-RELEASE=5.2
+RELEASE=5.3
PACKAGE=ceph
-VER=12.2.10
+VER=12.2.11
DEBREL=pve1
SRCDIR=ceph
cmake_minimum_required(VERSION 2.8.11)
project(ceph)
-set(VERSION 12.2.10)
+set(VERSION 12.2.11)
if(POLICY CMP0046)
# Tweak policies (this one disables "missing" dependency warning)
option(WITH_RADOSGW "Rados Gateway is enabled" ON)
option(WITH_RADOSGW_FCGI_FRONTEND "Rados Gateway's FCGI frontend is enabled" OFF)
option(WITH_RADOSGW_BEAST_FRONTEND "Rados Gateway's Beast frontend is enabled" ON)
+option(WITH_RADOSGW_BEAST_OPENSSL "Rados Gateway's Beast frontend uses OpenSSL" ON)
+
if(WITH_RADOSGW)
find_package(EXPAT REQUIRED)
if(WITH_RADOSGW_FCGI_FRONTEND)
message(WARNING "disabling WITH_RADOSGW_BEAST_FRONTEND, which depends on WITH_BOOST_CONTEXT")
set(WITH_RADOSGW_BEAST_FRONTEND OFF)
endif()
-endif(WITH_RADOSGW)
-
-if (WITH_RADOSGW)
- if (NOT DEFINED OPENSSL_FOUND)
- message(STATUS "Looking for openssl anyways, because radosgw selected")
- find_package(OpenSSL)
- endif()
# https://curl.haxx.se/docs/install.html mentions the
# configure flags for various ssl backends
execute_process(
if (CURL_CONFIG_ERRORS)
message(WARNING "unable to run curl-config; rgw cannot make ssl requests to external systems reliably")
endif()
- find_package(OpenSSL)
+
+ if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
+ find_package(OpenSSL REQUIRED)
+ else()
+ find_package(OpenSSL)
+ endif()
+
if (OPENSSL_FOUND)
if (NOT NO_CURL_SSL_LINK)
message(STATUS "libcurl is linked with openssl: explicitly setting locks")
+>= 12.2.11
+----------
+* `cephfs-journal-tool` makes rank argument (--rank) mandatory. Rank is
+ of format `filesystem:rank`, where `filesystem` is the cephfs filesystem
+ and `rank` is the MDS rank on which the operation is to be executed. To
+ operate on all ranks, use `all` or `*` as the rank specifier. Note that,
+ operations that dump journal information to file will now dump to per-rank
+ suffixed dump files. Importing journal information from dump files is
+ disallowed if operation is targetted for all ranks.
+
>= 12.1.2
---------
* When running 'df' on a CephFS filesystem comprising exactly one data pool,
a clean upgrade path is added to the pg log hard limit patches.
See also: http://tracker.ceph.com/issues/36686
+
+12.2.11
+-------
+
+* The default memory utilization for the mons has been increased
+ somewhat. Rocksdb now uses 512 MB of RAM by default, which should
+ be sufficient for small to medium-sized clusters; large clusters
+ should tune this up. Also, the ``mon_osd_cache_size`` has been
+ increase from 10 OSDMaps to 500, which will translate to an
+ additional 500 MB to 1 GB of RAM for large clusters, and much less
+ for small clusters.
+
+* New CephFS file system attributes session_timeout and session_autoclose
+ are configurable via `ceph fs set`. The MDS config options
+ mds_session_timeout, mds_session_autoclose, and mds_max_file_size are now
+ obsolete.
+
+* This release fixes the pg log hard limit bug(https://tracker.ceph.com/issues/23979).
+ A flag called pglog_hardlimit has been introduced. It is off by default.
+ This flag enables the feature that limits the length of the pg log. Users should run
+ 'ceph osd set pglog_hardlimit' after completely upgrading to 12.2.11. Once all the OSDs
+ have this flag set, the length of the pg log will be capped by a hard limit. We do not
+ recommend unsetting this flag beyond this point.
# Contributor: John Coyle <dx9err@gmail.com>
# Maintainer: John Coyle <dx9err@gmail.com>
pkgname=ceph
-pkgver=12.2.10
+pkgver=12.2.11
pkgrel=0
pkgdesc="Ceph is a distributed object store and file system"
pkgusers="ceph"
xmlstarlet
yasm
"
-source="ceph-12.2.10.tar.bz2"
+source="ceph-12.2.11.tar.bz2"
subpackages="
$pkgname-base
$pkgname-common
_udevrulesdir=/etc/udev/rules.d
_python_sitelib=/usr/lib/python2.7/site-packages
-builddir=$srcdir/ceph-12.2.10
+builddir=$srcdir/ceph-12.2.11
build() {
export CEPH_BUILD_VIRTUALENV=$builddir
# main package definition
#################################################################################
Name: ceph
-Version: 12.2.10
+Version: 12.2.11
Release: 0%{?dist}
%if 0%{?fedora} || 0%{?rhel}
Epoch: 2
Group: System/Filesystems
%endif
URL: http://ceph.com/
-Source0: http://ceph.com/download/ceph-12.2.10.tar.bz2
+Source0: http://ceph.com/download/ceph-12.2.11.tar.bz2
%if 0%{?suse_version}
%if 0%{?is_opensuse}
ExclusiveArch: x86_64 aarch64 ppc64 ppc64le
# common
#################################################################################
%prep
-%autosetup -p1 -n ceph-12.2.10
+%autosetup -p1 -n ceph-12.2.11
%build
%if 0%{with cephfs_java}
export CPPFLAGS="$java_inc"
export CFLAGS="$RPM_OPT_FLAGS"
export CXXFLAGS="$RPM_OPT_FLAGS"
+export LDFLAGS="$RPM_LD_FLAGS"
env | sort
export CPPFLAGS="$java_inc"
export CFLAGS="$RPM_OPT_FLAGS"
export CXXFLAGS="$RPM_OPT_FLAGS"
+export LDFLAGS="$RPM_LD_FLAGS"
env | sort
+ceph (12.2.11-1) stable; urgency=medium
+
+ * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com> Wed, 30 Jan 2019 15:51:24 +0000
+
ceph (12.2.10-1) stable; urgency=medium
* New upstream release
ceph-test (<< 9.0.3-1646),
librbd1 (<< 0.92-1238),
python-ceph (<< 0.92-1223),
+ radosgw (<< 12.0.3)
Breaks: ceph (<< 10),
ceph-fs-common (<< 11.0),
ceph-test (<< 9.0.3-1646),
librbd1 (<< 0.92-1238),
python-ceph (<< 0.92-1223),
+ radosgw (<< 12.0.3)
Suggests: ceph-base (= ${binary:Version}),
ceph-mds (= ${binary:Version}),
Description: common utilities to mount and interact with a ceph storage cluster
--- /dev/null
+"""
+Adapted from https://gist.github.com/mgedmin/6052926
+
+Sphinx extension to add ReadTheDocs-style "Edit on GitHub" links to the
+sidebar.
+
+Loosely based on https://github.com/astropy/astropy/pull/347
+"""
+
+import os
+import warnings
+
+
+__licence__ = 'BSD (3 clause)'
+
+
+def get_github_url(app, view, path):
+ return 'https://github.com/{project}/{view}/{branch}/doc/{path}'.format(
+ project=app.config.edit_on_github_project,
+ view=view,
+ branch=app.config.edit_on_github_branch,
+ path=path)
+
+
+def html_page_context(app, pagename, templatename, context, doctree):
+ if templatename != 'page.html':
+ return
+
+ if not app.config.edit_on_github_project:
+ warnings.warn("edit_on_github_project not specified")
+ return
+
+ path = os.path.relpath(doctree.get('source'), app.builder.srcdir)
+ show_url = get_github_url(app, 'blob', path)
+ edit_url = get_github_url(app, 'edit', path)
+
+ context['show_on_github_url'] = show_url
+ context['edit_on_github_url'] = edit_url
+
+def setup(app):
+ app.add_config_value('edit_on_github_project', '', True)
+ app.add_config_value('edit_on_github_branch', 'master', True)
+ app.connect('html-page-context', html_page_context)
--- /dev/null
+$(function() {
+ var releases_url = "http://docs.ceph.com/docs/master/releases.json";
+
+ function show_edit(branch, data) {
+ if (branch) {
+ if (branch === "master") {
+ $("#dev-warning").show();
+ return true;
+ }
+ if (data && data.releases && branch in data.releases) {
+ var eol = ("actual_eol" in data.releases[branch]);
+ if (eol) {
+ $("#eol-warning").show();
+ }
+ return !eol;
+ }
+ }
+ $("#dev-warning").show();
+ return false;
+ }
+
+ function get_branch() {
+ var url = window.location.href;
+ var res = url.match(/docs.ceph.com\/docs\/([a-z]+)\/?/i)
+ if (res) {
+ return res[1]
+ }
+ return null;
+ }
+
+ $.getJSON(releases_url, function(data) {
+ var branch = get_branch();
+ if (show_edit(branch, data)) {
+ // patch the edit-on-github URL for correct branch
+ var url = $("#edit-on-github").attr("href");
+ url = url.replace("master", branch);
+ $("#edit-on-github").attr("href", url);
+ $("#docubetter").show();
+ }
+ });
+});
--- /dev/null
+{% extends "!page.html" %}
+{% block body %}
+
+<div id="dev-warning" class="admonition note" style="display:none;">
+ <p class="first admonition-title">Notice</p>
+ <p class="last">This document is for a development version of Ceph.</p>
+</div>
+
+<div id="eol-warning" class="admonition warning" style="display:none;">
+ <p class="first admonition-title">Warning</p>
+ <p class="last">This document is for an unsupported version of Ceph.</p>
+</div>
+
+{%- if edit_on_github_url %}
+ <div id="docubetter" align="right" style="display:none; padding: 15px; font-weight: bold;">
+ <a id="edit-on-github" href="{{ edit_on_github_url }}" rel="nofollow">{{ _('Edit on GitHub')}}</a> | <a href="https://github.com/ceph/ceph/projects/4">Report a Documentation Bug</a>
+ </div>
+{%- endif %}
+
+ {{ super() }}
+{% endblock %}
Zapping a logical volume::
- ceph-volume lvm zap {vg name/lv name}
+ ceph-volume lvm zap {vg name/lv name}
Zapping a partition::
- ceph-volume lvm zap /dev/sdc1
+ ceph-volume lvm zap /dev/sdc1
-If you are zapping a raw device or partition and would like any vgs or lvs created
-from that device removed use the ``--destroy`` flag. A common use case is to simply
-deploy OSDs using a whole raw device. If you do so and then wish to reuse that device for
-another OSD you must use the ``--destroy`` flag when zapping so that the vgs and lvs that
-ceph-volume created on the raw device will be removed.
+Removing Devices
+----------------
+When zapping, and looking for full removal of the device (lv, vg, or partition)
+use the ``--destroy`` flag. A common use case is to simply deploy OSDs using
+a whole raw device. If you do so and then wish to reuse that device for another
+OSD you must use the ``--destroy`` flag when zapping so that the vgs and lvs
+that ceph-volume created on the raw device will be removed.
+
+.. note:: Multiple devices can be accepted at once, to zap them all
Zapping a raw device and destroying any vgs or lvs present::
- ceph-volume lvm zap /dev/sdc --destroy
+ ceph-volume lvm zap /dev/sdc --destroy
+
+
+This action can be performed on partitions, and logical volumes as well::
+
+ ceph-volume lvm zap /dev/sdc1 --destroy
+ ceph-volume lvm zap osd-vg/data-lv --destroy
+
+
+Finally, multiple devices can be detected if filtering by OSD ID and/or OSD
+FSID. Either identifier can be used or both can be used at the same time. This
+is useful in situations where multiple devices associated with a specific ID
+need to be purged. When using the FSID, the filtering is stricter, and might
+not match other (possibly invalid) devices associated to an ID.
+
+By ID only::
+
+ ceph-volume lvm zap --destroy --osd-id 1
+
+By FSID::
+
+ ceph-volume lvm zap --destroy --osd-fsid 2E8FBE58-0328-4E3B-BFB7-3CACE4E9A6CE
+
+By both::
+
+ ceph-volume lvm zap --destroy --osd-fsid 2E8FBE58-0328-4E3B-BFB7-3CACE4E9A6CE --osd-id 1
+
+
+.. warning:: If the systemd unit associated with the OSD ID to be zapped is
+ detected as running, the tool will refuse to zap until the daemon is stopped.
Splitting and merging
=====================
-An MDS will only consider doing splits and merges if the ``mds_bal_frag``
-setting is true in the MDS's configuration file, and the allow_dirfrags
-setting is true in the filesystem map (set on the mons). These settings
-are both true by default since the *Luminous* (12.2.x) release of Ceph.
+An MDS will only consider doing splits if the allow_dirfrags setting is true in
+the file system map (set on the mons). This setting is true by default since
+the *Luminous* release (12.2.X).
When an MDS identifies a directory fragment to be split, it does not
do the split immediately. Because splitting interrupts metadata IO,
There are three situations in which a client may be evicted automatically:
-On an active MDS daemon, if a client has not communicated with the MDS for
-over ``mds_session_autoclose`` seconds (300 seconds by default), then it
-will be evicted automatically.
+On an active MDS daemon, if a client has not communicated with the MDS for over
+``session_autoclose`` (a file system variable) seconds (300 seconds by
+default), then it will be evicted automatically.
On an active MDS daemon, if a client has not responded to cap revoke messages
for over ``mds_cap_revoke_eviction_timeout`` (configuration option) seconds.
To mount the Ceph file system as a FUSE, you may use the ``ceph-fuse`` command.
For example::
- sudo mkdir /home/usernname/cephfs
+ sudo mkdir /home/username/cephfs
sudo ceph-fuse -m 192.168.0.1:6789 /home/username/cephfs
If you have more than one filesystem, specify which one to mount using
sudo systemctl enable ceph-fuse@/mnt.service
.. _ceph-fuse: ../../man/8/ceph-fuse/
-.. _fstab: ./fstab
+.. _fstab: ../fstab/#fuse
.. _CEPHX Config Reference: ../../rados/configuration/auth-config-ref
the MDS will request clients release their capabilities. If the client
is unresponsive or buggy, it might fail to do so promptly or fail to do
so at all. This message appears if a client has taken longer than
-``mds_session_timeout`` (default 60s) to comply.
+``session_timeout`` (default 60s) to comply.
Message: "Client *name* failing to respond to cache pressure"
Code: MDS_HEALTH_CLIENT_RECALL, MDS_HEALTH_CLIENT_RECALL_MANY
:Type: Boolean
:Default: ``true``
-
-``mds max file size``
-
-:Description: The maximum allowed file size to set when creating a
- new file system.
-
-:Type: 64-bit Integer Unsigned
-:Default: ``1ULL << 40``
-
``mds cache memory limit``
:Description: The memory limit the MDS should enforce for its cache.
:Default: ``24.0*60.0``
-``mds session timeout``
-
-:Description: The interval (in seconds) of client inactivity before Ceph
- times out capabilities and leases.
-
-:Type: Float
-:Default: ``60``
-
-
-``mds session autoclose``
-
-:Description: The interval (in seconds) before Ceph closes
- a laggy client's session.
-
-:Type: Float
-:Default: ``300``
-
-
``mds reconnect timeout``
:Description: The interval (in seconds) to wait for clients to reconnect
:Default: ``0``
-``mds bal frag``
-
-:Description: Determines whether the MDS will fragment directories.
-:Type: Boolean
-:Default: ``false``
-
-
``mds bal split size``
:Description: The maximum directory size before the MDS will split a directory
html_favicon = 'favicon.ico'
html_use_smartypants = True
html_show_sphinx = False
+html_static_path = ["_static"]
html_sidebars = {
'**': ['smarttoc.html', 'searchbox.html'],
}
+sys.path.insert(0, os.path.abspath('_ext'))
+
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.graphviz',
'sphinx.ext.todo',
'sphinxcontrib.ditaa',
'breathe',
+ 'edit_on_github',
]
ditaa = 'ditaa'
todo_include_todos = True
pybind = os.path.join(top_level, 'src/pybind')
if pybind not in sys.path:
sys.path.insert(0, pybind)
+
+# the docs are rendered with github links pointing to master. the javascript
+# snippet in _static/ceph.js rewrites the edit links when a page is loaded, to
+# point to the correct branch.
+edit_on_github_project = 'ceph/ceph'
+edit_on_github_branch = 'master'
+
+# handles edit-on-github and old version warning display
+def setup(app):
+ app.add_javascript('js/ceph.js')
ceph-volume lvm zap /dev/sdc1
+For full removal of the device use the ``--destroy`` flag (allowed for all
+device types)::
+
+ ceph-volume lvm zap --destroy /dev/sdc1
+
+Multiple devices can be removed by specifying the OSD ID and/or the OSD FSID::
+
+ ceph-volume lvm zap --destroy --osd-id 1
+ ceph-volume lvm zap --destroy --osd-id 1 --osd-fsid C9605912-8395-4D76-AFC0-7DFDAC315D59
+
+
Positional arguments:
* <DEVICE> Either in the form of ``vg/lv`` for logical volumes,
# recompile
crushtool -c map.txt -o crushmap
+Reclassify
+==========
+
+The *reclassify* function allows users to transition from older maps that
+maintain parallel hierarchies for OSDs of different types to a modern CRUSH
+map that makes use of the *device class* feature. For more information,
+see http://docs.ceph.com/docs/master/rados/operations/crush-map-edits/#migrating-from-a-legacy-ssd-rule-to-device-classes.
+
Example output from --test
==========================
ceph balancer show <plan-name>
+All plans can be shown with::
+
+ ceph balancer ls
+
Old plans can be discarded with::
ceph balancer rm <plan-name>
ceph-disk prepare --bluestore <device> --block.wal <wal-device> --block-db <db-device>
-Cache size
-==========
+Provisioning strategies
+-----------------------
+Although there are multiple ways to deploy a Bluestore OSD (unlike Filestore
+which had 1) here are two common use cases that should help clarify the
+initial deployment strategy:
+
+.. _bluestore-single-type-device-config:
+
+**block (data) only**
+^^^^^^^^^^^^^^^^^^^^^
+If all the devices are the same type, for example all are spinning drives, and
+there are no fast devices to combine these, it makes sense to just deploy with
+block only and not try to separate ``block.db`` or ``block.wal``. The
+:ref:`ceph-volume-lvm` call for a single ``/dev/sda`` device would look like::
+
+ ceph-volume lvm create --bluestore --data /dev/sda
+
+If logical volumes have already been created for each device (1 LV using 100%
+of the device), then the :ref:`ceph-volume-lvm` call for an lv named
+``ceph-vg/block-lv`` would look like::
+
+ ceph-volume lvm create --bluestore --data ceph-vg/block-lv
+
+.. _bluestore-mixed-device-config:
+
+**block and block.db**
+^^^^^^^^^^^^^^^^^^^^^^
+If there is a mix of fast and slow devices (spinning and solid state),
+it is recommended to place ``block.db`` on the faster device while ``block``
+(data) lives on the slower (spinning drive). Sizing for ``block.db`` should be
+as large as possible to avoid performance penalties otherwise. The
+``ceph-volume`` tool is currently not able to create these automatically, so
+the volume groups and logical volumes need to be created manually.
+
+For the below example, lets assume 4 spinning drives (sda, sdb, sdc, and sdd)
+and 1 solid state drive (sdx). First create the volume groups::
+
+ $ vgcreate ceph-block-0 /dev/sda
+ $ vgcreate ceph-block-1 /dev/sdb
+ $ vgcreate ceph-block-2 /dev/sdc
+ $ vgcreate ceph-block-3 /dev/sdd
+
+Now create the logical volumes for ``block``::
+
+ $ lvcreate -l 100%FREE -n block-0 ceph-block-0
+ $ lvcreate -l 100%FREE -n block-1 ceph-block-1
+ $ lvcreate -l 100%FREE -n block-2 ceph-block-2
+ $ lvcreate -l 100%FREE -n block-3 ceph-block-3
+
+We are creating 4 OSDs for the four slow spinning devices, so assuming a 200GB
+SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB::
+
+ $ vgcreate ceph-db-0 /dev/sdx
+ $ lvcreate -L 50GB -n db-0 ceph-db-0
+ $ lvcreate -L 50GB -n db-1 ceph-db-0
+ $ lvcreate -L 50GB -n db-2 ceph-db-0
+ $ lvcreate -L 50GB -n db-3 ceph-db-0
+
+Finally, create the 4 OSDs with ``ceph-volume``::
+
+ $ ceph-volume lvm create --bluestore --data ceph-block-0/block-0 --block.db ceph-db-0/db-0
+ $ ceph-volume lvm create --bluestore --data ceph-block-1/block-1 --block.db ceph-db-0/db-1
+ $ ceph-volume lvm create --bluestore --data ceph-block-2/block-2 --block.db ceph-db-0/db-2
+ $ ceph-volume lvm create --bluestore --data ceph-block-3/block-3 --block.db ceph-db-0/db-3
+
+These operations should end up creating 4 OSDs, with ``block`` on the slower
+spinning drives and a 50GB logical volume for each coming from the solid state
+drive.
+
+Sizing
+======
+When using a :ref:`mixed spinning and solid drive setup
+<bluestore-mixed-device-config>` it is important to make a large-enough
+``block.db`` logical volume for Bluestore. Generally, ``block.db`` should have
+*as large as possible* logical volumes.
+
+It is recommended that the ``block.db`` size isn't smaller than 4% of
+``block``. For example, if the ``block`` size is 1TB, then ``block.db``
+shouldn't be less than 40GB.
+
+If *not* using a mix of fast and slow devices, it isn't required to create
+separate logical volumes for ``block.db`` (or ``block.wal``). Bluestore will
+automatically manage these within the space of ``block``.
+
+
+Automatic Cache Sizing
+======================
+
+Bluestore can be configured to automatically resize it's caches when tc_malloc
+is configured as the memory allocator and the ``bluestore_cache_autotune``
+setting is enabled. This option is currently enabled by default. Bluestore
+will attempt to keep OSD heap memory usage under a designated target size via
+the ``osd_memory_target`` configuration option. This is a best effort
+algorithm and caches will not shrink smaller than the amount specified by
+``osd_memory_cache_min``. Cache ratios will be chosen based on a hierarchy
+of priorities. If priority information is not availabe, the
+``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio`` options are
+used as fallbacks.
+
+``bluestore_cache_autotune``
+
+:Description: Automatically tune the ratios assigned to different bluestore caches while respecting minimum values.
+:Type: Boolean
+:Requered: Yes
+:Default: ``True``
+
+``osd_memory_target``
+
+:Description: When tcmalloc is available and cache autotuning is enabled, try to keep this many bytes mapped in memory. Note: This may not exactly match the RSS memory usage of the process. While the total amount of heap memory mapped by the process should generally stay close to this target, there is no guarantee that the kernel will actually reclaim memory that has been unmapped. During initial developement, it was found that some kernels result in the OSD's RSS Memory exceeding the mapped memory by up to 20%. It is hypothesised however, that the kernel generally may be more aggressive about reclaiming unmapped memory when there is a high amount of memory pressure. Your mileage may vary.
+:Type: Unsigned Integer
+:Requered: Yes
+:Default: ``4294967296``
+
+``bluestore_cache_autotune_chunk_size``
+
+:Description: The chunk size in bytes to allocate to caches when cache autotune is enabled. When the autotuner assigns memory to different caches, it will allocate memory in chunks. This is done to avoid evictions when there are minor fluctuations in the heap size or autotuned cache ratios.
+:Type: Unsigned Integer
+:Requered: No
+:Default: ``33554432``
+
+``bluestore_cache_autotune_interval``
+
+:Description: The number of seconds to wait between rebalances when cache autotune is enabled. This setting changes how quickly the ratios of the difference caches are recomputed. Note: Setting the interval too small can result in high CPU usage and lower performance.
+:Type: Float
+:Requered: No
+:Default: ``5``
+
+``osd_memory_base``
+
+:Description: When tcmalloc and cache autotuning is enabled, estimate the minimum amount of memory in bytes the OSD will need. This is used to help the autotuner estimate the expected aggregate memory consumption of the caches.
+:Type: Unsigned Interger
+:Required: No
+:Default: ``805306368``
+
+``osd_memory_expected_fragmentation``
+
+:Description: When tcmalloc and cache autotuning is enabled, estimate the percent of memory fragmentation. This is used to help the autotuner estimate the expected aggregate memory consumption of the caches.
+:Type: Float
+:Required: No
+:Default: ``0.15``
+
+``osd_memory_cache_min``
+
+:Description: When tcmalloc and cache autotuning is enabled, set the minimum amount of memory used for caches. Note: Setting this value too low can result in significant cache thrashing.
+:Type: Unsigned Integer
+:Required: No
+:Default: ``134217728``
+
+``osd_memory_cache_resize_interval``
+
+:Description: When tcmalloc and cache autotuning is enabled, wait this many seconds between resizing caches. This setting changes the total amount of memory available for bluestore to use for caching. Note: Setting the interval too small can result in memory allocator thrashing and lower performance.
+:Type: Float
+:Required: No
+:Default: ``1``
+
+
+Manual Cache Sizing
+===================
The amount of memory consumed by each OSD for BlueStore's cache is
determined by the ``bluestore_cache_size`` configuration option. If
+.. _adding-and-removing-monitors:
+
==========================
Adding/Removing Monitors
==========================
from the cluster at runtime. To bootstrap a monitor, see `Manual Deployment`_
or `Monitor Bootstrap`_.
+.. _adding-monitors:
+
Adding Monitors
===============
ceph-mon -i {mon-id} --public-addr {ip:port}
+.. _removing-monitors:
Removing Monitors
=================
.. important:: A given CRUSH rule may be assigned to multiple pools, but it
is not possible for a single pool to have multiple CRUSH rules.
-
-Placing Different Pools on Different OSDS:
-==========================================
-
-Suppose you want to have most pools default to OSDs backed by large hard drives,
-but have some pools mapped to OSDs backed by fast solid-state drives (SSDs).
-It's possible to have multiple independent CRUSH hierarchies within the same
-CRUSH map. Define two hierarchies with two different root nodes--one for hard
-disks (e.g., "root platter") and one for SSDs (e.g., "root ssd") as shown
-below::
-
- device 0 osd.0
- device 1 osd.1
- device 2 osd.2
- device 3 osd.3
- device 4 osd.4
- device 5 osd.5
- device 6 osd.6
- device 7 osd.7
-
- host ceph-osd-ssd-server-1 {
- id -1
- alg straw
- hash 0
- item osd.0 weight 1.00
- item osd.1 weight 1.00
- }
-
- host ceph-osd-ssd-server-2 {
- id -2
- alg straw
- hash 0
- item osd.2 weight 1.00
- item osd.3 weight 1.00
- }
-
- host ceph-osd-platter-server-1 {
- id -3
- alg straw
- hash 0
- item osd.4 weight 1.00
- item osd.5 weight 1.00
- }
-
- host ceph-osd-platter-server-2 {
- id -4
- alg straw
- hash 0
- item osd.6 weight 1.00
- item osd.7 weight 1.00
- }
-
- root platter {
- id -5
- alg straw
- hash 0
- item ceph-osd-platter-server-1 weight 2.00
- item ceph-osd-platter-server-2 weight 2.00
- }
-
- root ssd {
- id -6
- alg straw
- hash 0
- item ceph-osd-ssd-server-1 weight 2.00
- item ceph-osd-ssd-server-2 weight 2.00
- }
-
- rule data {
- ruleset 0
- type replicated
- min_size 2
- max_size 2
- step take platter
- step chooseleaf firstn 0 type host
- step emit
- }
-
- rule metadata {
- ruleset 1
- type replicated
- min_size 0
- max_size 10
- step take platter
- step chooseleaf firstn 0 type host
- step emit
- }
-
- rule rbd {
- ruleset 2
- type replicated
- min_size 0
- max_size 10
- step take platter
- step chooseleaf firstn 0 type host
- step emit
- }
-
- rule platter {
- ruleset 3
- type replicated
- min_size 0
- max_size 10
- step take platter
- step chooseleaf firstn 0 type host
- step emit
- }
-
- rule ssd {
- ruleset 4
- type replicated
- min_size 0
- max_size 4
- step take ssd
- step chooseleaf firstn 0 type host
- step emit
- }
-
- rule ssd-primary {
- ruleset 5
- type replicated
- min_size 5
- max_size 10
- step take ssd
- step chooseleaf firstn 1 type host
- step emit
- step take platter
- step chooseleaf firstn -1 type host
- step emit
- }
-
-You can then set a pool to use the SSD rule by::
-
- ceph osd pool set <poolname> crush_ruleset 4
-
-Similarly, using the ``ssd-primary`` rule will cause each placement group in the
-pool to be placed with an SSD as the primary and platters as the replicas.
-
+.. _crush-reclassify:
+
+Migrating from a legacy SSD rule to device classes
+--------------------------------------------------
+
+It used to be necessary to manually edit your CRUSH map and maintain a
+parallel hierarchy for each specialized device type (e.g., SSD) in order to
+write rules that apply to those devices. Since the Luminous release,
+the *device class* feature has enabled this transparently.
+
+However, migrating from an existing, manually customized per-device map to
+the new device class rules in the trivial way will cause all data in the
+system to be reshuffled.
+
+The ``crushtool`` has a few commands that can transform a legacy rule
+and hierarchy so that you can start using the new class-based rules.
+There are three types of transformations possible:
+
+#. ``--reclassify-root <root-name> <device-class>``
+
+ This will take everything in the hierarchy beneath root-name and
+ adjust any rules that reference that root via a ``take
+ <root-name>`` to instead ``take <root-name> class <device-class>``.
+ It renumbers the buckets in such a way that the old IDs are instead
+ used for the specified class's "shadow tree" so that no data
+ movement takes place.
+
+ For example, imagine you have an existing rule like::
+
+ rule replicated_ruleset {
+ id 0
+ type replicated
+ min_size 1
+ max_size 10
+ step take default
+ step chooseleaf firstn 0 type rack
+ step emit
+ }
+
+ If you reclassify the root `default` as class `hdd`, the rule will
+ become::
+
+ rule replicated_ruleset {
+ id 0
+ type replicated
+ min_size 1
+ max_size 10
+ step take default class hdd
+ step chooseleaf firstn 0 type rack
+ step emit
+ }
+
+#. ``--set-subtree-class <bucket-name> <device-class>``
+
+ This will mark every device in the subtree rooted at *bucket-name*
+ with the specified device class.
+
+ This is normally used in conjunction with the ``--reclassify-root``
+ option to ensure that all devices in that root are labeled with the
+ correct class. In some situations, however, some of those devices
+ (correctly) have a different class and we do not want to relabel
+ them. In such cases, one can exclude the ``--set-subtree-class``
+ option. This means that the remapping process will not be perfect,
+ since the previous rule distributed across devices of multiple
+ classes but the adjusted rules will only map to devices of the
+ specified *device-class*, but that often is an accepted level of
+ data movement when the nubmer of outlier devices is small.
+
+#. ``--reclassify-bucket <match-pattern> <device-class> <default-parent>``
+
+ This will allow you to merge a parallel type-specific hiearchy with the normal hierarchy. For example, many users have maps like::
+
+ host node1 {
+ id -2 # do not change unnecessarily
+ # weight 109.152
+ alg straw
+ hash 0 # rjenkins1
+ item osd.0 weight 9.096
+ item osd.1 weight 9.096
+ item osd.2 weight 9.096
+ item osd.3 weight 9.096
+ item osd.4 weight 9.096
+ item osd.5 weight 9.096
+ ...
+ }
+
+ host node1-ssd {
+ id -10 # do not change unnecessarily
+ # weight 2.000
+ alg straw
+ hash 0 # rjenkins1
+ item osd.80 weight 2.000
+ ...
+ }
+
+ root default {
+ id -1 # do not change unnecessarily
+ alg straw
+ hash 0 # rjenkins1
+ item node1 weight 110.967
+ ...
+ }
+
+ root ssd {
+ id -18 # do not change unnecessarily
+ # weight 16.000
+ alg straw
+ hash 0 # rjenkins1
+ item node1-ssd weight 2.000
+ ...
+ }
+
+ This function will reclassify each bucket that matches a
+ pattern. The pattern can look like ``%suffix`` or ``prefix%``.
+ For example, in the above example, we would use the pattern
+ ``%-ssd``. For each matched bucket, the remaining portion of the
+ name (that matches the ``%`` wildcard) specifies the *base bucket*.
+ All devices in the matched bucket are labeled with the specified
+ device class and then moved to the base bucket. If the base bucket
+ does not exist (e.g., ``node12-ssd`` exists but ``node12`` does
+ not), then it is created and linked underneath the specified
+ *default parent* bucket. In each case, we are careful to preserve
+ the old bucket IDs for the new shadow buckets to prevent data
+ movement. Any rules with ``take`` steps referencing the old
+ buckets are adjusted.
+
+#. ``--reclassify-bucket <bucket-name> <device-class> <base-bucket>``
+
+ The same command can also be used without a wildcard to map a
+ single bucket. For example, in the previous example, we want the
+ ``ssd`` bucket to be mapped to the ``default`` bucket.
+
+The final command to convert the map comprised of the above fragments would be something like::
+
+ $ ceph osd getcrushmap -o original
+ $ crushtool -i original --reclassify \
+ --set-subtree-class default hdd \
+ --reclassify-root default hdd \
+ --reclassify-bucket %-ssd ssd default \
+ --reclassify-bucket ssd ssd default \
+ -o adjusted
+
+In order to ensure that the conversion is correct, there is a ``--compare`` command that will test a large sample of inputs to the CRUSH map and ensure that the same result comes back out. These inputs are controlled by the same options that apply to the ``--test`` command. For the above example,::
+
+ $ crushtool -i original --compare adjusted
+ rule 0 had 0/10240 mismatched mappings (0)
+ rule 1 had 0/10240 mismatched mappings (0)
+ maps appear equivalent
+
+If there were difference, you'd see what ratio of inputs are remapped
+in the parentheses.
+
+If you are satisfied with the adjusted map, you can apply it to the cluster with something like::
+
+ ceph osd setcrushmap -i adjusted
Tuning CRUSH, the hard way
--------------------------
ceph osd crush tree --show-shadow
+For older clusters created before Luminous that relied on manually
+crafted CRUSH maps to maintain per-device-type hierarchies, there is a
+*reclassify* tool available to help transition to device classes
+without triggering data movement (see :ref:`crush-reclassify`).
+
Weights sets
------------
ceph auth caps client.paul mon 'allow rw' osd 'allow rwx pool=liverpool'
ceph auth caps client.brian-manager mon 'allow *' osd 'allow *'
-To remove a capability, you may reset the capability. If you want the user
-to have no access to a particular daemon that was previously set, specify
-an empty string. For example::
-
- ceph auth caps client.ringo mon ' ' osd ' '
-
See `Authorization (Capabilities)`_ for additional details on capabilities.
Recovery using healthy monitor(s)
---------------------------------
-If there is any survivers, we can always `replace`_ the corrupted one with a
-new one. And after booting up, the new joiner will sync up with a healthy
+If there are any survivors, we can always :ref:`replace <adding-and-removing-monitors>` the corrupted one with a
+new one. After booting up, the new joiner will sync up with a healthy
peer, and once it is fully sync'ed, it will be able to serve the clients.
Recovery using OSDs
a new issue on the `tracker`_.
.. _cluster map: ../../architecture#cluster-map
-.. _replace: ../operation/add-or-rm-mons
.. _tracker: http://tracker.ceph.com/projects/ceph/issues/new
the maximum number of objects. A negative value disables this setting.
- **Maximum Size:** The ``max-size`` option allows you to specify a quota
- for the maximum number of bytes. A negative value disables this setting.
-
+ for the maximum number of bytes. The ``max-size-kb`` option allows you
+ to specify it in KiB. A negative value disables this setting.
+
- **Quota Type:** The ``quota-type`` option sets the scope for the quota.
The options are ``bucket`` and ``user``.
:Default: ``false``
+``rgw trust forwarded https``
+
+:Description: When a proxy in front of radosgw is used for ssl termination, radosgw
+ does not know whether incoming http connections are secure. Enable
+ this option to trust the ``Forwarded`` and ``X-Forwarded-Proto`` headers
+ sent by the proxy when determining whether the connection is secure.
+ This is required for some features, such as server side encryption.
+:Type: Boolean
+:Default: ``false``
+
+
Logging Settings
================
means that the data is sent over HTTP in its unencrypted form, and the Ceph
Object Gateway stores that data in the Ceph Storage Cluster in encrypted form.
+.. note:: Requests for server-side encryption must be sent over a secure HTTPS
+ connection to avoid sending secrets in plaintext. If a proxy is used
+ for SSL termination, ``rgw trust forwarded https`` must be enabled
+ before forwarded requests will be trusted as secure.
+
Customer-Provided Keys
======================
Options
-------
-``port``
+``port`` and ``ssl_port``
:Description: Sets the listening port number. Can be specified multiple
times as in ``port=80 port=8000``.
:Default: ``80``
-``endpoint``
+``endpoint`` and ``ssl_endpoint``
:Description: Sets the listening address in the form ``address[:port]``,
where the address is an IPv4 address string in dotted decimal
- form, or an IPv6 address in hexadecimal notation. The
- optional port defaults to 80. Can be specified multiple times
- as in ``endpoint=::1 endpoint=192.168.0.100:8000``.
+ form, or an IPv6 address in hexadecimal notation surrounded
+ by square brackets. The optional port defaults to 80 for
+ ``endpoint`` and 443 for ``ssl_endpoint``. Can be specified
+ multiple times as in ``endpoint=[::1] endpoint=192.168.0.100:8000``.
:Type: Integer
:Default: None
+``ssl_certificate``
+
+:Description: Path to the SSL certificate file used for SSL-enabled endpoints.
+
+:Type: String
+:Default: None
+
+
+``ssl_private_key``
+
+:Description: Optional path to the private key file used for SSL-enabled
+ endpoints. If one is not given, the ``ssl_certificate`` file
+ is used as the private key.
+
+:Type: String
+:Default: None
+
+
Civetweb
========
RAM
===
-Metadata servers and monitors must be capable of serving their data quickly, so
-they should have plenty of RAM (e.g., 1GB of RAM per daemon instance). OSDs do
-not require as much RAM for regular operations (e.g., 500MB of RAM per daemon
-instance); however, during recovery they need significantly more RAM (e.g., ~1GB
-per 1TB of storage per daemon). Generally, more RAM is better.
+Generally, more RAM is better.
+
+Monitors and managers (ceph-mon and ceph-mgr)
+---------------------------------------------
+
+Monitor and manager daemon memory usage generally scales with the size of the
+cluster. For small clusters, 1-2 GB is generally sufficient. For
+large clusters, you should provide more (5-10 GB). You may also want
+to consider tuning settings like ``mon_osd_cache_size`` or
+``rocksdb_cache_size``.
+
+Metadata servers (ceph-mds)
+---------------------------
+
+The metadata daemon memory utilization depends on how much memory its cache is
+configured to consume. We recommend 1 GB as a minimum for most systems. See
+``mds_cache_memory``.
+
+OSDs (ceph-osd)
+---------------
+
+By default, OSDs that use the BlueStore backend require 3-5 GB of RAM. You can
+adjust the amount of memory the OSD consumes with the ``osd_memory_target`` configuration option when BlueStore is in use. When using the legacy FileStore backend, the operating system page cache is used for caching data, so no tuning is normally needed, and the OSD memory consumption is generally related to the number of PGs per daemon in the system.
Data Storage
ceph-deploy mgr create node1 *Required only for luminous+ builds, i.e >= 12.x builds*
#. Add three OSDs. For the purposes of these instructions, we assume you have an
- unused disk in each node called ``/dev/vdb``. *Be sure that the device is not currently in use and does not contain any important data.*
+ unused disk in each node called ``/dev/vdb``. *Be sure that the device is not currently in use and does not contain any important data.* ::
ceph-deploy osd create {ceph-node}:{device}
CXX_FLAGS?=-std=c++11 -Wall -Wextra -Werror -g
CXX_LIBS?=-lrados -lradosstriper
CXX_INC?=$(LOCAL_LIBRADOS_INC)
-CXX_CC=$(CXX) $(CXX_FLAGS) $(CXX_INC) $(LOCAL_LIBRADOS) $(CXX_LIBS)
+CXX_CC=$(CXX) $(CXX_FLAGS) $(CXX_INC) $(LOCAL_LIBRADOS)
CC?=gcc
CC_FLAGS=-Wall -Wextra -Werror -g
CC_INC=$(LOCAL_LIBRADOS_INC)
CC_LIBS?=-lrados
-CC_CC=$(CC) $(CC_FLAGS) $(CC_INC) $(LOCAL_LIBRADOS) $(CC_LIBS)
+CC_CC=$(CC) $(CC_FLAGS) $(CC_INC) $(LOCAL_LIBRADOS)
# Relative path to the Ceph source:
CEPH_SRC_HOME?=../../src
all-system: all
hello_world_cpp: hello_world.cc
- $(CXX_CC) -o hello_world_cpp hello_world.cc
+ $(CXX_CC) -o hello_world_cpp hello_world.cc $(CXX_LIBS)
hello_radosstriper_cpp: hello_radosstriper.cc
- $(CXX_CC) -o hello_radosstriper_cpp hello_radosstriper.cc
+ $(CXX_CC) -o hello_radosstriper_cpp hello_radosstriper.cc $(CXX_LIBS)
hello_world_c: hello_world_c.c
- $(CC_CC) -o hello_world_c hello_world_c.c
+ $(CC_CC) -o hello_world_c hello_world_c.c $(CC_LIBS)
clean:
rm -f hello_world_cpp hello_radosstriper_cpp hello_world_c
your system librados and headers, use "make all-system".
And executed using
-./librados_hello_world -c ../../src/ceph.conf
+./hello_world_cpp -c ../../src/ceph.conf
(or whatever path to a ceph.conf is appropriate to you, or
by explicitly specifying monitors, user id, and keys).
net/socat \
textproc/expat2 \
textproc/gsed \
+ lang/gawk \
textproc/libxml2 \
textproc/xmlstarlet \
textproc/jq \
--- /dev/null
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7]
+openstack:
+- volumes: # attached to each instance
+ count: 4
+ size: 20 # GB
+- machine:
+ disk: 200 # GB
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
openstack:
- volumes: # attached to each instance
count: 4
- size: 10 # GB
+ size: 20 # GB
+- machine:
+ disk: 200 # GB
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7, client.1]
+openstack:
+- volumes: # attached to each instance
+ count: 4
+ size: 20 # GB
+- machine:
+ disk: 200 # GB
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
openstack:
- volumes: # attached to each instance
count: 4
- size: 10 # GB
+ size: 30 # GB
+- machine:
+ disk: 200 # GB
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7]
+- [client.0]
+- [client.1]
+- [client.2]
+openstack:
+- volumes: # attached to each instance
+ count: 4
+ size: 30 # GB
+- machine:
+ disk: 200 # GB
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0, client.1]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7, client.2, client.3]
+openstack:
+- volumes: # attached to each instance
+ count: 4
+ size: 30 # GB
+- machine:
+ disk: 200 # GB
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
openstack:
- volumes: # attached to each instance
count: 4
- size: 10 # GB
+ size: 30 # GB
+- machine:
+ disk: 200 # GB
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
openstack:
- volumes: # attached to each instance
count: 4
- size: 10 # GB
+ size: 30 # GB
+- machine:
+ disk: 200 # GB
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
openstack:
- volumes: # attached to each instance
count: 4
- size: 10 # GB
+ size: 30 # GB
+- machine:
+ disk: 200 # GB
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
openstack:
- volumes: # attached to each instance
count: 4
- size: 10 # GB
+ size: 30 # GB
+- machine:
+ disk: 200 # GB
log-rotate:
ceph-mds: 10G
ceph-osd: 10G
exit 1
fi
-if [ ! -d /tmp/ceph-disk-virtualenv -o ! -d /tmp/ceph-detect-init-virtualenv ]; then
+TEMP_DIR=${TMPDIR:-/tmp}
+if [ ! -d $TEMP_DIR/ceph-disk-virtualenv -o ! -d $TEMP_DIR/ceph-detect-init-virtualenv ]; then
echo '/tmp/*-virtualenv directories not built. Please run "make check" first.'
exit 1
fi
#
TIMEOUT=300
PG_NUM=4
-: ${CEPH_BUILD_VIRTUALENV:=/tmp}
+TMPDIR=${TMPDIR:-/tmp}
+CEPH_BUILD_VIRTUALENV=${TMPDIR}
+TESTDIR=${TESTDIR:-${TMPDIR}}
if type xmlstarlet > /dev/null 2>&1; then
XMLSTARLET=xmlstarlet
if [ `uname` = FreeBSD ]; then
SED=gsed
+ AWK=gawk
DIFFCOLOPTS=""
KERNCORE="kern.corefile"
else
SED=sed
+ AWK=awk
termwidth=$(stty -a | head -1 | sed -e 's/.*columns \([0-9]*\).*/\1/')
if [ -n "$termwidth" -a "$termwidth" != "0" ]; then
termwidth="-W ${termwidth}"
function __teardown_btrfs() {
local btrfs_base_dir=$1
- local btrfs_root=$(df -P . | tail -1 | awk '{print $NF}')
- local btrfs_dirs=$(cd $btrfs_base_dir; sudo btrfs subvolume list . -t | awk '/^[0-9]/ {print $4}' | grep "$btrfs_base_dir/$btrfs_dir")
+ local btrfs_root=$(df -P . | tail -1 | $AWK '{print $NF}')
+ local btrfs_dirs=$(cd $btrfs_base_dir; sudo btrfs subvolume list -t . | $AWK '/^[0-9]/ {print $4}' | grep "$btrfs_base_dir/$btrfs_dir")
for subvolume in $btrfs_dirs; do
sudo btrfs subvolume delete $btrfs_root/$subvolume
done
#######################################################################
-calc() { awk "BEGIN{print $*}"; }
+calc() { $AWK "BEGIN{print $*}"; }
##
# Return a list of numbers that are increasingly larger and whose
local pid_variable=$1
shift
# Execute the command and prepend the output with its pid
- # We enforce to return the exit status of the command and not the awk one.
+ # We enforce to return the exit status of the command and not the sed one.
("$@" |& sed 's/^/'$$': /'; return "${PIPESTATUS[0]}") >&2 &
eval "$pid_variable+=\" $!\""
}
teardown $dir || return 1
}
+function TEST_request_scrub_priority() {
+ local dir=$1
+ local poolname=psr_pool
+ local objname=POBJ
+ local OBJECTS=64
+ local PGS=8
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+ ceph_osd_args+="--osd_scrub_backoff_ratio=0"
+ run_osd $dir 0 $ceph_osd_args || return 1
+
+ create_pool $poolname $PGS $PGS || return 1
+ wait_for_clean || return 1
+
+ local osd=0
+ add_something $dir $poolname $objname noscrub || return 1
+ local primary=$(get_primary $poolname $objname)
+ local pg=$(get_pg $poolname $objname)
+ poolid=$(ceph osd dump | grep "^pool.*[']${poolname}[']" | awk '{ print $2 }')
+
+ local otherpgs
+ for i in $(seq 0 $(expr $PGS - 1))
+ do
+ opg="${poolid}.${i}"
+ if [ "$opg" = "$pg" ]; then
+ continue
+ fi
+ otherpgs="${otherpgs}${opg} "
+ local other_last_scrub=$(get_last_scrub_stamp $pg)
+ # Fake a schedule scrub
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) \
+ trigger_scrub $opg || return 1
+ done
+
+ sleep 15
+ flush_pg_stats
+
+ # Request a regular scrub and it will be done
+ local last_scrub=$(get_last_scrub_stamp $pg)
+ ceph pg scrub $pg
+
+ ceph osd unset noscrub || return 1
+ ceph osd unset nodeep-scrub || return 1
+
+ wait_for_scrub $pg "$last_scrub"
+
+ for opg in $otherpgs $pg
+ do
+ wait_for_scrub $opg "$other_last_scrub"
+ done
+
+ # Verify that the requested scrub ran first
+ grep "log_channel.*scrub ok" $dir/osd.${primary}.log | head -1 | sed 's/.*[[]DBG[]]//' | grep -q $pg || return 1
+
+ return 0
+}
+
+
main osd-scrub-repair "$@"
# Local Variables:
overrides:
ceph_ansible:
+ branch: stable-3.2
vars:
- branch: stable-3.2
ceph_conf_overrides:
global:
osd default pool size: 2
fuse default permissions: false
tasks:
- workunit:
+ timeout: 6h
clients:
all:
- suites/pjd.sh
--- /dev/null
+.qa/cephfs/clusters/1-mds-4-client-coloc.yaml
\ No newline at end of file
+++ /dev/null
-roles:
-- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, mds.a, mds.b, client.1, client.2, client.3]
-- [client.0, osd.4, osd.5, osd.6, osd.7]
-openstack:
-- volumes: # attached to each instance
- count: 2
- size: 10 # GB
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
- Corrupt dentry
- Scrub error on inode
- Metadata damage detected
+ - MDS_READ_ONLY
+ - force file system read-only
tasks:
- cephfs_test_runner:
fuse default permissions: false
tasks:
- workunit:
+ timeout: 6h
clients:
all:
- suites/pjd.sh
- volumes: # attached to each instance
count: 2
size: 10 # GB
+- machine:
+ disk: 100 # GB
log-rotate:
ceph-mds: 10G
ceph-osd: 10G
--- /dev/null
+.qa/cephfs/clusters/1-mds-2-client.yaml
\ No newline at end of file
--- /dev/null
+.qa/cephfs/clusters/1-mds-3-client.yaml
\ No newline at end of file
+++ /dev/null
-roles:
-- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
-- [client.2]
-- [client.1]
-- [client.0]
-
-openstack:
-- volumes: # attached to each instance
- count: 1
- size: 10 # GB
-
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
-
+++ /dev/null
-roles:
-- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
-- [client.1]
-- [client.0]
-
-openstack:
-- volumes: # attached to each instance
- count: 3
- size: 10 # GB
-
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
-
+++ /dev/null
-roles:
-- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, mon.b, mds.a, mds.b, client.1]
-- [mds.c, mds.d, mon.c, client.0, osd.4, osd.5, osd.6, osd.7]
-openstack:
-- volumes: # attached to each instance
- count: 2
- size: 10 # GB
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
client acl type: posix_acl
tasks:
- workunit:
+ timeout: 6h
clients:
all:
- suites/pjd.sh
--- /dev/null
+.qa/cephfs/clusters/1-mds-1-client-coloc.yaml
\ No newline at end of file
+++ /dev/null
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2, mds.b-s-a]
-- [mon.b, mgr.x, mds.a, osd.3, osd.4, osd.5, client.0]
-openstack:
-- volumes: # attached to each instance
- count: 3
- size: 10 # GB
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
conf:
global:
ms inject socket failures: 2500
- mds inject delay type: osd mds
+ ms inject delay type: osd mds
ms inject delay probability: .005
ms inject delay max: 1
fuse default permissions: false
tasks:
- workunit:
+ timeout: 6h
clients:
all:
- suites/pjd.sh
tasks:
- kclient:
- workunit:
+ timeout: 6h
clients:
all:
- suites/pjd.sh
- Corrupt dentry
- Scrub error on inode
- Metadata damage detected
+ - MDS_READ_ONLY
+ - force file system read-only
tasks:
- cephfs_test_runner:
fuse default permissions: false
tasks:
- workunit:
+ timeout: 6h
clients:
all:
- suites/pjd.sh
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - overall HEALTH_
+ - \(AUTH_BAD_CAPS\)
+- workunit:
+ clients:
+ all:
+ - mon/test_config_key_caps.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_min_pg_log_entries: 1
+ osd_max_pg_log_entries: 2
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_min_pg_log_entries: 1
+ osd_max_pg_log_entries: 2
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_min_pg_log_entries: 1
+ osd_max_pg_log_entries: 2
- ceph.restart:
daemons: [mon.a,mon.b,mon.c,osd.0, osd.1, osd.2]
- print: "**** done ceph.restart 1st half"
+- exec:
+ osd.0:
+ - ceph osd set pglog_hardlimit && exit 1 || true
+ - ceph osd dump --format=json-pretty | grep "flags"
+- print: "**** try to set pglog_hardlimit, should not succeed"
daemons: [osd.3, osd.4, osd.5]
wait-for-healthy: false
wait-for-osds-up: true
+- exec:
+ osd.0:
+ - ceph osd require-osd-release luminous
+- print: "**** done `ceph osd require-osd-release luminous`"
+- exec:
+ osd.0:
+ - ceph osd dump --format=json-pretty | grep "flags"
+ - ceph osd set pglog_hardlimit
+ - ceph osd dump --format=json-pretty | grep "flags"
+- print: "**** try to set pglog_hardlimit again, should succeed"
+++ /dev/null
-../.qa/
\ No newline at end of file
--- /dev/null
+../../.qa
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ Run ceph on two nodes, using one of them as a client,
+ with a separate client-only node.
+ Use xfs beneath the osds.
+ install ceph/luminous v12.2.2 point version
+ run workload and upgrade-sequence in parallel
+ install ceph/luminous v12.2.5 point version
+ run workload and upgrade-sequence in parallel
+ install ceph/luminous v12.2.7 point version
+ run workload and upgrade-sequence in parallel
+ install ceph/luminous v12.2.8 point version
+ run workload and upgrade-sequence in parallel
+ install ceph/luminous v12.2.9 point version
+ run workload and upgrade-sequence in parallel
+ install ceph/luminous v12.2.10 point version
+ run workload and upgrade-sequence in parallel
+
+ install ceph/luminous latest version
+ run workload and upgrade-sequence in parallel
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+ - scrub
+ - osd_map_max_advance
+ - wrongly marked
+ - FS_DEGRADED
+ - POOL_APP_NOT_ENABLED
+ - CACHE_POOL_NO_HIT_SET
+ - POOL_FULL
+ - SMALLER_PG
+ - pool\(s\) full
+ - OSD_DOWN
+ - missing hit_sets
+ - CACHE_POOL_NEAR_FULL
+ - PG_AVAILABILITY
+ - PG_DEGRADED
+ - application not enabled
+ - overall HEALTH_
+ fs: xfs
+ conf:
+ mon:
+ mon debug unsafe allow tier with nonempty snaps: true
+ mon warn on pool no app: false
+ osd:
+ osd map max advance: 1000
+ osd_class_load_list: "cephfs hello journal lock log numops rbd refcount
+ replica_log rgw sdk statelog timeindex user version"
+ osd_class_default_list: "cephfs hello journal lock log numops rbd refcount
+ replica_log rgw sdk statelog timeindex user version"
+ client:
+ rgw_crypt_require_ssl: false
+ rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo=
+roles:
+- - mon.a
+ - mds.a
+ - osd.0
+ - osd.1
+ - osd.2
+ - mgr.x
+- - mon.b
+ - mon.c
+ - osd.3
+ - osd.4
+ - osd.5
+ - client.0
+- - client.1
+openstack:
+- volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
+tasks:
+- print: "**** v12.2.2 about to install"
+- install:
+ tag: v12.2.2
+ # line below can be removed its from jewel test
+ #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev', 'librgw2']
+- print: "**** done v12.2.2 install"
+- ceph:
+ fs: xfs
+ add_osds_to_crush: true
+- print: "**** done ceph xfs"
+- sequential:
+ - workload
+- print: "**** done workload v12.2.2"
+
+#### upgrade to v12.2.5
+- install.upgrade:
+ #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ mon.a:
+ tag: v12.2.5
+ mon.b:
+ tag: v12.2.5
+ # Note that client.a IS NOT upgraded at this point
+- parallel:
+ - workload_luminous
+ - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.5"
+
+#### upgrade to v12.2.7
+- install.upgrade:
+ #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ mon.a:
+ tag: v12.2.7
+ mon.b:
+ tag: v12.2.7
+ # Note that client.a IS NOT upgraded at this point
+- parallel:
+ - workload_luminous
+ - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.7"
+
+#### upgrade to v12.2.8
+- install.upgrade:
+ #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ mon.a:
+ tag: v12.2.8
+ mon.b:
+ tag: v12.2.8
+ # Note that client.a IS NOT upgraded at this point
+- parallel:
+ - workload_luminous
+ - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.8"
+
+
+#### upgrade to v12.2.9
+- install.upgrade:
+ #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ mon.a:
+ tag: v12.2.9
+ mon.b:
+ tag: v12.2.9
+ # Note that client.a IS NOT upgraded at this point
+- parallel:
+ - workload_luminous
+ - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.9"
+
+#### upgrade to v12.2.10
+- install.upgrade:
+ #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ mon.a:
+ tag: v12.2.10
+ mon.b:
+ tag: v12.2.10
+ # Note that client.a IS NOT upgraded at this point
+- parallel:
+ - workload_luminous
+ - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.10"
+
+
+#### upgrade to latest luminous
+- install.upgrade:
+ #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ mon.a:
+ mon.b:
+ # Note that client.a IS NOT upgraded at this point
+- parallel:
+ - workload_luminous
+ - upgrade-sequence_luminous
+- print: "**** done parallel luminous branch"
+
+#######################
+workload:
+ sequential:
+ - workunit:
+ clients:
+ client.0:
+ - suites/blogbench.sh
+workload_luminous:
+ full_sequential:
+ - workunit:
+ tag: v12.2.2
+ clients:
+ client.1:
+ - rados/test.sh
+ - cls
+ env:
+ CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces'
+ - print: "**** done rados/test.sh & cls workload_luminous"
+ - sequential:
+ - rgw: [client.0]
+ - print: "**** done rgw workload_luminous"
+ - s3tests:
+ client.0:
+ force-branch: ceph-luminous
+ rgw_server: client.0
+ scan_for_encryption_keys: false
+ - print: "**** done s3tests workload_luminous"
+upgrade-sequence_luminous:
+ sequential:
+ - print: "**** done branch: luminous install.upgrade"
+ - ceph.restart: [mds.a]
+ - sleep:
+ duration: 60
+ - ceph.restart: [osd.0]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.1]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.2]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.3]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.4]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.5]
+ - sleep:
+ duration: 60
+ - ceph.restart: [mon.a]
+ - sleep:
+ duration: 60
+ - ceph.restart: [mon.b]
+ - sleep:
+ duration: 60
+ - ceph.restart: [mon.c]
+ - sleep:
+ duration: 60
+ - print: "**** done ceph.restart all luminous branch mds/osd/mon"
--- /dev/null
+../../../../distros/supported
\ No newline at end of file
--- /dev/null
+../.qa/
\ No newline at end of file
--- /dev/null
+openstack:
+ - machine:
+ disk: 100 # GB
+ - volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+meta:
+- desc: |
+ Run ceph on two nodes,
+ with a separate client-only node.
+ Use xfs beneath the osds.
+overrides:
+ ceph:
+ fs: xfs
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+- - osd.3
+ - osd.4
+ - osd.5
+- - client.0
--- /dev/null
+meta:
+- desc: install ceph/luminous latest
+tasks:
+- install:
+ tag: v12.2.10
+ exclude_packages: ['librados3']
+ extra_packages: ['librados2']
+- print: "**** done install luminous v12.2.10"
+- ceph:
+- exec:
+ osd.0:
+ - ceph osd require-osd-release luminous
+ - ceph osd set-require-min-compat-client luminous
+- print: "**** done ceph"
+overrides:
+ ceph:
+ conf:
+ mon:
+ mon warn on osd down out interval zero: false
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_min_pg_log_entries: 1
+ osd_max_pg_log_entries: 2
--- /dev/null
+../.qa/
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ install upgrade ceph/-x on one node only
+ 1st half
+ restart : osd.0,1,2
+tasks:
+- install.upgrade:
+ osd.0:
+- print: "**** done install.upgrade osd.0"
+- ceph.restart:
+ daemons: [mon.a,mon.b,mon.c,osd.0, osd.1, osd.2]
+- print: "**** done ceph.restart 1st half"
+- exec:
+ osd.0:
+ - ceph osd set pglog_hardlimit && exit 1 || true
+ - ceph osd dump --format=json-pretty | grep "flags"
+- print: "**** try to set pglog_hardlimit, should not succeed"
--- /dev/null
+../.qa/
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ randomly kill and revive osd
+ small chance to increase the number of pgs
+overrides:
+ ceph:
+ log-whitelist:
+ - but it is still running
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ - log bound mismatch
+tasks:
+- parallel:
+ - stress-tasks
+stress-tasks:
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ chance_thrash_cluster_full: 0
+ chance_thrash_pg_upmap: 0
+ chance_thrash_pg_upmap_items: 0
+ disable_objectstore_tool_tests: true
+ chance_force_recovery: 0
+- print: "**** done thrashosds 3-thrash"
--- /dev/null
+../.qa/
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ run randomized correctness test for rados operations
+ generate write load with rados bench
+stress-tasks:
+- full_sequential:
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+- print: "**** done radosbench 7-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic cls tests for rbd
+stress-tasks:
+- workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - cls/test_cls_rbd.sh
+- print: "**** done cls/test_cls_rbd.sh 5-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic import/export cli tests for rbd
+stress-tasks:
+- workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --new-format
+- print: "**** done rbd/import_export.sh 5-workload"
--- /dev/null
+meta:
+- desc: |
+ librbd C and C++ api tests
+stress-tasks:
+- workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+- print: "**** done rbd/test_librbd.sh 7-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool,
+ using only reads, writes, and deletes
+stress-tasks:
+- full_sequential:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ write_append_excl: false
+ op_weights:
+ read: 45
+ write: 45
+ delete: 10
+- print: "**** done rados/readwrite 5-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool with snapshot operations
+stress-tasks:
+- full_sequential:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+- print: "**** done rados/snaps-few-objects 5-workload"
--- /dev/null
+tasks:
+- install.upgrade:
+ osd.3:
+ client.0:
+- ceph.restart:
+ daemons: [osd.3, osd.4, osd.5]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+- exec:
+ osd.0:
+ - ceph osd set pglog_hardlimit
+ - ceph osd dump --format=json-pretty | grep "flags"
+- print: "**** try to set pglog_hardlimit again, should succeed"
+
--- /dev/null
+../.qa/
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ librbd python api tests
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+- print: "**** done rbd/test_librbd_python.sh 9-workload"
--- /dev/null
+meta:
+- desc: |
+ swift api tests for rgw
+tasks:
+- rgw:
+ client.0:
+- print: "**** done rgw 9-workload"
+- swift:
+ client.0:
+ rgw_server: client.0
+- print: "**** done swift 9-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool with snapshot operations
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
--- /dev/null
+../../../../distros/supported
\ No newline at end of file
--- /dev/null
+../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: |
- Run ceph on two nodes, using one of them as a client,
- with a separate client-only node.
- Use xfs beneath the osds.
- install ceph/luminous v12.2.2 point version
- run workload and upgrade-sequence in parallel
- install ceph/luminous v12.2.5 point version
- run workload and upgrade-sequence in parallel
- install ceph/luminous v12.2.7 point version
- run workload and upgrade-sequence in parallel
- install ceph/luminous v12.2.8 point version
- run workload and upgrade-sequence in parallel
- install ceph/luminous latest version
- run workload and upgrade-sequence in parallel
-overrides:
- ceph:
- log-whitelist:
- - reached quota
- - scrub
- - osd_map_max_advance
- - wrongly marked
- - FS_DEGRADED
- - POOL_APP_NOT_ENABLED
- - CACHE_POOL_NO_HIT_SET
- - POOL_FULL
- - SMALLER_PG
- - pool\(s\) full
- - OSD_DOWN
- - missing hit_sets
- - CACHE_POOL_NEAR_FULL
- - PG_AVAILABILITY
- - PG_DEGRADED
- - application not enabled
- - overall HEALTH_
- fs: xfs
- conf:
- mon:
- mon debug unsafe allow tier with nonempty snaps: true
- mon warn on pool no app: false
- osd:
- osd map max advance: 1000
- osd_class_load_list: "cephfs hello journal lock log numops rbd refcount
- replica_log rgw sdk statelog timeindex user version"
- osd_class_default_list: "cephfs hello journal lock log numops rbd refcount
- replica_log rgw sdk statelog timeindex user version"
- client:
- rgw_crypt_require_ssl: false
- rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo=
-roles:
-- - mon.a
- - mds.a
- - osd.0
- - osd.1
- - osd.2
- - mgr.x
-- - mon.b
- - mon.c
- - osd.3
- - osd.4
- - osd.5
- - client.0
-- - client.1
-openstack:
-- volumes: # attached to each instance
- count: 3
- size: 30 # GB
-tasks:
-- print: "**** v12.2.2 about to install"
-- install:
- tag: v12.2.2
- # line below can be removed its from jewel test
- #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev', 'librgw2']
-- print: "**** done v12.2.2 install"
-- ceph:
- fs: xfs
- add_osds_to_crush: true
-- print: "**** done ceph xfs"
-- sequential:
- - workload
-- print: "**** done workload v12.2.2"
-
-#### upgrade to v12.2.5
-- install.upgrade:
- #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- mon.a:
- tag: v12.2.5
- mon.b:
- tag: v12.2.5
- # Note that client.a IS NOT upgraded at this point
-- parallel:
- - workload_luminous
- - upgrade-sequence_luminous
-- print: "**** done parallel luminous v12.2.5"
-
-#### upgrade to v12.2.7
-- install.upgrade:
- #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- mon.a:
- tag: v12.2.7
- mon.b:
- tag: v12.2.7
- # Note that client.a IS NOT upgraded at this point
-- parallel:
- - workload_luminous
- - upgrade-sequence_luminous
-- print: "**** done parallel luminous v12.2.7"
-
-#### upgrade to v12.2.8
-- install.upgrade:
- #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- mon.a:
- tag: v12.2.8
- mon.b:
- tag: v12.2.8
- # Note that client.a IS NOT upgraded at this point
-- parallel:
- - workload_luminous
- - upgrade-sequence_luminous
-- print: "**** done parallel luminous v12.2.8"
-
-#### upgrade to latest luminous
-- install.upgrade:
- #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- mon.a:
- mon.b:
- # Note that client.a IS NOT upgraded at this point
-- parallel:
- - workload_luminous
- - upgrade-sequence_luminous
-- print: "**** done parallel luminous branch"
-
-#######################
-workload:
- sequential:
- - workunit:
- clients:
- client.0:
- - suites/blogbench.sh
-workload_luminous:
- full_sequential:
- - workunit:
- tag: v12.2.2
- clients:
- client.1:
- - rados/test.sh
- - cls
- env:
- CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces'
- - print: "**** done rados/test.sh & cls workload_luminous"
- - sequential:
- - rgw: [client.0]
- - print: "**** done rgw workload_luminous"
- - s3tests:
- client.0:
- force-branch: ceph-luminous
- rgw_server: client.0
- scan_for_encryption_keys: false
- - print: "**** done s3tests workload_luminous"
-upgrade-sequence_luminous:
- sequential:
- - print: "**** done branch: luminous install.upgrade"
- - ceph.restart: [mds.a]
- - sleep:
- duration: 60
- - ceph.restart: [osd.0]
- - sleep:
- duration: 30
- - ceph.restart: [osd.1]
- - sleep:
- duration: 30
- - ceph.restart: [osd.2]
- - sleep:
- duration: 30
- - ceph.restart: [osd.3]
- - sleep:
- duration: 30
- - ceph.restart: [osd.4]
- - sleep:
- duration: 30
- - ceph.restart: [osd.5]
- - sleep:
- duration: 60
- - ceph.restart: [mon.a]
- - sleep:
- duration: 60
- - ceph.restart: [mon.b]
- - sleep:
- duration: 60
- - ceph.restart: [mon.c]
- - sleep:
- duration: 60
- - print: "**** done ceph.restart all luminous branch mds/osd/mon"
+++ /dev/null
-../../../distros/supported/
\ No newline at end of file
del self._ctx.ceph['ceph'].conf[subsys][key]
write_conf(self._ctx)
- def json_asok(self, command, service_type, service_id):
- proc = self.mon_manager.admin_socket(service_type, service_id, command)
+ def json_asok(self, command, service_type, service_id, timeout=None):
+ if timeout is None:
+ timeout = 15*60
+ proc = self.mon_manager.admin_socket(service_type, service_id, command, timeout=timeout)
response_data = proc.stdout.getvalue()
log.info("_json_asok output: {0}".format(response_data))
if response_data.strip():
self.mon_manager.raw_cluster_cmd("fs", "set", self.name, var, *a)
def set_max_mds(self, max_mds):
- self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "max_mds", "%d" % max_mds)
+ self.set_var("max_mds", "%d" % max_mds)
def set_allow_dirfrags(self, yes):
- self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "allow_dirfrags", str(yes).lower(), '--yes-i-really-mean-it')
+ self.set_var("allow_dirfrags", str(yes).lower(), '--yes-i-really-mean-it')
def get_pgs_per_fs_pool(self):
"""
def _df(self):
return json.loads(self.mon_manager.raw_cluster_cmd("df", "--format=json-pretty"))
- def get_mds_map(self):
- return self.status().get_fsmap(self.id)['mdsmap']
+ def get_mds_map(self, status=None):
+ if status is None:
+ status = self.status()
+ return status.get_fsmap(self.id)['mdsmap']
def get_var(self, var):
return self.status().get_fsmap(self.id)['mdsmap'][var]
return version
- def mds_asok(self, command, mds_id=None):
+ def mds_asok(self, command, mds_id=None, timeout=None):
if mds_id is None:
mds_id = self.get_lone_mds_id()
- return self.json_asok(command, 'mds', mds_id)
+ return self.json_asok(command, 'mds', mds_id, timeout=timeout)
- def rank_asok(self, command, rank=0):
- info = self.get_rank(rank=rank)
- return self.json_asok(command, 'mds', info['name'])
+ def rank_asok(self, command, rank=0, status=None, timeout=None):
+ info = self.get_rank(rank=rank, status=status)
+ return self.json_asok(command, 'mds', info['name'], timeout=timeout)
def read_cache(self, path, depth=None):
cmd = ["dump", "tree", path]
while True:
status = self.status()
if rank is not None:
- mds_info = status.get_rank(self.id, rank)
- current_state = mds_info['state'] if mds_info else None
- log.info("Looked up MDS state for mds.{0}: {1}".format(rank, current_state))
+ try:
+ mds_info = status.get_rank(self.id, rank)
+ current_state = mds_info['state'] if mds_info else None
+ log.info("Looked up MDS state for mds.{0}: {1}".format(rank, current_state))
+ except:
+ mdsmap = self.get_mds_map(status=status)
+ if rank in mdsmap['failed']:
+ log.info("Waiting for rank {0} to come back.".format(rank))
+ current_state = None
+ else:
+ raise
elif mds_id is not None:
# mds_info is None if no daemon with this ID exists in the map
mds_info = status.get_mds(mds_id)
"""
return ""
+ def _make_rank(self, rank):
+ return "{}:{}".format(self.name, rank)
+
def _run_tool(self, tool, args, rank=None, quiet=False):
# Tests frequently have [client] configuration that jacks up
# the objecter log level (unlikely to be interesting here)
base_args = [os.path.join(self._prefix, tool), '--debug-mds=4', '--debug-objecter=1']
if rank is not None:
- base_args.extend(["--rank", "%d" % rank])
+ base_args.extend(["--rank", "%s" % str(rank)])
t1 = datetime.datetime.now()
r = self.tool_remote.run(
mds_id = self.mds_ids[0]
return self.mds_daemons[mds_id].remote
- def journal_tool(self, args, rank=None, quiet=False):
+ def journal_tool(self, args, rank, quiet=False):
"""
- Invoke cephfs-journal-tool with the passed arguments, and return its stdout
+ Invoke cephfs-journal-tool with the passed arguments for a rank, and return its stdout
"""
- return self._run_tool("cephfs-journal-tool", args, rank, quiet)
+ fs_rank = self._make_rank(rank)
+ return self._run_tool("cephfs-journal-tool", args, fs_rank, quiet)
def table_tool(self, args, quiet=False):
"""
'--',
self.mountpoint,
],
+ timeout=(15*60)
)
run_cmd = [
def list_connections():
self.client_remote.run(
args=["sudo", "mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"],
- check_status=False
+ check_status=False,
+ timeout=(15*60)
)
p = self.client_remote.run(
args=["ls", "/sys/fs/fuse/connections"],
stdout=StringIO(),
- check_status=False
+ check_status=False,
+ timeout=(15*60)
)
if p.exitstatus != 0:
return []
],
stdout=StringIO(),
stderr=StringIO(),
- wait=False
+ wait=False,
+ timeout=(15*60)
)
try:
proc.wait()
# Now that we're mounted, set permissions so that the rest of the test will have
# unrestricted access to the filesystem mount.
- self.client_remote.run(
- args=['sudo', 'chmod', '1777', self.mountpoint])
+ try:
+ stderr = StringIO()
+ self.client_remote.run(args=['sudo', 'chmod', '1777', self.mountpoint], timeout=(15*60), stderr=stderr)
+ except run.CommandFailedError:
+ stderr = stderr.getvalue()
+ if "Read-only file system".lower() in stderr.lower():
+ pass
+ else:
+ raise
def _mountpoint_exists(self):
- return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False).exitstatus == 0
+ return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False, timeout=(15*60)).exitstatus == 0
def umount(self):
try:
'-u',
self.mountpoint,
],
+ timeout=(30*60),
)
except run.CommandFailedError:
log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name))
run.Raw(';'),
'ps',
'auxf',
- ])
+ ], timeout=(60*15))
# abort the fuse mount, killing all hung processes
if self._fuse_conn:
'-f',
self.mountpoint,
],
- stderr=stderr
+ stderr=stderr,
+ timeout=(60*15)
)
except CommandFailedError:
if self.is_mounted():
'--',
self.mountpoint,
],
- stderr=stderr
+ stderr=stderr,
+ timeout=(60*5)
)
except CommandFailedError:
if "No such file or directory" in stderr.getvalue():
'-rf',
self.mountpoint,
],
+ timeout=(60*5)
)
def _asok_path(self):
# Find the admin socket
p = self.client_remote.run(args=[
- 'python', '-c', pyscript
- ], stdout=StringIO())
+ 'sudo', 'python2', '-c', pyscript
+ ], stdout=StringIO(), timeout=(15*60))
asok_path = p.stdout.getvalue().strip()
log.info("Found client admin socket at {0}".format(asok_path))
# Query client ID from admin socket
p = self.client_remote.run(
args=['sudo', self._prefix + 'ceph', '--admin-daemon', asok_path] + args,
- stdout=StringIO())
+ stdout=StringIO(), timeout=(15*60))
return json.loads(p.stdout.getvalue())
def get_global_id(self):
run.Raw('>'),
filename,
],
+ timeout=(5*60),
)
def mount(self, mount_path=None, mount_fs_name=None):
'--',
self.mountpoint,
],
+ timeout=(5*60),
)
if mount_path is None:
'-o',
opts
],
+ timeout=(30*60),
)
self.client_remote.run(
- args=['sudo', 'chmod', '1777', self.mountpoint])
+ args=['sudo', 'chmod', '1777', self.mountpoint], timeout=(5*60))
self.mounted = True
cmd.append('-f')
try:
- self.client_remote.run(args=cmd, timeout=(5*60))
+ self.client_remote.run(args=cmd, timeout=(15*60))
except Exception as e:
self.client_remote.run(args=[
'sudo',
'lsof',
run.Raw(';'),
'ps', 'auxf',
- ])
+ ], timeout=(15*60))
raise e
rproc = self.client_remote.run(
'--',
self.mountpoint,
],
+ timeout=(5*60),
)
def _find_debug_dir(self):
p = self.client_remote.run(args=[
'sudo', 'python', '-c', pyscript
- ], stdout=StringIO())
+ ], stdout=StringIO(), timeout=(5*60))
client_id_to_dir = json.loads(p.stdout.getvalue())
try:
p = self.client_remote.run(args=[
'sudo', 'python', '-c', pyscript
- ], stdout=StringIO())
+ ], stdout=StringIO(), timeout=(5*60))
return p.stdout.getvalue()
def get_global_id(self):
# Client B tries to stat the file that client A created
rproc = self.mount_b.write_background("file1")
- # After mds_session_timeout, we should see a health warning (extra lag from
+ # After session_timeout, we should see a health warning (extra lag from
# MDS beacon period)
- mds_session_timeout = float(self.fs.get_config("mds_session_timeout"))
- self.wait_for_health("MDS_CLIENT_LATE_RELEASE", mds_session_timeout + 10)
+ session_timeout = self.fs.get_var("session_timeout")
+ self.wait_for_health("MDS_CLIENT_LATE_RELEASE", session_timeout + 10)
# Client B should still be stuck
self.assertFalse(rproc.finished)
REQUIRE_ONE_CLIENT_REMOTE = True
CLIENTS_REQUIRED = 2
- LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
+ LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"]
# Environment references
- mds_session_timeout = None
mds_reconnect_timeout = None
ms_max_backoff = None
I/O after failure.
"""
+ session_timeout = self.fs.get_var("session_timeout")
+
# We only need one client
self.mount_b.umount_wait()
# ...then it should block
self.assertFalse(write_blocked.finished)
self.assert_session_state(client_id, "open")
- time.sleep(self.mds_session_timeout * 1.5) # Long enough for MDS to consider session stale
+ time.sleep(session_timeout * 1.5) # Long enough for MDS to consider session stale
self.assertFalse(write_blocked.finished)
self.assert_session_state(client_id, "stale")
REQUIRE_KCLIENT_REMOTE = True
CLIENTS_REQUIRED = 2
- LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
+ LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"]
# Environment references
- mds_session_timeout = None
mds_reconnect_timeout = None
ms_max_backoff = None
self.mount_a.create_destroy()
def test_stale_caps(self):
+ session_timeout = self.fs.get_var("session_timeout")
+
# Capability release from stale session
# =====================================
cap_holder = self.mount_a.open_background()
self.mount_a.kill()
try:
- # Now, after mds_session_timeout seconds, the waiter should
+ # Now, after session_timeout seconds, the waiter should
# complete their operation when the MDS marks the holder's
# session stale.
cap_waiter = self.mount_b.write_background()
cap_waited = b - a
log.info("cap_waiter waited {0}s".format(cap_waited))
- self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0,
+ self.assertTrue(session_timeout / 2.0 <= cap_waited <= session_timeout * 2.0,
"Capability handover took {0}, expected approx {1}".format(
- cap_waited, self.mds_session_timeout
+ cap_waited, session_timeout
))
cap_holder.stdin.close()
# Eviction while holding a capability
# ===================================
+ session_timeout = self.fs.get_var("session_timeout")
+
# Take out a write capability on a file on client A,
# and then immediately kill it.
cap_holder = self.mount_a.open_background()
log.info("cap_waiter waited {0}s".format(cap_waited))
# This is the check that it happened 'now' rather than waiting
# for the session timeout
- self.assertLess(cap_waited, self.mds_session_timeout / 2.0,
+ self.assertLess(cap_waited, session_timeout / 2.0,
"Capability handover took {0}, expected less than {1}".format(
- cap_waited, self.mds_session_timeout / 2.0
+ cap_waited, session_timeout / 2.0
))
cap_holder.stdin.close()
if not isinstance(self.mount_a, FuseMount):
raise SkipTest("Require FUSE client to handle signal STOP/CONT")
+ session_timeout = self.fs.get_var("session_timeout")
+
self.mount_a.run_shell(["mkdir", "testdir"])
self.mount_a.run_shell(["touch", "testdir/file1"])
# populate readdir cache
self.mount_b.client_remote.run(args=["sudo", "kill", "-STOP", mount_b_pid])
self.assert_session_state(mount_b_gid, "open")
- time.sleep(self.mds_session_timeout * 1.5) # Long enough for MDS to consider session stale
+ time.sleep(session_timeout * 1.5) # Long enough for MDS to consider session stale
self.assert_session_state(mount_b_gid, "stale")
self.mount_a.run_shell(["touch", "testdir/file2"])
DAMAGED_ON_LS = "damaged_on_ls"
CRASHED = "server crashed"
NO_DAMAGE = "no damage"
+READONLY = "readonly"
FAILED_CLIENT = "client failed"
FAILED_SERVER = "server failed"
mutations = []
# Removals
- for obj_id in objects:
- if obj_id in [
+ for o in objects:
+ if o in [
# JournalPointers are auto-replaced if missing (same path as upgrade)
"400.00000000",
# Missing dirfrags for non-system dirs result in empty directory
expectation = DAMAGED_ON_START
log.info("Expectation on rm '{0}' will be '{1}'".format(
- obj_id, expectation
+ o, expectation
))
mutations.append(MetadataMutation(
- obj_id,
- "Delete {0}".format(obj_id),
- lambda o=obj_id: self.fs.rados(["rm", o]),
+ o,
+ "Delete {0}".format(o),
+ lambda o=o: self.fs.rados(["rm", o]),
expectation
))
# Blatant corruptions
- mutations.extend([
- MetadataMutation(
- o,
- "Corrupt {0}".format(o),
- lambda o=o: self.fs.rados(["put", o, "-"], stdin_data=junk),
- DAMAGED_ON_START
- ) for o in data_objects
- ])
-
- # Truncations
for obj_id in data_objects:
if obj_id == "500.00000000":
+ # purge queue corruption results in read-only FS
+ mutations.append(MetadataMutation(
+ obj_id,
+ "Corrupt {0}".format(obj_id),
+ lambda o=obj_id: self.fs.rados(["put", o, "-"], stdin_data=junk),
+ READONLY
+ ))
+ else:
+ mutations.append(MetadataMutation(
+ obj_id,
+ "Corrupt {0}".format(obj_id),
+ lambda o=obj_id: self.fs.rados(["put", o, "-"], stdin_data=junk),
+ DAMAGED_ON_START
+ ))
+
+ # Truncations
+ for o in data_objects:
+ if o == "500.00000000":
# The PurgeQueue is allowed to be empty: Journaler interprets
# an empty header object as an empty journal.
expectation = NO_DAMAGE
o,
"Truncate {0}".format(o),
lambda o=o: self.fs.rados(["truncate", o, "0"]),
- DAMAGED_ON_START
+ expectation
))
# OMAP value corruptions
)
# OMAP header corruptions
- for obj_id in omap_header_objs:
- if re.match("60.\.00000000", obj_id) \
- or obj_id in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
+ for o in omap_header_objs:
+ if re.match("60.\.00000000", o) \
+ or o in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
expectation = DAMAGED_ON_START
else:
expectation = NO_DAMAGE
log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
- obj_id, expectation
+ o, expectation
))
mutations.append(
MetadataMutation(
- obj_id,
- "Corrupt omap header on {0}".format(obj_id),
- lambda o=obj_id: self.fs.rados(["setomapheader", o, junk]),
+ o,
+ "Corrupt omap header on {0}".format(o),
+ lambda o=o: self.fs.rados(["setomapheader", o, junk]),
expectation
)
)
else:
log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
results[mutation] = FAILED_SERVER
-
+ elif mutation.expectation == READONLY:
+ proc = self.mount_a.run_shell(["mkdir", "foo"], wait=False)
+ try:
+ proc.wait()
+ except CommandFailedError:
+ stderr = proc.stderr.getvalue()
+ log.info(stderr)
+ if "Read-only file system".lower() in stderr.lower():
+ pass
+ else:
+ raise
else:
try:
wait([proc], 20)
# Drop everything from the MDS cache
self.mds_cluster.mds_stop()
- self.fs.journal_tool(['journal', 'reset'])
+ self.fs.journal_tool(['journal', 'reset'], 0)
self.mds_cluster.mds_fail_restart()
self.fs.wait_for_daemons()
if False:
with self.assertRaises(CommandFailedError):
# Normal reset should fail when no objects are present, we'll use --force instead
- self.fs.journal_tool(["journal", "reset"])
+ self.fs.journal_tool(["journal", "reset"], 0)
- self.fs.journal_tool(["journal", "reset", "--force"])
+ self.fs.journal_tool(["journal", "reset", "--force"], 0)
self.fs.data_scan(["init"])
self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
# ...and the journal is truncated to just a single subtreemap from the
# newly created segment
- summary_output = self.fs.journal_tool(["event", "get", "summary"])
+ summary_output = self.fs.journal_tool(["event", "get", "summary"], 0)
try:
self.assertEqual(summary_output,
dedent(
).strip())
flush_data = self.fs.mds_asok(["flush", "journal"])
self.assertEqual(flush_data['return_code'], 0)
- self.assertEqual(self.fs.journal_tool(["event", "get", "summary"]),
+ self.assertEqual(self.fs.journal_tool(["event", "get", "summary"], 0),
dedent(
"""
Events by type:
# is all that will be in the InoTable in memory)
self.fs.journal_tool(["event", "splice",
- "--inode={0}".format(inos["./file2_sixmegs"]), "summary"])
+ "--inode={0}".format(inos["./file2_sixmegs"]), "summary"], 0)
self.fs.journal_tool(["event", "splice",
- "--inode={0}".format(inos["./file3_sixmegs"]), "summary"])
+ "--inode={0}".format(inos["./file3_sixmegs"]), "summary"], 0)
# Revert to old inotable.
for key, value in inotable_copy.iteritems():
Apply kwargs as MDS configuration settings, enable dirfrags
and restart the MDSs.
"""
- kwargs['mds_bal_frag'] = "true"
for k, v in kwargs.items():
self.ceph_cluster.set_ceph_conf("mds", k, v.__str__())
))
# Verify that cephfs-journal-tool can now read the rewritten journal
- inspect_out = self.fs.journal_tool(["journal", "inspect"])
+ inspect_out = self.fs.journal_tool(["journal", "inspect"], 0)
if not inspect_out.endswith(": OK"):
raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
inspect_out
))
- self.fs.journal_tool(["event", "get", "json", "--path", "/tmp/journal.json"])
+ self.fs.journal_tool(["event", "get", "json",
+ "--path", "/tmp/journal.json"], 0)
p = self.fs.tool_remote.run(
args=[
"python",
self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
# Execute the dentry recovery, this should populate the backing store
- self.fs.journal_tool(['event', 'recover_dentries', 'list'])
+ self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
# Dentries in ROOT_INO are present
self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
# Now check the MDS can read what we wrote: truncate the journal
# and start the mds.
- self.fs.journal_tool(['journal', 'reset'])
+ self.fs.journal_tool(['journal', 'reset'], 0)
self.fs.mds_fail_restart()
self.fs.wait_for_daemons()
self.fs.mds_stop(active_mds_names[0])
self.fs.mds_fail(active_mds_names[0])
# Invoke recover_dentries quietly, because otherwise log spews millions of lines
- self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
- self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
+ self.fs.journal_tool(["event", "recover_dentries", "summary"], 0, quiet=True)
+ self.fs.journal_tool(["event", "recover_dentries", "summary"], 1, quiet=True)
self.fs.table_tool(["0", "reset", "session"])
- self.fs.journal_tool(["journal", "reset"], rank=0)
+ self.fs.journal_tool(["journal", "reset"], 0)
self.fs.erase_mds_objects(1)
self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
'--yes-i-really-mean-it')
import time
import json
import logging
+import time
log = logging.getLogger(__name__)
class TestMisc(CephFSTestCase):
CLIENTS_REQUIRED = 2
- LOAD_SETTINGS = ["mds_session_autoclose"]
- mds_session_autoclose = None
-
def test_getattr_caps(self):
"""
Check if MDS recognizes the 'mask' parameter of open request.
self.mount_a.kill_background(p)
+ def test_root_rctime(self):
+ """
+ Check that the root inode has a non-default rctime on startup.
+ """
+
+ t = time.time()
+ rctime = self.mount_a.getfattr(".", "ceph.dir.rctime")
+ log.info("rctime = {}".format(rctime))
+ self.assertGreaterEqual(rctime, t-10)
+
def test_fs_new(self):
data_pool_name = self.fs.get_data_pool_name()
only session
"""
+ session_autoclose = self.fs.get_var("session_autoclose")
+
self.mount_b.umount_wait()
ls_data = self.fs.mds_asok(['session', 'ls'])
self.assert_session_count(1, ls_data)
self.mount_a.kill()
self.mount_a.kill_cleanup()
- time.sleep(self.mds_session_autoclose * 1.5)
+ time.sleep(session_autoclose * 1.5)
ls_data = self.fs.mds_asok(['session', 'ls'])
self.assert_session_count(1, ls_data)
self.mount_a.kill()
self.mount_a.kill_cleanup()
- time.sleep(self.mds_session_autoclose * 1.5)
+ time.sleep(session_autoclose * 1.5)
ls_data = self.fs.mds_asok(['session', 'ls'])
self.assert_session_count(1, ls_data)
ratio = raw_avail / fs_avail
assert 0.9 < ratio < 1.1
+
+ def _run_drop_cache_cmd(self, timeout, use_tell):
+ drop_res = None
+ if use_tell:
+ mds_id = self.fs.get_lone_mds_id()
+ drop_res = json.loads(
+ self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id),
+ "cache", "drop", str(timeout)))
+ else:
+ drop_res = self.fs.mds_asok(["cache", "drop", str(timeout)])
+ return drop_res
+
+ def _drop_cache_command(self, timeout, use_tell=True):
+ self.mount_b.umount_wait()
+ ls_data = self.fs.mds_asok(['session', 'ls'])
+ self.assert_session_count(1, ls_data)
+
+ # create some files
+ self.mount_a.create_n_files("dc-dir/dc-file", 1000)
+ # drop cache
+ drop_res = self._run_drop_cache_cmd(timeout, use_tell)
+
+ self.assertTrue(drop_res['client_recall']['return_code'] == 0)
+ self.assertTrue(drop_res['flush_journal']['return_code'] == 0)
+
+ def _drop_cache_command_timeout(self, timeout, use_tell=True):
+ self.mount_b.umount_wait()
+ ls_data = self.fs.mds_asok(['session', 'ls'])
+ self.assert_session_count(1, ls_data)
+
+ # create some files
+ self.mount_a.create_n_files("dc-dir/dc-file-t", 1000)
+
+ # simulate client death and try drop cache
+ self.mount_a.kill()
+ drop_res = self._run_drop_cache_cmd(timeout, use_tell)
+
+ self.assertTrue(drop_res['client_recall']['return_code'] == -errno.ETIMEDOUT)
+ self.assertTrue(drop_res['flush_journal']['return_code'] == 0)
+
+ self.mount_a.kill_cleanup()
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ def test_drop_cache_command_asok(self):
+ """
+ Basic test for checking drop cache command using admin socket.
+ Note that the cache size post trimming is not checked here.
+ """
+ self._drop_cache_command(10, use_tell=False)
+
+ def test_drop_cache_command_tell(self):
+ """
+ Basic test for checking drop cache command using tell interface.
+ Note that the cache size post trimming is not checked here.
+ """
+ self._drop_cache_command(10)
+
+ def test_drop_cache_command_timeout_asok(self):
+ """
+ Check drop cache command with non-responding client using admin
+ socket. Note that the cache size post trimming is not checked here.
+ """
+ self._drop_cache_command_timeout(5, use_tell=False)
+
+ def test_drop_cache_command_timeout_tell(self):
+ """
+ Check drop cache command with non-responding client using tell
+ interface. Note that the cache size post trimming is not checked
+ here.
+ """
+ self._drop_cache_command_timeout(5)
self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
'--yes-i-really-mean-it')
- def get_state(mds_id):
- info = self.mds_cluster.get_mds_info(mds_id)
- return info['state'] if info is not None else None
-
self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
if False:
with self.assertRaises(CommandFailedError):
# Normal reset should fail when no objects are present, we'll use --force instead
- self.fs.journal_tool(["journal", "reset"])
+ self.fs.journal_tool(["journal", "reset"], 0)
self.fs.mds_stop()
self.fs.data_scan(['scan_extents', '--alternate-pool',
recovery_pool, '--filesystem', self.fs.name,
'--force-corrupt', '--force-init',
self.fs.get_data_pool_name()])
- self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
- 'recover_dentries', 'list',
- '--alternate-pool', recovery_pool])
+ self.fs.journal_tool(['event', 'recover_dentries', 'list',
+ '--alternate-pool', recovery_pool], 0)
self.fs.data_scan(['init', '--force-init', '--filesystem',
self.fs.name])
self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
'--force-corrupt', '--force-init',
self.fs.get_data_pool_name()])
- self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
- 'recover_dentries', 'list'])
+ self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
- self.fs.journal_tool(['--rank=' + recovery_fs + ":0", 'journal',
- 'reset', '--force'])
- self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'journal',
- 'reset', '--force'])
+ self.recovery_fs.journal_tool(['journal', 'reset', '--force'], 0)
+ self.fs.journal_tool(['journal', 'reset', '--force'], 0)
self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
recovery_fs + ":0")
self.recovery_fs.mds_restart()
self.fs.wait_for_daemons()
self.recovery_fs.wait_for_daemons()
- for mds_id in self.recovery_fs.mds_ids:
- self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + mds_id,
+ status = self.recovery_fs.status()
+ for rank in self.recovery_fs.get_ranks(status=status):
+ self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + rank['name'],
'injectargs', '--debug-mds=20')
- self.fs.mon_manager.raw_cluster_cmd('daemon', "mds." + mds_id,
- 'scrub_path', '/',
- 'recursive', 'repair')
+ self.fs.rank_asok(['scrub_path', '/', 'recursive', 'repair'], rank=rank['rank'], status=status)
log.info(str(self.mds_cluster.status()))
# Mount a client
(remote,) = ctx.cluster.only(client).remotes.keys()
- clone_dir = '{tdir}/clone.{role}'.format(tdir=testdir, role=client)
+ clone_dir = '{tdir}/qemu_clone.{role}'.format(tdir=testdir, role=client)
remote.run(args=refspec.clone(git_url, clone_dir))
src_dir = os.path.dirname(__file__)
os.path.join(testdir, 'qemu', 'userdata.' + client),
os.path.join(testdir, 'qemu', 'metadata.' + client),
'{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client),
- '{tdir}/clone.{client}'.format(tdir=testdir, client=client),
+ '{tdir}/qemu_clone.{client}'.format(tdir=testdir, client=client),
],
)
- \(REQUEST_SLOW\)
- \(TOO_FEW_PGS\)
- \(MON_DOWN\)
- - slow requests
+ - slow request
)
if cleanup:
args=['sudo', 'rm', '-rf', '--', scratch_tmp]
- remote.run(logger=log.getChild(role), args=args, timeout=(15*60))
+ remote.run(logger=log.getChild(role), args=args, timeout=(60*60))
finally:
log.info('Stopping %s on %s...', tests, role)
args=['sudo', 'rm', '-rf', '--', workunits_file, clonedir]
-#!/bin/sh -e
+#!/bin/sh -ex
-#check ceph health
ceph -s
-#list pools
rados lspools
-#lisr rbd images
rbd ls
-#check that the monitors work
+# check that the monitors work
ceph osd set nodown
ceph osd unset nodown
--- /dev/null
+#!/usr/bin/env bash
+
+set -x
+set -e
+
+tmp=$(mktemp -d -p /tmp test_mon_config_key_caps.XXXXX)
+entities=()
+
+function cleanup()
+{
+ set +e
+ set +x
+ if [[ -e $tmp/keyring ]] && [[ -e $tmp/keyring.orig ]]; then
+ grep '\[.*\..*\]' $tmp/keyring.orig > $tmp/entities.orig
+ for e in $(grep '\[.*\..*\]' $tmp/keyring | \
+ diff $tmp/entities.orig - | \
+ sed -n 's/^.*\[\(.*\..*\)\]/\1/p');
+ do
+ ceph auth rm $e 2>&1 >& /dev/null
+ done
+ fi
+ #rm -fr $tmp
+}
+
+trap cleanup 0 # cleanup on exit
+
+function expect_false()
+{
+ set -x
+ if "$@"; then return 1; else return 0; fi
+}
+
+# for cleanup purposes
+ceph auth export -o $tmp/keyring.orig
+
+k=$tmp/keyring
+
+# setup a few keys
+ceph config-key ls
+ceph config-key set daemon-private/osd.123/test-foo
+ceph config-key set mgr/test-foo
+ceph config-key set device/test-foo
+ceph config-key set test/foo
+
+allow_aa=client.allow_aa
+allow_bb=client.allow_bb
+allow_cc=client.allow_cc
+
+mgr_a=mgr.a
+mgr_b=mgr.b
+osd_a=osd.100
+osd_b=osd.200
+
+prefix_aa=client.prefix_aa
+prefix_bb=client.prefix_bb
+prefix_cc=client.prefix_cc
+match_aa=client.match_aa
+match_bb=client.match_bb
+
+fail_aa=client.fail_aa
+fail_bb=client.fail_bb
+fail_cc=client.fail_cc
+fail_dd=client.fail_dd
+fail_ee=client.fail_ee
+fail_ff=client.fail_ff
+fail_gg=client.fail_gg
+fail_writes=client.fail_writes
+
+ceph auth get-or-create $allow_aa mon 'allow *'
+ceph auth get-or-create $allow_bb mon 'allow service config-key rwx'
+ceph auth get-or-create $allow_cc mon 'allow command "config-key get"'
+
+ceph auth get-or-create $mgr_a mon 'allow profile mgr'
+ceph auth get-or-create $mgr_b mon 'allow profile mgr'
+ceph auth get-or-create $osd_a mon 'allow profile osd'
+ceph auth get-or-create $osd_b mon 'allow profile osd'
+
+ceph auth get-or-create $prefix_aa mon \
+ "allow command \"config-key get\" with key prefix client/$prefix_aa"
+
+cap="allow command \"config-key set\" with key prefix client/"
+cap="$cap,allow command \"config-key get\" with key prefix client/$prefix_bb"
+ceph auth get-or-create $prefix_bb mon "$cap"
+
+cap="allow command \"config-key get\" with key prefix client/"
+cap="$cap, allow command \"config-key set\" with key prefix client/"
+cap="$cap, allow command \"config-key ls\""
+ceph auth get-or-create $prefix_cc mon "$cap"
+
+cap="allow command \"config-key get\" with key=client/$match_aa/foo"
+ceph auth get-or-create $match_aa mon "$cap"
+cap="allow command \"config-key get\" with key=client/$match_bb/foo"
+cap="$cap,allow command \"config-key set\" with key=client/$match_bb/foo"
+ceph auth get-or-create $match_bb mon "$cap"
+
+ceph auth get-or-create $fail_aa mon 'allow rx'
+ceph auth get-or-create $fail_bb mon 'allow r,allow w'
+ceph auth get-or-create $fail_cc mon 'allow rw'
+ceph auth get-or-create $fail_dd mon 'allow rwx'
+ceph auth get-or-create $fail_ee mon 'allow profile bootstrap-rgw'
+ceph auth get-or-create $fail_ff mon 'allow profile bootstrap-rbd'
+# write commands will require rw; wx is not enough
+ceph auth get-or-create $fail_gg mon 'allow service config-key wx'
+# read commands will only require 'r'; 'rx' should be enough.
+ceph auth get-or-create $fail_writes mon 'allow service config-key rx'
+
+# grab keyring
+ceph auth export -o $k
+
+# keys will all the caps can do whatever
+for c in $allow_aa $allow_bb $allow_cc $mgr_a $mgr_b; do
+ ceph -k $k --name $c config-key get daemon-private/osd.123/test-foo
+ ceph -k $k --name $c config-key get mgr/test-foo
+ ceph -k $k --name $c config-key get device/test-foo
+ ceph -k $k --name $c config-key get test/foo
+done
+
+for c in $osd_a $osd_b; do
+ ceph -k $k --name $c config-key put daemon-private/$c/test-foo
+ ceph -k $k --name $c config-key get daemon-private/$c/test-foo
+ expect_false ceph -k $k --name $c config-key ls
+ expect_false ceph -k $k --name $c config-key get mgr/test-foo
+ expect_false ceph -k $k --name $c config-key get device/test-foo
+ expect_false ceph -k $k --name $c config-key get test/foo
+done
+
+expect_false ceph -k $k --name $osd_a get daemon-private/$osd_b/test-foo
+expect_false ceph -k $k --name $osd_b get daemon-private/$osd_a/test-foo
+
+expect_false ceph -k $k --name $prefix_aa \
+ config-key ls
+expect_false ceph -k $k --name $prefix_aa \
+ config-key get daemon-private/osd.123/test-foo
+expect_false ceph -k $k --name $prefix_aa \
+ config-key set test/bar
+expect_false ceph -k $k --name $prefix_aa \
+ config-key set client/$prefix_aa/foo
+
+# write something so we can read, use a custom entity
+ceph -k $k --name $allow_bb config-key set client/$prefix_aa/foo
+ceph -k $k --name $prefix_aa config-key get client/$prefix_aa/foo
+# check one writes to the other's prefix, the other is able to read
+ceph -k $k --name $prefix_bb config-key set client/$prefix_aa/bar
+ceph -k $k --name $prefix_aa config-key get client/$prefix_aa/bar
+
+ceph -k $k --name $prefix_bb config-key set client/$prefix_bb/foo
+ceph -k $k --name $prefix_bb config-key get client/$prefix_bb/foo
+
+expect_false ceph -k $k --name $prefix_bb config-key get client/$prefix_aa/bar
+expect_false ceph -k $k --name $prefix_bb config-key ls
+expect_false ceph -k $k --name $prefix_bb \
+ config-key get daemon-private/osd.123/test-foo
+expect_false ceph -k $k --name $prefix_bb config-key get mgr/test-foo
+expect_false ceph -k $k --name $prefix_bb config-key get device/test-foo
+expect_false ceph -k $k --name $prefix_bb config-key get test/bar
+expect_false ceph -k $k --name $prefix_bb config-key set test/bar
+
+ceph -k $k --name $prefix_cc config-key set client/$match_aa/foo
+ceph -k $k --name $prefix_cc config-key set client/$match_bb/foo
+ceph -k $k --name $prefix_cc config-key get client/$match_aa/foo
+ceph -k $k --name $prefix_cc config-key get client/$match_bb/foo
+expect_false ceph -k $k --name $prefix_cc config-key set other/prefix
+expect_false ceph -k $k --name $prefix_cc config-key get mgr/test-foo
+ceph -k $k --name $prefix_cc config-key ls >& /dev/null
+
+ceph -k $k --name $match_aa config-key get client/$match_aa/foo
+expect_false ceph -k $k --name $match_aa config-key get client/$match_bb/foo
+expect_false ceph -k $k --name $match_aa config-key set client/$match_aa/foo
+ceph -k $k --name $match_bb config-key get client/$match_bb/foo
+ceph -k $k --name $match_bb config-key set client/$match_bb/foo
+expect_false ceph -k $k --name $match_bb config-key get client/$match_aa/foo
+expect_false ceph -k $k --name $match_bb config-key set client/$match_aa/foo
+
+keys=(daemon-private/osd.123/test-foo
+ mgr/test-foo
+ device/test-foo
+ test/foo
+ client/$prefix_aa/foo
+ client/$prefix_bb/foo
+ client/$match_aa/foo
+ client/$match_bb/foo
+)
+# expect these all to fail accessing config-key
+for c in $fail_aa $fail_bb $fail_cc \
+ $fail_dd $fail_ee $fail_ff \
+ $fail_gg; do
+ for m in get set; do
+ for key in ${keys[*]} client/$prefix_aa/foo client/$prefix_bb/foo; do
+ expect_false ceph -k $k --name $c config-key $m $key
+ done
+ done
+done
+
+# fail writes but succeed on reads
+expect_false ceph -k $k --name $fail_writes config-key set client/$match_aa/foo
+expect_false ceph -k $k --name $fail_writes config-key set test/foo
+ceph -k $k --name $fail_writes config-key ls
+ceph -k $k --name $fail_writes config-key get client/$match_aa/foo
+ceph -k $k --name $fail_writes config-key get daemon-private/osd.123/test-foo
+
+echo "OK"
"
BINARIES="${BINARIES_TO_RUN}hello_radosstriper_cpp
"
-DL_PREFIX="http://git.ceph.com/?p=ceph.git;a=blob_plain;f=examples/librados/"
-#DL_PREFIX="https://raw.githubusercontent.com/ceph/ceph/master/examples/librados/"
+DL_PREFIX="http://git.ceph.com/?p=ceph.git;a=blob_plain;hb=luminous;f=examples/librados/"
+#DL_PREFIX="https://raw.githubusercontent.com/ceph/ceph/luminous/examples/librados/"
DESTDIR=$(pwd)
function cleanup () {
#!/bin/bash -ex
-STACK_BRANCH=stable/pike
-TEMPEST_BRANCH=17.2.0
+STACK_BRANCH=stable/rocky
+TEMPEST_BRANCH=19.0.0
STACK_USER=${STACK_USER:-stack}
STACK_GROUP=${STACK_GROUP:-stack}
set -e
set -x
-export BIN="${BIN:-cephfs-journal-tool}"
+export BIN="${BIN:-cephfs-journal-tool --rank=cephfs:0}"
export JOURNAL_FILE=/tmp/journal.bin
export JSON_OUTPUT=/tmp/json.tmp
export BINARY_OUTPUT=/tmp/binary.tmp
#
#
-# Return MAX(1, (number of processors / 2)) by default or NPROC
+# To just look at what this script will do, run it like this:
#
+# $ DRY_RUN=echo ./run-make-check.sh
+#
+
+set -e
+
+trap clean_up_after_myself EXIT
+
+ORIGINAL_CCACHE_CONF="$HOME/.ccache/ccache.conf"
+SAVED_CCACHE_CONF="$HOME/.run-make-check-saved-ccache-conf"
+
+function save_ccache_conf() {
+ test -f $ORIGINAL_CCACHE_CONF && cp $ORIGINAL_CCACHE_CONF $SAVED_CCACHE_CONF || true
+}
+
+function restore_ccache_conf() {
+ test -f $SAVED_CCACHE_CONF && mv $SAVED_CCACHE_CONF $ORIGINAL_CCACHE_CONF || true
+}
+
+function clean_up_after_myself() {
+ rm -fr ${CEPH_BUILD_VIRTUALENV:-/tmp}/*virtualenv*
+ restore_ccache_conf
+}
+
function get_processors() {
if test -n "$NPROC" ; then
echo $NPROC
exit 1
fi
if [ -n "$install_cmd" ]; then
- $DRY_RUN sudo $install_cmd ccache jq $which_pkg
+ $DRY_RUN sudo $install_cmd ccache $which_pkg
else
echo "WARNING: Don't know how to install packages" >&2
echo "This probably means distribution $ID is not supported by run-make-check.sh" >&2
fi
+ if ! type ccache > /dev/null 2>&1 ; then
+ echo "ERROR: ccache could not be installed"
+ exit 1
+ fi
+
if test -f ./install-deps.sh ; then
$DRY_RUN ./install-deps.sh || return 1
+ trap clean_up_after_myself EXIT
fi
# Init defaults after deps are installed. get_processors() depends on coreutils nproc.
DEFAULT_MAKEOPTS=${DEFAULT_MAKEOPTS:--j$(get_processors)}
BUILD_MAKEOPTS=${BUILD_MAKEOPTS:-$DEFAULT_MAKEOPTS}
+ test "$BUILD_MAKEOPTS" && echo "make will run with option(s) $BUILD_MAKEOPTS"
CHECK_MAKEOPTS=${CHECK_MAKEOPTS:-$DEFAULT_MAKEOPTS}
- $DRY_RUN ./do_cmake.sh $@ || return 1
+ if type python2 > /dev/null 2>&1 ; then
+ # gtest-parallel requires Python 2
+ CMAKE_PYTHON_OPTS="-DWITH_GTEST_PARALLEL=ON"
+ else
+ CMAKE_PYTHON_OPTS="-DWITH_PYTHON2=OFF -DWITH_PYTHON3=ON -DMGR_PYTHON_VERSION=3 -DWITH_GTEST_PARALLEL=OFF"
+ fi
+
+ CMAKE_BUILD_OPTS=""
+
+ cat <<EOM
+Note that the binaries produced by this script do not contain correct time
+and git version information, which may make them unsuitable for debugging
+and production use.
+EOM
+ save_ccache_conf
+ # remove the entropy generated by the date/time embedded in the build
+ CMAKE_BUILD_OPTS="$CMAKE_BUILD_OPTS -DENABLE_GIT_VERSION=OFF"
+ $DRY_RUN export SOURCE_DATE_EPOCH="946684800"
+ $DRY_RUN ccache -o sloppiness=time_macros
+ $DRY_RUN ccache -o run_second_cpp=true
+ if [ -n "$JENKINS_HOME" ]; then
+ # Build host has plenty of space available, let's use it to keep
+ # various versions of the built objects. This could increase the cache hit
+ # if the same or similar PRs are running several times
+ $DRY_RUN ccache -o max_size=100G
+ else
+ echo "Current ccache max_size setting:"
+ ccache -p | grep max_size
+ fi
+ $DRY_RUN ccache -z # Reset the ccache statistics
+
+ $DRY_RUN ./do_cmake.sh $CMAKE_BUILD_OPTS $CMAKE_PYTHON_OPTS $@ || return 1
$DRY_RUN cd build
$DRY_RUN make $BUILD_MAKEOPTS tests || return 1
- # prevent OSD EMFILE death on tests
- $DRY_RUN sudo ulimit -n 32768
+
+ $DRY_RUN ccache -s # print the ccache statistics to evaluate the efficiency
+
+ # to prevent OSD EMFILE death on tests, make sure ulimit >= 1024
+ $DRY_RUN ulimit -n $(ulimit -Hn)
+ if [ $(ulimit -n) -lt 1024 ];then
+ echo "***ulimit -n too small, better bigger than 1024 for test***"
+ return 1
+ fi
+
if ! $DRY_RUN ctest $CHECK_MAKEOPTS --output-on-failure; then
rm -f ${TMPDIR:-/tmp}/ceph-asok.*
return 1
echo "with the ability to run commands as root via sudo."
fi
echo -n "Checking hostname sanity... "
- if hostname --fqdn >/dev/null 2>&1 ; then
+ if $DRY_RUN hostname --fqdn >/dev/null 2>&1 ; then
echo "OK"
else
echo "NOT OK"
echo "Please fix 'hostname --fqdn', otherwise 'make check' will fail"
return 1
fi
- if run "$@" ; then
- rm -fr ${CEPH_BUILD_VIRTUALENV:-/tmp}/*virtualenv*
- echo "cmake check: successful run on $(git rev-parse HEAD)"
- return 0
- else
- rm -fr ${CEPH_BUILD_VIRTUALENV:-/tmp}/*virtualenv*
- return 1
- fi
+ run "$@" && echo "make check: successful run on $(git rev-parse HEAD)"
}
main "$@"
-177915764b752804194937482a39e95e0ca3de94
-v12.2.10
+26dc3775efc7bb286a1d6d66faee0ba30ea23eee
+v12.2.11
switch (protocol) {
case CEPH_AUTH_CEPHX:
+ // if there is no session key, there is no session handler.
+ if (key.get_type() == CEPH_CRYPTO_NONE) {
+ return nullptr;
+ }
return new CephxSessionHandler(cct, key, features);
case CEPH_AUTH_NONE:
return new AuthNoneSessionHandler(cct, key);
pathdir = os.path.dirname(path)
if not os.path.exists(pathdir):
os.makedirs(pathdir)
- os.chmod(pathdir, 0770)
+ os.chmod(pathdir, 0o770)
os.chown(pathdir, get_ceph_uid(), get_ceph_gid())
while wait_count > 0:
try:
- with file(tmp, 'w') as f:
- os.fchmod(f.fileno(), 0600)
+ with open(tmp, 'w') as f:
+ os.fchmod(f.fileno(), 0o600)
os.fchown(f.fileno(), get_ceph_uid(), get_ceph_gid())
LOG.info('Talking to monitor...')
pathdir = os.path.dirname(path)
if not os.path.exists(pathdir):
os.makedirs(pathdir)
- os.chmod(pathdir, 0770)
+ os.chmod(pathdir, 0o770)
os.chown(pathdir, get_ceph_uid(), get_ceph_gid())
while wait_count > 0:
try:
- with file(tmp, 'w') as f:
- os.fchmod(f.fileno(), 0600)
+ with open(tmp, 'w') as f:
+ os.fchmod(f.fileno(), 0o600)
os.fchown(f.fileno(), get_ceph_uid(), get_ceph_gid())
LOG.info('Talking to monitor...')
returncode = subprocess.call(
"""
Removes a volume group.
"""
+ if not vg_name:
+ logger.warning('Skipping removal of invalid VG name: "%s"', vg_name)
+ return
fail_msg = "Unable to remove vg %s" % vg_name
process.run(
[
if not system.device_is_mounted(source, destination=destination):
prepare_utils.mount_osd(source, osd_id, is_vdo=is_vdo)
+ # ensure that the OSD destination is always chowned properly
+ system.chown(destination)
+
# always re-do the symlink regardless if it exists, so that the journal
# device path that may have changed can be mapped correctly every time
destination = '/var/lib/ceph/osd/%s-%s/journal' % (conf.cluster, osd_id)
db_device_path = get_osd_device_path(osd_lv, lvs, 'db', dmcrypt_secret=dmcrypt_secret)
wal_device_path = get_osd_device_path(osd_lv, lvs, 'wal', dmcrypt_secret=dmcrypt_secret)
- # Once symlinks are removed, the osd dir can be 'primed again.
+ # Once symlinks are removed, the osd dir can be 'primed again. chown first,
+ # regardless of what currently exists so that ``prime-osd-dir`` can succeed
+ # even if permissions are somehow messed up
+ system.chown(osd_path)
prime_command = [
'ceph-bluestore-tool', '--cluster=%s' % conf.cluster,
'prime-osd-dir', '--dev', osd_lv_path,
self.argv = argv
def get_devices(self):
- all_devices = disk.get_devices()
# remove devices with partitions
- # XXX Should be optional when getting device info
- for device, detail in all_devices.items():
- if detail.get('partitions') != {}:
- del all_devices[device]
- devices = sorted(all_devices.items(), key=lambda x: (x[0], x[1]['size']))
- return device_formatter(devices)
+ devices = [(device, details) for device, details in
+ disk.get_devices().items() if details.get('partitions') == {}]
+ size_sort = lambda x: (x[0], x[1]['size'])
+ return device_formatter(sorted(devices, key=size_sort))
def print_help(self):
return self._help.format(
from __future__ import print_function
-import json
from ceph_volume.util import disk, prepare
from ceph_volume.api import lvm
from . import validators
+from .strategies import Strategy
+from .strategies import MixedStrategy
from ceph_volume.devices.lvm.create import Create
from ceph_volume.devices.lvm.prepare import Prepare
from ceph_volume.util import templates
from ceph_volume.exceptions import SizeAllocationError
-class SingleType(object):
+class SingleType(Strategy):
"""
Support for all SSDs, or all HDDS
"""
def __init__(self, devices, args):
- self.args = args
- self.osds_per_device = args.osds_per_device
- self.devices = devices
- # TODO: add --fast-devices and --slow-devices so these can be customized
- self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
- self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
- self.computed = {'osds': [], 'vgs': [], 'filtered_devices': args.filtered_devices}
- if self.devices:
- self.validate()
- self.compute()
- else:
- self.computed["changed"] = False
+ super(SingleType, self).__init__(devices, args)
+ self.validate_compute()
@staticmethod
def type():
return "bluestore.SingleType"
- @property
- def total_osds(self):
- if self.hdds:
- return len(self.hdds) * self.osds_per_device
- else:
- return len(self.ssds) * self.osds_per_device
-
- def report_json(self):
- print(json.dumps(self.computed, indent=4, sort_keys=True))
-
def report_pretty(self):
string = ""
if self.args.filtered_devices:
Create(command).main()
-class MixedType(object):
+class MixedType(MixedStrategy):
def __init__(self, devices, args):
- self.args = args
- self.devices = devices
- self.osds_per_device = args.osds_per_device
- # TODO: add --fast-devices and --slow-devices so these can be customized
- self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
- self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
- self.computed = {'osds': [], 'filtered_devices': args.filtered_devices}
+ super(MixedType, self).__init__(devices, args)
self.block_db_size = self.get_block_size()
self.system_vgs = lvm.VolumeGroups()
self.dbs_needed = len(self.hdds) * self.osds_per_device
- if self.devices:
- self.validate()
- self.compute()
- else:
- self.computed["changed"] = False
+ self.validate_compute()
@staticmethod
def type():
return "bluestore.MixedType"
- def report_json(self):
- print(json.dumps(self.computed, indent=4, sort_keys=True))
-
def get_block_size(self):
if self.args.block_db_size:
return disk.Size(b=self.args.block_db_size)
else:
Create(command).main()
- def get_common_vg(self):
- # find all the vgs associated with the current device
- for ssd in self.ssds:
- for pv in ssd.pvs_api:
- vg = self.system_vgs.get(vg_name=pv.vg_name)
- if not vg:
- continue
- # this should give us just one VG, it would've been caught by
- # the validator otherwise
- return vg
-
def validate(self):
"""
HDDs represent data devices, and solid state devices are for block.db,
from __future__ import print_function
-import json
from ceph_volume.util import disk, prepare
from ceph_volume.api import lvm
from . import validators
+from .strategies import Strategy
+from .strategies import MixedStrategy
from ceph_volume.devices.lvm.create import Create
from ceph_volume.devices.lvm.prepare import Prepare
from ceph_volume.util import templates
return prepare.get_journal_size(lv_format=False)
-class SingleType(object):
+class SingleType(Strategy):
"""
Support for all SSDs, or all HDDs, data and journal LVs will be colocated
in the same device
"""
def __init__(self, devices, args):
- self.args = args
- self.osds_per_device = args.osds_per_device
- self.devices = devices
- self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
- self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
- self.computed = {'osds': [], 'vgs': [], 'filtered_devices': args.filtered_devices}
+ super(SingleType, self).__init__(devices, args)
self.journal_size = get_journal_size(args)
- if self.devices:
- self.validate()
- self.compute()
- else:
- self.computed["changed"] = False
+ self.validate_compute()
@staticmethod
def type():
return "filestore.SingleType"
- @property
- def total_osds(self):
- if self.hdds:
- return len(self.hdds) * self.osds_per_device
- else:
- return len(self.ssds) * self.osds_per_device
-
- def report_json(self):
- print(json.dumps(self.computed, indent=4, sort_keys=True))
-
def report_pretty(self):
string = ""
if self.args.filtered_devices:
Create(command).main()
-class MixedType(object):
+class MixedType(MixedStrategy):
"""
Supports HDDs with SSDs, journals will be placed on SSDs, while HDDs will
be used fully for data.
"""
def __init__(self, devices, args):
- self.args = args
- self.osds_per_device = args.osds_per_device
- self.devices = devices
- self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
- self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
- self.computed = {'osds': [], 'vg': None, 'filtered_devices': args.filtered_devices}
+ super(MixedType, self).__init__(devices, args)
self.blank_ssds = []
self.journals_needed = len(self.hdds) * self.osds_per_device
self.journal_size = get_journal_size(args)
self.system_vgs = lvm.VolumeGroups()
- if self.devices:
- self.validate()
- self.compute()
- else:
- self.computed["changed"] = False
+ self.validate_compute()
@staticmethod
def type():
return "filestore.MixedType"
- def report_json(self):
- print(json.dumps(self.computed, indent=4, sort_keys=True))
-
- @property
- def total_osds(self):
- if self.hdds:
- return len(self.hdds) * self.osds_per_device
- else:
- return len(self.ssds) * self.osds_per_device
-
def report_pretty(self):
string = ""
if self.args.filtered_devices:
print(string)
- def get_common_vg(self):
- # find all the vgs associated with the current device
- for ssd in self.ssds:
- for pv in ssd.pvs_api:
- vg = self.system_vgs.get(vg_name=pv.vg_name)
- if not vg:
- continue
- # this should give us just one VG, it would've been caught by
- # the validator otherwise
- return vg
-
def validate(self):
"""
Ensure that the minimum requirements for this type of scenario is
--- /dev/null
+import json
+
+class Strategy(object):
+
+ def __init__(self, devices, args):
+ self.args = args
+ self.osds_per_device = args.osds_per_device
+ self.devices = devices
+ self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
+ self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
+ self.computed = {'osds': [], 'vgs': [], 'filtered_devices': args.filtered_devices}
+
+ def validate_compute(self):
+ if self.devices:
+ self.validate()
+ self.compute()
+ else:
+ self.computed["changed"] = False
+
+ def report_json(self):
+ print(json.dumps(self.computed, indent=4, sort_keys=True))
+
+ @property
+ def total_osds(self):
+ if self.hdds:
+ return len(self.hdds) * self.osds_per_device
+ else:
+ return len(self.ssds) * self.osds_per_device
+
+ # protect against base class instantiation and incomplete implementations.
+ # We could also use the abc module and implement this as an
+ # AbstractBaseClass
+ def compute(self):
+ raise NotImplementedError('compute() must be implemented in a child class')
+
+ def execute(self):
+ raise NotImplementedError('execute() must be implemented in a child class')
+
+class MixedStrategy(Strategy):
+
+ def get_common_vg(self):
+ # find all the vgs associated with the current device
+ for ssd in self.ssds:
+ for pv in ssd.pvs_api:
+ vg = self.system_vgs.get(vg_name=pv.vg_name)
+ if not vg:
+ continue
+ # this should give us just one VG, it would've been caught by
+ # the validator otherwise
+ return vg
import argparse
+import os
import logging
from textwrap import dedent
from ceph_volume import decorators, terminal, process
from ceph_volume.api import lvm as api
-from ceph_volume.util import system, encryption, disk
+from ceph_volume.util import system, encryption, disk, arg_validators
+from ceph_volume.util.device import Device
+from ceph_volume.systemd import systemctl
logger = logging.getLogger(__name__)
mlogger = terminal.MultiLogger(__name__)
])
+def find_associated_devices(osd_id=None, osd_fsid=None):
+ """
+ From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the
+ system that match those tag values, further detect if any partitions are
+ part of the OSD, and then return the set of LVs and partitions (if any).
+ """
+ lv_tags = {}
+ if osd_id:
+ lv_tags['ceph.osd_id'] = osd_id
+ if osd_fsid:
+ lv_tags['ceph.osd_fsid'] = osd_fsid
+ lvs = api.Volumes()
+ lvs.filter(lv_tags=lv_tags)
+ if not lvs:
+ raise RuntimeError('Unable to find any LV for zapping OSD: %s' % osd_id or osd_fsid)
+
+ devices_to_zap = ensure_associated_lvs(lvs)
+
+ return [Device(path) for path in set(devices_to_zap) if path]
+
+
+def ensure_associated_lvs(lvs):
+ """
+ Go through each LV and ensure if backing devices (journal, wal, block)
+ are LVs or partitions, so that they can be accurately reported.
+ """
+ # look for many LVs for each backing type, because it is possible to
+ # receive a filtering for osd.1, and have multiple failed deployments
+ # leaving many journals with osd.1 - usually, only a single LV will be
+ # returned
+ journal_lvs = lvs._filter(lv_tags={'ceph.type': 'journal'})
+ db_lvs = lvs._filter(lv_tags={'ceph.type': 'db'})
+ wal_lvs = lvs._filter(lv_tags={'ceph.type': 'wal'})
+ backing_devices = [
+ (journal_lvs, 'journal'),
+ (db_lvs, 'block'),
+ (wal_lvs, 'wal')
+ ]
+
+ verified_devices = []
+
+ for lv in lvs:
+ # go through each lv and append it, otherwise query `blkid` to find
+ # a physical device. Do this for each type (journal,db,wal) regardless
+ # if they have been processed in the previous LV, so that bad devices
+ # with the same ID can be caught
+ for ceph_lvs, _type in backing_devices:
+ if ceph_lvs:
+ verified_devices.extend([l.lv_path for l in ceph_lvs])
+ continue
+
+ # must be a disk partition, by querying blkid by the uuid we are
+ # ensuring that the device path is always correct
+ try:
+ device_uuid = lv.tags['ceph.%s_uuid' % _type]
+ except KeyError:
+ # Bluestore will not have ceph.journal_uuid, and Filestore
+ # will not not have ceph.db_uuid
+ continue
+
+ osd_device = disk.get_device_from_partuuid(device_uuid)
+ if not osd_device:
+ # if the osd_device is not found by the partuuid, then it is
+ # not possible to ensure this device exists anymore, so skip it
+ continue
+ verified_devices.append(osd_device)
+
+ verified_devices.append(lv.lv_path)
+
+ # reduce the list from all the duplicates that were added
+ return list(set(verified_devices))
+
+
class Zap(object):
help = 'Removes all data and filesystems from a logical volume or partition.'
if dmcrypt and dmcrypt_uuid:
self.dmcrypt_close(dmcrypt_uuid)
+ def zap_lv(self, device):
+ """
+ Device examples: vg-name/lv-name, /dev/vg-name/lv-name
+ Requirements: Must be a logical volume (LV)
+ """
+ lv = api.get_lv(lv_name=device.lv_name, vg_name=device.vg_name)
+ self.unmount_lv(lv)
+
+ wipefs(device.abspath)
+ zap_data(device.abspath)
+
+ if self.args.destroy:
+ lvs = api.Volumes()
+ lvs.filter(vg_name=device.vg_name)
+ if len(lvs) <= 1:
+ mlogger.info('Only 1 LV left in VG, will proceed to destroy volume group %s', device.vg_name)
+ api.remove_vg(device.vg_name)
+ else:
+ mlogger.info('More than 1 LV left in VG, will proceed to destroy LV only')
+ mlogger.info('Removing LV because --destroy was given: %s', device.abspath)
+ api.remove_lv(device.abspath)
+ elif lv:
+ # just remove all lvm metadata, leaving the LV around
+ lv.clear_tags()
+
+ def zap_partition(self, device):
+ """
+ Device example: /dev/sda1
+ Requirements: Must be a partition
+ """
+ if device.is_encrypted:
+ # find the holder
+ holders = [
+ '/dev/%s' % holder for holder in device.sys_api.get('holders', [])
+ ]
+ for mapper_uuid in os.listdir('/dev/mapper'):
+ mapper_path = os.path.join('/dev/mapper', mapper_uuid)
+ if os.path.realpath(mapper_path) in holders:
+ self.dmcrypt_close(mapper_uuid)
+
+ if system.device_is_mounted(device.abspath):
+ mlogger.info("Unmounting %s", device.abspath)
+ system.unmount(device.abspath)
+
+ wipefs(device.abspath)
+ zap_data(device.abspath)
+
+ if self.args.destroy:
+ mlogger.info("Destroying partition since --destroy was used: %s" % device.abspath)
+ disk.remove_partition(device)
+
+ def zap_lvm_member(self, device):
+ """
+ An LVM member may have more than one LV and or VG, for example if it is
+ a raw device with multiple partitions each belonging to a different LV
+
+ Device example: /dev/sda
+ Requirements: An LV or VG present in the device, making it an LVM member
+ """
+ for lv in device.lvs:
+ self.zap_lv(Device(lv.lv_path))
+
+
+ def zap_raw_device(self, device):
+ """
+ Any whole (raw) device passed in as input will be processed here,
+ checking for LVM membership and partitions (if any).
+
+ Device example: /dev/sda
+ Requirements: None
+ """
+ if not self.args.destroy:
+ # the use of dd on a raw device causes the partition table to be
+ # destroyed
+ mlogger.warning(
+ '--destroy was not specified, but zapping a whole device will remove the partition table'
+ )
+
+ # look for partitions and zap those
+ for part_name in device.sys_api.get('partitions', {}).keys():
+ self.zap_partition(Device('/dev/%s' % part_name))
+
+ wipefs(device.abspath)
+ zap_data(device.abspath)
+
@decorators.needs_root
- def zap(self, args):
- for device in args.devices:
- if disk.is_mapper_device(device):
+ def zap(self, devices=None):
+ devices = devices or self.args.devices
+
+ for device in devices:
+ mlogger.info("Zapping: %s", device.abspath)
+ if device.is_mapper:
terminal.error("Refusing to zap the mapper device: {}".format(device))
raise SystemExit(1)
- lv = api.get_lv_from_argument(device)
- if lv:
- # we are zapping a logical volume
- path = lv.lv_path
- self.unmount_lv(lv)
- else:
- # we are zapping a partition
- #TODO: ensure device is a partition
- path = device
- # check to if it is encrypted to close
- partuuid = disk.get_partuuid(device)
- if encryption.status("/dev/mapper/{}".format(partuuid)):
- dmcrypt_uuid = partuuid
- self.dmcrypt_close(dmcrypt_uuid)
-
- mlogger.info("Zapping: %s", path)
-
- # check if there was a pv created with the
- # name of device
- pvs = api.PVolumes()
- pvs.filter(pv_name=device)
- vgs = set([pv.vg_name for pv in pvs])
- for pv in pvs:
- vg_name = pv.vg_name
- lv = None
- if pv.lv_uuid:
- lv = api.get_lv(vg_name=vg_name, lv_uuid=pv.lv_uuid)
-
- if lv:
- self.unmount_lv(lv)
-
- if args.destroy:
- for vg_name in vgs:
- mlogger.info("Destroying volume group %s because --destroy was given", vg_name)
- api.remove_vg(vg_name)
- if not lv:
- mlogger.info("Destroying physical volume %s because --destroy was given", device)
- api.remove_pv(device)
-
- wipefs(path)
- zap_data(path)
-
- if lv and not pvs:
- if args.destroy:
- lvs = api.Volumes()
- lvs.filter(vg_name=lv.vg_name)
- if len(lvs) <= 1:
- mlogger.info('Only 1 LV left in VG, will proceed to destroy volume group %s', lv.vg_name)
- api.remove_vg(lv.vg_name)
- else:
- mlogger.info('More than 1 LV left in VG, will proceed to destroy LV only')
- mlogger.info('Removing LV because --destroy was given: %s', lv)
- api.remove_lv(lv)
- else:
- # just remove all lvm metadata, leaving the LV around
- lv.clear_tags()
-
- terminal.success("Zapping successful for: %s" % ", ".join(args.devices))
+ if device.is_lvm_member:
+ self.zap_lvm_member(device)
+ if device.is_lv:
+ self.zap_lv(device)
+ if device.is_partition:
+ self.zap_partition(device)
+ if device.is_device:
+ self.zap_raw_device(device)
+
+ if self.args.devices:
+ terminal.success(
+ "Zapping successful for: %s" % ", ".join([str(d) for d in self.args.devices])
+ )
+ else:
+ terminal.success(
+ "Zapping successful for OSD: %s" % self.args.osd_id or self.args.osd_fsid
+ )
+
+ @decorators.needs_root
+ def zap_osd(self):
+ if self.args.osd_id:
+ osd_is_running = systemctl.osd_is_active(self.args.osd_id)
+ if osd_is_running:
+ mlogger.error("OSD ID %s is running, stop it with:" % self.args.osd_id)
+ mlogger.error("systemctl stop ceph-osd@%s" % self.args.osd_id)
+ raise SystemExit("Unable to zap devices associated with OSD ID: %s" % self.args.osd_id)
+ devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid)
+ self.zap(devices)
def dmcrypt_close(self, dmcrypt_uuid):
dmcrypt_path = "/dev/mapper/{}".format(dmcrypt_uuid)
ceph-volume lvm zap /dev/sda /dev/sdb /db/sdc
+ Zapping devices associated with an OSD ID:
+
+ ceph-volume lvm zap --osd-id 1
+
+ Optionally include the OSD FSID
+
+ ceph-volume lvm zap --osd-id 1 --osd-fsid 55BD4219-16A7-4037-BC20-0F158EFCC83D
+
If the --destroy flag is given and you are zapping a raw device or partition
then all vgs and lvs that exist on that raw device or partition will be destroyed.
'devices',
metavar='DEVICES',
nargs='*',
+ type=arg_validators.ValidDevice(gpt_ok=True),
default=[],
help='Path to one or many lv (as vg/lv), partition (as /dev/sda1) or device (as /dev/sda)'
)
+
parser.add_argument(
'--destroy',
action='store_true',
default=False,
help='Destroy all volume groups and logical volumes if you are zapping a raw device or partition',
)
+
+ parser.add_argument(
+ '--osd-id',
+ help='Specify an OSD ID to detect associated devices for zapping',
+ )
+
+ parser.add_argument(
+ '--osd-fsid',
+ help='Specify an OSD FSID to detect associated devices for zapping',
+ )
+
if len(self.argv) == 0:
print(sub_command_help)
return
- args = parser.parse_args(self.argv)
- self.zap(args)
+
+ self.args = parser.parse_args(self.argv)
+
+ if self.args.osd_id or self.args.osd_fsid:
+ self.zap_osd()
+ else:
+ self.zap()
# -*- coding: utf-8 -*-
import argparse
-import pprint
+import json
from ceph_volume.util.device import Devices, Device
def format_report(self, inventory):
if self.args.format == 'json':
- print(inventory.json_report())
+ print(json.dumps(inventory.json_report()))
elif self.args.format == 'json-pretty':
- pprint.pprint(inventory.json_report())
+ print(json.dumps(inventory.json_report(), indent=4, sort_keys=True))
else:
print(inventory.pretty_report())
@pytest.fixture
def device_info(monkeypatch):
- def apply(devices=None, lsblk=None, lv=None, blkid=None):
+ def apply(devices=None, lsblk=None, lv=None, blkid=None, udevadm=None):
devices = devices if devices else {}
lsblk = lsblk if lsblk else {}
blkid = blkid if blkid else {}
+ udevadm = udevadm if udevadm else {}
lv = Factory(**lv) if lv else None
monkeypatch.setattr("ceph_volume.sys_info.devices", {})
monkeypatch.setattr("ceph_volume.util.device.disk.get_devices", lambda: devices)
monkeypatch.setattr("ceph_volume.util.device.lvm.get_lv", lambda vg_name, lv_uuid: lv)
monkeypatch.setattr("ceph_volume.util.device.disk.lsblk", lambda path: lsblk)
monkeypatch.setattr("ceph_volume.util.device.disk.blkid", lambda path: blkid)
+ monkeypatch.setattr("ceph_volume.util.disk.udevadm_property", lambda *a, **kw: udevadm)
return apply
from ceph_volume.devices.lvm import batch
+class TestBatchSmoke(object):
+
+ def test_batch_instance(self, is_root):
+ b = batch.Batch([])
+ b.main()
+
+
class TestFilterDevices(object):
def test_filter_used_device(self, factory):
--- /dev/null
+import pytest
+from ceph_volume.api import lvm as api
+from ceph_volume.devices.lvm import zap
+
+
+class TestFindAssociatedDevices(object):
+
+ def test_no_lvs_found_that_match_id(self, volumes, monkeypatch, device_info):
+ monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+ tags = 'ceph.osd_id=9,ceph.journal_uuid=x,ceph.type=data'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', vg_name='vg', lv_tags=tags)
+ volumes.append(osd)
+ with pytest.raises(RuntimeError):
+ zap.find_associated_devices(osd_id=10)
+
+ def test_no_lvs_found_that_match_fsid(self, volumes, monkeypatch, device_info):
+ monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+ tags = 'ceph.osd_id=9,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', vg_name='vg', lv_tags=tags)
+ volumes.append(osd)
+ with pytest.raises(RuntimeError):
+ zap.find_associated_devices(osd_fsid='aaaa-lkjh')
+
+ def test_no_lvs_found_that_match_id_fsid(self, volumes, monkeypatch, device_info):
+ monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+ tags = 'ceph.osd_id=9,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', vg_name='vg', lv_tags=tags)
+ volumes.append(osd)
+ with pytest.raises(RuntimeError):
+ zap.find_associated_devices(osd_id='9', osd_fsid='aaaa-lkjh')
+
+ def test_no_ceph_lvs_found(self, volumes, monkeypatch):
+ monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags='')
+ volumes.append(osd)
+ with pytest.raises(RuntimeError):
+ zap.find_associated_devices(osd_id=100)
+
+ def test_lv_is_matched_id(self, volumes, monkeypatch):
+ monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+ tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ volumes.append(osd)
+ result = zap.find_associated_devices(osd_id='0')
+ assert result[0].abspath == '/dev/VolGroup/lv'
+
+ def test_lv_is_matched_fsid(self, volumes, monkeypatch):
+ monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+ tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ volumes.append(osd)
+ result = zap.find_associated_devices(osd_fsid='asdf-lkjh')
+ assert result[0].abspath == '/dev/VolGroup/lv'
+
+ def test_lv_is_matched_id_fsid(self, volumes, monkeypatch):
+ monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+ tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ volumes.append(osd)
+ result = zap.find_associated_devices(osd_id='0', osd_fsid='asdf-lkjh')
+ assert result[0].abspath == '/dev/VolGroup/lv'
+
+
+class TestEnsureAssociatedLVs(object):
+
+ def test_nothing_is_found(self, volumes):
+ result = zap.ensure_associated_lvs(volumes)
+ assert result == []
+
+ def test_data_is_found(self, volumes):
+ tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/data', lv_tags=tags)
+ volumes.append(osd)
+ result = zap.ensure_associated_lvs(volumes)
+ assert result == ['/dev/VolGroup/data']
+
+ def test_block_is_found(self, volumes):
+ tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=block'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/block', lv_tags=tags)
+ volumes.append(osd)
+ result = zap.ensure_associated_lvs(volumes)
+ assert result == ['/dev/VolGroup/block']
+
+ def test_block_and_partition_are_found(self, volumes, monkeypatch):
+ monkeypatch.setattr(zap.disk, 'get_device_from_partuuid', lambda x: '/dev/sdb1')
+ tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=block'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/block', lv_tags=tags)
+ volumes.append(osd)
+ result = zap.ensure_associated_lvs(volumes)
+ assert '/dev/sdb1' in result
+ assert '/dev/VolGroup/block' in result
+
+ def test_journal_is_found(self, volumes):
+ tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=journal'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ volumes.append(osd)
+ result = zap.ensure_associated_lvs(volumes)
+ assert result == ['/dev/VolGroup/lv']
+
+ def test_multiple_journals_are_found(self, volumes):
+ tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=journal'
+ for i in range(3):
+ osd = api.Volume(
+ lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
+ volumes.append(osd)
+ result = zap.ensure_associated_lvs(volumes)
+ assert '/dev/VolGroup/lv0' in result
+ assert '/dev/VolGroup/lv1' in result
+ assert '/dev/VolGroup/lv2' in result
+
+ def test_multiple_dbs_are_found(self, volumes):
+ tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=db'
+ for i in range(3):
+ osd = api.Volume(
+ lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
+ volumes.append(osd)
+ result = zap.ensure_associated_lvs(volumes)
+ assert '/dev/VolGroup/lv0' in result
+ assert '/dev/VolGroup/lv1' in result
+ assert '/dev/VolGroup/lv2' in result
+
+ def test_multiple_wals_are_found(self, volumes):
+ tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.wal_uuid=x,ceph.type=wal'
+ for i in range(3):
+ osd = api.Volume(
+ lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
+ volumes.append(osd)
+ result = zap.ensure_associated_lvs(volumes)
+ assert '/dev/VolGroup/lv0' in result
+ assert '/dev/VolGroup/lv1' in result
+ assert '/dev/VolGroup/lv2' in result
+
+ def test_multiple_backing_devs_are_found(self, volumes):
+ for _type in ['journal', 'db', 'wal']:
+ tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.wal_uuid=x,ceph.type=%s' % _type
+ osd = api.Volume(
+ lv_name='volume%s' % _type, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % _type, lv_tags=tags)
+ volumes.append(osd)
+ result = zap.ensure_associated_lvs(volumes)
+ assert '/dev/VolGroup/lvjournal' in result
+ assert '/dev/VolGroup/lvwal' in result
+ assert '/dev/VolGroup/lvdb' in result
'/dev/mapper/foo',
'/dev/dm-0',
])
- def test_can_not_zap_mapper_device(self, capsys, is_root, device_name):
+ def test_can_not_zap_mapper_device(self, monkeypatch, device_info, capsys, is_root, device_name):
+ monkeypatch.setattr('os.path.exists', lambda x: True)
+ device_info()
with pytest.raises(SystemExit):
lvm.zap.Zap(argv=[device_name]).main()
stdout, stderr = capsys.readouterr()
--- /dev/null
+../../../playbooks/test_zap.yml
\ No newline at end of file
--- /dev/null
+../../../playbooks/test_zap.yml
\ No newline at end of file
--- /dev/null
+../../../playbooks/test_zap.yml
\ No newline at end of file
--- /dev/null
+../../../playbooks/test_zap.yml
\ No newline at end of file
--- /dev/null
+../../../playbooks/test_zap.yml
\ No newline at end of file
--- /dev/null
+../../../playbooks/test_zap.yml
\ No newline at end of file
--- /dev/null
+../../../playbooks/test_zap.yml
\ No newline at end of file
--- /dev/null
+../../../playbooks/test_zap.yml
\ No newline at end of file
--- /dev/null
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: stop ceph-osd daemons
+ service:
+ name: "ceph-osd@{{ item }}"
+ state: stopped
+ with_items: "{{ osd_ids }}"
+
+
+- hosts: mons
+ become: yes
+ tasks:
+
+ - name: purge osds
+ command: "ceph --cluster {{ cluster }} osd purge osd.{{ item }} --yes-i-really-mean-it"
+ with_items: "{{ osd_ids }}"
+
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: zap devices used for OSDs
+ command: "ceph-volume --cluster {{ cluster }} lvm zap --osd-id {{ item }} --destroy"
+ with_items: "{{ osd_ids }}"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
# retest to ensure cluster came back up correctly
testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+ # test zap OSDs by ID
+ ansible-playbook -vv -i {changedir}/hosts {changedir}/test_zap.yml
+
vagrant destroy {env:VAGRANT_DESTROY_FLAGS:"--force"}
--- /dev/null
+../../../playbooks/test_zap.yml
\ No newline at end of file
--- /dev/null
+../../../playbooks/test_zap.yml
\ No newline at end of file
--- /dev/null
+../../../playbooks/test_zap.yml
\ No newline at end of file
--- /dev/null
+../../../playbooks/test_zap.yml
\ No newline at end of file
environment:
CEPH_VOLUME_DEBUG: 1
+ # partitions have been completely removed, so re-create them again
+ - name: re-create partition /dev/sdd for lvm data usage
+ parted:
+ device: /dev/sdd
+ number: 1
+ part_start: 0%
+ part_end: 50%
+ unit: '%'
+ label: gpt
+ state: present
+
- name: redeploy osd.2 using /dev/sdd1
command: "ceph-volume --cluster {{ cluster }} lvm create --bluestore --data /dev/sdd1 --osd-id 2"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
+ # partitions have been completely removed, so re-create them again
+ - name: re-create partition /dev/sdd for lvm data usage
+ parted:
+ device: /dev/sdd
+ number: 1
+ part_start: 0%
+ part_end: 50%
+ unit: '%'
+ label: gpt
+ state: present
+
+ - name: re-create partition /dev/sdd lvm journals
+ parted:
+ device: /dev/sdd
+ number: 2
+ part_start: 50%
+ part_end: 100%
+ unit: '%'
+ state: present
+ label: gpt
+
- name: redeploy osd.2 using /dev/sdd1
command: "ceph-volume --cluster {{ cluster }} lvm create --filestore --data /dev/sdd1 --journal /dev/sdd2 --osd-id 2"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
+ - name: re-create partition /dev/sdc1
+ parted:
+ device: /dev/sdc
+ number: 1
+ part_start: 0%
+ part_end: 50%
+ unit: '%'
+ state: present
+ label: gpt
+
- name: prepare osd.0 again using test_group/data-lv1
command: "ceph-volume --cluster {{ cluster }} lvm prepare --filestore --data test_group/data-lv1 --journal /dev/sdc1 --osd-id 0"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
+ # partitions have been completely removed, so re-create them again
+ - name: re-create partition /dev/sdd for lvm data usage
+ parted:
+ device: /dev/sdd
+ number: 1
+ part_start: 0%
+ part_end: 50%
+ unit: '%'
+ label: gpt
+ state: present
+
- name: redeploy osd.2 using /dev/sdd1
command: "ceph-volume --cluster {{ cluster }} lvm create --bluestore --data /dev/sdd1 --osd-id 2"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
+ - name: find all OSD directories
+ find:
+ paths: /var/lib/ceph/osd
+ recurse: no
+ file_type: directory
+ register: osd_directories
+
+ - name: find all OSD symlinks
+ find:
+ paths: /var/lib/ceph/osd
+ recurse: yes
+ depth: 2
+ file_type: link
+ register: osd_symlinks
+
+ # set the OSD dir and the block/block.db links to root:root permissions, to
+ # ensure that the OSD will be able to activate regardless
+ - file:
+ path: "{{ item.path }}"
+ owner: root
+ group: root
+ with_items:
+ - "{{ osd_directories.files }}"
+
+ - file:
+ path: "{{ item.path }}"
+ owner: root
+ group: root
+ with_items:
+ - "{{ osd_symlinks.files }}"
+
- name: activate all to start the previously prepared osd.0
command: "ceph-volume lvm activate --all"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
+ # partitions have been completely removed, so re-create them again
+ - name: re-create partition /dev/sdd for lvm data usage
+ parted:
+ device: /dev/sdd
+ number: 1
+ part_start: 0%
+ part_end: 50%
+ unit: '%'
+ label: gpt
+ state: present
+
+ - name: re-create partition /dev/sdd lvm journals
+ parted:
+ device: /dev/sdd
+ number: 2
+ part_start: 50%
+ part_end: 100%
+ unit: '%'
+ state: present
+ label: gpt
+
- name: redeploy osd.2 using /dev/sdd1
command: "ceph-volume --cluster {{ cluster }} lvm create --filestore --data /dev/sdd1 --journal /dev/sdd2 --osd-id 2"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
+ - name: find all OSD paths
+ find:
+ paths: /var/lib/ceph/osd
+ recurse: no
+ file_type: directory
+ register: osd_paths
+
+ # set all OSD paths to root:rootto ensure that the OSD will be able to
+ # activate regardless
+ - name: mangle permissions to root
+ file:
+ path: "{{ item.path }}"
+ owner: root
+ group: root
+ recurse: yes
+ with_items:
+ - "{{ osd_paths.files }}"
+
+ - name: stop ceph-osd@2 daemon
+ service:
+ name: ceph-osd@2
+ state: stopped
+
+ - name: stop ceph-osd@1 daemon
+ service:
+ name: ceph-osd@1
+ state: stopped
+
- name: activate all to start the previously prepared osd.0
command: "ceph-volume lvm activate --filestore --all"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
+ # partitions have been completely removed, so re-create them again
+ - name: re-create partition /dev/sdd for lvm data usage
+ parted:
+ device: /dev/sdd
+ number: 1
+ part_start: 0%
+ part_end: 50%
+ unit: '%'
+ label: gpt
+ state: present
+
- name: redeploy osd.2 using /dev/sdd1
command: "ceph-volume --cluster {{ cluster }} lvm create --bluestore --data /dev/sdd1 --osd-id 2"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
+ # partitions have been completely removed, so re-create them again
+ - name: re-create partition /dev/sdd for lvm data usage
+ parted:
+ device: /dev/sdd
+ number: 1
+ part_start: 0%
+ part_end: 50%
+ unit: '%'
+ label: gpt
+ state: present
+
+ - name: re-create partition /dev/sdd lvm journals
+ parted:
+ device: /dev/sdd
+ number: 2
+ part_start: 50%
+ part_end: 100%
+ unit: '%'
+ state: present
+ label: gpt
+
- name: redeploy osd.2 using /dev/sdd1
command: "ceph-volume --cluster {{ cluster }} lvm create --filestore --data /dev/sdd1 --journal /dev/sdd2 --osd-id 2"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
+ - name: re-create partition /dev/sdc1
+ parted:
+ device: /dev/sdc
+ number: 1
+ part_start: 0%
+ part_end: 50%
+ unit: '%'
+ state: present
+ label: gpt
+
- name: prepare osd.0 again using test_group/data-lv1
command: "ceph-volume --cluster {{ cluster }} lvm prepare --filestore --data test_group/data-lv1 --journal /dev/sdc1 --osd-id 0"
environment:
become: True
any_errors_fatal: true
roles:
- - role: ceph-defaults
- tags: ['ceph_update_config']
- - role: ceph-handler
- - role: ceph-common
+ - ceph-defaults
+ - ceph-facts
+ - ceph-handler
+ - ceph-common
tasks:
- name: rsync ceph-volume to test nodes on centos
synchronize:
disk = device.Device("vg/lv")
assert disk.is_lv
- def test_is_device(self, device_info):
+ def test_vgs_is_empty(self, device_info, pvolumes, monkeypatch):
+ BarPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+ pvolumes.append(BarPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ lsblk = {"TYPE": "disk"}
+ device_info(lsblk=lsblk)
+ disk = device.Device("/dev/nvme0n1")
+ assert disk.vgs == []
+
+ def test_vgs_is_not_empty(self, device_info, pvolumes, monkeypatch):
+ BarPVolume = api.PVolume(vg_name='foo', lv_uuid='111', pv_name='/dev/nvme0n1', pv_uuid="0000", pv_tags={})
+ pvolumes.append(BarPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ lsblk = {"TYPE": "disk"}
+ device_info(lsblk=lsblk)
+ disk = device.Device("/dev/nvme0n1")
+ assert len(disk.vgs) == 1
+
+ def test_device_is_device(self, device_info, pvolumes):
data = {"/dev/sda": {"foo": "bar"}}
lsblk = {"TYPE": "device"}
device_info(devices=data, lsblk=lsblk)
disk = device.Device("/dev/sda")
- assert disk.is_device
+ assert disk.is_device is True
+
+ def test_disk_is_device(self, device_info, pvolumes):
+ data = {"/dev/sda": {"foo": "bar"}}
+ lsblk = {"TYPE": "disk"}
+ device_info(devices=data, lsblk=lsblk)
+ disk = device.Device("/dev/sda")
+ assert disk.is_device is True
def test_is_partition(self, device_info, pvolumes):
data = {"/dev/sda": {"foo": "bar"}}
disk = device.Device("/dev/mapper/foo")
assert disk.is_mapper
+ def test_dm_is_mapper_device(self, device_info):
+ device_info()
+ disk = device.Device("/dev/dm-4")
+ assert disk.is_mapper
+
def test_is_not_mapper_device(self, device_info):
device_info()
disk = device.Device("/dev/sda")
disk = device.Device("/dev/sda")
assert disk.is_ceph_disk_member
+ def test_is_ceph_disk_member_not_available(self, device_info):
+ lsblk = {"PARTLABEL": "ceph data"}
+ device_info(lsblk=lsblk)
+ disk = device.Device("/dev/sda")
+ assert disk.is_ceph_disk_member
+ assert not disk.available
+ assert "Used by ceph-disk" in disk.rejected_reasons
+
def test_is_not_ceph_disk_member_lsblk(self, device_info):
lsblk = {"PARTLABEL": "gluster partition"}
device_info(lsblk=lsblk)
disk = device.Device("/dev/sda")
assert not disk.used_by_ceph
+ def test_get_device_id(self, device_info):
+ udev = {k:k for k in ['ID_VENDOR', 'ID_MODEL', 'ID_SCSI_SERIAL']}
+ device_info(udevadm=udev)
+ disk = device.Device("/dev/sda")
+ assert disk._get_device_id() == 'ID_VENDOR_ID_MODEL_ID_SCSI_SERIAL'
+
+
+
+class TestDeviceEncryption(object):
+
+ def test_partition_is_not_encrypted_lsblk(self, device_info, pvolumes):
+ lsblk = {'TYPE': 'part', 'FSTYPE': 'xfs'}
+ device_info(lsblk=lsblk)
+ disk = device.Device("/dev/sda")
+ assert disk.is_encrypted is False
+
+ def test_partition_is_encrypted_lsblk(self, device_info, pvolumes):
+ lsblk = {'TYPE': 'part', 'FSTYPE': 'crypto_LUKS'}
+ device_info(lsblk=lsblk)
+ disk = device.Device("/dev/sda")
+ assert disk.is_encrypted is True
+
+ def test_partition_is_not_encrypted_blkid(self, device_info, pvolumes):
+ lsblk = {'TYPE': 'part'}
+ blkid = {'TYPE': 'ceph data'}
+ device_info(lsblk=lsblk, blkid=blkid)
+ disk = device.Device("/dev/sda")
+ assert disk.is_encrypted is False
+
+ def test_partition_is_encrypted_blkid(self, device_info, pvolumes):
+ lsblk = {'TYPE': 'part'}
+ blkid = {'TYPE': 'crypto_LUKS'}
+ device_info(lsblk=lsblk, blkid=blkid)
+ disk = device.Device("/dev/sda")
+ assert disk.is_encrypted is True
+
+ def test_mapper_is_encrypted_luks1(self, device_info, pvolumes, monkeypatch):
+ status = {'type': 'LUKS1'}
+ monkeypatch.setattr(device, 'encryption_status', lambda x: status)
+ lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+ blkid = {'TYPE': 'mapper'}
+ device_info(lsblk=lsblk, blkid=blkid)
+ disk = device.Device("/dev/mapper/uuid")
+ assert disk.is_encrypted is True
+
+ def test_mapper_is_encrypted_luks2(self, device_info, pvolumes, monkeypatch):
+ status = {'type': 'LUKS2'}
+ monkeypatch.setattr(device, 'encryption_status', lambda x: status)
+ lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+ blkid = {'TYPE': 'mapper'}
+ device_info(lsblk=lsblk, blkid=blkid)
+ disk = device.Device("/dev/mapper/uuid")
+ assert disk.is_encrypted is True
+
+ def test_mapper_is_encrypted_plain(self, device_info, pvolumes, monkeypatch):
+ status = {'type': 'PLAIN'}
+ monkeypatch.setattr(device, 'encryption_status', lambda x: status)
+ lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+ blkid = {'TYPE': 'mapper'}
+ device_info(lsblk=lsblk, blkid=blkid)
+ disk = device.Device("/dev/mapper/uuid")
+ assert disk.is_encrypted is True
+
+ def test_mapper_is_not_encrypted_plain(self, device_info, pvolumes, monkeypatch):
+ monkeypatch.setattr(device, 'encryption_status', lambda x: {})
+ lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+ blkid = {'TYPE': 'mapper'}
+ device_info(lsblk=lsblk, blkid=blkid)
+ disk = device.Device("/dev/mapper/uuid")
+ assert disk.is_encrypted is False
+
+ def test_lv_is_encrypted_blkid(self, device_info, pvolumes):
+ lsblk = {'TYPE': 'lvm'}
+ blkid = {'TYPE': 'crypto_LUKS'}
+ device_info(lsblk=lsblk, blkid=blkid)
+ disk = device.Device("/dev/sda")
+ disk.lv_api = {}
+ assert disk.is_encrypted is True
+
+ def test_lv_is_not_encrypted_blkid(self, factory, device_info, pvolumes):
+ lsblk = {'TYPE': 'lvm'}
+ blkid = {'TYPE': 'xfs'}
+ device_info(lsblk=lsblk, blkid=blkid)
+ disk = device.Device("/dev/sda")
+ disk.lv_api = factory(encrypted=None)
+ assert disk.is_encrypted is False
+
+ def test_lv_is_encrypted_lsblk(self, device_info, pvolumes):
+ lsblk = {'FSTYPE': 'crypto_LUKS', 'TYPE': 'lvm'}
+ blkid = {'TYPE': 'mapper'}
+ device_info(lsblk=lsblk, blkid=blkid)
+ disk = device.Device("/dev/sda")
+ disk.lv_api = {}
+ assert disk.is_encrypted is True
+
+ def test_lv_is_not_encrypted_lsblk(self, factory, device_info, pvolumes):
+ lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+ blkid = {'TYPE': 'mapper'}
+ device_info(lsblk=lsblk, blkid=blkid)
+ disk = device.Device("/dev/sda")
+ disk.lv_api = factory(encrypted=None)
+ assert disk.is_encrypted is False
+
+ def test_lv_is_encrypted_lvm_api(self, factory, device_info, pvolumes):
+ lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+ blkid = {'TYPE': 'mapper'}
+ device_info(lsblk=lsblk, blkid=blkid)
+ disk = device.Device("/dev/sda")
+ disk.lv_api = factory(encrypted=True)
+ assert disk.is_encrypted is True
+
+ def test_lv_is_not_encrypted_lvm_api(self, factory, device_info, pvolumes):
+ lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+ blkid = {'TYPE': 'mapper'}
+ device_info(lsblk=lsblk, blkid=blkid)
+ disk = device.Device("/dev/sda")
+ disk.lv_api = factory(encrypted=False)
+ assert disk.is_encrypted is False
+
class TestDeviceOrdering(object):
assert result['UUID'] == '62416664-cbaf-40bd-9689-10bd337379c3'
assert result['TYPE'] == 'xfs'
+class TestUdevadmProperty(object):
+
+ def test_good_output(self, stub_call):
+ output = """ID_MODEL=SK_hynix_SC311_SATA_512GB
+ID_PART_TABLE_TYPE=gpt
+ID_SERIAL_SHORT=MS83N71801150416A""".split()
+ stub_call((output, [], 0))
+ result = disk.udevadm_property('dev/sda')
+ assert result['ID_MODEL'] == 'SK_hynix_SC311_SATA_512GB'
+ assert result['ID_PART_TABLE_TYPE'] == 'gpt'
+ assert result['ID_SERIAL_SHORT'] == 'MS83N71801150416A'
+
+ def test_property_filter(self, stub_call):
+ output = """ID_MODEL=SK_hynix_SC311_SATA_512GB
+ID_PART_TABLE_TYPE=gpt
+ID_SERIAL_SHORT=MS83N71801150416A""".split()
+ stub_call((output, [], 0))
+ result = disk.udevadm_property('dev/sda', ['ID_MODEL',
+ 'ID_SERIAL_SHORT'])
+ assert result['ID_MODEL'] == 'SK_hynix_SC311_SATA_512GB'
+ assert 'ID_PART_TABLE_TYPE' not in result
+
+ def test_fail_on_broken_output(self, stub_call):
+ output = ["ID_MODEL:SK_hynix_SC311_SATA_512GB"]
+ stub_call((output, [], 0))
+ with pytest.raises(ValueError):
+ disk.udevadm_property('dev/sda')
+
class TestDeviceFamily(object):
assert len(result) == 1
assert result == [ceph_data_path]
+ def test_sda1_partition(self, tmpfile, tmpdir):
+ block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
+ block_sda_path = os.path.join(block_path, 'sda')
+ block_sda1_path = os.path.join(block_sda_path, 'sda1')
+ block_sda1_holders = os.path.join(block_sda1_path, 'holders')
+ dev_sda_path = os.path.join(dev_path, 'sda')
+ dev_sda1_path = os.path.join(dev_path, 'sda1')
+ os.makedirs(block_sda_path)
+ os.makedirs(block_sda1_path)
+ os.makedirs(dev_sda1_path)
+ os.makedirs(block_sda1_holders)
+ os.makedirs(dev_sda_path)
+ tmpfile('size', '1024', directory=block_sda_path)
+ tmpfile('partition', '1', directory=block_sda1_path)
+ result = disk.get_devices(
+ _sys_block_path=block_path,
+ _dev_path=dev_path,
+ _mapper_path=mapper_path)
+ assert dev_sda_path in list(result.keys())
+ assert '/dev/sda1' in list(result.keys())
+ assert result['/dev/sda1']['holders'] == []
+
def test_sda_size(self, tmpfile, tmpdir):
block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
block_sda_path = os.path.join(block_path, 'sda')
file_name = '/path/does/not/exist'
encryption.dmcrypt_close(file_name)
assert fake_run.calls == []
+
+
+class TestDmcryptKey(object):
+
+ def test_dmcrypt_with_default_size(self, conf_ceph_stub):
+ conf_ceph_stub('[global]\nfsid=asdf-lkjh')
+ result = encryption.create_dmcrypt_key()
+ assert len(result) == 172
+
+ def test_dmcrypt_with_custom_size(self, conf_ceph_stub):
+ conf_ceph_stub('''
+ [global]
+ fsid=asdf
+ [osd]
+ osd_dmcrypt_size=8
+ ''')
+ result = encryption.create_dmcrypt_key()
+ assert len(result) == 172
class TestStrToInt(object):
- def test_passing_a_float_str(self):
- result = util.str_to_int("1.99")
+ def test_passing_a_float_str_comma(self):
+ result = util.str_to_int("1,99")
assert result == 1
- def test_passing_a_float_does_not_round(self):
- result = util.str_to_int("1.99", round_down=False)
+ def test_passing_a_float_does_not_round_comma(self):
+ result = util.str_to_int("1,99", round_down=False)
+ assert result == 2
+
+ @pytest.mark.parametrize("value", ['2', 2])
+ def test_passing_an_int(self, value):
+ result = util.str_to_int(value)
+ assert result == 2
+
+ @pytest.mark.parametrize("value", ['1.99', 1.99])
+ def test_passing_a_float(self, value):
+ result = util.str_to_int(value)
+ assert result == 1
+
+ @pytest.mark.parametrize("value", ['1.99', 1.99])
+ def test_passing_a_float_does_not_round(self, value):
+ result = util.str_to_int(value, round_down=False)
assert result == 2
def test_text_is_not_an_integer_like(self):
util.str_to_int("1.4GB")
assert str(error.value) == "Unable to convert to integer: '1.4GB'"
+ def test_input_is_not_string(self):
+ with pytest.raises(RuntimeError) as error:
+ util.str_to_int(None)
+ assert str(error.value) == "Unable to convert to integer: 'None'"
+
def true_responses(upper_casing=False):
if upper_casing:
def test_trueish(self, response):
fake_input = lambda x: response
qx = 'what the what?'
- assert util.prompt_bool(qx, _raw_input=fake_input) is True
+ assert util.prompt_bool(qx, input_=fake_input) is True
@pytest.mark.parametrize('response', false_responses())
def test_falseish(self, response):
fake_input = lambda x: response
qx = 'what the what?'
- assert util.prompt_bool(qx, _raw_input=fake_input) is False
+ assert util.prompt_bool(qx, input_=fake_input) is False
def test_try_again_true(self):
responses = ['g', 'h', 'y']
fake_input = lambda x: responses.pop(0)
qx = 'what the what?'
- assert util.prompt_bool(qx, _raw_input=fake_input) is True
+ assert util.prompt_bool(qx, input_=fake_input) is True
def test_try_again_false(self):
responses = ['g', 'h', 'n']
fake_input = lambda x: responses.pop(0)
qx = 'what the what?'
- assert util.prompt_bool(qx, _raw_input=fake_input) is False
+ assert util.prompt_bool(qx, input_=fake_input) is False
from math import floor
from ceph_volume import terminal
+try:
+ input = raw_input # pylint: disable=redefined-builtin
+except NameError:
+ pass
logger = logging.getLogger(__name__)
"""
Parses a string number into an integer, optionally converting to a float
and rounding down.
+
+ Some LVM values may come with a comma instead of a dot to define decimals.
+ This function normalizes a comma into a dot
"""
error_msg = "Unable to convert to integer: '%s'" % str(string)
try:
- integer = float(string)
+ integer = float(string.replace(',', '.'))
+ except AttributeError:
+ # this might be a integer already, so try to use it, otherwise raise
+ # the original exception
+ if isinstance(string, (int, float)):
+ integer = string
+ else:
+ logger.exception(error_msg)
+ raise RuntimeError(error_msg)
except (TypeError, ValueError):
logger.exception(error_msg)
raise RuntimeError(error_msg)
raise ValueError("Invalid input value: %s" % val)
-def prompt_bool(question, _raw_input=None):
+def prompt_bool(question, input_=None):
"""
Interface to prompt a boolean (or boolean-like) response from a user.
Usually a confirmation.
"""
- input_prompt = _raw_input or raw_input
+ input_prompt = input_ or input
prompt_format = '--> {question} '.format(question=question)
response = input_prompt(prompt_format)
try:
terminal.error('Valid true responses are: y, yes, <Enter>')
terminal.error('Valid false responses are: n, no')
terminal.error('That response was invalid, please try again')
- return prompt_bool(question, _raw_input=input_prompt)
+ return prompt_bool(question, input_=input_prompt)
class ValidDevice(object):
- def __init__(self, as_string=False):
+ def __init__(self, as_string=False, gpt_ok=False):
self.as_string = as_string
+ self.gpt_ok = gpt_ok
def __call__(self, string):
device = Device(string)
error = None
if not device.exists:
error = "Unable to proceed with non-existing device: %s" % string
- elif device.has_gpt_headers:
+ # FIXME this is not a nice API, this validator was meant to catch any
+ # non-existing devices upfront, not check for gpt headers. Now this
+ # needs to optionally skip checking gpt headers which is beyond
+ # verifying if the device exists. The better solution would be to
+ # configure this with a list of checks that can be excluded/included on
+ # __init__
+ elif device.has_gpt_headers and not self.gpt_ok:
error = "GPT headers found, they must be removed on: %s" % string
if error:
{dev:<25} {size:<12} {rot!s:<7} {available!s:<9} {model}"""
+def encryption_status(abspath):
+ """
+ Helper function to run ``encryption.status()``. It is done here to avoid
+ a circular import issue (encryption module imports from this module) and to
+ ease testing by allowing monkeypatching of this function.
+ """
+ from ceph_volume.util import encryption
+ return encryption.status(abspath)
+
+
class Devices(object):
"""
A container for Device instances with reporting
self._is_lvm_member = None
self._parse()
self.available, self.rejected_reasons = self._check_reject_reasons()
+ self.device_id = self._get_device_id()
def __lt__(self, other):
'''
output['lvs'] = [lv.report() for lv in self.lvs]
return output
+ def _get_device_id(self):
+ """
+ Please keep this implementation in sync with get_device_id() in
+ src/common/blkdev.cc
+ """
+ props = ['ID_VENDOR','ID_MODEL','ID_SERIAL_SHORT', 'ID_SERIAL',
+ 'ID_SCSI_SERIAL']
+ p = disk.udevadm_property(self.abspath, props)
+ if 'ID_VENDOR' in p and 'ID_MODEL' in p and 'ID_SCSI_SERIAL' in p:
+ dev_id = '_'.join([p['ID_VENDOR'], p['ID_MODEL'],
+ p['ID_SCSI_SERIAL']])
+ elif 'ID_MODEL' in p and 'ID_SERIAL_SHORT' in p:
+ dev_id = '_'.join([p['ID_MODEL'], p['ID_SERIAL_SHORT']])
+ elif 'ID_SERIAL' in p:
+ dev_id = p['ID_SERIAL']
+ if dev_id.startswith('MTFD'):
+ # Micron NVMes hide the vendor
+ dev_id = 'Micron_' + dev_id
+ else:
+ # the else branch should fallback to using sysfs and ioctl to
+ # retrieve device_id on FreeBSD. Still figuring out if/how the
+ # python ioctl implementation does that on FreeBSD
+ dev_id = ''
+ dev_id.replace(' ', '_')
+ return dev_id
+
def _set_lvm_membership(self):
if self._is_lvm_member is None:
# this is contentious, if a PV is recognized by LVM but has no
pvs.filter(pv_name=path)
has_vgs = [pv.vg_name for pv in pvs if pv.vg_name]
if has_vgs:
+ self.vgs = list(set(has_vgs))
# a pv can only be in one vg, so this should be safe
self.vg_name = has_vgs[0]
self._is_lvm_member = True
lv = lvm.get_lv(vg_name=pv.vg_name, lv_uuid=pv.lv_uuid)
if lv:
self.lvs.append(lv)
+ else:
+ self.vgs = []
return self._is_lvm_member
def _get_pv_paths(self):
@property
def is_ceph_disk_member(self):
- return self.ceph_disk.is_member
+ is_member = self.ceph_disk.is_member
+ if self.sys_api.get("partitions"):
+ for part in self.sys_api.get("partitions").keys():
+ part = Device("/dev/%s" % part)
+ if part.is_ceph_disk_member:
+ is_member = True
+ break
+ return is_member
@property
def is_mapper(self):
- return self.path.startswith('/dev/mapper')
+ return self.path.startswith(('/dev/mapper', '/dev/dm-'))
@property
def is_lv(self):
@property
def is_device(self):
if self.disk_api:
- return self.disk_api['TYPE'] == 'device'
+ is_device = self.disk_api['TYPE'] == 'device'
+ is_disk = self.disk_api['TYPE'] == 'disk'
+ if is_device or is_disk:
+ return True
return False
+ @property
+ def is_encrypted(self):
+ """
+ Only correct for LVs, device mappers, and partitions. Will report a ``None``
+ for raw devices.
+ """
+ crypt_reports = [self.blkid_api.get('TYPE', ''), self.disk_api.get('FSTYPE', '')]
+ if self.is_lv:
+ # if disk APIs are reporting this is encrypted use that:
+ if 'crypto_LUKS' in crypt_reports:
+ return True
+ # if ceph-volume created this, then a tag would let us know
+ elif self.lv_api.encrypted:
+ return True
+ return False
+ elif self.is_partition:
+ return 'crypto_LUKS' in crypt_reports
+ elif self.is_mapper:
+ active_mapper = encryption_status(self.abspath)
+ if active_mapper:
+ # normalize a bit to ensure same values regardless of source
+ encryption_type = active_mapper['type'].lower().strip('12') # turn LUKS1 or LUKS2 into luks
+ return True if encryption_type in ['plain', 'luks'] else False
+ else:
+ return False
+ else:
+ return None
+
@property
def used_by_ceph(self):
# only filter out data devices as journals could potentially be reused
]
rejected = [reason for (k, v, reason) in reasons if
self.sys_api.get(k, '') == v]
+ if self.is_ceph_disk_member:
+ rejected.append("Used by ceph-disk")
+
return len(rejected) == 0, rejected
return ' '.join(out).strip()
+def remove_partition(device):
+ """
+ Removes a partition using parted
+
+ :param device: A ``Device()`` object
+ """
+ parent_device = '/dev/%s' % device.disk_api['PKNAME']
+ udev_info = udevadm_property(device.abspath)
+ partition_number = udev_info.get('ID_PART_ENTRY_NUMBER')
+ if not partition_number:
+ raise RuntimeError('Unable to detect the partition number for device: %s' % device.abspath)
+
+ process.run(
+ ['parted', parent_device, '--script', '--', 'rm', partition_number]
+ )
+
+
def _stat_is_device(stat_obj):
"""
Helper function that will interpret ``os.stat`` output directly, so that other
return devices
+def udevadm_property(device, properties=[]):
+ """
+ Query udevadm for information about device properties.
+ Optionally pass a list of properties to return. A requested property might
+ not be returned if not present.
+
+ Expected output format::
+ # udevadm info --query=property --name=/dev/sda :(
+ DEVNAME=/dev/sda
+ DEVTYPE=disk
+ ID_ATA=1
+ ID_BUS=ata
+ ID_MODEL=SK_hynix_SC311_SATA_512GB
+ ID_PART_TABLE_TYPE=gpt
+ ID_PART_TABLE_UUID=c8f91d57-b26c-4de1-8884-0c9541da288c
+ ID_PATH=pci-0000:00:17.0-ata-3
+ ID_PATH_TAG=pci-0000_00_17_0-ata-3
+ ID_REVISION=70000P10
+ ID_SERIAL=SK_hynix_SC311_SATA_512GB_MS83N71801150416A
+ TAGS=:systemd:
+ USEC_INITIALIZED=16117769
+ ...
+ """
+ out = _udevadm_info(device)
+ ret = {}
+ for line in out:
+ p, v = line.split('=', 1)
+ if not properties or p in properties:
+ ret[p] = v
+ return ret
+
+
+def _udevadm_info(device):
+ """
+ Call udevadm and return the output
+ """
+ cmd = ['udevadm', 'info', '--query=property', device]
+ out, _err, _rc = process.call(cmd)
+ return out
+
+
def lsblk(device, columns=None, abspath=False):
"""
Create a dictionary of identifying values for a device using ``lsblk``.
folder_path = os.path.join(sys_block_path, folder)
if os.path.exists(os.path.join(folder_path, 'partition')):
contents = get_file_contents(os.path.join(folder_path, 'partition'))
- if '1' in contents:
+ if contents:
part = {}
partname = folder
part_sys_block_path = os.path.join(sys_block_path, partname)
part['sectorsize'] = get_file_contents(
part_sys_block_path + "/queue/hw_sector_size", 512)
part['size'] = human_readable_size(float(part['sectors']) * 512)
+ part['holders'] = []
+ for holder in os.listdir(part_sys_block_path + '/holders'):
+ part['holders'].append(holder)
partition_metadata[partname] = part
return partition_metadata
metadata['path'] = diskname
metadata['locked'] = is_locked_raw_device(metadata['path'])
+ for part_name, part_metadata in metadata['partitions'].items():
+ part_abspath = '/dev/%s' % part_name
+ device_facts[part_abspath] = part_metadata
+
device_facts[diskname] = metadata
return device_facts
)
# The size of the key is defined in bits, so we must transform that
# value to bytes (dividing by 8) because we read in bytes, not bits
- random_string = os.urandom(dmcrypt_key_size / 8)
+ random_string = os.urandom(int(dmcrypt_key_size / 8))
key = base64.b64encode(random_string).decode('utf-8')
return key
'cryptsetup',
'--key-file',
'-',
+ '--allow-discards', # allow discards (aka TRIM) requests for device
'open',
device,
mapping,
'cryptsetup',
'--key-file',
'-',
+ '--allow-discards', # allow discards (aka TRIM) requests for device
'luksOpen',
device,
mapping,
f->dump_int("mds_epoch", mdsmap->get_epoch());
f->dump_int("osd_epoch", osd_epoch);
f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
+ f->dump_bool("blacklisted", blacklisted);
}
}
return o.is_blacklisted(myaddr);});
}
+ // Always subscribe to next osdmap for blacklisted client
+ // until this client is not blacklisted.
+ if (blacklisted) {
+ objecter->maybe_request_map();
+ }
+
if (objecter->osdmap_full_flag()) {
_handle_full_flag(-1);
} else {
void Client::handle_mds_map(MMDSMap* m)
{
+ mds_gid_t old_inc, new_inc;
if (m->get_epoch() <= mdsmap->get_epoch()) {
ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
<< " is identical to or older than our "
<< mdsmap->get_epoch() << dendl;
m->put();
return;
- }
+ }
ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
if (!mdsmap->is_up(mds)) {
session->con->mark_down();
} else if (mdsmap->get_inst(mds) != session->inst) {
+ old_inc = oldmap->get_incarnation(mds);
+ new_inc = mdsmap->get_incarnation(mds);
+ if (old_inc != new_inc) {
+ ldout(cct, 1) << "mds incarnation changed from "
+ << old_inc << " to " << new_inc << dendl;
+ oldstate = MDSMap::STATE_NULL;
+ }
session->con->mark_down();
session->inst = mdsmap->get_inst(mds);
// When new MDS starts to take over, notify kernel to trim unused entries
continue; // no change
session->mds_state = newstate;
+ if (old_inc != new_inc && newstate > MDSMap::STATE_RECONNECT) {
+ // missed reconnect close the session so that it can be reopened
+ _closed_mds_session(session);
+ continue;
+ }
if (newstate == MDSMap::STATE_RECONNECT) {
session->con = messenger->get_connection(session->inst);
send_reconnect(session);
tcap->cap_id = m->peer.cap_id;
tcap->seq = m->peer.seq - 1;
tcap->issue_seq = tcap->seq;
- tcap->mseq = m->peer.mseq;
tcap->issued |= cap->issued;
tcap->implemented |= cap->issued;
if (cap == in->auth_cap)
int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
const struct iovec *iov, int iovcnt)
{
+ uint64_t fpos = 0;
+
if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
return -EFBIG;
}
}
offset = f->pos;
- f->pos = offset+size;
+ fpos = offset+size;
unlock_fh_pos(f);
}
lat -= start;
logger->tinc(l_c_wrlat, lat);
+ if (fpos) {
+ lock_fh_pos(f);
+ f->pos = fpos;
+ unlock_fh_pos(f);
+ }
totalwritten = size;
r = (int)totalwritten;
#define LOCK_PREFIX "lock."
-static int read_lock(cls_method_context_t hctx, const string& name, lock_info_t *lock)
+static int clean_lock(cls_method_context_t hctx)
+{
+ int r = cls_cxx_remove(hctx);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int read_lock(cls_method_context_t hctx,
+ const string& name,
+ lock_info_t *lock)
{
bufferlist bl;
string key = LOCK_PREFIX;
map<locker_id_t, locker_info_t>::iterator iter = lock->lockers.begin();
while (iter != lock->lockers.end()) {
- map<locker_id_t, locker_info_t>::iterator next = iter;
- ++next;
-
struct locker_info_t& info = iter->second;
if (!info.expiration.is_zero() && info.expiration < now) {
CLS_LOG(20, "expiring locker");
- lock->lockers.erase(iter);
+ iter = lock->lockers.erase(iter);
+ } else {
+ ++iter;
}
+ }
- iter = next;
+ if (lock->lockers.empty() && cls_lock_is_ephemeral(lock->lock_type)) {
+ r = clean_lock(hctx);
+ if (r < 0) {
+ CLS_ERR("error, on read, cleaning lock object %s", cpp_strerror(r).c_str());
+ }
}
return 0;
const string& cookie,
const string& tag)
{
- bool exclusive = lock_type == LOCK_EXCLUSIVE;
+ bool exclusive = cls_lock_is_exclusive(lock_type);
lock_info_t linfo;
- bool fail_if_exists = (flags & LOCK_FLAG_RENEW) == 0;
+ bool fail_if_exists = (flags & LOCK_FLAG_MAY_RENEW) == 0;
+ bool fail_if_does_not_exist = flags & LOCK_FLAG_MUST_RENEW;
- CLS_LOG(20, "requested lock_type=%s fail_if_exists=%d", cls_lock_type_str(lock_type), fail_if_exists);
- if (lock_type != LOCK_EXCLUSIVE &&
- lock_type != LOCK_SHARED)
+ CLS_LOG(20,
+ "requested lock_type=%s fail_if_exists=%d fail_if_does_not_exist=%d",
+ cls_lock_type_str(lock_type), fail_if_exists, fail_if_does_not_exist);
+ if (!cls_lock_is_valid(lock_type)) {
return -EINVAL;
+ }
if (name.empty())
return -EINVAL;
+ if (!fail_if_exists && fail_if_does_not_exist) {
+ // at most one of LOCK_FLAG_MAY_RENEW and LOCK_FLAG_MUST_RENEW may
+ // be set since they have different implications if the lock does
+ // not already exist
+ return -EINVAL;
+ }
+
// see if there's already a locker
int r = read_lock(hctx, name, &linfo);
if (r < 0 && r != -ENOENT) {
CLS_ERR("Could not read lock info: %s", cpp_strerror(r).c_str());
return r;
}
+
map<locker_id_t, locker_info_t>& lockers = linfo.lockers;
map<locker_id_t, locker_info_t>::iterator iter;
CLS_LOG(20, "existing_lock_type=%s", cls_lock_type_str(existing_lock_type));
iter = lockers.find(id);
if (iter != lockers.end()) {
- if (fail_if_exists) {
+ if (fail_if_exists && !fail_if_does_not_exist) {
return -EEXIST;
} else {
lockers.erase(iter); // remove old entry
}
+ } else if (fail_if_does_not_exist) {
+ return -ENOENT;
}
if (!lockers.empty()) {
* entity or cookie is wrong), or -errno on other error.
*/
static int remove_lock(cls_method_context_t hctx,
- const string& name,
- entity_name_t& locker,
- const string& cookie)
+ const string& name,
+ entity_name_t& locker,
+ const string& cookie)
{
// get current lockers
lock_info_t linfo;
}
lockers.erase(iter);
- r = write_lock(hctx, name, linfo);
+ if (cls_lock_is_ephemeral(linfo.lock_type)) {
+ ceph_assert(lockers.empty());
+ r = clean_lock(hctx);
+ } else {
+ r = write_lock(hctx, name, linfo);
+ }
return r;
}
* is wrong), or -errno on other (unexpected) error.
*/
static int break_lock(cls_method_context_t hctx,
- bufferlist *in, bufferlist *out)
+ bufferlist *in, bufferlist *out)
{
CLS_LOG(20, "break_lock");
cls_lock_break_op op;
return -EINVAL;
}
- if (op.type != LOCK_EXCLUSIVE && op.type != LOCK_SHARED) {
+ if (!cls_lock_is_valid(op.type)) {
return -EINVAL;
}
return -EINVAL;
}
- if (op.type != LOCK_EXCLUSIVE && op.type != LOCK_SHARED) {
+ if (!cls_lock_is_valid(op.type)) {
return -EINVAL;
}
rados_op->exec("lock", "set_cookie", in);
}
+ void Lock::assert_locked_shared(ObjectOperation *op)
+ {
+ assert_locked(op, name, LOCK_SHARED, cookie, tag);
+ }
+
void Lock::assert_locked_exclusive(ObjectOperation *op)
{
assert_locked(op, name, LOCK_EXCLUSIVE, cookie, tag);
}
- void Lock::assert_locked_shared(ObjectOperation *op)
+ void Lock::assert_locked_exclusive_ephemeral(ObjectOperation *op)
{
- assert_locked(op, name, LOCK_SHARED, cookie, tag);
+ assert_locked(op, name, LOCK_EXCLUSIVE_EPHEMERAL, cookie, tag);
}
void Lock::lock_shared(ObjectWriteOperation *op)
cookie, tag, description, duration, flags);
}
+ void Lock::lock_exclusive_ephemeral(ObjectWriteOperation *op)
+ {
+ lock(op, name, LOCK_EXCLUSIVE_EPHEMERAL,
+ cookie, tag, description, duration, flags);
+ }
+
+ int Lock::lock_exclusive_ephemeral(IoCtx *ioctx, const string& oid)
+ {
+ return lock(ioctx, oid, name, LOCK_EXCLUSIVE_EPHEMERAL,
+ cookie, tag, description, duration, flags);
+ }
+
void Lock::unlock(ObjectWriteOperation *op)
{
rados::cls::lock::unlock(op, name, cookie);
#ifndef CEPH_CLS_LOCK_CLIENT_H
#define CEPH_CLS_LOCK_CLIENT_H
+#include <chrono>
+
#include "cls/lock/cls_lock_types.h"
namespace librados {
void set_tag(const std::string& t) { tag = t; }
void set_description(const std::string& desc) { description = desc; }
void set_duration(const utime_t& e) { duration = e; }
- void set_renew(bool renew) {
+ void set_duration(const ceph::timespan& d) {
+ duration = utime_t(ceph::real_clock::time_point::min() + d);
+ }
+
+ void set_may_renew(bool renew) {
if (renew) {
- flags |= LOCK_FLAG_RENEW;
+ flags |= LOCK_FLAG_MAY_RENEW;
+ flags &= ~LOCK_FLAG_MUST_RENEW; // if may then not must
} else {
- flags &= ~LOCK_FLAG_RENEW;
+ flags &= ~LOCK_FLAG_MAY_RENEW;
+ }
+ }
+
+ void set_must_renew(bool renew) {
+ if (renew) {
+ flags |= LOCK_FLAG_MUST_RENEW;
+ flags &= ~LOCK_FLAG_MAY_RENEW; // if must then not may
+ } else {
+ flags &= ~LOCK_FLAG_MUST_RENEW;
}
}
- void assert_locked_exclusive(librados::ObjectOperation *rados_op);
void assert_locked_shared(librados::ObjectOperation *rados_op);
+ void assert_locked_exclusive(librados::ObjectOperation *rados_op);
+ void assert_locked_exclusive_ephemeral(librados::ObjectOperation *rados_op);
/* ObjectWriteOperation */
- void lock_exclusive(librados::ObjectWriteOperation *ioctx);
void lock_shared(librados::ObjectWriteOperation *ioctx);
+ void lock_exclusive(librados::ObjectWriteOperation *ioctx);
+
+ // Be careful when using an exclusive ephemeral lock; it is
+ // intended strictly for cases when a lock object exists
+ // solely for a lock in a given process and the object is no
+ // longer needed when the lock is unlocked or expired, as the
+ // cls back-end will make an effort to delete it.
+ void lock_exclusive_ephemeral(librados::ObjectWriteOperation *ioctx);
void unlock(librados::ObjectWriteOperation *ioctx);
- void break_lock(librados::ObjectWriteOperation *ioctx, const entity_name_t& locker);
+ void break_lock(librados::ObjectWriteOperation *ioctx,
+ const entity_name_t& locker);
/* IoCtx */
- int lock_exclusive(librados::IoCtx *ioctx, const std::string& oid);
int lock_shared(librados::IoCtx *ioctx, const std::string& oid);
+ int lock_exclusive(librados::IoCtx *ioctx, const std::string& oid);
+
+ // NB: see above comment on exclusive ephemeral locks
+ int lock_exclusive_ephemeral(librados::IoCtx *ioctx,
+ const std::string& oid);
int unlock(librados::IoCtx *ioctx, const std::string& oid);
int break_lock(librados::IoCtx *ioctx, const std::string& oid,
const entity_name_t& locker);
i->tag = "tag";
i->description = "description";
i->duration = utime_t(5, 0);
- i->flags = LOCK_FLAG_RENEW;
+ i->flags = LOCK_FLAG_MAY_RENEW;
o.push_back(i);
o.push_back(new cls_lock_lock_op);
}
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
#ifndef CEPH_CLS_LOCK_OPS_H
#define CEPH_CLS_LOCK_OPS_H
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
#ifndef CEPH_CLS_LOCK_TYPES_H
#define CEPH_CLS_LOCK_TYPES_H
#include "msg/msg_types.h"
/* lock flags */
-#define LOCK_FLAG_RENEW 0x1 /* idempotent lock acquire */
+#define LOCK_FLAG_MAY_RENEW 0x1 /* idempotent lock acquire */
+#define LOCK_FLAG_MUST_RENEW 0x2 /* lock must already be acquired */
enum ClsLockType {
- LOCK_NONE = 0,
- LOCK_EXCLUSIVE = 1,
- LOCK_SHARED = 2,
+ LOCK_NONE = 0,
+ LOCK_EXCLUSIVE = 1,
+ LOCK_SHARED = 2,
+ LOCK_EXCLUSIVE_EPHEMERAL = 3, /* lock object is removed @ unlock */
};
static inline const char *cls_lock_type_str(ClsLockType type)
return "exclusive";
case LOCK_SHARED:
return "shared";
+ case LOCK_EXCLUSIVE_EPHEMERAL:
+ return "exclusive-ephemeral";
default:
return "<unknown>";
}
}
+inline bool cls_lock_is_exclusive(ClsLockType type) {
+ return LOCK_EXCLUSIVE == type || LOCK_EXCLUSIVE_EPHEMERAL == type;
+}
+
+inline bool cls_lock_is_ephemeral(ClsLockType type) {
+ return LOCK_EXCLUSIVE_EPHEMERAL == type;
+}
+
+inline bool cls_lock_is_valid(ClsLockType type) {
+ return LOCK_SHARED == type ||
+ LOCK_EXCLUSIVE == type ||
+ LOCK_EXCLUSIVE_EPHEMERAL == type;
+}
+
namespace rados {
namespace cls {
namespace lock {
if (ret < 0) {
return ret;
}
+ }
+
+ removing = existed && op.delete_marker;
+ if (!removing) {
ret = other_obj.unlink();
if (ret < 0) {
return ret;
}
}
-
- removing = existed && op.delete_marker;
} else {
removing = (existed && !obj.is_delete_marker() && op.delete_marker);
}
static int rgw_clear_bucket_resharding(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
- cls_rgw_set_bucket_resharding_op op;
+ cls_rgw_clear_bucket_resharding_op op;
bufferlist::iterator in_iter = in->begin();
try {
return true;
}
-void cls_rgw_bucket_init(ObjectWriteOperation& o)
+// note: currently only called by tesing code
+void cls_rgw_bucket_init_index(ObjectWriteOperation& o)
{
bufferlist in;
o.exec(RGW_CLASS, RGW_BUCKET_INIT_INDEX, in);
}
static bool issue_bucket_index_init_op(librados::IoCtx& io_ctx,
- const string& oid, BucketIndexAioManager *manager) {
+ const string& oid,
+ BucketIndexAioManager *manager) {
bufferlist in;
librados::ObjectWriteOperation op;
op.create(true);
return manager->aio_operate(io_ctx, oid, &op);
}
+static bool issue_bucket_index_clean_op(librados::IoCtx& io_ctx,
+ const string& oid,
+ BucketIndexAioManager *manager) {
+ bufferlist in;
+ librados::ObjectWriteOperation op;
+ op.remove();
+ return manager->aio_operate(io_ctx, oid, &op);
+}
+
static bool issue_bucket_set_tag_timeout_op(librados::IoCtx& io_ctx,
const string& oid, uint64_t timeout, BucketIndexAioManager *manager) {
bufferlist in;
void CLSRGWIssueBucketIndexInit::cleanup()
{
// Do best effort removal
- for (map<int, string>::iterator citer = objs_container.begin(); citer != iter; ++citer) {
+ for (auto citer = objs_container.begin(); citer != iter; ++citer) {
io_ctx.remove(citer->second);
}
}
+int CLSRGWIssueBucketIndexClean::issue_op(int shard_id, const string& oid)
+{
+ return issue_bucket_index_clean_op(io_ctx, oid, &manager);
+}
+
int CLSRGWIssueSetTagTimeout::issue_op(int shard_id, const string& oid)
{
return issue_bucket_set_tag_timeout_op(io_ctx, oid, tag_timeout, &manager);
{
return issue_set_bucket_resharding(io_ctx, oid, entry, &manager);
}
-
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
#ifndef CEPH_CLS_RGW_CLIENT_H
#define CEPH_CLS_RGW_CLIENT_H
};
/* bucket index */
-void cls_rgw_bucket_init(librados::ObjectWriteOperation& o);
+void cls_rgw_bucket_init_index(librados::ObjectWriteOperation& o);
class CLSRGWConcurrentIO {
protected:
virtual void reset_container(map<int, string>& objs) {}
public:
- CLSRGWConcurrentIO(librados::IoCtx& ioc, map<int, string>& _objs_container,
- uint32_t _max_aio) : io_ctx(ioc), objs_container(_objs_container), max_aio(_max_aio) {}
- virtual ~CLSRGWConcurrentIO() {}
+
+ CLSRGWConcurrentIO(librados::IoCtx& ioc,
+ map<int, string>& _objs_container,
+ uint32_t _max_aio) :
+ io_ctx(ioc), objs_container(_objs_container), max_aio(_max_aio)
+ {}
+
+ virtual ~CLSRGWConcurrentIO()
+ {}
int operator()() {
int ret = 0;
CLSRGWConcurrentIO(ioc, _bucket_objs, _max_aio) {}
};
+
+class CLSRGWIssueBucketIndexClean : public CLSRGWConcurrentIO {
+protected:
+ int issue_op(int shard_id, const string& oid) override;
+ int valid_ret_code() override {
+ return -ENOENT;
+ }
+
+public:
+ CLSRGWIssueBucketIndexClean(librados::IoCtx& ioc,
+ map<int, string>& _bucket_objs,
+ uint32_t _max_aio) :
+ CLSRGWConcurrentIO(ioc, _bucket_objs, _max_aio)
+ {}
+};
+
+
class CLSRGWIssueSetTagTimeout : public CLSRGWConcurrentIO {
uint64_t tag_timeout;
protected:
int cls_rgw_reshard_get_head(librados::IoCtx& io_ctx, const string& oid, cls_rgw_reshard_entry& entry);
void cls_rgw_reshard_remove(librados::ObjectWriteOperation& op, const cls_rgw_reshard_entry& entry);
-/* resharding attribute */
+/* resharding attribute on bucket index shard headers */
int cls_rgw_set_bucket_resharding(librados::IoCtx& io_ctx, const string& oid,
const cls_rgw_bucket_instance_entry& entry);
int cls_rgw_clear_bucket_resharding(librados::IoCtx& io_ctx, const string& oid);
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
#ifndef CEPH_CLS_RGW_TYPES_H
#define CEPH_CLS_RGW_TYPES_H
CLS_RGW_RESHARD_DONE = 2,
};
+static inline std::string to_string(const enum cls_rgw_reshard_status status)
+{
+ switch (status) {
+ case CLS_RGW_RESHARD_NONE:
+ return "CLS_RGW_RESHARD_NONE";
+ break;
+ case CLS_RGW_RESHARD_IN_PROGRESS:
+ return "CLS_RGW_RESHARD_IN_PROGRESS";
+ break;
+ case CLS_RGW_RESHARD_DONE:
+ return "CLS_RGW_RESHARD_DONE";
+ break;
+ default:
+ break;
+ };
+ return "Unknown reshard status";
+}
+
struct cls_rgw_bucket_instance_entry {
cls_rgw_reshard_status reshard_status{CLS_RGW_RESHARD_NONE};
string new_bucket_instance_id;
#define CEPH_COND_H
#include "include/Context.h"
-
-class Cond {
- // my bits
- pthread_cond_t _c;
-
- Mutex *waiter_mutex;
-
- // don't allow copying.
- void operator=(Cond &C);
- Cond(const Cond &C);
-
- public:
- Cond() : waiter_mutex(NULL) {
- int r = pthread_cond_init(&_c,NULL);
- assert(r == 0);
- }
- virtual ~Cond() {
- pthread_cond_destroy(&_c);
- }
-
- int Wait(Mutex &mutex) {
- // make sure this cond is used with one mutex only
- assert(waiter_mutex == NULL || waiter_mutex == &mutex);
- waiter_mutex = &mutex;
-
- assert(mutex.is_locked());
-
- mutex._pre_unlock();
- int r = pthread_cond_wait(&_c, &mutex._m);
- mutex._post_lock();
- return r;
- }
-
- int WaitUntil(Mutex &mutex, utime_t when) {
- // make sure this cond is used with one mutex only
- assert(waiter_mutex == NULL || waiter_mutex == &mutex);
- waiter_mutex = &mutex;
-
- assert(mutex.is_locked());
-
- struct timespec ts;
- when.to_timespec(&ts);
-
- mutex._pre_unlock();
- int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
- mutex._post_lock();
-
- return r;
- }
-
- int WaitInterval(Mutex &mutex, utime_t interval) {
- utime_t when = ceph_clock_now();
- when += interval;
- return WaitUntil(mutex, when);
- }
-
- template<typename Duration>
- int WaitInterval(Mutex &mutex, Duration interval) {
- ceph::real_time when(ceph::real_clock::now());
- when += interval;
-
- struct timespec ts = ceph::real_clock::to_timespec(when);
-
- mutex._pre_unlock();
- int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
- mutex._post_lock();
-
- return r;
- }
-
- int SloppySignal() {
- int r = pthread_cond_broadcast(&_c);
- return r;
- }
- int Signal() {
- // make sure signaler is holding the waiter's lock.
- assert(waiter_mutex == NULL ||
- waiter_mutex->is_locked());
-
- int r = pthread_cond_broadcast(&_c);
- return r;
- }
- int SignalOne() {
- // make sure signaler is holding the waiter's lock.
- assert(waiter_mutex == NULL ||
- waiter_mutex->is_locked());
-
- int r = pthread_cond_signal(&_c);
- return r;
- }
- int SignalAll() {
- // make sure signaler is holding the waiter's lock.
- assert(waiter_mutex == NULL ||
- waiter_mutex->is_locked());
-
- int r = pthread_cond_broadcast(&_c);
- return r;
- }
-};
+#include "CondVar.h"
/**
* context to signal a cond
--- /dev/null
+#ifndef CEPH_COND_VAR_H
+#define CEPH_COND_VAR_H
+
+#include "include/utime.h"
+
+#include "Clock.h"
+#include "Mutex.h"
+#include "pthread.h"
+
+class Cond {
+ // my bits
+ pthread_cond_t _c;
+
+ Mutex *waiter_mutex;
+
+ // don't allow copying.
+ void operator=(Cond &C);
+ Cond(const Cond &C);
+
+ public:
+ Cond() : waiter_mutex(NULL) {
+ int r = pthread_cond_init(&_c,NULL);
+ assert(r == 0);
+ }
+ virtual ~Cond() {
+ pthread_cond_destroy(&_c);
+ }
+
+ int Wait(Mutex &mutex) {
+ // make sure this cond is used with one mutex only
+ assert(waiter_mutex == NULL || waiter_mutex == &mutex);
+ waiter_mutex = &mutex;
+
+ assert(mutex.is_locked());
+
+ mutex._pre_unlock();
+ int r = pthread_cond_wait(&_c, &mutex._m);
+ mutex._post_lock();
+ return r;
+ }
+
+ int WaitUntil(Mutex &mutex, utime_t when) {
+ // make sure this cond is used with one mutex only
+ assert(waiter_mutex == NULL || waiter_mutex == &mutex);
+ waiter_mutex = &mutex;
+
+ assert(mutex.is_locked());
+
+ struct timespec ts;
+ when.to_timespec(&ts);
+
+ mutex._pre_unlock();
+ int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
+ mutex._post_lock();
+
+ return r;
+ }
+
+ int WaitInterval(Mutex &mutex, utime_t interval) {
+ utime_t when = ceph_clock_now();
+ when += interval;
+ return WaitUntil(mutex, when);
+ }
+
+ template<typename Duration>
+ int WaitInterval(Mutex &mutex, Duration interval) {
+ ceph::real_time when(ceph::real_clock::now());
+ when += interval;
+
+ struct timespec ts = ceph::real_clock::to_timespec(when);
+
+ mutex._pre_unlock();
+ int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
+ mutex._post_lock();
+
+ return r;
+ }
+
+ int SloppySignal() {
+ int r = pthread_cond_broadcast(&_c);
+ return r;
+ }
+ int Signal() {
+ // make sure signaler is holding the waiter's lock.
+ assert(waiter_mutex == NULL ||
+ waiter_mutex->is_locked());
+
+ int r = pthread_cond_broadcast(&_c);
+ return r;
+ }
+ int SignalOne() {
+ // make sure signaler is holding the waiter's lock.
+ assert(waiter_mutex == NULL ||
+ waiter_mutex->is_locked());
+
+ int r = pthread_cond_signal(&_c);
+ return r;
+ }
+ int SignalAll() {
+ // make sure signaler is holding the waiter's lock.
+ assert(waiter_mutex == NULL ||
+ waiter_mutex->is_locked());
+
+ int r = pthread_cond_broadcast(&_c);
+ return r;
+ }
+};
+
+#endif // CEPH_COND_VAR_H
while (i != sdata->ops_in_flight_sharded.end() &&
i->get_initiated() < too_old) {
- if (!i->warn_interval_multiplier)
+ if (!i->warn_interval_multiplier) {
+ ++i;
continue;
+ }
(*slow)++;
K key; // klass
ListPairs lp;
Klass(K& k) :
- key(k)
- {}
+ key(k) {
+ }
+ ~Klass() {
+ lp.clear_and_dispose(DelItem<ListPair>());
+ }
friend bool operator< (const Klass &a, const Klass &b)
{ return a.key < b.key; }
friend bool operator> (const Klass &a, const Klass &b)
Kit next;
SubQueue(unsigned& p) :
key(p),
- next(klasses.begin())
- {}
+ next(klasses.begin()) {
+ }
+ ~SubQueue() {
+ klasses.clear_and_dispose(DelItem<Klass>());
+ }
friend bool operator< (const SubQueue &a, const SubQueue &b)
{ return a.key < b.key; }
friend bool operator> (const SubQueue &a, const SubQueue &b)
Queue() :
total_prio(0),
max_cost(0),
- size(0)
- {}
+ size(0) {
+ }
+ ~Queue() {
+ queues.clear_and_dispose(DelItem<SubQueue>());
+ }
bool empty() const {
return !size;
}
}
}
+ uint64_t buffer::list::get_wasted_space() const
+ {
+ if (_buffers.size() == 1)
+ return _buffers.back().wasted();
+
+ std::vector<const raw*> raw_vec;
+ raw_vec.reserve(_buffers.size());
+ for (const auto& p : _buffers)
+ raw_vec.push_back(p.get_raw());
+ std::sort(raw_vec.begin(), raw_vec.end());
+
+ uint64_t total = 0;
+ const raw *last = nullptr;
+ for (const auto r : raw_vec) {
+ if (r == last)
+ continue;
+ last = r;
+ total += r->len;
+ }
+ // If multiple buffers are sharing the same raw buffer and they overlap
+ // with each other, the wasted space will be underestimated.
+ if (total <= length())
+ return 0;
+ return total - length();
+ }
+
void buffer::list::rebuild()
{
if (_len == 0) {
class LockdepObs : public md_config_obs_t {
public:
- explicit LockdepObs(CephContext *cct) : m_cct(cct), m_registered(false) {
+ explicit LockdepObs(CephContext *cct)
+ : m_cct(cct), m_registered(false), lock("lock_dep_obs", false, true) {
}
~LockdepObs() override {
if (m_registered) {
void handle_conf_change(const md_config_t *conf,
const std::set <std::string> &changed) override {
+ Mutex::Locker locker(lock);
if (conf->lockdep && !m_registered) {
lockdep_register_ceph_context(m_cct);
m_registered = true;
private:
CephContext *m_cct;
bool m_registered;
+ Mutex lock;
};
class MempoolObs : public md_config_obs_t,
public AdminSocketHook {
CephContext *cct;
+ Mutex lock;
public:
- explicit MempoolObs(CephContext *cct) : cct(cct) {
+ explicit MempoolObs(CephContext *cct)
+ : cct(cct), lock("mem_pool_obs", false, true) {
cct->_conf->add_observer(this);
int r = cct->get_admin_socket()->register_command(
"dump_mempools",
void handle_conf_change(const md_config_t *conf,
const std::set <std::string> &changed) override {
+ Mutex::Locker locker(lock);
if (changed.count("mempool_debug")) {
mempool::set_debug_mode(cct->_conf->mempool_debug);
}
*/
class LogObs : public md_config_obs_t {
ceph::logging::Log *log;
+ Mutex lock;
public:
- explicit LogObs(ceph::logging::Log *l) : log(l) {}
+ explicit LogObs(ceph::logging::Log *l)
+ : log(l), lock("log_obs", false, true) {
+ }
const char** get_tracked_conf_keys() const override {
static const char *KEYS[] = {
void handle_conf_change(const md_config_t *conf,
const std::set <std::string> &changed) override {
+ Mutex::Locker locker(lock);
// stderr
if (changed.count("log_to_stderr") || changed.count("err_to_stderr")) {
int l = conf->log_to_stderr ? 99 : (conf->err_to_stderr ? -1 : -2);
std::string cmd_vartype_stringify(const cmd_vartype& v);
+struct bad_cmd_get : public std::exception {
+ std::string desc;
+ bad_cmd_get(const std::string& f, const cmdmap_t& cmdmap) {
+ desc = "bad or missing field '" + f + "'";
+ }
+ const char *what() const throw() override {
+ return desc.c_str();
+ }
+};
+
template <typename T>
-bool
-cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k, T& val)
+bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k,
+ T& val)
{
if (cmdmap.count(k)) {
try {
val = boost::get<T>(cmdmap.find(k)->second);
return true;
- } catch (boost::bad_get) {
+ } catch (boost::bad_get&) {
handle_bad_get(cct, k, typeid(T).name());
}
}
return false;
}
+template <typename T>
+bool cmd_getval_throws(CephContext *cct, const cmdmap_t& cmdmap,
+ const std::string& k, T& val)
+{
+ if (cmdmap.count(k)) {
+ try {
+ val = boost::get<T>(cmdmap.find(k)->second);
+ return true;
+ } catch (boost::bad_get&) {
+ throw bad_cmd_get(k, cmdmap);
+ }
+ }
+ return false;
+}
+
// with default
template <typename T>
-void
-cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k, T& val, const T& defval)
+void cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k,
+ T& val, const T& defval)
{
if (!cmd_getval(cct, cmdmap, k, val))
val = defval;
}
+template <typename T>
+bool cmd_getval_throws(
+ CephContext *cct, const cmdmap_t& cmdmap, const std::string& k,
+ T& val, const T& defval)
+{
+ if (cmdmap.count(k)) {
+ try {
+ val = boost::get<T>(cmdmap.find(k)->second);
+ return true;
+ } catch (boost::bad_get&) {
+ throw bad_cmd_get(k, cmdmap);
+ }
+ } else {
+ val = defval;
+ return true;
+ }
+}
+
template <typename T>
void
cmd_putval(CephContext *cct, cmdmap_t& cmdmap, const std::string& k, const T& val)
#include "osd/osd_types.h"
#include "common/errno.h"
#include "common/hostname.h"
+#include "common/backport14.h"
#include <boost/type_traits.hpp>
obs_map_t::value_type val(*k, observer_);
observers.insert(val);
}
+ obs_call_gate.emplace(observer_, ceph::make_unique<CallGate>());
}
void md_config_t::remove_observer(md_config_obs_t* observer_)
{
Mutex::Locker l(lock);
+
+ call_gate_close(observer_);
+ obs_call_gate.erase(observer_);
+
bool found_obs = false;
for (obs_map_t::iterator o = observers.begin(); o != observers.end(); ) {
if (o->second == observer_) {
void md_config_t::apply_changes(std::ostream *oss)
{
- Mutex::Locker l(lock);
- /*
- * apply changes until the cluster name is assigned
- */
- if (cluster.size())
- _apply_changes(oss);
+ rev_obs_map_t rev_obs;
+ {
+ Mutex::Locker l(lock);
+ /*
+ * apply changes until the cluster name is assigned
+ */
+ if (cluster.size()) {
+ for_each_change(
+ oss, [this, &rev_obs](md_config_obs_t *obs, const std::string &key) {
+ map_observer_changes(obs, key, &rev_obs);
+ });
+ }
+ }
+
+ call_observers(rev_obs);
}
bool md_config_t::_internal_field(const string& s)
return false;
}
-void md_config_t::_apply_changes(std::ostream *oss)
+void md_config_t::for_each_change(std::ostream *oss, config_gather_cb callback)
{
- /* Maps observers to the configuration options that they care about which
- * have changed. */
- typedef std::map < md_config_obs_t*, std::set <std::string> > rev_obs_map_t;
-
expand_all_meta();
// expand_all_meta could have modified anything. Copy it all out again.
update_legacy_val(option, ptr);
}
- // create the reverse observer mapping, mapping observers to the set of
- // changed keys that they'll get.
- rev_obs_map_t robs;
std::set <std::string> empty_set;
char buf[128];
char *bufptr = (char*)buf;
}
}
for (obs_map_t::iterator r = range.first; r != range.second; ++r) {
- rev_obs_map_t::value_type robs_val(r->second, empty_set);
- pair < rev_obs_map_t::iterator, bool > robs_ret(robs.insert(robs_val));
- std::set <std::string> &keys(robs_ret.first->second);
- keys.insert(key);
+ callback(r->second, key);
}
}
changed.clear();
-
- // Make any pending observer callbacks
- for (rev_obs_map_t::const_iterator r = robs.begin(); r != robs.end(); ++r) {
- md_config_obs_t *obs = r->first;
- obs->handle_conf_change(this, r->second);
- }
-
}
void md_config_t::call_all_observers()
{
- std::map<md_config_obs_t*,std::set<std::string> > obs;
+ rev_obs_map_t rev_obs;
{
Mutex::Locker l(lock);
expand_all_meta();
for (auto r = observers.begin(); r != observers.end(); ++r) {
- obs[r->second].insert(r->first);
+ map_observer_changes(r->second, r->first, &rev_obs);
}
}
- for (auto p = obs.begin();
- p != obs.end();
- ++p) {
- p->first->handle_conf_change(this, p->second);
- }
+
+ call_observers(rev_obs);
}
int md_config_t::injectargs(const std::string& s, std::ostream *oss)
{
int ret;
- Mutex::Locker l(lock);
- char b[s.length()+1];
- strcpy(b, s.c_str());
- std::vector<const char*> nargs;
- char *p = b;
- while (*p) {
- nargs.push_back(p);
- while (*p && *p != ' ') p++;
- if (!*p)
- break;
- *p++ = 0;
- while (*p && *p == ' ') p++;
- }
- ret = parse_injectargs(nargs, oss);
- if (!nargs.empty()) {
- *oss << " failed to parse arguments: ";
- std::string prefix;
- for (std::vector<const char*>::const_iterator i = nargs.begin();
- i != nargs.end(); ++i) {
- *oss << prefix << *i;
- prefix = ",";
+ rev_obs_map_t rev_obs;
+ {
+ Mutex::Locker l(lock);
+
+ char b[s.length()+1];
+ strcpy(b, s.c_str());
+ std::vector<const char*> nargs;
+ char *p = b;
+ while (*p) {
+ nargs.push_back(p);
+ while (*p && *p != ' ') p++;
+ if (!*p)
+ break;
+ *p++ = 0;
+ while (*p && *p == ' ') p++;
+ }
+ ret = parse_injectargs(nargs, oss);
+ if (!nargs.empty()) {
+ *oss << " failed to parse arguments: ";
+ std::string prefix;
+ for (std::vector<const char*>::const_iterator i = nargs.begin();
+ i != nargs.end(); ++i) {
+ *oss << prefix << *i;
+ prefix = ",";
+ }
+ *oss << "\n";
+ ret = -EINVAL;
}
- *oss << "\n";
- ret = -EINVAL;
+
+ for_each_change(
+ oss, [this, &rev_obs](md_config_obs_t *obs, const std::string &key) {
+ map_observer_changes(obs, key, &rev_obs);
+ });
}
- _apply_changes(oss);
+
+ call_observers(rev_obs);
return ret;
}
::complain_about_parse_errors(cct, &parse_errors);
}
+void md_config_t::call_observers(rev_obs_map_t &rev_obs) {
+ for (auto p : rev_obs) {
+ p.first->handle_conf_change(this, p.second);
+ // this can be done outside the lock as call_gate_enter()
+ // and remove_observer() are serialized via lock
+ call_gate_leave(p.first);
+ }
+}
+
+void md_config_t::map_observer_changes(md_config_obs_t *obs, const std::string &key,
+ rev_obs_map_t *rev_obs) {
+ ceph_assert(lock.is_locked());
+
+ auto p = rev_obs->emplace(obs, std::set<std::string>{});
+
+ p.first->second.emplace(key);
+ if (p.second) {
+ // this needs to be done under lock as once this lock is
+ // dropped (before calling observers) a remove_observer()
+ // can sneak in and cause havoc.
+ call_gate_enter(p.first->first);
+ }
+}
#include "common/entity_name.h"
#include "common/code_environment.h"
#include "common/Mutex.h"
+#include "common/CondVar.h"
#include "log/SubsystemMap.h"
#include "common/config_obs.h"
#include "common/options.h"
* while another thread is reading them, either.
*/
struct md_config_t {
+private:
+ class CallGate {
+ private:
+ uint32_t call_count = 0;
+ Mutex lock;
+ Cond cond;
+ public:
+ CallGate()
+ : lock("call::gate::lock", false, true) {
+ }
+
+ void enter() {
+ Mutex::Locker locker(lock);
+ ++call_count;
+ }
+ void leave() {
+ Mutex::Locker locker(lock);
+ ceph_assert(call_count > 0);
+ if (--call_count == 0) {
+ cond.Signal();
+ }
+ }
+ void close() {
+ Mutex::Locker locker(lock);
+ while (call_count != 0) {
+ cond.Wait(lock);
+ }
+ }
+ };
+
+ void call_gate_enter(md_config_obs_t *obs) {
+ auto p = obs_call_gate.find(obs);
+ ceph_assert(p != obs_call_gate.end());
+ p->second->enter();
+ }
+ void call_gate_leave(md_config_obs_t *obs) {
+ auto p = obs_call_gate.find(obs);
+ ceph_assert(p != obs_call_gate.end());
+ p->second->leave();
+ }
+ void call_gate_close(md_config_obs_t *obs) {
+ auto p = obs_call_gate.find(obs);
+ ceph_assert(p != obs_call_gate.end());
+ p->second->close();
+ }
+
+ typedef std::unique_ptr<CallGate> CallGateRef;
+ std::map<md_config_obs_t*, CallGateRef> obs_call_gate;
+
+ typedef std::map<md_config_obs_t*, std::set<std::string>> rev_obs_map_t;
+ typedef std::function<void(md_config_obs_t*, const std::string&)> config_gather_cb;
+
+ void call_observers(rev_obs_map_t &rev_obs);
+ void map_observer_changes(md_config_obs_t *obs, const std::string &key,
+ rev_obs_map_t *rev_obs);
+
public:
typedef boost::variant<int64_t md_config_t::*,
uint64_t md_config_t::*,
// Expand all metavariables. Make any pending observer callbacks.
void apply_changes(std::ostream *oss);
- void _apply_changes(std::ostream *oss);
+ void for_each_change(std::ostream *oss, config_gather_cb callback);
bool _internal_field(const string& k);
void call_all_observers();
#endif
struct hobject_t {
+public:
+ static const int64_t POOL_META = -1;
+ static const int64_t POOL_TEMP_START = -2; // and then negative
+
+ static bool is_temp_pool(int64_t pool) {
+ return pool <= POOL_TEMP_START;
+ }
+ static int64_t get_temp_pool(int64_t pool) {
+ return POOL_TEMP_START - pool;
+ }
+ static bool is_meta_pool(int64_t pool) {
+ return pool == POOL_META;
+ }
+
+public:
object_t oid;
snapid_t snap;
private:
bool max;
uint32_t nibblewise_key_cache;
uint32_t hash_reverse_bits;
- static const int64_t POOL_META = -1;
- static const int64_t POOL_TEMP_START = -2; // and then negative
- friend class spg_t; // for POOL_TEMP_START
public:
int64_t pool;
string nspace;
}
bool is_temp() const {
- return pool <= POOL_TEMP_START && pool != INT64_MIN;
+ return is_temp_pool(pool) && pool != INT64_MIN;
}
bool is_meta() const {
- return pool == POOL_META;
+ return is_meta_pool(pool);
+ }
+ int64_t get_logical_pool() const {
+ if (is_temp_pool(pool))
+ return get_temp_pool(pool); // it's reversible
+ else
+ return pool;
}
hobject_t() : snap(0), hash(0), max(false), pool(INT64_MIN) {
hobject_t make_temp_hobject(const string& name) const {
return hobject_t(object_t(name), "", CEPH_NOSNAP,
hash,
- hobject_t::POOL_TEMP_START - pool, "");
+ get_temp_pool(pool),
+ "");
}
void swap(hobject_t &o) {
OPTION(journaler_prefetch_periods, OPT_INT) // * journal object size
OPTION(journaler_prezero_periods, OPT_INT) // * journal object size
OPTION(mds_data, OPT_STR)
-OPTION(mds_max_file_size, OPT_U64) // Used when creating new CephFS. Change with 'ceph mds set max_file_size <size>' afterwards
// max xattr kv pairs size for each dir/file
OPTION(mds_max_xattr_pairs_size, OPT_U32)
OPTION(mds_max_file_recover, OPT_U32)
OPTION(mds_beacon_grace, OPT_FLOAT)
OPTION(mds_enforce_unique_name, OPT_BOOL)
-OPTION(mds_session_timeout, OPT_FLOAT) // cap bits and leases time out if client unresponsive or not returning its caps
OPTION(mds_session_blacklist_on_timeout, OPT_BOOL) // whether to blacklist clients whose sessions are dropped due to timeout
OPTION(mds_session_blacklist_on_evict, OPT_BOOL) // whether to blacklist clients whose sessions are dropped via admin commands
OPTION(mds_sessionmap_keys_per_op, OPT_U32) // how many sessions should I try to load/store in a single OMAP operation?
OPTION(mds_recall_state_timeout, OPT_FLOAT) // detect clients which aren't trimming caps
OPTION(mds_freeze_tree_timeout, OPT_FLOAT) // detecting freeze tree deadlock
-OPTION(mds_session_autoclose, OPT_FLOAT) // autoclose idle session
OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
OPTION(mds_reconnect_timeout, OPT_FLOAT) // seconds to wait for clients during mds restart
- // make it (mds_session_timeout - mds_beacon_grace)
+ // make it (mdsmap.session_timeout - mds_beacon_grace)
OPTION(mds_tick_interval, OPT_FLOAT)
OPTION(mds_dirstat_min_interval, OPT_FLOAT) // try to avoid propagating more often than this
OPTION(mds_scatter_nudge_interval, OPT_FLOAT) // how quickly dirstat changes propagate up the hierarchy
OPTION(mds_bal_sample_interval, OPT_DOUBLE) // every 3 seconds
OPTION(mds_bal_replicate_threshold, OPT_FLOAT)
OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT)
-OPTION(mds_bal_frag, OPT_BOOL)
OPTION(mds_bal_split_size, OPT_INT)
OPTION(mds_bal_split_rd, OPT_FLOAT)
OPTION(mds_bal_split_wr, OPT_FLOAT)
OPTION(osd_peering_wq_batch_size, OPT_U64)
OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64)
OPTION(osd_op_pq_min_cost, OPT_U64)
-OPTION(osd_disk_threads, OPT_INT)
+OPTION(osd_remove_threads, OPT_INT)
+OPTION(osd_recovery_threads, OPT_INT)
OPTION(osd_disk_thread_ioprio_class, OPT_STR) // rt realtime be best effort idle
OPTION(osd_disk_thread_ioprio_priority, OPT_INT) // 0-7
OPTION(osd_recover_clone_overlap, OPT_BOOL) // preserve clone_overlap during recovery/migration
OPTION(osd_op_history_slow_op_size, OPT_U32) // Max number of slow ops to track
OPTION(osd_op_history_slow_op_threshold, OPT_DOUBLE) // track the op if over this threshold
OPTION(osd_target_transaction_size, OPT_INT) // to adjust various transactions that batch smaller items
+OPTION(osd_delete_sleep, OPT_FLOAT) // seconds to sleep between removal transactions
OPTION(osd_failsafe_full_ratio, OPT_FLOAT) // what % full makes an OSD "full" (failsafe)
OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL) // immediately mark OSDs as down once they refuse to accept connections
OPTION(bluestore_bluefs_gift_ratio, OPT_FLOAT) // how much to add at a time
OPTION(bluestore_bluefs_reclaim_ratio, OPT_FLOAT) // how much to reclaim at a time
OPTION(bluestore_bluefs_balance_interval, OPT_FLOAT) // how often (sec) to balance free space between bluefs and bluestore
+// how often (sec) to dump allocation failure happened during bluefs rebalance
+OPTION(bluestore_bluefs_balance_failure_dump_interval, OPT_FLOAT)
+
// If you want to use spdk driver, you need to specify NVMe serial number here
// with "spdk:" prefix.
// Users can use 'lspci -vvv -d 8086:0953 | grep "Device Serial Number"' to
OPTION(bluestore_csum_type, OPT_STR) // none|xxhash32|xxhash64|crc32c|crc32c_16|crc32c_8
OPTION(bluestore_csum_min_block, OPT_U32)
OPTION(bluestore_csum_max_block, OPT_U32)
+OPTION(bluestore_retry_disk_reads, OPT_U64)
OPTION(bluestore_min_alloc_size, OPT_U32)
OPTION(bluestore_min_alloc_size_hdd, OPT_U32)
OPTION(bluestore_min_alloc_size_ssd, OPT_U32)
OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL)
OPTION(bluestore_shard_finishers, OPT_BOOL)
OPTION(bluestore_debug_random_read_err, OPT_DOUBLE)
+OPTION(bluestore_debug_inject_csum_err_probability, OPT_FLOAT)
OPTION(kstore_max_ops, OPT_U64)
OPTION(kstore_max_bytes, OPT_U64)
OPTION(rgw_swift_versioning_enabled, OPT_BOOL) // whether swift object versioning feature is enabled
+OPTION(rgw_trust_forwarded_https, OPT_BOOL) // trust Forwarded and X-Forwarded-Proto headers for ssl termination
OPTION(rgw_crypt_require_ssl, OPT_BOOL) // requests including encryption key headers must be sent over ssl
OPTION(rgw_crypt_default_encryption_key, OPT_STR) // base64 encoded key for encryption of rgw objects
OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR) // extra keys that may be used for aws:kms
.set_description(""),
Option("mon_osd_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
- .set_default(10)
+ .set_default(500)
.set_description(""),
Option("mon_cpu_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(65536)
.set_description(""),
- Option("osd_disk_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ Option("osd_remove_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("osd_recovery_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(1)
.set_description(""),
.set_default(30)
.set_description(""),
+ Option("osd_delete_sleep", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description("Time in seconds to sleep before next removal transaction"),
+
Option("osd_failsafe_full_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(.97)
.set_description(""),
.set_description(""),
Option("rocksdb_cache_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
- .set_default(128_M)
+ .set_default(512_M)
.set_description(""),
Option("rocksdb_cache_row_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
Option("osd_memory_expected_fragmentation", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(0.15)
+ .set_min_max(0.0, 1.0)
.add_see_also("bluestore_cache_autotune")
.set_description("When tcmalloc and cache autotuning is enabled, estimate the percent of memory fragmentation."),
.set_default(1)
.set_description("How frequently (in seconds) to balance free space between BlueFS and BlueStore"),
+ Option("bluestore_bluefs_balance_failure_dump_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description("How frequently (in seconds) to dump information on "
+ "allocation failure occurred during BlueFS space rebalance"),
+
Option("bluestore_spdk_mem", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(512)
.set_description(""),
.set_description("Maximum block size to checksum")
.add_see_also("bluestore_csum_min_block"),
+ Option("bluestore_retry_disk_reads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(3)
+ .set_min_max(0, 255)
+ .set_description("Number of read retries on checksum validation error")
+ .set_long_description("Retries to read data from the disk this many times when checksum validation fails to handle spurious read errors gracefully."),
+
Option("bluestore_min_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(0)
.add_tag("mkfs")
.set_default(0)
.set_description(""),
+ Option("bluestore_debug_inject_csum_err_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0.0)
+ .set_description("inject crc verification errors into bluestore device reads"),
+
// -----------------------------------------
// kstore
.set_default(120)
.set_description(""),
+ Option("rgw_trust_forwarded_https", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description("Trust Forwarded and X-Forwarded-Proto headers")
+ .set_long_description(
+ "When a proxy in front of radosgw is used for ssl termination, radosgw "
+ "does not know whether incoming http connections are secure. Enable "
+ "this option to trust the Forwarded and X-Forwarded-Proto headers sent "
+ "by the proxy when determining whether the connection is secure. This "
+ "is required for some features, such as server side encryption.")
+ .add_see_also("rgw_crypt_require_ssl"),
+
Option("rgw_crypt_require_ssl", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description("Requests including encryption key headers must be sent over ssl"),
"of RGW instances under heavy use. If you would like "
"to turn off cache expiry, set this value to zero."),
+ Option("rgw_max_listing_results", Option::TYPE_UINT,
+ Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_min_max(1, 100000)
+ .add_service("rgw")
+ .set_description("Upper bound on results in listing operations, ListBucket max-keys")
+ .set_long_description("This caps the maximum permitted value for listing-like operations in RGW S3. "
+ "Affects ListBucket(max-keys), "
+ "ListBucketVersions(max-keys), "
+ "ListBucketMultiPartUploads(max-uploads), "
+ "ListMultipartUploadParts(max-parts)"),
});
}
.set_default("/var/lib/ceph/mds/$cluster-$id")
.set_description(""),
- Option("mds_max_file_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
- .set_default(1ULL << 40)
- .set_description(""),
-
Option("mds_max_xattr_pairs_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(64 << 10)
.set_description(""),
.set_default(15)
.set_description(""),
+ Option("mds_heartbeat_grace", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(15)
+ .set_description("tolerance in seconds for MDS internal heartbeat"),
+
Option("mds_enforce_unique_name", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description(""),
- Option("mds_session_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
- .set_default(60)
- .set_description(""),
-
Option("mds_session_blacklist_on_timeout", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description(""),
.set_default(30)
.set_description(""),
- Option("mds_session_autoclose", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
- .set_default(300)
- .set_description(""),
-
Option("mds_health_summarize_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(10)
.set_description(""),
.set_default(0)
.set_description(""),
- Option("mds_bal_frag", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
- .set_default(true)
- .set_description(""),
-
Option("mds_bal_split_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(10000)
.set_description(""),
Option("mds_cap_revoke_eviction_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description("number of seconds after which clients which have not responded to cap revoke messages by the MDS are evicted."),
+
+ Option("mds_dump_cache_threshold_formatter", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(1_G)
+ .set_description("threshold for cache usage to disallow \"dump cache\" operation to formatter")
+ .set_long_description("Disallow MDS from dumping caches to formatter via \"dump cache\" command if cache usage exceeds this threshold."),
+
+ Option("mds_dump_cache_threshold_file", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description("threshold for cache usage to disallow \"dump cache\" operation to file")
+ .set_long_description("Disallow MDS from dumping caches to file via \"dump cache\" command if cache usage exceeds this threshold."),
});
}
{
for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
if ((int)p->value.id().to_long() == crush_grammar::_bucket) {
- iter_t firstline = p->children.begin() + 3;
- string tag = string_node(firstline->children[0]);
- if (tag == "id") {
+ for (iter_t firstline = p->children.begin() + 3;
+ firstline != p->children.end();
+ ++firstline) {
+ string tag = string_node(firstline->children[0]);
+ if (tag != "id") {
+ break;
+ }
int id = int_node(firstline->children[1]);
//err << "saw bucket id " << id << std::endl;
id_item[id] = string();
return 0;
}
+
+int CrushTester::compare(CrushWrapper& crush2)
+{
+ if (min_rule < 0 || max_rule < 0) {
+ min_rule = 0;
+ max_rule = crush.get_max_rules() - 1;
+ }
+ if (min_x < 0 || max_x < 0) {
+ min_x = 0;
+ max_x = 1023;
+ }
+
+ // initial osd weights
+ vector<__u32> weight;
+
+ /*
+ * note device weight is set by crushtool
+ * (likely due to a given a command line option)
+ */
+ for (int o = 0; o < crush.get_max_devices(); o++) {
+ if (device_weight.count(o)) {
+ weight.push_back(device_weight[o]);
+ } else if (crush.check_item_present(o)) {
+ weight.push_back(0x10000);
+ } else {
+ weight.push_back(0);
+ }
+ }
+
+ // make adjustments
+ adjust_weights(weight);
+
+ map<int,int> bad_by_rule;
+
+ int ret = 0;
+ for (int r = min_rule; r < crush.get_max_rules() && r <= max_rule; r++) {
+ if (!crush.rule_exists(r)) {
+ if (output_statistics)
+ err << "rule " << r << " dne" << std::endl;
+ continue;
+ }
+ if (ruleset >= 0 &&
+ crush.get_rule_mask_ruleset(r) != ruleset) {
+ continue;
+ }
+ int minr = min_rep, maxr = max_rep;
+ if (min_rep < 0 || max_rep < 0) {
+ minr = crush.get_rule_mask_min_size(r);
+ maxr = crush.get_rule_mask_max_size(r);
+ }
+ int bad = 0;
+ for (int nr = minr; nr <= maxr; nr++) {
+ for (int x = min_x; x <= max_x; ++x) {
+ vector<int> out;
+ crush.do_rule(r, x, out, nr, weight, 0);
+ vector<int> out2;
+ crush2.do_rule(r, x, out2, nr, weight, 0);
+ if (out != out2) {
+ ++bad;
+ }
+ }
+ }
+ if (bad) {
+ ret = -1;
+ }
+ int max = (maxr - minr + 1) * (max_x - min_x + 1);
+ double ratio = (double)bad / (double)max;
+ cout << "rule " << r << " had " << bad << "/" << max
+ << " mismatched mappings (" << ratio << ")" << std::endl;
+ }
+ if (ret) {
+ cerr << "warning: maps are NOT equivalent" << std::endl;
+ } else {
+ cout << "maps appear equivalent" << std::endl;
+ }
+ return ret;
+}
void check_overlapped_rules() const;
int test();
int test_with_fork(int timeout);
+
+ int compare(CrushWrapper& other);
};
#endif
assert(0 == "no available class id");
}
-void CrushWrapper::reweight(CephContext *cct)
+int CrushWrapper::set_subtree_class(
+ const string& subtree,
+ const string& new_class)
{
+ if (!name_exists(subtree)) {
+ return -ENOENT;
+ }
+
+ int new_class_id = -1;
+ if (class_exists(new_class)) {
+ new_class_id = get_class_id(new_class);
+ } else {
+ for (new_class_id = 1; class_name.count(new_class_id); ++new_class_id) ;
+ class_name[new_class_id] = new_class;
+ class_rname[new_class] = new_class_id;
+ }
+
+ int id = get_item_id(subtree);
+ list<int> q = { id };
+ while (!q.empty()) {
+ int id = q.front();
+ q.pop_front();
+ crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b)) {
+ return PTR_ERR(b);
+ }
+ for (unsigned i = 0; i < b->size; ++i) {
+ int item = b->items[i];
+ if (item >= 0) {
+ class_map[item] = new_class_id;
+ } else {
+ q.push_back(item);
+ }
+ }
+ }
+ return 0;
+}
+
+int CrushWrapper::reclassify(
+ CephContext *cct,
+ ostream& out,
+ const map<string,string>& classify_root,
+ const map<string,pair<string,string>>& classify_bucket
+ )
+{
+ map<int,string> reclassified_bucket; // orig_id -> class
+
+ // classify_root
+ for (auto& i : classify_root) {
+ string root = i.first;
+ if (!name_exists(root)) {
+ out << "root " << root << " does not exist" << std::endl;
+ return -EINVAL;
+ }
+ int root_id = get_item_id(root);
+ string new_class = i.second;
+ int new_class_id = -1;
+ out << "classify_root " << root << " (" << root_id
+ << ") as " << new_class << std::endl;
+ if (class_exists(new_class)) {
+ new_class_id = get_class_id(new_class);
+ out << " new class " << new_class << " exists as " << new_class_id
+ << std::endl;
+ } else {
+ for (new_class_id = 1; class_name.count(new_class_id); ++new_class_id) ;
+ class_name[new_class_id] = new_class;
+ class_rname[new_class] = new_class_id;
+ out << " created new class " << new_class << " as " << new_class_id
+ << std::endl;
+ }
+
+ // validate rules
+ for (unsigned j = 0; j < crush->max_rules; j++) {
+ if (crush->rules[j]) {
+ auto rule = crush->rules[j];
+ for (unsigned k = 0; k < rule->len; ++k) {
+ if (rule->steps[k].op == CRUSH_RULE_TAKE) {
+ int step_item = get_rule_arg1(j, k);
+ int original_item;
+ int c;
+ int res = split_id_class(step_item, &original_item, &c);
+ if (res < 0)
+ return res;
+ if (c >= 0) {
+ if (original_item == root_id) {
+ out << " rule " << j << " includes take on root "
+ << root << " class " << c << std::endl;
+ return -EINVAL;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // rebuild new buckets for root
+ //cout << "before class_bucket: " << class_bucket << std::endl;
+ map<int,int> renumber;
+ list<int> q;
+ q.push_back(root_id);
+ while (!q.empty()) {
+ int id = q.front();
+ q.pop_front();
+ crush_bucket *bucket = get_bucket(id);
+ if (IS_ERR(bucket)) {
+ out << "cannot find bucket " << id
+ << ": " << cpp_strerror(PTR_ERR(bucket)) << std::endl;
+ return PTR_ERR(bucket);
+ }
+
+ // move bucket
+ int new_id = get_new_bucket_id();
+ out << " renumbering bucket " << id << " -> " << new_id << std::endl;
+ renumber[id] = new_id;
+ crush->buckets[-1-new_id] = bucket;
+ bucket->id = new_id;
+ crush->buckets[-1-id] = crush_make_bucket(crush,
+ bucket->alg,
+ bucket->hash,
+ bucket->type,
+ 0, NULL, NULL);
+ crush->buckets[-1-id]->id = id;
+ for (auto& i : choose_args) {
+ i.second.args[-1-new_id] = i.second.args[-1-id];
+ memset(&i.second.args[-1-id], 0, sizeof(i.second.args[0]));
+ }
+ class_bucket.erase(id);
+ class_bucket[new_id][new_class_id] = id;
+ name_map[new_id] = string(get_item_name(id));
+ name_map[id] = string(get_item_name(id)) + "~" + new_class;
+
+ for (unsigned j = 0; j < bucket->size; ++j) {
+ if (bucket->items[j] < 0) {
+ q.push_front(bucket->items[j]);
+ } else {
+ // we don't reclassify the device here; if the users wants that,
+ // they can pass --set-subtree-class separately.
+ }
+ }
+ }
+ //cout << "mid class_bucket: " << class_bucket << std::endl;
+
+ for (int i = 0; i < crush->max_buckets; ++i) {
+ crush_bucket *b = crush->buckets[i];
+ if (!b) {
+ continue;
+ }
+ for (unsigned j = 0; j < b->size; ++j) {
+ if (renumber.count(b->items[j])) {
+ b->items[j] = renumber[b->items[j]];
+ }
+ }
+ }
+
+ int r = rebuild_roots_with_classes();
+ if (r < 0) {
+ out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ //cout << "final class_bucket: " << class_bucket << std::endl;
+ }
+
+ // classify_bucket
+ map<int,int> send_to; // source bucket -> dest bucket
+ map<int,map<int,int>> new_class_bucket;
+ map<int,string> new_bucket_names;
+ map<int,map<string,string>> new_buckets;
+ map<string,int> new_bucket_by_name;
+ for (auto& i : classify_bucket) {
+ const string& match = i.first; // prefix% or %suffix
+ const string& new_class = i.second.first;
+ const string& default_parent = i.second.second;
+ if (!name_exists(default_parent)) {
+ out << "default parent " << default_parent << " does not exist"
+ << std::endl;
+ return -EINVAL;
+ }
+ int default_parent_id = get_item_id(default_parent);
+ crush_bucket *default_parent_bucket = get_bucket(default_parent_id);
+ assert(default_parent_bucket);
+ string default_parent_type_name = get_type_name(default_parent_bucket->type);
+
+ out << "classify_bucket " << match << " as " << new_class
+ << " default bucket " << default_parent
+ << " (" << default_parent_type_name << ")" << std::endl;
+
+ int new_class_id = -1;
+ if (class_exists(new_class)) {
+ new_class_id = get_class_id(new_class);
+ out << " new class " << new_class << " exists as " << new_class_id
+ << std::endl;
+ } else {
+ for (new_class_id = 1; class_name.count(new_class_id); ++new_class_id) {
+ }
+ class_name[new_class_id] = new_class;
+ class_rname[new_class] = new_class_id;
+ out << " created new class " << new_class << " as " << new_class_id
+ << std::endl;
+ }
+
+ for (int j = 0; j < crush->max_buckets; ++j) {
+ crush_bucket *b = crush->buckets[j];
+ if (!b || is_shadow_item(b->id)) {
+ continue;
+ }
+ string name = get_item_name(b->id);
+ if (name.length() < match.length()) {
+ continue;
+ }
+ string basename;
+ if (match[0] == '%') {
+ if (match.substr(1) != name.substr(name.size() - match.size() + 1)) {
+ continue;
+ }
+ basename = name.substr(0, name.size() - match.size() + 1);
+ } else if (match[match.size() - 1] == '%') {
+ if (match.substr(0, match.size() - 1) !=
+ name.substr(0, match.size() - 1)) {
+ continue;
+ }
+ basename = name.substr(match.size() - 1);
+ } else if (match == name) {
+ basename = default_parent;
+ } else {
+ continue;
+ }
+ cout << "match " << match << " to " << name << " basename " << basename
+ << std::endl;
+ // look up or create basename bucket
+ int base_id;
+ if (name_exists(basename)) {
+ base_id = get_item_id(basename);
+ cout << " have base " << base_id << std::endl;
+ } else if (new_bucket_by_name.count(basename)) {
+ base_id = new_bucket_by_name[basename];
+ cout << " already creating base " << base_id << std::endl;
+ } else {
+ base_id = get_new_bucket_id();
+ crush->buckets[-1-base_id] = crush_make_bucket(crush,
+ b->alg,
+ b->hash,
+ b->type,
+ 0, NULL, NULL);
+ crush->buckets[-1-base_id]->id = base_id;
+ name_map[base_id] = basename;
+ new_bucket_by_name[basename] = base_id;
+ cout << " created base " << base_id << std::endl;
+
+ new_buckets[base_id][default_parent_type_name] = default_parent;
+ }
+ send_to[b->id] = base_id;
+ new_class_bucket[base_id][new_class_id] = b->id;
+ new_bucket_names[b->id] = basename + "~" + get_class_name(new_class_id);
+
+ // make sure devices are classified
+ for (unsigned i = 0; i < b->size; ++i) {
+ int item = b->items[i];
+ if (item >= 0) {
+ class_map[item] = new_class_id;
+ }
+ }
+ }
+ }
+
+ // no name_exists() works below,
+ have_rmaps = false;
+
+ // copy items around
+ //cout << "send_to " << send_to << std::endl;
set<int> roots;
find_roots(&roots);
- for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
- if (*p >= 0)
+ for (auto& i : send_to) {
+ crush_bucket *from = get_bucket(i.first);
+ crush_bucket *to = get_bucket(i.second);
+ cout << "moving items from " << from->id << " (" << get_item_name(from->id)
+ << ") to " << to->id << " (" << get_item_name(to->id) << ")"
+ << std::endl;
+ for (unsigned j = 0; j < from->size; ++j) {
+ int item = from->items[j];
+ int r;
+ map<string,string> to_loc;
+ to_loc[get_type_name(to->type)] = get_item_name(to->id);
+ if (item >= 0) {
+ if (subtree_contains(to->id, item)) {
+ continue;
+ }
+ map<string,string> from_loc;
+ from_loc[get_type_name(from->type)] = get_item_name(from->id);
+ auto w = get_item_weightf_in_loc(item, from_loc);
+ r = insert_item(cct, item,
+ w,
+ get_item_name(item),
+ to_loc);
+ } else {
+ if (!send_to.count(item)) {
+ lderr(cct) << "item " << item << " in bucket " << from->id
+ << " is not also a reclassified bucket" << dendl;
+ return -EINVAL;
+ }
+ int newitem = send_to[item];
+ if (subtree_contains(to->id, newitem)) {
+ continue;
+ }
+ r = link_bucket(cct, newitem, to_loc);
+ }
+ if (r != 0) {
+ cout << __func__ << " err from insert_item: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ }
+ }
+
+ // make sure new buckets have parents
+ for (auto& i : new_buckets) {
+ int parent;
+ if (get_immediate_parent_id(i.first, &parent) < 0) {
+ cout << "new bucket " << i.first << " missing parent, adding at "
+ << i.second << std::endl;
+ int r = link_bucket(cct, i.first, i.second);
+ if (r != 0) {
+ cout << __func__ << " err from insert_item: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ }
+ }
+
+ // set class mappings
+ //cout << "pre class_bucket: " << class_bucket << std::endl;
+ for (auto& i : new_class_bucket) {
+ for (auto& j : i.second) {
+ class_bucket[i.first][j.first] = j.second;
+ }
+
+ }
+ //cout << "post class_bucket: " << class_bucket << std::endl;
+ for (auto& i : new_bucket_names) {
+ name_map[i.first] = i.second;
+ }
+
+ int r = rebuild_roots_with_classes();
+ if (r < 0) {
+ out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ //cout << "final class_bucket: " << class_bucket << std::endl;
+
+ return 0;
+}
+
+int CrushWrapper::get_new_bucket_id()
+{
+ int id = -1;
+ while (crush->buckets[-1-id] &&
+ -1-id < crush->max_buckets) {
+ id--;
+ }
+ if (-1-id == crush->max_buckets) {
+ ++crush->max_buckets;
+ crush->buckets = (struct crush_bucket**)realloc(
+ crush->buckets,
+ sizeof(crush->buckets[0]) * crush->max_buckets);
+ for (auto& i : choose_args) {
+ assert(i.second.size == crush->max_buckets - 1);
+ ++i.second.size;
+ i.second.args = (struct crush_choose_arg*)realloc(
+ i.second.args,
+ sizeof(i.second.args[0]) * i.second.size);
+ }
+ }
+ return id;
+}
+
+void CrushWrapper::reweight(CephContext *cct)
+{
+ set<int> roots;
+ find_nonshadow_roots(&roots);
+ for (auto id : roots) {
+ if (id >= 0)
continue;
- crush_bucket *b = get_bucket(*p);
- ldout(cct, 5) << "reweight bucket " << *p << dendl;
+ crush_bucket *b = get_bucket(id);
+ ldout(cct, 5) << "reweight root bucket " << id << dendl;
int r = crush_reweight_bucket(crush, b);
assert(r == 0);
+
+ for (auto& i : choose_args) {
+ //cout << "carg " << i.first << std::endl;
+ vector<uint32_t> w; // discard top-level weights
+ reweight_bucket(b, i.second, &w);
+ }
+ }
+ int r = rebuild_roots_with_classes();
+ ceph_assert(r == 0);
+}
+
+void CrushWrapper::reweight_bucket(
+ crush_bucket *b,
+ crush_choose_arg_map& arg_map,
+ vector<uint32_t> *weightv)
+{
+ int idx = -1 - b->id;
+ unsigned npos = arg_map.args[idx].weight_set_positions;
+ //cout << __func__ << " " << b->id << " npos " << npos << std::endl;
+ weightv->resize(npos);
+ for (unsigned i = 0; i < b->size; ++i) {
+ int item = b->items[i];
+ if (item >= 0) {
+ for (unsigned pos = 0; pos < npos; ++pos) {
+ (*weightv)[pos] += arg_map.args[idx].weight_set->weights[i];
+ }
+ } else {
+ vector<uint32_t> subw(npos);
+ crush_bucket *sub = get_bucket(item);
+ assert(sub);
+ reweight_bucket(sub, arg_map, &subw);
+ for (unsigned pos = 0; pos < npos; ++pos) {
+ (*weightv)[pos] += subw[pos];
+ // strash the real bucket weight as the weights for this reference
+ arg_map.args[idx].weight_set->weights[i] = subw[pos];
+ }
+ }
}
+ //cout << __func__ << " finish " << b->id << " " << *weightv << std::endl;
}
int CrushWrapper::add_simple_rule_at(
return adjust_item_weight_in_loc(cct, id, (int)(weight * (float)0x10000), loc);
}
void reweight(CephContext *cct);
+ void reweight_bucket(crush_bucket *b,
+ crush_choose_arg_map& arg_map,
+ vector<uint32_t> *weightv);
int adjust_subtree_weight(CephContext *cct, int id, int weight);
int adjust_subtree_weightf(CephContext *cct, int id, float weight) {
**/
int detach_bucket(CephContext *cct, int item);
+ int get_new_bucket_id();
+
public:
int get_max_buckets() const {
if (!crush) return -EINVAL;
/* remove unused roots generated for class devices */
int trim_roots_with_class();
+ int reclassify(
+ CephContext *cct,
+ ostream& out,
+ const map<string,string>& classify_root,
+ const map<string,pair<string,string>>& classify_bucket
+ );
+
+ int set_subtree_class(const string& name, const string& class_name);
+
void start_choose_profile() {
free(crush->choose_tries);
/*
return *this;
}
+ uint64_t get_wasted_space() const;
unsigned get_num_buffers() const { return _buffers.size(); }
const ptr& front() const { return _buffers.front(); }
const ptr& back() const { return _buffers.back(); }
DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES)
DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(19, 2, OSD_PGLOG_HARDLIMIT)
DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
CEPH_FEATURE_RADOS_BACKOFF | \
CEPH_FEATURE_OSD_RECOVERY_DELETES | \
CEPH_FEATURE_CEPHX_V2 | \
+ CEPH_FEATURE_OSD_PGLOG_HARDLIMIT | \
0ULL)
#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL
#define CEPH_SETATTR_ATIME (1 << 4)
#define CEPH_SETATTR_SIZE (1 << 5)
#define CEPH_SETATTR_CTIME (1 << 6)
-#define CEPH_SETATTR_BTIME (1 << 9)
-#endif
#define CEPH_SETATTR_MTIME_NOW (1 << 7)
#define CEPH_SETATTR_ATIME_NOW (1 << 8)
+#define CEPH_SETATTR_BTIME (1 << 9)
+#endif
#define CEPH_SETATTR_KILL_SGUID (1 << 10)
/*
# define CEPH_SETATTR_ATIME 16
# define CEPH_SETATTR_SIZE 32
# define CEPH_SETATTR_CTIME 64
+# define CEPH_SETATTR_MTIME_NOW 128
+# define CEPH_SETATTR_ATIME_NOW 256
# define CEPH_SETATTR_BTIME 512
#endif
/* Defined if boost::context is available */
#cmakedefine HAVE_BOOST_CONTEXT
+/* Defined if OpenSSL is available for the rgw beast frontend */
+#cmakedefine WITH_RADOSGW_BEAST_OPENSSL
+
#endif /* CONFIG_H */
#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
#define CEPH_OSDMAP_PURGED_SNAPDIRS (1<<20) /* osds have converted snapsets */
+#define CEPH_OSDMAP_PGLOG_HARDLIMIT (1<<22) /* put a hard limit on pg log length */
/* these are hidden in 'ceph status' view */
#define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL| \
CEPH_OSDMAP_REQUIRE_LUMINOUS | \
CEPH_OSDMAP_RECOVERY_DELETES | \
CEPH_OSDMAP_SORTBITWISE | \
- CEPH_OSDMAP_PURGED_SNAPDIRS)
+ CEPH_OSDMAP_PURGED_SNAPDIRS | \
+ CEPH_OSDMAP_PGLOG_HARDLIMIT)
#define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL | \
CEPH_OSDMAP_REQUIRE_KRAKEN | \
CEPH_OSDMAP_REQUIRE_LUMINOUS)
bool operator!=(const NObjectIterator& rhs) const;
const ListObject& operator*() const;
const ListObject* operator->() const;
- NObjectIterator &operator++(); // Preincrement
- NObjectIterator operator++(int); // Postincrement
+ NObjectIterator &operator++(); //< Preincrement; errors are thrown as exceptions
+ NObjectIterator operator++(int); //< Postincrement; errors are thrown as exceptions
friend class IoCtx;
friend class NObjectIteratorImpl;
/// get current hash position of the iterator, rounded to the current pg
uint32_t get_pg_hash_position() const;
- /// move the iterator to a given hash position. this may (will!) be rounded to the nearest pg.
+ /// move the iterator to a given hash position. this may (will!) be rounded
+ /// to the nearest pg. errors are thrown as exceptions
uint32_t seek(uint32_t pos);
- /// move the iterator to a given cursor position
+ /// move the iterator to a given cursor position. errors are thrown as exceptions
uint32_t seek(const ObjectCursor& cursor);
/// get current cursor position
std::list<librados::locker_t> *lockers);
- /// Start enumerating objects for a pool
+ /// Start enumerating objects for a pool. Errors are thrown as exceptions.
NObjectIterator nobjects_begin();
NObjectIterator nobjects_begin(const bufferlist &filter);
- /// Start enumerating objects for a pool starting from a hash position
+ /// Start enumerating objects for a pool starting from a hash position.
+ /// Errors are thrown as exceptions.
NObjectIterator nobjects_begin(uint32_t start_hash_position);
NObjectIterator nobjects_begin(uint32_t start_hash_position,
const bufferlist &filter);
- /// Start enumerating objects for a pool starting from cursor
+ /// Start enumerating objects for a pool starting from cursor. Errors are
+ /// thrown as exceptions.
NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor);
NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor,
const bufferlist &filter);
using std::set;
using std::vector;
using std::list;
-using std::runtime_error;
#define dout_subsys ceph_subsys_rados
#undef dout_prefix
return;
}
else if (ret) {
- ostringstream oss;
- oss << "rados returned " << cpp_strerror(ret);
- throw std::runtime_error(oss.str());
+ throw std::system_error(-ret, std::system_category(),
+ "rados_nobjects_list_next");
}
if (cur_obj.impl == NULL)
tracepoint(librbd, snap_list_exit, -EINVAL, 0);
return -EINVAL;
}
+ memset(snaps, 0, sizeof(*snaps) * *max_snaps);
int r = librbd::snap_list(ictx, cpp_snaps);
if (r == -ENOENT) {
if (m_new_size < m_original_size && !m_allow_shrink) {
ldout(cct, 1) << " shrinking the image is not permitted" << dendl;
+ image_ctx.io_work_queue->unblock_writes();
this->async_complete(-EINVAL);
return nullptr;
}
int64_t old_pool = inode.layout.pool_id;
mark_dirty(front.inode.version, ls);
+ bool new_export_pin = inode.export_pin != front.inode.export_pin;
inode = front.inode;
+ if (new_export_pin)
+ maybe_export_pin(true);
if (inode.is_backtrace_updated())
mark_dirty_parent(ls, old_pool != inode.layout.pool_id);
assert(is_dir());
assert(is_projected());
get_projected_inode()->export_pin = rank;
- maybe_export_pin(true);
}
mds_rank_t CInode::get_export_pin(bool inherit) const
while (true) {
if (in->is_system())
break;
- const CDentry *pdn = in->get_projected_parent_dn();
+ const CDentry *pdn = in->get_parent_dn();
if (!pdn)
break;
- const mempool_inode *pi = in->get_projected_inode();
// ignore export pin for unlinked directory
- if (pi->nlink == 0)
+ if (in->get_inode().nlink == 0)
break;
- if (pi->export_pin >= 0)
- return pi->export_pin;
+ if (in->get_inode().export_pin >= 0)
+ return in->get_inode().export_pin;
if (!inherit)
break;
int d_type() const { return IFTODT(inode.mode); }
mempool_inode& get_inode() { return inode; }
+ const mempool_inode& get_inode() const { return inode; }
CDentry* get_parent_dn() { return parent; }
const CDentry* get_parent_dn() const { return parent; }
const CDentry* get_projected_parent_dn() const { return !projected_parent.empty() ? projected_parent.back() : parent; }
auto fs = std::make_shared<Filesystem>();
fs->mds_map.epoch = epoch;
fs->mds_map.fs_name = std::string(name);
- fs->mds_map.max_mds = 1;
fs->mds_map.data_pools.push_back(data_pool);
fs->mds_map.metadata_pool = metadata_pool;
fs->mds_map.cas_pool = -1;
- fs->mds_map.max_file_size = g_conf->mds_max_file_size;
fs->mds_map.compat = compat;
fs->mds_map.created = ceph_clock_now();
fs->mds_map.modified = ceph_clock_now();
- fs->mds_map.session_timeout = g_conf->mds_session_timeout;
- fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
fs->mds_map.enabled = true;
if (features & CEPH_FEATURE_SERVER_JEWEL) {
fs->fscid = next_filesystem_id++;
// Carry forward what makes sense
new_fs->fscid = fs->fscid;
new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
- new_fs->mds_map.max_mds = 1;
new_fs->mds_map.data_pools = fs->mds_map.data_pools;
new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
new_fs->mds_map.fs_name = fs->mds_map.fs_name;
- new_fs->mds_map.max_file_size = g_conf->mds_max_file_size;
new_fs->mds_map.compat = compat;
new_fs->mds_map.created = ceph_clock_now();
new_fs->mds_map.modified = ceph_clock_now();
- new_fs->mds_map.session_timeout = g_conf->mds_session_timeout;
- new_fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
new_fs->mds_map.enabled = true;
{
assert(!in->is_auth());
- int wanted = in->get_caps_wanted() & ~CEPH_CAP_PIN;
+ int wanted = in->get_caps_wanted() & in->get_caps_allowed_ever() & ~CEPH_CAP_PIN;
if (wanted != in->replica_caps_wanted) {
// wait for single auth
if (in->is_ambiguous_auth()) {
// filter wanted based on what we could ever give out (given auth/replica status)
bool need_flush = m->flags & CLIENT_CAPS_SYNC;
- int new_wanted = m->get_wanted() & head_in->get_caps_allowed_ever();
+ int new_wanted = m->get_wanted();
if (new_wanted != cap->wanted()) {
- if (!need_flush && (new_wanted & ~cap->pending())) {
+ if (!need_flush && in->is_auth() && (new_wanted & ~cap->pending())) {
// exapnding caps. make sure we aren't waiting for a log flush
need_flush = _need_flush_mdlog(head_in, new_wanted & ~cap->pending());
}
/**
* Return true if any currently revoking caps exceed the
- * mds_session_timeout threshold.
+ * session_timeout threshold.
*/
bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking,
double timeout) const
utime_t age = now - cap->get_last_revoke_stamp();
dout(20) << __func__ << " age = " << age << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
- if (age <= g_conf->mds_session_timeout) {
- dout(20) << __func__ << " age below timeout " << g_conf->mds_session_timeout << dendl;
+ if (age <= mds->mdsmap->get_session_timeout()) {
+ dout(20) << __func__ << " age below timeout " << mds->mdsmap->get_session_timeout() << dendl;
break;
} else {
++i;
}
}
// exponential backoff of warning intervals
- if (age > g_conf->mds_session_timeout * (1 << cap->get_num_revoke_warnings())) {
+ if (age > mds->mdsmap->get_session_timeout() * (1 << cap->get_num_revoke_warnings())) {
cap->inc_num_revoke_warnings();
stringstream ss;
ss << "client." << cap->get_client() << " isn't responding to mclientcaps(revoke), ino "
else if (lock->get_state() != LOCK_SYNC &&
!lock->is_wrlocked() && // drain wrlocks first!
!lock->is_waiter_for(SimpleLock::WAIT_WR) &&
- !(wanted & (CEPH_CAP_GWR|CEPH_CAP_GBUFFER)) &&
+ !(wanted & CEPH_CAP_GWR) &&
!((lock->get_state() == LOCK_MIX) &&
in->is_dir() && in->has_subtree_or_exporting_dirfrag()) // if we are a delegation point, stay where we are
//((wanted & CEPH_CAP_RD) ||
void MDBalancer::maybe_fragment(CDir *dir, bool hot)
{
// split/merge
- if (g_conf->mds_bal_frag && bal_fragment_interval > 0 &&
+ if (bal_fragment_interval > 0 &&
dir->is_auth() &&
!dir->inode->is_base() && // not root/base (for now at least)
!dir->inode->is_stray()) { // not straydir
memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
if (in->inode.is_dir()) {
in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
- ++in->inode.rstat.rsubdirs;
+ in->inode.rstat.rsubdirs = 1; /* itself */
+ in->inode.rstat.rctime = in->inode.ctime;
} else {
in->inode.layout = default_file_layout;
++in->inode.rstat.rfiles;
adjust_subtree_auth(rootdir, mds->get_nodeid());
rootdir->dir_rep = CDir::REP_ALL; //NONE;
- rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat;
- rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
-
- root->inode.dirstat = rootdir->fnode.fragstat;
- root->inode.rstat = rootdir->fnode.rstat;
- ++root->inode.rstat.rsubdirs;
- root->inode.accounted_rstat = root->inode.rstat;
+ assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
+ assert(rootdir->fnode.fragstat == root->inode.dirstat);
+ assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
+ /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
+ * assume version 0 is stale/invalid.
+ */
rootdir->mark_complete();
rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
// MDCache::shutdown_export_strays() always exports strays to mds.0
if (who == mds_rank_t(0))
- shutdown_exported_strays.clear();
+ shutdown_exporting_strays.clear();
show_subtrees();
}
if (cache_toofull()) {
last_recall_state = clock::now();
- mds->server->recall_client_state();
+ mds->server->recall_client_state(-1.0, false, nullptr);
}
// If the cache size had exceeded its limit, but we're back in bounds
// Fully trim the log so that all objects in cache are clean and may be
// trimmed by a future MDCache::trim. Note that MDSRank::tick does not
// trim the log such that the cache eventually becomes clean.
- mds->mdlog->trim(0);
+ if (mds->mdlog->get_num_segments() > 0) {
+ auto ls = mds->mdlog->get_current_segment();
+ if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
+ // Current segment contains events other than subtreemap or
+ // there are dirty dirfrags (see CDir::log_mark_dirty())
+ mds->mdlog->start_new_segment();
+ mds->mdlog->flush();
+ }
+ }
+ mds->mdlog->trim_all();
if (mds->mdlog->get_num_segments() > 1) {
dout(7) << "still >1 segments, waiting for log to trim" << dendl;
return false;
assert(!migrator->is_exporting());
assert(!migrator->is_importing());
+ // replicas may dirty scatter locks
+ if (myin && myin->is_replicated()) {
+ dout(7) << "still have replicated objects" << dendl;
+ return false;
+ }
+
if ((myin && myin->is_auth_pinned()) ||
(mydir && mydir->is_auth_pinned())) {
dout(7) << "still have auth pinned objects" << dendl;
if (!mds->mdlog->is_capped()) {
dout(7) << "capping the log" << dendl;
mds->mdlog->cap();
- mds->mdlog->trim();
}
+ if (!mds->mdlog->empty())
+ mds->mdlog->trim(0);
+
if (!mds->mdlog->empty()) {
dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
<< " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
bool MDCache::shutdown_export_strays()
{
+ static const unsigned MAX_EXPORTING = 100;
+
if (mds->get_nodeid() == 0)
return true;
-
- dout(10) << "shutdown_export_strays" << dendl;
+
+ if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
+ return false;
+
+ dout(10) << "shutdown_export_strays " << shutdown_export_next.first
+ << " '" << shutdown_export_next.second << "'" << dendl;
bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
+ bool all_exported = false;
- bool done = true;
+again:
+ auto next = shutdown_export_next;
- list<CDir*> dfs;
for (int i = 0; i < NUM_STRAY; ++i) {
- if (!strays[i] ||
- !strays[i]->state_test(CInode::STATE_STRAYPINNED))
+ CInode *strayi = strays[i];
+ if (!strayi ||
+ !strayi->state_test(CInode::STATE_STRAYPINNED))
+ continue;
+ if (strayi->ino() < next.first.ino)
continue;
- strays[i]->get_dirfrags(dfs);
- }
- for (std::list<CDir*>::iterator dfs_i = dfs.begin();
- dfs_i != dfs.end(); ++dfs_i)
- {
- CDir *dir = *dfs_i;
+ deque<CDir*> dfls;
+ strayi->get_dirfrags(dfls);
- if (!dir->is_complete()) {
- dir->fetch(0);
- done = false;
- if (!mds0_active)
- break;
- }
-
- for (auto &p : dir->items) {
- CDentry *dn = p.second;
- CDentry::linkage_t *dnl = dn->get_projected_linkage();
- if (dnl->is_null())
+ while (!dfls.empty()) {
+ CDir *dir = dfls.front();
+ dfls.pop_front();
+
+ if (dir->dirfrag() < next.first)
continue;
- done = false;
- if (!mds0_active)
- break;
-
- if (dn->state_test(CDentry::STATE_PURGING)) {
- // Don't try to migrate anything that is actually
- // being purged right now
- continue;
+ if (next.first < dir->dirfrag()) {
+ next.first = dir->dirfrag();
+ next.second.clear();
+ }
+
+ if (!dir->is_complete()) {
+ MDSInternalContextBase *fin = nullptr;
+ if (shutdown_exporting_strays.empty()) {
+ fin = new MDSInternalContextWrapper(mds,
+ new FunctionContext([this](int r) {
+ shutdown_export_strays();
+ })
+ );
+ }
+ dir->fetch(fin);
+ goto done;
}
- if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) {
- shutdown_exported_strays.insert(dnl->get_inode()->ino());
- stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
+ CDir::dentry_key_map::iterator it;
+ if (next.second.empty()) {
+ it = dir->begin();
} else {
- dout(10) << "already exporting " << *dn << dendl;
+ auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
+ it = dir->lower_bound(dentry_key_t(0, next.second, hash));
+ }
+
+ for (; it != dir->end(); ++it) {
+ CDentry *dn = it->second;
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ if (dnl->is_null())
+ continue;
+
+ if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
+ next.second = string(it->first.name);
+ goto done;
+ }
+
+ auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
+ if (!ret.second) {
+ dout(10) << "already exporting/purging " << *dn << dendl;
+ continue;
+ }
+
+ // Don't try to migrate anything that is actually
+ // being purged right now
+ if (!dn->state_test(CDentry::STATE_PURGING))
+ stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
+
+ if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
+ ++it;
+ if (it != dir->end()) {
+ next.second = string(it->first.name);
+ } else {
+ if (dfls.empty())
+ next.first.ino.val++;
+ else
+ next.first = dfls.front()->dirfrag();
+ next.second.clear();
+ }
+ goto done;
+ }
}
}
}
- return done;
+ if (shutdown_exporting_strays.empty()) {
+ dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
+ if (first_df < shutdown_export_next.first ||
+ !shutdown_export_next.second.empty()) {
+ shutdown_export_next.first = first_df;
+ shutdown_export_next.second.clear();
+ goto again;
+ }
+ all_exported = true;
+ }
+
+done:
+ shutdown_export_next = next;
+ return all_exported;
}
// ========= messaging ==============
show_func(p.second);
}
-int MDCache::cache_status(Formatter *f)
+void MDCache::cache_status(Formatter *f)
{
f->open_object_section("cache");
f->close_section();
f->close_section();
- return 0;
}
int MDCache::dump_cache(boost::string_view file_name)
boost::string_view dump_root, int depth)
{
int r = 0;
+
+ // dumping large caches may cause mds to hang or worse get killed.
+ // so, disallow the dump if the cache size exceeds the configured
+ // threshold, which is 1G for formatter and unlimited for file (note
+ // that this can be jacked up by the admin... and is nothing but foot
+ // shooting, but the option itself is for devs and hence dangerous to
+ // tune). TODO: remove this when fixed.
+ uint64_t threshold = f ?
+ g_conf->get_val<uint64_t>("mds_dump_cache_threshold_formatter") :
+ g_conf->get_val<uint64_t>("mds_dump_cache_threshold_file");
+
+ if (threshold && cache_size() > threshold) {
+ if (f) {
+ std::stringstream ss;
+ ss << "cache usage exceeds dump threshold";
+ f->open_object_section("result");
+ f->dump_string("error", ss.str());
+ f->close_section();
+ } else {
+ derr << "cache usage exceeds dump threshold" << dendl;
+ r = -EINVAL;
+ }
+ return r;
+ }
+
+ r = 0;
int fd = -1;
if (f) {
// shutdown
private:
- set<inodeno_t> shutdown_exported_strays;
+ set<inodeno_t> shutdown_exporting_strays;
+ pair<dirfrag_t, string> shutdown_export_next;
public:
void shutdown_start();
void shutdown_check();
bool shutdown_pass();
- bool shutdown_export_strays();
bool shutdown(); // clear cache (ie at shutodwn)
+ bool shutdown_export_strays();
+ void shutdown_export_stray_finish(inodeno_t ino) {
+ if (shutdown_exporting_strays.erase(ino))
+ shutdown_export_strays();
+ }
bool did_shutdown_log_cap;
int dump_cache(Formatter *f);
int dump_cache(boost::string_view dump_root, int depth, Formatter *f);
- int cache_status(Formatter *f);
+ void cache_status(Formatter *f);
void dump_resolve_status(Formatter *f) const;
void dump_rejoin_status(Formatter *f) const;
plb.add_u64(l_mdl_evexd, "evexd", "Current expired events");
plb.add_u64(l_mdl_segexg, "segexg", "Expiring segments");
plb.add_u64(l_mdl_segexd, "segexd", "Current expired segments");
- plb.add_u64_counter(l_mdl_replayed, "replayed", "Events replayed");
+ plb.add_u64_counter(l_mdl_replayed, "replayed", "Events replayed",
+ "repl", PerfCountersBuilder::PRIO_INTERESTING);
plb.add_time_avg(l_mdl_jlat, "jlat", "Journaler flush latency");
plb.add_u64_counter(l_mdl_evex, "evex", "Total expired events");
plb.add_u64_counter(l_mdl_evtrm, "evtrm", "Trimmed events");
map<uint64_t,LogSegment*>::iterator p = segments.begin();
while (p != segments.end() &&
- p->first < last_seq && p->second->end <= safe_pos) {
+ p->first < last_seq &&
+ p->second->end < safe_pos) { // next segment should have been started
LogSegment *ls = p->second;
++p;
if (journaler->get_read_pos() == journaler->get_write_pos()) {
dout(10) << "replay - journal empty, done." << dendl;
mds->mdcache->trim();
+ if (mds->is_standby_replay())
+ mds->update_mlogger();
if (c) {
c->complete(0);
}
asok_hook,
"show cache status");
assert(r == 0);
+ r = admin_socket->register_command("cache drop",
+ "cache drop name=timeout,type=CephInt,range=0,req=false",
+ asok_hook,
+ "drop cache");
+ assert(r == 0);
r = admin_socket->register_command("dump tree",
"dump tree "
"name=root,type=CephString,req=true "
"mds_cache_reservation",
"mds_health_cache_threshold",
"mds_cache_mid",
+ "mds_dump_cache_threshold_formatter",
+ "mds_dump_cache_threshold_file",
// MDBalancer
"mds_bal_fragment_interval",
// PurgeQueue
m->put();
}
-
struct MDSCommand {
string cmdstring;
string helpstring;
"name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
"show heap usage info (available only if compiled with tcmalloc)", \
"mds", "*", "cli,rest")
+COMMAND("cache drop name=timeout,type=CephInt,range=0,req=false", "trim cache and optionally "
+ "request client to release all caps and flush the journal", "mds",
+ "r", "cli,rest")
};
}
else {
bool handled = mds_rank->handle_command(cmdmap, m, &r, &ds, &ss,
- need_reply);
+ run_later, need_reply);
if (!handled) {
// MDSDaemon doesn't know this command
ss << "unrecognized command! " << prefix;
//because add_observer is called after set_up_admin_socket
//so we can use asok_hook to avoid assert in the remove_observer
- if (asok_hook != NULL)
+ if (asok_hook != NULL) {
+ mds_lock.Unlock();
g_conf->remove_observer(this);
+ mds_lock.Lock();
+ }
clean_up_admin_socket();
utime_t get_session_timeout() const {
return utime_t(session_timeout,0);
}
+ void set_session_timeout(uint32_t t) {
+ session_timeout = t;
+ }
utime_t get_session_autoclose() const {
return utime_t(session_autoclose, 0);
}
+ void set_session_autoclose(uint32_t t) {
+ session_autoclose = t;
+ }
uint64_t get_max_filesize() const { return max_file_size; }
void set_max_filesize(uint64_t m) { max_file_size = m; }
}
}
+ /**
+ * Get MDS rank incarnation if the rank is up, else -1
+ */
+ mds_gid_t get_incarnation(mds_rank_t m) const {
+ std::map<mds_rank_t, mds_gid_t>::const_iterator u = up.find(m);
+ if (u == up.end())
+ return MDS_GID_NONE;
+ return (mds_gid_t)get_inc_gid(u->second);
+ }
+
int get_inc_gid(mds_gid_t gid) const {
auto mds_info_entry = mds_info.find(gid);
if (mds_info_entry != mds_info.end())
#undef dout_prefix
#define dout_prefix *_dout << "mds." << whoami << '.' << incarnation << ' '
+class C_Flush_Journal : public MDSInternalContext {
+public:
+ C_Flush_Journal(MDCache *mdcache, MDLog *mdlog, MDSRank *mds,
+ std::ostream *ss, Context *on_finish)
+ : MDSInternalContext(mds),
+ mdcache(mdcache), mdlog(mdlog), ss(ss), on_finish(on_finish),
+ whoami(mds->whoami), incarnation(mds->incarnation) {
+ }
+
+ void send() {
+ assert(mds->mds_lock.is_locked());
+
+ dout(20) << __func__ << dendl;
+
+ if (mdcache->is_readonly()) {
+ dout(5) << __func__ << ": read-only FS" << dendl;
+ complete(-EROFS);
+ return;
+ }
+
+ if (!mds->is_active()) {
+ dout(5) << __func__ << ": MDS not active, no-op" << dendl;
+ complete(0);
+ return;
+ }
+
+ flush_mdlog();
+ }
+
+private:
+
+ void flush_mdlog() {
+ dout(20) << __func__ << dendl;
+
+ // I need to seal off the current segment, and then mark all
+ // previous segments for expiry
+ mdlog->start_new_segment();
+
+ Context *ctx = new FunctionContext([this](int r) {
+ handle_flush_mdlog(r);
+ });
+
+ // Flush initially so that all the segments older than our new one
+ // will be elegible for expiry
+ mdlog->flush();
+ mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
+ }
+
+ void handle_flush_mdlog(int r) {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ if (r != 0) {
+ *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
+ complete(r);
+ return;
+ }
+
+ clear_mdlog();
+ }
+
+ void clear_mdlog() {
+ dout(20) << __func__ << dendl;
+
+ Context *ctx = new FunctionContext([this](int r) {
+ handle_clear_mdlog(r);
+ });
+
+ // Because we may not be the last wait_for_safe context on MDLog,
+ // and subsequent contexts might wake up in the middle of our
+ // later trim_all and interfere with expiry (by e.g. marking
+ // dirs/dentries dirty on previous log segments), we run a second
+ // wait_for_safe here. See #10368
+ mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
+ }
+
+ void handle_clear_mdlog(int r) {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ if (r != 0) {
+ *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
+ complete(r);
+ return;
+ }
+
+ trim_mdlog();
+ }
+
+ void trim_mdlog() {
+ // Put all the old log segments into expiring or expired state
+ dout(5) << __func__ << ": beginning segment expiry" << dendl;
+
+ int ret = mdlog->trim_all();
+ if (ret != 0) {
+ *ss << "Error " << ret << " (" << cpp_strerror(ret) << ") while trimming log";
+ complete(ret);
+ return;
+ }
+
+ expire_segments();
+ }
+
+ void expire_segments() {
+ dout(20) << __func__ << dendl;
+
+ // Attach contexts to wait for all expiring segments to expire
+ MDSGatherBuilder *expiry_gather = new MDSGatherBuilder(g_ceph_context);
+
+ const auto &expiring_segments = mdlog->get_expiring_segments();
+ for (auto p : expiring_segments) {
+ p->wait_for_expiry(expiry_gather->new_sub());
+ }
+ dout(5) << __func__ << ": waiting for " << expiry_gather->num_subs_created()
+ << " segments to expire" << dendl;
+
+ if (!expiry_gather->has_subs()) {
+ trim_segments();
+ delete expiry_gather;
+ return;
+ }
+
+ Context *ctx = new FunctionContext([this](int r) {
+ handle_expire_segments(r);
+ });
+ expiry_gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
+ expiry_gather->activate();
+ }
+
+ void handle_expire_segments(int r) {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ ceph_assert(r == 0); // MDLog is not allowed to raise errors via
+ // wait_for_expiry
+ trim_segments();
+ }
+
+ void trim_segments() {
+ dout(20) << __func__ << dendl;
+
+ Context *ctx = new C_OnFinisher(new FunctionContext([this](int _) {
+ Mutex::Locker locker(mds->mds_lock);
+ trim_expired_segments();
+ }), mds->finisher);
+ ctx->complete(0);
+ }
+
+ void trim_expired_segments() {
+ dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now "
+ << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
+ << mdlog->get_journaler()->get_trimmed_pos() << dendl;
+
+ // Now everyone I'm interested in is expired
+ mdlog->trim_expired_segments();
+
+ dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now "
+ << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
+ << mdlog->get_journaler()->get_trimmed_pos() << dendl;
+
+ write_journal_head();
+ }
+
+ void write_journal_head() {
+ dout(20) << __func__ << dendl;
+
+ Context *ctx = new FunctionContext([this](int r) {
+ Mutex::Locker locker(mds->mds_lock);
+ handle_write_head(r);
+ });
+ // Flush the journal header so that readers will start from after
+ // the flushed region
+ mdlog->get_journaler()->write_head(ctx);
+ }
+
+ void handle_write_head(int r) {
+ if (r != 0) {
+ *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
+ } else {
+ dout(5) << __func__ << ": write_head complete, all done!" << dendl;
+ }
+
+ complete(r);
+ }
+
+ void finish(int r) override {
+ dout(20) << __func__ << ": r=" << r << dendl;
+ on_finish->complete(r);
+ }
+
+ MDCache *mdcache;
+ MDLog *mdlog;
+ std::ostream *ss;
+ Context *on_finish;
+
+ // so as to use dout
+ mds_rank_t whoami;
+ int incarnation;
+};
+
+class C_Drop_Cache : public MDSInternalContext {
+public:
+ C_Drop_Cache(Server *server, MDCache *mdcache, MDLog *mdlog,
+ MDSRank *mds, uint64_t recall_timeout,
+ Formatter *f, Context *on_finish)
+ : MDSInternalContext(mds),
+ server(server), mdcache(mdcache), mdlog(mdlog),
+ recall_timeout(recall_timeout), f(f), on_finish(on_finish),
+ whoami(mds->whoami), incarnation(mds->incarnation) {
+ }
+
+ void send() {
+ // not really a hard requirement here, but lets ensure this in
+ // case we change the logic here.
+ assert(mds->mds_lock.is_locked());
+
+ dout(20) << __func__ << dendl;
+ recall_client_state();
+ }
+
+private:
+ // context which completes itself (with -ETIMEDOUT) after a specified
+ // timeout or when explicitly completed, whichever comes first. Note
+ // that the context does not detroy itself after completion -- it
+ // needs to be explicitly freed.
+ class C_ContextTimeout : public MDSInternalContext {
+ public:
+ C_ContextTimeout(MDSRank *mds, uint64_t timeout, Context *on_finish)
+ : MDSInternalContext(mds),
+ timeout(timeout),
+ lock("mds::context::timeout", false, true),
+ on_finish(on_finish) {
+ }
+ ~C_ContextTimeout() {
+ ceph_assert(timer_task == nullptr);
+ }
+
+ void start_timer() {
+ if (!timeout) {
+ return;
+ }
+
+ timer_task = new FunctionContext([this](int _) {
+ timer_task = nullptr;
+ complete(-ETIMEDOUT);
+ });
+ mds->timer.add_event_after(timeout, timer_task);
+ }
+
+ void finish(int r) override {
+ Context *ctx = nullptr;
+ {
+ Mutex::Locker locker(lock);
+ std::swap(on_finish, ctx);
+ }
+ if (ctx != nullptr) {
+ ctx->complete(r);
+ }
+ }
+ void complete(int r) override {
+ if (timer_task != nullptr) {
+ mds->timer.cancel_event(timer_task);
+ }
+
+ finish(r);
+ }
+
+ uint64_t timeout;
+ Mutex lock;
+ Context *on_finish = nullptr;
+ Context *timer_task = nullptr;
+ };
+
+ void recall_client_state() {
+ dout(20) << __func__ << dendl;
+
+ f->open_object_section("result");
+
+ MDSGatherBuilder *gather = new MDSGatherBuilder(g_ceph_context);
+ server->recall_client_state(1.0, true, gather);
+ if (!gather->has_subs()) {
+ handle_recall_client_state(0);
+ delete gather;
+ return;
+ }
+
+ C_ContextTimeout *ctx = new C_ContextTimeout(
+ mds, recall_timeout, new FunctionContext([this](int r) {
+ handle_recall_client_state(r);
+ }));
+
+ ctx->start_timer();
+ gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
+ gather->activate();
+ }
+
+ void handle_recall_client_state(int r) {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ // client recall section
+ f->open_object_section("client_recall");
+ f->dump_int("return_code", r);
+ f->dump_string("message", cpp_strerror(r));
+ f->close_section();
+
+ // we can still continue after recall timeout
+ trim_cache();
+ }
+
+ void trim_cache() {
+ dout(20) << __func__ << dendl;
+
+ if (!mdcache->trim(UINT64_MAX)) {
+ cmd_err(f, "failed to trim cache");
+ complete(-EINVAL);
+ return;
+ }
+
+ flush_journal();
+ }
+
+ void flush_journal() {
+ dout(20) << __func__ << dendl;
+
+ Context *ctx = new FunctionContext([this](int r) {
+ handle_flush_journal(r);
+ });
+
+ C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, mds, &ss, ctx);
+ flush_journal->send();
+ }
+
+ void handle_flush_journal(int r) {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ if (r != 0) {
+ cmd_err(f, ss.str());
+ complete(r);
+ return;
+ }
+
+ // journal flush section
+ f->open_object_section("flush_journal");
+ f->dump_int("return_code", r);
+ f->dump_string("message", ss.str());
+ f->close_section();
+
+ cache_status();
+ }
+
+ void cache_status() {
+ dout(20) << __func__ << dendl;
+
+ // cache status section
+ mdcache->cache_status(f);
+ f->close_section();
+
+ complete(0);
+ }
+
+ void finish(int r) override {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ on_finish->complete(r);
+ }
+
+ Server *server;
+ MDCache *mdcache;
+ MDLog *mdlog;
+ uint64_t recall_timeout;
+ Formatter *f;
+ Context *on_finish;
+
+ int retval = 0;
+ std::stringstream ss;
+
+ // so as to use dout
+ mds_rank_t whoami;
+ int incarnation;
+
+ void cmd_err(Formatter *f, boost::string_view err) {
+ f->reset();
+ f->open_object_section("result");
+ f->dump_string("error", err);
+ f->close_section();
+ }
+};
+
MDSRank::MDSRank(
mds_rank_t whoami_,
Mutex &mds_lock_,
// Purge Queue operates inside mds_lock when we're calling into
// it, and outside when in background, so must handle both cases.
if (mds_lock.is_locked_by_me()) {
- damaged();
+ handle_write_error(r);
} else {
- damaged_unlocked();
+ Mutex::Locker l(mds_lock);
+ handle_write_error(r);
}
}
)
bool MDSRankDispatcher::ms_dispatch(Message *m)
{
- bool ret;
+ if (m->get_source().is_client()) {
+ Session *session = static_cast<Session*>(m->get_connection()->get_priv());
+ if (session)
+ session->last_seen = Session::clock::now();
+ }
+
inc_dispatch_depth();
- ret = _dispatch(m, true);
+ bool ret = _dispatch(m, true);
dec_dispatch_depth();
return ret;
}
// NB not enabling suicide grace, because the mon takes care of killing us
// (by blacklisting us) when we fail to send beacons, and it's simpler to
// only have one way of dying.
- g_ceph_context->get_heartbeat_map()->reset_timeout(hb, g_conf->mds_beacon_grace, 0);
+ auto grace = g_conf->get_val<double>("mds_heartbeat_grace");
+ g_ceph_context->get_heartbeat_map()->reset_timeout(hb, grace, 0);
}
bool MDSRank::is_stale_message(Message *m) const
imported_session->auth_caps = session->auth_caps;
assert(session->get_nref() == 1);
imported_session->connection->set_priv(imported_session->get());
+ imported_session->last_seen = session->last_seen;
session = imported_session;
}
}
<< cpp_strerror(r);
damaged();
assert(r == 0); // Unreachable, damaged() calls respawn()
+ } else if (r == -EROFS) {
+ dout(0) << "boot error forcing transition to read-only; MDS will try to continue" << dendl;
} else {
// Completely unexpected error, give up and die
dout(0) << "boot_start encountered an error, failing" << dendl;
{
if (standby_replaying) {
/* Go around for another pass of replaying in standby */
- dout(4) << "standby_replay_restart (as standby)" << dendl;
+ dout(5) << "Restarting replay as standby-replay" << dendl;
mdlog->get_journaler()->reread_head_and_probe(
new C_MDS_StandbyReplayRestartFinish(
this,
void MDSRank::replay_done()
{
- dout(1) << "replay_done" << (standby_replaying ? " (as standby)" : "") << dendl;
+ if (!standby_replaying) {
+ dout(1) << "Finished replaying journal" << dendl;
+ } else {
+ dout(5) << "Finished replaying journal as standby-replay" << dendl;
+ }
if (is_standby_replay()) {
// The replay was done in standby state, and we are still in that state
// Reformat and come back here
if (mdlog->get_journaler()->get_stream_format() < g_conf->mds_journal_format) {
- dout(4) << "reformatting journal on standbyreplay->replay transition" << dendl;
+ dout(4) << "reformatting journal on standby-replay->replay transition" << dendl;
mdlog->reopen(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
return;
}
}
} else if (command == "cache status") {
Mutex::Locker l(mds_lock);
- int r = mdcache->cache_status(f);
+ mdcache->cache_status(f);
+ } else if (command == "cache drop") {
+ int64_t timeout;
+ if (!cmd_getval(g_ceph_context, cmdmap, "timeout", timeout)) {
+ timeout = 0;
+ }
+
+ C_SaferCond cond;
+ command_cache_drop((uint64_t)timeout, f, &cond);
+ int r = cond.wait();
if (r != 0) {
- ss << "Failed to get cache status: " << cpp_strerror(r);
+ f->flush(ss);
}
} else if (command == "dump tree") {
string root;
return true;
}
-class C_MDS_Send_Command_Reply : public MDSInternalContext
-{
+class C_MDS_Send_Command_Reply : public MDSInternalContext {
protected:
MCommand *m;
public:
C_MDS_Send_Command_Reply(MDSRank *_mds, MCommand *_m) :
MDSInternalContext(_mds), m(_m) { m->get(); }
- void send (int r, boost::string_view out_str) {
+
+ void send(int r, boost::string_view ss) {
+ std::stringstream ds;
+ send(r, ss, ds);
+ }
+
+ void send(int r, boost::string_view ss, std::stringstream &ds) {
bufferlist bl;
- MDSDaemon::send_command_reply(m, mds, r, bl, out_str);
+ bl.append(ds);
+ MDSDaemon::send_command_reply(m, mds, r, bl, ss);
m->put();
}
- void finish (int r) override {
+
+ void finish(int r) override {
send(r, "");
}
};
f->close_section(); // results
}
-/**
- * Wrapper around _command_flush_journal that
- * handles serialization of result
- */
-void MDSRank::command_flush_journal(Formatter *f)
-{
- assert(f != NULL);
+// synchronous wrapper around "journal flush" asynchronous context
+// execution.
+void MDSRank::command_flush_journal(Formatter *f) {
+ ceph_assert(f != NULL);
+ C_SaferCond cond;
std::stringstream ss;
- const int r = _command_flush_journal(&ss);
- f->open_object_section("result");
- f->dump_string("message", ss.str());
- f->dump_int("return_code", r);
- f->close_section();
-}
-
-/**
- * Implementation of "flush journal" asok command.
- *
- * @param ss
- * Optionally populate with a human readable string describing the
- * reason for any unexpected return status.
- */
-int MDSRank::_command_flush_journal(std::stringstream *ss)
-{
- assert(ss != NULL);
- Mutex::Locker l(mds_lock);
-
- if (mdcache->is_readonly()) {
- dout(5) << __func__ << ": read-only FS" << dendl;
- return -EROFS;
- }
-
- if (!is_active()) {
- dout(5) << __func__ << ": MDS not active, no-op" << dendl;
- return 0;
- }
-
- // I need to seal off the current segment, and then mark all previous segments
- // for expiry
- mdlog->start_new_segment();
- int r = 0;
-
- // Flush initially so that all the segments older than our new one
- // will be elegible for expiry
- {
- C_SaferCond mdlog_flushed;
- mdlog->flush();
- mdlog->wait_for_safe(new MDSInternalContextWrapper(this, &mdlog_flushed));
- mds_lock.Unlock();
- r = mdlog_flushed.wait();
- mds_lock.Lock();
- if (r != 0) {
- *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
- return r;
- }
- }
-
- // Because we may not be the last wait_for_safe context on MDLog, and
- // subsequent contexts might wake up in the middle of our later trim_all
- // and interfere with expiry (by e.g. marking dirs/dentries dirty
- // on previous log segments), we run a second wait_for_safe here.
- // See #10368
{
- C_SaferCond mdlog_cleared;
- mdlog->wait_for_safe(new MDSInternalContextWrapper(this, &mdlog_cleared));
- mds_lock.Unlock();
- r = mdlog_cleared.wait();
- mds_lock.Lock();
- if (r != 0) {
- *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
- return r;
- }
+ Mutex::Locker locker(mds_lock);
+ C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, this, &ss, &cond);
+ flush_journal->send();
}
+ int r = cond.wait();
- // Put all the old log segments into expiring or expired state
- dout(5) << __func__ << ": beginning segment expiry" << dendl;
- r = mdlog->trim_all();
- if (r != 0) {
- *ss << "Error " << r << " (" << cpp_strerror(r) << ") while trimming log";
- return r;
- }
-
- // Attach contexts to wait for all expiring segments to expire
- MDSGatherBuilder expiry_gather(g_ceph_context);
-
- const std::set<LogSegment*> &expiring_segments = mdlog->get_expiring_segments();
- for (std::set<LogSegment*>::const_iterator i = expiring_segments.begin();
- i != expiring_segments.end(); ++i) {
- (*i)->wait_for_expiry(expiry_gather.new_sub());
- }
- dout(5) << __func__ << ": waiting for " << expiry_gather.num_subs_created()
- << " segments to expire" << dendl;
-
- if (expiry_gather.has_subs()) {
- C_SaferCond cond;
- expiry_gather.set_finisher(new MDSInternalContextWrapper(this, &cond));
- expiry_gather.activate();
-
- // Drop mds_lock to allow progress until expiry is complete
- mds_lock.Unlock();
- int r = cond.wait();
- mds_lock.Lock();
-
- assert(r == 0); // MDLog is not allowed to raise errors via wait_for_expiry
- }
-
- dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now " << std::hex <<
- mdlog->get_journaler()->get_expire_pos() << "/" <<
- mdlog->get_journaler()->get_trimmed_pos() << dendl;
-
- // Now everyone I'm interested in is expired
- mdlog->trim_expired_segments();
-
- dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now " << std::hex <<
- mdlog->get_journaler()->get_expire_pos() << "/" <<
- mdlog->get_journaler()->get_trimmed_pos() << dendl;
-
- // Flush the journal header so that readers will start from after the flushed region
- C_SaferCond wrote_head;
- mdlog->get_journaler()->write_head(&wrote_head);
- mds_lock.Unlock(); // Drop lock to allow messenger dispatch progress
- r = wrote_head.wait();
- mds_lock.Lock();
- if (r != 0) {
- *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
- return r;
- }
-
- dout(5) << __func__ << ": write_head complete, all done!" << dendl;
-
- return 0;
+ f->open_object_section("result");
+ f->dump_string("message", ss.str());
+ f->dump_int("return_code", r);
+ f->close_section();
}
-
void MDSRank::command_get_subtrees(Formatter *f)
{
assert(f != NULL);
int *r,
std::stringstream *ds,
std::stringstream *ss,
+ Context **run_later,
bool *need_reply)
{
assert(r != nullptr);
return true;
}
- Formatter *f = new JSONFormatter(true);
- dump_sessions(filter, f);
- f->flush(*ds);
- delete f;
+ JSONFormatter f(true);
+ dump_sessions(filter, &f);
+ f.flush(*ds);
return true;
} else if (prefix == "session evict" || prefix == "client evict") {
std::vector<std::string> filter_args;
*need_reply = false;
return true;
} else if (prefix == "damage ls") {
- Formatter *f = new JSONFormatter(true);
- damage_table.dump(f);
- f->flush(*ds);
- delete f;
+ JSONFormatter f(true);
+ damage_table.dump(&f);
+ f.flush(*ds);
return true;
} else if (prefix == "damage rm") {
damage_entry_id_t id = 0;
}
damage_table.erase(id);
+ return true;
+ } else if (prefix == "cache drop") {
+ int64_t timeout;
+ if (!cmd_getval(g_ceph_context, cmdmap, "timeout", timeout)) {
+ timeout = 0;
+ }
+
+ JSONFormatter *f = new JSONFormatter(true);
+ C_MDS_Send_Command_Reply *reply = new C_MDS_Send_Command_Reply(this, m);
+ Context *on_finish = new FunctionContext([this, f, reply](int r) {
+ cache_drop_send_reply(f, reply, r);
+ delete f;
+ delete reply;
+ });
+
+ *need_reply = false;
+ *run_later = new C_OnFinisher(
+ new FunctionContext([this, timeout, f, on_finish](int _) {
+ command_cache_drop((uint64_t)timeout, f, on_finish);
+ }), finisher);
+
return true;
} else {
return false;
}
}
+void MDSRank::cache_drop_send_reply(Formatter *f, C_MDS_Send_Command_Reply *reply, int r) {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ std::stringstream ds;
+ std::stringstream ss;
+ if (r != 0) {
+ f->flush(ss);
+ } else {
+ f->flush(ds);
+ }
+
+ reply->send(r, ss.str(), ds);
+}
+
+void MDSRank::command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish) {
+ dout(20) << __func__ << dendl;
+
+ Mutex::Locker locker(mds_lock);
+ C_Drop_Cache *request = new C_Drop_Cache(server, mdcache, mdlog, this,
+ timeout, f, on_finish);
+ request->send();
+}
+
epoch_t MDSRank::get_osd_epoch() const
{
return objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
class Finisher;
class MMDSMap;
class ScrubStack;
+class C_MDS_Send_Command_Reply;
/**
* The public part of this class's interface is what's exposed to all
int incarnation;
public:
+
+ friend class C_Flush_Journal;
+ friend class C_Drop_Cache;
+
mds_rank_t get_nodeid() const { return whoami; }
int64_t get_metadata_pool();
std::ostream &ss,
Formatter *f);
int _command_export_dir(boost::string_view path, mds_rank_t dest);
- int _command_flush_journal(std::stringstream *ss);
CDir *_command_dirfrag_get(
const cmdmap_t &cmdmap,
std::ostream &ss);
+ void cache_drop_send_reply(Formatter *f, C_MDS_Send_Command_Reply *reply, int r);
+ void command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish);
+
protected:
Messenger *messenger;
MonClient *monc;
int *r,
std::stringstream *ds,
std::stringstream *ss,
+ Context **run_later,
bool *need_reply);
void dump_sessions(const SessionFilter &filter, Formatter *f) const;
#include "PurgeQueue.h"
+#include <string.h>
#define dout_context cct
#define dout_subsys ceph_subsys_mds
if (logger) {
g_ceph_context->get_perfcounters_collection()->remove(logger.get());
}
+ delete on_error;
}
void PurgeQueue::create_logger()
void PurgeQueue::activate()
{
Mutex::Locker l(lock);
+
+ if (readonly) {
+ dout(10) << "skipping activate: PurgeQueue is readonly" << dendl;
+ return;
+ }
+
if (journaler.get_read_pos() == journaler.get_write_pos())
return;
finish_contexts(g_ceph_context, waiting_for_recovery);
} else {
derr << "Error " << r << " loading Journaler" << dendl;
- on_error->complete(r);
+ _go_readonly(r);
}
}));
}
void PurgeQueue::wait_for_recovery(Context* c)
{
Mutex::Locker l(lock);
- if (recovered)
+ if (recovered) {
c->complete(0);
- else
+ } else if (readonly) {
+ dout(10) << "cannot wait for recovery: PurgeQueue is readonly" << dendl;
+ c->complete(-EROFS);
+ } else {
waiting_for_recovery.push_back(c);
+ }
}
void PurgeQueue::_recover()
if (journaler.get_error()) {
int r = journaler.get_error();
derr << "Error " << r << " recovering write_pos" << dendl;
- on_error->complete(r);
+ _go_readonly(r);
return;
}
journaler.create(&layout, JOURNAL_FORMAT_RESILIENT);
journaler.write_head(new FunctionContext([this](int r) {
Mutex::Locker l(lock);
- recovered = true;
- finish_contexts(g_ceph_context, waiting_for_recovery);
+ if (r) {
+ _go_readonly(r);
+ } else {
+ recovered = true;
+ finish_contexts(g_ceph_context, waiting_for_recovery);
+ }
}));
}
dout(4) << "pushing inode 0x" << std::hex << pi.ino << std::dec << dendl;
Mutex::Locker l(lock);
+ if (readonly) {
+ dout(10) << "cannot push inode: PurgeQueue is readonly" << dendl;
+ completion->complete(-EROFS);
+ return;
+ }
+
// Callers should have waited for open() before using us
assert(!journaler.is_readonly());
if (!could_consume) {
// Usually, it is not necessary to explicitly flush here, because the reader
// will get flushes generated inside Journaler::is_readable. However,
- // if we remain in a can_consume()==false state for a long period then
+ // if we remain in a _can_consume()==false state for a long period then
// we should flush in order to allow MDCache to drop its strays rather
// than having them wait for purgequeue to progress.
if (!delayed_flush) {
return ops_required;
}
-bool PurgeQueue::can_consume()
+bool PurgeQueue::_can_consume()
{
+ if (readonly) {
+ dout(10) << "can't consume: PurgeQueue is readonly" << dendl;
+ return false;
+ }
+
dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, "
<< in_flight.size() << "/" << g_conf->mds_max_purge_files
<< " files" << dendl;
}
}
+void PurgeQueue::_go_readonly(int r)
+{
+ if (readonly) return;
+ dout(1) << "going readonly because internal IO failed: " << strerror(-r) << dendl;
+ readonly = true;
+ on_error->complete(r);
+ on_error = nullptr;
+ journaler.set_readonly();
+ finish_contexts(g_ceph_context, waiting_for_recovery, r);
+}
+
bool PurgeQueue::_consume()
{
assert(lock.is_locked_by_me());
bool could_consume = false;
- while(can_consume()) {
+ while(_can_consume()) {
if (delayed_flush) {
// We are now going to read from the journal, so any proactive
if (int r = journaler.get_error()) {
derr << "Error " << r << " recovering write_pos" << dendl;
- on_error->complete(r);
+ _go_readonly(r);
return could_consume;
}
if (r == 0) {
_consume();
} else if (r != -EAGAIN) {
- on_error->complete(r);
+ _go_readonly(r);
}
}));
}
} catch (const buffer::error &err) {
derr << "Decode error at read_pos=0x" << std::hex
<< journaler.get_read_pos() << dendl;
- on_error->complete(0);
+ _go_readonly(EIO);
}
dout(20) << " executing item (0x" << std::hex << item.ino
<< std::dec << ")" << dendl;
in_flight[expire_to] = item;
logger->set(l_pq_executing, in_flight.size());
- ops_in_flight += _calculate_ops(item);
+ auto ops = _calculate_ops(item);
+ ops_in_flight += ops;
logger->set(l_pq_executing_ops, ops_in_flight);
SnapContext nullsnapc;
} else {
derr << "Invalid item (action=" << item.action << ") in purge queue, "
"dropping it" << dendl;
+ ops_in_flight -= ops;
+ logger->set(l_pq_executing_ops, ops_in_flight);
in_flight.erase(expire_to);
logger->set(l_pq_executing, in_flight.size());
return;
// expire_pos doesn't fall too far behind our progress when consuming
// a very long queue.
if (in_flight.empty() || journaler.write_head_needed()) {
- journaler.write_head(new FunctionContext([this](int r){
- journaler.trim();
- }));
+ journaler.write_head(nullptr);
}
}), &finisher));
{
Mutex::Locker l(lock);
+ if (readonly) {
+ dout(10) << "skipping; PurgeQueue is readonly" << dendl;
+ return;
+ }
+
uint64_t pg_count = 0;
objecter->with_osdmap([&](const OSDMap& o) {
// Number of PGs across all data pools
size_t *in_flight_count
)
{
+ Mutex::Locker l(lock);
+
+ if (readonly) {
+ dout(10) << "skipping drain; PurgeQueue is readonly" << dendl;
+ return true;
+ }
+
assert(progress != nullptr);
assert(progress_total != nullptr);
assert(in_flight_count != nullptr);
CephContext *cct;
const mds_rank_t rank;
Mutex lock;
+ bool readonly = false;
int64_t metadata_pool;
uint32_t _calculate_ops(const PurgeItem &item) const;
- bool can_consume();
+ bool _can_consume();
// How many bytes were remaining when drain() was first called,
// used for indicating progress.
bool recovered;
std::list<Context*> waiting_for_recovery;
+ void _go_readonly(int r);
+
public:
void init();
void activate();
m->put();
}
+
+void Server::flush_session(Session *session, MDSGatherBuilder *gather) {
+ if (!session->is_open() ||
+ !session->connection.get() ||
+ !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
+ return;
+ }
+
+ version_t seq = session->wait_for_flush(gather->new_sub());
+ mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
+}
+
void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
{
for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
assert(session);
- if (!session->is_open() ||
- !session->connection.get() ||
- !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER))
- continue;
- version_t seq = session->wait_for_flush(gather.new_sub());
- mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
+ flush_session(session, &gather);
}
}
// (caps go stale, lease die)
double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
- while (1) {
- Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
- if (!session) break;
- auto last_cap_renew_span = std::chrono::duration<double>(now-session->last_cap_renew).count();
- if (last_cap_renew_span < cutoff) {
- dout(20) << "laggiest active session is " << session->info.inst << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
- break;
+
+ const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
+ if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
+ std::vector<Session*> new_stale;
+
+ for (auto session : *(sessions_p1->second)) {
+ auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+ if (last_cap_renew_span < cutoff) {
+ dout(20) << "laggiest active session is " << session->info.inst
+ << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+ break;
+ }
+
+ if (session->last_seen > session->last_cap_renew) {
+ last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
+ if (last_cap_renew_span < cutoff) {
+ dout(20) << "laggiest active session is " << session->info.inst
+ << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+ continue;
+ }
+ }
+
+ dout(10) << "new stale session " << session->info.inst
+ << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
+ new_stale.push_back(session);
}
- dout(10) << "new stale session " << session->info.inst << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
- mds->sessionmap.set_state(session, Session::STATE_STALE);
- mds->locker->revoke_stale_caps(session);
- mds->locker->remove_stale_leases(session);
- mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
- finish_flush_session(session, session->get_push_seq());
+ for (auto session : new_stale) {
+ mds->sessionmap.set_state(session, Session::STATE_STALE);
+ mds->locker->revoke_stale_caps(session);
+ mds->locker->remove_stale_leases(session);
+ mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
+ finish_flush_session(session, session->get_push_seq());
+ }
}
// autoclose
// Collect a list of sessions exceeding the autoclose threshold
std::vector<Session *> to_evict;
- const auto sessions_p = mds->sessionmap.by_state.find(Session::STATE_STALE);
- if (sessions_p == mds->sessionmap.by_state.end() || sessions_p->second->empty()) {
+ const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
+ if (sessions_p2 == mds->sessionmap.by_state.end() || sessions_p2->second->empty()) {
return;
}
- const auto &stale_sessions = sessions_p->second;
+ const auto &stale_sessions = sessions_p2->second;
assert(stale_sessions != nullptr);
for (const auto &session: *stale_sessions) {
// clients will get the mdsmap and discover we're reconnecting via the monitor.
- reconnect_start = ceph_clock_now();
+ reconnect_start = clock::now();
dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
mds->sessionmap.dump();
}
return;
}
- utime_t delay = ceph_clock_now();
- delay -= reconnect_start;
+ auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
bool deny = false;
}
mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
+ reconnect_last_seen = clock::now();
+
// remove from gather set
client_reconnect_gather.erase(from);
if (client_reconnect_gather.empty())
void Server::reconnect_tick()
{
if (reconnect_evicting) {
- dout(4) << "reconnect_tick: waiting for evictions" << dendl;
+ dout(7) << "reconnect_tick: waiting for evictions" << dendl;
return;
}
- utime_t reconnect_end = reconnect_start;
- reconnect_end += g_conf->mds_reconnect_timeout;
- if (ceph_clock_now() >= reconnect_end &&
- !client_reconnect_gather.empty()) {
- dout(10) << "reconnect timed out" << dendl;
+ if (client_reconnect_gather.empty())
+ return;
- // If we're doing blacklist evictions, use this to wait for them before
- // proceeding to reconnect_gather_finish
- MDSGatherBuilder gather(g_ceph_context);
+ auto now = clock::now();
+ auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
+ if (elapse1 < g_conf->mds_reconnect_timeout)
+ return;
- for (set<client_t>::iterator p = client_reconnect_gather.begin();
- p != client_reconnect_gather.end();
- ++p) {
- Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
- assert(session);
- dout(1) << "reconnect gave up on " << session->info.inst << dendl;
-
- mds->clog->warn() << "evicting unresponsive client " << *session
- << ", after waiting " << g_conf->mds_reconnect_timeout
- << " seconds during MDS startup";
-
- if (g_conf->mds_session_blacklist_on_timeout) {
- std::stringstream ss;
- mds->evict_client(session->info.inst.name.num(), false, true, ss,
- gather.new_sub());
- } else {
- kill_session(session, NULL);
- }
+ vector<Session*> remaining_sessions;
+ remaining_sessions.reserve(client_reconnect_gather.size());
+ for (auto c : client_reconnect_gather) {
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
+ ceph_assert(session);
+ remaining_sessions.push_back(session);
+ // client re-sends cap flush messages before the reconnect message
+ if (session->last_seen > reconnect_last_seen)
+ reconnect_last_seen = session->last_seen;
+ }
- failed_reconnects++;
- }
- client_reconnect_gather.clear();
+ auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
+ if (elapse2 < g_conf->mds_reconnect_timeout / 2) {
+ dout(7) << "reconnect_tick: last seen " << elapse2
+ << " seconds ago, extending reconnect interval" << dendl;
+ return;
+ }
+
+ dout(7) << "reconnect timed out, " << remaining_sessions.size()
+ << " clients have not reconnected in time" << dendl;
+
+ // If we're doing blacklist evictions, use this to wait for them before
+ // proceeding to reconnect_gather_finish
+ MDSGatherBuilder gather(g_ceph_context);
+
+ for (auto session : remaining_sessions) {
+ dout(1) << "reconnect gives up on " << session->info.inst << dendl;
- if (gather.has_subs()) {
- dout(1) << "reconnect will complete once clients are evicted" << dendl;
- gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
- [this](int r){reconnect_gather_finish();})));
- gather.activate();
- reconnect_evicting = true;
+ mds->clog->warn() << "evicting unresponsive client " << *session
+ << ", after waiting " << elapse1
+ << " seconds during MDS startup";
+
+ if (g_conf->mds_session_blacklist_on_timeout) {
+ std::stringstream ss;
+ mds->evict_client(session->get_client().v, false, true, ss,
+ gather.new_sub());
} else {
- reconnect_gather_finish();
+ kill_session(session, NULL);
}
+
+ failed_reconnects++;
+ }
+ client_reconnect_gather.clear();
+
+ if (gather.has_subs()) {
+ dout(1) << "reconnect will complete once clients are evicted" << dendl;
+ gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
+ [this](int r){reconnect_gather_finish();})));
+ gather.activate();
+ reconnect_evicting = true;
+ } else {
+ reconnect_gather_finish();
}
}
* to trim some caps, and consequently unpin some inodes in the MDCache so
* that it can trim too.
*/
-void Server::recall_client_state(void)
-{
+void Server::recall_client_state(double ratio, bool flush_client_session,
+ MDSGatherBuilder *gather) {
+ if (flush_client_session) {
+ assert(gather != nullptr);
+ }
+
/* try to recall at least 80% of all caps */
uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
/* ratio: determine the amount of caps to recall from each client. Use
* percentage full over the cache reservation. Cap the ratio at 80% of client
* caps. */
- double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
+ if (ratio < 0.0)
+ ratio = 1.0 - fmin(0.80, mdcache->cache_toofull_ratio());
- dout(10) << "recall_client_state " << ratio
- << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
- << dendl;
+ dout(10) << __func__ << ": ratio=" << ratio << ", caps per client "
+ << min_caps_per_client << "-" << max_caps_per_client << dendl;
set<Session*> sessions;
mds->sessionmap.get_client_session_set(sessions);
+
for (auto &session : sessions) {
if (!session->is_open() ||
+ !session->connection.get() ||
!session->info.inst.name.is_client())
continue;
MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
m->head.max_caps = newlim;
mds->send_message_client(m, session);
+ if (flush_client_session) {
+ flush_session(session, gather);
+ }
session->notify_recall_sent(newlim);
}
}
if (in->is_dir()) {
assert(straydn);
mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
-
- in->maybe_export_pin(true);
}
journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
{
dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
- CDentry::linkage_t *destdnl = destdn->get_linkage();
+ CInode *in = destdn->get_linkage()->get_inode();
+
+ inodeno_t migrated_stray;
+ if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
+ migrated_stray = in->ino();
list<MDSInternalContextBase*> finished;
if (r == 0) {
// unfreeze+singleauth inode
// hmm, do i really need to delay this?
if (mdr->more()->is_inode_exporter) {
-
- CInode *in = destdnl->get_inode();
-
// drop our pins
// we exported, clear out any xlocks that we moved to another MDS
set<SimpleLock*>::iterator i = mdr->xlocks.begin();
bufferlist::iterator bp = mdr->more()->inode_import.begin();
::decode(peer_imported, bp);
- dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
- mdcache->migrator->finish_export_inode(destdnl->get_inode(), ceph_clock_now(),
- mdr->slave_to_mds, peer_imported, finished);
+ dout(10) << " finishing inode export on " << *in << dendl;
+ mdcache->migrator->finish_export_inode(in, ceph_clock_now(), mdr->slave_to_mds,
+ peer_imported, finished);
mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
// unfreeze
- assert(destdnl->get_inode()->is_frozen_inode());
- destdnl->get_inode()->unfreeze_inode(finished);
+ assert(in->is_frozen_inode());
+ in->unfreeze_inode(finished);
}
// singleauth
// witness list from the master, and they failed before we tried prep again.
if (mdr->more()->rollback_bl.length()) {
if (mdr->more()->is_inode_exporter) {
- dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
- destdnl->get_inode()->abort_export();
+ dout(10) << " reversing inode export of " << *in << dendl;
+ in->abort_export();
}
if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
mdcache->request_finish(mdr);
}
}
+
+ if (migrated_stray && mds->is_stopping())
+ mdcache->shutdown_export_stray_finish(migrated_stray);
}
void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
int failed_reconnects;
bool reconnect_evicting; // true if I am waiting for evictions to complete
// before proceeding to reconnect_gather_finish
+ time reconnect_start = time::min();
+ time reconnect_last_seen = time::min();
+ set<client_t> client_reconnect_gather; // clients i need a reconnect msg from.
double cap_revoke_eviction_timeout = 0;
void handle_osd_map();
// -- sessions and recovery --
- utime_t reconnect_start;
- set<client_t> client_reconnect_gather; // clients i need a reconnect msg from.
bool waiting_for_reconnect(client_t c) const;
void dump_reconnect_status(Formatter *f) const;
void reconnect_tick();
void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
- void recall_client_state(void);
+ void recall_client_state(double ratio, bool flush_client_session,
+ MDSGatherBuilder *gather);
void force_clients_readonly();
// -- requests --
private:
void reply_client_request(MDRequestRef& mdr, MClientReply *reply);
+ void flush_session(Session *session, MDSGatherBuilder *gather);
};
#endif
xlist<Capability*> caps; // inodes with caps; front=most recently used
xlist<ClientLease*> leases; // metadata leases to clients
time last_cap_renew = time::min();
+ time last_seen = time::min();
public:
version_t inc_push_seq() { return ++cap_push_seq; }
mds->mdlog->submit_entry(le, new C_PurgeStrayLogged(this, dn, pdv,
mds->mdlog->get_current_segment()));
-
- logger->set(l_mdc_num_strays, num_strays);
}
}
}
// drop inode
+ inodeno_t ino = in->ino();
if (in->is_dirty())
in->mark_clean();
- in->mdcache->remove_inode(in);
+ mds->mdcache->remove_inode(in);
+
+ if (mds->is_stopping())
+ mds->mdcache->shutdown_export_stray_finish(ino);
}
void StrayManager::enqueue(CDentry *dn, bool trunc)
return false; // not until some snaps are deleted.
}
- in->mdcache->clear_dirty_bits_for_stray(in);
+ mds->mdcache->clear_dirty_bits_for_stray(in);
if (!in->remote_parents.empty()) {
// unlink any stale remote snap dentry.
dout(10) << __func__ << ": " << *dn << " " << *in << dendl;
+ in->pop_and_dirty_projected_inode(ls);
+
+ in->state_clear(CInode::STATE_PURGING);
dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED);
dn->put(CDentry::PIN_PURGING);
- in->pop_and_dirty_projected_inode(ls);
-
eval_stray(dn);
+
+ if (!dn->state_test(CDentry::STATE_PURGING) && mds->is_stopping())
+ mds->mdcache->shutdown_export_stray_finish(in->ino());
}
f.reset(Formatter::create("json-pretty"));
// only include state from services that are in the persisted service map
f->open_object_section("service_status");
- ServiceMap s;
- cluster_state.with_servicemap([&](const ServiceMap& service_map) {
- s = service_map;
- });
- for (auto& p : s.services) {
+ for (auto& p : pending_service_map.services) {
f->open_object_section(p.first.c_str());
for (auto& q : p.second.daemons) {
f->open_object_section(q.first.c_str());
ss << "pg " << pgid << " primary osd." << acting_primary
<< " is not currently connected";
cmdctx->reply(-EAGAIN, ss);
+ return true;
}
vector<pg_t> pgs = { pgid };
for (auto& con : p->second) {
for (const auto &t : report->declare_types) {
types.insert(std::make_pair(t.path, t));
session->declared_types.insert(t.path);
- instances.insert(std::pair<std::string, PerfCounterInstance>(
- t.path, PerfCounterInstance(t.type)));
}
// Remove any old types
for (const auto &t : report->undeclare_types) {
DECODE_START(1, p);
for (const auto &t_path : session->declared_types) {
const auto &t = types.at(t_path);
+ auto instances_it = instances.find(t_path);
+ // Always check the instance exists, as we don't prevent yet
+ // multiple sessions from daemons with the same name, and one
+ // session clearing stats created by another on open.
+ if (instances_it == instances.end()) {
+ instances_it = instances.insert({t_path, t.type}).first;
+ }
uint64_t val = 0;
uint64_t avgcount = 0;
uint64_t avgcount2 = 0;
if (t.type & PERFCOUNTER_LONGRUNAVG) {
::decode(avgcount, p);
::decode(avgcount2, p);
- instances.at(t_path).push_avg(now, val, avgcount);
+ instances_it->second.push_avg(now, val, avgcount);
} else {
- instances.at(t_path).push(now, val);
+ instances_it->second.push(now, val);
}
}
DECODE_FINISH(p);
dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
switch (m->get_type()) {
case MSG_MON_COMMAND:
- return preprocess_command(op);
+ try {
+ return preprocess_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
case CEPH_MSG_AUTH:
return prep_auth(op, false);
dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
switch (m->get_type()) {
case MSG_MON_COMMAND:
- return prepare_command(op);
+ try {
+ return prepare_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
case MSG_MON_GLOBAL_ID:
return prepare_global_id(op);
case CEPH_MSG_AUTH:
}
string prefix;
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
if (prefix == "auth add" ||
prefix == "auth del" ||
prefix == "auth rm" ||
// entity might not be supplied, but if it is, it should be valid
string entity_name;
- cmd_getval(g_ceph_context, cmdmap, "entity", entity_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "entity", entity_name);
EntityName entity;
if (!entity_name.empty() && !entity.from_str(entity_name)) {
ss << "invalid entity_auth " << entity_name;
}
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
boost::scoped_ptr<Formatter> f(Formatter::create(format));
if (prefix == "auth export") {
string entity_name;
EntityName entity;
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
boost::scoped_ptr<Formatter> f(Formatter::create(format));
MonSession *session = m->get_session();
return true;
}
- cmd_getval(g_ceph_context, cmdmap, "caps", caps_vec);
+ cmd_getval_throws(g_ceph_context, cmdmap, "caps", caps_vec);
if ((caps_vec.size() % 2) != 0) {
ss << "bad capabilities request; odd number of arguments";
err = -EINVAL;
goto done;
}
- cmd_getval(g_ceph_context, cmdmap, "entity", entity_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "entity", entity_name);
if (!entity_name.empty() && !entity.from_str(entity_name)) {
ss << "bad entity name";
err = -EINVAL;
return true;
} else if (prefix == "fs authorize") {
string filesystem;
- cmd_getval(g_ceph_context, cmdmap, "filesystem", filesystem);
+ cmd_getval_throws(g_ceph_context, cmdmap, "filesystem", filesystem);
string mds_cap_string, osd_cap_string;
string osd_cap_wanted = "r";
return false;
}
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
string key;
- cmd_getval(g_ceph_context, cmdmap, "key", key);
+ cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
if (prefix == "config-key get") {
ret = store_get(key, rdata);
bufferlist data;
string val;
- if (cmd_getval(g_ceph_context, cmdmap, "val", val)) {
+ if (cmd_getval_throws(g_ceph_context, cmdmap, "val", val)) {
// they specified a value in the command instead of a file
data.append(val);
} else if (cmd->get_data_len() > 0) {
std::stringstream &ss) override
{
string flag_name;
- cmd_getval(g_ceph_context, cmdmap, "flag_name", flag_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "flag_name", flag_name);
string flag_val;
- cmd_getval(g_ceph_context, cmdmap, "val", flag_val);
+ cmd_getval_throws(g_ceph_context, cmdmap, "val", flag_val);
string confirm;
- cmd_getval(g_ceph_context, cmdmap, "confirm", confirm);
+ cmd_getval_throws(g_ceph_context, cmdmap, "confirm", confirm);
if (flag_name == "enable_multiple") {
bool flag_bool = false;
assert(m_paxos->is_plugged());
string metadata_name;
- cmd_getval(g_ceph_context, cmdmap, "metadata", metadata_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "metadata", metadata_name);
int64_t metadata = mon->osdmon()->osdmap.lookup_pg_pool_name(metadata_name);
if (metadata < 0) {
ss << "pool '" << metadata_name << "' does not exist";
}
string force_str;
- cmd_getval(g_ceph_context,cmdmap, "force", force_str);
+ cmd_getval_throws(g_ceph_context,cmdmap, "force", force_str);
bool force = (force_str == "--force");
const pool_stat_t *stat = mon->pgservice->get_pool_stat(metadata);
if (stat) {
}
string data_name;
- cmd_getval(g_ceph_context, cmdmap, "data", data_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "data", data_name);
int64_t data = mon->osdmon()->osdmap.lookup_pg_pool_name(data_name);
if (data < 0) {
ss << "pool '" << data_name << "' does not exist";
}
string fs_name;
- cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name);
if (fs_name.empty()) {
// Ensure fs name is not empty so that we can implement
// commmands that refer to FS by name in future.
string sure;
if ((std::find(data_pools.begin(), data_pools.end(), data) != data_pools.end()
|| fs->mds_map.get_metadata_pool() == metadata)
- && ((!cmd_getval(g_ceph_context, cmdmap, "sure", sure)
+ && ((!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure)
|| sure != "--allow-dangerous-metadata-overlay"))) {
ss << "Filesystem '" << fs_name
<< "' is already using one of the specified RADOS pools. This should ONLY be done in emergencies and after careful reading of the documentation. Pass --allow-dangerous-metadata-overlay to permit this.";
std::stringstream &ss) override
{
std::string fs_name;
- if (!cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name) || fs_name.empty()) {
ss << "Missing filesystem name";
return -EINVAL;
}
}
string var;
- if (!cmd_getval(g_ceph_context, cmdmap, "var", var) || var.empty()) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "var", var) || var.empty()) {
ss << "Invalid variable";
return -EINVAL;
}
string val;
string interr;
int64_t n = 0;
- if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "val", val)) {
return -EINVAL;
}
// we got a string. see if it contains an int.
if (enable_inline) {
string confirm;
- if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "confirm", confirm) ||
confirm != "--yes-i-really-mean-it") {
ss << EXPERIMENTAL_WARNING;
return -EPERM;
{
fs->mds_map.set_standby_count_wanted(n);
});
+ } else if (var == "session_timeout") {
+ if (interr.length()) {
+ ss << var << " requires an integer value";
+ return -EINVAL;
+ }
+ if (n < 30) {
+ ss << var << " must be at least 30s";
+ return -ERANGE;
+ }
+ fsmap.modify_filesystem(
+ fs->fscid,
+ [n](std::shared_ptr<Filesystem> fs)
+ {
+ fs->mds_map.set_session_timeout((uint32_t)n);
+ });
+ } else if (var == "session_autoclose") {
+ if (interr.length()) {
+ ss << var << " requires an integer value";
+ return -EINVAL;
+ }
+ if (n < 30) {
+ ss << var << " must be at least 30s";
+ return -ERANGE;
+ }
+ fsmap.modify_filesystem(
+ fs->fscid,
+ [n](std::shared_ptr<Filesystem> fs)
+ {
+ fs->mds_map.set_session_autoclose((uint32_t)n);
+ });
} else {
ss << "unknown variable " << var;
return -EINVAL;
assert(m_paxos->is_plugged());
string poolname;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
std::string fs_name;
- if (!cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name)
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name)
|| fs_name.empty()) {
ss << "Missing filesystem name";
return -EINVAL;
std::stringstream &ss) override
{
std::string fs_name;
- cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name);
auto fs = fsmap.get_filesystem(fs_name);
if (fs == nullptr) {
ss << "filesystem '" << fs_name << "' does not exist";
// (redundant while there is only one FS, but command
// syntax should apply to multi-FS future)
string fs_name;
- cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name);
auto fs = fsmap.get_filesystem(fs_name);
if (fs == nullptr) {
// Consider absence success to make deletes idempotent
// Check for confirmation flag
string sure;
- cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+ cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
if (sure != "--yes-i-really-mean-it") {
ss << "this is a DESTRUCTIVE operation and will make data in your filesystem permanently" \
" inaccessible. Add --yes-i-really-mean-it if you are sure you wish to continue.";
std::stringstream &ss) override
{
string fs_name;
- cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name);
auto fs = fsmap.get_filesystem(fs_name);
if (fs == nullptr) {
ss << "filesystem '" << fs_name << "' does not exist";
// Check for confirmation flag
string sure;
- cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+ cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
if (sure != "--yes-i-really-mean-it") {
ss << "this is a potentially destructive operation, only for use by experts in disaster recovery. "
"Add --yes-i-really-mean-it if you are sure you wish to continue.";
std::stringstream &ss) override
{
string poolname;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
std::string fs_name;
- if (!cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name)
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name)
|| fs_name.empty()) {
ss << "Missing filesystem name";
return -EINVAL;
dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
switch (m->get_type()) {
case MSG_MON_COMMAND:
- return preprocess_command(op);
+ try {
+ return preprocess_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
case MSG_LOG:
return preprocess_log(op);
dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
switch (m->get_type()) {
case MSG_MON_COMMAND:
- return prepare_command(op);
+ try {
+ return prepare_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
case MSG_LOG:
return prepare_log(op);
default:
}
string prefix;
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
boost::scoped_ptr<Formatter> f(Formatter::create(format));
if (prefix == "log last") {
int64_t num = 20;
- cmd_getval(g_ceph_context, cmdmap, "num", num);
+ cmd_getval_throws(g_ceph_context, cmdmap, "num", num);
if (f) {
f->open_array_section("tail");
}
std::string level_str;
clog_type level;
- if (cmd_getval(g_ceph_context, cmdmap, "level", level_str)) {
+ if (cmd_getval_throws(g_ceph_context, cmdmap, "level", level_str)) {
level = LogEntry::str_to_level(level_str);
if (level == CLOG_UNKNOWN) {
ss << "Invalid severity '" << level_str << "'";
}
std::string channel;
- if (!cmd_getval(g_ceph_context, cmdmap, "channel", channel)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "channel", channel)) {
channel = CLOG_CHANNEL_DEFAULT;
}
}
string prefix;
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
MonSession *session = m->get_session();
if (!session) {
if (prefix == "log") {
vector<string> logtext;
- cmd_getval(g_ceph_context, cmdmap, "logtext", logtext);
+ cmd_getval_throws(g_ceph_context, cmdmap, "logtext", logtext);
LogEntry le;
le.who = m->get_orig_source_inst();
le.name = session->entity_name;
return preprocess_beacon(op);
case MSG_MON_COMMAND:
- return preprocess_command(op);
+ try {
+ return preprocess_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
case MSG_MDS_OFFLOAD_TARGETS:
return preprocess_offload_targets(op);
return prepare_beacon(op);
case MSG_MON_COMMAND:
- return prepare_command(op);
+ try {
+ return prepare_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
case MSG_MDS_OFFLOAD_TARGETS:
return prepare_offload_targets(op);
});
}
- if (info.state == MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPED ) {
+ if (info.state == MDSMap::STATE_STOPPING &&
+ state != MDSMap::STATE_STOPPING &&
+ state != MDSMap::STATE_STOPPED) {
// we can't transition to any other states from STOPPING
dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
<< dendl;
case MSG_MGR_BEACON:
return preprocess_beacon(op);
case MSG_MON_COMMAND:
- return preprocess_command(op);
+ try {
+ return preprocess_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+
default:
mon->no_reply(op);
derr << "Unhandled message type " << m->get_type() << dendl;
return prepare_beacon(op);
case MSG_MON_COMMAND:
- return prepare_command(op);
+ try {
+ return prepare_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
default:
mon->no_reply(op);
}
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format, string("json-pretty"));
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("json-pretty"));
boost::scoped_ptr<Formatter> f(Formatter::create(format));
string prefix;
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
int r = 0;
if (prefix == "mgr dump") {
int64_t epoch = 0;
- cmd_getval(g_ceph_context, cmdmap, "epoch", epoch, (int64_t)map.get_epoch());
+ cmd_getval_throws(g_ceph_context, cmdmap, "epoch", epoch, (int64_t)map.get_epoch());
if (epoch == (int64_t)map.get_epoch()) {
f->dump_object("mgrmap", map);
} else {
f->flush(rdata);
} else if (prefix == "mgr metadata") {
string name;
- cmd_getval(g_ceph_context, cmdmap, "id", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "id", name);
if (name.size() > 0 && !map.have_name(name)) {
ss << "mgr." << name << " does not exist";
r = -ENOENT;
goto reply;
}
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format);
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
if (name.size()) {
f->open_object_section("mgr_metadata");
if (!f)
f.reset(Formatter::create("json-pretty"));
string field;
- cmd_getval(g_ceph_context, cmdmap, "property", field);
+ cmd_getval_throws(g_ceph_context, cmdmap, "property", field);
count_metadata(field, f.get());
f->flush(rdata);
r = 0;
}
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
boost::scoped_ptr<Formatter> f(Formatter::create(format));
string prefix;
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
int r = 0;
if (prefix == "mgr fail") {
string who;
- cmd_getval(g_ceph_context, cmdmap, "who", who);
+ cmd_getval_throws(g_ceph_context, cmdmap, "who", who);
std::string err;
uint64_t gid = strict_strtol(who.c_str(), 10, &err);
}
} else if (prefix == "mgr module enable") {
string module;
- cmd_getval(g_ceph_context, cmdmap, "module", module);
+ cmd_getval_throws(g_ceph_context, cmdmap, "module", module);
if (module.empty()) {
r = -EINVAL;
goto out;
}
string force;
- cmd_getval(g_ceph_context, cmdmap, "force", force);
+ cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
if (!pending_map.all_support_module(module) &&
force != "--force") {
ss << "all mgr daemons do not support module '" << module << "', pass "
pending_map.modules.insert(module);
} else if (prefix == "mgr module disable") {
string module;
- cmd_getval(g_ceph_context, cmdmap, "module", module);
+ cmd_getval_throws(g_ceph_context, cmdmap, "module", module);
if (module.empty()) {
r = -EINVAL;
goto out;
}
return MON_CAP_ALL;
}
+ // we don't allow config-key service to be accessed with blanket caps other
+ // than '*' (i.e., 'any'), and that should have been checked by the caller
+ // via 'is_allow_all()'.
+ if (s == "config-key") {
+ return 0;
+ }
return allow;
}
quoted_string %=
lexeme['"' >> +(char_ - '"') >> '"'] |
lexeme['\'' >> +(char_ - '\'') >> '\''];
- unquoted_word %= +char_("a-zA-Z0-9_.-");
+ unquoted_word %= +char_("a-zA-Z0-9_./-");
str %= quoted_string | unquoted_word;
spaces = +(lit(' ') | lit('\n') | lit('\t'));
"name=fs_name,type=CephString " \
"name=var,type=CephChoices,strings=max_mds|max_file_size"
"|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags|balancer" \
- "|standby_count_wanted " \
+ "|standby_count_wanted|session_timeout|session_autoclose " \
"name=val,type=CephString " \
"name=confirm,type=CephString,req=false", \
- "set mds parameter <var> to <val>", "mds", "rw", "cli,rest")
+ "set fs parameter <var> to <val>", "mds", "rw", "cli,rest")
COMMAND("fs flag set name=flag_name,type=CephChoices,strings=enable_multiple "
"name=val,type=CephString " \
"name=confirm,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"list all erasure code profiles", \
"osd", "r", "cli,rest")
COMMAND("osd set " \
- "name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds " \
+ "name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds||pglog_hardlimit " \
"name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"set <key>", "osd", "rw", "cli,rest")
COMMAND("osd unset " \
state = STATE_SHUTDOWN;
+ lock.Unlock();
g_conf->remove_observer(this);
+ lock.Lock();
if (admin_hook) {
AdminSocket* admin_socket = cct->get_admin_socket();
remove_all_sessions();
+ log_client.shutdown();
+
+ // unlock before msgr shutdown...
+ lock.Unlock();
+
+ // shutdown messenger before removing logger from perfcounter collection,
+ // otherwise _ms_dispatch() will try to update deleted logger
+ messenger->shutdown();
+ mgr_messenger->shutdown();
+
if (logger) {
cct->get_perfcounters_collection()->remove(logger);
delete logger;
delete cluster_logger;
cluster_logger = NULL;
}
-
- log_client.shutdown();
-
- // unlock before msgr shutdown...
- lock.Unlock();
-
- messenger->shutdown(); // last thing! ceph_mon.cc will delete mon.
- mgr_messenger->shutdown();
}
void Monitor::wait_for_paxos_write()
switch (m->get_type()) {
// READs
case MSG_MON_COMMAND:
- return preprocess_command(op);
+ try {
+ return preprocess_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
case MSG_MON_JOIN:
return preprocess_join(op);
default:
}
string prefix;
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
MonSession *session = m->get_session();
if (!session) {
}
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
boost::scoped_ptr<Formatter> f(Formatter::create(format));
if (prefix == "mon stat") {
epoch_t epoch;
int64_t epochnum;
- cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)0);
+ cmd_getval_throws(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)0);
epoch = epochnum;
MonMap *p = mon->monmap;
bool list_with_value = false;
string with_value;
- if (cmd_getval(g_ceph_context, cmdmap, "with_value", with_value) &&
+ if (cmd_getval_throws(g_ceph_context, cmdmap, "with_value", with_value) &&
with_value == "--with-value") {
list_with_value = true;
}
switch (m->get_type()) {
case MSG_MON_COMMAND:
- return prepare_command(op);
+ try {
+ return prepare_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
case MSG_MON_JOIN:
return prepare_join(op);
default:
}
string prefix;
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
MonSession *session = m->get_session();
if (!session) {
bool propose = false;
if (prefix == "mon add") {
string name;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
string addrstr;
- cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "addr", addrstr);
entity_addr_t addr;
bufferlist rdata;
} else if (prefix == "mon remove" ||
prefix == "mon rm") {
string name;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
if (!monmap.contains(name)) {
err = 0;
ss << "mon." << name << " does not exist or has already been removed";
* 'mon flag set/unset'.
*/
string feature_name;
- if (!cmd_getval(g_ceph_context, cmdmap, "feature_name", feature_name)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "feature_name", feature_name)) {
ss << "missing required feature name";
err = -EINVAL;
goto reply;
}
string sure;
- if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure) ||
sure != "--yes-i-really-mean-it") {
ss << "please specify '--yes-i-really-mean-it' if you "
<< "really, **really** want to set feature '"
<< "persistent = " << pending_map.persistent_features
// output optional nevertheless, for auditing purposes.
<< ", optional = " << pending_map.optional_features << dendl;
-
} else {
ss << "unknown command " << prefix;
err = -EINVAL;
int next_up_primary, next_acting_primary;
next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
&next_acting, &next_acting_primary);
- if (acting == next_acting && next_up != next_acting)
+ if (acting == next_acting &&
+ !(up != acting && next_up == next_acting))
return; // no change since last epoch
if (acting.empty())
switch (m->get_type()) {
// READs
case MSG_MON_COMMAND:
- return preprocess_command(op);
+ try {
+ return preprocess_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
case CEPH_MSG_MON_GET_OSDMAP:
return preprocess_get_osdmap(op);
return prepare_beacon(op);
case MSG_MON_COMMAND:
- return prepare_command(op);
+ try {
+ return prepare_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
case CEPH_MSG_POOLOP:
return prepare_pool_op(op);
}
}
+ // The release check here is required because for OSD_PGLOG_HARDLIMIT,
+ // we are reusing a jewel feature bit that was retired in luminous.
+ if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+ osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
+ !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
+ mon->clog->info() << "disallowing boot of OSD "
+ << m->get_orig_source_inst()
+ << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
+ goto ignore;
+ }
+
// already booted?
if (osdmap.is_up(from) &&
osdmap.get_inst(from) == m->get_orig_source_inst() &&
// check privilege, ignore if failed
MonSession *session = m->get_session();
+ mon->no_reply(op);
if (!session)
goto ignore;
if (!session->caps.is_capable(
}
string prefix;
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
boost::scoped_ptr<Formatter> f(Formatter::create(format));
if (prefix == "osd stat") {
epoch_t epoch = 0;
int64_t epochnum;
- cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
+ cmd_getval_throws(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
epoch = epochnum;
bufferlist osdmap_bl;
rdata.append(ds);
} else if (prefix == "osd tree") {
vector<string> states;
- cmd_getval(g_ceph_context, cmdmap, "states", states);
+ cmd_getval_throws(g_ceph_context, cmdmap, "states", states);
unsigned filter = 0;
for (auto& s : states) {
if (s == "up") {
ss << p->get_crush_version();
} else if (prefix == "osd ls-tree") {
string bucket_name;
- cmd_getval(g_ceph_context, cmdmap, "name", bucket_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", bucket_name);
set<int> osds;
r = p->get_osds_by_bucket_name(bucket_name, &osds);
if (r == -ENOENT) {
goto reply;
} else if (prefix == "osd find") {
int64_t osd;
- if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", osd)) {
ss << "unable to parse osd id value '"
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
r = -EINVAL;
goto reply;
}
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format);
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
f->open_object_section("osd_location");
f->dump_int("osd", osd);
f->dump_stream("ip") << osdmap.get_addr(osd);
+ f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
f->open_object_section("crush_location");
map<string,string> loc = osdmap.crush->get_full_location(osd);
for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
} else if (prefix == "osd metadata") {
int64_t osd = -1;
if (cmd_vartype_stringify(cmdmap["id"]).size() &&
- !cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
+ !cmd_getval_throws(g_ceph_context, cmdmap, "id", osd)) {
ss << "unable to parse osd id value '"
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
r = -EINVAL;
goto reply;
}
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format);
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
if (osd >= 0) {
f->open_object_section("osd_metadata");
if (!f)
f.reset(Formatter::create("json-pretty"));
string field;
- cmd_getval(g_ceph_context, cmdmap, "property", field);
+ cmd_getval_throws(g_ceph_context, cmdmap, "property", field);
count_metadata(field, f.get());
f->flush(rdata);
r = 0;
} else if (prefix == "osd map") {
string poolstr, objstr, namespacestr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
- cmd_getval(g_ceph_context, cmdmap, "object", objstr);
- cmd_getval(g_ceph_context, cmdmap, "nspace", namespacestr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "object", objstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "nspace", namespacestr);
int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
if (pool < 0) {
} else if (prefix == "pg map") {
pg_t pgid;
string pgidstr;
- cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr);
if (!pgid.parse(pgidstr.c_str())) {
ss << "invalid pgid '" << pgidstr << "'";
r = -EINVAL;
}
} else if (prefix == "osd lspools") {
int64_t auid;
- cmd_getval(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
+ cmd_getval_throws(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
if (f)
f->open_array_section("pools");
for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
} else if (prefix == "osd pool ls") {
string detail;
- cmd_getval(g_ceph_context, cmdmap, "detail", detail);
+ cmd_getval_throws(g_ceph_context, cmdmap, "detail", detail);
if (!f && detail == "detail") {
ostringstream ss;
osdmap.print_pools(ss);
} else if (prefix == "osd crush get-tunable") {
string tunable;
- cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
+ cmd_getval_throws(g_ceph_context, cmdmap, "tunable", tunable);
ostringstream rss;
if (f)
f->open_object_section("tunable");
} else if (prefix == "osd pool get") {
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
if (pool < 0) {
ss << "unrecognized pool '" << poolstr << "'";
const pg_pool_t *p = osdmap.get_pg_pool(pool);
string var;
- cmd_getval(g_ceph_context, cmdmap, "var", var);
+ cmd_getval_throws(g_ceph_context, cmdmap, "var", var);
typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
const choices_map_t ALL_CHOICES = {
osdmap, f.get(), &ss, &rdata);
} else if (prefix == "osd pool get-quota") {
string pool_name;
- cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", pool_name);
int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
if (poolid < 0) {
}
} else if (prefix == "osd crush rule ls-by-class") {
string class_name;
- cmd_getval(g_ceph_context, cmdmap, "class", class_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "class", class_name);
if (class_name.empty()) {
ss << "no class specified";
r = -EINVAL;
}
} else if (prefix == "osd crush rule dump") {
string name;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format);
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
if (name == "") {
f->open_array_section("rules");
rdata.append(rs.str());
} else if (prefix == "osd crush dump") {
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format);
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
f->open_object_section("crush_map");
osdmap.crush->dump(f.get());
rdata.append(rs.str());
} else if (prefix == "osd crush show-tunables") {
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format);
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
f->open_object_section("crush_map_tunables");
osdmap.crush->dump_tunables(f.get());
rdata.append(rs.str());
} else if (prefix == "osd crush tree") {
string shadow;
- cmd_getval(g_ceph_context, cmdmap, "shadow", shadow);
+ cmd_getval_throws(g_ceph_context, cmdmap, "shadow", shadow);
bool show_shadow = shadow == "--show-shadow";
boost::scoped_ptr<Formatter> f(Formatter::create(format));
if (f) {
}
} else if (prefix == "osd crush ls") {
string name;
- if (!cmd_getval(g_ceph_context, cmdmap, "node", name)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "node", name)) {
ss << "no node specified";
r = -EINVAL;
goto reply;
f->flush(rdata);
} else if (prefix == "osd crush class ls-osd") {
string name;
- cmd_getval(g_ceph_context, cmdmap, "class", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "class", name);
set<int> osds;
osdmap.crush->get_devices_by_class(name, &osds);
if (f) {
f->flush(rdata);
} else if (prefix == "osd erasure-code-profile get") {
string name;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
if (!osdmap.has_erasure_code_profile(name)) {
ss << "unknown erasure code profile '" << name << "'";
r = -ENOENT;
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
"json-pretty"));
string pool_name;
- cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", pool_name);
string app;
- cmd_getval(g_ceph_context, cmdmap, "app", app);
+ cmd_getval_throws(g_ceph_context, cmdmap, "app", app);
string key;
- cmd_getval(g_ceph_context, cmdmap, "key", key);
+ cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
if (pool_name.empty()) {
// all
stringstream& ss)
{
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
if (pool < 0) {
ss << "unrecognized pool '" << poolstr << "'";
return -ENOENT;
}
string var;
- cmd_getval(g_ceph_context, cmdmap, "var", var);
+ cmd_getval_throws(g_ceph_context, cmdmap, "var", var);
pg_pool_t p = *osdmap.get_pg_pool(pool);
if (pending_inc.new_pools.count(pool))
int64_t n = 0;
double f = 0;
int64_t uf = 0; // micro-f
- if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
- // wasn't a string; maybe an older mon forwarded json with an int?
- if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
- return -EINVAL; // no value!
- } else {
- // we got a string. see if it contains an int.
- n = strict_strtoll(val.c_str(), 10, &interr);
- // or a float
- f = strict_strtod(val.c_str(), &floaterr);
- uf = llrintl(f * (double)1000000.0);
- }
+ cmd_getval(g_ceph_context, cmdmap, "val", val);
+
+ // parse string as both int and float; different fields use different types.
+ n = strict_strtoll(val.c_str(), 10, &interr);
+ f = strict_strtod(val.c_str(), &floaterr);
+ uf = llrintl(f * (double)1000000.0);
if (!p.is_tier() &&
(var == "hit_set_type" || var == "hit_set_period" ||
return r;
}
string force;
- cmd_getval(g_ceph_context,cmdmap, "force", force);
+ cmd_getval_throws(g_ceph_context,cmdmap, "force", force);
if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
force != "--yes-i-really-mean-it") {
ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
} else if (var == "hashpspool") {
uint64_t flag = pg_pool_t::get_flag_by_name(var);
string force;
- cmd_getval(g_ceph_context, cmdmap, "force", force);
+ cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
if (force != "--yes-i-really-mean-it") {
ss << "are you SURE? this will remap all placement groups in this pool,"
" this triggers large data movement,"
stringstream& ss)
{
string pool_name;
- cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", pool_name);
int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
if (pool < 0) {
ss << "unrecognized pool '" << pool_name << "'";
}
string app;
- cmd_getval(g_ceph_context, cmdmap, "app", app);
+ cmd_getval_throws(g_ceph_context, cmdmap, "app", app);
bool app_exists = (p.application_metadata.count(app) > 0);
if (boost::algorithm::ends_with(prefix, "enable")) {
}
string force;
- cmd_getval(g_ceph_context, cmdmap, "force", force);
+ cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
if (!app_exists && !p.application_metadata.empty() &&
force != "--yes-i-really-mean-it") {
} else if (boost::algorithm::ends_with(prefix, "disable")) {
string force;
- cmd_getval(g_ceph_context, cmdmap, "force", force);
+ cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
if (force != "--yes-i-really-mean-it") {
ss << "Are you SURE? Disabling an application within a pool might result "
}
string key;
- cmd_getval(g_ceph_context, cmdmap, "key", key);
+ cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
if (key.empty()) {
ss << "key must be provided";
}
string value;
- cmd_getval(g_ceph_context, cmdmap, "value", value);
+ cmd_getval_throws(g_ceph_context, cmdmap, "value", value);
if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
ss << "value '" << value << "' too long; max length "
<< MAX_POOL_APPLICATION_LENGTH;
}
string key;
- cmd_getval(g_ceph_context, cmdmap, "key", key);
+ cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
auto it = p.application_metadata[app].find(key);
if (it == p.application_metadata[app].end()) {
ss << "application '" << app << "' on pool '" << pool_name
* If `id` is specified, and the osd has been previously marked
* as destroyed, then the `id` will be reused.
*/
- if (!cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "uuid", uuidstr)) {
ss << "requires the OSD's UUID to be specified.";
return -EINVAL;
} else if (!uuid.parse(uuidstr.c_str())) {
return -EINVAL;
}
- if (cmd_getval(g_ceph_context, cmdmap, "id", id) &&
+ if (cmd_getval_throws(g_ceph_context, cmdmap, "id", id) &&
(id < 0)) {
ss << "invalid OSD id; must be greater or equal than zero.";
return -EINVAL;
map<int32_t, uint32_t>* weights)
{
string weights_str;
- if (!cmd_getval(g_ceph_context, cmdmap, "weights", weights_str)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "weights", weights_str)) {
return -EINVAL;
}
std::replace(begin(weights_str), end(weights_str), '\'', '"');
int err = 0;
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+ cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
boost::scoped_ptr<Formatter> f(Formatter::create(format));
string prefix;
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
int64_t osdid;
string name;
if (prefix != "osd pg-temp" &&
prefix != "osd pg-upmap" &&
prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
- osdid_present = cmd_getval(g_ceph_context, cmdmap, "id", osdid);
+ osdid_present = cmd_getval_throws(g_ceph_context, cmdmap, "id", osdid);
}
if (osdid_present) {
ostringstream oss;
}
int64_t prior_version = 0;
- if (cmd_getval(g_ceph_context, cmdmap, "prior_version", prior_version)) {
+ if (cmd_getval_throws(g_ceph_context, cmdmap, "prior_version", prior_version)) {
if (prior_version == osdmap.get_crush_version() - 1) {
// see if we are a resend of the last update. this is imperfect
// (multiple racing updaters may not both get reliable success)
}
string device_class;
- if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "class", device_class)) {
err = -EINVAL; // no value!
goto reply;
}
bool stop = false;
vector<string> idvec;
- cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+ cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
CrushWrapper newcrush;
_get_pending_crush(newcrush);
set<int> updated;
} else if (prefix == "osd crush rm-device-class") {
bool stop = false;
vector<string> idvec;
- cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+ cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
CrushWrapper newcrush;
_get_pending_crush(newcrush);
set<int> updated;
}
} else if (prefix == "osd crush class rename") {
string srcname, dstname;
- if (!cmd_getval(g_ceph_context, cmdmap, "srcname", srcname)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "srcname", srcname)) {
err = -EINVAL;
goto reply;
}
- if (!cmd_getval(g_ceph_context, cmdmap, "dstname", dstname)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "dstname", dstname)) {
err = -EINVAL;
goto reply;
}
} else if (prefix == "osd crush add-bucket") {
// os crush add-bucket <name> <type>
string name, typestr;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
- cmd_getval(g_ceph_context, cmdmap, "type", typestr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "type", typestr);
if (!_have_pending_crush() &&
_get_stable_crush().name_exists(name)) {
goto update;
} else if (prefix == "osd crush rename-bucket") {
string srcname, dstname;
- cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
- cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
+ cmd_getval_throws(g_ceph_context, cmdmap, "srcname", srcname);
+ cmd_getval_throws(g_ceph_context, cmdmap, "dstname", dstname);
err = crush_rename_bucket(srcname, dstname, &ss);
if (err == -EALREADY) // equivalent to success for idempotency
goto reply;
}
string poolname, mode;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
pool = osdmap.lookup_pg_pool_name(poolname.c_str());
if (pool < 0) {
ss << "pool '" << poolname << "' not found";
err = -ENOENT;
goto reply;
}
- cmd_getval(g_ceph_context, cmdmap, "mode", mode);
+ cmd_getval_throws(g_ceph_context, cmdmap, "mode", mode);
if (mode != "flat" && mode != "positional") {
ss << "unrecognized weight-set mode '" << mode << "'";
err = -EINVAL;
int64_t pool;
if (prefix == "osd crush weight-set rm") {
string poolname;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
pool = osdmap.lookup_pg_pool_name(poolname.c_str());
if (pool < 0) {
ss << "pool '" << poolname << "' not found";
prefix == "osd crush weight-set reweight-compat") {
string poolname, item;
vector<double> weight;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
- cmd_getval(g_ceph_context, cmdmap, "item", item);
- cmd_getval(g_ceph_context, cmdmap, "weight", weight);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
+ cmd_getval_throws(g_ceph_context, cmdmap, "item", item);
+ cmd_getval_throws(g_ceph_context, cmdmap, "weight", weight);
CrushWrapper newcrush;
_get_pending_crush(newcrush);
int64_t pool;
}
double weight;
- if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", weight)) {
ss << "unable to parse weight value '"
<< cmd_vartype_stringify(cmdmap["weight"]) << "'";
err = -EINVAL;
string args;
vector<string> argvec;
- cmd_getval(g_ceph_context, cmdmap, "args", argvec);
+ cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
map<string,string> loc;
CrushWrapper::parse_loc_map(argvec, &loc);
}
double weight;
- if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", weight)) {
ss << "unable to parse weight value '"
<< cmd_vartype_stringify(cmdmap["weight"]) << "'";
err = -EINVAL;
string args;
vector<string> argvec;
- cmd_getval(g_ceph_context, cmdmap, "args", argvec);
+ cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
map<string,string> loc;
CrushWrapper::parse_loc_map(argvec, &loc);
string args;
vector<string> argvec;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
- cmd_getval(g_ceph_context, cmdmap, "args", argvec);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
map<string,string> loc;
CrushWrapper::parse_loc_map(argvec, &loc);
} while (false);
} else if (prefix == "osd crush swap-bucket") {
string source, dest, force;
- cmd_getval(g_ceph_context, cmdmap, "source", source);
- cmd_getval(g_ceph_context, cmdmap, "dest", dest);
- cmd_getval(g_ceph_context, cmdmap, "force", force);
+ cmd_getval_throws(g_ceph_context, cmdmap, "source", source);
+ cmd_getval_throws(g_ceph_context, cmdmap, "dest", dest);
+ cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
CrushWrapper newcrush;
_get_pending_crush(newcrush);
if (!newcrush.name_exists(source)) {
} else if (prefix == "osd crush link") {
// osd crush link <name> <loc1> [<loc2> ...]
string name;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
vector<string> argvec;
- cmd_getval(g_ceph_context, cmdmap, "args", argvec);
+ cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
map<string,string> loc;
CrushWrapper::parse_loc_map(argvec, &loc);
_get_pending_crush(newcrush);
string name;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
if (!osdmap.crush->name_exists(name)) {
err = 0;
bool unlink_only = prefix == "osd crush unlink";
string ancestor_str;
- if (cmd_getval(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
+ if (cmd_getval_throws(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
if (!newcrush.name_exists(ancestor_str)) {
err = -ENOENT;
ss << "ancestor item '" << ancestor_str
_get_pending_crush(newcrush);
string name;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
if (!newcrush.name_exists(name)) {
err = -ENOENT;
ss << "device '" << name << "' does not appear in the crush map";
goto reply;
}
double w;
- if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
ss << "unable to parse weight value '"
<< cmd_vartype_stringify(cmdmap["weight"]) << "'";
err = -EINVAL;
_get_pending_crush(newcrush);
string name;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
if (!newcrush.name_exists(name)) {
err = -ENOENT;
ss << "device '" << name << "' does not appear in the crush map";
goto reply;
}
double w;
- if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
ss << "unable to parse weight value '"
<< cmd_vartype_stringify(cmdmap["weight"]) << "'";
err = -EINVAL;
err = 0;
string profile;
- cmd_getval(g_ceph_context, cmdmap, "profile", profile);
+ cmd_getval_throws(g_ceph_context, cmdmap, "profile", profile);
if (profile == "legacy" || profile == "argonaut") {
newcrush.set_tunables_legacy();
} else if (profile == "bobtail") {
err = 0;
string tunable;
- cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
+ cmd_getval_throws(g_ceph_context, cmdmap, "tunable", tunable);
int64_t value = -1;
- if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "value", value)) {
err = -EINVAL;
ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
goto reply;
} else if (prefix == "osd crush rule create-simple") {
string name, root, type, mode;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
- cmd_getval(g_ceph_context, cmdmap, "root", root);
- cmd_getval(g_ceph_context, cmdmap, "type", type);
- cmd_getval(g_ceph_context, cmdmap, "mode", mode);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "root", root);
+ cmd_getval_throws(g_ceph_context, cmdmap, "type", type);
+ cmd_getval_throws(g_ceph_context, cmdmap, "mode", mode);
if (mode == "")
mode = "firstn";
} else if (prefix == "osd crush rule create-replicated") {
string name, root, type, device_class;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
- cmd_getval(g_ceph_context, cmdmap, "root", root);
- cmd_getval(g_ceph_context, cmdmap, "type", type);
- cmd_getval(g_ceph_context, cmdmap, "class", device_class);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "root", root);
+ cmd_getval_throws(g_ceph_context, cmdmap, "type", type);
+ cmd_getval_throws(g_ceph_context, cmdmap, "class", device_class);
if (!device_class.empty()) {
if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
} else if (prefix == "osd erasure-code-profile rm") {
string name;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
goto wait;
} else if (prefix == "osd erasure-code-profile set") {
string name;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
vector<string> profile;
- cmd_getval(g_ceph_context, cmdmap, "profile", profile);
+ cmd_getval_throws(g_ceph_context, cmdmap, "profile", profile);
bool force;
if (profile.size() > 0 && profile.back() == "--force") {
profile.pop_back();
if (err)
goto reply;
string name, poolstr;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
string profile;
- cmd_getval(g_ceph_context, cmdmap, "profile", profile);
+ cmd_getval_throws(g_ceph_context, cmdmap, "profile", profile);
if (profile == "")
profile = "default";
if (profile == "default") {
} else if (prefix == "osd crush rule rm") {
string name;
- cmd_getval(g_ceph_context, cmdmap, "name", name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
if (!osdmap.crush->rule_exists(name)) {
ss << "rule " << name << " does not exist";
} else if (prefix == "osd crush rule rename") {
string srcname;
string dstname;
- cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
- cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
+ cmd_getval_throws(g_ceph_context, cmdmap, "srcname", srcname);
+ cmd_getval_throws(g_ceph_context, cmdmap, "dstname", dstname);
if (srcname.empty() || dstname.empty()) {
ss << "must specify both source rule name and destination rule name";
err = -EINVAL;
} else if (prefix == "osd setmaxosd") {
int64_t newmax;
- if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "newmax", newmax)) {
ss << "unable to parse 'newmax' value '"
<< cmd_vartype_stringify(cmdmap["newmax"]) << "'";
err = -EINVAL;
goto reply;
}
double n;
- if (!cmd_getval(g_ceph_context, cmdmap, "ratio", n)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "ratio", n)) {
ss << "unable to parse 'ratio' value '"
<< cmd_vartype_stringify(cmdmap["ratio"]) << "'";
err = -EINVAL;
goto reply;
}
string v;
- cmd_getval(g_ceph_context, cmdmap, "version", v);
+ cmd_getval_throws(g_ceph_context, cmdmap, "version", v);
int vno = ceph_release_from_name(v.c_str());
if (vno <= 0) {
ss << "version " << v << " is not recognized";
goto reply;
}
string sure;
- cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+ cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
if (sure != "--yes-i-really-mean-it") {
FeatureMap m;
mon->get_combined_feature_map(&m);
} else if (prefix == "osd set") {
string sure;
- cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+ cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
string key;
- cmd_getval(g_ceph_context, cmdmap, "key", key);
+ cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
if (key == "full")
return prepare_set_flag(op, CEPH_OSDMAP_FULL);
else if (key == "pause")
err = -EPERM;
goto reply;
}
+ } else if (key == "pglog_hardlimit") {
+ if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+ ss << "Not advisable to continue since no OSDs are up. Pass "
+ << "--yes-i-really-mean-it if you really wish to continue.";
+ err = -EPERM;
+ goto reply;
+ }
+ // The release check here is required because for OSD_PGLOG_HARDLIMIT,
+ // we are reusing a jewel feature bit that was retired in luminous.
+ if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+ (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
+ || sure == "--yes-i-really-mean-it")) {
+ return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
+ } else {
+ ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
+ err = -EPERM;
+ goto reply;
+ }
} else if (key == "require_jewel_osds") {
if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
ss << "Not advisable to continue since no OSDs are up. Pass "
} else if (prefix == "osd unset") {
string key;
- cmd_getval(g_ceph_context, cmdmap, "key", key);
+ cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
if (key == "full")
return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
else if (key == "pause")
} else if (prefix == "osd require-osd-release") {
string release;
- cmd_getval(g_ceph_context, cmdmap, "release", release);
+ cmd_getval_throws(g_ceph_context, cmdmap, "release", release);
string sure;
- cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+ cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
ss << "the sortbitwise flag must be set first";
err = -EPERM;
bool verbose = true;
vector<string> idvec;
- cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+ cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
for (unsigned j = 0; j < idvec.size() && !stop; j++) {
set<int> osds;
bool stop = false;
vector<string> idvec;
- cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+ cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
for (unsigned j = 0; j < idvec.size() && !stop; j++) {
set<int> osds;
bool stop = false;
vector<string> idvec;
- cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+ cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
for (unsigned j = 0; j < idvec.size() && !stop; j++) {
}
} else if (prefix == "osd pg-temp") {
string pgidstr;
- if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr)) {
ss << "unable to parse 'pgid' value '"
<< cmd_vartype_stringify(cmdmap["pgid"]) << "'";
err = -EINVAL;
vector<int64_t> id_vec;
vector<int32_t> new_pg_temp;
- if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id_vec)) {
ss << "unable to parse 'id' value(s) '"
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
err = -EINVAL;
goto update;
} else if (prefix == "osd primary-temp") {
string pgidstr;
- if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr)) {
ss << "unable to parse 'pgid' value '"
<< cmd_vartype_stringify(cmdmap["pgid"]) << "'";
err = -EINVAL;
}
int64_t osd;
- if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", osd)) {
ss << "unable to parse 'id' value '"
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
err = -EINVAL;
if (err < 0)
goto reply;
string pgidstr;
- if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr)) {
ss << "unable to parse 'pgid' value '"
<< cmd_vartype_stringify(cmdmap["pgid"]) << "'";
err = -EINVAL;
case OP_PG_UPMAP:
{
vector<int64_t> id_vec;
- if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id_vec)) {
ss << "unable to parse 'id' value(s) '"
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
err = -EINVAL;
case OP_PG_UPMAP_ITEMS:
{
vector<int64_t> id_vec;
- if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id_vec)) {
ss << "unable to parse 'id' value(s) '"
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
err = -EINVAL;
goto update;
} else if (prefix == "osd primary-affinity") {
int64_t id;
- if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
ss << "invalid osd id value '"
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
err = -EINVAL;
goto reply;
}
double w;
- if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
ss << "unable to parse 'weight' value '"
<< cmd_vartype_stringify(cmdmap["weight"]) << "'";
err = -EINVAL;
}
} else if (prefix == "osd reweight") {
int64_t id;
- if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
ss << "unable to parse osd id value '"
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
err = -EINVAL;
goto reply;
}
double w;
- if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
ss << "unable to parse weight value '"
<< cmd_vartype_stringify(cmdmap["weight"]) << "'";
err = -EINVAL;
return true;
} else if (prefix == "osd lost") {
int64_t id;
- if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
ss << "unable to parse osd id value '"
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
err = -EINVAL;
goto reply;
}
string sure;
- if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
ss << "are you SURE? this might mean real, permanent data loss. pass "
"--yes-i-really-mean-it if you really do.";
err = -EPERM;
}
int64_t id;
- if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
ss << "unable to parse osd id value '"
<< cmd_vartype_stringify(cmdmap["id"]) << "";
err = -EINVAL;
}
string sure;
- if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure) ||
sure != "--yes-i-really-mean-it") {
ss << "Are you SURE? This will mean real, permanent data loss, as well "
<< "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
// optional id provided?
int64_t id = -1, cmd_id = -1;
- if (cmd_getval(g_ceph_context, cmdmap, "id", cmd_id)) {
+ if (cmd_getval_throws(g_ceph_context, cmdmap, "id", cmd_id)) {
if (cmd_id < 0) {
ss << "invalid osd id value '" << cmd_id << "'";
err = -EINVAL;
uuid_d uuid;
string uuidstr;
- if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
+ if (cmd_getval_throws(g_ceph_context, cmdmap, "uuid", uuidstr)) {
if (!uuid.parse(uuidstr.c_str())) {
ss << "invalid uuid value '" << uuidstr << "'";
err = -EINVAL;
return true;
} else if (prefix == "osd blacklist") {
string addrstr;
- cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "addr", addrstr);
entity_addr_t addr;
if (!addr.parse(addrstr.c_str(), 0)) {
ss << "unable to parse address " << addrstr;
}
else {
string blacklistop;
- cmd_getval(g_ceph_context, cmdmap, "blacklistop", blacklistop);
+ cmd_getval_throws(g_ceph_context, cmdmap, "blacklistop", blacklistop);
if (blacklistop == "add") {
utime_t expires = ceph_clock_now();
double d;
// default one hour
- cmd_getval(g_ceph_context, cmdmap, "expire", d,
+ cmd_getval_throws(g_ceph_context, cmdmap, "expire", d,
g_conf->mon_osd_blacklist_default_expire);
expires += d;
}
} else if (prefix == "osd pool mksnap") {
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
if (pool < 0) {
ss << "unrecognized pool '" << poolstr << "'";
goto reply;
}
string snapname;
- cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
+ cmd_getval_throws(g_ceph_context, cmdmap, "snap", snapname);
const pg_pool_t *p = osdmap.get_pg_pool(pool);
if (p->is_unmanaged_snaps_mode()) {
ss << "pool " << poolstr << " is in unmanaged snaps mode";
return true;
} else if (prefix == "osd pool rmsnap") {
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
if (pool < 0) {
ss << "unrecognized pool '" << poolstr << "'";
goto reply;
}
string snapname;
- cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
+ cmd_getval_throws(g_ceph_context, cmdmap, "snap", snapname);
const pg_pool_t *p = osdmap.get_pg_pool(pool);
if (p->is_unmanaged_snaps_mode()) {
ss << "pool " << poolstr << " is in unmanaged snaps mode";
} else if (prefix == "osd pool create") {
int64_t pg_num;
int64_t pgp_num;
- cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
- cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
+ cmd_getval_throws(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
string pool_type_str;
- cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool_type", pool_type_str);
if (pool_type_str.empty())
pool_type_str = g_conf->osd_pool_default_type;
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
if (pool_id >= 0) {
const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
bool implicit_rule_creation = false;
int64_t expected_num_objects = 0;
string rule_name;
- cmd_getval(g_ceph_context, cmdmap, "rule", rule_name);
+ cmd_getval_throws(g_ceph_context, cmdmap, "rule", rule_name);
string erasure_code_profile;
- cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
+ cmd_getval_throws(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
if (pool_type == pg_pool_t::TYPE_ERASURE) {
if (erasure_code_profile == "")
rule_name = poolstr;
}
}
- cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
+ cmd_getval_throws(g_ceph_context, cmdmap, "expected_num_objects",
expected_num_objects, int64_t(0));
} else {
//NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
}
rule_name = erasure_code_profile;
} else { // cmd is well-formed
- cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
+ cmd_getval_throws(g_ceph_context, cmdmap, "expected_num_objects",
expected_num_objects, int64_t(0));
}
}
}
int64_t fast_read_param;
- cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
+ cmd_getval_throws(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
FastReadType fast_read = FAST_READ_DEFAULT;
if (fast_read_param == 0)
fast_read = FAST_READ_OFF;
prefix == "osd pool rm") {
// osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
string poolstr, poolstr2, sure;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
- cmd_getval(g_ceph_context, cmdmap, "pool2", poolstr2);
- cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool2", poolstr2);
+ cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
if (pool < 0) {
ss << "pool '" << poolstr << "' does not exist";
goto update;
} else if (prefix == "osd pool rename") {
string srcpoolstr, destpoolstr;
- cmd_getval(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
- cmd_getval(g_ceph_context, cmdmap, "destpool", destpoolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "destpool", destpoolstr);
int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
if (err)
goto reply;
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
if (pool_id < 0) {
ss << "unrecognized pool '" << poolstr << "'";
goto reply;
}
string tierpoolstr;
- cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
if (tierpool_id < 0) {
ss << "unrecognized pool '" << tierpoolstr << "'";
// make sure new tier is empty
string force_nonempty;
- cmd_getval(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
+ cmd_getval_throws(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
const pool_stat_t *pstats = mon->pgservice->get_pool_stat(tierpool_id);
if (pstats && pstats->stats.sum.num_objects != 0 &&
force_nonempty != "--force-nonempty") {
} else if (prefix == "osd tier remove" ||
prefix == "osd tier rm") {
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
if (pool_id < 0) {
ss << "unrecognized pool '" << poolstr << "'";
goto reply;
}
string tierpoolstr;
- cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
if (tierpool_id < 0) {
ss << "unrecognized pool '" << tierpoolstr << "'";
if (err)
goto reply;
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
if (pool_id < 0) {
ss << "unrecognized pool '" << poolstr << "'";
goto reply;
}
string overlaypoolstr;
- cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
if (overlaypool_id < 0) {
ss << "unrecognized pool '" << overlaypoolstr << "'";
} else if (prefix == "osd tier remove-overlay" ||
prefix == "osd tier rm-overlay") {
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
if (pool_id < 0) {
ss << "unrecognized pool '" << poolstr << "'";
if (err)
goto reply;
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
if (pool_id < 0) {
ss << "unrecognized pool '" << poolstr << "'";
goto reply;
}
string modestr;
- cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "mode", modestr);
pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
if (mode < 0) {
ss << "'" << modestr << "' is not a valid cache mode";
}
string sure;
- cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+ cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
mode != pg_pool_t::CACHEMODE_NONE &&
mode != pg_pool_t::CACHEMODE_PROXY &&
if (err)
goto reply;
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
if (pool_id < 0) {
ss << "unrecognized pool '" << poolstr << "'";
goto reply;
}
string tierpoolstr;
- cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
if (tierpool_id < 0) {
ss << "unrecognized pool '" << tierpoolstr << "'";
}
int64_t size = 0;
- if (!cmd_getval(g_ceph_context, cmdmap, "size", size)) {
+ if (!cmd_getval_throws(g_ceph_context, cmdmap, "size", size)) {
ss << "unable to parse 'size' value '"
<< cmd_vartype_stringify(cmdmap["size"]) << "'";
err = -EINVAL;
return true;
} else if (prefix == "osd pool set-quota") {
string poolstr;
- cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
if (pool_id < 0) {
ss << "unrecognized pool '" << poolstr << "'";
}
string field;
- cmd_getval(g_ceph_context, cmdmap, "field", field);
+ cmd_getval_throws(g_ceph_context, cmdmap, "field", field);
if (field != "max_objects" && field != "max_bytes") {
ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
err = -EINVAL;
// val could contain unit designations, so we treat as a string
string val;
- cmd_getval(g_ceph_context, cmdmap, "val", val);
+ cmd_getval_throws(g_ceph_context, cmdmap, "val", val);
string tss;
int64_t value;
if (field == "max_objects") {
} else if (prefix == "osd force-create-pg") {
pg_t pgid;
string pgidstr;
- cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
+ cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr);
if (!pgid.parse(pgidstr.c_str())) {
ss << "invalid pgid '" << pgidstr << "'";
err = -EINVAL;
return 0;
}
+// returns true if specified device is attached
+bool BlueFS::is_device(unsigned id)
+{
+ return !(id >= MAX_BDEV || bdev[id] == nullptr);
+}
+
+// returns true if specified device is under full bluefs control
+// and hence can be expanded
+bool BlueFS::is_device_expandable(unsigned id)
+{
+ if (id >= MAX_BDEV || bdev[id] == nullptr) {
+ return false;
+ }
+ switch(id) {
+ case BDEV_WAL:
+ return true;
+
+ case BDEV_DB:
+ // true if DB volume is non-shared
+ return bdev[BDEV_SLOW] != nullptr;
+ }
+ return false;
+}
+
int BlueFS::mkfs(uuid_d osd_uuid)
{
std::unique_lock<std::mutex> l(lock);
/// get current extents that we own for given block device
int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
+ // returns true if specified device is attached
+ bool is_device(unsigned id);
+
+ // returns true if specified device is under full bluefs control
+ // and hence can be expanded
+ bool is_device_expandable(unsigned id);
+
int open_for_write(
const string& dir,
const string& file,
cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
}
-void BlueStore::BufferSpace::finish_write(Cache* cache, uint64_t seq)
+void BlueStore::BufferSpace::_finish_write(Cache* cache, uint64_t seq)
{
- std::lock_guard<std::recursive_mutex> l(cache->lock);
-
auto i = writing.begin();
while (i != writing.end()) {
if (i->seq > seq) {
}
}
+void BlueStore::SharedBlob::finish_write(uint64_t seq)
+{
+ while (true) {
+ Cache *cache = coll->cache;
+ std::lock_guard<std::recursive_mutex> l(cache->lock);
+ if (coll->cache != cache) {
+ ldout(coll->store->cct, 20) << __func__
+ << " raced with sb cache update, was " << cache
+ << ", now " << coll->cache << ", retrying"
+ << dendl;
+ continue;
+ }
+ bc._finish_write(cache, seq);
+ break;
+ }
+}
+
// SharedBlobSet
#undef dout_prefix
uint64_t target = store->osd_memory_target;
uint64_t base = store->osd_memory_base;
double fragmentation = store->osd_memory_expected_fragmentation;
- uint64_t cache_max = ((1.0 - fragmentation) * target) - base;
uint64_t cache_min = store->osd_memory_cache_min;
+ uint64_t cache_max = cache_min;
+ uint64_t limited_target = (1.0 - fragmentation) * target;
+ if (limited_target > base + cache_min) {
+ cache_max = limited_target - base;
+ }
size_t heap_size = 0;
size_t unmapped = 0;
"collection");
b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
"Read EIO errors propagated to high level callers");
+ b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
+ "Read operations that required at least one retry due to failed checksum validation");
logger = b.create_perf_counters();
cct->get_perfcounters_collection()->add(logger);
}
return 0;
}
+void BlueStore::_dump_alloc_on_rebalance_failure()
+{
+ auto dump_interval =
+ cct->_conf->bluestore_bluefs_balance_failure_dump_interval;
+ if (dump_interval > 0 &&
+ next_dump_on_bluefs_balance_failure <= ceph_clock_now()) {
+ alloc->dump();
+ next_dump_on_bluefs_balance_failure = ceph_clock_now();
+ next_dump_on_bluefs_balance_failure += dump_interval;
+ }
+}
+
int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
{
int ret = 0;
0, 0, &exts);
if (alloc_len <= 0) {
- dout(1) << __func__ << " no allocate on 0x" << std::hex << gift
+ dout(0) << __func__ << " no allocate on 0x" << std::hex << gift
<< " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
alloc->unreserve(gift);
- alloc->dump();
+ _dump_alloc_on_rebalance_failure();
return 0;
} else if (alloc_len < (int64_t)gift) {
- dout(1) << __func__ << " insufficient allocate on 0x" << std::hex << gift
+ dout(0) << __func__ << " insufficient allocate on 0x" << std::hex << gift
<< " min_alloc_size 0x" << min_alloc_size
<< " allocated 0x" << alloc_len
<< std::dec << dendl;
alloc->unreserve(gift - alloc_len);
- alloc->dump();
+ _dump_alloc_on_rebalance_failure();
}
for (auto& p : exts) {
bluestore_pextent_t e = bluestore_pextent_t(p);
uint64_t offset,
size_t length,
bufferlist& bl,
- uint32_t op_flags)
+ uint32_t op_flags,
+ uint64_t retry_count)
{
FUNCTRACE();
int r = 0;
bufferlist& compressed_bl = *p++;
if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
b2r_it->second.front().logical_offset) < 0) {
- return -EIO;
+ // Handles spurious read errors caused by a kernel bug.
+ // We sometimes get all-zero pages as a result of the read under
+ // high memory pressure. Retrying the failing read succeeds in most cases.
+ // See also: http://tracker.ceph.com/issues/22464
+ if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+ return -EIO;
+ }
+ return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
}
bufferlist raw_bl;
r = _decompress(compressed_bl, &raw_bl);
for (auto& reg : b2r_it->second) {
if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
reg.logical_offset) < 0) {
- return -EIO;
+ // Handles spurious read errors caused by a kernel bug.
+ // We sometimes get all-zero pages as a result of the read under
+ // high memory pressure. Retrying the failing read succeeds in most cases.
+ // See also: http://tracker.ceph.com/issues/22464
+ if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+ return -EIO;
+ }
+ return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
}
if (buffered) {
bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
assert(pos == length);
assert(pr == pr_end);
r = bl.length();
+ if (retry_count) {
+ logger->inc(l_bluestore_reads_with_retries);
+ dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
+ << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
+ }
return r;
}
uint64_t bad_csum;
utime_t start = ceph_clock_now();
int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
+ if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
+ (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
+ derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
+ bad = blob_xoffset;
+ r = -1;
+ bad_csum = 0xDEADBEEF;
+ }
if (r < 0) {
if (r == -1) {
PExtentVector pex;
assert(txc->state == TransContext::STATE_FINISHING);
for (auto& sb : txc->shared_blobs_written) {
- sb->bc.finish_write(sb->get_cache(), txc->seq);
+ sb->finish_write(txc->seq);
}
txc->shared_blobs_written.clear();
dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
<< " target_blob_size 0x" << std::hex << wctx->target_blob_size
+ << " compress=" << (int)wctx->compress
+ << " buffered=" << (int)wctx->buffered
<< std::dec << dendl;
}
c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
r = 0;
+ // hold a ref to new Onode in old name position, to ensure we don't drop
+ // it from the cache before this txc commits (or else someone may come along
+ // and read newo's metadata via the old name).
+ txc->note_modified_object(oldo);
+
out:
dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
<< new_oid << " = " << r << dendl;
l_bluestore_extent_compress,
l_bluestore_gc_merged,
l_bluestore_read_eio,
+ l_bluestore_reads_with_retries,
l_bluestore_last
};
b->cache_private = _discard(cache, offset, bl.length());
_add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
}
- void finish_write(Cache* cache, uint64_t seq);
+ void _finish_write(Cache* cache, uint64_t seq);
void did_read(Cache* cache, uint32_t offset, bufferlist& bl) {
std::lock_guard<std::recursive_mutex> l(cache->lock);
Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
void put_ref(uint64_t offset, uint32_t length,
PExtentVector *r, set<SharedBlob*> *maybe_unshared_blobs);
+ void finish_write(uint64_t seq);
+
friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
return l.get_sbid() == r.get_sbid();
}
unsigned bluefs_shared_bdev = 0; ///< which bluefs bdev we are sharing
bool bluefs_single_shared_device = true;
utime_t bluefs_last_balance;
+ utime_t next_dump_on_bluefs_balance_failure;
KeyValueDB *db = nullptr;
BlockDevice *bdev = nullptr;
void _open_statfs();
+ void _dump_alloc_on_rebalance_failure();
int _reconcile_bluefs_freespace();
int _balance_bluefs_freespace(PExtentVector *extents);
void _commit_bluefs_freespace(const PExtentVector& extents);
uint64_t offset,
size_t len,
bufferlist& bl,
- uint32_t op_flags = 0);
+ uint32_t op_flags = 0,
+ uint64_t retry_count = 0);
private:
int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
}
}
+const char* find_device_path(
+ int id,
+ CephContext *cct,
+ const vector<string>& devs)
+{
+ for (auto& i : devs) {
+ bluestore_bdev_label_t label;
+ int r = BlueStore::_read_bdev_label(cct, i, &label);
+ if (r < 0) {
+ cerr << "unable to read label for " << i << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if ((id == BlueFS::BDEV_SLOW && label.description == "main") ||
+ (id == BlueFS::BDEV_DB && label.description == "bluefs db") ||
+ (id == BlueFS::BDEV_WAL && label.description == "bluefs wal")) {
+ return i.c_str();
+ }
+ }
+ return nullptr;
+}
+
BlueFS *open_bluefs(
CephContext *cct,
const string& path,
env_to_vec(args);
auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
- CODE_ENVIRONMENT_UTILITY, 0);
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+
common_init_finish(cct.get());
if (action == "fsck" ||
<< cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE);
}
- label.meta[key] = value;
+ if (key == "size") {
+ label.size = strtoull(value.c_str(), nullptr, 10);
+ } else if (key =="osd_uuid") {
+ label.osd_uuid.parse(value.c_str());
+ } else if (key =="btime") {
+ uint64_t epoch;
+ uint64_t nsec;
+ int r = utime_t::parse_date(value.c_str(), &epoch, &nsec);
+ if (r == 0) {
+ label.btime = utime_t(epoch, nsec);
+ }
+ } else if (key =="description") {
+ label.description = value;
+ } else {
+ label.meta[key] = value;
+ }
r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
if (r < 0) {
cerr << "unable to write label for " << devs.front() << ": "
}
else if (action == "bluefs-bdev-expand") {
BlueFS *fs = open_bluefs(cct.get(), path, devs);
- cout << "start:" << std::endl;
fs->dump_block_extents(cout);
+ cout << "Expanding..." << std::endl;
for (int devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB }) {
+ if (!fs->is_device(devid)) {
+ continue;
+ }
+ if (!fs->is_device_expandable(devid)) {
+ cout << devid
+ << " : can't be expanded. Bypassing..."
+ << std::endl;
+ continue;
+ }
interval_set<uint64_t> before;
fs->get_block_extents(devid, &before);
+ assert(!before.empty());
uint64_t end = before.range_end();
uint64_t size = fs->get_block_device_size(devid);
if (end < size) {
- cout << "expanding dev " << devid << " from 0x" << std::hex
+ cout << devid
+ <<" : expanding " << " from 0x" << std::hex
<< end << " to 0x" << size << std::dec << std::endl;
fs->add_block_extent(devid, end, size-end);
+ const char* path = find_device_path(devid, cct.get(), devs);
+ if (path == nullptr) {
+ cerr << devid
+ <<": can't find device path " << std::endl;
+ continue;
+ }
+ bluestore_bdev_label_t label;
+ int r = BlueStore::_read_bdev_label(cct.get(), path, &label);
+ if (r < 0) {
+ cerr << "unable to read label for " << path << ": "
+ << cpp_strerror(r) << std::endl;
+ continue;
+ }
+ label.size = size;
+ r = BlueStore::_write_bdev_label(cct.get(), path, label);
+ if (r < 0) {
+ cerr << "unable to write label for " << path << ": "
+ << cpp_strerror(r) << std::endl;
+ continue;
+ }
+ cout << devid
+ <<" : size label updated to " << size
+ << std::endl;
}
}
delete fs;
out: \
complete_inject_failure(); \
return r; \
- } catch (RetryException) { \
+ } catch (RetryException&) { \
failed = true; \
} catch (...) { \
ceph_abort(); \
monc(osd->monc),
peering_wq(osd->peering_wq),
recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
- &osd->disk_tp),
+ &osd->recovery_tp),
class_handler(osd->class_handler),
pg_epoch_lock("OSDService::pg_epoch_lock"),
publish_lock("OSDService::publish_lock"),
"osd_peering_tp_threads"),
osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
get_num_op_threads()),
- disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
+ remove_tp(cct, "OSD::remove_tp", "tp_osd_remove", cct->_conf->osd_remove_threads, "osd_remove_threads"),
+ recovery_tp(cct, "OSD::recovery_tp", "tp_osd_recovery", cct->_conf->osd_recovery_threads, "osd_recovery_threads"),
command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
session_waiting_lock("OSD::session_waiting_lock"),
osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
store,
cct->_conf->osd_remove_thread_timeout,
cct->_conf->osd_remove_thread_suicide_timeout,
- &disk_tp),
+ &remove_tp),
service(this)
{
monc->set_messenger(client_messenger);
service.max_oldest_map = superblock.oldest_map;
osd_op_tp.start();
- disk_tp.start();
+ remove_tp.start();
+ recovery_tp.start();
command_tp.start();
set_disk_tp_priority();
command_tp.stop();
dout(10) << "command tp stopped" << dendl;
- disk_tp.drain();
- disk_tp.stop();
- dout(10) << "disk tp paused (new)" << dendl;
+ remove_tp.drain();
+ remove_tp.stop();
+ dout(10) << "remove tp paused (new)" << dendl;
+
+ recovery_tp.drain();
+ recovery_tp.stop();
+ dout(10) << "recovery tp paused (new)" << dendl;
dout(10) << "stopping agent" << dendl;
service.agent_stop();
#ifdef PG_DEBUG_REFS
service.dump_live_pgids();
#endif
+
+ osd_lock.Unlock();
cct->_conf->remove_observer(this);
+ osd_lock.Lock();
dout(10) << "syncing store" << dendl;
enable_disable_fuse(true);
bool OSD::heartbeat_reset(Connection *con)
{
+ Mutex::Locker l(heartbeat_lock);
HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
if (s) {
- heartbeat_lock.Lock();
if (is_stopping()) {
- heartbeat_lock.Unlock();
s->put();
return true;
}
} else {
dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
}
- heartbeat_lock.Unlock();
s->put();
}
return true;
}
}
+ check_ops_in_flight();
mgrc.update_osd_health(get_health_metrics());
service.kick_recovery_queue();
tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
cont = dstate->pause_clearing();
handle.suspend_tp_timeout();
waiter.wait();
+ if (cct->_conf->osd_delete_sleep) {
+ utime_t t;
+ t.set_from_double(cct->_conf->osd_delete_sleep);
+ lgeneric_subdout(cct, osd, 10) << __func__ << " inject delay of " << t << dendl;
+ t.sleep();
+ }
handle.reset_tp_timeout();
if (cont)
cont = dstate->resume_clearing();
"osd", "rw", "cli,rest")
};
+namespace {
+ class unlock_guard {
+ Mutex& m;
+ public:
+ explicit unlock_guard(Mutex& mutex)
+ : m(mutex)
+ {
+ m.Unlock();
+ }
+ unlock_guard(unlock_guard&) = delete;
+ ~unlock_guard() {
+ m.Lock();
+ }
+ };
+}
+
void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
{
int r = 0;
string args = argsvec.front();
for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
args += " " + *a;
- osd_lock.Unlock();
+ unlock_guard unlock{osd_lock};
r = cct->_conf->injectargs(args, &ss);
- osd_lock.Lock();
}
else if (prefix == "config set") {
std::string key;
std::string val;
cmd_getval(cct, cmdmap, "key", key);
cmd_getval(cct, cmdmap, "value", val);
- osd_lock.Unlock();
+ unlock_guard unlock{osd_lock};
r = cct->_conf->set_val(key, val, true, &ss);
if (r == 0) {
cct->_conf->apply_changes(nullptr);
}
- osd_lock.Lock();
}
else if (prefix == "cluster_log") {
vector<string> msg;
cmd_getval(cct, cmdmap, "delay", delay);
ostringstream oss;
oss << delay;
+ unlock_guard unlock{osd_lock};
r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
if (r != 0) {
ss << "kick_recovery_wq: error setting "
void OSD::handle_conf_change(const struct md_config_t *conf,
const std::set <std::string> &changed)
{
+ Mutex::Locker l(osd_lock);
if (changed.count("osd_max_backfills")) {
service.local_reserver.set_max(cct->_conf->osd_max_backfills);
service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
return;
int cls =
ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
- if (cls < 0)
+ if (cls < 0) {
derr << __func__ << cpp_strerror(cls) << ": "
<< "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
<< " but only the following values are allowed: idle, be or rt" << dendl;
- else
- disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
+ } else {
+ remove_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
+ recovery_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
+ }
}
// --------------------------------
ThreadPool peering_tp;
ShardedThreadPool osd_op_tp;
- ThreadPool disk_tp;
+ ThreadPool remove_tp;
+ ThreadPool recovery_tp;
ThreadPool command_tp;
void set_disk_tp_priority();
to_check.insert(p.first);
}
for (auto& pg : to_check) {
- auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
- if (crush_rule < 0) {
- lderr(cct) << __func__ << " unable to load crush-rule of pg "
- << pg << dendl;
+ if (!tmpmap.pg_exists(pg)) {
+ ldout(cct, 0) << __func__ << " pg " << pg << " is gone" << dendl;
+ to_cancel.insert(pg);
continue;
}
+ auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
map<int, float> weight_map;
auto it = rule_weight_map.find(crush_rule);
if (it == rule_weight_map.end()) {
tmpmap.pg_to_raw_up(pg, &raw, &primary);
set<int> parents;
for (auto osd : raw) {
+ // skip non-existent/down osd for erasure-coded PGs
+ if (osd == CRUSH_ITEM_NONE)
+ continue;
if (type > 0) {
auto parent = tmpmap.crush->get_parent_of_type(osd, type, crush_rule);
- if (parent >= 0) {
+ if (parent < 0) {
+ auto r = parents.insert(parent);
+ if (!r.second) {
+ // two up-set osds come from same parent
+ to_cancel.insert(pg);
+ break;
+ }
+ } else {
lderr(cct) << __func__ << " unable to get parent of raw osd."
<< osd << " of pg " << pg
<< dendl;
- break;
- }
- auto r = parents.insert(parent);
- if (!r.second) {
- // two up-set osds come from same parent
- to_cancel.insert(pg);
- break;
+ // continue to do checks below
}
}
// the above check validates collision only
}
}
}
+ tmpmap.clean_pg_upmaps(cct, pending_inc);
}
int OSDMap::apply_incremental(const Incremental &inc)
s += ",recovery_deletes";
if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
s += ",purged_snapdirs";
+ if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
+ s += ",pglog_hardlimit";
if (s.length())
s.erase(0, 1);
return s;
int OSDMap::clean_pg_upmaps(
CephContext *cct,
- Incremental *pending_inc)
+ Incremental *pending_inc) const
{
ldout(cct, 10) << __func__ << dendl;
int changed = 0;
pg_to_raw_osds(p.first, &raw, &primary);
mempool::osdmap::vector<pair<int,int>> newmap;
for (auto& q : p.second) {
- if (std::find(raw.begin(), raw.end(), q.first) != raw.end()) {
- newmap.push_back(q);
+ if (std::find(raw.begin(), raw.end(), q.first) == raw.end()) {
+ // cancel mapping if source osd does not exist anymore
+ continue;
+ }
+ if (q.second != CRUSH_ITEM_NONE && q.second < max_osd &&
+ q.second >= 0 && osd_weight[q.second] == 0) {
+ // cancel mapping if target osd is out
+ continue;
}
+ newmap.push_back(q);
}
if (newmap.empty()) {
ldout(cct, 10) << " removing no-op pg_upmap_items " << p.first << " "
multimap<float,int> deviation_osd; // deviation(pgs), osd
set<int> overfull;
for (auto& i : pgs_by_osd) {
+ // make sure osd is still there (belongs to this crush-tree)
+ assert(osd_weight.count(i.first));
float target = osd_weight[i.first] * pgs_per_weight;
float deviation = (float)i.second.size() - target;
ldout(cct, 20) << " osd." << i.first
for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
int osd = p->second;
float deviation = p->first;
- // make sure osd is still there (belongs to this crush-tree)
- assert(osd_weight.count(osd));
float target = osd_weight[osd] * pgs_per_weight;
assert(target > 0);
if (deviation/target < max_deviation_ratio) {
int clean_pg_upmaps(
CephContext *cct,
- Incremental *pending_inc);
+ Incremental *pending_inc) const;
bool try_pg_upmap(
CephContext *cct,
int get_osds_by_bucket_name(const string &name, set<int> *osds) const;
+ bool have_pg_upmaps(pg_t pg) const {
+ return pg_upmap.count(pg) ||
+ pg_upmap_items.count(pg);
+ }
+
/*
* handy helpers to build simple maps...
*/
t.omap_setkeys(coll, pgmeta_oid, km);
}
-void PG::trim_log()
-{
- assert(is_primary());
- calc_trim_to();
- dout(10) << __func__ << " to " << pg_trim_to << dendl;
- if (pg_trim_to != eversion_t()) {
- // inform peers to trim log
- assert(!actingbackfill.empty());
- for (set<pg_shard_t>::iterator i = actingbackfill.begin();
- i != actingbackfill.end();
- ++i) {
- if (*i == pg_whoami) continue;
- osd->send_message_osd_cluster(
- i->osd,
- new MOSDPGTrim(
- get_osdmap()->get_epoch(),
- spg_t(info.pgid.pgid, i->shard),
- pg_trim_to),
- get_osdmap()->get_epoch());
- }
-
- // trim primary as well
- pg_log.trim(pg_trim_to, info);
- dirty_info = true;
- }
-}
-
void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
{
// raise last_complete only if we were previously up to date
roll_forward_to));
}
- pg_log.trim(trim_to, info);
+ dout(10) << __func__ << " approx pg log length = "
+ << pg_log.get_log().approx_size() << dendl;
+ dout(10) << __func__ << " transaction_applied = "
+ << transaction_applied << dendl;
+ if (!transaction_applied)
+ dout(10) << __func__ << " " << pg_whoami
+ << " is backfill target" << dendl;
+ pg_log.trim(trim_to, info, transaction_applied);
// update the local pg, pg log
dirty_info = true;
return;
utime_t reg_stamp;
- if (scrubber.must_scrub ||
- (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
+ bool must = false;
+ if (scrubber.must_scrub) {
+ // Set the smallest time that isn't utime_t()
+ reg_stamp = utime_t(0,1);
+ must = true;
+ } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) {
reg_stamp = ceph_clock_now();
+ must = true;
} else {
reg_stamp = info.history.last_scrub_stamp;
}
reg_stamp,
scrub_min_interval,
scrub_max_interval,
- scrubber.must_scrub);
+ must);
}
void PG::unreg_next_scrub()
bool done;
t.register_on_applied_sync(
new C_SafeCond(&my_lock, &my_cond, &done, &r));
- r = osd->store->apply_transaction(osr.get(), std::move(t));
+ r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
if (r != 0) {
derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
<< dendl;
pg->publish_stats_to_osd();
}
- // trim pglog on recovered
- pg->trim_log();
-
// adjust acting set? (e.g. because backfill completed...)
bool history_les_bound = false;
if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
virtual void calc_trim_to() = 0;
+ virtual void calc_trim_to_aggressive() = 0;
+
void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
pg_missing_t& omissing, pg_shard_t from);
void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
}
+ bool hard_limit_pglog() const {
+ return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
+ }
+
void init_primary_up_acting(
const vector<int> &newup,
const vector<int> &newacting,
ObjectStore::Transaction &t,
bool transaction_applied = true);
bool check_log_for_corruption(ObjectStore *store);
- void trim_log();
std::string get_corrupt_pg_log_name() const;
static int read_info(
set<string>* trimmed_dups,
eversion_t *write_from_dups)
{
- if (complete_to != log.end() &&
- complete_to->version <= s) {
- generic_dout(0) << " bad trim to " << s << " when complete_to is "
- << complete_to->version
- << " on " << *this << dendl;
- }
-
assert(s <= can_rollback_to);
+ if (complete_to != log.end())
+ lgeneric_subdout(cct, osd, 20) << " complete_to " << complete_to->version << dendl;
auto earliest_dup_version =
log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
const pg_log_entry_t &e = *log.begin();
if (e.version > s)
break;
- generic_dout(20) << "trim " << e << dendl;
+ lgeneric_subdout(cct, osd, 20) << "trim " << e << dendl;
if (trimmed)
trimmed->insert(e.version);
unindex(e); // remove from index,
// add to dup list
- generic_dout(20) << "earliest_dup_version = " << earliest_dup_version << dendl;
+ lgeneric_subdout(cct, osd, 20) << "earliest_dup_version = " << earliest_dup_version << dendl;
if (e.version.version >= earliest_dup_version) {
if (write_from_dups != nullptr && *write_from_dups > e.version) {
- generic_dout(20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
+ lgeneric_subdout(cct, osd, 20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
*write_from_dups = e.version;
}
dups.push_back(pg_log_dup_t(e));
}
}
+ bool reset_complete_to = false;
+ // we are trimming past complete_to, so reset complete_to
+ if (complete_to != log.end() && e.version >= complete_to->version)
+ reset_complete_to = true;
if (rollback_info_trimmed_to_riter == log.rend() ||
e.version == rollback_info_trimmed_to_riter->version) {
log.pop_front();
} else {
log.pop_front();
}
+
+ // reset complete_to to the beginning of the log
+ if (reset_complete_to) {
+ lgeneric_subdout(cct, osd, 20) << " moving complete_to " << " to "
+ << log.begin()->version << dendl;
+ complete_to = log.begin();
+ }
}
while (!dups.empty()) {
const auto& e = *dups.begin();
if (e.version.version >= earliest_dup_version)
break;
- generic_dout(20) << "trim dup " << e << dendl;
+ lgeneric_subdout(cct, osd, 20) << "trim dup " << e << dendl;
if (trimmed_dups)
trimmed_dups->insert(e.get_key_name());
if (indexed_data & PGLOG_INDEXED_DUPS) {
void PGLog::trim(
eversion_t trim_to,
- pg_info_t &info)
+ pg_info_t &info,
+ bool transaction_applied)
{
+ dout(10) << __func__ << " proposed trim_to = " << trim_to << dendl;
// trim?
if (trim_to > log.tail) {
- // We shouldn't be trimming the log past last_complete
- assert(trim_to <= info.last_complete);
+ dout(10) << __func__ << " missing = " << missing.num_missing() << dendl;
+ // Don't assert for backfill_targets
+ // or whenever there are missing items
+ if (transaction_applied && (missing.num_missing() == 0))
+ assert(trim_to <= info.last_complete);
dout(10) << "trim " << log << " to " << trim_to << dendl;
log.trim(cct, trim_to, &trimmed, &trimmed_dups, &write_from_dups);
info.log_tail = log.tail;
+ if (log.complete_to != log.log.end())
+ dout(10) << " after trim complete_to " << log.complete_to->version << dendl;
}
}
void trim(
eversion_t trim_to,
- pg_info_t &info);
+ pg_info_t &info,
+ bool transaction_applied = true);
void roll_forward_to(
eversion_t roll_forward_to,
size_t target = cct->_conf->osd_min_pg_log_entries;
if (is_degraded() ||
state_test(PG_STATE_RECOVERING |
- PG_STATE_RECOVERY_WAIT |
- PG_STATE_BACKFILLING |
- PG_STATE_BACKFILL_WAIT |
- PG_STATE_BACKFILL_TOOFULL)) {
+ PG_STATE_RECOVERY_WAIT |
+ PG_STATE_BACKFILLING |
+ PG_STATE_BACKFILL_WAIT |
+ PG_STATE_BACKFILL_TOOFULL)) {
target = cct->_conf->osd_max_pg_log_entries;
}
- eversion_t limit = MIN(
+ eversion_t limit = std::min(
min_last_complete_ondisk,
pg_log.get_can_rollback_to());
if (limit != eversion_t() &&
limit != pg_trim_to &&
pg_log.get_log().approx_size() > target) {
- size_t num_to_trim = MIN(pg_log.get_log().approx_size() - target,
- cct->_conf->osd_pg_log_trim_max);
+ size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
+ cct->_conf->osd_pg_log_trim_max);
if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
- cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
+ cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
return;
}
list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
new_trim_to = it->version;
++it;
if (new_trim_to > limit) {
- new_trim_to = limit;
- dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
- break;
+ new_trim_to = limit;
+ dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
+ break;
}
}
dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
}
}
+void PrimaryLogPG::calc_trim_to_aggressive()
+{
+ size_t target = cct->_conf->osd_min_pg_log_entries;
+ if (is_degraded() ||
+ state_test(PG_STATE_RECOVERING |
+ PG_STATE_RECOVERY_WAIT |
+ PG_STATE_BACKFILLING |
+ PG_STATE_BACKFILL_WAIT |
+ PG_STATE_BACKFILL_TOOFULL)) {
+ target = cct->_conf->osd_max_pg_log_entries;
+ }
+ // limit pg log trimming up to the can_rollback_to value
+ eversion_t limit = std::min(
+ pg_log.get_head(),
+ pg_log.get_can_rollback_to());
+ dout(10) << __func__ << " limit = " << limit << dendl;
+
+ if (limit != eversion_t() &&
+ limit != pg_trim_to &&
+ pg_log.get_log().approx_size() > target) {
+ dout(10) << __func__ << " approx pg log length = "
+ << pg_log.get_log().approx_size() << dendl;
+ size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
+ cct->_conf->osd_pg_log_trim_max);
+ dout(10) << __func__ << " num_to_trim = " << num_to_trim << dendl;
+ if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
+ cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
+ return;
+ }
+ auto it = pg_log.get_log().log.begin(); // oldest log entry
+ auto rit = pg_log.get_log().log.rbegin();
+ eversion_t by_n_to_keep; // start from tail
+ eversion_t by_n_to_trim = eversion_t::max(); // start from head
+ for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
+ i++;
+ if (i > target && by_n_to_keep == eversion_t()) {
+ by_n_to_keep = rit->version;
+ }
+ if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
+ by_n_to_trim = it->version;
+ }
+ if (by_n_to_keep != eversion_t() &&
+ by_n_to_trim != eversion_t::max()) {
+ break;
+ }
+ }
+
+ if (by_n_to_keep == eversion_t()) {
+ return;
+ }
+
+ pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
+ dout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
+ assert(pg_trim_to <= pg_log.get_head());
+ }
+}
+
PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
const PGPool &_pool, spg_t p) :
PG(o, curmap, _pool, p),
assert(op->may_write() || op->may_cache());
// trim log?
- calc_trim_to();
+ if (hard_limit_pglog())
+ calc_trim_to_aggressive();
+ else
+ calc_trim_to();
// verify that we are doing this in order?
if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
op.alloc_hint.expected_write_size,
op.alloc_hint.flags);
- ctx->delta_stats.num_wr++;
result = 0;
}
break;
dout(20) << __func__ << " " << repop << dendl;
issue_repop(repop, ctx.get());
eval_repop(repop);
- calc_trim_to();
+ if (hard_limit_pglog())
+ calc_trim_to_aggressive();
+ else
+ calc_trim_to();
repop->put();
}
assert(r == 0);
});
- calc_trim_to();
+ if (hard_limit_pglog())
+ calc_trim_to_aggressive();
+ else
+ calc_trim_to();
}
void PrimaryLogPG::cancel_log_updates()
void apply_and_flush_repops(bool requeue);
void calc_trim_to() override;
+ void calc_trim_to_aggressive() override;
int do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr);
int do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr);
unsigned get_split_bits(unsigned pg_num) const;
bool contains(int bits, const ghobject_t& oid) {
- return oid.match(bits, ps());
+ return
+ (int64_t)m_pool == oid.hobj.get_logical_pool() &&
+ oid.match(bits, ps());
}
bool contains(int bits, const hobject_t& oid) {
- return oid.match(bits, ps());
+ return
+ (int64_t)m_pool == oid.get_logical_pool() &&
+ oid.match(bits, ps());
}
hobject_t get_hobj_start() const;
return ghobject_t(
hobject_t(object_t(name), "", CEPH_NOSNAP,
pgid.ps(),
- hobject_t::POOL_TEMP_START - pgid.pool(), ""),
+ hobject_t::get_temp_pool(pgid.pool()),
+ ""),
ghobject_t::NO_GEN,
shard);
}
// adjust safe_pos
auto it = pending_safe.find(start);
assert(it != pending_safe.end());
+ uint64_t min_next_safe_pos = pending_safe.begin()->second;
pending_safe.erase(it);
if (pending_safe.empty())
safe_pos = next_safe_pos;
else
- safe_pos = pending_safe.begin()->second;
+ safe_pos = min_next_safe_pos;
ldout(cct, 10) << "_finish_flush safe from " << start
<< ", pending_safe " << pending_safe
// prefetch?
_prefetch();
+
+ // If bufferlist consists of discontiguous memory, decoding types whose
+ // denc_traits needs contiguous memory is inefficient. The bufferlist may
+ // get copied to temporary memory multiple times (copy_shallow() in
+ // src/include/denc.h actually does deep copy)
+ if (bl.get_num_buffers() > 1)
+ bl.rebuild();
return true;
}
#include "include/assert.h"
#define MAX_FLUSH_UNDER_LOCK 20 ///< max bh's we start writeback on
-#define BUFFER_MEMORY_WEIGHT 12 // memory usage of BufferHead, count in (1<<n)
+#define BUFFER_MEMORY_WEIGHT CEPH_PAGE_SHIFT // memory usage of BufferHead, count in (1<<n)
using std::chrono::seconds;
/// while holding the lock
++p;
if (p != data.end() && can_merge_bh(bh, p->second))
merge_left(bh, p->second);
+
+ maybe_rebuild_buffer(bh);
+}
+
+void ObjectCacher::Object::maybe_rebuild_buffer(BufferHead *bh)
+{
+ auto& bl = bh->bl;
+ if (bl.get_num_buffers() <= 1)
+ return;
+
+ auto wasted = bl.get_wasted_space();
+ if (wasted * 2 > bl.length() &&
+ wasted > (1U << BUFFER_MEMORY_WEIGHT))
+ bl.rebuild();
}
/*
if (cur + max >= bh->end()) {
// we want right bit (one splice)
final = split(bh, cur); // just split it, take right half.
+ maybe_rebuild_buffer(bh);
replace_journal_tid(final, tid);
++p;
assert(p->second == final);
} else {
// we want middle bit (two splices)
final = split(bh, cur);
+ maybe_rebuild_buffer(bh);
++p;
assert(p->second == final);
- split(final, cur+max);
+ auto right = split(final, cur+max);
+ maybe_rebuild_buffer(right);
replace_journal_tid(final, tid);
}
} else {
// whole bufferhead, piece of cake.
} else {
// we want left bit (one splice)
- split(bh, cur + max); // just split
+ auto right = split(bh, cur + max); // just split
+ maybe_rebuild_buffer(right);
}
if (final) {
oc->mark_dirty(bh);
// split bh at truncation point?
if (bh->start() < s) {
split(bh, s);
+ maybe_rebuild_buffer(bh);
continue;
}
// split bh at truncation point?
if (bh->start() < off) {
split(bh, off);
+ maybe_rebuild_buffer(bh);
++p;
continue;
}
assert(bh->start() >= off);
if (bh->end() > off + len) {
- split(bh, off + len);
+ auto right = split(bh, off + len);
+ maybe_rebuild_buffer(right);
}
++p;
void merge_left(BufferHead *left, BufferHead *right);
bool can_merge_bh(BufferHead *left, BufferHead *right);
void try_merge_bh(BufferHead *bh);
+ void maybe_rebuild_buffer(BufferHead *bh);
bool is_cached(loff_t off, loff_t len) const;
bool include_all_cached_data(loff_t off, loff_t len);
initialized = false;
+ wl.unlock();
cct->_conf->remove_observer(this);
+ wl.lock();
map<int,OSDSession*>::iterator p;
while (!osd_sessions.empty()) {
}
bool unpaused = false;
- if (t->paused && !target_should_be_paused(t)) {
- t->paused = false;
+ bool should_be_paused = target_should_be_paused(t);
+ if (t->paused && !should_be_paused) {
unpaused = true;
}
+ t->paused = should_be_paused;
bool legacy_change =
t->pgid != pgid ||
op->tid = 0;
m->get_redirect().combine_with_locator(op->target.target_oloc,
op->target.target_oid.name);
- op->target.flags |= (CEPH_OSD_FLAG_REDIRECTED | CEPH_OSD_FLAG_IGNORE_OVERLAY);
+ op->target.flags |= (CEPH_OSD_FLAG_REDIRECTED |
+ CEPH_OSD_FLAG_IGNORE_CACHE |
+ CEPH_OSD_FLAG_IGNORE_OVERLAY);
_op_submit(op, sul, NULL);
m->put();
return;
else:
return pool_id
- def create_group(self, group_id):
+ def create_group(self, group_id, mode=0o755):
# Prevent craftily-named volume groups from colliding with the meta
# files.
if group_id.endswith(META_FILE_EXT):
raise ValueError("group ID cannot end with '{0}'.".format(
META_FILE_EXT))
path = self._get_group_path(group_id)
- self._mkdir_p(path)
+ self._mkdir_p(path, mode)
def destroy_group(self, group_id):
path = self._get_group_path(group_id)
else:
self.fs.rmdir(path)
- def _mkdir_p(self, path):
+ def _mkdir_p(self, path, mode=0o755):
try:
self.fs.stat(path)
except cephfs.ObjectNotFound:
try:
self.fs.stat(subpath)
except cephfs.ObjectNotFound:
- self.fs.mkdir(subpath, 0o755)
+ self.fs.mkdir(subpath, mode)
- def create_volume(self, volume_path, size=None, data_isolated=False, namespace_isolated=True):
+ def create_volume(self, volume_path, size=None, data_isolated=False, namespace_isolated=True,
+ mode=0o755):
"""
Set up metadata, pools and auth for a volume.
path = self._get_path(volume_path)
log.info("create_volume: {0}".format(path))
- self._mkdir_p(path)
+ self._mkdir_p(path, mode)
if size is not None:
self.fs.setxattr(path, 'ceph.quota.max_bytes', to_bytes(size), 0)
dir_path, self.rados.conf_get('client_snapdir'), snapshot_name
)
- def _snapshot_create(self, dir_path, snapshot_name):
+ def _snapshot_create(self, dir_path, snapshot_name, mode=0o755):
# TODO: raise intelligible exception for clusters where snaps are disabled
- self.fs.mkdir(self._snapshot_path(dir_path, snapshot_name), 0o755)
+ self.fs.mkdir(self._snapshot_path(dir_path, snapshot_name), mode)
def _snapshot_destroy(self, dir_path, snapshot_name):
"""
except cephfs.ObjectNotFound:
log.warn("Snapshot was already gone: {0}".format(snapshot_name))
- def create_snapshot_volume(self, volume_path, snapshot_name):
- self._snapshot_create(self._get_path(volume_path), snapshot_name)
+ def create_snapshot_volume(self, volume_path, snapshot_name, mode=0o755):
+ self._snapshot_create(self._get_path(volume_path), snapshot_name, mode)
def destroy_snapshot_volume(self, volume_path, snapshot_name):
self._snapshot_destroy(self._get_path(volume_path), snapshot_name)
- def create_snapshot_group(self, group_id, snapshot_name):
+ def create_snapshot_group(self, group_id, snapshot_name, mode=0o755):
if group_id is None:
raise RuntimeError("Group ID may not be None")
- return self._snapshot_create(self._get_group_path(group_id), snapshot_name)
+ return self._snapshot_create(self._get_group_path(group_id), snapshot_name,
+ mode)
def destroy_snapshot_group(self, group_id, snapshot_name):
if group_id is None:
"desc": "Show an optimization plan",
"perm": "r",
},
+ {
+ "cmd": "balancer ls",
+ "desc": "List all plans",
+ "perm": "r",
+ },
{
"cmd": "balancer execute name=plan,type=CephString",
"desc": "Execute an optimization plan",
self.log.warn("Handling command: '%s'" % str(command))
if command['prefix'] == 'balancer status':
s = {
- 'plans': self.plans.keys(),
+ 'plans': list(self.plans.keys()),
'active': self.active,
'mode': self.get_config('mode', default_mode),
}
elif command['prefix'] == 'balancer reset':
self.plans = {}
return (0, '', '')
+ elif command['prefix'] == 'balancer ls':
+ return (0, json.dumps([p for p in self.plans], indent=4), '')
elif command['prefix'] == 'balancer dump':
plan = self.plans.get(command['plan'])
if not plan:
}
self.log.debug('score_by_root %s' % pe.score_by_root)
+ # get the list of score metrics, comma separated
+ metrics = self.get_config('crush_compat_metrics', 'pgs,objects,bytes').split(',')
+
# total score is just average of normalized stddevs
pe.score = 0.0
for r, vs in six.iteritems(pe.score_by_root):
for k, v in six.iteritems(vs):
- pe.score += v
- pe.score /= 3 * len(roots)
+ if k in metrics:
+ pe.score += v
+ pe.score /= len(metrics) * len(roots)
return pe
def evaluate(self, ms, pools, verbose=False):
self.log.error(detail)
return -errno.EOPNOTSUPP, detail
- key = 'pgs' # pgs objects or bytes
+ # rebalance by pgs, objects, or bytes
+ metrics = self.get_config('crush_compat_metrics', 'pgs,objects,bytes').split(',')
+ key = metrics[0] # balancing using the first score metric
+ if key not in ['pgs', 'bytes', 'objects']:
+ self.log.warn("Invalid crush_compat balancing key %s. Using 'pgs'." % key)
+ key = 'pgs'
# go
best_ws = copy.deepcopy(orig_ws)
for daemon, counters in six.iteritems(self.get_all_perf_counters()):
svc_type, svc_id = daemon.split(".", 1)
metadata = self.get_metadata(svc_type, svc_id)
+ if not metadata:
+ continue
for path, counter_info in counters.items():
if counter_info['type'] & self.PERFCOUNTER_HISTOGRAM:
RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
-DISK_OCCUPATION = ( 'ceph_daemon', 'device','instance')
+DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device', 'wal_device', 'instance')
NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
osd_metadata = self.get_metadata("osd", str(id_))
if osd_metadata is None:
continue
- dev_keys = ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
- osd_dev_node = None
- for dev_key in dev_keys:
- val = osd_metadata.get(dev_key, None)
- if val and val != "unknown":
- osd_dev_node = val
- break
+
+ osd_objectstore = osd_metadata.get('osd_objectstore', None)
+ if osd_objectstore == "filestore":
+ # collect filestore backend device
+ osd_dev_node = osd_metadata.get('backend_filestore_dev_node', None)
+ # collect filestore journal device
+ osd_wal_dev_node = osd_metadata.get('osd_journal', '')
+ osd_db_dev_node = ''
+ elif osd_objectstore == "bluestore":
+ # collect bluestore backend device
+ osd_dev_node = osd_metadata.get('bluestore_bdev_dev_node', None)
+ # collect bluestore wal backend
+ osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '')
+ # collect bluestore db backend
+ osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '')
+ if osd_dev_node and osd_dev_node == "unknown":
+ osd_dev_node = None
+
osd_hostname = osd_metadata.get('hostname', None)
if osd_dev_node and osd_hostname:
self.log.debug("Got dev for osd {0}: {1}/{2}".format(
self.metrics['disk_occupation'].set(1, (
"osd.{0}".format(id_),
osd_dev_node,
+ osd_db_dev_node,
+ osd_wal_dev_node,
osd_hostname
))
else:
# Implemented osd commands
OSD_IMPLEMENTED_COMMANDS = [
- 'scrub', 'deep_scrub', 'repair'
+ 'scrub', 'deep-scrub', 'repair'
]
# Valid values for the 'var' argument to 'ceph osd pool set'
if tag == 'seq':
return
- request = [x for x in self.requests if x.is_running(tag)]
+ with self.requests_lock:
+ request = [x for x in self.requests if x.is_running(tag)]
+
if len(request) != 1:
self.log.warn("Unknown request '%s'" % str(tag))
return
def submit_request(self, _request, **kwargs):
- request = CommandsRequest(_request)
with self.requests_lock:
+ request = CommandsRequest(_request)
self.requests.append(request)
if kwargs.get('wait', 0):
while not request.is_finished():
dns = self.get_latest("mds", daemon_info['name'], "mds_mem.dn")
activity = "Evts: " + self.format_dimless(
- self.get_rate("mds", daemon_info['name'], "mds_log.replay"),
+ self.get_rate("mds", daemon_info['name'], "mds_log.replayed"),
5
) + "/s"
output += "\n" + standby_table.get_string() + "\n"
if len(mds_versions) == 1:
- output += "MDS version: {0}".format(mds_versions.keys()[0])
+ output += "MDS version: {0}".format(list(mds_versions)[0])
else:
version_table = PrettyTable(["version", "daemons"])
for version, daemons in six.iteritems(mds_versions):
RBD_FEATURES_ALL = _RBD_FEATURES_ALL
RBD_FLAG_OBJECT_MAP_INVALID = _RBD_FLAG_OBJECT_MAP_INVALID
+RBD_FLAG_FAST_DIFF_INVALID = _RBD_FLAG_FAST_DIFF_INVALID
RBD_MIRROR_MODE_DISABLED = _RBD_MIRROR_MODE_DISABLED
RBD_MIRROR_MODE_IMAGE = _RBD_MIRROR_MODE_IMAGE
add_dependencies(rgw_a civetweb_h)
-target_include_directories(rgw_a PUBLIC
- ${FCGI_INCLUDE_DIR}
- "../rapidjson/include"
- )
-target_compile_definitions(rgw_a PUBLIC BOOST_COROUTINES_NO_DEPRECATION_WARNING)
+target_include_directories(rgw_a SYSTEM PUBLIC "../rapidjson/include")
target_link_libraries(rgw_a librados cls_lock_client cls_rgw_client cls_refcount_client
cls_log_client cls_statelog_client cls_timeindex_client cls_version_client
${EXPAT_LIBRARIES}
${OPENLDAP_LIBRARIES} ${CRYPTO_LIBS})
-if (WITH_CURL_OPENSSL)
+if (WITH_CURL_OPENSSL OR (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL))
target_link_libraries(rgw_a ${OPENSSL_LIBRARIES})
-endif (WITH_CURL_OPENSSL)
+endif()
+if (WITH_RADOSGW_BEAST_FRONTEND)
+ target_compile_definitions(rgw_a PUBLIC BOOST_COROUTINES_NO_DEPRECATION_WARNING)
+ target_link_libraries(rgw_a Boost::coroutine Boost::context)
+endif()
set(radosgw_srcs
rgw_loadgen_process.cc
add_library(radosgw_a STATIC ${radosgw_srcs}
$<TARGET_OBJECTS:civetweb_common_objs>)
-target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
+if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
+ target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
+endif()
add_executable(radosgw rgw_main.cc)
target_link_libraries(radosgw radosgw_a librados
cls_version cls_replica_log cls_user)
install(TARGETS radosgw DESTINATION bin)
+if (WITH_RADOSGW_BEAST_FRONTEND)
+ target_link_libraries(radosgw_a ${OPENSSL_LIBRARIES})
+endif()
+
set(radosgw_admin_srcs
rgw_admin.cc
rgw_orphan.cc)
#include "rgw_auth_s3.h"
#include "rgw_lib.h"
#include "rgw_lib_frontend.h"
+#include "rgw_http_client.h"
+#include "rgw_http_client_curl.h"
#include <errno.h>
#include <chrono>
RGWLibFS* fs = iter->first->ref();
uniq.unlock();
fs->gc();
+ fs->update_user();
fs->rele();
uniq.lock();
if (cur_gen != gen)
rgw_tools_init(g_ceph_context);
rgw_init_resolver();
+ rgw::curl::setup_curl(boost::none);
store = RGWStoreManager::get_storage(g_ceph_context,
g_conf->rgw_enable_gc_threads,
rgw_tools_cleanup();
rgw_shutdown_resolver();
+ rgw::curl::cleanup_curl();
rgw_perf_stop(g_ceph_context);
cout << " reshard list list all bucket resharding or scheduled to be reshared\n";
cout << " reshard process process of scheduled reshard jobs\n";
cout << " reshard cancel cancel resharding a bucket\n";
+ cout << " reshard stale-instances list list stale-instances from bucket resharding\n";
+ cout << " reshard stale-instances rm cleanup stale-instances from bucket resharding\n";
cout << " sync error list list sync error\n";
cout << " sync error trim trim sync error\n";
cout << "options:\n";
OPT_RESHARD_STATUS,
OPT_RESHARD_PROCESS,
OPT_RESHARD_CANCEL,
+ OPT_RESHARD_STALE_INSTANCES_LIST,
+ OPT_RESHARD_STALE_INSTANCES_DELETE
};
static int get_cmd(const char *cmd, const char *prev_cmd, const char *prev_prev_cmd, bool *need_more)
strcmp(cmd, "replicalog") == 0 ||
strcmp(cmd, "role") == 0 ||
strcmp(cmd, "role-policy") == 0 ||
+ strcmp(cmd, "stale-instances") == 0 ||
strcmp(cmd, "subuser") == 0 ||
strcmp(cmd, "sync") == 0 ||
strcmp(cmd, "usage") == 0 ||
return OPT_RESHARD_PROCESS;
if (strcmp(cmd, "cancel") == 0)
return OPT_RESHARD_CANCEL;
+ } else if ((prev_prev_cmd && strcmp(prev_prev_cmd, "reshard") == 0) &&
+ (strcmp(prev_cmd, "stale-instances") == 0)) {
+ if (strcmp(cmd, "list") == 0)
+ return OPT_RESHARD_STALE_INSTANCES_LIST;
+ if (strcmp(cmd, "rm") == 0 ||
+ strcmp(cmd, "delete") == 0)
+ return OPT_RESHARD_STALE_INSTANCES_DELETE;
}
return -EINVAL;
formatter->flush(cout);
}
+static void show_reshard_status(
+ const list<cls_rgw_bucket_instance_entry>& status, Formatter *formatter)
+{
+ formatter->open_array_section("status");
+ for (const auto& entry : status) {
+ formatter->open_object_section("entry");
+ formatter->dump_string("reshard_status", to_string(entry.reshard_status));
+ formatter->dump_string("new_bucket_instance_id",
+ entry.new_bucket_instance_id);
+ formatter->dump_unsigned("num_shards", entry.num_shards);
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->flush(cout);
+}
+
class StoreDestructor {
RGWRados *store;
public:
int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket, &attrs);
if (ret < 0) {
cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
- return -ret;
+ return ret;
}
int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
return EINVAL;
}
} else if (ceph_argparse_witharg(args, i, &val, "--max-size", (char*)NULL)) {
- max_size = strict_iecstrtoll(val.c_str(), &err);
+ max_size = strict_iec_cast<long long>(val.c_str(), &err);
if (!err.empty()) {
cerr << "ERROR: failed to parse max size: " << err << std::endl;
return EINVAL;
for (int i = 0; i < max_shards; i++) {
RGWRados::BucketShard bs(store);
int shard_id = (bucket_info.num_shards > 0 ? i : -1);
- int ret = bs.init(bucket, shard_id);
+ int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
marker.clear();
if (ret < 0) {
for (int i = 0; i < max_shards; i++) {
RGWRados::BucketShard bs(store);
int shard_id = (bucket_info.num_shards > 0 ? i : -1);
- int ret = bs.init(bucket, shard_id);
+ int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
if (ret < 0) {
cerr << "ERROR: bs.init(bucket=" << bucket << ", shard=" << shard_id << "): " << cpp_strerror(-ret) << std::endl;
return -ret;
return ret;
}
- RGWBucketReshard br(store, bucket_info, attrs);
+ RGWBucketReshard br(store, bucket_info, attrs, nullptr /* no callback */);
#define DEFAULT_RESHARD_MAX_ENTRIES 1000
if (max_entries < 1) {
return 0;
}
-
if (opt_cmd == OPT_RESHARD_STATUS) {
if (bucket_name.empty()) {
cerr << "ERROR: bucket not specified" << std::endl;
return -ret;
}
- RGWBucketReshard br(store, bucket_info, attrs);
+ RGWBucketReshard br(store, bucket_info, attrs, nullptr /* no callback */);
list<cls_rgw_bucket_instance_entry> status;
int r = br.get_status(&status);
if (r < 0) {
- cerr << "ERROR: could not get resharding status for bucket " << bucket_name << std::endl;
+ cerr << "ERROR: could not get resharding status for bucket " <<
+ bucket_name << std::endl;
return -r;
}
- encode_json("status", status, formatter);
- formatter->flush(cout);
+ show_reshard_status(status, formatter);
}
if (opt_cmd == OPT_RESHARD_PROCESS) {
return -ret;
}
- RGWBucketReshard br(store, bucket_info, attrs);
+ RGWBucketReshard br(store, bucket_info, attrs, nullptr /* no callback */);
int ret = br.cancel();
if (ret < 0) {
if (ret == -EBUSY) {
}
}
- return 0;
+ if (opt_cmd == OPT_RESHARD_STALE_INSTANCES_LIST) {
+ ret = RGWBucketAdminOp::list_stale_instances(store, bucket_op,f);
+ if (ret < 0) {
+ cerr << "ERROR: listing stale instances" << cpp_strerror(-ret) << std::endl;
+ }
+ }
+
+ if (opt_cmd == OPT_RESHARD_STALE_INSTANCES_DELETE) {
+ ret = RGWBucketAdminOp::clear_stale_instances(store, bucket_op,f);
+ if (ret < 0) {
+ cerr << "ERROR: deleting stale instances" << cpp_strerror(-ret) << std::endl;
+ }
+ }
+
+ return 0;
}
using namespace rgw::asio;
-ClientIO::ClientIO(tcp::socket& socket,
- parser_type& parser,
- beast::flat_buffer& buffer)
- : socket(socket), parser(parser), buffer(buffer), txbuf(*this)
+ClientIO::ClientIO(parser_type& parser, bool is_ssl,
+ const endpoint_type& local_endpoint,
+ const endpoint_type& remote_endpoint)
+ : parser(parser), is_ssl(is_ssl),
+ local_endpoint(local_endpoint),
+ remote_endpoint(remote_endpoint),
+ txbuf(*this)
{
}
env.set("SCRIPT_URI", url.to_string()); /* FIXME */
char port_buf[16];
- snprintf(port_buf, sizeof(port_buf), "%d", socket.local_endpoint().port());
+ snprintf(port_buf, sizeof(port_buf), "%d", local_endpoint.port());
env.set("SERVER_PORT", port_buf);
- env.set("REMOTE_ADDR", socket.remote_endpoint().address().to_string());
- // TODO: set SERVER_PORT_SECURE if using ssl
+ if (is_ssl) {
+ env.set("SERVER_PORT_SECURE", port_buf);
+ }
+ env.set("REMOTE_ADDR", remote_endpoint.address().to_string());
// TODO: set REMOTE_USER if authenticated
return 0;
}
-size_t ClientIO::write_data(const char* buf, size_t len)
-{
- boost::system::error_code ec;
- auto bytes = boost::asio::write(socket, boost::asio::buffer(buf, len), ec);
- if (ec) {
- derr << "write_data failed: " << ec.message() << dendl;
- throw rgw::io::Exception(ec.value(), std::system_category());
- }
- /* According to the documentation of boost::asio::write if there is
- * no error (signalised by ec), then bytes == len. We don't need to
- * take care of partial writes in such situation. */
- return bytes;
-}
-
-size_t ClientIO::read_data(char* buf, size_t max)
-{
- auto& message = parser.get();
- auto& body_remaining = message.body();
- body_remaining.data = buf;
- body_remaining.size = max;
-
- dout(30) << this << " read_data for " << max << " with "
- << buffer.size() << " bytes buffered" << dendl;
-
- while (body_remaining.size && !parser.is_done()) {
- boost::system::error_code ec;
- beast::http::read_some(socket, buffer, parser, ec);
- if (ec == beast::http::error::partial_message ||
- ec == beast::http::error::need_buffer) {
- break;
- }
- if (ec) {
- derr << "failed to read body: " << ec.message() << dendl;
- throw rgw::io::Exception(ec.value(), std::system_category());
- }
- }
- return max - body_remaining.size;
-}
-
size_t ClientIO::complete_request()
{
return 0;
class ClientIO : public io::RestfulClient,
public io::BuffererSink {
- private:
- using tcp = boost::asio::ip::tcp;
- tcp::socket& socket;
+ protected:
parser_type& parser;
- beast::flat_buffer& buffer; //< parse buffer
+ private:
+ const bool is_ssl;
+ using endpoint_type = boost::asio::ip::tcp::endpoint;
+ endpoint_type local_endpoint;
+ endpoint_type remote_endpoint;
RGWEnv env;
rgw::io::StaticOutputBufferer<> txbuf;
- size_t write_data(const char *buf, size_t len) override;
- size_t read_data(char *buf, size_t max);
-
public:
- ClientIO(tcp::socket& socket, parser_type& parser,
- beast::flat_buffer& buffer);
+ ClientIO(parser_type& parser, bool is_ssl,
+ const endpoint_type& local_endpoint,
+ const endpoint_type& remote_endpoint);
~ClientIO() override;
int init_env(CephContext *cct) override;
size_t send_content_length(uint64_t len) override;
size_t complete_header() override;
- size_t recv_body(char* buf, size_t max) override {
- return read_data(buf, max);
- }
-
size_t send_body(const char* buf, size_t len) override {
return write_data(buf, len);
}
#include <vector>
#include <boost/asio.hpp>
+#include <boost/asio/spawn.hpp>
#include "common/errno.h"
#include "rgw_asio_client.h"
#include "rgw_asio_frontend.h"
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+#include <boost/asio/ssl.hpp>
+#endif
+
#define dout_subsys ceph_subsys_rgw
namespace {
using tcp = boost::asio::ip::tcp;
namespace beast = boost::beast;
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+namespace ssl = boost::asio::ssl;
+#endif
+
+template <typename Stream>
+class StreamIO : public rgw::asio::ClientIO {
+ Stream& stream;
+ beast::flat_buffer& buffer;
+ public:
+ StreamIO(Stream& stream, rgw::asio::parser_type& parser,
+ beast::flat_buffer& buffer, bool is_ssl,
+ const tcp::endpoint& local_endpoint,
+ const tcp::endpoint& remote_endpoint)
+ : ClientIO(parser, is_ssl, local_endpoint, remote_endpoint),
+ stream(stream), buffer(buffer)
+ {}
+
+ size_t write_data(const char* buf, size_t len) override {
+ boost::system::error_code ec;
+ auto bytes = boost::asio::write(stream, boost::asio::buffer(buf, len), ec);
+ if (ec) {
+ derr << "write_data failed: " << ec.message() << dendl;
+ throw rgw::io::Exception(ec.value(), std::system_category());
+ }
+ return bytes;
+ }
-class Connection {
- RGWProcessEnv& env;
- boost::asio::io_service::strand strand;
- tcp::socket socket;
-
- // references are bound to callbacks for async operations. if a callback
- // function returns without issuing another operation, the reference is
- // dropped and the Connection is deleted/closed
- std::atomic<int> nref{0};
- using Ref = boost::intrusive_ptr<Connection>;
+ size_t recv_body(char* buf, size_t max) override {
+ auto& message = parser.get();
+ auto& body_remaining = message.body();
+ body_remaining.data = buf;
+ body_remaining.size = max;
+
+ while (body_remaining.size && !parser.is_done()) {
+ boost::system::error_code ec;
+ beast::http::read_some(stream, buffer, parser, ec);
+ if (ec == beast::http::error::partial_message ||
+ ec == beast::http::error::need_buffer) {
+ break;
+ }
+ if (ec) {
+ derr << "failed to read body: " << ec.message() << dendl;
+ throw rgw::io::Exception(ec.value(), std::system_category());
+ }
+ }
+ return max - body_remaining.size;
+ }
+};
+template <typename Stream>
+void handle_connection(RGWProcessEnv& env, Stream& stream,
+ beast::flat_buffer& buffer, bool is_ssl,
+ boost::system::error_code& ec,
+ boost::asio::yield_context yield)
+{
// limit header to 4k, since we read it all into a single flat_buffer
static constexpr size_t header_limit = 4096;
// don't impose a limit on the body, since we read it in pieces
static constexpr size_t body_limit = std::numeric_limits<size_t>::max();
- beast::flat_buffer buffer;
- boost::optional<rgw::asio::parser_type> parser;
-
- using bad_response_type = beast::http::response<beast::http::empty_body>;
- boost::optional<bad_response_type> response;
+ auto cct = env.store->ctx();
- CephContext* ctx() const { return env.store->ctx(); }
-
- void read_header() {
+ // read messages from the stream until eof
+ for (;;) {
// configure the parser
- parser.emplace();
- parser->header_limit(header_limit);
- parser->body_limit(body_limit);
+ rgw::asio::parser_type parser;
+ parser.header_limit(header_limit);
+ parser.body_limit(body_limit);
// parse the header
- beast::http::async_read_header(socket, buffer, *parser, strand.wrap(
- std::bind(&Connection::on_header, Ref{this},
- std::placeholders::_1)));
- }
-
- void discard_unread_message() {
- if (parser->is_done()) {
- // nothing left to discard, start reading the next message
- read_header();
- return;
- }
-
- // read the rest of the request into a static buffer. multiple clients could
- // write at the same time, but this is okay because we never read it back
- static std::array<char, 1024> discard_buffer;
-
- auto& body = parser->get().body();
- body.size = discard_buffer.size();
- body.data = discard_buffer.data();
-
- beast::http::async_read_some(socket, buffer, *parser, strand.wrap(
- std::bind(&Connection::on_discard_unread, Ref{this},
- std::placeholders::_1)));
- }
-
- void on_discard_unread(boost::system::error_code ec) {
- if (ec == boost::asio::error::connection_reset) {
- return;
- }
- if (ec) {
- ldout(ctx(), 5) << "discard_unread_message failed: "
- << ec.message() << dendl;
- return;
- }
- discard_unread_message();
- }
-
- void on_write_error(boost::system::error_code ec) {
- if (ec) {
- ldout(ctx(), 5) << "failed to write response: " << ec.message() << dendl;
- }
- }
-
- void on_header(boost::system::error_code ec) {
+ beast::http::async_read_header(stream, buffer, parser, yield[ec]);
if (ec == boost::asio::error::connection_reset ||
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+ ec == ssl::error::stream_truncated ||
+#endif
ec == beast::http::error::end_of_stream) {
return;
}
if (ec) {
- auto& message = parser->get();
- ldout(ctx(), 1) << "failed to read header: " << ec.message() << dendl;
- ldout(ctx(), 1) << "====== req done http_status=400 ======" << dendl;
- response.emplace();
- response->result(beast::http::status::bad_request);
- response->version(message.version() == 10 ? 10 : 11);
- response->prepare_payload();
- beast::http::async_write(socket, *response, strand.wrap(
- std::bind(&Connection::on_write_error, Ref{this},
- std::placeholders::_1)));
+ ldout(cct, 1) << "failed to read header: " << ec.message() << dendl;
+ auto& message = parser.get();
+ beast::http::response<beast::http::empty_body> response;
+ response.result(beast::http::status::bad_request);
+ response.version(message.version() == 10 ? 10 : 11);
+ response.prepare_payload();
+ beast::http::async_write(stream, response, yield[ec]);
+ if (ec) {
+ ldout(cct, 5) << "failed to write response: " << ec.message() << dendl;
+ }
+ ldout(cct, 1) << "====== req done http_status=400 ======" << dendl;
return;
}
// process the request
RGWRequest req{env.store->get_new_req_id()};
- rgw::asio::ClientIO real_client{socket, *parser, buffer};
+ auto& socket = stream.lowest_layer();
+ StreamIO<Stream> real_client{stream, parser, buffer, is_ssl,
+ socket.local_endpoint(),
+ socket.remote_endpoint()};
auto real_client_io = rgw::io::add_reordering(
- rgw::io::add_buffering(ctx(),
+ rgw::io::add_buffering(cct,
rgw::io::add_chunking(
rgw::io::add_conlen_controlling(
&real_client))));
- RGWRestfulIO client(ctx(), &real_client_io);
+ RGWRestfulIO client(cct, &real_client_io);
process_request(env.store, env.rest, &req, env.uri_prefix,
*env.auth_registry, &client, env.olog);
- if (parser->keep_alive()) {
- // parse any unread bytes from the previous message (in case we replied
- // before reading the entire body) before reading the next
- discard_unread_message();
+ if (!parser.keep_alive()) {
+ return;
}
- }
- public:
- Connection(RGWProcessEnv& env, tcp::socket&& socket)
- : env(env), strand(socket.get_io_service()), socket(std::move(socket)) {}
-
- void on_connect() {
- read_header();
- }
+ // if we failed before reading the entire message, discard any remaining
+ // bytes before reading the next
+ while (!parser.is_done()) {
+ static std::array<char, 1024> discard_buffer;
- void get() { ++nref; }
- void put() { if (nref.fetch_sub(1) == 1) { delete this; } }
+ auto& body = parser.get().body();
+ body.size = discard_buffer.size();
+ body.data = discard_buffer.data();
- friend void intrusive_ptr_add_ref(Connection *c) { c->get(); }
- friend void intrusive_ptr_release(Connection *c) { c->put(); }
-};
+ beast::http::async_read_some(stream, buffer, parser, yield[ec]);
+ if (ec == boost::asio::error::connection_reset) {
+ return;
+ }
+ if (ec) {
+ ldout(cct, 5) << "failed to discard unread message: "
+ << ec.message() << dendl;
+ return;
+ }
+ }
+ }
+}
class AsioFrontend {
RGWProcessEnv env;
RGWFrontendConfig* conf;
boost::asio::io_service service;
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+ boost::optional<ssl::context> ssl_context;
+ int init_ssl();
+#endif
struct Listener {
tcp::endpoint endpoint;
tcp::acceptor acceptor;
tcp::socket socket;
+ bool use_ssl = false;
Listener(boost::asio::io_service& service)
: acceptor(service), socket(service) {}
}
tcp::endpoint parse_endpoint(BOOST_ASIO_STRING_VIEW_PARAM input,
+ unsigned short default_port,
boost::system::error_code& ec)
{
tcp::endpoint endpoint;
- auto colon = input.find(':');
- if (colon != input.npos) {
- auto port_str = input.substr(colon + 1);
- endpoint.port(parse_port(port_str.data(), ec));
- } else {
- endpoint.port(80);
+ if (input.empty()) {
+ ec = boost::asio::error::invalid_argument;
+ return endpoint;
}
- if (!ec) {
+
+ if (input[0] == '[') { // ipv6
+ const size_t addr_begin = 1;
+ const size_t addr_end = input.find(']');
+ if (addr_end == input.npos) { // no matching ]
+ ec = boost::asio::error::invalid_argument;
+ return endpoint;
+ }
+ if (addr_end + 1 < input.size()) {
+ // :port must must follow [ipv6]
+ if (input[addr_end + 1] != ':') {
+ ec = boost::asio::error::invalid_argument;
+ return endpoint;
+ } else {
+ auto port_str = input.substr(addr_end + 2);
+ endpoint.port(parse_port(port_str.data(), ec));
+ }
+ }
+ auto addr = input.substr(addr_begin, addr_end - addr_begin);
+ endpoint.address(boost::asio::ip::make_address_v6(addr, ec));
+ } else { // ipv4
+ auto colon = input.find(':');
+ if (colon != input.npos) {
+ auto port_str = input.substr(colon + 1);
+ endpoint.port(parse_port(port_str.data(), ec));
+ if (ec) {
+ return endpoint;
+ }
+ }
auto addr = input.substr(0, colon);
- endpoint.address(boost::asio::ip::make_address(addr, ec));
+ endpoint.address(boost::asio::ip::make_address_v4(addr, ec));
}
return endpoint;
}
boost::system::error_code ec;
auto& config = conf->get_config_map();
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+ int r = init_ssl();
+ if (r < 0) {
+ return r;
+ }
+#endif
+
// parse endpoints
- auto range = config.equal_range("port");
- for (auto i = range.first; i != range.second; ++i) {
+ auto ports = config.equal_range("port");
+ for (auto i = ports.first; i != ports.second; ++i) {
auto port = parse_port(i->second.c_str(), ec);
if (ec) {
lderr(ctx()) << "failed to parse port=" << i->second << dendl;
listeners.back().endpoint.port(port);
}
- range = config.equal_range("endpoint");
- for (auto i = range.first; i != range.second; ++i) {
- auto endpoint = parse_endpoint(i->second, ec);
+ auto endpoints = config.equal_range("endpoint");
+ for (auto i = endpoints.first; i != endpoints.second; ++i) {
+ auto endpoint = parse_endpoint(i->second, 80, ec);
if (ec) {
lderr(ctx()) << "failed to parse endpoint=" << i->second << dendl;
return -ec.value();
return drop_privileges(ctx());
}
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+int AsioFrontend::init_ssl()
+{
+ boost::system::error_code ec;
+ auto& config = conf->get_config_map();
+
+ // ssl configuration
+ auto cert = config.find("ssl_certificate");
+ const bool have_cert = cert != config.end();
+ if (have_cert) {
+ // only initialize the ssl context if it's going to be used
+ ssl_context = boost::in_place(ssl::context::tls);
+ }
+
+ auto key = config.find("ssl_private_key");
+ const bool have_private_key = key != config.end();
+ if (have_private_key) {
+ if (!have_cert) {
+ lderr(ctx()) << "no ssl_certificate configured for ssl_private_key" << dendl;
+ return -EINVAL;
+ }
+ ssl_context->use_private_key_file(key->second, ssl::context::pem, ec);
+ if (ec) {
+ lderr(ctx()) << "failed to add ssl_private_key=" << key->second
+ << ": " << ec.message() << dendl;
+ return -ec.value();
+ }
+ }
+ if (have_cert) {
+ ssl_context->use_certificate_chain_file(cert->second, ec);
+ if (ec) {
+ lderr(ctx()) << "failed to use ssl_certificate=" << cert->second
+ << ": " << ec.message() << dendl;
+ return -ec.value();
+ }
+ if (!have_private_key) {
+ // attempt to use it as a private key if a separate one wasn't provided
+ ssl_context->use_private_key_file(cert->second, ssl::context::pem, ec);
+ if (ec) {
+ lderr(ctx()) << "failed to use ssl_certificate=" << cert->second
+ << " as a private key: " << ec.message() << dendl;
+ return -ec.value();
+ }
+ }
+ }
+
+ // parse ssl endpoints
+ auto ports = config.equal_range("ssl_port");
+ for (auto i = ports.first; i != ports.second; ++i) {
+ if (!have_cert) {
+ lderr(ctx()) << "no ssl_certificate configured for ssl_port" << dendl;
+ return -EINVAL;
+ }
+ auto port = parse_port(i->second.c_str(), ec);
+ if (ec) {
+ lderr(ctx()) << "failed to parse ssl_port=" << i->second << dendl;
+ return -ec.value();
+ }
+ listeners.emplace_back(service);
+ listeners.back().endpoint.port(port);
+ listeners.back().use_ssl = true;
+ }
+
+ auto endpoints = config.equal_range("ssl_endpoint");
+ for (auto i = endpoints.first; i != endpoints.second; ++i) {
+ if (!have_cert) {
+ lderr(ctx()) << "no ssl_certificate configured for ssl_endpoint" << dendl;
+ return -EINVAL;
+ }
+ auto endpoint = parse_endpoint(i->second, 443, ec);
+ if (ec) {
+ lderr(ctx()) << "failed to parse ssl_endpoint=" << i->second << dendl;
+ return -ec.value();
+ }
+ listeners.emplace_back(service);
+ listeners.back().endpoint = endpoint;
+ listeners.back().use_ssl = true;
+ }
+ return 0;
+}
+#endif // WITH_RADOSGW_BEAST_OPENSSL
+
void AsioFrontend::accept(Listener& l, boost::system::error_code ec)
{
if (!l.acceptor.is_open()) {
accept(l, ec);
});
- boost::intrusive_ptr<Connection> conn{new Connection(env, std::move(socket))};
- conn->on_connect();
- // reference drops here, but on_connect() takes another
+ // spawn a coroutine to handle the connection
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+ if (l.use_ssl) {
+ boost::asio::spawn(service, std::bind(
+ [this] (boost::asio::yield_context yield, tcp::socket& s) mutable {
+ // wrap the socket in an ssl stream
+ ssl::stream<tcp::socket&> stream{s, *ssl_context};
+ beast::flat_buffer buffer;
+ // do ssl handshake
+ boost::system::error_code ec;
+ auto bytes = stream.async_handshake(ssl::stream_base::server,
+ buffer.data(), yield[ec]);
+ if (ec) {
+ ldout(ctx(), 1) << "ssl handshake failed: " << ec.message() << dendl;
+ return;
+ }
+ buffer.consume(bytes);
+ handle_connection(env, stream, buffer, true, ec, yield);
+ if (!ec) {
+ // ssl shutdown (ignoring errors)
+ stream.async_shutdown(yield[ec]);
+ }
+ s.shutdown(tcp::socket::shutdown_both, ec);
+ }, std::placeholders::_1, std::move(socket)));
+ } else {
+#else
+ {
+#endif // WITH_RADOSGW_BEAST_OPENSSL
+ boost::asio::spawn(service, std::bind(
+ [this] (boost::asio::yield_context yield, tcp::socket& s) mutable {
+ beast::flat_buffer buffer;
+ boost::system::error_code ec;
+ handle_connection(env, s, buffer, false, ec, yield);
+ s.shutdown(tcp::socket::shutdown_both, ec);
+ }, std::placeholders::_1, std::move(socket)));
+ }
}
int AsioFrontend::run()
#include "rgw_common.h"
#include "rgw_auth.h"
+#include "rgw_quota.h"
#include "rgw_user.h"
#include "rgw_http_client.h"
#include "rgw_keystone.h"
user_info.user_id = new_acct_user;
user_info.display_name = info.acct_name;
+ user_info.max_buckets = cct->_conf->rgw_user_max_buckets;
+ rgw_apply_default_bucket_quota(user_info.bucket_quota, cct);
+ rgw_apply_default_user_quota(user_info.user_quota, cct);
+
int ret = rgw_store_user_info(store, user_info, nullptr, nullptr,
real_time(), true);
if (ret < 0) {
const auto canonical_req_hash = calc_hash_sha256(canonical_req);
- ldout(cct, 10) << "canonical request = " << canonical_req << dendl;
+ using sanitize = rgw::crypt_sanitize::log_content;
+ ldout(cct, 10) << "canonical request = " << sanitize{canonical_req} << dendl;
ldout(cct, 10) << "canonical request hash = "
<< buf_to_hex(canonical_req_hash).data() << dendl;
#include "common/errno.h"
#include "common/ceph_json.h"
#include "common/backport14.h"
+#include "include/scope_guard.h"
#include "rgw_rados.h"
#include "rgw_acl.h"
#include "rgw_acl_s3.h"
#include "include/rados/librados.hpp"
// until everything is moved from rgw_common
#include "rgw_common.h"
-
+#include "rgw_reshard.h"
#include "cls/user/cls_user_types.h"
#define dout_context g_ceph_context
return store->meta_mgr->put_entry(bucket_instance_meta_handler, entry, bl, exclusive, objv_tracker, mtime, pattrs);
}
-int rgw_bucket_instance_remove_entry(RGWRados *store, string& entry, RGWObjVersionTracker *objv_tracker) {
+int rgw_bucket_instance_remove_entry(RGWRados *store, const string& entry,
+ RGWObjVersionTracker *objv_tracker) {
return store->meta_mgr->remove_entry(bucket_instance_meta_handler, entry, objv_tracker);
}
return bucket.set_quota(op_state);
}
+static int purge_bucket_instance(RGWRados *store, const RGWBucketInfo& bucket_info)
+{
+ int max_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
+ for (int i = 0; i < max_shards; i++) {
+ RGWRados::BucketShard bs(store);
+ int shard_id = (bucket_info.num_shards > 0 ? i : -1);
+ int ret = bs.init(bucket_info.bucket, shard_id, nullptr);
+ if (ret < 0) {
+ cerr << "ERROR: bs.init(bucket=" << bucket_info.bucket << ", shard=" << shard_id
+ << "): " << cpp_strerror(-ret) << std::endl;
+ return ret;
+ }
+ ret = store->bi_remove(bs);
+ if (ret < 0) {
+ cerr << "ERROR: failed to remove bucket index object: "
+ << cpp_strerror(-ret) << std::endl;
+ return ret;
+ }
+ }
+ return 0;
+}
+
+inline auto split_tenant(const std::string& bucket_name) ->
+ std::pair<std::string,std::string>
+{
+ auto p = bucket_name.find('/');
+ if(p != std::string::npos) {
+ return std::make_pair(bucket_name.substr(0,p), bucket_name.substr(p+1));
+ }
+ return std::make_pair(std::string(), bucket_name);
+}
+
+using bucket_instance_ls = std::vector<RGWBucketInfo>;
+void get_stale_instances(RGWRados *store, const std::string& bucket_name,
+ const vector<std::string>& lst,
+ bucket_instance_ls& stale_instances)
+{
+
+ RGWObjectCtx obj_ctx(store);
+
+ bucket_instance_ls other_instances;
+// first iterate over the entries, and pick up the done buckets; these
+// are guaranteed to be stale
+ for (const auto& bucket_instance : lst){
+ RGWBucketInfo binfo;
+ int r = store->get_bucket_instance_info(obj_ctx, bucket_instance,
+ binfo, nullptr,nullptr);
+ if (r < 0){
+ // this can only happen if someone deletes us right when we're processing
+ lderr(store->ctx()) << "Bucket instance is invalid: " << bucket_instance
+ << cpp_strerror(-r) << dendl;
+ continue;
+ }
+ if (binfo.reshard_status == CLS_RGW_RESHARD_DONE)
+ stale_instances.emplace_back(std::move(binfo));
+ else {
+ other_instances.emplace_back(std::move(binfo));
+ }
+ }
+
+ // Read the cur bucket info, if the bucket doesn't exist we can simply return
+ // all the instances
+ std::string tenant,bucket;
+ std::tie(tenant, bucket) = split_tenant(bucket_name);
+ RGWBucketInfo cur_bucket_info;
+ int r = store->get_bucket_info(obj_ctx, tenant, bucket, cur_bucket_info, nullptr);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ // bucket doesn't exist, everything is stale then
+ stale_instances.insert(std::end(stale_instances),
+ std::make_move_iterator(other_instances.begin()),
+ std::make_move_iterator(other_instances.end()));
+ } else {
+ // all bets are off if we can't read the bucket, just return the sureshot stale instances
+ lderr(store->ctx()) << "error: reading bucket info for bucket: "
+ << bucket << cpp_strerror(-r) << dendl;
+ }
+ return;
+ }
+
+ // Don't process further in this round if bucket is resharding
+ if (cur_bucket_info.reshard_status == CLS_RGW_RESHARD_IN_PROGRESS)
+ return;
+
+ other_instances.erase(std::remove_if(other_instances.begin(), other_instances.end(),
+ [&cur_bucket_info](const RGWBucketInfo& b){
+ return (b.bucket.bucket_id == cur_bucket_info.bucket.bucket_id ||
+ b.bucket.bucket_id == cur_bucket_info.new_bucket_instance_id);
+ }),
+ other_instances.end());
+
+ // check if there are still instances left
+ if (other_instances.empty()) {
+ return;
+ }
+
+ // Now we have a bucket with instances where the reshard status is none, this
+ // usually happens when the reshard process couldn't complete, lockdown the
+ // bucket and walk through these instances to make sure no one else interferes
+ // with these
+ {
+ RGWBucketReshardLock reshard_lock(store, cur_bucket_info, true);
+ r = reshard_lock.lock();
+ if (r < 0) {
+ // most likely bucket is under reshard, return the sureshot stale instances
+ ldout(store->ctx(), 5) << __func__
+ << "failed to take reshard lock; reshard underway likey" << dendl;
+ return;
+ }
+ auto sg = make_scope_guard([&reshard_lock](){ reshard_lock.unlock();} );
+ // this should be fast enough that we may not need to renew locks and check
+ // exit status?, should we read the values of the instances again?
+ stale_instances.insert(std::end(stale_instances),
+ std::make_move_iterator(other_instances.begin()),
+ std::make_move_iterator(other_instances.end()));
+ }
+
+ return;
+}
+
+static int process_stale_instances(RGWRados *store, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ std::function<void(const bucket_instance_ls&,
+ Formatter *,
+ RGWRados*)> process_f)
+{
+ std::string marker;
+ void *handle;
+ Formatter *formatter = flusher.get_formatter();
+ static constexpr auto default_max_keys = 1000;
+
+ int ret = store->meta_mgr->list_keys_init("bucket.instance", marker, &handle);
+ if (ret < 0) {
+ cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+ return ret;
+ }
+
+ bool truncated;
+
+ formatter->open_array_section("keys");
+
+ do {
+ list<std::string> keys;
+
+ ret = store->meta_mgr->list_keys_next(handle, default_max_keys, keys, &truncated);
+ if (ret < 0 && ret != -ENOENT) {
+ cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+ return ret;
+ } if (ret != -ENOENT) {
+ // partition the list of buckets by buckets as the listing is un sorted,
+ // since it would minimize the reads to bucket_info
+ std::unordered_map<std::string, std::vector<std::string>> bucket_instance_map;
+ for (auto &key: keys) {
+ auto pos = key.find(':');
+ if(pos != std::string::npos)
+ bucket_instance_map[key.substr(0,pos)].emplace_back(std::move(key));
+ }
+ for (const auto& kv: bucket_instance_map) {
+ bucket_instance_ls stale_lst;
+ get_stale_instances(store, kv.first, kv.second, stale_lst);
+ process_f(stale_lst, formatter, store);
+ }
+ }
+ } while (truncated);
+
+ formatter->close_section(); // keys
+ formatter->flush(cout);
+ return 0;
+}
+
+int RGWBucketAdminOp::list_stale_instances(RGWRados *store,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher)
+{
+ auto process_f = [](const bucket_instance_ls& lst,
+ Formatter *formatter,
+ RGWRados*){
+ for (const auto& binfo: lst)
+ formatter->dump_string("key", binfo.bucket.get_key());
+ };
+ return process_stale_instances(store, op_state, flusher, process_f);
+}
+
+
+int RGWBucketAdminOp::clear_stale_instances(RGWRados *store,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher)
+{
+ auto process_f = [](const bucket_instance_ls& lst,
+ Formatter *formatter,
+ RGWRados *store){
+ for (const auto &binfo: lst) {
+ int ret = purge_bucket_instance(store, binfo);
+ if (ret == 0){
+ auto md_key = "bucket.instance:" + binfo.bucket.get_key();
+ ret = store->meta_mgr->remove(md_key);
+ }
+ formatter->open_object_section("delete_status");
+ formatter->dump_string("bucket_instance", binfo.bucket.get_key());
+ formatter->dump_int("status", -ret);
+ formatter->close_section();
+ }
+ };
+
+ return process_stale_instances(store, op_state, flusher, process_f);
+}
+
void rgw_data_change::dump(Formatter *f) const
{
string type;
RGWListRawObjsCtx ctx;
};
- int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) override {
+ int remove(RGWRados *store, string& entry,
+ RGWObjVersionTracker& objv_tracker) override {
RGWBucketInfo info;
RGWObjectCtx obj_ctx(store);
if (ret < 0 && ret != -ENOENT)
return ret;
- return rgw_bucket_instance_remove_entry(store, entry, &info.objv_tracker);
+ return rgw_bucket_instance_remove_entry(store, entry,
+ &info.objv_tracker);
}
void get_pool_and_oid(RGWRados *store, const string& key, rgw_pool& pool, string& oid) override {
extern int rgw_bucket_parse_bucket_key(CephContext *cct, const string& key,
rgw_bucket* bucket, int *shard_id);
-extern int rgw_bucket_instance_remove_entry(RGWRados *store, string& entry, RGWObjVersionTracker *objv_tracker);
+extern int rgw_bucket_instance_remove_entry(RGWRados *store, const string& entry,
+ RGWObjVersionTracker *objv_tracker);
extern void rgw_bucket_instance_key_to_oid(string& key);
extern void rgw_bucket_instance_oid_to_key(string& oid);
/**
* Remove a bucket from the user's list by name.
*/
- void remove(string& name) {
+ void remove(const string& name) {
map<string, RGWBucketEnt>::iterator iter;
iter = buckets.find(name);
if (iter != buckets.end()) {
RGWFormatterFlusher& flusher,
bool warnings_only = false);
static int set_quota(RGWRados *store, RGWBucketAdminOpState& op_state);
+
+ static int list_stale_instances(RGWRados *store, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher);
+
+ static int clear_stale_instances(RGWRados *store, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher);
};
return e ? iter->second : string();
}
+bool rgw_transport_is_secure(CephContext *cct, const RGWEnv& env)
+{
+ const auto& m = env.get_map();
+ // frontend connected with ssl
+ if (m.count("SERVER_PORT_SECURE")) {
+ return true;
+ }
+ // ignore proxy headers unless explicitly enabled
+ if (!cct->_conf->rgw_trust_forwarded_https) {
+ return false;
+ }
+ // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Forwarded
+ // Forwarded: by=<identifier>; for=<identifier>; host=<host>; proto=<http|https>
+ auto i = m.find("HTTP_FORWARDED");
+ if (i != m.end() && i->second.find("proto=https") != std::string::npos) {
+ return true;
+ }
+ // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-Proto
+ i = m.find("HTTP_X_FORWARDED_PROTO");
+ if (i != m.end() && i->second == "https") {
+ return true;
+ }
+ return false;
+}
+
bool verify_user_permission(struct req_state * const s,
RGWAccessControlPolicy * const user_acl,
const int perm)
{
const uint32_t flag2 = flag & (MATCH_POLICY_ACTION|MATCH_POLICY_ARN) ?
MATCH_CASE_INSENSITIVE : 0;
+ const bool colonblocks = !(flag & (MATCH_POLICY_RESOURCE |
+ MATCH_POLICY_STRING));
const auto npos = boost::string_view::npos;
boost::string_view::size_type last_pos_input = 0, last_pos_pattern = 0;
while (true) {
- auto cur_pos_input = input.find(":", last_pos_input);
- auto cur_pos_pattern = pattern.find(":", last_pos_pattern);
+ auto cur_pos_input = colonblocks ? input.find(":", last_pos_input) : npos;
+ auto cur_pos_pattern =
+ colonblocks ? pattern.find(":", last_pos_pattern) : npos;
auto substr_input = input.substr(last_pos_input, cur_pos_input);
auto substr_pattern = pattern.substr(last_pos_pattern, cur_pos_pattern);
const std::map<string, string, ltstr_nocase>& get_map() const { return env_map; }
};
+// return true if the connection is secure. this either means that the
+// connection arrived via ssl, or was forwarded as https by a trusted proxy
+bool rgw_transport_is_secure(CephContext *cct, const RGWEnv& env);
+
enum http_op {
OP_GET,
OP_PUT,
utime_t duration(duration_secs, 0);
l.set_duration(duration);
l.set_cookie(cookie);
- l.set_renew(true);
+ l.set_may_renew(true);
return l.lock_exclusive(&ref.ioctx, ref.oid);
}
return -ERR_INVALID_ENCRYPTION_ALGORITHM;
}
if (s->cct->_conf->rgw_crypt_require_ssl &&
- !s->info.env->exists("SERVER_PORT_SECURE")) {
+ !rgw_transport_is_secure(s->cct, *s->info.env)) {
ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
return -ERR_INVALID_REQUEST;
}
return -EINVAL;
}
if (s->cct->_conf->rgw_crypt_require_ssl &&
- !s->info.env->exists("SERVER_PORT_SECURE")) {
+ !rgw_transport_is_secure(s->cct, *s->info.env)) {
ldout(s->cct, 5) << "ERROR: insecure request, rgw_crypt_require_ssl is set" << dendl;
return -ERR_INVALID_REQUEST;
}
if (stored_mode == "SSE-C-AES256") {
if (s->cct->_conf->rgw_crypt_require_ssl &&
- !s->info.env->exists("SERVER_PORT_SECURE")) {
+ !rgw_transport_is_secure(s->cct, *s->info.env)) {
ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
return -ERR_INVALID_REQUEST;
}
if (stored_mode == "SSE-KMS") {
if (s->cct->_conf->rgw_crypt_require_ssl &&
- !s->info.env->exists("SERVER_PORT_SECURE")) {
+ !rgw_transport_is_secure(s->cct, *s->info.env)) {
ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
return -ERR_INVALID_REQUEST;
}
}
logger.log("remove");
call(data_sync_module->remove_object(sync_env, *bucket_info, key, timestamp, versioned, versioned_epoch.value_or(0), &zones_trace));
+ // our copy of the object is more recent, continue as if it succeeded
+ if (retcode == -ERR_PRECONDITION_FAILED) {
+ retcode = 0;
+ }
} else if (op == CLS_RGW_OP_LINK_OLH_DM) {
logger.log("creating delete marker");
set_status("creating delete marker");
RGWUserInfo* get_user() { return &user; }
+ void update_user() {
+ RGWUserInfo _user = user;
+ int ret = rgw_get_user_info_by_access_key(rgwlib.get_store(), key.id, user);
+ if (ret != 0)
+ user = _user;
+ }
+
void close();
void gc();
}; /* RGWLibFS */
return false;
}
- if (!match_policy(resource, candidate.resource, MATCH_POLICY_ARN)) {
+ if (!match_policy(resource, candidate.resource, MATCH_POLICY_RESOURCE)) {
return false;
}
return 0;
}
-int RGWMetadataManager::remove_entry(RGWMetadataHandler *handler, string& key, RGWObjVersionTracker *objv_tracker)
+int RGWMetadataManager::remove_entry(RGWMetadataHandler *handler,
+ const string& key,
+ RGWObjVersionTracker *objv_tracker)
{
string section;
RGWMetadataLogData log_data;
int put_entry(RGWMetadataHandler *handler, const string& key, bufferlist& bl, bool exclusive,
RGWObjVersionTracker *objv_tracker, real_time mtime, map<string, bufferlist> *pattrs = NULL);
- int remove_entry(RGWMetadataHandler *handler, string& key, RGWObjVersionTracker *objv_tracker);
+ int remove_entry(RGWMetadataHandler *handler,
+ const string& key,
+ RGWObjVersionTracker *objv_tracker);
int get(string& metadata_key, Formatter *f);
int put(string& metadata_key, bufferlist& bl,
RGWMetadataHandler::sync_type_t sync_mode,
e.emplace("aws:Referer", i->second);
}
- // These seem to be the semantics, judging from rest_rgw_s3.cc
- i = m.find("SERVER_PORT_SECURE");
- if (i != m.end()) {
+ if (rgw_transport_is_secure(s->cct, *s->info.env)) {
e.emplace("aws:SecureTransport", "true");
}
int RGWListBucket::parse_max_keys()
{
- if (!max_keys.empty()) {
- char *endptr;
- max = strtol(max_keys.c_str(), &endptr, 10);
- if (endptr) {
- if (endptr == max_keys.c_str()) return -EINVAL;
- while (*endptr && isspace(*endptr)) // ignore white space
- endptr++;
- if (*endptr) {
- return -EINVAL;
- }
- }
- } else {
- max = default_max;
- }
-
- return 0;
+ // Bound max value of max-keys to configured value for security
+ // Bound min value of max-keys to '0'
+ // Some S3 clients explicitly send max-keys=0 to detect if the bucket is
+ // empty without listing any items.
+ return parse_value_and_bound(max_keys, max, 0,
+ s->cct->_conf->get_val<uint64_t>("rgw_max_listing_results"),
+ default_max);
}
void RGWListBucket::pre_exec()
virtual const string name() { return "get_cluster_stat"; }
};
+static inline int parse_value_and_bound(
+ const string &input,
+ int &output,
+ const long lower_bound,
+ const long upper_bound,
+ const long default_val)
+{
+ if (!input.empty()) {
+ char *endptr;
+ output = strtol(input.c_str(), &endptr, 10);
+ if (endptr) {
+ if (endptr == input.c_str()) return -EINVAL;
+ while (*endptr && isspace(*endptr)) // ignore white space
+ endptr++;
+ if (*endptr) {
+ return -EINVAL;
+ }
+ }
+ if(output > upper_bound) {
+ output = upper_bound;
+ }
+ if(output < lower_bound) {
+ output = lower_bound;
+ }
+ } else {
+ output = default_val;
+ }
+
+ return 0;
+}
#endif /* CEPH_RGW_OP_H */
}
+void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const CephContext* cct)
+{
+ if (cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
+ quota.max_objects = cct->_conf->rgw_bucket_default_quota_max_objects;
+ quota.enabled = true;
+ }
+ if (cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
+ quota.max_size = cct->_conf->rgw_bucket_default_quota_max_size;
+ quota.enabled = true;
+ }
+}
+
+void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const CephContext* cct)
+{
+ if (cct->_conf->rgw_user_default_quota_max_objects >= 0) {
+ quota.max_objects = cct->_conf->rgw_user_default_quota_max_objects;
+ quota.enabled = true;
+ }
+ if (cct->_conf->rgw_user_default_quota_max_size >= 0) {
+ quota.max_size = cct->_conf->rgw_user_default_quota_max_size;
+ quota.enabled = true;
+ }
+}
static void free_handler(RGWQuotaHandler *handler);
};
+// apply default quotas from configuration
+void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const CephContext* cct);
+void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const CephContext* cct);
+
#endif
return -EINVAL;
}
+ if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
+ ldout(cct,0) << "ERROR: zonegroup " << zg.get_name()
+ << " has a non existent master zone "<< dendl;
+ return -EINVAL;
+ }
+
if (zg.is_master_zonegroup()) {
master_zonegroup = zg.get_id();
master_zone = zg.master_zone;
ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
RGWRados::BucketShard bs(store);
+ RGWBucketInfo bucket_info;
- int r = bs.init(c->obj.bucket, c->obj);
+ int r = bs.init(c->obj.bucket, c->obj, &bucket_info);
if (r < 0) {
ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
/* not much to do */
continue;
}
- r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int {
- librados::ObjectWriteOperation o;
- cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
- cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
- c->log_op, c->bilog_op, &c->zones_trace);
-
- return bs->index_ctx.operate(bs->bucket_obj, &o);
+ r = store->guard_reshard(&bs, c->obj, bucket_info,
+ [&](RGWRados::BucketShard *bs) -> int {
+ librados::ObjectWriteOperation o;
+ cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+ cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
+ c->log_op, c->bilog_op, &c->zones_trace);
+ return bs->index_ctx.operate(bs->bucket_obj, &o);
});
if (r < 0) {
ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
// use endpoints from the zonegroup's master zone
auto master = zg.zones.find(zg.master_zone);
if (master == zg.zones.end()) {
+ // Check for empty zonegroup which can happen if zone was deleted before removal
+ if (zg.zones.size() == 0)
+ continue;
// fix missing master zone for a single zone zonegroup
if (zg.master_zone.empty() && zg.zones.size() == 1) {
master = zg.zones.begin();
*/
int RGWRados::list_buckets_init(RGWAccessHandle *handle)
{
- librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
- *handle = (RGWAccessHandle)state;
- return 0;
+ try {
+ auto iter = root_pool_ctx.nobjects_begin();
+ librados::NObjectIterator *state = new librados::NObjectIterator(iter);
+ *handle = (RGWAccessHandle)state;
+ return 0;
+ } catch (const std::system_error& e) {
+ int r = -e.code().value();
+ ldout(cct, 10) << "nobjects_begin threw " << e.what()
+ << ", returning " << r << dendl;
+ return r;
+ } catch (const std::exception& e) {
+ ldout(cct, 10) << "nobjects_begin threw " << e.what()
+ << ", returning -5" << dendl;
+ return -EIO;
+ }
}
/**
if (obj.key.name[0] == '_') {
obj.key.name = obj.key.name.substr(1);
}
-
- (*state)++;
+ try {
+ (*state)++;
+ } catch (const std::system_error& e) {
+ int r = -e.code().value();
+ ldout(cct, 10) << "nobjects_begin threw " << e.what()
+ << ", returning " << r << dendl;
+ return r;
+ } catch (const std::exception& e) {
+ ldout(cct, 10) << "nobjects_begin threw " << e.what()
+ << ", returning -5" << dendl;
+ return -EIO;
+ }
} while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
return 0;
l.set_duration(ut);
l.set_cookie(owner_id);
l.set_tag(zone_id);
- l.set_renew(true);
+ l.set_may_renew(true);
return l.lock_exclusive(&io_ctx, oid);
}
int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
{
- librados::IoCtx index_ctx; // context for new bucket
+ librados::IoCtx index_ctx;
- string dir_oid = dir_oid_prefix;
+ string dir_oid = dir_oid_prefix;
int r = open_bucket_index_ctx(bucket_info, index_ctx);
if (r < 0) {
return r;
map<int, string> bucket_objs;
get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
- return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
+ return CLSRGWIssueBucketIndexInit(index_ctx,
+ bucket_objs,
+ cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+int RGWRados::clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
+{
+ librados::IoCtx index_ctx;
+
+ std::string dir_oid = dir_oid_prefix;
+ int r = open_bucket_index_ctx(bucket_info, index_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ dir_oid.append(bucket_info.bucket.bucket_id);
+
+ std::map<int, std::string> bucket_objs;
+ get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
+
+ return CLSRGWIssueBucketIndexClean(index_ctx,
+ bucket_objs,
+ cct->_conf->rgw_bucket_index_max_aio)();
}
void RGWRados::create_bucket_id(string *bucket_id)
/* only remove it if it's a different bucket instance */
if (info.bucket.bucket_id != bucket.bucket_id) {
/* remove bucket meta instance */
- string entry = bucket.get_key();
- r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
+ r = rgw_bucket_instance_remove_entry(this,
+ bucket.get_key(),
+ &instance_ver);
if (r < 0)
return r;
- map<int, string>::const_iterator biter;
- for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
- // Do best effort removal
- index_ctx.remove(biter->second);
- }
+ /* remove bucket index objects asynchronously by best effort */
+ (void) CLSRGWIssueBucketIndexClean(index_ctx,
+ bucket_objs,
+ cct->_conf->rgw_bucket_index_max_aio)();
}
/* ret == -ENOENT here */
}
return 0;
}
-int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
+int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
+ const rgw_obj& obj,
+ RGWBucketInfo* bucket_info_out)
{
bucket = _bucket;
RGWObjectCtx obj_ctx(store);
RGWBucketInfo bucket_info;
- int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
+ RGWBucketInfo* bucket_info_p =
+ bucket_info_out ? bucket_info_out : &bucket_info;
+
+ int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
if (ret < 0) {
return ret;
}
- ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
+ ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
if (ret < 0) {
ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
return ret;
return 0;
}
-int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
+int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
+ int sid,
+ RGWBucketInfo* bucket_info_out)
{
bucket = _bucket;
shard_id = sid;
RGWObjectCtx obj_ctx(store);
RGWBucketInfo bucket_info;
- int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
+ RGWBucketInfo* bucket_info_p =
+ bucket_info_out ? bucket_info_out : &bucket_info;
+ int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
if (ret < 0) {
return ret;
}
- ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
+ ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, shard_id, &bucket_obj);
if (ret < 0) {
ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
return ret;
/* if the bucket is not synced we can remove the meta file */
if (!is_syncing_bucket_meta(bucket)) {
RGWObjVersionTracker objv_tracker;
- string entry = bucket.get_key();
- r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
+ r = rgw_bucket_instance_remove_entry(this, bucket.get_key(), &objv_tracker);
if (r < 0) {
return r;
}
- /* remove bucket index objects*/
- map<int, string>::const_iterator biter;
- for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
- index_ctx.remove(biter->second);
- }
+
+ /* remove bucket index objects asynchronously by best effort */
+ (void) CLSRGWIssueBucketIndexClean(index_ctx,
+ bucket_objs,
+ cct->_conf->rgw_bucket_index_max_aio)();
}
+
return 0;
}
return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
}
-int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
+int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
{
librados::IoCtx index_ctx;
map<int, string> bucket_objs;
}
ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
string new_bucket_id;
- r = store->block_while_resharding(bs, &new_bucket_id);
+ r = store->block_while_resharding(bs, &new_bucket_id, target->bucket_info);
if (r == -ERR_BUSY_RESHARDING) {
continue;
}
}
}
- int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
- return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
- });
+ int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
+ return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
+ });
if (r < 0) {
return r;
RGWRados *store = target->get_store();
BucketShard *bs;
- int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
- return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
- });
+ int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
+ return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
+ });
/*
* need to update data log anyhow, so that whoever follows needs to update its internal markers
return ret;
}
-int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call)
+int RGWRados::guard_reshard(BucketShard *bs,
+ const rgw_obj& obj_instance,
+ const RGWBucketInfo& bucket_info,
+ std::function<int(BucketShard *)> call)
{
rgw_obj obj;
const rgw_obj *pobj = &obj_instance;
int r;
for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
- r = bs->init(pobj->bucket, *pobj);
+ r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */);
if (r < 0) {
ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
return r;
}
ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
string new_bucket_id;
- r = block_while_resharding(bs, &new_bucket_id);
+ r = block_while_resharding(bs, &new_bucket_id, bucket_info);
if (r == -ERR_BUSY_RESHARDING) {
continue;
}
return 0;
}
-int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
+int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
+ string *new_bucket_id,
+ const RGWBucketInfo& bucket_info)
{
std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
- return waiter->block_while_resharding(bs, new_bucket_id);
+ return waiter->block_while_resharding(bs, new_bucket_id, bucket_info);
}
int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
BucketShard bs(this);
cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
- r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
- librados::ObjectWriteOperation op;
- cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
- return cls_rgw_bucket_link_olh(bs->index_ctx, op,
- bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
- unmod_since, high_precision_time,
- get_zone().log_data, zones_trace);
+ r = guard_reshard(&bs, obj_instance, bucket_info,
+ [&](BucketShard *bs) -> int {
+ librados::ObjectWriteOperation op;
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ return cls_rgw_bucket_link_olh(bs->index_ctx, op,
+ bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
+ unmod_since, high_precision_time,
+ get_zone().log_data, zones_trace);
});
if (r < 0) {
ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
BucketShard bs(this);
cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
- r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
- librados::ObjectWriteOperation op;
- cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
- return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
- olh_tag, olh_epoch, get_zone().log_data, zones_trace);
+ r = guard_reshard(&bs, obj_instance, bucket_info,
+ [&](BucketShard *bs) -> int {
+ librados::ObjectWriteOperation op;
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
+ olh_tag, olh_epoch, get_zone().log_data, zones_trace);
});
if (r < 0) {
ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
}
BucketShard bs(this);
- int ret = bs.init(obj_instance.bucket, obj_instance);
+ int ret =
+ bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
if (ret < 0) {
ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
return ret;
cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
- ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
- ObjectReadOperation op;
- cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
- return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
- key, ver_marker, olh_tag, log, is_truncated);
- });
+ ret = guard_reshard(&bs, obj_instance, bucket_info,
+ [&](BucketShard *bs) -> int {
+ ObjectReadOperation op;
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
+ key, ver_marker, olh_tag, log, is_truncated);
+ });
if (ret < 0) {
ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
return ret;
}
BucketShard bs(this);
- int ret = bs.init(obj_instance.bucket, obj_instance);
+ int ret =
+ bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
if (ret < 0) {
ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
return ret;
cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
- ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
- ObjectWriteOperation op;
- cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
- cls_rgw_trim_olh_log(op, key, ver, olh_tag);
- return pbs->index_ctx.operate(pbs->bucket_obj, &op);
+ ret = guard_reshard(&bs, obj_instance, bucket_info,
+ [&](BucketShard *pbs) -> int {
+ ObjectWriteOperation op;
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ cls_rgw_trim_olh_log(op, key, ver, olh_tag);
+ return pbs->index_ctx.operate(pbs->bucket_obj, &op);
});
if (ret < 0) {
ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
- int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
- ObjectWriteOperation op;
- cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
- return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
+ int ret = guard_reshard(&bs, obj_instance, bucket_info,
+ [&](BucketShard *pbs) -> int {
+ ObjectWriteOperation op;
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
});
if (ret < 0) {
ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
return -EINVAL;
}
- iter = io_ctx.nobjects_begin(oc);
-
- return 0;
+ try {
+ iter = io_ctx.nobjects_begin(oc);
+ return 0;
+ } catch (const std::system_error& e) {
+ r = -e.code().value();
+ ldout(cct, 10) << "nobjects_begin threw " << e.what()
+ << ", returning " << r << dendl;
+ return r;
+ } catch (const std::exception& e) {
+ ldout(cct, 10) << "nobjects_begin threw " << e.what()
+ << ", returning -5" << dendl;
+ return -EIO;
+ }
}
string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
return ctx.iter.get_cursor().to_str();
}
-int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
+static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
+ vector<rgw_bucket_dir_entry>& objs,
bool *is_truncated, RGWAccessListFilter *filter)
{
librados::IoCtx& io_ctx = ctx.io_ctx;
}
};
+int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
+ bool *is_truncated, RGWAccessListFilter *filter)
+{
+ // catch exceptions from NObjectIterator::operator++()
+ try {
+ return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
+ } catch (const std::system_error& e) {
+ int r = -e.code().value();
+ ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
+ << ", returning " << r << dendl;
+ return r;
+ } catch (const std::exception& e) {
+ ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
+ << ", returning -5" << dendl;
+ return -EIO;
+ }
+}
+
int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
{
if (!ctx->initialized) {
int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
{
BucketShard bs(this);
- int ret = bs.init(bucket, obj);
+ int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
if (ret < 0) {
ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
return ret;
int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
{
BucketShard bs(this);
- int ret = bs.init(bucket, obj);
+ int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
if (ret < 0) {
ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
return ret;
{
rgw_obj obj(bucket, obj_name);
BucketShard bs(this);
- int ret = bs.init(bucket, obj);
+ int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
if (ret < 0) {
ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
return ret;
int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
{
BucketShard bs(this);
- int ret = bs.init(bucket, shard_id);
+ int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
if (ret < 0) {
ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
return ret;
return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
}
-int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
-{
- bufferlist in;
- cls_rgw_bucket_init(op);
- return index_ctx.operate(oid, &op);
-}
-
int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
{
friend class RGWReplicaLogger;
friend class RGWReshard;
friend class RGWBucketReshard;
+ friend class RGWBucketReshardLock;
friend class BucketIndexLockGuard;
friend class RGWCompleteMultipart;
int create_pool(const rgw_pool& pool);
int init_bucket_index(RGWBucketInfo& bucket_info, int num_shards);
+ int clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards);
int select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& rule,
string *pselected_rule_name, RGWZonePlacementInfo *rule_info);
int select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info);
string bucket_obj;
explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {}
- int init(const rgw_bucket& _bucket, const rgw_obj& obj);
- int init(const rgw_bucket& _bucket, int sid);
+ int init(const rgw_bucket& _bucket, const rgw_obj& obj, RGWBucketInfo* out);
+ int init(const rgw_bucket& _bucket, int sid, RGWBucketInfo* out);
int init(const RGWBucketInfo& bucket_info, int sid);
};
int get_bucket_shard(BucketShard **pbs) {
if (!bs_initialized) {
- int r = bs.init(bucket_info.bucket, obj);
+ int r =
+ bs.init(bucket_info.bucket, obj, nullptr /* no RGWBucketInfo */);
if (r < 0) {
return r;
}
rgw_zone_set *zones_trace{nullptr};
int init_bs() {
- int r = bs.init(target->get_bucket(), obj);
+ int r =
+ bs.init(target->get_bucket(), obj, nullptr /* no RGWBucketInfo */);
if (r < 0) {
return r;
}
int obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op);
int obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op);
- int guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call);
- int block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id);
+ int guard_reshard(BucketShard *bs,
+ const rgw_obj& obj_instance,
+ const RGWBucketInfo& bucket_info,
+ std::function<int(BucketShard *)> call);
+ int block_while_resharding(RGWRados::BucketShard *bs,
+ string *new_bucket_id,
+ const RGWBucketInfo& bucket_info);
void bucket_index_guard_olh_op(RGWObjState& olh_state, librados::ObjectOperation& op);
int olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag);
int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv,
map<string, bufferlist> *pattrs, bool create_entry_point);
- int cls_rgw_init_index(librados::IoCtx& io_ctx, librados::ObjectWriteOperation& op, string& oid);
int cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag, int64_t pool, uint64_t epoch,
rgw_bucket_dir_entry& ent, RGWObjCategory category, list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
map<RGWObjCategory, RGWStorageStats> *existing_stats,
map<RGWObjCategory, RGWStorageStats> *calculated_stats);
int bucket_rebuild_index(RGWBucketInfo& bucket_info);
- int bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry);
+ int bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry);
int remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list);
int move_rados_obj(librados::IoCtx& src_ioctx,
const string& src_oid, const string& src_locator,
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
+#include <limits>
+
#include "rgw_rados.h"
#include "rgw_bucket.h"
#include "rgw_reshard.h"
#define RESHARD_SHARD_WINDOW 64
#define RESHARD_MAX_AIO 128
+
class BucketReshardShard {
RGWRados *store;
const RGWBucketInfo& bucket_info;
public:
BucketReshardShard(RGWRados *_store, const RGWBucketInfo& _bucket_info,
int _num_shard,
- deque<librados::AioCompletion *>& _completions) : store(_store), bucket_info(_bucket_info), bs(store),
- aio_completions(_completions) {
+ deque<librados::AioCompletion *>& _completions) :
+ store(_store), bucket_info(_bucket_info), bs(store),
+ aio_completions(_completions)
+ {
num_shard = (bucket_info.num_shards > 0 ? _num_shard : -1);
- bs.init(bucket_info.bucket, num_shard);
+ bs.init(bucket_info.bucket, num_shard, nullptr /* no RGWBucketInfo */);
}
int get_num_shard() {
return ret;
}
}
+
return 0;
}
+
int flush() {
if (entries.size() == 0) {
return 0;
}
return ret;
}
-};
+}; // class BucketReshardShard
+
class BucketReshardManager {
RGWRados *store;
vector<BucketReshardShard *> target_shards;
public:
- BucketReshardManager(RGWRados *_store, const RGWBucketInfo& _target_bucket_info, int _num_target_shards) : store(_store), target_bucket_info(_target_bucket_info),
- num_target_shards(_num_target_shards) {
+ BucketReshardManager(RGWRados *_store,
+ const RGWBucketInfo& _target_bucket_info,
+ int _num_target_shards) :
+ store(_store), target_bucket_info(_target_bucket_info),
+ num_target_shards(_num_target_shards)
+ {
target_shards.resize(num_target_shards);
for (int i = 0; i < num_target_shards; ++i) {
target_shards[i] = new BucketReshardShard(store, target_bucket_info, i, completions);
for (auto& shard : target_shards) {
int ret = shard->wait_all_aio();
if (ret < 0) {
- ldout(store->ctx(), 20) << __func__ << ": shard->wait_all_aio() returned ret=" << ret << dendl;
+ ldout(store->ctx(), 20) << __func__ <<
+ ": shard->wait_all_aio() returned ret=" << ret << dendl;
}
}
}
int add_entry(int shard_index,
rgw_cls_bi_entry& entry, bool account, uint8_t category,
const rgw_bucket_category_stats& entry_stats) {
- int ret = target_shards[shard_index]->add_entry(entry, account, category, entry_stats);
+ int ret = target_shards[shard_index]->add_entry(entry, account, category,
+ entry_stats);
if (ret < 0) {
- derr << "ERROR: target_shards.add_entry(" << entry.idx << ") returned error: " << cpp_strerror(-ret) << dendl;
+ derr << "ERROR: target_shards.add_entry(" << entry.idx <<
+ ") returned error: " << cpp_strerror(-ret) << dendl;
return ret;
}
+
return 0;
}
target_shards.clear();
return ret;
}
-};
-
-RGWBucketReshard::RGWBucketReshard(RGWRados *_store, const RGWBucketInfo& _bucket_info, const map<string, bufferlist>& _bucket_attrs) :
- store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
- reshard_lock(reshard_lock_name) {
- const rgw_bucket& b = bucket_info.bucket;
- reshard_oid = b.tenant + (b.tenant.empty() ? "" : ":") + b.name + ":" + b.bucket_id;
-
- utime_t lock_duration(store->ctx()->_conf->rgw_reshard_bucket_lock_duration, 0);
-#define COOKIE_LEN 16
- char cookie_buf[COOKIE_LEN + 1];
- gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
- cookie_buf[COOKIE_LEN] = '\0';
-
- reshard_lock.set_cookie(cookie_buf);
- reshard_lock.set_duration(lock_duration);
-}
-
-int RGWBucketReshard::lock_bucket()
-{
- int ret = reshard_lock.lock_exclusive(&store->reshard_pool_ctx, reshard_oid);
- if (ret < 0) {
- ldout(store->ctx(), 0) << "RGWReshard::add failed to acquire lock on " << reshard_oid << " ret=" << ret << dendl;
- return ret;
- }
- return 0;
-}
-
-void RGWBucketReshard::unlock_bucket()
-{
- int ret = reshard_lock.unlock(&store->reshard_pool_ctx, reshard_oid);
- if (ret < 0) {
- ldout(store->ctx(), 0) << "WARNING: RGWReshard::add failed to drop lock on " << reshard_oid << " ret=" << ret << dendl;
- }
-}
-
-int RGWBucketReshard::set_resharding_status(const string& new_instance_id, int32_t num_shards, cls_rgw_reshard_status status)
+}; // class BucketReshardManager
+
+RGWBucketReshard::RGWBucketReshard(RGWRados *_store,
+ const RGWBucketInfo& _bucket_info,
+ const map<string, bufferlist>& _bucket_attrs,
+ RGWBucketReshardLock* _outer_reshard_lock) :
+ store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
+ reshard_lock(store, bucket_info, true),
+ outer_reshard_lock(_outer_reshard_lock)
+{ }
+
+int RGWBucketReshard::set_resharding_status(RGWRados* store,
+ const RGWBucketInfo& bucket_info,
+ const string& new_instance_id,
+ int32_t num_shards,
+ cls_rgw_reshard_status status)
{
if (new_instance_id.empty()) {
ldout(store->ctx(), 0) << __func__ << " missing new bucket instance id" << dendl;
return 0;
}
-int RGWBucketReshard::clear_resharding()
+// reshard lock assumes lock is held
+int RGWBucketReshard::clear_resharding(RGWRados* store,
+ const RGWBucketInfo& bucket_info)
{
- cls_rgw_bucket_instance_entry instance_entry;
+ int ret = clear_index_shard_reshard_status(store, bucket_info);
+ if (ret < 0) {
+ ldout(store->ctx(), 0) << "RGWBucketReshard::" << __func__ <<
+ " ERROR: error clearing reshard status from index shard " <<
+ cpp_strerror(-ret) << dendl;
+ return ret;
+ }
- int ret = store->bucket_set_reshard(bucket_info, instance_entry);
+ cls_rgw_bucket_instance_entry instance_entry;
+ ret = store->bucket_set_reshard(bucket_info, instance_entry);
if (ret < 0) {
- ldout(store->ctx(), 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: "
- << cpp_strerror(-ret) << dendl;
+ ldout(store->ctx(), 0) << "RGWReshard::" << __func__ <<
+ " ERROR: error setting bucket resharding flag on bucket index: " <<
+ cpp_strerror(-ret) << dendl;
return ret;
}
+
+ return 0;
+}
+
+int RGWBucketReshard::clear_index_shard_reshard_status(RGWRados* store,
+ const RGWBucketInfo& bucket_info)
+{
+ uint32_t num_shards = bucket_info.num_shards;
+
+ if (num_shards < std::numeric_limits<uint32_t>::max()) {
+ int ret = set_resharding_status(store, bucket_info,
+ bucket_info.bucket.bucket_id,
+ (num_shards < 1 ? 1 : num_shards),
+ CLS_RGW_RESHARD_NONE);
+ if (ret < 0) {
+ ldout(store->ctx(), 0) << "RGWBucketReshard::" << __func__ <<
+ " ERROR: error clearing reshard status from index shard " <<
+ cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ }
+
return 0;
}
int ret = store->init_bucket_index(new_bucket_info, new_bucket_info.num_shards);
if (ret < 0) {
cerr << "ERROR: failed to init new bucket indexes: " << cpp_strerror(-ret) << std::endl;
- return -ret;
+ return ret;
}
ret = store->put_bucket_instance_info(new_bucket_info, true, real_time(), &attrs);
if (ret < 0) {
cerr << "ERROR: failed to store new bucket instance info: " << cpp_strerror(-ret) << std::endl;
- return -ret;
+ return ret;
}
return 0;
int RGWBucketReshard::create_new_bucket_instance(int new_num_shards,
RGWBucketInfo& new_bucket_info)
{
- return ::create_new_bucket_instance(store, new_num_shards, bucket_info, bucket_attrs, new_bucket_info);
+ return ::create_new_bucket_instance(store, new_num_shards,
+ bucket_info, bucket_attrs, new_bucket_info);
}
int RGWBucketReshard::cancel()
{
- int ret = lock_bucket();
+ int ret = reshard_lock.lock();
if (ret < 0) {
return ret;
}
ret = clear_resharding();
- unlock_bucket();
- return 0;
+ reshard_lock.unlock();
+ return ret;
}
class BucketInfoReshardUpdate
}
public:
- BucketInfoReshardUpdate(RGWRados *_store, RGWBucketInfo& _bucket_info,
- map<string, bufferlist>& _bucket_attrs, const string& new_bucket_id) : store(_store),
- bucket_info(_bucket_info),
- bucket_attrs(_bucket_attrs) {
+ BucketInfoReshardUpdate(RGWRados *_store,
+ RGWBucketInfo& _bucket_info,
+ map<string, bufferlist>& _bucket_attrs,
+ const string& new_bucket_id) :
+ store(_store),
+ bucket_info(_bucket_info),
+ bucket_attrs(_bucket_attrs)
+ {
bucket_info.new_bucket_instance_id = new_bucket_id;
}
+
~BucketInfoReshardUpdate() {
if (in_progress) {
+ // resharding must not have ended correctly, clean up
+ int ret =
+ RGWBucketReshard::clear_index_shard_reshard_status(store, bucket_info);
+ if (ret < 0) {
+ lderr(store->ctx()) << "Error: " << __func__ <<
+ " clear_index_shard_status returned " << ret << dendl;
+ }
bucket_info.new_bucket_instance_id.clear();
- set_status(CLS_RGW_RESHARD_NONE);
+ set_status(CLS_RGW_RESHARD_NONE); // clears new_bucket_instance as well
}
}
}
};
-int RGWBucketReshard::do_reshard(
- int num_shards,
- RGWBucketInfo& new_bucket_info,
- int max_entries,
- bool verbose,
- ostream *out,
- Formatter *formatter)
+
+RGWBucketReshardLock::RGWBucketReshardLock(RGWRados* _store,
+ const std::string& reshard_lock_oid,
+ bool _ephemeral) :
+ store(_store),
+ lock_oid(reshard_lock_oid),
+ ephemeral(_ephemeral),
+ internal_lock(reshard_lock_name)
+{
+ const int lock_dur_secs = store->ctx()->_conf->rgw_reshard_bucket_lock_duration;
+ duration = std::chrono::seconds(lock_dur_secs);
+
+#define COOKIE_LEN 16
+ char cookie_buf[COOKIE_LEN + 1];
+ gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
+ cookie_buf[COOKIE_LEN] = '\0';
+
+ internal_lock.set_cookie(cookie_buf);
+ internal_lock.set_duration(duration);
+}
+
+int RGWBucketReshardLock::lock() {
+ internal_lock.set_must_renew(false);
+ int ret;
+ if (ephemeral) {
+ ret = internal_lock.lock_exclusive_ephemeral(&store->reshard_pool_ctx,
+ lock_oid);
+ } else {
+ ret = internal_lock.lock_exclusive(&store->reshard_pool_ctx, lock_oid);
+ }
+ if (ret < 0) {
+ ldout(store->ctx(), 0) << "RGWReshardLock::" << __func__ <<
+ " failed to acquire lock on " << lock_oid << " ret=" << ret << dendl;
+ return ret;
+ }
+ reset_time(Clock::now());
+
+ return 0;
+}
+
+void RGWBucketReshardLock::unlock() {
+ int ret = internal_lock.unlock(&store->reshard_pool_ctx, lock_oid);
+ if (ret < 0) {
+ ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ <<
+ " failed to drop lock on " << lock_oid << " ret=" << ret << dendl;
+ }
+}
+
+int RGWBucketReshardLock::renew(const Clock::time_point& now) {
+ internal_lock.set_must_renew(true);
+ int ret;
+ if (ephemeral) {
+ ret = internal_lock.lock_exclusive_ephemeral(&store->reshard_pool_ctx,
+ lock_oid);
+ } else {
+ ret = internal_lock.lock_exclusive(&store->reshard_pool_ctx, lock_oid);
+ }
+ if (ret < 0) { /* expired or already locked by another processor */
+ ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " <<
+ lock_oid << " with " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ internal_lock.set_must_renew(false);
+
+ reset_time(now);
+ ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " <<
+ lock_oid << dendl;
+
+ return 0;
+}
+
+
+int RGWBucketReshard::do_reshard(int num_shards,
+ RGWBucketInfo& new_bucket_info,
+ int max_entries,
+ bool verbose,
+ ostream *out,
+ Formatter *formatter)
{
rgw_bucket& bucket = bucket_info.bucket;
int ret = 0;
if (out) {
- (*out) << "*** NOTICE: operation will not remove old bucket index objects ***" << std::endl;
- (*out) << "*** these will need to be removed manually ***" << std::endl;
(*out) << "tenant: " << bucket_info.bucket.tenant << std::endl;
(*out) << "bucket name: " << bucket_info.bucket.name << std::endl;
- (*out) << "old bucket instance id: " << bucket_info.bucket.bucket_id << std::endl;
- (*out) << "new bucket instance id: " << new_bucket_info.bucket.bucket_id << std::endl;
+ (*out) << "old bucket instance id: " << bucket_info.bucket.bucket_id <<
+ std::endl;
+ (*out) << "new bucket instance id: " << new_bucket_info.bucket.bucket_id <<
+ std::endl;
}
- /* update bucket info -- in progress*/
+ /* update bucket info -- in progress*/
list<rgw_cls_bi_entry> entries;
if (max_entries < 0) {
- ldout(store->ctx(), 0) << __func__ << ": can't reshard, negative max_entries" << dendl;
+ ldout(store->ctx(), 0) << __func__ <<
+ ": can't reshard, negative max_entries" << dendl;
return -EINVAL;
}
+ // NB: destructor cleans up sharding state if reshard does not
+ // complete successfully
BucketInfoReshardUpdate bucket_info_updater(store, bucket_info, bucket_attrs, new_bucket_info.bucket.bucket_id);
ret = bucket_info_updater.start();
cout << "total entries:";
}
- int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
+ const int num_source_shards =
+ (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
string marker;
for (int i = 0; i < num_source_shards; ++i) {
bool is_truncated = true;
ret = store->bi_list(bucket, i, string(), marker, max_entries, &entries, &is_truncated);
if (ret < 0 && ret != -ENOENT) {
derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl;
- return -ret;
+ return ret;
}
- list<rgw_cls_bi_entry>::iterator iter;
- for (iter = entries.begin(); iter != entries.end(); ++iter) {
+ for (auto iter = entries.begin(); iter != entries.end(); ++iter) {
rgw_cls_bi_entry& entry = *iter;
if (verbose) {
formatter->open_object_section("entry");
int shard_index = (target_shard_id > 0 ? target_shard_id : 0);
- ret = target_shards_mgr.add_entry(shard_index, entry, account, category, stats);
+ ret = target_shards_mgr.add_entry(shard_index, entry, account,
+ category, stats);
if (ret < 0) {
return ret;
}
+
+ Clock::time_point now = Clock::now();
+ if (reshard_lock.should_renew(now)) {
+ // assume outer locks have timespans at least the size of ours, so
+ // can call inside conditional
+ if (outer_reshard_lock) {
+ ret = outer_reshard_lock->renew(now);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ ret = reshard_lock.renew(now);
+ if (ret < 0) {
+ lderr(store->ctx()) << "Error renewing bucket lock: " << ret << dendl;
+ return ret;
+ }
+ }
+
if (verbose) {
formatter->close_section();
if (out) {
formatter->flush(*out);
- formatter->flush(*out);
}
} else if (out && !(total_entries % 1000)) {
(*out) << " " << total_entries;
}
- }
+ } // entries loop
}
}
+
if (verbose) {
formatter->close_section();
if (out) {
ret = target_shards_mgr.finish();
if (ret < 0) {
lderr(store->ctx()) << "ERROR: failed to reshard" << dendl;
- return EIO;
+ return -EIO;
}
ret = rgw_link_bucket(store, new_bucket_info.owner, new_bucket_info.bucket, bucket_info.creation_time);
if (ret < 0) {
lderr(store->ctx()) << "failed to link new bucket instance (bucket_id=" << new_bucket_info.bucket.bucket_id << ": " << cpp_strerror(-ret) << ")" << dendl;
- return -ret;
+ return ret;
}
ret = bucket_info_updater.complete();
ldout(store->ctx(), 0) << __func__ << ": failed to update bucket info ret=" << ret << dendl;
/* don't error out, reshard process succeeded */
}
+
return 0;
-}
+ // NB: some error clean-up is done by ~BucketInfoReshardUpdate
+} // RGWBucketReshard::do_reshard
int RGWBucketReshard::get_status(list<cls_rgw_bucket_instance_entry> *status)
{
return 0;
}
-int RGWBucketReshard::execute(int num_shards, int max_op_entries,
- bool verbose, ostream *out, Formatter *formatter, RGWReshard* reshard_log)
+int RGWBucketReshard::execute(int num_shards, int max_op_entries,
+ bool verbose, ostream *out, Formatter *formatter,
+ RGWReshard* reshard_log)
{
- int ret = lock_bucket();
+ Clock::time_point now;
+
+ int ret = reshard_lock.lock();
if (ret < 0) {
return ret;
}
RGWBucketInfo new_bucket_info;
ret = create_new_bucket_instance(num_shards, new_bucket_info);
if (ret < 0) {
- unlock_bucket();
- return ret;
+ // shard state is uncertain, but this will attempt to remove them anyway
+ goto error_out;
}
if (reshard_log) {
ret = reshard_log->update(bucket_info, new_bucket_info);
if (ret < 0) {
- unlock_bucket();
- return ret;
+ goto error_out;
}
}
- ret = set_resharding_status(new_bucket_info.bucket.bucket_id, num_shards, CLS_RGW_RESHARD_IN_PROGRESS);
+ // set resharding status of current bucket_info & shards with
+ // information about planned resharding
+ ret = set_resharding_status(new_bucket_info.bucket.bucket_id,
+ num_shards, CLS_RGW_RESHARD_IN_PROGRESS);
if (ret < 0) {
- unlock_bucket();
+ reshard_lock.unlock();
return ret;
}
new_bucket_info,
max_op_entries,
verbose, out, formatter);
-
if (ret < 0) {
- unlock_bucket();
- return ret;
+ goto error_out;
}
- ret = set_resharding_status(new_bucket_info.bucket.bucket_id, num_shards, CLS_RGW_RESHARD_DONE);
+ // at this point we've done the main work; we'll make a best-effort
+ // to clean-up but will not indicate any errors encountered
+
+ reshard_lock.unlock();
+
+ // resharding successful, so remove old bucket index shards; use
+ // best effort and don't report out an error; the lock isn't needed
+ // at this point since all we're using a best effor to to remove old
+ // shard objects
+ ret = store->clean_bucket_index(bucket_info, bucket_info.num_shards);
if (ret < 0) {
- unlock_bucket();
- return ret;
+ lderr(store->ctx()) << "Error: " << __func__ <<
+ " failed to clean up old shards; " <<
+ "RGWRados::clean_bucket_index returned " << ret << dendl;
}
- unlock_bucket();
+ ret = rgw_bucket_instance_remove_entry(store,
+ bucket_info.bucket.get_key(),
+ nullptr);
+ if (ret < 0) {
+ lderr(store->ctx()) << "Error: " << __func__ <<
+ " failed to clean old bucket info object \"" <<
+ bucket_info.bucket.get_key() <<
+ "\"created after successful resharding with error " << ret << dendl;
+ }
return 0;
-}
+
+error_out:
+
+ reshard_lock.unlock();
+
+ // since the real problem is the issue that led to this error code
+ // path, we won't touch ret and instead use another variable to
+ // temporarily error codes
+ int ret2 = store->clean_bucket_index(new_bucket_info,
+ new_bucket_info.num_shards);
+ if (ret2 < 0) {
+ lderr(store->ctx()) << "Error: " << __func__ <<
+ " failed to clean up shards from failed incomplete resharding; " <<
+ "RGWRados::clean_bucket_index returned " << ret2 << dendl;
+ }
+
+ ret2 = rgw_bucket_instance_remove_entry(store,
+ new_bucket_info.bucket.get_key(),
+ nullptr);
+ if (ret2 < 0) {
+ lderr(store->ctx()) << "Error: " << __func__ <<
+ " failed to clean bucket info object \"" <<
+ new_bucket_info.bucket.get_key() <<
+ "\"created during incomplete resharding with error " << ret2 << dendl;
+ }
+
+ return ret;
+} // execute
RGWReshard::RGWReshard(RGWRados* _store, bool _verbose, ostream *_out,
- Formatter *_formatter) : store(_store), instance_lock(bucket_instance_lock_name),
- verbose(_verbose), out(_out), formatter(_formatter)
+ Formatter *_formatter) :
+ store(_store), instance_lock(bucket_instance_lock_name),
+ verbose(_verbose), out(_out), formatter(_formatter)
{
num_logshards = store->ctx()->_conf->rgw_reshard_num_logs;
}
-string RGWReshard::get_logshard_key(const string& tenant, const string& bucket_name)
+string RGWReshard::get_logshard_key(const string& tenant,
+ const string& bucket_name)
{
return tenant + ":" + bucket_name;
}
return 0;
}
-int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
+int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs,
+ string *new_bucket_id,
+ const RGWBucketInfo& bucket_info)
{
int ret = 0;
cls_rgw_bucket_instance_entry entry;
- for (int i=0; i < num_retries;i++) {
+ for (int i=0; i < num_retries; i++) {
ret = cls_rgw_get_bucket_resharding(bs->index_ctx, bs->bucket_obj, &entry);
if (ret < 0) {
ldout(store->ctx(), 0) << __func__ << " ERROR: failed to get bucket resharding :" <<
return 0;
}
ldout(store->ctx(), 20) << "NOTICE: reshard still in progress; " << (i < num_retries - 1 ? "retrying" : "too many retries") << dendl;
- /* needed to unlock as clear resharding uses the same lock */
if (i == num_retries - 1) {
break;
}
+ // If bucket is erroneously marked as resharding (e.g., crash or
+ // other error) then fix it. If we can take the bucket reshard
+ // lock then it means no other resharding should be taking place,
+ // and we're free to clear the flags.
+ {
+ // since we expect to do this rarely, we'll do our work in a
+ // block and erase our work after each try
+
+ RGWObjectCtx obj_ctx(bs->store);
+ const rgw_bucket& b = bs->bucket;
+ std::string bucket_id = b.get_key();
+ RGWBucketReshardLock reshard_lock(bs->store, bucket_info, true);
+ ret = reshard_lock.lock();
+ if (ret < 0) {
+ ldout(store->ctx(), 20) << __func__ <<
+ " INFO: failed to take reshard lock for bucket " <<
+ bucket_id << "; expected if resharding underway" << dendl;
+ } else {
+ ldout(store->ctx(), 10) << __func__ <<
+ " INFO: was able to take reshard lock for bucket " <<
+ bucket_id << dendl;
+ ret = RGWBucketReshard::clear_resharding(bs->store, bucket_info);
+ if (ret < 0) {
+ reshard_lock.unlock();
+ ldout(store->ctx(), 0) << __func__ <<
+ " ERROR: failed to clear resharding flags for bucket " <<
+ bucket_id << dendl;
+ } else {
+ reshard_lock.unlock();
+ ldout(store->ctx(), 5) << __func__ <<
+ " INFO: apparently successfully cleared resharding flags for "
+ "bucket " << bucket_id << dendl;
+ continue; // if we apparently succeed immediately test again
+ } // if clear resharding succeeded
+ } // if taking of lock succeeded
+ } // block to encapsulate recovery from incomplete reshard
+
ret = do_wait();
if (ret < 0) {
ldout(store->ctx(), 0) << __func__ << " ERROR: bucket is still resharding, please retry" << dendl;
bool truncated = true;
CephContext *cct = store->ctx();
- int max_entries = 1000;
- int max_secs = 60;
-
- rados::cls::lock::Lock l(reshard_lock_name);
-
- utime_t time(max_secs, 0);
- l.set_duration(time);
-
- char cookie_buf[COOKIE_LEN + 1];
- gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
- cookie_buf[COOKIE_LEN] = '\0';
-
- l.set_cookie(cookie_buf);
+ constexpr uint32_t max_entries = 1000;
string logshard_oid;
get_logshard_oid(logshard_num, &logshard_oid);
- int ret = l.lock_exclusive(&store->reshard_pool_ctx, logshard_oid);
+ RGWBucketReshardLock logshard_lock(store, logshard_oid, false);
+
+ int ret = logshard_lock.lock();
if (ret == -EBUSY) { /* already locked by another processor */
- ldout(store->ctx(), 5) << __func__ << "(): failed to acquire lock on " << logshard_oid << dendl;
+ ldout(store->ctx(), 5) << __func__ << "(): failed to acquire lock on " <<
+ logshard_oid << dendl;
return ret;
}
- utime_t lock_start_time = ceph_clock_now();
-
do {
std::list<cls_rgw_reshard_entry> entries;
ret = list(logshard_num, marker, max_entries, entries, &truncated);
if (ret < 0) {
- ldout(cct, 10) << "cannot list all reshards in logshard oid=" << logshard_oid << dendl;
+ ldout(cct, 10) << "cannot list all reshards in logshard oid=" <<
+ logshard_oid << dendl;
continue;
}
- for(auto& entry: entries) {
+ for(auto& entry: entries) { // logshard entries
if(entry.new_instance_id.empty()) {
- ldout(store->ctx(), 20) << __func__ << " resharding " << entry.bucket_name << dendl;
+ ldout(store->ctx(), 20) << __func__ << " resharding " <<
+ entry.bucket_name << dendl;
RGWObjectCtx obj_ctx(store);
rgw_bucket bucket;
RGWBucketInfo bucket_info;
map<string, bufferlist> attrs;
- ret = store->get_bucket_info(obj_ctx, entry.tenant, entry.bucket_name, bucket_info, nullptr,
- &attrs);
+ ret = store->get_bucket_info(obj_ctx, entry.tenant, entry.bucket_name,
+ bucket_info, nullptr, &attrs);
if (ret < 0) {
- ldout(cct, 0) << __func__ << ": Error in get_bucket_info: " << cpp_strerror(-ret) << dendl;
+ ldout(cct, 0) << __func__ << ": Error in get_bucket_info: " <<
+ cpp_strerror(-ret) << dendl;
return -ret;
}
- RGWBucketReshard br(store, bucket_info, attrs);
+ RGWBucketReshard br(store, bucket_info, attrs, nullptr);
Formatter* formatter = new JSONFormatter(false);
auto formatter_ptr = std::unique_ptr<Formatter>(formatter);
- ret = br.execute(entry.new_num_shards, max_entries, true,nullptr, formatter, this);
+ ret = br.execute(entry.new_num_shards, max_entries, true, nullptr,
+ formatter, this);
if (ret < 0) {
- ldout (store->ctx(), 0) << __func__ << "ERROR in reshard_bucket " << entry.bucket_name << ":" <<
+ ldout (store->ctx(), 0) << __func__ <<
+ "ERROR in reshard_bucket " << entry.bucket_name << ":" <<
cpp_strerror(-ret)<< dendl;
return ret;
}
- ldout (store->ctx(), 20) << " removing entry" << entry.bucket_name<< dendl;
+ ldout (store->ctx(), 20) << " removing entry" << entry.bucket_name <<
+ dendl;
ret = remove(entry);
if (ret < 0) {
- ldout(cct, 0)<< __func__ << ":Error removing bucket " << entry.bucket_name << " for resharding queue: "
- << cpp_strerror(-ret) << dendl;
+ ldout(cct, 0)<< __func__ << ":Error removing bucket " <<
+ entry.bucket_name << " for resharding queue: " <<
+ cpp_strerror(-ret) << dendl;
return ret;
}
}
- utime_t now = ceph_clock_now();
-
- if (now > lock_start_time + max_secs / 2) { /* do you need to renew lock? */
- l.set_renew(true);
- ret = l.lock_exclusive(&store->reshard_pool_ctx, logshard_oid);
- if (ret == -EBUSY) { /* already locked by another processor */
- ldout(store->ctx(), 5) << __func__ << "(): failed to acquire lock on " << logshard_oid << dendl;
- return ret;
- }
- lock_start_time = now;
+
+ Clock::time_point now = Clock::now();
+ if (logshard_lock.should_renew(now)) {
+ ret = logshard_lock.renew(now);
+ if (ret < 0) {
+ return ret;
+ }
}
+
entry.get_key(&marker);
}
} while (truncated);
- l.unlock(&store->reshard_pool_ctx, logshard_oid);
+ logshard_lock.unlock();
return 0;
}
#define RGW_RESHARD_H
#include <vector>
+#include <functional>
+
#include "include/rados/librados.hpp"
+#include "common/ceph_time.h"
#include "cls/rgw/cls_rgw_types.h"
#include "cls/lock/cls_lock_client.h"
#include "rgw_bucket.h"
+
class CephContext;
class RGWRados;
+class RGWBucketReshardLock {
+ using Clock = ceph::coarse_mono_clock;
+
+ RGWRados* store;
+ const std::string lock_oid;
+ const bool ephemeral;
+ rados::cls::lock::Lock internal_lock;
+ std::chrono::seconds duration;
+
+ Clock::time_point start_time;
+ Clock::time_point renew_thresh;
+
+ void reset_time(const Clock::time_point& now) {
+ start_time = now;
+ renew_thresh = start_time + duration / 2;
+ }
+
+public:
+ RGWBucketReshardLock(RGWRados* _store,
+ const std::string& reshard_lock_oid,
+ bool _ephemeral);
+ RGWBucketReshardLock(RGWRados* _store,
+ const RGWBucketInfo& bucket_info,
+ bool _ephemeral) :
+ RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral)
+ {}
+
+ int lock();
+ void unlock();
+ int renew(const Clock::time_point&);
+
+ bool should_renew(const Clock::time_point& now) const {
+ return now >= renew_thresh;
+ }
+}; // class RGWBucketReshardLock
class RGWBucketReshard {
+public:
+
friend class RGWReshard;
+ using Clock = ceph::coarse_mono_clock;
+
+private:
+
RGWRados *store;
RGWBucketInfo bucket_info;
std::map<string, bufferlist> bucket_attrs;
- string reshard_oid;
- rados::cls::lock::Lock reshard_lock;
-
- int lock_bucket();
- void unlock_bucket();
- int set_resharding_status(const string& new_instance_id, int32_t num_shards, cls_rgw_reshard_status status);
- int clear_resharding();
+ RGWBucketReshardLock reshard_lock;
+ RGWBucketReshardLock* outer_reshard_lock;
- int create_new_bucket_instance(int new_num_shards, RGWBucketInfo& new_bucket_info);
+ int create_new_bucket_instance(int new_num_shards,
+ RGWBucketInfo& new_bucket_info);
int do_reshard(int num_shards,
RGWBucketInfo& new_bucket_info,
int max_entries,
ostream *os,
Formatter *formatter);
public:
- RGWBucketReshard(RGWRados *_store, const RGWBucketInfo& _bucket_info,
- const std::map<string, bufferlist>& _bucket_attrs);
+ // pass nullptr for the final parameter if no outer reshard lock to
+ // manage
+ RGWBucketReshard(RGWRados *_store, const RGWBucketInfo& _bucket_info,
+ const std::map<string, bufferlist>& _bucket_attrs,
+ RGWBucketReshardLock* _outer_reshard_lock);
int execute(int num_shards, int max_op_entries,
bool verbose = false, ostream *out = nullptr,
Formatter *formatter = nullptr,
RGWReshard *reshard_log = nullptr);
- int abort();
int get_status(std::list<cls_rgw_bucket_instance_entry> *status);
int cancel();
-};
+ static int clear_resharding(RGWRados* store,
+ const RGWBucketInfo& bucket_info);
+ int clear_resharding() {
+ return clear_resharding(store, bucket_info);
+ }
+ static int clear_index_shard_reshard_status(RGWRados* store,
+ const RGWBucketInfo& bucket_info);
+ int clear_index_shard_reshard_status() {
+ return clear_index_shard_reshard_status(store, bucket_info);
+ }
+ static int set_resharding_status(RGWRados* store,
+ const RGWBucketInfo& bucket_info,
+ const string& new_instance_id,
+ int32_t num_shards,
+ cls_rgw_reshard_status status);
+ int set_resharding_status(const string& new_instance_id,
+ int32_t num_shards,
+ cls_rgw_reshard_status status) {
+ return set_resharding_status(store, bucket_info,
+ new_instance_id, num_shards, status);
+ }
+}; // RGWBucketReshard
class RGWReshard {
+public:
+ using Clock = ceph::coarse_mono_clock;
+
+private:
RGWRados *store;
string lock_name;
rados::cls::lock::Lock instance_lock;
~RGWReshardWait() {
assert(going_down);
}
- int block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id);
+ int block_while_resharding(RGWRados::BucketShard *bs,
+ string *new_bucket_id,
+ const RGWBucketInfo& bucket_info);
void stop() {
Mutex::Locker l(lock);
}
string str = s->info.args.get("max-parts");
- if (!str.empty())
- max_parts = atoi(str.c_str());
+ op_ret = parse_value_and_bound(str, max_parts, 0,
+ g_conf->get_val<uint64_t>("rgw_max_listing_results"),
+ max_parts);
return op_ret;
}
delimiter = s->info.args.get("delimiter");
prefix = s->info.args.get("prefix");
string str = s->info.args.get("max-uploads");
- if (!str.empty())
- max_uploads = atoi(str.c_str());
- else
- max_uploads = default_max;
+ op_ret = parse_value_and_bound(str, max_uploads, 0,
+ g_conf->get_val<uint64_t>("rgw_max_listing_results"),
+ default_max);
+ if (op_ret < 0) {
+ return op_ret;
+ }
string key_marker = s->info.args.get("key-marker");
string upload_id_marker = s->info.args.get("upload-id-marker");
boost::optional<std::string> canonical_headers = \
get_v4_canonical_headers(s->info, signed_hdrs, using_qs);
if (canonical_headers) {
- ldout(s->cct, 10) << "canonical headers format = " << *canonical_headers
- << dendl;
+ using sanitize = rgw::crypt_sanitize::log_content;
+ ldout(s->cct, 10) << "canonical headers format = "
+ << sanitize{*canonical_headers} << dendl;
} else {
throw -EPERM;
}
if (op_ret < 0) {
return op_ret;
}
+ // S3 behavior is to silently cap the max-keys.
+ // Swift behavior is to abort.
if (max > default_max)
return -ERR_PRECONDITION_FAILED;
if (gen_key)
op_state.set_generate_key();
- RGWQuotaInfo bucket_quota;
- RGWQuotaInfo user_quota;
-
- if (s->cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
- bucket_quota.max_objects = s->cct->_conf->rgw_bucket_default_quota_max_objects;
- bucket_quota.enabled = true;
- }
-
- if (s->cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
- bucket_quota.max_size = s->cct->_conf->rgw_bucket_default_quota_max_size;
- bucket_quota.enabled = true;
- }
-
- if (s->cct->_conf->rgw_user_default_quota_max_objects >= 0) {
- user_quota.max_objects = s->cct->_conf->rgw_user_default_quota_max_objects;
- user_quota.enabled = true;
- }
-
- if (s->cct->_conf->rgw_user_default_quota_max_size >= 0) {
- user_quota.max_size = s->cct->_conf->rgw_user_default_quota_max_size;
- user_quota.enabled = true;
- }
-
- if (bucket_quota.enabled) {
- op_state.set_bucket_quota(bucket_quota);
- }
-
- if (user_quota.enabled) {
- op_state.set_user_quota(user_quota);
- }
-
http_ret = RGWUserAdminOp_User::create(store, op_state, flusher);
}
old_quota = &info.bucket_quota;
}
- int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size);
- int64_t max_size_kb;
RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, "a.max_objects);
- RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb);
- quota.max_size = max_size_kb * 1024;
+ RESTArgs::get_int64(s, "max-size", old_quota->max_size, "a.max_size);
+ int64_t max_size_kb;
+ bool has_max_size_kb = false;
+ RESTArgs::get_int64(s, "max-size-kb", 0, &max_size_kb, &has_max_size_kb);
+ if (has_max_size_kb) {
+ quota.max_size = max_size_kb * 1024;
+ }
RESTArgs::get_bool(s, "enabled", old_quota->enabled, "a.enabled);
}
#include <boost/circular_buffer.hpp>
#include <boost/container/flat_map.hpp>
+#include "include/scope_guard.h"
#include "common/bounded_key_counter.h"
#include "common/errno.h"
#include "rgw_sync_log_trim.h"
const std::string section;
const std::string start_marker;
MetadataListCallback callback;
- void *handle{nullptr};
int _send_request() override;
public:
: RGWAsyncRadosRequest(caller, cn), cct(cct), mgr(mgr),
section(section), start_marker(start_marker), callback(callback)
{}
- ~AsyncMetadataList() override {
- if (handle) {
- mgr->list_keys_complete(handle);
- }
- }
};
int AsyncMetadataList::_send_request()
{
+ void* handle = nullptr;
+ std::list<std::string> keys;
+ bool truncated{false};
+ std::string marker;
+
// start a listing at the given marker
int r = mgr->list_keys_init(section, start_marker, &handle);
- if (r < 0) {
+ if (r == -EINVAL) {
+ // restart with empty marker below
+ } else if (r < 0) {
ldout(cct, 10) << "failed to init metadata listing: "
<< cpp_strerror(r) << dendl;
return r;
- }
- ldout(cct, 20) << "starting metadata listing at " << start_marker << dendl;
-
- std::list<std::string> keys;
- bool truncated{false};
- std::string marker;
-
- do {
- // get the next key and marker
- r = mgr->list_keys_next(handle, 1, keys, &truncated);
- if (r < 0) {
- ldout(cct, 10) << "failed to list metadata: "
- << cpp_strerror(r) << dendl;
- return r;
- }
- marker = mgr->get_marker(handle);
-
- if (!keys.empty()) {
- assert(keys.size() == 1);
- auto& key = keys.front();
- if (!callback(std::move(key), std::move(marker))) {
- return 0;
+ } else {
+ ldout(cct, 20) << "starting metadata listing at " << start_marker << dendl;
+
+ // release the handle when scope exits
+ auto g = make_scope_guard([=] { mgr->list_keys_complete(handle); });
+
+ do {
+ // get the next key and marker
+ r = mgr->list_keys_next(handle, 1, keys, &truncated);
+ if (r < 0) {
+ ldout(cct, 10) << "failed to list metadata: "
+ << cpp_strerror(r) << dendl;
+ return r;
}
- }
- } while (truncated);
+ marker = mgr->get_marker(handle);
+
+ if (!keys.empty()) {
+ ceph_assert(keys.size() == 1);
+ auto& key = keys.front();
+ if (!callback(std::move(key), std::move(marker))) {
+ return 0;
+ }
+ }
+ } while (truncated);
- if (start_marker.empty()) {
- // already listed all keys
- return 0;
+ if (start_marker.empty()) {
+ // already listed all keys
+ return 0;
+ }
}
// restart the listing from the beginning (empty marker)
- mgr->list_keys_complete(handle);
handle = nullptr;
r = mgr->list_keys_init(section, "", &handle);
}
ldout(cct, 20) << "restarting metadata listing" << dendl;
+ // release the handle when scope exits
+ auto g = make_scope_guard([=] { mgr->list_keys_complete(handle); });
do {
// get the next key and marker
r = mgr->list_keys_next(handle, 1, keys, &truncated);
}
};
+static bool is_sys_attr(const std::string& attr_name){
+ static constexpr std::initializer_list<const char*> rgw_sys_attrs = {RGW_ATTR_PG_VER,
+ RGW_ATTR_SOURCE_ZONE,
+ RGW_ATTR_ID_TAG,
+ RGW_ATTR_TEMPURL_KEY1,
+ RGW_ATTR_TEMPURL_KEY2,
+ RGW_ATTR_UNIX1,
+ RGW_ATTR_UNIX_KEY1
+ };
+
+ return std::find(rgw_sys_attrs.begin(), rgw_sys_attrs.end(), attr_name) != rgw_sys_attrs.end();
+}
+
struct es_obj_metadata {
CephContext *cct;
ElasticConfigRef es_conf;
for (auto i : attrs) {
const string& attr_name = i.first;
- string name;
bufferlist& val = i.second;
if (attr_name.compare(0, sizeof(RGW_ATTR_PREFIX) - 1, RGW_ATTR_PREFIX) != 0) {
}
if (attr_name.compare(0, sizeof(RGW_ATTR_META_PREFIX) - 1, RGW_ATTR_META_PREFIX) == 0) {
- name = attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1);
- custom_meta[name] = string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0));
+ custom_meta.emplace(attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1),
+ string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0)));
continue;
}
- name = attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1);
+ if (attr_name.compare(0, sizeof(RGW_ATTR_CRYPT_PREFIX) -1, RGW_ATTR_CRYPT_PREFIX) == 0) {
+ continue;
+ }
- if (name == "acl") {
+ if (attr_name == RGW_ATTR_ACL) {
try {
auto i = val.begin();
::decode(policy, i);
} catch (buffer::error& err) {
ldout(cct, 0) << "ERROR: failed to decode acl for " << bucket_info.bucket << "/" << key << dendl;
+ continue;
}
const RGWAccessControlList& acl = policy.get_acl();
}
}
}
- } else if (name == "x-amz-tagging") {
- auto tags_bl = val.begin();
- ::decode(obj_tags, tags_bl);
- } else if (name == "compression") {
+ } else if (attr_name == RGW_ATTR_TAGS) {
+ try {
+ auto tags_bl = val.begin();
+ ::decode(obj_tags, tags_bl);
+ } catch (buffer::error& err) {
+ ldout(cct,0) << "ERROR: failed to decode obj tags for "
+ << bucket_info.bucket << "/" << key << dendl;
+ continue;
+ }
+ } else if (attr_name == RGW_ATTR_COMPRESSION) {
RGWCompressionInfo cs_info;
- auto vals_bl = val.begin();
- decode(cs_info, vals_bl);
- out_attrs[name] = cs_info.compression_type;
+ try {
+ auto vals_bl = val.begin();
+ ::decode(cs_info, vals_bl);
+ } catch (buffer::error& err) {
+ ldout(cct,0) << "ERROR: failed to decode compression attr for "
+ << bucket_info.bucket << "/" << key << dendl;
+ continue;
+ }
+ out_attrs.emplace("compression",std::move(cs_info.compression_type));
} else {
- if (name != "pg_ver" &&
- name != "source_zone" &&
- name != "idtag") {
- out_attrs[name] = string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0));
+ if (!is_sys_attr(attr_name)) {
+ out_attrs.emplace(attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1),
+ std::string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0)));
}
}
}
#include "rgw_common.h"
#include "rgw_bucket.h"
+#include "rgw_quota.h"
#define dout_subsys ceph_subsys_rgw
if (op_state.has_bucket_quota()) {
user_info.bucket_quota = op_state.get_bucket_quota();
} else {
- if (cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
- user_info.bucket_quota.max_objects = cct->_conf->rgw_bucket_default_quota_max_objects;
- user_info.bucket_quota.enabled = true;
- }
- if (cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
- user_info.bucket_quota.max_size = cct->_conf->rgw_bucket_default_quota_max_size;
- user_info.bucket_quota.enabled = true;
- }
+ rgw_apply_default_bucket_quota(user_info.bucket_quota, cct);
}
if (op_state.temp_url_key_specified) {
if (op_state.has_user_quota()) {
user_info.user_quota = op_state.get_user_quota();
} else {
- if (cct->_conf->rgw_user_default_quota_max_objects >= 0) {
- user_info.user_quota.max_objects = cct->_conf->rgw_user_default_quota_max_objects;
- user_info.user_quota.enabled = true;
- }
- if (cct->_conf->rgw_user_default_quota_max_size >= 0) {
- user_info.user_quota.max_size = cct->_conf->rgw_user_default_quota_max_size;
- user_info.user_quota.enabled = true;
- }
+ rgw_apply_default_user_quota(user_info.user_quota, cct);
}
// update the request
-i mapfn --move name --loc type name ...
move the given item to specified location
-i mapfn --reweight recalculate all bucket weights
+ -i mapfn --rebuild-class-roots
+ rebuild the per-class shadow trees (normally a no-op)
-i mapfn --create-simple-rule name root type mode
create crush rule <name> to start from <root>,
replicate across buckets of type <type>, using
export select data generated during testing routine
to CSV files for off-line post-processing
use --help-output for more information
+ --reclassify transform legacy CRUSH map buckets and rules
+ by adding classes
+ --reclassify-bucket <bucket-match> <class> <default-parent>
+ --reclassify-root <bucket-name> <class>
+ --set-subtree-class <bucket-name> <class>
+ set class for all items beneath bucket-name
+ --compare <otherfile> compare two maps using --test parameters
Options for the output stage
--- /dev/null
+ $ crushtool -i $TESTDIR/crush-classes/a --set-subtree-class default hdd --reclassify --reclassify-bucket %-ssd ssd default --reclassify-bucket ssd ssd default --reclassify-root default hdd -o foo
+ classify_root default (-1) as hdd
+ new class hdd exists as 1
+ renumbering bucket -1 -> -5
+ renumbering bucket -4 -> -6
+ renumbering bucket -3 -> -7
+ renumbering bucket -2 -> -8
+ classify_bucket %-ssd as ssd default bucket default (root)
+ created new class ssd as 2
+ match %-ssd to ttipod001-cephosd-2-ssd basename ttipod001-cephosd-2
+ have base -8
+ match %-ssd to ttipod001-cephosd-1-ssd basename ttipod001-cephosd-1
+ have base -7
+ match %-ssd to ttipod001-cephosd-3-ssd basename ttipod001-cephosd-3
+ have base -6
+ classify_bucket ssd as ssd default bucket default (root)
+ new class ssd exists as 2
+ match ssd to ssd basename default
+ have base -5
+ moving items from -24 (ttipod001-cephosd-3-ssd) to -6 (ttipod001-cephosd-3)
+ moving items from -23 (ttipod001-cephosd-1-ssd) to -7 (ttipod001-cephosd-1)
+ moving items from -22 (ttipod001-cephosd-2-ssd) to -8 (ttipod001-cephosd-2)
+ moving items from -21 (ssd) to -5 (default)
+ $ crushtool -i $TESTDIR/crush-classes/a --compare foo
+ rule 0 had 0/10240 mismatched mappings (0)
+ rule 1 had 0/10240 mismatched mappings (0)
+ maps appear equivalent
+
+ $ crushtool -i $TESTDIR/crush-classes/d --set-subtree-class default hdd --reclassify --reclassify-bucket %-ssd ssd default --reclassify-bucket ssd ssd default --reclassify-root default hdd -o foo
+ classify_root default (-1) as hdd
+ new class hdd exists as 1
+ renumbering bucket -1 -> -13
+ renumbering bucket -6 -> -14
+ renumbering bucket -5 -> -15
+ renumbering bucket -4 -> -16
+ renumbering bucket -3 -> -17
+ renumbering bucket -2 -> -18
+ classify_bucket %-ssd as ssd default bucket default (root)
+ created new class ssd as 2
+ match %-ssd to node-20-ssd basename node-20
+ have base -18
+ match %-ssd to node-21-ssd basename node-21
+ created base -25
+ match %-ssd to node-22-ssd basename node-22
+ created base -26
+ match %-ssd to node-23-ssd basename node-23
+ created base -27
+ match %-ssd to node-27-ssd basename node-27
+ created base -28
+ classify_bucket ssd as ssd default bucket default (root)
+ new class ssd exists as 2
+ match ssd to ssd basename default
+ have base -13
+ moving items from -12 (node-27-ssd) to -28 (node-27)
+ moving items from -11 (node-23-ssd) to -27 (node-23)
+ moving items from -10 (node-22-ssd) to -26 (node-22)
+ moving items from -9 (node-21-ssd) to -25 (node-21)
+ moving items from -8 (node-20-ssd) to -18 (node-20)
+ moving items from -7 (ssd) to -13 (default)
+ $ crushtool -i $TESTDIR/crush-classes/d --compare foo
+ rule 0 had 0/10240 mismatched mappings (0)
+ rule 1 had 0/10240 mismatched mappings (0)
+ maps appear equivalent
+
+ $ crushtool -i $TESTDIR/crush-classes/e --reclassify --reclassify-bucket ceph-osd-ssd-% ssd default --reclassify-bucket ssd-root ssd default --reclassify-root default hdd -o foo
+ classify_root default (-1) as hdd
+ new class hdd exists as 1
+ renumbering bucket -1 -> -55
+ renumbering bucket -34 -> -56
+ renumbering bucket -20 -> -57
+ renumbering bucket -14 -> -58
+ renumbering bucket -15 -> -59
+ renumbering bucket -16 -> -60
+ renumbering bucket -52 -> -61
+ renumbering bucket -46 -> -62
+ renumbering bucket -40 -> -63
+ classify_bucket ceph-osd-ssd-% as ssd default bucket default (root)
+ new class ssd exists as 0
+ match ceph-osd-ssd-% to ceph-osd-ssd-node4 basename node4
+ have base -57
+ match ceph-osd-ssd-% to ceph-osd-ssd-node3 basename node3
+ have base -58
+ match ceph-osd-ssd-% to ceph-osd-ssd-node1 basename node1
+ have base -60
+ match ceph-osd-ssd-% to ceph-osd-ssd-node2 basename node2
+ have base -59
+ match ceph-osd-ssd-% to ceph-osd-ssd-node5 basename node5
+ have base -56
+ match ceph-osd-ssd-% to ceph-osd-ssd-node6 basename node6
+ have base -63
+ match ceph-osd-ssd-% to ceph-osd-ssd-node7 basename node7
+ have base -62
+ match ceph-osd-ssd-% to ceph-osd-ssd-node8 basename node8
+ have base -61
+ classify_bucket ssd-root as ssd default bucket default (root)
+ new class ssd exists as 0
+ match ssd-root to ssd-root basename default
+ have base -55
+ moving items from -49 (ceph-osd-ssd-node8) to -61 (node8)
+ moving items from -43 (ceph-osd-ssd-node7) to -62 (node7)
+ moving items from -37 (ceph-osd-ssd-node6) to -63 (node6)
+ moving items from -31 (ceph-osd-ssd-node5) to -56 (node5)
+ moving items from -18 (ssd-root) to -55 (default)
+ moving items from -9 (ceph-osd-ssd-node2) to -59 (node2)
+ moving items from -7 (ceph-osd-ssd-node1) to -60 (node1)
+ moving items from -5 (ceph-osd-ssd-node3) to -58 (node3)
+ moving items from -3 (ceph-osd-ssd-node4) to -57 (node4)
+
+this one has weird node weights, so *lots* of mappings change...
+
+ $ crushtool -i $TESTDIR/crush-classes/e --compare foo
+ rule 0 had 6540/10240 mismatched mappings (0.638672)
+ rule 1 had 8417/10240 mismatched mappings (0.821973)
+ warning: maps are NOT equivalent
+ [1]
+
+ $ crushtool -i $TESTDIR/crush-classes/c --reclassify --reclassify-bucket %-SSD ssd default --reclassify-bucket ssd ssd default --reclassify-root default hdd -o foo
+ classify_root default (-1) as hdd
+ new class hdd exists as 0
+ renumbering bucket -1 -> -55
+ renumbering bucket -9 -> -56
+ renumbering bucket -8 -> -57
+ renumbering bucket -7 -> -58
+ renumbering bucket -6 -> -59
+ renumbering bucket -5 -> -60
+ renumbering bucket -4 -> -61
+ renumbering bucket -3 -> -62
+ renumbering bucket -2 -> -63
+ classify_bucket %-SSD as ssd default bucket default (root)
+ created new class ssd as 2
+ match %-SSD to Ceph-Stor1-SSD basename Ceph-Stor1
+ have base -63
+ match %-SSD to Ceph-Stor2-SSD basename Ceph-Stor2
+ have base -62
+ match %-SSD to Ceph-Stor3-SSD basename Ceph-Stor3
+ have base -61
+ match %-SSD to Ceph-Stor4-SSD basename Ceph-Stor4
+ have base -60
+ match %-SSD to Ceph-Stor5-SSD basename Ceph-Stor5
+ have base -59
+ match %-SSD to Ceph-Stor6-SSD basename Ceph-Stor6
+ have base -58
+ match %-SSD to Ceph-Stor7-SSD basename Ceph-Stor7
+ have base -57
+ match %-SSD to Ceph-Stor8-SSD basename Ceph-Stor8
+ have base -56
+ classify_bucket ssd as ssd default bucket default (root)
+ new class ssd exists as 2
+ match ssd to ssd basename default
+ have base -55
+ moving items from -18 (ssd) to -55 (default)
+ moving items from -17 (Ceph-Stor8-SSD) to -56 (Ceph-Stor8)
+ moving items from -16 (Ceph-Stor7-SSD) to -57 (Ceph-Stor7)
+ moving items from -15 (Ceph-Stor6-SSD) to -58 (Ceph-Stor6)
+ moving items from -14 (Ceph-Stor5-SSD) to -59 (Ceph-Stor5)
+ moving items from -13 (Ceph-Stor4-SSD) to -60 (Ceph-Stor4)
+ moving items from -12 (Ceph-Stor3-SSD) to -61 (Ceph-Stor3)
+ moving items from -11 (Ceph-Stor2-SSD) to -62 (Ceph-Stor2)
+ moving items from -10 (Ceph-Stor1-SSD) to -63 (Ceph-Stor1)
+
+wonky crush weights on Ceph-Stor1, so a small number of mappings change
+because the new map has a strictly summing hierarchy.
+
+ $ crushtool -i $TESTDIR/crush-classes/c --compare foo
+ rule 0 had 158/10240 mismatched mappings (0.0154297)
+ rule 1 had 62/5120 mismatched mappings (0.0121094)
+ rule 2 had 0/10240 mismatched mappings (0)
+ warning: maps are NOT equivalent
+ [1]
+
+ $ crushtool -i $TESTDIR/crush-classes/beesly --set-subtree-class 0513-R-0060 hdd --set-subtree-class 0513-R-0050 hdd --reclassify --reclassify-root 0513-R-0050 hdd --reclassify-root 0513-R-0060 hdd -o foo
+ classify_root 0513-R-0050 (-2) as hdd
+ new class hdd exists as 0
+ renumbering bucket -2 -> -131
+ renumbering bucket -14 -> -132
+ renumbering bucket -34 -> -133
+ renumbering bucket -33 -> -134
+ renumbering bucket -30 -> -135
+ renumbering bucket -26 -> -136
+ renumbering bucket -22 -> -137
+ renumbering bucket -18 -> -138
+ renumbering bucket -13 -> -139
+ renumbering bucket -9 -> -140
+ renumbering bucket -12 -> -141
+ renumbering bucket -11 -> -142
+ renumbering bucket -32 -> -143
+ renumbering bucket -31 -> -144
+ renumbering bucket -10 -> -145
+ renumbering bucket -8 -> -146
+ renumbering bucket -6 -> -147
+ renumbering bucket -28 -> -148
+ renumbering bucket -27 -> -149
+ renumbering bucket -21 -> -150
+ renumbering bucket -20 -> -151
+ renumbering bucket -19 -> -152
+ renumbering bucket -7 -> -153
+ renumbering bucket -5 -> -154
+ renumbering bucket -4 -> -155
+ renumbering bucket -25 -> -156
+ renumbering bucket -24 -> -157
+ renumbering bucket -23 -> -158
+ renumbering bucket -17 -> -159
+ renumbering bucket -16 -> -160
+ renumbering bucket -15 -> -161
+ renumbering bucket -3 -> -162
+ renumbering bucket -72 -> -163
+ renumbering bucket -98 -> -164
+ renumbering bucket -97 -> -165
+ renumbering bucket -96 -> -166
+ renumbering bucket -95 -> -167
+ renumbering bucket -94 -> -168
+ renumbering bucket -93 -> -169
+ renumbering bucket -68 -> -170
+ classify_root 0513-R-0060 (-65) as hdd
+ new class hdd exists as 0
+ renumbering bucket -65 -> -35
+ renumbering bucket -76 -> -36
+ renumbering bucket -78 -> -37
+ renumbering bucket -87 -> -38
+ renumbering bucket -82 -> -39
+ renumbering bucket -81 -> -40
+ renumbering bucket -77 -> -41
+ renumbering bucket -75 -> -42
+ renumbering bucket -89 -> -43
+ renumbering bucket -85 -> -44
+ renumbering bucket -84 -> -45
+ renumbering bucket -74 -> -46
+ renumbering bucket -71 -> -47
+ renumbering bucket -80 -> -48
+ renumbering bucket -91 -> -49
+ renumbering bucket -90 -> -50
+ renumbering bucket -88 -> -51
+ renumbering bucket -79 -> -52
+ renumbering bucket -70 -> -53
+ renumbering bucket -86 -> -54
+ renumbering bucket -83 -> -55
+ renumbering bucket -73 -> -56
+ renumbering bucket -69 -> -57
+ $ crushtool -i $TESTDIR/crush-classes/beesly --compare foo
+ rule 0 had 0/10240 mismatched mappings (0)
+ rule 1 had 0/10240 mismatched mappings (0)
+ rule 2 had 0/10240 mismatched mappings (0)
+ rule 4 had 0/10240 mismatched mappings (0)
+ maps appear equivalent
+
+ $ crushtool -i $TESTDIR/crush-classes/flax --reclassify --reclassify-root default hdd -o foo
+ classify_root default (-1) as hdd
+ new class hdd exists as 0
+ renumbering bucket -1 -> -5
+ renumbering bucket -12 -> -7
+ renumbering bucket -9 -> -8
+ renumbering bucket -6 -> -10
+ renumbering bucket -4 -> -11
+ renumbering bucket -3 -> -13
+ renumbering bucket -2 -> -14
+ $ crushtool -i $TESTDIR/crush-classes/flax --compare foo
+ rule 0 had 0/10240 mismatched mappings (0)
+ maps appear equivalent
+
+ $ crushtool -i $TESTDIR/crush-classes/gabe --reclassify --reclassify-root default hdd -o foo
+ classify_root default (-1) as hdd
+ new class hdd exists as 0
+ rule 3 includes take on root default class 0
+ failed to reclassify map
+ [1]
+
+above fails because of ec-rack-by-2-hdd also has take default class hdd.
+
+below is an adjusted version of the same cluster's map
+
+ $ crushtool -i $TESTDIR/crush-classes/gabe2 --reclassify --reclassify-root default hdd -o foo
+ classify_root default (-1) as hdd
+ new class hdd exists as 0
+ renumbering bucket -1 -> -178
+ renumbering bucket -4 -> -179
+ renumbering bucket -25 -> -180
+ renumbering bucket -16 -> -181
+ renumbering bucket -21 -> -182
+ renumbering bucket -19 -> -183
+ renumbering bucket -15 -> -184
+ renumbering bucket -7 -> -185
+ renumbering bucket -47 -> -186
+ renumbering bucket -18 -> -187
+ renumbering bucket -8 -> -188
+ renumbering bucket -6 -> -189
+ renumbering bucket -12 -> -190
+ renumbering bucket -23 -> -191
+ renumbering bucket -22 -> -192
+ renumbering bucket -20 -> -193
+ renumbering bucket -11 -> -194
+ renumbering bucket -10 -> -195
+ renumbering bucket -17 -> -196
+ renumbering bucket -13 -> -197
+ renumbering bucket -9 -> -198
+ renumbering bucket -3 -> -199
+ renumbering bucket -14 -> -200
+ renumbering bucket -5 -> -201
+ renumbering bucket -2 -> -202
+ $ crushtool -i $TESTDIR/crush-classes/gabe2 --compare foo
+ rule 0 had 627/10240 mismatched mappings (0.0612305)
+ rule 1 had 422/6144 mismatched mappings (0.0686849)
+ warning: maps are NOT equivalent
+ [1]
+
+
+
+ $ crushtool -i $TESTDIR/crush-classes/b --reclassify --reclassify-bucket %-hdd hdd default --reclassify-bucket %-ssd ssd default --reclassify-bucket ssd ssd default --reclassify-bucket hdd hdd default -o foo
+ classify_bucket %-hdd as hdd default bucket default (root)
+ new class hdd exists as 0
+ match %-hdd to berta-hdd basename berta
+ have base -37
+ match %-hdd to oelgard-hdd basename oelgard
+ have base -36
+ match %-hdd to leonhard-hdd basename leonhard
+ have base -33
+ match %-hdd to gottlieb-hdd basename gottlieb
+ have base -30
+ match %-hdd to hieronymus-hdd basename hieronymus
+ have base -31
+ match %-hdd to uhu-hdd basename uhu
+ have base -34
+ match %-hdd to euphrosyne-hdd basename euphrosyne
+ have base -35
+ match %-hdd to frauenhaus-hdd basename frauenhaus
+ created base -145
+ match %-hdd to herrenhaus-hdd basename herrenhaus
+ created base -146
+ match %-hdd to zoo-hdd basename zoo
+ created base -147
+ match %-hdd to borkenkaefer-hdd basename borkenkaefer
+ have base -4
+ match %-hdd to hirsch-hdd basename hirsch
+ have base -41
+ match %-hdd to cassowary-hdd basename cassowary
+ created base -148
+ match %-hdd to fuchs-hdd basename fuchs
+ created base -149
+ match %-hdd to analia-hdd basename analia
+ created base -150
+ match %-hdd to gundula-hdd basename gundula
+ created base -151
+ match %-hdd to achim-hdd basename achim
+ created base -152
+ match %-hdd to hugo-hdd basename hugo
+ created base -153
+ match %-hdd to carl-hdd basename carl
+ have base -32
+ classify_bucket %-ssd as ssd default bucket default (root)
+ new class ssd exists as 1
+ match %-ssd to frauenhaus-ssd basename frauenhaus
+ already creating base -145
+ match %-ssd to herrenhaus-ssd basename herrenhaus
+ already creating base -146
+ match %-ssd to zoo-ssd basename zoo
+ already creating base -147
+ match %-ssd to berta-ssd basename berta
+ have base -37
+ match %-ssd to euphrosyne-ssd basename euphrosyne
+ have base -35
+ match %-ssd to oelgard-ssd basename oelgard
+ have base -36
+ match %-ssd to leonhard-ssd basename leonhard
+ have base -33
+ match %-ssd to hieronymus-ssd basename hieronymus
+ have base -31
+ match %-ssd to gottlieb-ssd basename gottlieb
+ have base -30
+ match %-ssd to uhu-ssd basename uhu
+ have base -34
+ match %-ssd to borkenkaefer-ssd basename borkenkaefer
+ have base -4
+ match %-ssd to hirsch-ssd basename hirsch
+ have base -41
+ match %-ssd to phaidon-ssd basename phaidon
+ created base -154
+ match %-ssd to glykera-ssd basename glykera
+ created base -155
+ match %-ssd to bonobo-ssd basename bonobo
+ created base -156
+ classify_bucket hdd as hdd default bucket default (root)
+ new class hdd exists as 0
+ match hdd to hdd basename default
+ have base -1
+ classify_bucket ssd as ssd default bucket default (root)
+ new class ssd exists as 1
+ match ssd to ssd basename default
+ have base -1
+ moving items from -124 (bonobo-ssd) to -156 (bonobo)
+ moving items from -123 (glykera-ssd) to -155 (glykera)
+ moving items from -122 (phaidon-ssd) to -154 (phaidon)
+ moving items from -121 (carl-hdd) to -32 (carl)
+ moving items from -120 (hugo-hdd) to -153 (hugo)
+ moving items from -119 (achim-hdd) to -152 (achim)
+ moving items from -118 (gundula-hdd) to -151 (gundula)
+ moving items from -117 (analia-hdd) to -150 (analia)
+ moving items from -116 (fuchs-hdd) to -149 (fuchs)
+ moving items from -115 (cassowary-hdd) to -148 (cassowary)
+ moving items from -39 (hirsch-ssd) to -41 (hirsch)
+ moving items from -38 (hirsch-hdd) to -41 (hirsch)
+ moving items from -29 (borkenkaefer-ssd) to -4 (borkenkaefer)
+ moving items from -28 (hdd) to -1 (default)
+ moving items from -27 (ssd) to -1 (default)
+ moving items from -26 (uhu-ssd) to -34 (uhu)
+ moving items from -25 (gottlieb-ssd) to -30 (gottlieb)
+ moving items from -24 (hieronymus-ssd) to -31 (hieronymus)
+ moving items from -23 (leonhard-ssd) to -33 (leonhard)
+ moving items from -22 (borkenkaefer-hdd) to -4 (borkenkaefer)
+ moving items from -21 (oelgard-ssd) to -36 (oelgard)
+ moving items from -20 (euphrosyne-ssd) to -35 (euphrosyne)
+ moving items from -19 (berta-ssd) to -37 (berta)
+ moving items from -17 (zoo-ssd) to -147 (zoo)
+ moving items from -16 (herrenhaus-ssd) to -146 (herrenhaus)
+ moving items from -15 (frauenhaus-ssd) to -145 (frauenhaus)
+ moving items from -12 (zoo-hdd) to -147 (zoo)
+ moving items from -11 (herrenhaus-hdd) to -146 (herrenhaus)
+ moving items from -10 (frauenhaus-hdd) to -145 (frauenhaus)
+ moving items from -9 (euphrosyne-hdd) to -35 (euphrosyne)
+ moving items from -8 (uhu-hdd) to -34 (uhu)
+ moving items from -7 (hieronymus-hdd) to -31 (hieronymus)
+ moving items from -6 (gottlieb-hdd) to -30 (gottlieb)
+ moving items from -5 (leonhard-hdd) to -33 (leonhard)
+ moving items from -3 (oelgard-hdd) to -36 (oelgard)
+ moving items from -2 (berta-hdd) to -37 (berta)
+ new bucket -156 missing parent, adding at {root=default}
+ new bucket -155 missing parent, adding at {root=default}
+ new bucket -154 missing parent, adding at {root=default}
+
+ $ crushtool -i $TESTDIR/crush-classes/b --compare foo
+ rule 0 had 0/3072 mismatched mappings (0)
+ rule 1 had 0/4096 mismatched mappings (0)
+ maps appear equivalent
+
+ $ crushtool -i $TESTDIR/crush-classes/f --reclassify --reclassify-root default hdd -o foo
+ classify_root default (-1) as hdd
+ new class hdd exists as 0
+ renumbering bucket -1 -> -178
+ renumbering bucket -4 -> -179
+ renumbering bucket -25 -> -180
+ renumbering bucket -16 -> -181
+ renumbering bucket -21 -> -182
+ renumbering bucket -19 -> -183
+ renumbering bucket -15 -> -184
+ renumbering bucket -7 -> -185
+ renumbering bucket -47 -> -186
+ renumbering bucket -18 -> -187
+ renumbering bucket -8 -> -188
+ renumbering bucket -6 -> -189
+ renumbering bucket -12 -> -190
+ renumbering bucket -23 -> -191
+ renumbering bucket -22 -> -192
+ renumbering bucket -20 -> -193
+ renumbering bucket -11 -> -194
+ renumbering bucket -10 -> -195
+ renumbering bucket -17 -> -196
+ renumbering bucket -13 -> -197
+ renumbering bucket -9 -> -198
+ renumbering bucket -3 -> -199
+ renumbering bucket -14 -> -200
+ renumbering bucket -5 -> -201
+ renumbering bucket -2 -> -202
+
+We expect some mismatches below because there are some ssd-labeled nodes under
+default that we aren't changing the class on.
+
+ $ crushtool -i $TESTDIR/crush-classes/f --compare foo
+ rule 0 had 627/10240 mismatched mappings (0.0612305)
+ rule 1 had 422/6144 mismatched mappings (0.0686849)
+ warning: maps are NOT equivalent
+ [1]
+
+ $ crushtool -i $TESTDIR/crush-classes/g --reclassify --reclassify-bucket sata-% hdd-sata default --reclassify-bucket sas-% hdd-sas default --reclassify-bucket sas hdd-sas default --reclassify-bucket sata hdd-sata default -o foo
+ classify_bucket sas as hdd-sas default bucket default (root)
+ created new class hdd-sas as 1
+ match sas to sas basename default
+ have base -1
+ classify_bucket sas-% as hdd-sas default bucket default (root)
+ new class hdd-sas exists as 1
+ match sas-% to sas-osd01 basename osd01
+ created base -73
+ match sas-% to sas-osd02 basename osd02
+ created base -74
+ match sas-% to sas-osd03 basename osd03
+ created base -75
+ match sas-% to sas-osd04 basename osd04
+ created base -76
+ match sas-% to sas-osd05 basename osd05
+ created base -77
+ match sas-% to sas-osd06 basename osd06
+ created base -78
+ match sas-% to sas-osd07 basename osd07
+ created base -79
+ match sas-% to sas-osd08 basename osd08
+ created base -80
+ match sas-% to sas-osd09 basename osd09
+ created base -81
+ match sas-% to sas-rack1 basename rack1
+ created base -82
+ match sas-% to sas-rack2 basename rack2
+ created base -83
+ match sas-% to sas-rack3 basename rack3
+ created base -84
+ classify_bucket sata as hdd-sata default bucket default (root)
+ created new class hdd-sata as 2
+ match sata to sata basename default
+ have base -1
+ classify_bucket sata-% as hdd-sata default bucket default (root)
+ new class hdd-sata exists as 2
+ match sata-% to sata-osd11 basename osd11
+ created base -85
+ match sata-% to sata-osd10 basename osd10
+ created base -86
+ match sata-% to sata-osd14 basename osd14
+ created base -87
+ match sata-% to sata-osd13 basename osd13
+ created base -88
+ match sata-% to sata-osd12 basename osd12
+ created base -89
+ match sata-% to sata-osd15 basename osd15
+ created base -90
+ match sata-% to sata-osd16 basename osd16
+ created base -91
+ match sata-% to sata-osd18 basename osd18
+ created base -92
+ match sata-% to sata-osd19 basename osd19
+ created base -93
+ match sata-% to sata-osd17 basename osd17
+ created base -94
+ match sata-% to sata-osd20 basename osd20
+ created base -95
+ match sata-% to sata-osd21 basename osd21
+ created base -96
+ match sata-% to sata-osd22 basename osd22
+ created base -97
+ match sata-% to sata-osd23 basename osd23
+ created base -98
+ match sata-% to sata-osd24 basename osd24
+ created base -99
+ match sata-% to sata-osd25 basename osd25
+ created base -100
+ match sata-% to sata-osd26 basename osd26
+ created base -101
+ match sata-% to sata-osd27 basename osd27
+ created base -102
+ match sata-% to sata-rack1 basename rack1
+ already creating base -82
+ match sata-% to sata-rack2 basename rack2
+ already creating base -83
+ match sata-% to sata-rack3 basename rack3
+ already creating base -84
+ moving items from -36 (sas) to -1 (default)
+ moving items from -35 (sata) to -1 (default)
+ moving items from -34 (sas-rack3) to -84 (rack3)
+ moving items from -33 (sas-rack2) to -83 (rack2)
+ moving items from -32 (sas-rack1) to -82 (rack1)
+ moving items from -31 (sata-rack3) to -84 (rack3)
+ moving items from -30 (sata-rack2) to -83 (rack2)
+ moving items from -29 (sata-rack1) to -82 (rack1)
+ moving items from -28 (sas-osd09) to -81 (osd09)
+ moving items from -27 (sas-osd08) to -80 (osd08)
+ moving items from -26 (sas-osd07) to -79 (osd07)
+ moving items from -25 (sas-osd06) to -78 (osd06)
+ moving items from -24 (sas-osd05) to -77 (osd05)
+ moving items from -23 (sas-osd04) to -76 (osd04)
+ moving items from -22 (sas-osd03) to -75 (osd03)
+ moving items from -21 (sas-osd02) to -74 (osd02)
+ moving items from -20 (sata-osd27) to -102 (osd27)
+ moving items from -19 (sas-osd01) to -73 (osd01)
+ moving items from -18 (sata-osd26) to -101 (osd26)
+ moving items from -17 (sata-osd25) to -100 (osd25)
+ moving items from -16 (sata-osd24) to -99 (osd24)
+ moving items from -15 (sata-osd23) to -98 (osd23)
+ moving items from -14 (sata-osd22) to -97 (osd22)
+ moving items from -13 (sata-osd21) to -96 (osd21)
+ moving items from -12 (sata-osd20) to -95 (osd20)
+ moving items from -11 (sata-osd17) to -94 (osd17)
+ moving items from -10 (sata-osd19) to -93 (osd19)
+ moving items from -9 (sata-osd18) to -92 (osd18)
+ moving items from -8 (sata-osd16) to -91 (osd16)
+ moving items from -7 (sata-osd15) to -90 (osd15)
+ moving items from -6 (sata-osd12) to -89 (osd12)
+ moving items from -5 (sata-osd13) to -88 (osd13)
+ moving items from -4 (sata-osd14) to -87 (osd14)
+ moving items from -3 (sata-osd10) to -86 (osd10)
+ moving items from -2 (sata-osd11) to -85 (osd11)
+ $ crushtool -i $TESTDIR/crush-classes/g --compare foo
+ rule 0 had 0/10240 mismatched mappings (0)
+ rule 1 had 0/10240 mismatched mappings (0)
+ maps appear equivalent
reshard list list all bucket resharding or scheduled to be reshared
reshard process process of scheduled reshard jobs
reshard cancel cancel resharding a bucket
+ reshard stale-instances list list stale-instances from bucket resharding
+ reshard stale-instances rm cleanup stale-instances from bucket resharding
sync error list list sync error
sync error trim trim sync error
options:
ASSERT_EQ(-EEXIST, l.lock_exclusive(&ioctx, oid));
/* test idempotency */
- l.set_renew(true);
+ l.set_may_renew(true);
ASSERT_EQ(0, l.lock_exclusive(&ioctx, oid));
- l.set_renew(false);
+ l.set_may_renew(false);
/* test second client */
Lock l2(lock_name);
/* check new tag */
string new_tag = "new_tag";
l.set_tag(new_tag);
- l.set_renew(true);
+ l.set_may_renew(true);
ASSERT_EQ(0, l.lock_exclusive(&ioctx, oid));
lock_info(&ioctx, oid, lock_name, lockers, NULL, &new_tag);
ASSERT_EQ(1, (int)lockers.size());
ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
}
+
+TEST(ClsLock, TestRenew) {
+ Rados cluster;
+ std::string pool_name = get_temp_pool_name();
+ ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+ IoCtx ioctx;
+ cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+ bufferlist bl;
+
+ string oid1 = "foo1";
+ string lock_name1 = "mylock1";
+
+ ASSERT_EQ(0, ioctx.write(oid1, bl, bl.length(), 0));
+
+ Lock l1(lock_name1);
+ utime_t lock_duration1(5, 0);
+ l1.set_duration(lock_duration1);
+
+ ASSERT_EQ(0, l1.lock_exclusive(&ioctx, oid1));
+ l1.set_may_renew(true);
+ sleep(2);
+ ASSERT_EQ(0, l1.lock_exclusive(&ioctx, oid1));
+ sleep(7);
+ ASSERT_EQ(0, l1.lock_exclusive(&ioctx, oid1)) <<
+ "when a cls_lock is set to may_renew, a relock after expiration "
+ "should still work";
+ ASSERT_EQ(0, l1.unlock(&ioctx, oid1));
+
+ // ***********************************************
+
+ string oid2 = "foo2";
+ string lock_name2 = "mylock2";
+
+ ASSERT_EQ(0, ioctx.write(oid2, bl, bl.length(), 0));
+
+ Lock l2(lock_name2);
+ utime_t lock_duration2(5, 0);
+ l2.set_duration(lock_duration2);
+
+ ASSERT_EQ(0, l2.lock_exclusive(&ioctx, oid2));
+ l2.set_must_renew(true);
+ sleep(2);
+ ASSERT_EQ(0, l2.lock_exclusive(&ioctx, oid2));
+ sleep(7);
+ ASSERT_EQ(-ENOENT, l2.lock_exclusive(&ioctx, oid2)) <<
+ "when a cls_lock is set to must_renew, a relock after expiration "
+ "should fail";
+ ASSERT_EQ(-ENOENT, l2.unlock(&ioctx, oid2));
+
+ // ***********************************************
+
+ string oid3 = "foo3";
+ string lock_name3 = "mylock3";
+
+ ASSERT_EQ(0, ioctx.write(oid3, bl, bl.length(), 0));
+
+ Lock l3(lock_name3);
+ l3.set_duration(utime_t(5, 0));
+ l3.set_must_renew(true);
+
+ ASSERT_EQ(-ENOENT, l3.lock_exclusive(&ioctx, oid3)) <<
+ "unable to create a lock with must_renew";
+
+ ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+TEST(ClsLock, TestExclusiveEphemeralBasic) {
+ Rados cluster;
+ std::string pool_name = get_temp_pool_name();
+ ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+ IoCtx ioctx;
+ cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+ bufferlist bl;
+
+ string oid1 = "foo1";
+ string oid2 = "foo2";
+ string lock_name1 = "mylock1";
+ string lock_name2 = "mylock2";
+
+ Lock l1(lock_name1);
+ l1.set_duration(utime_t(5, 0));
+
+ uint64_t size;
+ time_t mod_time;
+
+ l1.set_may_renew(true);
+ ASSERT_EQ(0, l1.lock_exclusive_ephemeral(&ioctx, oid1));
+ ASSERT_EQ(0, ioctx.stat(oid1, &size, &mod_time));
+ sleep(2);
+ ASSERT_EQ(0, l1.unlock(&ioctx, oid1));
+ ASSERT_EQ(-ENOENT, ioctx.stat(oid1, &size, &mod_time));
+
+ // ***********************************************
+
+ Lock l2(lock_name2);
+ utime_t lock_duration2(5, 0);
+ l2.set_duration(utime_t(5, 0));
+
+ ASSERT_EQ(0, l2.lock_exclusive(&ioctx, oid2));
+ ASSERT_EQ(0, ioctx.stat(oid2, &size, &mod_time));
+ sleep(2);
+ ASSERT_EQ(0, l2.unlock(&ioctx, oid2));
+ ASSERT_EQ(0, ioctx.stat(oid2, &size, &mod_time));
+
+ ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+
+TEST(ClsLock, TestExclusiveEphemeralStealEphemeral) {
+ Rados cluster;
+ std::string pool_name = get_temp_pool_name();
+ ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+ IoCtx ioctx;
+ cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+ bufferlist bl;
+
+ string oid1 = "foo1";
+ string lock_name1 = "mylock1";
+
+ Lock l1(lock_name1);
+ l1.set_duration(utime_t(3, 0));
+
+ ASSERT_EQ(0, l1.lock_exclusive_ephemeral(&ioctx, oid1));
+ sleep(4);
+
+ // l1 is expired, l2 can take; l2 is also exclusive_ephemeral
+ Lock l2(lock_name1);
+ l2.set_duration(utime_t(3, 0));
+ ASSERT_EQ(0, l2.lock_exclusive_ephemeral(&ioctx, oid1));
+ sleep(1);
+ ASSERT_EQ(0, l2.unlock(&ioctx, oid1));
+
+ // l2 cannot unlock its expired lock
+ ASSERT_EQ(-ENOENT, l1.unlock(&ioctx, oid1));
+
+ ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+
+TEST(ClsLock, TestExclusiveEphemeralStealExclusive) {
+ Rados cluster;
+ std::string pool_name = get_temp_pool_name();
+ ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+ IoCtx ioctx;
+ cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+ bufferlist bl;
+
+ string oid1 = "foo1";
+ string lock_name1 = "mylock1";
+
+ Lock l1(lock_name1);
+ l1.set_duration(utime_t(3, 0));
+
+ ASSERT_EQ(0, l1.lock_exclusive_ephemeral(&ioctx, oid1));
+ sleep(4);
+
+ // l1 is expired, l2 can take; l2 is exclusive (but not ephemeral)
+ Lock l2(lock_name1);
+ l2.set_duration(utime_t(3, 0));
+ ASSERT_EQ(0, l2.lock_exclusive(&ioctx, oid1));
+ sleep(1);
+ ASSERT_EQ(0, l2.unlock(&ioctx, oid1));
+
+ // l2 cannot unlock its expired lock
+ ASSERT_EQ(-ENOENT, l1.unlock(&ioctx, oid1));
+
+ ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
OpMgr mgr;
ObjectWriteOperation *op = mgr.write_op();
- cls_rgw_bucket_init(*op);
+ cls_rgw_bucket_init_index(*op);
ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
uint64_t epoch = 1;
OpMgr mgr;
ObjectWriteOperation *op = mgr.write_op();
- cls_rgw_bucket_init(*op);
+ cls_rgw_bucket_init_index(*op);
ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
uint64_t obj_size = 1024;
OpMgr mgr;
ObjectWriteOperation *op = mgr.write_op();
- cls_rgw_bucket_init(*op);
+ cls_rgw_bucket_init_index(*op);
ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
uint64_t obj_size = 1024;
OpMgr mgr;
ObjectWriteOperation *op = mgr.write_op();
- cls_rgw_bucket_init(*op);
+ cls_rgw_bucket_init_index(*op);
ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
uint64_t total_size = 0;
OpMgr mgr;
ObjectWriteOperation *op = mgr.write_op();
- cls_rgw_bucket_init(*op);
+ cls_rgw_bucket_init_index(*op);
ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
uint64_t epoch = 1;
OpMgr mgr;
ObjectWriteOperation *op = mgr.write_op();
- cls_rgw_bucket_init(*op);
+ cls_rgw_bucket_init_index(*op);
ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
string name;
$<TARGET_OBJECTS:unit-main>
)
add_ceph_unittest(unittest_compression ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_compression)
-target_link_libraries(unittest_compression global)
+target_link_libraries(unittest_compression global gtest)
add_dependencies(unittest_compression ceph_example)
echo "FAILED $failed / $numtests tests."
exit 1
fi
+
+if [ $numtests -eq 0 ]; then
+ echo "FAILED: no tests found to run!"
+ exit 1
+fi
+
echo "passed $numtests tests."
#include "gtest/gtest.h"
#include <errno.h>
#include <fcntl.h>
-#include <semaphore.h>
#include <sstream>
#include <string>
#include <boost/scoped_ptr.hpp>
AioTestData()
: m_cluster(NULL),
m_ioctx(NULL),
- m_init(false),
- m_complete(false),
- m_safe(false)
+ m_init(false)
{
}
if (m_init) {
rados_ioctx_destroy(m_ioctx);
destroy_one_pool(m_pool_name, &m_cluster);
- sem_close(m_sem);
}
}
std::string init()
{
int ret;
- if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
- int err = errno;
- ostringstream oss;
- oss << "sem_open failed: " << cpp_strerror(err);
- return oss.str();
- }
m_pool_name = get_temp_pool_name();
std::string err = create_one_pool(m_pool_name, &m_cluster);
if (!err.empty()) {
- sem_close(m_sem);
ostringstream oss;
oss << "create_one_pool(" << m_pool_name << ") failed: error " << err;
return oss.str();
}
ret = rados_ioctx_create(m_cluster, m_pool_name.c_str(), &m_ioctx);
if (ret) {
- sem_close(m_sem);
destroy_one_pool(m_pool_name, &m_cluster);
ostringstream oss;
oss << "rados_ioctx_create failed: error " << ret;
return "";
}
- sem_t *m_sem = nullptr;
rados_t m_cluster;
rados_ioctx_t m_ioctx;
std::string m_pool_name;
bool m_init;
- bool m_complete;
- bool m_safe;
};
class AioTestDataPP
{
public:
AioTestDataPP()
- : m_init(false),
- m_complete(false),
- m_safe(false)
+ : m_init(false)
{
}
if (m_init) {
m_ioctx.close();
destroy_one_pool_pp(m_pool_name, m_cluster);
- sem_close(m_sem);
}
}
{
int ret;
- if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
- int err = errno;
- ostringstream oss;
- oss << "sem_open failed: " << cpp_strerror(err);
- return oss.str();
- }
m_pool_name = get_temp_pool_name();
std::string err = create_one_pool_pp(m_pool_name, m_cluster, config);
if (!err.empty()) {
- sem_close(m_sem);
ostringstream oss;
oss << "create_one_pool(" << m_pool_name << ") failed: error " << err;
return oss.str();
}
ret = m_cluster.ioctx_create(m_pool_name.c_str(), m_ioctx);
if (ret) {
- sem_close(m_sem);
destroy_one_pool_pp(m_pool_name, m_cluster);
ostringstream oss;
oss << "rados_ioctx_create failed: error " << ret;
return "";
}
- sem_t *m_sem = nullptr;
Rados m_cluster;
IoCtx m_ioctx;
std::string m_pool_name;
bool m_init;
- bool m_complete;
- bool m_safe;
};
-void set_completion_complete(rados_completion_t cb, void *arg)
-{
- AioTestData *test = static_cast<AioTestData*>(arg);
- test->m_complete = true;
- sem_post(test->m_sem);
-}
-
-void set_completion_safe(rados_completion_t cb, void *arg)
-{
- AioTestData *test = static_cast<AioTestData*>(arg);
- test->m_safe = true;
- sem_post(test->m_sem);
-}
-
-void set_completion_completePP(rados_completion_t cb, void *arg)
-{
- AioTestDataPP *test = static_cast<AioTestDataPP*>(arg);
- test->m_complete = true;
- sem_post(test->m_sem);
-}
-
-void set_completion_safePP(rados_completion_t cb, void *arg)
-{
- AioTestDataPP *test = static_cast<AioTestDataPP*>(arg);
- test->m_safe = true;
- sem_post(test->m_sem);
-}
-
TEST(LibRadosAio, TooBig) {
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(-E2BIG, rados_aio_write(test_data.m_ioctx, "foo",
bufferlist bl;
AioCompletion *aio_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, NULL, NULL);
+ nullptr, NULL, NULL);
ASSERT_EQ(-E2BIG, test_data.m_ioctx.aio_write("foo", aio_completion, bl, UINT_MAX, 0));
ASSERT_EQ(-E2BIG, test_data.m_ioctx.aio_append("foo", aio_completion, bl, UINT_MAX));
// ioctx.aio_write_full no way to overflow bl.length()
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion2, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
rados_aio_release(my_completion);
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completePP, set_completion_safePP);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
my_completion, bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
delete my_completion;
ASSERT_EQ("", test_data.init());
test_data.m_ioctx.set_namespace("nspace");
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completePP, set_completion_safePP);
+ nullptr, nullptr, nullptr);
ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
my_completion, bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
delete my_completion;
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completePP, set_completion_safePP);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
char buf2[256];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), 0));
{
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
char buf2[128];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), 0));
{
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
char buf2[128];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
rados_read_op_t op2 = rados_create_read_op();
rados_read_op_read(op2, 0, sizeof(buf2), buf2, NULL, NULL);
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completePP, set_completion_safePP);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
bufferlist bl2;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completePP, set_completion_safePP);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
my_completion2, &bl2, sizeof(buf), 0));
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completePP, set_completion_safePP);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
bufferlist bl2;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completePP, set_completion_safePP);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
my_completion2, &bl2, sizeof(buf), 0));
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completePP, set_completion_safePP);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
std::map<uint64_t, uint64_t> extents;
bufferlist bl2;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completePP, set_completion_safePP);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_sparse_read("foo",
my_completion2, &extents, &bl2, sizeof(buf), 0));
AioTestData test_data;
rados_completion_t my_completion, my_completion2, my_completion3;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
char buf2[128];
memset(buf2, 0xdd, sizeof(buf2));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2)));
{
ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
char buf3[sizeof(buf) + sizeof(buf2)];
memset(buf3, 0, sizeof(buf3));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion3, buf3, sizeof(buf3), 0));
{
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bufferlist bl2;
bl2.append(buf2, sizeof(buf2));
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_append("foo", my_completion2,
bl2, sizeof(buf2)));
ASSERT_EQ(0, my_completion2->get_return_value());
bufferlist bl3;
AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion3, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
my_completion3, &bl3, 2 * sizeof(buf), 0));
rados_completion_t my_completion;
AioTestData test_data;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
memset(buf, 0xaa, sizeof(buf));
ASSERT_EQ(0, rados_append(test_data.m_ioctx, "foo", buf, sizeof(buf)));
ASSERT_EQ(0, rados_aio_remove(test_data.m_ioctx, "foo", my_completion));
ASSERT_EQ(0, test_data.m_ioctx.append("foo", bl1, sizeof(buf)));
boost::scoped_ptr<AioCompletion> my_completion
(test_data.m_cluster.aio_create_completion
- ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+ (nullptr, nullptr, nullptr));
ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion.get()));
{
TestAlarm alarm;
ASSERT_EQ(0, rados_append(test_data.m_ioctx, "foo", buf, sizeof(buf)));
// async getxattr
rados_completion_t my_completion;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
ASSERT_EQ(0, rados_aio_getxattr(test_data.m_ioctx, "foo", my_completion, attr1, buf, sizeof(buf)));
{
TestAlarm alarm;
rados_aio_release(my_completion);
// async setxattr
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_setxattr(test_data.m_ioctx, "foo", my_completion2, attr1, attr1_buf, sizeof(attr1_buf)));
{
TestAlarm alarm;
rados_aio_release(my_completion2);
// async getxattr
rados_completion_t my_completion3;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_getxattr(test_data.m_ioctx, "foo", my_completion3, attr1, buf, sizeof(buf)));
{
TestAlarm alarm;
// async getxattr
boost::scoped_ptr<AioCompletion> my_completion
(test_data.m_cluster.aio_create_completion
- ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+ (nullptr, nullptr, nullptr));
ASSERT_EQ(0, test_data.m_ioctx.aio_getxattr("foo", my_completion.get(), attr1, bl2));
{
TestAlarm alarm;
ASSERT_EQ("", test_data2.init());
boost::scoped_ptr<AioCompletion> my_completion2
(test_data.m_cluster.aio_create_completion
- ((void*)&test_data2, set_completion_completePP, set_completion_safePP));
+ (nullptr, nullptr, nullptr));
ASSERT_EQ(0, test_data.m_ioctx.aio_setxattr("foo", my_completion2.get(), attr1, bl3));
{
TestAlarm alarm;
ASSERT_EQ("", test_data3.init());
boost::scoped_ptr<AioCompletion> my_completion3
(test_data.m_cluster.aio_create_completion
- ((void*)&test_data3, set_completion_completePP, set_completion_safePP));
+ (nullptr, nullptr, nullptr));
ASSERT_EQ(0, test_data.m_ioctx.aio_getxattr("foo", my_completion3.get(), attr1, bl4));
{
TestAlarm alarm;
ASSERT_EQ(0, rados_append(test_data.m_ioctx, "foo", buf, sizeof(buf)));
// async setxattr
rados_completion_t my_completion;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
ASSERT_EQ(0, rados_aio_setxattr(test_data.m_ioctx, "foo", my_completion, attr1, attr1_buf, sizeof(attr1_buf)));
{
TestAlarm alarm;
rados_aio_release(my_completion);
// async rmxattr
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_rmxattr(test_data.m_ioctx, "foo", my_completion2, attr1));
{
TestAlarm alarm;
rados_aio_release(my_completion2);
// async getxattr after deletion
rados_completion_t my_completion3;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_getxattr(test_data.m_ioctx, "foo", my_completion3, attr1, buf, sizeof(buf)));
{
TestAlarm alarm;
ASSERT_EQ(0, rados_write(test_data.m_ioctx, "foo_rmxattr", buf2, sizeof(buf2), 0));
// asynx setxattr
rados_completion_t my_completion4;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion4));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion4));
ASSERT_EQ(0, rados_aio_setxattr(test_data.m_ioctx, "foo_rmxattr", my_completion4, attr2, attr2_buf, sizeof(attr2_buf)));
{
TestAlarm alarm;
ASSERT_EQ(0, rados_remove(test_data.m_ioctx, "foo_rmxattr"));
// async rmxattr on non existing object
rados_completion_t my_completion5;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion5));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion5));
ASSERT_EQ(0, rados_aio_rmxattr(test_data.m_ioctx, "foo_rmxattr", my_completion5, attr2));
{
TestAlarm alarm;
bl2.append(attr1_buf, sizeof(attr1_buf));
boost::scoped_ptr<AioCompletion> my_completion
(test_data.m_cluster.aio_create_completion
- ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+ (nullptr, nullptr, nullptr));
ASSERT_EQ(0, test_data.m_ioctx.aio_setxattr("foo", my_completion.get(), attr1, bl2));
{
TestAlarm alarm;
ASSERT_EQ("", test_data2.init());
boost::scoped_ptr<AioCompletion> my_completion2
(test_data.m_cluster.aio_create_completion
- ((void*)&test_data2, set_completion_completePP, set_completion_safePP));
+ (nullptr, nullptr, nullptr));
ASSERT_EQ(0, test_data.m_ioctx.aio_rmxattr("foo", my_completion2.get(), attr1));
{
TestAlarm alarm;
ASSERT_EQ("", test_data3.init());
boost::scoped_ptr<AioCompletion> my_completion3
(test_data.m_cluster.aio_create_completion
- ((void*)&test_data3, set_completion_completePP, set_completion_safePP));
+ (nullptr, nullptr, nullptr));
bufferlist bl3;
ASSERT_EQ(0, test_data.m_ioctx.aio_getxattr("foo", my_completion3.get(), attr1, bl3));
{
ASSERT_EQ("", test_data4.init());
boost::scoped_ptr<AioCompletion> my_completion4
(test_data.m_cluster.aio_create_completion
- ((void*)&test_data4, set_completion_completePP, set_completion_safePP));
+ (nullptr, nullptr, nullptr));
ASSERT_EQ(0, test_data.m_ioctx.aio_setxattr("foo_rmxattr", my_completion4.get(), attr2, bl22));
{
TestAlarm alarm;
ASSERT_EQ("", test_data5.init());
boost::scoped_ptr<AioCompletion> my_completion5
(test_data.m_cluster.aio_create_completion
- ((void*)&test_data5, set_completion_completePP, set_completion_safePP));
+ (nullptr, nullptr, nullptr));
ASSERT_EQ(0, test_data.m_ioctx.aio_rmxattr("foo_rmxattr", my_completion5.get(), attr2));
{
TestAlarm alarm;
ASSERT_EQ(0, rados_setxattr(test_data.m_ioctx, "foo", attr2, attr2_buf, sizeof(attr2_buf)));
// call async version of getxattrs and wait for completion
rados_completion_t my_completion;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
rados_xattrs_iter_t iter;
ASSERT_EQ(0, rados_aio_getxattrs(test_data.m_ioctx, "foo", my_completion, &iter));
{
// call async version of getxattrs
boost::scoped_ptr<AioCompletion> my_completion
(test_data.m_cluster.aio_create_completion
- ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+ (nullptr, nullptr, nullptr));
std::map<std::string, bufferlist> attrset;
ASSERT_EQ(0, test_data.m_ioctx.aio_getxattrs("foo", my_completion.get(), attrset));
{
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
char buf2[128];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), 0));
{
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
bufferlist bl2;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
&bl2, sizeof(buf), 0));
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
char buf2[128];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), 0));
{
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
}
ASSERT_EQ(0, my_completion->get_return_value());
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
bufferlist bl2;
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0, sizeof(buf));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent",
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
bufferlist bl1;
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xee, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
char buf2[128];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), 0));
{
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
ASSERT_EQ(0, my_completion->get_return_value());
bufferlist bl2;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
&bl2, sizeof(buf), 0));
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
rados_completion_t flush_completion;
ASSERT_EQ(0, rados_aio_create_completion(NULL, NULL, NULL, &flush_completion));
char buf[128];
char buf2[128];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), 0));
{
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *flush_completion =
test_data.m_cluster.aio_create_completion(NULL, NULL, NULL);
AioCompletion *my_completion_null = NULL;
ASSERT_EQ(0, my_completion->get_return_value());
bufferlist bl2;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
&bl2, sizeof(buf), 0));
AioTestData test_data;
rados_completion_t my_completion, my_completion2, my_completion3;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
char buf2[64];
memset(buf2, 0xdd, sizeof(buf2));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_write_full(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2)));
{
ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
char buf3[sizeof(buf) + sizeof(buf2)];
memset(buf3, 0, sizeof(buf3));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion3, buf3, sizeof(buf3), 0));
{
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bufferlist bl2;
bl2.append(buf2, sizeof(buf2));
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_write_full("foo", my_completion2, bl2));
{
ASSERT_EQ(0, my_completion2->get_return_value());
bufferlist bl3;
AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion3, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
&bl3, sizeof(buf), 0));
AioTestData test_data;
rados_completion_t my_completion, my_completion2, my_completion3;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char full[128];
memset(full, 0xcc, sizeof(full));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
char buf[32];
size_t ws_write_len = sizeof(full);
memset(buf, 0xdd, sizeof(buf));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_writesame(test_data.m_ioctx, "foo",
my_completion2, buf, sizeof(buf),
ws_write_len, 0));
ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion3, full, sizeof(full), 0));
{
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char full[128];
bufferlist bl2;
bl2.append(buf, sizeof(buf));
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_writesame("foo", my_completion2, bl2,
ws_write_len, 0));
ASSERT_EQ(0, my_completion2->get_return_value());
bufferlist bl3;
AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion3, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
&bl3, sizeof(full), 0));
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
uint64_t psize;
time_t pmtime;
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
my_completion2, &psize, &pmtime));
{
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
uint64_t psize;
time_t pmtime;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
&psize, &pmtime));
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
char buf2[64];
memset(buf2, 0xbb, sizeof(buf2));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf2, sizeof(buf2), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
uint64_t psize;
time_t pmtime;
rados_completion_t my_completion2;
rados_ioctx_set_namespace(test_data.m_ioctx, "");
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
my_completion2, &psize, &pmtime));
{
rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
rados_completion_t my_completion3;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
my_completion3, &psize, &pmtime));
{
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
uint64_t psize;
time_t pmtime;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
&psize, &pmtime));
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
uint64_t psize;
time_t pmtime;
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
my_completion2, &psize, &pmtime));
{
ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
ASSERT_EQ(sizeof(buf), psize);
rados_completion_t my_completion3;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_remove(test_data.m_ioctx, "foo", my_completion3));
{
TestAlarm alarm;
uint64_t psize2;
time_t pmtime2;
rados_completion_t my_completion4;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion4));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion4));
ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
my_completion4, &psize2, &pmtime2));
{
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
uint64_t psize;
time_t pmtime;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
&psize, &pmtime));
uint64_t psize2;
time_t pmtime2;
AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion3, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion3));
{
ASSERT_EQ(0, my_completion3->get_return_value());
AioCompletion *my_completion4 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion4, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion4,
&psize2, &pmtime2));
AioTestData test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
}
- ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
char out[128];
ASSERT_EQ(0, rados_aio_exec(test_data.m_ioctx, "foo", my_completion2,
"hello", "say_hello", NULL, 0, out, sizeof(out)));
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
bufferlist in, out;
ASSERT_EQ(0, test_data.m_ioctx.aio_exec("foo", my_completion2,
AioTestData test_data;
rados_completion_t my_completion, my_completion2, my_completion3;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
char buf2[64];
memset(buf2, 0xdd, sizeof(buf2));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), sizeof(buf)));
{
char buf3[(sizeof(buf) + sizeof(buf2)) * 3];
memset(buf3, 0, sizeof(buf3));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion3, buf3, sizeof(buf3), 0));
{
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bufferlist bl2;
bl2.append(buf2, sizeof(buf2));
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion2,
bl2, sizeof(buf2), sizeof(buf)));
bufferlist bl3;
AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion3, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
&bl3, (sizeof(buf) + sizeof(buf2) * 3), 0));
ASSERT_EQ("", test_data.init());
ASSERT_EQ(0, rados_lock_exclusive(test_data.m_ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
rados_completion_t my_completion;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_complete, set_completion_safe, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
ASSERT_EQ(0, rados_aio_unlock(test_data.m_ioctx, "foo", "TestLock", "Cookie", my_completion));
{
TestAlarm alarm;
ASSERT_EQ(0, test_data.m_ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
boost::scoped_ptr<AioCompletion> my_completion
(test_data.m_cluster.aio_create_completion
- ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+ (nullptr, nullptr, nullptr));
ASSERT_EQ(0, test_data.m_ioctx.aio_unlock("foo", "TestLock", "Cookie", my_completion.get()));
{
TestAlarm alarm;
AioTestDataEC()
: m_cluster(NULL),
m_ioctx(NULL),
- m_init(false),
- m_complete(false),
- m_safe(false)
+ m_init(false)
{
}
if (m_init) {
rados_ioctx_destroy(m_ioctx);
destroy_one_ec_pool(m_pool_name, &m_cluster);
- sem_close(m_sem);
}
}
std::string init()
{
int ret;
- if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
- int err = errno;
- ostringstream oss;
- oss << "sem_open failed: " << cpp_strerror(err);
- return oss.str();
- }
m_pool_name = get_temp_pool_name();
std::string err = create_one_ec_pool(m_pool_name, &m_cluster);
if (!err.empty()) {
- sem_close(m_sem);
ostringstream oss;
oss << "create_one_ec_pool(" << m_pool_name << ") failed: error " << err;
return oss.str();
}
ret = rados_ioctx_create(m_cluster, m_pool_name.c_str(), &m_ioctx);
if (ret) {
- sem_close(m_sem);
destroy_one_ec_pool(m_pool_name, &m_cluster);
ostringstream oss;
oss << "rados_ioctx_create failed: error " << ret;
return "";
}
- sem_t *m_sem;
rados_t m_cluster;
rados_ioctx_t m_ioctx;
std::string m_pool_name;
bool m_init;
- bool m_complete;
- bool m_safe;
};
class AioTestDataECPP
{
public:
AioTestDataECPP()
- : m_init(false),
- m_complete(false),
- m_safe(false)
+ : m_init(false)
{
}
if (m_init) {
m_ioctx.close();
destroy_one_ec_pool_pp(m_pool_name, m_cluster);
- sem_close(m_sem);
}
}
std::string init()
{
int ret;
- if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
- int err = errno;
- ostringstream oss;
- oss << "sem_open failed: " << cpp_strerror(err);
- return oss.str();
- }
m_pool_name = get_temp_pool_name();
std::string err = create_one_ec_pool_pp(m_pool_name, m_cluster);
if (!err.empty()) {
- sem_close(m_sem);
ostringstream oss;
oss << "create_one_ec_pool(" << m_pool_name << ") failed: error " << err;
return oss.str();
}
ret = m_cluster.ioctx_create(m_pool_name.c_str(), m_ioctx);
if (ret) {
- sem_close(m_sem);
destroy_one_ec_pool_pp(m_pool_name, m_cluster);
ostringstream oss;
oss << "rados_ioctx_create failed: error " << ret;
return "";
}
- sem_t *m_sem = nullptr;
Rados m_cluster;
IoCtx m_ioctx;
std::string m_pool_name;
bool m_init;
- bool m_complete;
- bool m_safe;
};
-void set_completion_completeEC(rados_completion_t cb, void *arg)
-{
- AioTestDataEC *test = static_cast<AioTestDataEC*>(arg);
- test->m_complete = true;
- sem_post(test->m_sem);
-}
-
-void set_completion_safeEC(rados_completion_t cb, void *arg)
-{
- AioTestDataEC *test = static_cast<AioTestDataEC*>(arg);
- test->m_safe = true;
- sem_post(test->m_sem);
-}
-
-void set_completion_completeECPP(rados_completion_t cb, void *arg)
-{
- AioTestDataECPP *test = static_cast<AioTestDataECPP*>(arg);
- test->m_complete = true;
- sem_post(test->m_sem);
-}
-
-void set_completion_safeECPP(rados_completion_t cb, void *arg)
-{
- AioTestDataECPP *test = static_cast<AioTestDataECPP*>(arg);
- test->m_safe = true;
- sem_post(test->m_sem);
-}
-
TEST(LibRadosAioEC, SimpleWrite) {
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion2, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
rados_aio_release(my_completion);
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
my_completion, bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
delete my_completion;
ASSERT_EQ("", test_data.init());
test_data.m_ioctx.set_namespace("nspace");
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+ nullptr, nullptr, nullptr);
ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
my_completion, bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
delete my_completion;
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
char buf2[256];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), 0));
{
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
char buf2[128];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), 0));
{
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
bufferlist bl2;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
my_completion2, &bl2, sizeof(buf), 0));
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
bufferlist bl2;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
my_completion2, &bl2, sizeof(buf), 0));
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
map<uint64_t, uint64_t> extents;
bufferlist bl2;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_sparse_read("foo",
my_completion2, &extents, &bl2, sizeof(buf), 0));
AioTestDataEC test_data;
rados_completion_t my_completion, my_completion2, my_completion3, my_completion4;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
int requires;
ASSERT_EQ(0, rados_ioctx_pool_requires_alignment2(test_data.m_ioctx, &requires));
ASSERT_NE(0, requires);
int hbsize = bsize / 2;
char *buf2 = (char *)new char[hbsize];
memset(buf2, 0xdd, hbsize);
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
my_completion2, buf2, hbsize));
{
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
my_completion3, buf2, hbsize));
{
int tbsize = bsize + hbsize;
char *buf3 = (char *)new char[tbsize];
memset(buf3, 0, tbsize);
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion4));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion4));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion4, buf3, bsize * 3, 0));
{
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
bool requires;
bufferlist bl2;
bl2.append(buf2, hbsize);
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_append("foo", my_completion2,
bl2, hbsize));
ASSERT_EQ(0, my_completion2->get_return_value());
AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion3, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_append("foo", my_completion3,
bl2, hbsize));
bufferlist bl3;
AioCompletion *my_completion4 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion4, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
my_completion4, &bl3, bsize * 3, 0));
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
char buf2[128];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), 0));
{
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
bufferlist bl2;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
&bl2, sizeof(buf), 0));
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
char buf2[128];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), 0));
{
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
}
ASSERT_EQ(0, my_completion->get_return_value());
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
bufferlist bl2;
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0, sizeof(buf));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent",
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
bufferlist bl1;
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xee, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
char buf2[128];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), 0));
{
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
ASSERT_EQ(0, my_completion->get_return_value());
bufferlist bl2;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
&bl2, sizeof(buf), 0));
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
rados_completion_t flush_completion;
ASSERT_EQ(0, rados_aio_create_completion(NULL, NULL, NULL, &flush_completion));
char buf[128];
char buf2[128];
memset(buf2, 0, sizeof(buf2));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), 0));
{
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
AioCompletion *flush_completion =
test_data.m_cluster.aio_create_completion(NULL, NULL, NULL);
AioCompletion *my_completion_null = NULL;
ASSERT_EQ(0, my_completion->get_return_value());
bufferlist bl2;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
&bl2, sizeof(buf), 0));
AioTestDataEC test_data;
rados_completion_t my_completion, my_completion2, my_completion3;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
char buf2[64];
memset(buf2, 0xdd, sizeof(buf2));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_write_full(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2)));
{
ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
char buf3[sizeof(buf) + sizeof(buf2)];
memset(buf3, 0, sizeof(buf3));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion3, buf3, sizeof(buf3), 0));
{
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bufferlist bl2;
bl2.append(buf2, sizeof(buf2));
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_write_full("foo", my_completion2, bl2));
{
ASSERT_EQ(0, my_completion2->get_return_value());
bufferlist bl3;
AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion3, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
&bl3, sizeof(buf), 0));
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
uint64_t psize;
time_t pmtime;
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
my_completion2, &psize, &pmtime));
{
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
uint64_t psize;
time_t pmtime;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
&psize, &pmtime));
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
char buf2[64];
memset(buf2, 0xbb, sizeof(buf2));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf2, sizeof(buf2), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
uint64_t psize;
time_t pmtime;
rados_completion_t my_completion2;
rados_ioctx_set_namespace(test_data.m_ioctx, "");
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
my_completion2, &psize, &pmtime));
{
rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
rados_completion_t my_completion3;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
my_completion3, &psize, &pmtime));
{
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
uint64_t psize;
time_t pmtime;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
&psize, &pmtime));
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
uint64_t psize;
time_t pmtime;
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
my_completion2, &psize, &pmtime));
{
ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
ASSERT_EQ(sizeof(buf), psize);
rados_completion_t my_completion3;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_remove(test_data.m_ioctx, "foo", my_completion3));
{
TestAlarm alarm;
uint64_t psize2;
time_t pmtime2;
rados_completion_t my_completion4;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion4));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion4));
ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
my_completion4, &psize2, &pmtime2));
{
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
uint64_t psize;
time_t pmtime;
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
&psize, &pmtime));
uint64_t psize2;
time_t pmtime2;
AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion3, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion3));
{
ASSERT_EQ(0, my_completion3->get_return_value());
AioCompletion *my_completion4 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion4, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion4,
&psize2, &pmtime2));
AioTestDataEC test_data;
rados_completion_t my_completion;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion, buf, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
}
ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
rados_completion_t my_completion2;
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
char out[128];
ASSERT_EQ(0, rados_aio_exec(test_data.m_ioctx, "foo", my_completion2,
"hello", "say_hello", NULL, 0, out, sizeof(out)));
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bl1, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
+ ASSERT_EQ(0, my_completion->wait_for_complete());
}
ASSERT_EQ(0, my_completion->get_return_value());
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
bufferlist in, out;
ASSERT_EQ(0, test_data.m_ioctx.aio_exec("foo", my_completion2,
AioTestDataEC test_data;
rados_completion_t my_completion, my_completion2, my_completion3;
ASSERT_EQ("", test_data.init());
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion));
char buf[128];
memset(buf, 0xcc, sizeof(buf));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
char buf2[64];
memset(buf2, 0xdd, sizeof(buf2));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion2));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion2));
ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
my_completion2, buf2, sizeof(buf2), sizeof(buf)));
{
char buf3[(sizeof(buf) + sizeof(buf2)) * 3];
memset(buf3, 0, sizeof(buf3));
- ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
- set_completion_completeEC, set_completion_safeEC, &my_completion3));
+ ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+ nullptr, nullptr, &my_completion3));
ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
my_completion3, buf3, sizeof(buf3), 0));
{
AioTestDataECPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char buf[128];
bufferlist bl2;
bl2.append(buf2, sizeof(buf2));
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion2,
bl2, sizeof(buf2), sizeof(buf)));
bufferlist bl3;
AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion3, my_completion_null);
ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
&bl3, (sizeof(buf) + sizeof(buf2) * 3), 0));
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init({{"objecter_retry_writes_after_first_reply", "true"}}));
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion, nullptr);
char buf[128];
memset(buf, 0xcc, sizeof(buf));
bufferlist bl;
bl.append(buf, sizeof(buf));
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_NE(my_completion2, nullptr);
ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion2));
ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
bl, sizeof(buf), 0));
{
TestAlarm alarm;
- sem_wait(test_data.m_sem);
- sem_wait(test_data.m_sem);
my_completion2->wait_for_complete();
my_completion->wait_for_complete();
}
AioTestDataPP test_data;
ASSERT_EQ("", test_data.init());
AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
AioCompletion *my_completion_null = NULL;
ASSERT_NE(my_completion, my_completion_null);
char full[128];
bufferlist cbl;
cbl.append(full, sizeof(full));
AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_EQ(0, test_data.m_ioctx.aio_cmpext("foo", my_completion2, 0, cbl));
{
cbl.clear();
cbl.append(full, sizeof(full));
AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
- (void*)&test_data, set_completion_complete, set_completion_safe);
+ nullptr, nullptr, nullptr);
ASSERT_EQ(0, test_data.m_ioctx.aio_cmpext("foo", my_completion3, 0, cbl));
{
ASSERT_EQ(0, ioctx.lock_shared("foo", "TestLock", "Cookie", "Tag", "", NULL, 0));
}
-TEST_F(LibRadosLock, LockRenew) {
+TEST_F(LibRadosLock, LockMayRenew) {
ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
ASSERT_EQ(-EEXIST, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
- ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_RENEW));
+ ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_MAY_RENEW));
}
-TEST_F(LibRadosLockPP, LockRenewPP) {
+TEST_F(LibRadosLockPP, LockMayRenewPP) {
ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
ASSERT_EQ(-EEXIST, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
- ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_RENEW));
+ ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_MAY_RENEW));
}
TEST_F(LibRadosLock, Unlock) {
ASSERT_EQ(0, ioctx.lock_shared("foo", "TestLock", "Cookie", "Tag", "", NULL, 0));
}
-TEST_F(LibRadosLockEC, LockRenew) {
+TEST_F(LibRadosLockEC, LockMayRenew) {
ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
ASSERT_EQ(-EEXIST, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
- ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_RENEW));
+ ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_MAY_RENEW));
}
-TEST_F(LibRadosLockECPP, LockRenewPP) {
+TEST_F(LibRadosLockECPP, LockMayRenewPP) {
ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
ASSERT_EQ(-EEXIST, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
- ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_RENEW));
+ ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_MAY_RENEW));
}
TEST_F(LibRadosLockEC, Unlock) {
return ctx->io_ctx_impl->create(ctx->oid, exclusive);
}
+int cls_cxx_remove(cls_method_context_t hctx) {
+ librados::TestClassHandler::MethodContext *ctx =
+ reinterpret_cast<librados::TestClassHandler::MethodContext*>(hctx);
+ return ctx->io_ctx_impl->remove(ctx->oid, ctx->io_ctx_impl->get_snap_context());
+}
+
int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin) {
librados::TestClassHandler::MethodContext *ctx =
reinterpret_cast<librados::TestClassHandler::MethodContext*>(hctx);
g_conf->apply_changes(NULL);
}
+TEST_P(StoreTest, SpuriousReadErrorTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ ObjectStore::Sequencer osr("test");
+ int r;
+ auto logger = store->get_perf_counters();
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = store->queue_transaction(&osr, std::move(t), nullptr);
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist test_data;
+ bufferptr ap(0x2000);
+ memset(ap.c_str(), 'a', 0x2000);
+ test_data.append(ap);
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, 0x2000, test_data);
+ r = store->queue_transaction(&osr, std::move(t), nullptr);
+ ASSERT_EQ(r, 0);
+ // force cache clear
+ EXPECT_EQ(store->umount(), 0);
+ EXPECT_EQ(store->mount(), 0);
+ }
+
+ cerr << "Injecting CRC error with no retry, expecting EIO" << std::endl;
+ g_conf->set_val("bluestore_retry_disk_reads", "0");
+ g_conf->set_val("bluestore_debug_inject_csum_err_probability", "1");
+ g_ceph_context->_conf->apply_changes(nullptr);
+ {
+ bufferlist in;
+ r = store->read(cid, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ ASSERT_EQ(-EIO, r);
+ ASSERT_EQ(logger->get(l_bluestore_read_eio), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_reads_with_retries), 0u);
+ }
+
+ cerr << "Injecting CRC error with retries, expecting success after several retries" << std::endl;
+ g_conf->set_val("bluestore_retry_disk_reads", "255");
+ g_conf->set_val("bluestore_debug_inject_csum_err_probability", "0.8");
+ /**
+ * Probabilistic test: 25 reads, each has a 80% chance of failing with 255 retries
+ * Probability of at least one retried read: 1 - (0.2 ** 25) = 100% - 3e-18
+ * Probability of a random test failure: 1 - ((1 - (0.8 ** 255)) ** 25) ~= 5e-24
+ */
+ g_ceph_context->_conf->apply_changes(nullptr);
+ {
+ for (int i = 0; i < 25; ++i) {
+ bufferlist in;
+ r = store->read(cid, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ ASSERT_EQ(0x2000, r);
+ ASSERT_TRUE(bl_eq(test_data, in));
+ }
+ ASSERT_GE(logger->get(l_bluestore_reads_with_retries), 1u);
+ }
+
+ // revert
+ g_conf->set_val("bluestore_retry_disk_reads", "3");
+ g_conf->set_val("bluestore_debug_inject_csum_err_probability", "0");
+ g_ceph_context->_conf->apply_changes(nullptr);
+}
+
int main(int argc, char **argv) {
vector<const char*> args;
argv_to_vec(argc, (const char **)argv, args);
ASSERT_TRUE(parent_0 != parent_1);
}
+ {
+ // cancel stale upmaps
+ osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+ int from = -1;
+ for (int i = 0; i < (int)get_num_osds(); i++) {
+ if (std::find(up.begin(), up.end(), i) == up.end()) {
+ from = i;
+ break;
+ }
+ }
+ ASSERT_TRUE(from >= 0);
+ int to = -1;
+ for (int i = 0; i < (int)get_num_osds(); i++) {
+ if (std::find(up.begin(), up.end(), i) == up.end() && i != from) {
+ to = i;
+ break;
+ }
+ }
+ ASSERT_TRUE(to >= 0);
+ vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+ new_pg_upmap_items.push_back(make_pair(from, to));
+ OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+ pending_inc.new_pg_upmap_items[pgid] =
+ mempool::osdmap::vector<pair<int32_t,int32_t>>(
+ new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+ OSDMap nextmap;
+ nextmap.deepish_copy_from(osdmap);
+ nextmap.apply_incremental(pending_inc);
+ ASSERT_TRUE(nextmap.have_pg_upmaps(pgid));
+ OSDMap::Incremental new_pending_inc(nextmap.get_epoch() + 1);
+ nextmap.clean_pg_upmaps(g_ceph_context, &new_pending_inc);
+ nextmap.apply_incremental(new_pending_inc);
+ ASSERT_TRUE(!nextmap.have_pg_upmaps(pgid));
+ }
+
+ {
+ // https://tracker.ceph.com/issues/37493
+ pg_t ec_pg(0, my_ec_pool);
+ pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg);
+ OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map..
+ int from = -1;
+ int to = -1;
+ {
+ // insert a valid pg_upmap_item
+ vector<int> ec_up;
+ int ec_up_primary;
+ osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
+ ASSERT_TRUE(!ec_up.empty());
+ from = *(ec_up.begin());
+ ASSERT_TRUE(from >= 0);
+ for (int i = 0; i < (int)get_num_osds(); i++) {
+ if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
+ to = i;
+ break;
+ }
+ }
+ ASSERT_TRUE(to >= 0);
+ ASSERT_TRUE(from != to);
+ vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+ new_pg_upmap_items.push_back(make_pair(from, to));
+ OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+ pending_inc.new_pg_upmap_items[ec_pgid] =
+ mempool::osdmap::vector<pair<int32_t,int32_t>>(
+ new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+ tmpmap.deepish_copy_from(osdmap);
+ tmpmap.apply_incremental(pending_inc);
+ ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+ }
+ {
+ // mark one of the target OSDs of the above pg_upmap_item as down
+ OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
+ pending_inc.new_state[to] = CEPH_OSD_UP;
+ tmpmap.apply_incremental(pending_inc);
+ ASSERT_TRUE(!tmpmap.is_up(to));
+ ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+ }
+ {
+ // confirm *maybe_remove_pg_upmaps* won't do anything bad
+ OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
+ tmpmap.maybe_remove_pg_upmaps(g_ceph_context, tmpmap, &pending_inc);
+ tmpmap.apply_incremental(pending_inc);
+ ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+ }
+ }
+
+ {
+ // http://tracker.ceph.com/issues/37501
+ pg_t ec_pg(0, my_ec_pool);
+ pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg);
+ OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map..
+ int from = -1;
+ int to = -1;
+ {
+ // insert a valid pg_upmap_item
+ vector<int> ec_up;
+ int ec_up_primary;
+ osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
+ ASSERT_TRUE(!ec_up.empty());
+ from = *(ec_up.begin());
+ ASSERT_TRUE(from >= 0);
+ for (int i = 0; i < (int)get_num_osds(); i++) {
+ if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
+ to = i;
+ break;
+ }
+ }
+ ASSERT_TRUE(to >= 0);
+ ASSERT_TRUE(from != to);
+ vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+ new_pg_upmap_items.push_back(make_pair(from, to));
+ OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+ pending_inc.new_pg_upmap_items[ec_pgid] =
+ mempool::osdmap::vector<pair<int32_t,int32_t>>(
+ new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+ tmpmap.deepish_copy_from(osdmap);
+ tmpmap.apply_incremental(pending_inc);
+ ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+ }
+ {
+ // mark one of the target OSDs of the above pg_upmap_item as out
+ OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
+ pending_inc.new_weight[to] = CEPH_OSD_OUT;
+ tmpmap.apply_incremental(pending_inc);
+ ASSERT_TRUE(tmpmap.is_out(to));
+ ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+ }
+ {
+ // *maybe_remove_pg_upmaps* should be able to remove the above *bad* mapping
+ OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
+ OSDMap nextmap;
+ nextmap.deepish_copy_from(tmpmap);
+ nextmap.maybe_remove_pg_upmaps(g_ceph_context, nextmap, &pending_inc);
+ tmpmap.apply_incremental(pending_inc);
+ ASSERT_TRUE(!tmpmap.have_pg_upmaps(ec_pgid));
+ }
+ }
+
{
// TEST pg_upmap
{
from abc import ABCMeta, abstractmethod
-from cStringIO import StringIO
+from six import StringIO
+
import json
-from conn import get_gateway_connection
+from .conn import get_gateway_connection
class Cluster:
""" interface to run commands against a distinct ceph cluster """
import sys
import time
import logging
+import errno
try:
from itertools import izip_longest as zip_longest
except ImportError:
from itertools import zip_longest
from itertools import combinations
-from cStringIO import StringIO
+from six import StringIO
import boto
import boto.s3.connection
from nose.plugins.attrib import attr
from nose.plugins.skip import SkipTest
-from .multisite import Zone
+from .multisite import Zone, ZoneGroup
from .conn import get_gateway_connection
# validate the resulting period
zonegroup.period.update(z1, commit=True)
+
+def test_zg_master_zone_delete():
+
+ master_zg = realm.master_zonegroup()
+ master_zone = master_zg.master_zone
+
+ assert(len(master_zg.zones) >= 1)
+ master_cluster = master_zg.zones[0].cluster
+
+ rm_zg = ZoneGroup('remove_zg')
+ rm_zg.create(master_cluster)
+
+ rm_zone = Zone('remove', rm_zg, master_cluster)
+ rm_zone.create(master_cluster)
+ master_zg.period.update(master_zone, commit=True)
+
+
+ rm_zone.delete(master_cluster)
+ # Period update: This should now fail as the zone will be the master zone
+ # in that zg
+ _, retcode = master_zg.period.update(master_zone, check_retcode=False)
+ assert(retcode == errno.EINVAL)
+
+ # Proceed to delete the zonegroup as well, previous period now does not
+ # contain a dangling master_zone, this must succeed
+ rm_zg.delete(master_cluster)
+ master_zg.period.update(master_zone, commit=True)
+
def test_set_bucket_website():
buckets, zone_bucket = create_bucket_per_zone_in_realm()
for _, bucket in zone_bucket:
EXPECT_TRUE(match_policy("a:b:c", "a:b:c", flag));
EXPECT_FALSE(match_policy("a:b:c", "A:B:C", flag)); // case sensitive
EXPECT_TRUE(match_policy("a:*:e", "a:bcd:e", flag));
- EXPECT_FALSE(match_policy("a:*", "a:b:c", flag)); // cannot span segments
+ EXPECT_TRUE(match_policy("a:*", "a:b:c", flag)); // can span segments
}
TEST(MatchPolicy, ARN)
EXPECT_TRUE(match_policy("a:b:c", "a:b:c", flag));
EXPECT_FALSE(match_policy("a:b:c", "A:B:C", flag)); // case sensitive
EXPECT_TRUE(match_policy("a:*:e", "a:bcd:e", flag));
- EXPECT_FALSE(match_policy("a:*", "a:b:c", flag)); // cannot span segments
+ EXPECT_TRUE(match_policy("a:*", "a:b:c", flag)); // can span segments
}
<< " <output>: [summary|list|binary|json] [--path <path>]\n"
<< "\n"
<< "General options:\n"
- << " --rank=filesystem:mds-rank Journal rank (required if multiple\n"
- << " file systems, default is rank 0 on\n"
- << " the only filesystem otherwise.\n"
+ << " --rank=filesystem:mds-rank|all Journal rank (mandatory)\n"
<< "\n"
<< "Special options\n"
<< " --alternate-pool <name> Alternative metadata pool to target\n"
std::vector<const char*>::iterator arg = argv.begin();
std::string rank_str;
- if(!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
- // Default: act on rank 0. Will give the user an error if they
- // try invoking this way when they have more than one filesystem.
- rank_str = "0";
+ if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
+ derr << "missing mandatory \"--rank\" argument" << dendl;
+ return -EINVAL;
}
- r = role_selector.parse(*fsmap, rank_str);
+ r = role_selector.parse(*fsmap, rank_str, false);
if (r != 0) {
derr << "Couldn't determine MDS rank." << dendl;
return r;
// Execution
// =========
- for (auto role : role_selector.get_roles()) {
+ auto roles = role_selector.get_roles();
+ if (roles.size() > 1) {
+ const std::string &command = argv[0];
+ bool allowed = can_execute_for_all_ranks(mode, command);
+ if (!allowed) {
+ derr << "operation not allowed for all ranks" << dendl;
+ return -EINVAL;
+ }
+
+ all_ranks = true;
+ }
+ for (auto role : roles) {
rank = role.rank;
+ std::vector<const char *> rank_argv(argv);
dout(4) << "Executing for rank " << rank << dendl;
if (mode == std::string("journal")) {
- r = main_journal(argv);
+ r = main_journal(rank_argv);
} else if (mode == std::string("header")) {
- r = main_header(argv);
+ r = main_header(rank_argv);
} else if (mode == std::string("event")) {
- r = main_event(argv);
+ r = main_event(rank_argv);
} else {
derr << "Bad command '" << mode << "'" << dendl;
usage();
}
+std::string JournalTool::gen_dump_file_path(const std::string &prefix) {
+ if (!all_ranks) {
+ return prefix;
+ }
+
+ return prefix + "." + std::to_string(rank);
+}
+
+bool JournalTool::can_execute_for_all_ranks(const std::string &mode,
+ const std::string &command) {
+ if (mode == "journal" && command == "import") {
+ return false;
+ }
+
+ return true;
+}
+
/**
* Handle arguments for 'journal' mode
*
}
}
+ const std::string dump_path = gen_dump_file_path(output_path);
+
// Execute command
// ===============
JournalScanner js(input, rank, filter);
// Generate output
// ===============
- EventOutput output(js, output_path);
+ EventOutput output(js, dump_path);
int output_result = 0;
if (output_style == "binary") {
output_result = output.binary();
if (import) {
r = dumper.undump(path.c_str(), force);
} else {
- r = dumper.dump(path.c_str());
+ const std::string ex_path = gen_dump_file_path(path);
+ r = dumper.dump(ex_path.c_str());
}
}
// various main_ functions.
mds_rank_t rank;
+ // when set, generate per rank dump file path
+ bool all_ranks = false;
+
// Entry points
int main_journal(std::vector<const char*> &argv);
int main_header(std::vector<const char*> &argv);
bufferlist *out_bl);
int consume_inos(const std::set<inodeno_t> &inos);
+ // generate output file path for dump/export
+ std::string gen_dump_file_path(const std::string &prefix);
+
+ // check if an operation (mode, command) is safe to be
+ // executed on all ranks.
+ bool can_execute_for_all_ranks(const std::string &mode,
+ const std::string &command);
+
public:
void usage();
JournalTool() :
}
}
-int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str)
+int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str,
+ bool allow_unqualified_rank)
{
auto colon_pos = str.find(":");
if (colon_pos == std::string::npos) {
// An unqualified rank. Only valid if there is only one
// namespace.
- if (fsmap.filesystem_count() == 1) {
+ if (fsmap.filesystem_count() == 1 && allow_unqualified_rank) {
fscid = fsmap.get_filesystem()->fscid;
return parse_rank(fsmap, str);
} else {
{
public:
const std::vector<mds_role_t> &get_roles() const {return roles;}
- int parse(const FSMap &fsmap, std::string const &str);
+ int parse(const FSMap &fsmap, std::string const &str,
+ bool allow_unqualified_rank=true);
MDSRoleSelector()
: fscid(FS_CLUSTER_ID_NONE)
{}
cout << " -i mapfn --move name --loc type name ...\n"
<< " move the given item to specified location\n";
cout << " -i mapfn --reweight recalculate all bucket weights\n";
+ cout << " -i mapfn --rebuild-class-roots\n";
+ cout << " rebuild the per-class shadow trees (normally a no-op)\n";
cout << " -i mapfn --create-simple-rule name root type mode\n"
<< " create crush rule <name> to start from <root>,\n"
<< " replicate across buckets of type <type>, using\n"
cout << " export select data generated during testing routine\n";
cout << " to CSV files for off-line post-processing\n";
cout << " use --help-output for more information\n";
+ cout << " --reclassify transform legacy CRUSH map buckets and rules\n";
+ cout << " by adding classes\n";
+ cout << " --reclassify-bucket <bucket-match> <class> <default-parent>\n";
+ cout << " --reclassify-root <bucket-name> <class>\n";
+ cout << " --set-subtree-class <bucket-name> <class>\n";
+ cout << " set class for all items beneath bucket-name\n";
+ cout << " --compare <otherfile> compare two maps using --test parameters\n";
cout << "\n";
cout << "Options for the output stage\n";
cout << "\n";
int verbose = 0;
bool unsafe_tunables = false;
+ bool rebuild_class_roots = false;
+
bool reweight = false;
int add_item = -1;
bool add_bucket = false;
int straw_calc_version = -1;
int allowed_bucket_algs = -1;
+ bool reclassify = false;
+ map<string,pair<string,string>> reclassify_bucket; // %suffix or prefix% -> class, default_root
+ map<string,string> reclassify_root; // bucket -> class
+ map<string,string> set_subtree_class; // bucket -> class
+
+ string compare;
+
CrushWrapper crush;
CrushTester tester(crush, cout);
outfn = val;
} else if (ceph_argparse_flag(args, i, "-v", "--verbose", (char*)NULL)) {
verbose += 1;
+ } else if (ceph_argparse_witharg(args, i, &val, "--compare", (char*)NULL)) {
+ compare = val;
+ } else if (ceph_argparse_flag(args, i, "--reclassify", (char*)NULL)) {
+ reclassify = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--reclassify-bucket",
+ (char*)NULL)) {
+ if (i == args.end()) {
+ cerr << "expecting additional argument" << std::endl;
+ return EXIT_FAILURE;
+ }
+ string c = *i;
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument" << std::endl;
+ return EXIT_FAILURE;
+ }
+ reclassify_bucket[val] = make_pair(c, *i);
+ i = args.erase(i);
+ } else if (ceph_argparse_witharg(args, i, &val, "--reclassify-root",
+ (char*)NULL)) {
+ if (i == args.end()) {
+ cerr << "expecting additional argument" << std::endl;
+ return EXIT_FAILURE;
+ }
+ reclassify_root[val] = *i;
+ i = args.erase(i);
+ } else if (ceph_argparse_witharg(args, i, &val, "--set-subtree-class",
+ (char*)NULL)) {
+ if (i == args.end()) {
+ cerr << "expecting additional argument" << std::endl;
+ return EXIT_FAILURE;
+ }
+ set_subtree_class[val] = *i;
+ i = args.erase(i);
} else if (ceph_argparse_flag(args, i, "--tree", (char*)NULL)) {
tree = true;
} else if (ceph_argparse_witharg(args, i, &val, "-f", "--format", (char*)NULL)) {
adjust = true;
} else if (ceph_argparse_flag(args, i, "--reweight", (char*)NULL)) {
reweight = true;
+ } else if (ceph_argparse_flag(args, i, "--rebuild-class-roots", (char*)NULL)) {
+ rebuild_class_roots = true;
} else if (ceph_argparse_witharg(args, i, &add_item, err, "--add_item", (char*)NULL)) {
if (!err.str().empty()) {
cerr << err.str() << std::endl;
}
}
- if (test && !check && !display && !write_to_file) {
+ if (test && !check && !display && !write_to_file && compare.empty()) {
cerr << "WARNING: no output selected; use --output-csv or --show-X" << std::endl;
}
}
if (!check && !compile && !decompile && !build && !test && !reweight && !adjust && !tree && !dump &&
add_item < 0 && !add_bucket && !move_item && !add_rule && !del_rule && full_location < 0 &&
+ !reclassify && !rebuild_class_roots &&
+ compare.empty() &&
remove_name.empty() && reweight_name.empty()) {
cerr << "no action specified; -h for help" << std::endl;
return EXIT_FAILURE;
crush.reweight(g_ceph_context);
modified = true;
}
+ if (rebuild_class_roots) {
+ int r = crush.rebuild_roots_with_classes();
+ if (r < 0) {
+ cerr << "failed to rebuidl roots with classes" << std::endl;
+ return EXIT_FAILURE;
+ }
+ modified = true;
+ }
+ for (auto& i : set_subtree_class) {
+ crush.set_subtree_class(i.first, i.second);
+ modified = true;
+ }
+ if (reclassify) {
+ int r = crush.reclassify(
+ g_ceph_context,
+ cout,
+ reclassify_root,
+ reclassify_bucket);
+ if (r < 0) {
+ cerr << "failed to reclassify map" << std::endl;
+ return EXIT_FAILURE;
+ }
+ modified = true;
+ }
// display ---
if (full_location >= 0) {
}
if (tree) {
- crush.dump_tree(&cout, NULL);
+ crush.dump_tree(&cout, NULL, {}, true);
}
if (dump) {
return EXIT_FAILURE;
}
+ if (compare.size()) {
+ CrushWrapper crush2;
+ bufferlist in;
+ string error;
+ int r = in.read_file(compare.c_str(), &error);
+ if (r < 0) {
+ cerr << me << ": error reading '" << compare << "': "
+ << error << std::endl;
+ return EXIT_FAILURE;
+ }
+ auto p = in.begin();
+ try {
+ crush2.decode(p);
+ } catch(...) {
+ cerr << me << ": unable to decode " << compare << std::endl;
+ return EXIT_FAILURE;
+ }
+ r = tester.compare(crush2);
+ if (r < 0)
+ return EXIT_FAILURE;
+ }
+
// output ---
if (modified) {
crush.finalize();
}
}
}
- catch (const std::runtime_error& e) {
+ catch (const std::exception& e) {
cerr << e.what() << std::endl;
return -1;
}
}
}
}
- catch (const std::runtime_error& e) {
+ catch (const std::exception& e) {
cerr << e.what() << std::endl;
ret = -1;
goto out;
const string & oid = *iter;
if (use_striper) {
if (forcefull) {
- ret = striper.remove(oid, CEPH_OSD_FLAG_FULL_FORCE);
+ ret = striper.remove(oid, (CEPH_OSD_FLAG_FULL_FORCE | CEPH_OSD_FLAG_FULL_TRY));
} else {
ret = striper.remove(oid);
}
} else {
if (forcefull) {
- ret = io_ctx.remove(oid, CEPH_OSD_FLAG_FULL_FORCE);
+ ret = io_ctx.remove(oid, (CEPH_OSD_FLAG_FULL_FORCE | CEPH_OSD_FLAG_FULL_TRY));
} else {
ret = io_ctx.remove(oid);
}
m_update_status_interval = new_interval;
}
- bool restarting = (new_interval == 0 || canceled_task);
if (new_interval >= 0 && is_running_() &&
- start_mirror_image_status_update(false, restarting)) {
+ start_mirror_image_status_update(true, false)) {
m_update_status_task = new FunctionContext(
[this](int r) {
assert(m_threads->timer_lock.is_locked());
if (canceled_task) {
dout(20) << "canceled task" << dendl;
+ // decrement in-flight status update counter for canceled task
finish_mirror_image_status_update();
}
}