-RELEASE=5.1
+RELEASE=5.2
PACKAGE=ceph
-VER=12.2.5
+VER=12.2.7
DEBREL=pve1
SRCDIR=ceph
cmake_minimum_required(VERSION 2.8.11)
project(ceph)
-set(VERSION 12.2.5)
+set(VERSION 12.2.7)
if(POLICY CMP0046)
# Tweak policies (this one disables "missing" dependency warning)
# Contributor: John Coyle <dx9err@gmail.com>
# Maintainer: John Coyle <dx9err@gmail.com>
pkgname=ceph
-pkgver=12.2.5
+pkgver=12.2.7
pkgrel=0
pkgdesc="Ceph is a distributed object store and file system"
pkgusers="ceph"
xmlstarlet
yasm
"
-source="ceph-12.2.5.tar.bz2"
+source="ceph-12.2.7.tar.bz2"
subpackages="
$pkgname-base
$pkgname-common
_udevrulesdir=/etc/udev/rules.d
_python_sitelib=/usr/lib/python2.7/site-packages
-builddir=$srcdir/ceph-12.2.5
+builddir=$srcdir/ceph-12.2.7
build() {
export CEPH_BUILD_VIRTUALENV=$builddir
# main package definition
#################################################################################
Name: ceph
-Version: 12.2.5
+Version: 12.2.7
Release: 0%{?dist}
%if 0%{?fedora} || 0%{?rhel}
Epoch: 2
Group: System/Filesystems
%endif
URL: http://ceph.com/
-Source0: http://ceph.com/download/ceph-12.2.5.tar.bz2
+Source0: http://ceph.com/download/ceph-12.2.7.tar.bz2
%if 0%{?suse_version}
%if 0%{?is_opensuse}
ExclusiveArch: x86_64 aarch64 ppc64 ppc64le
# common
#################################################################################
%prep
-%autosetup -p1 -n ceph-12.2.5
+%autosetup -p1 -n ceph-12.2.7
%build
%if 0%{with cephfs_java}
%else
/usr/lib/systemd/systemd-sysctl %{_sysctldir}/90-ceph-osd.conf > /dev/null 2>&1 || :
%endif
+# work around https://tracker.ceph.com/issues/24903
+chown -h ceph:ceph /var/lib/ceph/osd/*/block* 2>&1 > /dev/null || :
%preun osd
%if 0%{?suse_version}
%else
/usr/lib/systemd/systemd-sysctl %{_sysctldir}/90-ceph-osd.conf > /dev/null 2>&1 || :
%endif
+# work around https://tracker.ceph.com/issues/24903
+chown -h ceph:ceph /var/lib/ceph/osd/*/block* 2>&1 > /dev/null || :
%preun osd
%if 0%{?suse_version}
configure)
[ -x /etc/init.d/procps ] && invoke-rc.d procps restart || :
[ -x /sbin/start ] && start ceph-osd-all || :
+ # work around https://tracker.ceph.com/issues/24903
+ chown -h ceph:ceph /var/lib/ceph/osd/*/block* 2>&1 > /dev/null || :
;;
abort-upgrade|abort-remove|abort-deconfigure)
:
+ceph (12.2.7-1) stable; urgency=medium
+
+ * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com> Mon, 16 Jul 2018 16:00:29 +0000
+
+ceph (12.2.6-1) stable; urgency=medium
+
+ * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com> Mon, 09 Jul 2018 16:18:46 +0000
+
ceph (12.2.5-1) stable; urgency=medium
* New upstream release
#}
<h3><a href="{{ pathto(master_doc) }}">{{ _('Table Of Contents') }}</a></h3>
-{{ toctree(maxdepth=-1) }}
+{{ toctree(maxdepth=-1, includehidden=True) }}
<!-- ugly kludge to make genindex look like it's part of the toc -->
<ul style="margin-top: -10px"><li class="toctree-l1"><a class="reference internal" href="{{ pathto('genindex') }}">Index</a></li></ul>
---------
Starting on Ceph version 12.2.2, ``ceph-disk`` is deprecated. Deprecation
warnings will show up that will link to this page. It is strongly suggested
-that users start consuming ``ceph-volume``.
+that users start consuming ``ceph-volume``. There are two paths for migrating:
+
+#. Keep OSDs deployed with ``ceph-disk``: The :ref:`ceph-volume-simple` command
+ provides a way to take over the management while disabling ``ceph-disk``
+ triggers.
+#. Redeploy existing OSDs with ``ceph-volume``: This is covered in depth on
+ :ref:`rados-replacing-an-osd`
New deployments
^^^^^^^^^^^^^^^
.. note:: The UUID is stored in the ``fsid`` file in the OSD path, which is
generated when :ref:`ceph-volume-lvm-prepare` is used.
+Activating all OSDs
+-------------------
+It is possible to activate all existing OSDs at once by using the ``--all``
+flag. For example::
+
+ ceph-volume lvm activate --all
+
+This call will inspect all the OSDs created by ceph-volume that are inactive
+and will activate them one by one. If any of the OSDs are already running, it
+will report them in the command output and skip them, making it safe to rerun
+(idempotent).
+
requiring uuids
^^^^^^^^^^^^^^^
The :term:`OSD uuid` is being required as an extra step to ensure that the
the same id exists and would end up activating the incorrect one.
+dmcrypt
+^^^^^^^
+If the OSD was prepared with dmcrypt by ceph-volume, there is no need to
+specify ``--dmcrypt`` on the command line again (that flag is not available for
+the ``activate`` subcommand). An encrypted OSD will be automatically detected.
+
+
Discovery
---------
With either existing OSDs or new ones being activated, a *discovery* process is
Existing OSDs
-------------
-For exsiting OSDs that have been deployed with different tooling, the only way
-to port them over to the new mechanism is to prepare them again (losing data).
-See :ref:`ceph-volume-lvm-existing-osds` for details on how to proceed.
+For existing OSDs that have been deployed with ``ceph-disk``, they need to be
+scanned and activated :ref:`using the simple sub-command <ceph-volume-simple>`.
+If a different tooling was used then the only way to port them over to the new
+mechanism is to prepare them again (losing data). See
+:ref:`ceph-volume-lvm-existing-osds` for details on how to proceed.
Summary
-------
Encryption
==========
-Logical volumes can be encrypted using ``dmcrypt``. Encryption can be done in
-different ways, specially with LVM. ``ceph-volume`` is somewhat opinionated
-with the way it sets up encryption with logical volumes so that the process is
-consistent and robust.
+Logical volumes can be encrypted using ``dmcrypt`` by specifying the
+``--dmcrypt`` flag when creating OSDs. Encryption can be done in different ways,
+specially with LVM. ``ceph-volume`` is somewhat opinionated with the way it
+sets up encryption with logical volumes so that the process is consistent and
+robust.
In this case, ``ceph-volume lvm`` follows these constraints:
data uuid SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
journal device /dev/journals/journal1
data device /dev/test_group/data-lv2
+ devices /dev/sda
[data] /dev/test_group/data-lv2
data uuid SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
journal device /dev/journals/journal1
data device /dev/test_group/data-lv2
+ devices /dev/sdb
====== osd.0 =======
data uuid TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00
journal device /dev/sdd1
data device /dev/test_group/data-lv1
+ devices /dev/sdc
[journal] /dev/sdd1
PARTUUID cd72bd28-002a-48da-bdf6-d5b993e84f3f
+
+For logical volumes the ``devices`` key is populated with the physical devices
+associated with the logical volume. Since LVM allows multiple physical devices
+to be part of a logical volume, the value will be comma separated when using
+``pretty``, but an array when using ``json``.
+
.. note:: Tags are displayed in a readable format. The ``osd id`` key is stored
as a ``ceph.osd_id`` tag. For more information on lvm tag conventions
see :ref:`ceph-volume-lvm-tag-api`
data uuid SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
journal device /dev/journals/journal1
data device /dev/test_group/data-lv2
+ devices /dev/sdc
.. note:: Tags are displayed in a readable format. The ``osd id`` key is stored
{
"0": [
{
+ "devices": ["/dev/sda"],
"lv_name": "data-lv1",
"lv_path": "/dev/test_group/data-lv1",
"lv_tags": "ceph.cluster_fsid=ce454d91-d748-4751-a318-ff7f7aa18ffd,ceph.data_device=/dev/test_group/data-lv1,ceph.data_uuid=TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00,ceph.journal_device=/dev/sdd1,ceph.journal_uuid=cd72bd28-002a-48da-bdf6-d5b993e84f3f,ceph.osd_fsid=943949f0-ce37-47ca-a33c-3413d46ee9ec,ceph.osd_id=0,ceph.type=data",
ceph-volume prepare --filestore --data data --journal journal
+For enabling :ref:`encryption <ceph-volume-lvm-encryption>`, the ``--dmcrypt`` flag is required::
+
+ ceph-volume lvm prepare --filestore --dmcrypt --data volume_group/lv_name --journal journal
+
There is flexibility to use a raw device or partition as well for ``--data``
that will be converted to a logical volume. This is not ideal in all situations
since ``ceph-volume`` is just going to create a unique volume group and
ceph-volume lvm prepare --bluestore --data /path/to/device
+For enabling :ref:`encryption <ceph-volume-lvm-encryption>`, the ``--dmcrypt`` flag is required::
+
+ ceph-volume lvm prepare --bluestore --dmcrypt --data vg/lv
If a ``block.db`` or a ``block.wal`` is needed (they are optional for
bluestore) they can be specified with ``--block.db`` and ``--block.wal``
By *taking over* management, it disables all ``ceph-disk`` systemd units used
to trigger devices at startup, relying on basic (customizable) JSON
configuration and systemd for starting up OSDs.
+
+This process involves two steps:
+
+#. :ref:`Scan <ceph-volume-simple-scan>` the running OSD or the data device
+#. :ref:`Activate <ceph-volume-simple-activate>` the scanned OSD
+
+The scanning will infer everything that ``ceph-volume`` needs to start the OSD,
+so that when activation is needed, the OSD can start normally without getting
+interference from ``ceph-disk``.
+
+As part of the activation process the systemd units for ``ceph-disk`` in charge
+of reacting to ``udev`` events, are linked to ``/dev/null`` so that they are
+fully inactive.
to enumerate the objects during operations like stats or deletes.
+Taking the cluster down
+-----------------------
+
+Taking a CephFS cluster down is done by reducing the number of ranks to 1,
+setting the cluster_down flag, and then failing the last rank. For example:
+
+::
+ ceph fs set <fs_name> max_mds 1
+ ceph mds deactivate <fs_name>:1 # rank 2 of 2
+ ceph status # wait for rank 1 to finish stopping
+ ceph fs set <fs_name> cluster_down true
+ ceph mds fail <fs_name>:0
+
+Setting the ``cluster_down`` flag prevents standbys from taking over the failed
+rank.
+
Daemons
-------
- le64: length of appending data (8)
- le64: image striping count
+ImageMeta Key and Value
+-----------------------
+
+- u8: 'M'
+- le64: length of appending data (length of key + length of value + 4 * 2)
+- string: image-meta key
+- string: image-meta value
+
Final Record
~~~~~~~~~~~~
Diffs records
-~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~
+
Record the all snapshots and the HEAD in this section.
+Snap Protection status
+----------------------
+
+Record the snapshot's protection status if `--export-format=2`.
+- u8: 'p'
+- le64: length of appending data (8)
+- u8: snap protection status (0 for false, 1 for true)
+
+Others
+------
+
- le64: number of diffs
- Diffs ...
--- /dev/null
+Testing notes
+=============
+
+
+build-integration-branch
+------------------------
+
+Setup
+^^^^^
+
+#. Create a github token at `<https://github.com/settings/tokens>`_
+ and put it in ``~/.github_token``. Note that only the
+ ``public_repo`` under the ``repo`` section needs to be checked.
+
+#. Create a ceph repo label `wip-yourname-testing` if you don't
+ already have one at `<https://github.com/ceph/ceph/labels>`_.
+
+#. Create the ``ci`` remote::
+
+ git remote add ci git@github.com:ceph/ceph-ci
+
+Using
+^^^^^
+
+#. Tag some subset of `needs-qa` commits with your label (usually `wip-yourname-testing`).
+
+#. Create the integration branch::
+
+ git checkout master
+ git pull
+ ../src/script/build-integration-branch wip-yourname-testing
+
+#. Smoke test::
+
+ make && ctest -j12
+
+#. Push to ceph-ci::
+
+ git push ci $(git rev-parse --abbrev-ref HEAD)
+
Synopsis
========
-| **ceph-fuse** [ -m *monaddr*:*port* ] *mountpoint* [ *fuse options* ]
+| **ceph-fuse** [-n *client.username*] [ -m *monaddr*:*port* ] *mountpoint* [ *fuse options* ]
Description
Any options not recognized by ceph-fuse will be passed on to libfuse.
+.. option:: -o opt,[opt...]
+
+ mount options.
+
.. option:: -d
- Detach from console and daemonize after startup.
+ Run in foreground, send all log output to stderr and enable FUSE debugging (-o debug).
.. option:: -c ceph.conf, --conf=ceph.conf
Connect to specified monitor (instead of looking through ceph.conf).
-.. option:: -r root_directory
+.. option:: --client_mountpoint/-r root_directory
Use root_directory as the mounted root, rather than the full Ceph tree.
+.. option:: -f
+
+ Foreground: do not daemonize after startup (run in foreground). Do not generate a pid file.
+
+.. option:: -s
+
+ Disable multi-threaded operation.
Availability
============
* [--no-systemd] Skip creating and enabling systemd units and starting of OSD
services
+Multiple OSDs can be activated at once by using the (idempotent) ``--all`` flag::
+
+ ceph-volume lvm activate --all
+
**prepare**
Prepares a logical volume to be used as an OSD and journal using a ``filestore``
* --data A logical group name or a path to a logical volume
+For encrypting an OSD, the ``--dmcrypt`` flag must be added when preparing
+(also supported in the ``create`` sub-command).
+
+
**create**
Wraps the two-step process to provision a new osd (calling ``prepare`` first
and then ``activate``) into a single one. The reason to prefer ``prepare`` and
.. option:: --shard-id=<shard-id>
- Optional for mdlog list. Required for ``mdlog trim``,
+ Optional for mdlog list, data sync status. Required for ``mdlog trim``,
``replica mdlog get/delete``, ``replica datalog get/delete``.
+.. option:: --max-entries=<entries>
+
+ Optional for listing operations to specify the max entires
+
.. option:: --auth-uid=auid
The librados auid.
--- /dev/null
+Balancer plugin
+===============
+
+The *balancer* plugin can optimize the placement of PGs across OSDs in
+order to achieve a balanced distribution, either automatically or in a
+supervised fashion.
+
+Enabling
+--------
+
+The *balancer* module is enabled with::
+
+ ceph mgr module enable balancer
+
+(It is enabled by default.)
+
+Status
+------
+
+The current status of the balancer can be check at any time with::
+
+ ceph balancer status
+
+
+Automatic balancing
+-------------------
+
+The automatic balancing can be enabled, using the default settings, with::
+
+ ceph balancer on
+
+The balancer can be turned back off again with::
+
+ ceph balancer off
+
+This will use the ``crush-compat`` mode, which is backward compatible
+with older clients, and will make small changes to the data
+distribution over time to ensure that OSDs are equally utilized.
+
+
+Throttling
+----------
+
+No adjustments will be made to the PG distribution if the cluster is
+degraded (e.g., because an OSD has failed and the system has not yet
+healed itself).
+
+When the cluster is healthy, the balancer will throttle its changes
+such that the percentage of PGs that are misplaced (i.e., that need to
+be moved) is below a threshold of (by default) 5%. The
+``max_misplaced`` threshold can be adjusted with::
+
+ ceph config-key set mgr/balancer/max_misplaced .07 # 7%
+
+
+Modes
+-----
+
+There are currently two supported balancer modes:
+
+#. **crush-compat**. The CRUSH compat mode uses the compat weight-set
+ feature (introduced in Luminous) to manage an alternative set of
+ weights for devices in the CRUSH hierarchy. The normal weights
+ should remain set to the size of the device to reflect the target
+ amount of data that we want to store on the device. The balancer
+ then optimizes the weight-set values, adjusting them up or down in
+ small increments, in order to achieve a distribution that matches
+ the target distribution as closely as possible. (Because PG
+ placement is a pseudorandom process, there is a natural amount of
+ variation in the placement; by optimizing the weights we
+ counter-act that natural variation.)
+
+ Notably, this mode is *fully backwards compatible* with older
+ clients: when an OSDMap and CRUSH map is shared with older clients,
+ we present the optimized weights as the "real" weights.
+
+ The primary restriction of this mode is that the balancer cannot
+ handle multiple CRUSH hierarchies with different placement rules if
+ the subtrees of the hierarchy share any OSDs. (This is normally
+ not the case, and is generally not a recommended configuration
+ because it is hard to manage the space utilization on the shared
+ OSDs.)
+
+#. **upmap**. Starting with Luminous, the OSDMap can store explicit
+ mappings for individual OSDs as exceptions to the normal CRUSH
+ placement calculation. These `upmap` entries provide fine-grained
+ control over the PG mapping. This CRUSH mode will optimize the
+ placement of individual PGs in order to achieve a balanced
+ distribution. In most cases, this distribution is "perfect," which
+ an equal number of PGs on each OSD (+/-1 PG, since they might not
+ divide evenly).
+
+ Note that using upmap requires that all clients be Luminous or newer.
+
+The default mode is ``crush-compat``. The mode can be adjusted with::
+
+ ceph balancer mode upmap
+
+or::
+
+ ceph balancer mode crush-compat
+
+Supervised optimization
+-----------------------
+
+The balancer operation is broken into a few distinct phases:
+
+#. building a *plan*
+#. evaluating the quality of the data distribution, either for the current PG distribution, or the PG distribution that would result after executing a *plan*
+#. executing the *plan*
+
+To evautate and score the current distribution,::
+
+ ceph balancer eval
+
+You can also evaluate the distribution for a single pool with::
+
+ ceph balancer eval <pool-name>
+
+Greater detail for the evaluation can be seen with::
+
+ ceph balancer eval-verbose ...
+
+The balancer can generate a plan, using the currently configured mode, with::
+
+ ceph balancer optimize <plan-name>
+
+The name is provided by the user and can be any useful identifying string. The contents of a plan can be seen with::
+
+ ceph balancer show <plan-name>
+
+Old plans can be discarded with::
+
+ ceph balancer rm <plan-name>
+
+Currently recorded plans are shown as part of the status command::
+
+ ceph balancer status
+
+The quality of the distribution that would result after executing a plan can be calculated with::
+
+ ceph balancer eval <plan-name>
+
+Assuming the plan is expected to improve the distribution (i.e., it has a lower score than the current cluster state), the user can execute that plan with::
+
+ ceph balancer execute <plan-name>
Installation and Configuration <administrator>
Writing plugins <plugins>
+ Balancer plugin <balancer>
Dashboard plugin <dashboard>
Local pool plugin <localpool>
RESTful plugin <restful>
:interval: Time between reports to InfluxDB. Default 5 seconds.
:database: InfluxDB database name. Default "ceph". You will need to create this database and grant write privileges to the configured username or the username must have admin privileges to create it.
:port: InfluxDB server port. Default 8086
-
+:ssl: Use https connection for InfluxDB server. Use "true" or "false". Default false
+:verify_ssl: Verify https cert for InfluxDB server. Use "true" or "false". Default true
---------
Debugging
::
- ceph_pool_metadata{pool_id="2",name="cephfs_metadata_a"} 0.0
+ ceph_pool_metadata{pool_id="2",name="cephfs_metadata_a"} 1.0
OSDs have a ``ceph_osd_metadata`` field like this:
::
- ceph_osd_metadata{cluster_addr="172.21.9.34:6802/19096",device_class="ssd",id="0",public_addr="172.21.9.34:6801/19096",weight="1.0"} 0.0
+ ceph_osd_metadata{cluster_addr="172.21.9.34:6802/19096",device_class="ssd",ceph_daemon="osd.0",public_addr="172.21.9.34:6801/19096",weight="1.0"} 1.0
Correlating drive statistics with node_exporter
::
- ceph_disk_occupation{ceph_daemon="osd.0",device="sdd",instance="myhost",job="ceph"}
+ ceph_disk_occupation{ceph_daemon="osd.0",device="sdd", exported_instance="myhost"}
-To use this to get disk statistics by OSD ID, use the ``and on`` syntax
-in your prometheus query like this:
+To use this to get disk statistics by OSD ID, use either the ``and`` operator or
+the ``*`` operator in your prometheus query. All metadata metrics (like ``
+ceph_disk_occupation`` have the value 1 so they act neutral with ``*``. Using ``*``
+allows to use ``group_left`` and ``group_right`` grouping modifiers, so that
+the resulting metric has additional labels from one side of the query.
+
+See the
+`prometheus documentation`__ for more information about constructing queries.
+
+__ https://prometheus.io/docs/prometheus/latest/querying/basics
+
+The goal is to run a query like
::
rate(node_disk_bytes_written[30s]) and on (device,instance) ceph_disk_occupation{ceph_daemon="osd.0"}
-See the prometheus documentation for more information about constructing
-queries.
+Out of the box the above query will not return any metrics since the ``instance`` labels of
+both metrics don't match. The ``instance`` label of ``ceph_disk_occupation``
+will be the currently active MGR node.
-Note that for this mechanism to work, Ceph and node_exporter must agree
-about the values of the ``instance`` label. See the following section
-for guidance about to to set up Prometheus in a way that sets
-``instance`` properly.
+ The following two section outline two approaches to remedy this.
-Configuring Prometheus server
-=============================
+Use label_replace
+=================
+
+The ``label_replace`` function (cp.
+`label_replace documentation <https://prometheus.io/docs/prometheus/latest/querying/functions/#label_replace>`_)
+can add a label to, or alter a label of, a metric within a query.
-See the prometheus documentation for full details of how to add
-scrape endpoints: the notes
-in this section are tips on how to configure Prometheus to capture
-the Ceph statistics in the most usefully-labelled form.
+To correlate an OSD and its disks write rate, the following query can be used:
+
+::
-This configuration is necessary because Ceph is reporting metrics
-from many hosts and services via a single endpoint, and some
-metrics that relate to no physical host (such as pool statistics).
+ label_replace(rate(node_disk_bytes_written[30s]), "exported_instance", "$1", "instance", "(.*):.*") and on (device,exported_instance) ceph_disk_occupation{ceph_daemon="osd.0"}
+
+Configuring Prometheus server
+=============================
honor_labels
------------
use the ``honor_labels`` setting when adding the ceph-mgr endpoints
to your prometheus configuration.
-Without this setting, any ``instance`` labels that Ceph outputs, such
-as those in ``ceph_disk_occupation`` series, will be overridden
-by Prometheus.
-
-Ceph instance label
--------------------
+This allows Ceph to export the proper ``instance`` label without prometheus
+overwriting it. Without this setting, Prometheus applies an ``instance`` label
+that includes the hostname and port of the endpoint that the series game from.
+Because Ceph clusters have multiple manager daemons, this results in an
+``instance`` label that changes spuriously when the active manager daemon
+changes.
-By default, Prometheus applies an ``instance`` label that includes
-the hostname and port of the endpoint that the series game from. Because
-Ceph clusters have multiple manager daemons, this results in an ``instance``
-label that changes spuriously when the active manager daemon changes.
-
-Set a custom ``instance`` label in your Prometheus target configuration:
-you might wish to set it to the hostname of your first monitor, or something
-completely arbitrary like "ceph_cluster".
-
-node_exporter instance labels
+node_exporter hostname labels
-----------------------------
Set your ``instance`` labels to match what appears in Ceph's OSD metadata
-in the ``hostname`` field. This is generally the short hostname of the node.
+in the ``instance`` field. This is generally the short hostname of the node.
This is only necessary if you want to correlate Ceph stats with host stats,
but you may find it useful to do it in all cases in case you want to do
---------------------
This example shows a single node configuration running ceph-mgr and
-node_exporter on a server called ``senta04``.
+node_exporter on a server called ``senta04``. Note that this requires to add the
+appropriate instance label to every ``node_exporter`` target individually.
This is just an example: there are other ways to configure prometheus
scrape targets and label rewrite rules.
[
{
"targets": [ "senta04.mydomain.com:9283" ],
- "labels": {
- "instance": "ceph_cluster"
- }
+ "labels": {}
}
]
Signatures
----------
-In Ceph Bobtail and subsequent versions, we prefer that Ceph authenticate all
-ongoing messages between the entities using the session key set up for that
-initial authentication. However, Argonaut and earlier Ceph daemons do not know
-how to perform ongoing message authentication. To maintain backward
-compatibility (e.g., running both Botbail and Argonaut daemons in the same
-cluster), message signing is **off** by default. If you are running Bobtail or
-later daemons exclusively, configure Ceph to require signatures.
+Ceph performs a signature check that provides some limited protection
+against messages being tampered with in flight (e.g., by a "man in the
+middle" attack).
Like other parts of Ceph authentication, Ceph provides fine-grained control so
you can enable/disable signatures for service messages between the client and
Ceph, and you can enable/disable signatures for messages between Ceph daemons.
+Note that even with signatures enabled data is not encrypted in
+flight.
``cephx require signatures``
traffic between the Ceph Client and the Ceph Storage Cluster, and
between daemons comprising the Ceph Storage Cluster.
+ Ceph Argonaut and Linux kernel versions prior to 3.19 do
+ not support signatures; if such clients are in use this
+ option can be turned off to allow them to connect.
+
:Type: Boolean
:Required: No
:Default: ``false``
``cephx sign messages``
:Description: If the Ceph version supports message signing, Ceph will sign
- all messages so they cannot be spoofed.
+ all messages so they are more difficult to spoof.
:Type: Boolean
:Default: ``true``
just add or remove one or more metadata servers on the command line with one
command.
-.. important:: You must deploy at least one metadata server to use CephFS.
- There is experimental support for running multiple metadata servers.
- Do not run multiple active metadata servers in production.
-
See `MDS Config Reference`_ for details on configuring metadata servers.
Note that this practice will no longer be necessary in Bobtail and
subsequent releases.
+.. _rados-replacing-an-osd:
Replacing an OSD
----------------
:Type: Double
+``allow_ec_overwrites``
+
+:Description: see allow_ec_overwrites_
+
+:Type: Boolean
+
+
Set the Number of Object Replicas
=================================
``port``
-:Description: Sets the listening port number.
+:Description: Sets the listening port number. Can be specified multiple
+ times as in ``port=80 port=8000``.
:Type: Integer
:Default: ``80``
+``endpoint``
+
+:Description: Sets the listening address in the form ``address[:port]``,
+ where the address is an IPv4 address string in dotted decimal
+ form, or an IPv6 address in hexadecimal notation. The
+ optional port defaults to 80. Can be specified multiple times
+ as in ``endpoint=::1 endpoint=192.168.0.100:8000``.
+
+:Type: Integer
+:Default: None
+
+
Civetweb
========
:Description: Sets the listening port number. For SSL-enabled ports, add an
``s`` suffix like ``443s``. To bind a specific IPv4 or IPv6
address, use the form ``address:port``. Multiple endpoints
- can be separated by ``+`` as in ``127.0.0.1:8000+443s``.
+ can either be separated by ``+`` as in ``127.0.0.1:8000+443s``,
+ or by providing multiple options as in ``port=8000 port=443s``.
:Type: String
:Default: ``7480``
rgw keystone admin domain = {keystone admin domain name}
rgw keystone admin project = {keystone admin project name}
+For compatibility with previous versions of ceph, it is also
+possible to set ``rgw keystone implicit tenants`` to either
+``s3`` or ``swift``. This has the effect of splitting
+the identity space such that the indicated protocol will
+only use implicit tenants, and the other protocol will
+never use implicit tenants. Some older versions of ceph
+only supported implicit tenants with swift.
Prior to Kilo
-------------
TBD -- don't forget to explain the function of
rgw keystone implicit tenants = true
in commit e9259486decab52a362443d3fd3dec33b0ec654f
+ [ There is a description of this in keystone.rst ]
Notes and known issues
----------------------
roles:
- [mon.a, mon.c, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
- [mon.b, mgr.x, mds.b, mds.c, osd.4, osd.5, osd.6, osd.7]
-- [client.0]
+- [client.0, client.1]
openstack:
- volumes: # attached to each instance
count: 4
roles:
- [mon.a, mon.c, mgr.y, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2, osd.3]
- [mon.b, mgr.x, mds.e, mds.f, mds.g, mds.h, mds.i, osd.4, osd.5, osd.6, osd.7]
-- [client.0]
+- [client.0, client.1]
openstack:
- volumes: # attached to each instance
count: 4
--- /dev/null
+os_type: rhel
+os_version: "7.5"
--- /dev/null
+os_type: ubuntu
+os_version: "18.04"
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_recovery_max_active: 10
+ osd_recovery_max_single_start: 10
sleep 1
}
+function delete_pool() {
+ local poolname=$1
+ ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
+}
+
#######################################################################
function run_mgr() {
wait_for_clean || return 1
}
-function delete_pool() {
- local poolname=$1
-
- ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
-}
-
function rados_put_get() {
local dir=$1
local poolname=$2
wait_for_clean || return 1
}
-function delete_pool() {
+function delete_erasure_coded_pool() {
local poolname=$1
-
ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
ceph osd erasure-code-profile rm myprofile
}
# inject eio on primary OSD (0) and replica OSD (1)
local shard_id=0
rados_put_get_data eio $dir $shard_id || return 1
- delete_pool $poolname
+ delete_erasure_coded_pool $poolname
}
function TEST_rados_get_subread_eio_shard_1() {
# inject eio into replicas OSD (1) and OSD (2)
local shard_id=1
rados_put_get_data eio $dir $shard_id || return 1
- delete_pool $poolname
+ delete_erasure_coded_pool $poolname
}
# We don't remove the object from the primary because
# inject remove into replicas OSD (1) and OSD (2)
local shard_id=1
rados_put_get_data remove $dir $shard_id || return 1
- delete_pool $poolname
+ delete_erasure_coded_pool $poolname
}
#
rados_get_data_bad_size $dir $shard_id 10 || return 1
rados_get_data_bad_size $dir $shard_id 0 || return 1
rados_get_data_bad_size $dir $shard_id 256 add || return 1
- delete_pool $poolname
+ delete_erasure_coded_pool $poolname
}
function TEST_rados_get_bad_size_shard_1() {
rados_get_data_bad_size $dir $shard_id 10 || return 1
rados_get_data_bad_size $dir $shard_id 0 || return 1
rados_get_data_bad_size $dir $shard_id 256 add || return 1
- delete_pool $poolname
+ delete_erasure_coded_pool $poolname
}
function TEST_rados_get_with_subreadall_eio_shard_0() {
# inject eio on primary OSD (0)
rados_put_get_data eio $dir $shard_id recovery || return 1
- delete_pool $poolname
+ delete_erasure_coded_pool $poolname
}
function TEST_rados_get_with_subreadall_eio_shard_1() {
# inject eio on replica OSD (1)
rados_put_get_data eio $dir $shard_id recovery || return 1
- delete_pool $poolname
+ delete_erasure_coded_pool $poolname
}
# Test recovery the first k copies aren't all available
-function TEST_ec_recovery_errors() {
+function TEST_ec_single_recovery_error() {
local dir=$1
local objname=myobject
# Cluster should recover this object
wait_for_clean || return 1
- delete_pool $poolname
+ rados_get $dir $poolname myobject || return 1
+
+ delete_erasure_coded_pool $poolname
+}
+
+# Test recovery when repeated reads are needed due to EIO
+function TEST_ec_recovery_multiple_errors() {
+ local dir=$1
+ local objname=myobject
+
+ setup_osds 9 || return 1
+
+ local poolname=pool-jerasure
+ create_erasure_coded_pool $poolname 4 4 || return 1
+
+ rados_put $dir $poolname $objname || return 1
+ inject_eio ec data $poolname $objname $dir 0 || return 1
+ # first read will try shards 0,1,2 when 0 gets EIO, shard 3 gets
+ # tried as well. Make that fail to test multiple-EIO handling.
+ inject_eio ec data $poolname $objname $dir 3 || return 1
+ inject_eio ec data $poolname $objname $dir 4 || return 1
+
+ local -a initial_osds=($(get_osds $poolname $objname))
+ local last_osd=${initial_osds[-1]}
+ # Kill OSD
+ kill_daemons $dir TERM osd.${last_osd} >&2 < /dev/null || return 1
+ ceph osd down ${last_osd} || return 1
+ ceph osd out ${last_osd} || return 1
+
+ # Cluster should recover this object
+ wait_for_clean || return 1
+
+ rados_get $dir $poolname myobject || return 1
+
+ delete_erasure_coded_pool $poolname
+}
+
+# Test recovery when there's only one shard to recover, but multiple
+# objects recovering in one RecoveryOp
+function TEST_ec_recovery_multiple_objects() {
+ local dir=$1
+ local objname=myobject
+
+ ORIG_ARGS=$CEPH_ARGS
+ CEPH_ARGS+=' --osd-recovery-max-single-start 3 --osd-recovery-max-active 3 '
+ setup_osds 7 || return 1
+ CEPH_ARGS=$ORIG_ARGS
+
+ local poolname=pool-jerasure
+ create_erasure_coded_pool $poolname 3 2 || return 1
+
+ rados_put $dir $poolname test1
+ rados_put $dir $poolname test2
+ rados_put $dir $poolname test3
+
+ ceph osd out 0 || return 1
+
+ # Cluster should recover these objects all at once
+ wait_for_clean || return 1
+
+ rados_get $dir $poolname test1
+ rados_get $dir $poolname test2
+ rados_get $dir $poolname test3
+
+ delete_erasure_coded_pool $poolname
+}
+
+# test multi-object recovery when the one missing shard gets EIO
+function TEST_ec_recovery_multiple_objects_eio() {
+ local dir=$1
+ local objname=myobject
+
+ ORIG_ARGS=$CEPH_ARGS
+ CEPH_ARGS+=' --osd-recovery-max-single-start 3 --osd-recovery-max-active 3 '
+ setup_osds 7 || return 1
+ CEPH_ARGS=$ORIG_ARGS
+
+ local poolname=pool-jerasure
+ create_erasure_coded_pool $poolname 3 2 || return 1
+
+ rados_put $dir $poolname test1
+ rados_put $dir $poolname test2
+ rados_put $dir $poolname test3
+
+ # can't read from this shard anymore
+ inject_eio ec data $poolname $objname $dir 0 || return 1
+ ceph osd out 0 || return 1
+
+ # Cluster should recover these objects all at once
+ wait_for_clean || return 1
+
+ rados_get $dir $poolname test1
+ rados_get $dir $poolname test2
+ rados_get $dir $poolname test3
+
+ delete_erasure_coded_pool $poolname
}
# Test backfill with unfound object
# Must be between 1 and $lastobj
local testobj=obj250
- export CEPH_ARGS
+ ORIG_ARGS=$CEPH_ARGS
CEPH_ARGS+=' --osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10'
setup_osds 5 || return 1
+ CEPH_ARGS=$ORIG_ARGS
local poolname=pool-jerasure
create_erasure_coded_pool $poolname 3 2 || return 1
rm -f ${dir}/ORIGINAL ${dir}/CHECK
- delete_pool $poolname
+ delete_erasure_coded_pool $poolname
}
# Test recovery with unfound object
# Must be between 1 and $lastobj
local testobj=obj75
+ ORIG_ARGS=$CEPH_ARGS
+ CEPH_ARGS+=' --osd-recovery-max-single-start 3 --osd-recovery-max-active 3 '
+ CEPH_ARGS+=' --osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10'
setup_osds 5 || return 1
+ CEPH_ARGS=$ORIG_ARGS
local poolname=pool-jerasure
create_erasure_coded_pool $poolname 3 2 || return 1
rm -f ${dir}/ORIGINAL ${dir}/CHECK
- delete_pool $poolname
+ delete_erasure_coded_pool $poolname
}
main test-erasure-eio "$@"
--- /dev/null
+#!/usr/bin/env bash
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ # Fix port????
+ export CEPH_MON="127.0.0.1:7132" # git grep '\<7132\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON --osd-objectstore filestore"
+ export margin=10
+ export objects=200
+ export poolname=test
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_ec_error_rollforward() {
+ local dir=$1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+ run_osd $dir 3 || return 1
+
+ ceph osd erasure-code-profile set ec-profile m=2 k=2 crush-failure-domain=osd
+ ceph osd pool create ec 1 1 erasure ec-profile
+
+ rados -p ec put foo /etc/passwd
+
+ kill -STOP `cat $dir/osd.2.pid`
+
+ rados -p ec rm foo &
+ sleep 1
+ rados -p ec rm a &
+ rados -p ec rm b &
+ rados -p ec rm c &
+ sleep 1
+ kill -9 `cat $dir/osd.?.pid`
+ kill %1 %2 %3 %4
+
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+ run_osd $dir 3 || return 1
+
+ wait_for_clean || return 1
+}
+
+main ec-error-rollforward "$@"
jq -r ".[] | select(.pgid==\"$pgid\") | .$sname"
}
-function delete_pool() {
- local poolname=$1
-
- ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
-}
-
function rados_put() {
local dir=$1
local poolname=$2
rados -p test rm foo
test_log_size $PGID 3
rados -p test rm foo
- test_log_size $PGID 4
-
+ test_log_size $PGID 3
rados -p test rm foo
- test_log_size $PGID 2
+ test_log_size $PGID 3
+ rados -p test rm foo
+ test_log_size $PGID 3
}
main repro-long-log "$@"
# Local Variables:
# compile-command: "cd build ; make -j4 && \
# ../qa/run-standalone.sh osd-recovery-scrub.sh"
+# End:
export CEPH_ARGS
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
CEPH_ARGS+="--mon-host=$CEPH_MON "
+ CEPH_ARGS+="--osd-skip-data-digest=false "
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"lost": 0,
"flags": [
"dirty",
- "data_digest",
- "omap_digest"
+ "data_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
main osd-scrub-repair "$@"
# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && \
-# test/osd/osd-scrub-repair.sh # TEST_corrupt_and_repair_replicated"
+# compile-command: "cd build ; make -j4 && \
+# ../qa/run-standalone.sh osd-scrub-repair.sh"
# End:
# Set to "yes" in order to ignore diff errors and save results to update test
getjson="no"
+jqfilter='.inconsistents'
+sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
+
function run() {
local dir=$1
shift
done
}
-function TEST_scrub_snaps() {
+function create_scenario() {
local dir=$1
- local poolname=test
- local OBJS=15
- local OSDS=1
-
- TESTDATA="testdata.$$"
+ local poolname=$2
+ local TESTDATA=$3
+ local osd=$4
- run_mon $dir a --osd_pool_default_size=$OSDS || return 1
- run_mgr $dir x || return 1
- for osd in $(seq 0 $(expr $OSDS - 1))
- do
- run_osd $dir $osd || return 1
- done
-
- # Create a pool with a single pg
- create_pool $poolname 1 1
- wait_for_clean || return 1
- poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
-
- dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
- for i in `seq 1 $OBJS`
- do
- rados -p $poolname put obj${i} $TESTDATA
- done
-
- local primary=$(get_primary $poolname obj1)
SNAP=1
rados -p $poolname mksnap snap${SNAP}
dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
# Don't need to use ceph_objectstore_tool() function because osd stopped
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj1)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" --force remove
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj1)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" --force remove
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --op list obj5 | grep \"snapid\":2)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" remove
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj5 | grep \"snapid\":2)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" remove
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --op list obj5 | grep \"snapid\":1)"
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj5 | grep \"snapid\":1)"
OBJ5SAVE="$JSON"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" remove
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" remove
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --op list obj5 | grep \"snapid\":4)"
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj5 | grep \"snapid\":4)"
dd if=/dev/urandom of=$TESTDATA bs=256 count=18
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" set-bytes $TESTDATA
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-bytes $TESTDATA
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj3)"
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj3)"
dd if=/dev/urandom of=$TESTDATA bs=256 count=15
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" set-bytes $TESTDATA
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-bytes $TESTDATA
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --op list obj4 | grep \"snapid\":7)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" remove
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj4 | grep \"snapid\":7)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" remove
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj2)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" rm-attr snapset
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj2)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" rm-attr snapset
# Create a clone which isn't in snapset and doesn't have object info
JSON="$(echo "$OBJ5SAVE" | sed s/snapid\":1/snapid\":7/)"
dd if=/dev/urandom of=$TESTDATA bs=256 count=7
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" set-bytes $TESTDATA
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-bytes $TESTDATA
- rm -f $TESTDATA
-
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj6)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj7)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset corrupt
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj8)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset seq
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj9)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset clone_size
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj10)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset clone_overlap
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj11)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset clones
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj12)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset head
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj13)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset snaps
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj14)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset size
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj6)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj7)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset corrupt
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj8)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset seq
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj9)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset clone_size
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj10)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset clone_overlap
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj11)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset clones
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj12)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset head
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj13)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset snaps
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj14)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset size
echo "garbage" > $dir/bad
- JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj15)"
- ceph-objectstore-tool --data-path $dir/${primary} "$JSON" set-attr snapset $dir/bad
+ JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj15)"
+ ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-attr snapset $dir/bad
rm -f $dir/bad
+}
+
+function TEST_scrub_snaps() {
+ local dir=$1
+ local poolname=test
+ local OBJS=15
+ local OSDS=1
+
+ TESTDATA="testdata.$$"
+
+ run_mon $dir a --osd_pool_default_size=$OSDS || return 1
+ run_mgr $dir x || return 1
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ run_osd $dir $osd || return 1
+ done
+
+ # Create a pool with a single pg
+ create_pool $poolname 1 1
+ wait_for_clean || return 1
+ poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
+
+ dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
+ for i in `seq 1 $OBJS`
+ do
+ rados -p $poolname put obj${i} $TESTDATA
+ done
+
+ local primary=$(get_primary $poolname obj1)
+
+ create_scenario $dir $poolname $TESTDATA $primary
+
+ rm -f $TESTDATA
for osd in $(seq 0 $(expr $OSDS - 1))
do
cat $dir/osd.0.log
return 1
fi
- grep 'log_channel' $dir/osd.0.log
+
+ test "$(grep "_scan_snaps start" $dir/osd.${primary}.log | wc -l)" = "2" || return 1
rados list-inconsistent-pg $poolname > $dir/json || return 1
# Check pg count
# Check pgid
test $(jq -r '.[0]' $dir/json) = $pgid || return 1
- rados list-inconsistent-snapset $pgid > $dir/json || return 1
+ rados list-inconsistent-obj $pgid > $dir/json || return 1
- local jqfilter='.inconsistents'
- local sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
+ # The injected snapshot errors with a single copy pool doesn't
+ # see object errors because all the issues are detected by
+ # comparing copies.
+ jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
+{
+ "epoch": 17,
+ "inconsistents": []
+}
+EOF
+
+ jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
+ diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+
+ rados list-inconsistent-snapset $pgid > $dir/json || return 1
jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
{
jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-snap.json || return 1
fi
+ pidfiles=$(find $dir 2>/dev/null | grep 'osd[^/]*\.pid')
+ pids=""
+ for pidfile in ${pidfiles}
+ do
+ pids+="$(cat $pidfile) "
+ done
+
for i in `seq 1 7`
do
rados -p $poolname rmsnap snap$i
ERRORS=0
- pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
- pid=$(cat $pidfile)
- if ! kill -0 $pid
- then
- echo "OSD crash occurred"
- tail -100 $dir/osd.0.log
- ERRORS=$(expr $ERRORS + 1)
- fi
+ for pid in $pids
+ do
+ if ! kill -0 $pid
+ then
+ echo "OSD Crash occurred"
+ ERRORS=$(expr $ERRORS + 1)
+ fi
+ done
kill_daemons $dir || return 1
return 0
}
+function _scrub_snaps_multi() {
+ local dir=$1
+ local poolname=test
+ local OBJS=15
+ local OSDS=2
+ local which=$2
+
+ TESTDATA="testdata.$$"
+
+ run_mon $dir a --osd_pool_default_size=$OSDS || return 1
+ run_mgr $dir x || return 1
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ run_osd $dir $osd || return 1
+ done
+
+ # Create a pool with a single pg
+ create_pool $poolname 1 1
+ wait_for_clean || return 1
+ poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
+
+ dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
+ for i in `seq 1 $OBJS`
+ do
+ rados -p $poolname put obj${i} $TESTDATA
+ done
+
+ local primary=$(get_primary $poolname obj1)
+ local replica=$(get_not_primary $poolname obj1)
+
+ eval create_scenario $dir $poolname $TESTDATA \$$which
+
+ rm -f $TESTDATA
+
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ run_osd $dir $osd || return 1
+ done
+
+ local pgid="${poolid}.0"
+ if ! pg_scrub "$pgid" ; then
+ cat $dir/osd.0.log
+ return 1
+ fi
+
+ test "$(grep "_scan_snaps start" $dir/osd.${primary}.log | wc -l)" -gt "3" || return 1
+ test "$(grep "_scan_snaps start" $dir/osd.${replica}.log | wc -l)" -gt "3" || return 1
+
+ rados list-inconsistent-pg $poolname > $dir/json || return 1
+ # Check pg count
+ test $(jq '. | length' $dir/json) = "1" || return 1
+ # Check pgid
+ test $(jq -r '.[0]' $dir/json) = $pgid || return 1
+
+ rados list-inconsistent-obj $pgid --format=json-pretty
+
+ rados list-inconsistent-snapset $pgid > $dir/json || return 1
+
+ # Since all of the snapshots on the primary is consistent there are no errors here
+ if [ $which = "replica" ];
+ then
+ scruberrors="21"
+ jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
+{
+ "epoch": 23,
+ "inconsistents": []
+}
+EOF
+
+else
+ scruberrors="33"
+ jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
+{
+ "epoch": 23,
+ "inconsistents": [
+ {
+ "name": "obj10",
+ "nspace": "",
+ "locator": "",
+ "snap": 1,
+ "errors": [
+ "size_mismatch"
+ ]
+ },
+ {
+ "name": "obj11",
+ "nspace": "",
+ "locator": "",
+ "snap": 1,
+ "errors": [
+ "headless"
+ ]
+ },
+ {
+ "name": "obj14",
+ "nspace": "",
+ "locator": "",
+ "snap": 1,
+ "errors": [
+ "size_mismatch"
+ ]
+ },
+ {
+ "name": "obj6",
+ "nspace": "",
+ "locator": "",
+ "snap": 1,
+ "errors": [
+ "headless"
+ ]
+ },
+ {
+ "name": "obj7",
+ "nspace": "",
+ "locator": "",
+ "snap": 1,
+ "errors": [
+ "headless"
+ ]
+ },
+ {
+ "name": "obj9",
+ "nspace": "",
+ "locator": "",
+ "snap": 1,
+ "errors": [
+ "size_mismatch"
+ ]
+ },
+ {
+ "name": "obj5",
+ "nspace": "",
+ "locator": "",
+ "snap": 7,
+ "errors": [
+ "info_missing",
+ "headless"
+ ]
+ },
+ {
+ "name": "obj10",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 1,
+ "size": 1032,
+ "overlap": "????",
+ "snaps": [
+ 1
+ ]
+ }
+ ]
+ },
+ "errors": []
+ },
+ {
+ "name": "obj11",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": []
+ },
+ "errors": [
+ "extra_clones"
+ ],
+ "extra clones": [
+ 1
+ ]
+ },
+ {
+ "errors": [
+ "head_mismatch"
+ ],
+ "locator": "",
+ "name": "obj12",
+ "nspace": "",
+ "snap": "head",
+ "snapset": {
+ "clones": [
+ {
+ "overlap": "[]",
+ "size": 1032,
+ "snap": 1,
+ "snaps": [
+ 1
+ ]
+ }
+ ],
+ "head_exists": 0,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ }
+ }
+ },
+ {
+ "name": "obj14",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 1,
+ "size": 1033,
+ "overlap": "[]",
+ "snaps": [
+ 1
+ ]
+ }
+ ]
+ },
+ "errors": []
+ },
+ {
+ "name": "obj5",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 6,
+ "snaps": [
+ 6,
+ 5,
+ 4,
+ 3,
+ 2,
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 1,
+ "size": 1032,
+ "overlap": "[]",
+ "snaps": [
+ 1
+ ]
+ },
+ {
+ "snap": 2,
+ "size": 256,
+ "overlap": "[]",
+ "snaps": [
+ 2
+ ]
+ },
+ {
+ "snap": 4,
+ "size": 512,
+ "overlap": "[]",
+ "snaps": [
+ 4,
+ 3
+ ]
+ },
+ {
+ "snap": 6,
+ "size": 1024,
+ "overlap": "[]",
+ "snaps": [
+ 6,
+ 5
+ ]
+ }
+ ]
+ },
+ "errors": [
+ "extra_clones"
+ ],
+ "extra clones": [
+ 7
+ ]
+ },
+ {
+ "name": "obj6",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": []
+ },
+ "errors": [
+ "extra_clones"
+ ],
+ "extra clones": [
+ 1
+ ]
+ },
+ {
+ "name": "obj7",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "snapset": {
+ "head_exists": 0,
+ "snap_context": {
+ "seq": 0,
+ "snaps": []
+ },
+ "clones": []
+ },
+ "errors": [
+ "head_mismatch",
+ "extra_clones"
+ ],
+ "extra clones": [
+ 1
+ ]
+ },
+ {
+ "name": "obj8",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 0,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 1,
+ "size": 1032,
+ "overlap": "[]",
+ "snaps": [
+ 1
+ ]
+ }
+ ]
+ },
+ "errors": [
+ "snapset_error"
+ ]
+ },
+ {
+ "name": "obj9",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 1,
+ "size": "????",
+ "overlap": "[]",
+ "snaps": [
+ 1
+ ]
+ }
+ ]
+ },
+ "errors": []
+ }
+ ]
+}
+EOF
+fi
+
+ jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
+ diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ if test $getjson = "yes"
+ then
+ jq '.' $dir/json > save1.json
+ fi
+
+ if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
+ then
+ jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-snap.json || return 1
+ fi
+
+ pidfiles=$(find $dir 2>/dev/null | grep 'osd[^/]*\.pid')
+ pids=""
+ for pidfile in ${pidfiles}
+ do
+ pids+="$(cat $pidfile) "
+ done
+
+ # When removing snapshots with a corrupt replica, it crashes.
+ # See http://tracker.ceph.com/issues/23875
+ if [ $which = "primary" ];
+ then
+ for i in `seq 1 7`
+ do
+ rados -p $poolname rmsnap snap$i
+ done
+ fi
+
+ ERRORS=0
+
+ for pid in $pids
+ do
+ if ! kill -0 $pid
+ then
+ echo "OSD Crash occurred"
+ ERRORS=$(expr $ERRORS + 1)
+ fi
+ done
+
+ kill_daemons $dir || return 1
+
+ declare -a err_strings
+ err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] missing .*:::obj4:7"
+ err_strings[1]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1]: soid .*:::obj3:head size 3840 != size 768 from auth oi"
+ err_strings[2]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] missing .*:::obj5:1"
+ err_strings[3]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] missing .*:::obj5:2"
+ err_strings[4]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1]: soid .*:::obj5:4 size 4608 != size 512 from auth oi"
+ err_strings[5]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid .*:::obj5:7: failed to pick suitable object info"
+ err_strings[6]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] missing .*:::obj1:head"
+ err_strings[7]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub ${scruberrors} errors"
+
+ for err_string in "${err_strings[@]}"
+ do
+ if ! grep "$err_string" $dir/osd.${primary}.log > /dev/null;
+ then
+ echo "Missing log message '$err_string'"
+ ERRORS=$(expr $ERRORS + 1)
+ fi
+ done
+
+ if [ $ERRORS != "0" ];
+ then
+ echo "TEST FAILED WITH $ERRORS ERRORS"
+ return 1
+ fi
+
+ echo "TEST PASSED"
+ return 0
+}
+
+function TEST_scrub_snaps_replica() {
+ local dir=$1
+ ORIG_ARGS=$CEPH_ARGS
+ CEPH_ARGS+=" --osd_scrub_chunk_min=3 --osd_scrub_chunk_max=3"
+ _scrub_snaps_multi $dir replica
+ err=$?
+ CEPH_ARGS=$ORIG_ARGS
+ return $err
+}
+
+function TEST_scrub_snaps_primary() {
+ local dir=$1
+ ORIG_ARGS=$CEPH_ARGS
+ CEPH_ARGS+=" --osd_scrub_chunk_min=3 --osd_scrub_chunk_max=3"
+ _scrub_snaps_multi $dir primary
+ err=$?
+ CEPH_ARGS=$ORIG_ARGS
+ return $err
+}
+
main osd-scrub-snaps "$@"
# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && \
-# test/osd/osd-scrub-snaps.sh"
+# compile-command: "cd build ; make -j4 && \
+# ../qa/run-standalone.sh osd-scrub-snaps.sh"
+# End:
main osd-scrub-test "$@"
# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && \
-# qa/standalone/scrub/osd-scrub-test.sh"
+# compile-command: "cd build ; make -j4 && \
+# ../qa/run-standalone.sh osd-scrub-test.sh"
+# End:
+openstack:
+ - machine:
+ disk: 10
+ volumes:
+ count: 2
+ size: 20
roles:
- - mon.a
- mgr.x
- - osd.1
- mon.b
- client.0
-openstack:
- - machine:
- disk: 10 # GB
- ram: 2000 # MB
- cpus: 1
- volumes: # attached to each instance
- count: 2
- size: 10 # GB
tasks:
- ssh_keys:
- print: "**** done ssh_keys"
ansible.cephlab:
vars:
quick_lvs_to_create: 4
+openstack:
+ - machine:
+ disk: 10
+ volumes:
+ count: 4
+ size: 20
roles:
- [mon.a, mgr.y, osd.0, osd.1]
- [mon.b, osd.2, osd.3]
- Metadata damage detected
- bad backtrace on inode
- overall HEALTH_
- - (MDS_TRIM)
+ - \(MDS_TRIM\)
conf:
mds:
mds log max segments: 1
tasks:
- cephfs_test_runner:
+ fail_on_skip: false
modules:
- tasks.cephfs.test_client_recovery
--- /dev/null
+../../../../cephfs/begin.yaml
\ No newline at end of file
--- /dev/null
+roles:
+- [mon.a, mgr.x, osd.0, osd.1, osd.2, mds.a, mds.b, client.0]
+openstack:
+- volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+../../../../../cephfs/objectstore-ec/bluestore.yaml
\ No newline at end of file
--- /dev/null
+../../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../../overrides/no_client_pidfile.yaml
\ No newline at end of file
--- /dev/null
+../../../../../cephfs/overrides/whitelist_health.yaml
\ No newline at end of file
--- /dev/null
+../../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+# Note this test is unlikely to exercise the code as expected in the future:
+# "It's too tricky to arrange inodes in session->caps. we don't know if it
+# still works in the future." -Zheng
+
+overrides:
+ ceph:
+ log-whitelist:
+ - MDS cache is too large
+ - \(MDS_CACHE_OVERSIZED\)
+tasks:
+- exec:
+ mon.a:
+ - "ceph tell mds.* config set mds_max_ratio_caps_per_client 1"
+ - "ceph tell mds.* config set mds_min_caps_per_client 1"
+- background_exec:
+ mon.a:
+ - "sleep 30 && ceph tell mds.* config set mds_cache_memory_limit 1"
+- exec:
+ client.0:
+ - ceph_test_trim_caps
tasks:
- cephfs_test_runner:
+ fail_on_skip: false
modules:
- tasks.cephfs.test_client_recovery
log-whitelist:
- reached quota
- \(POOL_APP_NOT_ENABLED\)
+ - \(PG_AVAILABILITY\)
tasks:
- ceph-fuse:
- workunit:
- 'deep-scrub 1 missing, 0 inconsistent objects'
- 'failed to pick suitable auth object'
- overall HEALTH_
- - (OSDMAP_FLAGS)
- - (OSD_
- - (PG_
- - (OSD_SCRUB_ERRORS)
- - (TOO_FEW_PGS)
+ - \(OSDMAP_FLAGS\)
+ - \(OSD_
+ - \(PG_
+ - \(OSD_SCRUB_ERRORS\)
+ - \(TOO_FEW_PGS\)
conf:
osd:
osd deep scrub update digest min age: 0
+ osd skip data digest: false
tasks:
- scrub_test:
- reached quota
- but it is still running
- overall HEALTH_
- - (POOL_FULL)
- - (SMALLER_PGP_NUM)
- - (CACHE_POOL_NO_HIT_SET)
- - (CACHE_POOL_NEAR_FULL)
- - (POOL_APP_NOT_ENABLED)
+ - \(POOL_FULL\)
+ - \(SMALLER_PGP_NUM\)
+ - \(CACHE_POOL_NO_HIT_SET\)
+ - \(CACHE_POOL_NEAR_FULL\)
+ - \(POOL_APP_NOT_ENABLED\)
+ - \(PG_AVAILABILITY\)
tasks:
- workunit:
clients:
- attr name mismatch
- Regular scrub request, deep-scrub details will be lost
- overall HEALTH_
- - (OSDMAP_FLAGS)
- - (OSD_
- - (PG_
+ - \(OSDMAP_FLAGS\)
+ - \(OSD_
+ - \(PG_
conf:
osd:
filestore debug inject read err: true
paxos service trim min: 5
# thrashing monitors may make mgr have trouble w/ its keepalive
log-whitelist:
- - daemon x is unresponsive
- overall HEALTH_
- \(MGR_DOWN\)
tasks:
- .*clock.*skew.*
- clocks not synchronized
- overall HEALTH_
- - (MON_CLOCK_SKEW)
+ - \(MON_CLOCK_SKEW\)
- mon_clock_skew_check:
expect-skew: false
- install:
- exec:
client.0:
- - mkdir $TESTDIR/ostest && cd $TESTDIR/ostest && ulimit -c 0 && ulimit -Sn 4096 && ceph_test_objectstore --gtest_filter=-*/3
- - rm -rf $TESTDIR/ostest
+ - mkdir $TESTDIR/archive/ostest && cd $TESTDIR/archive/ostest && ulimit -Sn 16384 && CEPH_ARGS="--no-log-to-stderr --log-file $TESTDIR/archive/ceph_test_objectstore.log --debug-filestore 20 --debug-bluestore 20" ceph_test_objectstore --gtest_filter=-*/3 --gtest_catch_exceptions=0
+ - rm -rf $TESTDIR/archive/ostest
- failsafe engaged, dropping updates
- failsafe disengaged, no longer dropping updates
- overall HEALTH_
- - (OSDMAP_FLAGS)
- - (OSD_
- - (PG_
- - (SMALLER_PG_NUM)
+ - \(OSDMAP_FLAGS\)
+ - \(OSD_
+ - \(PG_
+ - \(SMALLER_PG_NUM\)
+ - \(CACHE_POOL_NO_HIT_SET\)
+ - erasure code profile property .ruleset-failure-domain. is no longer supported
+ - \(CACHE_POOL_NEAR_FULL\)
+ - \(FS_WITH_FAILED_MDS\)
+ - \(FS_DEGRADED\)
+ - \(POOL_BACKFILLFULL\)
+ - \(POOL_FULL\)
+ - \(SMALLER_PGP_NUM\)
+ - \(POOL_NEARFULL\)
+ - \(POOL_APP_NOT_ENABLED\)
- workunit:
clients:
all:
- MDS in read-only mode
- force file system read-only
- overall HEALTH_
- - (OSDMAP_FLAGS)
- - (OSD_FULL)
- - (MDS_READ_ONLY)
- - (POOL_FULL)
+ - \(OSDMAP_FLAGS\)
+ - \(OSD_FULL\)
+ - \(MDS_READ_ONLY\)
+ - \(POOL_FULL\)
tasks:
- install:
- ceph:
--- /dev/null
+roles:
+- [mon.a, mgr.x, osd.0, osd.1, client.0]
+overrides:
+ ceph:
+ log-whitelist:
+ - \(OSDMAP_FLAGS\)
+ - \(OSD_FULL\)
+ - \(MDS_READ_ONLY\)
+ - large omap objects
+ - Large omap object found
+ - application not enabled
+ conf:
+ osd:
+ osd deep scrub large omap object value sum threshold: 8800000
+ osd deep scrub large omap object key threshold: 20000
+tasks:
+- install:
+- ceph:
+- workunit:
+ clients:
+ all:
+ - rados/test_large_omap_detection.py
- \(PG_
- \(OBJECT_
- \(POOL_APP_NOT_ENABLED\)
- conf:
- osd:
- debug osd: 5
tasks:
- install:
- \(PG_
- \(OBJECT_
- \(POOL_APP_NOT_ENABLED\)
- conf:
- osd:
- debug osd: 5
tasks:
- install:
- \(OSD_
- \(PG_
- \(OBJECT_
+ - \(REQUEST_SLOW\)
conf:
osd:
osd min pg log entries: 5
- \(PG_
- \(OBJECT_DEGRADED\)
- \(SLOW_OPS\)
+ - \(REQUEST_SLOW\)
conf:
osd:
osd min pg log entries: 5
- but it is still running
- slow request
- overall HEALTH_
- - (OSDMAP_FLAGS)
- - (OSD_
- - (PG_
+ - \(OSDMAP_FLAGS\)
+ - \(OSD_
+ - \(PG_
- exec:
client.0:
- sudo ceph osd pool create foo 128 128
- missing primary copy of
- objects unfound and apparently lost
- overall HEALTH_
- - (POOL_APP_NOT_ENABLED)
- - (PG_DEGRADED)
+ - \(POOL_APP_NOT_ENABLED\)
+ - \(PG_DEGRADED\)
- full_sequential:
- exec:
client.0:
- missing primary copy of
- objects unfound and apparently lost
- overall HEALTH_
- - (OSDMAP_FLAGS)
- - (REQUEST_SLOW)
- - (PG_
+ - \(OSDMAP_FLAGS\)
+ - \(REQUEST_SLOW\)
+ - \(PG_
- \(OBJECT_MISPLACED\)
- - (OSD_
+ - \(OSD_
- thrashosds:
op_delay: 30
clean_interval: 120
- but it is still running
- slow request
- overall HEALTH_
- - (CACHE_POOL_
+ - \(CACHE_POOL_
- exec:
client.0:
- sudo ceph osd pool create base 4
--- /dev/null
+../thrash/2-recovery-overrides
\ No newline at end of file
--- /dev/null
+../thrash/2-recovery-overrides
\ No newline at end of file
--- /dev/null
+../thrash/2-recovery-overrides
\ No newline at end of file
--- /dev/null
+../thrash/2-recovery-overrides
\ No newline at end of file
--- /dev/null
+../thrash/2-recovery-overrides
\ No newline at end of file
--- /dev/null
+../../../../overrides/more-active-recovery.yaml
\ No newline at end of file
log-whitelist:
- reached quota
- \(POOL_APP_NOT_ENABLED\)
+ - \(PG_AVAILABILITY\)
crush_tunables: hammer
conf:
client:
- \(REQUEST_SLOW\)
- \(CACHE_POOL_NEAR_FULL\)
- \(POOL_APP_NOT_ENABLED\)
+ - \(PG_AVAILABILITY\)
conf:
client:
debug ms: 1
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 500
overrides:
ceph:
log-whitelist:
- - (POOL_APP_NOT_ENABLED)
+ - \(POOL_APP_NOT_ENABLED\)
- but it is still running
- objects unfound and apparently lost
- overall HEALTH_
- - (CACHE_POOL_NEAR_FULL)
- - (CACHE_POOL_NO_HIT_SET)
+ - \(CACHE_POOL_NEAR_FULL\)
+ - \(CACHE_POOL_NO_HIT_SET\)
tasks:
- exec:
client.0:
- reached quota
- but it is still running
- objects unfound and apparently lost
- - (POOL_APP_NOT_ENABLED)
+ - \(POOL_APP_NOT_ENABLED\)
- thrashosds:
chance_pgnum_grow: 2
chance_pgpnum_fix: 1
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 4
+ size: 30 # GB
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+ - mgr.x
+- - client.0
+overrides:
+ ceph:
+ log-whitelist:
+ - failed to encode map
+ - CACHE_POOL_NO_HIT_SET
+ fs: xfs
--- /dev/null
+tasks:
+- install:
+ branch: luminous
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+- print: "**** done install luminous"
+upgrade_workload:
+ sequential:
+ - install.upgrade:
+ exclude_packages: ['ceph-test', 'ceph-test-dbg','libcephfs1']
+ client.0:
+ - print: "**** done install.upgrade to -x on client.0"
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - "cp $(which ceph_test_librbd_api) $TESTDIR/ceph_test_librbd_api"
+- sequential:
+ - upgrade_workload
+- ceph:
+- print: "**** done ceph"
+- exec:
+ client.0:
+ - "cp --force $TESTDIR/ceph_test_librbd_api $(which ceph_test_librbd_api)"
+ - "rm -rf $TESTDIR/ceph_test_librbd_api"
+- print: "**** done reverting to luminous ceph_test_librbd_api"
+- workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - rbd/test_librbd_api.sh
+ env:
+ RBD_FEATURES: "61"
+- print: "**** done rbd/test_librbd_api.sh"
--- /dev/null
+tasks:
+- sequential:
+ - upgrade_workload
+- ceph:
+- print: "**** done ceph"
+- workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - rbd/import_export.sh
+- print: "**** done rbd/import_export.sh"
--- /dev/null
+../../../../../../distros/all/centos_7.4.yaml
\ No newline at end of file
--- /dev/null
+../../../../../../distros/all/rhel_7.5.yaml
\ No newline at end of file
--- /dev/null
+../../../../../../distros/all/ubuntu_16.04.yaml
\ No newline at end of file
--- /dev/null
+../../../../../../distros/all/ubuntu_18.04.yaml
\ No newline at end of file
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 4
+ size: 30 # GB
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+ - mgr.x
+- - client.1
+overrides:
+ ceph:
+ log-whitelist:
+ - failed to encode map
+ fs: xfs
--- /dev/null
+tasks:
+- install:
+ branch: luminous
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+- print: "**** done install luminous"
+- install.upgrade:
+ exclude_packages: ['ceph-test', 'ceph-test-dbg','libcephfs1']
+ client.1:
+- print: "**** done install.upgrade to -x on client.0"
+- ceph:
+- print: "**** done ceph task"
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default features: 61
+
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default features: 1
+
--- /dev/null
+tasks:
+- workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - rbd/notify_master.sh
+ client.1:
+ - rbd/notify_slave.sh
+ env:
+ RBD_FEATURES: "61"
+- print: "**** done rbd: old librbd -> new librbd"
+- workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - rbd/notify_slave.sh
+ client.1:
+ - rbd/notify_master.sh
+ env:
+ RBD_FEATURES: "61"
+- print: "**** done rbd: new librbd -> old librbd"
--- /dev/null
+../../../../../../distros/all/centos_7.4.yaml
\ No newline at end of file
--- /dev/null
+../../../../../../distros/all/rhel_7.5.yaml
\ No newline at end of file
--- /dev/null
+../../../../../../distros/all/ubuntu_16.04.yaml
\ No newline at end of file
--- /dev/null
+../../../../../../distros/all/ubuntu_18.04.yaml
\ No newline at end of file
- \(PG_
- Monitor daemon marked osd
- Behind on trimming
- - is unresponsive
conf:
global:
mon warn on pool no app: false
- scrub mismatch
- ScrubResult
- wrongly marked
- - (POOL_APP_NOT_ENABLED)
+ - \(POOL_APP_NOT_ENABLED\)
- overall HEALTH_
conf:
global:
--- /dev/null
+meta:
+- desc: |
+ Run ceph on two nodes, using one of them as a client,
+ with a separate client-only node.
+ Use xfs beneath the osds.
+ install ceph/luminous v12.2.2 point version
+ run workload and upgrade-sequence in parallel
+ install ceph/luminous v12.2.5 point version
+ run workload and upgrade-sequence in parallel
+ install ceph/luminous latest version
+ run workload and upgrade-sequence in parallel
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+ - scrub
+ - osd_map_max_advance
+ - wrongly marked
+ - FS_DEGRADED
+ - POOL_APP_NOT_ENABLED
+ - CACHE_POOL_NO_HIT_SET
+ - POOL_FULL
+ - SMALLER_PG
+ - pool\(s\) full
+ - OSD_DOWN
+ - missing hit_sets
+ - CACHE_POOL_NEAR_FULL
+ - PG_AVAILABILITY
+ - PG_DEGRADED
+ - application not enabled
+ fs: xfs
+ conf:
+ mon:
+ mon debug unsafe allow tier with nonempty snaps: true
+ mon warn on pool no app: false
+ osd:
+ osd map max advance: 1000
+ osd_class_load_list: "cephfs hello journal lock log numops rbd refcount
+ replica_log rgw sdk statelog timeindex user version"
+ osd_class_default_list: "cephfs hello journal lock log numops rbd refcount
+ replica_log rgw sdk statelog timeindex user version"
+ client:
+ rgw_crypt_require_ssl: false
+ rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo=
+roles:
+- - mon.a
+ - mds.a
+ - osd.0
+ - osd.1
+ - osd.2
+ - mgr.x
+- - mon.b
+ - mon.c
+ - osd.3
+ - osd.4
+ - osd.5
+ - client.0
+- - client.1
+openstack:
+- volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
+tasks:
+- print: "**** v12.2.2 about to install"
+- install:
+ tag: v12.2.2
+ # line below can be removed its from jewel test
+ #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev', 'librgw2']
+- print: "**** done v12.2.2 install"
+- ceph:
+ fs: xfs
+ add_osds_to_crush: true
+- print: "**** done ceph xfs"
+- sequential:
+ - workload
+- print: "**** done workload v12.2.2"
+
+#### upgrade to v12.2.5
+- install.upgrade:
+ #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ mon.a:
+ tag: v12.2.5
+ mon.b:
+ tag: v12.2.5
+ # Note that client.a IS NOT upgraded at this point
+- parallel:
+ - workload_luminous
+ - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.5"
+#### upgrade to latest luminous
+- install.upgrade:
+ #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ mon.a:
+ branch: luminous
+ mon.b:
+ branch: luminous
+ # Note that client.a IS NOT upgraded at this point
+- parallel:
+ - workload_luminous
+ - upgrade-sequence_luminous
+- print: "**** done parallel luminous branch"
+
+#######################
+workload:
+ sequential:
+ - workunit:
+ clients:
+ client.0:
+ - suites/blogbench.sh
+workload_luminous:
+ full_sequential:
+ - workunit:
+ tag: v12.2.2
+ clients:
+ client.1:
+ - rados/test.sh
+ - cls
+ env:
+ CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces'
+ - print: "**** done rados/test.sh & cls workload_luminous"
+ - sequential:
+ - rgw: [client.0]
+ - print: "**** done rgw workload_luminous"
+ - s3tests:
+ client.0:
+ force-branch: ceph-luminous
+ rgw_server: client.0
+ scan_for_encryption_keys: false
+ - print: "**** done s3tests workload_luminous"
+upgrade-sequence_luminous:
+ sequential:
+ - print: "**** done branch: luminous install.upgrade"
+ - ceph.restart: [mds.a]
+ - sleep:
+ duration: 60
+ - ceph.restart: [osd.0]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.1]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.2]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.3]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.4]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.5]
+ - sleep:
+ duration: 60
+ - ceph.restart: [mon.a]
+ - sleep:
+ duration: 60
+ - ceph.restart: [mon.b]
+ - sleep:
+ duration: 60
+ - ceph.restart: [mon.c]
+ - sleep:
+ duration: 60
+ - print: "**** done ceph.restart all luminous branch mds/osd/mon"
--- /dev/null
+../../../distros/supported/
\ No newline at end of file
- scrub mismatch
- ScrubResult
- wrongly marked
- - (POOL_APP_NOT_ENABLED)
+ - \(POOL_APP_NOT_ENABLED\)
- overall HEALTH_
conf:
global:
+++ /dev/null
-../../../../../distros/supported/centos_latest.yaml
\ No newline at end of file
+++ /dev/null
-../../../../../distros/supported/ubuntu_latest.yaml
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: |
- Run ceph on two nodes, using one of them as a client,
- with a separate client-only node.
- Use xfs beneath the osds.
- install ceph/luminous v12.2.2 point version
- run workload and upgrade-sequence in parallel
- install ceph/luminous latest version
- run workload and upgrade-sequence in parallel
- install ceph/-x version (luminous or master/mimic)
- run workload and upgrade-sequence in parallel
-overrides:
- ceph:
- log-whitelist:
- - reached quota
- - scrub
- - osd_map_max_advance
- - wrongly marked
- fs: xfs
- conf:
- mon:
- mon debug unsafe allow tier with nonempty snaps: true
- mon warn on pool no app: false
- osd:
- osd map max advance: 1000
- osd_class_load_list: "cephfs hello journal lock log numops rbd refcount
- replica_log rgw sdk statelog timeindex user version"
- osd_class_default_list: "cephfs hello journal lock log numops rbd refcount
- replica_log rgw sdk statelog timeindex user version"
- client:
- rgw_crypt_require_ssl: false
- rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo=
-roles:
-- - mon.a
- - mds.a
- - osd.0
- - osd.1
- - osd.2
- - mgr.x
-- - mon.b
- - mon.c
- - osd.3
- - osd.4
- - osd.5
- - client.0
-- - client.1
-openstack:
-- volumes: # attached to each instance
- count: 3
- size: 30 # GB
-tasks:
-- print: "**** v12.2.2 about to install"
-- install:
- tag: v12.2.2
- # line below can be removed its from jewel test
- #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev', 'librgw2']
-- print: "**** done v12.2.2 install"
-- ceph:
- fs: xfs
- add_osds_to_crush: true
-- print: "**** done ceph xfs"
-- sequential:
- - workload
-- print: "**** done workload"
-- install.upgrade:
- #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- mon.a:
- branch: luminous
- mon.b:
- branch: luminous
- # Note that client.a IS NOT upgraded at this point
-- parallel:
- - workload_luminous
- - upgrade-sequence_luminous
-- print: "**** done parallel luminous branch"
-- install.upgrade:
- #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- client.1:
- branch: luminous
-- print: "**** done branch: luminous install.upgrade on client.1"
-- install.upgrade:
- mon.a:
- mon.b:
-- print: "**** done branch: -x install.upgrade on mon.a and mon.b"
-- parallel:
- - workload_x
- - upgrade-sequence_x
-- print: "**** done parallel -x branch"
-- exec:
- osd.0:
- - ceph osd set-require-min-compat-client luminous
-# Run librados tests on the -x upgraded cluster
-- install.upgrade:
- client.1:
-- workunit:
- branch: luminous
- clients:
- client.1:
- - rados/test.sh
- - cls
-- print: "**** done final test on -x cluster"
-#######################
-workload:
- sequential:
- - workunit:
- clients:
- client.0:
- - suites/blogbench.sh
-workload_luminous:
- full_sequential:
- - workunit:
- tag: v12.2.2
- clients:
- client.1:
- - rados/test.sh
- - cls
- env:
- CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces'
- - print: "**** done rados/test.sh & cls workload_luminous"
- - sequential:
- - rgw: [client.0]
- - print: "**** done rgw workload_luminous"
- - s3tests:
- client.0:
- force-branch: ceph-luminous
- rgw_server: client.0
- scan_for_encryption_keys: false
- - print: "**** done s3tests workload_luminous"
-upgrade-sequence_luminous:
- sequential:
- - print: "**** done branch: luminous install.upgrade"
- - ceph.restart: [mds.a]
- - sleep:
- duration: 60
- - ceph.restart: [osd.0]
- - sleep:
- duration: 30
- - ceph.restart: [osd.1]
- - sleep:
- duration: 30
- - ceph.restart: [osd.2]
- - sleep:
- duration: 30
- - ceph.restart: [osd.3]
- - sleep:
- duration: 30
- - ceph.restart: [osd.4]
- - sleep:
- duration: 30
- - ceph.restart: [osd.5]
- - sleep:
- duration: 60
- - ceph.restart: [mon.a]
- - sleep:
- duration: 60
- - ceph.restart: [mon.b]
- - sleep:
- duration: 60
- - ceph.restart: [mon.c]
- - sleep:
- duration: 60
- - print: "**** done ceph.restart all luminous branch mds/osd/mon"
-workload_x:
- sequential:
- - workunit:
- branch: luminous
- clients:
- client.1:
- - rados/test-upgrade-to-mimic.sh
- - cls
- - print: "**** done rados/test-upgrade-to-mimic.sh & cls workload_x NOT upgraded client"
- - workunit:
- branch: luminous
- clients:
- client.0:
- - rados/test-upgrade-to-mimic.sh
- - cls
- - print: "**** done rados/test.sh & cls workload_x upgraded client"
- - rgw: [client.1]
- - print: "**** done rgw workload_x"
- - s3tests:
- client.1:
- force-branch: ceph-luminous
- rgw_server: client.1
- scan_for_encryption_keys: false
- - print: "**** done s3tests workload_x"
-upgrade-sequence_x:
- sequential:
- - ceph.restart: [mds.a]
- - sleep:
- duration: 60
- - ceph.restart: [mon.a]
- - sleep:
- duration: 60
- - ceph.restart: [mon.b]
- - sleep:
- duration: 60
- - ceph.restart: [mon.c]
- - sleep:
- duration: 60
- - ceph.restart: [osd.0]
- - sleep:
- duration: 30
- - ceph.restart: [osd.1]
- - sleep:
- duration: 30
- - ceph.restart: [osd.2]
- - sleep:
- duration: 30
- - ceph.restart: [osd.3]
- - sleep:
- duration: 30
- - ceph.restart: [osd.4]
- - sleep:
- duration: 30
- - ceph.restart:
- daemons: [osd.5]
- wait-for-healthy: false
- wait-for-up-osds: true
- - ceph.restart:
- daemons: [mgr.x]
- wait-for-healthy: false
- - exec:
- osd.0:
- - ceph osd require-osd-release luminous
- - ceph.healthy:
- - print: "**** done ceph.restart all -x branch mds/osd/mon"
# allow this to fail; in certain cases the OSD might not be up
# at this point. we will catch all pgs below.
try:
+ manager.raw_cluster_cmd('tell', 'osd.' + id_, 'config', 'set',
+ 'osd_debug_deep_scrub_sleep', '0');
manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
except run.CommandFailedError:
pass
elif not config.get('only_mon'):
raise RuntimeError(
"The cluster is NOT operational due to insufficient OSDs")
+ # create rbd pool
+ ceph_admin.run(
+ args=[
+ 'sudo', 'ceph', '--cluster', 'ceph',
+ 'osd', 'pool', 'create', 'rbd', '128', '128'],
+ check_status=False)
+ ceph_admin.run(
+ args=[
+ 'sudo', 'ceph', '--cluster', 'ceph',
+ 'osd', 'pool', 'application', 'enable',
+ 'rbd', 'rbd', '--yes-i-really-mean-it'
+ ],
+ check_status=False)
yield
except Exception:
# both osd_mon_report_interval_min and mgr_stats_period are 5 seconds
# by default, and take the faulty injection in ms into consideration,
# 12 seconds are more than enough
- delays = [1, 1, 2, 3, 5, 8, 13]
+ delays = [1, 1, 2, 3, 5, 8, 13, 0]
@wraps(func)
def wrapper(self, *args, **kwargs):
exc = None
return result
- def get_rank_names(self):
+ def get_rank(self, rank=0, status=None):
+ if status is None:
+ status = self.getinfo()
+ return status.get_rank(self.id, rank)
+
+ def get_ranks(self, status=None):
+ if status is None:
+ status = self.getinfo()
+ return status.get_ranks(self.id)
+
+ def get_rank_names(self, status=None):
"""
Return MDS daemon names of those daemons holding a rank,
sorted by rank. This includes e.g. up:replay/reconnect
return self.json_asok(command, 'mds', mds_id)
+ def rank_asok(self, command, rank=0):
+ info = self.get_rank(rank=rank)
+ return self.json_asok(command, 'mds', info['name'])
+
def read_cache(self, path, depth=None):
cmd = ["dump", "tree", path]
if depth is not None:
assert not self.is_mounted()
self._fuse_conn = None
- def umount_wait(self, force=False, require_clean=False):
+ def umount_wait(self, force=False, require_clean=False, timeout=900):
"""
:param force: Complete cleanly even if the MDS is offline
"""
try:
if self.fuse_daemon:
# Permit a timeout, so that we do not block forever
- run.wait([self.fuse_daemon], 900)
+ run.wait([self.fuse_daemon], timeout)
except MaxWhileTries:
log.error("process failed to terminate after unmount. This probably"
"indicates a bug within ceph-fuse.")
"""
Look up the CephFS client ID for this mount
"""
-
return self.admin_socket(['mds_sessions'])['id']
+ def get_client_pid(self):
+ """
+ return pid of ceph-fuse process
+ """
+ status = self.admin_socket(['status'])
+ return status['metadata']['pid']
+
def get_osd_epoch(self):
"""
Return 2-tuple of osd_epoch, osd_epoch_barrier
def cleanup(self):
pass
- def umount_wait(self, force=False, require_clean=False):
+ def umount_wait(self, force=False, require_clean=False, timeout=900):
"""
Unlike the fuse client, the kernel client's umount is immediate
"""
import os
from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
+from tasks.cephfs.fuse_mount import FuseMount
from tasks.cephfs.cephfs_test_case import CephFSTestCase
from teuthology.packaging import get_package_version
+from unittest import SkipTest
log = logging.getLogger(__name__)
self.mount_b.mount()
self.mount_b.wait_until_mounted()
self.mount_b.run_shell(["ls", "subdir/childfile"])
+
+ def test_stale_renew(self):
+ if not isinstance(self.mount_a, FuseMount):
+ raise SkipTest("Require FUSE client to handle signal STOP/CONT")
+
+ self.mount_a.run_shell(["mkdir", "testdir"])
+ self.mount_a.run_shell(["touch", "testdir/file1"])
+ # populate readdir cache
+ self.mount_a.run_shell(["ls", "testdir"])
+ self.mount_b.run_shell(["ls", "testdir"])
+
+ # check if readdir cache is effective
+ initial_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency'])
+ self.mount_b.run_shell(["ls", "testdir"])
+ current_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency'])
+ self.assertEqual(current_readdirs, initial_readdirs);
+
+ mount_b_gid = self.mount_b.get_global_id()
+ mount_b_pid = self.mount_b.get_client_pid()
+ # stop ceph-fuse process of mount_b
+ self.mount_b.client_remote.run(args=["sudo", "kill", "-STOP", mount_b_pid])
+
+ self.assert_session_state(mount_b_gid, "open")
+ time.sleep(self.mds_session_timeout * 1.5) # Long enough for MDS to consider session stale
+ self.assert_session_state(mount_b_gid, "stale")
+
+ self.mount_a.run_shell(["touch", "testdir/file2"])
+
+ # resume ceph-fuse process of mount_b
+ self.mount_b.client_remote.run(args=["sudo", "kill", "-CONT", mount_b_pid])
+ # Is the new file visible from mount_b? (caps become invalid after session stale)
+ self.mount_b.run_shell(["ls", "testdir/file2"])
+
+ def test_unmount_for_evicted_client(self):
+ """Test if client hangs on unmount after evicting the client."""
+ mount_a_client_id = self.mount_a.get_global_id()
+ self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+
+ self.mount_a.umount_wait(require_clean=True, timeout=30)
class TestExports(CephFSTestCase):
MDSS_REQUIRED = 2
+ CLIENTS_REQUIRED = 2
def _wait_subtrees(self, status, rank, test):
timeout = 30
self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/aa/bb', 0)])
self.mount_a.run_shell(["mv", "aa", "a/b/"])
self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/a/b/aa/bb', 0)])
+
+ def test_session_race(self):
+ """
+ Test session creation race.
+
+ See: https://tracker.ceph.com/issues/24072#change-113056
+ """
+
+ self.fs.set_max_mds(2)
+ self.fs.wait_for_daemons()
+
+ status = self.fs.status()
+ rank1 = self.fs.get_rank(rank=1, status=status)
+ name1 = 'mds.'+rank1['name']
+
+ # Create a directory that is pre-exported to rank 1
+ self.mount_a.run_shell(["mkdir", "-p", "a/aa"])
+ self.mount_a.setfattr("a", "ceph.dir.pin", "1")
+ self._wait_subtrees(status, 1, [('/a', 1)])
+
+ # Now set the mds config to allow the race
+ self.fs.rank_asok(["config", "set", "mds_inject_migrator_session_race", "true"], rank=1)
+
+ # Now create another directory and try to export it
+ self.mount_b.run_shell(["mkdir", "-p", "b/bb"])
+ self.mount_b.setfattr("b", "ceph.dir.pin", "1")
+
+ time.sleep(5)
+
+ # Now turn off the race so that it doesn't wait again
+ self.fs.rank_asok(["config", "set", "mds_inject_migrator_session_race", "false"], rank=1)
+
+ # Now try to create a session with rank 1 by accessing a dir known to
+ # be there, if buggy, this should cause the rank 1 to crash:
+ self.mount_b.run_shell(["ls", "a"])
+
+ # Check if rank1 changed (standby tookover?)
+ new_rank1 = self.fs.get_rank(rank=1)
+ self.assertEqual(rank1['gid'], new_rank1['gid'])
self.mds_cluster.mds_restart()
self.fs.wait_for_daemons()
- def test_purge_queue_op_rate(self):
- """
- A busy purge queue is meant to aggregate operations sufficiently
- that our RADOS ops to the metadata pool are not O(files). Check
- that that is so.
- :return:
- """
-
- # For low rates of deletion, the rate of metadata ops actually
- # will be o(files), so to see the desired behaviour we have to give
- # the system a significant quantity, i.e. an order of magnitude
- # more than the number of files it will purge at one time.
-
- max_purge_files = 2
-
- self.set_conf('mds', 'mds_bal_frag', 'false')
- self.set_conf('mds', 'mds_max_purge_files', "%d" % max_purge_files)
- self.fs.mds_fail_restart()
- self.fs.wait_for_daemons()
-
- phase_1_files = 256
- phase_2_files = 512
-
- self.mount_a.run_shell(["mkdir", "phase1"])
- self.mount_a.create_n_files("phase1/file", phase_1_files)
-
- self.mount_a.run_shell(["mkdir", "phase2"])
- self.mount_a.create_n_files("phase2/file", phase_2_files)
-
- def unlink_and_count_ops(path, expected_deletions):
- initial_ops = self.get_stat("objecter", "op")
- initial_pq_executed = self.get_stat("purge_queue", "pq_executed")
-
- self.mount_a.run_shell(["rm", "-rf", path])
-
- self._wait_for_counter(
- "purge_queue", "pq_executed", initial_pq_executed + expected_deletions
- )
-
- final_ops = self.get_stat("objecter", "op")
-
- # Calculation of the *overhead* operations, i.e. do not include
- # the operations where we actually delete files.
- return final_ops - initial_ops - expected_deletions
-
- self.fs.mds_asok(['flush', 'journal'])
- phase1_ops = unlink_and_count_ops("phase1/", phase_1_files + 1)
-
- self.fs.mds_asok(['flush', 'journal'])
- phase2_ops = unlink_and_count_ops("phase2/", phase_2_files + 1)
-
- log.info("Phase 1: {0}".format(phase1_ops))
- log.info("Phase 2: {0}".format(phase2_ops))
-
- # The success criterion is that deleting double the number
- # of files doesn't generate double the number of overhead ops
- # -- this comparison is a rough approximation of that rule.
- self.assertTrue(phase2_ops < phase1_ops * 1.25)
-
- # Finally, check that our activity did include properly quiescing
- # the queue (i.e. call to Journaler::write_head in the right place),
- # by restarting the MDS and checking that it doesn't try re-executing
- # any of the work we did.
- self.fs.mds_asok(['flush', 'journal']) # flush to ensure no strays
- # hanging around
- self.fs.mds_fail_restart()
- self.fs.wait_for_daemons()
- time.sleep(10)
- self.assertEqual(self.get_stat("purge_queue", "pq_executed"), 0)
-
def test_replicated_delete_speed(self):
"""
That deletions of replicated metadata are not pathologically slow
# it has lost network, because there is nothing to tell it that is messages
# are being dropped because it's identity is gone)
background = self.mounts[2].write_n_mb("rogue.bin", 1, wait=False)
- time.sleep(10) # Approximate check for 'stuck' as 'still running after 10s'
- self.assertFalse(background.finished)
+ try:
+ background.wait()
+ except CommandFailedError:
+ # command failed with EBLACKLISTED?
+ if "transport endpoint shutdown" in background.stderr.getvalue():
+ pass
+ else:
+ raise
# After deauthorisation, the client ID should be gone (this was the only
# volume it was authorised for)
self.assertNotIn("client.{0}".format(guest_entity), [e['entity'] for e in self.auth_list()])
# Clean up the dead mount (ceph-fuse's behaviour here is a bit undefined)
- self.mounts[2].kill()
- self.mounts[2].kill_cleanup()
- try:
- background.wait()
- except CommandFailedError:
- # We killed the mount out from under you
- pass
+ self.mounts[2].umount_wait()
self._volume_client_python(self.mount_b, dedent("""
vp = VolumePath("{group_id}", "{volume_id}")
vc.delete_volume(vp, data_isolated=True)
vc.purge_volume(vp, data_isolated=True)
vc.purge_volume(vp, data_isolated=True)
+
+ vc.create_volume(vp, 10, namespace_isolated=False)
+ vc.create_volume(vp, 10, namespace_isolated=False)
+ vc.authorize(vp, "{guest_entity}")
+ vc.authorize(vp, "{guest_entity}")
+ vc.deauthorize(vp, "{guest_entity}")
+ vc.deauthorize(vp, "{guest_entity}")
+ vc.evict("{guest_entity}")
+ vc.evict("{guest_entity}")
+ vc.delete_volume(vp)
+ vc.delete_volume(vp)
+ vc.purge_volume(vp)
+ vc.purge_volume(vp)
""".format(
group_id=group_id,
volume_id=volume_id,
# auth ID belongs to, the auth ID's authorized access levels
# for different volumes, versioning details, etc.
expected_auth_metadata = {
- u"version": 2,
- u"compat_version": 1,
- u"dirty": False,
- u"tenant_id": u"tenant1",
- u"volumes": {
- u"groupid/volumeid": {
- u"dirty": False,
- u"access_level": u"rw",
+ "version": 2,
+ "compat_version": 1,
+ "dirty": False,
+ "tenant_id": u"tenant1",
+ "volumes": {
+ "groupid/volumeid": {
+ "dirty": False,
+ "access_level": u"rw",
}
}
}
auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+ import json
vp = VolumePath("{group_id}", "{volume_id}")
auth_metadata = vc._auth_metadata_get("{auth_id}")
- print auth_metadata
+ print(json.dumps(auth_metadata))
""".format(
group_id=group_id,
volume_id=volume_id,
auth_id=guestclient_1["auth_id"],
)))
+ auth_metadata = json.loads(auth_metadata)
- self.assertItemsEqual(str(expected_auth_metadata), auth_metadata)
+ self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+ del expected_auth_metadata["version"]
+ del auth_metadata["version"]
+ self.assertEqual(expected_auth_metadata, auth_metadata)
# Verify that the volume metadata file stores info about auth IDs
# and their access levels to the volume, versioning details, etc.
expected_vol_metadata = {
- u"version": 2,
- u"compat_version": 1,
- u"auths": {
- u"guest": {
- u"dirty": False,
- u"access_level": u"rw"
+ "version": 2,
+ "compat_version": 1,
+ "auths": {
+ "guest": {
+ "dirty": False,
+ "access_level": u"rw"
}
}
}
vol_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+ import json
vp = VolumePath("{group_id}", "{volume_id}")
volume_metadata = vc._volume_metadata_get(vp)
- print volume_metadata
+ print(json.dumps(volume_metadata))
""".format(
group_id=group_id,
volume_id=volume_id,
)))
- self.assertItemsEqual(str(expected_vol_metadata), vol_metadata)
+ vol_metadata = json.loads(vol_metadata)
+
+ self.assertGreaterEqual(vol_metadata["version"], expected_vol_metadata["version"])
+ del expected_vol_metadata["version"]
+ del vol_metadata["version"]
+ self.assertEqual(expected_vol_metadata, vol_metadata)
# Cannot authorize 'guestclient_2' to access the volume.
# It uses auth ID 'guest', which has already been used by a
# Mount the volume in the guest using the auth ID to assert that the
# auth caps are valid
guest_mount.mount(mount_path=mount_path)
+
+ def test_volume_without_namespace_isolation(self):
+ """
+ That volume client can create volumes that do not have separate RADOS
+ namespace layouts.
+ """
+ vc_mount = self.mounts[1]
+ vc_mount.umount_wait()
+
+ # Configure vc_mount as the handle for driving volumeclient
+ self._configure_vc_auth(vc_mount, "manila")
+
+ # Create a volume
+ volume_prefix = "/myprefix"
+ group_id = "grpid"
+ volume_id = "volid"
+ mount_path = self._volume_client_python(vc_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ create_result = vc.create_volume(vp, 1024*1024*10, namespace_isolated=False)
+ print create_result['mount_path']
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id
+ )), volume_prefix)
+
+ # The CephFS volume should be created
+ self.mounts[0].stat(os.path.join("myprefix", group_id, volume_id))
+ vol_namespace = self.mounts[0].getfattr(
+ os.path.join("myprefix", group_id, volume_id),
+ "ceph.dir.layout.pool_namespace")
+ assert not vol_namespace
+
+ self._volume_client_python(vc_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.delete_volume(vp)
+ vc.purge_volume(vp)
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ )), volume_prefix)
dir = '%s/ceph.data/test.%s' % (testdir, client)
- seed = str(int(random.uniform(1,100)))
+ seed = int(random.uniform(1,100))
+ start = 800 + random.randint(800,1200)
+ end = start + 150
try:
log.info('creating a working dir')
args=[
'cd', dir,
run.Raw('&&'),
- './run_seed_to_range.sh', seed, '50', '300',
+ './run_seed_to_range.sh', str(seed), str(start), str(end),
],
wait=False,
check_status=False)
manager = ctx.managers['ceph']
log.info('1. creating pool.a')
pool_a = manager.create_pool_with_unique_name(pg_num)
- manager.wait_for_clean()
- assert manager.get_num_active_clean() == pg_num
+ pg_states = manager.wait_till_pg_convergence(300)
+ pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+ assert pg_created == pg_num
log.info('2. creating pool.b')
pool_b = manager.create_pool_with_unique_name(pg_num)
manager = ctx.managers['ceph']
log.info('1. creating pool.a')
pool_a = manager.create_pool_with_unique_name(pg_num)
- manager.wait_for_clean()
- assert manager.get_num_active_clean() == pg_num
+ pg_states = manager.wait_till_pg_convergence(300)
+ pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+ assert pg_created == pg_num
log.info('2. creating pool.b')
while True:
(ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
assert ret == 200
assert out['suspended']
+ assert out['email'] == email
# TESTCASE 're-enable','user','enable','suspended user','succeeds'
(ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : 'false'})
#list pools
rados lspools
#lisr rbd images
-ceph osd pool create rbd 128 128
rbd ls
#check that the monitors work
ceph osd set nodown
--- /dev/null
+#!/usr/bin/env bash
+
+set -x
+
+timeout=30
+old_value=""
+new_value=""
+
+wait_until_changed() {
+ name=$1
+ wait=0
+ while [ $wait -lt $timeout ]; do
+ new_value=`getfattr --only-value -n ceph.dir.$name .`
+ [ $new_value == $old_value ] || return 0
+ sleep 1
+ wait=$(($wait + 1))
+ done
+ return 1
+}
+
+check_rctime() {
+ old_sec=$(echo $old_value | cut -d. -f1)
+ old_nsec=$(echo $old_value | cut -d. -f2)
+ new_sec=$(echo $new_value | cut -d. -f1)
+ new_nsec=$(echo $new_value | cut -d. -f2)
+ [ "$old_sec" -lt "$new_sec" ] && return 0
+ [ "$old_sec" -gt "$new_sec" ] && return 1
+ [ "$old_nsec" -lt "$new_nsec" ] && return 0
+ return 1
+}
+
+# sync(3) does not make ceph-fuse flush dirty caps, because fuse kernel module
+# does not notify ceph-fuse about it. Use fsync(3) instead.
+fsync_path() {
+ cmd="import os; fd=os.open(\"$1\", os.O_RDONLY); os.fsync(fd); os.close(fd)"
+ python -c "$cmd"
+}
+
+set -e
+
+mkdir -p rstats_testdir/d1/d2
+cd rstats_testdir
+
+# rfiles
+old_value=`getfattr --only-value -n ceph.dir.rfiles .`
+[ $old_value == 0 ] || false
+touch d1/d2/f1
+wait_until_changed rfiles
+[ $new_value == $(($old_value + 1)) ] || false
+
+# rsubdirs
+old_value=`getfattr --only-value -n ceph.dir.rsubdirs .`
+[ $old_value == 3 ] || false
+mkdir d1/d2/d3
+wait_until_changed rsubdirs
+[ $new_value == $(($old_value + 1)) ] || false
+
+# rbytes
+old_value=`getfattr --only-value -n ceph.dir.rbytes .`
+[ $old_value == 0 ] || false
+echo hello > d1/d2/f2
+fsync_path d1/d2/f2
+wait_until_changed rbytes
+[ $new_value == $(($old_value + 6)) ] || false
+
+#rctime
+old_value=`getfattr --only-value -n ceph.dir.rctime .`
+touch d1/d2/d3 # touch existing file
+fsync_path d1/d2/d3
+wait_until_changed rctime
+check_rctime
+
+old_value=`getfattr --only-value -n ceph.dir.rctime .`
+touch d1/d2/f3 # create new file
+wait_until_changed rctime
+check_rctime
+
+cd ..
+rm -rf rstats_testdir
+echo OK
ceph osd pool rm cold cold --yes-i-really-really-mean-it
ceph osd crush weight-set rm-compat
+# weight set vs device classes vs move
+ceph osd crush weight-set create-compat
+ceph osd crush add-bucket fooo host
+ceph osd crush move fooo root=default
+ceph osd crush add-bucket barr rack
+ceph osd crush move barr root=default
+ceph osd crush move fooo rack=barr
+ceph osd crush rm fooo
+ceph osd crush rm barr
+ceph osd crush weight-set rm-compat
+
echo OK
# compile code
cd rocksdb
-make env_librados_test ROCKSDB_USE_LIBRADOS=1 -j8
+make env_librados_test ROCKSDB_USE_LIBRADOS=1 DISABLE_WARNING_AS_ERROR=1 -j8
echo "Copy ceph.conf"
# prepare ceph.conf
--- /dev/null
+#!/usr/bin/python
+# -*- mode:python -*-
+# vim: ts=4 sw=4 smarttab expandtab
+#
+# Copyright (C) 2017 Red Hat <contact@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+import json
+import rados
+import shlex
+import subprocess
+import time
+
+def cleanup(cluster):
+ cluster.delete_pool('large-omap-test-pool')
+ cluster.shutdown()
+
+def init():
+ # For local testing
+ #cluster = rados.Rados(conffile='./ceph.conf')
+ cluster = rados.Rados(conffile='/etc/ceph/ceph.conf')
+ cluster.connect()
+ print("\nCluster ID: " + cluster.get_fsid())
+ cluster.create_pool('large-omap-test-pool')
+ ioctx = cluster.open_ioctx('large-omap-test-pool')
+ ioctx.write_full('large-omap-test-object1', "Lorem ipsum")
+ op = ioctx.create_write_op()
+
+ keys = []
+ values = []
+ for x in range(20001):
+ keys.append(str(x))
+ values.append("X")
+
+ ioctx.set_omap(op, tuple(keys), tuple(values))
+ ioctx.operate_write_op(op, 'large-omap-test-object1', 0)
+ ioctx.release_write_op(op)
+
+ ioctx.write_full('large-omap-test-object2', "Lorem ipsum dolor")
+ op = ioctx.create_write_op()
+
+ buffer = ("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do "
+ "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut "
+ "enim ad minim veniam, quis nostrud exercitation ullamco laboris "
+ "nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in "
+ "reprehenderit in voluptate velit esse cillum dolore eu fugiat "
+ "nulla pariatur. Excepteur sint occaecat cupidatat non proident, "
+ "sunt in culpa qui officia deserunt mollit anim id est laborum.")
+
+ keys = []
+ values = []
+ for x in xrange(20000):
+ keys.append(str(x))
+ values.append(buffer)
+
+ ioctx.set_omap(op, tuple(keys), tuple(values))
+ ioctx.operate_write_op(op, 'large-omap-test-object2', 0)
+ ioctx.release_write_op(op)
+ ioctx.close()
+ return cluster
+
+def get_deep_scrub_timestamp(pgid):
+ cmd = ['ceph', 'pg', 'dump', '--format=json-pretty']
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+ out = proc.communicate()[0]
+ for stat in json.loads(out)['pg_stats']:
+ if stat['pgid'] == pgid:
+ return stat['last_deep_scrub_stamp']
+
+def wait_for_scrub():
+ osds = set();
+ pgs = dict();
+ cmd = ['ceph', 'osd', 'map', 'large-omap-test-pool',
+ 'large-omap-test-object1', '--format=json-pretty']
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+ out = proc.communicate()[0]
+ osds.add(json.loads(out)['acting_primary'])
+ pgs[json.loads(out)['pgid']] = get_deep_scrub_timestamp(json.loads(out)['pgid'])
+ cmd = ['ceph', 'osd', 'map', 'large-omap-test-pool',
+ 'large-omap-test-object2', '--format=json-pretty']
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+ out = proc.communicate()[0]
+ osds.add(json.loads(out)['acting_primary'])
+ pgs[json.loads(out)['pgid']] = get_deep_scrub_timestamp(json.loads(out)['pgid'])
+
+ for pg in pgs:
+ command = "ceph pg deep-scrub " + str(pg)
+ subprocess.check_call(shlex.split(command))
+
+ for pg in pgs:
+ RETRIES = 0
+ while RETRIES < 60 and pgs[pg] == get_deep_scrub_timestamp(pg):
+ time.sleep(10)
+ RETRIES += 1
+
+def check_health_output():
+ RETRIES = 0
+ result = 0
+ while RETRIES < 6 and result != 2:
+ result = 0
+ RETRIES += 1
+ output = subprocess.check_output(["ceph", "health", "detail"])
+ for line in output.splitlines():
+ result += int(line.find('2 large omap objects') != -1)
+ time.sleep(10)
+
+ if result != 2:
+ print("Error, got invalid output:")
+ print(output)
+ raise Exception
+
+def main():
+ cluster = init()
+ wait_for_scrub()
+ check_health_output()
+
+ cleanup(cluster)
+
+if __name__ == '__main__':
+ main()
set -e
-expect_1()
+KEYRING=$(mktemp)
+trap cleanup EXIT ERR HUP INT QUIT
+
+cleanup() {
+ (ceph auth del client.mon_read || true) >/dev/null 2>&1
+ (ceph auth del client.mon_write || true) >/dev/null 2>&1
+
+ rm -f $KEYRING
+}
+
+expect_false()
{
- set -x
- set +e
- "$@"
- if [ $? == 1 ]; then return 0; else return 1; fi
+ set -x
+ if "$@"; then return 1; else return 0; fi
+}
+
+create_pool_op() {
+ ID=$1
+ POOL=$2
+
+ cat << EOF | CEPH_KEYRING="$KEYRING" python
+import rados
+
+cluster = rados.Rados(conffile="", rados_id="${ID}")
+cluster.connect()
+cluster.create_pool("${POOL}")
+EOF
}
+delete_pool_op() {
+ ID=$1
+ POOL=$2
+
+ cat << EOF | CEPH_KEYRING="$KEYRING" python
+import rados
+
+cluster = rados.Rados(conffile="", rados_id="${ID}")
+cluster.connect()
+cluster.delete_pool("${POOL}")
+EOF
+}
+
+create_pool_snap_op() {
+ ID=$1
+ POOL=$2
+ SNAP=$3
+
+ cat << EOF | CEPH_KEYRING="$KEYRING" python
+import rados
+
+cluster = rados.Rados(conffile="", rados_id="${ID}")
+cluster.connect()
+ioctx = cluster.open_ioctx("${POOL}")
+
+ioctx.create_snap("${SNAP}")
+EOF
+}
+
+remove_pool_snap_op() {
+ ID=$1
+ POOL=$2
+ SNAP=$3
+
+ cat << EOF | CEPH_KEYRING="$KEYRING" python
+import rados
+
+cluster = rados.Rados(conffile="", rados_id="${ID}")
+cluster.connect()
+ioctx = cluster.open_ioctx("${POOL}")
+
+ioctx.remove_snap("${SNAP}")
+EOF
+}
+
+test_pool_op()
+{
+ ceph auth get-or-create client.mon_read mon 'allow r' >> $KEYRING
+ ceph auth get-or-create client.mon_write mon 'allow *' >> $KEYRING
+
+ expect_false create_pool_op mon_read pool1
+ create_pool_op mon_write pool1
+
+ expect_false create_pool_snap_op mon_read pool1 snap1
+ create_pool_snap_op mon_write pool1 snap1
+
+ expect_false remove_pool_snap_op mon_read pool1 snap1
+ remove_pool_snap_op mon_write pool1 snap1
+
+ expect_false delete_pool_op mon_read pool1
+ delete_pool_op mon_write pool1
+}
key=`ceph auth get-or-create-key client.poolaccess1 mon 'allow r' osd 'allow *'`
rados --id poolaccess1 --key $key -p rbd ls
key=`ceph auth get-or-create-key client.poolaccess2 mon 'allow r' osd 'allow * pool=nopool'`
-expect_1 rados --id poolaccess2 --key $key -p rbd ls
+expect_false rados --id poolaccess2 --key $key -p rbd ls
key=`ceph auth get-or-create-key client.poolaccess3 mon 'allow r' osd 'allow rw pool=nopool'`
-expect_1 rados --id poolaccess3 --key $key -p rbd ls
+expect_false rados --id poolaccess3 --key $key -p rbd ls
+
+test_pool_op
echo OK
dd if=/bin/dd of=${TMPDIR}/img bs=1k count=10 seek=100
rbd import $RBD_CREATE_ARGS ${TMPDIR}/img testimg
rbd snap create testimg@snap
+ rbd image-meta set testimg key1 value1
+ IMAGEMETA_BEFORE=`rbd image-meta list testimg`
rbd export --export-format 2 testimg ${TMPDIR}/img_v2
rbd import --export-format 2 ${TMPDIR}/img_v2 testimg_import
rbd info testimg_import
rbd info testimg_import@snap
+ IMAGEMETA_AFTER=`rbd image-meta list testimg_import`
+ [ "$IMAGEMETA_BEFORE" = "$IMAGEMETA_AFTER" ]
# compare the contents between testimg and testimg_import
rbd export testimg_import ${TMPDIR}/img_import
rbd import --stripe-count 1000 --stripe-unit 4096 ${TMPDIR}/img testimg
rbd export --export-format 2 testimg ${TMPDIR}/img_v2
rbd import --export-format 2 ${TMPDIR}/img_v2 testimg_import
- rbd info testimg_import|grep "stripe unit"|awk '{print $3}'|grep 4096
+ rbd info testimg_import|grep "stripe unit"|grep -Ei '(4 KiB|4K|4096)'
rbd info testimg_import|grep "stripe count"|awk '{print $3}'|grep 1000
rm ${TMPDIR}/img_v2
# 1M sparse, 1M data
rbd rm sparse1 || true
rbd import $RBD_CREATE_ARGS --order 20 ${TMPDIR}/sparse1
-rbd ls -l | grep sparse1 | grep -Ei '(2M|2048k)'
+rbd ls -l | grep sparse1 | grep -Ei '(2 MiB|2M|2048k)'
[ $tiered -eq 1 -o "$(objects sparse1)" = '1' ]
# export, compare contents and on-disk size
# 1M data, 1M sparse
rbd rm sparse2 || true
rbd import $RBD_CREATE_ARGS --order 20 ${TMPDIR}/sparse2
-rbd ls -l | grep sparse2 | grep -Ei '(2M|2048k)'
+rbd ls -l | grep sparse2 | grep -Ei '(2 MiB|2M|2048k)'
[ $tiered -eq 1 -o "$(objects sparse2)" = '0' ]
rbd export sparse2 ${TMPDIR}/sparse2.out
compare_files_and_ondisk_sizes ${TMPDIR}/sparse2 ${TMPDIR}/sparse2.out
truncate ${TMPDIR}/sparse1 -s 10M
# import from stdin just for fun, verify still sparse
rbd import $RBD_CREATE_ARGS --order 20 - sparse1 < ${TMPDIR}/sparse1
-rbd ls -l | grep sparse1 | grep -Ei '(10M|10240k)'
+rbd ls -l | grep sparse1 | grep -Ei '(10 MiB|10M|10240k)'
[ $tiered -eq 1 -o "$(objects sparse1)" = '1' ]
rbd export sparse1 ${TMPDIR}/sparse1.out
compare_files_and_ondisk_sizes ${TMPDIR}/sparse1 ${TMPDIR}/sparse1.out
dd if=/dev/urandom bs=2M count=1 of=${TMPDIR}/sparse2 oflag=append conv=notrunc
# again from stding
rbd import $RBD_CREATE_ARGS --order 20 - sparse2 < ${TMPDIR}/sparse2
-rbd ls -l | grep sparse2 | grep -Ei '(4M|4096k)'
+rbd ls -l | grep sparse2 | grep -Ei '(4 MiB|4M|4096k)'
[ $tiered -eq 1 -o "$(objects sparse2)" = '0 2 3' ]
rbd export sparse2 ${TMPDIR}/sparse2.out
compare_files_and_ondisk_sizes ${TMPDIR}/sparse2 ${TMPDIR}/sparse2.out
delete_users() {
(ceph auth del client.volumes || true) >/dev/null 2>&1
(ceph auth del client.images || true) >/dev/null 2>&1
+
+ (ceph auth del client.snap_none || true) >/dev/null 2>&1
+ (ceph auth del client.snap_all || true) >/dev/null 2>&1
+ (ceph auth del client.snap_pool || true) >/dev/null 2>&1
+ (ceph auth del client.snap_profile_all || true) >/dev/null 2>&1
+ (ceph auth del client.snap_profile_pool || true) >/dev/null 2>&1
+
+ (ceph auth del client.mon_write || true) >/dev/null 2>&1
}
create_users() {
ceph auth get-or-create client.volumes mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow r class-read pool images, allow rwx pool volumes' >> $KEYRING
ceph auth get-or-create client.images mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool images' >> $KEYRING
+
+ ceph auth get-or-create client.snap_none mon 'allow r' >> $KEYRING
+ ceph auth get-or-create client.snap_all mon 'allow r' osd 'allow w' >> $KEYRING
+ ceph auth get-or-create client.snap_pool mon 'allow r' osd 'allow w pool=images' >> $KEYRING
+ ceph auth get-or-create client.snap_profile_all mon 'allow r' osd 'profile rbd' >> $KEYRING
+ ceph auth get-or-create client.snap_profile_pool mon 'allow r' osd 'profile rbd pool=images' >> $KEYRING
+
+ ceph auth get-or-create client.mon_write mon 'allow *' >> $KEYRING
}
expect() {
rbd -k $KEYRING --id volumes rm volumes/child
}
+create_self_managed_snapshot() {
+ ID=$1
+ POOL=$2
+
+ cat << EOF | CEPH_KEYRING="$KEYRING" python
+import rados
+
+cluster = rados.Rados(conffile="", rados_id="${ID}")
+cluster.connect()
+ioctx = cluster.open_ioctx("${POOL}")
+
+snap_id = ioctx.create_self_managed_snap()
+print ("Created snap id {}".format(snap_id))
+EOF
+}
+
+remove_self_managed_snapshot() {
+ ID=$1
+ POOL=$2
+
+ cat << EOF | CEPH_KEYRING="$KEYRING" python
+import rados
+
+cluster1 = rados.Rados(conffile="", rados_id="mon_write")
+cluster1.connect()
+ioctx1 = cluster1.open_ioctx("${POOL}")
+
+snap_id = ioctx1.create_self_managed_snap()
+print ("Created snap id {}".format(snap_id))
+
+cluster2 = rados.Rados(conffile="", rados_id="${ID}")
+cluster2.connect()
+ioctx2 = cluster2.open_ioctx("${POOL}")
+
+ioctx2.remove_self_managed_snap(snap_id)
+print ("Removed snap id {}".format(snap_id))
+EOF
+}
+
+test_remove_self_managed_snapshots() {
+ # Ensure users cannot create self-managed snapshots w/o permissions
+ expect 1 create_self_managed_snapshot snap_none images
+ expect 1 create_self_managed_snapshot snap_none volumes
+
+ create_self_managed_snapshot snap_all images
+ create_self_managed_snapshot snap_all volumes
+
+ create_self_managed_snapshot snap_pool images
+ expect 1 create_self_managed_snapshot snap_pool volumes
+
+ create_self_managed_snapshot snap_profile_all images
+ create_self_managed_snapshot snap_profile_all volumes
+
+ create_self_managed_snapshot snap_profile_pool images
+ expect 1 create_self_managed_snapshot snap_profile_pool volumes
+
+ # Ensure users cannot delete self-managed snapshots w/o permissions
+ expect 1 remove_self_managed_snapshot snap_none images
+ expect 1 remove_self_managed_snapshot snap_none volumes
+
+ remove_self_managed_snapshot snap_all images
+ remove_self_managed_snapshot snap_all volumes
+
+ remove_self_managed_snapshot snap_pool images
+ expect 1 remove_self_managed_snapshot snap_pool volumes
+
+ remove_self_managed_snapshot snap_profile_all images
+ remove_self_managed_snapshot snap_profile_all volumes
+
+ remove_self_managed_snapshot snap_profile_pool images
+ expect 1 remove_self_managed_snapshot snap_profile_pool volumes
+}
+
cleanup() {
rm -f $KEYRING
}
+
KEYRING=$(mktemp)
trap cleanup EXIT ERR HUP INT QUIT
recreate_pools
test_volumes_access
+test_remove_self_managed_snapshots
+
delete_pools
delete_users
set_image_meta ${CLUSTER2} ${POOL} ${image} \
conf_rbd_mirroring_resync_after_disconnect true
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
-image_id=$(get_image_id ${CLUSTER1} ${pool} ${image})
+image_id=$(get_image_id ${CLUSTER1} ${POOL} ${image})
disconnect_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted' ${image_id}
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present'
('get', '/request?page=0', {}),
('delete', '/request', {}),
('get', '/request', {}),
+ ('patch', '/pool/1', {'pg_num': 128}),
+ ('patch', '/pool/1', {'pgp_num': 128}),
]
for method, endpoint, args in screenplay:
class dir read;
class file { getattr read open };
class blk_file { getattr ioctl open read write };
+ class capability2 block_suspend;
}
########################################
allow ceph_t self:fifo_file rw_fifo_file_perms;
allow ceph_t self:unix_stream_socket create_stream_socket_perms;
allow ceph_t self:capability { setuid setgid dac_override };
+allow ceph_t self:capability2 block_suspend;
manage_dirs_pattern(ceph_t, ceph_log_t, ceph_log_t)
manage_files_pattern(ceph_t, ceph_log_t, ceph_log_t)
nis_use_ypbind_uncond(ceph_t)
storage_raw_rw_fixed_disk(ceph_t)
files_manage_generic_locks(ceph_t)
+libs_exec_ldconfig(ceph_t)
allow ceph_t sysfs_t:dir read;
allow ceph_t sysfs_t:file { read getattr open };
-cad919881333ac92274171586c827e01f554a70a
-v12.2.5
+3ec878d1e53e1aeb47a9f619c49d9e7c0aa384d5
+v12.2.7
common/bit_str.cc
osdc/Striper.cc
osdc/Objecter.cc
+ common/compat.cc
common/Graylog.cc
common/fs_types.cc
common/dns_resolve.cc
list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER})
endif(WITH_CCACHE AND CCACHE_FOUND)
+ list(APPEND ROCKSDB_CMAKE_ARGS -DWITH_SNAPPY=${SNAPPY_FOUND})
+ list(APPEND ROCKSDB_CMAKE_ARGS -DWITH_LZ4=${LZ4_FOUND})
+ list(APPEND ROCKSDB_CMAKE_ARGS -DWITH_ZLIB=${ZLIB_FOUND})
+
# SSE 4.2 is enabled by default in rocksdb's crc32c. For details refer to
# rocksdb/util/crc32c.cc.
list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_AR=${CMAKE_AR})
list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
-
- if (CMAKE_CXX_COMPILER_ID STREQUAL Clang)
- list(APPEND ROCKSDB_CMAKE_ARGS -DFAIL_ON_WARNINGS=OFF)
- endif()
+ list(APPEND ROCKSDB_CMAKE_ARGS -DFAIL_ON_WARNINGS=OFF)
# we use an external project and copy the sources to bin directory to ensure
# that object files are built outside of the source tree.
/* flags we export */
int ceph_arch_neon = 0;
int ceph_arch_aarch64_crc32 = 0;
+int ceph_arch_aarch64_pmull = 0;
#include <stdio.h>
#include <elf.h>
#include <link.h> // ElfW macro
+#include <sys/auxv.h>
#if __arm__ || __aarch64__
#include <asm/hwcap.h>
#endif // __arm__
-static unsigned long get_auxval(unsigned long type)
-{
- unsigned long result = 0;
- FILE *f = fopen("/proc/self/auxv", "r");
- if (f) {
- ElfW(auxv_t) entry;
- while (fread(&entry, sizeof(entry), 1, f) == 1) {
- if (entry.a_type == type) {
- result = entry.a_un.a_val;
- break;
- }
- }
- fclose(f);
- }
- return result;
-}
-
-static unsigned long get_hwcap(void)
-{
- return get_auxval(AT_HWCAP);
-}
-
#endif // __linux__
int ceph_arch_arm_probe(void)
{
-#if __arm__ && __linux__
- ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON;
-#elif __aarch64__ && __linux__
- ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD;
-# if defined(HAVE_ARMV8_CRC) && defined(HWCAP_CRC32)
- ceph_arch_aarch64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32;
-# endif
-#else
- if (0)
- get_hwcap(); // make compiler shut up
+#if __linux__
+ unsigned long hwcap = getauxval(AT_HWCAP);
+#if __arm__
+ ceph_arch_neon = (hwcap & HWCAP_NEON) == HWCAP_NEON;
+#elif __aarch64__
+ ceph_arch_neon = (hwcap & HWCAP_ASIMD) == HWCAP_ASIMD;
+ ceph_arch_aarch64_crc32 = (hwcap & HWCAP_CRC32) == HWCAP_CRC32;
+ ceph_arch_aarch64_pmull = (hwcap & HWCAP_PMULL) == HWCAP_PMULL;
#endif
+#endif // __linux__
return 0;
}
extern int ceph_arch_neon; /* true if we have ARM NEON or ASIMD abilities */
extern int ceph_arch_aarch64_crc32; /* true if we have AArch64 CRC32/CRC32C abilities */
+extern int ceph_arch_aarch64_pmull; /* true if we have AArch64 PMULL abilities */
extern int ceph_arch_arm_probe(void);
explicit AuthAuthorizer(__u32 p) : protocol(p) {}
virtual ~AuthAuthorizer() {}
virtual bool verify_reply(bufferlist::iterator& reply) = 0;
+ virtual bool add_challenge(CephContext *cct, bufferlist& challenge) = 0;
+};
+
+struct AuthAuthorizerChallenge {
+ virtual ~AuthAuthorizerChallenge() {}
};
virtual bool verify_authorizer(CephContext *cct, KeyStore *keys,
bufferlist& authorizer_data, bufferlist& authorizer_reply,
EntityName& entity_name, uint64_t& global_id,
- AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid = NULL) = 0;
+ AuthCapsInfo& caps_info, CryptoKey& session_key,
+ uint64_t *auid,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) = 0;
virtual int authorizer_session_crypto() = 0;
};
-bool CephxAuthorizeHandler::verify_authorizer(CephContext *cct, KeyStore *keys,
- bufferlist& authorizer_data, bufferlist& authorizer_reply,
- EntityName& entity_name, uint64_t& global_id, AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid)
+bool CephxAuthorizeHandler::verify_authorizer(
+ CephContext *cct, KeyStore *keys,
+ bufferlist& authorizer_data, bufferlist& authorizer_reply,
+ EntityName& entity_name, uint64_t& global_id, AuthCapsInfo& caps_info,
+ CryptoKey& session_key, uint64_t *auid,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge)
{
bufferlist::iterator iter = authorizer_data.begin();
CephXServiceTicketInfo auth_ticket_info;
- bool isvalid = cephx_verify_authorizer(cct, keys, iter, auth_ticket_info, authorizer_reply);
+ bool isvalid = cephx_verify_authorizer(cct, keys, iter, auth_ticket_info, challenge,
+ authorizer_reply);
if (isvalid) {
caps_info = auth_ticket_info.ticket.caps;
bool verify_authorizer(CephContext *cct, KeyStore *keys,
bufferlist& authorizer_data, bufferlist& authorizer_reply,
EntityName& entity_name, uint64_t& global_id,
- AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid = NULL) override;
+ AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
int authorizer_session_crypto() override;
};
{
CephXAuthorizer *a = new CephXAuthorizer(cct);
a->session_key = session_key;
- a->nonce = ((uint64_t)rand() << 32) + rand();
+ get_random_bytes((char*)&a->nonce, sizeof(a->nonce));
__u8 authorizer_v = 1;
::encode(authorizer_v, a->bl);
::encode(service_id, a->bl);
::encode(ticket, a->bl);
+ a->base_bl = a->bl;
CephXAuthorize msg;
msg.nonce = a->nonce;
*/
bool cephx_verify_authorizer(CephContext *cct, KeyStore *keys,
bufferlist::iterator& indata,
- CephXServiceTicketInfo& ticket_info, bufferlist& reply_bl)
+ CephXServiceTicketInfo& ticket_info,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge,
+ bufferlist& reply_bl)
{
__u8 authorizer_v;
uint32_t service_id;
return false;
}
+ if (challenge) {
+ auto *c = static_cast<CephXAuthorizeChallenge*>(challenge->get());
+ if (!auth_msg.have_challenge || !c) {
+ c = new CephXAuthorizeChallenge;
+ challenge->reset(c);
+ get_random_bytes((char*)&c->server_challenge, sizeof(c->server_challenge));
+ ldout(cct,10) << __func__ << " adding server_challenge " << c->server_challenge
+ << dendl;
+
+ encode_encrypt_enc_bl(cct, *c, ticket_info.session_key, reply_bl, error);
+ if (!error.empty()) {
+ ldout(cct, 10) << "verify_authorizer: encode_encrypt error: " << error << dendl;
+ return false;
+ }
+ return false;
+ }
+ ldout(cct, 10) << __func__ << " got server_challenge+1 "
+ << auth_msg.server_challenge_plus_one
+ << " expecting " << c->server_challenge + 1 << dendl;
+ if (c->server_challenge + 1 != auth_msg.server_challenge_plus_one) {
+ return false;
+ }
+ }
+
/*
* Reply authorizer:
* {timestamp + 1}^session_key
return true;
}
+bool CephXAuthorizer::add_challenge(CephContext *cct, bufferlist& challenge)
+{
+ bl = base_bl;
+
+ CephXAuthorize msg;
+ msg.nonce = nonce;
+
+ auto p = challenge.begin();
+ if (!p.end()) {
+ std::string error;
+ CephXAuthorizeChallenge ch;
+ decode_decrypt_enc_bl(cct, ch, session_key, challenge, error);
+ if (!error.empty()) {
+ ldout(cct, 0) << "failed to decrypt challenge (" << challenge.length() << " bytes): "
+ << error << dendl;
+ return false;
+ }
+ msg.have_challenge = true;
+ msg.server_challenge_plus_one = ch.server_challenge + 1;
+ }
+
+ std::string error;
+ if (encode_encrypt(cct, msg, session_key, bl, error)) {
+ ldout(cct, 0) << __func__ << " failed to encrypt authorizer: " << error << dendl;
+ return false;
+ }
+ return true;
+}
CephContext *cct;
public:
uint64_t nonce;
+ bufferlist base_bl;
explicit CephXAuthorizer(CephContext *cct_)
: AuthAuthorizer(CEPH_AUTH_CEPHX), cct(cct_), nonce(0) {}
bool build_authorizer();
bool verify_reply(bufferlist::iterator& reply) override;
+ bool add_challenge(CephContext *cct, bufferlist& challenge) override;
};
};
WRITE_CLASS_ENCODER(CephXServiceTicketInfo)
+struct CephXAuthorizeChallenge : public AuthAuthorizerChallenge {
+ uint64_t server_challenge;
+ void encode(bufferlist& bl) const {
+ __u8 struct_v = 1;
+ ::encode(struct_v, bl);
+ ::encode(server_challenge, bl);
+ }
+ void decode(bufferlist::iterator& bl) {
+ __u8 struct_v;
+ ::decode(struct_v, bl);
+ ::decode(server_challenge, bl);
+ }
+};
+WRITE_CLASS_ENCODER(CephXAuthorizeChallenge)
+
struct CephXAuthorize {
uint64_t nonce;
+ bool have_challenge = false;
+ uint64_t server_challenge_plus_one = 0;
void encode(bufferlist& bl) const {
- __u8 struct_v = 1;
+ __u8 struct_v = 2;
::encode(struct_v, bl);
::encode(nonce, bl);
+ ::encode(have_challenge, bl);
+ ::encode(server_challenge_plus_one, bl);
}
void decode(bufferlist::iterator& bl) {
__u8 struct_v;
::decode(struct_v, bl);
::decode(nonce, bl);
+ if (struct_v >= 2) {
+ ::decode(have_challenge, bl);
+ ::decode(server_challenge_plus_one, bl);
+ }
+
}
};
WRITE_CLASS_ENCODER(CephXAuthorize)
/*
* Verify authorizer and generate reply authorizer
*/
-extern bool cephx_verify_authorizer(CephContext *cct, KeyStore *keys,
- bufferlist::iterator& indata,
- CephXServiceTicketInfo& ticket_info, bufferlist& reply_bl);
+extern bool cephx_verify_authorizer(
+ CephContext *cct, KeyStore *keys,
+ bufferlist::iterator& indata,
+ CephXServiceTicketInfo& ticket_info,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge,
+ bufferlist& reply_bl);
bufferlist tmp_bl;
CephXServiceTicketInfo auth_ticket_info;
- if (!cephx_verify_authorizer(cct, key_server, indata, auth_ticket_info, tmp_bl)) {
+ // note: no challenge here.
+ if (!cephx_verify_authorizer(cct, key_server, indata, auth_ticket_info, nullptr,
+ tmp_bl)) {
ret = -EPERM;
break;
}
const ceph_msg_header& header = m->get_header();
const ceph_msg_footer& footer = m->get_footer();
- // optimized signature calculation
- // - avoid temporary allocated buffers from encode_encrypt[_enc_bl]
- // - skip the leading 4 byte wrapper from encode_encrypt
- struct {
- __u8 v;
- __le64 magic;
- __le32 len;
- __le32 header_crc;
- __le32 front_crc;
- __le32 middle_crc;
- __le32 data_crc;
- } __attribute__ ((packed)) sigblock = {
- 1, mswab(AUTH_ENC_MAGIC), mswab<uint32_t>(4*4),
- mswab<uint32_t>(header.crc), mswab<uint32_t>(footer.front_crc),
- mswab<uint32_t>(footer.middle_crc), mswab<uint32_t>(footer.data_crc)
- };
- bufferlist bl_plaintext;
- bl_plaintext.append(buffer::create_static(sizeof(sigblock), (char*)&sigblock));
-
- bufferlist bl_ciphertext;
- if (key.encrypt(cct, bl_plaintext, bl_ciphertext, NULL) < 0) {
- lderr(cct) << __func__ << " failed to encrypt signature block" << dendl;
- return -1;
- }
+ if (!HAVE_FEATURE(features, CEPHX_V2)) {
+ // legacy pre-mimic behavior for compatibility
+
+ // optimized signature calculation
+ // - avoid temporary allocated buffers from encode_encrypt[_enc_bl]
+ // - skip the leading 4 byte wrapper from encode_encrypt
+ struct {
+ __u8 v;
+ __le64 magic;
+ __le32 len;
+ __le32 header_crc;
+ __le32 front_crc;
+ __le32 middle_crc;
+ __le32 data_crc;
+ } __attribute__ ((packed)) sigblock = {
+ 1, mswab(AUTH_ENC_MAGIC), mswab<uint32_t>(4*4),
+ mswab<uint32_t>(header.crc), mswab<uint32_t>(footer.front_crc),
+ mswab<uint32_t>(footer.middle_crc), mswab<uint32_t>(footer.data_crc)
+ };
+
+ bufferlist bl_plaintext;
+ bl_plaintext.append(buffer::create_static(sizeof(sigblock),
+ (char*)&sigblock));
+
+ bufferlist bl_ciphertext;
+ if (key.encrypt(cct, bl_plaintext, bl_ciphertext, NULL) < 0) {
+ lderr(cct) << __func__ << " failed to encrypt signature block" << dendl;
+ return -1;
+ }
- bufferlist::iterator ci = bl_ciphertext.begin();
- ::decode(*psig, ci);
+ bufferlist::iterator ci = bl_ciphertext.begin();
+ ::decode(*psig, ci);
+ } else {
+ // newer mimic+ signatures
+ struct {
+ __le32 header_crc;
+ __le32 front_crc;
+ __le32 front_len;
+ __le32 middle_crc;
+ __le32 middle_len;
+ __le32 data_crc;
+ __le32 data_len;
+ __le32 seq_lower_word;
+ } __attribute__ ((packed)) sigblock = {
+ mswab<uint32_t>(header.crc),
+ mswab<uint32_t>(footer.front_crc),
+ mswab<uint32_t>(header.front_len),
+ mswab<uint32_t>(footer.middle_crc),
+ mswab<uint32_t>(header.middle_len),
+ mswab<uint32_t>(footer.data_crc),
+ mswab<uint32_t>(header.data_len),
+ mswab<uint32_t>(header.seq)
+ };
+
+ bufferlist bl_plaintext;
+ bl_plaintext.append(buffer::create_static(sizeof(sigblock),
+ (char*)&sigblock));
+
+ bufferlist bl_ciphertext;
+ if (key.encrypt(cct, bl_plaintext, bl_ciphertext, NULL) < 0) {
+ lderr(cct) << __func__ << " failed to encrypt signature block" << dendl;
+ return -1;
+ }
+
+ struct enc {
+ __le64 a, b, c, d;
+ } *penc = reinterpret_cast<enc*>(bl_ciphertext.c_str());
+ *psig = penc->a ^ penc->b ^ penc->c ^ penc->d;
+ }
ldout(cct, 10) << __func__ << " seq " << m->get_seq()
<< " front_crc_ = " << footer.front_crc
#define dout_subsys ceph_subsys_auth
-bool AuthNoneAuthorizeHandler::verify_authorizer(CephContext *cct, KeyStore *keys,
- bufferlist& authorizer_data, bufferlist& authorizer_reply,
- EntityName& entity_name, uint64_t& global_id, AuthCapsInfo& caps_info, CryptoKey& session_key,
-uint64_t *auid)
+bool AuthNoneAuthorizeHandler::verify_authorizer(
+ CephContext *cct, KeyStore *keys,
+ bufferlist& authorizer_data, bufferlist& authorizer_reply,
+ EntityName& entity_name, uint64_t& global_id, AuthCapsInfo& caps_info,
+ CryptoKey& session_key,
+ uint64_t *auid,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge)
{
bufferlist::iterator iter = authorizer_data.begin();
bool verify_authorizer(CephContext *cct, KeyStore *keys,
bufferlist& authorizer_data, bufferlist& authorizer_reply,
EntityName& entity_name, uint64_t& global_id,
- AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid=NULL) override;
+ AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
int authorizer_session_crypto() override;
};
#include "auth/Auth.h"
+class CephContext;
+
struct AuthNoneAuthorizer : public AuthAuthorizer {
AuthNoneAuthorizer() : AuthAuthorizer(CEPH_AUTH_NONE) { }
bool build_authorizer(const EntityName &ename, uint64_t global_id) {
return 0;
}
bool verify_reply(bufferlist::iterator& reply) override { return true; }
+ bool add_challenge(CephContext *cct, bufferlist& ch) override { return true; }
};
#endif
#include "AuthUnknownAuthorizeHandler.h"
-bool AuthUnknownAuthorizeHandler::verify_authorizer(CephContext *cct, KeyStore *keys,
- bufferlist& authorizer_data, bufferlist& authorizer_reply,
- EntityName& entity_name, uint64_t& global_id, AuthCapsInfo& caps_info, CryptoKey& session_key,
-uint64_t *auid)
+bool AuthUnknownAuthorizeHandler::verify_authorizer(
+ CephContext *cct, KeyStore *keys,
+ bufferlist& authorizer_data, bufferlist& authorizer_reply,
+ EntityName& entity_name, uint64_t& global_id, AuthCapsInfo& caps_info,
+ CryptoKey& session_key,
+ uint64_t *auid,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge)
{
// For unknown authorizers, there's nothing to verify. They're "OK" by definition. PLR
bool verify_authorizer(CephContext *cct, KeyStore *keys,
bufferlist& authorizer_data, bufferlist& authorizer_reply,
EntityName& entity_name, uint64_t& global_id,
- AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid=NULL) override;
+ AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
int authorizer_session_crypto() override;
};
def check_journal_reqs(args):
+ log_file = "/var/log/ceph/$cluster-osd-check.log"
_, _, allows_journal = command([
'ceph-osd', '--check-allows-journal',
'-i', '0',
- '--log-file', '$run_dir/$cluster-osd-check.log',
+ '--log-file', log_file,
'--cluster', args.cluster,
'--setuser', get_ceph_user(),
'--setgroup', get_ceph_group(),
_, _, wants_journal = command([
'ceph-osd', '--check-wants-journal',
'-i', '0',
- '--log-file', '$run_dir/$cluster-osd-check.log',
+ '--log-file', log_file,
'--cluster', args.cluster,
'--setuser', get_ceph_user(),
'--setgroup', get_ceph_group(),
_, _, needs_journal = command([
'ceph-osd', '--check-needs-journal',
'-i', '0',
- '--log-file', '$run_dir/$cluster-osd-check.log',
+ '--log-file', log_file,
'--cluster', args.cluster,
'--setuser', get_ceph_user(),
'--setgroup', get_ceph_group(),
from collections import namedtuple
+
+class UnloadedConfig(object):
+ """
+ This class is used as the default value for conf.ceph so that if
+ a configuration file is not successfully loaded then it will give
+ a nice error message when values from the config are used.
+ """
+ def __getattr__(self, *a):
+ raise RuntimeError("No valid ceph configuration file was loaded.")
+
conf = namedtuple('config', ['ceph', 'cluster', 'verbosity', 'path', 'log_path'])
+conf.ceph = UnloadedConfig()
__version__ = "1.0.0"
/dev/sdv;;07A4F654-4162-4600-8EB3-88D1E42F368D
"""
- fields = 'pv_name,pv_tags,pv_uuid,vg_name'
+ fields = 'pv_name,pv_tags,pv_uuid,vg_name,lv_uuid'
stdout, stderr, returncode = process.call(
['pvs', '--no-heading', '--readonly', '--separator=";"', '-o', fields]
destination = os.path.join(osd_path, 'block.db')
process.run(['ln', '-snf', db_device_path, destination])
system.chown(db_device_path)
+ system.chown(destination)
if wal_device_path:
destination = os.path.join(osd_path, 'block.wal')
process.run(['ln', '-snf', wal_device_path, destination])
system.chown(wal_device_path)
+ system.chown(destination)
if no_systemd is False:
# enable the ceph-volume unit for this OSD
# activate, which would never need to be rolled back.
Activate([]).activate(args)
except Exception:
- logger.error('lvm activate was unable to complete, while creating the OSD')
+ logger.exception('lvm activate was unable to complete, while creating the OSD')
logger.info('will rollback OSD ID creation')
rollback_osd(args, osd_id)
raise
value=value
)
)
+ output.append(
+ device_metadata_item_template.format(tag_name='devices', value=','.join(device['devices'])))
+
print(''.join(output))
def __init__(self, argv):
self.argv = argv
+ @property
+ def pvs(self):
+ """
+ To avoid having to make an LVM API call for every single item being
+ reported, the call gets set only once, using that stored call for
+ subsequent calls
+ """
+ if getattr(self, '_pvs', None) is not None:
+ return self._pvs
+ self._pvs = api.get_api_pvs()
+ return self._pvs
+
+ def match_devices(self, lv_uuid):
+ """
+ It is possible to have more than one PV reported *with the same name*,
+ to avoid incorrect or duplicate contents we correlated the lv uuid to
+ the one on the physical device.
+ """
+ devices = []
+ for device in self.pvs:
+ if device.get('lv_uuid') == lv_uuid:
+ devices.append(device['pv_name'])
+ return devices
+
@decorators.needs_root
def list(self, args):
# ensure everything is up to date before calling out
return self.full_report(lvs=lvs)
if lv:
+
try:
_id = lv.tags['ceph.osd_id']
except KeyError:
return report
report.setdefault(_id, [])
- report[_id].append(
- lv.as_dict()
- )
+ lv_report = lv.as_dict()
+ lv_report['devices'] = self.match_devices(lv.lv_uuid)
+ report[_id].append(lv_report)
else:
# this has to be a journal/wal/db device (not a logical volume) so try
continue
report.setdefault(_id, [])
- report[_id].append(
- lv.as_dict()
- )
+ lv_report = lv.as_dict()
+ lv_report['devices'] = self.match_devices(lv.lv_uuid)
+ report[_id].append(lv_report)
for device_type in ['journal', 'block', 'wal', 'db']:
device_uuid = lv.tags.get('ceph.%s_uuid' % device_type)
try:
self.prepare(args)
except Exception:
- logger.error('lvm prepare was unable to complete')
+ logger.exception('lvm prepare was unable to complete')
logger.info('will rollback OSD ID creation')
rollback_osd(args, self.osd_id)
raise
self.a = a
self.kw = kw
self.calls = []
+ self.return_values = kw.get('return_values', False)
+ self.always_returns = kw.get('always_returns', False)
def __call__(self, *a, **kw):
self.calls.append({'args': a, 'kwargs': kw})
+ if self.always_returns:
+ return self.always_returns
+ if self.return_values:
+ return self.return_values.pop()
class Factory(object):
@pytest.fixture
def fake_call(monkeypatch):
- fake_call = Capture()
+ fake_call = Capture(always_returns=([], [], 0))
monkeypatch.setattr('ceph_volume.process.call', fake_call)
return fake_call
"""
Monkeypatches process.call, so that a caller can add behavior to the response
"""
- def apply(return_value):
- monkeypatch.setattr(
- 'ceph_volume.process.call',
- lambda *a, **kw: return_value)
+ def apply(return_values):
+ if isinstance(return_values, tuple):
+ return_values = [return_values]
+ stubbed_call = Capture(return_values=return_values)
+ monkeypatch.setattr('ceph_volume.process.call', stubbed_call)
+ return stubbed_call
return apply
assert stdout == '\n'
def test_type_and_path_are_reported(self, capsys):
- lvm.listing.pretty_report({0: [{'type': 'data', 'path': '/dev/sda1'}]})
+ lvm.listing.pretty_report({0: [
+ {'type': 'data', 'path': '/dev/sda1', 'devices': ['/dev/sda']}
+ ]})
stdout, stderr = capsys.readouterr()
assert '[data] /dev/sda1' in stdout
def test_osd_id_header_is_reported(self, capsys):
- lvm.listing.pretty_report({0: [{'type': 'data', 'path': '/dev/sda1'}]})
+ lvm.listing.pretty_report({0: [
+ {'type': 'data', 'path': '/dev/sda1', 'devices': ['/dev/sda']}
+ ]})
stdout, stderr = capsys.readouterr()
assert '====== osd.0 =======' in stdout
{0: [{
'type': 'data',
'path': '/dev/sda1',
- 'tags': {'ceph.osd_id': '0'}
+ 'tags': {'ceph.osd_id': '0'},
+ 'devices': ['/dev/sda'],
}]}
)
stdout, stderr = capsys.readouterr()
assert 'osd id' in stdout
+ def test_devices_are_comma_separated(self, capsys):
+ lvm.listing.pretty_report({0: [
+ {'type': 'data', 'path': '/dev/sda1', 'devices': ['/dev/sda', '/dev/sdb1']}
+ ]})
+ stdout, stderr = capsys.readouterr()
+ assert '/dev/sda,/dev/sdb1' in stdout
+
class TestList(object):
# ceph lvs are detected by looking into its tags
tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
lv = api.Volume(
- lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ lv_name='lv', vg_name='VolGroup',
+ lv_uuid='aaaa', lv_path='/dev/VolGroup/lv', lv_tags=tags
+ )
volumes.append(lv)
monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
result = lvm.listing.List([]).single_report('VolGroup/lv')
assert result['0'][0]['name'] == 'lv'
assert result['0'][0]['lv_tags'] == tags
assert result['0'][0]['path'] == '/dev/VolGroup/lv'
+ assert result['0'][0]['devices'] == []
def test_report_a_ceph_journal_device(self, volumes, monkeypatch):
# ceph lvs are detected by looking into its tags
tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data,ceph.journal_device=/dev/sda1'
lv = api.Volume(
- lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv',
+ lv_uuid='aaa', lv_tags=tags)
volumes.append(lv)
monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
result = lvm.listing.List([]).single_report('/dev/sda1')
assert result['0'][0]['tags'] == {'PARTUUID': 'x'}
assert result['0'][0]['type'] == 'journal'
assert result['0'][0]['path'] == '/dev/sda1'
+
+ def test_report_a_ceph_lv_with_devices(self, volumes, monkeypatch):
+ tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+ lv = api.Volume(
+ lv_name='lv', vg_name='VolGroup',
+ lv_uuid='aaaa', lv_path='/dev/VolGroup/lv', lv_tags=tags
+ )
+ volumes.append(lv)
+ monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+ listing = lvm.listing.List([])
+ listing._pvs = [
+ {'lv_uuid': 'aaaa', 'pv_name': '/dev/sda1', 'pv_tags': '', 'pv_uuid': ''},
+ {'lv_uuid': 'aaaa', 'pv_name': '/dev/sdb1', 'pv_tags': '', 'pv_uuid': ''},
+ ]
+ result = listing.single_report('VolGroup/lv')
+ assert result['0'][0]['name'] == 'lv'
+ assert result['0'][0]['lv_tags'] == tags
+ assert result['0'][0]['path'] == '/dev/VolGroup/lv'
+ assert result['0'][0]['devices'] == ['/dev/sda1', '/dev/sdb1']
+
+ def test_report_a_ceph_lv_with_no_matching_devices(self, volumes, monkeypatch):
+ tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+ lv = api.Volume(
+ lv_name='lv', vg_name='VolGroup',
+ lv_uuid='aaaa', lv_path='/dev/VolGroup/lv', lv_tags=tags
+ )
+ volumes.append(lv)
+ monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+ listing = lvm.listing.List([])
+ listing._pvs = [
+ {'lv_uuid': 'ffff', 'pv_name': '/dev/sda1', 'pv_tags': '', 'pv_uuid': ''},
+ {'lv_uuid': 'ffff', 'pv_name': '/dev/sdb1', 'pv_tags': '', 'pv_uuid': ''},
+ ]
+ result = listing.single_report('VolGroup/lv')
+ assert result['0'][0]['name'] == 'lv'
+ assert result['0'][0]['lv_tags'] == tags
+ assert result['0'][0]['path'] == '/dev/VolGroup/lv'
+ assert result['0'][0]['devices'] == []
+
+
+class TestListingPVs(object):
+
+ def setup(self):
+ self.default_pvs = [
+ {'lv_uuid': 'ffff', 'pv_name': '/dev/sda1', 'pv_tags': '', 'pv_uuid': ''},
+ {'lv_uuid': 'ffff', 'pv_name': '/dev/sdb1', 'pv_tags': '', 'pv_uuid': ''},
+ ]
+
+ def test_pvs_is_unset(self, monkeypatch):
+ monkeypatch.setattr(lvm.listing.api, 'get_api_pvs', lambda: self.default_pvs)
+ listing = lvm.listing.List([])
+ assert listing.pvs == self.default_pvs
+
+ def test_pvs_is_set(self, monkeypatch):
+ # keep it patched so that we can fail if this gets returned
+ monkeypatch.setattr(lvm.listing.api, 'get_api_pvs', lambda: self.default_pvs)
+ listing = lvm.listing.List([])
+ listing._pvs = []
+ assert listing.pvs == []
osd_scenario: lvm
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
lvm_volumes:
- data: data-lv1
data_vg: test_group
osd_scenario: lvm
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
lvm_volumes:
- data: data-lv1
data_vg: test_group
-
- hosts: osds
become: yes
tasks:
name: ceph-osd@2
state: stopped
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+- hosts: mons
+ become: yes
+ tasks:
+
- name: destroy osd.2
command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
+ - name: destroy osd.0
+ command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ # osd.2 device
- name: zap /dev/sdd1
command: "ceph-volume lvm zap /dev/sdd1 --destroy"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
- - name: stop ceph-osd@0 daemon
- service:
- name: ceph-osd@0
- state: stopped
-
- - name: destroy osd.0
- command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
-
+ # osd.0 lv
- name: zap test_group/data-lv1
command: "ceph-volume lvm zap test_group/data-lv1"
environment:
name: ceph-osd@0
state: stopped
+
+- hosts: mons
+ become: yes
+ tasks:
+
- name: destroy osd.0
command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+- hosts: osds
+ become: yes
+ tasks:
+
+
- name: zap test_group/data-lv1
command: "ceph-volume lvm zap test_group/data-lv1"
environment:
osd_scenario: lvm
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
lvm_volumes:
- data: data-lv1
osd_scenario: lvm
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
lvm_volumes:
- data: data-lv1
name: ceph-osd@2
state: stopped
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+
+- hosts: mons
+ become: yes
+ tasks:
+
- name: destroy osd.2
command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
+ - name: destroy osd.0
+ command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ # osd.2 device
- name: zap /dev/sdd1
command: "ceph-volume lvm zap /dev/sdd1 --destroy"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
- - name: stop ceph-osd@0 daemon
- service:
- name: ceph-osd@0
- state: stopped
-
- - name: destroy osd.0
- command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
-
+ # osd.0 lv
- name: zap test_group/data-lv1
command: "ceph-volume lvm zap test_group/data-lv1"
environment:
name: ceph-osd@2
state: stopped
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+
+- hosts: mons
+ become: yes
+ tasks:
+
- name: destroy osd.2
command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
+ - name: destroy osd.0
+ command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ # osd.2 device
- name: zap /dev/sdd1
command: "ceph-volume lvm zap /dev/sdd1 --destroy"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
- - name: stop ceph-osd@0 daemon
- service:
- name: ceph-osd@0
- state: stopped
-
- - name: destroy osd.0
- command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
-
+ # osd.0 device
- name: zap test_group/data-lv1
command: "ceph-volume lvm zap test_group/data-lv1"
environment:
name: ceph-osd@2
state: stopped
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+
+- hosts: mons
+ become: yes
+ tasks:
+
- name: destroy osd.2
command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
+ - name: destroy osd.0
+ command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ # osd.2 device
- name: zap /dev/sdd1
command: "ceph-volume lvm zap /dev/sdd1 --destroy"
environment:
CEPH_VOLUME_DEBUG: 1
+ # osd.2 journal
- name: zap /dev/sdd2
command: "ceph-volume lvm zap /dev/sdd2 --destroy"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
- - name: stop ceph-osd@0 daemon
- service:
- name: ceph-osd@0
- state: stopped
-
- - name: destroy osd.0
- command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
-
+ # osd.0 data lv
- name: zap test_group/data-lv1
command: "ceph-volume lvm zap test_group/data-lv1"
environment:
CEPH_VOLUME_DEBUG: 1
+ # osd.0 journal device
- name: zap /dev/sdc1
command: "ceph-volume lvm zap /dev/sdc1 --destroy"
environment:
ansible==2.4.1
testinfra==1.7.1
pytest-xdist
+ notario>=0.0.13
changedir=
# plain/unencrypted
centos7-filestore-create: {toxinidir}/centos7/filestore/create
centos7-bluestore-prepare_activate: {toxinidir}/xenial/bluestore/prepare_activate
commands=
git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
+ # XXX Ideally we should be able to consume the requirements for ceph-ansible directly,
+ # but the master branch doesn't pin dependencies so we can't guarantee to work correctly
+ #pip install -r {envdir}/tmp/ceph-ansible/requirements.txt
vagrant up --no-provision {posargs:--provider=virtualbox}
bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
osd_scenario: lvm
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
lvm_volumes:
- data: data-lv1
data_vg: test_group
osd_scenario: lvm
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
lvm_volumes:
- data: data-lv1
data_vg: test_group
-
- hosts: osds
become: yes
tasks:
name: ceph-osd@2
state: stopped
- - name: destroy osd.2
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+- hosts: mons
+ become: yes
+ tasks:
+
+ - name: destroy osd.2
command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
- - name: zap /dev/sdd1
+ - name: destroy osd.0
+ command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ # osd.2 device
+ - name: zap /dev/sdd1
command: "ceph-volume lvm zap /dev/sdd1 --destroy"
environment:
CEPH_VOLUME_DEBUG: 1
- - name: redeploy osd.2 using /dev/sdd1
+ - name: redeploy osd.2 using /dev/sdd1
command: "ceph-volume lvm create --bluestore --data /dev/sdd1 --osd-id 2"
environment:
CEPH_VOLUME_DEBUG: 1
- - name: stop ceph-osd@0 daemon
- service:
- name: ceph-osd@0
- state: stopped
-
- - name: destroy osd.0
- command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
-
- - name: zap test_group/data-lv1
+ # osd.0 lv
+ - name: zap test_group/data-lv1
command: "ceph-volume lvm zap test_group/data-lv1"
environment:
CEPH_VOLUME_DEBUG: 1
- - name: redeploy osd.0 using test_group/data-lv1
+ - name: redeploy osd.0 using test_group/data-lv1
command: "ceph-volume lvm create --bluestore --data test_group/data-lv1 --osd-id 0"
environment:
CEPH_VOLUME_DEBUG: 1
name: ceph-osd@0
state: stopped
+
+- hosts: mons
+ become: yes
+ tasks:
+
- name: destroy osd.0
command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+- hosts: osds
+ become: yes
+ tasks:
+
- name: zap test_group/data-lv1
command: "ceph-volume lvm zap test_group/data-lv1"
environment:
osd_scenario: lvm
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
lvm_volumes:
- data: data-lv1
osd_scenario: lvm
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
lvm_volumes:
- data: data-lv1
name: ceph-osd@2
state: stopped
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+
+- hosts: mons
+ become: yes
+ tasks:
+
- name: destroy osd.2
command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
+ - name: destroy osd.0
+ command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ # osd.2 device
- name: zap /dev/sdd1
command: "ceph-volume lvm zap /dev/sdd1 --destroy"
environment:
environment:
CEPH_VOLUME_DEBUG: 1
- - name: stop ceph-osd@0 daemon
- service:
- name: ceph-osd@0
- state: stopped
-
- - name: destroy osd.0
- command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
-
+ # osd.0 lv
- name: zap test_group/data-lv1
command: "ceph-volume lvm zap test_group/data-lv1"
environment:
osd_objectstore: "bluestore"
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
osd_objectstore: "bluestore"
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
osd_objectstore: "bluestore"
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
osd_objectstore: "filestore"
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
osd_objectstore: "filestore"
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
osd_objectstore: "filestore"
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
ansible==2.4.1
testinfra==1.7.1
pytest-xdist
+ notario>=0.0.13
changedir=
centos7-filestore-activate: {toxinidir}/centos7/filestore/activate
centos7-bluestore-activate: {toxinidir}/centos7/bluestore/activate
centos7-filestore-dmcrypt_luks: {toxinidir}/centos7/filestore/dmcrypt-luks
commands=
git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
+ # XXX Ideally we should be able to consume the requirements for ceph-ansible directly,
+ # but the master branch doesn't pin dependencies so we can't guarantee to work correctly
+ #pip install -r {envdir}/tmp/ceph-ansible/requirements.txt
vagrant up --no-provision {posargs:--provider=virtualbox}
bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
osd_objectstore: "bluestore"
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
osd_objectstore: "bluestore"
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
osd_objectstore: "bluestore"
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
osd_objectstore: "filestore"
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
osd_objectstore: "filestore"
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
osd_objectstore: "filestore"
ceph_origin: 'repository'
ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
def test_normalize_strings_duplicate_flags(self, flags):
result = sorted(prepare._normalize_mount_flags(flags, extras=['discard','rw']).split(','))
assert ','.join(result) == 'auto,discard,exec,rw'
+
+
+class TestMkfsBluestore(object):
+
+ def test_non_zero_exit_status(self, stub_call, monkeypatch):
+ conf.cluster = 'ceph'
+ monkeypatch.setattr('ceph_volume.util.prepare.system.chown', lambda x: True)
+ stub_call(([], [], 1))
+ with pytest.raises(RuntimeError) as error:
+ prepare.osd_mkfs_bluestore('1', 'asdf-1234', keyring='keyring')
+ assert "Command failed with exit code 1" in str(error)
+
+ def test_non_zero_exit_formats_command_correctly(self, stub_call, monkeypatch):
+ conf.cluster = 'ceph'
+ monkeypatch.setattr('ceph_volume.util.prepare.system.chown', lambda x: True)
+ stub_call(([], [], 1))
+ with pytest.raises(RuntimeError) as error:
+ prepare.osd_mkfs_bluestore('1', 'asdf-1234', keyring='keyring')
+ expected = ' '.join([
+ 'ceph-osd',
+ '--cluster',
+ 'ceph',
+ '--osd-objectstore', 'bluestore', '--mkfs',
+ '-i', '1', '--monmap', '/var/lib/ceph/osd/ceph-1/activate.monmap',
+ '--keyfile', '-', '--osd-data', '/var/lib/ceph/osd/ceph-1/',
+ '--osd-uuid', 'asdf-1234',
+ '--setuser', 'ceph', '--setgroup', 'ceph'])
+ assert expected in str(error)
command = base_command + supplementary_command
- process.call(command, stdin=keyring, show_command=True)
+ _, _, returncode = process.call(command, stdin=keyring, show_command=True)
+ if returncode != 0:
+ raise RuntimeError('Command failed with exit code %s: %s' % (returncode, ' '.join(command)))
def osd_mkfs_filestore(osd_id, fsid):
"""
uid, gid = get_ceph_user_ids()
if os.path.islink(path):
+ process.run(['chown', '-h', 'ceph:ceph', path])
path = os.path.realpath(path)
if recursive:
process.run(['chown', '-R', 'ceph:ceph', path])
filer_flags |= CEPH_OSD_FLAG_LOCALIZE_READS;
} else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
usage();
+ } else if (ceph_argparse_flag(args, i, "-V", (char*)nullptr)) {
+ const char* tmpargv[] = {
+ "ceph-fuse",
+ "-V"
+ };
+
+ struct fuse_args fargs = FUSE_ARGS_INIT(2, (char**)tmpargv);
+ if (fuse_parse_cmdline(&fargs, nullptr, nullptr, nullptr) == -1) {
+ derr << "fuse_parse_cmdline failed." << dendl;
+ }
+ assert(fargs.allocated);
+ fuse_opt_free_args(&fargs);
+ exit(0);
} else {
++i;
}
r = client->mount(g_conf->client_mountpoint.c_str(), perms,
g_ceph_context->_conf->fuse_require_active_mds);
if (r < 0) {
- if (r == CEPH_FUSE_NO_MDS_UP)
+ if (r == CEPH_FUSE_NO_MDS_UP) {
cerr << "ceph-fuse[" << getpid() << "]: probably no MDS server is up?" << std::endl;
- cerr << "ceph-fuse[" << getpid() << "]: ceph mount failed with " << cpp_strerror(-r) << std::endl;
+ }
+ cerr << "ceph-fuse[" << getpid() << "]: ceph mount failed with " << cpp_strerror(-r) << std::endl;r = EXIT_FAILURE;
+ r = EXIT_FAILURE;
goto out_shutdown;
}
remount_cb(NULL),
ino_invalidate_cb(NULL),
dentry_invalidate_cb(NULL),
- getgroups_cb(NULL),
umask_cb(NULL),
can_invalidate_dentries(false),
async_ino_invalidator(m->cct),
add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
request_perms);
- if (in->auth_cap && in->auth_cap->session == session)
+ if (in->auth_cap && in->auth_cap->session == session) {
in->max_size = st->max_size;
+ in->rstat = st->rstat;
+ }
} else
in->snap_caps |= st->cap.caps;
break;
case CEPH_SESSION_STALE:
+ // invalidate session caps/leases
+ session->cap_gen++;
+ session->cap_ttl = ceph_clock_now();
+ session->cap_ttl -= 1;
renew_caps(session);
break;
ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
in->hold_caps_until = ceph_clock_now();
in->hold_caps_until += cct->_conf->client_caps_release_delay;
- delayed_caps.push_back(&in->cap_item);
+ delayed_list.push_back(&in->delay_cap_item);
}
void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
if (cct->_conf->client_oc) {
vector<ObjectExtent> ls;
Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
- objectcacher->discard_set(&in->oset, ls);
+ objectcacher->discard_writeback(&in->oset, ls, nullptr);
}
_schedule_invalidate_callback(in, off, len);
cap->session = mds_session;
cap->inode = in;
cap->gen = mds_session->cap_gen;
- cap_list.push_back(&in->cap_item);
}
check_cap_issue(in, cap, issued);
cap->seq = seq;
cap->issue_seq = seq;
cap->mseq = mseq;
+ cap->gen = mds_session->cap_gen;
cap->latest_perms = cap_perms;
ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
<< " from mds." << mds
in->flushing_cap_tids.clear();
}
in->flushing_caps = 0;
- in->dirty_caps = 0;
+ in->mark_caps_clean();
put_inode(in);
}
}
}
}
-void Client::trim_caps(MetaSession *s, int max)
+void Client::trim_caps(MetaSession *s, uint64_t max)
{
mds_rank_t mds = s->mds_num;
- int caps_size = s->caps.size();
+ size_t caps_size = s->caps.size();
ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
<< " caps " << caps_size << dendl;
- int trimmed = 0;
- xlist<Cap*>::iterator p = s->caps.begin();
- std::set<InodeRef> anchor; /* prevent put_inode from deleting all caps during traversal */
+ uint64_t trimmed = 0;
+ auto p = s->caps.begin();
+ std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
+ * looking at from getting deleted during traversal. */
while ((caps_size - trimmed) > max && !p.end()) {
Cap *cap = *p;
InodeRef in(cap->inode);
// disposable non-auth cap
if (!(get_caps_used(in.get()) & ~oissued & mine)) {
ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
- remove_cap(cap, true);
- /* N.B. no need to push onto anchor, as we are only removing one cap */
+ cap = (remove_cap(cap, true), nullptr);
trimmed++;
}
} else {
// the end of this function.
_schedule_invalidate_dentry_callback(dn, true);
}
- ldout(cct, 20) << " anchoring inode: " << in->ino << dendl;
- anchor.insert(in);
- trim_dentry(dn);
+ ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
+ to_trim.insert(dn);
} else {
ldout(cct, 20) << " not expirable: " << dn->name << dendl;
all = false;
}
}
}
- ldout(cct, 20) << " clearing anchored inodes" << dendl;
- anchor.clear();
+ ldout(cct, 20) << " trimming queued dentries: " << dendl;
+ for (const auto &dn : to_trim) {
+ trim_dentry(dn);
+ }
+ to_trim.clear();
caps_size = s->caps.size();
if (caps_size > max)
}
}
-void Client::mark_caps_dirty(Inode *in, int caps)
-{
- ldout(cct, 10) << "mark_caps_dirty " << *in << " " << ccap_string(in->dirty_caps) << " -> "
- << ccap_string(in->dirty_caps | caps) << dendl;
- if (caps && !in->caps_dirty())
- in->get();
- in->dirty_caps |= caps;
-}
-
int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
{
MetaSession *session = in->auth_cap->session;
}
in->flushing_caps |= flushing;
- in->dirty_caps = 0;
+ in->mark_caps_clean();
if (!in->flushing_cap_item.is_on_list())
session->flushing_caps.push_back(&in->flushing_cap_item);
void Client::flush_caps_sync()
{
ldout(cct, 10) << __func__ << dendl;
- xlist<Inode*>::iterator p = delayed_caps.begin();
+ xlist<Inode*>::iterator p = delayed_list.begin();
while (!p.end()) {
unsigned flags = CHECK_CAPS_NODELAY;
Inode *in = *p;
++p;
- delayed_caps.pop_front();
- if (p.end() && cap_list.empty())
+ delayed_list.pop_front();
+ if (p.end() && dirty_list.empty())
flags |= CHECK_CAPS_SYNCHRONOUS;
check_caps(in, flags);
}
// other caps, too
- p = cap_list.begin();
+ p = dirty_list.begin();
while (!p.end()) {
unsigned flags = CHECK_CAPS_NODELAY;
Inode *in = *p;
<< " caps now " << ccap_string(new_caps)
<< " was " << ccap_string(old_caps) << dendl;
cap->seq = m->get_seq();
+ cap->gen = session->cap_gen;
in->layout = m->get_layout();
::decode(in->xattrs, p);
in->xattr_version = m->head.xattr_version;
}
+
+ if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
+ in->dirstat.nfiles = m->get_nfiles();
+ in->dirstat.nsubdirs = m->get_nsubdirs();
+ }
+
update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(),
m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(),
m->get_mtime(), m->get_atime(),
else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
in->recall_deleg(true);
- if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER)
- && !_flush(in, new C_Client_FlushComplete(this, in))) {
+ if ((used & revoked & CEPH_CAP_FILE_BUFFER) &&
+ !_flush(in, new C_Client_FlushComplete(this, in))) {
// waitin' for flush
- } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) {
+ } else if (revoked & CEPH_CAP_FILE_CACHE) {
if (_release(in))
check = true;
} else {
cap->wanted = 0; // don't let check_caps skip sending a response to MDS
check = true;
}
-
} else if (old_caps == new_caps) {
ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
} else {
m->put();
}
-int Client::_getgrouplist(gid_t** sgids, uid_t uid, gid_t gid)
-{
- // cppcheck-suppress variableScope
- int sgid_count;
- gid_t *sgid_buf;
-
- if (getgroups_cb) {
- sgid_count = getgroups_cb(callback_handle, &sgid_buf);
- if (sgid_count > 0) {
- *sgids = sgid_buf;
- return sgid_count;
- }
- }
-
-#if HAVE_GETGROUPLIST
- struct passwd *pw;
- pw = getpwuid(uid);
- if (pw == NULL) {
- ldout(cct, 3) << "getting user entry failed" << dendl;
- return -errno;
- }
- //use PAM to get the group list
- // initial number of group entries, defaults to posix standard of 16
- // PAM implementations may provide more than 16 groups....
- sgid_count = 16;
- sgid_buf = (gid_t*)malloc(sgid_count * sizeof(gid_t));
- if (sgid_buf == NULL) {
- ldout(cct, 3) << "allocating group memory failed" << dendl;
- return -ENOMEM;
- }
-
- while (1) {
-#if defined(__APPLE__)
- if (getgrouplist(pw->pw_name, gid, (int*)sgid_buf, &sgid_count) == -1) {
-#else
- if (getgrouplist(pw->pw_name, gid, sgid_buf, &sgid_count) == -1) {
-#endif
- // we need to resize the group list and try again
- void *_realloc = NULL;
- if ((_realloc = realloc(sgid_buf, sgid_count * sizeof(gid_t))) == NULL) {
- ldout(cct, 3) << "allocating group memory failed" << dendl;
- free(sgid_buf);
- return -ENOMEM;
- }
- sgid_buf = (gid_t*)_realloc;
- continue;
- }
- // list was successfully retrieved
- break;
- }
- *sgids = sgid_buf;
- return sgid_count;
-#else
- return 0;
-#endif
-}
-
int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
{
if (perms.uid() == 0)
}
// delayed caps
- xlist<Inode*>::iterator p = delayed_caps.begin();
+ xlist<Inode*>::iterator p = delayed_list.begin();
while (!p.end()) {
Inode *in = *p;
++p;
if (in->hold_caps_until > now)
break;
- delayed_caps.pop_front();
- cap_list.push_back(&in->cap_item);
+ delayed_list.pop_front();
check_caps(in, CHECK_CAPS_NODELAY);
}
in->cap_dirtier_uid = perms.uid();
in->cap_dirtier_gid = perms.gid();
if (issued & CEPH_CAP_AUTH_EXCL)
- mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
+ in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
else if (issued & CEPH_CAP_FILE_EXCL)
- mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
+ in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
else if (issued & CEPH_CAP_XATTR_EXCL)
- mark_caps_dirty(in, CEPH_CAP_XATTR_EXCL);
+ in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
else
mask |= CEPH_SETATTR_CTIME;
}
in->cap_dirtier_uid = perms.uid();
in->cap_dirtier_gid = perms.gid();
in->uid = stx->stx_uid;
- mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
+ in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
mask &= ~CEPH_SETATTR_UID;
kill_sguid = true;
ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
in->cap_dirtier_uid = perms.uid();
in->cap_dirtier_gid = perms.gid();
in->gid = stx->stx_gid;
- mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
+ in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
mask &= ~CEPH_SETATTR_GID;
kill_sguid = true;
ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
in->cap_dirtier_uid = perms.uid();
in->cap_dirtier_gid = perms.gid();
in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
- mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
+ in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
mask &= ~CEPH_SETATTR_MODE;
ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
} else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
/* Must squash the any setuid/setgid bits with an ownership change */
in->mode &= ~(S_ISUID|S_ISGID);
- mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
+ in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
}
if (mask & CEPH_SETATTR_BTIME) {
in->cap_dirtier_uid = perms.uid();
in->cap_dirtier_gid = perms.gid();
in->btime = utime_t(stx->stx_btime);
- mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
+ in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
mask &= ~CEPH_SETATTR_BTIME;
ldout(cct,10) << "changing btime to " << in->btime << dendl;
}
in->cap_dirtier_uid = perms.uid();
in->cap_dirtier_gid = perms.gid();
in->time_warp_seq++;
- mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
+ in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
}
}
st->st_dev = in->snapid;
st->st_mode = in->mode;
st->st_rdev = in->rdev;
- st->st_nlink = in->nlink;
+ if (in->is_dir()) {
+ switch (in->nlink) {
+ case 0:
+ st->st_nlink = 0; /* dir is unlinked */
+ break;
+ case 1:
+ st->st_nlink = 1 /* parent dentry */
+ + 1 /* <dir>/. */
+ + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
+ break;
+ default:
+ ceph_abort();
+ }
+ } else {
+ st->st_nlink = in->nlink;
+ }
st->st_uid = in->uid;
st->st_gid = in->gid;
if (in->ctime > in->mtime) {
}
if (mask & CEPH_CAP_LINK_SHARED) {
- stx->stx_nlink = in->nlink;
+ if (in->is_dir()) {
+ switch (in->nlink) {
+ case 0:
+ stx->stx_nlink = 0; /* dir is unlinked */
+ break;
+ case 1:
+ stx->stx_nlink = 1 /* parent dentry */
+ + 1 /* <dir>/. */
+ + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
+ break;
+ default:
+ ceph_abort();
+ }
+ } else {
+ stx->stx_nlink = in->nlink;
+ }
stx->stx_mask |= CEPH_STATX_NLINK;
}
else
dirp->next_offset = dirp->offset_low();
dirp->last_name = dn_name; // we successfully returned this one; update!
+ dirp->release_count = 0; // last_name no longer match cache index
if (r > 0)
return r;
}
if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
in->inline_data.clear();
in->inline_version = CEPH_INLINE_NONE;
- mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+ in->mark_caps_dirty(CEPH_CAP_FILE_WR);
check_caps(in, 0);
} else
r = uninline_ret;
// check quota
uint64_t endoff = offset + size;
- if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
- f->actor_perms)) {
+ std::list<InodeRef> quota_roots;
+ if (endoff > in->size &&
+ is_quota_bytes_exceeded(in, endoff - in->size, f->actor_perms, "a_roots)) {
return -EDQUOT;
}
// extend file?
if (totalwritten + offset > in->size) {
in->size = totalwritten + offset;
- mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+ in->mark_caps_dirty(CEPH_CAP_FILE_WR);
- if (is_quota_bytes_approaching(in, f->actor_perms)) {
+ if (is_quota_bytes_approaching(in, quota_roots)) {
check_caps(in, CHECK_CAPS_NODELAY);
} else if (is_max_size_approaching(in)) {
check_caps(in, 0);
// mtime
in->mtime = ceph_clock_now();
in->change_attr++;
- mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+ in->mark_caps_dirty(CEPH_CAP_FILE_WR);
done:
if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
in->inline_data.clear();
in->inline_version = CEPH_INLINE_NONE;
- mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+ in->mark_caps_dirty(CEPH_CAP_FILE_WR);
check_caps(in, 0);
} else
r = uninline_ret;
} else ldout(cct, 10) << "no metadata needs to commit" << dendl;
if (!syncdataonly && !in->unsafe_ops.empty()) {
+ flush_mdlog_sync();
+
MetaRequest *req = in->unsafe_ops.back();
ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
<< " invalidate_ino_cb " << args->ino_cb
<< " invalidate_dentry_cb " << args->dentry_cb
- << " getgroups_cb" << args->getgroups_cb
<< " switch_interrupt_cb " << args->switch_intr_cb
<< " remount_cb " << args->remount_cb
<< dendl;
remount_cb = args->remount_cb;
remount_finisher.start();
}
- getgroups_cb = args->getgroups_cb;
umask_cb = args->umask_cb;
}
// Do a force getattr to get the latest quota before returning
// a value to userspace.
- r = _getattr(in, 0, perms, true);
+ int flags = 0;
+ if (vxattr->flags & VXATTR_RSTAT) {
+ flags |= CEPH_STAT_RSTAT;
+ }
+ r = _getattr(in, flags, perms, true);
if (r != 0) {
// Error from getattr!
return r;
readonly: true, \
hidden: false, \
exists_cb: NULL, \
+ flags: 0, \
+}
+#define XATTR_NAME_CEPH2(_type, _name, _flags) \
+{ \
+ name: CEPH_XATTR_NAME(_type, _name), \
+ getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
+ readonly: true, \
+ hidden: false, \
+ exists_cb: NULL, \
+ flags: _flags, \
}
#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
{ \
readonly: false, \
hidden: true, \
exists_cb: &Client::_vxattrcb_layout_exists, \
+ flags: 0, \
}
#define XATTR_QUOTA_FIELD(_type, _name) \
{ \
readonly: false, \
hidden: true, \
exists_cb: &Client::_vxattrcb_quota_exists, \
+ flags: 0, \
}
const Client::VXattr Client::_dir_vxattrs[] = {
readonly: false,
hidden: true,
exists_cb: &Client::_vxattrcb_layout_exists,
+ flags: 0,
},
XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
XATTR_NAME_CEPH(dir, entries),
XATTR_NAME_CEPH(dir, files),
XATTR_NAME_CEPH(dir, subdirs),
- XATTR_NAME_CEPH(dir, rentries),
- XATTR_NAME_CEPH(dir, rfiles),
- XATTR_NAME_CEPH(dir, rsubdirs),
- XATTR_NAME_CEPH(dir, rbytes),
- XATTR_NAME_CEPH(dir, rctime),
+ XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
+ XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
+ XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
+ XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
+ XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
{
name: "ceph.quota",
getxattr_cb: &Client::_vxattrcb_quota,
readonly: false,
hidden: true,
exists_cb: &Client::_vxattrcb_quota_exists,
+ flags: 0,
},
XATTR_QUOTA_FIELD(quota, max_bytes),
XATTR_QUOTA_FIELD(quota, max_files),
readonly: false,
hidden: true,
exists_cb: &Client::_vxattrcb_layout_exists,
+ flags: 0,
},
XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
XATTR_LAYOUT_FIELD(file, layout, stripe_count),
{
Mutex::Locker lock(client_lock);
- inodeno_t ino = ll_get_inodeno(in);
+ inodeno_t ino = in->ino;
uint32_t object_size = layout->object_size;
uint32_t su = layout->stripe_unit;
uint32_t stripe_count = layout->stripe_count;
return r;
}
+int Client::ll_sync_inode(Inode *in, bool syncdataonly)
+{
+ Mutex::Locker lock(client_lock);
+ ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
+ tout(cct) << "ll_sync_inode" << std::endl;
+ tout(cct) << (unsigned long)in << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
+ return _fsync(in, syncdataonly);
+}
+
#ifdef FALLOC_FL_PUNCH_HOLE
int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
return -EBADF;
uint64_t size = offset + length;
+ std::list<InodeRef> quota_roots;
if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
size > in->size &&
- is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
+ is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms, "a_roots)) {
return -EDQUOT;
}
}
in->mtime = ceph_clock_now();
in->change_attr++;
- mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+ in->mark_caps_dirty(CEPH_CAP_FILE_WR);
} else {
if (in->inline_version < CEPH_INLINE_NONE) {
onuninline = new C_SafeCond(&uninline_flock,
0, true, onfinish);
in->mtime = ceph_clock_now();
in->change_attr++;
- mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+ in->mark_caps_dirty(CEPH_CAP_FILE_WR);
client_lock.Unlock();
flock.Lock();
in->size = size;
in->mtime = ceph_clock_now();
in->change_attr++;
- mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+ in->mark_caps_dirty(CEPH_CAP_FILE_WR);
- if (is_quota_bytes_approaching(in, fh->actor_perms)) {
+ if (is_quota_bytes_approaching(in, quota_roots)) {
check_caps(in, CHECK_CAPS_NODELAY);
} else if (is_max_size_approaching(in)) {
check_caps(in, 0);
if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
in->inline_data.clear();
in->inline_version = CEPH_INLINE_NONE;
- mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+ in->mark_caps_dirty(CEPH_CAP_FILE_WR);
check_caps(in, 0);
} else
r = uninline_ret;
case MetaSession::STATE_OPEN:
{
+ objecter->maybe_request_map(); /* to check if we are blacklisted */
const md_config_t *conf = cct->_conf;
if (conf->client_reconnect_stale) {
ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
}
bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
- const UserPerm& perms)
+ const UserPerm& perms,
+ std::list<InodeRef>* quota_roots)
{
return check_quota_condition(in, perms,
- [&new_bytes](const Inode &in) {
+ [&new_bytes, quota_roots](const Inode &in) {
+ if (quota_roots)
+ quota_roots->emplace_back(const_cast<Inode*>(&in));
return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
> in.quota.max_bytes;
});
}
-bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
+bool Client::is_quota_bytes_approaching(Inode *in, std::list<InodeRef>& quota_roots)
{
- return check_quota_condition(in, perms,
- [](const Inode &in) {
- if (in.quota.max_bytes) {
- if (in.rstat.rbytes >= in.quota.max_bytes) {
- return true;
- }
-
- assert(in.size >= in.reported_size);
- const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
- const uint64_t size = in.size - in.reported_size;
- return (space >> 4) < size;
- } else {
- return false;
- }
- });
+ assert(in->size >= in->reported_size);
+ const uint64_t size = in->size - in->reported_size;
+
+ for (auto& diri : quota_roots) {
+ if (diri->quota.max_bytes) {
+ if (diri->rstat.rbytes >= diri->quota.max_bytes)
+ return true;
+
+ uint64_t space = diri->quota.max_bytes - diri->rstat.rbytes;
+ if ((space >> 4) < size)
+ return true;
+ }
+ }
+ return false;
}
enum {
}
}
-void Client::init_groups(UserPerm *perms)
-{
- gid_t *sgids;
- int count = _getgrouplist(&sgids, perms->uid(), perms->gid());
- perms->init_gids(sgids, count);
-}
-
void intrusive_ptr_add_ref(Inode *in)
{
in->get();
};
/* error code for ceph_fuse */
-#define CEPH_FUSE_NO_MDS_UP -(1<<2) /* no mds up deteced in ceph_fuse */
+#define CEPH_FUSE_NO_MDS_UP -((1<<16)+0) /* no mds up deteced in ceph_fuse */
+#define CEPH_FUSE_LAST -((1<<16)+1) /* (unused) */
// ============================================
// types for my local metadata cache
vinodeno_t ino, string& name);
typedef int (*client_remount_callback_t)(void *handle);
-typedef int (*client_getgroups_callback_t)(void *handle, gid_t **sgids);
typedef void(*client_switch_interrupt_callback_t)(void *handle, void *data);
typedef mode_t (*client_umask_callback_t)(void *handle);
client_dentry_callback_t dentry_cb;
client_switch_interrupt_callback_t switch_intr_cb;
client_remount_callback_t remount_cb;
- client_getgroups_callback_t getgroups_cb;
client_umask_callback_t umask_cb;
};
client_remount_callback_t remount_cb;
client_ino_callback_t ino_invalidate_cb;
client_dentry_callback_t dentry_invalidate_cb;
- client_getgroups_callback_t getgroups_cb;
client_umask_callback_t umask_cb;
bool can_invalidate_dentries;
Inode* root_ancestor;
LRU lru; // lru list of Dentry's in our local metadata cache.
- // all inodes with caps sit on either cap_list or delayed_caps.
- xlist<Inode*> delayed_caps, cap_list;
+ // dirty_list keeps all the dirty inodes before flushing.
+ xlist<Inode*> delayed_list, dirty_list;
int num_flushing_caps;
ceph::unordered_map<inodeno_t,SnapRealm*> snap_realms;
void trim_cache(bool trim_kernel_dcache=false);
void trim_cache_for_reconnect(MetaSession *s);
void trim_dentry(Dentry *dn);
- void trim_caps(MetaSession *s, int max);
+ void trim_caps(MetaSession *s, uint64_t max);
void _invalidate_kernel_dcache();
void dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected);
std::function<bool (const Inode &)> test);
bool is_quota_files_exceeded(Inode *in, const UserPerm& perms);
bool is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
- const UserPerm& perms);
- bool is_quota_bytes_approaching(Inode *in, const UserPerm& perms);
+ const UserPerm& perms,
+ std::list<InodeRef>* quota_roots=nullptr);
+ bool is_quota_bytes_approaching(Inode *in, std::list<InodeRef>& quota_roots);
std::map<std::pair<int64_t,std::string>, int> pool_perms;
list<Cond*> waiting_for_pool_perm;
void remove_cap(Cap *cap, bool queue_release);
void remove_all_caps(Inode *in);
void remove_session_caps(MetaSession *session);
- void mark_caps_dirty(Inode *in, int caps);
int mark_caps_flushing(Inode *in, ceph_tid_t *ptid);
void adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s);
void flush_caps_sync();
void flush_mdlog_sync();
void flush_mdlog(MetaSession *session);
+ xlist<Inode*> &get_dirty_list() { return dirty_list; }
// ----------------------
// fs ops.
private:
MAY_READ = 4,
};
- void init_groups(UserPerm *groups);
-
- int inode_permission(Inode *in, const UserPerm& perms, unsigned want);
int xattr_permission(Inode *in, const char *name, unsigned want,
const UserPerm& perms);
int may_setattr(Inode *in, struct ceph_statx *stx, int mask,
int may_hardlink(Inode *in, const UserPerm& perms);
int _getattr_for_perm(Inode *in, const UserPerm& perms);
- int _getgrouplist(gid_t **sgids, uid_t uid, gid_t gid);
vinodeno_t _get_vino(Inode *in);
inodeno_t _get_inodeno(Inode *in);
size_t (Client::*getxattr_cb)(Inode *in, char *val, size_t size);
bool readonly, hidden;
bool (Client::*exists_cb)(Inode *in);
+ int flags;
};
+/* Flags for VXattr */
+#define VXATTR_RSTAT 0x1
+
bool _vxattrcb_quota_exists(Inode *in);
size_t _vxattrcb_quota(Inode *in, char *val, size_t size);
size_t _vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size);
int mksnap(const char *path, const char *name, const UserPerm& perm);
int rmsnap(const char *path, const char *name, const UserPerm& perm);
+ // Inode permission checking
+ int inode_permission(Inode *in, const UserPerm& perms, unsigned want);
+
// expose caps
int get_caps_issued(int fd);
int get_caps_issued(const char *path, const UserPerm& perms);
- // low-level interface v2
- inodeno_t ll_get_inodeno(Inode *in) {
- Mutex::Locker lock(client_lock);
- return _get_inodeno(in);
- }
snapid_t ll_get_snapid(Inode *in);
vinodeno_t ll_get_vino(Inode *in) {
Mutex::Locker lock(client_lock);
loff_t ll_lseek(Fh *fh, loff_t offset, int whence);
int ll_flush(Fh *fh);
int ll_fsync(Fh *fh, bool syncdataonly);
+ int ll_sync_inode(Inode *in, bool syncdataonly);
int ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length);
int ll_release(Fh *fh);
int ll_getlk(Fh *fh, struct flock *fl, uint64_t owner);
Inode::~Inode()
{
- cap_item.remove_myself();
+ delay_cap_item.remove_myself();
+ dirty_cap_item.remove_myself();
snaprealm_item.remove_myself();
if (snapdir_parent) {
<< " open=" << in.open_by_mode
<< " mode=" << oct << in.mode << dec
<< " size=" << in.size << "/" << in.max_size
+ << " nlink=" << in.nlink
<< " mtime=" << in.mtime
<< " caps=" << ccap_string(in.caps_issued());
if (!in.caps.empty()) {
}
}
}
+
+/**
+* mark_caps_dirty - mark some caps dirty
+* @caps: the dirty caps
+*
+* note that if there is no dirty and flushing caps before, we need to pin this inode.
+* it will be unpined by handle_cap_flush_ack when there are no dirty and flushing caps.
+*/
+void Inode::mark_caps_dirty(int caps)
+{
+ lsubdout(client->cct, client, 10) << __func__ << " " << *this << " " << ccap_string(dirty_caps) << " -> "
+ << ccap_string(dirty_caps | caps) << dendl;
+ if (caps && !caps_dirty())
+ get();
+ dirty_caps |= caps;
+ client->get_dirty_list().push_back(&dirty_cap_item);
+}
+
+/**
+* mark_caps_clean - only clean the dirty_caps and caller should start flushing the dirty caps.
+*/
+void Inode::mark_caps_clean()
+{
+ lsubdout(client->cct, client, 10) << __func__ << " " << *this << dendl;
+ dirty_caps = 0;
+ dirty_cap_item.remove_myself();
+}
+
+
int shared_gen, cache_gen;
int snap_caps, snap_cap_refs;
utime_t hold_caps_until;
- xlist<Inode*>::item cap_item, flushing_cap_item;
+ xlist<Inode*>::item delay_cap_item, dirty_cap_item, flushing_cap_item;
SnapRealm *snaprealm;
xlist<Inode*>::item snaprealm_item;
cap_dirtier_uid(-1), cap_dirtier_gid(-1),
dirty_caps(0), flushing_caps(0), shared_gen(0), cache_gen(0),
snap_caps(0), snap_cap_refs(0),
- cap_item(this), flushing_cap_item(this),
+ delay_cap_item(this), dirty_cap_item(this), flushing_cap_item(this),
snaprealm(0), snaprealm_item(this),
oset((void *)this, newlayout->pool_id, this->ino),
reported_size(0), wanted_max_size(0), requested_max_size(0),
int set_deleg(Fh *fh, unsigned type, ceph_deleg_cb_t cb, void *priv);
void unset_deleg(Fh *fh);
+ void mark_caps_dirty(int caps);
+ void mark_caps_clean();
private:
// how many opens for write on this Inode?
long open_count_for_write()
}
bool auth_is_best() {
if ((head.op & CEPH_MDS_OP_WRITE) || head.op == CEPH_MDS_OP_OPEN ||
- head.op == CEPH_MDS_OP_READDIR)
+ head.op == CEPH_MDS_OP_READDIR || send_to_auth)
return true;
return false;
}
m_uid = b.m_uid;
m_gid = b.m_gid;
gid_count = b.gid_count;
- if (gid_count) {
+ if (gid_count > 0) {
gids = new gid_t[gid_count];
alloced_gids = true;
for (int i = 0; i < gid_count; ++i) {
void init_gids(gid_t* _gids, int count) {
gids = _gids;
gid_count = count;
+ alloced_gids = true;
}
- void take_gids() { alloced_gids = true; }
void shallow_copy(const UserPerm& o) {
m_uid = o.m_uid;
m_gid = o.m_gid;
return -ENOSYS;
}
-static int getgroups_cb(void *handle, gid_t **sgids)
+static void get_fuse_groups(UserPerm& perms, fuse_req_t req)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *) handle;
- fuse_req_t req = cfuse->get_fuse_req();
- return getgroups(req, sgids);
-}
+ if (g_conf->get_val<bool>("fuse_set_user_groups")) {
+ gid_t *gids = NULL;
+ int count = getgroups(req, &gids);
-#define GET_GROUPS(perms, req) { \
- if (g_conf->get_val<bool>("fuse_set_user_groups")) { \
- gid_t *gids = NULL; \
- int count = getgroups(req, &gids); \
- perms.init_gids(gids, count); \
- perms.take_gids(); \
- } }
+ if (count > 0) {
+ perms.init_gids(gids, count);
+ } else if (count < 0) {
+ derr << __func__ << ": getgroups failed: " << cpp_strerror(-count)
+ << dendl;
+ }
+ }
+}
static CephFuse::Handle *fuse_ll_req_prepare(fuse_req_t req)
Inode *i2, *i1 = cfuse->iget(parent); // see below
int r;
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
memset(&fe, 0, sizeof(fe));
r = cfuse->client->ll_lookup(i1, name, &fe.attr, &i2, perms);
Inode *in = cfuse->iget(ino);
struct stat stbuf;
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
(void) fi; // XXX
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
int mask = 0;
if (to_set & FUSE_SET_ATTR_MODE) mask |= CEPH_SETATTR_MODE;
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
int r = cfuse->client->ll_setxattr(in, name, value, size, flags, perms);
fuse_reply_err(req, -r);
Inode *in = cfuse->iget(ino);
char buf[size];
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
int r = cfuse->client->ll_listxattr(in, buf, size, perms);
if (size == 0 && r >= 0)
Inode *in = cfuse->iget(ino);
char buf[size];
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
int r = cfuse->client->ll_getxattr(in, name, buf, size, perms);
if (size == 0 && r >= 0)
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
int r = cfuse->client->ll_removexattr(in, name, perms);
fuse_reply_err(req, -r);
void *dirp;
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
int r = cfuse->client->ll_opendir(in, fi->flags, (dir_result_t **)&dirp,
perms);
Inode *in = cfuse->iget(ino);
char buf[PATH_MAX + 1]; // leave room for a null terminator
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
int r = cfuse->client->ll_readlink(in, buf, sizeof(buf) - 1, perms);
if (r >= 0) {
Inode *i2, *i1 = cfuse->iget(parent);
struct fuse_entry_param fe;
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
memset(&fe, 0, sizeof(fe));
memset(&fe, 0, sizeof(fe));
UserPerm perm(ctx->uid, ctx->gid);
- GET_GROUPS(perm, req);
+ get_fuse_groups(perm, req);
#ifdef HAVE_SYS_SYNCFS
if (cfuse->fino_snap(parent) == CEPH_SNAPDIR &&
cfuse->client->cct->_conf->fuse_multithreaded &&
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(parent);
UserPerm perm(ctx->uid, ctx->gid);
- GET_GROUPS(perm, req);
+ get_fuse_groups(perm, req);
int r = cfuse->client->ll_unlink(in, name, perm);
fuse_reply_err(req, -r);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(parent);
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
int r = cfuse->client->ll_rmdir(in, name, perms);
fuse_reply_err(req, -r);
Inode *i2, *i1 = cfuse->iget(parent);
struct fuse_entry_param fe;
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
memset(&fe, 0, sizeof(fe));
Inode *in = cfuse->iget(parent);
Inode *nin = cfuse->iget(newparent);
UserPerm perm(ctx->uid, ctx->gid);
- GET_GROUPS(perm, req);
+ get_fuse_groups(perm, req);
int r = cfuse->client->ll_rename(in, name, nin, newname, perm);
fuse_reply_err(req, -r);
memset(&fe, 0, sizeof(fe));
UserPerm perm(ctx->uid, ctx->gid);
- GET_GROUPS(perm, req);
+ get_fuse_groups(perm, req);
/*
* Note that we could successfully link, but then fail the subsequent
Inode *in = cfuse->iget(ino);
Fh *fh = NULL;
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
int r = cfuse->client->ll_open(in, fi->flags, &fh, perms);
if (r == 0) {
static void fuse_ll_access(fuse_req_t req, fuse_ino_t ino, int mask)
{
- fuse_reply_err(req, 0);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
+ const struct fuse_ctx *ctx = fuse_req_ctx(req);
+ Inode *in = cfuse->iget(ino);
+ UserPerm perms(ctx->uid, ctx->gid);
+ get_fuse_groups(perms, req);
+
+ int r = cfuse->client->inode_permission(in, perms, mask);
+ fuse_reply_err(req, -r);
+ cfuse->iput(in);
}
static void fuse_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name,
struct fuse_entry_param fe;
Fh *fh = NULL;
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
memset(&fe, 0, sizeof(fe));
Inode *in = cfuse->iget(ino);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
UserPerm perms(ctx->uid, ctx->gid);
- GET_GROUPS(perms, req);
+ get_fuse_groups(perms, req);
int r = cfuse->client->ll_statfs(in, &stbuf, perms);
if (r == 0)
#if defined(__linux__)
remount_cb: remount_cb,
#endif
- getgroups_cb: getgroups_cb,
#if !defined(DARWIN)
umask_cb: umask_cb,
#endif
for (iter = keys.begin(); iter != keys.end(); ++iter) {
if (iter->first >= end_key) {
/* past the end of plain namespace */
+ if (pmore) {
+ *pmore = false;
+ }
return count;
}
CLS_LOG(20, "%s(): entry.idx=%s e.key.name=%s", __func__, escape_str(entry.idx).c_str(), escape_str(e.key.name).c_str());
if (!name.empty() && e.key.name != name) {
+ /* we are skipping the rest of the entries */
+ if (pmore) {
+ *pmore = false;
+ }
return count;
}
entry.data = iter->second;
if (!filter.empty() && entry.idx.compare(0, filter.size(), filter) != 0) {
+ /* we are skipping the rest of the entries */
+ if (pmore) {
+ *pmore = false;
+ }
return count;
}
}
if (!name.empty() && e.key.name != name) {
+ /* we are skipping the rest of the entries */
+ if (pmore) {
+ *pmore = false;
+ }
return count;
}
entry.data = iter->second;
if (!filter.empty() && entry.idx.compare(0, filter.size(), filter) != 0) {
+ /* we are skipping the rest of the entries */
+ if (pmore) {
+ *pmore = false;
+ }
return count;
}
}
if (!name.empty() && e.key.name != name) {
+ /* we are skipping the rest of the entries */
+ if (pmore) {
+ *pmore = false;
+ }
return count;
}
int32_t max = (op.max < MAX_BI_LIST_ENTRIES ? op.max : MAX_BI_LIST_ENTRIES);
string start_key = op.marker;
bool more;
- int ret = list_plain_entries(hctx, op.name, op.marker, max, &op_ret.entries, &more);
+ int ret = list_plain_entries(hctx, op.name, op.marker, max, &op_ret.entries, &more);
if (ret < 0) {
CLS_LOG(0, "ERROR: %s(): list_plain_entries retured ret=%d", __func__, ret);
return ret;
void DecayCounter::decay(utime_t now, const DecayRate &rate)
{
- utime_t el = now;
- el -= last_decay;
+ if (now >= last_decay) {
+ double el = (double)(now - last_decay);
+ if (el >= 1.0) {
+ // calculate new value
+ double newval = (val+delta) * exp(el * rate.k);
+ if (newval < .01)
+ newval = 0.0;
- if (el.sec() >= 1) {
- // calculate new value
- double newval = (val+delta) * exp((double)el * rate.k);
- if (newval < .01)
- newval = 0.0;
+ // calculate velocity approx
+ vel += (newval - val) * el;
+ vel *= exp(el * rate.k);
- // calculate velocity approx
- vel += (newval - val) * (double)el;
- vel *= exp((double)el * rate.k);
-
- val = newval;
- delta = 0;
- last_decay = now;
+ val = newval;
+ delta = 0;
+ last_decay = now;
+ }
+ } else {
+ last_decay = now;
}
}
return val+delta;
}
- double get_last() {
+ double get_last() const {
return val;
}
- double get_last_vel() {
+ double get_last_vel() const {
return vel;
}
- utime_t get_last_decay() {
+ utime_t get_last_decay() const {
return last_decay;
}
int signal_exit(int r) {
if (forked) {
- // tell parent. this shouldn't fail, but if it does, pass the
- // error back to the parent.
- int ret = safe_write(fd[1], &r, sizeof(r));
- if (ret <= 0)
- return ret;
+ /* If we get an error here, it's too late to do anything reasonable about it. */
+ (void)safe_write(fd[1], &r, sizeof(r));
}
return r;
}
struct const_pointer_iterator : public map_type::const_iterator {
const_pointer_iterator(typename map_type::const_iterator i)
: map_type::const_iterator(i) {}
- const value_type* operator*() const {
+
+ using value_type = typename map_type::const_iterator::value_type*;
+ using reference = const typename map_type::const_iterator::value_type*;
+
+ reference operator*() const {
return &map_type::const_iterator::operator*();
}
};
#include <iostream>
+#include "acconfig.h"
+
#ifdef HAVE_SYS_PRCTL_H
#include <sys/prctl.h>
#endif
+#include <string.h>
+
code_environment_t g_code_env = CODE_ENVIRONMENT_UTILITY;
extern "C" const char *code_environment_to_str(enum code_environment_t e)
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <fcntl.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <sys/mount.h>
+#if defined(__linux__)
+#include <sys/vfs.h>
+#endif
+
+#include "include/compat.h"
+#include "common/safe_io.h"
+
+// The type-value for a ZFS FS in fstatfs.
+#define FS_ZFS_TYPE 0xde
+
+// On FreeBSD, ZFS fallocate always fails since it is considered impossible to
+// reserve space on a COW filesystem. posix_fallocate() returns EINVAL
+// Linux in this case already emulates the reservation in glibc
+// In which case it is allocated manually, and still that is not a real guarantee
+// that a full buffer is allocated on disk, since it could be compressed.
+// To prevent this the written buffer needs to be loaded with random data.
+int manual_fallocate(int fd, off_t offset, off_t len) {
+ int r = lseek(fd, offset, SEEK_SET);
+ if (r == -1)
+ return errno;
+ char data[1024*128];
+ // TODO: compressing filesystems would require random data
+ memset(data, 0x42, sizeof(data));
+ for (off_t off = 0; off < len; off += sizeof(data)) {
+ if (off + sizeof(data) > len)
+ r = safe_write(fd, data, len - off);
+ else
+ r = safe_write(fd, data, sizeof(data));
+ if (r == -1) {
+ return errno;
+ }
+ }
+ return 0;
+}
+
+int on_zfs(int basedir_fd) {
+ struct statfs basefs;
+ (void)fstatfs(basedir_fd, &basefs);
+ return (basefs.f_type == FS_ZFS_TYPE);
+}
+
+int ceph_posix_fallocate(int fd, off_t offset, off_t len) {
+ // Return 0 if oke, otherwise errno > 0
+
+#ifdef HAVE_POSIX_FALLOCATE
+ if (on_zfs(fd)) {
+ return manual_fallocate(fd, offset, len);
+ } else {
+ return posix_fallocate(fd, offset, len);
+ }
+#elif defined(__APPLE__)
+ fstore_t store;
+ store.fst_flags = F_ALLOCATECONTIG;
+ store.fst_posmode = F_PEOFPOSMODE;
+ store.fst_offset = offset;
+ store.fst_length = len;
+
+ int ret = fcntl(fd, F_PREALLOCATE, &store);
+ if (ret == -1) {
+ ret = errno;
+ }
+ return ret;
+#else
+ return manual_fallocate(fd, offset, len);
+#endif
+}
+
#include "acconfig.h"
#include "include/int_types.h"
#include "common/crc32c_aarch64.h"
+#include "arch/arm.h"
#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
/* Request crc extension capabilities from the assembler */
if (buffer) {
#ifdef HAVE_ARMV8_CRYPTO
+ if (ceph_arch_aarch64_pmull) {
#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
/* Calculate reflected crc with PMULL Instruction */
const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
if(!(length += 1024))
return crc;
-
+ }
#endif /* HAVE_ARMV8_CRYPTO */
while ((length -= sizeof(uint64_t)) >= 0) {
CRC32CX(crc, *(uint64_t *)buffer);
CRC32CB(crc, *buffer);
} else {
#ifdef HAVE_ARMV8_CRYPTO
+ if (ceph_arch_aarch64_pmull) {
#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
const poly64_t k1 = 0xe417f38a;
uint64_t t0;
if(!(length += 1024))
return crc;
-
+ }
#endif /* HAVE_ARMV8_CRYPTO */
while ((length -= sizeof(uint64_t)) >= 0)
CRC32CX(crc, 0);
OPTION(auth_client_required, OPT_STR) // what clients require of daemons
OPTION(auth_supported, OPT_STR) // deprecated; default value for above if they are not defined.
OPTION(max_rotating_auth_attempts, OPT_INT)
-OPTION(cephx_require_signatures, OPT_BOOL) // If true, don't talk to Cephx partners if they don't support message signing; off by default
+OPTION(cephx_require_signatures, OPT_BOOL)
OPTION(cephx_cluster_require_signatures, OPT_BOOL)
OPTION(cephx_service_require_signatures, OPT_BOOL)
+OPTION(cephx_require_version, OPT_INT)
+OPTION(cephx_cluster_require_version, OPT_INT)
+OPTION(cephx_service_require_version, OPT_INT)
OPTION(cephx_sign_messages, OPT_BOOL) // Default to signing session messages if supported
OPTION(auth_mon_ticket_ttl, OPT_DOUBLE)
OPTION(auth_service_ticket_ttl, OPT_DOUBLE)
OPTION(osd_scrub_during_recovery, OPT_BOOL) // Allow new scrubs to start while recovery is active on the OSD
OPTION(osd_scrub_begin_hour, OPT_INT)
OPTION(osd_scrub_end_hour, OPT_INT)
+OPTION(osd_scrub_begin_week_day, OPT_INT)
+OPTION(osd_scrub_end_week_day, OPT_INT)
OPTION(osd_scrub_load_threshold, OPT_FLOAT)
OPTION(osd_scrub_min_interval, OPT_FLOAT) // if load is low
OPTION(osd_scrub_max_interval, OPT_FLOAT) // regardless of load
OPTION(osd_deep_scrub_interval, OPT_FLOAT) // once a week
OPTION(osd_deep_scrub_randomize_ratio, OPT_FLOAT) // scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep)
OPTION(osd_deep_scrub_stride, OPT_INT)
+OPTION(osd_deep_scrub_keys, OPT_INT)
OPTION(osd_deep_scrub_update_digest_min_age, OPT_INT) // objects must be this old (seconds) before we update the whole-object digest on scrub
+OPTION(osd_skip_data_digest, OPT_BOOL)
+OPTION(osd_distrust_data_digest, OPT_BOOL)
+OPTION(osd_deep_scrub_large_omap_object_key_threshold, OPT_U64)
+OPTION(osd_deep_scrub_large_omap_object_value_sum_threshold, OPT_U64)
OPTION(osd_class_dir, OPT_STR) // where rados plugins are stored
OPTION(osd_open_classes_on_start, OPT_BOOL)
OPTION(osd_class_load_list, OPT_STR) // list of object classes allowed to be loaded (allow all: *)
OPTION(osd_debug_drop_ping_duration, OPT_INT)
OPTION(osd_debug_op_order, OPT_BOOL)
OPTION(osd_debug_verify_missing_on_start, OPT_BOOL)
-OPTION(osd_debug_scrub_chance_rewrite_digest, OPT_U64)
OPTION(osd_debug_verify_snaps, OPT_BOOL)
OPTION(osd_debug_verify_stray_on_activate, OPT_BOOL)
OPTION(osd_debug_skip_full_check_in_backfill_reservation, OPT_BOOL)
OPTION(osd_debug_skip_full_check_in_recovery, OPT_BOOL)
OPTION(osd_debug_random_push_read_error, OPT_DOUBLE)
OPTION(osd_debug_verify_cached_snaps, OPT_BOOL)
+OPTION(osd_debug_deep_scrub_sleep, OPT_FLOAT)
OPTION(osd_enable_op_tracker, OPT_BOOL) // enable/disable OSD op tracking
OPTION(osd_num_op_tracker_shard, OPT_U32) // The number of shards for holding the ops
OPTION(osd_op_history_size, OPT_U32) // Max number of completed ops to track
OPTION(rgw_lc_debug_interval, OPT_INT) // Debug run interval, in seconds
OPTION(rgw_script_uri, OPT_STR) // alternative value for SCRIPT_URI if not set in request
OPTION(rgw_request_uri, OPT_STR) // alternative value for REQUEST_URI if not set in request
+OPTION(rgw_ignore_get_invalid_range, OPT_BOOL) // treat invalid (e.g., negative) range requests as full
OPTION(rgw_swift_url, OPT_STR) // the swift url, being published by the internal swift auth
OPTION(rgw_swift_url_prefix, OPT_STR) // entry point for which a url is considered a swift url
OPTION(rgw_swift_auth_url, OPT_STR) // default URL to go and verify tokens for v1 auth (if not using internal swift auth)
OPTION(rgw_keystone_token_cache_size, OPT_INT) // max number of entries in keystone token cache
OPTION(rgw_keystone_revocation_interval, OPT_INT) // seconds between tokens revocation check
OPTION(rgw_keystone_verify_ssl, OPT_BOOL) // should we try to verify keystone's ssl
-OPTION(rgw_keystone_implicit_tenants, OPT_BOOL) // create new users in their own tenants of the same name
OPTION(rgw_cross_domain_policy, OPT_STR)
OPTION(rgw_healthcheck_disabling_path, OPT_STR) // path that existence causes the healthcheck to respond 503
OPTION(rgw_s3_auth_use_rados, OPT_BOOL) // should we try to use the internal credentials for s3?
formatter->dump_format("min_iops", "%d", data.idata.min_iops);
formatter->dump_format("average_latency", "%f", data.avg_latency);
formatter->dump_format("stddev_latency", "%f", vec_stddev(data.history.latency));
- formatter->dump_format("max_latency:", "%f", data.max_latency);
+ formatter->dump_format("max_latency", "%f", data.max_latency);
formatter->dump_format("min_latency", "%f", data.min_latency);
}
//write object size/number data for read benchmarks
.set_default(false)
.set_description(""),
+ Option("cephx_require_version", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description("Cephx version required (1 = pre-mimic, 2 = mimic+)"),
+
Option("cephx_cluster_require_signatures", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
.set_description(""),
+ Option("cephx_cluster_require_version", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description("Cephx version required by the cluster from clients (1 = pre-mimic, 2 = mimic+)"),
+
Option("cephx_service_require_signatures", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
.set_description(""),
+ Option("cephx_service_require_version", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description("Cephx version required from ceph services (1 = pre-mimic, 2 = mimic+)"),
+
Option("cephx_sign_messages", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description(""),
.set_default(8)
.set_description(""),
+ Option("osd_skip_data_digest", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description("Do not store full-object checksums if the backend (bluestore) does its own checksums. Do not ever turn this off if it has ever been turned on."),
+
+ Option("osd_distrust_data_digest", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description("Do not trust stored data_digest (due to previous bug or corruption)"),
+
Option("osd_op_queue", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("wpq")
.set_enum_allowed( { "wpq", "prioritized", "mclock_opclass", "mclock_client", "debug_random" } )
Option("osd_max_scrubs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(1)
- .set_description(""),
+ .set_description("Maximum concurrent scrubs on a single OSD"),
Option("osd_scrub_during_recovery", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("Allow scrubbing when PGs on the OSD are undergoing recovery"),
Option("osd_scrub_begin_hour", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
- .set_description(""),
+ .set_description("Restrict scrubbing to this hour of the day or later")
+ .add_see_also("osd_scrub_end_hour"),
Option("osd_scrub_end_hour", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(24)
- .set_description(""),
+ .set_description("Restrict scrubbing to hours of the day earlier than this")
+ .add_see_also("osd_scrub_begin_hour"),
+
+ Option("osd_scrub_begin_week_day", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description("Restrict scrubbing to this day of the week or later")
+ .set_long_description("0 or 7 = Sunday, 1 = Monday, etc.")
+ .add_see_also("osd_scrub_end_week_day"),
+
+ Option("osd_scrub_end_week_day", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(7)
+ .set_description("Restrict scrubbing to days of the week earlier than this")
+ .set_long_description("0 or 7 = Sunday, 1 = Monday, etc.")
+ .add_see_also("osd_scrub_begin_week_day"),
Option("osd_scrub_load_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0.5)
- .set_description(""),
+ .set_description("Allow scrubbing when system load divided by number of CPUs is below this value"),
Option("osd_scrub_min_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(1_day)
- .set_description(""),
+ .set_description("Scrub each PG no more often than this interval")
+ .add_see_also("osd_scrub_max_interval"),
Option("osd_scrub_max_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(7_day)
- .set_description(""),
+ .set_description("Scrub each PG no less often than this interval")
+ .add_see_also("osd_scrub_min_interval"),
Option("osd_scrub_interval_randomize_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0.5)
- .set_description(""),
+ .set_description("Ratio of scrub interval to randomly vary")
+ .set_long_description("This prevents a scrub 'stampede' by randomly varying the scrub intervals so that they are soon uniformly distributed over the week")
+ .add_see_also("osd_scrub_min_interval"),
- Option("osd_scrub_backoff_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ Option("osd_scrub_backoff_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(.66)
- .set_description(""),
+ .set_description("Backoff ratio after a failed scrub scheduling attempt"),
Option("osd_scrub_chunk_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(5)
- .set_description(""),
+ .set_description("Minimum number of objects to scrub in a single chunk")
+ .add_see_also("osd_scrub_chunk_max"),
Option("osd_scrub_chunk_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(25)
- .set_description(""),
+ .set_description("Maximum number of object to scrub in a single chunk")
+ .add_see_also("osd_scrub_chunk_min"),
Option("osd_scrub_sleep", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0)
- .set_description(""),
+ .set_description("Duration to inject a delay during scrubbing"),
Option("osd_scrub_auto_repair", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("Automatically repair damaged objects detected during scrub"),
Option("osd_scrub_auto_repair_num_errors", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(5)
- .set_description(""),
+ .set_description("Maximum number of detected errors to automatically repair")
+ .add_see_also("osd_scrub_auto_repair"),
+
+ Option("osd_scrub_max_preemptions", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description("Set the maximum number of times we will preempt a deep scrub due to a client operation before blocking client IO to complete the scrub"),
Option("osd_deep_scrub_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(7_day)
- .set_description(""),
+ .set_description("Deep scrub each PG (i.e., verify data checksums) at least this often"),
Option("osd_deep_scrub_randomize_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0.15)
- .set_description(""),
+ .set_description("Ratio of deep scrub interval to randomly vary")
+ .set_long_description("This prevents a deep scrub 'stampede' by randomly varying the scrub intervals so that they are soon uniformly distributed over the week")
+ .add_see_also("osd_deep_scrub_interval"),
Option("osd_deep_scrub_stride", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(524288)
- .set_description(""),
+ .set_description("Number of bytes to read from an object at a time during deep scrub"),
+
+ Option("osd_deep_scrub_keys", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description("Number of keys to read from an object at a time during deep scrub"),
Option("osd_deep_scrub_update_digest_min_age", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(2_hr)
- .set_description(""),
+ .set_description("Update overall object digest only if object was last modified longer ago than this"),
+
+ Option("osd_deep_scrub_large_omap_object_key_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(2000000)
+ .set_description("Warn when we encounter an object with more omap keys than this")
+ .add_service("osd")
+ .add_see_also("osd_deep_scrub_large_omap_object_value_sum_threshold"),
+
+ Option("osd_deep_scrub_large_omap_object_value_sum_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1_G)
+ .set_description("Warn when we encounter an object with more omap key bytes than this")
+ .add_service("osd")
+ .add_see_also("osd_deep_scrub_large_omap_object_key_threshold"),
Option("osd_class_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default(CEPH_LIBDIR "/rados-classes")
.set_description(""),
Option("osd_max_pg_per_osd_hard_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
- .set_default(2)
+ .set_default(3)
.set_min(1)
.set_description("Maximum number of PG per OSD, a factor of 'mon_max_pg_per_osd'")
.set_long_description("OSD will refuse to instantiate PG if the number of PG it serves exceeds this number.")
.set_default(false)
.set_description(""),
- Option("osd_debug_scrub_chance_rewrite_digest", Option::TYPE_UINT, Option::LEVEL_DEV)
- .set_default(0)
- .set_description(""),
-
Option("osd_debug_verify_snaps", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
.set_default(false)
.set_description(""),
+ Option("osd_debug_deep_scrub_sleep", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description("Inject an expensive sleep during deep scrub IO to make it easier to induce preemption"),
+
Option("osd_enable_op_tracker", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description(""),
.set_description("The block size for index partitions. (0 = rocksdb default)"),
Option("mon_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
- .set_default("write_buffer_size=33554432,compression=kNoCompression")
+ .set_default("write_buffer_size=33554432,"
+ "compression=kNoCompression,"
+ "level_compaction_dynamic_level_bytes=true")
.set_description(""),
Option("osd_client_op_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
Option("osd_scrub_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(5)
- .set_description(""),
+ .set_description("Priority for scrub operations in work queue"),
Option("osd_scrub_cost", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(50<<20)
- .set_description(""),
+ .set_description("Cost for scrub operations in work queue"),
Option("osd_requested_scrub_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(120)
.set_default("")
.set_description(""),
+ Option("rgw_ignore_get_invalid_range", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description("Treat invalid (e.g., negative) range request as full")
+ .set_long_description("Treat invalid (e.g., negative) range request "
+ "as request for the full object (AWS compatibility)"),
+
Option("rgw_swift_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
.set_description("Swift-auth storage URL")
.set_default(true)
.set_description("Should RGW verify the Keystone server SSL certificate."),
- Option("rgw_keystone_implicit_tenants", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
- .set_default(false)
+ Option("rgw_keystone_implicit_tenants", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("false")
+ .set_enum_allowed( { "false", "true", "swift", "s3", "both", "0", "1", "none" } )
.set_description("RGW Keystone implicit tenants creation")
.set_long_description(
"Implicitly create new users in their own tenant with the same name when "
- "authenticating via Keystone."),
+ "authenticating via Keystone. Can be limited to s3 or swift only."),
Option("rgw_cross_domain_policy", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("<allow-access-from domain=\"*\" secure=\"false\" />")
Option("rgw_torrent_flag", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description("Produce torrent function flag"),
+ .set_description("When true, uploaded objects will calculate and store "
+ "a SHA256 hash of object data so the object can be "
+ "retrieved as a torrent file"),
Option("rgw_torrent_tracker", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
Option("mds_hack_allow_loading_invalid_metadata", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description("INTENTIONALLY CAUSE DATA LOSS by bypasing checks for invalid metadata on disk. Allows testing repair tools."),
+
+ Option("mds_inject_migrator_session_race", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false),
});
}
std::vector<Option> get_mds_client_options() {
return std::vector<Option>({
- Option("client_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ Option("client_cache_size", Option::TYPE_INT, Option::LEVEL_BASIC)
.set_default(16384)
- .set_description(""),
+ .set_description("soft maximum number of directory entries in client cache"),
Option("client_cache_mid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(.75)
- .set_description(""),
+ .set_description("mid-point of client cache LRU"),
- Option("client_use_random_mds", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ Option("client_use_random_mds", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
- .set_description(""),
+ .set_description("issue new requests to a random active MDS"),
Option("client_mount_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(300.0)
- .set_description(""),
+ .set_description("timeout for mounting CephFS (seconds)"),
- Option("client_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ Option("client_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(1.0)
- .set_description(""),
+ .set_description("seconds between client upkeep ticks"),
- Option("client_trace", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ Option("client_trace", Option::TYPE_STR, Option::LEVEL_DEV)
.set_default("")
- .set_description(""),
+ .set_description("file containing trace of client operations"),
Option("client_readahead_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(128*1024)
- .set_description(""),
+ .set_description("minimum bytes to readahead in a file"),
Option("client_readahead_max_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
- .set_description(""),
+ .set_description("maximum bytes to readahead in a file (zero is unlimited)"),
Option("client_readahead_max_periods", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(4)
- .set_description(""),
+ .set_description("maximum stripe periods to readahead in a file"),
Option("client_reconnect_stale", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("reconnect when the session becomes stale"),
Option("client_snapdir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default(".snap")
- .set_description(""),
+ .set_description("pseudo directory for snapshot access to a directory"),
Option("client_mountpoint", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("/")
- .set_description(""),
+ .set_description("default mount-point"),
Option("client_mount_uid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(-1)
- .set_description(""),
+ .set_description("uid to mount as"),
Option("client_mount_gid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(-1)
- .set_description(""),
+ .set_description("gid to mount as"),
- Option("client_notify_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ /* RADOS client option */
+ Option("client_notify_timeout", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(10)
.set_description(""),
- Option("osd_client_watch_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ /* RADOS client option */
+ Option("osd_client_watch_timeout", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(30)
.set_description(""),
- Option("client_caps_release_delay", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ Option("client_caps_release_delay", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(5)
.set_description(""),
Option("client_quota_df", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("show quota usage for statfs (df)"),
Option("client_oc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("enable object caching"),
Option("client_oc_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(200_M)
- .set_description(""),
+ .set_description("maximum size of object cache"),
Option("client_oc_max_dirty", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(100_M)
- .set_description(""),
+ .set_description("maximum size of dirty pages in object cache"),
Option("client_oc_target_dirty", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(8_M)
- .set_description(""),
+ .set_description("target size of dirty pages object cache"),
Option("client_oc_max_dirty_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(5.0)
- .set_description(""),
+ .set_description("maximum age of dirty pages in object cache (seconds)"),
Option("client_oc_max_objects", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(1000)
- .set_description(""),
+ .set_description("maximum number of objects in cache"),
Option("client_debug_getattr_caps", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_default(0)
.set_description(""),
- Option("client_max_inline_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ Option("client_max_inline_size", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(4_K)
.set_description(""),
Option("client_metadata", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
- .set_description(""),
+ .set_description("metadata key=value comma-delimited pairs appended to session metadata"),
Option("client_acl_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
- .set_description(""),
+ .set_description("ACL type to enforce (none or \"posix_acl\")"),
Option("client_permissions", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("client-enforced permission checking"),
Option("client_dirsize_rbytes", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("set the directory size as the number of file bytes recursively used")
+ .set_long_description("This option enables a CephFS feature that stores the recursive directory size (the bytes used by files in the directory and its descendents) in the st_size field of the stat structure."),
Option("fuse_use_invalidate_cb", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
Option("fuse_disable_pagecache", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("disable page caching in the kernel for this FUSE mount"),
Option("fuse_allow_other", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("pass allow_other to FUSE on mount"),
Option("fuse_default_permissions", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("pass default_permisions to FUSE on mount"),
Option("fuse_big_writes", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
Option("fuse_atomic_o_trunc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("pass atomic_o_trunc flag to FUSE on mount"),
- Option("fuse_debug", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ Option("fuse_debug", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
Option("fuse_multithreaded", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("allow parallel processing through FUSE library"),
Option("fuse_require_active_mds", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("require active MDSs in the file system when mounting"),
Option("fuse_syncfs_on_mksnap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("synchronize all local metadata/file changes after snapshot"),
Option("fuse_set_user_groups", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description("check for ceph-fuse to consider supplementary groups for permissions"),
- Option("client_try_dentry_invalidate", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ Option("client_try_dentry_invalidate", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
Option("client_check_pool_perm", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("confirm access to inode's data pool/namespace described in file layout"),
- Option("client_use_faked_inos", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ Option("client_use_faked_inos", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
Option("client_mds_namespace", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
- .set_description(""),
+ .set_description("CephFS file system name to mount"),
});
}
int r;
out << " {\n";
out << " bucket_id " << bucket_id << "\n";
- if (arg->weight_set_size > 0) {
- r = decompile_weight_set(arg->weight_set, arg->weight_set_size, out);
+ if (arg->weight_set_positions > 0) {
+ r = decompile_weight_set(arg->weight_set, arg->weight_set_positions, out);
if (r < 0)
return r;
}
{
for (__u32 i = 0; i < arg_map.size; i++) {
if ((arg_map.args[i].ids_size == 0) &&
- (arg_map.args[i].weight_set_size == 0))
+ (arg_map.args[i].weight_set_positions == 0))
continue;
int r = decompile_choose_arg(&arg_map.args[i], -1-i, out);
if (r < 0)
int CrushCompiler::parse_weight_set(iter_t const& i, int bucket_id, crush_choose_arg *arg)
{
// -3 stands for the leading "weight_set" keyword and the enclosing [ ]
- arg->weight_set_size = i->children.size() - 3;
- arg->weight_set = (crush_weight_set *)calloc(arg->weight_set_size, sizeof(crush_weight_set));
+ arg->weight_set_positions = i->children.size() - 3;
+ arg->weight_set = (crush_weight_set *)calloc(arg->weight_set_positions, sizeof(crush_weight_set));
__u32 pos = 0;
for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
int r = 0;
switch((int)p->value.id().to_long()) {
case crush_grammar::_weight_set_weights:
- if (pos < arg->weight_set_size) {
+ if (pos < arg->weight_set_positions) {
r = parse_weight_set_weights(p, bucket_id, &arg->weight_set[pos]);
pos++;
} else {
if (b &&
bidx < (int)cmap.size &&
cmap.args[bidx].weight_set &&
- cmap.args[bidx].weight_set_size >= 1) {
+ cmap.args[bidx].weight_set_positions >= 1) {
int bpos;
for (bpos = 0;
bpos < (int)cmap.args[bidx].weight_set[0].size &&
}
f->open_array_section(name.c_str());
for (unsigned opos = 0;
- opos < cmap.args[bidx].weight_set_size;
+ opos < cmap.args[bidx].weight_set_positions;
++opos) {
float w = (float)cmap.args[bidx].weight_set[opos].weights[bpos] /
(float)0x10000;
crush_choose_arg_map arg_map = choose_args.begin()->second;
for (__u32 i = 0; i < arg_map.size; i++) {
crush_choose_arg *arg = &arg_map.args[i];
- if (arg->weight_set_size == 0 &&
+ if (arg->weight_set_positions == 0 &&
arg->ids_size == 0)
continue;
- if (arg->weight_set_size != 1)
+ if (arg->weight_set_positions != 1)
return true;
if (arg->ids_size != 0)
return true;
}
}
+void CrushWrapper::find_takes_by_rule(int rule, set<int> *roots) const
+{
+ if (rule < 0 || rule >= (int)crush->max_rules)
+ return;
+ crush_rule *r = crush->rules[rule];
+ if (!r)
+ return;
+ for (unsigned i = 0; i < r->len; i++) {
+ if (r->steps[i].op == CRUSH_RULE_TAKE)
+ roots->insert(r->steps[i].arg1);
+ }
+}
+
void CrushWrapper::find_roots(set<int> *roots) const
{
for (int i = 0; i < crush->max_buckets; i++) {
if (class_bucket.count(item) != 0)
class_bucket.erase(item);
class_remove_item(item);
+ update_choose_args(cct);
}
if ((item >= 0 || !unlink_only) && name_map.count(item)) {
ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl;
if (class_bucket.count(item) != 0)
class_bucket.erase(item);
class_remove_item(item);
+ update_choose_args(nullptr);
return 0;
}
+void CrushWrapper::update_choose_args(CephContext *cct)
+{
+ for (auto& i : choose_args) {
+ crush_choose_arg_map &arg_map = i.second;
+ unsigned positions = get_choose_args_positions(arg_map);
+ for (int j = 0; j < crush->max_buckets; ++j) {
+ crush_bucket *b = crush->buckets[j];
+ auto& carg = arg_map.args[j];
+ // strip out choose_args for any buckets that no longer exist
+ if (!b || b->alg != CRUSH_BUCKET_STRAW2) {
+ if (carg.ids) {
+ if (cct)
+ ldout(cct,0) << __func__ << " removing " << i.first << " bucket "
+ << (-1-j) << " ids" << dendl;
+ free(carg.ids);
+ carg.ids = 0;
+ carg.ids_size = 0;
+ }
+ if (carg.weight_set) {
+ if (cct)
+ ldout(cct,0) << __func__ << " removing " << i.first << " bucket "
+ << (-1-j) << " weight_sets" << dendl;
+ for (unsigned p = 0; p < carg.weight_set_positions; ++p) {
+ free(carg.weight_set[p].weights);
+ }
+ free(carg.weight_set);
+ carg.weight_set = 0;
+ carg.weight_set_positions = 0;
+ }
+ continue;
+ }
+ if (carg.weight_set_positions == 0) {
+ continue; // skip it
+ }
+ if (carg.weight_set_positions != positions) {
+ if (cct)
+ lderr(cct) << __func__ << " " << i.first << " bucket "
+ << (-1-j) << " positions " << carg.weight_set_positions
+ << " -> " << positions << dendl;
+ continue; // wth... skip!
+ }
+ // mis-sized weight_sets? this shouldn't ever happen.
+ for (unsigned p = 0; p < positions; ++p) {
+ if (carg.weight_set[p].size != b->size) {
+ if (cct)
+ lderr(cct) << __func__ << " fixing " << i.first << " bucket "
+ << (-1-j) << " position " << p
+ << " size " << carg.weight_set[p].size << " -> "
+ << b->size << dendl;
+ auto old_ws = carg.weight_set[p];
+ carg.weight_set[p].size = b->size;
+ carg.weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32));
+ auto max = std::min<unsigned>(old_ws.size, b->size);
+ for (unsigned k = 0; k < max; ++k) {
+ carg.weight_set[p].weights[k] = old_ws.weights[k];
+ }
+ free(old_ws.weights);
+ }
+ }
+ }
+ }
+}
+
int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
{
ldout(cct, 5) << "remove_item " << item
return b->size;
}
+void CrushWrapper::get_children_of_type(int id,
+ int type,
+ set<int> *children,
+ bool exclude_shadow) const
+{
+ if (id >= 0) {
+ if (type == 0) {
+ // want leaf?
+ children->insert(id);
+ }
+ return;
+ }
+ auto b = get_bucket(id);
+ if (IS_ERR(b)) {
+ return;
+ }
+ if (b->type < type) {
+ // give up
+ return;
+ } else if (b->type == type) {
+ if (!is_shadow_item(b->id) || !exclude_shadow) {
+ children->insert(b->id);
+ }
+ return;
+ }
+ for (unsigned n = 0; n < b->size; n++) {
+ get_children_of_type(b->items[n], type, children, exclude_shadow);
+ }
+}
+
int CrushWrapper::get_rule_failure_domain(int rule_id)
{
crush_rule *rule = get_rule(rule_id);
return -ENOENT;
}
-int CrushWrapper::get_parent_of_type(int item, int type) const
+int CrushWrapper::get_parent_of_type(int item, int type, int rule) const
{
- do {
- int r = get_immediate_parent_id(item, &item);
- if (r < 0) {
- return 0;
+ if (rule < 0) {
+ // no rule specified
+ do {
+ int r = get_immediate_parent_id(item, &item);
+ if (r < 0) {
+ return 0;
+ }
+ } while (get_bucket_type(item) != type);
+ return item;
+ }
+ set<int> roots;
+ find_takes_by_rule(rule, &roots);
+ for (auto root : roots) {
+ set<int> candidates;
+ get_children_of_type(root, type, &candidates, false);
+ for (auto candidate : candidates) {
+ if (subtree_contains(candidate, item)) {
+ // note that here we assure that no two different buckets
+ // from a single crush rule will share a same device,
+ // which should generally be true.
+ return candidate;
+ }
}
- } while (get_bucket_type(item) != type);
- return item;
+ }
+ return 0; // not found
}
int CrushWrapper::rename_class(const string& srcname, const string& dstname)
// accumulate weight values for each carg and bucket as we go. because it is
// depth first, we will have the nested bucket weights we need when we
// finish constructing the containing buckets.
- map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> weights
+ map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> [bucket weight for each position]
set<int> roots;
find_nonshadow_roots(&roots);
for (auto &r : roots) {
for (auto &w : choose_args) {
crush_choose_arg_map &arg_map = w.second;
crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
- for (__u32 j = 0; j < arg->weight_set_size; j++) {
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
crush_weight_set *weight_set = &arg->weight_set[j];
weight_set->weights[position] = weight;
}
crush_choose_arg& carg = cmap.args[pos];
carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
size);
- carg.weight_set_size = positions;
+ carg.weight_set_positions = positions;
for (int ppos = 0; ppos < positions; ++ppos) {
carg.weight_set[ppos].weights = (__u32*)calloc(sizeof(__u32), size);
carg.weight_set[ppos].size = size;
for (auto &w : choose_args) {
crush_choose_arg_map &arg_map = w.second;
crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
- for (__u32 j = 0; j < arg->weight_set_size; j++) {
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
crush_weight_set *weight_set = &arg->weight_set[j];
weight_set->weights = (__u32*)realloc(weight_set->weights,
new_size * sizeof(__u32));
for (auto &w : choose_args) {
crush_choose_arg_map &arg_map = w.second;
crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
- for (__u32 j = 0; j < arg->weight_set_size; j++) {
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
crush_weight_set *weight_set = &arg->weight_set[j];
assert(weight_set->size - 1 == new_size);
for (__u32 k = position; k < new_size; k++)
auto& o = cmap.args[-1-original_id];
auto& n = cmap.args[-1-bno];
n.ids_size = 0; // FIXME: implement me someday
- n.weight_set_size = o.weight_set_size;
+ n.weight_set_positions = o.weight_set_positions;
n.weight_set = (crush_weight_set*)calloc(
- n.weight_set_size, sizeof(crush_weight_set));
- for (size_t s = 0; s < n.weight_set_size; ++s) {
+ n.weight_set_positions, sizeof(crush_weight_set));
+ for (size_t s = 0; s < n.weight_set_positions; ++s) {
n.weight_set[s].size = copy->size;
n.weight_set[s].weights = (__u32*)calloc(copy->size, sizeof(__u32));
}
- for (size_t s = 0; s < n.weight_set_size; ++s) {
- vector<int> bucket_weights(n.weight_set_size);
+ for (size_t s = 0; s < n.weight_set_positions; ++s) {
+ vector<int> bucket_weights(n.weight_set_positions);
for (size_t i = 0; i < copy->size; ++i) {
int item = copy->items[i];
if (item >= 0) {
n.weight_set[s].weights[i] = o.weight_set[s].weights[item_orig_pos[i]];
- } else {
+ } else if ((*cmap_item_weight)[w.first].count(item)) {
n.weight_set[s].weights[i] = (*cmap_item_weight)[w.first][item][s];
+ } else {
+ n.weight_set[s].weights[i] = 0;
}
bucket_weights[s] += n.weight_set[s].weights[i];
}
{
__u32 *weights;
if (encode_compat_choose_args &&
- arg_map.args[i].weight_set_size > 0) {
+ arg_map.args[i].weight_set_positions > 0) {
weights = arg_map.args[i].weight_set[0].weights;
} else {
weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
size = 0;
for (__u32 i = 0; i < arg_map.size; i++) {
crush_choose_arg *arg = &arg_map.args[i];
- if (arg->weight_set_size == 0 &&
+ if (arg->weight_set_positions == 0 &&
arg->ids_size == 0)
continue;
size++;
::encode(size, bl);
for (__u32 i = 0; i < arg_map.size; i++) {
crush_choose_arg *arg = &arg_map.args[i];
- if (arg->weight_set_size == 0 &&
+ if (arg->weight_set_positions == 0 &&
arg->ids_size == 0)
continue;
::encode(i, bl);
- ::encode(arg->weight_set_size, bl);
- for (__u32 j = 0; j < arg->weight_set_size; j++) {
+ ::encode(arg->weight_set_positions, bl);
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
crush_weight_set *weight_set = &arg->weight_set[j];
::encode(weight_set->size, bl);
for (__u32 k = 0; k < weight_set->size; k++)
::decode(bucket_index, blp);
assert(bucket_index < arg_map.size);
crush_choose_arg *arg = &arg_map.args[bucket_index];
- ::decode(arg->weight_set_size, blp);
- if (arg->weight_set_size) {
+ ::decode(arg->weight_set_positions, blp);
+ if (arg->weight_set_positions) {
arg->weight_set = (crush_weight_set*)calloc(
- arg->weight_set_size, sizeof(crush_weight_set));
- for (__u32 k = 0; k < arg->weight_set_size; k++) {
+ arg->weight_set_positions, sizeof(crush_weight_set));
+ for (__u32 k = 0; k < arg->weight_set_positions; k++) {
crush_weight_set *weight_set = &arg->weight_set[k];
::decode(weight_set->size, blp);
weight_set->weights = (__u32*)calloc(
choose_args[choose_args_index] = arg_map;
}
}
+ update_choose_args(nullptr); // in case we decode a legacy "corrupted" map
finalize();
}
catch (...) {
f->open_array_section(stringify(c.first).c_str());
for (__u32 i = 0; i < arg_map.size; i++) {
crush_choose_arg *arg = &arg_map.args[i];
- if (arg->weight_set_size == 0 &&
+ if (arg->weight_set_positions == 0 &&
arg->ids_size == 0)
continue;
f->open_object_section("choose_args");
int bucket_index = i;
f->dump_int("bucket_id", -1-bucket_index);
- if (arg->weight_set_size > 0) {
+ if (arg->weight_set_positions > 0) {
f->open_array_section("weight_set");
- for (__u32 j = 0; j < arg->weight_set_size; j++) {
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
f->open_array_section("weights");
__u32 *weights = arg->weight_set[j].weights;
__u32 size = arg->weight_set[j].size;
if (b &&
bidx < (int)cmap.size &&
cmap.args[bidx].weight_set &&
- cmap.args[bidx].weight_set_size >= 1) {
+ cmap.args[bidx].weight_set_positions >= 1) {
int pos;
for (pos = 0;
pos < (int)cmap.args[bidx].weight_set[0].size &&
<< " w " << w << dendl;
vector<int> o;
auto tmpi = i;
+ if (i == orig.end()) {
+ ldout(cct, 10) << __func__ << " end of orig, break 0" << dendl;
+ break;
+ }
for (auto from : w) {
ldout(cct, 10) << " from " << from << dendl;
// identify leaves under each choice. we use this to check whether any of these
ldout(cct, 10) << __func__ << " pos " << pos << " replace "
<< *i << " -> " << item << dendl;
replaced = true;
+ assert(i != orig.end());
++i;
break;
}
if (!replaced) {
ldout(cct, 10) << __func__ << " pos " << pos << " keep " << *i
<< dendl;
+ assert(i != orig.end());
o.push_back(*i);
++i;
}
if (numrep <= 0)
numrep += maxout;
type_stack.push_back(make_pair(type, numrep));
- type_stack.push_back(make_pair(0, 1));
+ if (type > 0)
+ type_stack.push_back(make_pair(0, 1));
int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
i, used, &w);
if (r < 0)
}
crush_choose_arg *carg = &cmap.args[bidx];
if (carg->weight_set == NULL) {
- if (ss)
- *ss << "no weight-set for bucket " << b->id;
- ldout(cct, 10) << __func__ << " no weight_set for bucket " << b->id
- << dendl;
- return 0;
+ // create a weight-set for this bucket and populate it with the
+ // bucket weights
+ unsigned positions = get_choose_args_positions(cmap);
+ carg->weight_set_positions = positions;
+ carg->weight_set = static_cast<crush_weight_set*>(
+ calloc(sizeof(crush_weight_set), positions));
+ for (unsigned p = 0; p < positions; ++p) {
+ carg->weight_set[p].size = b->size;
+ carg->weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32));
+ for (unsigned i = 0; i < b->size; ++i) {
+ carg->weight_set[p].weights[i] = crush_get_bucket_item_weight(b, i);
+ }
+ }
+ changed++;
}
- if (carg->weight_set_size != weight.size()) {
+ if (carg->weight_set_positions != weight.size()) {
if (ss)
- *ss << "weight_set_size != " << weight.size() << " for bucket " << b->id;
- ldout(cct, 10) << __func__ << " weight_set_size != " << weight.size()
+ *ss << "weight_set_positions != " << weight.size() << " for bucket " << b->id;
+ ldout(cct, 10) << __func__ << " weight_set_positions != " << weight.size()
<< " for bucket " << b->id << dendl;
return 0;
}
std::map<int64_t, crush_choose_arg_map> choose_args;
private:
- struct crush_map *crush;
+ struct crush_map *crush = nullptr;
bool have_uniform_rules = false;
/* reverse maps */
- mutable bool have_rmaps;
+ mutable bool have_rmaps = false;
mutable std::map<string, int> type_rmap, name_rmap, rule_name_rmap;
void build_rmaps() const {
if (have_rmaps) return;
CrushWrapper(const CrushWrapper& other);
const CrushWrapper& operator=(const CrushWrapper& other);
- CrushWrapper() : crush(0), have_rmaps(false) {
+ CrushWrapper() {
create();
}
~CrushWrapper() {
* Note that these may not be parentless roots.
*/
void find_takes(set<int> *roots) const;
+ void find_takes_by_rule(int rule, set<int> *roots) const;
/**
* find tree roots
/**
* return ancestor of the given type, or 0 if none
+ * can pass in a specific crush **rule** to return ancestor from that rule only
* (parent is always a bucket and thus <0)
*/
- int get_parent_of_type(int id, int type) const;
+ int get_parent_of_type(int id, int type, int rule = -1) const;
/**
* get the fully qualified location of a device by successively finding
* @return number of items, or error
*/
int get_children(int id, list<int> *children);
+ void get_children_of_type(int id,
+ int type,
+ set<int> *children,
+ bool exclude_shadow = true) const;
/**
* get failure-domain type of a specific crush rule
void destroy_choose_args(crush_choose_arg_map arg_map) {
for (__u32 i = 0; i < arg_map.size; i++) {
crush_choose_arg *arg = &arg_map.args[i];
- for (__u32 j = 0; j < arg->weight_set_size; j++) {
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
crush_weight_set *weight_set = &arg->weight_set[j];
free(weight_set->weights);
}
carg.ids_size = 0;
if (b && b->alg == CRUSH_BUCKET_STRAW2) {
crush_bucket_straw2 *sb = (crush_bucket_straw2*)b;
- carg.weight_set_size = positions;
+ carg.weight_set_positions = positions;
carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
- carg.weight_set_size);
+ carg.weight_set_positions);
// initialize with canonical weights
for (int pos = 0; pos < positions; ++pos) {
carg.weight_set[pos].size = b->size;
}
} else {
carg.weight_set = NULL;
- carg.weight_set_size = 0;
+ carg.weight_set_positions = 0;
}
}
}
choose_args.clear();
}
+ // remove choose_args for buckets that no longer exist, create them for new buckets
+ void update_choose_args(CephContext *cct);
+
// adjust choose_args_map weight, preserving the hierarchical summation
// property. used by callers optimizing layouts by tweaking weights.
int _choose_args_adjust_item_weight_in_bucket(
int get_choose_args_positions(crush_choose_arg_map cmap) {
// infer positions from other buckets
for (unsigned j = 0; j < cmap.size; ++j) {
- if (cmap.args[j].weight_set_size) {
- return cmap.args[j].weight_set_size;
+ if (cmap.args[j].weight_set_positions) {
+ return cmap.args[j].weight_set_positions;
}
}
return 1;
weights += bucket->h.size;
}
arg[b].weight_set = weight_set;
- arg[b].weight_set_size = num_positions;
+ arg[b].weight_set_positions = num_positions;
weight_set += position;
memcpy(ids, bucket->h.items, sizeof(__s32) * bucket->h.size);
* When crush_do_rule() chooses the Nth item from a straw2 bucket, the
* replacement weights found at __weight_set[N]__ are used instead of
* the weights from __item_weights__. If __N__ is greater than
- * __weight_set_size__, the weights found at __weight_set_size-1__ are
+ * __weight_set_positions__, the weights found at __weight_set_positions-1__ are
* used instead. For instance if __weight_set__ is:
*
* [ [ 0x10000, 0x20000 ], // position 0
__s32 *ids; /*!< values to use instead of items */
__u32 ids_size; /*!< size of the __ids__ array */
struct crush_weight_set *weight_set; /*!< weight replacements for a given position */
- __u32 weight_set_size; /*!< size of the __weight_set__ array */
+ __u32 weight_set_positions; /*!< size of the __weight_set__ array */
};
/** @ingroup API
if ((arg == NULL) ||
(arg->weight_set == NULL))
return bucket->item_weights;
- if (position >= arg->weight_set_size)
- position = arg->weight_set_size - 1;
+ if (position >= arg->weight_set_positions)
+ position = arg->weight_set_positions - 1;
return arg->weight_set[position].weights;
}
<< err << dendl;
exit(1);
}
- VOID_TEMP_FAILURE_RETRY(close(STDOUT_FILENO));
- if (open("/dev/null", O_RDONLY) < 0) {
- int err = errno;
- derr << "global_init_daemonize: open(/dev/null) failed: error "
- << err << dendl;
- exit(1);
- }
const md_config_t *conf = cct->_conf;
if (pidfile_write(conf) < 0)
void global_init_postfork_finish(CephContext *cct)
{
- /* We only close stderr once the caller decides the daemonization
- * process is finished. This way we can allow error messages to be
+ /* We only close stdout+stderr once the caller decides the daemonization
+ * process is finished. This way we can allow error or other messages to be
* propagated in a manner that the user is able to see.
*/
if (!(cct->get_init_flags() & CINIT_FLAG_NO_CLOSE_STDERR)) {
exit(1);
}
}
+
+ VOID_TEMP_FAILURE_RETRY(close(STDOUT_FILENO));
+ if (open("/dev/null", O_RDONLY) < 0) {
+ int err = errno;
+ derr << "global_init_daemonize: open(/dev/null) failed: error "
+ << err << dendl;
+ exit(1);
+ }
+
ldout(cct, 1) << "finished global_init_daemonize" << dendl;
}
DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit*
+DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2) // *do not share this bit*
-DEFINE_CEPH_FEATURE(61, 1, RESERVED2) // unused, but slow down!
DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal
DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing
CEPH_FEATURE_RESEND_ON_SPLIT | \
CEPH_FEATURE_RADOS_BACKOFF | \
CEPH_FEATURE_OSD_RECOVERY_DELETES | \
+ CEPH_FEATURE_CEPHX_V2 | \
0ULL)
#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL
static inline void ____build_time_check_for_reserved_bits(void) {
CEPH_STATIC_ASSERT((CEPH_FEATURES_ALL &
(CEPH_FEATURE_RESERVED |
- CEPH_FEATURE_RESERVED2 |
DEPRECATED_CEPH_FEATURE_RESERVED_BROKEN)) == 0);
}
CEPH_CAP_XATTR_SHARED)
#define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \
CEPH_CAP_FILE_RD)
+#define CEPH_STAT_RSTAT CEPH_CAP_FILE_WREXTEND
#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
CEPH_CAP_LINK_SHARED | \
int64_t off, uint64_t len, char* buf);
int ceph_ll_fsync(struct ceph_mount_info *cmount, struct Fh *fh,
int syncdataonly);
+int ceph_ll_sync_inode(struct ceph_mount_info *cmount, struct Inode *in,
+ int syncdataonly);
int ceph_ll_write(struct ceph_mount_info *cmount, struct Fh* filehandle,
int64_t off, uint64_t len, const char *data);
int64_t ceph_ll_readv(struct ceph_mount_info *cmount, struct Fh *fh,
#define CEPH_COMPAT_H
#include "acconfig.h"
+#include <sys/types.h>
#if defined(__linux__)
#define PROCPREFIX
0; })
#endif
+int ceph_posix_fallocate(int fd, off_t offset, off_t len);
+
#endif /* !CEPH_COMPAT_H */
erase(val, 1);
}
- void erase(T start, T len,
- std::function<bool(T, T)> post_process = {}) {
+ void erase(T start, T len,
+ std::function<bool(T, T)> claim = {}) {
typename Map::iterator p = find_inc_m(start);
_size -= len;
T before = start - p->first;
assert(p->second >= before+len);
T after = p->second - before - len;
- if (before && (post_process ? post_process(p->first, before) : true)) {
- p->second = before; // shorten bit before
+ if (before) {
+ if (claim && claim(p->first, before)) {
+ _size -= before;
+ m.erase(p);
+ } else {
+ p->second = before; // shorten bit before
+ }
} else {
m.erase(p);
}
- if (after && (post_process ? post_process(start + len, after) : true)) {
- m[start + len] = after;
+ if (after) {
+ if (claim && claim(start + len, after)) {
+ _size -= after;
+ } else {
+ m[start + len] = after;
+ }
}
}
#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */
#define CEPH_MSGR_TAG_KEEPALIVE2 14
#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive reply */
-
+#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16 /* ceph v2 doing server challenge */
/*
* connection negotiation
java/com/ceph/fs/CephStat.java
java/com/ceph/fs/CephStatVFS.java)
-# note: for the -source 1.5 builds, we add
+# note: for the -source 1.7 builds, we add
# -Xlint:-options
# to get rid of the warning
-# warning: [options] bootstrap class path not set in conjunction with -source 1.5
+# warning: [options] bootstrap class path not set in conjunction with -source 1.7
# as per
# https://blogs.oracle.com/darcy/entry/bootclasspath_older_source
-set(CMAKE_JAVA_COMPILE_FLAGS "-source" "1.5" "-target" "1.5" "-Xlint:-options")
-add_jar(libcephfs ${java_srcs})
+set(CMAKE_JAVA_COMPILE_FLAGS "-source" "1.7" "-target" "1.7" "-Xlint:-options")
+set(jni_header_dir "${CMAKE_CURRENT_BINARY_DIR}/native")
+if(Java_VERSION VERSION_LESS 1.8)
+ add_jar(libcephfs ${java_srcs})
+ get_property(libcephfs_jar TARGET libcephfs PROPERTY JAR_FILE)
+ set(java_h native/com_ceph_fs_CephMount.h)
+ add_custom_command(
+ OUTPUT ${java_h}
+ COMMAND ${Java_JAVAH_EXECUTABLE} -classpath ${libcephfs_jar} -jni -o ${java_h} com.ceph.fs.CephMount
+ COMMENT "Building C header files from classes...")
+ add_custom_target(jni-header
+ DEPENDS ${java_h})
+ add_dependencies(jni-header libcephfs)
+else()
+ if(CMAKE_VERSION VERSION_LESS 3.11)
+ set(CMAKE_JAVA_COMPILE_FLAGS ${CMAKE_JAVA_COMPILE_FLAGS} "-h" ${jni_header_dir})
+ add_jar(libcephfs ${java_srcs})
+ add_custom_target(
+ jni-header
+ DEPENDS libcephfs)
+ add_dependencies(jni-header libcephfs)
+ else()
+ add_jar(libcephfs ${java_srcs}
+ GENERATE_NATIVE_HEADERS jni-header
+ DESTINATION ${jni_header_dir})
+ endif()
+ get_property(libcephfs_jar TARGET libcephfs PROPERTY JAR_FILE)
+endif()
install_jar(libcephfs share/java)
-get_property(libcephfs_jar TARGET libcephfs PROPERTY JAR_FILE)
-
-set(java_h native/com_ceph_fs_CephMount.h)
-add_custom_command(
- OUTPUT ${java_h}
- COMMAND ${Java_JAVAH_EXECUTABLE} -classpath ${libcephfs_jar} -jni -o ${CMAKE_CURRENT_BINARY_DIR}/${java_h} com.ceph.fs.CephMount)
-add_custom_target(
- jni-header
- DEPENDS ${java_h})
-add_dependencies(jni-header libcephfs)
find_jar(JUNIT_JAR
NAMES junit4 junit
return (cmount->get_client()->ll_fsync(fh, syncdataonly));
}
+extern "C" int ceph_ll_sync_inode(class ceph_mount_info *cmount,
+ Inode *in, int syncdataonly)
+{
+ return (cmount->get_client()->ll_sync_inode(in, syncdataonly));
+}
+
extern "C" off_t ceph_ll_lseek(class ceph_mount_info *cmount,
Fh *fh, off_t offset, int whence)
{
}
void complete(int r) override {
- // invoked by C_notify_Finish (or C_aio_notify_Ack on failure)
+ // invoked by C_notify_Finish
lock.Lock();
finished = true;
complete_unlock(r);
ldout(cct, 10) << __func__ << " linger op " << oncomplete->linger_op << " "
<< "acked (" << r << ")" << dendl;
oncomplete->handle_ack(r);
- if (r < 0) {
- // on failure, we won't expect to see a notify_finish callback
- onfinish->complete(r);
- }
}
};
Context *notify_finish = new C_notify_Finish(client->cct, ¬ify_finish_cond,
objecter, linger_op, preply_bl,
preply_buf, preply_buf_len);
+ (void) notify_finish;
uint32_t timeout = notify_timeout;
if (timeout_ms)
} else {
ldout(client->cct, 10) << __func__ << " failed to initiate notify, r = "
<< r << dendl;
- notify_finish->complete(r);
+ notify_finish_cond.wait();
}
objecter->linger_cancel(linger_op);
journal_policy = policy;
}
+ bool ImageCtx::is_writeback_cache_enabled() const {
+ return (cache && cache_max_dirty > 0);
+ }
+
void ImageCtx::get_thread_pool_instance(CephContext *cct,
ThreadPool **thread_pool,
ContextWQ **op_work_queue) {
journal::Policy *get_journal_policy() const;
void set_journal_policy(journal::Policy *policy);
+ bool is_writeback_cache_enabled() const;
+
static void get_thread_pool_instance(CephContext *cct,
ThreadPool **thread_pool,
ContextWQ **op_work_queue);
!m_image_ctx.exclusive_lock->is_lock_owner());
RWLock::RLocker watch_locker(this->m_watch_lock);
- if (this->m_watch_state == Watcher::WATCH_STATE_REGISTERED) {
+ if (this->is_registered(this->m_watch_lock)) {
ldout(m_image_ctx.cct, 15) << this << " requesting exclusive lock" << dendl;
FunctionContext *ctx = new FunctionContext(
m_cct(reinterpret_cast<CephContext *>(ioctx.cct())),
m_watch_lock(util::unique_lock_name("librbd::Watcher::m_watch_lock", this)),
m_watch_handle(0), m_notifier(work_queue, ioctx, oid),
- m_watch_state(WATCH_STATE_UNREGISTERED), m_watch_ctx(*this) {
+ m_watch_state(WATCH_STATE_IDLE), m_watch_ctx(*this) {
}
Watcher::~Watcher() {
RWLock::RLocker l(m_watch_lock);
- assert(m_watch_state != WATCH_STATE_REGISTERED);
+ assert(is_unregistered(m_watch_lock));
}
void Watcher::register_watch(Context *on_finish) {
ldout(m_cct, 10) << dendl;
RWLock::RLocker watch_locker(m_watch_lock);
- assert(m_watch_state == WATCH_STATE_UNREGISTERED);
+ assert(is_unregistered(m_watch_lock));
m_watch_state = WATCH_STATE_REGISTERING;
librados::AioCompletion *aio_comp = create_rados_callback(
- new C_RegisterWatch(this, on_finish));
+ new C_RegisterWatch(this, on_finish));
int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_watch_handle, &m_watch_ctx);
assert(r == 0);
aio_comp->release();
void Watcher::handle_register_watch(int r, Context *on_finish) {
ldout(m_cct, 10) << "r=" << r << dendl;
+
+ bool watch_error = false;
Context *unregister_watch_ctx = nullptr;
{
RWLock::WLocker watch_locker(m_watch_lock);
assert(m_watch_state == WATCH_STATE_REGISTERING);
- std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ m_watch_state = WATCH_STATE_IDLE;
if (r < 0) {
lderr(m_cct) << "failed to register watch: " << cpp_strerror(r)
<< dendl;
m_watch_handle = 0;
- m_watch_state = WATCH_STATE_UNREGISTERED;
- } else if (r >= 0) {
- m_watch_state = WATCH_STATE_REGISTERED;
+ }
+
+ if (m_unregister_watch_ctx != nullptr) {
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == 0 && m_watch_error) {
+ lderr(m_cct) << "re-registering watch after error" << dendl;
+ m_watch_state = WATCH_STATE_REWATCHING;
+ watch_error = true;
}
}
on_finish->complete(r);
- // wake up pending unregister request
if (unregister_watch_ctx != nullptr) {
unregister_watch_ctx->complete(0);
+ } else if (watch_error) {
+ rewatch();
}
}
{
RWLock::WLocker watch_locker(m_watch_lock);
- if (m_watch_state == WATCH_STATE_REGISTERING ||
- m_watch_state == WATCH_STATE_REWATCHING) {
+ if (m_watch_state != WATCH_STATE_IDLE) {
ldout(m_cct, 10) << "delaying unregister until register completed"
<< dendl;
unregister_watch(on_finish);
});
return;
- }
-
- if (m_watch_state == WATCH_STATE_REGISTERED ||
- m_watch_state == WATCH_STATE_ERROR) {
- m_watch_state = WATCH_STATE_UNREGISTERED;
-
+ } else if (is_registered(m_watch_lock)) {
librados::AioCompletion *aio_comp = create_rados_callback(
new C_UnwatchAndFlush(m_ioctx, on_finish));
int r = m_ioctx.aio_unwatch(m_watch_handle, aio_comp);
assert(r == 0);
aio_comp->release();
+ m_watch_handle = 0;
return;
}
}
}
void Watcher::set_oid(const string& oid) {
- RWLock::WLocker l(m_watch_lock);
- assert(m_watch_state == WATCH_STATE_UNREGISTERED);
+ RWLock::WLocker watch_locker(m_watch_lock);
+ assert(is_unregistered(m_watch_lock));
m_oid = oid;
}
void Watcher::handle_error(uint64_t handle, int err) {
lderr(m_cct) << "handle=" << handle << ": " << cpp_strerror(err) << dendl;
- RWLock::WLocker l(m_watch_lock);
- if (m_watch_state == WATCH_STATE_REGISTERED) {
- m_watch_state = WATCH_STATE_ERROR;
+ RWLock::WLocker watch_locker(m_watch_lock);
+ m_watch_error = true;
+
+ if (is_registered(m_watch_lock)) {
+ m_watch_state = WATCH_STATE_REWATCHING;
FunctionContext *ctx = new FunctionContext(
boost::bind(&Watcher::rewatch, this));
void Watcher::rewatch() {
ldout(m_cct, 10) << dendl;
- RWLock::WLocker l(m_watch_lock);
- if (m_watch_state != WATCH_STATE_ERROR) {
- return;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ RWLock::WLocker watch_locker(m_watch_lock);
+ assert(m_watch_state == WATCH_STATE_REWATCHING);
+
+ if (m_unregister_watch_ctx != nullptr) {
+ m_watch_state = WATCH_STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else {
+ m_watch_error = false;
+ auto ctx = create_context_callback<
+ Watcher, &Watcher::handle_rewatch>(this);
+ auto req = RewatchRequest::create(m_ioctx, m_oid, m_watch_lock,
+ &m_watch_ctx, &m_watch_handle, ctx);
+ req->send();
+ return;
+ }
}
- m_watch_state = WATCH_STATE_REWATCHING;
-
- Context *ctx = create_context_callback<Watcher,
- &Watcher::handle_rewatch>(this);
- RewatchRequest *req = RewatchRequest::create(m_ioctx, m_oid, m_watch_lock,
- &m_watch_ctx,
- &m_watch_handle, ctx);
- req->send();
+
+ unregister_watch_ctx->complete(0);
}
void Watcher::handle_rewatch(int r) {
ldout(m_cct, 10) "r=" << r << dendl;
- WatchState next_watch_state = WATCH_STATE_REGISTERED;
- if (r < 0) {
- // only EBLACKLISTED or ENOENT can be returned
- assert(r == -EBLACKLISTED || r == -ENOENT);
- next_watch_state = WATCH_STATE_UNREGISTERED;
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ RWLock::WLocker watch_locker(m_watch_lock);
+ assert(m_watch_state == WATCH_STATE_REWATCHING);
+
+ if (m_unregister_watch_ctx != nullptr) {
+ ldout(m_cct, 10) << "image is closing, skip rewatch" << dendl;
+ m_watch_state = WATCH_STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == -EBLACKLISTED) {
+ lderr(m_cct) << "client blacklisted" << dendl;
+ } else if (r == -ENOENT) {
+ ldout(m_cct, 5) << "object does not exist" << dendl;
+ } else if (r < 0) {
+ lderr(m_cct) << "failed to rewatch: " << cpp_strerror(r) << dendl;
+ watch_error = true;
+ } else if (m_watch_error) {
+ lderr(m_cct) << "re-registering watch after error" << dendl;
+ watch_error = true;
+ }
}
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ return;
+ } else if (watch_error) {
+ rewatch();
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ Watcher, &Watcher::handle_rewatch_callback>(this);
+ m_work_queue->queue(ctx, r);
+}
+
+void Watcher::handle_rewatch_callback(int r) {
+ ldout(m_cct, 10) << "r=" << r << dendl;
+ handle_rewatch_complete(r);
+
+ bool watch_error = false;
Context *unregister_watch_ctx = nullptr;
{
RWLock::WLocker watch_locker(m_watch_lock);
assert(m_watch_state == WATCH_STATE_REWATCHING);
- m_watch_state = next_watch_state;
- std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
-
- m_work_queue->queue(
- create_context_callback<Watcher,
- &Watcher::handle_rewatch_complete>(this), r);
+ if (m_unregister_watch_ctx != nullptr) {
+ m_watch_state = WATCH_STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == -EBLACKLISTED || r == -ENOENT) {
+ m_watch_state = WATCH_STATE_IDLE;
+ } else if (m_watch_error) {
+ watch_error = true;
+ } else {
+ m_watch_state = WATCH_STATE_IDLE;
+ }
}
- // wake up pending unregister request
if (unregister_watch_ctx != nullptr) {
unregister_watch_ctx->complete(0);
+ } else if (watch_error) {
+ rewatch();
}
}
bool is_registered() const {
RWLock::RLocker locker(m_watch_lock);
- return m_watch_state == WATCH_STATE_REGISTERED;
+ return is_registered(m_watch_lock);
}
bool is_unregistered() const {
RWLock::RLocker locker(m_watch_lock);
- return m_watch_state == WATCH_STATE_UNREGISTERED;
+ return is_unregistered(m_watch_lock);
}
protected:
enum WatchState {
- WATCH_STATE_UNREGISTERED,
+ WATCH_STATE_IDLE,
WATCH_STATE_REGISTERING,
- WATCH_STATE_REGISTERED,
- WATCH_STATE_ERROR,
WATCH_STATE_REWATCHING
};
WatchState m_watch_state;
AsyncOpTracker m_async_op_tracker;
+ bool is_registered(const RWLock&) const {
+ return (m_watch_state == WATCH_STATE_IDLE && m_watch_handle != 0);
+ }
+ bool is_unregistered(const RWLock&) const {
+ return (m_watch_state == WATCH_STATE_IDLE && m_watch_handle == 0);
+ }
+
void send_notify(bufferlist &payload,
watcher::NotifyResponse *response = nullptr,
Context *on_finish = nullptr);
WatchCtx m_watch_ctx;
Context *m_unregister_watch_ctx = nullptr;
+ bool m_watch_error = false;
+
uint32_t m_blocked_count = 0;
void handle_register_watch(int r, Context *on_finish);
void rewatch();
void handle_rewatch(int r);
+ void handle_rewatch_callback(int r);
};
template <typename I>
void Replay<I>::handle_aio_modify_complete(Context *on_ready, Context *on_safe,
- int r, std::set<int> &filters) {
+ int r, std::set<int> &filters,
+ bool writeback_cache_enabled) {
Mutex::Locker locker(m_lock);
CephContext *cct = m_image_ctx.cct;
ldout(cct, 20) << ": on_ready=" << on_ready << ", "
return;
}
- // will be completed after next flush operation completes
- m_aio_modify_safe_contexts.insert(on_safe);
+ if (writeback_cache_enabled) {
+ // will be completed after next flush operation completes
+ m_aio_modify_safe_contexts.insert(on_safe);
+ } else {
+ // IO is safely stored on disk
+ assert(m_in_flight_aio_modify > 0);
+ --m_in_flight_aio_modify;
+
+ if (m_on_aio_ready != nullptr) {
+ ldout(cct, 10) << ": resuming paused AIO" << dendl;
+ m_on_aio_ready->complete(0);
+ m_on_aio_ready = nullptr;
+ }
+
+ ldout(cct, 20) << ": completing safe context: " << on_safe << dendl;
+ m_image_ctx.op_work_queue->queue(on_safe, 0);
+ }
}
template <typename I>
}
++m_in_flight_aio_modify;
- m_aio_modify_unsafe_contexts.push_back(on_safe);
+
+ bool writeback_cache_enabled = m_image_ctx.is_writeback_cache_enabled();
+ if (writeback_cache_enabled) {
+ m_aio_modify_unsafe_contexts.push_back(on_safe);
+ }
// FLUSH if we hit the low-water mark -- on_safe contexts are
// completed by flushes-only so that we don't move the journal
// commit position until safely on-disk
- *flush_required = (m_aio_modify_unsafe_contexts.size() ==
+ *flush_required = (writeback_cache_enabled &&
+ m_aio_modify_unsafe_contexts.size() ==
IN_FLIGHT_IO_LOW_WATER_MARK);
if (*flush_required) {
ldout(cct, 10) << ": hit AIO replay low-water mark: scheduling flush"
// event. when flushed, the completion of the next flush will fire the
// on_safe callback
auto aio_comp = io::AioCompletion::create_and_start<Context>(
- new C_AioModifyComplete(this, on_ready, on_safe, std::move(filters)),
+ new C_AioModifyComplete(this, on_ready, on_safe, std::move(filters),
+ writeback_cache_enabled),
util::get_image_ctx(&m_image_ctx), aio_type);
return aio_comp;
}
Context *on_ready;
Context *on_safe;
std::set<int> filters;
+ bool writeback_cache_enabled;
C_AioModifyComplete(Replay *replay, Context *on_ready,
- Context *on_safe, std::set<int> &&filters)
+ Context *on_safe, std::set<int> &&filters,
+ bool writeback_cache_enabled)
: replay(replay), on_ready(on_ready), on_safe(on_safe),
- filters(std::move(filters)) {
+ filters(std::move(filters)),
+ writeback_cache_enabled(writeback_cache_enabled) {
}
void finish(int r) override {
- replay->handle_aio_modify_complete(on_ready, on_safe, r, filters);
+ replay->handle_aio_modify_complete(on_ready, on_safe, r, filters,
+ writeback_cache_enabled);
}
};
Context *on_safe);
void handle_aio_modify_complete(Context *on_ready, Context *on_safe,
- int r, std::set<int> &filters);
+ int r, std::set<int> &filters,
+ bool writeback_cache_enabled);
void handle_aio_flush_complete(Context *on_flush_safe, Contexts &on_safe_ctxs,
int r);
void Log::stop()
{
- assert(is_started());
- pthread_mutex_lock(&m_queue_mutex);
- m_stop = true;
- pthread_cond_signal(&m_cond_flusher);
- pthread_cond_broadcast(&m_cond_loggers);
- pthread_mutex_unlock(&m_queue_mutex);
- join();
+ if (is_started()) {
+ pthread_mutex_lock(&m_queue_mutex);
+ m_stop = true;
+ pthread_cond_signal(&m_cond_flusher);
+ pthread_cond_broadcast(&m_cond_loggers);
+ pthread_mutex_unlock(&m_queue_mutex);
+ join();
+ }
}
void *Log::entry()
dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state())
<< " seq " << m->get_seq() << " dne" << dendl;
}
+ m->put();
}
pop_nested(ceph_clock_now()),
pop_auth_subtree(ceph_clock_now()),
pop_auth_subtree_nested(ceph_clock_now()),
+ pop_lru_subdirs(member_offset(CInode, item_pop_lru)),
num_dentries_nested(0), num_dentries_auth_subtree(0),
num_dentries_auth_subtree_nested(0),
dir_auth(CDIR_AUTH_DEFAULT)
items[dn->key()] = dn;
dn->get_linkage()->inode = in;
- in->set_primary_parent(dn);
link_inode_work(dn, in);
assert(dn->get_linkage()->is_null());
dn->get_linkage()->inode = in;
- in->set_primary_parent(dn);
link_inode_work(dn, in);
void CDir::link_inode_work( CDentry *dn, CInode *in)
{
assert(dn->get_linkage()->get_inode() == in);
- assert(in->get_parent_dn() == dn);
+ in->set_primary_parent(dn);
// set inode version
//in->inode.version = dn->get_version();
// unlink auth_pin count
if (in->auth_pins + in->nested_auth_pins)
dn->adjust_nested_auth_pins(0 - (in->auth_pins + in->nested_auth_pins), 0 - in->auth_pins, NULL);
-
+
// detach inode
in->remove_primary_parent(dn);
+ if (in->is_dir())
+ in->item_pop_lru.remove_myself();
dn->get_linkage()->inode = 0;
} else {
assert(!dn->get_linkage()->is_null());
if (dn->get_linkage()->is_primary()) {
CInode *in = dn->get_linkage()->get_inode();
auto pi = in->get_projected_inode();
- if (dn->get_linkage()->get_inode()->is_dir())
+ if (in->is_dir()) {
fnode.fragstat.nsubdirs++;
- else
+ if (in->item_pop_lru.is_on_list())
+ pop_lru_subdirs.push_back(&in->item_pop_lru);
+ } else {
fnode.fragstat.nfiles++;
+ }
fnode.rstat.rbytes += pi->accounted_rstat.rbytes;
fnode.rstat.rfiles += pi->accounted_rstat.rfiles;
fnode.rstat.rsubdirs += pi->accounted_rstat.rsubdirs;
bufferlist &bl,
const int pos,
const std::set<snapid_t> *snaps,
- bool *force_dirty,
- list<CInode*> *undef_inodes)
+ bool *force_dirty)
{
bufferlist::iterator q = bl.begin();
}
if (dn) {
- if (dn->get_linkage()->get_inode() == 0) {
- dout(12) << "_fetched had NEG dentry " << *dn << dendl;
- } else {
- dout(12) << "_fetched had dentry " << *dn << dendl;
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ dout(12) << "_fetched had " << (dnl->is_null() ? "NEG" : "") << " dentry " << *dn << dendl;
+ if (committed_version == 0 &&
+ dnl->is_remote() &&
+ dn->is_dirty() &&
+ ino == dnl->get_remote_ino() &&
+ d_type == dnl->get_remote_d_type()) {
+ // see comment below
+ dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl;
+ dn->mark_clean();
}
} else {
// (remote) link
bool undef_inode = false;
if (dn) {
- CInode *in = dn->get_linkage()->get_inode();
- if (in) {
- dout(12) << "_fetched had dentry " << *dn << dendl;
- if (in->state_test(CInode::STATE_REJOINUNDEF)) {
- undef_inodes->push_back(in);
- undef_inode = true;
- }
- } else
- dout(12) << "_fetched had NEG dentry " << *dn << dendl;
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ dout(12) << "_fetched had " << (dnl->is_null() ? "NEG" : "") << " dentry " << *dn << dendl;
+
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ if (in->state_test(CInode::STATE_REJOINUNDEF)) {
+ undef_inode = true;
+ } else if (committed_version == 0 &&
+ dn->is_dirty() &&
+ inode_data.inode.ino == in->ino() &&
+ inode_data.inode.version == in->get_version()) {
+ /* clean underwater item?
+ * Underwater item is something that is dirty in our cache from
+ * journal replay, but was previously flushed to disk before the
+ * mds failed.
+ *
+ * We only do this is committed_version == 0. that implies either
+ * - this is a fetch after from a clean/empty CDir is created
+ * (and has no effect, since the dn won't exist); or
+ * - this is a fetch after _recovery_, which is what we're worried
+ * about. Items that are marked dirty from the journal should be
+ * marked clean if they appear on disk.
+ */
+ dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl;
+ dn->mark_clean();
+ dout(10) << "_fetched had underwater inode " << *dnl->get_inode() << ", marking clean" << dendl;
+ in->mark_clean();
+ }
+ }
}
if (!dn || undef_inode) {
try {
dn = _load_dentry(
p->first, dname, last, p->second, pos, snaps,
- &force_dirty, &undef_inodes);
+ &force_dirty);
} catch (const buffer::error &err) {
cache->mds->clog->warn() << "Corrupt dentry '" << dname << "' in "
"dir frag " << dirfrag() << ": "
continue;
}
- if (dn && (wanted_items.count(mempool::mds_co::string(boost::string_view(dname))) > 0 || !complete)) {
- dout(10) << " touching wanted dn " << *dn << dendl;
- inode->mdcache->touch_dentry(dn);
- }
+ if (!dn)
+ continue;
- /** clean underwater item?
- * Underwater item is something that is dirty in our cache from
- * journal replay, but was previously flushed to disk before the
- * mds failed.
- *
- * We only do this is committed_version == 0. that implies either
- * - this is a fetch after from a clean/empty CDir is created
- * (and has no effect, since the dn won't exist); or
- * - this is a fetch after _recovery_, which is what we're worried
- * about. Items that are marked dirty from the journal should be
- * marked clean if they appear on disk.
- */
- if (committed_version == 0 &&
- dn &&
- dn->get_version() <= got_fnode.version &&
- dn->is_dirty()) {
- dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl;
- dn->mark_clean();
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ if (dnl->is_primary() && dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
+ undef_inodes.push_back(dnl->get_inode());
- if (dn->get_linkage()->is_primary()) {
- assert(dn->get_linkage()->get_inode()->get_version() <= got_fnode.version);
- dout(10) << "_fetched had underwater inode " << *dn->get_linkage()->get_inode() << ", marking clean" << dendl;
- dn->get_linkage()->get_inode()->mark_clean();
- }
+ if (!complete || wanted_items.count(mempool::mds_co::string(boost::string_view(dname))) > 0) {
+ dout(10) << " touching wanted dn " << *dn << dendl;
+ inode->mdcache->touch_dentry(dn);
}
}
MDSCacheObject::dump(f);
}
+void CDir::dump_load(Formatter *f, utime_t now, const DecayRate& rate)
+{
+ f->dump_stream("path") << get_path();
+ f->dump_stream("dirfrag") << dirfrag();
+
+ f->open_object_section("pop_me");
+ pop_me.dump(f, now, rate);
+ f->close_section();
+
+ f->open_object_section("pop_nested");
+ pop_nested.dump(f, now, rate);
+ f->close_section();
+
+ f->open_object_section("pop_auth_subtree");
+ pop_auth_subtree.dump(f, now, rate);
+ f->close_section();
+
+ f->open_object_section("pop_auth_subtree_nested");
+ pop_auth_subtree_nested.dump(f, now, rate);
+ f->close_section();
+}
+
/****** Scrub Stuff *******/
void CDir::scrub_info_create() const
load_spread_t pop_spread;
+ elist<CInode*> pop_lru_subdirs;
+
// and to provide density
int num_dentries_nested;
int num_dentries_auth_subtree;
bufferlist &bl,
int pos,
const std::set<snapid_t> *snaps,
- bool *force_dirty,
- std::list<CInode*> *undef_inodes);
+ bool *force_dirty);
/**
* Mark this fragment as BADFRAG (common part of go_bad and go_bad_dentry)
ostream& print_db_line_prefix(ostream& out) override;
void print(ostream& out) override;
void dump(Formatter *f) const;
+ void dump_load(Formatter *f, utime_t now, const DecayRate& rate);
};
#endif
inode = front.inode;
if (inode.is_backtrace_updated())
- _mark_dirty_parent(ls, old_pool != inode.layout.pool_id);
+ mark_dirty_parent(ls, old_pool != inode.layout.pool_id);
if (front.xattrs) {
--num_projected_xattrs;
mdcache->fetch_backtrace(inode.ino, get_backtrace_pool(), *backtrace, fin);
}
-void CInode::_mark_dirty_parent(LogSegment *ls, bool dirty_pool)
+void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool)
{
if (!state_test(STATE_DIRTYPARENT)) {
dout(10) << "mark_dirty_parent" << dendl;
mds->clog->error() << "bad backtrace on directory inode " << ino();
assert(!"bad backtrace" == (g_conf->mds_verify_backtrace > 1));
- _mark_dirty_parent(mds->mdlog->get_current_segment(), false);
+ mark_dirty_parent(mds->mdlog->get_current_segment(), false);
mds->mdlog->flush();
}
}
m->ctime = i->ctime;
m->change_attr = i->change_attr;
m->time_warp_seq = i->time_warp_seq;
+ m->nfiles = i->dirstat.nfiles;
+ m->nsubdirs = i->dirstat.nsubdirs;
if (cap->client_inline_version < i->inline_data.version) {
m->inline_version = cap->client_inline_version = i->inline_data.version;
}
if (is_dirty_parent()) {
get(PIN_DIRTYPARENT);
- _mark_dirty_parent(ls);
+ mark_dirty_parent(ls);
}
::decode(pop, ceph_clock_now(), p);
in->make_path_string(path);
in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino()
<< "(" << path << "), rewriting it";
- in->_mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(),
+ in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(),
false);
// Flag that we repaired this BT so that it won't go into damagetable
results->backtrace.repaired = true;
int auth_pin_freeze_allowance = 0;
inode_load_vec_t pop;
+ elist<CInode*>::item item_pop_lru;
// friends
friend class Server;
*/
int64_t get_backtrace_pool() const;
public:
- void _mark_dirty_parent(LogSegment *ls, bool dirty_pool=false);
+ void mark_dirty_parent(LogSegment *ls, bool dirty_pool=false);
void clear_dirty_parent();
void verify_diri_backtrace(bufferlist &bl, int err);
bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); }
Export make_export() {
return Export(cap_id, _wanted, issued(), pending(), client_follows, last_sent, mseq+1, last_issue_stamp);
}
- void merge(Export& other, bool auth_cap) {
+ void merge(const Export& other, bool auth_cap) {
if (!is_stale()) {
// issued + pending
int newpending = other.pending | pending();
uint64_t features)
{
auto fs = std::make_shared<Filesystem>();
+ fs->mds_map.epoch = epoch;
fs->mds_map.fs_name = std::string(name);
fs->mds_map.max_mds = 1;
fs->mds_map.data_pools.push_back(data_pool);
public:
friend class MDSMonitor;
+ friend class PaxosFSMap;
FSMap()
: epoch(0),
#include "MDSRank.h"
#include "MDCache.h"
#include "Locker.h"
+#include "MDBalancer.h"
#include "CInode.h"
#include "CDir.h"
#include "CDentry.h"
}
issue_caps_set(need_issue);
+ utime_t now = ceph_clock_now();
+ mds->balancer->hit_inode(now, in, META_POP_IWR);
+
// auth unpin after issuing caps
mut->cleanup();
}
pi.inode.rstat.rbytes = new_size;
dout(10) << "check_inode_max_size mtime " << pi.inode.mtime << " -> " << new_mtime << dendl;
pi.inode.mtime = new_mtime;
+ if (new_mtime > pi.inode.ctime)
+ pi.inode.ctime = pi.inode.rstat.rctime = new_mtime;
}
// use EOpen if the file is still open; otherwise, use EUpdate.
if (m->get_ctime() > pi->ctime) {
dout(7) << " ctime " << pi->ctime << " -> " << m->get_ctime()
<< " for " << *in << dendl;
- pi->ctime = m->get_ctime();
+ pi->ctime = pi->rstat.rctime = m->get_ctime();
}
if ((features & CEPH_FEATURE_FS_CHANGE_ATTR) &&
}
// balance?
- if (last_heartbeat == utime_t())
- last_heartbeat = now;
if (mds->get_nodeid() == 0 &&
g_conf->mds_bal_interval > 0 &&
(num_bal_times ||
dout(20) << "get_load no root, no load" << dendl;
}
- load.req_rate = mds->get_req_rate();
+ uint64_t num_requests = mds->get_num_requests();
+ bool new_req_rate = false;
+ if (last_get_load != utime_t() &&
+ now > last_get_load &&
+ num_requests >= last_num_requests) {
+ utime_t el = now;
+ el -= last_get_load;
+ if (el.sec() >= 1) {
+ load.req_rate = (num_requests - last_num_requests) / (double)el;
+ new_req_rate = true;
+ }
+ }
+ if (!new_req_rate) {
+ auto p = mds_load.find(mds->get_nodeid());
+ if (p != mds_load.end())
+ load.req_rate = p->second.req_rate;
+ }
+ last_get_load = now;
+ last_num_requests = num_requests;
+
load.queue_len = messenger->get_dispatch_queue_len();
ifstream cpu(PROCPREFIX "/proc/loadavg");
if (mds->get_nodeid() == 0) {
beat_epoch++;
-
mds_load.clear();
}
// my load
mds_load_t load = get_load(now);
- map<mds_rank_t, mds_load_t>::value_type val(mds->get_nodeid(), load);
- mds_load.insert(val);
+ mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
+ mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
+
+ mds_load[mds->get_nodeid()] = load;
// import_map -- how much do i import from whom
map<mds_rank_t, float> import_map;
/* This function DOES put the passed message before returning */
void MDBalancer::handle_heartbeat(MHeartbeat *m)
{
- typedef map<mds_rank_t, mds_load_t> mds_load_map_t;
-
mds_rank_t who = mds_rank_t(m->get_source().num());
dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
if (who == 0) {
dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
if (beat_epoch != m->get_beat()) {
+ beat_epoch = m->get_beat();
mds_load.clear();
}
- beat_epoch = m->get_beat();
+
send_heartbeat();
mds->mdcache->show_subtrees();
- }
-
- {
- // set mds_load[who]
- mds_load_map_t::value_type val(who, m->get_load());
- pair < mds_load_map_t::iterator, bool > rval (mds_load.insert(val));
- if (!rval.second) {
- rval.first->second = val.second;
+ } else if (mds->get_nodeid() == 0) {
+ if (beat_epoch != m->get_beat()) {
+ dout(10) << " old heartbeat epoch, ignoring" << dendl;
+ goto out;
}
}
- mds_import_map[ who ] = m->get_import_map();
+
+ mds_load[who] = m->get_load();
+ mds_import_map[who] = m->get_import_map();
{
unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
m->put();
}
-
-void MDBalancer::export_empties()
-{
- dout(5) << "export_empties checking for empty imports" << dendl;
-
- std::set<CDir *> subtrees;
- mds->mdcache->get_fullauth_subtrees(subtrees);
- for (auto &dir : subtrees) {
- if (dir->is_freezing() || dir->is_frozen())
- continue;
-
- if (!dir->inode->is_base() &&
- !dir->inode->is_stray() &&
- dir->get_num_head_items() == 0)
- mds->mdcache->migrator->export_empty_import(dir);
- }
-}
-
-
-
double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
mds_rank_t im, double& maxim)
{
<< dendl;
}
+ mds_meta_load.clear();
+
double total_load = 0.0;
multimap<double,mds_rank_t> load_map;
for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
- map<mds_rank_t, mds_load_t>::value_type val(i, mds_load_t(ceph_clock_now()));
- std::pair < map<mds_rank_t, mds_load_t>::iterator, bool > r(mds_load.insert(val));
- mds_load_t &load(r.first->second);
+ mds_load_t& load = mds_load.at(i);
double l = load.mds_load() * load_fac;
mds_meta_load[i] = l;
<< dendl;
// under or over?
- if (my_load < target_load * (1.0 + g_conf->mds_bal_min_rebalance)) {
+ for (auto p : load_map) {
+ if (p.first < target_load * (1.0 + g_conf->mds_bal_min_rebalance)) {
+ dout(5) << " mds." << p.second << " is underloaded or barely overloaded." << dendl;
+ mds_last_epoch_under_map[p.second] = beat_epoch;
+ }
+ }
+
+ int last_epoch_under = mds_last_epoch_under_map[whoami];
+ if (last_epoch_under == beat_epoch) {
dout(5) << " i am underloaded or barely overloaded, doing nothing." << dendl;
- last_epoch_under = beat_epoch;
- mds->mdcache->show_subtrees();
return;
}
-
// am i over long enough?
if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
dout(5) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
importers.insert(pair<double,mds_rank_t>(it->first,it->second));
importer_set.insert(it->second);
} else {
- dout(15) << " mds." << it->second << " is exporter" << dendl;
- exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
- exporter_set.insert(it->second);
+ int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
+ if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
+ dout(15) << " mds." << it->second << " is exporter" << dendl;
+ exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
+ exporter_set.insert(it->second);
+ }
}
}
/* fill in the metrics for each mds by grabbing load struct */
vector < map<string, double> > metrics (cluster_size);
- for (mds_rank_t i=mds_rank_t(0);
- i < mds_rank_t(cluster_size);
- i++) {
- map<mds_rank_t, mds_load_t>::value_type val(i, mds_load_t(ceph_clock_now()));
- std::pair < map<mds_rank_t, mds_load_t>::iterator, bool > r(mds_load.insert(val));
- mds_load_t &load(r.first->second);
+ for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
+ mds_load_t& load = mds_load.at(i);
metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
{"all.meta_load", load.all.meta_load()},
}
// make a sorted list of my imports
- map<double,CDir*> import_pop_map;
- multimap<mds_rank_t,CDir*> import_from_map;
+ multimap<double, CDir*> import_pop_map;
+ multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
set<CDir*> fullauthsubs;
mds->mdcache->get_fullauth_subtrees(fullauthsubs);
- for (set<CDir*>::iterator it = fullauthsubs.begin();
- it != fullauthsubs.end();
- ++it) {
- CDir *im = *it;
- if (im->get_inode()->is_stray()) continue;
+ for (auto dir : fullauthsubs) {
+ CInode *diri = dir->get_inode();
+ if (diri->is_mdsdir())
+ continue;
+ if (diri->get_export_pin(false) != MDS_RANK_NONE)
+ continue;
+ if (dir->is_freezing() || dir->is_frozen())
+ continue; // export pbly already in progress
- double pop = im->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
+ mds_rank_t from = diri->authority().first;
+ double pop = dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
if (g_conf->mds_bal_idle_threshold > 0 &&
pop < g_conf->mds_bal_idle_threshold &&
- im->inode != mds->mdcache->get_root() &&
- im->inode->authority().first != mds->get_nodeid()) {
- dout(5) << " exporting idle (" << pop << ") import " << *im
- << " back to mds." << im->inode->authority().first
- << dendl;
- mds->mdcache->migrator->export_dir_nicely(im, im->inode->authority().first);
+ diri != mds->mdcache->get_root() &&
+ from != mds->get_nodeid()) {
+ dout(5) << " exporting idle (" << pop << ") import " << *dir
+ << " back to mds." << from << dendl;
+ mds->mdcache->migrator->export_dir_nicely(dir, from);
continue;
}
- import_pop_map[ pop ] = im;
- mds_rank_t from = im->inode->authority().first;
- dout(15) << " map: i imported " << *im << " from " << from << dendl;
- import_from_map.insert(pair<mds_rank_t,CDir*>(from, im));
+ dout(15) << " map: i imported " << *dir << " from " << from << dendl;
+ import_pop_map.insert(make_pair(pop, dir));
+ import_from_map.insert(make_pair(from, make_pair(dir, pop)));
}
-
-
// do my exports!
- set<CDir*> already_exporting;
+ map<mds_rank_t, double> export_pop_map;
for (auto &it : state.targets) {
mds_rank_t target = it.first;
double amount = it.second;
- if (amount < MIN_OFFLOAD) continue;
- if (amount / target_load < .2) continue;
+ if (amount / target_load < .2)
+ continue;
+ if (amount < MIN_OFFLOAD)
+ continue;
dout(5) << "want to send " << amount << " to mds." << target
//<< " .. " << (*it).second << " * " << load_fac
<< " -> " << amount
<< dendl;//" .. fudge is " << fudge << dendl;
- double have = 0.0;
+ double& have = export_pop_map[target];
mds->mdcache->show_subtrees();
// search imports from target
if (import_from_map.count(target)) {
dout(5) << " aha, looking through imports from target mds." << target << dendl;
- pair<multimap<mds_rank_t,CDir*>::iterator, multimap<mds_rank_t,CDir*>::iterator> p =
- import_from_map.equal_range(target);
- while (p.first != p.second) {
- CDir *dir = (*p.first).second;
+ for (auto p = import_from_map.equal_range(target);
+ p.first != p.second; ) {
+ CDir *dir = p.first->second.first;
+ double pop = p.first->second.second;
dout(5) << "considering " << *dir << " from " << (*p.first).first << dendl;
- multimap<mds_rank_t,CDir*>::iterator plast = p.first++;
+ auto plast = p.first++;
- if (dir->inode->is_base() ||
- dir->inode->is_stray())
+ if (dir->inode->is_base())
continue;
- if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress
- double pop = dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
if (pop <= amount-have) {
- dout(5) << "reexporting " << *dir
- << " pop " << pop
+ dout(5) << "reexporting " << *dir << " pop " << pop
<< " back to mds." << target << dendl;
mds->mdcache->migrator->export_dir_nicely(dir, target);
have += pop;
import_from_map.erase(plast);
- import_pop_map.erase(pop);
+ for (auto q = import_pop_map.equal_range(pop);
+ q.first != q.second; ) {
+ if (q.first->second == dir) {
+ import_pop_map.erase(q.first);
+ break;
+ }
+ q.first++;
+ }
} else {
dout(5) << "can't reexport " << *dir << ", too big " << pop << dendl;
}
- if (amount-have < MIN_OFFLOAD) break;
+ if (amount-have < MIN_OFFLOAD)
+ break;
}
}
- if (amount-have < MIN_OFFLOAD) {
+ }
+
+ // any other imports
+ for (auto &it : state.targets) {
+ mds_rank_t target = it.first;
+ double amount = it.second;
+
+ if (!export_pop_map.count(target))
+ continue;
+ double& have = export_pop_map[target];
+ if (amount-have < MIN_OFFLOAD)
continue;
- }
- // any other imports
- if (false)
- for (map<double,CDir*>::iterator import = import_pop_map.begin();
- import != import_pop_map.end();
- import++) {
- CDir *imp = (*import).second;
- if (imp->inode->is_base() ||
- imp->inode->is_stray())
- continue;
+ for (auto p = import_pop_map.begin();
+ p != import_pop_map.end(); ) {
+ CDir *dir = p->second;
+ if (dir->inode->is_base()) {
+ ++p;
+ continue;
+ }
- double pop = (*import).first;
- if (pop < amount-have || pop < MIN_REEXPORT) {
- dout(5) << "reexporting " << *imp
- << " pop " << pop
- << " back to mds." << imp->inode->authority()
- << dendl;
- have += pop;
- mds->mdcache->migrator->export_dir_nicely(imp, imp->inode->authority().first);
- }
- if (amount-have < MIN_OFFLOAD) break;
+ double pop = p->first;
+ if (pop <= amount-have && pop > MIN_REEXPORT) {
+ dout(0) << "reexporting " << *dir << " pop " << pop
+ << " to mds." << target << dendl;
+ have += pop;
+ mds->mdcache->migrator->export_dir_nicely(dir, target);
+ import_pop_map.erase(p++);
+ } else {
+ ++p;
}
- if (amount-have < MIN_OFFLOAD) {
- //fudge = amount-have;
- continue;
+ if (amount-have < MIN_OFFLOAD)
+ break;
}
+ }
- // okay, search for fragments of my workload
- set<CDir*> candidates;
- mds->mdcache->get_fullauth_subtrees(candidates);
+ set<CDir*> already_exporting;
+
+ for (auto &it : state.targets) {
+ mds_rank_t target = it.first;
+ double amount = it.second;
+ if (!export_pop_map.count(target))
+ continue;
+ double& have = export_pop_map[target];
+ if (amount-have < MIN_OFFLOAD)
+ continue;
+
+ // okay, search for fragments of my workload
list<CDir*> exports;
- for (set<CDir*>::iterator pot = candidates.begin();
- pot != candidates.end();
- ++pot) {
- if ((*pot)->get_inode()->is_stray()) continue;
- find_exports(*pot, amount, exports, have, already_exporting);
- if (have > amount-MIN_OFFLOAD)
+ for (auto p = import_pop_map.rbegin();
+ p != import_pop_map.rend();
+ ++p) {
+ CDir *dir = p->second;
+ find_exports(dir, amount, exports, have, already_exporting);
+ if (amount-have < MIN_OFFLOAD)
break;
}
//fudge = amount - have;
- for (list<CDir*>::iterator it = exports.begin(); it != exports.end(); ++it) {
- dout(5) << " - exporting "
- << (*it)->pop_auth_subtree
- << " "
- << (*it)->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate)
- << " to mds." << target
- << " " << **it
- << dendl;
- mds->mdcache->migrator->export_dir_nicely(*it, target);
+ for (auto dir : exports) {
+ dout(5) << " - exporting " << dir->pop_auth_subtree
+ << " " << dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate)
+ << " to mds." << target << " " << *dir << dendl;
+ mds->mdcache->migrator->export_dir_nicely(dir, target);
}
}
double& have,
set<CDir*>& already_exporting)
{
+ utime_t now = ceph_clock_now();
+ if ((double)(now - rebalance_time) > 0.1) {
+ derr << " balancer runs too long" << dendl_impl;
+ have = amount;
+ return;
+ }
+
+ assert(dir->is_auth());
+
double need = amount - have;
if (need < amount * g_conf->mds_bal_min_start)
return; // good enough!
+
double needmax = need * g_conf->mds_bal_need_max;
double needmin = need * g_conf->mds_bal_need_min;
double midchunk = need * g_conf->mds_bal_midchunk;
dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
double subdir_sum = 0;
- for (auto it = dir->begin(); it != dir->end(); ++it) {
- CInode *in = it->second->get_linkage()->get_inode();
- if (!in) continue;
- if (!in->is_dir()) continue;
+ for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
+ !it.end(); ) {
+ CInode *in = *it;
+ ++it;
+
+ assert(in->is_dir());
+ assert(in->get_parent_dir() == dir);
list<CDir*> dfls;
- in->get_dirfrags(dfls);
+ in->get_nested_dirfrags(dfls);
+
+ size_t num_idle_frags = 0;
for (list<CDir*>::iterator p = dfls.begin();
p != dfls.end();
++p) {
CDir *subdir = *p;
- if (!subdir->is_auth()) continue;
- if (already_exporting.count(subdir)) continue;
+ if (already_exporting.count(subdir))
+ continue;
- if (subdir->is_frozen()) continue; // can't export this right now!
+ // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
+ // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
+ if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
+ subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
+ continue; // can't export this right now!
// how popular?
double pop = subdir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
subdir_sum += pop;
dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
- if (pop < minchunk) continue;
+ if (pop < minchunk) {
+ num_idle_frags++;
+ continue;
+ }
// lucky find?
if (pop > needmin && pop < needmax) {
} else
smaller.insert(pair<double,CDir*>(pop, subdir));
}
+ if (dfls.size() == num_idle_frags)
+ in->item_pop_lru.remove_myself();
}
dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl;
if (have > needmin)
return;
}
-
}
-void MDBalancer::hit_inode(utime_t now, CInode *in, int type, int who)
+void MDBalancer::hit_inode(const utime_t& now, CInode *in, int type, int who)
{
// hit inode
in->pop.get(type).hit(now, mds->mdcache->decayrate);
}
}
-void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amount)
+void MDBalancer::hit_dir(const utime_t& now, CDir *dir, int type, int who, double amount)
{
// hit me
double v = dir->pop_me.get(type).hit(now, mds->mdcache->decayrate, amount);
bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
while (true) {
+ CDir *pdir = dir->inode->get_parent_dir();
dir->pop_nested.get(type).hit(now, mds->mdcache->decayrate, amount);
if (rd_adj != 0.0)
dir->pop_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
if (hit_subtree) {
dir->pop_auth_subtree.get(type).hit(now, mds->mdcache->decayrate, amount);
+
if (rd_adj != 0.0)
dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
+
+ if (dir->is_subtree_root())
+ hit_subtree = false; // end of auth domain, stop hitting auth counters.
+ else if (pdir)
+ pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
}
if (hit_subtree_nested) {
if (rd_adj != 0.0)
dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
}
-
- if (dir->is_subtree_root())
- hit_subtree = false; // end of auth domain, stop hitting auth counters.
-
- if (dir->inode->get_parent_dn() == 0) break;
- dir = dir->inode->get_parent_dn()->get_dir();
+ if (!pdir) break;
+ dir = pdir;
}
}
}
}
+void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, utime_t now, bool inc)
+{
+ DecayRate& rate = mds->mdcache->decayrate;
+
+ bool adjust_subtree_nest = dir->is_auth();
+ bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
+ CDir *cur = dir;
+ while (true) {
+ if (inc) {
+ pdir->pop_nested.add(now, rate, dir->pop_nested);
+ if (adjust_subtree) {
+ pdir->pop_auth_subtree.add(now, rate, dir->pop_auth_subtree);
+ pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
+ }
+
+ if (adjust_subtree_nest)
+ pdir->pop_auth_subtree_nested.add(now, rate, dir->pop_auth_subtree_nested);
+ } else {
+ pdir->pop_nested.sub(now, rate, dir->pop_nested);
+ if (adjust_subtree)
+ pdir->pop_auth_subtree.sub(now, rate, dir->pop_auth_subtree);
+
+ if (adjust_subtree_nest)
+ pdir->pop_auth_subtree_nested.sub(now, rate, dir->pop_auth_subtree_nested);
+ }
+
+ if (pdir->is_subtree_root())
+ adjust_subtree = false;
+ cur = pdir;
+ pdir = pdir->inode->get_parent_dir();
+ if (!pdir) break;
+ }
+}
+
void MDBalancer::handle_mds_failure(mds_rank_t who)
{
if (0 == who) {
- last_epoch_under = 0;
+ mds_last_epoch_under_map.clear();
}
}
+
+int MDBalancer::dump_loads(Formatter *f)
+{
+ utime_t now = ceph_clock_now();
+ DecayRate& decayrate = mds->mdcache->decayrate;
+
+ list<CDir*> dfs;
+ if (mds->mdcache->get_root()) {
+ mds->mdcache->get_root()->get_dirfrags(dfs);
+ } else {
+ dout(5) << "dump_load no root" << dendl;
+ }
+
+ f->open_object_section("loads");
+
+ f->open_array_section("dirfrags");
+ while (!dfs.empty()) {
+ CDir *dir = dfs.front();
+ dfs.pop_front();
+
+ if (f) {
+ f->open_object_section("dir");
+ dir->dump_load(f, now, decayrate);
+ f->close_section();
+ }
+
+ for (auto it = dir->begin(); it != dir->end(); ++it) {
+ CInode *in = it->second->get_linkage()->get_inode();
+ if (!in || !in->is_dir())
+ continue;
+
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (auto subdir : ls) {
+ if (subdir->pop_nested.meta_load() < .001)
+ continue;
+ dfs.push_back(subdir);
+ }
+ }
+ }
+ f->close_section(); // dirfrags array
+
+ f->open_object_section("mds_load");
+ {
+
+ auto dump_mds_load = [f, now](mds_load_t& load) {
+ f->dump_float("request_rate", load.req_rate);
+ f->dump_float("cache_hit_rate", load.cache_hit_rate);
+ f->dump_float("queue_length", load.queue_len);
+ f->dump_float("cpu_load", load.cpu_load_avg);
+ f->dump_float("mds_load", load.mds_load());
+
+ DecayRate rate; // no decay
+ f->open_object_section("auth_dirfrags");
+ load.auth.dump(f, now, rate);
+ f->close_section();
+ f->open_object_section("all_dirfrags");
+ load.all.dump(f, now, rate);
+ f->close_section();
+ };
+
+ for (auto p : mds_load) {
+ stringstream name;
+ name << "mds." << p.first;
+ f->open_object_section(name.str().c_str());
+ dump_mds_load(p.second);
+ f->close_section();
+ }
+ }
+ f->close_section(); // mds_load
+
+ f->open_object_section("mds_meta_load");
+ for (auto p : mds_meta_load) {
+ stringstream name;
+ name << "mds." << p.first;
+ f->dump_float(name.str().c_str(), p.second);
+ }
+ f->close_section(); // mds_meta_load
+
+ f->open_object_section("mds_import_map");
+ for (auto p : mds_import_map) {
+ stringstream name1;
+ name1 << "mds." << p.first;
+ f->open_array_section(name1.str().c_str());
+ for (auto q : p.second) {
+ f->open_object_section("from");
+ stringstream name2;
+ name2 << "mds." << q.first;
+ f->dump_float(name2.str().c_str(), q.second);
+ f->close_section();
+ }
+ f->close_section(); // mds.? array
+ }
+ f->close_section(); // mds_import_map
+
+ f->close_section(); // loads
+ return 0;
+}
friend class C_Bal_SendHeartbeat;
public:
MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
- mds(m),
- messenger(msgr),
- mon_client(monc),
- beat_epoch(0),
- last_epoch_under(0), my_load(0.0), target_load(0.0)
- { }
-
- mds_load_t get_load(utime_t);
+ mds(m), messenger(msgr), mon_client(monc) { }
int proc_message(Message *m);
void subtract_export(CDir *ex, utime_t now);
void add_import(CDir *im, utime_t now);
+ void adjust_pop_for_rename(CDir *pdir, CDir *dir, utime_t now, bool inc);
- void hit_inode(utime_t now, CInode *in, int type, int who=-1);
- void hit_dir(utime_t now, CDir *dir, int type, int who=-1, double amount=1.0);
+ void hit_inode(const utime_t& now, CInode *in, int type, int who=-1);
+ void hit_dir(const utime_t& now, CDir *dir, int type, int who=-1, double amount=1.0);
void queue_split(const CDir *dir, bool fast);
void queue_merge(CDir *dir);
void handle_mds_failure(mds_rank_t who);
+ int dump_loads(Formatter *f);
+
private:
typedef struct {
std::map<mds_rank_t, double> targets;
void handle_export_pins(void);
- void export_empties();
+ mds_load_t get_load(utime_t now);
int localize_balancer();
void send_heartbeat();
void handle_heartbeat(MHeartbeat *m);
MDSRank *mds;
Messenger *messenger;
MonClient *mon_client;
- int beat_epoch;
+ int beat_epoch = 0;
- int last_epoch_under;
string bal_code;
string bal_version;
utime_t last_sample;
utime_t rebalance_time; //ensure a consistent view of load for rebalance
+ utime_t last_get_load;
+ uint64_t last_num_requests = 0;
+
// Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
// just as soon as a delayed context comes back and triggers it.
// These sets just prevent us from spawning extra timer contexts for
map<mds_rank_t, mds_load_t> mds_load;
map<mds_rank_t, double> mds_meta_load;
map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;
+ map<mds_rank_t, int> mds_last_epoch_under_map;
// per-epoch state
- double my_load, target_load;
+ double my_load = 0;
+ double target_load = 0;
};
#endif
rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
rootdir->commit(0, gather->new_sub());
- root->store(gather->new_sub());
+ root->mark_clean();
+ root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment());
+ root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
+ root->flush(gather->new_sub());
}
void MDCache::create_mydir_hierarchy(MDSGather *gather)
straydir->mark_complete();
straydir->mark_dirty(straydir->pre_dirty(), ls);
straydir->commit(0, gather->new_sub());
- stray->_mark_dirty_parent(ls, true);
+ stray->mark_dirty_parent(ls, true);
stray->store_backtrace(gather->new_sub());
}
gather.activate();
}
+void MDCache::open_mydir_frag(MDSInternalContextBase *c)
+{
+ open_mydir_inode(
+ new MDSInternalContextWrapper(mds,
+ new FunctionContext([this, c](int r) {
+ if (r < 0) {
+ c->complete(r);
+ return;
+ }
+ CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
+ assert(mydir);
+ adjust_subtree_auth(mydir, mds->get_nodeid());
+ mydir->fetch(c);
+ })
+ )
+ );
+}
+
void MDCache::open_root()
{
dout(10) << "open_root" << dendl;
* merge with parent and/or child subtrees, if is it appropriate.
* merge can ONLY happen if both parent and child have unambiguous auth.
*/
-void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth)
+void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
{
dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
<< " on " << *dir << dendl;
root = dir;
// adjust recursive pop counters
- if (dir->is_auth()) {
+ if (adjust_pop && dir->is_auth()) {
utime_t now = ceph_clock_now();
CDir *p = dir->get_parent_dir();
while (p) {
}
};
-void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval)
+void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
{
dout(10) << "try_subtree_merge_at " << *dir << dendl;
subtrees[parent].erase(dir);
// adjust popularity?
- if (dir->is_auth()) {
+ if (adjust_pop && dir->is_auth()) {
utime_t now = ceph_clock_now();
+ CDir *cur = dir;
CDir *p = dir->get_parent_dir();
while (p) {
p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
+ p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
if (p->is_subtree_root()) break;
+ cur = p;
p = p->inode->get_parent_dir();
}
}
dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
//show_subtrees();
+ utime_t now = ceph_clock_now();
CDir *newdir = diri->get_parent_dir();
CDir *newparent = get_subtree_root(newdir);
dout(10) << " new parent " << *newparent << dendl;
+ if (olddir != newdir)
+ mds->balancer->adjust_pop_for_rename(olddir, dir, now, false);
+
if (oldparent == newparent) {
dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
- continue;
- }
-
- if (dir->is_subtree_root()) {
+ } else if (dir->is_subtree_root()) {
// children are fine. change parent.
dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
assert(subtrees[oldparent].count(dir));
assert(subtrees.count(newparent));
subtrees[newparent].insert(dir);
// caller is responsible for 'eval diri'
- try_subtree_merge_at(dir, NULL);
+ try_subtree_merge_at(dir, NULL, false);
} else {
// mid-subtree.
// did auth change?
if (oldparent->authority() != newparent->authority()) {
- adjust_subtree_auth(dir, oldparent->authority());
+ adjust_subtree_auth(dir, oldparent->authority(), false);
// caller is responsible for 'eval diri'
- try_subtree_merge_at(dir, NULL);
+ try_subtree_merge_at(dir, NULL, false);
}
}
+
+ if (olddir != newdir)
+ mds->balancer->adjust_pop_for_rename(newdir, dir, now, true);
}
show_subtrees();
}
}
-void MDCache::broadcast_quota_to_client(CInode *in)
+void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct)
{
if (!in->is_auth() || in->is_frozen())
return;
continue;
Capability *cap = it->second;
+
+ if (exclude_ct >= 0 && exclude_ct != it->first)
+ goto update;
+
if (cap->last_rbytes == i->rstat.rbytes &&
cap->last_rsize == i->rstat.rsize())
continue;
im.cap_id = ++last_cap_id; // assign a new cap ID
im.issue_seq = 1;
im.mseq = q->second.mseq;
+
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+ if (session)
+ rejoin_client_map.emplace(q->first, session->info.inst);
}
// will process these caps in rejoin stage
if (mds->is_rejoin()) {
map<client_t, set<mds_rank_t> > client_exports;
for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) {
- assert(cap_export_targets.count(p->first));
- mds_rank_t target = cap_export_targets[p->first];
+ mds_rank_t target = p->second.first;
if (rejoins.count(target) == 0)
continue;
- rejoins[target]->cap_exports[p->first] = p->second;
- for (auto q = p->second.begin(); q != p->second.end(); ++q)
+ rejoins[target]->cap_exports[p->first] = p->second.second;
+ for (auto q = p->second.second.begin(); q != p->second.second.end(); ++q)
client_exports[q->first].insert(target);
}
for (map<client_t, set<mds_rank_t> >::iterator p = client_exports.begin();
rejoins_pending = false;
// nothing?
- if (mds->is_rejoin() && rejoins.empty()) {
+ if (mds->is_rejoin() && rejoin_gather.empty()) {
dout(10) << "nothing to rejoin" << dendl;
rejoin_gather_finish();
}
}
} else {
// done?
- if (rejoin_gather.empty()) {
+ if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
rejoin_gather_finish();
} else {
dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
}
}
-class C_MDC_RejoinGatherFinish : public MDCacheContext {
-public:
- explicit C_MDC_RejoinGatherFinish(MDCache *c) : MDCacheContext(c) {}
- void finish(int r) override {
- mdcache->rejoin_gather_finish();
- }
-};
-
/*
* rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
*
// done?
assert(rejoin_gather.count(from));
rejoin_gather.erase(from);
- if (rejoin_gather.empty()) {
+ if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
rejoin_gather_finish();
} else {
dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
p != peer_imported.end();
++p) {
- assert(cap_exports.count(p->first));
- assert(cap_export_targets.count(p->first));
- assert(cap_export_targets[p->first] == from);
+ auto& ex = cap_exports.at(p->first);
+ assert(ex.first == from);
for (map<client_t,Capability::Import>::iterator q = p->second.begin();
q != p->second.end();
++q) {
- assert(cap_exports[p->first].count(q->first));
+ auto r = ex.second.find(q->first);
+ assert(r != ex.second.end());
dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
- assert(session);
+ if (!session) {
+ dout(10) << " no session for client." << p->first << dendl;
+ ex.second.erase(r);
+ continue;
+ }
// mark client caps stale.
MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0,
- cap_exports[p->first][q->first].capinfo.cap_id, 0,
+ r->second.capinfo.cap_id, 0,
mds->get_osd_epoch_barrier());
m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
(q->second.cap_id > 0 ? from : -1), 0);
mds->send_message_client_counted(m, session);
- cap_exports[p->first].erase(q->first);
+ ex.second.erase(r);
}
- assert(cap_exports[p->first].empty());
+ assert(ex.second.empty());
}
// done?
{
dout(10) << "rejoin_gather_finish" << dendl;
assert(mds->is_rejoin());
+ assert(rejoin_ack_gather.count(mds->get_nodeid()));
if (open_undef_inodes_dirfrags())
return;
rejoin_send_acks();
// signal completion of fetches, rejoin_gather_finish, etc.
- assert(rejoin_ack_gather.count(mds->get_nodeid()));
rejoin_ack_gather.erase(mds->get_nodeid());
// did we already get our acks too?
class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
public:
- map<client_t,entity_inst_t> client_map;
- map<client_t,uint64_t> sseqmap;
-
- C_MDC_RejoinSessionsOpened(MDCache *c, map<client_t,entity_inst_t>& cm) :
- MDCacheLogContext(c), client_map(cm) {}
+ map<client_t,pair<Session*,uint64_t> > session_map;
+ C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
void finish(int r) override {
assert(r == 0);
- mdcache->rejoin_open_sessions_finish(client_map, sseqmap);
+ mdcache->rejoin_open_sessions_finish(session_map);
}
};
-void MDCache::rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
- map<client_t,uint64_t>& sseqmap)
+void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
{
dout(10) << "rejoin_open_sessions_finish" << dendl;
- mds->server->finish_force_open_sessions(client_map, sseqmap);
+ mds->server->finish_force_open_sessions(session_map);
+ rejoin_session_map.swap(session_map);
if (rejoin_gather.empty())
rejoin_gather_finish();
}
cap_imports_num_opening++;
dout(10) << " opening missing ino " << p->first << dendl;
open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
+ if (!(cap_imports_num_opening % 1000))
+ mds->heartbeat_reset();
}
if (cap_imports_num_opening > 0)
// called by rejoin_gather_finish() ?
if (rejoin_gather.count(mds->get_nodeid()) == 0) {
- // if sessions for imported caps are all open ?
- for (map<client_t,entity_inst_t>::iterator p = rejoin_client_map.begin();
- p != rejoin_client_map.end();
- ++p) {
- if (!mds->sessionmap.have_session(entity_name_t::CLIENT(p->first.v))) {
- C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this, rejoin_client_map);
- version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map, finish->sseqmap);
- ESessions *le = new ESessions(pv, rejoin_client_map);
- mds->mdlog->start_submit_entry(le, finish);
- mds->mdlog->flush();
- rejoin_client_map.clear();
- return true;
- }
+ if (!rejoin_client_map.empty() &&
+ rejoin_session_map.empty()) {
+ C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
+ version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
+ finish->session_map);
+ mds->mdlog->start_submit_entry(new ESessions(pv, rejoin_client_map), finish);
+ mds->mdlog->flush();
+ rejoin_client_map.clear();
+ return true;
}
- rejoin_client_map.clear();
// process caps that were exported by slave rename
for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
q != p->second.second.end();
++q) {
- Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
- assert(session);
+ auto r = rejoin_session_map.find(q->first);
+ if (r == rejoin_session_map.end())
+ continue;
+ Session *session = r->second.first;
Capability *cap = in->get_client_cap(q->first);
if (!cap)
cap = in->add_client_cap(q->first, session);
}
assert(in->is_auth());
for (auto q = p->second.begin(); q != p->second.end(); ++q) {
- Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
- assert(session);
+ Session *session;
+ {
+ auto r = rejoin_session_map.find(q->first);
+ session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
+ }
+
for (auto r = q->second.begin(); r != q->second.end(); ++r) {
+ if (!session) {
+ if (r->first >= 0)
+ (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
+ continue;
+ }
+
Capability *cap = in->reconnect_cap(q->first, r->second, session);
add_reconnected_cap(q->first, in->ino(), r->second);
if (r->first >= 0) {
} else {
trim_non_auth();
+ assert(rejoin_gather.count(mds->get_nodeid()));
rejoin_gather.erase(mds->get_nodeid());
+ assert(!rejoin_ack_gather.count(mds->get_nodeid()));
maybe_send_pending_rejoins();
-
- if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
- rejoin_gather_finish();
}
return false;
}
if (fetch_queue.empty())
return false;
- MDSGatherBuilder gather(g_ceph_context, new C_MDC_RejoinGatherFinish(this));
+ MDSGatherBuilder gather(g_ceph_context,
+ new MDSInternalContextWrapper(mds,
+ new FunctionContext([this](int r) {
+ if (rejoin_gather.empty())
+ rejoin_gather_finish();
+ })
+ )
+ );
+
for (set<CDir*>::iterator p = fetch_queue.begin();
p != fetch_queue.end();
++p) {
// This is because that unconnected replicas are problematic for
// subtree migration.
//
- if (!in->is_auth() && !in->dirfragtreelock.can_read(-1))
+ if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1, nullptr)) {
return true;
+ }
// DIR
list<CDir*> dfls;
}
// empty stray dir
- if (!shutdown_export_strays()) {
- dout(7) << "waiting for strays to migrate" << dendl;
- return false;
- }
-
- // drop our reference to our stray dir inode
- for (int i = 0; i < NUM_STRAY; ++i) {
- if (strays[i] &&
- strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
- strays[i]->state_clear(CInode::STATE_STRAYPINNED);
- strays[i]->put(CInode::PIN_STRAY);
- strays[i]->put_stickydirs();
- }
- }
+ bool strays_all_exported = shutdown_export_strays();
// trim cache
trim(UINT64_MAX);
dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
- // SUBTREES
+ // Export all subtrees to another active (usually rank 0) if not rank 0
int num_auth_subtree = 0;
if (!subtrees.empty() &&
- mds->get_nodeid() != 0 &&
- migrator->get_export_queue_size() == 0) {
+ mds->get_nodeid() != 0) {
dout(7) << "looking for subtrees to export to mds0" << dendl;
list<CDir*> ls;
for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
ls.push_back(dir);
}
}
+
+ migrator->clear_export_queue();
for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
CDir *dir = *p;
mds_rank_t dest = dir->get_inode()->authority().first;
}
}
+ if (!strays_all_exported) {
+ dout(7) << "waiting for strays to migrate" << dendl;
+ return false;
+ }
+
if (num_auth_subtree > 0) {
+ assert(mds->get_nodeid() > 0);
dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
show_subtrees();
return false;
return false;
}
+ // Fully trim the log so that all objects in cache are clean and may be
+ // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
+ // trim the log such that the cache eventually becomes clean.
+ mds->mdlog->trim(0);
+ if (mds->mdlog->get_num_segments() > 1) {
+ dout(7) << "still >1 segments, waiting for log to trim" << dendl;
+ return false;
+ }
+
+ // drop our reference to our stray dir inode
+ for (int i = 0; i < NUM_STRAY; ++i) {
+ if (strays[i] &&
+ strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
+ strays[i]->state_clear(CInode::STATE_STRAYPINNED);
+ strays[i]->put(CInode::PIN_STRAY);
+ strays[i]->put_stickydirs();
+ }
+ }
+
CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
if (mydir && !mydir->is_subtree_root())
mydir = NULL;
assert(!migrator->is_exporting());
assert(!migrator->is_importing());
- // flush what we can from the log
- mds->mdlog->trim(0);
- if (mds->mdlog->get_num_segments() > 1) {
- dout(7) << "still >1 segments, waiting for log to trim" << dendl;
- return false;
- }
-
if ((myin && myin->is_auth_pinned()) ||
(mydir && mydir->is_auth_pinned())) {
dout(7) << "still have auth pinned objects" << dendl;
list<CDir*> dfs;
for (int i = 0; i < NUM_STRAY; ++i) {
- if (!strays[i]) {
+ if (!strays[i] ||
+ !strays[i]->state_test(CInode::STATE_STRAYPINNED))
continue;
- }
strays[i]->get_dirfrags(dfs);
}
for (auto &p : dir->items) {
CDentry *dn = p.second;
- CDentry::linkage_t *dnl = dn->get_linkage();
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
if (dnl->is_null())
continue;
done = false;
// If the scrub did some repair, then flush the journal at the end of
// the scrub. Otherwise in the case of e.g. rewriting a backtrace
// the on disk state will still look damaged.
- auto expiry_fin = new FunctionContext([this, header, fin](int r){
- if (header->get_repaired()) {
- dout(4) << "Flushing journal because scrub did some repairs" << dendl;
- mds->mdlog->start_new_segment();
- mds->mdlog->trim_all();
- if (fin) {
- MDSGatherBuilder expiry_gather(g_ceph_context);
- const std::set<LogSegment*> &expiring_segments = mds->mdlog->get_expiring_segments();
- for (std::set<LogSegment*>::const_iterator i = expiring_segments.begin();
- i != expiring_segments.end(); ++i) {
- (*i)->wait_for_expiry(expiry_gather.new_sub());
- }
- expiry_gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
- expiry_gather.activate();
- }
- } else {
- if (fin) {
- fin->complete(r);
- }
+ auto scrub_finish = new FunctionContext([this, header, fin](int r){
+ if (!header->get_repaired()) {
+ if (fin)
+ fin->complete(r);
+ return;
+ }
+
+ auto flush_finish = new FunctionContext([this, fin](int r){
+ dout(4) << "Expiring log segments because scrub did some repairs" << dendl;
+ mds->mdlog->trim_all();
+
+ if (fin) {
+ MDSGatherBuilder gather(g_ceph_context);
+ auto& expiring_segments = mds->mdlog->get_expiring_segments();
+ for (auto logseg : expiring_segments)
+ logseg->wait_for_expiry(gather.new_sub());
+ assert(gather.has_subs());
+ gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
+ gather.activate();
}
+ });
+
+ dout(4) << "Flushing journal because scrub did some repairs" << dendl;
+ mds->mdlog->start_new_segment();
+ mds->mdlog->flush();
+ mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish));
});
if (!header->get_recursive()) {
mds->scrubstack->enqueue_inode_top(in, header,
- new MDSInternalContextWrapper(mds,
- expiry_fin));
+ new MDSInternalContextWrapper(mds, scrub_finish));
} else {
mds->scrubstack->enqueue_inode_bottom(in, header,
- new MDSInternalContextWrapper(mds,
- expiry_fin));
+ new MDSInternalContextWrapper(mds, scrub_finish));
}
mds->server->respond_to_request(mdr, 0);
public:
bool is_subtrees() { return !subtrees.empty(); }
void list_subtrees(list<CDir*>& ls);
- void adjust_subtree_auth(CDir *root, mds_authority_t auth);
+ void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true);
void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
adjust_subtree_auth(root, mds_authority_t(a,b));
}
}
void map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result);
void try_subtree_merge(CDir *root);
- void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval);
+ void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval, bool adjust_pop=true);
void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut);
void eval_subtree_root(CInode *diri);
CDir *get_subtree_root(CDir *dir);
void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
snapid_t ofirst, snapid_t last,
CInode *pin, bool cow_head);
- void broadcast_quota_to_client(CInode *in);
+ void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1);
void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
CInode *in, CDir *parent,
int flags, int linkunlink=0,
map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_slave_exports;
map<client_t,entity_inst_t> rejoin_client_map;
+ map<client_t,pair<Session*,uint64_t> > rejoin_session_map;
- map<inodeno_t,map<client_t,cap_reconnect_t> > cap_exports; // ino -> client -> capex
- map<inodeno_t,mds_rank_t> cap_export_targets; // ino -> auth mds
+ map<inodeno_t,pair<mds_rank_t,map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex
map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports; // ino -> client -> frommds -> capex
set<inodeno_t> cap_imports_missing;
void rejoin_send_rejoins();
void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
int target=-1) {
- cap_exports[ino][client] = icr;
- cap_export_targets[ino] = target;
+ auto& ex = cap_exports[ino];
+ ex.first = target;
+ ex.second[client] = icr;
}
void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
mds_rank_t frommds=MDS_RANK_NONE) {
cap_imports[ino][client][frommds] = icr;
}
+ void rejoin_recovered_client(client_t client, const entity_inst_t& inst) {
+ rejoin_client_map.emplace(client, inst);
+ }
const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
if (cap_imports.count(ino) &&
cap_imports[ino].count(client) &&
friend class C_MDC_RejoinOpenInoFinish;
friend class C_MDC_RejoinSessionsOpened;
void rejoin_open_ino_finish(inodeno_t ino, int ret);
- void rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
- map<client_t,uint64_t>& sseqmap);
+ void rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map);
bool process_imported_caps();
void choose_lock_states_and_reconnect_caps();
void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
void open_root_inode(MDSInternalContextBase *c);
void open_root();
void open_mydir_inode(MDSInternalContextBase *c);
+ void open_mydir_frag(MDSInternalContextBase *c);
void populate_mydir();
void _create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin);
}
if (waiting->empty()) {
put(PIN_WAITER);
- waiting.release();
+ waiting.reset();
}
}
void finish_waiting(uint64_t mask, int result = 0);
asok_hook,
"dump metadata cache for subtree");
assert(r == 0);
+ r = admin_socket->register_command("dump loads",
+ "dump loads",
+ asok_hook,
+ "dump metadata loads");
+ assert(r == 0);
r = admin_socket->register_command("session evict",
"session evict name=client_id,type=CephString",
asok_hook,
admin_socket->unregister_command("dump cache");
admin_socket->unregister_command("cache status");
admin_socket->unregister_command("dump tree");
+ admin_socket->unregister_command("dump loads");
admin_socket->unregister_command("session evict");
admin_socket->unregister_command("osdmap barrier");
admin_socket->unregister_command("session ls");
"mds_max_purge_ops",
"mds_max_purge_ops_per_pg",
"mds_max_purge_files",
+ "mds_inject_migrator_session_race",
"clog_to_graylog",
"clog_to_graylog_host",
"clog_to_graylog_port",
bool MDSDaemon::ms_verify_authorizer(Connection *con, int peer_type,
int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
- bool& is_valid, CryptoKey& session_key)
+ bool& is_valid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge)
{
Mutex::Locker l(mds_lock);
if (stopping) {
is_valid = authorize_handler->verify_authorizer(
cct, keys,
authorizer_data, authorizer_reply, name, global_id, caps_info,
- session_key);
+ session_key, nullptr, challenge);
} else {
dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
is_valid = false;
dout(10) << " new session " << s << " for " << s->info.inst << " con " << con << dendl;
con->set_priv(s);
s->connection = con;
+ if (mds_rank) {
+ mds_rank->kick_waiters_for_any_client_connection();
+ }
} else {
dout(10) << " existing session " << s << " for " << s->info.inst << " existing con " << s->connection
<< ", new/authorizing con " << con << dendl;
bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
bool ms_verify_authorizer(Connection *con, int peer_type,
int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key) override;
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
void ms_handle_accept(Connection *con) override;
void ms_handle_connect(Connection *con) override;
bool ms_handle_reset(Connection *con) override;
}
// log
- mds_load_t load = balancer->get_load(ceph_clock_now());
-
if (logger) {
- logger->set(l_mds_load_cent, 100 * load.mds_load());
- logger->set(l_mds_dispatch_queue_len, messenger->get_dispatch_queue_len());
logger->set(l_mds_subtrees, mdcache->num_subtrees());
mdcache->log_stat();
{
Session *session = static_cast<Session *>(m->get_connection()->get_priv());
if (session) {
+ session->put(); // do not carry ref
dout(20) << "get_session have " << session << " " << session->info.inst
<< " state " << session->get_state_name() << dendl;
- session->put(); // not carry ref
+ // Check if we've imported an open session since (new sessions start closed)
+ if (session->is_closed()) {
+ Session *imported_session = sessionmap.get_session(session->info.inst.name);
+ if (imported_session && imported_session != session) {
+ dout(10) << __func__ << " replacing connection bootstrap session " << session << " with imported session " << imported_session << dendl;
+ imported_session->info.auth_name = session->info.auth_name;
+ //assert(session->info.auth_name == imported_session->info.auth_name);
+ assert(session->info.inst == imported_session->info.inst);
+ imported_session->connection = session->connection;
+ // send out any queued messages
+ while (!session->preopen_out_queue.empty()) {
+ imported_session->connection->send_message(session->preopen_out_queue.front());
+ session->preopen_out_queue.pop_front();
+ }
+ imported_session->auth_caps = session->auth_caps;
+ assert(session->get_nref() == 1);
+ imported_session->connection->set_priv(imported_session->get());
+ session = imported_session;
+ }
+ }
} else {
dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
}
MDSGatherBuilder gather(g_ceph_context,
new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
- mdcache->open_mydir_inode(gather.new_sub());
+ if (is_starting()) {
+ // load mydir frag for the first log segment (creating subtree map)
+ mdcache->open_mydir_frag(gather.new_sub());
+ } else {
+ mdcache->open_mydir_inode(gather.new_sub());
+ }
- if (is_starting() ||
- whoami == mdsmap->get_root()) { // load root inode off disk if we are auth
- mdcache->open_root_inode(gather.new_sub());
- } else {
- // replay. make up fake root inode to start with
- (void)mdcache->create_root_inode();
- }
+ if (whoami == mdsmap->get_root()) { // load root inode off disk if we are auth
+ mdcache->open_root_inode(gather.new_sub());
+ } else if (is_any_replay()) {
+ // replay. make up fake root inode to start with
+ mdcache->create_root_inode();
+ }
gather.activate();
}
break;
void MDSRank::validate_sessions()
{
assert(mds_lock.is_locked_by_me());
- std::vector<Session*> victims;
+ bool valid = true;
// Identify any sessions which have state inconsistent with other,
// after they have been loaded from rados during startup.
Session *session = i.second;
interval_set<inodeno_t> badones;
if (inotable->intersects_free(session->info.prealloc_inos, &badones)) {
- clog->error() << "Client session loaded with invalid preallocated "
- "inodes, evicting session " << *session;
-
- // Make the session consistent with inotable so that it can
- // be cleanly torn down
- session->info.prealloc_inos.subtract(badones);
-
- victims.push_back(session);
+ clog->error() << "client " << *session
+ << "loaded with preallocated inodes that are inconsistent with inotable";
+ valid = false;
}
}
- for (const auto &session: victims) {
- server->kill_session(session, nullptr);
+ if (!valid) {
+ damaged();
+ assert(valid);
}
}
assert(is_starting());
request_state(MDSMap::STATE_ACTIVE);
- mdcache->open_root();
-
- if (mdcache->is_open()) {
- mdlog->start_new_segment();
- } else {
- mdcache->wait_for_open(new MDSInternalContextWrapper(this,
- new FunctionContext([this] (int r) {
- mdlog->start_new_segment();
- })));
- }
+ mdlog->start_new_segment();
}
{
dout(1) << "active_start" << dendl;
- if (last_state == MDSMap::STATE_CREATING) {
+ if (last_state == MDSMap::STATE_CREATING ||
+ last_state == MDSMap::STATE_STARTING) {
mdcache->open_root();
}
// REJOIN
// is everybody finally rejoining?
- if (is_starting() || is_rejoin() || is_clientreplay() || is_active() || is_stopping()) {
+ if (is_rejoin() || is_clientreplay() || is_active() || is_stopping()) {
// did we start?
if (!oldmap->is_rejoining() && mdsmap->is_rejoining())
rejoin_joint_start();
f->reset();
}
}
+ } else if (command == "dump loads") {
+ Mutex::Locker l(mds_lock);
+ int r = balancer->dump_loads(f);
+ if (r != 0) {
+ ss << "Failed to dump loads: " << cpp_strerror(r);
+ f->reset();
+ }
} else if (command == "force_readonly") {
Mutex::Locker l(mds_lock);
mdcache->force_readonly();
void handle_conf_change(const struct md_config_t *conf,
const std::set <std::string> &changed)
{
+ mdcache->migrator->handle_conf_change(conf, changed, *mdsmap);
purge_queue.handle_conf_change(conf, changed, *mdsmap);
}
ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename)
list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
+ list<MDSInternalContextBase*> waiting_for_any_client_connection;
list<MDSInternalContextBase*> replay_queue;
map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
}
+ void wait_for_any_client_connection(MDSInternalContextBase *c) {
+ waiting_for_any_client_connection.push_back(c);
+ }
+ void kick_waiters_for_any_client_connection(void) {
+ finish_contexts(g_ceph_context, waiting_for_any_client_connection);
+ }
void wait_for_active(MDSInternalContextBase *c) {
waiting_for_active.push_back(c);
}
MDSMap *get_mds_map() { return mdsmap; }
- int get_req_rate() const { return logger->get(l_mds_request); }
+ uint64_t get_num_requests() const { return logger->get(l_mds_request); }
int get_mds_slow_req_count() const { return mds_slow_req_count; }
#include "Mutation.h"
#include "include/filepath.h"
+#include "common/likely.h"
#include "events/EExport.h"
#include "events/EImportStart.h"
handle_export_prep(static_cast<MExportDirPrep*>(m));
break;
case MSG_MDS_EXPORTDIR:
- handle_export_dir(static_cast<MExportDir*>(m));
+ if (unlikely(inject_session_race)) {
+ dout(0) << "waiting for inject_session_race" << dendl;
+ mds->wait_for_any_client_connection(new C_MDS_RetryMessage(mds, m));
+ } else {
+ handle_export_dir(static_cast<MExportDir*>(m));
+ }
break;
case MSG_MDS_EXPORTDIRFINISH:
handle_export_finish(static_cast<MExportDirFinish*>(m));
map<client_t,Capability::Import>::iterator q = peer_imported.find(it->first);
assert(q != peer_imported.end());
- m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq, peer, 0);
+ m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
+ (q->second.cap_id > 0 ? peer : -1), 0);
mds->send_message_client_counted(m, it->first);
}
in->clear_client_caps_after_export();
CDir *dir;
mds_rank_t from;
public:
- map<client_t,entity_inst_t> imported_client_map;
- map<client_t,uint64_t> sseqmap;
+ map<client_t,pair<Session*,uint64_t> > imported_session_map;
C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, mds_rank_t f) :
MigratorLogContext(m), df(d->dirfrag()), dir(d), from(f) {
}
void finish(int r) override {
- mig->import_logged_start(df, dir, from, imported_client_map, sseqmap);
+ mig->import_logged_start(df, dir, from, imported_session_map);
}
};
// new client sessions, open these after we journal
// include imported sessions in EImportStart
bufferlist::iterator cmp = m->client_map.begin();
- ::decode(onlogged->imported_client_map, cmp);
+ map<client_t,entity_inst_t> client_map;
+ decode(client_map, cmp);
assert(cmp.end());
- le->cmapv = mds->server->prepare_force_open_sessions(onlogged->imported_client_map, onlogged->sseqmap);
- le->client_map.claim(m->client_map);
+ le->cmapv = mds->server->prepare_force_open_sessions(client_map, onlogged->imported_session_map);
+ encode(client_map, le->client_map, mds->mdsmap->get_up_features());
bufferlist::iterator blp = m->export_data.begin();
int num_imported_inodes = 0;
if (stat.state == IMPORT_ACKING) {
// remove imported caps
for (map<CInode*,map<client_t,Capability::Export> >::iterator p = stat.peer_exports.begin();
- p != stat.peer_exports.end();
- ++p) {
+ p != stat.peer_exports.end();
+ ++p) {
CInode *in = p->first;
for (map<client_t,Capability::Export>::iterator q = p->second.begin();
- q != p->second.end();
- ++q) {
+ q != p->second.end();
+ ++q) {
Capability *cap = in->get_client_cap(q->first);
- assert(cap);
+ if (!cap) {
+ assert(!stat.session_map.count(q->first));
+ continue;
+ }
if (cap->is_importing())
in->remove_client_cap(q->first);
}
in->put(CInode::PIN_IMPORTINGCAPS);
}
- for (map<client_t,entity_inst_t>::iterator p = stat.client_map.begin();
- p != stat.client_map.end();
- ++p) {
- Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
- assert(session);
+ for (auto& p : stat.session_map) {
+ Session *session = p.second.first;
session->dec_importing();
}
}
void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
- map<client_t,entity_inst_t>& imported_client_map,
- map<client_t,uint64_t>& sseqmap)
+ map<client_t,pair<Session*,uint64_t> >& imported_session_map)
{
map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
if (it == import_state.end() ||
it->second.state != IMPORT_LOGGINGSTART) {
dout(7) << "import " << df << " must have aborted" << dendl;
- mds->server->finish_force_open_sessions(imported_client_map, sseqmap);
+ mds->server->finish_force_open_sessions(imported_session_map);
return;
}
assert (g_conf->mds_kill_import_at != 7);
// force open client sessions and finish cap import
- mds->server->finish_force_open_sessions(imported_client_map, sseqmap, false);
- it->second.client_map.swap(imported_client_map);
+ mds->server->finish_force_open_sessions(imported_session_map, false);
map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
p != it->second.peer_exports.end();
++p) {
// parameter 'peer' is NONE, delay sending cap import messages to client
- finish_import_inode_caps(p->first, MDS_RANK_NONE, true, p->second, imported_caps[p->first->ino()]);
+ finish_import_inode_caps(p->first, MDS_RANK_NONE, true, imported_session_map,
+ p->second, imported_caps[p->first->ino()]);
}
+
+ it->second.session_map.swap(imported_session_map);
// send notify's etc.
dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl;
for (map<client_t,Capability::Export>::iterator q = p->second.begin();
q != p->second.end();
++q) {
- Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
- assert(session);
+ auto r = it->second.session_map.find(q->first);
+ if (r == it->second.session_map.end())
+ continue;
+
+ Session *session = r->second.first;
Capability *cap = in->get_client_cap(q->first);
assert(cap);
cap->merge(q->second, true);
p->second.clear();
in->replica_caps_wanted = 0;
}
- for (map<client_t,entity_inst_t>::iterator p = it->second.client_map.begin();
- p != it->second.client_map.end();
- ++p) {
- Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
- assert(session);
+ for (auto& p : it->second.session_map) {
+ Session *session = p.second.first;
session->dec_importing();
}
}
assert(!dn->get_linkage()->get_inode());
dn->dir->link_primary_inode(dn, in);
}
+
+ if (in->is_dir())
+ dn->dir->pop_lru_subdirs.push_back(&in->item_pop_lru);
// add inode?
if (added) {
}
void Migrator::finish_import_inode_caps(CInode *in, mds_rank_t peer, bool auth_cap,
- map<client_t,Capability::Export> &export_map,
+ const map<client_t,pair<Session*,uint64_t> >& session_map,
+ const map<client_t,Capability::Export> &export_map,
map<client_t,Capability::Import> &import_map)
{
- for (map<client_t,Capability::Export>::iterator it = export_map.begin();
- it != export_map.end();
- ++it) {
- dout(10) << "finish_import_inode_caps for client." << it->first << " on " << *in << dendl;
- Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(it->first.v));
- assert(session);
+ for (auto& it : export_map) {
+ dout(10) << "finish_import_inode_caps for client." << it.first << " on " << *in << dendl;
+
+ auto p = session_map.find(it.first);
+ if (p == session_map.end()) {
+ dout(10) << " no session for client." << it.first << dendl;
+ (void)import_map[it.first];
+ continue;
+ }
- Capability *cap = in->get_client_cap(it->first);
+ Session *session = p->second.first;
+
+ Capability *cap = in->get_client_cap(it.first);
if (!cap) {
- cap = in->add_client_cap(it->first, session);
+ cap = in->add_client_cap(it.first, session);
if (peer < 0)
cap->mark_importing();
}
- Capability::Import& im = import_map[it->first];
+ Capability::Import& im = import_map[it.first];
im.cap_id = cap->get_cap_id();
- im.mseq = auth_cap ? it->second.mseq : cap->get_mseq();
+ im.mseq = auth_cap ? it.second.mseq : cap->get_mseq();
im.issue_seq = cap->get_last_seq() + 1;
if (peer >= 0) {
- cap->merge(it->second, auth_cap);
- mds->mdcache->do_cap_import(session, in, cap, it->second.cap_id,
- it->second.seq, it->second.mseq - 1, peer,
+ cap->merge(it.second, auth_cap);
+ mds->mdcache->do_cap_import(session, in, cap, it.second.cap_id,
+ it.second.seq, it.second.mseq - 1, peer,
auth_cap ? CEPH_CAP_FLAG_AUTH : CEPH_CAP_FLAG_RELEASE);
}
}
CInode *in;
mds_rank_t from;
public:
+ map<client_t,pair<Session*,uint64_t> > imported_session_map;
map<CInode*, map<client_t,Capability::Export> > peer_exports;
- map<client_t,entity_inst_t> client_map;
- map<client_t,uint64_t> sseqmap;
C_M_LoggedImportCaps(Migrator *m, CInode *i, mds_rank_t f) : MigratorLogContext(m), in(i), from(f) {}
void finish(int r) override {
- mig->logged_import_caps(in, from, peer_exports, client_map, sseqmap);
+ mig->logged_import_caps(in, from, imported_session_map, peer_exports);
}
};
assert(in->is_auth());
// FIXME
- if (!in->can_auth_pin())
+ if (!in->can_auth_pin()) {
+ ex->put();
return;
+ }
+
in->auth_pin(this);
+ map<client_t,entity_inst_t> client_map;
+ client_map.swap(ex->client_map);
+
C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
this, in, mds_rank_t(ex->get_source().num()));
- finish->client_map = ex->client_map;
+ version_t pv = mds->server->prepare_force_open_sessions(client_map,
+ finish->imported_session_map);
// decode new caps
bufferlist::iterator blp = ex->cap_bl.begin();
decode_import_inode_caps(in, false, blp, finish->peer_exports);
assert(!finish->peer_exports.empty()); // thus, inode is pinned.
// journal open client sessions
- version_t pv = mds->server->prepare_force_open_sessions(finish->client_map, finish->sseqmap);
- ESessions *le = new ESessions(pv, ex->client_map);
+ ESessions *le = new ESessions(pv, client_map);
mds->mdlog->start_submit_entry(le, finish);
mds->mdlog->flush();
void Migrator::logged_import_caps(CInode *in,
mds_rank_t from,
- map<CInode*, map<client_t,Capability::Export> >& peer_exports,
- map<client_t,entity_inst_t>& client_map,
- map<client_t,uint64_t>& sseqmap)
+ map<client_t,pair<Session*,uint64_t> >& imported_session_map,
+ map<CInode*, map<client_t,Capability::Export> >& peer_exports)
{
dout(10) << "logged_import_caps on " << *in << dendl;
// see export_go() vs export_go_synced()
assert(in->is_auth());
// force open client sessions and finish cap import
- mds->server->finish_force_open_sessions(client_map, sseqmap);
+ mds->server->finish_force_open_sessions(imported_session_map);
map<client_t,Capability::Import> imported_caps;
- assert(peer_exports.count(in));
+ auto it = peer_exports.find(in);
+ assert(it != peer_exports.end());
+
// clients will release caps from the exporter when they receive the cap import message.
- finish_import_inode_caps(in, from, false, peer_exports[in], imported_caps);
+ finish_import_inode_caps(in, from, false, imported_session_map, it->second, imported_caps);
mds->locker->eval(in, CEPH_CAP_LOCKS, true);
in->auth_unpin(this);
}
+
+void Migrator::handle_conf_change(const struct md_config_t *conf,
+ const std::set <std::string> &changed,
+ const MDSMap &mds_map)
+{
+ if (changed.count("mds_inject_migrator_session_race")) {
+ inject_session_race = conf->get_val<bool>("mds_inject_migrator_session_race");
+ dout(0) << "mds_inject_migrator_session_race is " << inject_session_race << dendl;
+ }
+}
class CDir;
class CInode;
class CDentry;
+class Session;
class MExportDirDiscover;
class MExportDirDiscoverAck;
}
// -- cons --
- Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) {}
-
+ Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) {
+ inject_session_race = g_conf->get_val<bool>("mds_inject_migrator_session_race");
+ }
+ void handle_conf_change(const struct md_config_t *conf,
+ const std::set <std::string> &changed,
+ const MDSMap &mds_map);
protected:
// export fun
set<mds_rank_t> bystanders;
list<dirfrag_t> bound_ls;
list<ScatterLock*> updated_scatterlocks;
- map<client_t,entity_inst_t> client_map;
+ map<client_t,pair<Session*,uint64_t> > session_map;
map<CInode*, map<client_t,Capability::Export> > peer_exports;
MutationRef mut;
import_state_t() : state(0), peer(0), tid(0), mut() {}
void import_notify_abort(CDir *dir, set<CDir*>& bounds);
void import_notify_finish(CDir *dir, set<CDir*>& bounds);
void import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
- map<client_t,entity_inst_t> &imported_client_map,
- map<client_t,uint64_t>& sseqmap);
+ map<client_t,pair<Session*,uint64_t> >& imported_session_map);
void handle_export_finish(MExportDirFinish *m);
void handle_export_caps(MExportCaps *m);
void logged_import_caps(CInode *in,
mds_rank_t from,
- map<CInode*, map<client_t,Capability::Export> >& cap_imports,
- map<client_t,entity_inst_t>& client_map,
- map<client_t,uint64_t>& sseqmap);
+ map<client_t,pair<Session*,uint64_t> >& imported_session_map,
+ map<CInode*, map<client_t,Capability::Export> >& cap_imports);
friend class C_MDS_ImportDirLoggedStart;
void decode_import_inode_caps(CInode *in, bool auth_cap, bufferlist::iterator &blp,
map<CInode*, map<client_t,Capability::Export> >& cap_imports);
void finish_import_inode_caps(CInode *in, mds_rank_t from, bool auth_cap,
- map<client_t,Capability::Export> &export_map,
+ const map<client_t,pair<Session*,uint64_t> >& smap,
+ const map<client_t,Capability::Export> &export_map,
map<client_t,Capability::Import> &import_map);
int decode_import_dir(bufferlist::iterator& blp,
mds_rank_t oldauth,
private:
MDSRank *mds;
MDCache *cache;
+ bool inject_session_race = false;
};
#endif
bool is_remote_frozen_authpin;
bool is_inode_exporter;
- map<client_t,entity_inst_t> imported_client_map;
- map<client_t,uint64_t> sseq_map;
+ map<client_t, pair<Session*, uint64_t> > imported_session_map;
map<CInode*, map<client_t,Capability::Export> > cap_imports;
// for lock/flock
bool could_consume = false;
while(can_consume()) {
- could_consume = true;
if (delayed_flush) {
// We are now going to read from the journal, so any proactive
return could_consume;
}
+ could_consume = true;
// The journaler is readable: consume an entry
bufferlist bl;
bool readable = journaler.try_read_entry(bl);
bool wait_for_active = true;
if (mds->is_stopping()) {
- if (m->get_source().is_mds())
- wait_for_active = false;
+ wait_for_active = false;
} else if (mds->is_clientreplay()) {
if (req->is_queued_for_replay()) {
wait_for_active = false;
if (session->is_opening() ||
session->is_open() ||
session->is_stale() ||
- session->is_killing()) {
+ session->is_killing() ||
+ terminating_sessions) {
dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
// set client metadata for session opened by prepare_force_open_sessions
if (!m->client_meta.empty())
});
if (blacklisted) {
- dout(10) << "ignoring blacklisted client " << session->info.inst.addr << dendl;
+ dout(10) << "rejecting blacklisted client " << session->info.inst.addr << dendl;
+ mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
m->put();
return;
}
* - sessions learned from other MDSs during a cross-MDS rename
*/
version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
- map<client_t,uint64_t>& sseqmap)
+ map<client_t, pair<Session*,uint64_t> >& smap)
{
version_t pv = mds->sessionmap.get_projected();
dout(10) << "prepare_force_open_sessions " << pv
<< " on " << cm.size() << " clients"
<< dendl;
- for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
+ mds->objecter->with_osdmap(
+ [this, &cm](const OSDMap &osd_map) {
+ for (auto p = cm.begin(); p != cm.end(); ) {
+ if (osd_map.is_blacklisted(p->second.addr)) {
+ dout(10) << " ignoring blacklisted client." << p->first
+ << " (" << p->second.addr << ")" << dendl;
+ cm.erase(p++);
+ } else {
+ ++p;
+ }
+ }
+ });
+
+ for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
Session *session = mds->sessionmap.get_or_add_session(p->second);
pv = mds->sessionmap.mark_projected(session);
+ uint64_t sseq;
if (session->is_closed() ||
session->is_closing() ||
- session->is_killing())
- sseqmap[p->first] = mds->sessionmap.set_state(session, Session::STATE_OPENING);
- else
+ session->is_killing()) {
+ sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
+ } else {
assert(session->is_open() ||
session->is_opening() ||
session->is_stale());
+ sseq = 0;
+ }
+ smap[p->first] = make_pair(session, sseq);
session->inc_importing();
}
return pv;
}
-void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
- map<client_t,uint64_t>& sseqmap,
+void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
bool dec_import)
{
/*
* client trying to close a session and an MDS doing an import
* trying to force open a session...
*/
- dout(10) << "finish_force_open_sessions on " << cm.size() << " clients,"
+ dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
<< " initial v " << mds->sessionmap.get_version() << dendl;
-
-
- for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
- Session *session = mds->sessionmap.get_session(p->second.name);
- assert(session);
-
- if (sseqmap.count(p->first)) {
- uint64_t sseq = sseqmap[p->first];
+ for (auto &it : smap) {
+ Session *session = it.second.first;
+ uint64_t sseq = it.second.second;
+ if (sseq > 0) {
if (session->get_state_seq() != sseq) {
dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
} else {
void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
{
reconnect_done = reconnect_done_;
- mds->sessionmap.get_client_set(client_reconnect_gather);
+
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ for (auto session : sessions) {
+ if (session->is_open())
+ client_reconnect_gather.insert(session->get_client());
+ }
if (client_reconnect_gather.empty()) {
dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
<< ") from " << m->get_source_inst()
<< " after " << delay << " (allowed interval " << g_conf->mds_reconnect_timeout << ")";
deny = true;
- } else if (session->is_closed()) {
+ } else if (!session->is_open()) {
dout(1) << " session is closed, ignoring reconnect, sending close" << dendl;
mds->clog->info() << "denied reconnect attempt (mds is "
<< ceph_mds_state_name(mds->get_state())
mdcache->rejoin_recovered_caps(p->first, from, p->second, MDS_RANK_NONE);
}
}
+ mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
// remove from gather set
client_reconnect_gather.erase(from);
} else {
mdcache->request_finish(mdr);
}
+ m->put();
return;
}
}
return;
}
- CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, false, false, NULL, !is_lookup);
+ bool want_auth = false;
+ int mask = req->head.args.getattr.mask;
+ if (mask & CEPH_STAT_RSTAT)
+ want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
+
+ CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, want_auth, false, NULL,
+ !is_lookup);
if (!ref) return;
/*
mdr->snapid <= cap->client_follows))
issued = cap->issued();
- int mask = req->head.args.getattr.mask;
if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
rdlocks.insert(&ref->linklock);
if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
if (!check_access(mdr, ref, MAY_READ))
return;
+ utime_t now = ceph_clock_now();
+ mdr->set_mds_stamp(now);
+
// note which caps are requested, so we return at least a snapshot
// value for them. (currently this matters for xattrs and inline data)
mdr->getattr_caps = mask;
- mds->balancer->hit_inode(ceph_clock_now(), ref, META_POP_IRD,
+ mds->balancer->hit_inode(now, ref, META_POP_IRD,
req->get_source().num());
// reply
if (!check_access(mdr, cur, mask))
return;
+ utime_t now = ceph_clock_now();
+ mdr->set_mds_stamp(now);
+
if (cur->is_file() || cur->is_dir()) {
if (mdr->snapid == CEPH_NOSNAP) {
// register new cap
// hit pop
if (cmode & CEPH_FILE_MODE_WR)
- mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IWR);
+ mds->balancer->hit_inode(now, cur, META_POP_IWR);
else
- mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IRD,
+ mds->balancer->hit_inode(now, cur, META_POP_IRD,
mdr->client_request->get_source().num());
CDentry *dn = 0;
// dirty inode, dn, dir
newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
newi->mark_dirty(newi->inode.version+1, mdr->ls);
- newi->_mark_dirty_parent(mdr->ls, true);
+ newi->mark_dirty_parent(mdr->ls, true);
mdr->apply();
MDRequestRef null_ref;
get_mds()->mdcache->send_dentry_link(dn, null_ref);
- get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
+ utime_t now = ceph_clock_now();
+ get_mds()->balancer->hit_inode(now, newi, META_POP_IWR);
server->respond_to_request(mdr, 0);
get_mds()->mdcache->truncate_inode(in, mdr->ls);
}
- get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
+ utime_t now = ceph_clock_now();
+ get_mds()->balancer->hit_inode(now, in, META_POP_IWR);
server->respond_to_request(mdr, 0);
}
pi.inode.version = cur->pre_dirty();
- pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
// log + wait
// prepare
auto &pi = in->project_inode();
pi.inode.version = in->pre_dirty();
- pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.mtime = pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
// add the old pool to the inode
pi.inode.add_old_pool(old_layout.pool_id);
pi.inode.version = cur->pre_dirty();
- pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
// log + wait
int64_t old_pool = pi.inode.layout.pool_id;
pi.inode.add_old_pool(old_pool);
pi.inode.layout = layout;
- pi.inode.ctime = mdr->get_op_stamp();
pip = &pi.inode;
} else if (name.compare(0, 10, "ceph.quota") == 0) {
if (!cur->is_dir() || cur->is_root()) {
mdr->no_early_reply = true;
pip = &pi.inode;
+
+ client_t exclude_ct = mdr->get_client();
+ mdcache->broadcast_quota_to_client(cur, exclude_ct);
} else if (name.find("ceph.dir.pin") == 0) {
if (!cur->is_dir() || cur->is_root()) {
respond_to_request(mdr, -EINVAL);
}
pip->change_attr++;
- pip->ctime = mdr->get_op_stamp();
+ pip->ctime = pip->rstat.rctime = mdr->get_op_stamp();
pip->version = cur->pre_dirty();
if (cur->is_file())
pip->update_backtrace();
mdr->apply();
- get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
+ utime_t now = ceph_clock_now();
+ get_mds()->balancer->hit_inode(now, in, META_POP_IWR);
server->respond_to_request(mdr, 0);
}
// project update
auto &pi = cur->project_inode(true);
pi.inode.version = cur->pre_dirty();
- pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
pi.inode.xattr_version++;
auto &px = *pi.xattrs;
auto &pi = cur->project_inode(true);
auto &px = *pi.xattrs;
pi.inode.version = cur->pre_dirty();
- pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
pi.inode.xattr_version++;
px.erase(mempool::mds_co::string(boost::string_view(name)));
// a new version of hte inode since it's just been created)
newi->inode.version--;
newi->mark_dirty(newi->inode.version + 1, mdr->ls);
- newi->_mark_dirty_parent(mdr->ls, true);
+ newi->mark_dirty_parent(mdr->ls, true);
// mkdir?
if (newi->inode.is_dir()) {
get_mds()->locker->share_inode_max_size(newi);
// hit pop
- get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
+ utime_t now = ceph_clock_now();
+ get_mds()->balancer->hit_inode(now, newi, META_POP_IWR);
// reply
server->respond_to_request(mdr, 0);
// project inode update
auto &pi = targeti->project_inode();
pi.inode.nlink++;
- pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
pi.inode.version = tipv;
mdcache->send_dentry_link(dn, null_ref);
// bump target popularity
- mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
- mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
+ utime_t now = ceph_clock_now();
+ mds->balancer->hit_inode(now, targeti, META_POP_IWR);
+ mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
// reply
respond_to_request(mdr, 0);
mdcache->send_dentry_unlink(dn, NULL, null_ref);
// bump target popularity
- mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
- mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
+ utime_t now = ceph_clock_now();
+ mds->balancer->hit_inode(now, targeti, META_POP_IWR);
+ mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
// reply
respond_to_request(mdr, 0);
mdr->apply();
// hit pop
- mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
+ utime_t now = ceph_clock_now();
+ mds->balancer->hit_inode(now, targeti, META_POP_IWR);
// done.
mdr->slave_request->put();
}
// inode
- pi.inode.ctime = rollback.old_ctime;
+ pi.inode.ctime = pi.inode.rstat.rctime = rollback.old_ctime;
if (rollback.was_inc)
pi.inode.nlink--;
else
}
mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
pi.inode.version = in->pre_dirty();
- pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
pi.inode.nlink--;
if (pi.inode.nlink == 0)
mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
// bump pop
- mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
+ utime_t now = ceph_clock_now();
+ mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
// reply
respond_to_request(mdr, 0);
assert(g_conf->mds_kill_rename_at != 6);
// bump popularity
- mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
+ utime_t now = ceph_clock_now();
+ mds->balancer->hit_dir(now, srcdn->get_dir(), META_POP_IWR);
if (destdnl->is_remote() && in->is_auth())
- mds->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
+ mds->balancer->hit_inode(now, in, META_POP_IWR);
// did we import srci? if so, explicitly ack that import that, before we unlock and reply.
bufferlist::iterator blp = mdr->more()->inode_import.begin();
// imported caps
- ::decode(mdr->more()->imported_client_map, blp);
- ::encode(mdr->more()->imported_client_map, *client_map_bl,
- mds->mdsmap->get_up_features());
- prepare_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
+ map<client_t,entity_inst_t> client_map;
+ decode(client_map, blp);
+ prepare_force_open_sessions(client_map, mdr->more()->imported_session_map);
+ encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
list<ScatterLock*> updated_scatterlocks;
mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
if (!silent) {
if (spi) {
- spi->ctime = mdr->get_op_stamp();
+ spi->ctime = spi->rstat.rctime = mdr->get_op_stamp();
spi->change_attr++;
if (linkmerge)
spi->nlink--;
}
if (tpi) {
- tpi->ctime = mdr->get_op_stamp();
+ tpi->ctime = tpi->rstat.rctime = mdr->get_op_stamp();
tpi->change_attr++;
{
std::string t;
map<client_t,Capability::Import> imported_caps;
// finish cap imports
- finish_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
+ finish_force_open_sessions(mdr->more()->imported_session_map);
if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
- mdr->more()->srcdn_auth_mds, true,
- mdr->more()->cap_imports[destdnl->get_inode()],
- imported_caps);
+ mdr->more()->srcdn_auth_mds, true,
+ mdr->more()->imported_session_map,
+ mdr->more()->cap_imports[destdnl->get_inode()],
+ imported_caps);
}
mdr->more()->inode_import.clear();
destdnl = destdn->get_linkage();
// bump popularity
- mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
+ utime_t now = ceph_clock_now();
+ mds->balancer->hit_dir(now, srcdn->get_dir(), META_POP_IWR);
if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
- mds->balancer->hit_inode(mdr->get_mds_stamp(), destdnl->get_inode(),
- META_POP_IWR);
+ mds->balancer->hit_inode(now, destdnl->get_inode(), META_POP_IWR);
// done.
mdr->slave_request->put();
::decode(peer_imported, bp);
dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
- mdcache->migrator->finish_export_inode(destdnl->get_inode(),
- mdr->get_mds_stamp(),
+ mdcache->migrator->finish_export_inode(destdnl->get_inode(), ceph_clock_now(),
mdr->slave_to_mds, peer_imported, finished);
mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
} else
pip = in->get_projected_inode();
if (pip->ctime == rollback.ctime)
- pip->ctime = rollback.orig_src.old_ctime;
+ pip->ctime = pip->rstat.rctime = rollback.orig_src.old_ctime;
}
if (srcdn && srcdn->authority().first == whoami) {
} else
ti = target->get_projected_inode();
if (ti->ctime == rollback.ctime)
- ti->ctime = rollback.orig_dest.old_ctime;
+ ti->ctime = ti->rstat.rctime = rollback.orig_dest.old_ctime;
if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
info.stamp = mdr->get_op_stamp();
auto &pi = diri->project_inode(false, true);
- pi.inode.ctime = info.stamp;
+ pi.inode.ctime = pi.inode.rstat.rctime = info.stamp;
pi.inode.version = diri->pre_dirty();
// project the snaprealm
// journal
auto &pi = diri->project_inode(false, true);
pi.inode.version = diri->pre_dirty();
- pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "rmsnap");
// journal
auto &pi = diri->project_inode(false, true);
- pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.version = diri->pre_dirty();
// project the snaprealm
void _session_logged(Session *session, uint64_t state_seq,
bool open, version_t pv, interval_set<inodeno_t>& inos,version_t piv);
version_t prepare_force_open_sessions(map<client_t,entity_inst_t> &cm,
- map<client_t,uint64_t>& sseqmap);
- void finish_force_open_sessions(map<client_t,entity_inst_t> &cm,
- map<client_t,uint64_t>& sseqmap,
+ map<client_t,pair<Session*,uint64_t> >& smap);
+ void finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
bool dec_import=true);
void flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather);
void finish_flush_session(Session *session, version_t seq);
num_trim_flushes_warnings(0),
num_trim_requests_warnings(0) { }
~Session() override {
- assert(!item_session_list.is_on_list());
+ if (state == STATE_CLOSED) {
+ item_session_list.remove_myself();
+ } else {
+ assert(!item_session_list.is_on_list());
+ }
while (!preopen_out_queue.empty()) {
preopen_out_queue.front()->put();
preopen_out_queue.pop_front();
void dump();
- void get_client_set(set<client_t>& s) {
- for (ceph::unordered_map<entity_name_t,Session*>::iterator p = session_map.begin();
- p != session_map.end();
- ++p)
- if (p->second->info.inst.name.is_client())
- s.insert(p->second->info.inst.name.num());
- }
void get_client_session_set(set<Session*>& s) const {
for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
p != session_map.end();
s.insert(p->second);
}
- void open_sessions(map<client_t,entity_inst_t>& client_map) {
+ void replay_open_sessions(map<client_t,entity_inst_t>& client_map) {
for (map<client_t,entity_inst_t>::iterator p = client_map.begin();
p != client_map.end();
++p) {
Session *s = get_or_add_session(p->second);
set_state(s, Session::STATE_OPEN);
- version++;
+ replay_dirty_session(s);
}
}
if (p->is_dirty())
in->_mark_dirty(logseg);
if (p->is_dirty_parent())
- in->_mark_dirty_parent(logseg, p->is_dirty_pool());
+ in->mark_dirty_parent(logseg, p->is_dirty_pool());
if (p->need_snapflush())
logseg->open_files.push_back(&in->item_open_file);
if (dn->is_auth())
} else {
dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
<< " < " << cmapv << dendl;
- mds->sessionmap.open_sessions(client_map);
+ mds->sessionmap.replay_open_sessions(client_map);
assert(mds->sessionmap.get_version() == cmapv);
- mds->sessionmap.set_projected(mds->sessionmap.get_version());
}
update_segment();
}
map<client_t,entity_inst_t> cm;
bufferlist::iterator blp = client_map.begin();
::decode(cm, blp);
- mds->sessionmap.open_sessions(cm);
-
+ mds->sessionmap.replay_open_sessions(cm);
assert(mds->sessionmap.get_version() == cmapv);
- mds->sessionmap.set_projected(mds->sessionmap.get_version());
}
}
update_segment();
map<client_t,entity_inst_t> cm;
bufferlist::iterator blp = client_map.begin();
::decode(cm, blp);
- mds->sessionmap.open_sessions(cm);
+ mds->sessionmap.replay_open_sessions(cm);
if (mds->sessionmap.get_version() != cmapv)
{
derr << "sessionmap version " << mds->sessionmap.get_version()
mds->damaged();
ceph_abort(); // Should be unreachable because damaged() calls respawn()
}
- mds->sessionmap.set_projected(mds->sessionmap.get_version());
}
update_segment();
}
f->close_section();
}
+void dirfrag_load_vec_t::dump(Formatter *f, utime_t now, const DecayRate& rate)
+{
+ f->dump_float("meta_load", meta_load(now, rate));
+ f->dump_float("IRD", get(META_POP_IRD).get(now, rate));
+ f->dump_float("IWR", get(META_POP_IWR).get(now, rate));
+ f->dump_float("READDIR", get(META_POP_READDIR).get(now, rate));
+ f->dump_float("FETCH", get(META_POP_FETCH).get(now, rate));
+ f->dump_float("STORE", get(META_POP_STORE).get(now, rate));
+}
+
void dirfrag_load_vec_t::generate_test_instances(list<dirfrag_load_vec_t*>& ls)
{
utime_t sample;
decode(sample, p);
}
void dump(Formatter *f) const;
+ void dump(Formatter *f, utime_t now, const DecayRate& rate);
static void generate_test_instances(list<dirfrag_load_vec_t*>& ls);
DecayCounter &get(int t) {
2*vec[META_POP_FETCH].get(now, rate) +
4*vec[META_POP_STORE].get(now, rate);
}
- double meta_load() {
+ double meta_load() const {
return
1*vec[META_POP_IRD].get_last() +
2*vec[META_POP_IWR].get_last() +
c.decode(sample, p);
}
-inline std::ostream& operator<<(std::ostream& out, dirfrag_load_vec_t& dl)
+inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl)
{
- // ugliness!
- utime_t now = ceph_clock_now();
- DecayRate rate(g_conf->mds_decay_halflife);
- return out << "[" << dl.vec[0].get(now, rate) << "," << dl.vec[1].get(now, rate)
- << " " << dl.meta_load(now, rate)
- << "]";
+ return out << "[" << dl.vec[0].get_last() << "," << dl.vec[1].get_last()
+ << " " << dl.meta_load() << "]";
}
c.decode(sample, p);
}
-inline std::ostream& operator<<( std::ostream& out, mds_load_t& load )
+inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load)
{
return out << "mdsload<" << load.auth << "/" << load.all
<< ", req " << load.req_rate
#define CLIENT_CAPS_SYNC (0x1)
class MClientCaps : public Message {
- static const int HEAD_VERSION = 10;
+ static const int HEAD_VERSION = 11;
static const int COMPAT_VERSION = 1;
public:
struct ceph_mds_caps_head head;
- uint64_t size, max_size, truncate_size, change_attr;
- uint32_t truncate_seq;
+ uint64_t size = 0;
+ uint64_t max_size = 0;
+ uint64_t truncate_size = 0;
+ uint64_t change_attr = 0;
+ uint32_t truncate_seq = 0;
utime_t mtime, atime, ctime, btime;
- uint32_t time_warp_seq;
+ uint32_t time_warp_seq = 0;
+ int64_t nfiles = -1; // files in dir
+ int64_t nsubdirs = -1; // subdirs in dir
struct ceph_mds_cap_peer peer;
bufferlist snapbl;
bufferlist xattrbl;
bufferlist flockbl;
- version_t inline_version;
+ version_t inline_version = 0;
bufferlist inline_data;
// Receivers may not use their new caps until they have this OSD map
- epoch_t osd_epoch_barrier;
- ceph_tid_t oldest_flush_tid;
- uint32_t caller_uid;
- uint32_t caller_gid;
+ epoch_t osd_epoch_barrier = 0;
+ ceph_tid_t oldest_flush_tid = 0;
+ uint32_t caller_uid = 0;
+ uint32_t caller_gid = 0;
/* advisory CLIENT_CAPS_* flags to send to mds */
- unsigned flags;
+ unsigned flags = 0;
int get_caps() { return head.caps; }
int get_wanted() { return head.wanted; }
utime_t get_atime() { return atime; }
__u64 get_change_attr() { return change_attr; }
__u32 get_time_warp_seq() { return time_warp_seq; }
+ uint64_t get_nfiles() { return nfiles; }
+ uint64_t get_nsubdirs() { return nsubdirs; }
+ bool dirstat_is_valid() const { return nfiles != -1 || nsubdirs != -1; }
const file_layout_t& get_layout() {
return layout;
void clear_dirty() { head.dirty = 0; }
MClientCaps()
- : Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION),
- size(0),
- max_size(0),
- truncate_size(0),
- change_attr(0),
- truncate_seq(0),
- time_warp_seq(0),
- osd_epoch_barrier(0),
- oldest_flush_tid(0),
- caller_uid(0), caller_gid(0),
- flags(0) {
- inline_version = 0;
- }
+ : Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION) {}
MClientCaps(int op,
inodeno_t ino,
inodeno_t realm,
int mseq,
epoch_t oeb)
: Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION),
- size(0),
- max_size(0),
- truncate_size(0),
- change_attr(0),
- truncate_seq(0),
- time_warp_seq(0),
- osd_epoch_barrier(oeb),
- oldest_flush_tid(0),
- caller_uid(0), caller_gid(0),
- flags(0) {
+ osd_epoch_barrier(oeb) {
memset(&head, 0, sizeof(head));
head.op = op;
head.ino = ino;
head.dirty = dirty;
head.migrate_seq = mseq;
memset(&peer, 0, sizeof(peer));
- inline_version = 0;
}
MClientCaps(int op,
inodeno_t ino, inodeno_t realm,
uint64_t id, int mseq, epoch_t oeb)
: Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION),
- size(0),
- max_size(0),
- truncate_size(0),
- change_attr(0),
- truncate_seq(0),
- time_warp_seq(0),
- osd_epoch_barrier(oeb),
- oldest_flush_tid(0),
- caller_uid(0), caller_gid(0),
- flags(0) {
+ osd_epoch_barrier(oeb) {
memset(&head, 0, sizeof(head));
head.op = op;
head.ino = ino;
head.cap_id = id;
head.migrate_seq = mseq;
memset(&peer, 0, sizeof(peer));
- inline_version = 0;
}
private:
file_layout_t layout;
if (header.version >= 10) {
::decode(flags, p);
}
+ if (header.version >= 11) {
+ decode(nfiles, p);
+ decode(nsubdirs, p);
+ }
}
void encode_payload(uint64_t features) override {
header.version = HEAD_VERSION;
::encode(btime, payload);
::encode(change_attr, payload);
::encode(flags, payload);
+ ::encode(nfiles, payload);
+ ::encode(nsubdirs, payload);
}
};
class MHeartbeat : public Message {
mds_load_t load;
- __s32 beat;
+ __s32 beat = 0;
map<mds_rank_t, float> import_map;
public:
MMDSMap() :
Message(CEPH_MSG_MDS_MAP, HEAD_VERSION, COMPAT_VERSION) {}
- MMDSMap(const uuid_d &f, MDSMap *mm) :
+ MMDSMap(const uuid_d &f, const MDSMap *mm) :
Message(CEPH_MSG_MDS_MAP, HEAD_VERSION, COMPAT_VERSION),
fsid(f) {
epoch = mm->get_epoch();
public:
uuid_d fsid;
+ uint64_t encode_features = 0;
map<epoch_t, bufferlist> maps;
map<epoch_t, bufferlist> incremental_maps;
epoch_t oldest_map =0, newest_map = 0;
MOSDMap() : Message(CEPH_MSG_OSD_MAP, HEAD_VERSION) { }
- MOSDMap(const uuid_d &f)
+ MOSDMap(const uuid_d &f, const uint64_t features)
: Message(CEPH_MSG_OSD_MAP, HEAD_VERSION),
- fsid(f),
+ fsid(f), encode_features(features),
oldest_map(0), newest_map(0) { }
private:
~MOSDMap() override {}
-
public:
// marshalling
void decode_payload() override {
void encode_payload(uint64_t features) override {
header.version = HEAD_VERSION;
::encode(fsid, payload);
- if ((features & CEPH_FEATURE_PGID64) == 0 ||
- (features & CEPH_FEATURE_PGPOOL3) == 0 ||
- (features & CEPH_FEATURE_OSDENC) == 0 ||
- (features & CEPH_FEATURE_OSDMAP_ENC) == 0 ||
- (features & CEPH_FEATURE_MSG_ADDR2) == 0 ||
- !HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ if (OSDMap::get_significant_features(encode_features) !=
+ OSDMap::get_significant_features(features)) {
if ((features & CEPH_FEATURE_PGID64) == 0 ||
(features & CEPH_FEATURE_PGPOOL3) == 0)
header.version = 1; // old old_client version
OSDMap::Incremental inc;
bufferlist::iterator q = p->second.begin();
inc.decode(q);
+ // always encode with subset of osdmaps canonical features
+ uint64_t f = inc.encode_features & features;
p->second.clear();
if (inc.fullmap.length()) {
// embedded full map?
OSDMap m;
m.decode(inc.fullmap);
inc.fullmap.clear();
- m.encode(inc.fullmap, features | CEPH_FEATURE_RESERVED);
+ m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
}
if (inc.crush.length()) {
// embedded crush map
auto p = inc.crush.begin();
c.decode(p);
inc.crush.clear();
- c.encode(inc.crush, features);
+ c.encode(inc.crush, f);
}
- inc.encode(p->second, features | CEPH_FEATURE_RESERVED);
+ inc.encode(p->second, f | CEPH_FEATURE_RESERVED);
}
for (map<epoch_t,bufferlist>::iterator p = maps.begin();
p != maps.end();
++p) {
OSDMap m;
m.decode(p->second);
+ // always encode with subset of osdmaps canonical features
+ uint64_t f = m.get_encoding_features() & features;
p->second.clear();
- m.encode(p->second, features | CEPH_FEATURE_RESERVED);
+ m.encode(p->second, f | CEPH_FEATURE_RESERVED);
}
}
::encode(incremental_maps, payload);
struct MOSDRepScrub : public MOSDFastDispatchOp {
- static const int HEAD_VERSION = 7;
+ static const int HEAD_VERSION = 9;
static const int COMPAT_VERSION = 6;
spg_t pgid; // PG to scrub
hobject_t start; // lower bound of scrub, inclusive
hobject_t end; // upper bound of scrub, exclusive
bool deep; // true if scrub should be deep
- uint32_t seed; // seed value for digest calculation
+ bool allow_preemption = false;
+ int32_t priority = 0;
+ bool high_priority = false;
epoch_t get_map_epoch() const override {
return map_epoch;
MOSDRepScrub()
: MOSDFastDispatchOp(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION),
chunky(false),
- deep(false),
- seed(0) { }
+ deep(false) { }
MOSDRepScrub(spg_t pgid, eversion_t scrub_to, epoch_t map_epoch, epoch_t min_epoch,
- hobject_t start, hobject_t end, bool deep, uint32_t seed)
+ hobject_t start, hobject_t end, bool deep,
+ bool preemption, int prio, bool highprio)
: MOSDFastDispatchOp(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION),
pgid(pgid),
scrub_to(scrub_to),
start(start),
end(end),
deep(deep),
- seed(seed) { }
+ allow_preemption(preemption),
+ priority(prio),
+ high_priority(highprio) { }
private:
public:
const char *get_type_name() const override { return "replica scrub"; }
void print(ostream& out) const override {
- out << "replica scrub(pg: ";
- out << pgid << ",from:" << scrub_from << ",to:" << scrub_to
+ out << "replica_scrub(pg: " << pgid
+ << ",from:" << scrub_from
+ << ",to:" << scrub_to
<< ",epoch:" << map_epoch << "/" << min_epoch
<< ",start:" << start << ",end:" << end
<< ",chunky:" << chunky
<< ",deep:" << deep
- << ",seed:" << seed
- << ",version:" << header.version;
- out << ")";
+ << ",version:" << header.version
+ << ",allow_preemption:" << (int)allow_preemption
+ << ",priority=" << priority
+ << (high_priority ? " (high)":"")
+ << ")";
}
void encode_payload(uint64_t features) override {
::encode(end, payload);
::encode(deep, payload);
::encode(pgid.shard, payload);
- ::encode(seed, payload);
+ ::encode((uint32_t)-1, payload); // seed
::encode(min_epoch, payload);
+ ::encode(allow_preemption, payload);
+ ::encode(priority, payload);
+ ::encode(high_priority, payload);
}
void decode_payload() override {
bufferlist::iterator p = payload.begin();
::decode(end, p);
::decode(deep, p);
::decode(pgid.shard, p);
- ::decode(seed, p);
+ {
+ uint32_t seed;
+ ::decode(seed, p);
+ }
if (header.version >= 7) {
::decode(min_epoch, p);
} else {
min_epoch = map_epoch;
}
+ if (header.version >= 8) {
+ ::decode(allow_preemption, p);
+ }
+ if (header.version >= 9) {
+ ::decode(priority, p);
+ ::decode(high_priority, p);
+ }
}
};
struct MOSDRepScrubMap : public MOSDFastDispatchOp {
- static const int HEAD_VERSION = 1;
+ static const int HEAD_VERSION = 2;
static const int COMPAT_VERSION = 1;
spg_t pgid; // primary spg_t
epoch_t map_epoch = 0;
pg_shard_t from; // whose scrubmap this is
bufferlist scrub_map_bl;
+ bool preempted = false;
epoch_t get_map_epoch() const override {
return map_epoch;
const char *get_type_name() const { return "rep_scrubmap"; }
void print(ostream& out) const {
out << "rep_scrubmap(" << pgid << " e" << map_epoch
- << " from shard " << from << ")";
+ << " from shard " << from
+ << (preempted ? " PREEMPTED":"") << ")";
}
void encode_payload(uint64_t features) {
::encode(pgid, payload);
::encode(map_epoch, payload);
::encode(from, payload);
+ ::encode(preempted, payload);
}
void decode_payload() {
bufferlist::iterator p = payload.begin();
::decode(pgid, p);
::decode(map_epoch, p);
::decode(from, p);
+ if (header.version >= 2) {
+ ::decode(preempted, p);
+ }
}
};
f.dump_int(i.first.c_str(), i.second);
}
f.close_section();
+ f.open_object_section("pg_stats_sum");
+ pg_map.pg_sum.dump(&f);
+ f.close_section();
}
);
return f.get();
Mutex::Locker l2(metadata->lock);
if (metadata->perf_counters.instances.count(path)) {
auto counter_instance = metadata->perf_counters.instances.at(path);
- const auto &data = counter_instance.get_data();
- for (const auto &datapoint : data) {
- f.open_array_section("datapoint");
- f.dump_unsigned("t", datapoint.t.sec());
- f.dump_unsigned("v", datapoint.v);
- f.close_section();
-
+ auto counter_type = metadata->perf_counters.types.at(path);
+ if (counter_type.type & PERFCOUNTER_LONGRUNAVG) {
+ const auto &avg_data = counter_instance.get_data_avg();
+ for (const auto &datapoint : avg_data) {
+ f.open_array_section("datapoint");
+ f.dump_unsigned("t", datapoint.t.sec());
+ f.dump_unsigned("s", datapoint.s);
+ f.dump_unsigned("c", datapoint.c);
+ f.close_section();
+ }
+ } else {
+ const auto &data = counter_instance.get_data();
+ for (const auto &datapoint : data) {
+ f.open_array_section("datapoint");
+ f.dump_unsigned("t", datapoint.t.sec());
+ f.dump_unsigned("v", datapoint.v);
+ f.close_section();
+ }
}
} else {
dout(4) << "Missing counter: '" << path << "' ("
}
-bool DaemonServer::ms_verify_authorizer(Connection *con,
- int peer_type,
- int protocol,
- ceph::bufferlist& authorizer_data,
- ceph::bufferlist& authorizer_reply,
- bool& is_valid,
- CryptoKey& session_key)
+bool DaemonServer::ms_verify_authorizer(
+ Connection *con,
+ int peer_type,
+ int protocol,
+ ceph::bufferlist& authorizer_data,
+ ceph::bufferlist& authorizer_reply,
+ bool& is_valid,
+ CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge)
{
AuthAuthorizeHandler *handler = nullptr;
if (peer_type == CEPH_ENTITY_TYPE_OSD ||
authorizer_data,
authorizer_reply, s->entity_name,
s->global_id, caps_info,
- session_key);
+ session_key,
+ nullptr,
+ challenge);
} else {
dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
is_valid = false;
bool ms_handle_refused(Connection *con) override;
bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer,
bool force_new) override;
- bool ms_verify_authorizer(Connection *con,
- int peer_type,
- int protocol,
- ceph::bufferlist& authorizer,
- ceph::bufferlist& authorizer_reply,
- bool& isvalid,
- CryptoKey& session_key) override;
+ bool ms_verify_authorizer(
+ Connection *con,
+ int peer_type,
+ int protocol,
+ ceph::bufferlist& authorizer,
+ ceph::bufferlist& authorizer_reply,
+ bool& isvalid,
+ CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
bool handle_open(MMgrOpen *m);
bool handle_report(MMgrReport *m);
for (const auto &t : report->declare_types) {
types.insert(std::make_pair(t.path, t));
session->declared_types.insert(t.path);
+ instances.insert(std::pair<std::string, PerfCounterInstance>(
+ t.path, PerfCounterInstance(t.type)));
}
// Remove any old types
for (const auto &t : report->undeclare_types) {
if (t.type & PERFCOUNTER_LONGRUNAVG) {
::decode(avgcount, p);
::decode(avgcount2, p);
+ instances.at(t_path).push_avg(now, val, avgcount);
+ } else {
+ instances.at(t_path).push(now, val);
}
- // TODO: interface for insertion of avgs
- instances[t_path].push(now, val);
}
DECODE_FINISH(p);
}
buffer.push_back({t, v});
}
+void PerfCounterInstance::push_avg(utime_t t, uint64_t const &s,
+ uint64_t const &c)
+{
+ avg_buffer.push_back({t, s, c});
+}
{}
};
+ class AvgDataPoint
+ {
+ public:
+ utime_t t;
+ uint64_t s;
+ uint64_t c;
+ AvgDataPoint(utime_t t_, uint64_t s_, uint64_t c_)
+ : t(t_), s(s_), c(c_)
+ {}
+ };
+
boost::circular_buffer<DataPoint> buffer;
+ boost::circular_buffer<AvgDataPoint> avg_buffer;
+
uint64_t get_current() const;
public:
{
return buffer;
}
+ const boost::circular_buffer<AvgDataPoint> & get_data_avg() const
+ {
+ return avg_buffer;
+ }
void push(utime_t t, uint64_t const &v);
- PerfCounterInstance()
- : buffer(20) {}
+ void push_avg(utime_t t, uint64_t const &s, uint64_t const &c);
+
+ PerfCounterInstance(enum perfcounter_type_d type)
+ {
+ if (type & PERFCOUNTER_LONGRUNAVG)
+ avg_buffer = boost::circular_buffer<AvgDataPoint>(20);
+ else
+ buffer = boost::circular_buffer<DataPoint>(20);
+ };
};
supported.erase(CEPH_AUTH_CEPHX);
}
}
+ } else if (!m->get_connection()->has_feature(CEPH_FEATURE_CEPHX_V2)) {
+ if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_OSD ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_MDS ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_MGR) {
+ if (g_conf->cephx_cluster_require_version >= 2 ||
+ g_conf->cephx_require_version >= 2) {
+ dout(1) << m->get_source_inst()
+ << " supports cephx but not v2 and"
+ << " 'cephx [cluster] require version >= 2';"
+ << " disallowing cephx" << dendl;
+ supported.erase(CEPH_AUTH_CEPHX);
+ }
+ } else {
+ if (g_conf->cephx_service_require_version >= 2 ||
+ g_conf->cephx_require_version >= 2) {
+ dout(1) << m->get_source_inst()
+ << " supports cephx but not v2 and"
+ << " 'cephx [service] require version >= 2';"
+ << " disallowing cephx" << dendl;
+ supported.erase(CEPH_AUTH_CEPHX);
+ }
+ }
}
int type;
for (const auto &sys_cap : wanted_caps) {
if (entity_auth.caps.count(sys_cap.first) == 0 ||
!entity_auth.caps[sys_cap.first].contents_equal(sys_cap.second)) {
- ss << "key for " << entity << " exists but cap " << sys_cap.first
- << " does not match";
+ ss << entity << " already has fs capabilities that differ from those supplied. To generate a new auth key for "
+ << entity << ", first remove " << entity << " from configuration files, execute 'ceph auth rm " << entity << "', then execute this command again.";
err = -EINVAL;
goto done;
}
PGMonitor.cc
PGMap.cc
ConfigKeyService.cc
- ../mgr/mgr_commands.cc)
+ ../mgr/mgr_commands.cc
+ ../osd/OSDCap.cc)
add_library(mon STATIC
${lib_mon_srcs}
$<TARGET_OBJECTS:kv_objs>
return false;
}
+static bool is_binary_string(const string& s)
+{
+ for (auto c : s) {
+ // \n and \t are escaped in JSON; other control characters are not.
+ if ((c < 0x20 && c != '\n' && c != '\t') || c >= 0x7f) {
+ return true;
+ }
+ }
+ return false;
+}
+
void ConfigKeyService::store_dump(stringstream &ss)
{
KeyValueDB::Iterator iter =
f.open_object_section("config-key store");
while (iter->valid()) {
- f.dump_string(iter->key().c_str(), iter->value().to_str());
+ string s = iter->value().to_str();
+ if (is_binary_string(s)) {
+ ostringstream ss;
+ ss << "<<< binary blob of length " << s.size() << " >>>";
+ f.dump_string(iter->key().c_str(), ss.str());
+ } else {
+ f.dump_string(iter->key().c_str(), s);
+ }
iter->next();
}
f.close_section();
return false;
done:
+ mon->no_reply(op);
return true;
}
}
version_t summary_ver = summary.version;
- while (sv <= summary_ver) {
+ while (sv && sv <= summary_ver) {
bufferlist bl;
int err = get_version(sv, bl);
assert(err == 0);
#define dout_subsys ceph_subsys_mon
#undef dout_prefix
-#define dout_prefix _prefix(_dout, mon, fsmap)
-static ostream& _prefix(std::ostream *_dout, Monitor *mon, FSMap const& fsmap) {
+#define dout_prefix _prefix(_dout, mon, get_fsmap())
+static ostream& _prefix(std::ostream *_dout, Monitor *mon, const FSMap& fsmap) {
return *_dout << "mon." << mon->name << "@" << mon->rank
<< "(" << mon->get_state_name()
<< ").mds e" << fsmap.get_epoch() << " ";
// my methods
-void MDSMonitor::print_map(FSMap &m, int dbl)
+void MDSMonitor::print_map(const FSMap &m, int dbl)
{
dout(dbl) << "print_map\n";
m.print(*_dout);
void MDSMonitor::update_from_paxos(bool *need_bootstrap)
{
version_t version = get_last_committed();
- if (version == fsmap.epoch)
+ if (version == get_fsmap().epoch)
return;
dout(10) << __func__ << " version " << version
- << ", my e " << fsmap.epoch << dendl;
- assert(version > fsmap.epoch);
+ << ", my e " << get_fsmap().epoch << dendl;
+ assert(version > get_fsmap().epoch);
load_health();
assert(fsmap_bl.length() > 0);
dout(10) << __func__ << " got " << version << dendl;
- fsmap.decode(fsmap_bl);
+ PaxosFSMap::decode(fsmap_bl);
// new map
dout(4) << "new map" << dendl;
- print_map(fsmap, 0);
+ print_map(get_fsmap(), 0);
if (!g_conf->mon_mds_skip_sanity) {
- fsmap.sanity();
+ get_fsmap().sanity();
}
check_subs();
void MDSMonitor::create_pending()
{
- pending_fsmap = fsmap;
- pending_fsmap.epoch++;
+ auto &fsmap = PaxosFSMap::create_pending();
if (mon->osdmon()->is_readable()) {
- auto &osdmap = mon->osdmon()->osdmap;
- pending_fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
+ const auto &osdmap = mon->osdmon()->osdmap;
+ fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
}
- dout(10) << "create_pending e" << pending_fsmap.epoch << dendl;
+ dout(10) << "create_pending e" << fsmap.epoch << dendl;
}
void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
{
- dout(10) << "encode_pending e" << pending_fsmap.epoch << dendl;
+ auto &pending = get_pending_fsmap_writeable();
+ auto &epoch = pending.epoch;
+ dout(10) << "encode_pending e" << epoch << dendl;
// print map iff 'debug mon = 30' or higher
- print_map(pending_fsmap, 30);
+ print_map(get_pending_fsmap(), 30);
if (!g_conf->mon_mds_skip_sanity) {
- pending_fsmap.sanity();
+ pending.sanity();
}
// Set 'modified' on maps modified this epoch
- for (auto &i : fsmap.filesystems) {
- if (i.second->mds_map.epoch == fsmap.epoch) {
- i.second->mds_map.modified = ceph_clock_now();
+ for (auto &p : pending.filesystems) {
+ if (p.second->mds_map.epoch == epoch) {
+ p.second->mds_map.modified = ceph_clock_now();
}
}
// apply to paxos
- assert(get_last_committed() + 1 == pending_fsmap.epoch);
- bufferlist fsmap_bl;
- pending_fsmap.encode(fsmap_bl, mon->get_quorum_con_features());
+ assert(get_last_committed() + 1 == pending.epoch);
+ bufferlist pending_bl;
+ pending.encode(pending_bl, mon->get_quorum_con_features());
/* put everything in the transaction */
- put_version(t, pending_fsmap.epoch, fsmap_bl);
- put_last_committed(t, pending_fsmap.epoch);
+ put_version(t, pending.epoch, pending_bl);
+ put_last_committed(t, pending.epoch);
// Encode MDSHealth data
for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
// health
health_check_map_t new_checks;
- const auto info_map = pending_fsmap.get_mds_info();
+ const auto &info_map = pending.get_mds_info();
for (const auto &i : info_map) {
const auto &gid = i.first;
const auto &info = i.second;
mds_metric_summary(metric.type));
ostringstream ss;
ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
- for (auto p = metric.metadata.begin();
- p != metric.metadata.end();
- ++p) {
- if (p != metric.metadata.begin()) {
+ bool first = true;
+ for (auto &p : metric.metadata) {
+ if (first) {
+ ss << " ";
+ } else {
ss << ", ";
- }
- ss << p->first << ": " << p->second;
+ }
+ ss << p.first << ": " << p.second;
+ first = false;
}
check->detail.push_back(ss.str());
}
}
- pending_fsmap.get_health_checks(&new_checks);
+ pending.get_health_checks(&new_checks);
for (auto& p : new_checks.checks) {
p.second.summary = boost::regex_replace(
p.second.summary,
{
dout(10) << "update_logger" << dendl;
+ const auto &fsmap = get_fsmap();
+
uint64_t up = 0;
uint64_t in = 0;
uint64_t failed = 0;
MDSMap::mds_info_t info;
epoch_t effective_epoch = 0;
+ const auto &fsmap = get_working_fsmap();
+
// check privileges, ignore if fails
MonSession *session = m->get_session();
assert(session);
}
// fw to leader?
- if (!mon->is_leader())
+ if (!is_leader())
return false;
// booted, but not in map?
- if (!pending_fsmap.gid_exists(gid)) {
+ if (!fsmap.gid_exists(gid)) {
if (state != MDSMap::STATE_BOOT) {
dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
<< ceph_mds_state_name(state) << ")" << dendl;
}
}
dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
- info = pending_fsmap.get_info_gid(gid);
+ info = fsmap.get_info_gid(gid);
// old seq?
if (info.state_seq > seq) {
// Work out the latest epoch that this daemon should have seen
{
- fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
+ fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
if (fscid == FS_CLUSTER_ID_NONE) {
- effective_epoch = pending_fsmap.standby_epochs.at(gid);
+ effective_epoch = fsmap.standby_epochs.at(gid);
} else {
- effective_epoch = pending_fsmap.get_filesystem(fscid)->mds_map.epoch;
+ effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
}
if (effective_epoch != m->get_last_epoch_seen()) {
dout(10) << "mds_beacon " << *m
op->mark_mdsmon_event(__func__);
MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
+
+ auto &fsmap = get_working_fsmap();
// check privileges, ignore message if fails
MonSession *session = m->get_session();
MDSMap::DaemonState state = m->get_state();
version_t seq = m->get_seq();
+ auto &pending = get_pending_fsmap_writeable();
+
dout(20) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
// Calculate deltas of health metrics created and removed
for (const auto &new_metric: new_health) {
if (old_types.count(new_metric.type) == 0) {
- std::stringstream msg;
- msg << "MDS health message (" << m->get_orig_source_inst().name << "): "
- << new_metric.message;
- if (new_metric.sev == HEALTH_ERR) {
- mon->clog->error() << msg.str();
- } else if (new_metric.sev == HEALTH_WARN) {
- mon->clog->warn() << msg.str();
- } else {
- mon->clog->info() << msg.str();
- }
+ dout(10) << "MDS health message (" << m->get_orig_source_inst().name
+ << "): " << new_metric.sev << " " << new_metric.message << dendl;
}
}
// zap previous instance of this name?
if (g_conf->mds_enforce_unique_name) {
bool failed_mds = false;
- while (mds_gid_t existing = pending_fsmap.find_mds_gid_by_name(m->get_name())) {
+ while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
if (!mon->osdmon()->is_writeable()) {
mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
return false;
}
const MDSMap::mds_info_t &existing_info =
- pending_fsmap.get_info_gid(existing);
+ pending.get_info_gid(existing);
mon->clog->info() << existing_info.human_name() << " restarted";
fail_mds_gid(existing);
failed_mds = true;
}
// Add this daemon to the map
- if (pending_fsmap.mds_roles.count(gid) == 0) {
+ if (pending.mds_roles.count(gid) == 0) {
MDSMap::mds_info_t new_info;
new_info.global_id = gid;
new_info.name = m->get_name();
new_info.standby_for_name = m->get_standby_for_name();
new_info.standby_for_fscid = m->get_standby_for_fscid();
new_info.standby_replay = m->get_standby_replay();
- pending_fsmap.insert(new_info);
+ pending.insert(new_info);
}
// Resolve standby_for_name to a rank
- const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
+ const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
if (!info.standby_for_name.empty()) {
- const MDSMap::mds_info_t *leaderinfo = fsmap.find_by_name(
+ const MDSMap::mds_info_t *leaderinfo = pending.find_by_name(
info.standby_for_name);
if (leaderinfo && (leaderinfo->rank >= 0)) {
- auto fscid = pending_fsmap.mds_roles.at(leaderinfo->global_id);
- auto fs = pending_fsmap.get_filesystem(fscid);
+ const auto &fscid = pending.mds_roles.at(leaderinfo->global_id);
+ const auto &fs = pending.get_filesystem(fscid);
- pending_fsmap.modify_daemon(gid, [fscid, leaderinfo](
+ pending.modify_daemon(gid, [fscid, leaderinfo](
MDSMap::mds_info_t *info) {
info->standby_for_rank = leaderinfo->rank;
info->standby_for_fscid = fscid;
last_beacon[gid].seq = seq;
// new incompat?
- if (!pending_fsmap.compat.writeable(m->get_compat())) {
- dout(10) << " fsmap " << pending_fsmap.compat
+ if (!pending.compat.writeable(m->get_compat())) {
+ dout(10) << " fsmap " << pending.compat
<< " can't write to new mds' " << m->get_compat()
<< ", updating fsmap and killing old mds's"
<< dendl;
- pending_fsmap.update_compat(m->get_compat());
+ pending.update_compat(m->get_compat());
}
update_metadata(m->get_global_id(), m->get_sys_info());
} else {
// state update
- const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
+ const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
// Old MDS daemons don't mention that they're standby replay until
// after they've sent their boot beacon, so update this field.
if (info.standby_replay != m->get_standby_replay()) {
- pending_fsmap.modify_daemon(info.global_id, [&m](
+ pending.modify_daemon(info.global_id, [&m](
MDSMap::mds_info_t *i)
{
i->standby_replay = m->get_standby_replay();
if (info.laggy()) {
dout(10) << "prepare_beacon clearing laggy flag on " << addr << dendl;
- pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
+ pending.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
{
info->clear_laggy();
}
<< " standby_for_rank=" << m->get_standby_for_rank()
<< dendl;
if (state == MDSMap::STATE_STOPPED) {
- const auto fscid = pending_fsmap.mds_roles.at(gid);
- auto fs = pending_fsmap.get_filesystem(fscid);
+ const auto fscid = pending.mds_roles.at(gid);
+ const auto &fs = pending.get_filesystem(fscid);
mon->clog->info() << info.human_name() << " finished "
<< "deactivating rank " << info.rank << " in filesystem "
<< fs->mds_map.fs_name << " (now has "
<< fs->mds_map.get_num_in_mds() - 1 << " ranks)";
- auto erased = pending_fsmap.stop(gid);
+ auto erased = pending.stop(gid);
erased.push_back(gid);
for (const auto &erased_gid : erased) {
until += g_conf->get_val<double>("mon_mds_blacklist_interval");
const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
request_proposal(mon->osdmon());
- pending_fsmap.damaged(gid, blacklist_epoch);
+ pending.damaged(gid, blacklist_epoch);
last_beacon.erase(gid);
// Respond to MDS, so that it knows it can continue to shut down
mon->send_reply(op,
new MMDSBeacon(
mon->monmap->fsid, m->get_global_id(),
- m->get_name(), fsmap.get_epoch(), state, seq,
+ m->get_name(), pending.get_epoch(), state, seq,
CEPH_FEATURES_SUPPORTED_DEFAULT));
} else if (state == MDSMap::STATE_DNE) {
if (!mon->osdmon()->is_writeable()) {
mon->send_reply(op,
new MMDSBeacon(
mon->monmap->fsid, m->get_global_id(),
- m->get_name(), fsmap.get_epoch(), state, seq,
+ m->get_name(), pending.get_epoch(), state, seq,
CEPH_FEATURES_SUPPORTED_DEFAULT));
} else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
// Standby daemons should never modify their own
return true;
} else {
if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
- auto fscid = pending_fsmap.mds_roles.at(gid);
- auto fs = pending_fsmap.get_filesystem(fscid);
+ const auto &fscid = pending.mds_roles.at(gid);
+ const auto &fs = pending.get_filesystem(fscid);
mon->clog->info() << info.human_name() << " is now active in "
<< "filesystem " << fs->mds_map.fs_name << " as rank "
<< info.rank;
// Made it through special cases and validations, record the
// daemon's reported state to the FSMap.
- pending_fsmap.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
+ pending.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
info->state = state;
info->state_seq = seq;
});
}
dout(7) << "prepare_beacon pending map now:" << dendl;
- print_map(pending_fsmap);
+ print_map(pending);
wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
if (r >= 0)
bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
{
+ auto &pending = get_pending_fsmap_writeable();
+
op->mark_mdsmon_event(__func__);
MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
mds_gid_t gid = m->global_id;
- if (pending_fsmap.gid_has_rank(gid)) {
+ if (pending.gid_has_rank(gid)) {
dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
- pending_fsmap.update_export_targets(gid, m->targets);
+ pending.update_export_targets(gid, m->targets);
} else {
dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
}
void MDSMonitor::_updated(MonOpRequestRef op)
{
+ const auto &fsmap = get_fsmap();
op->mark_mdsmon_event(__func__);
MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
tick();
update_logger();
- if (mon->is_leader()) {
- mon->clog->debug() << "fsmap " << fsmap;
+ if (is_leader()) {
+ mon->clog->debug() << "fsmap " << get_fsmap();
}
}
list<pair<health_status_t, string> > *detail,
CephContext* cct) const
{
+ const auto &fsmap = get_fsmap();
+
fsmap.get_health(summary, detail);
// For each MDS GID...
- const auto info_map = fsmap.get_mds_info();
+ const auto &info_map = fsmap.get_mds_info();
for (const auto &i : info_map) {
const auto &gid = i.first;
const auto &info = i.second;
void MDSMonitor::dump_info(Formatter *f)
{
f->open_object_section("fsmap");
- fsmap.dump(f);
+ get_fsmap().dump(f);
f->close_section();
f->dump_unsigned("mdsmap_first_committed", get_first_committed());
stringstream ss, ds;
map<string, cmd_vartype> cmdmap;
+ const auto &fsmap = get_working_fsmap();
+
if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
// ss has reason for failure
string rs = ss.str();
int64_t epocharg;
epoch_t epoch;
- FSMap *p = &fsmap;
+ const FSMap *fsmapp = &get_fsmap();
+ FSMap dummy;
if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
epoch = epocharg;
bufferlist b;
int err = get_version(epoch, b);
if (err == -ENOENT) {
- p = 0;
r = -ENOENT;
+ goto out;
} else {
assert(err == 0);
assert(b.length());
- p = new FSMap;
- p->decode(b);
+ dummy.decode(b);
+ fsmapp = &dummy;
}
}
- if (p) {
- stringstream ds;
- const MDSMap *mdsmap = nullptr;
- MDSMap blank;
- blank.epoch = fsmap.epoch;
- if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
- mdsmap = &(fsmap.filesystems[fsmap.legacy_client_fscid]->mds_map);
- } else {
- mdsmap = ␣
- }
- if (f != NULL) {
- f->open_object_section("mdsmap");
- mdsmap->dump(f.get());
- f->close_section();
- f->flush(ds);
- r = 0;
- } else {
- mdsmap->print(ds);
- r = 0;
- }
-
- rdata.append(ds);
- ss << "dumped fsmap epoch " << p->get_epoch();
- if (p != &fsmap) {
- delete p;
- }
+ stringstream ds;
+ const MDSMap *mdsmapp = nullptr;
+ MDSMap blank;
+ blank.epoch = fsmapp->epoch;
+ if (fsmapp->legacy_client_fscid != FS_CLUSTER_ID_NONE) {
+ mdsmapp = &fsmapp->filesystems.at(fsmapp->legacy_client_fscid)->mds_map;
+ } else {
+ mdsmapp = ␣
}
+ if (f != NULL) {
+ f->open_object_section("mdsmap");
+ mdsmapp->dump(f.get());
+ f->close_section();
+ f->flush(ds);
+ r = 0;
+ } else {
+ mdsmapp->print(ds);
+ r = 0;
+ }
+
+ rdata.append(ds);
+ ss << "dumped fsmap epoch " << fsmapp->get_epoch();
} else if (prefix == "fs dump") {
int64_t epocharg;
epoch_t epoch;
- FSMap *p = &fsmap;
+ const FSMap *fsmapp = &get_fsmap();
+ FSMap dummy;
if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
epoch = epocharg;
bufferlist b;
int err = get_version(epoch, b);
if (err == -ENOENT) {
- p = 0;
r = -ENOENT;
+ goto out;
} else {
assert(err == 0);
assert(b.length());
- p = new FSMap;
- p->decode(b);
+ dummy.decode(b);
+ fsmapp = &dummy;
}
}
- if (p) {
- stringstream ds;
- if (f != NULL) {
- f->open_object_section("fsmap");
- p->dump(f.get());
- f->close_section();
- f->flush(ds);
- r = 0;
- } else {
- p->print(ds);
- r = 0;
- }
-
- rdata.append(ds);
- ss << "dumped fsmap epoch " << p->get_epoch();
- if (p != &fsmap)
- delete p;
+ stringstream ds;
+ if (f != NULL) {
+ f->open_object_section("fsmap");
+ fsmapp->dump(f.get());
+ f->close_section();
+ f->flush(ds);
+ r = 0;
+ } else {
+ fsmapp->print(ds);
+ r = 0;
}
+
+ rdata.append(ds);
+ ss << "dumped fsmap epoch " << fsmapp->get_epoch();
} else if (prefix == "mds metadata") {
if (!f)
f.reset(Formatter::create("json-pretty"));
} else if (prefix == "fs get") {
string fs_name;
cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
- auto fs = fsmap.get_filesystem(fs_name);
+ const auto &fs = fsmap.get_filesystem(fs_name);
if (fs == nullptr) {
ss << "filesystem '" << fs_name << "' not found";
r = -ENOENT;
if (f) {
f->open_array_section("filesystems");
{
- for (const auto i : fsmap.filesystems) {
- const auto fs = i.second;
+ for (const auto &p : fsmap.filesystems) {
+ const auto &fs = p.second;
f->open_object_section("filesystem");
{
const MDSMap &mds_map = fs->mds_map;
f->close_section();
f->flush(ds);
} else {
- for (const auto i : fsmap.filesystems) {
- const auto fs = i.second;
+ for (const auto &p : fsmap.filesystems) {
+ const auto &fs = p.second;
const MDSMap &mds_map = fs->mds_map;
const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
mds_map.metadata_pool);
r = 0;
}
+out:
if (r != -1) {
rdata.append(ds);
string rs;
bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
{
- const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
+ auto &pending = get_pending_fsmap_writeable();
+
+ const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
dout(10) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
epoch_t blacklist_epoch = 0;
blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
}
- pending_fsmap.erase(gid, blacklist_epoch);
+ pending.erase(gid, blacklist_epoch);
last_beacon.erase(gid);
if (pending_daemon_health.count(gid)) {
pending_daemon_health.erase(gid);
mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
{
- const FSMap *relevant_fsmap = mon->is_leader() ? &pending_fsmap : &fsmap;
+ const auto &fsmap = get_working_fsmap();
// Try parsing as a role
mds_role_t role;
int r = parse_role(arg, &role, ignore_err);
if (r == 0) {
// See if a GID is assigned to this role
- auto fs = relevant_fsmap->get_filesystem(role.fscid);
+ const auto &fs = fsmap.get_filesystem(role.fscid);
assert(fs != nullptr); // parse_role ensures it exists
if (fs->mds_map.is_up(role.rank)) {
dout(10) << __func__ << ": validated rank/GID " << role
unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
if (!err.empty()) {
// Not a role or a GID, try as a daemon name
- const MDSMap::mds_info_t *mds_info = relevant_fsmap->find_by_name(arg);
+ const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
if (!mds_info) {
ss << "MDS named '" << arg
<< "' does not exist, or is not up";
dout(10) << __func__ << ": treating MDS reference '" << arg
<< "' as an integer " << maybe_gid << dendl;
- if (relevant_fsmap->gid_exists(mds_gid_t(maybe_gid))) {
+ if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
return mds_gid_t(maybe_gid);
}
}
// Take a copy of the info before removing the MDS from the map,
// so that the caller knows which mds (if any) they ended up removing.
- *failed_info = pending_fsmap.get_info_gid(gid);
+ *failed_info = get_pending_fsmap().get_info_gid(gid);
fail_mds_gid(gid);
ss << "failed mds gid " << gid;
return true;
}
+ auto &pending = get_pending_fsmap_writeable();
+
bool batched_propose = false;
- for (auto h : handlers) {
+ for (const auto &h : handlers) {
if (h->can_handle(prefix)) {
batched_propose = h->batched_propose();
if (batched_propose) {
paxos->plug();
}
- r = h->handle(mon, pending_fsmap, op, cmdmap, ss);
+ r = h->handle(mon, pending, op, cmdmap, ss);
if (batched_propose) {
paxos->unplug();
}
} else {
if (r == 0) {
// On successful updates, print the updated map
- print_map(pending_fsmap);
+ print_map(pending);
}
// Successful or not, we're done: respond.
goto out;
}
// Only handle legacy commands if there is a filesystem configured
- if (pending_fsmap.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
- if (pending_fsmap.filesystems.size() == 0) {
+ if (pending.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
+ if (pending.filesystems.size() == 0) {
ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
} else {
ss << "No filesystem set for use with legacy commands";
mds_role_t *role,
std::ostream &ss)
{
- const FSMap *relevant_fsmap = &fsmap;
- if (mon->is_leader()) {
- relevant_fsmap = &pending_fsmap;
- }
- return relevant_fsmap->parse_role(role_str, role, ss);
+ return get_working_fsmap().parse_role(role_str, role, ss);
}
int MDSMonitor::filesystem_command(
string whostr;
cmd_getval(g_ceph_context, cmdmap, "who", whostr);
+ auto &pending = get_pending_fsmap_writeable();
if (prefix == "mds stop" ||
prefix == "mds deactivate") {
-
mds_role_t role;
r = parse_role(whostr, &role, ss);
if (r < 0 ) {
return r;
}
- auto fs = pending_fsmap.get_filesystem(role.fscid);
+ const auto &fs = pending.get_filesystem(role.fscid);
if (!fs->mds_map.is_active(role.rank)) {
r = -EEXIST;
r = 0;
mds_gid_t gid = fs->mds_map.up.at(role.rank);
ss << "telling mds." << role << " "
- << pending_fsmap.get_info_gid(gid).addr << " to deactivate";
+ << pending.get_info_gid(gid).addr << " to deactivate";
- pending_fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
+ pending.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
info->state = MDSMap::STATE_STOPPING;
});
}
<< cmd_vartype_stringify(cmdmap["state"]) << "'";
return -EINVAL;
}
- if (pending_fsmap.gid_exists(gid)) {
- pending_fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
+ if (pending.gid_exists(gid)) {
+ pending.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
info->state = state;
});
ss << "set mds gid " << gid << " to state " << state << " "
<< cmd_vartype_stringify(cmdmap["gid"]) << "'";
return -EINVAL;
}
- if (!pending_fsmap.gid_exists(gid)) {
+ if (!pending.gid_exists(gid)) {
ss << "mds gid " << gid << " dne";
r = 0;
} else {
- MDSMap::DaemonState state = pending_fsmap.get_info_gid(gid).state;
+ const auto &info = pending.get_info_gid(gid);
+ MDSMap::DaemonState state = info.state;
if (state > 0) {
- ss << "cannot remove active mds." << pending_fsmap.get_info_gid(gid).name
- << " rank " << pending_fsmap.get_info_gid(gid).rank;
+ ss << "cannot remove active mds." << info.name
+ << " rank " << info.rank;
return -EBUSY;
} else {
- pending_fsmap.erase(gid, {});
+ pending.erase(gid, {});
ss << "removed mds gid " << gid;
return 0;
}
return -EINVAL;
}
- pending_fsmap.modify_filesystem(
+ pending.modify_filesystem(
role.fscid,
[role](std::shared_ptr<Filesystem> fs)
{
<< cmd_vartype_stringify(cmdmap["feature"]) << "'";
return -EINVAL;
}
- if (pending_fsmap.compat.compat.contains(f)) {
+ if (pending.compat.compat.contains(f)) {
ss << "removing compat feature " << f;
- CompatSet modified = pending_fsmap.compat;
+ CompatSet modified = pending.compat;
modified.compat.remove(f);
- pending_fsmap.update_compat(modified);
+ pending.update_compat(modified);
} else {
- ss << "compat feature " << f << " not present in " << pending_fsmap.compat;
+ ss << "compat feature " << f << " not present in " << pending.compat;
}
r = 0;
} else if (prefix == "mds compat rm_incompat") {
<< cmd_vartype_stringify(cmdmap["feature"]) << "'";
return -EINVAL;
}
- if (pending_fsmap.compat.incompat.contains(f)) {
+ if (pending.compat.incompat.contains(f)) {
ss << "removing incompat feature " << f;
- CompatSet modified = pending_fsmap.compat;
+ CompatSet modified = pending.compat;
modified.incompat.remove(f);
- pending_fsmap.update_compat(modified);
+ pending.update_compat(modified);
} else {
- ss << "incompat feature " << f << " not present in " << pending_fsmap.compat;
+ ss << "incompat feature " << f << " not present in " << pending.compat;
}
r = 0;
} else if (prefix == "mds repaired") {
return r;
}
- bool modified = pending_fsmap.undamaged(role.fscid, role.rank);
+ bool modified = pending.undamaged(role.fscid, role.rank);
if (modified) {
dout(4) << "repaired: restoring rank " << role << dendl;
} else {
void MDSMonitor::modify_legacy_filesystem(
std::function<void(std::shared_ptr<Filesystem> )> fn)
{
+ auto &pending_fsmap = get_pending_fsmap_writeable();
pending_fsmap.modify_filesystem(
pending_fsmap.legacy_client_fscid,
fn
string whostr;
cmd_getval(g_ceph_context, cmdmap, "who", whostr);
+ auto &pending_fsmap = get_pending_fsmap_writeable();
+
assert (pending_fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
if (prefix == "mds set_max_mds") {
types.push_back("fsmap");
types.push_back("fsmap.user");
types.push_back("mdsmap");
- for (const auto &i : fsmap.filesystems) {
- auto fscid = i.first;
+ for (const auto &p : get_fsmap().filesystems) {
+ const auto &fscid = p.first;
std::ostringstream oss;
oss << "mdsmap." << fscid;
types.push_back(oss.str());
{
dout(20) << __func__ << ": " << sub->type << dendl;
+ const auto &fsmap = get_fsmap();
+
if (sub->type == "fsmap") {
if (sub->next <= fsmap.get_epoch()) {
sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
FSMapUser fsmap_u;
fsmap_u.epoch = fsmap.get_epoch();
fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
- for (auto p = fsmap.filesystems.begin();
- p != fsmap.filesystems.end();
- ++p) {
- FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p->first];
- fs_info.cid = p->first;
- fs_info.name= p->second->mds_map.fs_name;
+ for (const auto &p : fsmap.filesystems) {
+ FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
+ fs_info.cid = p.second->fscid;
+ fs_info.name = p.second->mds_map.fs_name;
}
sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
if (sub->onetime) {
dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
// Work out the effective latest epoch
- MDSMap *mds_map = nullptr;
+ const MDSMap *mds_map = nullptr;
MDSMap null_map;
null_map.compat = fsmap.compat;
if (fscid == FS_CLUSTER_ID_NONE) {
// For a client, we should have already dropped out
assert(is_mds);
- if (fsmap.standby_daemons.count(mds_gid)) {
+ auto it = fsmap.standby_daemons.find(mds_gid);
+ if (it != fsmap.standby_daemons.end()) {
// For an MDS, we need to feed it an MDSMap with its own state in
- null_map.mds_info[mds_gid] = fsmap.standby_daemons[mds_gid];
- null_map.epoch = fsmap.standby_epochs[mds_gid];
+ null_map.mds_info[mds_gid] = it->second;
+ null_map.epoch = fsmap.standby_epochs.at(mds_gid);
} else {
null_map.epoch = fsmap.epoch;
}
mds_map = &null_map;
} else {
// Check the effective epoch
- mds_map = &(fsmap.filesystems.at(fscid)->mds_map);
+ mds_map = &fsmap.get_filesystem(fscid)->mds_map;
}
assert(mds_map != nullptr);
bool update = false;
for (map<mds_gid_t, Metadata>::iterator i = pending_metadata.begin();
i != pending_metadata.end(); ) {
- if (!pending_fsmap.gid_exists(i->first)) {
+ if (!get_pending_fsmap().gid_exists(i->first)) {
pending_metadata.erase(i++);
update = true;
} else {
continue;
}
const mds_gid_t gid = it->first;
- if (!fsmap.gid_exists(gid)) {
+ if (!get_fsmap().gid_exists(gid)) {
dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
continue;
}
- const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
+ const MDSMap::mds_info_t& mds_info = get_fsmap().get_info_gid(gid);
// FIXME: include filesystem name with rank here
mdses[hostname->second].push_back(mds_info.rank);
}
* If a cluster is undersized (with respect to max_mds), then
* attempt to find daemons to grow it.
*/
-bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
+bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> &fs)
{
bool do_propose = false;
+ auto &pending = get_pending_fsmap_writeable();
if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
return do_propose;
while (fs->mds_map.is_in(mds)) {
mds++;
}
- mds_gid_t newgid = pending_fsmap.find_replacement_for({fs->fscid, mds},
+ mds_gid_t newgid = pending.find_replacement_for({fs->fscid, mds},
name, g_conf->mon_force_standby_active);
if (newgid == MDS_GID_NONE) {
break;
}
- const auto &new_info = pending_fsmap.get_info_gid(newgid);
+ const auto &new_info = pending.get_info_gid(newgid);
dout(1) << "assigned standby " << new_info.addr
<< " as mds." << mds << dendl;
"filesystem " << fs->mds_map.fs_name << " as rank "
<< mds << " (now has " << fs->mds_map.get_num_in_mds() + 1
<< " ranks)";
- pending_fsmap.promote(newgid, fs, mds);
+ pending.promote(newgid, fs, mds);
do_propose = true;
}
assert(mds_propose != nullptr);
assert(osd_propose != nullptr);
- const auto fscid = pending_fsmap.mds_roles.at(gid);
+ auto &pending = get_pending_fsmap_writeable();
+ const auto fscid = pending.mds_roles.at(gid);
// We will only take decisive action (replacing/removing a daemon)
// if we have some indicating that some other daemon(s) are successfully
info.state != MDSMap::STATE_STANDBY &&
info.state != MDSMap::STATE_STANDBY_REPLAY &&
may_replace &&
- !pending_fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
- (sgid = pending_fsmap.find_replacement_for({fscid, info.rank}, info.name,
+ !pending.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
+ (sgid = pending.find_replacement_for({fscid, info.rank}, info.name,
g_conf->mon_force_standby_active)) != MDS_GID_NONE)
{
- MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
+ MDSMap::mds_info_t si = pending.get_info_gid(sgid);
dout(10) << " replacing " << gid << " " << info.addr << " mds."
<< info.rank << "." << info.inc
<< " " << ceph_mds_state_name(info.state)
<< " with standby " << si.human_name();
// Remember what NS the old one was in
- const fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
+ const fs_cluster_id_t fscid = pending.mds_roles.at(gid);
// Remove the old one
*osd_propose |= fail_mds_gid(gid);
// Promote the replacement
- auto fs = pending_fsmap.filesystems.at(fscid);
- pending_fsmap.promote(sgid, fs, info.rank);
+ auto fs = pending.filesystems.at(fscid);
+ pending.promote(sgid, fs, info.rank);
*mds_propose = true;
} else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
dout(10) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
<< " " << ceph_mds_state_name(info.state)
<< " laggy" << dendl;
- pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
+ pending.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
info->laggy_since = ceph_clock_now();
});
*mds_propose = true;
}
}
-bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
+bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> &fs)
{
assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
+ auto &pending = get_pending_fsmap_writeable();
+
bool do_propose = false;
// have a standby take over?
set<mds_rank_t>::iterator p = failed.begin();
while (p != failed.end()) {
mds_rank_t f = *p++;
- mds_gid_t sgid = pending_fsmap.find_replacement_for({fs->fscid, f}, {},
+ mds_gid_t sgid = pending.find_replacement_for({fs->fscid, f}, {},
g_conf->mon_force_standby_active);
if (sgid) {
- const MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
+ const MDSMap::mds_info_t si = pending.get_info_gid(sgid);
dout(0) << " taking over failed mds." << f << " with " << sgid
<< "/" << si.name << " " << si.addr << dendl;
mon->clog->info() << "Standby " << si.human_name()
<< " assigned to filesystem " << fs->mds_map.fs_name
<< " as rank " << f;
- pending_fsmap.promote(sgid, fs, f);
+ pending.promote(sgid, fs, f);
do_propose = true;
}
}
// them while perhaps-modifying standby_daemons during the loop
// (if we promote anyone they are removed from standby_daemons)
std::vector<mds_gid_t> standby_gids;
- for (const auto &j : pending_fsmap.standby_daemons) {
+ for (const auto &j : pending.standby_daemons) {
standby_gids.push_back(j.first);
}
for (const auto &gid : standby_gids) {
- const auto &info = pending_fsmap.standby_daemons.at(gid);
+ const auto &info = pending.standby_daemons.at(gid);
assert(info.state == MDSMap::STATE_STANDBY);
if (!info.standby_replay) {
// the standby_for_rank refers to: lookup via legacy_client_fscid
mds_role_t target_role = {
info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
- pending_fsmap.legacy_client_fscid : info.standby_for_fscid,
+ pending.legacy_client_fscid : info.standby_for_fscid,
info.standby_for_rank};
// It is possible that the map contains a standby_for_fscid
// that doesn't correspond to an existing filesystem, especially
// if we loaded from a version with a bug (#17466)
if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
- && !pending_fsmap.filesystem_exists(info.standby_for_fscid)) {
+ && !pending.filesystem_exists(info.standby_for_fscid)) {
derr << "gid " << gid << " has invalid standby_for_fscid "
<< info.standby_for_fscid << dendl;
continue;
// If we managed to resolve a full target role
if (target_role.fscid != FS_CLUSTER_ID_NONE) {
- auto fs = pending_fsmap.get_filesystem(target_role.fscid);
+ const auto &fs = pending.get_filesystem(target_role.fscid);
if (fs->mds_map.is_followable(target_role.rank)) {
do_propose |= try_standby_replay(
info,
}
// check everyone
- for (auto fs_i : pending_fsmap.filesystems) {
- const MDSMap &mds_map = fs_i.second->mds_map;
- for (auto mds_i : mds_map.mds_info) {
- MDSMap::mds_info_t &cand_info = mds_i.second;
+ for (const auto &p : pending.filesystems) {
+ if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
+ info.standby_for_fscid != p.first)
+ continue;
+
+ bool assigned = false;
+ const auto &fs = p.second;
+ const MDSMap &mds_map = fs->mds_map;
+ for (const auto &mds_i : mds_map.mds_info) {
+ const MDSMap::mds_info_t &cand_info = mds_i.second;
if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
info.standby_for_rank != MDS_RANK_NONE) {
continue; // we're supposed to follow someone else
}
- if (try_standby_replay(info, *(fs_i.second), cand_info)) {
- do_propose = true;
+ if (try_standby_replay(info, *fs, cand_info)) {
+ assigned = true;
break;
}
- continue;
}
}
+ if (assigned) {
+ do_propose = true;
+ break;
+ }
}
}
}
{
// make sure mds's are still alive
// ...if i am an active leader
+
if (!is_active()) return;
- dout(10) << fsmap << dendl;
+ dout(10) << get_working_fsmap() << dendl;
- bool do_propose = false;
+ if (!is_leader()) return;
+
+ auto &pending = get_pending_fsmap_writeable();
- if (!mon->is_leader()) return;
+ bool do_propose = false;
- do_propose |= pending_fsmap.check_health();
+ do_propose |= pending.check_health();
// expand mds cluster (add new nodes to @in)?
- for (auto i : pending_fsmap.filesystems) {
- do_propose |= maybe_expand_cluster(i.second);
+ for (auto &p : pending.filesystems) {
+ do_propose |= maybe_expand_cluster(p.second);
}
const auto now = ceph_clock_now();
cutoff -= g_conf->mds_beacon_grace;
// make sure last_beacon is fully populated
- for (const auto &p : pending_fsmap.mds_roles) {
+ for (auto &p : pending.mds_roles) {
auto &gid = p.first;
if (last_beacon.count(gid) == 0) {
last_beacon[gid].stamp = now;
auto beacon_info = p->second;
++p;
- if (!pending_fsmap.gid_exists(gid)) {
+ if (!pending.gid_exists(gid)) {
// clean it out
last_beacon.erase(gid);
continue;
}
if (beacon_info.stamp < cutoff) {
- auto &info = pending_fsmap.get_info_gid(gid);
+ auto &info = pending.get_info_gid(gid);
dout(1) << "no beacon from mds." << info.rank << "." << info.inc
<< " (gid: " << gid << " addr: " << info.addr
<< " state: " << ceph_mds_state_name(info.state) << ")"
request_proposal(mon->osdmon());
}
- for (auto i : pending_fsmap.filesystems) {
- auto fs = i.second;
+ for (auto &p : pending.filesystems) {
+ auto &fs = p.second;
if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
do_propose |= maybe_promote_standby(fs);
}
} else {
// Assign the new role to the standby
dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
- pending_fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
+ get_pending_fsmap_writeable().assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
return true;
}
}
using namespace std;
#include "include/types.h"
-#include "mds/FSMap.h"
-#include "mds/MDSMap.h"
+#include "PaxosFSMap.h"
#include "PaxosService.h"
#include "msg/Messenger.h"
#include "messages/MMDSBeacon.h"
class MMDSMap;
class FileSystemCommandHandler;
-class MDSMonitor : public PaxosService {
+class MDSMonitor : public PaxosService, public PaxosFSMap {
public:
MDSMonitor(Monitor *mn, Paxos *p, string service_name);
void check_subs();
void check_sub(Subscription *sub);
- const FSMap &get_pending() const { return pending_fsmap; }
- const FSMap &get_fsmap() const { return fsmap; }
void dump_info(Formatter *f);
int print_nodes(Formatter *f);
* Return true if a blacklist was done (i.e. OSD propose needed)
*/
bool fail_mds_gid(mds_gid_t gid);
- protected:
- // mds maps
- FSMap fsmap; // current
- FSMap pending_fsmap; // current + pending updates
+ bool is_leader() const override { return mon->is_leader(); }
+
+ protected:
// my helpers
- void print_map(FSMap &m, int dbl=7);
+ void print_map(const FSMap &m, int dbl=7);
void update_logger();
void _updated(MonOpRequestRef op);
std::list<std::shared_ptr<FileSystemCommandHandler> > handlers;
- bool maybe_promote_standby(std::shared_ptr<Filesystem> fs);
- bool maybe_expand_cluster(std::shared_ptr<Filesystem> fs);
+ bool maybe_promote_standby(std::shared_ptr<Filesystem> &fs);
+ bool maybe_expand_cluster(std::shared_ptr<Filesystem> &fs);
void maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
bool *mds_propose, bool *osd_propose);
void tick() override; // check state, take actions
<< " daemon " << pending_map.active_name;
} else {
dout(4) << "Active is laggy but have no standbys to replace it" << dendl;
- mon->clog->warn() << "Manager daemon " << old_active_name
+ mon->clog->info() << "Manager daemon " << old_active_name
<< " is unresponsive. No standby daemons available.";
}
} else if (pending_map.active_gid == 0) {
"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
COMMAND("osd pool get " \
"name=pool,type=CephPoolname " \
- "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block", \
+ "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites", \
"get pool parameter <var>", "osd", "r", "cli,rest")
COMMAND("osd pool set " \
"name=pool,type=CephPoolname " \
bool Monitor::ms_verify_authorizer(Connection *con, int peer_type,
int protocol, bufferlist& authorizer_data,
bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key)
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge)
{
dout(10) << "ms_verify_authorizer " << con->get_peer_addr()
<< " " << ceph_entity_type_name(peer_type)
if (authorizer_data.length()) {
bool ret = cephx_verify_authorizer(g_ceph_context, &keyring, iter,
- auth_ticket_info, authorizer_reply);
+ auth_ticket_info, challenge, authorizer_reply);
if (ret) {
session_key = auth_ticket_info.session_key;
isvalid = true;
bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
bool ms_verify_authorizer(Connection *con, int peer_type,
int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key) override;
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
bool ms_handle_reset(Connection *con) override;
void ms_handle_remote_reset(Connection *con) override {}
bool ms_handle_refused(Connection *con) override;
#include "include/str_map.h"
#include "include/scope_guard.h"
+#include "auth/cephx/CephxKeyServer.h"
+#include "osd/OSDCap.h"
+
#include "json_spirit/json_spirit_reader.h"
#include <boost/algorithm/string/predicate.hpp>
const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
+bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
+ // Note: this doesn't include support for the application tag match
+ if ((grant.spec.allow & OSD_CAP_W) != 0) {
+ auto& match = grant.match;
+ if (match.is_match_all()) {
+ return true;
+ } else if (pool_name != nullptr && match.auid < 0 &&
+ !match.pool_namespace.pool_name.empty() &&
+ match.pool_namespace.pool_name == *pool_name) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool is_unmanaged_snap_op_permitted(CephContext* cct,
+ const KeyServer& key_server,
+ const EntityName& entity_name,
+ const MonCap& mon_caps,
+ const std::string* pool_name)
+{
+ typedef std::map<std::string, std::string> CommandArgs;
+
+ if (mon_caps.is_capable(cct, CEPH_ENTITY_TYPE_MON,
+ entity_name, "osd",
+ "osd pool op unmanaged-snap",
+ (pool_name == nullptr ?
+ CommandArgs{} /* pool DNE, require unrestricted cap */ :
+ CommandArgs{{"poolname", *pool_name}}),
+ false, true, false)) {
+ return true;
+ }
+
+ AuthCapsInfo caps_info;
+ if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
+ caps_info)) {
+ dout(10) << "unable to locate OSD cap data for " << entity_name
+ << " in auth db" << dendl;
+ return false;
+ }
+
+ string caps_str;
+ if (caps_info.caps.length() > 0) {
+ auto p = caps_info.caps.begin();
+ try {
+ decode(caps_str, p);
+ } catch (const buffer::error &err) {
+ derr << "corrupt OSD cap data for " << entity_name << " in auth db"
+ << dendl;
+ return false;
+ }
+ }
+
+ OSDCap osd_cap;
+ if (!osd_cap.parse(caps_str, nullptr)) {
+ dout(10) << "unable to parse OSD cap data for " << entity_name
+ << " in auth db" << dendl;
+ return false;
+ }
+
+ // if the entity has write permissions in one or all pools, permit
+ // usage of unmanaged-snapshots
+ if (osd_cap.allow_all()) {
+ return true;
+ }
+
+ for (auto& grant : osd_cap.grants) {
+ if (grant.profile.is_valid()) {
+ for (auto& profile_grant : grant.profile_grants) {
+ if (is_osd_writable(profile_grant, pool_name)) {
+ return true;
+ }
+ }
+ } else if (is_osd_writable(grant, pool_name)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
} // anonymous namespace
void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
cct(cct),
inc_osd_cache(g_conf->mon_osd_cache_size),
full_osd_cache(g_conf->mon_osd_cache_size),
- last_attempted_minwait_time(utime_t()),
mapper(mn->cct, &mn->cpu_tp),
op_tracker(cct, true, 1)
{}
}
// encode into pending incremental
+ uint64_t features = newmap.get_encoding_features();
newmap.encode(pending_inc.fullmap,
- mon->get_quorum_con_features() | CEPH_FEATURE_RESERVED);
+ features | CEPH_FEATURE_RESERVED);
pending_inc.full_crc = newmap.get_crc();
dout(20) << " full crc " << pending_inc.full_crc << dendl;
}
uint32_t match_count = 0;
// CephFS
- FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
+ const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
if (pending_fsmap.pool_in_use(pool_id)) {
dout(10) << __func__ << " auto-enabling CephFS on pool '"
<< pool_name << "'" << dendl;
osdmap.maybe_remove_pg_upmaps(cct, osdmap, &pending_inc);
// features for osdmap and its incremental
- uint64_t features = mon->get_quorum_con_features();
+ uint64_t features;
// encode full map and determine its crc
OSDMap tmp;
tmp.apply_incremental(pending_inc);
// determine appropriate features
- if (tmp.require_osd_release < CEPH_RELEASE_LUMINOUS) {
- dout(10) << __func__ << " encoding without feature SERVER_LUMINOUS"
- << dendl;
- features &= ~CEPH_FEATURE_SERVER_LUMINOUS;
- }
- if (tmp.require_osd_release < CEPH_RELEASE_KRAKEN) {
- dout(10) << __func__ << " encoding without feature SERVER_KRAKEN | "
- << "MSG_ADDR2" << dendl;
- features &= ~(CEPH_FEATURE_SERVER_KRAKEN |
- CEPH_FEATURE_MSG_ADDR2);
- }
- if (tmp.require_osd_release < CEPH_RELEASE_JEWEL) {
- dout(10) << __func__ << " encoding without feature SERVER_JEWEL" << dendl;
- features &= ~CEPH_FEATURE_SERVER_JEWEL;
- }
- dout(10) << __func__ << " encoding full map with " << features << dendl;
+ features = tmp.get_encoding_features();
+ dout(10) << __func__ << " encoding full map with "
+ << ceph_release_name(tmp.require_osd_release)
+ << " features " << features << dendl;
+
+ // the features should be a subset of the mon quorum's features!
+ assert((features & ~mon->get_quorum_con_features()) == 0);
bufferlist fullbl;
::encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
}
dout(10) << "committed, telling random " << s->inst << " all about it" << dendl;
+
+ // get feature of the peer
+ // use quorum_con_features, if it's an anonymous connection.
+ uint64_t features = s->con_features ? s->con_features :
+ mon->get_quorum_con_features();
// whatev, they'll request more if they need it
- MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch());
+ MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
s->con->send_message(m);
// NOTE: do *not* record osd has up to this epoch (as we do
// elsewhere) as they may still need to request older values.
return true;
}
- // propose as fast as possible if updating up_thru or pg_temp
- // want to merge OSDMap changes as much as possible
- if ((pending_inc.new_primary_temp.size() == 1
- || pending_inc.new_up_thru.size() == 1)
- && pending_inc.new_state.size() < 2) {
- dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl;
-
- utime_t now = ceph_clock_now();
- if (now - last_attempted_minwait_time > g_conf->paxos_propose_interval
- && now - paxos->get_last_commit_time() > g_conf->paxos_min_wait) {
- delay = g_conf->paxos_min_wait;
- last_attempted_minwait_time = now;
- return true;
- }
- }
-
return PaxosService::should_propose(delay);
}
{
op->mark_osdmon_event(__func__);
MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
+
+ uint64_t features = mon->get_quorum_con_features();
+ if (m->get_session() && m->get_session()->con_features)
+ features = m->get_session()->con_features;
+
dout(10) << __func__ << " " << *m << dendl;
- MOSDMap *reply = new MOSDMap(mon->monmap->fsid);
+ MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
epoch_t first = get_first_committed();
epoch_t last = osdmap.get_epoch();
int max = g_conf->osd_map_message_max;
for (epoch_t e = MAX(first, m->get_full_first());
e <= MIN(last, m->get_full_last()) && max > 0;
++e, --max) {
- int r = get_version_full(e, reply->maps[e]);
+ int r = get_version_full(e, features, reply->maps[e]);
assert(r >= 0);
}
for (epoch_t e = MAX(first, m->get_inc_first());
e <= MIN(last, m->get_inc_last()) && max > 0;
++e, --max) {
- int r = get_version(e, reply->incremental_maps[e]);
+ int r = get_version(e, features, reply->incremental_maps[e]);
assert(r >= 0);
}
reply->oldest_map = first;
return false;
didit:
+ mon->no_reply(op);
return true;
}
o->mark_event(__func__);
MOSDFailure *m = o->get_req<MOSDFailure>();
send_latest(o, m->get_epoch());
+ mon->no_reply(o);
}
ls.pop_front();
}
}
-MOSDMap *OSDMonitor::build_latest_full()
+MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
{
- MOSDMap *r = new MOSDMap(mon->monmap->fsid);
- get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]);
+ MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
+ get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
r->oldest_map = get_first_committed();
r->newest_map = osdmap.get_epoch();
return r;
}
-MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to)
+MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
{
- dout(10) << "build_incremental [" << from << ".." << to << "]" << dendl;
- MOSDMap *m = new MOSDMap(mon->monmap->fsid);
+ dout(10) << "build_incremental [" << from << ".." << to << "] with features " << std::hex << features << dendl;
+ MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
m->oldest_map = get_first_committed();
m->newest_map = osdmap.get_epoch();
for (epoch_t e = to; e >= from && e > 0; e--) {
bufferlist bl;
- int err = get_version(e, bl);
+ int err = get_version(e, features, bl);
if (err == 0) {
assert(bl.length());
// if (get_version(e, bl) > 0) {
} else {
assert(err == -ENOENT);
assert(!bl.length());
- get_version_full(e, bl);
+ get_version_full(e, features, bl);
if (bl.length() > 0) {
//else if (get_version("full", e, bl) > 0) {
dout(20) << "build_incremental full " << e << " "
{
op->mark_osdmon_event(__func__);
dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
- mon->send_reply(op, build_latest_full());
+ mon->send_reply(op, build_latest_full(op->get_session()->con_features));
}
void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
<< " to " << session->inst << dendl;
+ // get feature of the peer
+ // use quorum_con_features, if it's an anonymous connection.
+ uint64_t features = session->con_features ? session->con_features :
+ mon->get_quorum_con_features();
+
if (first <= session->osd_epoch) {
dout(10) << __func__ << " " << session->inst << " should already have epoch "
<< session->osd_epoch << dendl;
if (first < get_first_committed()) {
first = get_first_committed();
bufferlist bl;
- int err = get_version_full(first, bl);
+ int err = get_version_full(first, features, bl);
assert(err == 0);
assert(bl.length());
dout(20) << "send_incremental starting with base full "
<< first << " " << bl.length() << " bytes" << dendl;
- MOSDMap *m = new MOSDMap(osdmap.get_fsid());
+ MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
m->oldest_map = get_first_committed();
m->newest_map = osdmap.get_epoch();
m->maps[first] = bl;
}
while (first <= osdmap.get_epoch()) {
- epoch_t last = MIN(first + g_conf->osd_map_message_max - 1,
- osdmap.get_epoch());
- MOSDMap *m = build_incremental(first, last);
+ epoch_t last = std::min<epoch_t>(first + g_conf->osd_map_message_max - 1,
+ osdmap.get_epoch());
+ MOSDMap *m = build_incremental(first, last, features);
if (req) {
// send some maps. it may not be all of them, but it will get them
int OSDMonitor::get_version(version_t ver, bufferlist& bl)
{
- if (inc_osd_cache.lookup(ver, &bl)) {
- return 0;
- }
- int ret = PaxosService::get_version(ver, bl);
- if (!ret) {
- inc_osd_cache.add(ver, bl);
- }
+ return get_version(ver, mon->get_quorum_con_features(), bl);
+}
+
+void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
+{
+ OSDMap::Incremental inc;
+ bufferlist::iterator q = bl.begin();
+ inc.decode(q);
+ // always encode with subset of osdmap's canonical features
+ uint64_t f = features & inc.encode_features;
+ dout(20) << __func__ << " " << inc.epoch << " with features " << f
+ << dendl;
+ bl.clear();
+ if (inc.fullmap.length()) {
+ // embedded full map?
+ OSDMap m;
+ m.decode(inc.fullmap);
+ inc.fullmap.clear();
+ m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
+ }
+ if (inc.crush.length()) {
+ // embedded crush map
+ CrushWrapper c;
+ auto p = inc.crush.begin();
+ c.decode(p);
+ inc.crush.clear();
+ c.encode(inc.crush, f);
+ }
+ inc.encode(bl, f | CEPH_FEATURE_RESERVED);
+}
+
+void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
+{
+ OSDMap m;
+ bufferlist::iterator q = bl.begin();
+ m.decode(q);
+ // always encode with subset of osdmap's canonical features
+ uint64_t f = features & m.get_encoding_features();
+ dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
+ << dendl;
+ bl.clear();
+ m.encode(bl, f | CEPH_FEATURE_RESERVED);
+}
+
+int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
+{
+ uint64_t significant_features = OSDMap::get_significant_features(features);
+ if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
+ return 0;
+ }
+ int ret = PaxosService::get_version(ver, bl);
+ if (ret < 0) {
return ret;
+ }
+ // NOTE: this check is imprecise; the OSDMap encoding features may
+ // be a subset of the latest mon quorum features, but worst case we
+ // reencode once and then cache the (identical) result under both
+ // feature masks.
+ if (significant_features !=
+ OSDMap::get_significant_features(mon->get_quorum_con_features())) {
+ reencode_incremental_map(bl, features);
+ }
+ inc_osd_cache.add({ver, significant_features}, bl);
+ return 0;
}
int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
{
- if (full_osd_cache.lookup(ver, &bl)) {
- return 0;
- }
- int ret = PaxosService::get_version_full(ver, bl);
- if (!ret) {
- full_osd_cache.add(ver, bl);
- }
+ return get_version_full(ver, mon->get_quorum_con_features(), bl);
+}
+
+int OSDMonitor::get_version_full(version_t ver, uint64_t features,
+ bufferlist& bl)
+{
+ uint64_t significant_features = OSDMap::get_significant_features(features);
+ if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
+ return 0;
+ }
+ int ret = PaxosService::get_version_full(ver, bl);
+ if (ret < 0) {
return ret;
+ }
+ // NOTE: this check is imprecise; the OSDMap encoding features may
+ // be a subset of the latest mon quorum features, but worst case we
+ // reencode once and then cache the (identical) result under both
+ // feature masks.
+ if (significant_features !=
+ OSDMap::get_significant_features(mon->get_quorum_con_features())) {
+ reencode_full_map(bl, features);
+ }
+ full_osd_cache.add({ver, significant_features}, bl);
+ return 0;
}
epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
if (sub->next >= 1)
send_incremental(sub->next, sub->session, sub->incremental_onetime);
else
- sub->session->con->send_message(build_latest_full());
+ sub->session->con->send_message(build_latest_full(sub->session->con_features));
if (sub->onetime)
mon->session_map.remove_sub(sub);
else
namespace {
enum osd_pool_get_choices {
SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL,
- PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL,
+ PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
NODELETE, NOPGCHANGE, NOSIZECHANGE,
WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
{"min_size", MIN_SIZE},
{"crash_replay_interval", CRASH_REPLAY_INTERVAL},
{"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
- {"crush_rule", CRUSH_RULE},
- {"hashpspool", HASHPSPOOL}, {"nodelete", NODELETE},
+ {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
+ {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
{"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
{"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
{"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
};
const choices_set_t ONLY_ERASURE_CHOICES = {
- ERASURE_CODE_PROFILE
+ EC_OVERWRITES, ERASURE_CODE_PROFILE
};
choices_set_t selected_choices;
f->dump_string("crush_rule", stringify(p->get_crush_rule()));
}
break;
+ case EC_OVERWRITES:
+ f->dump_bool("allow_ec_overwrites",
+ p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
+ break;
case HASHPSPOOL:
case NODELETE:
case NOPGCHANGE:
ss << "hit_set_search_last_n: " <<
p->hit_set_search_last_n << "\n";
break;
+ case EC_OVERWRITES:
+ ss << "allow_ec_overwrites: " <<
+ (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
+ "\n";
+ break;
case HASHPSPOOL:
case NODELETE:
case NOPGCHANGE:
return true;
}
-bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
+bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
{
op->mark_osdmon_event(__func__);
+
MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
-
+ MonSession *session = m->get_session();
+ if (!session) {
+ _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+ return true;
+ }
+
+ switch (m->op) {
+ case POOL_OP_CREATE_UNMANAGED_SNAP:
+ case POOL_OP_DELETE_UNMANAGED_SNAP:
+ {
+ const std::string* pool_name = nullptr;
+ const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
+ if (pg_pool != nullptr) {
+ pool_name = &osdmap.get_pool_name(m->pool);
+ }
+
+ if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
+ session->entity_name, session->caps,
+ pool_name)) {
+ dout(0) << "got unmanaged-snap pool op from entity with insufficient "
+ << "privileges. message: " << *m << std::endl
+ << "caps: " << session->caps << dendl;
+ _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+ return true;
+ }
+ }
+ break;
+ default:
+ if (!session->is_capable("osd", MON_CAP_W)) {
+ dout(0) << "got pool op from entity with insufficient privileges. "
+ << "message: " << *m << std::endl
+ << "caps: " << session->caps << dendl;
+ _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
+
+ if (enforce_pool_op_caps(op)) {
+ return true;
+ }
+
if (m->fsid != mon->monmap->fsid) {
dout(0) << __func__ << " drop message on fsid " << m->fsid
<< " != " << mon->monmap->fsid << " for " << *m << dendl;
{
op->mark_osdmon_event(__func__);
MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
- MonSession *session = m->get_session();
- if (!session) {
- _pool_op_reply(op, -EPERM, osdmap.get_epoch());
- return true;
- }
- if (!session->is_capable("osd", MON_CAP_W)) {
- dout(5) << "attempt to create new pool without sufficient auid privileges!"
- << "message: " << *m << std::endl
- << "caps: " << session->caps << dendl;
- _pool_op_reply(op, -EPERM, osdmap.get_epoch());
- return true;
- }
-
int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
if (pool >= 0) {
_pool_op_reply(op, 0, osdmap.get_epoch());
case POOL_OP_DELETE_UNMANAGED_SNAP:
if (!pp.is_removed_snap(m->snapid)) {
+ if (m->snapid > pp.get_snap_seq()) {
+ _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
+ return false;
+ }
pp.remove_unmanaged_snap(m->snapid);
changed = true;
}
const string& poolstr = osdmap.get_pool_name(pool_id);
// If the Pool is in use by CephFS, refuse to delete it
- FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
+ FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
if (pending_fsmap.pool_in_use(pool_id)) {
*ss << "pool '" << poolstr << "' is in use by CephFS";
return -EBUSY;
const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
- const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
+ const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
if (pending_fsmap.pool_in_use(tier_pool_id)) {
*ss << "pool '" << tier_pool_name << "' is in use by CephFS";
*err = -EBUSY;
const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
// Apply CephFS-specific checks
- const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
+ const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
if (pending_fsmap.pool_in_use(base_pool_id)) {
if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
// If the underlying pool is erasure coded and does not allow EC
#include "erasure-code/ErasureCodeInterface.h"
#include "mon/MonOpRequest.h"
+#include <boost/functional/hash.hpp>
+// re-include our assert to clobber the system one; fix dout:
+#include "include/assert.h"
/// information about a particular peer's failure reports for one osd
struct failure_reporter_t {
map<int,double> osd_weight;
- SimpleLRU<version_t, bufferlist> inc_osd_cache;
- SimpleLRU<version_t, bufferlist> full_osd_cache;
+ using osdmap_key_t = std::pair<version_t, uint64_t>;
+ using osdmap_cache_t = SimpleLRU<osdmap_key_t,
+ bufferlist,
+ std::less<osdmap_key_t>,
+ boost::hash<osdmap_key_t>>;
+ osdmap_cache_t inc_osd_cache;
+ osdmap_cache_t full_osd_cache;
bool check_failures(utime_t now);
bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
void force_failure(int target_osd, int by);
- // the time of last msg(MSG_ALIVE and MSG_PGTEMP) proposed without delay
- utime_t last_attempted_minwait_time;
-
bool _have_pending_crush();
CrushWrapper &_get_stable_crush();
void _get_pending_crush(CrushWrapper& newcrush);
bool can_mark_in(int o);
// ...
- MOSDMap *build_latest_full();
- MOSDMap *build_incremental(epoch_t first, epoch_t last);
+ MOSDMap *build_latest_full(uint64_t features);
+ MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features);
void send_full(MonOpRequestRef op);
void send_incremental(MonOpRequestRef op, epoch_t first);
public:
int _prepare_remove_pool(int64_t pool, ostream *ss, bool no_fake);
int _prepare_rename_pool(int64_t pool, string newname);
+ bool enforce_pool_op_caps(MonOpRequestRef op);
bool preprocess_pool_op (MonOpRequestRef op);
bool preprocess_pool_op_create (MonOpRequestRef op);
bool prepare_pool_op (MonOpRequestRef op);
int load_metadata(int osd, map<string, string>& m, ostream *err);
void count_metadata(const string& field, Formatter *f);
+
+ void reencode_incremental_map(bufferlist& bl, uint64_t features);
+ void reencode_full_map(bufferlist& bl, uint64_t features);
public:
void count_metadata(const string& field, map<string,int> *out);
protected:
}
int get_version(version_t ver, bufferlist& bl) override;
+ int get_version(version_t ver, uint64_t feature, bufferlist& bl);
+
+ int get_version_full(version_t ver, uint64_t feature, bufferlist& bl);
int get_version_full(version_t ver, bufferlist& bl) override;
epoch_t blacklist(const entity_addr_t& a, utime_t until);
checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
}
+ // LARGE_OMAP_OBJECTS
+ if (pg_sum.stats.sum.num_large_omap_objects) {
+ list<string> detail;
+ for (auto &pool : pools) {
+ const string& pool_name = osdmap.get_pool_name(pool.first);
+ auto it2 = pg_pool_sum.find(pool.first);
+ if (it2 == pg_pool_sum.end()) {
+ continue;
+ }
+ const pool_stat_t *pstat = &it2->second;
+ if (pstat == nullptr) {
+ continue;
+ }
+ const object_stat_sum_t& sum = pstat->stats.sum;
+ if (sum.num_large_omap_objects) {
+ stringstream ss;
+ ss << sum.num_large_omap_objects << " large objects found in pool "
+ << "'" << pool_name << "'";
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
+ auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str());
+ stringstream tip;
+ tip << "Search the cluster log for 'Large omap object found' for more "
+ << "details.";
+ detail.push_back(tip.str());
+ d.detail.swap(detail);
+ }
+ }
+
// CACHE_POOL_NEAR_FULL
{
list<string> detail;
}
if (!warn_detail.empty()) {
- ostringstream ss;
- ss << warn << " slow requests are blocked > "
- << cct->_conf->mon_osd_warn_op_age << " sec";
- auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
- d.detail.swap(warn_detail);
int left = max;
+ set<int> implicated_osds;
for (auto& p : warn_osd_by_max) {
ostringstream ss;
+ implicated_osds.insert(p.second.begin(), p.second.end());
if (p.second.size() > 1) {
ss << "osds " << p.second
<< " have blocked requests > " << p.first << " sec";
ss << "osd." << *p.second.begin()
<< " has blocked requests > " << p.first << " sec";
}
- d.detail.push_back(ss.str());
+ warn_detail.push_back(ss.str());
if (--left == 0) {
break;
}
}
+ ostringstream ss;
+ ss << warn << " slow requests are blocked > "
+ << cct->_conf->mon_osd_warn_op_age << " sec. Implicated osds "
+ << implicated_osds;
+ auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
+ d.detail.swap(warn_detail);
}
if (!error_detail.empty()) {
- ostringstream ss;
- ss << error << " stuck requests are blocked > "
- << err_age << " sec";
- auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
- d.detail.swap(error_detail);
int left = max;
+ set<int> implicated_osds;
for (auto& p : error_osd_by_max) {
ostringstream ss;
+ implicated_osds.insert(p.second.begin(), p.second.end());
if (p.second.size() > 1) {
ss << "osds " << p.second
<< " have stuck requests > " << p.first << " sec";
ss << "osd." << *p.second.begin()
<< " has stuck requests > " << p.first << " sec";
}
- d.detail.push_back(ss.str());
+ error_detail.push_back(ss.str());
if (--left == 0) {
break;
}
}
+ ostringstream ss;
+ ss << error << " stuck requests are blocked > "
+ << err_age << " sec. Implicated osds " << implicated_osds;
+ auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
+ d.detail.swap(error_detail);
}
}
* @return the first committed version
*/
version_t get_first_committed() { return first_committed; }
- /**
- * Get the last commit time
- *
- * @returns Our last commit time
- */
- utime_t get_last_commit_time() const{
- return last_commit_time;
- }
/**
* Check if a given version is readable.
*
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PAXOS_FSMAP_H
+#define CEPH_PAXOS_FSMAP_H
+
+#include "mds/FSMap.h"
+#include "mds/MDSMap.h"
+
+#include "include/assert.h"
+
+class PaxosFSMap {
+public:
+ virtual ~PaxosFSMap() {}
+
+ const FSMap &get_pending_fsmap() const { assert(is_leader()); return pending_fsmap; }
+ const FSMap &get_fsmap() const { return fsmap; }
+
+ virtual bool is_leader() const = 0;
+
+protected:
+ FSMap &get_pending_fsmap_writeable() { assert(is_leader()); return pending_fsmap; }
+
+ /* get_working_fsmap returns the "relevant" version of the fsmap (see MDSMonitor.cc history)
+ * used depending in helper methods of MDSMonitor.cc.
+ *
+ * This is technically evil and will be removed in the future.
+ *
+ * See discussion: https://github.com/ceph/ceph/pull/21458#discussion_r182081366
+ */
+ const FSMap &get_working_fsmap() const { return is_leader() ? pending_fsmap : fsmap; }
+
+ FSMap &create_pending() {
+ assert(is_leader());
+ pending_fsmap = fsmap;
+ pending_fsmap.epoch++;
+ return pending_fsmap;
+ }
+
+ void decode(bufferlist &bl) {
+ fsmap.decode(bl);
+ pending_fsmap = FSMap(); /* nuke it to catch invalid access */
+ }
+
+private:
+ /* Keep these PRIVATE to prevent unprotected manipulation. */
+ FSMap fsmap; /* the current epoch */
+ FSMap pending_fsmap; /* the next epoch */
+};
+
+
+#endif
#define CEPH_DISPATCHER_H
#include "include/assert.h"
+#include <memory>
#include "include/buffer_fwd.h"
#include "include/assert.h"
class AuthAuthorizer;
class CryptoKey;
class CephContext;
+class AuthAuthorizerChallenge;
class Dispatcher {
public:
ceph::bufferlist& authorizer,
ceph::bufferlist& authorizer_reply,
bool& isvalid,
- CryptoKey& session_key) { return false; }
+ CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) {
+ return false;
+ }
/**
* @} //Authentication
*/
*/
bool ms_deliver_verify_authorizer(Connection *con, int peer_type,
int protocol, bufferlist& authorizer, bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key) {
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) {
for (list<Dispatcher*>::iterator p = dispatchers.begin();
p != dispatchers.end();
++p) {
- if ((*p)->ms_verify_authorizer(con, peer_type, protocol, authorizer, authorizer_reply, isvalid, session_key))
+ if ((*p)->ms_verify_authorizer(con, peer_type, protocol, authorizer, authorizer_reply,
+ isvalid, session_key, challenge))
return true;
}
return false;
ldout(async_msgr->cct, 20) << __func__ << " connect peer addr for me is " << peer_addr_for_me << dendl;
lock.unlock();
async_msgr->learned_addr(peer_addr_for_me);
- if (async_msgr->cct->_conf->ms_inject_internal_delays) {
+ if (async_msgr->cct->_conf->ms_inject_internal_delays
+ && async_msgr->cct->_conf->ms_inject_socket_failures) {
if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) {
ldout(msgr->cct, 10) << __func__ << " sleep for "
<< async_msgr->cct->_conf->ms_inject_internal_delays << dendl;
case STATE_CONNECTING_SEND_CONNECT_MSG:
{
- if (!got_bad_auth) {
- delete authorizer;
+ if (!authorizer) {
authorizer = async_msgr->get_authorizer(peer_type, false);
}
bufferlist bl;
}
authorizer_reply.append(state_buffer, connect_reply.authorizer_len);
- bufferlist::iterator iter = authorizer_reply.begin();
+
+ if (connect_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+ ldout(async_msgr->cct,10) << __func__ << " connect got auth challenge" << dendl;
+ authorizer->add_challenge(async_msgr->cct, authorizer_reply);
+ state = STATE_CONNECTING_SEND_CONNECT_MSG;
+ break;
+ }
+
+ auto iter = authorizer_reply.begin();
if (authorizer && !authorizer->verify_reply(iter)) {
ldout(async_msgr->cct, 0) << __func__ << " failed verifying authorize reply" << dendl;
goto fail;
// require signatures for cephx?
if (connect.authorizer_protocol == CEPH_AUTH_CEPHX) {
if (peer_type == CEPH_ENTITY_TYPE_OSD ||
- peer_type == CEPH_ENTITY_TYPE_MDS) {
+ peer_type == CEPH_ENTITY_TYPE_MDS ||
+ peer_type == CEPH_ENTITY_TYPE_MGR) {
if (async_msgr->cct->_conf->cephx_require_signatures ||
async_msgr->cct->_conf->cephx_cluster_require_signatures) {
ldout(async_msgr->cct, 10) << __func__ << " using cephx, requiring MSG_AUTH feature bit for cluster" << dendl;
policy.features_required |= CEPH_FEATURE_MSG_AUTH;
}
+ if (async_msgr->cct->_conf->cephx_require_version >= 2 ||
+ async_msgr->cct->_conf->cephx_cluster_require_version >= 2) {
+ ldout(async_msgr->cct, 10) << __func__ << " using cephx, requiring cephx v2 feature bit for cluster" << dendl;
+ policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+ }
} else {
if (async_msgr->cct->_conf->cephx_require_signatures ||
async_msgr->cct->_conf->cephx_service_require_signatures) {
ldout(async_msgr->cct, 10) << __func__ << " using cephx, requiring MSG_AUTH feature bit for service" << dendl;
policy.features_required |= CEPH_FEATURE_MSG_AUTH;
}
+ if (async_msgr->cct->_conf->cephx_require_version >= 2 ||
+ async_msgr->cct->_conf->cephx_service_require_version >= 2) {
+ ldout(async_msgr->cct, 10) << __func__ << " using cephx, requiring cephx v2 feature bit for service" << dendl;
+ policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+ }
}
}
+
uint64_t feat_missing = policy.features_required & ~(uint64_t)connect.features;
if (feat_missing) {
ldout(async_msgr->cct, 1) << __func__ << " peer missing required features "
lock.unlock();
bool authorizer_valid;
- if (!async_msgr->verify_authorizer(this, peer_type, connect.authorizer_protocol, authorizer_bl,
- authorizer_reply, authorizer_valid, session_key) || !authorizer_valid) {
+ bool need_challenge = HAVE_FEATURE(connect.features, CEPHX_V2);
+ bool had_challenge = (bool)authorizer_challenge;
+ if (!async_msgr->verify_authorizer(
+ this, peer_type, connect.authorizer_protocol, authorizer_bl,
+ authorizer_reply, authorizer_valid, session_key,
+ need_challenge ? &authorizer_challenge : nullptr) ||
+ !authorizer_valid) {
lock.lock();
- ldout(async_msgr->cct,0) << __func__ << ": got bad authorizer" << dendl;
+ char tag;
+ if (need_challenge && !had_challenge && authorizer_challenge) {
+ ldout(async_msgr->cct,0) << __func__ << ": challenging authorizer"
+ << dendl;
+ assert(authorizer_reply.length());
+ tag = CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER;
+ } else {
+ ldout(async_msgr->cct,0) << __func__ << ": got bad authorizer" << dendl;
+ tag = CEPH_MSGR_TAG_BADAUTHORIZER;
+ }
session_security.reset();
- return _reply_accept(CEPH_MSGR_TAG_BADAUTHORIZER, connect, reply, authorizer_reply);
+ return _reply_accept(tag, connect, reply, authorizer_reply);
}
// We've verified the authorizer for this AsyncConnection, so set up the session security structure. PLR
// there shouldn't exist any buffer
assert(recv_start == recv_end);
+ existing->authorizer_challenge.reset();
+
auto deactivate_existing = std::bind(
[existing, new_worker, new_center, connect, reply, authorizer_reply](ConnectedSocket &cs) mutable {
// we need to delete time event in original thread
Worker *worker;
EventCenter *center;
ceph::shared_ptr<AuthSessionHandler> session_security;
+ std::unique_ptr<AuthAuthorizerChallenge> authorizer_challenge; // accept side
public:
// used by eventcallback
* This wraps ms_deliver_verify_authorizer; we use it for AsyncConnection.
*/
bool verify_authorizer(Connection *con, int peer_type, int protocol, bufferlist& auth, bufferlist& auth_reply,
- bool& isvalid, CryptoKey& session_key) {
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) {
return ms_deliver_verify_authorizer(con, peer_type, protocol, auth,
- auth_reply, isvalid, session_key);
+ auth_reply, isvalid, session_key, challenge);
}
/**
* Increment the global sequence for this AsyncMessenger and return it.
// used for reading in the remote acked seq on connect
uint64_t newly_acked_seq = 0;
+ bool need_challenge = false;
+ bool had_challenge = false;
+ std::unique_ptr<AuthAuthorizerChallenge> authorizer_challenge;
+
recv_reset();
set_socket_options();
// require signatures for cephx?
if (connect.authorizer_protocol == CEPH_AUTH_CEPHX) {
if (peer_type == CEPH_ENTITY_TYPE_OSD ||
- peer_type == CEPH_ENTITY_TYPE_MDS) {
+ peer_type == CEPH_ENTITY_TYPE_MDS ||
+ peer_type == CEPH_ENTITY_TYPE_MGR) {
if (msgr->cct->_conf->cephx_require_signatures ||
msgr->cct->_conf->cephx_cluster_require_signatures) {
ldout(msgr->cct,10) << "using cephx, requiring MSG_AUTH feature bit for cluster" << dendl;
policy.features_required |= CEPH_FEATURE_MSG_AUTH;
}
+ if (msgr->cct->_conf->cephx_require_version >= 2 ||
+ msgr->cct->_conf->cephx_cluster_require_version >= 2) {
+ ldout(msgr->cct,10) << "using cephx, requiring cephx v2 feature bit for cluster" << dendl;
+ policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+ }
} else {
if (msgr->cct->_conf->cephx_require_signatures ||
msgr->cct->_conf->cephx_service_require_signatures) {
ldout(msgr->cct,10) << "using cephx, requiring MSG_AUTH feature bit for service" << dendl;
policy.features_required |= CEPH_FEATURE_MSG_AUTH;
}
+ if (msgr->cct->_conf->cephx_require_version >= 2 ||
+ msgr->cct->_conf->cephx_service_require_version >= 2) {
+ ldout(msgr->cct,10) << "using cephx, requiring cephx v2 feature bit for cluster" << dendl;
+ policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+ }
}
}
pipe_lock.Unlock();
- if (!msgr->verify_authorizer(connection_state.get(), peer_type, connect.authorizer_protocol, authorizer,
- authorizer_reply, authorizer_valid, session_key) ||
+ need_challenge = HAVE_FEATURE(connect.features, CEPHX_V2);
+ had_challenge = (bool)authorizer_challenge;
+ authorizer_reply.clear();
+ if (!msgr->verify_authorizer(
+ connection_state.get(), peer_type, connect.authorizer_protocol, authorizer,
+ authorizer_reply, authorizer_valid, session_key,
+ need_challenge ? &authorizer_challenge : nullptr) ||
!authorizer_valid) {
- ldout(msgr->cct,0) << "accept: got bad authorizer" << dendl;
pipe_lock.Lock();
if (state != STATE_ACCEPTING)
goto shutting_down_msgr_unlocked;
- reply.tag = CEPH_MSGR_TAG_BADAUTHORIZER;
+ if (!had_challenge && need_challenge && authorizer_challenge) {
+ ldout(msgr->cct,0) << "accept: challenging authorizer "
+ << authorizer_reply.length()
+ << " bytes" << dendl;
+ assert(authorizer_reply.length());
+ reply.tag = CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER;
+ } else {
+ ldout(msgr->cct,0) << "accept: got bad authorizer" << dendl;
+ reply.tag = CEPH_MSGR_TAG_BADAUTHORIZER;
+ }
session_security.reset();
goto reply;
}
while (1) {
- delete authorizer;
- authorizer = msgr->get_authorizer(peer_type, false);
+ if (!authorizer) {
+ authorizer = msgr->get_authorizer(peer_type, false);
+ }
bufferlist authorizer_reply;
ceph_msg_connect connect;
authorizer_reply.push_back(bp);
}
+ if (reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+ authorizer->add_challenge(msgr->cct, authorizer_reply);
+ ldout(msgr->cct,10) << " got authorizer challenge, " << authorizer_reply.length()
+ << " bytes" << dendl;
+ continue;
+ }
+
if (authorizer) {
bufferlist::iterator iter = authorizer_reply.begin();
if (!authorizer->verify_reply(iter)) {
bool SimpleMessenger::verify_authorizer(Connection *con, int peer_type,
int protocol, bufferlist& authorizer, bufferlist& authorizer_reply,
- bool& isvalid,CryptoKey& session_key)
+ bool& isvalid,CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge)
{
- return ms_deliver_verify_authorizer(con, peer_type, protocol, authorizer, authorizer_reply, isvalid,session_key);
+ return ms_deliver_verify_authorizer(con, peer_type, protocol, authorizer, authorizer_reply,
+ isvalid, session_key,
+ challenge);
}
ConnectionRef SimpleMessenger::get_connection(const entity_inst_t& dest)
/**
* This wraps ms_deliver_verify_authorizer; we use it for Pipe.
*/
- bool verify_authorizer(Connection *con, int peer_type, int protocol, bufferlist& auth, bufferlist& auth_reply,
- bool& isvalid,CryptoKey& session_key);
+ bool verify_authorizer(Connection *con, int peer_type, int protocol, bufferlist& auth,
+ bufferlist& auth_reply,
+ bool& isvalid,CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge);
/**
* Increment the global sequence for this SimpleMessenger and return it.
* This is for the connect protocol, although it doesn't hurt if somebody
virtual void inject_mdata_error(const ghobject_t &oid) {}
virtual void compact() {}
+ virtual bool has_builtin_csum() const {
+ return false;
+ }
};
WRITE_CLASS_ENCODER(ObjectStore::Transaction)
WRITE_CLASS_ENCODER(ObjectStore::Transaction::TransactionData)
<< " removing self from set " << get_parent()
<< dendl;
if (get_parent()) {
- if (get_parent()->try_remove(this)) {
- delete this;
- } else {
- ldout(coll->store->cct, 20)
- << __func__ << " " << this << " lost race to remove myself from set"
- << dendl;
- }
- } else {
- delete this;
+ get_parent()->remove(this);
}
+ delete this;
}
}
string bfn;
struct stat st;
- if (read_meta("path_block.db", &bfn) < 0) {
- bfn = path + "/block.db";
- }
+ bfn = path + "/block.db";
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
if (r < 0) {
}
bluefs_shared_bdev = BlueFS::BDEV_SLOW;
bluefs_single_shared_device = false;
- } else if (::lstat(bfn.c_str(), &st) == -1) {
- bluefs_shared_bdev = BlueFS::BDEV_DB;
} else {
- //symlink exist is bug
- derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
r = -errno;
- goto free_bluefs;
+ if (::lstat(bfn.c_str(), &st) == -1) {
+ r = 0;
+ bluefs_shared_bdev = BlueFS::BDEV_DB;
+ } else {
+ derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
}
// shared device
- if (read_meta("path_block", &bfn) < 0) {
- bfn = path + "/block";
- }
+ bfn = path + "/block";
r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
if (r < 0) {
derr << __func__ << " add block device(" << bfn << ") returned: "
bluefs_extents.insert(start, initial);
}
- if (read_meta("path_block.wal", &bfn) < 0) {
- bfn = path + "/block.wal";
- }
+ bfn = path + "/block.wal";
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
if (r < 0) {
}
cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
bluefs_single_shared_device = false;
- } else if (::lstat(bfn.c_str(), &st) == -1) {
- cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
} else {
- //symlink exist is bug
- derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
r = -errno;
- goto free_bluefs;
+ if (::lstat(bfn.c_str(), &st) == -1) {
+ r = 0;
+ cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
+ } else {
+ derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
}
if (create) {
int BlueStore::_open_collections(int *errors)
{
+ dout(10) << __func__ << dendl;
assert(coll_map.empty());
KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
for (it->upper_bound(string());
<< pretty_binary_string(it->key()) << dendl;
return -EIO;
}
- dout(20) << __func__ << " opened " << cid << " " << c << dendl;
+ dout(20) << __func__ << " opened " << cid << " " << c
+ << " " << c->cnode << dendl;
coll_map[cid] = c;
} else {
derr << __func__ << " unrecognized collection " << it->key() << dendl;
}
if (cct->_conf->bluestore_block_preallocate_file) {
-#ifdef HAVE_POSIX_FALLOCATE
- r = ::posix_fallocate(fd, 0, size);
- if (r) {
+ r = ::ceph_posix_fallocate(fd, 0, size);
+ if (r > 0) {
derr << __func__ << " failed to prefallocate " << name << " file to "
<< size << ": " << cpp_strerror(r) << dendl;
VOID_TEMP_FAILURE_RETRY(::close(fd));
return -r;
}
-#else
- char data[1024*128];
- for (uint64_t off = 0; off < size; off += sizeof(data)) {
- if (off + sizeof(data) > size)
- r = ::write(fd, data, size - off);
- else
- r = ::write(fd, data, sizeof(data));
- if (r < 0) {
- r = -errno;
- derr << __func__ << " failed to prefallocate w/ write " << name << " file to "
- << size << ": " << cpp_strerror(r) << dendl;
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- return r;
- }
- }
-#endif
}
dout(1) << __func__ << " resized " << name << " file to "
<< pretty_si_t(size) << "B" << dendl;
if (r < 0)
goto out_close_fsid;
- {
- string wal_path = cct->_conf->get_val<string>("bluestore_block_wal_path");
- if (wal_path.size()) {
- write_meta("path_block.wal", wal_path);
- }
- string db_path = cct->_conf->get_val<string>("bluestore_block_db_path");
- if (db_path.size()) {
- write_meta("path_block.db", db_path);
- }
- }
-
// choose min_alloc_size
if (cct->_conf->bluestore_min_alloc_size) {
min_alloc_size = cct->_conf->bluestore_min_alloc_size;
continue;
}
c->cid.is_pg(&pgid);
- dout(20) << __func__ << " collection " << c->cid << dendl;
+ dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
+ << dendl;
}
if (!expecting_shards.empty()) {
}
out:
- if (r == 0 && _debug_data_eio(oid)) {
+ if (r >= 0 && _debug_data_eio(oid)) {
r = -EIO;
derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
} else if (cct->_conf->bluestore_debug_random_read_err &&
SharedBlobRef lookup(uint64_t sbid) {
std::lock_guard<std::mutex> l(lock);
auto p = sb_map.find(sbid);
- if (p == sb_map.end()) {
+ if (p == sb_map.end() ||
+ p->second->nref == 0) {
return nullptr;
}
return p->second;
sb->coll = coll;
}
- bool try_remove(SharedBlob *sb) {
- std::lock_guard<std::mutex> l(lock);
- if (sb->nref == 0) {
- assert(sb->get_parent() == this);
- sb_map.erase(sb->get_sbid());
- return true;
- }
- return false;
- }
-
void remove(SharedBlob *sb) {
std::lock_guard<std::mutex> l(lock);
assert(sb->get_parent() == this);
- sb_map.erase(sb->get_sbid());
+ // only remove if it still points to us
+ auto p = sb_map.find(sb->get_sbid());
+ if (p != sb_map.end() &&
+ p->second == sb) {
+ sb_map.erase(p);
+ }
}
bool empty() {
assert(db);
db->compact();
}
-
+ bool has_builtin_csum() const override {
+ return true;
+ }
+
private:
bool _debug_data_eio(const ghobject_t& o) {
if (!cct->_conf->bluestore_debug_inject_read_err) {
}
}
+static bool is_expected_ioerr(const int r)
+{
+ // https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135
+ return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC ||
+ r == -ENOLINK || r == -EREMOTEIO || r == -EBADE ||
+ r == -ENODATA || r == -EILSEQ || r == -ENOMEM ||
+ r == -EAGAIN || r == -EREMCHG || r == -EIO);
+}
+
void KernelDevice::_aio_thread()
{
dout(10) << __func__ << " start" << dendl;
long r = aio[i]->get_return_value();
if (r < 0) {
- derr << __func__ << " got " << cpp_strerror(r) << dendl;
- if (ioc->allow_eio && r == -EIO) {
- ioc->set_return_value(r);
+ derr << __func__ << " got r=" << r << " (" << cpp_strerror(r) << ")"
+ << dendl;
+ if (ioc->allow_eio && is_expected_ioerr(r)) {
+ derr << __func__ << " translating the error to EIO for upper layer"
+ << dendl;
+ ioc->set_return_value(-EIO);
} else {
- assert(0 == "got unexpected error from io_getevents");
+ assert(0 == "got unexpected error from aio_t::get_return_value. "
+ "This may suggest HW issue. Please check your dmesg!");
}
} else if (aio[i]->length != (uint64_t)r) {
derr << "aio to " << aio[i]->offset << "~" << aio[i]->length
ldout(cct, 30) << __func__ << " demoting1 0x" << std::hex << off << "~" << len
<< std::dec << " to bin " << newbin << dendl;
_insert_free(off, len);
- return false;
+ return true;
}
- return true;
+ return false;
});
++it;
}
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
+#include <algorithm>
#include "aio.h"
#if defined(HAVE_LIBAIO)
}
int done = 0;
while (left > 0) {
- int r = io_submit(ctx, left, piocb + done);
+ int r = io_submit(ctx, std::min(left, max_iodepth), piocb + done);
if (r < 0) {
if (r == -EAGAIN && attempts-- > 0) {
usleep(delay);
assert(r > 0);
done += r;
left -= r;
+ attempts = 16;
+ delay = 125;
}
return done;
}
block_size(4096) { }
uint64_t block_mask() const {
- return ~(block_size - 1);
+ return ~((uint64_t)block_size - 1);
}
void encode(bufferlist& bl) const;
o.push_back(new bluestore_cnode_t(123));
}
+ostream& operator<<(ostream& out, const bluestore_cnode_t& l)
+{
+ return out << "cnode(bits " << l.bits << ")";
+}
+
// bluestore_extent_ref_map_t
void bluestore_extent_ref_map_t::_check() const
};
WRITE_CLASS_DENC(bluestore_cnode_t)
+ostream& operator<<(ostream& out, const bluestore_cnode_t& l);
+
class AllocExtent;
typedef mempool::bluestore_alloc::vector<AllocExtent> AllocExtentVector;
class AllocExtent {
<< newsize << " bytes: " << cpp_strerror(err) << dendl;
return -err;
}
-#ifdef HAVE_POSIX_FALLOCATE
- ret = ::posix_fallocate(fd, 0, newsize);
+ ret = ceph_posix_fallocate(fd, 0, newsize);
if (ret) {
derr << "FileJournal::_open_file : unable to preallocation journal to "
<< newsize << " bytes: " << cpp_strerror(ret) << dendl;
return -ret;
}
max_size = newsize;
-#elif defined(__APPLE__)
- fstore_t store;
- store.fst_flags = F_ALLOCATECONTIG;
- store.fst_posmode = F_PEOFPOSMODE;
- store.fst_offset = 0;
- store.fst_length = newsize;
-
- ret = ::fcntl(fd, F_PREALLOCATE, &store);
- if (ret == -1) {
- ret = -errno;
- derr << "FileJournal::_open_file : unable to preallocation journal to "
- << newsize << " bytes: " << cpp_strerror(ret) << dendl;
- return ret;
- }
- max_size = newsize;
-#else
-# error "Journal pre-allocation not supported on platform."
-#endif
}
else {
max_size = oldsize;
plb.add_u64(l_filestore_journal_ops, "journal_ops", "Active journal entries to be applied");
plb.add_u64(l_filestore_journal_queue_bytes, "journal_queue_bytes", "Size of journal queue");
plb.add_u64(l_filestore_journal_bytes, "journal_bytes", "Active journal operation size to be applied");
- plb.add_time_avg(l_filestore_journal_latency, "journal_latency", "Average journal queue completing latency");
+ plb.add_time_avg(l_filestore_journal_latency, "journal_latency", "Average journal queue completing latency",
+ NULL, PerfCountersBuilder::PRIO_USEFUL);
plb.add_u64_counter(l_filestore_journal_wr, "journal_wr", "Journal write IOs");
plb.add_u64_avg(l_filestore_journal_wr_bytes, "journal_wr_bytes", "Journal data written");
plb.add_u64(l_filestore_op_queue_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue");
plb.add_time_avg(l_filestore_commitcycle_interval, "commitcycle_interval", "Average interval between commits");
plb.add_time_avg(l_filestore_commitcycle_latency, "commitcycle_latency", "Average latency of commit");
plb.add_u64_counter(l_filestore_journal_full, "journal_full", "Journal writes while full");
- plb.add_time_avg(l_filestore_queue_transaction_latency_avg, "queue_transaction_latency_avg", "Store operation queue latency");
+ plb.add_time_avg(l_filestore_queue_transaction_latency_avg, "queue_transaction_latency_avg",
+ "Store operation queue latency", NULL, PerfCountersBuilder::PRIO_USEFUL);
plb.add_time(l_filestore_sync_pause_max_lat, "sync_pause_max_latency", "Max latency of op_wq pause before syncfs");
logger = plb.create_perf_counters();
const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
_cid : _cid.get_temp();
tracepoint(objectstore, omap_clear_enter, osr_name);
- r = _omap_clear(cid, oid, spos);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_clear(cid, oid, spos);
tracepoint(objectstore, omap_clear_exit, r);
}
break;
map<string, bufferlist> aset;
i.decode_attrset(aset);
tracepoint(objectstore, omap_setkeys_enter, osr_name);
- r = _omap_setkeys(cid, oid, aset, spos);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_setkeys(cid, oid, aset, spos);
tracepoint(objectstore, omap_setkeys_exit, r);
}
break;
set<string> keys;
i.decode_keyset(keys);
tracepoint(objectstore, omap_rmkeys_enter, osr_name);
- r = _omap_rmkeys(cid, oid, keys, spos);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_rmkeys(cid, oid, keys, spos);
tracepoint(objectstore, omap_rmkeys_exit, r);
}
break;
first = i.decode_string();
last = i.decode_string();
tracepoint(objectstore, omap_rmkeyrange_enter, osr_name);
- r = _omap_rmkeyrange(cid, oid, first, last, spos);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_rmkeyrange(cid, oid, first, last, spos);
tracepoint(objectstore, omap_rmkeyrange_exit, r);
}
break;
bufferlist bl;
i.decode_bl(bl);
tracepoint(objectstore, omap_setheader_enter, osr_name);
- r = _omap_setheader(cid, oid, bl, spos);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_setheader(cid, oid, bl, spos);
tracepoint(objectstore, omap_setheader_exit, r);
}
break;
object_map->compact();
}
+ bool has_builtin_csum() const override {
+ return false;
+ }
+
void debug_obj_on_delete(const ghobject_t &oid);
bool debug_data_eio(const ghobject_t &oid);
bool debug_mdata_eio(const ghobject_t &oid);
struct RecoveryMessages {
map<hobject_t,
ECBackend::read_request_t> reads;
+ map<hobject_t, set<int>> want_to_read;
void read(
ECBackend *ec,
const hobject_t &hoid, uint64_t off, uint64_t len,
+ set<int> &&_want_to_read,
const set<pg_shard_t> &need,
bool attrs) {
list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read;
to_read.push_back(boost::make_tuple(off, len, 0));
assert(!reads.count(hoid));
+ want_to_read.insert(make_pair(hoid, std::move(_want_to_read)));
reads.insert(
make_pair(
hoid,
return;
start_read_op(
priority,
+ m.want_to_read,
m.reads,
OpRequestRef(),
false, true);
op.hoid,
op.recovery_progress.data_recovered_to,
amount,
+ std::move(want),
to_read,
op.recovery_progress.first && !op.obc);
op.extent_requested = make_pair(
// not conflict with ECSubWrite's operator<<.
MOSDECSubOpWrite *op = static_cast<MOSDECSubOpWrite*>(
_op->get_nonconst_req());
+ parent->maybe_preempt_replica_scrub(op->op.soid);
handle_sub_write(op->op.from, _op, op->op, _op->pg_trace);
return true;
}
have.insert(j->first.shard);
dout(20) << __func__ << " have shard=" << j->first.shard << dendl;
}
- set<int> want_to_read, dummy_minimum;
- get_want_to_read_shards(&want_to_read);
+ set<int> dummy_minimum;
int err;
- if ((err = ec_impl->minimum_to_decode(want_to_read, have, &dummy_minimum)) < 0) {
+ if ((err = ec_impl->minimum_to_decode(rop.want_to_read[iter->first], have, &dummy_minimum)) < 0) {
dout(20) << __func__ << " minimum_to_decode failed" << dendl;
if (rop.in_progress.empty()) {
// If we don't have enough copies and we haven't sent reads for all shards
void ECBackend::get_all_avail_shards(
const hobject_t &hoid,
+ const set<pg_shard_t> &error_shards,
set<int> &have,
map<shard_id_t, pg_shard_t> &shards,
bool for_recovery)
++i) {
dout(10) << __func__ << ": checking acting " << *i << dendl;
const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
+ if (error_shards.find(*i) != error_shards.end())
+ continue;
if (!missing.is_missing(hoid)) {
assert(!have.count(i->shard));
have.insert(i->shard);
get_parent()->get_backfill_shards().begin();
i != get_parent()->get_backfill_shards().end();
++i) {
+ if (error_shards.find(*i) != error_shards.end())
+ continue;
if (have.count(i->shard)) {
assert(shards.count(i->shard));
continue;
if (m) {
assert(!(*m).is_missing(hoid));
}
+ if (error_shards.find(*i) != error_shards.end())
+ continue;
have.insert(i->shard);
shards.insert(make_pair(i->shard, *i));
}
set<int> have;
map<shard_id_t, pg_shard_t> shards;
+ set<pg_shard_t> error_shards;
- get_all_avail_shards(hoid, have, shards, for_recovery);
+ get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
set<int> need;
int r = ec_impl->minimum_to_decode(want, have, &need);
int ECBackend::get_remaining_shards(
const hobject_t &hoid,
const set<int> &avail,
+ const set<int> &want,
+ const read_result_t &result,
set<pg_shard_t> *to_read,
bool for_recovery)
{
set<int> have;
map<shard_id_t, pg_shard_t> shards;
+ set<pg_shard_t> error_shards;
+ for (auto &p : result.errors) {
+ error_shards.insert(p.first);
+ }
+
+ get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
+
+ set<int> need;
+ int r = ec_impl->minimum_to_decode(want, have, &need);
+ if (r < 0) {
+ dout(0) << __func__ << " not enough shards left to try for " << hoid
+ << " read result was " << result << dendl;
+ return -EIO;
+ }
- get_all_avail_shards(hoid, have, shards, for_recovery);
+ set<int> shards_left;
+ for (auto p : need) {
+ if (avail.find(p) == avail.end()) {
+ shards_left.insert(p);
+ }
+ }
- for (set<int>::iterator i = have.begin();
- i != have.end();
+ for (set<int>::iterator i = shards_left.begin();
+ i != shards_left.end();
++i) {
assert(shards.count(shard_id_t(*i)));
- if (avail.find(*i) == avail.end())
- to_read->insert(shards[shard_id_t(*i)]);
+ assert(avail.find(*i) == avail.end());
+ to_read->insert(shards[shard_id_t(*i)]);
}
return 0;
}
void ECBackend::start_read_op(
int priority,
+ map<hobject_t, set<int>> &want_to_read,
map<hobject_t, read_request_t> &to_read,
OpRequestRef _op,
bool do_redundant_reads,
do_redundant_reads,
for_recovery,
_op,
+ std::move(want_to_read),
std::move(to_read))).first->second;
dout(10) << __func__ << ": starting " << op << dendl;
if (_op) {
return;
}
+ map<hobject_t, set<int>> obj_want_to_read;
set<int> want_to_read;
get_want_to_read_shards(&want_to_read);
shards,
false,
c)));
+ obj_want_to_read.insert(make_pair(to_read.first, want_to_read));
}
start_read_op(
CEPH_MSG_PRIO_DEFAULT,
+ obj_want_to_read,
for_read_op,
OpRequestRef(),
fast_read, false);
already_read.insert(i->shard);
dout(10) << __func__ << " have/error shards=" << already_read << dendl;
set<pg_shard_t> shards;
- int r = get_remaining_shards(hoid, already_read, &shards, rop.for_recovery);
+ int r = get_remaining_shards(hoid, already_read, rop.want_to_read[hoid],
+ rop.complete[hoid], &shards, rop.for_recovery);
if (r)
return r;
- if (shards.empty())
- return -EIO;
-
- dout(10) << __func__ << " Read remaining shards " << shards << dendl;
- // TODOSAM: this doesn't seem right
list<boost::tuple<uint64_t, uint64_t, uint32_t> > offsets =
rop.to_read.find(hoid)->second.to_read;
GenContext<pair<RecoveryMessages *, read_result_t& > &> *c =
rop.to_read.find(hoid)->second.cb;
- map<hobject_t, read_request_t> for_read_op;
- for_read_op.insert(
- make_pair(
+ rop.to_read.erase(hoid);
+ rop.to_read.insert(make_pair(
hoid,
read_request_t(
offsets,
shards,
false,
c)));
-
- rop.to_read.swap(for_read_op);
do_read_op(rop);
return 0;
}
old_size));
}
-void ECBackend::be_deep_scrub(
+int ECBackend::be_deep_scrub(
const hobject_t &poid,
- uint32_t seed,
- ScrubMap::object &o,
- ThreadPool::TPHandle &handle) {
- bufferhash h(-1); // we always used -1
+ ScrubMap &map,
+ ScrubMapBuilder &pos,
+ ScrubMap::object &o)
+{
+ dout(10) << __func__ << " " << poid << " pos " << pos << dendl;
int r;
+
+ uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
+ utime_t sleeptime;
+ sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep);
+ if (sleeptime != utime_t()) {
+ lgeneric_derr(cct) << __func__ << " sleeping for " << sleeptime << dendl;
+ sleeptime.sleep();
+ }
+
+ if (pos.data_pos == 0) {
+ pos.data_hash = bufferhash(-1);
+ }
+
uint64_t stride = cct->_conf->osd_deep_scrub_stride;
if (stride % sinfo.get_chunk_size())
stride += sinfo.get_chunk_size() - (stride % sinfo.get_chunk_size());
- uint64_t pos = 0;
- uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
-
- while (true) {
- bufferlist bl;
- handle.reset_tp_timeout();
- r = store->read(
- ch,
- ghobject_t(
- poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
- pos,
- stride, bl,
- fadvise_flags);
- if (r < 0)
- break;
- if (bl.length() % sinfo.get_chunk_size()) {
- r = -EIO;
- break;
- }
- pos += r;
- h << bl;
- if ((unsigned)r < stride)
- break;
+ bufferlist bl;
+ r = store->read(
+ ch,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ pos.data_pos,
+ stride, bl,
+ fadvise_flags);
+ if (r < 0) {
+ dout(20) << __func__ << " " << poid << " got "
+ << r << " on read, read_error" << dendl;
+ o.read_error = true;
+ return 0;
}
-
- if (r == -EIO) {
- dout(0) << "_scan_list " << poid << " got "
- << r << " on read, read_error" << dendl;
+ if (bl.length() % sinfo.get_chunk_size()) {
+ dout(20) << __func__ << " " << poid << " got "
+ << r << " on read, not chunk size " << sinfo.get_chunk_size() << " aligned"
+ << dendl;
o.read_error = true;
- return;
+ return 0;
+ }
+ if (r > 0) {
+ pos.data_hash << bl;
+ }
+ pos.data_pos += r;
+ if (r == (int)stride) {
+ return -EINPROGRESS;
}
ECUtil::HashInfoRef hinfo = get_hash_info(poid, false, &o.attrs);
dout(0) << "_scan_list " << poid << " could not retrieve hash info" << dendl;
o.read_error = true;
o.digest_present = false;
- return;
+ return 0;
} else {
if (!get_parent()->get_pool().allows_ecoverwrites()) {
assert(hinfo->has_chunk_hash());
- if (hinfo->get_total_chunk_size() != pos) {
- dout(0) << "_scan_list " << poid << " got incorrect size on read" << dendl;
+ if (hinfo->get_total_chunk_size() != (unsigned)pos.data_pos) {
+ dout(0) << "_scan_list " << poid << " got incorrect size on read 0x"
+ << std::hex << pos
+ << " expected 0x" << hinfo->get_total_chunk_size() << std::dec
+ << dendl;
o.ec_size_mismatch = true;
- return;
+ return 0;
}
- if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) != h.digest()) {
- dout(0) << "_scan_list " << poid << " got incorrect hash on read" << dendl;
+ if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) !=
+ pos.data_hash.digest()) {
+ dout(0) << "_scan_list " << poid << " got incorrect hash on read 0x"
+ << std::hex << pos.data_hash.digest() << " != expected 0x"
+ << hinfo->get_chunk_hash(get_parent()->whoami_shard().shard)
+ << std::dec << dendl;
o.ec_hash_mismatch = true;
- return;
+ return 0;
}
/* We checked above that we match our own stored hash. We cannot
}
}
- o.omap_digest = seed;
+ o.omap_digest = -1;
o.omap_digest_present = true;
+ return 0;
}
RecoveryMessages *m);
void get_all_avail_shards(
const hobject_t &hoid,
+ const set<pg_shard_t> &error_shards,
set<int> &have,
map<shard_id_t, pg_shard_t> &shards,
bool for_recovery);
ZTracer::Trace trace;
+ map<hobject_t, set<int>> want_to_read;
map<hobject_t, read_request_t> to_read;
map<hobject_t, read_result_t> complete;
bool do_redundant_reads,
bool for_recovery,
OpRequestRef op,
+ map<hobject_t, set<int>> &&_want_to_read,
map<hobject_t, read_request_t> &&_to_read)
: priority(priority), tid(tid), op(op), do_redundant_reads(do_redundant_reads),
- for_recovery(for_recovery), to_read(std::move(_to_read)) {
+ for_recovery(for_recovery), want_to_read(std::move(_want_to_read)),
+ to_read(std::move(_to_read)) {
for (auto &&hpair: to_read) {
auto &returned = complete[hpair.first].returned;
for (auto &&extent: hpair.second.to_read) {
map<pg_shard_t, set<ceph_tid_t> > shard_to_read_map;
void start_read_op(
int priority,
+ map<hobject_t, set<int>> &want_to_read,
map<hobject_t, read_request_t> &to_read,
OpRequestRef op,
bool do_redundant_reads, bool for_recovery);
int get_remaining_shards(
const hobject_t &hoid,
const set<int> &avail,
+ const set<int> &want,
+ const read_result_t &result,
set<pg_shard_t> *to_read,
bool for_recovery);
bool scrub_supported() override { return true; }
bool auto_repair_supported() const override { return true; }
- void be_deep_scrub(
- const hobject_t &obj,
- uint32_t seed,
- ScrubMap::object &o,
- ThreadPool::TPHandle &handle) override;
+ int be_deep_scrub(
+ const hobject_t &poid,
+ ScrubMap &map,
+ ScrubMapBuilder &pos,
+ ScrubMap::object &o) override;
uint64_t be_get_ondisk_size(uint64_t logical_size) override {
return sinfo.logical_to_next_chunk_offset(logical_size);
}
if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
- << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
+ << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active
+ << ")" << dendl;
can_inc = true;
} else {
- dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
+ dout(20) << __func__ << " " << scrubs_pending << " + " << scrubs_active
+ << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
}
return can_inc;
MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
OSDSuperblock& sblock)
{
- MOSDMap *m = new MOSDMap(monc->get_fsid());
+ MOSDMap *m = new MOSDMap(monc->get_fsid(),
+ osdmap->get_encoding_features());
m->oldest_map = max_oldest_map;
m->newest_map = sblock.newest_map;
OSDSuperblock sblock(get_superblock());
if (since < sblock.oldest_map) {
// just send latest full map
- MOSDMap *m = new MOSDMap(monc->get_fsid());
+ MOSDMap *m = new MOSDMap(monc->get_fsid(),
+ osdmap->get_encoding_features());
m->oldest_map = max_oldest_map;
m->newest_map = sblock.newest_map;
get_map_bl(to, m->maps[to]);
update_log_config();
peering_tp.start();
+
+ service.init();
+ service.publish_map(osdmap);
+ service.publish_superblock(superblock);
+ service.max_oldest_map = superblock.oldest_map;
+
osd_op_tp.start();
disk_tp.start();
command_tp.start();
tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
}
- service.init();
- service.publish_map(osdmap);
- service.publish_superblock(superblock);
- service.max_oldest_map = superblock.oldest_map;
-
osd_lock.Unlock();
r = monc->authenticate();
}
-bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
- int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key)
+bool OSD::ms_verify_authorizer(
+ Connection *con, int peer_type,
+ int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge)
{
AuthAuthorizeHandler *authorize_handler = 0;
switch (peer_type) {
isvalid = authorize_handler->verify_authorizer(
cct, keys,
authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
- &auid);
+ &auid, challenge);
} else {
dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
isvalid = false;
struct tm bdt;
time_t tt = now.sec();
localtime_r(&tt, &bdt);
+
+ bool day_permit = false;
+ if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
+ if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
+ day_permit = true;
+ }
+ } else {
+ if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
+ day_permit = true;
+ }
+ }
+
+ if (!day_permit) {
+ dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
+ << " - " << cct->_conf->osd_scrub_end_week_day
+ << " now " << bdt.tm_wday << " = no" << dendl;
+ return false;
+ }
+
bool time_permit = false;
if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
}
bool ms_verify_authorizer(Connection *con, int peer_type,
int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key) override {
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
isvalid = true;
return true;
}
bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
bool ms_verify_authorizer(Connection *con, int peer_type,
int protocol, bufferlist& authorizer, bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key) override;
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
void ms_handle_connect(Connection *con) override;
void ms_handle_fast_connect(Connection *con) override;
void ms_handle_fast_accept(Connection *con) override;
return 0;
}
+// ----------------------------------
+// OSDMap
bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
{
OSDMap tmpmap;
tmpmap.deepish_copy_from(osdmap);
tmpmap.apply_incremental(*pending_inc);
+ set<pg_t> to_check;
+ set<pg_t> to_cancel;
+ map<int, map<int, float>> rule_weight_map;
for (auto& p : tmpmap.pg_upmap) {
- ldout(cct, 10) << __func__ << " pg_upmap entry "
- << "[" << p.first << ":" << p.second << "]"
- << dendl;
- auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first);
+ to_check.insert(p.first);
+ }
+ for (auto& p : tmpmap.pg_upmap_items) {
+ to_check.insert(p.first);
+ }
+ for (auto& p : pending_inc->new_pg_upmap) {
+ to_check.insert(p.first);
+ }
+ for (auto& p : pending_inc->new_pg_upmap_items) {
+ to_check.insert(p.first);
+ }
+ for (auto& pg : to_check) {
+ auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
if (crush_rule < 0) {
lderr(cct) << __func__ << " unable to load crush-rule of pg "
- << p.first << dendl;
+ << pg << dendl;
continue;
}
+ map<int, float> weight_map;
+ auto it = rule_weight_map.find(crush_rule);
+ if (it == rule_weight_map.end()) {
+ auto r = tmpmap.crush->get_rule_weight_osd_map(crush_rule, &weight_map);
+ if (r < 0) {
+ lderr(cct) << __func__ << " unable to get crush weight_map for "
+ << "crush_rule " << crush_rule << dendl;
+ continue;
+ }
+ rule_weight_map[crush_rule] = weight_map;
+ } else {
+ weight_map = it->second;
+ }
auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
if (type < 0) {
lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
- << p.first << dendl;
- continue;
- } else if (type == 0) {
- ldout(cct, 10) << __func__ << " failure-domain of pg " << p.first
- << " is osd-level, skipping"
- << dendl;
+ << pg << dendl;
continue;
}
- ldout(cct, 10) << __func__ << " pg " << p.first
+ ldout(cct, 10) << __func__ << " pg " << pg
<< " crush-rule-id " << crush_rule
+ << " weight_map " << weight_map
<< " failure-domain-type " << type
<< dendl;
vector<int> raw;
int primary;
- tmpmap.pg_to_raw_up(p.first, &raw, &primary);
+ tmpmap.pg_to_raw_up(pg, &raw, &primary);
set<int> parents;
- bool error = false;
- bool collide = false;
for (auto osd : raw) {
- auto parent = tmpmap.crush->get_parent_of_type(osd, type);
- if (parent >= 0) {
- lderr(cct) << __func__ << " unable to get parent of raw osd." << osd
- << ", pg " << p.first
- << dendl;
- error = true;
+ if (type > 0) {
+ auto parent = tmpmap.crush->get_parent_of_type(osd, type, crush_rule);
+ if (parent >= 0) {
+ lderr(cct) << __func__ << " unable to get parent of raw osd."
+ << osd << " of pg " << pg
+ << dendl;
+ break;
+ }
+ auto r = parents.insert(parent);
+ if (!r.second) {
+ // two up-set osds come from same parent
+ to_cancel.insert(pg);
+ break;
+ }
+ }
+ // the above check validates collision only
+ // below we continue to check against crush-topology changing..
+ auto it = weight_map.find(osd);
+ if (it == weight_map.end()) {
+ // osd is gone or has been moved out of the specific crush-tree
+ to_cancel.insert(pg);
break;
}
- auto r = parents.insert(parent);
- if (!r.second) {
- collide = true;
+ auto adjusted_weight = tmpmap.get_weightf(it->first) * it->second;
+ if (adjusted_weight == 0) {
+ // osd is out/crush-out
+ to_cancel.insert(pg);
break;
}
}
- if (!error && collide) {
- ldout(cct, 10) << __func__ << " removing invalid pg_upmap "
- << "[" << p.first << ":" << p.second << "]"
- << ", final mapping result will be: " << raw
- << dendl;
- auto it = pending_inc->new_pg_upmap.find(p.first);
+ }
+ for (auto &pg: to_cancel) {
+ { // pg_upmap
+ auto it = pending_inc->new_pg_upmap.find(pg);
if (it != pending_inc->new_pg_upmap.end()) {
+ ldout(cct, 10) << __func__ << " cancel invalid pending "
+ << "pg_upmap entry "
+ << it->first << "->" << it->second
+ << dendl;
pending_inc->new_pg_upmap.erase(it);
}
- if (osdmap.pg_upmap.count(p.first)) {
- pending_inc->old_pg_upmap.insert(p.first);
- }
- }
- }
- for (auto& p : tmpmap.pg_upmap_items) {
- ldout(cct, 10) << __func__ << " pg_upmap_items entry "
- << "[" << p.first << ":" << p.second << "]"
- << dendl;
- auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first);
- if (crush_rule < 0) {
- lderr(cct) << __func__ << " unable to load crush-rule of pg "
- << p.first << dendl;
- continue;
- }
- auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
- if (type < 0) {
- lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
- << p.first << dendl;
- continue;
- } else if (type == 0) {
- ldout(cct, 10) << __func__ << " failure-domain of pg " << p.first
- << " is osd-level, skipping"
- << dendl;
- continue;
- }
- ldout(cct, 10) << __func__ << " pg " << p.first
- << " crush_rule_id " << crush_rule
- << " failure_domain_type " << type
- << dendl;
- vector<int> raw;
- int primary;
- tmpmap.pg_to_raw_up(p.first, &raw, &primary);
- set<int> parents;
- bool error = false;
- bool collide = false;
- for (auto osd : raw) {
- auto parent = tmpmap.crush->get_parent_of_type(osd, type);
- if (parent >= 0) {
- lderr(cct) << __func__ << " unable to get parent of raw osd." << osd
- << ", pg " << p.first
- << dendl;
- error = true;
- break;
- }
- auto r = parents.insert(parent);
- if (!r.second) {
- collide = true;
- break;
+ if (osdmap.pg_upmap.count(pg)) {
+ ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
+ << osdmap.pg_upmap.find(pg)->first << "->"
+ << osdmap.pg_upmap.find(pg)->second
+ << dendl;
+ pending_inc->old_pg_upmap.insert(pg);
}
}
- if (!error && collide) {
- ldout(cct, 10) << __func__ << " removing invalid pg_upmap_items "
- << "[" << p.first << ":" << p.second << "]"
- << ", final mapping result will be: " << raw
- << dendl;
- // This is overkilling, but simpler..
- auto it = pending_inc->new_pg_upmap_items.find(p.first);
+ { // pg_upmap_items
+ auto it = pending_inc->new_pg_upmap_items.find(pg);
if (it != pending_inc->new_pg_upmap_items.end()) {
+ ldout(cct, 10) << __func__ << " cancel invalid pending "
+ << "pg_upmap_items entry "
+ << it->first << "->" << it->second
+ << dendl;
pending_inc->new_pg_upmap_items.erase(it);
}
- if (osdmap.pg_upmap_items.count(p.first)) {
- pending_inc->old_pg_upmap_items.insert(p.first);
+ if (osdmap.pg_upmap_items.count(pg)) {
+ ldout(cct, 10) << __func__ << " cancel invalid "
+ << "pg_upmap_items entry "
+ << osdmap.pg_upmap_items.find(pg)->first << "->"
+ << osdmap.pg_upmap_items.find(pg)->second
+ << dendl;
+ pending_inc->old_pg_upmap_items.insert(pg);
}
}
}
return false; // same primary (tho replicas may have changed)
}
+uint64_t OSDMap::get_encoding_features() const
+{
+ uint64_t f = SIGNIFICANT_FEATURES;
+ if (require_osd_release < CEPH_RELEASE_LUMINOUS) {
+ f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
+ CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
+ }
+ if (require_osd_release < CEPH_RELEASE_KRAKEN) {
+ f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
+ CEPH_FEATURE_MSG_ADDR2 |
+ CEPH_FEATURE_CRUSH_TUNABLES5);
+ }
+ if (require_osd_release < CEPH_RELEASE_JEWEL) {
+ f &= ~(CEPH_FEATURE_SERVER_JEWEL |
+ CEPH_FEATURE_NEW_OSDOP_ENCODING);
+ }
+ return f;
+}
// serialize, unserialize
void OSDMap::encode_client_old(bufferlist& bl) const
ENCODE_START(8, 7, bl);
{
+ // NOTE: any new encoding dependencies must be reflected by
+ // SIGNIFICANT_FEATURES
uint8_t v = 6;
if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
v = 3;
}
{
+ // NOTE: any new encoding dependencies must be reflected by
+ // SIGNIFICANT_FEATURES
uint8_t target_v = 5;
if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
target_v = 1;
for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
int osd = p->second;
float deviation = p->first;
+ // make sure osd is still there (belongs to this crush-tree)
+ assert(osd_weight.count(osd));
float target = osd_weight[osd] * pgs_per_weight;
assert(target > 0);
if (deviation/target < max_deviation_ratio) {
int32_t max_osd;
vector<uint32_t> osd_state;
+ // These features affect OSDMap[::Incremental] encoding, or the
+ // encoding of some type embedded therein (CrushWrapper, something
+ // from osd_types, etc.).
+ static constexpr uint64_t SIGNIFICANT_FEATURES =
+ CEPH_FEATUREMASK_PGID64 |
+ CEPH_FEATUREMASK_PGPOOL3 |
+ CEPH_FEATUREMASK_OSDENC |
+ CEPH_FEATUREMASK_OSDMAP_ENC |
+ CEPH_FEATUREMASK_OSD_POOLRESEND |
+ CEPH_FEATUREMASK_NEW_OSDOP_ENCODING |
+ CEPH_FEATUREMASK_MSG_ADDR2 |
+ CEPH_FEATUREMASK_CRUSH_TUNABLES5 |
+ CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS |
+ CEPH_FEATUREMASK_SERVER_LUMINOUS ;
struct addrs_s {
mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > client_addr;
mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > cluster_addr;
OSDMap& operator=(const OSDMap& other) = default;
public:
+ /// return feature mask subset that is relevant to OSDMap encoding
+ static uint64_t get_significant_features(uint64_t features) {
+ return SIGNIFICANT_FEATURES & features;
+ }
+
+ uint64_t get_encoding_features() const;
+
void deepish_copy_from(const OSDMap& o) {
*this = o;
primary_temp.reset(new mempool::osdmap::map<pg_t,int32_t>(*o.primary_temp));
: reserved(false), reserve_failed(false),
epoch_start(0),
active(false),
- waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
+ shallow_errors(0), deep_errors(0), fixed(0),
must_scrub(false), must_deep_scrub(false), must_repair(false),
auto_repair(false),
num_digest_updates_pending(0),
state(INACTIVE),
- deep(false),
- seed(0)
+ deep(false)
{}
PG::Scrubber::~Scrubber() {}
<< scrubber.received_maps[m->from].valid_through
<< dendl;
- --scrubber.waiting_on;
+ dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
+ << dendl;
+ assert(scrubber.waiting_on_whom.count(m->from));
scrubber.waiting_on_whom.erase(m->from);
- if (scrubber.waiting_on == 0) {
+ if (m->preempted) {
+ dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
+ scrub_preempted = true;
+ }
+ if (scrubber.waiting_on_whom.empty()) {
if (ops_blocked_by_scrub()) {
requeue_scrub(true);
} else {
<< scrubber.received_maps[m->from].valid_through
<< dendl;
- --scrubber.waiting_on;
scrubber.waiting_on_whom.erase(m->from);
- if (scrubber.waiting_on == 0) {
+ if (scrubber.waiting_on_whom.empty()) {
if (ops_blocked_by_scrub()) {
requeue_scrub(true);
} else {
void PG::_request_scrub_map(
pg_shard_t replica, eversion_t version,
hobject_t start, hobject_t end,
- bool deep, uint32_t seed)
+ bool deep,
+ bool allow_preemption)
{
assert(replica != pg_whoami);
dout(10) << "scrub requesting scrubmap from osd." << replica
- << " deep " << (int)deep << " seed " << seed << dendl;
+ << " deep " << (int)deep << dendl;
MOSDRepScrub *repscrubop = new MOSDRepScrub(
spg_t(info.pgid.pgid, replica.shard), version,
get_osdmap()->get_epoch(),
get_last_peering_reset(),
- start, end, deep, seed);
+ start, end, deep,
+ allow_preemption,
+ scrubber.priority,
+ ops_blocked_by_scrub());
// default priority, we want the rep scrub processed prior to any recovery
// or client io messages (we are holding a lock!)
osd->send_message_osd_cluster(
{
hobject_t head;
SnapSet snapset;
+
+ // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
+ // caller using clean_meta_map(), and it works properly.
+ dout(20) << __func__ << " start" << dendl;
+
for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
i != smap.objects.rend();
++i) {
const hobject_t &hoid = i->first;
ScrubMap::object &o = i->second;
+ dout(20) << __func__ << " " << hoid << dendl;
+
if (hoid.is_head() || hoid.is_snapdir()) {
// parse the SnapSet
bufferlist bl;
}
}
}
-
-/*
- * build a scrub map over a chunk without releasing the lock
- * only used by chunky scrub
- */
int PG::build_scrub_map_chunk(
ScrubMap &map,
- hobject_t start, hobject_t end, bool deep, uint32_t seed,
+ ScrubMapBuilder &pos,
+ hobject_t start,
+ hobject_t end,
+ bool deep,
ThreadPool::TPHandle &handle)
{
dout(10) << __func__ << " [" << start << "," << end << ") "
- << " seed " << seed << dendl;
-
- map.valid_through = info.last_update;
+ << " pos " << pos
+ << dendl;
- // objects
- vector<hobject_t> ls;
- vector<ghobject_t> rollback_obs;
- int ret = get_pgbackend()->objects_list_range(
- start,
- end,
- 0,
- &ls,
- &rollback_obs);
- if (ret < 0) {
- dout(5) << "objects_list_range error: " << ret << dendl;
- return ret;
+ // start
+ while (pos.empty()) {
+ pos.deep = deep;
+ map.valid_through = info.last_update;
+ osr->flush();
+
+ // objects
+ vector<ghobject_t> rollback_obs;
+ pos.ret = get_pgbackend()->objects_list_range(
+ start,
+ end,
+ 0,
+ &pos.ls,
+ &rollback_obs);
+ if (pos.ret < 0) {
+ dout(5) << "objects_list_range error: " << pos.ret << dendl;
+ return pos.ret;
+ }
+ if (pos.ls.empty()) {
+ break;
+ }
+ _scan_rollback_obs(rollback_obs, handle);
+ pos.pos = 0;
+ return -EINPROGRESS;
}
+ // scan objects
+ while (!pos.done()) {
+ int r = get_pgbackend()->be_scan_list(map, pos);
+ if (r == -EINPROGRESS) {
+ return r;
+ }
+ }
- get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
- _scan_rollback_obs(rollback_obs, handle);
- _scan_snaps(map);
+ // finish
+ dout(20) << __func__ << " finishing" << dendl;
+ assert(pos.done());
_repair_oinfo_oid(map);
+ if (!is_primary()) {
+ ScrubMap for_meta_scrub;
+ // In case we restarted smaller chunk, clear old data
+ scrubber.cleaned_meta_map.clear_from(scrubber.start);
+ scrubber.cleaned_meta_map.insert(map);
+ scrubber.clean_meta_map(for_meta_scrub);
+ _scan_snaps(for_meta_scrub);
+ }
- dout(20) << __func__ << " done" << dendl;
+ dout(20) << __func__ << " done, got " << map.objects.size() << " items"
+ << dendl;
return 0;
}
return;
}
- ScrubMap map;
-
assert(msg->chunky);
if (last_update_applied < msg->scrub_to) {
dout(10) << "waiting for last_update_applied to catch up" << dendl;
return;
}
- // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
- hobject_t start = msg->start;
- hobject_t end = msg->end;
- if (!start.is_max())
- start.pool = info.pgid.pool();
- if (!end.is_max())
- end.pool = info.pgid.pool();
-
- build_scrub_map_chunk(
- map, start, end, msg->deep, msg->seed,
- handle);
-
- if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
- MOSDRepScrubMap *reply = new MOSDRepScrubMap(
- spg_t(info.pgid.pgid, get_primary().shard),
- msg->map_epoch,
- pg_whoami);
- ::encode(map, reply->get_data());
- osd->send_message_osd_cluster(reply, msg->get_connection());
+ scrubber.state = Scrubber::BUILD_MAP_REPLICA;
+ scrubber.replica_scrub_start = msg->min_epoch;
+ scrubber.start = msg->start;
+ scrubber.end = msg->end;
+ scrubber.max_end = msg->end;
+ scrubber.deep = msg->deep;
+ scrubber.epoch_start = info.history.same_interval_since;
+ if (msg->priority) {
+ scrubber.priority = msg->priority;
} else {
- // for jewel compatibility
- vector<OSDOp> scrub(1);
- scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
- hobject_t poid;
- eversion_t v;
- osd_reqid_t reqid;
- MOSDSubOp *subop = new MOSDSubOp(
- reqid,
- pg_whoami,
- spg_t(info.pgid.pgid, get_primary().shard),
- poid,
- 0,
- msg->map_epoch,
- osd->get_tid(),
- v);
- ::encode(map, subop->get_data());
- subop->ops = scrub;
- osd->send_message_osd_cluster(subop, msg->get_connection());
+ scrubber.priority = get_scrub_priority();
}
+
+ scrub_can_preempt = msg->allow_preemption;
+ scrub_preempted = false;
+ scrubber.replica_scrubmap_pos.reset();
+
+ requeue_scrub(msg->high_priority);
}
/* Scrub:
scrub_queued = false;
scrubber.needs_sleep = true;
+ // for the replica
+ if (!is_primary() &&
+ scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
+ chunky_scrub(handle);
+ return;
+ }
+
if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
dout(10) << "scrub -- not primary or active or not clean" << dendl;
state_clear(PG_STATE_SCRUBBING);
while (!done) {
dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
- << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
+ << " [" << scrubber.start << "," << scrubber.end << ")"
+ << " max_end " << scrubber.max_end << dendl;
switch (scrubber.state) {
case PG::Scrubber::INACTIVE:
dout(10) << "scrub start" << dendl;
+ assert(is_primary());
publish_stats_to_osd();
scrubber.epoch_start = info.history.same_interval_since;
osd->clog->debug(oss);
}
- scrubber.seed = -1;
-
+ scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
+ "osd_scrub_max_preemptions");
+ scrubber.preempt_divisor = 1;
break;
case PG::Scrubber::NEW_CHUNK:
scrubber.primary_scrubmap = ScrubMap();
scrubber.received_maps.clear();
+ // begin (possible) preemption window
+ if (scrub_preempted) {
+ scrubber.preempt_left--;
+ scrubber.preempt_divisor *= 2;
+ dout(10) << __func__ << " preempted, " << scrubber.preempt_left
+ << " left" << dendl;
+ scrub_preempted = false;
+ }
+ scrub_can_preempt = scrubber.preempt_left > 0;
+
{
/* get the start and end of our scrub chunk
*
* left end of the range if we are a tier because they may legitimately
* not exist (see _scrub).
*/
- int min = MAX(3, cct->_conf->osd_scrub_chunk_min);
+ int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
+ scrubber.preempt_divisor);
+ int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
+ scrubber.preempt_divisor);
hobject_t start = scrubber.start;
hobject_t candidate_end;
vector<hobject_t> objects;
+ osr->flush();
ret = get_pgbackend()->objects_list_partial(
start,
min,
- MAX(min, cct->_conf->osd_scrub_chunk_max),
+ max,
&objects,
&candidate_end);
assert(ret >= 0);
break;
}
scrubber.end = candidate_end;
+ if (scrubber.end > scrubber.max_end)
+ scrubber.max_end = scrubber.end;
}
// walk the log to find the latest update that affects our chunk
// ask replicas to wait until
// last_update_applied >= scrubber.subset_last_update and then scan
scrubber.waiting_on_whom.insert(pg_whoami);
- ++scrubber.waiting_on;
// request maps from replicas
for (set<pg_shard_t>::iterator i = actingbackfill.begin();
if (*i == pg_whoami) continue;
_request_scrub_map(*i, scrubber.subset_last_update,
scrubber.start, scrubber.end, scrubber.deep,
- scrubber.seed);
+ scrubber.preempt_left > 0);
scrubber.waiting_on_whom.insert(*i);
- ++scrubber.waiting_on;
}
+ dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
+ << dendl;
scrubber.state = PG::Scrubber::WAIT_PUSHES;
-
break;
case PG::Scrubber::WAIT_PUSHES:
break;
case PG::Scrubber::WAIT_LAST_UPDATE:
- if (last_update_applied >= scrubber.subset_last_update) {
- scrubber.state = PG::Scrubber::BUILD_MAP;
- } else {
+ if (last_update_applied < scrubber.subset_last_update) {
// will be requeued by op_applied
dout(15) << "wait for writes to flush" << dendl;
done = true;
- }
+ break;
+ }
+
+ scrubber.state = PG::Scrubber::BUILD_MAP;
+ scrubber.primary_scrubmap_pos.reset();
break;
case PG::Scrubber::BUILD_MAP:
assert(last_update_applied >= scrubber.subset_last_update);
// build my own scrub map
- ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
- scrubber.start, scrubber.end,
- scrubber.deep, scrubber.seed,
- handle);
- if (ret < 0) {
- dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
+ if (scrub_preempted) {
+ dout(10) << __func__ << " preempted" << dendl;
+ scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
+ break;
+ }
+ ret = build_scrub_map_chunk(
+ scrubber.primary_scrubmap,
+ scrubber.primary_scrubmap_pos,
+ scrubber.start, scrubber.end,
+ scrubber.deep,
+ handle);
+ if (ret == -EINPROGRESS) {
+ requeue_scrub();
+ done = true;
+ break;
+ }
+ scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
+ break;
+
+ case PG::Scrubber::BUILD_MAP_DONE:
+ if (scrubber.primary_scrubmap_pos.ret < 0) {
+ dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
+ << ", aborting" << dendl;
scrub_clear_state();
scrub_unreserve_replicas();
return;
}
-
- --scrubber.waiting_on;
+ dout(10) << __func__ << " waiting_on_whom was "
+ << scrubber.waiting_on_whom << dendl;
+ assert(scrubber.waiting_on_whom.count(pg_whoami));
scrubber.waiting_on_whom.erase(pg_whoami);
scrubber.state = PG::Scrubber::WAIT_REPLICAS;
break;
case PG::Scrubber::WAIT_REPLICAS:
- if (scrubber.waiting_on > 0) {
+ if (!scrubber.waiting_on_whom.empty()) {
// will be requeued by sub_op_scrub_map
dout(10) << "wait for replicas to build scrub map" << dendl;
done = true;
- } else {
+ break;
+ }
+ // end (possible) preemption window
+ scrub_can_preempt = false;
+ if (scrub_preempted) {
+ dout(10) << __func__ << " preempted, restarting chunk" << dendl;
+ scrubber.state = PG::Scrubber::NEW_CHUNK;
+ } else {
scrubber.state = PG::Scrubber::COMPARE_MAPS;
}
break;
case PG::Scrubber::COMPARE_MAPS:
assert(last_update_applied >= scrubber.subset_last_update);
- assert(scrubber.waiting_on == 0);
+ assert(scrubber.waiting_on_whom.empty());
scrub_compare_maps();
scrubber.start = scrubber.end;
break;
}
+ scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
+ "osd_scrub_max_preemptions");
+ scrubber.preempt_divisor = 1;
+
if (!(scrubber.end.is_max())) {
- scrubber.state = PG::Scrubber::NEW_CHUNK;
+ scrubber.state = PG::Scrubber::NEW_CHUNK;
requeue_scrub();
done = true;
} else {
break;
+ case PG::Scrubber::BUILD_MAP_REPLICA:
+ // build my own scrub map
+ if (scrub_preempted) {
+ dout(10) << __func__ << " preempted" << dendl;
+ ret = 0;
+ } else {
+ ret = build_scrub_map_chunk(
+ scrubber.replica_scrubmap,
+ scrubber.replica_scrubmap_pos,
+ scrubber.start, scrubber.end,
+ scrubber.deep,
+ handle);
+ }
+ if (ret == -EINPROGRESS) {
+ requeue_scrub();
+ done = true;
+ break;
+ }
+ // reply
+ if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
+ MOSDRepScrubMap *reply = new MOSDRepScrubMap(
+ spg_t(info.pgid.pgid, get_primary().shard),
+ scrubber.replica_scrub_start,
+ pg_whoami);
+ reply->preempted = scrub_preempted;
+ ::encode(scrubber.replica_scrubmap, reply->get_data());
+ osd->send_message_osd_cluster(
+ get_primary().osd, reply,
+ scrubber.replica_scrub_start);
+ } else {
+ // for jewel compatibility
+ vector<OSDOp> scrub(1);
+ scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
+ hobject_t poid;
+ eversion_t v;
+ osd_reqid_t reqid;
+ MOSDSubOp *subop = new MOSDSubOp(
+ reqid,
+ pg_whoami,
+ spg_t(info.pgid.pgid, get_primary().shard),
+ poid,
+ 0,
+ scrubber.replica_scrub_start,
+ osd->get_tid(),
+ v);
+ ::encode(scrubber.replica_scrubmap, subop->get_data());
+ subop->ops = scrub;
+ osd->send_message_osd_cluster(
+ get_primary().osd, subop,
+ scrubber.replica_scrub_start);
+ }
+ scrub_preempted = false;
+ scrub_can_preempt = false;
+ scrubber.state = PG::Scrubber::INACTIVE;
+ scrubber.replica_scrubmap = ScrubMap();
+ scrubber.replica_scrubmap_pos = ScrubMapBuilder();
+ scrubber.start = hobject_t();
+ scrubber.end = hobject_t();
+ scrubber.max_end = hobject_t();
+ done = true;
+ break;
+
default:
ceph_abort();
}
}
dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
- << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
+ << " [" << scrubber.start << "," << scrubber.end << ")"
+ << " max_end " << scrubber.max_end << dendl;
+}
+
+bool PG::write_blocked_by_scrub(const hobject_t& soid)
+{
+ if (soid < scrubber.start || soid >= scrubber.end) {
+ return false;
+ }
+ if (scrub_can_preempt) {
+ if (!scrub_preempted) {
+ dout(10) << __func__ << " " << soid << " preempted" << dendl;
+ scrub_preempted = true;
+ } else {
+ dout(10) << __func__ << " " << soid << " already preempted" << dendl;
+ }
+ return false;
+ }
+ return true;
+}
+
+bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
+{
+ // does [start, end] intersect [scrubber.start, scrubber.max_end)
+ return (start < scrubber.max_end &&
+ end >= scrubber.start);
}
void PG::scrub_clear_state()
// construct authoritative scrub map for type specific scrubbing
scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
- map<hobject_t, pair<uint32_t, uint32_t>> missing_digest;
+ map<hobject_t,
+ pair<boost::optional<uint32_t>,
+ boost::optional<uint32_t>>> missing_digest;
+
+ map<pg_shard_t, ScrubMap *> maps;
+ maps[pg_whoami] = &scrubber.primary_scrubmap;
+
+ for (const auto& i : actingbackfill) {
+ if (i == pg_whoami) continue;
+ dout(2) << __func__ << " replica " << i << " has "
+ << scrubber.received_maps[i].objects.size()
+ << " items" << dendl;
+ maps[i] = &scrubber.received_maps[i];
+ }
+
+ set<hobject_t> master_set;
+
+ // Construct master set
+ for (const auto map : maps) {
+ for (const auto i : map.second->objects) {
+ master_set.insert(i.first);
+ }
+ }
+
+ stringstream ss;
+ get_pgbackend()->be_large_omap_check(maps, master_set,
+ scrubber.large_omap_objects, ss);
+ if (!ss.str().empty()) {
+ osd->clog->warn(ss);
+ }
if (acting.size() > 1) {
dout(10) << __func__ << " comparing replica scrub maps" << dendl;
- stringstream ss;
-
// Map from object with errors to good peer
map<hobject_t, list<pg_shard_t>> authoritative;
- map<pg_shard_t, ScrubMap *> maps;
dout(2) << __func__ << " osd." << acting[0] << " has "
<< scrubber.primary_scrubmap.objects.size() << " items" << dendl;
- maps[pg_whoami] = &scrubber.primary_scrubmap;
- for (set<pg_shard_t>::iterator i = actingbackfill.begin();
- i != actingbackfill.end();
- ++i) {
- if (*i == pg_whoami) continue;
- dout(2) << __func__ << " replica " << *i << " has "
- << scrubber.received_maps[*i].objects.size()
- << " items" << dendl;
- maps[*i] = &scrubber.received_maps[*i];
- }
+ ss.str("");
+ ss.clear();
get_pgbackend()->be_compare_scrubmaps(
maps,
+ master_set,
state_test(PG_STATE_REPAIR),
scrubber.missing,
scrubber.inconsistent,
}
ScrubMap for_meta_scrub;
- if (scrubber.end.is_max() ||
- scrubber.cleaned_meta_map.objects.empty()) {
- scrubber.cleaned_meta_map.swap(for_meta_scrub);
- } else {
- auto iter = scrubber.cleaned_meta_map.objects.end();
- --iter; // not empty, see if clause
- auto begin = scrubber.cleaned_meta_map.objects.begin();
- while (iter != begin) {
- auto next = iter--;
- if (next->first.get_head() != iter->first.get_head()) {
- ++iter;
- break;
- }
- }
- for_meta_scrub.objects.insert(begin, iter);
- scrubber.cleaned_meta_map.objects.erase(begin, iter);
- }
+ scrubber.clean_meta_map(for_meta_scrub);
// ok, do the pg-type specific scrubbing
scrub_snapshot_metadata(for_meta_scrub, missing_digest);
+ // Called here on the primary can use an authoritative map if it isn't the primary
+ _scan_snaps(for_meta_scrub);
if (!scrubber.store->empty()) {
if (state_test(PG_STATE_REPAIR)) {
dout(10) << __func__ << ": discarding scrub results" << dendl;
info.history.last_clean_scrub_stamp = now;
info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
+ info.stats.stats.sum.num_large_omap_objects = scrubber.large_omap_objects;
} else {
info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
// XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
assert(entries.begin()->version > info.last_update);
PGLogEntryHandler rollbacker{this, &t};
- if (roll_forward_to) {
- pg_log.roll_forward(&rollbacker);
- }
bool invalidate_stats =
pg_log.append_new_log_entries(info.last_backfill,
info.last_backfill_bitwise,
PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
{
PG *pg = context< RecoveryMachine >().pg;
+ if (!pg->state_test(PG_STATE_RECOVERING)) {
+ // we may have finished recovery and have an AllReplicasRecovered
+ // event queued to move us to the next state.
+ ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl;
+ return discard_event();
+ }
ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
pg->state_clear(PG_STATE_RECOVERING);
pg->state_set(PG_STATE_RECOVERY_WAIT);
q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
q.f->dump_stream("scrubber.start") << pg->scrubber.start;
q.f->dump_stream("scrubber.end") << pg->scrubber.end;
+ q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end;
q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
- q.f->dump_unsigned("scrubber.seed", pg->scrubber.seed);
- q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
{
q.f->open_array_section("scrubber.waiting_on_whom");
for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
int64_t poolnum = pg->info.pgid.pool();
// Reset if min_size turn smaller than previous value, pg might now be able to go active
- if (advmap.lastmap->get_pools().find(poolnum)->second.min_size >
+ if (!advmap.osdmap->have_pg_pool(poolnum) ||
+ advmap.lastmap->get_pools().find(poolnum)->second.min_size >
advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
post_event(advmap);
return transit< Reset >();
// common to both scrubs
bool active;
- int waiting_on;
set<pg_shard_t> waiting_on_whom;
int shallow_errors;
int deep_errors;
+ int large_omap_objects = 0;
int fixed;
ScrubMap primary_scrubmap;
+ ScrubMapBuilder primary_scrubmap_pos;
+ epoch_t replica_scrub_start = 0;
+ ScrubMap replica_scrubmap;
+ ScrubMapBuilder replica_scrubmap_pos;
map<pg_shard_t, ScrubMap> received_maps;
OpRequestRef active_rep_scrub;
utime_t scrub_reg_stamp; // stamp we registered for
// Cleaned map pending snap metadata scrub
ScrubMap cleaned_meta_map;
+ void clean_meta_map(ScrubMap &for_meta_scrub) {
+ if (end.is_max() ||
+ cleaned_meta_map.objects.empty()) {
+ cleaned_meta_map.swap(for_meta_scrub);
+ } else {
+ auto iter = cleaned_meta_map.objects.end();
+ --iter; // not empty, see if clause
+ auto begin = cleaned_meta_map.objects.begin();
+ if (iter->first.has_snapset()) {
+ ++iter;
+ } else {
+ while (iter != begin) {
+ auto next = iter--;
+ if (next->first.get_head() != iter->first.get_head()) {
+ ++iter;
+ break;
+ }
+ }
+ }
+ for_meta_scrub.objects.insert(begin, iter);
+ cleaned_meta_map.objects.erase(begin, iter);
+ }
+ }
+
// digest updates which we are waiting on
int num_digest_updates_pending;
// chunky scrub
- hobject_t start, end;
+ hobject_t start, end; // [start,end)
+ hobject_t max_end; // Largest end that may have been sent to replicas
eversion_t subset_last_update;
// chunky scrub state
WAIT_PUSHES,
WAIT_LAST_UPDATE,
BUILD_MAP,
+ BUILD_MAP_DONE,
WAIT_REPLICAS,
COMPARE_MAPS,
WAIT_DIGEST_UPDATES,
FINISH,
+ BUILD_MAP_REPLICA,
} state;
std::unique_ptr<Scrub::Store> store;
// deep scrub
bool deep;
- uint32_t seed;
+ int preempt_left;
+ int preempt_divisor;
list<Context*> callbacks;
void add_callback(Context *context) {
case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
case BUILD_MAP: ret = "BUILD_MAP"; break;
+ case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
case FINISH: ret = "FINISH"; break;
+ case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
}
return ret;
}
bool is_chunky_scrub_active() const { return state != INACTIVE; }
- // classic (non chunk) scrubs block all writes
- // chunky scrubs only block writes to a range
- bool write_blocked_by_scrub(const hobject_t &soid) {
- return (soid >= start && soid < end);
- }
-
// clear all state
void reset() {
active = false;
- waiting_on = 0;
waiting_on_whom.clear();
if (active_rep_scrub) {
active_rep_scrub = OpRequestRef();
state = PG::Scrubber::INACTIVE;
start = hobject_t();
end = hobject_t();
+ max_end = hobject_t();
subset_last_update = eversion_t();
shallow_errors = 0;
deep_errors = 0;
+ large_omap_objects = 0;
fixed = 0;
deep = false;
- seed = 0;
run_callbacks();
inconsistent.clear();
missing.clear();
authoritative.clear();
num_digest_updates_pending = 0;
+ primary_scrubmap = ScrubMap();
+ primary_scrubmap_pos.reset();
+ replica_scrubmap = ScrubMap();
+ replica_scrubmap_pos.reset();
cleaned_meta_map = ScrubMap();
sleeping = false;
needs_sleep = true;
int active_pushes;
+ bool scrub_can_preempt = false;
+ bool scrub_preempted = false;
+
+ // we allow some number of preemptions of the scrub, which mean we do
+ // not block. then we start to block. once we start blocking, we do
+ // not stop until the scrub range is completed.
+ bool write_blocked_by_scrub(const hobject_t &soid);
+
+ /// true if the given range intersects the scrub interval in any way
+ bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
+
void repair_object(
const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
pg_shard_t bad_peer);
ThreadPool::TPHandle &handle);
void _request_scrub_map(pg_shard_t replica, eversion_t version,
hobject_t start, hobject_t end, bool deep,
- uint32_t seed);
+ bool allow_preemption);
int build_scrub_map_chunk(
ScrubMap &map,
- hobject_t start, hobject_t end, bool deep, uint32_t seed,
+ ScrubMapBuilder &pos,
+ hobject_t start, hobject_t end, bool deep,
ThreadPool::TPHandle &handle);
/**
* returns true if [begin, end) is good to scrub at this time
const hobject_t &begin, const hobject_t &end) = 0;
virtual void scrub_snapshot_metadata(
ScrubMap &map,
- const std::map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest) { }
+ const std::map<hobject_t,
+ pair<boost::optional<uint32_t>,
+ boost::optional<uint32_t>>> &missing_digest) { }
virtual void _scrub_clear_state() { }
virtual void _scrub_finish() { }
virtual void split_colls(
}
}
-/*
- * pg lock may or may not be held
- */
-void PGBackend::be_scan_list(
- ScrubMap &map, const vector<hobject_t> &ls, bool deep, uint32_t seed,
- ThreadPool::TPHandle &handle)
+int PGBackend::be_scan_list(
+ ScrubMap &map,
+ ScrubMapBuilder &pos)
{
- dout(10) << __func__ << " scanning " << ls.size() << " objects"
- << (deep ? " deeply" : "") << dendl;
- int i = 0;
- for (vector<hobject_t>::const_iterator p = ls.begin();
- p != ls.end();
- ++p, i++) {
- handle.reset_tp_timeout();
- hobject_t poid = *p;
-
- struct stat st;
- int r = store->stat(
+ dout(10) << __func__ << " " << pos << dendl;
+ assert(!pos.done());
+ assert(pos.pos < pos.ls.size());
+ hobject_t& poid = pos.ls[pos.pos];
+
+ struct stat st;
+ int r = store->stat(
+ ch,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ &st,
+ true);
+ if (r == 0) {
+ ScrubMap::object &o = map.objects[poid];
+ o.size = st.st_size;
+ assert(!o.negative);
+ store->getattrs(
ch,
ghobject_t(
poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
- &st,
- true);
- if (r == 0) {
- ScrubMap::object &o = map.objects[poid];
- o.size = st.st_size;
- assert(!o.negative);
- store->getattrs(
- ch,
- ghobject_t(
- poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
- o.attrs);
-
- // calculate the CRC32 on deep scrubs
- if (deep) {
- be_deep_scrub(*p, seed, o, handle);
- }
+ o.attrs);
- dout(25) << __func__ << " " << poid << dendl;
- } else if (r == -ENOENT) {
- dout(25) << __func__ << " " << poid << " got " << r
- << ", skipping" << dendl;
- } else if (r == -EIO) {
- dout(25) << __func__ << " " << poid << " got " << r
- << ", stat_error" << dendl;
- ScrubMap::object &o = map.objects[poid];
- o.stat_error = true;
- } else {
- derr << __func__ << " got: " << cpp_strerror(r) << dendl;
- ceph_abort();
+ if (pos.deep) {
+ r = be_deep_scrub(poid, map, pos, o);
}
+ dout(25) << __func__ << " " << poid << dendl;
+ } else if (r == -ENOENT) {
+ dout(25) << __func__ << " " << poid << " got " << r
+ << ", skipping" << dendl;
+ } else if (r == -EIO) {
+ dout(25) << __func__ << " " << poid << " got " << r
+ << ", stat_error" << dendl;
+ ScrubMap::object &o = map.objects[poid];
+ o.stat_error = true;
+ } else {
+ derr << __func__ << " got: " << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ if (r == -EINPROGRESS) {
+ return -EINPROGRESS;
}
+ pos.next_object();
+ return 0;
}
bool PGBackend::be_compare_scrub_objects(
void PGBackend::be_compare_scrubmaps(
const map<pg_shard_t,ScrubMap*> &maps,
+ const set<hobject_t> &master_set,
bool repair,
map<hobject_t, set<pg_shard_t>> &missing,
map<hobject_t, set<pg_shard_t>> &inconsistent,
map<hobject_t, list<pg_shard_t>> &authoritative,
- map<hobject_t, pair<uint32_t,uint32_t>> &missing_digest,
+ map<hobject_t, pair<boost::optional<uint32_t>,
+ boost::optional<uint32_t>>> &missing_digest,
int &shallow_errors, int &deep_errors,
Scrub::Store *store,
const spg_t& pgid,
const vector<int> &acting,
ostream &errorstream)
{
- map<hobject_t,ScrubMap::object>::const_iterator i;
- map<pg_shard_t, ScrubMap *>::const_iterator j;
- set<hobject_t> master_set;
utime_t now = ceph_clock_now();
- // Construct master set
- for (j = maps.begin(); j != maps.end(); ++j) {
- for (i = j->second->objects.begin(); i != j->second->objects.end(); ++i) {
- master_set.insert(i->first);
- }
- }
-
// Check maps against master set and each other
for (set<hobject_t>::const_iterator k = master_set.begin();
k != master_set.end();
set<pg_shard_t> cur_missing;
set<pg_shard_t> cur_inconsistent;
- for (j = maps.begin(); j != maps.end(); ++j) {
+ for (auto j = maps.cbegin(); j != maps.cend(); ++j) {
if (j == auth)
shard_map[auth->first].selected_oi = true;
if (j->second->objects.count(*k)) {
FORCE = 2,
} update = NO;
- if (auth_object.digest_present && auth_object.omap_digest_present &&
- (!auth_oi.is_data_digest() || !auth_oi.is_omap_digest())) {
- dout(20) << __func__ << " missing digest on " << *k << dendl;
+ if (auth_object.digest_present && !auth_oi.is_data_digest()) {
+ dout(20) << __func__ << " missing data digest on " << *k << dendl;
update = MAYBE;
}
- if (auth_object.digest_present && auth_object.omap_digest_present &&
- cct->_conf->osd_debug_scrub_chance_rewrite_digest &&
- (((unsigned)rand() % 100) >
- cct->_conf->osd_debug_scrub_chance_rewrite_digest)) {
- dout(20) << __func__ << " randomly updating digest on " << *k << dendl;
+ if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) {
+ dout(20) << __func__ << " missing omap digest on " << *k << dendl;
update = MAYBE;
}
utime_t age = now - auth_oi.local_mtime;
if (update == FORCE ||
age > cct->_conf->osd_deep_scrub_update_digest_min_age) {
- dout(20) << __func__ << " will update digest on " << *k << dendl;
- missing_digest[*k] = make_pair(auth_object.digest,
- auth_object.omap_digest);
+ boost::optional<uint32_t> data_digest, omap_digest;
+ if (auth_object.digest_present) {
+ data_digest = auth_object.digest;
+ dout(20) << __func__ << " will update data digest on " << *k << dendl;
+ }
+ if (auth_object.omap_digest_present) {
+ omap_digest = auth_object.omap_digest;
+ dout(20) << __func__ << " will update omap digest on " << *k << dendl;
+ }
+ missing_digest[*k] = make_pair(data_digest, omap_digest);
} else {
dout(20) << __func__ << " missing digest but age " << age
<< " < " << cct->_conf->osd_deep_scrub_update_digest_min_age
}
}
}
+
+void PGBackend::be_large_omap_check(const map<pg_shard_t,ScrubMap*> &maps,
+ const set<hobject_t> &master_set,
+ int& large_omap_objects,
+ ostream &warnstream) const
+{
+ bool needs_check = false;
+ for (const auto& map : maps) {
+ if (map.second->has_large_omap_object_errors) {
+ needs_check = true;
+ break;
+ }
+ }
+
+ if (!needs_check) {
+ return;
+ }
+
+ // Iterate through objects and check large omap object flag
+ for (const auto& k : master_set) {
+ for (const auto& map : maps) {
+ ScrubMap::object& obj = map.second->objects[k];
+ if (obj.large_omap_object_found) {
+ large_omap_objects++;
+ warnstream << "Large omap object found. Object: " << k << " Key count: "
+ << obj.large_omap_object_key_count << " Size (bytes): "
+ << obj.large_omap_object_value_size << '\n';
+ break;
+ }
+ }
+ }
+}
eversion_t v,
Context *on_complete) = 0;
+
/**
* Bless a context
*
virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
+ virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0;
virtual ~Listener() {}
};
Listener *parent;
virtual bool scrub_supported() = 0;
virtual bool auto_repair_supported() const = 0;
- void be_scan_list(
- ScrubMap &map, const vector<hobject_t> &ls, bool deep, uint32_t seed,
- ThreadPool::TPHandle &handle);
+ int be_scan_list(
+ ScrubMap &map,
+ ScrubMapBuilder &pos);
bool be_compare_scrub_objects(
pg_shard_t auth_shard,
const ScrubMap::object &auth,
inconsistent_obj_wrapper &object_error);
void be_compare_scrubmaps(
const map<pg_shard_t,ScrubMap*> &maps,
+ const set<hobject_t> &master_set,
bool repair,
map<hobject_t, set<pg_shard_t>> &missing,
map<hobject_t, set<pg_shard_t>> &inconsistent,
map<hobject_t, list<pg_shard_t>> &authoritative,
- map<hobject_t, pair<uint32_t,uint32_t>> &missing_digest,
+ map<hobject_t, pair<boost::optional<uint32_t>,
+ boost::optional<uint32_t>>> &missing_digest,
int &shallow_errors, int &deep_errors,
Scrub::Store *store,
const spg_t& pgid,
ostream &errorstream);
virtual uint64_t be_get_ondisk_size(
uint64_t logical_size) = 0;
- virtual void be_deep_scrub(
- const hobject_t &poid,
- uint32_t seed,
- ScrubMap::object &o,
- ThreadPool::TPHandle &handle) = 0;
+ virtual int be_deep_scrub(
+ const hobject_t &oid,
+ ScrubMap &map,
+ ScrubMapBuilder &pos,
+ ScrubMap::object &o) = 0;
+ void be_large_omap_check(
+ const map<pg_shard_t,ScrubMap*> &maps,
+ const set<hobject_t> &master_set,
+ int& large_omap_objects,
+ ostream &warnstream) const;
static PGBackend *build_pg_backend(
const pg_pool_t &pool,
) {
auto &op = get_object_op_for_modify(hoid);
for (auto &&i: attrs) {
- op.attr_updates[i.first] = i.second;
+ auto& d = op.attr_updates[i.first];
+ d = i.second;
+ d->rebuild();
}
}
void setattr(
bufferlist &bl ///< [in] val to write, may be claimed
) {
auto &op = get_object_op_for_modify(hoid);
- op.attr_updates[attrname] = bl;
+ auto& d = op.attr_updates[attrname];
+ d = bl;
+ d->rebuild();
}
void rmattr(
const hobject_t &hoid, ///< [in] object to write
return;
}
- if (write_ordered &&
- scrubber.write_blocked_by_scrub(head)) {
+ if (write_ordered && scrubber.is_chunky_scrub_active() &&
+ write_blocked_by_scrub(head)) {
dout(20) << __func__ << ": waiting for scrub" << dendl;
waiting_for_scrub.push_back(op);
op->mark_delayed("waiting for scrub");
{
hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
assert(hoid != hobject_t());
- if (scrubber.write_blocked_by_scrub(hoid)) {
+ if (write_blocked_by_scrub(hoid)) {
dout(10) << __func__ << " " << hoid
<< " blocked by scrub" << dendl;
if (op) {
bufferlist::iterator *bl_it)
{
dout(20) << __func__ << dendl;
+ bool skip_data_digest =
+ (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
+ g_conf->osd_distrust_data_digest;
auto& op = osd_op.op;
if (op.checksum.chunk_size > 0) {
// If there is a data digest and it is possible we are reading
// entire object, pass the digest.
boost::optional<uint32_t> maybe_crc;
- if (oi.is_data_digest() && op.checksum.offset == 0 &&
+ if (!skip_data_digest &&
+ oi.is_data_digest() && op.checksum.offset == 0 &&
op.checksum.length >= oi.size) {
maybe_crc = oi.data_digest;
}
{
dout(20) << __func__ << dendl;
ceph_osd_op& op = osd_op.op;
+ bool skip_data_digest =
+ (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
+ g_conf->osd_distrust_data_digest;
auto& oi = ctx->new_obs.oi;
uint64_t size = oi.size;
// If there is a data digest and it is possible we are reading
// entire object, pass the digest.
boost::optional<uint32_t> maybe_crc;
- if (oi.is_data_digest() && op.checksum.offset == 0 &&
+ if (!skip_data_digest &&
+ oi.is_data_digest() && op.checksum.offset == 0 &&
op.checksum.length >= oi.size) {
maybe_crc = oi.data_digest;
}
__u32 seq = oi.truncate_seq;
uint64_t size = oi.size;
bool trimmed_read = false;
+ bool skip_data_digest =
+ (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
+ g_conf->osd_distrust_data_digest;
// are we beyond truncate_size?
if ( (seq < op.extent.truncate_seq) &&
// If there is a data digest and it is possible we are reading
// entire object, pass the digest. FillInVerifyExtent will
// will check the oi.size again.
- if (oi.is_data_digest() && op.extent.offset == 0 &&
+ if (!skip_data_digest &&
+ oi.is_data_digest() && op.extent.offset == 0 &&
op.extent.length >= oi.size)
maybe_crc = oi.data_digest;
ctx->pending_async_reads.push_back(
<< " bytes from obj " << soid << dendl;
// whole object? can we verify the checksum?
- if (op.extent.length == oi.size && oi.is_data_digest()) {
+ if (!skip_data_digest &&
+ op.extent.length == oi.size && oi.is_data_digest()) {
uint32_t crc = osd_op.outdata.crc32c(-1);
if (oi.data_digest != crc) {
osd->clog->error() << info.pgid << std::hex
auto& op = osd_op.op;
auto& oi = ctx->new_obs.oi;
auto& soid = oi.soid;
+ bool skip_data_digest =
+ (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
+ g_conf->osd_distrust_data_digest;
if (op.extent.truncate_seq) {
dout(0) << "sparse_read does not support truncation sequence " << dendl;
// Maybe at first, there is no much whole objects. With continued use, more
// and more whole object exist. So from this point, for spare-read add
// checksum make sense.
- if (total_read == oi.size && oi.is_data_digest()) {
+ if (!skip_data_digest &&
+ total_read == oi.size && oi.is_data_digest()) {
uint32_t crc = data_bl.crc32c(-1);
if (oi.data_digest != crc) {
osd->clog->error() << info.pgid << std::hex
ObjectState& obs = ctx->new_obs;
object_info_t& oi = obs.oi;
const hobject_t& soid = oi.soid;
+ bool skip_data_digest =
+ (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
+ g_conf->osd_distrust_data_digest;
PGTransaction* t = ctx->op_t.get();
soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
}
- if (op.extent.offset == 0 && op.extent.length >= oi.size)
+ if (op.extent.offset == 0 && op.extent.length >= oi.size
+ && !skip_data_digest) {
obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
- else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
- obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
- else
+ } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
+ if (skip_data_digest) {
+ obs.oi.clear_data_digest();
+ } else {
+ obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
+ }
+ } else {
obs.oi.clear_data_digest();
+ }
write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
op.extent.offset, op.extent.length);
if (op.extent.length) {
t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
}
- obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
+ if (!skip_data_digest) {
+ obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
+ } else {
+ obs.oi.clear_data_digest();
+ }
write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
0, op.extent.length, true);
int result = 0;
object_copy_cursor_t cursor;
uint64_t out_max;
+ bool skip_data_digest =
+ (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
+ g_conf->osd_distrust_data_digest;
+
try {
::decode(cursor, bp);
::decode(out_max, bp);
} else {
reply_obj.snap_seq = obc->ssc->snapset.seq;
}
- if (oi.is_data_digest()) {
+ if (!skip_data_digest && oi.is_data_digest()) {
reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
reply_obj.data_digest = oi.data_digest;
}
// CopyFromCallback fills this in for us
obs.oi.user_version = ctx->user_at_version;
- obs.oi.set_data_digest(cb->results->data_digest);
- obs.oi.set_omap_digest(cb->results->omap_digest);
+ if (cb->results->is_data_digest()) {
+ obs.oi.set_data_digest(cb->results->data_digest);
+ } else {
+ obs.oi.clear_data_digest();
+ }
+ if (cb->results->is_omap_digest()) {
+ obs.oi.set_omap_digest(cb->results->omap_digest);
+ } else {
+ obs.oi.clear_omap_digest();
+ }
obs.oi.truncate_seq = cb->results->truncate_seq;
obs.oi.truncate_size = cb->results->truncate_size;
}
tctx->new_obs.oi.size = results->object_size;
tctx->new_obs.oi.user_version = results->user_version;
- // Don't care src object whether have data or omap digest
- if (results->object_size)
+ if (results->is_data_digest()) {
tctx->new_obs.oi.set_data_digest(results->data_digest);
- if (results->has_omap)
+ } else {
+ tctx->new_obs.oi.clear_data_digest();
+ }
+ if (results->is_omap_digest()) {
tctx->new_obs.oi.set_omap_digest(results->omap_digest);
+ } else {
+ tctx->new_obs.oi.clear_omap_digest();
+ }
tctx->new_obs.oi.truncate_seq = results->truncate_seq;
tctx->new_obs.oi.truncate_size = results->truncate_size;
}
if (!fop->blocking &&
- scrubber.write_blocked_by_scrub(oid)) {
+ write_blocked_by_scrub(oid)) {
if (fop->op) {
dout(10) << __func__ << " blocked by scrub" << dendl;
requeue_op(fop->op);
fop->op)) {
dout(20) << __func__ << " took write lock" << dendl;
} else if (fop->op) {
- dout(10) << __func__ << " waiting on write lock" << dendl;
+ dout(10) << __func__ << " waiting on write lock " << fop->op << " "
+ << fop->dup_ops << dendl;
close_op_ctx(ctx.release());
- requeue_op(fop->op);
- requeue_ops(fop->dup_ops);
+ // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
+ for (auto op : fop->dup_ops) {
+ bool locked = ctx->lock_manager.get_lock_type(
+ ObjectContext::RWState::RWWRITE,
+ oid,
+ obc,
+ op);
+ assert(!locked);
+ }
return -EAGAIN; // will retry
} else {
dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
return;
}
- if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
+ if (write_blocked_by_scrub(obc->obs.oi.soid)) {
dout(10) << "handle_watch_timeout waiting for scrub on obj "
<< obc->obs.oi.soid
<< dendl;
} else {
auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
assert(p != obc->ssc->snapset.clone_snaps.end());
+ if (p->second.empty()) {
+ dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
+ assert(!cct->_conf->osd_debug_verify_snaps);
+ return -ENOENT;
+ }
first = p->second.back();
last = p->second.front();
}
if (bi->version < info.log_tail) {
dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
<< dendl;
+ osr->flush();
if (last_update_applied >= info.log_tail) {
bi->version = last_update_applied;
} else {
- osr->flush();
bi->version = info.last_update;
}
scan_range(local_min, local_max, bi, handle);
// Once we hit a degraded object just skip
if (is_degraded_or_backfilling_object(aoid))
return;
- if (scrubber.write_blocked_by_scrub(aoid))
+ if (write_blocked_by_scrub(aoid))
return;
}
// Once we hit a degraded object just skip further trim
if (is_degraded_or_backfilling_object(aoid))
return;
- if (scrubber.write_blocked_by_scrub(aoid))
+ if (write_blocked_by_scrub(aoid))
return;
}
new_hset.using_gmt);
// If the current object is degraded we skip this persist request
- if (scrubber.write_blocked_by_scrub(oid))
+ if (write_blocked_by_scrub(oid))
return;
hit_set->seal();
osd->logger->inc(l_osd_agent_skip);
continue;
}
- if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
+ if (range_intersects_scrub(obc->obs.oi.soid,
+ obc->obs.oi.soid.get_head())) {
dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
osd->logger->inc(l_osd_agent_skip);
continue;
*/
void PrimaryLogPG::scrub_snapshot_metadata(
ScrubMap &scrubmap,
- const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
+ const map<hobject_t,
+ pair<boost::optional<uint32_t>,
+ boost::optional<uint32_t>>> &missing_digest)
{
dout(10) << __func__ << dendl;
if (head && (head_error.errors || soid_error_count))
scrubber.store->add_snap_error(pool.id, head_error);
- for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
- missing_digest.begin();
- p != missing_digest.end();
- ++p) {
+ for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
if (p->first.is_snapdir())
continue;
dout(10) << __func__ << " recording digests for " << p->first << dendl;
OpContextUPtr ctx = simple_opc_create(obc);
ctx->at_version = get_next_version();
ctx->mtime = utime_t(); // do not update mtime
- ctx->new_obs.oi.set_data_digest(p->second.first);
- ctx->new_obs.oi.set_omap_digest(p->second.second);
+ if (p->second.first) {
+ ctx->new_obs.oi.set_data_digest(*p->second.first);
+ } else {
+ ctx->new_obs.oi.clear_data_digest();
+ }
+ if (p->second.second) {
+ ctx->new_obs.oi.set_omap_digest(*p->second.second);
+ } else {
+ ctx->new_obs.oi.clear_omap_digest();
+ }
finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
ctx->register_on_success(
if (!to_req.empty()) {
// requeue at front of scrub blocking queue if we are blocked by scrub
for (auto &&p: to_req) {
- if (scrubber.write_blocked_by_scrub(p.first.get_head())) {
+ if (write_blocked_by_scrub(p.first.get_head())) {
+ for (auto& op : p.second) {
+ op->mark_delayed("waiting for scrub");
+ }
waiting_for_scrub.splice(
waiting_for_scrub.begin(),
p.second,
const hobject_t &begin, const hobject_t &end) override;
void scrub_snapshot_metadata(
ScrubMap &map,
- const std::map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest) override;
+ const std::map<hobject_t,
+ pair<boost::optional<uint32_t>,
+ boost::optional<uint32_t>>> &missing_digest) override;
void _scrub_clear_state() override;
void _scrub_finish() override;
object_stat_collection_t scrub_cstat;
void on_shutdown() override;
bool check_failsafe_full(ostream &ss) override;
bool check_osdmap_full(const set<pg_shard_t> &missing_on) override;
+ bool maybe_preempt_replica_scrub(const hobject_t& oid) override {
+ return write_blocked_by_scrub(oid);
+ }
int rep_repair_primary_object(const hobject_t& soid, OpRequestRef op);
// attr cache handling
}
}
-void ReplicatedBackend::be_deep_scrub(
+int ReplicatedBackend::be_deep_scrub(
const hobject_t &poid,
- uint32_t seed,
- ScrubMap::object &o,
- ThreadPool::TPHandle &handle)
+ ScrubMap &map,
+ ScrubMapBuilder &pos,
+ ScrubMap::object &o)
{
- dout(10) << __func__ << " " << poid << " seed "
- << std::hex << seed << std::dec << dendl;
- bufferhash h(seed), oh(seed);
- bufferlist bl, hdrbl;
+ dout(10) << __func__ << " " << poid << " pos " << pos << dendl;
int r;
- __u64 pos = 0;
+ uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
- uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
-
- while (true) {
- handle.reset_tp_timeout();
- r = store->read(
- ch,
- ghobject_t(
- poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
- pos,
- cct->_conf->osd_deep_scrub_stride, bl,
- fadvise_flags);
- if (r <= 0)
- break;
-
- h << bl;
- pos += bl.length();
- bl.clear();
+ utime_t sleeptime;
+ sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep);
+ if (sleeptime != utime_t()) {
+ lgeneric_derr(cct) << __func__ << " sleeping for " << sleeptime << dendl;
+ sleeptime.sleep();
}
- if (r == -EIO) {
- dout(25) << __func__ << " " << poid << " got "
- << r << " on read, read_error" << dendl;
- o.read_error = true;
- return;
- }
- o.digest = h.digest();
- o.digest_present = true;
- bl.clear();
- r = store->omap_get_header(
- coll,
- ghobject_t(
- poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
- &hdrbl, true);
- // NOTE: bobtail to giant, we would crc the head as (len, head).
- // that changes at the same time we start using a non-zero seed.
- if (r == 0 && hdrbl.length()) {
- dout(25) << "CRC header " << string(hdrbl.c_str(), hdrbl.length())
- << dendl;
- if (seed == 0) {
- // legacy
- bufferlist bl;
- ::encode(hdrbl, bl);
- oh << bl;
- } else {
- oh << hdrbl;
+ assert(poid == pos.ls[pos.pos]);
+ if (!pos.data_done()) {
+ if (pos.data_pos == 0) {
+ pos.data_hash = bufferhash(-1);
+ }
+
+ bufferlist bl;
+ r = store->read(
+ ch,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ pos.data_pos,
+ cct->_conf->osd_deep_scrub_stride, bl,
+ fadvise_flags);
+ if (r < 0) {
+ dout(20) << __func__ << " " << poid << " got "
+ << r << " on read, read_error" << dendl;
+ o.read_error = true;
+ return 0;
+ }
+ if (r > 0) {
+ pos.data_hash << bl;
+ }
+ pos.data_pos += r;
+ if (r == cct->_conf->osd_deep_scrub_stride) {
+ dout(20) << __func__ << " " << poid << " more data, digest so far 0x"
+ << std::hex << pos.data_hash.digest() << std::dec << dendl;
+ return -EINPROGRESS;
+ }
+ // done with bytes
+ pos.data_pos = -1;
+ o.digest = pos.data_hash.digest();
+ o.digest_present = true;
+ dout(20) << __func__ << " " << poid << " done with data, digest 0x"
+ << std::hex << o.digest << std::dec << dendl;
+ }
+
+ // omap header
+ if (pos.omap_pos.empty()) {
+ pos.omap_hash = bufferhash(-1);
+
+ bufferlist hdrbl;
+ r = store->omap_get_header(
+ coll,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ &hdrbl, true);
+ if (r == -EIO) {
+ dout(20) << __func__ << " " << poid << " got "
+ << r << " on omap header read, read_error" << dendl;
+ o.read_error = true;
+ return 0;
+ }
+ if (r == 0 && hdrbl.length()) {
+ dout(25) << "CRC header " << string(hdrbl.c_str(), hdrbl.length())
+ << dendl;
+ pos.omap_hash << hdrbl;
}
- } else if (r == -EIO) {
- dout(25) << __func__ << " " << poid << " got "
- << r << " on omap header read, read_error" << dendl;
- o.read_error = true;
- return;
}
+ // omap
ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(
coll,
ghobject_t(
poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
assert(iter);
- for (iter->seek_to_first(); iter->status() == 0 && iter->valid();
- iter->next(false)) {
- handle.reset_tp_timeout();
-
- dout(25) << "CRC key " << iter->key() << " value:\n";
- iter->value().hexdump(*_dout);
- *_dout << dendl;
-
+ if (pos.omap_pos.length()) {
+ iter->lower_bound(pos.omap_pos);
+ } else {
+ iter->seek_to_first();
+ }
+ int max = g_conf->osd_deep_scrub_keys;
+ while (iter->status() == 0 && iter->valid()) {
+ pos.omap_bytes += iter->value().length();
+ ++pos.omap_keys;
+ --max;
+ // fixme: we can do this more efficiently.
+ bufferlist bl;
::encode(iter->key(), bl);
::encode(iter->value(), bl);
- oh << bl;
- bl.clear();
+ pos.omap_hash << bl;
+
+ iter->next();
+
+ if (iter->valid() && max == 0) {
+ pos.omap_pos = iter->key();
+ return -EINPROGRESS;
+ }
+ if (iter->status() < 0) {
+ dout(25) << __func__ << " " << poid
+ << " on omap scan, db status error" << dendl;
+ o.read_error = true;
+ return 0;
+ }
}
- if (iter->status() < 0) {
- dout(25) << __func__ << " " << poid
- << " on omap scan, db status error" << dendl;
- o.read_error = true;
- return;
+ if (pos.omap_keys > cct->_conf->
+ osd_deep_scrub_large_omap_object_key_threshold ||
+ pos.omap_bytes > cct->_conf->
+ osd_deep_scrub_large_omap_object_value_sum_threshold) {
+ dout(25) << __func__ << " " << poid
+ << " large omap object detected. Object has " << pos.omap_keys
+ << " keys and size " << pos.omap_bytes << " bytes" << dendl;
+ o.large_omap_object_found = true;
+ o.large_omap_object_key_count = pos.omap_keys;
+ o.large_omap_object_value_size = pos.omap_bytes;
+ map.has_large_omap_object_errors = true;
}
- //Store final calculated CRC32 of omap header & key/values
- o.omap_digest = oh.digest();
+ o.omap_digest = pos.omap_hash.digest();
o.omap_digest_present = true;
- dout(20) << __func__ << " " << poid << " omap_digest "
+ dout(20) << __func__ << " done with " << poid << " omap_digest "
<< std::hex << o.omap_digest << std::dec << dendl;
+
+ // done!
+ return 0;
}
void ReplicatedBackend::_do_push(OpRequestRef op)
// we better not be missing this.
assert(!parent->get_log().get_missing().is_missing(soid));
+ parent->maybe_preempt_replica_scrub(soid);
+
int ackerosd = m->get_source().num();
op->mark_started();
bool auto_repair_supported() const override { return false; }
- void be_deep_scrub(
- const hobject_t &obj,
- uint32_t seed,
- ScrubMap::object &o,
- ThreadPool::TPHandle &handle) override;
+ int be_deep_scrub(
+ const hobject_t &poid,
+ ScrubMap &map,
+ ScrubMapBuilder &pos,
+ ScrubMap::object &o) override;
uint64_t be_get_ondisk_size(uint64_t logical_size) override { return logical_size; }
};
bufferlist::iterator bp = got.begin()->second.begin();
::decode(*out, bp);
dout(20) << __func__ << " " << oid << " " << out->snaps << dendl;
- assert(!out->snaps.empty());
+ if (out->snaps.empty()) {
+ dout(1) << __func__ << " " << oid << " empty snapset" << dendl;
+ assert(!cct->_conf->osd_debug_verify_snaps);
+ }
} else {
dout(20) << __func__ << " " << oid << " (out == NULL)" << dendl;
}
assert(is_unmanaged_snaps_mode());
removed_snaps.insert(s);
snap_seq = snap_seq + 1;
- removed_snaps.insert(get_snap_seq());
+ // try to add in the new seq, just to try to keep the interval_set contiguous
+ if (!removed_snaps.contains(get_snap_seq())) {
+ removed_snaps.insert(get_snap_seq());
+ }
}
SnapContext pg_pool_t::get_snap_context() const
}
uint8_t v = 26;
+ // NOTE: any new encoding dependencies must be reflected by
+ // SIGNIFICANT_FEATURES
if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
// this was the first post-hammer thing we added; if it's missing, encode
// like hammer.
f->dump_int("num_evict_mode_full", num_evict_mode_full);
f->dump_int("num_objects_pinned", num_objects_pinned);
f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
+ f->dump_int("num_large_omap_objects", num_large_omap_objects);
}
void object_stat_sum_t::encode(bufferlist& bl) const
{
- ENCODE_START(16, 14, bl);
+ ENCODE_START(17, 14, bl);
#if defined(CEPH_LITTLE_ENDIAN)
bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
#else
::encode(num_objects_pinned, bl);
::encode(num_objects_missing, bl);
::encode(num_legacy_snapsets, bl);
+ ::encode(num_large_omap_objects, bl);
#endif
ENCODE_FINISH(bl);
}
void object_stat_sum_t::decode(bufferlist::iterator& bl)
{
bool decode_finish = false;
- DECODE_START(16, bl);
+ DECODE_START(17, bl); // make sure to also update fast decode below
#if defined(CEPH_LITTLE_ENDIAN)
- if (struct_v >= 16) {
+ if (struct_v >= 17) { // this must match newest decode version
bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
decode_finish = true;
}
} else {
num_legacy_snapsets = num_object_clones; // upper bound
}
+ if (struct_v >= 17) {
+ ::decode(num_large_omap_objects, bl);
+ }
}
DECODE_FINISH(bl);
}
a.num_evict_mode_some = 1;
a.num_evict_mode_full = 0;
a.num_objects_pinned = 20;
+ a.num_large_omap_objects = 5;
o.push_back(new object_stat_sum_t(a));
}
num_evict_mode_full += o.num_evict_mode_full;
num_objects_pinned += o.num_objects_pinned;
num_legacy_snapsets += o.num_legacy_snapsets;
+ num_large_omap_objects += o.num_large_omap_objects;
}
void object_stat_sum_t::sub(const object_stat_sum_t& o)
num_evict_mode_full -= o.num_evict_mode_full;
num_objects_pinned -= o.num_objects_pinned;
num_legacy_snapsets -= o.num_legacy_snapsets;
+ num_large_omap_objects -= o.num_large_omap_objects;
}
bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
l.num_evict_mode_some == r.num_evict_mode_some &&
l.num_evict_mode_full == r.num_evict_mode_full &&
l.num_objects_pinned == r.num_objects_pinned &&
- l.num_legacy_snapsets == r.num_legacy_snapsets;
+ l.num_legacy_snapsets == r.num_legacy_snapsets &&
+ l.num_large_omap_objects == r.num_large_omap_objects;
}
// -- object_stat_collection_t --
void ScrubMap::object::encode(bufferlist& bl) const
{
bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
- ENCODE_START(8, 7, bl);
+ ENCODE_START(9, 7, bl);
::encode(size, bl);
::encode(negative, bl);
::encode(attrs, bl);
::encode(read_error, bl);
::encode(ec_hash_mismatch, bl);
::encode(ec_size_mismatch, bl);
+ ::encode(large_omap_object_found, bl);
+ ::encode(large_omap_object_key_count, bl);
+ ::encode(large_omap_object_value_size, bl);
ENCODE_FINISH(bl);
}
void ScrubMap::object::decode(bufferlist::iterator& bl)
{
- DECODE_START(8, bl);
+ DECODE_START(9, bl);
::decode(size, bl);
bool tmp, compat_read_error = false;
::decode(tmp, bl);
// If older encoder found a read_error, set read_error
if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
read_error = true;
+ if (struct_v >= 9) {
+ ::decode(tmp, bl);
+ large_omap_object_found = tmp;
+ ::decode(large_omap_object_key_count, bl);
+ ::decode(large_omap_object_value_size, bl);
+ }
DECODE_FINISH(bl);
}
int64_t num_objects_pinned;
int64_t num_objects_missing;
int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
+ int64_t num_large_omap_objects = 0;
object_stat_sum_t()
: num_bytes(0),
FLOOR(num_rd_kb);
FLOOR(num_wr);
FLOOR(num_wr_kb);
+ FLOOR(num_large_omap_objects);
FLOOR(num_shallow_scrub_errors);
FLOOR(num_deep_scrub_errors);
num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
out[i].num_deep_scrub_errors;
}
+ SPLIT(num_large_omap_objects);
SPLIT(num_objects_recovered);
SPLIT(num_bytes_recovered);
SPLIT(num_keys_recovered);
sizeof(num_wr) +
sizeof(num_wr_kb) +
sizeof(num_scrub_errors) +
+ sizeof(num_large_omap_objects) +
sizeof(num_objects_recovered) +
sizeof(num_bytes_recovered) +
sizeof(num_keys_recovered) +
omap_digest = -1;
}
void new_object() {
- set_data_digest(-1);
- set_omap_digest(-1);
+ clear_data_digest();
+ clear_omap_digest();
}
void encode(bufferlist& bl, uint64_t features) const;
bool stat_error:1;
bool ec_hash_mismatch:1;
bool ec_size_mismatch:1;
+ bool large_omap_object_found:1;
+ uint64_t large_omap_object_key_count = 0;
+ uint64_t large_omap_object_value_size = 0;
object() :
// Init invalid size so it won't match if we get a stat EIO error
size(-1), omap_digest(0), digest(0),
- negative(false), digest_present(false), omap_digest_present(false),
- read_error(false), stat_error(false), ec_hash_mismatch(false), ec_size_mismatch(false) {}
+ negative(false), digest_present(false), omap_digest_present(false),
+ read_error(false), stat_error(false), ec_hash_mismatch(false),
+ ec_size_mismatch(false), large_omap_object_found(false) {}
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bl);
map<hobject_t,object> objects;
eversion_t valid_through;
eversion_t incr_since;
+ bool has_large_omap_object_errors:1;
void merge_incr(const ScrubMap &l);
+ void clear_from(const hobject_t& start) {
+ objects.erase(objects.lower_bound(start), objects.end());
+ }
void insert(const ScrubMap &r) {
objects.insert(r.objects.begin(), r.objects.end());
}
WRITE_CLASS_ENCODER(ScrubMap::object)
WRITE_CLASS_ENCODER(ScrubMap)
+struct ScrubMapBuilder {
+ bool deep = false;
+ vector<hobject_t> ls;
+ size_t pos = 0;
+ int64_t data_pos = 0;
+ string omap_pos;
+ int ret = 0;
+ bufferhash data_hash, omap_hash; ///< accumulatinng hash value
+ uint64_t omap_keys = 0;
+ uint64_t omap_bytes = 0;
+
+ bool empty() {
+ return ls.empty();
+ }
+ bool done() {
+ return pos >= ls.size();
+ }
+ void reset() {
+ *this = ScrubMapBuilder();
+ }
+
+ bool data_done() {
+ return data_pos < 0;
+ }
+
+ void next_object() {
+ ++pos;
+ data_pos = 0;
+ omap_pos.clear();
+ omap_keys = 0;
+ omap_bytes = 0;
+ }
+
+ friend ostream& operator<<(ostream& out, const ScrubMapBuilder& pos) {
+ out << "(" << pos.pos << "/" << pos.ls.size();
+ if (pos.pos < pos.ls.size()) {
+ out << " " << pos.ls[pos.pos];
+ }
+ if (pos.data_pos < 0) {
+ out << " byte " << pos.data_pos;
+ }
+ if (!pos.omap_pos.empty()) {
+ out << " key " << pos.omap_pos;
+ }
+ if (pos.deep) {
+ out << " deep";
+ }
+ if (pos.ret) {
+ out << " ret " << pos.ret;
+ }
+ return out << ")";
+ }
+};
+
struct OSDOp {
ceph_osd_op op;
sobject_t soid;
if (waiting_for_zero_pos > flush_pos) {
_do_flush(waiting_for_zero_pos - flush_pos);
}
+
+ if (prezero_pos == prezeroing_pos &&
+ !waitfor_prezero.empty()) {
+ list<Context*> ls;
+ ls.swap(waitfor_prezero);
+ finish_contexts(cct, ls, 0);
+ }
} else {
pending_zero.insert(start, len);
}
<< dendl;
}
+void Journaler::wait_for_prezero(Context *onfinish)
+{
+ assert(onfinish);
+ lock_guard l(lock);
+
+ if (prezero_pos == prezeroing_pos) {
+ finisher->queue(onfinish, 0);
+ return;
+ }
+ waitfor_prezero.push_back(wrap_finisher(onfinish));
+}
/***************** READING *******************/
uint64_t waiting_for_zero_pos;
interval_set<uint64_t> pending_zero; // non-contig bits we've zeroed
+ list<Context*> waitfor_prezero;
+
std::map<uint64_t, uint64_t> pending_safe; // flush_pos -> safe_pos
// when safe through given offset
std::map<uint64_t, std::list<Context*> > waitfor_safe;
void flush(Context *onsafe = 0);
void wait_for_readable(Context *onfinish);
bool have_waiter() const;
+ void wait_for_prezero(Context *onfinish);
// Synchronous setters
// ===================
}
}
-void ObjectCacher::Object::discard(loff_t off, loff_t len)
+void ObjectCacher::Object::discard(loff_t off, loff_t len,
+ C_GatherBuilder* commit_gather)
{
assert(oc->lock.is_locked());
ldout(oc->cct, 10) << "discard " << *this << " " << off << "~" << len
++p;
ldout(oc->cct, 10) << "discard " << *this << " bh " << *bh << dendl;
- assert(bh->waitfor_read.empty());
replace_journal_tid(bh, 0);
+
+ if (bh->is_tx() && commit_gather != nullptr) {
+ // wait for the writeback to commit
+ waitfor_commit[bh->last_write_tid].emplace_back(commit_gather->new_sub());
+ } else if (bh->is_rx()) {
+ // cannot remove bh with in-flight read, but we can ensure the
+ // read won't overwrite the discard
+ bh->last_read_tid = ++oc->last_read_tid;
+ bh->bl.clear();
+ bh->set_nocache(true);
+ oc->mark_zero(bh);
+ // we should mark all Rx bh to zero
+ continue;
+ } else {
+ assert(bh->waitfor_read.empty());
+ }
+
oc->bh_remove(this, bh);
delete bh;
}
void ObjectCacher::discard_set(ObjectSet *oset, const vector<ObjectExtent>& exls)
{
assert(lock.is_locked());
- if (oset->objects.empty()) {
- ldout(cct, 10) << "discard_set on " << oset << " dne" << dendl;
+ bool was_dirty = oset->dirty_or_tx > 0;
+
+ _discard(oset, exls, nullptr);
+ _discard_finish(oset, was_dirty, nullptr);
+}
+
+/**
+ * discard object extents from an ObjectSet by removing the objects in
+ * exls from the in-memory oset. If the bh is in TX state, the discard
+ * will wait for the write to commit prior to invoking on_finish.
+ */
+void ObjectCacher::discard_writeback(ObjectSet *oset,
+ const vector<ObjectExtent>& exls,
+ Context* on_finish)
+{
+ assert(lock.is_locked());
+ bool was_dirty = oset->dirty_or_tx > 0;
+
+ C_GatherBuilder gather(cct);
+ _discard(oset, exls, &gather);
+
+ if (gather.has_subs()) {
+ bool flushed = was_dirty && oset->dirty_or_tx == 0;
+ gather.set_finisher(new FunctionContext(
+ [this, oset, flushed, on_finish](int) {
+ assert(lock.is_locked());
+ if (flushed && flush_set_callback)
+ flush_set_callback(flush_set_callback_arg, oset);
+ if (on_finish)
+ on_finish->complete(0);
+ }));
+ gather.activate();
return;
}
- ldout(cct, 10) << "discard_set " << oset << dendl;
+ _discard_finish(oset, was_dirty, on_finish);
+}
+
+void ObjectCacher::_discard(ObjectSet *oset, const vector<ObjectExtent>& exls,
+ C_GatherBuilder* gather)
+{
+ if (oset->objects.empty()) {
+ ldout(cct, 10) << __func__ << " on " << oset << " dne" << dendl;
+ return;
+ }
- bool were_dirty = oset->dirty_or_tx > 0;
+ ldout(cct, 10) << __func__ << " " << oset << dendl;
- for (vector<ObjectExtent>::const_iterator p = exls.begin();
- p != exls.end();
- ++p) {
- ldout(cct, 10) << "discard_set " << oset << " ex " << *p << dendl;
- const ObjectExtent &ex = *p;
+ for (auto& ex : exls) {
+ ldout(cct, 10) << __func__ << " " << oset << " ex " << ex << dendl;
sobject_t soid(ex.oid, CEPH_NOSNAP);
if (objects[oset->poolid].count(soid) == 0)
continue;
Object *ob = objects[oset->poolid][soid];
- ob->discard(ex.offset, ex.length);
+ ob->discard(ex.offset, ex.length, gather);
}
+}
+
+void ObjectCacher::_discard_finish(ObjectSet *oset, bool was_dirty,
+ Context* on_finish)
+{
+ assert(lock.is_locked());
// did we truncate off dirty data?
- if (flush_set_callback &&
- were_dirty && oset->dirty_or_tx == 0)
+ if (flush_set_callback && was_dirty && oset->dirty_or_tx == 0) {
flush_set_callback(flush_set_callback_arg, oset);
+ }
+
+ // notify that in-flight writeback has completed
+ if (on_finish != nullptr) {
+ on_finish->complete(0);
+ }
}
void ObjectCacher::verify_stats() const
void replace_journal_tid(BufferHead *bh, ceph_tid_t tid);
void truncate(loff_t s);
- void discard(loff_t off, loff_t len);
+ void discard(loff_t off, loff_t len, C_GatherBuilder* commit_gather);
// reference counting
int get() {
void maybe_wait_for_writeback(uint64_t len, ZTracer::Trace *trace);
bool _flush_set_finish(C_GatherBuilder *gather, Context *onfinish);
+ void _discard(ObjectSet *oset, const vector<ObjectExtent>& exls,
+ C_GatherBuilder* gather);
+ void _discard_finish(ObjectSet *oset, bool was_dirty, Context* on_finish);
+
public:
bool set_is_empty(ObjectSet *oset);
bool set_is_cached(ObjectSet *oset);
uint64_t release_all();
void discard_set(ObjectSet *oset, const vector<ObjectExtent>& ex);
+ void discard_writeback(ObjectSet *oset, const vector<ObjectExtent>& ex,
+ Context* on_finish);
/**
* Retry any in-flight reads that get -ENOENT instead of marking
info->on_reg_commit->complete(r);
info->on_reg_commit = NULL;
}
+ if (r < 0 && info->on_notify_finish) {
+ info->on_notify_finish->complete(r);
+ info->on_notify_finish = nullptr;
+ }
// only tell the user the first time we do this
info->registered = true;
}
if (op->map_dne_bound > 0) {
if (osdmap->get_epoch() >= op->map_dne_bound) {
+ LingerOp::unique_lock wl{op->watch_lock};
if (op->on_reg_commit) {
op->on_reg_commit->complete(-ENOENT);
+ op->on_reg_commit = nullptr;
+ }
+ if (op->on_notify_finish) {
+ op->on_notify_finish->complete(-ENOENT);
+ op->on_notify_finish = nullptr;
}
*need_unregister = true;
}
if (c->map_dne_bound == 0)
c->map_dne_bound = latest;
+ OSDSession::unique_lock sul(c->session->lock);
objecter->_check_command_map_dne(c);
+ sul.unlock();
c->put();
}
void Objecter::_check_command_map_dne(CommandOp *c)
{
// rwlock is locked unique
+ // session is locked unique
ldout(cct, 10) << "_check_command_map_dne tid " << c->tid
<< " current " << osdmap->get_epoch()
void Objecter::_send_command_map_check(CommandOp *c)
{
// rwlock is locked unique
+ // session is locked unique
// ask the monitor
if (check_latest_map_commands.count(c->tid) == 0) {
sl.unlock();
-
+ OSDSession::unique_lock sul(s->lock);
_finish_command(c, m->r, m->rs);
+ sul.unlock();
+
m->put();
if (s)
s->put();
CommandOp *op = it->second;
_command_cancel_map_check(op);
+ OSDSession::unique_lock sl(op->session->lock);
_finish_command(op, r, "");
+ sl.unlock();
return 0;
}
void Objecter::_finish_command(CommandOp *c, int r, string rs)
{
// rwlock is locked unique
+ // session lock is locked
ldout(cct, 10) << "_finish_command " << c->tid << " = " << r << " "
<< rs << dendl;
timer.cancel_event(c->ontimeout);
OSDSession *s = c->session;
- OSDSession::unique_lock sl(s->lock);
_session_command_op_remove(c->session, c);
- sl.unlock();
c->put();
* 1 - Initial version
* 2 - Added get_object, put_object, delete_object methods to CephFSVolumeClient
-
+ * 3 - Allow volumes to be created without RADOS namespace isolation
"""
"""
# Current version
- version = 2
+ version = 3
# Where shall we create our volumes?
POOL_PREFIX = "fsvolume_"
except cephfs.ObjectNotFound:
self.fs.mkdir(subpath, 0o755)
- def create_volume(self, volume_path, size=None, data_isolated=False):
+ def create_volume(self, volume_path, size=None, data_isolated=False, namespace_isolated=True):
"""
Set up metadata, pools and auth for a volume.
:param volume_path: VolumePath instance
:param size: In bytes, or None for no size limit
:param data_isolated: If true, create a separate OSD pool for this volume
+ :param namespace_isolated: If true, use separate RADOS namespace for this volume
:return:
"""
path = self._get_path(volume_path)
})
self.fs.setxattr(path, 'ceph.dir.layout.pool', pool_name, 0)
- # enforce security isolation, use seperate namespace for this volume
- namespace = "{0}{1}".format(self.pool_ns_prefix, volume_path.volume_id)
- log.info("create_volume: {0}, using rados namespace {1} to isolate data.".format(volume_path, namespace))
- self.fs.setxattr(path, 'ceph.dir.layout.pool_namespace', namespace, 0)
+ # enforce security isolation, use separate namespace for this volume
+ if namespace_isolated:
+ namespace = "{0}{1}".format(self.pool_ns_prefix, volume_path.volume_id)
+ log.info("create_volume: {0}, using rados namespace {1} to isolate data.".format(volume_path, namespace))
+ self.fs.setxattr(path, 'ceph.dir.layout.pool_namespace', namespace, 0)
+ else:
+ # If volume's namespace layout is not set, then the volume's pool
+ # layout remains unset and will undesirably change with ancestor's
+ # pool layout changes.
+ pool_name = self._get_ancestor_xattr(path, "ceph.dir.layout.pool")
+ self.fs.setxattr(path, 'ceph.dir.layout.pool', pool_name, 0)
# Create a volume meta file, if it does not already exist, to store
# data about auth ids having access to the volume
# First I need to work out what the data pool is for this share:
# read the layout
pool_name = self._get_ancestor_xattr(path, "ceph.dir.layout.pool")
- namespace = self.fs.getxattr(path, "ceph.dir.layout.pool_namespace")
+
+ try:
+ namespace = self.fs.getxattr(path, "ceph.dir.layout.pool_namespace")
+ except cephfs.NoData:
+ namespace = None
# Now construct auth capabilities that give the guest just enough
# permissions to access the share
client_entity = "client.{0}".format(auth_id)
want_access_level = 'r' if readonly else 'rw'
want_mds_cap = 'allow {0} path={1}'.format(want_access_level, path)
- want_osd_cap = 'allow {0} pool={1} namespace={2}'.format(
- want_access_level, pool_name, namespace)
+ if namespace:
+ want_osd_cap = 'allow {0} pool={1} namespace={2}'.format(
+ want_access_level, pool_name, namespace)
+ else:
+ want_osd_cap = 'allow {0} pool={1}'.format(want_access_level,
+ pool_name)
try:
existing = self._rados_command(
# auth caps.
unwanted_access_level = 'r' if want_access_level is 'rw' else 'rw'
unwanted_mds_cap = 'allow {0} path={1}'.format(unwanted_access_level, path)
- unwanted_osd_cap = 'allow {0} pool={1} namespace={2}'.format(
- unwanted_access_level, pool_name, namespace)
+ if namespace:
+ unwanted_osd_cap = 'allow {0} pool={1} namespace={2}'.format(
+ unwanted_access_level, pool_name, namespace)
+ else:
+ unwanted_osd_cap = 'allow {0} pool={1}'.format(
+ unwanted_access_level, pool_name)
+
+ def cap_update(
+ orig_mds_caps, orig_osd_caps, want_mds_cap,
+ want_osd_cap, unwanted_mds_cap, unwanted_osd_cap):
- def cap_update(orig, want, unwanted):
- # Updates the existing auth caps such that there is a single
- # occurrence of wanted auth caps and no occurrence of
- # conflicting auth caps.
+ if not orig_mds_caps:
+ return want_mds_cap, want_osd_cap
- if not orig:
- return want
+ mds_cap_tokens = orig_mds_caps.split(",")
+ osd_cap_tokens = orig_osd_caps.split(",")
- cap_tokens = set(orig.split(","))
+ if want_mds_cap in mds_cap_tokens:
+ return orig_mds_caps, orig_osd_caps
- cap_tokens.discard(unwanted)
- cap_tokens.add(want)
+ if unwanted_mds_cap in mds_cap_tokens:
+ mds_cap_tokens.remove(unwanted_mds_cap)
+ osd_cap_tokens.remove(unwanted_osd_cap)
- return ",".join(cap_tokens)
+ mds_cap_tokens.append(want_mds_cap)
+ osd_cap_tokens.append(want_osd_cap)
- osd_cap_str = cap_update(cap['caps'].get('osd', ""), want_osd_cap, unwanted_osd_cap)
- mds_cap_str = cap_update(cap['caps'].get('mds', ""), want_mds_cap, unwanted_mds_cap)
+ return ",".join(mds_cap_tokens), ",".join(osd_cap_tokens)
+
+ orig_mds_caps = cap['caps'].get('mds', "")
+ orig_osd_caps = cap['caps'].get('osd', "")
+
+ mds_cap_str, osd_cap_str = cap_update(
+ orig_mds_caps, orig_osd_caps, want_mds_cap, want_osd_cap,
+ unwanted_mds_cap, unwanted_osd_cap)
caps = self._rados_command(
'auth caps',
client_entity = "client.{0}".format(auth_id)
path = self._get_path(volume_path)
pool_name = self._get_ancestor_xattr(path, "ceph.dir.layout.pool")
- namespace = self.fs.getxattr(path, "ceph.dir.layout.pool_namespace")
+ try:
+ namespace = self.fs.getxattr(path, "ceph.dir.layout.pool_namespace")
+ except cephfs.NoData:
+ namespace = None
# The auth_id might have read-only or read-write mount access for the
# volume path.
access_levels = ('r', 'rw')
- want_mds_caps = {'allow {0} path={1}'.format(access_level, path)
- for access_level in access_levels}
- want_osd_caps = {'allow {0} pool={1} namespace={2}'.format(
- access_level, pool_name, namespace)
- for access_level in access_levels}
+ want_mds_caps = ['allow {0} path={1}'.format(access_level, path)
+ for access_level in access_levels]
+ if namespace:
+ want_osd_caps = ['allow {0} pool={1} namespace={2}'.format(access_level, pool_name, namespace)
+ for access_level in access_levels]
+ else:
+ want_osd_caps = ['allow {0} pool={1}'.format(access_level, pool_name)
+ for access_level in access_levels]
+
try:
existing = self._rados_command(
}
)
- def cap_remove(orig, want):
- cap_tokens = set(orig.split(","))
- return ",".join(cap_tokens.difference(want))
+ def cap_remove(orig_mds_caps, orig_osd_caps, want_mds_caps, want_osd_caps):
+ mds_cap_tokens = orig_mds_caps.split(",")
+ osd_cap_tokens = orig_osd_caps.split(",")
+
+ for want_mds_cap, want_osd_cap in zip(want_mds_caps, want_osd_caps):
+ if want_mds_cap in mds_cap_tokens:
+ mds_cap_tokens.remove(want_mds_cap)
+ osd_cap_tokens.remove(want_osd_cap)
+ break
+
+ return ",".join(mds_cap_tokens), ",".join(osd_cap_tokens)
cap = existing[0]
- osd_cap_str = cap_remove(cap['caps'].get('osd', ""), want_osd_caps)
- mds_cap_str = cap_remove(cap['caps'].get('mds', ""), want_mds_caps)
- if (not osd_cap_str) and (not mds_cap_str):
+ orig_mds_caps = cap['caps'].get('mds', "")
+ orig_osd_caps = cap['caps'].get('osd', "")
+ mds_cap_str, osd_cap_str = cap_remove(orig_mds_caps, orig_osd_caps,
+ want_mds_caps, want_osd_caps)
+
+ if not mds_cap_str:
self._rados_command('auth del', {'entity': client_entity}, decode=False)
else:
self._rados_command(
-
from datetime import datetime
from threading import Event
import json
import errno
+import time
from mgr_module import MgrModule
try:
from influxdb import InfluxDBClient
from influxdb.exceptions import InfluxDBClientError
+ from requests.exceptions import ConnectionError
except ImportError:
InfluxDBClient = None
+
class Module(MgrModule):
COMMANDS = [
+ {
+ "cmd": "influx config-set name=key,type=CephString "
+ "name=value,type=CephString",
+ "desc": "Set a configuration value",
+ "perm": "rw"
+ },
+ {
+ "cmd": "influx config-show",
+ "desc": "Show current configuration",
+ "perm": "r"
+ },
+ {
+ "cmd": "influx send",
+ "desc": "Force sending data to Influx",
+ "perm": "rw"
+ },
{
"cmd": "influx self-test",
"desc": "debug the module",
- "perm": "rw"
+ "perm": "rw"
},
]
+ config_keys = {
+ 'hostname': None,
+ 'port': 8086,
+ 'database': 'ceph',
+ 'username': None,
+ 'password': None,
+ 'interval': 5,
+ 'ssl': 'false',
+ 'verify_ssl': 'true'
+ }
def __init__(self, *args, **kwargs):
super(Module, self).__init__(*args, **kwargs)
self.event = Event()
- self.run = True
+ self.run = True
+ self.config = dict()
+ def get_fsid(self):
+ return self.get('mon_map')['fsid']
def get_latest(self, daemon_type, daemon_name, stat):
data = self.get_counter(daemon_type, daemon_name, stat)[stat]
if data:
return data[-1][1]
- else:
- return 0
+ return 0
def get_df_stats(self):
df = self.get("df")
data = []
+ now = datetime.utcnow().isoformat() + 'Z'
+
df_types = [
'bytes_used',
+ 'kb_used',
'dirty',
+ 'rd',
'rd_bytes',
'raw_bytes_used',
+ 'wr',
'wr_bytes',
'objects',
- 'max_avail'
+ 'max_avail',
+ 'quota_objects',
+ 'quota_bytes'
]
for df_type in df_types:
point = {
"measurement": "ceph_pool_stats",
"tags": {
- "pool_name" : pool['name'],
- "pool_id" : pool['id'],
- "type_instance" : df_type,
- "mgr_id" : self.get_mgr_id(),
+ "pool_name": pool['name'],
+ "pool_id": pool['id'],
+ "type_instance": df_type,
+ "fsid": self.get_fsid()
},
- "time" : datetime.utcnow().isoformat() + 'Z',
- "fields": {
- "value" : pool['stats'][df_type],
- }
+ "time": now,
+ "fields": {
+ "value": pool['stats'][df_type],
+ }
}
data.append(point)
return data
def get_daemon_stats(self):
data = []
+ now = datetime.utcnow().isoformat() + 'Z'
+
for daemon, counters in self.get_all_perf_counters().iteritems():
- svc_type, svc_id = daemon.split(".")
+ svc_type, svc_id = daemon.split(".", 1)
metadata = self.get_metadata(svc_type, svc_id)
for path, counter_info in counters.items():
"tags": {
"ceph_daemon": daemon,
"type_instance": path,
- "host": metadata['hostname']
+ "host": metadata['hostname'],
+ "fsid": self.get_fsid()
},
- "time": datetime.utcnow().isoformat() + 'Z',
+ "time": now,
"fields": {
"value": value
}
return data
+ def set_config_option(self, option, value):
+ if option not in self.config_keys.keys():
+ raise RuntimeError('{0} is a unknown configuration '
+ 'option'.format(option))
+
+ if option in ['port', 'interval']:
+ try:
+ value = int(value)
+ except (ValueError, TypeError):
+ raise RuntimeError('invalid {0} configured. Please specify '
+ 'a valid integer'.format(option))
+
+ if option == 'interval' and value < 5:
+ raise RuntimeError('interval should be set to at least 5 seconds')
+
+ if option in ['ssl', 'verify_ssl']:
+ value = value.lower() == 'true'
+
+ self.config[option] = value
+
+ def init_module_config(self):
+ self.config['hostname'] = \
+ self.get_config("hostname", default=self.config_keys['hostname'])
+ self.config['port'] = \
+ int(self.get_config("port", default=self.config_keys['port']))
+ self.config['database'] = \
+ self.get_config("database", default=self.config_keys['database'])
+ self.config['username'] = \
+ self.get_config("username", default=self.config_keys['username'])
+ self.config['password'] = \
+ self.get_config("password", default=self.config_keys['password'])
+ self.config['interval'] = \
+ int(self.get_config("interval",
+ default=self.config_keys['interval']))
+ ssl = self.get_config("ssl", default=self.config_keys['ssl'])
+ self.config['ssl'] = ssl.lower() == 'true'
+ verify_ssl = \
+ self.get_config("verify_ssl", default=self.config_keys['verify_ssl'])
+ self.config['verify_ssl'] = verify_ssl.lower() == 'true'
+
def send_to_influx(self):
- host = self.get_config("hostname")
- if not host:
- self.log.error("No InfluxDB server configured, please set"
- "`hostname` configuration key.")
+ if not self.config['hostname']:
+ self.log.error("No Influx server configured, please set one using: "
+ "ceph influx config-set hostname <hostname>")
+ self.set_health_checks({
+ 'MGR_INFLUX_NO_SERVER': {
+ 'severity': 'warning',
+ 'summary': 'No InfluxDB server configured',
+ 'detail': ['Configuration option hostname not set']
+ }
+ })
return
- port = int(self.get_config("port", default="8086"))
- database = self.get_config("database", default="ceph")
-
# If influx server has authentication turned off then
# missing username/password is valid.
- username = self.get_config("username", default="")
- password = self.get_config("password", default="")
-
- client = InfluxDBClient(host, port, username, password, database)
+ self.log.debug("Sending data to Influx host: %s",
+ self.config['hostname'])
+ client = InfluxDBClient(self.config['hostname'], self.config['port'],
+ self.config['username'],
+ self.config['password'],
+ self.config['database'],
+ self.config['ssl'],
+ self.config['verify_ssl'])
- # using influx client get_list_database requires admin privs, instead we'll catch the not found exception and inform the user if db can't be created
+ # using influx client get_list_database requires admin privs,
+ # instead we'll catch the not found exception and inform the user if
+ # db can not be created
try:
client.write_points(self.get_df_stats(), 'ms')
client.write_points(self.get_daemon_stats(), 'ms')
+ self.set_health_checks(dict())
+ except ConnectionError as e:
+ self.log.exception("Failed to connect to Influx host %s:%d",
+ self.config['hostname'], self.config['port'])
+ self.set_health_checks({
+ 'MGR_INFLUX_SEND_FAILED': {
+ 'severity': 'warning',
+ 'summary': 'Failed to send data to InfluxDB server at %s:%d'
+ ' due to an connection error'
+ % (self.config['hostname'], self.config['port']),
+ 'detail': [str(e)]
+ }
+ })
except InfluxDBClientError as e:
if e.code == 404:
- self.log.info("Database '{0}' not found, trying to create (requires admin privs). You can also create manually and grant write privs to user '{1}'".format(database,username))
- client.create_database(database)
+ self.log.info("Database '%s' not found, trying to create "
+ "(requires admin privs). You can also create "
+ "manually and grant write privs to user "
+ "'%s'", self.config['database'],
+ self.config['username'])
+ client.create_database(self.config['database'])
else:
+ self.set_health_checks({
+ 'MGR_INFLUX_SEND_FAILED': {
+ 'severity': 'warning',
+ 'summary': 'Failed to send data to InfluxDB',
+ 'detail': [str(e)]
+ }
+ })
raise
def shutdown(self):
self.event.set()
def handle_command(self, cmd):
+ if cmd['prefix'] == 'influx config-show':
+ return 0, json.dumps(self.config), ''
+ elif cmd['prefix'] == 'influx config-set':
+ key = cmd['key']
+ value = cmd['value']
+ if not value:
+ return -errno.EINVAL, '', 'Value should not be empty or None'
+
+ self.log.debug('Setting configuration option %s to %s', key, value)
+ self.set_config_option(key, value)
+ self.set_config(key, value)
+ return 0, 'Configuration option {0} updated'.format(key), ''
+ elif cmd['prefix'] == 'influx send':
+ self.send_to_influx()
+ return 0, 'Sending data to Influx', ''
if cmd['prefix'] == 'influx self-test':
daemon_stats = self.get_daemon_stats()
assert len(daemon_stats)
df_stats = self.get_df_stats()
+
result = {
'daemon_stats': daemon_stats,
'df_stats': df_stats
}
+
return 0, json.dumps(result, indent=2), 'Self-test OK'
- else:
- return (-errno.EINVAL, '',
- "Command not found '{0}'".format(cmd['prefix']))
+
+ return (-errno.EINVAL, '',
+ "Command not found '{0}'".format(cmd['prefix']))
def serve(self):
if InfluxDBClient is None:
return
self.log.info('Starting influx module')
+ self.init_module_config()
self.run = True
+
while self.run:
+ start = time.time()
self.send_to_influx()
- self.log.debug("Running interval loop")
- interval = self.get_config("interval")
- if interval is None:
- interval = 5
- self.log.debug("sleeping for %d seconds",interval)
- self.event.wait(interval)
-
+ runtime = time.time() - start
+ self.log.debug('Finished sending data in Influx in %.3f seconds',
+ runtime)
+ self.log.debug("Sleeping for %d seconds", self.config['interval'])
+ self.event.wait(self.config['interval'])
PERFCOUNTER_LONGRUNAVG = 4
PERFCOUNTER_COUNTER = 8
PERFCOUNTER_HISTOGRAM = 0x10
- PERFCOUNTER_TYPE_MASK = ~2
+ PERFCOUNTER_TYPE_MASK = ~3
def __init__(self, module_name, py_modules_ptr, this_ptr):
self.module_name = module_name
return ''
+ def _perfvalue_to_value(self, stattype, value):
+ if stattype & self.PERFCOUNTER_TIME:
+ # Convert from ns to seconds
+ return value / 1000000000.0
+ else:
+ return value
+
def get_server(self, hostname):
"""
Called by the plugin to load information about a particular
else:
return 0
+ def get_latest_avg(daemon_type, daemon_name, counter):
+ data = self.get_counter(daemon_type, daemon_name, counter)[counter]
+ if data:
+ return (data[-1][1], data[-1][2])
+ else:
+ return (0, 0)
+
for server in self.list_servers():
for service in server['services']:
if service['type'] not in ("rgw", "mds", "osd", "mon"):
if counter_schema['priority'] < prio_limit:
continue
- counter_info = counter_schema
- counter_info['value'] = get_latest(service['type'], service['id'], counter_path)
+ counter_info = dict(counter_schema)
+
+ # Also populate count for the long running avgs
+ if counter_schema['type'] & self.PERFCOUNTER_LONGRUNAVG:
+ v, c = get_latest_avg(
+ service['type'],
+ service['id'],
+ counter_path
+ )
+ counter_info['value'], counter_info['count'] = v, c
+ result[svc_full_name][counter_path] = counter_info
+ else:
+ counter_info['value'] = get_latest(
+ service['type'],
+ service['id'],
+ counter_path
+ )
+
result[svc_full_name][counter_path] = counter_info
self.log.debug("returning {0} counter".format(len(result)))
and/or the monitor cluster is down.
"""
- return self._ceph_have_mon_connection()
\ No newline at end of file
+ return self._ceph_have_mon_connection()
OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance',
'norecover', 'noscrub', 'nodeep-scrub')
-FS_METADATA = ('data_pools', 'id', 'metadata_pool', 'name')
+FS_METADATA = ('data_pools', 'fs_id', 'metadata_pool', 'name')
-MDS_METADATA = ('id', 'fs', 'hostname', 'public_addr', 'rank', 'ceph_version')
+MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank',
+ 'ceph_version')
-MON_METADATA = ('id', 'hostname', 'public_addr', 'rank', 'ceph_version')
+MON_METADATA = ('ceph_daemon', 'hostname', 'public_addr', 'rank', 'ceph_version')
-OSD_METADATA = ('cluster_addr', 'device_class', 'id', 'hostname', 'public_addr',
- 'ceph_version')
+OSD_METADATA = ('ceph_daemon', 'cluster_addr', 'device_class', 'hostname',
+ 'public_addr', 'ceph_version')
OSD_STATUS = ['weight', 'up', 'in']
POOL_METADATA = ('pool_id', 'name')
-RGW_METADATA = ('id', 'hostname', 'ceph_version')
+RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
-DISK_OCCUPATION = ('instance', 'device', 'ceph_daemon')
+DISK_OCCUPATION = ( 'ceph_daemon', 'device','instance')
+
+NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
class Metrics(object):
'DF pool {}'.format(state),
('pool_id',)
)
+ for state in NUM_OBJECTS:
+ path = 'num_objects_{}'.format(state)
+ metrics[path] = Metric(
+ 'gauge',
+ path,
+ 'Number of {} objects'.format(state),
+ )
return metrics
fs['id'],
fs['mdsmap']['metadata_pool'],
fs['mdsmap']['fs_name']))
+ self.log.debug('mdsmap: {}'.format(fs['mdsmap']))
for gid, daemon in fs['mdsmap']['info'].items():
id_ = daemon['name']
host_version = servers.get((id_, 'mds'), ('',''))
self.metrics.append('mds_metadata', 1,
- (id_, fs['id'], host_version[0],
- daemon['addr'], daemon['rank'],
- host_version[1]))
+ ('mds.{}'.format(id_), fs['id'],
+ host_version[0], daemon['addr'],
+ daemon['rank'], host_version[1]))
def get_quorum_status(self):
mon_status = json.loads(self.get('mon_status')['json'])
id_ = mon['name']
host_version = servers.get((id_, 'mon'), ('',''))
self.metrics.append('mon_metadata', 1,
- (id_, host_version[0],
+ ('mon.{}'.format(id_), host_version[0],
mon['public_addr'].split(':')[0], rank,
host_version[1]))
in_quorum = int(rank in mon_status['quorum'])
self.metrics.append('mon_quorum_status', in_quorum,
- ('mon_{}'.format(id_),))
+ ('mon.{}'.format(id_),))
def get_pg_status(self):
# TODO add per pool status?
host_version = servers.get((str(id_), 'osd'), ('',''))
self.metrics.append('osd_metadata', 1, (
+ 'osd.{}'.format(id_),
c_addr,
dev_class,
- id_, host_version[0],
+ host_version[0],
p_addr, host_version[1]
))
self.log.debug("Got dev for osd {0}: {1}/{2}".format(
id_, osd_hostname, osd_dev_node))
self.metrics.set('disk_occupation', 1, (
- osd_hostname,
+ "osd.{0}".format(id_),
osd_dev_node,
- "osd.{0}".format(id_)
+ osd_hostname
))
else:
self.log.info("Missing dev node metadata for osd {0}, skipping "
self.metrics.append(
'rgw_metadata',
1,
- (service_id, hostname, version)
+ ('{}.{}'.format(service_type, service_id), hostname, version)
)
+ def get_num_objects(self):
+ pg_sum = self.get('pg_summary')['pg_stats_sum']['stat_sum']
+ for obj in NUM_OBJECTS:
+ stat = 'num_objects_{}'.format(obj)
+ self.metrics.set(stat, pg_sum[stat])
+
def collect(self):
self.get_health()
self.get_df()
self.get_quorum_status()
self.get_metadata_and_osd_status()
self.get_pg_status()
+ self.get_num_objects()
for daemon, counters in self.get_all_perf_counters().items():
for path, counter_info in counters.items():
+ # Skip histograms, they are represented by long running avgs
stattype = self._stattype_to_str(counter_info['type'])
- # XXX simplify first effort: no histograms
- # averages are already collapsed to one value for us
if not stattype or stattype == 'histogram':
self.log.debug('ignoring %s, type %s' % (path, stattype))
continue
- self.metrics.add_metric(path, Metric(
+ # Get the value of the counter
+ value = self._perfvalue_to_value(counter_info['type'], counter_info['value'])
+
+ # Represent the long running avgs as sum/count pairs
+ if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG:
+ _path = path + '_sum'
+ self.metrics.add_metric(_path, Metric(
+ stattype,
+ _path,
+ counter_info['description'] + ' Total',
+ ("ceph_daemon",),
+ ))
+ self.metrics.append(_path, value, (daemon,))
+
+ _path = path + '_count'
+ self.metrics.add_metric(_path, Metric(
+ 'counter',
+ _path,
+ counter_info['description'] + ' Count',
+ ("ceph_daemon",),
+ ))
+ self.metrics.append(_path, counter_info['count'], (daemon,))
+ else:
+ self.metrics.add_metric(path, Metric(
stattype,
path,
counter_info['description'],
("ceph_daemon",),
))
+ self.metrics.append(path, value, (daemon,))
- self.metrics.append(path, counter_info['value'], (daemon,))
# It is sufficient to reset the pending metrics once per scrape
self.metrics.reset()
# Valid values for the 'var' argument to 'ceph osd pool set'
POOL_PROPERTIES_1 = [
'size', 'min_size', 'crash_replay_interval', 'pg_num',
- 'crush_rule', 'hashpspool',
+ 'crush_rule', 'hashpspool', 'auid',
]
POOL_PROPERTIES_2 = [
'prefix': 'osd pool set',
'pool': pool_name,
'var': var,
+ 'val': args[var],
})
return commands
self.finished
),
'waiting': map(
- lambda x: {
- 'command': x.command,
- 'outs': x.outs,
- 'outb': x.outb,
- },
+ lambda x: map(
+ lambda y: common.humanify_command(y),
+ x
+ ),
self.waiting
),
'failed': map(
int rados_ioctx_snap_list(rados_ioctx_t io, rados_snap_t * snaps, int maxlen)
int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id, time_t * t)
+ int rados_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
+ rados_snap_t *snapid)
+ int rados_ioctx_selfmanaged_snap_remove(rados_ioctx_t io,
+ rados_snap_t snapid)
+ int rados_ioctx_selfmanaged_snap_set_write_ctx(rados_ioctx_t io,
+ rados_snap_t snap_seq,
+ rados_snap_t *snap,
+ int num_snaps)
+ int rados_ioctx_selfmanaged_snap_rollback(rados_ioctx_t io, const char *oid,
+ rados_snap_t snapid)
+
int rados_lock_exclusive(rados_ioctx_t io, const char * oid, const char * name,
const char * cookie, const char * desc,
timeval * duration, uint8_t flags)
if ret != 0:
raise make_ex(ret, "Failed to rollback %s" % oid)
+ def create_self_managed_snap(self):
+ """
+ Creates a self-managed snapshot
+
+ :returns: snap id on success
+
+ :raises: :class:`Error`
+ """
+ self.require_ioctx_open()
+ cdef:
+ rados_snap_t _snap_id
+ with nogil:
+ ret = rados_ioctx_selfmanaged_snap_create(self.io, &_snap_id)
+ if ret != 0:
+ raise make_ex(ret, "Failed to create self-managed snapshot")
+ return int(_snap_id)
+
+ @requires(('snap_id', int))
+ def remove_self_managed_snap(self, snap_id):
+ """
+ Removes a self-managed snapshot
+
+ :param snap_id: the name of the snapshot
+ :type snap_id: int
+
+ :raises: :class:`TypeError`
+ :raises: :class:`Error`
+ """
+ self.require_ioctx_open()
+ cdef:
+ rados_snap_t _snap_id = snap_id
+ with nogil:
+ ret = rados_ioctx_selfmanaged_snap_remove(self.io, _snap_id)
+ if ret != 0:
+ raise make_ex(ret, "Failed to remove self-managed snapshot")
+
+ def set_self_managed_snap_write(self, snaps):
+ """
+ Updates the write context to the specified self-managed
+ snapshot ids.
+
+ :param snaps: all associated self-managed snapshot ids
+ :type snaps: list
+
+ :raises: :class:`TypeError`
+ :raises: :class:`Error`
+ """
+ self.require_ioctx_open()
+ sorted_snaps = []
+ snap_seq = 0
+ if snaps:
+ sorted_snaps = sorted([int(x) for x in snaps], reverse=True)
+ snap_seq = sorted_snaps[0]
+
+ cdef:
+ rados_snap_t _snap_seq = snap_seq
+ rados_snap_t *_snaps = NULL
+ int _num_snaps = len(sorted_snaps)
+ try:
+ _snaps = <rados_snap_t *>malloc(_num_snaps * sizeof(rados_snap_t))
+ for i in range(len(sorted_snaps)):
+ _snaps[i] = sorted_snaps[i]
+ with nogil:
+ ret = rados_ioctx_selfmanaged_snap_set_write_ctx(self.io,
+ _snap_seq,
+ _snaps,
+ _num_snaps)
+ if ret != 0:
+ raise make_ex(ret, "Failed to update snapshot write context")
+ finally:
+ free(_snaps)
+
+ @requires(('oid', str_type), ('snap_id', int))
+ def rollback_self_managed_snap(self, oid, snap_id):
+ """
+ Rolls an specific object back to a self-managed snapshot revision
+
+ :param oid: the name of the object
+ :type oid: str
+ :param snap_id: the name of the snapshot
+ :type snap_id: int
+
+ :raises: :class:`TypeError`
+ :raises: :class:`Error`
+ """
+ self.require_ioctx_open()
+ oid = cstr(oid, 'oid')
+ cdef:
+ char *_oid = oid
+ rados_snap_t _snap_id = snap_id
+ with nogil:
+ ret = rados_ioctx_selfmanaged_snap_rollback(self.io, _oid, _snap_id)
+ if ret != 0:
+ raise make_ex(ret, "Failed to rollback %s" % oid)
+
def get_last_version(self):
"""
Return the version of the last object read or written to.
int rbd_get_id(rbd_image_t image, char *id, size_t id_len)
int rbd_get_block_name_prefix(rbd_image_t image, char *prefix,
size_t prefix_len)
+ int64_t rbd_get_data_pool_id(rbd_image_t image)
int rbd_get_parent_info2(rbd_image_t image,
char *parent_poolname, size_t ppoolnamelen,
char *parent_name, size_t pnamelen,
return None
if isinstance(val, bytes):
return val
- elif isinstance(val, unicode):
+ elif isinstance(val, str):
+ return val.encode(encoding)
+ elif sys.version_info < (3, 0) and isinstance(val, unicode):
return val.encode(encoding)
else:
raise InvalidArgument('%s must be a string' % name)
:raises: :class:`FunctionNotSupported`
"""
name = cstr(name, 'name')
+ data_pool = cstr(data_pool, 'data_pool', opt=True)
cdef:
rados_ioctx_t _ioctx = convert_ioctx(ioctx)
char *_name = name
p_snapname = cstr(p_snapname, 'p_snapname')
p_name = cstr(p_name, 'p_name')
c_name = cstr(c_name, 'c_name')
+ data_pool = cstr(data_pool, 'data_pool', opt=True)
cdef:
rados_ioctx_t _p_ioctx = convert_ioctx(p_ioctx)
rados_ioctx_t _c_ioctx = convert_ioctx(c_ioctx)
'id' : decode_cstr(c_info.id),
'name' : decode_cstr(c_info.name),
'source' : __source_string[c_info.source],
- 'deletion_time' : datetime.fromtimestamp(c_info.deletion_time),
- 'deferment_end_time' : datetime.fromtimestamp(c_info.deferment_end_time)
+ 'deletion_time' : datetime.utcfromtimestamp(c_info.deletion_time),
+ 'deferment_end_time' : datetime.utcfromtimestamp(c_info.deferment_end_time)
}
rbd_trash_get_cleanup(&c_info)
return info
},
'state' : self.images[i].state,
'description' : decode_cstr(self.images[i].description),
- 'last_update' : datetime.fromtimestamp(self.images[i].last_update),
+ 'last_update' : datetime.utcfromtimestamp(self.images[i].last_update),
'up' : self.images[i].up,
}
if self.size < self.max_read:
finally:
free(prefix)
+ def data_pool_id(self):
+ """
+ Get the pool id of the pool where the data of this RBD image is stored.
+
+ :returns: int - the pool id
+ """
+ return rbd_get_data_pool_id(self.image)
+
def parent_info(self):
"""
Get information about a cloned image's parent (if any)
:raises: :class:`ArgumentOutOfRange`
"""
dest_name = cstr(dest_name, 'dest_name')
+ data_pool = cstr(data_pool, 'data_pool', opt=True)
cdef:
rados_ioctx_t _dest_ioctx = convert_ioctx(dest_ioctx)
char *_dest_name = dest_name
ret = rbd_snap_get_timestamp(self.image, _snap_id, ×tamp)
if ret != 0:
raise make_ex(ret, 'error getting snapshot timestamp for image: %s, snap_id: %d' % (self.name, snap_id))
- return datetime.fromtimestamp(timestamp.tv_sec)
+ return datetime.utcfromtimestamp(timestamp.tv_sec)
def remove_snap_limit(self):
"""
ret = rbd_get_create_timestamp(self.image, ×tamp)
if ret != 0:
raise make_ex(ret, 'error getting create timestamp for image: %s' % (self.name))
- return datetime.fromtimestamp(timestamp.tv_sec)
+ return datetime.utcfromtimestamp(timestamp.tv_sec)
def flatten(self):
"""
},
'state' : c_status.state,
'description' : decode_cstr(c_status.description),
- 'last_update' : datetime.fromtimestamp(c_status.last_update),
+ 'last_update' : datetime.utcfromtimestamp(c_status.last_update),
'up' : c_status.up,
}
free(c_status.name)
'id' : decode_cstr(self.entries[i].id),
'name' : decode_cstr(self.entries[i].name),
'source' : TrashIterator.__source_string[self.entries[i].source],
- 'deletion_time' : datetime.fromtimestamp(self.entries[i].deletion_time),
- 'deferment_end_time' : datetime.fromtimestamp(self.entries[i].deferment_end_time)
+ 'deletion_time' : datetime.utcfromtimestamp(self.entries[i].deletion_time),
+ 'deferment_end_time' : datetime.utcfromtimestamp(self.entries[i].deferment_end_time)
}
def __dealloc__(self):
#define SWIFT_GROUP_ALL_USERS ".r:*"
-static int parse_list(const std::string& uid_list,
+static int parse_list(const char* uid_list,
std::vector<std::string>& uids) /* out */
{
- char *s = strdup(uid_list.c_str());
+ char *s = strdup(uid_list);
if (!s) {
return -ENOMEM;
}
int RGWAccessControlPolicy_SWIFT::create(RGWRados* const store,
const rgw_user& id,
const std::string& name,
- const std::string& read_list,
- const std::string& write_list,
+ const char* read_list,
+ const char* write_list,
uint32_t& rw_mask)
{
acl.create_default(id, name);
owner.set_name(name);
rw_mask = 0;
- if (read_list.size()) {
+ if (read_list) {
std::vector<std::string> uids;
int r = parse_list(read_list, uids);
if (r < 0) {
}
rw_mask |= SWIFT_PERM_READ;
}
- if (write_list.size()) {
+ if (write_list) {
std::vector<std::string> uids;
int r = parse_list(write_list, uids);
if (r < 0) {
int create(RGWRados *store,
const rgw_user& id,
const std::string& name,
- const std::string& read_list,
- const std::string& write_list,
+ const char* read_list,
+ const char* write_list,
uint32_t& rw_mask);
void filter_merge(uint32_t mask, RGWAccessControlPolicy_SWIFT *policy);
void to_str(std::string& read, std::string& write);
cout << " --max-buckets max number of buckets for a user\n";
cout << " --admin set the admin flag on the user\n";
cout << " --system set the system flag on the user\n";
- cout << " --bucket=<bucket>\n";
- cout << " --pool=<pool>\n";
- cout << " --object=<object>\n";
- cout << " --date=<date>\n";
- cout << " --start-date=<date>\n";
- cout << " --end-date=<date>\n";
- cout << " --bucket-id=<bucket-id>\n";
- cout << " --shard-id=<shard-id> optional for mdlog list\n";
+ cout << " --bucket=<bucket> Specify the bucket name. Also used by the quota command.\n";
+ cout << " --pool=<pool> Specify the pool name. Also used to scan for leaked rados objects.\n";
+ cout << " --object=<object> object name\n";
+ cout << " --date=<date> date in the format yyyy-mm-dd\n";
+ cout << " --start-date=<date> start date in the format yyyy-mm-dd\n";
+ cout << " --end-date=<date> end date in the format yyyy-mm-dd\n";
+ cout << " --bucket-id=<bucket-id> bucket id\n";
+ cout << " --shard-id=<shard-id> optional for: \n";
+ cout << " mdlog list\n";
+ cout << " data sync status\n";
cout << " required for: \n";
cout << " mdlog trim\n";
cout << " replica mdlog get/delete\n";
cout << " replica datalog get/delete\n";
+ cout << " --max-entries=<entries> max entries for listing operations\n";
cout << " --metadata-key=<key> key to retrieve metadata from with metadata get\n";
cout << " --remote=<remote> zone or zonegroup id of remote gateway\n";
cout << " --period=<id> period id\n";
OPT_BUCKET_STATS,
OPT_BUCKET_CHECK,
OPT_BUCKET_SYNC_STATUS,
+ OPT_BUCKET_SYNC_MARKERS,
OPT_BUCKET_SYNC_INIT,
OPT_BUCKET_SYNC_RUN,
OPT_BUCKET_SYNC_DISABLE,
if (strcmp(prev_cmd, "sync") == 0) {
if (strcmp(cmd, "status") == 0)
return OPT_BUCKET_SYNC_STATUS;
+ if (strcmp(cmd, "markers") == 0)
+ return OPT_BUCKET_SYNC_MARKERS;
if (strcmp(cmd, "init") == 0)
return OPT_BUCKET_SYNC_INIT;
if (strcmp(cmd, "run") == 0)
int num_full = 0;
int num_inc = 0;
int total_shards = 0;
+ set<int> shards_behind_set;
for (auto marker_iter : sync_status.sync_markers) {
full_total += marker_iter.second.total_entries;
total_shards++;
+ int shard_id = marker_iter.first;
if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) {
num_full++;
full_complete += marker_iter.second.pos;
+ shards_behind_set.insert(shard_id);
} else {
full_complete += marker_iter.second.total_entries;
}
if (local_iter.second.state == rgw_meta_sync_marker::SyncState::IncrementalSync &&
master_marker > local_iter.second.marker) {
shards_behind[shard_id] = local_iter.second.marker;
+ shards_behind_set.insert(shard_id);
}
}
}
push_ss(ss, status) << "metadata is caught up with master";
} else {
push_ss(ss, status) << "metadata is behind on " << total_behind << " shards";
+
+ push_ss(ss, status) << "behind shards: " << "[" << shards_behind_set << "]";
map<int, rgw_mdlog_shard_data> master_pos;
ret = sync.read_master_log_shards_next(sync_status.sync_info.period, shards_behind, &master_pos);
return;
}
+ set<int> recovering_shards;
+ ret = sync.read_recovering_shards(sync_status.sync_info.num_shards, recovering_shards);
+ if (ret < 0 && ret != ENOENT) {
+ push_ss(ss, status, tab) << string("failed read recovering shards: ") + cpp_strerror(-ret);
+ return;
+ }
+
string status_str;
switch (sync_status.sync_info.state) {
case rgw_data_sync_info::StateInit:
int num_full = 0;
int num_inc = 0;
int total_shards = 0;
+ set<int> shards_behind_set;
for (auto marker_iter : sync_status.sync_markers) {
+ int shard_id = marker_iter.first;
full_total += marker_iter.second.total_entries;
total_shards++;
if (marker_iter.second.state == rgw_data_sync_marker::SyncState::FullSync) {
num_full++;
full_complete += marker_iter.second.pos;
+ shards_behind_set.insert(shard_id);
} else {
full_complete += marker_iter.second.total_entries;
}
if (local_iter.second.state == rgw_data_sync_marker::SyncState::IncrementalSync &&
master_marker > local_iter.second.marker) {
shards_behind[shard_id] = local_iter.second.marker;
+ shards_behind_set.insert(shard_id);
}
}
int total_behind = shards_behind.size() + (sync_status.sync_info.num_shards - num_inc);
- if (total_behind == 0) {
+ int total_recovering = recovering_shards.size();
+ if (total_behind == 0 && total_recovering == 0) {
push_ss(ss, status, tab) << "data is caught up with source";
- } else {
+ } else if (total_behind > 0) {
push_ss(ss, status, tab) << "data is behind on " << total_behind << " shards";
+ push_ss(ss, status, tab) << "behind shards: " << "[" << shards_behind_set << "]" ;
+
map<int, rgw_datalog_shard_data> master_pos;
ret = sync.read_source_log_shards_next(shards_behind, &master_pos);
if (ret < 0) {
}
}
+ if (total_recovering > 0) {
+ push_ss(ss, status, tab) << total_recovering << " shards are recovering";
+ push_ss(ss, status, tab) << "recovering shards: " << "[" << recovering_shards << "]";
+ }
+
flush_ss(ss, status);
}
tab_dump("data sync", width, data_status);
}
+struct indented {
+ int w; // indent width
+ boost::string_view header;
+ indented(int w, boost::string_view header = "") : w(w), header(header) {}
+};
+std::ostream& operator<<(std::ostream& out, const indented& h) {
+ return out << std::setw(h.w) << h.header << std::setw(1) << ' ';
+}
+
+static int remote_bilog_markers(RGWRados *store, const RGWZone& source,
+ RGWRESTConn *conn, const RGWBucketInfo& info,
+ BucketIndexShardsManager *markers)
+{
+ const auto instance_key = info.bucket.get_key();
+ const rgw_http_param_pair params[] = {
+ { "type" , "bucket-index" },
+ { "bucket-instance", instance_key.c_str() },
+ { "info" , nullptr },
+ { nullptr, nullptr }
+ };
+ rgw_bucket_index_marker_info result;
+ int r = conn->get_json_resource("/admin/log/", params, result);
+ if (r < 0) {
+ lderr(store->ctx()) << "failed to fetch remote log markers: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ r = markers->from_string(result.max_marker, -1);
+ if (r < 0) {
+ lderr(store->ctx()) << "failed to decode remote log markers" << dendl;
+ return r;
+ }
+ return 0;
+}
+
+static int bucket_source_sync_status(RGWRados *store, const RGWZone& zone,
+ const RGWZone& source, RGWRESTConn *conn,
+ const RGWBucketInfo& bucket_info,
+ int width, std::ostream& out)
+{
+ out << indented{width, "source zone"} << source.id << " (" << source.name << ")\n";
+
+ // syncing from this zone?
+ if (!zone.syncs_from(source.id)) {
+ out << indented{width} << "not in sync_from\n";
+ return 0;
+ }
+ std::vector<rgw_bucket_shard_sync_info> status;
+ int r = rgw_bucket_sync_status(store, source.id, bucket_info, &status);
+ if (r < 0) {
+ lderr(store->ctx()) << "failed to read bucket sync status: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ int num_full = 0;
+ int num_inc = 0;
+ uint64_t full_complete = 0;
+ const int total_shards = status.size();
+
+ using BucketSyncState = rgw_bucket_shard_sync_info::SyncState;
+ for (size_t shard_id = 0; shard_id < total_shards; shard_id++) {
+ auto& m = status[shard_id];
+ if (m.state == BucketSyncState::StateFullSync) {
+ num_full++;
+ full_complete += m.full_marker.count;
+ } else if (m.state == BucketSyncState::StateIncrementalSync) {
+ num_inc++;
+ }
+ }
+
+ out << indented{width} << "full sync: " << num_full << "/" << total_shards << " shards\n";
+ if (num_full > 0) {
+ out << indented{width} << "full sync: " << full_complete << " objects completed\n";
+ }
+ out << indented{width} << "incremental sync: " << num_inc << "/" << total_shards << " shards\n";
+
+ BucketIndexShardsManager remote_markers;
+ r = remote_bilog_markers(store, source, conn, bucket_info, &remote_markers);
+ if (r < 0) {
+ lderr(store->ctx()) << "failed to read remote log: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::set<int> shards_behind;
+ for (auto& r : remote_markers.get()) {
+ auto shard_id = r.first;
+ auto& m = status[shard_id];
+ if (r.second.empty()) {
+ continue; // empty bucket index shard
+ }
+ auto pos = BucketIndexShardsManager::get_shard_marker(m.inc_marker.position);
+ if (m.state != BucketSyncState::StateIncrementalSync || pos != r.second) {
+ shards_behind.insert(shard_id);
+ }
+ }
+ if (shards_behind.empty()) {
+ out << indented{width} << "bucket is caught up with source\n";
+ } else {
+ out << indented{width} << "bucket is behind on " << shards_behind.size() << " shards\n";
+ out << indented{width} << "behind shards: [" << shards_behind << "]\n" ;
+ }
+ return 0;
+}
+
+static int bucket_sync_status(RGWRados *store, const RGWBucketInfo& info,
+ const std::string& source_zone_id,
+ std::ostream& out)
+{
+ RGWRealm& realm = store->realm;
+ RGWZoneGroup& zonegroup = store->get_zonegroup();
+ RGWZone& zone = store->get_zone();
+ constexpr int width = 15;
+
+ out << indented{width, "realm"} << realm.get_id() << " (" << realm.get_name() << ")\n";
+ out << indented{width, "zonegroup"} << zonegroup.get_id() << " (" << zonegroup.get_name() << ")\n";
+ out << indented{width, "zone"} << zone.id << " (" << zone.name << ")\n";
+ out << indented{width, "bucket"} << info.bucket << "\n\n";
+
+ if (!info.datasync_flag_enabled()) {
+ out << "Sync is disabled for bucket " << info.bucket.name << '\n';
+ return 0;
+ }
+
+ if (!source_zone_id.empty()) {
+ auto z = zonegroup.zones.find(source_zone_id);
+ if (z == zonegroup.zones.end()) {
+ lderr(store->ctx()) << "Source zone not found in zonegroup "
+ << zonegroup.get_name() << dendl;
+ return -EINVAL;
+ }
+ auto c = store->zone_conn_map.find(source_zone_id);
+ if (c == store->zone_conn_map.end()) {
+ lderr(store->ctx()) << "No connection to zone " << z->second.name << dendl;
+ return -EINVAL;
+ }
+ return bucket_source_sync_status(store, zone, z->second, c->second,
+ info, width, out);
+ }
+
+ for (const auto& z : zonegroup.zones) {
+ auto c = store->zone_conn_map.find(z.second.id);
+ if (c != store->zone_conn_map.end()) {
+ bucket_source_sync_status(store, zone, z.second, c->second,
+ info, width, out);
+ }
+ }
+ return 0;
+}
+
static void parse_tier_config_param(const string& s, map<string, string, ltstr_nocase>& out)
{
list<string> confs;
OPT_REALM_RENAME, OPT_REALM_SET,
OPT_REALM_DEFAULT, OPT_REALM_PULL};
+ std::set<int> readonly_ops_list = {
+ OPT_USER_INFO,
+ OPT_USER_STATS,
+ OPT_BUCKETS_LIST,
+ OPT_BUCKET_LIMIT_CHECK,
+ OPT_BUCKET_STATS,
+ OPT_BUCKET_SYNC_STATUS,
+ OPT_LOG_LIST,
+ OPT_LOG_SHOW,
+ OPT_USAGE_SHOW,
+ OPT_OBJECT_STAT,
+ OPT_BI_GET,
+ OPT_BI_LIST,
+ OPT_OLH_GET,
+ OPT_OLH_READLOG,
+ OPT_GC_LIST,
+ OPT_LC_LIST,
+ OPT_ORPHANS_LIST_JOBS,
+ OPT_ZONEGROUP_GET,
+ OPT_ZONEGROUP_LIST,
+ OPT_ZONEGROUP_PLACEMENT_LIST,
+ OPT_ZONE_GET,
+ OPT_ZONE_LIST,
+ OPT_ZONE_PLACEMENT_LIST,
+ OPT_METADATA_GET,
+ OPT_METADATA_LIST,
+ OPT_METADATA_SYNC_STATUS,
+ OPT_MDLOG_LIST,
+ OPT_MDLOG_STATUS,
+ OPT_SYNC_ERROR_LIST,
+ OPT_BILOG_LIST,
+ OPT_BILOG_STATUS,
+ OPT_DATA_SYNC_STATUS,
+ OPT_DATALOG_LIST,
+ OPT_DATALOG_STATUS,
+ OPT_OPSTATE_LIST,
+ OPT_REPLICALOG_GET,
+ OPT_REALM_GET,
+ OPT_REALM_GET_DEFAULT,
+ OPT_REALM_LIST,
+ OPT_REALM_LIST_PERIODS,
+ OPT_PERIOD_GET,
+ OPT_PERIOD_GET_CURRENT,
+ OPT_PERIOD_LIST,
+ OPT_GLOBAL_QUOTA_GET,
+ OPT_SYNC_STATUS,
+ OPT_ROLE_GET,
+ OPT_ROLE_LIST,
+ OPT_ROLE_POLICY_LIST,
+ OPT_ROLE_POLICY_GET,
+ OPT_RESHARD_LIST,
+ OPT_RESHARD_STATUS,
+ };
bool raw_storage_op = (raw_storage_ops_list.find(opt_cmd) != raw_storage_ops_list.end() ||
raw_period_update);
+ bool need_cache = readonly_ops_list.find(opt_cmd) == readonly_ops_list.end();
if (raw_storage_op) {
store = RGWStoreManager::get_raw_storage(g_ceph_context);
} else {
- store = RGWStoreManager::get_storage(g_ceph_context, false, false, false, false, false);
+ store = RGWStoreManager::get_storage(g_ceph_context, false, false, false, false, false,
+ need_cache && g_conf->rgw_cache_enabled);
}
if (!store) {
cerr << "couldn't init storage provider" << std::endl;
}
rgw_data_sync_status sync_status;
- ret = sync.read_sync_status(&sync_status);
- if (ret < 0 && ret != -ENOENT) {
- cerr << "ERROR: sync.read_sync_status() returned ret=" << ret << std::endl;
- return -ret;
- }
+ if (specified_shard_id) {
+ set<string> pending_buckets;
+ set<string> recovering_buckets;
+ rgw_data_sync_marker sync_marker;
+ ret = sync.read_shard_status(shard_id, pending_buckets, recovering_buckets, &sync_marker,
+ max_entries_specified ? max_entries : 20);
+ if (ret < 0 && ret != -ENOENT) {
+ cerr << "ERROR: sync.read_shard_status() returned ret=" << ret << std::endl;
+ return -ret;
+ }
+ formatter->open_object_section("summary");
+ encode_json("shard_id", shard_id, formatter);
+ encode_json("marker", sync_marker, formatter);
+ encode_json("pending_buckets", pending_buckets, formatter);
+ encode_json("recovering_buckets", recovering_buckets, formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ } else {
+ ret = sync.read_sync_status(&sync_status);
+ if (ret < 0 && ret != -ENOENT) {
+ cerr << "ERROR: sync.read_sync_status() returned ret=" << ret << std::endl;
+ return -ret;
+ }
- formatter->open_object_section("summary");
- encode_json("sync_status", sync_status, formatter);
+ formatter->open_object_section("summary");
+ encode_json("sync_status", sync_status, formatter);
- uint64_t full_total = 0;
- uint64_t full_complete = 0;
+ uint64_t full_total = 0;
+ uint64_t full_complete = 0;
- for (auto marker_iter : sync_status.sync_markers) {
- full_total += marker_iter.second.total_entries;
- if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) {
- full_complete += marker_iter.second.pos;
- } else {
- full_complete += marker_iter.second.total_entries;
+ for (auto marker_iter : sync_status.sync_markers) {
+ full_total += marker_iter.second.total_entries;
+ if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) {
+ full_complete += marker_iter.second.pos;
+ } else {
+ full_complete += marker_iter.second.total_entries;
+ }
}
- }
- formatter->open_object_section("full_sync");
- encode_json("total", full_total, formatter);
- encode_json("complete", full_complete, formatter);
- formatter->close_section();
- formatter->close_section();
+ formatter->open_object_section("full_sync");
+ encode_json("total", full_total, formatter);
+ encode_json("complete", full_complete, formatter);
+ formatter->close_section();
+ formatter->close_section();
- formatter->flush(cout);
+ formatter->flush(cout);
+ }
}
if (opt_cmd == OPT_DATA_SYNC_INIT) {
ret = set_bucket_sync_enabled(store, opt_cmd, tenant, bucket_name);
if (ret < 0)
return -ret;
-}
+ }
if (opt_cmd == OPT_BUCKET_SYNC_STATUS) {
+ if (bucket_name.empty()) {
+ cerr << "ERROR: bucket not specified" << std::endl;
+ return EINVAL;
+ }
+ RGWBucketInfo bucket_info;
+ rgw_bucket bucket;
+ int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
+ if (ret < 0) {
+ return -ret;
+ }
+ bucket_sync_status(store, bucket_info, source_zone, std::cout);
+ }
+
+ if (opt_cmd == OPT_BUCKET_SYNC_MARKERS) {
if (source_zone.empty()) {
cerr << "ERROR: source zone not specified" << std::endl;
return EINVAL;
friend void intrusive_ptr_release(Connection *c) { c->put(); }
};
-
class AsioFrontend {
RGWProcessEnv env;
RGWFrontendConfig* conf;
boost::asio::io_service service;
- tcp::acceptor acceptor;
- tcp::socket peer_socket;
+ struct Listener {
+ tcp::endpoint endpoint;
+ tcp::acceptor acceptor;
+ tcp::socket socket;
+
+ Listener(boost::asio::io_service& service)
+ : acceptor(service), socket(service) {}
+ };
+ std::vector<Listener> listeners;
std::vector<std::thread> threads;
Pauser pauser;
CephContext* ctx() const { return env.store->ctx(); }
- void accept(boost::system::error_code ec);
+ void accept(Listener& listener, boost::system::error_code ec);
public:
AsioFrontend(const RGWProcessEnv& env, RGWFrontendConfig* conf)
- : env(env), conf(conf), acceptor(service), peer_socket(service) {}
+ : env(env), conf(conf) {}
int init();
int run();
void unpause(RGWRados* store, rgw_auth_registry_ptr_t);
};
-int AsioFrontend::init()
+unsigned short parse_port(const char *input, boost::system::error_code& ec)
+{
+ char *end = nullptr;
+ auto port = std::strtoul(input, &end, 10);
+ if (port > std::numeric_limits<unsigned short>::max()) {
+ ec.assign(ERANGE, boost::system::system_category());
+ } else if (port == 0 && end == input) {
+ ec.assign(EINVAL, boost::system::system_category());
+ }
+ return port;
+}
+
+tcp::endpoint parse_endpoint(BOOST_ASIO_STRING_VIEW_PARAM input,
+ boost::system::error_code& ec)
{
- std::string port_str;
- conf->get_val("port", "80", &port_str);
+ tcp::endpoint endpoint;
- unsigned short port;
- boost::asio::ip::address addr; // default to 'any'
+ auto colon = input.find(':');
+ if (colon != input.npos) {
+ auto port_str = input.substr(colon + 1);
+ endpoint.port(parse_port(port_str.data(), ec));
+ } else {
+ endpoint.port(80);
+ }
+ if (!ec) {
+ auto addr = input.substr(0, colon);
+ endpoint.address(boost::asio::ip::make_address(addr, ec));
+ }
+ return endpoint;
+}
+
+int AsioFrontend::init()
+{
boost::system::error_code ec;
+ auto& config = conf->get_config_map();
- auto colon = port_str.find(':');
- if (colon != port_str.npos) {
- addr = boost::asio::ip::make_address(port_str.substr(0, colon), ec);
+ // parse endpoints
+ auto range = config.equal_range("port");
+ for (auto i = range.first; i != range.second; ++i) {
+ auto port = parse_port(i->second.c_str(), ec);
if (ec) {
- lderr(ctx()) << "failed to parse address '" << port_str << "': " << ec.message() << dendl;
+ lderr(ctx()) << "failed to parse port=" << i->second << dendl;
return -ec.value();
}
- port = std::stoul(port_str.substr(colon + 1), nullptr, 0);
- } else {
- port = std::stoul(port_str, nullptr, 0);
+ listeners.emplace_back(service);
+ listeners.back().endpoint.port(port);
}
- tcp::endpoint ep = {addr, port};
- ldout(ctx(), 4) << "frontend listening on " << ep << dendl;
-
- acceptor.open(ep.protocol(), ec);
- if (ec) {
- lderr(ctx()) << "failed to open socket: " << ec.message() << dendl;
- return -ec.value();
+ range = config.equal_range("endpoint");
+ for (auto i = range.first; i != range.second; ++i) {
+ auto endpoint = parse_endpoint(i->second, ec);
+ if (ec) {
+ lderr(ctx()) << "failed to parse endpoint=" << i->second << dendl;
+ return -ec.value();
+ }
+ listeners.emplace_back(service);
+ listeners.back().endpoint = endpoint;
}
- acceptor.set_option(tcp::acceptor::reuse_address(true));
- acceptor.bind(ep, ec);
- if (ec) {
- lderr(ctx()) << "failed to bind address " << ep <<
- ": " << ec.message() << dendl;
- return -ec.value();
+
+ // start listeners
+ for (auto& l : listeners) {
+ l.acceptor.open(l.endpoint.protocol(), ec);
+ if (ec) {
+ lderr(ctx()) << "failed to open socket: " << ec.message() << dendl;
+ return -ec.value();
+ }
+ l.acceptor.set_option(tcp::acceptor::reuse_address(true));
+ l.acceptor.bind(l.endpoint, ec);
+ if (ec) {
+ lderr(ctx()) << "failed to bind address " << l.endpoint
+ << ": " << ec.message() << dendl;
+ return -ec.value();
+ }
+ l.acceptor.listen(boost::asio::socket_base::max_connections);
+ l.acceptor.async_accept(l.socket,
+ [this, &l] (boost::system::error_code ec) {
+ accept(l, ec);
+ });
+
+ ldout(ctx(), 4) << "frontend listening on " << l.endpoint << dendl;
}
- acceptor.listen(boost::asio::socket_base::max_connections);
- acceptor.async_accept(peer_socket,
- [this] (boost::system::error_code ec) {
- return accept(ec);
- });
return 0;
}
-void AsioFrontend::accept(boost::system::error_code ec)
+void AsioFrontend::accept(Listener& l, boost::system::error_code ec)
{
- if (!acceptor.is_open()) {
+ if (!l.acceptor.is_open()) {
return;
} else if (ec == boost::asio::error::operation_aborted) {
return;
} else if (ec) {
throw ec;
}
- auto socket = std::move(peer_socket);
- acceptor.async_accept(peer_socket,
- [this] (boost::system::error_code ec) {
- return accept(ec);
- });
+ auto socket = std::move(l.socket);
+ l.acceptor.async_accept(l.socket,
+ [this, &l] (boost::system::error_code ec) {
+ accept(l, ec);
+ });
boost::intrusive_ptr<Connection> conn{new Connection(env, std::move(socket))};
conn->on_connect();
going_down = true;
boost::system::error_code ec;
- acceptor.close(ec);
+ // close all listeners
+ for (auto& listener : listeners) {
+ listener.acceptor.close(ec);
+ }
// unblock the run() threads
service.stop();
<< ", is_admin=" << info.is_admin << ")";
}
+void rgw::auth::ImplicitTenants::recompute_value(const md_config_t *c)
+{
+ std::string s = c->get_val<std::string>("rgw_keystone_implicit_tenants");
+ int v = 0;
+ if (boost::iequals(s, "both")
+ || boost::iequals(s, "true")
+ || boost::iequals(s, "1")) {
+ v = IMPLICIT_TENANTS_S3|IMPLICIT_TENANTS_SWIFT;
+ } else if (boost::iequals(s, "0")
+ || boost::iequals(s, "none")
+ || boost::iequals(s, "false")) {
+ v = 0;
+ } else if (boost::iequals(s, "s3")) {
+ v = IMPLICIT_TENANTS_S3;
+ } else if (boost::iequals(s, "swift")) {
+ v = IMPLICIT_TENANTS_SWIFT;
+ } else { /* "" (and anything else) */
+ v = IMPLICIT_TENANTS_BAD;
+ // assert(0);
+ }
+ saved = v;
+}
+
+const char **rgw::auth::ImplicitTenants::get_tracked_conf_keys() const
+{
+ static const char *keys[] = {
+ "rgw_keystone_implicit_tenants",
+ NULL };
+ return keys;
+}
+
+void rgw::auth::ImplicitTenants::handle_conf_change(const struct md_config_t *c,
+ const std::set <std::string> &changed)
+{
+ if (changed.count("rgw_keystone_implicit_tenants")) {
+ recompute_value(c);
+ }
+}
+
void rgw::auth::RemoteApplier::create_account(const rgw_user& acct_user,
+ bool implicit_tenant,
RGWUserInfo& user_info) const /* out */
{
rgw_user new_acct_user = acct_user;
/* An upper layer may enforce creating new accounts within their own
* tenants. */
- if (new_acct_user.tenant.empty() && implicit_tenants) {
+ if (new_acct_user.tenant.empty() && implicit_tenant) {
new_acct_user.tenant = new_acct_user.id;
}
* that belongs to the authenticated identity. Another policy may be
* applied by using a RGWThirdPartyAccountAuthApplier decorator. */
const rgw_user& acct_user = info.acct_user;
+ auto implicit_value = implicit_tenant_context.get_value();
+ bool implicit_tenant = implicit_value.implicit_tenants_for_(implicit_tenant_bit);
+ bool split_mode = implicit_value.is_split_mode();
/* Normally, empty "tenant" field of acct_user means the authenticated
* identity has the legacy, global tenant. However, due to inclusion
* the wiser.
* If that fails, we look up in the requested (possibly empty) tenant.
* If that fails too, we create the account within the global or separated
- * namespace depending on rgw_keystone_implicit_tenants. */
- if (acct_user.tenant.empty()) {
+ * namespace depending on rgw_keystone_implicit_tenants.
+ * For compatibility with previous versions of ceph, it is possible
+ * to enable implicit_tenants for only s3 or only swift.
+ * in this mode ("split_mode"), we must constrain the id lookups to
+ * only use the identifier space that would be used if the id were
+ * to be created. */
+
+ if (split_mode && !implicit_tenant)
+ ; /* suppress lookup for id used by "other" protocol */
+ else if (acct_user.tenant.empty()) {
const rgw_user tenanted_uid(acct_user.id, acct_user.id);
if (rgw_get_user_info_by_uid(store, tenanted_uid, user_info) >= 0) {
}
}
- if (rgw_get_user_info_by_uid(store, acct_user, user_info) < 0) {
- ldout(cct, 0) << "NOTICE: couldn't map swift user " << acct_user << dendl;
- create_account(acct_user, user_info);
+ if (split_mode && implicit_tenant)
+ ; /* suppress lookup for id used by "other" protocol */
+ else if (rgw_get_user_info_by_uid(store, acct_user, user_info) >= 0) {
+ /* Succeeded. */
+ return;
}
+ ldout(cct, 0) << "NOTICE: couldn't map swift user " << acct_user << dendl;
+ create_account(acct_user, implicit_tenant, user_info);
+
/* Succeeded if we are here (create_account() hasn't throwed). */
}
* Each new Strategy should be exposed to it. */
class StrategyRegistry;
+class ImplicitTenants: public md_config_obs_t {
+public:
+ enum implicit_tenant_flag_bits {IMPLICIT_TENANTS_SWIFT=1,
+ IMPLICIT_TENANTS_S3=2, IMPLICIT_TENANTS_BAD = -1, };
+private:
+ int saved;
+ void recompute_value(const md_config_t *);
+ class ImplicitTenantValue {
+ friend class ImplicitTenants;
+ int v;
+ ImplicitTenantValue(int v) : v(v) {};
+ public:
+ bool inline is_split_mode()
+ {
+ assert(v != IMPLICIT_TENANTS_BAD);
+ return v == IMPLICIT_TENANTS_SWIFT || v == IMPLICIT_TENANTS_S3;
+ }
+ bool inline implicit_tenants_for_(const implicit_tenant_flag_bits bit)
+ {
+ assert(v != IMPLICIT_TENANTS_BAD);
+ return !!(v&bit);
+ }
+ };
+public:
+ ImplicitTenants(md_config_t &c) { recompute_value(&c);}
+ ImplicitTenantValue get_value() {
+ return ImplicitTenantValue(saved);
+ }
+private:
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(const struct md_config_t *conf,
+ const std::set <std::string> &changed) override;
+};
+
+std::tuple<bool,bool> implicit_tenants_enabled_for_swift(CephContext * const cct);
+std::tuple<bool,bool> implicit_tenants_enabled_for_s3(CephContext * const cct);
+
/* rgw::auth::RemoteApplier targets those authentication engines which don't
* need to ask the RADOS store while performing the auth process. Instead,
* they obtain credentials from an external source like Keystone or LDAP.
const acl_strategy_t extra_acl_strategy;
const AuthInfo info;
- const bool implicit_tenants;
+ rgw::auth::ImplicitTenants& implicit_tenant_context;
+ const rgw::auth::ImplicitTenants::implicit_tenant_flag_bits implicit_tenant_bit;
virtual void create_account(const rgw_user& acct_user,
+ bool implicit_tenant,
RGWUserInfo& user_info) const; /* out */
public:
RGWRados* const store,
acl_strategy_t&& extra_acl_strategy,
const AuthInfo& info,
- const bool implicit_tenants)
+ rgw::auth::ImplicitTenants& implicit_tenant_context,
+ rgw::auth::ImplicitTenants::implicit_tenant_flag_bits implicit_tenant_bit)
: cct(cct),
store(store),
extra_acl_strategy(std::move(extra_acl_strategy)),
info(info),
- implicit_tenants(implicit_tenants) {
+ implicit_tenant_context(implicit_tenant_context),
+ implicit_tenant_bit(implicit_tenant_bit) {
}
uint32_t get_perms_from_aclspec(const aclspec_t& aclspec) const override;
s3_main_strategy_plain_t s3_main_strategy_plain;
s3_main_strategy_boto2_t s3_main_strategy_boto2;
- s3_main_strategy_t(CephContext* const cct, RGWRados* const store)
- : s3_main_strategy_plain(cct, store),
- s3_main_strategy_boto2(cct, store) {
+ s3_main_strategy_t(CephContext* const cct,
+ ImplicitTenants& implicit_tenant_context,
+ RGWRados* const store)
+ : s3_main_strategy_plain(cct, implicit_tenant_context, store),
+ s3_main_strategy_boto2(cct, implicit_tenant_context, store) {
add_engine(Strategy::Control::SUFFICIENT, s3_main_strategy_plain);
add_engine(Strategy::Control::FALLBACK, s3_main_strategy_boto2);
}
public:
StrategyRegistry(CephContext* const cct,
+ ImplicitTenants& implicit_tenant_context,
RGWRados* const store)
- : s3_main_strategy(cct, store),
- s3_post_strategy(cct, store),
- swift_strategy(cct, store) {
+ : s3_main_strategy(cct, implicit_tenant_context, store),
+ s3_post_strategy(cct, implicit_tenant_context, store),
+ swift_strategy(cct, implicit_tenant_context, store) {
}
const s3_main_strategy_t& get_s3_main() const {
static std::shared_ptr<StrategyRegistry>
create(CephContext* const cct,
+ ImplicitTenants& implicit_tenant_context,
RGWRados* const store) {
- return std::make_shared<StrategyRegistry>(cct, store);
+ return std::make_shared<StrategyRegistry>(cct, implicit_tenant_context, store);
}
};
#include <boost/container/small_vector.hpp>
#include <boost/utility/string_view.hpp>
+#include <boost/algorithm/string/trim_all.hpp>
#define dout_context g_ceph_context
#define dout_subsys ceph_subsys_rgw
std::string canonical_hdrs;
for (const auto& header : canonical_hdrs_map) {
const boost::string_view& name = header.first;
- const std::string& value = header.second;
+ std::string value = header.second;
+ boost::trim_all<std::string>(value);
canonical_hdrs.append(name.data(), name.length())
.append(":", std::strlen(":"))
public rgw::auth::RemoteApplier::Factory {
typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t;
RGWRados* const store;
+ rgw::auth::ImplicitTenants& implicit_tenant_context;
using keystone_config_t = rgw::keystone::CephCtxConfig;
using keystone_cache_t = rgw::keystone::TokenCache;
) const override {
auto apl = rgw::auth::add_sysreq(cct, store, s,
rgw::auth::RemoteApplier(cct, store, std::move(acl_alg), info,
- cct->_conf->rgw_keystone_implicit_tenants));
+ implicit_tenant_context,
+ rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_S3));
/* TODO(rzarzynski): replace with static_ptr. */
return aplptr_t(new decltype(apl)(std::move(apl)));
}
public:
ExternalAuthStrategy(CephContext* const cct,
RGWRados* const store,
+ rgw::auth::ImplicitTenants& implicit_tenant_context,
AWSEngine::VersionAbstractor* const ver_abstractor)
: store(store),
+ implicit_tenant_context(implicit_tenant_context),
ldap_engine(cct, store, *ver_abstractor,
static_cast<rgw::auth::RemoteApplier::Factory*>(this)) {
public:
AWSAuthStrategy(CephContext* const cct,
+ rgw::auth::ImplicitTenants& implicit_tenant_context,
RGWRados* const store)
: store(store),
ver_abstractor(cct),
anonymous_engine(cct,
static_cast<rgw::auth::LocalApplier::Factory*>(this)),
- external_engines(cct, store, &ver_abstractor),
+ external_engines(cct, store, implicit_tenant_context, &ver_abstractor),
local_engine(cct, store, ver_abstractor,
static_cast<rgw::auth::LocalApplier::Factory*>(this)) {
/* The anynoymous auth. */
ldout(cct, 10) << "cache put: name=" << name << " info.flags=0x"
<< std::hex << info.flags << std::dec << dendl;
- map<string, ObjectCacheEntry>::iterator iter = cache_map.find(name);
- if (iter == cache_map.end()) {
- ObjectCacheEntry entry;
+
+ const std::pair<std::map<std::string,
+ ObjectCacheEntry>::iterator, bool>& emp_pair
+ = cache_map.emplace(name, ObjectCacheEntry{});
+ ObjectCacheEntry& entry = emp_pair.first->second;
+ bool inserted = emp_pair.second;
+ entry.info.time_added = ceph::coarse_mono_clock::now();
+ if (inserted) {
entry.lru_iter = lru.end();
- cache_map.insert(pair<string, ObjectCacheEntry>(name, entry));
- iter = cache_map.find(name);
}
- ObjectCacheEntry& entry = iter->second;
ObjectCacheInfo& target = entry.info;
invalidate_lru(entry);
map<string, bufferlist> rm_xattrs;
ObjectMetaInfo meta;
obj_version version = {};
- ceph::coarse_mono_time time_added = ceph::coarse_mono_clock::now();
+ ceph::coarse_mono_time time_added;
ObjectCacheInfo() = default;
return 0;
}
for (c = 0; c < len; c += ret) {
- ret = mg_read(conn, buf, len);
+ ret = mg_read(conn, buf+c, len-c);
if (ret < 0) {
throw rgw::io::Exception(EIO, std::system_category());
}
int RGWCivetWebFrontend::run()
{
auto& conf_map = conf->get_config_map();
- string port_str;
set_conf_default(conf_map, "num_threads",
std::to_string(g_conf->rgw_thread_pool_size));
set_conf_default(conf_map, "validate_http_method", "no");
set_conf_default(conf_map, "canonicalize_url_path", "no");
set_conf_default(conf_map, "enable_auth_domain_check", "no");
- conf->get_val("port", "80", &port_str);
- std::replace(port_str.begin(), port_str.end(), '+', ',');
- conf_map["listening_ports"] = port_str;
+
+ std::string listening_ports;
+ // support multiple port= entries
+ auto range = conf_map.equal_range("port");
+ for (auto p = range.first; p != range.second; ++p) {
+ std::string port_str = p->second;
+ // support port= entries with multiple values
+ std::replace(port_str.begin(), port_str.end(), '+', ',');
+ if (!listening_ports.empty()) {
+ listening_ports.append(1, ',');
+ }
+ listening_ports.append(port_str);
+ }
+ if (listening_ports.empty()) {
+ listening_ports = "80";
+ }
+ conf_map.emplace("listening_ports", std::move(listening_ports));
/* Set run_as_user. This will cause civetweb to invoke setuid() and setgid()
* based on pw_uid and pw_gid obtained from pw_name. */
std::string uid_string = g_ceph_context->get_set_uid_string();
if (! uid_string.empty()) {
- conf_map["run_as_user"] = std::move(uid_string);
+ conf_map.emplace("run_as_user", std::move(uid_string));
}
/* Prepare options for CivetWeb. */
WRITE_CLASS_ENCODER(rgw_bucket)
inline ostream& operator<<(ostream& out, const rgw_bucket &b) {
- out << b.name << "[" << b.marker << "])";
+ out << b.name << "[" << b.marker << "]";
return out;
}
while (blocked_count - interval_wait_count >= ops_window) {
ret = completion_mgr->get_next((void **)&blocked_stack);
if (ret < 0) {
- ldout(cct, 0) << "ERROR: failed to clone shard, completion_mgr.get_next() returned ret=" << ret << dendl;
+ ldout(cct, 5) << "completion_mgr.get_next() returned ret=" << ret << dendl;
}
handle_unblocked_stack(context_stacks, scheduled_stacks, blocked_stack, &blocked_count);
}
while (scheduled_stacks.empty() && blocked_count > 0) {
ret = completion_mgr->get_next((void **)&blocked_stack);
if (ret < 0) {
- ldout(cct, 0) << "ERROR: failed to clone shard, completion_mgr.get_next() returned ret=" << ret << dendl;
+ ldout(cct, 5) << "completion_mgr.get_next() returned ret=" << ret << dendl;
}
if (going_down) {
ldout(cct, 5) << __func__ << "(): was stopped, exiting" << dendl;
RGWRadosGetOmapKeysCR::RGWRadosGetOmapKeysCR(RGWRados *_store,
const rgw_raw_obj& _obj,
const string& _marker,
- map<string, bufferlist> *_entries, int _max_entries) : RGWSimpleCoroutine(_store->ctx()),
+ std::set<std::string> *_entries, int _max_entries) : RGWSimpleCoroutine(_store->ctx()),
store(_store),
marker(_marker),
- entries(_entries), max_entries(_max_entries), rval(0),
+ entries(_entries), max_entries(_max_entries),
obj(_obj), cn(NULL)
{
set_description() << "set omap keys dest=" << obj << " marker=" << marker;
set_status() << "send request";
librados::ObjectReadOperation op;
- op.omap_get_vals2(marker, max_entries, entries, nullptr, &rval);
+ op.omap_get_keys2(marker, max_entries, entries, nullptr, nullptr);
cn = stack->create_completion_notifier();
return ref.ioctx.aio_operate(ref.oid, cn->completion(), &op, NULL);
}
+int RGWRadosGetOmapKeysCR::request_complete()
+{
+ int r = cn->completion()->get_return_value();
+
+ set_status() << "request complete; ret=" << r;
+
+ return r;
+}
+
RGWRadosRemoveOmapKeysCR::RGWRadosRemoveOmapKeysCR(RGWRados *_store,
const rgw_raw_obj& _obj,
const set<string>& _keys) : RGWSimpleCoroutine(_store->ctx()),
RGWRados *store;
string marker;
- map<string, bufferlist> *entries;
+ std::set<std::string> *entries;
int max_entries;
- int rval;
rgw_rados_ref ref;
rgw_raw_obj obj;
RGWRadosGetOmapKeysCR(RGWRados *_store,
const rgw_raw_obj& _obj,
const string& _marker,
- map<string, bufferlist> *_entries, int _max_entries);
+ std::set<std::string> *_entries, int _max_entries);
int send_request() override;
-
- int request_complete() override {
- return rval;
- }
+ int request_complete() override;
};
class RGWRadosRemoveOmapKeysCR : public RGWSimpleCoroutine {
return true;
}
+class RGWReadDataSyncRecoveringShardsCR : public RGWShardCollectCR {
+ static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+ RGWDataSyncEnv *env;
+
+ uint64_t max_entries;
+ int num_shards;
+ int shard_id{0};;
+
+ string marker;
+ map<int, std::set<std::string>> &entries_map;
+
+ public:
+ RGWReadDataSyncRecoveringShardsCR(RGWDataSyncEnv *env, uint64_t _max_entries, int _num_shards,
+ map<int, std::set<std::string>>& _entries_map)
+ : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS), env(env),
+ max_entries(_max_entries), num_shards(_num_shards), entries_map(_entries_map)
+ {}
+ bool spawn_next() override;
+};
+
+bool RGWReadDataSyncRecoveringShardsCR::spawn_next()
+{
+ if (shard_id > num_shards)
+ return false;
+
+ string error_oid = RGWDataSyncStatusManager::shard_obj_name(env->source_zone, shard_id) + ".retry";
+ spawn(new RGWRadosGetOmapKeysCR(env->store, rgw_raw_obj(env->store->get_zone_params().log_pool, error_oid),
+ marker, &entries_map[shard_id], max_entries), false);
+
+ ++shard_id;
+ return true;
+}
+
class RGWReadDataSyncStatusCoroutine : public RGWCoroutine {
RGWDataSyncEnv *sync_env;
rgw_data_sync_status *sync_status;
return ret;
}
+int RGWRemoteDataLog::read_recovering_shards(const int num_shards, set<int>& recovering_shards)
+{
+ // cannot run concurrently with run_sync(), so run in a separate manager
+ RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
+ RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
+ int ret = http_manager.set_threaded();
+ if (ret < 0) {
+ ldout(store->ctx(), 0) << "failed in http_manager.set_threaded() ret=" << ret << dendl;
+ return ret;
+ }
+ RGWDataSyncEnv sync_env_local = sync_env;
+ sync_env_local.http_manager = &http_manager;
+ map<int, std::set<std::string>> entries_map;
+ uint64_t max_entries{1};
+ ret = crs.run(new RGWReadDataSyncRecoveringShardsCR(&sync_env_local, max_entries, num_shards, entries_map));
+ http_manager.stop();
+
+ if (ret == 0) {
+ for (const auto& entry : entries_map) {
+ if (entry.second.size() != 0) {
+ recovering_shards.insert(entry.first);
+ }
+ }
+ }
+
+ return ret;
+}
+
int RGWRemoteDataLog::init_sync_status(int num_shards)
{
rgw_data_sync_status sync_status;
uint32_t shard_id;
rgw_data_sync_marker sync_marker;
- map<string, bufferlist> entries;
- map<string, bufferlist>::iterator iter;
+ std::set<std::string> entries;
+ std::set<std::string>::iterator iter;
string oid;
string error_oid;
RGWOmapAppend *error_repo;
- map<string, bufferlist> error_entries;
+ std::set<std::string> error_entries;
string error_marker;
int max_error_entries;
}
iter = entries.begin();
for (; iter != entries.end(); ++iter) {
- ldout(sync_env->cct, 20) << __func__ << ": full sync: " << iter->first << dendl;
+ ldout(sync_env->cct, 20) << __func__ << ": full sync: " << *iter << dendl;
total_entries++;
- if (!marker_tracker->start(iter->first, total_entries, real_time())) {
- ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << iter->first << ". Duplicate entry?" << dendl;
+ if (!marker_tracker->start(*iter, total_entries, real_time())) {
+ ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << *iter << ". Duplicate entry?" << dendl;
} else {
// fetch remote and write locally
- yield spawn(new RGWDataSyncSingleEntryCR(sync_env, iter->first, iter->first, marker_tracker, error_repo, false), false);
+ yield spawn(new RGWDataSyncSingleEntryCR(sync_env, *iter, *iter, marker_tracker, error_repo, false), false);
if (retcode < 0) {
lease_cr->go_down();
drain_all();
return set_cr_error(retcode);
}
}
- sync_marker.marker = iter->first;
+ sync_marker.marker = *iter;
}
} while ((int)entries.size() == max_entries);
ldout(sync_env->cct, 20) << __func__ << "(): read error repo, got " << error_entries.size() << " entries" << dendl;
iter = error_entries.begin();
for (; iter != error_entries.end(); ++iter) {
- ldout(sync_env->cct, 20) << __func__ << "(): handle error entry: " << iter->first << dendl;
- spawn(new RGWDataSyncSingleEntryCR(sync_env, iter->first, iter->first, nullptr /* no marker tracker */, error_repo, true), false);
- error_marker = iter->first;
+ error_marker = *iter;
+ ldout(sync_env->cct, 20) << __func__ << "(): handle error entry: " << error_marker << dendl;
+ spawn(new RGWDataSyncSingleEntryCR(sync_env, error_marker, error_marker, nullptr /* no marker tracker */, error_repo, true), false);
}
if ((int)error_entries.size() != max_error_entries) {
if (error_marker.empty() && error_entries.empty()) {
return 0;
}
-struct bucket_index_marker_info {
- string bucket_ver;
- string master_ver;
- string max_marker;
- bool syncstopped{false};
-
- void decode_json(JSONObj *obj) {
- JSONDecoder::decode_json("bucket_ver", bucket_ver, obj);
- JSONDecoder::decode_json("master_ver", master_ver, obj);
- JSONDecoder::decode_json("max_marker", max_marker, obj);
- JSONDecoder::decode_json("syncstopped", syncstopped, obj);
- }
-};
-
class RGWReadRemoteBucketIndexLogInfoCR : public RGWCoroutine {
RGWDataSyncEnv *sync_env;
const string instance_key;
- bucket_index_marker_info *info;
+ rgw_bucket_index_marker_info *info;
public:
RGWReadRemoteBucketIndexLogInfoCR(RGWDataSyncEnv *_sync_env,
const rgw_bucket_shard& bs,
- bucket_index_marker_info *_info)
+ rgw_bucket_index_marker_info *_info)
: RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
instance_key(bs.get_key()), info(_info) {}
{ NULL, NULL } };
string p = "/admin/log/";
- call(new RGWReadRESTResourceCR<bucket_index_marker_info>(sync_env->cct, sync_env->conn, sync_env->http_manager, p, pairs, info));
+ call(new RGWReadRESTResourceCR<rgw_bucket_index_marker_info>(sync_env->cct, sync_env->conn, sync_env->http_manager, p, pairs, info));
}
if (retcode < 0) {
return set_cr_error(retcode);
rgw_bucket_shard_sync_info& status;
- bucket_index_marker_info info;
+ rgw_bucket_index_marker_info info;
public:
RGWInitBucketShardSyncStatusCoroutine(RGWDataSyncEnv *_sync_env,
const rgw_bucket_shard& bs,
}
return 0;
}
+
+#define OMAP_READ_MAX_ENTRIES 10
+class RGWReadRecoveringBucketShardsCoroutine : public RGWCoroutine {
+ RGWDataSyncEnv *sync_env;
+ RGWRados *store;
+
+ const int shard_id;
+ int max_entries;
+
+ set<string>& recovering_buckets;
+ string marker;
+ string error_oid;
+
+ set<string> error_entries;
+ int max_omap_entries;
+ int count;
+
+public:
+ RGWReadRecoveringBucketShardsCoroutine(RGWDataSyncEnv *_sync_env, const int _shard_id,
+ set<string>& _recovering_buckets, const int _max_entries)
+ : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+ store(sync_env->store), shard_id(_shard_id), max_entries(_max_entries),
+ recovering_buckets(_recovering_buckets), max_omap_entries(OMAP_READ_MAX_ENTRIES)
+ {
+ error_oid = RGWDataSyncStatusManager::shard_obj_name(sync_env->source_zone, shard_id) + ".retry";
+ }
+
+ int operate() override;
+};
+
+int RGWReadRecoveringBucketShardsCoroutine::operate()
+{
+ reenter(this){
+ //read recovering bucket shards
+ count = 0;
+ do {
+ yield call(new RGWRadosGetOmapKeysCR(store, rgw_raw_obj(store->get_zone_params().log_pool, error_oid),
+ marker, &error_entries, max_omap_entries));
+
+ if (retcode == -ENOENT) {
+ break;
+ }
+
+ if (retcode < 0) {
+ ldout(sync_env->cct, 0) << "failed to read recovering bucket shards with "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+
+ if (error_entries.empty()) {
+ break;
+ }
+
+ count += error_entries.size();
+ marker = *error_entries.rbegin();
+ recovering_buckets.insert(error_entries.begin(), error_entries.end());
+ }while((int)error_entries.size() == max_omap_entries && count < max_entries);
+
+ return set_cr_done();
+ }
+
+ return 0;
+}
+
+class RGWReadPendingBucketShardsCoroutine : public RGWCoroutine {
+ RGWDataSyncEnv *sync_env;
+ RGWRados *store;
+
+ const int shard_id;
+ int max_entries;
+
+ set<string>& pending_buckets;
+ string marker;
+ string status_oid;
+
+ rgw_data_sync_marker* sync_marker;
+ int count;
+
+ list<rgw_data_change_log_entry> log_entries;
+ bool truncated;
+
+public:
+ RGWReadPendingBucketShardsCoroutine(RGWDataSyncEnv *_sync_env, const int _shard_id,
+ set<string>& _pending_buckets,
+ rgw_data_sync_marker* _sync_marker, const int _max_entries)
+ : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+ store(sync_env->store), shard_id(_shard_id), max_entries(_max_entries),
+ pending_buckets(_pending_buckets), sync_marker(_sync_marker)
+ {
+ status_oid = RGWDataSyncStatusManager::shard_obj_name(sync_env->source_zone, shard_id);
+ }
+
+ int operate() override;
+};
+
+int RGWReadPendingBucketShardsCoroutine::operate()
+{
+ reenter(this){
+ //read sync status marker
+ using CR = RGWSimpleRadosReadCR<rgw_data_sync_marker>;
+ yield call(new CR(sync_env->async_rados, store,
+ rgw_raw_obj(store->get_zone_params().log_pool, status_oid),
+ sync_marker));
+ if (retcode < 0) {
+ ldout(sync_env->cct,0) << "failed to read sync status marker with "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+
+ //read pending bucket shards
+ marker = sync_marker->marker;
+ count = 0;
+ do{
+ yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, &marker, &log_entries, &truncated));
+
+ if (retcode == -ENOENT) {
+ break;
+ }
+
+ if (retcode < 0) {
+ ldout(sync_env->cct,0) << "failed to read remote data log info with "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+
+ if (log_entries.empty()) {
+ break;
+ }
+
+ count += log_entries.size();
+ for (const auto& entry : log_entries) {
+ pending_buckets.insert(entry.entry.key);
+ }
+ }while(truncated && count < max_entries);
+
+ return set_cr_done();
+ }
+
+ return 0;
+}
+
+int RGWRemoteDataLog::read_shard_status(int shard_id, set<string>& pending_buckets, set<string>& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries)
+{
+ // cannot run concurrently with run_sync(), so run in a separate manager
+ RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
+ RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
+ int ret = http_manager.set_threaded();
+ if (ret < 0) {
+ ldout(store->ctx(), 0) << "failed in http_manager.start() ret=" << ret << dendl;
+ return ret;
+ }
+ RGWDataSyncEnv sync_env_local = sync_env;
+ sync_env_local.http_manager = &http_manager;
+ list<RGWCoroutinesStack *> stacks;
+ RGWCoroutinesStack* recovering_stack = new RGWCoroutinesStack(store->ctx(), &crs);
+ recovering_stack->call(new RGWReadRecoveringBucketShardsCoroutine(&sync_env_local, shard_id, recovering_buckets, max_entries));
+ stacks.push_back(recovering_stack);
+ RGWCoroutinesStack* pending_stack = new RGWCoroutinesStack(store->ctx(), &crs);
+ pending_stack->call(new RGWReadPendingBucketShardsCoroutine(&sync_env_local, shard_id, pending_buckets, sync_marker, max_entries));
+ stacks.push_back(pending_stack);
+ ret = crs.run(stacks);
+ http_manager.stop();
+ return ret;
+}
+
RGWCoroutine *RGWRemoteBucketLog::read_sync_status_cr(rgw_bucket_shard_sync_info *sync_status)
{
return new RGWReadBucketSyncStatusCoroutine(&sync_env, bs, sync_status);
sync_status = retcode;
}
if (!error_ss.str().empty()) {
- yield call(sync_env->error_logger->log_error_cr(sync_env->conn->get_remote_id(), "data", error_ss.str(), -retcode, "failed to sync object"));
+ yield call(sync_env->error_logger->log_error_cr(sync_env->conn->get_remote_id(), "data", error_ss.str(), -retcode, string("failed to sync object") + cpp_strerror(-sync_status)));
}
done:
if (sync_status == 0) {
};
int rgw_bucket_sync_status(RGWRados *store, const std::string& source_zone,
- const rgw_bucket& bucket,
+ const RGWBucketInfo& bucket_info,
std::vector<rgw_bucket_shard_sync_info> *status)
{
- // read the bucket instance info for num_shards
- RGWObjectCtx ctx(store);
- RGWBucketInfo info;
- int ret = store->get_bucket_instance_info(ctx, bucket, info, nullptr, nullptr);
- if (ret < 0) {
- return ret;
- }
+ const auto num_shards = bucket_info.num_shards;
status->clear();
- status->resize(std::max<size_t>(1, info.num_shards));
+ status->resize(std::max<size_t>(1, num_shards));
RGWDataSyncEnv env;
RGWSyncModuleInstanceRef module; // null sync module
nullptr, nullptr, source_zone, module, nullptr);
RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
- return crs.run(new RGWCollectBucketSyncStatusCR(store, &env, info.num_shards,
- bucket, status));
+ return crs.run(new RGWCollectBucketSyncStatusCR(store, &env, num_shards,
+ bucket_info.bucket, status));
}
}
void dump(Formatter *f) const {
- encode_json("state", (int)state, f);
+ const char *s{nullptr};
+ switch ((SyncState)state) {
+ case FullSync:
+ s = "full-sync";
+ break;
+ case IncrementalSync:
+ s = "incremental-sync";
+ break;
+ default:
+ s = "unknown";
+ break;
+ }
+ encode_json("status", s, f);
encode_json("marker", marker, f);
encode_json("next_step_marker", next_step_marker, f);
encode_json("total_entries", total_entries, f);
encode_json("timestamp", utime_t(timestamp), f);
}
void decode_json(JSONObj *obj) {
- int s;
- JSONDecoder::decode_json("state", s, obj);
- state = s;
+ std::string s;
+ JSONDecoder::decode_json("status", s, obj);
+ if (s == "full-sync") {
+ state = FullSync;
+ } else if (s == "incremental-sync") {
+ state = IncrementalSync;
+ }
JSONDecoder::decode_json("marker", marker, obj);
JSONDecoder::decode_json("next_step_marker", next_step_marker, obj);
JSONDecoder::decode_json("total_entries", total_entries, obj);
int read_source_log_shards_info(map<int, RGWDataChangesLogInfo> *shards_info);
int read_source_log_shards_next(map<int, string> shard_markers, map<int, rgw_datalog_shard_data> *result);
int read_sync_status(rgw_data_sync_status *sync_status);
+ int read_recovering_shards(const int num_shards, set<int>& recovering_shards);
+ int read_shard_status(int shard_id, set<string>& lagging_buckets,set<string>& recovering_buckets, rgw_data_sync_marker* sync_marker, const int max_entries);
int init_sync_status(int num_shards);
int run_sync(int num_shards);
int read_sync_status(rgw_data_sync_status *sync_status) {
return source_log.read_sync_status(sync_status);
}
+
+ int read_recovering_shards(const int num_shards, set<int>& recovering_shards) {
+ return source_log.read_recovering_shards(num_shards, recovering_shards);
+ }
+
+ int read_shard_status(int shard_id, set<string>& lagging_buckets, set<string>& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries) {
+ return source_log.read_shard_status(shard_id, lagging_buckets, recovering_buckets,sync_marker, max_entries);
+ }
int init_sync_status() { return source_log.init_sync_status(num_shards); }
int read_log_info(rgw_datalog_info *log_info) {
};
WRITE_CLASS_ENCODER(rgw_bucket_shard_sync_info)
+struct rgw_bucket_index_marker_info {
+ string bucket_ver;
+ string master_ver;
+ string max_marker;
+ bool syncstopped{false};
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("bucket_ver", bucket_ver, obj);
+ JSONDecoder::decode_json("master_ver", master_ver, obj);
+ JSONDecoder::decode_json("max_marker", max_marker, obj);
+ JSONDecoder::decode_json("syncstopped", syncstopped, obj);
+ }
+};
+
class RGWRemoteBucketLog : public RGWCoroutinesManager {
RGWRados *store;
/// read the sync status of all bucket shards from the given source zone
int rgw_bucket_sync_status(RGWRados *store, const std::string& source_zone,
- const rgw_bucket& bucket,
+ const RGWBucketInfo& bucket_info,
std::vector<rgw_bucket_shard_sync_info> *status);
class RGWDefaultSyncModule : public RGWSyncModule {
#include "rgw_auth_s3.h"
#include "rgw_user.h"
#include "rgw_bucket.h"
-
#include "rgw_file.h"
#include "rgw_lib_frontend.h"
+#include "common/errno.h"
#include <atomic>
}
RGWFileHandle::~RGWFileHandle() {
+ /* !recycle case, handle may STILL be in handle table, BUT
+ * the partition lock is not held in this path */
+ if (fh_hook.is_linked()) {
+ fs->fh_cache.remove(fh.fh_hk.object, this, FHCache::FLAG_LOCK);
+ }
/* cond-unref parent */
if (parent && (! parent->is_mount())) {
/* safe because if parent->unref causes its deletion,
op_ret = processor->complete(s->obj_size, etag, &mtime, real_time(), attrs,
(delete_at ? *delete_at : real_time()),
- if_match, if_nomatch);
+ if_match, if_nomatch);
if (op_ret != 0) {
/* revert attr updates */
rgw_fh->set_mtime(omtime);
struct rgw_statvfs *vfs_st, uint32_t flags)
{
RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+ struct rados_cluster_stat_t stats;
+
+ RGWGetClusterStatReq req(fs->get_context(), fs->get_user(), stats);
+ int rc = rgwlib.get_fe()->execute_req(&req);
+ if (rc < 0) {
+ lderr(fs->get_context()) << "ERROR: getting total cluster usage"
+ << cpp_strerror(-rc) << dendl;
+ return rc;
+ }
- /* XXX for now, just publish a huge capacity and
- * limited utiliztion */
- vfs_st->f_bsize = 1024*1024 /* 1M */;
- vfs_st->f_frsize = 1024; /* minimal allocation unit (who cares) */
- vfs_st->f_blocks = UINT64_MAX;
- vfs_st->f_bfree = UINT64_MAX;
- vfs_st->f_bavail = UINT64_MAX;
- vfs_st->f_files = 1024; /* object count, do we have an est? */
- vfs_st->f_ffree = UINT64_MAX;
+ //Set block size to 1M.
+ constexpr uint32_t CEPH_BLOCK_SHIFT = 20;
+ vfs_st->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+ vfs_st->f_frsize = 1 << CEPH_BLOCK_SHIFT;
+ vfs_st->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
+ vfs_st->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
+ vfs_st->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
+ vfs_st->f_files = stats.num_objects;
+ vfs_st->f_ffree = -1;
vfs_st->f_fsid[0] = fs->get_fsid();
vfs_st->f_fsid[1] = fs->get_fsid();
vfs_st->f_flag = 0;
{
RGWFileHandle* rgw_fh = get_rgwfh(fh);
- /* XXX
+ /* XXX
* need to track specific opens--at least read opens and
* a write open; we need to know when a write open is returned,
* that closes a write transaction
}; /* RGWSetAttrsRequest */
+/*
+ * Send request to get the rados cluster stats
+ */
+class RGWGetClusterStatReq : public RGWLibRequest,
+ public RGWGetClusterStat {
+public:
+ struct rados_cluster_stat_t& stats_req;
+ RGWGetClusterStatReq(CephContext* _cct,RGWUserInfo *_user,
+ rados_cluster_stat_t& _stats):
+ RGWLibRequest(_cct, _user), stats_req(_stats){
+ op = this;
+ }
+
+ int op_init() override {
+ // assign store, s, and dialect_handler
+ RGWObjectCtx* rados_ctx
+ = static_cast<RGWObjectCtx*>(get_state()->obj_ctx);
+ // framework promises to call op_init after parent init
+ assert(rados_ctx);
+ RGWOp::init(rados_ctx->store, get_state(), this);
+ op = this; // assign self as op: REQUIRED
+ return 0;
+ }
+
+ int header_init() override {
+ struct req_state* s = get_state();
+ s->info.method = "GET";
+ s->op = OP_GET;
+ s->user = user;
+ return 0;
+ }
+
+ int get_params() override { return 0; }
+ bool only_bucket() override { return false; }
+ void send_response() override {
+ stats_req.kb = stats_op.kb;
+ stats_req.kb_avail = stats_op.kb_avail;
+ stats_req.kb_used = stats_op.kb_used;
+ stats_req.num_objects = stats_op.num_objects;
+ }
+}; /* RGWGetClusterStatReq */
+
+
} /* namespace rgw */
#endif /* RGW_FILE_H */
#define dout_subsys ceph_subsys_rgw
int RGWFrontendConfig::parse_config(const string& config,
- map<string, string>& config_map)
+ std::multimap<string, string>& config_map)
{
list<string> config_list;
get_str_list(config, " ", config_list);
ssize_t pos = entry.find('=');
if (pos < 0) {
dout(0) << "framework conf key: " << entry << dendl;
- config_map[entry] = "";
+ config_map.emplace(std::move(entry), "");
continue;
}
}
dout(0) << "framework conf key: " << key << ", val: " << val << dendl;
- config_map[key] = val;
+ config_map.emplace(std::move(key), std::move(val));
}
return 0;
bool RGWFrontendConfig::get_val(const string& key, const string& def_val,
string *out)
{
- map<string, string>::iterator iter = config_map.find(key);
+ auto iter = config_map.find(key);
if (iter == config_map.end()) {
*out = def_val;
return false;
class RGWFrontendConfig {
std::string config;
- std::map<std::string, std::string> config_map;
+ std::multimap<std::string, std::string> config_map;
std::string framework;
int parse_config(const std::string& config,
- std::map<std::string, std::string>& config_map);
+ std::multimap<std::string, std::string>& config_map);
public:
RGWFrontendConfig(const std::string& config)
return config;
}
- std::map<std::string, std::string>& get_config_map() {
+ std::multimap<std::string, std::string>& get_config_map() {
return config_map;
}
struct mg_context* ctx;
RGWMongooseEnv env;
- void set_conf_default(std::map<std::string, std::string>& m,
+ void set_conf_default(std::multimap<std::string, std::string>& m,
const std::string& key,
const std::string& def_val) {
if (m.find(key) == std::end(m)) {
- m[key] = def_val;
+ m.emplace(key, def_val);
}
}
class RGWFrontendPauser : public RGWRealmReloader::Pauser {
std::list<RGWFrontend*> &frontends;
RGWRealmReloader::Pauser* pauser;
+ rgw::auth::ImplicitTenants& implicit_tenants;
public:
RGWFrontendPauser(std::list<RGWFrontend*> &frontends,
+ rgw::auth::ImplicitTenants& implicit_tenants,
RGWRealmReloader::Pauser* pauser = nullptr)
- : frontends(frontends), pauser(pauser) {}
+ : frontends(frontends),
+ pauser(pauser),
+ implicit_tenants(implicit_tenants) {
+ }
void pause() override {
for (auto frontend : frontends)
/* Initialize the registry of auth strategies which will coordinate
* the dynamic reconfiguration. */
auto auth_registry = \
- rgw::auth::StrategyRegistry::create(g_ceph_context, store);
+ rgw::auth::StrategyRegistry::create(g_ceph_context, implicit_tenants, store);
for (auto frontend : frontends)
frontend->unpause_with_new_config(store, auth_registry);
}
}
- val.append(": ");
- val.append(p.second);
+ // curl won't send headers with empty values unless it ends with a ; instead
+ if (p.second.empty()) {
+ val.append(1, ';');
+ } else {
+ val.append(": ");
+ val.append(p.second);
+ }
h = curl_slist_append(h, val.c_str());
}
{ "s3:GetObjectVersionTagging", s3GetObjectVersionTagging},
{ "s3:GetReplicationConfiguration", s3GetReplicationConfiguration },
{ "s3:ListAllMyBuckets", s3ListAllMyBuckets },
- { "s3:ListBucketMultiPartUploads", s3ListBucketMultiPartUploads },
+ { "s3:ListBucketMultipartUploads", s3ListBucketMultipartUploads },
{ "s3:ListBucket", s3ListBucket },
{ "s3:ListBucketVersions", s3ListBucketVersions },
{ "s3:ListMultipartUploadParts", s3ListMultipartUploadParts },
case s3ListAllMyBuckets:
return "s3:ListAllMyBuckets";
- case s3ListBucketMultiPartUploads:
- return "s3:ListBucketMultiPartUploads";
+ case s3ListBucketMultipartUploads:
+ return "s3:ListBucketMultipartUploads";
case s3GetAccelerateConfiguration:
return "s3:GetAccelerateConfiguration";
static constexpr std::uint64_t s3ListBucket = 1ULL << 16;
static constexpr std::uint64_t s3ListBucketVersions = 1ULL << 17;
static constexpr std::uint64_t s3ListAllMyBuckets = 1ULL << 18;
-static constexpr std::uint64_t s3ListBucketMultiPartUploads = 1ULL << 19;
+static constexpr std::uint64_t s3ListBucketMultipartUploads = 1ULL << 19;
static constexpr std::uint64_t s3GetAccelerateConfiguration = 1ULL << 20;
static constexpr std::uint64_t s3PutAccelerateConfiguration = 1ULL << 21;
static constexpr std::uint64_t s3GetBucketAcl = 1ULL << 22;
case s3GetObjectVersionTagging:
case s3ListAllMyBuckets:
case s3ListBucket:
- case s3ListBucketMultiPartUploads:
+ case s3ListBucketMultipartUploads:
case s3ListBucketVersions:
case s3ListMultipartUploadParts:
return RGW_PERM_READ;
bool RGWLifecycleConfiguration_S3::xml_end(const char *el) {
XMLObjIter iter = find("Rule");
LCRule_S3 *rule = static_cast<LCRule_S3 *>(iter.get_next());
+ if (!rule)
+ return false;
while (rule) {
add_rule(rule);
rule = static_cast<LCRule_S3 *>(iter.get_next());
RGWRados *store = RGWStoreManager::get_storage(g_ceph_context,
g_conf->rgw_enable_gc_threads, g_conf->rgw_enable_lc_threads, g_conf->rgw_enable_quota_threads,
- g_conf->rgw_run_sync_thread, g_conf->rgw_dynamic_resharding);
+ g_conf->rgw_run_sync_thread, g_conf->rgw_dynamic_resharding, g_conf->rgw_cache_enabled);
if (!store) {
mutex.Lock();
init_timer.cancel_all_events();
/* Initialize the registry of auth strategies which will coordinate
* the dynamic reconfiguration. */
+ rgw::auth::ImplicitTenants implicit_tenant_context{*g_conf};
+ g_conf->add_observer(&implicit_tenant_context);
auto auth_registry = \
- rgw::auth::StrategyRegistry::create(g_ceph_context, store);
+ rgw::auth::StrategyRegistry::create(g_ceph_context, implicit_tenant_context, store);
/* Header custom behavior */
rest.register_x_headers(g_conf->rgw_log_http_headers);
// add a watcher to respond to realm configuration changes
RGWPeriodPusher pusher(store);
- RGWFrontendPauser pauser(fes, &pusher);
+ RGWFrontendPauser pauser(fes, implicit_tenant_context, &pusher);
RGWRealmReloader reloader(store, service_map_meta, &pauser);
RGWRealmWatcher realm_watcher(g_ceph_context, store->realm);
rgw_tools_cleanup();
rgw_shutdown_resolver();
rgw::curl::cleanup_curl();
+ g_conf->remove_observer(&implicit_tenant_context);
rgw_perf_stop(g_ceph_context);
static MultipartMetaFilter mp_filter;
-static int parse_range(const char *range, off_t& ofs, off_t& end, bool *partial_content)
+int RGWGetObj::parse_range(void)
{
int r = -ERANGE;
- string s(range);
+ string rs(range_str);
string ofs_str;
string end_str;
- *partial_content = false;
+ ignore_invalid_range = s->cct->_conf->rgw_ignore_get_invalid_range;
+ partial_content = false;
- size_t pos = s.find("bytes=");
+ size_t pos = rs.find("bytes=");
if (pos == string::npos) {
pos = 0;
- while (isspace(s[pos]))
+ while (isspace(rs[pos]))
pos++;
int end = pos;
- while (isalpha(s[end]))
+ while (isalpha(rs[end]))
end++;
- if (strncasecmp(s.c_str(), "bytes", end - pos) != 0)
+ if (strncasecmp(rs.c_str(), "bytes", end - pos) != 0)
return 0;
- while (isspace(s[end]))
+ while (isspace(rs[end]))
end++;
- if (s[end] != '=')
+ if (rs[end] != '=')
return 0;
- s = s.substr(end + 1);
+ rs = rs.substr(end + 1);
} else {
- s = s.substr(pos + 6); /* size of("bytes=") */
+ rs = rs.substr(pos + 6); /* size of("bytes=") */
}
- pos = s.find('-');
+ pos = rs.find('-');
if (pos == string::npos)
goto done;
- *partial_content = true;
+ partial_content = true;
- ofs_str = s.substr(0, pos);
- end_str = s.substr(pos + 1);
+ ofs_str = rs.substr(0, pos);
+ end_str = rs.substr(pos + 1);
if (end_str.length()) {
end = atoll(end_str.c_str());
if (end < 0)
if (end >= 0 && end < ofs)
goto done;
- r = 0;
+ range_parsed = true;
+ return 0;
+
done:
+ if (ignore_invalid_range) {
+ partial_content = false;
+ ofs = 0;
+ end = -1;
+ range_parsed = false; // allow retry
+ r = 0;
+ }
+
return r;
}
bool prefetch_first_chunk = true;
range_str = s->info.env->get("HTTP_RANGE");
- if(range_str) {
- int r = parse_range(range_str, ofs, end, &partial_content);
- /* error on parsing the range, stop prefetch and will fail in execte() */
+ if (range_str) {
+ int r = parse_range();
+ /* error on parsing the range, stop prefetch and will fail in execute() */
if (r < 0) {
- range_parsed = false;
- return false;
- } else {
- range_parsed = true;
+ return false; /* range_parsed==false */
}
- /* range get goes to shadown objects, stop prefetch */
+ /* range get goes to shadow objects, stop prefetch */
if (ofs >= s->cct->_conf->rgw_max_chunk_size) {
prefetch_first_chunk = false;
}
return get_data && prefetch_first_chunk;
}
+
void RGWGetObj::pre_exec()
{
rgw_bucket_object_pre_exec(s);
{
attr_iter = attrs.find(RGW_ATTR_CRYPT_MODE);
if (attr_iter != attrs.end() && attr_iter->second.to_str() == "SSE-C-AES256") {
- op_ret = -ERR_INVALID_REQUEST;
+ ldout(s->cct, 0) << "ERROR: torrents are not supported for objects "
+ "encrypted with SSE-C" << dendl;
+ op_ret = -EINVAL;
goto done_err;
}
torrent.init(s, store);
int RGWGetObj::init_common()
{
if (range_str) {
- /* range parsed error when prefetch*/
+ /* range parsed error when prefetch */
if (!range_parsed) {
- int r = parse_range(range_str, ofs, end, &partial_content);
+ int r = parse_range();
if (r < 0)
return r;
}
char *endptr;
max = strtol(max_keys.c_str(), &endptr, 10);
if (endptr) {
+ if (endptr == max_keys.c_str()) return -EINVAL;
while (*endptr && isspace(*endptr)) // ignore white space
endptr++;
if (*endptr) {
op_ret = -ENOENT;
goto done;
}
- lst = astate->size - 1;
+ lst = astate->accounted_size - 1;
} else {
lst = copy_source_range_lst;
}
}
}
op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, &s->bucket_info.objv_tracker);
- string shard_id = s->bucket.name + ':' +s->bucket.bucket_id;
+ string shard_id = s->bucket.tenant + ':' + s->bucket.name + ':' + s->bucket.bucket_id;
pair<string, int> entry(shard_id, lc_uninitial);
string oid;
get_lc_oid(s, oid);
int RGWListBucketMultiparts::verify_permission()
{
if (!verify_bucket_permission(s,
- rgw::IAM::s3ListBucketMultiPartUploads))
+ rgw::IAM::s3ListBucketMultipartUploads))
return -EACCES;
return 0;
return op_ret;
});
}
+
+void RGWGetClusterStat::execute()
+{
+ op_ret = this->store->get_rados_handle()->cluster_stat(stats_op);
+}
+
+
map<string, bufferlist> attrs;
bool get_data;
bool partial_content;
+ bool ignore_invalid_range;
bool range_parsed;
bool skip_manifest;
bool skip_decrypt{false};
void set_get_data(bool get_data) {
this->get_data = get_data;
}
+
int verify_permission() override;
void pre_exec() override;
void execute() override;
+ int parse_range();
int read_user_manifest_part(
rgw_bucket& bucket,
const rgw_bucket_dir_entry& ent,
virtual uint32_t op_mask() { return RGW_OP_TYPE_WRITE; }
};
+class RGWGetClusterStat : public RGWOp {
+protected:
+ struct rados_cluster_stat_t stats_op;
+public:
+ RGWGetClusterStat() {}
+
+ void init(RGWRados *store, struct req_state *s, RGWHandler *h) override {
+ RGWOp::init(store, s, h);
+ }
+ int verify_permission() override {return 0;}
+ virtual void send_response() = 0;
+ virtual int get_params() = 0;
+ void execute() override;
+ virtual const string name() { return "get_cluster_stat"; }
+};
+
+
+
#endif /* CEPH_RGW_OP_H */
return r;
}
// reflect period objects if this is the latest version
- r = period.reflect();
- if (r < 0) {
- return r;
+ if (store->realm.get_current_period() == period_id) {
+ r = period.reflect();
+ if (r < 0) {
+ return r;
+ }
}
ldout(store->ctx(), 14) << "period " << period_id
<< " pulled and written to local storage" << dendl;
int r = rados->ioctx_create(pool.name.c_str(), ioctx);
if (r == -ENOENT && create) {
r = rados->pool_create(pool.name.c_str());
+ if (r == -ERANGE) {
+ dout(0)
+ << __func__
+ << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r)
+ << " (this can be due to a pool or placement group misconfiguration, e.g."
+ << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
+ << dendl;
+ }
if (r < 0 && r != -EEXIST) {
return r;
}
notify_oid.append(buf);
}
-int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
+int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
{
- librados::Rados *rad = get_rados_handle();
- int r = rgw_init_ioctx(rad, pool, io_ctx);
- if (r != -ENOENT)
- return r;
-
- if (!pools_initialized)
- return r;
-
- r = rad->pool_create(pool.name.c_str());
- if (r < 0 && r != -EEXIST)
- return r;
-
- r = rgw_init_ioctx(rad, pool, io_ctx);
- if (r < 0)
- return r;
-
- r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
- if (r < 0 && r != -EOPNOTSUPP)
- return r;
- return 0;
+ constexpr bool create = true; // create the pool if it doesn't exist
+ return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
}
void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
*/
int RGWRados::create_pool(const rgw_pool& pool)
{
- int ret = 0;
-
- librados::Rados *rad = get_rados_handle();
- ret = rad->pool_create(pool.name.c_str(), 0);
- if (ret == -EEXIST)
- ret = 0;
- else if (ret == -ERANGE) {
- ldout(cct, 0)
- << __func__
- << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-ret)
- << " (this can be due to a pool or placement group misconfiguration, e.g."
- << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
- << dendl;
- }
- if (ret < 0)
- return ret;
-
librados::IoCtx io_ctx;
- ret = rad->ioctx_create(pool.name.c_str(), io_ctx);
- if (ret < 0)
- return ret;
-
- ret = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
- if (ret < 0 && ret != -EOPNOTSUPP)
- return ret;
- return 0;
+ constexpr bool create = true;
+ return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
}
int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
return c->is_safe();
}
+// PutObj filter that buffers data so we don't try to compress tiny blocks.
+// libcurl reads in 16k at a time, and we need at least 64k to get a good
+// compression ratio
+class RGWPutObj_Buffer : public RGWPutObj_Filter {
+ const unsigned buffer_size;
+ bufferlist buffer;
+ public:
+ RGWPutObj_Buffer(RGWPutObjDataProcessor* next, unsigned buffer_size)
+ : RGWPutObj_Filter(next), buffer_size(buffer_size) {
+ assert(ISP2(buffer_size)); // must be power of 2
+ }
+
+ int handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj,
+ bool *again) override {
+ if (*again || !bl.length()) {
+ // flush buffered data
+ return RGWPutObj_Filter::handle_data(buffer, ofs, phandle, pobj, again);
+ }
+ // transform offset to the beginning of the buffer
+ ofs = ofs - buffer.length();
+ buffer.claim_append(bl);
+ if (buffer.length() < buffer_size) {
+ *again = false; // don't come back until there's more data
+ return 0;
+ }
+ const auto count = P2ALIGN(buffer.length(), buffer_size);
+ buffer.splice(0, count, &bl);
+ return RGWPutObj_Filter::handle_data(bl, ofs, phandle, pobj, again);
+ }
+};
+
class RGWRadosPutObj : public RGWGetDataCB
{
CephContext* cct;
rgw_obj obj;
RGWPutObjDataProcessor *filter;
boost::optional<RGWPutObj_Compress>& compressor;
+ boost::optional<RGWPutObj_Buffer> buffering;
CompressorRef& plugin;
RGWPutObjProcessor_Atomic *processor;
RGWOpStateSingleOp *opstate;
if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
//do not compress if object is encrypted
compressor = boost::in_place(cct, plugin, filter);
- filter = &*compressor;
+ constexpr unsigned buffer_size = 512 * 1024;
+ buffering = boost::in_place(&*compressor, buffer_size);
+ filter = &*buffering;
}
return 0;
}
return 0;
}
+ int flush() {
+ bufferlist bl;
+ return put_data_and_throttle(filter, bl, 0, false);
+ }
+
bufferlist& get_extra_data() { return extra_data_bl; }
map<string, bufferlist>& get_attrs() { return src_attrs; }
if (ret < 0) {
goto set_err_state;
}
+ ret = cb.flush();
+ if (ret < 0) {
+ goto set_err_state;
+ }
if (compressor && compressor->is_compressed()) {
bufferlist tmp;
RGWCompressionInfo cs_info;
return ++max_bucket_id;
}
-RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread)
+RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread,
+ bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_cache)
{
- int use_cache = cct->_conf->rgw_cache_enabled;
RGWRados *store = NULL;
if (!use_cache) {
store = new RGWRados;
} else {
- store = new RGWCache<RGWRados>;
+ store = new RGWCache<RGWRados>;
}
if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
bool is_read_only() { return read_only; }
- bool syncs_from(const string& zone_id) {
+ bool syncs_from(const string& zone_id) const {
return (sync_from_all || sync_from.find(zone_id) != sync_from.end());
}
};
class RGWStoreManager {
public:
RGWStoreManager() {}
- static RGWRados *get_storage(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread) {
+ static RGWRados *get_storage(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads,
+ bool run_sync_thread, bool run_reshard_thread, bool use_cache = true) {
RGWRados *store = init_storage_provider(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread,
- run_reshard_thread);
+ run_reshard_thread, use_cache);
return store;
}
static RGWRados *get_raw_storage(CephContext *cct) {
RGWRados *store = init_raw_storage_provider(cct);
return store;
}
- static RGWRados *init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread);
+ static RGWRados *init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_metadata_cache);
static RGWRados *init_raw_storage_provider(CephContext *cct);
static void close_storage(RGWRados *store);
cct->_conf->rgw_enable_lc_threads,
cct->_conf->rgw_enable_quota_threads,
cct->_conf->rgw_run_sync_thread,
- cct->_conf->rgw_dynamic_resharding);
+ cct->_conf->rgw_dynamic_resharding,
+ cct->_conf->rgw_cache_enabled
+ );
ldout(cct, 1) << "Creating new store" << dendl;
get_data &= (!rgwx_stat);
}
- /* start gettorrent */
- bool is_torrent = s->info.args.exists(GET_TORRENT);
- bool torrent_flag = s->cct->_conf->rgw_torrent_flag;
- if (torrent_flag && is_torrent)
- {
- int ret = 0;
- ret = torrent.get_params();
- if (ret < 0)
- {
- return ret;
- }
+ if (s->info.args.exists(GET_TORRENT)) {
+ return torrent.get_params();
}
- /* end gettorrent */
-
return 0;
}
return;
}
- http_ret = rgw_bucket_sync_status(store, source_zone, bucket, &status);
+ // read the bucket instance info for num_shards
+ RGWObjectCtx ctx(store);
+ RGWBucketInfo info;
+ http_ret = store->get_bucket_instance_info(ctx, bucket, info, nullptr, nullptr);
+ if (http_ret < 0) {
+ ldout(s->cct, 4) << "failed to read bucket info: " << cpp_strerror(http_ret) << dendl;
+ return;
+ }
+ http_ret = rgw_bucket_sync_status(store, source_zone, info, &status);
}
void RGWOp_BILog_Status::send_response()
case RGW_OP_PUT_BUCKET_POLICY:
case RGW_OP_PUT_OBJ_TAGGING:
case RGW_OP_PUT_LC:
+ case RGW_OP_SET_REQUEST_PAYMENT:
break;
default:
dout(10) << "ERROR: AWS4 completion for this operation NOT IMPLEMENTED" << dendl;
};
+#if 0
class S3AuthFactory : public rgw::auth::RemoteApplier::Factory,
public rgw::auth::LocalApplier::Factory {
typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t;
RGWRados* const store;
+ ImplicitTenants& implicit_tenant_context;
public:
- S3AuthFactory(RGWRados* const store)
- : store(store) {
+ S3AuthFactory(RGWRados* const store,
+ ImplicitTenants& implicit_tenant_context)
+ : store(store),
+ implicit_tenant_context(implicit_tenant_context) {
}
aplptr_t create_apl_remote(CephContext* const cct,
) const override {
return aplptr_t(
new rgw::auth::RemoteApplier(cct, store, std::move(acl_alg), info,
- cct->_conf->rgw_keystone_implicit_tenants));
+ implicit_tenant_context,
+ rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_S3));
}
aplptr_t create_apl_local(CephContext* const cct,
new rgw::auth::LocalApplier(cct, user_info, subuser));
}
};
+#endif
} /* namespace s3 */
RGWCORSConfiguration * const cors_config,
bool * const has_cors)
{
- string read_list, write_list;
-
- const char * const read_attr = s->info.env->get("HTTP_X_CONTAINER_READ");
- if (read_attr) {
- read_list = read_attr;
- }
- const char * const write_attr = s->info.env->get("HTTP_X_CONTAINER_WRITE");
- if (write_attr) {
- write_list = write_attr;
- }
+ const char * const read_list = s->info.env->get("HTTP_X_CONTAINER_READ");
+ const char * const write_list = s->info.env->get("HTTP_X_CONTAINER_WRITE");
*has_policy = false;
- if (read_attr || write_attr) {
+ if (read_list || write_list) {
RGWAccessControlPolicy_SWIFT swift_policy(s->cct);
const auto r = swift_policy.create(store,
s->user->user_id,
bool gen_key;
bool suspended;
bool system;
-
+ bool email_set;
bool quota_set;
int32_t max_buckets;
rgw_user uid(uid_str);
RESTArgs::get_string(s, "display-name", display_name, &display_name);
- RESTArgs::get_string(s, "email", email, &email);
+ RESTArgs::get_string(s, "email", email, &email, &email_set);
RESTArgs::get_string(s, "access-key", access_key, &access_key);
RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
RESTArgs::get_string(s, "user-caps", caps, &caps);
op_state.set_user_id(uid);
op_state.set_display_name(display_name);
- op_state.set_user_email(email);
+
+ if (email_set)
+ op_state.set_user_email(email);
+
op_state.set_caps(caps);
op_state.set_access_key(access_key);
op_state.set_secret_key(secret_key);
public rgw::auth::LocalApplier::Factory,
public rgw::auth::swift::TempURLApplier::Factory {
RGWRados* const store;
+ ImplicitTenants& implicit_tenant_context;
/* The engines. */
const rgw::auth::swift::TempURLEngine tempurl_engine;
rgw::auth::add_3rdparty(store, s->account_name,
rgw::auth::add_sysreq(cct, store, s,
rgw::auth::RemoteApplier(cct, store, std::move(extra_acl_strategy), info,
- cct->_conf->rgw_keystone_implicit_tenants)));
+ implicit_tenant_context,
+ rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_SWIFT)));
/* TODO(rzarzynski): replace with static_ptr. */
return aplptr_t(new decltype(apl)(std::move(apl)));
}
public:
DefaultStrategy(CephContext* const cct,
+ ImplicitTenants& implicit_tenant_context,
RGWRados* const store)
: store(store),
+ implicit_tenant_context(implicit_tenant_context),
tempurl_engine(cct,
store,
static_cast<rgw::auth::swift::TempURLApplier::Factory*>(this)),
string max_marker;
const std::string& period_marker; //< max marker stored in next period
- map<string, bufferlist> entries;
- map<string, bufferlist>::iterator iter;
+ std::set<std::string> entries;
+ std::set<std::string>::iterator iter;
string oid;
}
iter = entries.begin();
for (; iter != entries.end(); ++iter) {
- ldout(sync_env->cct, 20) << __func__ << ": full sync: " << iter->first << dendl;
+ marker = *iter;
+ ldout(sync_env->cct, 20) << __func__ << ": full sync: " << marker << dendl;
total_entries++;
- if (!marker_tracker->start(iter->first, total_entries, real_time())) {
- ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << iter->first << ". Duplicate entry?" << dendl;
+ if (!marker_tracker->start(marker, total_entries, real_time())) {
+ ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << marker << ". Duplicate entry?" << dendl;
} else {
// fetch remote and write locally
yield {
- RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, iter->first, iter->first, MDLOG_STATUS_COMPLETE, marker_tracker), false);
+ RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, marker, marker, MDLOG_STATUS_COMPLETE, marker_tracker), false);
// stack_to_pos holds a reference to the stack
- stack_to_pos[stack] = iter->first;
- pos_to_prev[iter->first] = marker;
+ stack_to_pos[stack] = marker;
+ pos_to_prev[marker] = marker;
}
}
- marker = iter->first;
}
collect_children();
} while ((int)entries.size() == max_entries && can_adjust_marker);
} else if (name == "x-amz-tagging") {
auto tags_bl = val.begin();
::decode(obj_tags, tags_bl);
+ } else if (name == "compression") {
+ RGWCompressionInfo cs_info;
+ auto vals_bl = val.begin();
+ decode(cs_info, vals_bl);
+ out_attrs[name] = cs_info.compression_type;
} else {
if (name != "pg_ver" &&
name != "source_zone" &&
}
string oid, key;
- map<string, bufferlist> m;
- set<string> obj_key;
get_obj_bucket_and_oid_loc(obj, oid, key);
- ldout(s->cct, 0) << "NOTICE: head obj oid= " << oid << dendl;
+ ldout(s->cct, 20) << "NOTICE: head obj oid= " << oid << dendl;
- obj_key.insert(RGW_OBJ_TORRENT);
- const int op_ret = read_op.state.io_ctx.omap_get_vals_by_keys(oid, obj_key, &m);
- if (op_ret < 0)
- {
- ldout(s->cct, 0) << "ERROR: failed to omap_get_vals_by_keys op_ret = "
- << op_ret << dendl;
- return op_ret;
+ const set<string> obj_key{RGW_OBJ_TORRENT};
+ map<string, bufferlist> m;
+ const int r = read_op.state.io_ctx.omap_get_vals_by_keys(oid, obj_key, &m);
+ if (r < 0) {
+ ldout(s->cct, 0) << "ERROR: omap_get_vals_by_keys failed: " << r << dendl;
+ return r;
}
-
- map<string, bufferlist>::iterator iter;
- for (iter = m.begin(); iter != m.end(); ++iter)
- {
- bufferlist bl_tmp = iter->second;
- char *pbuff = bl_tmp.c_str();
- bl.append(pbuff, bl_tmp.length());
+ if (m.size() != 1) {
+ ldout(s->cct, 0) << "ERROR: omap key " RGW_OBJ_TORRENT " not found" << dendl;
+ return -EINVAL;
}
+ bl.append(std::move(m.begin()->second));
dencode.bencode_end(bl);
bl_data = bl;
RGWStorageStats *arg_stats = NULL;
if (op_state.fetch_stats) {
int ret = store->get_user_stats(info.user_id, stats);
- if (ret < 0) {
+ if (ret < 0 && ret != -ENOENT) {
return ret;
}
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import json
+import os
+import requests
+from subprocess import call
+import sys
+import time
+try:
+ from urllib.parse import urljoin
+except:
+ from urlparse import urljoin
+
+label = sys.argv[1]
+repo = "ceph/ceph"
+
+with open(os.environ['HOME'] + '/.github_token', 'r') as myfile:
+ token = myfile.readline().strip()
+
+# get prs
+baseurl = urljoin('https://api.github.com',
+ 'repos/{repo}/issues?labels={label}&access_token={token}')
+url = baseurl.format(
+ label=label,
+ repo=repo,
+ token=token)
+r = requests.get(url)
+assert(r.ok)
+j = json.loads(r.text or r.content)
+print("--- found %d issues tagged with %s" % (len(j), label))
+
+prs = []
+prtext = []
+for issue in j:
+ if 'pull_request' not in issue:
+ continue
+ r = requests.get(issue['pull_request']['url'] + '?access_token=' + token)
+ pr = json.loads(r.text or r.content)
+ prs.append(pr)
+ prtext.append(pr['html_url'] + ' - ' + pr['title'])
+print("--- queried %s prs" % len(prs))
+
+# name branch
+TIME_FORMAT = '%Y-%m-%d-%H%M'
+branch = label + "-" + time.strftime(TIME_FORMAT, time.localtime())
+print("branch %s" % branch)
+
+# assemble
+print('--- creating branch %s' % branch)
+r = call(['git', 'checkout', '-b', branch])
+assert not r
+for pr in prs:
+ print('--- pr %d --- pulling %s branch %s' % (
+ pr['number'],
+ pr['head']['repo']['clone_url'],
+ pr['head']['ref']))
+ r = call(['git', 'pull', '--no-edit',
+ pr['head']['repo']['clone_url'],
+ pr['head']['ref']])
+ assert not r
+print('--- done. these PRs were included:')
+print('\n'.join(prtext).encode('ascii', errors='ignore').decode())
+print('--- perhaps you want to: make && ctest -j12 && git push ci %s' % branch)
endif(FREEBSD)
endif(WITH_RBD)
if(WITH_RADOSGW)
- add_dependencies(tests radosgw-admin)
+ add_dependencies(tests radosgw radosgw-admin)
endif(WITH_RADOSGW)
if(NOT FREEBSD)
add_dependencies(tests ceph-detect-init)
host host0 {
id -1 # do not change unnecessarily
# weight 1.000
- alg straw
+ alg straw2
hash 0 # rjenkins1
item device0 weight 1.000
}
host host1 {
id -2 # do not change unnecessarily
# weight 1.000
- alg straw
+ alg straw2
hash 0 # rjenkins1
item device1 weight 1.000
}
host host2 {
id -5 # do not change unnecessarily
# weight 1.000
- alg straw
+ alg straw2
hash 0 # rjenkins1
item device2 weight 1.000
}
rack rack0 {
id -3 # do not change unnecessarily
# weight 3.000
- alg straw
+ alg straw2
hash 0 # rjenkins1
item host0 weight 1.000
item host1 weight 1.000
root root {
id -4 # do not change unnecessarily
# weight 4.000
- alg straw
+ alg straw2
hash 0 # rjenkins1
item rack0 weight 4.000
}
"type_id": 1,
"type_name": "host",
"weight": 65536,
- "alg": "straw",
+ "alg": "straw2",
"hash": "rjenkins1",
"items": [
{
"type_id": 1,
"type_name": "host",
"weight": 65536,
- "alg": "straw",
+ "alg": "straw2",
"hash": "rjenkins1",
"items": [
{
"type_id": 2,
"type_name": "rack",
"weight": 196608,
- "alg": "straw",
+ "alg": "straw2",
"hash": "rjenkins1",
"items": [
{
"type_id": 3,
"type_name": "root",
"weight": 262144,
- "alg": "straw",
+ "alg": "straw2",
"hash": "rjenkins1",
"items": [
{
"type_id": 1,
"type_name": "host",
"weight": 65536,
- "alg": "straw",
+ "alg": "straw2",
"hash": "rjenkins1",
"items": [
{
"profile": "argonaut",
"optimal_tunables": 0,
"legacy_tunables": 1,
- "minimum_required_version": "argonaut",
+ "minimum_required_version": "hammer",
"require_feature_tunables": 0,
"require_feature_tunables2": 0,
"has_v2_rules": 0,
"require_feature_tunables3": 0,
"has_v3_rules": 0,
- "has_v4_buckets": 0,
+ "has_v4_buckets": 1,
"require_feature_tunables5": 0,
"has_v5_rules": 0
},
--max-buckets max number of buckets for a user
--admin set the admin flag on the user
--system set the system flag on the user
- --bucket=<bucket>
- --pool=<pool>
- --object=<object>
- --date=<date>
- --start-date=<date>
- --end-date=<date>
- --bucket-id=<bucket-id>
- --shard-id=<shard-id> optional for mdlog list
+ --bucket=<bucket> Specify the bucket name. Also used by the quota command.
+ --pool=<pool> Specify the pool name. Also used to scan for leaked rados objects.
+ --object=<object> object name
+ --date=<date> date in the format yyyy-mm-dd
+ --start-date=<date> start date in the format yyyy-mm-dd
+ --end-date=<date> end date in the format yyyy-mm-dd
+ --bucket-id=<bucket-id> bucket id
+ --shard-id=<shard-id> optional for:
+ mdlog list
+ data sync status
required for:
mdlog trim
replica mdlog get/delete
replica datalog get/delete
+ --max-entries=<entries> max entries for listing operations
--metadata-key=<key> key to retrieve metadata from with metadata get
--remote=<remote> zone or zonegroup id of remote gateway
--period=<id> period id
crush_choose_arg choose_args[maxbuckets];
memset(choose_args, '\0', sizeof(crush_choose_arg) * maxbuckets);
choose_args[-1-id].ids_size = 0;
- choose_args[-1-id].weight_set_size = 1;
+ choose_args[-1-id].weight_set_positions = 1;
choose_args[-1-id].weight_set = &weight_set;
crush_choose_arg_map arg_map;
arg_map.size = c.get_max_buckets();
CrushWrapper c_new;
c_new.decode(i);
ASSERT_EQ(1u, c_new.choose_args.size());
- ASSERT_EQ(1u, c_new.choose_args[caid].args[-1-id].weight_set_size);
+ ASSERT_EQ(1u, c_new.choose_args[caid].args[-1-id].weight_set_positions);
ASSERT_EQ(weights, c_new.choose_args[caid].args[-1-id].weight_set[0].weights[0]);
ASSERT_EQ(weight, c_new.get_bucket_item_weightf(id, 0));
}
-# unittest_mds_types
-add_executable(unittest_mds_types
- mds_types.cc
+if(${WITH_CEPHFS})
+
+ # unittest_mds_types
+ add_executable(unittest_mds_types
+ mds_types.cc
+ )
+ add_ceph_unittest(unittest_mds_types ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_mds_types)
+ target_link_libraries(unittest_mds_types global)
+
+ add_executable(ceph_test_trim_caps
+ test_trim_caps.cc
)
-add_ceph_unittest(unittest_mds_types ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_mds_types)
-target_link_libraries(unittest_mds_types global)
+ target_link_libraries(ceph_test_trim_caps ceph-common cephfs)
+ install(TARGETS ceph_test_trim_caps DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+endif(${WITH_CEPHFS})
--- /dev/null
+#define _FILE_OFFSET_BITS 64
+#include <features.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+#include <unistd.h>
+#include <include/cephfs/libcephfs.h>
+
+int main(int argc, char *argv[])
+{
+ char buf;
+ int pipefd[2];
+ int rc = pipe(pipefd);
+ assert(rc >= 0);
+
+ pid_t pid = fork();
+ assert(pid >= 0);
+ if (pid == 0)
+ close(pipefd[1]);
+ else
+ close(pipefd[0]);
+
+ struct ceph_mount_info *cmount = NULL;
+
+ ceph_create(&cmount, "admin");
+ ceph_conf_read_file(cmount, NULL);
+
+ int ret = ceph_mount(cmount, NULL);
+ assert(ret >= 0);
+
+ if (pid == 0) {
+ ret = read(pipefd[0], &buf, 1);
+ assert(ret == 1);
+
+ ret = ceph_rename(cmount, "1", "3");
+ assert(ret >= 0);
+
+ ret = ceph_rename(cmount, "2", "1");
+ assert(ret >= 0);
+
+ ceph_unmount(cmount);
+ printf("child exits\n");
+ } else {
+ ret = ceph_mkdirs(cmount, "1/2", 0755);
+ assert(ret >= 0);
+
+ struct ceph_statx stx;
+ ret = ceph_statx(cmount, "1", &stx, 0, 0);
+ assert(ret >= 0);
+ uint64_t orig_ino = stx.stx_ino;
+
+
+ ret = ceph_mkdir(cmount, "2", 0755);
+ assert(ret >= 0);
+
+ ret = write(pipefd[1], &buf, 1);
+ assert(ret == 1);
+
+ int wstatus;
+ ret = waitpid(pid, &wstatus, 0);
+ assert(ret >= 0);
+ assert(wstatus == 0);
+
+ // make origin '1' no parent dentry
+ ret = ceph_statx(cmount, "1", &stx, 0, 0);
+ assert(ret >= 0);
+ assert(orig_ino != stx.stx_ino);
+
+ // move root inode's cap_item to tail of session->caps
+ ret = ceph_statx(cmount, ".", &stx, 0, 0);
+ assert(ret >= 0);
+
+ printf("waiting for crash\n");
+ sleep(60);
+ }
+ return 0;
+}
ceph_shutdown(cmount);
}
+TEST(LibCephFS, StatDirNlink) {
+ struct ceph_mount_info *cmount;
+ ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
+ ASSERT_EQ(ceph_mount(cmount, NULL), 0);
+
+ char test_dir1[256];
+ sprintf(test_dir1, "dir1_symlinks_%d", getpid());
+ ASSERT_EQ(ceph_mkdir(cmount, test_dir1, 0700), 0);
+
+ int fd = ceph_open(cmount, test_dir1, O_DIRECTORY|O_RDONLY, 0);
+ ASSERT_GT(fd, 0);
+ struct ceph_statx stx;
+ ASSERT_EQ(ceph_fstatx(cmount, fd, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+ ASSERT_EQ(stx.stx_nlink, 2);
+
+ {
+ char test_dir2[256];
+ sprintf(test_dir2, "%s/.", test_dir1);
+ ASSERT_EQ(ceph_statx(cmount, test_dir2, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+ ASSERT_EQ(stx.stx_nlink, 2);
+ }
+
+ {
+ char test_dir2[256];
+ sprintf(test_dir2, "%s/1", test_dir1);
+ ASSERT_EQ(ceph_mkdir(cmount, test_dir2, 0700), 0);
+ ASSERT_EQ(ceph_statx(cmount, test_dir2, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+ ASSERT_EQ(stx.stx_nlink, 2);
+ ASSERT_EQ(ceph_statx(cmount, test_dir1, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+ ASSERT_EQ(stx.stx_nlink, 3);
+ sprintf(test_dir2, "%s/2", test_dir1);
+ ASSERT_EQ(ceph_mkdir(cmount, test_dir2, 0700), 0);
+ ASSERT_EQ(ceph_statx(cmount, test_dir1, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+ ASSERT_EQ(stx.stx_nlink, 4);
+ sprintf(test_dir2, "%s/1/1", test_dir1);
+ ASSERT_EQ(ceph_mkdir(cmount, test_dir2, 0700), 0);
+ ASSERT_EQ(ceph_statx(cmount, test_dir1, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+ ASSERT_EQ(stx.stx_nlink, 4);
+ ASSERT_EQ(ceph_rmdir(cmount, test_dir2), 0);
+ ASSERT_EQ(ceph_statx(cmount, test_dir1, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+ ASSERT_EQ(stx.stx_nlink, 4);
+ sprintf(test_dir2, "%s/1", test_dir1);
+ ASSERT_EQ(ceph_rmdir(cmount, test_dir2), 0);
+ ASSERT_EQ(ceph_statx(cmount, test_dir1, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+ ASSERT_EQ(stx.stx_nlink, 3);
+ sprintf(test_dir2, "%s/2", test_dir1);
+ ASSERT_EQ(ceph_rmdir(cmount, test_dir2), 0);
+ ASSERT_EQ(ceph_statx(cmount, test_dir1, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+ ASSERT_EQ(stx.stx_nlink, 2);
+ }
+
+ ASSERT_EQ(ceph_rmdir(cmount, test_dir1), 0);
+ ASSERT_EQ(ceph_fstatx(cmount, fd, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+ ASSERT_EQ(stx.stx_nlink, 0);
+
+ ceph_close(cmount, fd);
+
+ ceph_shutdown(cmount);
+}
+
TEST(LibCephFS, DoubleChmod) {
struct ceph_mount_info *cmount;
}
ASSERT_LT(n, 1024);
+ // make sure we have latest map that marked the pool full
+ test_data.m_cluster.wait_for_latest_osdmap();
+
// make sure we block without FULL_TRY
{
ObjectWriteOperation op;
// Ensure a non-power-of-two PG count to avoid only
// touching the easy path.
- std::string err_str = set_pg_num(&s_cluster, pool_name, 11);
- ASSERT_TRUE(err_str.empty());
+ ASSERT_TRUE(set_pg_num(&s_cluster, pool_name, 11).empty());
+ ASSERT_TRUE(set_pgp_num(&s_cluster, pool_name, 11).empty());
std::set<std::string> saw_obj;
rados_object_list_cursor c = rados_object_list_begin(ioctx);
// Ensure a non-power-of-two PG count to avoid only
// touching the easy path.
- std::string err_str = set_pg_num(&s_cluster, pool_name, 11);
- ASSERT_TRUE(err_str.empty());
+ ASSERT_TRUE(set_pg_num(&s_cluster, pool_name, 11).empty());
+ ASSERT_TRUE(set_pgp_num(&s_cluster, pool_name, 11).empty());
rados_object_list_cursor begin = rados_object_list_begin(ioctx);
rados_object_list_cursor end = rados_object_list_end(ioctx);
return ret;
}
-}
-
-std::string set_pg_num(
- rados_t *cluster, const std::string &pool_name, uint32_t pg_num)
-{
- // Wait for 'creating' to clear
- int r = wait_for_healthy(cluster);
- if (r != 0) {
- goto err;
+struct pool_op_error : std::exception {
+ string msg;
+ pool_op_error(const std::string& pool_name,
+ const std::string& func_name,
+ int err) {
+ std::ostringstream oss;
+ oss << func_name << "(" << pool_name << ") failed with error " << err;
+ msg = oss.str();
}
-
- // Adjust pg_num
- r = rados_pool_set(cluster, pool_name, "pg_num", stringify(pg_num));
- if (r != 0) {
- goto err;
+ const char* what() const noexcept override {
+ return msg.c_str();
}
+};
- // Wait for 'creating' to clear
- r = wait_for_healthy(cluster);
- if (r != 0) {
- goto err;
+template<typename Func>
+std::string with_healthy_cluster(rados_t* cluster,
+ const std::string& pool_name,
+ Func&& func)
+{
+ try {
+ // Wait for 'creating/backfilling' to clear
+ int r = wait_for_healthy(cluster);
+ if (r != 0) {
+ throw pool_op_error{pool_name, "wait_for_healthy", r};
+ }
+ func();
+ // Wait for 'creating/backfilling' to clear
+ r = wait_for_healthy(cluster);
+ if (r != 0) {
+ throw pool_op_error{pool_name, "wait_for_healthy", r};
+ }
+ } catch (const pool_op_error& e) {
+ rados_shutdown(*cluster);
+ return e.what();
}
-
return "";
+}
+}
+
+std::string set_pg_num(
+ rados_t *cluster, const std::string &pool_name, uint32_t pg_num)
+{
+ return with_healthy_cluster(cluster, pool_name, [&] {
+ // Adjust pg_num
+ int r = rados_pool_set(cluster, pool_name, "pg_num",
+ stringify(pg_num));
+ if (r != 0) {
+ throw pool_op_error{pool_name, "set_pg_num", r};
+ }
+ });
+}
-err:
- rados_shutdown(*cluster);
- std::ostringstream oss;
- oss << __func__ << "(" << pool_name << ") failed with error " << r;
- return oss.str();
+std::string set_pgp_num(
+ rados_t *cluster, const std::string &pool_name, uint32_t pgp_num)
+{
+ return with_healthy_cluster(cluster, pool_name, [&] {
+ // Adjust pgp_num
+ int r = rados_pool_set(cluster, pool_name, "pgp_num",
+ stringify(pgp_num));
+ if (r != 0) {
+ throw pool_op_error{pool_name, "set_pgp_num", r};
+ }
+ });
}
std::string set_pg_num(
rados_t *cluster, const std::string &pool_name, uint32_t pg_num);
+std::string set_pgp_num(
+ rados_t *cluster, const std::string &pool_name, uint32_t pgp_num);
}
}
+ void expect_writeback_cache_enabled(MockReplayImageCtx &mock_image_ctx,
+ bool enabled) {
+ EXPECT_CALL(mock_image_ctx, is_writeback_cache_enabled())
+ .WillRepeatedly(Return(enabled));
+ }
+
void when_process(MockJournalReplay &mock_journal_replay,
EventEntry &&event_entry, Context *on_ready,
Context *on_safe) {
MockJournalReplay mock_journal_replay(mock_image_ctx);
MockIoImageRequest mock_io_image_request;
+ expect_writeback_cache_enabled(mock_image_ctx, true);
expect_op_work_queue(mock_image_ctx);
InSequence seq;
MockJournalReplay mock_journal_replay(mock_image_ctx);
MockIoImageRequest mock_io_image_request;
+ expect_writeback_cache_enabled(mock_image_ctx, true);
expect_op_work_queue(mock_image_ctx);
InSequence seq;
MockJournalReplay mock_journal_replay(mock_image_ctx);
MockIoImageRequest mock_io_image_request;
+ expect_writeback_cache_enabled(mock_image_ctx, true);
expect_op_work_queue(mock_image_ctx);
InSequence seq;
MockJournalReplay mock_compare_and_write_journal_replay(mock_image_ctx);
MockJournalReplay mock_mis_compare_and_write_journal_replay(mock_image_ctx);
MockIoImageRequest mock_io_image_request;
+ expect_writeback_cache_enabled(mock_image_ctx, true);
expect_op_work_queue(mock_image_ctx);
InSequence seq;
MockJournalReplay mock_journal_replay(mock_image_ctx);
MockIoImageRequest mock_io_image_request;
+ expect_writeback_cache_enabled(mock_image_ctx, true);
expect_op_work_queue(mock_image_ctx);
InSequence seq;
MockJournalReplay mock_journal_replay(mock_image_ctx);
MockIoImageRequest mock_io_image_request;
+ expect_writeback_cache_enabled(mock_image_ctx, true);
expect_op_work_queue(mock_image_ctx);
InSequence seq;
MockJournalReplay mock_journal_replay(mock_image_ctx);
MockIoImageRequest mock_io_image_request;
+ expect_writeback_cache_enabled(mock_image_ctx, true);
expect_op_work_queue(mock_image_ctx);
InSequence seq;
}
TEST_F(TestMockJournalReplay, Flush) {
+ REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
librbd::ImageCtx *ictx;
ASSERT_EQ(0, open_image(m_image_name, &ictx));
MockJournalReplay mock_journal_replay(mock_image_ctx);
MockIoImageRequest mock_io_image_request;
+ expect_writeback_cache_enabled(mock_image_ctx, true);
expect_op_work_queue(mock_image_ctx);
InSequence seq;
}
TEST_F(TestMockJournalReplay, MissingOpFinishEvent) {
+ REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
librbd::ImageCtx *ictx;
ASSERT_EQ(0, open_image(m_image_name, &ictx));
}
TEST_F(TestMockJournalReplay, MissingOpFinishEventCancelOps) {
+ REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
librbd::ImageCtx *ictx;
ASSERT_EQ(0, open_image(m_image_name, &ictx));
}
TEST_F(TestMockJournalReplay, UnknownOpFinishEvent) {
+ REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
librbd::ImageCtx *ictx;
ASSERT_EQ(0, open_image(m_image_name, &ictx));
ASSERT_EQ(-ECANCELED, on_finish_safe.wait());
}
+TEST_F(TestMockJournalReplay, WritebackCacheDisabled) {
+ REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockReplayImageCtx mock_image_ctx(*ictx);
+
+ MockExclusiveLock mock_exclusive_lock;
+ mock_image_ctx.exclusive_lock = &mock_exclusive_lock;
+ expect_accept_ops(mock_exclusive_lock, true);
+
+ MockJournalReplay mock_journal_replay(mock_image_ctx);
+ MockIoImageRequest mock_io_image_request;
+ expect_writeback_cache_enabled(mock_image_ctx, false);
+ expect_op_work_queue(mock_image_ctx);
+
+ InSequence seq;
+ io::AioCompletion *aio_comp;
+ C_SaferCond on_ready;
+ C_SaferCond on_safe;
+ expect_aio_discard(mock_io_image_request, &aio_comp, 123, 456, false);
+ when_process(mock_journal_replay,
+ EventEntry{AioDiscardEvent(123, 456, false)},
+ &on_ready, &on_safe);
+
+ when_complete(mock_image_ctx, aio_comp, 0);
+ ASSERT_EQ(0, on_ready.wait());
+ ASSERT_EQ(0, on_safe.wait());
+ ASSERT_EQ(0, when_shut_down(mock_journal_replay, false));
+}
+
} // namespace journal
} // namespace librbd
MOCK_CONST_METHOD0(get_stripe_count, uint64_t());
MOCK_CONST_METHOD0(get_stripe_period, uint64_t());
+ MOCK_CONST_METHOD0(is_writeback_cache_enabled, bool());
+
ImageCtx *image_ctx;
CephContext *cct;
PerfCounters *perfcounter;
* authorizer, false otherwise.
*/
bool ms_verify_authorizer(Connection *con, int peer_type,
- int protocol, bufferlist& authorizer,
- bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key) override {
+ int protocol, bufferlist& authorizer,
+ bufferlist& authorizer_reply,
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
/* always succeed */
isvalid = true;
return true;
virtual bool ms_verify_authorizer(Connection *con, int peer_type,
int protocol, bufferlist& authorizer,
bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key) {
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) {
/* always succeed */
isvalid = true;
return true;
bool ms_handle_refused(Connection *con) override { return false; }
bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
bufferlist& authorizer, bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key) override {
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
isvalid = true;
return true;
}
}
bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
bufferlist& authorizer, bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key) override {
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
isvalid = true;
return true;
}
bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
bufferlist& authorizer, bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key) override {
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
isvalid = true;
return true;
}
bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
bufferlist& authorizer, bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key) override {
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
isvalid = true;
return true;
}
}
bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
bufferlist& authorizer, bufferlist& authorizer_reply,
- bool& isvalid, CryptoKey& session_key) override {
+ bool& isvalid, CryptoKey& session_key,
+ std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
isvalid = true;
return true;
}
break;
default:
+ cout << "bad op " << op << std::endl;
assert(0 == "bad op");
}
return ok;
}
hobject_t *obj = entry->touch_obj(obj_id);
- dout(0) << "do_touch " << entry->m_coll.to_str() << "/" << obj->oid.name << dendl;
+ dout(0) << "do_touch " << entry->m_coll << "/" << obj << dendl;
- _do_touch(entry->m_coll, *obj);
+ _do_touch(entry, *obj);
return true;
}
hobject_t *obj = entry->touch_obj(obj_id);
ceph_assert(obj);
- dout(0) << "do_remove " << entry->m_coll.to_str() << "/" << obj->oid.name << dendl;
+ dout(0) << "do_remove " << entry->m_coll << "/" << obj << dendl;
- _do_remove(entry->m_coll, *obj);
+ _do_remove(entry, *obj);
hobject_t *rmobj = entry->remove_obj(obj_id);
ceph_assert(rmobj);
delete rmobj;
gen_attrs(gen, &out);
dout(0) << "do_set_attrs " << out.size() << " entries" << dendl;
- _do_set_attrs(entry->m_coll, *obj, out);
+ _do_set_attrs(entry, *obj, out);
return true;
}
bufferlist bl;
_gen_random(gen, size, bl);
- dout(0) << "do_write " << entry->m_coll.to_str() << "/" << obj->oid.name
+ dout(0) << "do_write " << entry->m_coll << "/" << obj
<< " 0~" << size << dendl;
- _do_write(entry->m_coll, *obj, 0, bl.length(), bl);
+ _do_write(entry, *obj, 0, bl.length(), bl);
return true;
}
-bool DeterministicOpSequence::_prepare_clone(rngen_t& gen,
- coll_t& coll_ret, hobject_t& orig_obj_ret, hobject_t& new_obj_ret)
+bool DeterministicOpSequence::_prepare_clone(
+ rngen_t& gen,
+ coll_entry_t **entry_ret,
+ int *orig_obj_id,
+ hobject_t *orig_obj_ret,
+ int *new_obj_id,
+ hobject_t *new_obj_ret)
{
int coll_id = _gen_coll_id(gen);
coll_entry_t *entry = get_coll_at(coll_id);
ceph_assert(entry != NULL);
- if (entry->m_objects.size() >= 2) {
- dout(0) << "_prepare_clone coll " << entry->m_coll.to_str()
+ if (entry->m_objects.size() < 2) {
+ dout(0) << "_prepare_clone coll " << entry->m_coll
<< " doesn't have 2 or more objects" << dendl;
return false;
}
- int orig_obj_id = entry->get_random_obj_id(gen);
- hobject_t *orig_obj = entry->touch_obj(orig_obj_id);
+ *orig_obj_id = entry->get_random_obj_id(gen);
+ hobject_t *orig_obj = entry->touch_obj(*orig_obj_id);
ceph_assert(orig_obj);
- int id;
do {
- id = entry->get_random_obj_id(gen);
- } while (id == orig_obj_id);
- hobject_t *new_obj = entry->touch_obj(id);
+ *new_obj_id = entry->get_random_obj_id(gen);
+ } while (*new_obj_id == *orig_obj_id);
+ hobject_t *new_obj = entry->touch_obj(*new_obj_id);
ceph_assert(new_obj);
- coll_ret = entry->m_coll;
- orig_obj_ret = *orig_obj;
- new_obj_ret = *new_obj;
-
+ *entry_ret = entry;
+ *orig_obj_ret = *orig_obj;
+ *new_obj_ret = *new_obj;
return true;
}
bool DeterministicOpSequence::do_clone(rngen_t& gen)
{
- coll_t coll;
+ coll_entry_t *entry;
+ int orig_id, new_id;
hobject_t orig_obj, new_obj;
- if (!_prepare_clone(gen, coll, orig_obj, new_obj)) {
+ if (!_prepare_clone(gen, &entry, &orig_id, &orig_obj, &new_id, &new_obj)) {
return false;
}
- dout(0) << "do_clone " << coll.to_str() << "/" << orig_obj.oid.name
- << " => " << coll.to_str() << "/" << new_obj.oid.name << dendl;
+ dout(0) << "do_clone " << entry->m_coll << "/" << orig_obj
+ << " => " << entry->m_coll << "/" << new_obj << dendl;
- _do_clone(coll, orig_obj, new_obj);
+ _do_clone(entry, orig_obj, new_obj);
return true;
}
bool DeterministicOpSequence::do_clone_range(rngen_t& gen)
{
- coll_t coll;
+ coll_entry_t *entry;
+ int orig_id, new_id;
hobject_t orig_obj, new_obj;
- if (!_prepare_clone(gen, coll, orig_obj, new_obj)) {
+ if (!_prepare_clone(gen, &entry, &orig_id, &orig_obj, &new_id, &new_obj)) {
return false;
}
boost::uniform_int<> clone_len(1, bl.length());
size = (size_t) clone_len(gen);
- dout(0) << "do_clone_range " << coll.to_str() << "/" << orig_obj.oid.name
+ dout(0) << "do_clone_range " << entry->m_coll << "/" << orig_obj
<< " (0~" << size << ")"
- << " => " << coll.to_str() << "/" << new_obj.oid.name
+ << " => " << entry->m_coll << "/" << new_obj
<< " (0)" << dendl;
- _do_write_and_clone_range(coll, orig_obj, new_obj, 0, size, 0, bl);
+ _do_write_and_clone_range(entry, orig_obj, new_obj, 0, size, 0, bl);
return true;
}
-bool DeterministicOpSequence::_prepare_colls(rngen_t& gen,
- coll_entry_t* &orig_coll, coll_entry_t* &new_coll)
-{
- ceph_assert(m_collections_ids.size() > 1);
- int orig_coll_id = _gen_coll_id(gen);
- int new_coll_id;
- do {
- new_coll_id = _gen_coll_id(gen);
- } while (new_coll_id == orig_coll_id);
-
- dout(0) << "_prepare_colls from coll id " << orig_coll_id
- << " to coll id " << new_coll_id << dendl;
-
- orig_coll = get_coll_at(orig_coll_id);
- ceph_assert(orig_coll != NULL);
- new_coll = get_coll_at(new_coll_id);
- ceph_assert(new_coll != NULL);
-
- if (!orig_coll->m_objects.size()) {
- dout(0) << "_prepare_colls coll " << orig_coll->m_coll.to_str()
- << " has no objects to use" << dendl;
- return false;
- }
-
- return true;
-}
-
-
bool DeterministicOpSequence::do_coll_move(rngen_t& gen)
{
- coll_entry_t *orig_coll = NULL, *new_coll = NULL;
- if (!_prepare_colls(gen, orig_coll, new_coll))
- return false;
-
- ceph_assert(orig_coll && new_coll);
-
- boost::uniform_int<> obj_rng(0, orig_coll->m_objects.size()-1);
- int obj_pos = obj_rng(gen);
- int obj_key = -1;
- hobject_t *obj = orig_coll->get_obj_at(obj_pos, &obj_key);
- if (!obj) {
- dout(0) << "do_coll_move coll " << orig_coll->m_coll.to_str()
- << " has no object as pos #" << obj_pos << " (key " << obj_key << ")"
- << dendl;
- return false;
- }
- if (new_coll->check_for_obj(obj_key)) {
- dout(0) << "do_coll_move coll " << orig_coll->m_coll.to_str()
- << " already has object as pos #" << obj_pos << " (key " << obj_key << ")"
- << dendl;
+ coll_entry_t *entry;
+ int orig_id, new_id;
+ hobject_t orig_obj, new_obj;
+ if (!_prepare_clone(gen, &entry, &orig_id, &orig_obj, &new_id, &new_obj)) {
return false;
}
- dout(0) << "do_coll_move " << orig_coll->m_coll.to_str() << "/" << obj->oid.name
- << " => " << new_coll->m_coll.to_str() << "/" << obj->oid.name << dendl;
- new_coll->touch_obj(obj_key);
- orig_coll->remove_obj(obj_key);
+ dout(0) << "do_coll_move " << entry->m_coll << "/" << orig_obj
+ << " => " << entry->m_coll << "/" << new_obj << dendl;
+ entry->remove_obj(orig_id);
- _do_coll_move(orig_coll->m_coll, new_coll->m_coll, *obj);
+ _do_coll_move(entry, orig_obj, new_obj);
return true;
}
bool DeterministicOpSequence::do_coll_create(rngen_t& gen)
{
- boost::uniform_int<> pg_num_range(0, 512);
- int pg_num = pg_num_range(gen);
-
- // Assume there is 7 OSDs in total, the PGs are evenly distributed across those OSDs
- int pgs = pg_num / 7;
-
- boost::uniform_int<> num_objs_range(1, 1024);
- int num_objs = num_objs_range(gen);
-
- int pool_id = get_next_pool_id();
- std::set<int> pg_created;
- for (int i = 0; i < pgs; i++) {
- boost::uniform_int<> pg_range(0, pg_num - 1);
- int pg_id = pg_range(gen);
- if (pg_created.count(pg_id) > 0)
- continue;
- _do_coll_create(coll_t(spg_t(pg_t(pg_id,pool_id),shard_id_t::NO_SHARD)),
- (uint32_t) pg_num, (uint64_t) num_objs);
- pg_created.insert(pg_id);
- }
+ int i = m_collections.size();
+ coll_entry_t *entry = coll_create(i);
+ m_collections.insert(make_pair(i, entry));
+ m_collections_ids.push_back(i);
+
+ _do_coll_create(entry, 10, 10);
+
return true;
}
-void DeterministicOpSequence::_do_coll_create(coll_t cid, uint32_t pg_num, uint64_t num_objs)
+void DeterministicOpSequence::_do_coll_create(coll_entry_t *entry, uint32_t pg_num, uint64_t num_objs)
{
ObjectStore::Transaction t;
note_txn(&t);
- t.create_collection(cid, 32);
+ t.create_collection(entry->m_coll, 32);
bufferlist hint;
- ::encode(pg_num, hint);
- ::encode(num_objs, hint);
- t.collection_hint(cid, ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS, hint);
- dout(0) << "Give collection: " << cid << " a hint, pg_num is: " << pg_num << ", num_objs is: "
- << num_objs << dendl;
+ encode(pg_num, hint);
+ encode(num_objs, hint);
+ t.collection_hint(entry->m_coll, ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS, hint);
+ dout(0) << "Give collection: " << entry->m_coll
+ << " a hint, pg_num is: " << pg_num << ", num_objs is: "
+ << num_objs << dendl;
m_store->apply_transaction(&m_osr, std::move(t));
}
-void DeterministicOpSequence::_do_touch(coll_t coll, hobject_t& obj)
+void DeterministicOpSequence::_do_touch(coll_entry_t *entry, hobject_t& obj)
{
ObjectStore::Transaction t;
note_txn(&t);
- t.touch(coll, ghobject_t(obj));
+ t.touch(entry->m_coll, ghobject_t(obj));
m_store->apply_transaction(&m_osr, std::move(t));
}
-void DeterministicOpSequence::_do_remove(coll_t coll, hobject_t& obj)
+void DeterministicOpSequence::_do_remove(coll_entry_t *entry, hobject_t& obj)
{
ObjectStore::Transaction t;
note_txn(&t);
- t.remove(coll, ghobject_t(obj));
+ t.remove(entry->m_coll, ghobject_t(obj));
m_store->apply_transaction(&m_osr, std::move(t));
}
-void DeterministicOpSequence::_do_set_attrs(coll_t coll,
+void DeterministicOpSequence::_do_set_attrs(coll_entry_t *entry,
hobject_t &obj,
const map<string, bufferlist> &attrs)
{
ObjectStore::Transaction t;
note_txn(&t);
- t.omap_setkeys(coll, ghobject_t(obj), attrs);
+ t.omap_setkeys(entry->m_coll, ghobject_t(obj), attrs);
m_store->apply_transaction(&m_osr, std::move(t));
}
-void DeterministicOpSequence::_do_write(coll_t coll, hobject_t& obj,
+void DeterministicOpSequence::_do_write(coll_entry_t *entry, hobject_t& obj,
uint64_t off, uint64_t len, const bufferlist& data)
{
ObjectStore::Transaction t;
note_txn(&t);
- t.write(coll, ghobject_t(obj), off, len, data);
+ t.write(entry->m_coll, ghobject_t(obj), off, len, data);
m_store->apply_transaction(&m_osr, std::move(t));
}
-void DeterministicOpSequence::_do_clone(coll_t coll, hobject_t& orig_obj,
+void DeterministicOpSequence::_do_clone(coll_entry_t *entry, hobject_t& orig_obj,
hobject_t& new_obj)
{
ObjectStore::Transaction t;
note_txn(&t);
- t.clone(coll, ghobject_t(orig_obj), ghobject_t(new_obj));
+ t.clone(entry->m_coll, ghobject_t(orig_obj), ghobject_t(new_obj));
m_store->apply_transaction(&m_osr, std::move(t));
}
-void DeterministicOpSequence::_do_clone_range(coll_t coll,
+void DeterministicOpSequence::_do_clone_range(coll_entry_t *entry,
hobject_t& orig_obj, hobject_t& new_obj, uint64_t srcoff,
uint64_t srclen, uint64_t dstoff)
{
ObjectStore::Transaction t;
note_txn(&t);
- t.clone_range(coll, ghobject_t(orig_obj), ghobject_t(new_obj),
+ t.clone_range(entry->m_coll, ghobject_t(orig_obj), ghobject_t(new_obj),
srcoff, srclen, dstoff);
m_store->apply_transaction(&m_osr, std::move(t));
}
-void DeterministicOpSequence::_do_write_and_clone_range(coll_t coll,
+void DeterministicOpSequence::_do_write_and_clone_range(coll_entry_t *entry,
hobject_t& orig_obj,
hobject_t& new_obj,
uint64_t srcoff,
{
ObjectStore::Transaction t;
note_txn(&t);
- t.write(coll, ghobject_t(orig_obj), srcoff, bl.length(), bl);
- t.clone_range(coll, ghobject_t(orig_obj), ghobject_t(new_obj),
+ t.write(entry->m_coll, ghobject_t(orig_obj), srcoff, bl.length(), bl);
+ t.clone_range(entry->m_coll, ghobject_t(orig_obj), ghobject_t(new_obj),
srcoff, srclen, dstoff);
m_store->apply_transaction(&m_osr, std::move(t));
}
-void DeterministicOpSequence::_do_coll_move(coll_t orig_coll, coll_t new_coll,
- hobject_t& obj)
+void DeterministicOpSequence::_do_coll_move(coll_entry_t *entry,
+ hobject_t& orig_obj,
+ hobject_t& new_obj)
{
ObjectStore::Transaction t;
note_txn(&t);
- t.remove(new_coll, ghobject_t(obj));
- t.collection_move_rename(orig_coll, ghobject_t(obj), new_coll, ghobject_t(obj));
+ t.remove(entry->m_coll, ghobject_t(new_obj));
+ t.collection_move_rename(entry->m_coll, ghobject_t(orig_obj),
+ entry->m_coll, ghobject_t(new_obj));
m_store->apply_transaction(&m_osr, std::move(t));
}
DSOP_CLONE = 2,
DSOP_CLONE_RANGE = 3,
DSOP_OBJ_REMOVE = 4,
- DSOP_COLL_MOVE = 6,
- DSOP_SET_ATTRS = 7,
- DSOP_COLL_CREATE = 8,
+ DSOP_COLL_MOVE = 5,
+ DSOP_SET_ATTRS = 6,
+ DSOP_COLL_CREATE = 7,
DSOP_FIRST = DSOP_TOUCH,
DSOP_LAST = DSOP_COLL_CREATE,
bool do_set_attrs(rngen_t& gen);
bool do_coll_create(rngen_t& gen);
- virtual void _do_touch(coll_t coll, hobject_t& obj);
- virtual void _do_remove(coll_t coll, hobject_t& obj);
- virtual void _do_write(coll_t coll, hobject_t& obj, uint64_t off,
+ virtual void _do_touch(coll_entry_t *entry, hobject_t& obj);
+ virtual void _do_remove(coll_entry_t *entry, hobject_t& obj);
+ virtual void _do_write(coll_entry_t *entry, hobject_t& obj, uint64_t off,
uint64_t len, const bufferlist& data);
- virtual void _do_set_attrs(coll_t coll,
+ virtual void _do_set_attrs(coll_entry_t *entry,
hobject_t &obj,
const map<string, bufferlist> &attrs);
- virtual void _do_clone(coll_t coll, hobject_t& orig_obj, hobject_t& new_obj);
- virtual void _do_clone_range(coll_t coll, hobject_t& orig_obj,
+ virtual void _do_clone(coll_entry_t *entry, hobject_t& orig_obj, hobject_t& new_obj);
+ virtual void _do_clone_range(coll_entry_t *entry, hobject_t& orig_obj,
hobject_t& new_obj, uint64_t srcoff, uint64_t srclen, uint64_t dstoff);
- virtual void _do_write_and_clone_range(coll_t coll, hobject_t& orig_obj,
+ virtual void _do_write_and_clone_range(coll_entry_t *entry, hobject_t& orig_obj,
hobject_t& new_obj, uint64_t srcoff, uint64_t srclen,
uint64_t dstoff, bufferlist& bl);
- virtual void _do_coll_move(coll_t orig_coll, coll_t new_coll, hobject_t& obj);
- virtual void _do_coll_create(coll_t cid, uint32_t pg_num, uint64_t num_objs);
+ virtual void _do_coll_move(coll_entry_t *entry, hobject_t& orig_obj, hobject_t& new_obj);
+ virtual void _do_coll_create(coll_entry_t *entry, uint32_t pg_num, uint64_t num_objs);
int _gen_coll_id(rngen_t& gen);
int _gen_obj_id(rngen_t& gen);
void _print_status(int seq, int op);
private:
- bool _prepare_clone(rngen_t& gen, coll_t& coll_ret,
- hobject_t& orig_obj_ret, hobject_t& new_obj_ret);
- bool _prepare_colls(rngen_t& gen,
- coll_entry_t* &orig_coll, coll_entry_t* &new_coll);
+ bool _prepare_clone(
+ rngen_t& gen,
+ coll_entry_t **entry_ret,
+ int *orig_obj_id,
+ hobject_t *orig_obj_ret,
+ int *new_obj_id,
+ hobject_t *new_obj_ret);
};
std::map<std::string, bufferptr>::iterator a_it = a.begin();
for (; b_it != b.end(); ++b_it, ++a_it) {
if (b_it->first != a_it->first) {
- dout(0) << "diff_attrs name mismatch (verify: " << b_it->first
- << ", store: " << a_it->first << ")" << dendl;
+ cout << "diff_attrs name mismatch (verify: " << b_it->first
+ << ", store: " << a_it->first << ")" << std::endl;
ret = true;
continue;
}
if (!b_it->second.cmp(a_it->second)) {
- dout(0) << "diff_attrs contents mismatch on attr " << b_it->first << dendl;
+ cout << "diff_attrs contents mismatch on attr " << b_it->first << std::endl;
ret = true;
continue;
}
std::map<std::string, bufferlist>::iterator b_it = b.begin();
std::map<std::string, bufferlist>::iterator a_it = a.begin();
for (; b_it != b.end(); ++b_it, ++a_it) {
+ if (a_it == a.end()) {
+ cout << __func__ << " a reached end before b, a missing " << b_it->first
+ << std::endl;
+ ret = true;
+ break;
+ }
if (b_it->first != a_it->first) {
- dout(0) << "diff_attrs name mismatch (verify: " << b_it->first
- << ", store: " << a_it->first << ")" << dendl;
+ cout << "diff_attrs name mismatch (verify: " << b_it->first
+ << ", store: " << a_it->first << ")" << std::endl;
ret = true;
continue;
}
if (!(b_it->second == a_it->second)) {
- dout(0) << "diff_attrs contents mismatch on attr " << b_it->first << dendl;
+ cout << "diff_attrs contents mismatch on attr " << b_it->first << std::endl;
ret = true;
continue;
}
bool ret = false;
if (a.st_uid != b.st_uid) {
- dout(0) << "diff_objects_stat uid mismatch (A: "
- << a.st_uid << " != B: " << b.st_uid << ")" << dendl;
+ cout << "diff_objects_stat uid mismatch (A: "
+ << a.st_uid << " != B: " << b.st_uid << ")" << std::endl;
ret = true;
}
if (a.st_gid != b.st_gid) {
- dout(0) << "diff_objects_stat gid mismatch (A: "
- << a.st_gid << " != B: " << b.st_gid << ")" << dendl;
+ cout << "diff_objects_stat gid mismatch (A: "
+ << a.st_gid << " != B: " << b.st_gid << ")" << std::endl;
ret = true;
}
if (a.st_mode != b.st_mode) {
- dout(0) << "diff_objects_stat mode mismatch (A: "
- << a.st_mode << " != B: " << b.st_mode << ")" << dendl;
+ cout << "diff_objects_stat mode mismatch (A: "
+ << a.st_mode << " != B: " << b.st_mode << ")" << std::endl;
ret = true;
}
if (a.st_nlink != b.st_nlink) {
- dout(0) << "diff_objects_stat nlink mismatch (A: "
- << a.st_nlink << " != B: " << b.st_nlink << ")" << dendl;
+ cout << "diff_objects_stat nlink mismatch (A: "
+ << a.st_nlink << " != B: " << b.st_nlink << ")" << std::endl;
ret = true;
}
if (a.st_size != b.st_size) {
- dout(0) << "diff_objects_stat size mismatch (A: "
- << a.st_size << " != B: " << b.st_size << ")" << dendl;
+ cout << "diff_objects_stat size mismatch (A: "
+ << a.st_size << " != B: " << b.st_size << ")" << std::endl;
ret = true;
}
return ret;
bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t coll)
{
- dout(2) << __func__ << " coll " << coll << dendl;
-
bool ret = false;
int err;
err = b_store->collection_list(coll, ghobject_t(), ghobject_t::get_max(),
INT_MAX, &b_objects, NULL);
if (err < 0) {
- dout(0) << "diff_objects list on verify coll " << coll.to_str()
- << " returns " << err << dendl;
+ cout << "diff_objects list on verify coll " << coll.to_str()
+ << " returns " << err << std::endl;
return true;
}
err = a_store->collection_list(coll, ghobject_t(), ghobject_t::get_max(),
INT_MAX, &a_objects, NULL);
if (err < 0) {
- dout(0) << "diff_objects list on store coll " << coll.to_str()
- << " returns " << err << dendl;
+ cout << "diff_objects list on store coll " << coll.to_str()
+ << " returns " << err << std::endl;
return true;
}
if (b_objects.size() != a_objects.size()) {
- dout(0) << "diff_objects num objs mismatch (A: " << a_objects.size()
- << ", B: " << b_objects.size() << ")" << dendl;
+ cout << "diff_objects num objs mismatch (A: " << a_objects.size()
+ << ", B: " << b_objects.size() << ")" << std::endl;
ret = true;
}
for (; b_it != b_objects.end(); ++b_it, ++a_it) {
ghobject_t b_obj = *b_it, a_obj = *a_it;
if (b_obj.hobj.oid.name != a_obj.hobj.oid.name) {
- dout(0) << "diff_objects name mismatch on A object "
+ cout << "diff_objects name mismatch on A object "
<< coll << "/" << a_obj << " and B object "
- << coll << "/" << b_obj << dendl;
+ << coll << "/" << b_obj << std::endl;
ret = true;
continue;
}
struct stat b_stat, a_stat;
err = b_store->stat(coll, b_obj, &b_stat);
if (err < 0) {
- dout(0) << "diff_objects error stating B object "
- << coll.to_str() << "/" << b_obj.hobj.oid.name << dendl;
+ cout << "diff_objects error stating B object "
+ << coll.to_str() << "/" << b_obj.hobj.oid.name << std::endl;
ret = true;
}
err = a_store->stat(coll, a_obj, &a_stat);
if (err < 0) {
- dout(0) << "diff_objects error stating A object "
- << coll << "/" << a_obj << dendl;
+ cout << "diff_objects error stating A object "
+ << coll << "/" << a_obj << std::endl;
ret = true;
}
if (diff_objects_stat(a_stat, b_stat)) {
- dout(0) << "diff_objects stat mismatch on "
- << coll << "/" << b_obj << dendl;
+ cout << "diff_objects stat mismatch on "
+ << coll << "/" << b_obj << std::endl;
ret = true;
}
a_store->read(coll, a_obj, 0, a_stat.st_size, a_obj_bl);
if (!a_obj_bl.contents_equal(b_obj_bl)) {
- dout(0) << "diff_objects content mismatch on "
- << coll << "/" << b_obj << dendl;
+ cout << "diff_objects content mismatch on "
+ << coll << "/" << b_obj << std::endl;
ret = true;
}
std::map<std::string, bufferptr> a_obj_attrs_map, b_obj_attrs_map;
err = a_store->getattrs(coll, a_obj, a_obj_attrs_map);
if (err < 0) {
- dout(0) << "diff_objects getattrs on A object " << coll << "/" << a_obj
- << " returns " << err << dendl;
+ cout << "diff_objects getattrs on A object " << coll << "/" << a_obj
+ << " returns " << err << std::endl;
ret = true;
}
err = b_store->getattrs(coll, b_obj, b_obj_attrs_map);
if (err < 0) {
- dout(0) << "diff_objects getattrs on B object " << coll << "/" << b_obj
- << "returns " << err << dendl;
+ cout << "diff_objects getattrs on B object " << coll << "/" << b_obj
+ << "returns " << err << std::endl;
ret = true;
}
if (diff_attrs(b_obj_attrs_map, a_obj_attrs_map)) {
- dout(0) << "diff_objects attrs mismatch on A object "
+ cout << "diff_objects attrs mismatch on A object "
<< coll << "/" << a_obj << " and B object "
- << coll << "/" << b_obj << dendl;
+ << coll << "/" << b_obj << std::endl;
ret = true;
}
std::set<std::string> a_omap_keys, b_omap_keys;
err = a_store->omap_get_keys(coll, a_obj, &a_omap_keys);
if (err < 0) {
- dout(0) << "diff_objects getomap on A object " << coll << "/" << a_obj
- << " returns " << err << dendl;
+ cout << "diff_objects getomap on A object " << coll << "/" << a_obj
+ << " returns " << err << std::endl;
ret = true;
}
err = a_store->omap_get_values(coll, a_obj, a_omap_keys, &a_obj_omap);
if (err < 0) {
- dout(0) << "diff_objects getomap on A object " << coll << "/" << a_obj
- << " returns " << err << dendl;
+ cout << "diff_objects getomap on A object " << coll << "/" << a_obj
+ << " returns " << err << std::endl;
ret = true;
}
err = b_store->omap_get_keys(coll, b_obj, &b_omap_keys);
if (err < 0) {
- dout(0) << "diff_objects getomap on A object " << coll << "/" << b_obj
- << " returns " << err << dendl;
+ cout << "diff_objects getomap on A object " << coll << "/" << b_obj
+ << " returns " << err << std::endl;
ret = true;
}
err = b_store->omap_get_values(coll, b_obj, b_omap_keys, &b_obj_omap);
if (err < 0) {
- dout(0) << "diff_objects getomap on A object " << coll << "/" << b_obj
- << " returns " << err << dendl;
+ cout << "diff_objects getomap on A object " << coll << "/" << b_obj
+ << " returns " << err << std::endl;
ret = true;
}
if (diff_omap(a_obj_omap, b_obj_omap)) {
- dout(0) << "diff_objects omap mismatch on A object "
+ cout << "diff_objects omap mismatch on A object "
<< coll << "/" << a_obj << " and B object "
- << coll << "/" << b_obj << dendl;
+ << coll << "/" << b_obj << std::endl;
+ cout << "a: " << a_obj_omap << std::endl;
+ cout << "b: " << b_obj_omap << std::endl;
ret = true;
}
}
for (; it != b_coll_list.end(); ++it) {
coll_t b_coll = *it;
if (!a_store->collection_exists(b_coll)) {
- dout(0) << "diff B coll " << b_coll.to_str() << " DNE on A" << dendl;
+ cout << "diff B coll " << b_coll.to_str() << " DNE on A" << std::endl;
ret = true;
continue;
}
}
for (std::vector<coll_t>::iterator it = a_coll_list.begin();
it != a_coll_list.end(); ++it) {
- dout(0) << "diff A coll " << *it << " DNE on B" << dendl;
+ cout << "diff A coll " << *it << " DNE on B" << std::endl;
ret = true;
}
public:
explicit TestObjectStoreState(ObjectStore *store) :
m_next_coll_nr(0), m_num_objs_per_coll(10), m_num_objects(0),
- m_max_in_flight(0), m_finished_lock("Finished Lock"), m_next_pool(1) {
+ m_max_in_flight(0), m_finished_lock("Finished Lock"), m_next_pool(2) {
m_store.reset(store);
}
~TestObjectStoreState() {
echo
}
+echo $0 $*
+
die_on_missing_arg() {
if [[ "$2" == "" ]]; then
echo "$1: missing required parameter"
$tmp_name_a $tmp_name_a/journal \
--test-seed $seed --osd-journal-size 100 \
--filestore-kill-at $killat $tmp_opts_a \
- --log-file $tmp_name_a.fail --debug-filestore 20 || true
+ --log-file $tmp_name_a.fail --debug-filestore 20 --no-log-to-stderr || true
stop_at=`ceph_test_filestore_idempotent_sequence get-last-op \
$tmp_name_a $tmp_name_a/journal \
--log-file $tmp_name_a.recover \
- --debug-filestore 20 --debug-journal 20`
+ --debug-filestore 20 --debug-journal 20 --no-log-to-stderr`
if [[ "`expr $stop_at - $stop_at 2>/dev/null`" != "0" ]]; then
echo "error: get-last-op returned '$stop_at'"
$v ceph_test_filestore_idempotent_sequence run-sequence-to \
$stop_at $tmp_name_b $tmp_name_b/journal \
--test-seed $seed --osd-journal-size 100 \
- --log-file $tmp_name_b.clean --debug-filestore 20 $tmp_opts_b
+ --log-file $tmp_name_b.clean --debug-filestore 20 --no-log-to-stderr \
+ $tmp_opts_b
if $v ceph_test_filestore_idempotent_sequence diff \
- $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal ; then
+ $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal --no-log-to-stderr --log-file $tmp_name_a.diff.log --debug-filestore 20 ; then
echo OK
else
echo "FAIL"
#!/bin/sh
+set -x
set -e
seed=$1
for f in `seq $from $to`
do
- if ! $mydir/run_seed_to.sh $seed $f; then
+ if ! $mydir/run_seed_to.sh -o 10 -e $seed $f; then
if [ -d "$dir" ]; then
echo copying evidence to $dir
cp -a . $dir
fi
exit 1
fi
-done
\ No newline at end of file
+done
eq(self.object.read(3), b'bar')
eq(self.object.read(3), b'baz')
+class TestIoCtxSelfManagedSnaps(object):
+ def setUp(self):
+ self.rados = Rados(conffile='')
+ self.rados.connect()
+ self.rados.create_pool('test_pool')
+ assert self.rados.pool_exists('test_pool')
+ self.ioctx = self.rados.open_ioctx('test_pool')
+
+ def tearDown(self):
+ cmd = {"prefix":"osd unset", "key":"noup"}
+ self.rados.mon_command(json.dumps(cmd), b'')
+ self.ioctx.close()
+ self.rados.delete_pool('test_pool')
+ self.rados.shutdown()
+
+ def test(self):
+ # cannot mix-and-match pool and self-managed snapshot mode
+ self.ioctx.set_self_managed_snap_write([])
+ self.ioctx.write('abc', b'abc')
+ snap_id_1 = self.ioctx.create_self_managed_snap()
+ self.ioctx.set_self_managed_snap_write([snap_id_1])
+
+ self.ioctx.write('abc', b'def')
+ snap_id_2 = self.ioctx.create_self_managed_snap()
+ self.ioctx.set_self_managed_snap_write([snap_id_1, snap_id_2])
+
+ self.ioctx.write('abc', b'ghi')
+
+ self.ioctx.rollback_self_managed_snap('abc', snap_id_1)
+ eq(self.ioctx.read('abc'), b'abc')
+
+ self.ioctx.rollback_self_managed_snap('abc', snap_id_2)
+ eq(self.ioctx.read('abc'), b'def')
+
+ self.ioctx.remove_self_managed_snap(snap_id_1)
+ self.ioctx.remove_self_managed_snap(snap_id_2)
+
class TestCommand(object):
def setUp(self):
ASSERT_EQ(-EREMOTEIO, ctx.wait());
}
+TEST_F(TestMockImageReplayerBootstrapRequest, NonPrimaryRemoteNotTagOwner) {
+ create_local_image();
+
+ InSequence seq;
+
+ // lookup remote image tag class
+ cls::journal::Client client;
+ librbd::journal::ClientData client_data{
+ librbd::journal::ImageClientMeta{123}};
+ encode(client_data, client.data);
+ ::journal::MockJournaler mock_journaler;
+ expect_journaler_get_client(mock_journaler,
+ librbd::Journal<>::IMAGE_CLIENT_ID,
+ client, 0);
+
+ // open the remote image
+ librbd::MockJournal mock_journal;
+ librbd::MockTestImageCtx mock_remote_image_ctx(*m_remote_image_ctx);
+ MockOpenImageRequest mock_open_image_request;
+ expect_open_image(mock_open_image_request, m_remote_io_ctx,
+ mock_remote_image_ctx.id, mock_remote_image_ctx, 0);
+
+ // test if remote image is primary
+ MockIsPrimaryRequest mock_is_primary_request;
+ expect_is_primary(mock_is_primary_request, false, 0);
+
+ // open the local image
+ librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
+ mock_local_image_ctx.journal = &mock_journal;
+ MockOpenLocalImageRequest mock_open_local_image_request;
+ expect_open_local_image(mock_open_local_image_request, m_local_io_ctx,
+ mock_local_image_ctx.id, &mock_local_image_ctx, 0);
+ expect_is_resync_requested(mock_journal, false, 0);
+
+ expect_journal_get_tag_tid(mock_journal, 345);
+ expect_journal_get_tag_data(mock_journal, {librbd::Journal<>::LOCAL_MIRROR_UUID,
+ librbd::Journal<>::ORPHAN_MIRROR_UUID,
+ true, 344, 0});
+
+ MockCloseImageRequest mock_close_image_request;
+ expect_close_image(mock_close_image_request, mock_local_image_ctx, 0);
+ expect_close_image(mock_close_image_request, mock_remote_image_ctx, 0);
+
+ C_SaferCond ctx;
+ MockInstanceWatcher mock_instance_watcher;
+ cls::journal::ClientState client_state = cls::journal::CLIENT_STATE_CONNECTED;
+ librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
+ mock_local_image_ctx.id};
+ mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+ MockBootstrapRequest *request = create_request(
+ &mock_instance_watcher, mock_journaler, mock_local_image_ctx.id,
+ mock_remote_image_ctx.id, "global image id", "local mirror uuid",
+ "remote mirror uuid", &client_state, &mirror_peer_client_meta, &ctx);
+ request->send();
+ ASSERT_EQ(-EREMOTEIO, ctx.wait());
+}
+
TEST_F(TestMockImageReplayerBootstrapRequest, RemoteDemotePromote) {
create_local_image();
// test if remote image is primary
MockIsPrimaryRequest mock_is_primary_request;
- expect_is_primary(mock_is_primary_request, true, 0);
+ expect_is_primary(mock_is_primary_request, false, 0);
// open the local image
librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
mock_local_image_ctx.id, &mock_local_image_ctx, 0);
expect_is_resync_requested(mock_journal, false, 0);
+ expect_journal_get_tag_tid(mock_journal, 345);
+ expect_journal_get_tag_data(mock_journal, {"remote mirror uuid"});
+
// remote demotion / promotion event
Tags tags = {
{2, 123, encode_tag_data({librbd::Journal<>::LOCAL_MIRROR_UUID,
true, 4, 369})}
};
expect_journaler_get_tags(mock_journaler, 123, tags, 0);
- expect_journal_get_tag_tid(mock_journal, 345);
- expect_journal_get_tag_data(mock_journal, {"remote mirror uuid"});
MockCloseImageRequest mock_close_image_request;
expect_close_image(mock_close_image_request, mock_remote_image_ctx, 0);
mock_local_image_ctx.id, &mock_local_image_ctx, 0);
expect_is_resync_requested(mock_journal, false, 0);
+ expect_journal_get_tag_tid(mock_journal, 345);
+ expect_journal_get_tag_data(mock_journal, {librbd::Journal<>::ORPHAN_MIRROR_UUID,
+ "remote mirror uuid", true, 4, 1});
+
// remote demotion / promotion event
Tags tags = {
{2, 123, encode_tag_data({librbd::Journal<>::LOCAL_MIRROR_UUID,
true, 7, 1})}
};
expect_journaler_get_tags(mock_journaler, 123, tags, 0);
- expect_journal_get_tag_tid(mock_journal, 345);
- expect_journal_get_tag_data(mock_journal, {librbd::Journal<>::ORPHAN_MIRROR_UUID,
- "remote mirror uuid", true, 4, 1});
MockCloseImageRequest mock_close_image_request;
expect_close_image(mock_close_image_request, mock_remote_image_ctx, 0);
mock_local_image_ctx.id, &mock_local_image_ctx, 0);
expect_is_resync_requested(mock_journal, false, 0);
+ expect_journal_get_tag_tid(mock_journal, 346);
+ expect_journal_get_tag_data(mock_journal,
+ {librbd::Journal<>::ORPHAN_MIRROR_UUID,
+ librbd::Journal<>::LOCAL_MIRROR_UUID,
+ true, 345, 1});
+
// remote demotion / promotion event
Tags tags = {
{2, 123, encode_tag_data({"local mirror uuid", "local mirror uuid",
true, 3, 1})}
};
expect_journaler_get_tags(mock_journaler, 123, tags, 0);
- expect_journal_get_tag_tid(mock_journal, 346);
- expect_journal_get_tag_data(mock_journal,
- {librbd::Journal<>::ORPHAN_MIRROR_UUID,
- librbd::Journal<>::LOCAL_MIRROR_UUID,
- true, 345, 1});
MockCloseImageRequest mock_close_image_request;
expect_close_image(mock_close_image_request, mock_remote_image_ctx, 0);
mock_local_image_ctx.id, &mock_local_image_ctx, 0);
expect_is_resync_requested(mock_journal, false, 0);
+ expect_journal_get_tag_tid(mock_journal, 345);
+ expect_journal_get_tag_data(mock_journal, {librbd::Journal<>::LOCAL_MIRROR_UUID,
+ librbd::Journal<>::ORPHAN_MIRROR_UUID,
+ true, 344, 0});
+
// remote demotion / promotion event
Tags tags = {
{2, 123, encode_tag_data({librbd::Journal<>::LOCAL_MIRROR_UUID,
true, 2, 1})}
};
expect_journaler_get_tags(mock_journaler, 123, tags, 0);
- expect_journal_get_tag_tid(mock_journal, 345);
- expect_journal_get_tag_data(mock_journal, {librbd::Journal<>::LOCAL_MIRROR_UUID,
- librbd::Journal<>::ORPHAN_MIRROR_UUID,
- true, 344, 0});
MockCloseImageRequest mock_close_image_request;
expect_close_image(mock_close_image_request, mock_local_image_ctx, 0);
// resync is requested
expect_is_resync_requested(mock_journal, true, 0);
+ expect_journal_get_tag_tid(mock_journal, 345);
+ expect_journal_get_tag_data(mock_journal, {"remote mirror uuid"});
MockCloseImageRequest mock_close_image_request;
expect_close_image(mock_close_image_request, mock_remote_image_ctx, 0);
mock_local_image_ctx.id, &mock_local_image_ctx, 0);
expect_is_resync_requested(mock_journal, false, 0);
+ expect_journal_get_tag_tid(mock_journal, 345);
+ expect_journal_get_tag_data(mock_journal, {"remote mirror uuid"});
+
// sync the remote image to the local image
MockImageSync mock_image_sync;
expect_image_sync(mock_image_sync, 0);
mock_local_image_ctx.id, &mock_local_image_ctx, 0);
expect_is_resync_requested(mock_journal, false, 0);
+ expect_journal_get_tag_tid(mock_journal, 345);
+ expect_journal_get_tag_data(mock_journal, {"remote mirror uuid"});
+
// sync the remote image to the local image
MockImageSync mock_image_sync;
expect_image_sync(mock_image_sync, 0);
}));
}
+ void expect_dir_get_name(librados::IoCtx &io_ctx,
+ const std::string &image_name, int r) {
+ bufferlist bl;
+ encode(image_name, bl);
+
+ EXPECT_CALL(get_mock_io_ctx(io_ctx),
+ exec(RBD_DIRECTORY, _, StrEq("rbd"), StrEq("dir_get_name"), _, _, _))
+ .WillOnce(DoAll(WithArg<5>(Invoke([bl](bufferlist *out_bl) {
+ *out_bl = bl;
+ })),
+ Return(r)));
+ }
+
void expect_mirror_image_get(librados::IoCtx &io_ctx,
cls::rbd::MirrorImageState state,
const std::string &global_id, int r) {
MockGetMirrorImageIdRequest mock_get_mirror_image_id_request;
expect_get_mirror_image_id(mock_get_mirror_image_id_request, "local image id",
0);
+ expect_dir_get_name(m_local_io_ctx, "local image name", 0);
expect_mirror_image_get(m_local_io_ctx, cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
"global image id", 0);
expect_get_tag_owner(mock_journal, "local image id", "remote mirror uuid", 0);
std::string local_image_id;
+ std::string local_image_name;
std::string tag_owner;
C_SaferCond ctx;
auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
"global image id",
&local_image_id,
+ &local_image_name,
&tag_owner,
m_threads->work_queue,
&ctx);
ASSERT_EQ(0, ctx.wait());
ASSERT_EQ(std::string("local image id"), local_image_id);
+ ASSERT_EQ(std::string("local image name"), local_image_name);
ASSERT_EQ(std::string("remote mirror uuid"), tag_owner);
}
expect_get_mirror_image_id(mock_get_mirror_image_id_request, "", -EINVAL);
std::string local_image_id;
+ std::string local_image_name;
std::string tag_owner;
C_SaferCond ctx;
auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
"global image id",
&local_image_id,
+ &local_image_name,
&tag_owner,
m_threads->work_queue,
&ctx);
ASSERT_EQ(-EINVAL, ctx.wait());
}
+TEST_F(TestMockImageReplayerPrepareLocalImageRequest, DirGetNameError) {
+ InSequence seq;
+ MockGetMirrorImageIdRequest mock_get_mirror_image_id_request;
+ expect_get_mirror_image_id(mock_get_mirror_image_id_request, "local image id",
+ 0);
+ expect_dir_get_name(m_local_io_ctx, "", -ENOENT);
+
+ std::string local_image_id;
+ std::string local_image_name;
+ std::string tag_owner;
+ C_SaferCond ctx;
+ auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
+ "global image id",
+ &local_image_id,
+ &local_image_name,
+ &tag_owner,
+ m_threads->work_queue,
+ &ctx);
+ req->send();
+
+ ASSERT_EQ(-ENOENT, ctx.wait());
+}
+
TEST_F(TestMockImageReplayerPrepareLocalImageRequest, MirrorImageError) {
InSequence seq;
MockGetMirrorImageIdRequest mock_get_mirror_image_id_request;
expect_get_mirror_image_id(mock_get_mirror_image_id_request, "local image id",
0);
+ expect_dir_get_name(m_local_io_ctx, "local image name", 0);
expect_mirror_image_get(m_local_io_ctx, cls::rbd::MIRROR_IMAGE_STATE_DISABLED,
"", -EINVAL);
std::string local_image_id;
+ std::string local_image_name;
std::string tag_owner;
C_SaferCond ctx;
auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
"global image id",
&local_image_id,
+ &local_image_name,
&tag_owner,
m_threads->work_queue,
&ctx);
MockGetMirrorImageIdRequest mock_get_mirror_image_id_request;
expect_get_mirror_image_id(mock_get_mirror_image_id_request, "local image id",
0);
+ expect_dir_get_name(m_local_io_ctx, "local image name", 0);
expect_mirror_image_get(m_local_io_ctx, cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
"global image id", 0);
-ENOENT);
std::string local_image_id;
+ std::string local_image_name;
std::string tag_owner;
C_SaferCond ctx;
auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
"global image id",
&local_image_id,
+ &local_image_name,
&tag_owner,
m_threads->work_queue,
&ctx);
TestImageReplayer()
: m_local_cluster(new librados::Rados()), m_watch_handle(0)
{
+ EXPECT_EQ(0, g_ceph_context->_conf->set_val("rbd_mirror_journal_commit_age",
+ "0.1"));
+
EXPECT_EQ("", connect_cluster_pp(*m_local_cluster.get()));
EXPECT_EQ(0, m_local_cluster->conf_set("rbd_cache", "false"));
EXPECT_EQ(0, m_local_cluster->conf_set("rbd_mirror_journal_poll_age", "1"));
EXPECT_EQ(0, m_remote_cluster.pool_delete(m_remote_pool_name.c_str()));
EXPECT_EQ(0, m_local_cluster->pool_delete(m_local_pool_name.c_str()));
+ EXPECT_EQ(0, g_ceph_context->_conf->set_val("rbd_mirror_journal_commit_age",
+ "5"));
}
template <typename ImageReplayerT = rbd::mirror::ImageReplayer<> >
cls::journal::ObjectPosition mirror_position;
for (int i = 0; i < 100; i++) {
- printf("m_replayer->flush()\n");
- C_SaferCond cond;
- m_replayer->flush(&cond);
- ASSERT_EQ(0, cond.wait());
get_commit_positions(&master_position, &mirror_position);
if (master_position == mirror_position) {
break;
struct PrepareLocalImageRequest<librbd::MockTestImageCtx> {
static PrepareLocalImageRequest* s_instance;
std::string *local_image_id = nullptr;
+ std::string *local_image_name = nullptr;
std::string *tag_owner = nullptr;
Context *on_finish = nullptr;
static PrepareLocalImageRequest* create(librados::IoCtx &,
const std::string &global_image_id,
std::string *local_image_id,
+ std::string *local_image_name,
std::string *tag_owner,
MockContextWQ *work_queue,
Context *on_finish) {
assert(s_instance != nullptr);
s_instance->local_image_id = local_image_id;
+ s_instance->local_image_name = local_image_name;
s_instance->tag_owner = tag_owner;
s_instance->on_finish = on_finish;
return s_instance;
void expect_send(MockPrepareLocalImageRequest &mock_request,
const std::string &local_image_id,
+ const std::string &local_image_name,
const std::string &tag_owner,
int r) {
EXPECT_CALL(mock_request, send())
- .WillOnce(Invoke([&mock_request, local_image_id, tag_owner, r]() {
+ .WillOnce(Invoke([&mock_request, local_image_id, local_image_name, tag_owner, r]() {
if (r == 0) {
*mock_request.local_image_id = local_image_id;
+ *mock_request.local_image_name = local_image_name;
*mock_request.tag_owner = tag_owner;
}
mock_request.on_finish->complete(r);
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
- "remote mirror uuid", 0);
+ mock_local_image_ctx.name, "remote mirror uuid", 0);
expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
m_remote_image_ctx->id, 0);
EXPECT_CALL(mock_remote_journaler, construct());
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
- "", 0);
+ mock_local_image_ctx.name, "", 0);
expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
"remote image id", 0);
EXPECT_CALL(mock_remote_journaler, construct());
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
- expect_send(mock_prepare_local_image_request, "", "", -ENOENT);
+ expect_send(mock_prepare_local_image_request, "", "", "", -ENOENT);
expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
m_remote_image_ctx->id, 0);
EXPECT_CALL(mock_remote_journaler, construct());
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
- "remote mirror uuid", -EINVAL);
+ mock_local_image_ctx.name, "remote mirror uuid", -EINVAL);
create_image_replayer(mock_threads, mock_image_deleter);
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
- "remote mirror uuid", 0);
+ mock_local_image_ctx.name, "remote mirror uuid", 0);
expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
"", -ENOENT);
expect_schedule_image_delete(mock_image_deleter, "global image id", false);
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
- "some other mirror uuid", 0);
+ mock_local_image_ctx.name, "some other mirror uuid", 0);
expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
"", -ENOENT);
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
- "remote mirror uuid", 0);
+ mock_local_image_ctx.name, "remote mirror uuid", 0);
expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
m_remote_image_ctx->id, -EINVAL);
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
- "remote mirror uuid", 0);
+ mock_local_image_ctx.name, "remote mirror uuid", 0);
expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
m_remote_image_ctx->id, 0);
EXPECT_CALL(mock_remote_journaler, construct());
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
- "remote mirror uuid", 0);
+ mock_local_image_ctx.name, "remote mirror uuid", 0);
expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
m_remote_image_ctx->id, 0);
EXPECT_CALL(mock_remote_journaler, construct());
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
- "remote mirror uuid", 0);
+ mock_local_image_ctx.name, "remote mirror uuid", 0);
expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
m_remote_image_ctx->id, 0);
EXPECT_CALL(mock_remote_journaler, construct());
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
- "remote mirror uuid", 0);
+ mock_local_image_ctx.name, "remote mirror uuid", 0);
expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
m_remote_image_ctx->id, 0);
EXPECT_CALL(mock_remote_journaler, construct());
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
- "remote mirror uuid", 0);
+ mock_local_image_ctx.name, "remote mirror uuid", 0);
expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
m_remote_image_ctx->id, 0);
EXPECT_CALL(mock_remote_journaler, construct());
InSequence seq;
expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
- "remote mirror uuid", 0);
+ mock_local_image_ctx.name, "remote mirror uuid", 0);
expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
m_remote_image_ctx->id, 0);
EXPECT_CALL(mock_remote_journaler, construct());
if target_zone == source_zone:
return None
- cmd = ['bucket', 'sync', 'status'] + target_zone.zone_args()
+ cmd = ['bucket', 'sync', 'markers'] + target_zone.zone_args()
cmd += ['--source-zone', source_zone.name]
cmd += ['--bucket', bucket_name]
while True:
assert(retcode == 2) # ENOENT
bucket_sync_status_json = bucket_sync_status_json.decode('utf-8')
- log.debug('current bucket sync status=%s', bucket_sync_status_json)
+ log.debug('current bucket sync markers=%s', bucket_sync_status_json)
sync_status = json.loads(bucket_sync_status_json)
markers={}
time.sleep(config.checkpoint_delay)
- assert False, 'finished bucket checkpoint for target_zone=%s source_zone=%s bucket=%s' % \
+ assert False, 'failed bucket checkpoint for target_zone=%s source_zone=%s bucket=%s' % \
(target_zone.name, source_zone.name, bucket_name)
def zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name):
if source_conn.zone == target_conn.zone:
continue
zone_bucket_checkpoint(target_conn.zone, source_conn.zone, bucket_name)
- target_conn.check_bucket_eq(source_conn, bucket_name)
+ for source_conn, target_conn in combinations(zonegroup_conns.zones, 2):
+ target_conn.check_bucket_eq(source_conn, bucket_name)
def set_master_zone(zone):
zone.modify(zone.cluster, ['--master'])
log.debug('version3 id=%s', v.version_id)
k.bucket.delete_key(obj, version_id=v.version_id)
- for source_conn, bucket in zone_bucket:
- for target_conn in zonegroup_conns.zones:
- if source_conn.zone == target_conn.zone:
- continue
- zone_bucket_checkpoint(target_conn.zone, source_conn.zone, bucket.name)
- check_bucket_eq(source_conn, target_conn, bucket)
+ for _, bucket in zone_bucket:
+ zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
def test_bucket_versioning():
buckets, zone_bucket = create_bucket_per_zone_in_realm()
using rgw::IAM::s3ListAllMyBuckets;
using rgw::IAM::s3ListBucket;
using rgw::IAM::s3ListBucket;
-using rgw::IAM::s3ListBucketMultiPartUploads;
+using rgw::IAM::s3ListBucketMultipartUploads;
using rgw::IAM::s3ListBucketVersions;
using rgw::IAM::s3ListMultipartUploadParts;
using rgw::IAM::s3None;
EXPECT_EQ(p->statements[2].action, (s3ListMultipartUploadParts |
s3ListBucket | s3ListBucketVersions |
s3ListAllMyBuckets |
- s3ListBucketMultiPartUploads |
+ s3ListBucketMultipartUploads |
s3GetObject | s3GetObjectVersion |
s3GetObjectAcl | s3GetObjectVersionAcl |
s3GetObjectTorrent |
auto s3allow = (s3ListMultipartUploadParts | s3ListBucket |
s3ListBucketVersions | s3ListAllMyBuckets |
- s3ListBucketMultiPartUploads | s3GetObject |
+ s3ListBucketMultipartUploads | s3GetObject |
s3GetObjectVersion | s3GetObjectAcl | s3GetObjectVersionAcl |
s3GetObjectTorrent | s3GetObjectVersionTorrent |
s3GetAccelerateConfiguration | s3GetBucketAcl |
bufferlist bl;
le->encode_with_header(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
-
+
cout << "writing EResetJournal entry" << std::endl;
- C_SaferCond cond;
journaler->append_entry(bl);
- journaler->flush(&cond);
- return cond.wait();
+ int ret;
+ {
+ C_SaferCond cond;
+ journaler->flush(&cond);
+ ret = cond.wait();
+ if (ret < 0)
+ return ret;
+ }
+ {
+ // wait until all journal prezero ops are done
+ C_SaferCond cond;
+ journaler->wait_for_prezero(&cond);
+ cond.wait();
+ }
+
+ return ret;
}
#define RBD_DIFF_ZERO 'z'
#define RBD_DIFF_END 'e'
+#define RBD_SNAP_PROTECTION_STATUS 'p'
+
#define RBD_EXPORT_IMAGE_ORDER 'O'
#define RBD_EXPORT_IMAGE_FEATURES 'T'
#define RBD_EXPORT_IMAGE_STRIPE_UNIT 'U'
#define RBD_EXPORT_IMAGE_STRIPE_COUNT 'C'
+#define RBD_EXPORT_IMAGE_META 'M'
#define RBD_EXPORT_IMAGE_END 'E'
enum SnapshotPresence {
::encode(tag, bl);
std::string to(endsnapname);
if (export_format == 2) {
- len = to.length() + 4;
- ::encode(len, bl);
+ len = to.length() + 4;
+ ::encode(len, bl);
}
::encode(to, bl);
}
+ if (endsnapname && export_format == 2) {
+ tag = RBD_SNAP_PROTECTION_STATUS;
+ encode(tag, bl);
+ bool is_protected = false;
+ r = image.snap_is_protected(endsnapname, &is_protected);
+ if (r < 0) {
+ return r;
+ }
+ len = 8;
+ encode(len, bl);
+ encode(is_protected, bl);
+ }
+
tag = RBD_DIFF_IMAGE_SIZE;
::encode(tag, bl);
uint64_t endsize = info.size;
int m_fd;
};
+const uint32_t MAX_KEYS = 64;
+
static int do_export_v2(librbd::Image& image, librbd::image_info_t &info, int fd,
uint64_t period, int max_concurrent_ops, utils::ProgressContext &pc)
{
::encode(length, bl);
::encode(stripe_count, bl);
+ //retrieve metadata of image
+ std::map<std::string, string> imagemetas;
+ std::string last_key;
+ bool more_results = true;
+ while (more_results) {
+ std::map<std::string, bufferlist> pairs;
+ r = image.metadata_list(last_key, MAX_KEYS, &pairs);
+ if (r < 0) {
+ std::cerr << "failed to retrieve metadata of image : " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ if (!pairs.empty()) {
+ last_key = pairs.rbegin()->first;
+
+ for (auto kv : pairs) {
+ std::string key = kv.first;
+ std::string val(kv.second.c_str(), kv.second.length());
+ imagemetas[key] = val;
+ }
+ }
+ more_results = (pairs.size() == MAX_KEYS);
+ }
+
+ //encode imageMeta key and value
+ for (std::map<std::string, string>::iterator it = imagemetas.begin();
+ it != imagemetas.end(); ++it) {
+ string key = it->first;
+ string value = it->second;
+
+ tag = RBD_EXPORT_IMAGE_META;
+ length = key.length() + value.length() + 4 * 2;
+ ::encode(tag, bl);
+ ::encode(length, bl);
+ ::encode(key, bl);
+ ::encode(value, bl);
+ }
+
// encode end tag
tag = RBD_EXPORT_IMAGE_END;
::encode(tag, bl);
string from;
r = utils::read_string(idiffctx->fd, 4096, &from); // 4k limit to make sure we don't get a garbage string
if (r < 0) {
+ std::cerr << "rbd: failed to decode start snap name" << std::endl;
return r;
}
bool exists;
r = idiffctx->image->snap_exists2(from.c_str(), &exists);
if (r < 0) {
+ std::cerr << "rbd: failed to query start snap state" << std::endl;
return r;
}
string to;
r = utils::read_string(idiffctx->fd, 4096, &to); // 4k limit to make sure we don't get a garbage string
if (r < 0) {
+ std::cerr << "rbd: failed to decode end snap name" << std::endl;
return r;
}
bool exists;
r = idiffctx->image->snap_exists2(to.c_str(), &exists);
if (r < 0) {
+ std::cerr << "rbd: failed to query end snap state" << std::endl;
return r;
}
if (exists) {
- std::cerr << "end snapshot '" << to << "' already exists, aborting" << std::endl;
+ std::cerr << "end snapshot '" << to << "' already exists, aborting"
+ << std::endl;
return -EEXIST;
}
return 0;
}
+static int get_snap_protection_status(ImportDiffContext *idiffctx,
+ bool *is_protected)
+{
+ int r;
+ char buf[sizeof(__u8)];
+ r = safe_read_exact(idiffctx->fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode snap protection status" << std::endl;
+ return r;
+ }
+
+ *is_protected = (buf[0] != 0);
+ idiffctx->update_progress();
+
+ return 0;
+}
+
static int do_image_resize(ImportDiffContext *idiffctx)
{
int r;
uint64_t end_size;
r = safe_read_exact(idiffctx->fd, buf, sizeof(buf));
if (r < 0) {
+ std::cerr << "rbd: failed to decode image size" << std::endl;
return r;
}
char buf[16];
r = safe_read_exact(idiffctx->fd, buf, sizeof(buf));
if (r < 0) {
+ std::cerr << "rbd: failed to decode IO length" << std::endl;
return r;
}
bufferptr bp = buffer::create(buffer_length);
r = safe_read_exact(idiffctx->fd, bp.c_str(), buffer_length);
if (r < 0) {
+ std::cerr << "rbd: failed to decode write data" << std::endl;
return r;
}
{
int r;
char buf[banner.size() + 1];
+ memset(buf, 0, sizeof(buf));
r = safe_read_exact(fd, buf, banner.size());
if (r < 0) {
+ std::cerr << "rbd: failed to decode diff banner" << std::endl;
return r;
}
buf[banner.size()] = '\0';
if (strcmp(buf, banner.c_str())) {
- std::cerr << "invalid banner '" << buf << "', expected '" << banner << "'" << std::endl;
+ std::cerr << "rbd: invalid or unexpected diff banner" << std::endl;
return -EINVAL;
}
uint64_t len = min<uint64_t>(length, sizeof(buf));
while (len > 0) {
r = safe_read_exact(fd, buf, len);
- if (r < 0)
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode skipped tag data" << std::endl;
return r;
+ }
length -= len;
len = min<uint64_t>(length, sizeof(buf));
}
r = safe_read_exact(fd, &read_tag, sizeof(read_tag));
if (r < 0) {
+ std::cerr << "rbd: failed to decode tag" << std::endl;
return r;
}
char buf[sizeof(uint64_t)];
r = safe_read_exact(fd, buf, sizeof(buf));
if (r < 0) {
+ std::cerr << "rbd: failed to decode tag length" << std::endl;
return r;
}
struct stat stat_buf;
r = ::fstat(fd, &stat_buf);
if (r < 0) {
+ std::cerr << "rbd: failed to stat specified diff file" << std::endl;
return r;
}
size = (uint64_t)stat_buf.st_size;
// begin image import
std::string tosnap;
+ bool is_protected = false;
ImportDiffContext idiffctx(&image, fd, size, no_progress);
while (r == 0) {
__u8 tag;
r = do_image_snap_from(&idiffctx);
} else if (tag == RBD_DIFF_TO_SNAP) {
r = do_image_snap_to(&idiffctx, &tosnap);
+ } else if (tag == RBD_SNAP_PROTECTION_STATUS) {
+ r = get_snap_protection_status(&idiffctx, &is_protected);
} else if (tag == RBD_DIFF_IMAGE_SIZE) {
r = do_image_resize(&idiffctx);
} else if (tag == RBD_DIFF_WRITE || tag == RBD_DIFF_ZERO) {
int temp_r = idiffctx.throttle.wait_for_ret();
r = (r < 0) ? r : temp_r; // preserve original error
if (r == 0 && tosnap.length()) {
- idiffctx.image->snap_create(tosnap.c_str());
+ r = idiffctx.image->snap_create(tosnap.c_str());
+ if (r == 0 && is_protected) {
+ r = idiffctx.image->snap_protect(tosnap.c_str());
+ }
}
idiffctx.finish(r);
r = do_import_diff(rados, image, path.c_str(),
vm[at::NO_PROGRESS].as<bool>(), sparse_size);
+ if (r == -EDOM) {
+ r = -EBADMSG;
+ }
if (r < 0) {
cerr << "rbd: import-diff failed: " << cpp_strerror(r) << std::endl;
return r;
r = safe_read_exact(fd, buf, sizeof(buf));
if (r < 0) {
+ std::cerr << "rbd: failed to decode image option" << std::endl;
return r;
}
return 0;
}
-static int do_import_header(int fd, int import_format, uint64_t &size, librbd::ImageOptions& opts)
+static int do_import_metadata(int import_format, librbd::Image& image,
+ const std::map<std::string, std::string> &imagemetas)
+{
+ int r = 0;
+
+ //v1 format
+ if (import_format == 1) {
+ return 0;
+ }
+
+ for (std::map<std::string, std::string>::const_iterator it = imagemetas.begin();
+ it != imagemetas.end(); ++it) {
+ r = image.metadata_set(it->first, it->second);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int decode_imagemeta(int fd, uint64_t length, std::map<std::string, std::string>* imagemetas)
+{
+ int r;
+ string key;
+ string value;
+
+ r = utils::read_string(fd, length, &key);
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode metadata key" << std::endl;
+ return r;
+ }
+
+ r = utils::read_string(fd, length, &value);
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode metadata value" << std::endl;
+ return r;
+ }
+
+ (*imagemetas)[key] = value;
+ return 0;
+}
+
+static int do_import_header(int fd, int import_format, uint64_t &size, librbd::ImageOptions& opts,
+ std::map<std::string, std::string>* imagemetas)
{
// There is no header in v1 image.
if (import_format == 1) {
r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_STRIPE_UNIT, opts);
} else if (tag == RBD_EXPORT_IMAGE_STRIPE_COUNT) {
r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_STRIPE_COUNT, opts);
+ } else if (tag == RBD_EXPORT_IMAGE_META) {
+ r = decode_imagemeta(fd, length, imagemetas);
} else {
std::cerr << "rbd: invalid tag in image properties zone: " << tag << "Skip it."
<< std::endl;
char buf[sizeof(uint64_t)];
r = safe_read_exact(fd, buf, sizeof(buf));
if (r < 0) {
+ std::cerr << "rbd: failed to decode diff count" << std::endl;
return r;
}
bufferlist bl;
int fd, r;
struct stat stat_buf;
utils::ProgressContext pc("Importing image", no_progress);
+ std::map<std::string, std::string> imagemetas;
assert(imgname);
#endif
}
- r = do_import_header(fd, import_format, size, opts);
+ r = do_import_header(fd, import_format, size, opts, &imagemetas);
if (r < 0) {
std::cerr << "rbd: import header failed." << std::endl;
goto done;
goto err;
}
+ r = do_import_metadata(import_format, image, imagemetas);
+ if (r < 0) {
+ std::cerr << "rbd: failed to import image-meta" << std::endl;
+ goto err;
+ }
+
if (import_format == 1) {
r = do_import_v1(fd, image, size, imgblklen, pc, sparse_size);
} else {
}
bool call(Formatter *f, stringstream *ss) override {
- C_SaferCond cond;
- this->replayer->flush(&cond);
- int r = cond.wait();
- if (r < 0) {
- *ss << "flush: " << cpp_strerror(r);
- return false;
- }
+ this->replayer->flush();
return true;
}
};
m_local(local),
m_local_mirror_uuid(local_mirror_uuid),
m_local_pool_id(local_pool_id),
- m_global_image_id(global_image_id),
+ m_global_image_id(global_image_id), m_local_image_name(global_image_id),
m_lock("rbd::mirror::ImageReplayer " + stringify(local_pool_id) + " " +
global_image_id),
m_progress_cxt(this),
Context *ctx = create_context_callback<
ImageReplayer, &ImageReplayer<I>::handle_prepare_local_image>(this);
auto req = PrepareLocalImageRequest<I>::create(
- m_local_ioctx, m_global_image_id, &m_local_image_id,
+ m_local_ioctx, m_global_image_id, &m_local_image_id, &m_local_image_name,
&m_local_image_tag_owner, m_threads->work_queue, ctx);
req->send();
}
} else if (r < 0) {
on_start_fail(r, "error preparing local image for replay");
return;
+ } else {
+ reregister_admin_socket_hook();
}
// local image doesn't exist or is non-primary
return;
}
- on_name_changed();
-
update_mirror_image_status(false, boost::none);
init_remote_journaler();
}
template <typename I>
void ImageReplayer<I>::allocate_local_tag() {
- dout(20) << dendl;
+ dout(15) << dendl;
std::string mirror_uuid = m_replay_tag_data.mirror_uuid;
- if (mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID ||
- mirror_uuid == m_local_mirror_uuid) {
+ if (mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID) {
mirror_uuid = m_remote_image.mirror_uuid;
+ } else if (mirror_uuid == m_local_mirror_uuid) {
+ mirror_uuid = librbd::Journal<>::LOCAL_MIRROR_UUID;
} else if (mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) {
- dout(5) << "encountered image demotion: stopping" << dendl;
- Mutex::Locker locker(m_lock);
- m_stop_requested = true;
+ // handle possible edge condition where daemon can failover and
+ // the local image has already been promoted/demoted
+ auto local_tag_data = m_local_journal->get_tag_data();
+ if (local_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID &&
+ (local_tag_data.predecessor.commit_valid &&
+ local_tag_data.predecessor.mirror_uuid ==
+ librbd::Journal<>::LOCAL_MIRROR_UUID)) {
+ dout(15) << "skipping stale demotion event" << dendl;
+ handle_process_entry_safe(m_replay_entry, 0);
+ handle_replay_ready();
+ return;
+ } else {
+ dout(5) << "encountered image demotion: stopping" << dendl;
+ Mutex::Locker locker(m_lock);
+ m_stop_requested = true;
+ }
}
librbd::journal::TagPredecessor predecessor(m_replay_tag_data.predecessor);
predecessor.mirror_uuid = librbd::Journal<>::LOCAL_MIRROR_UUID;
}
- dout(20) << "mirror_uuid=" << mirror_uuid << ", "
- << "predecessor_mirror_uuid=" << predecessor.mirror_uuid << ", "
- << "replay_tag_tid=" << m_replay_tag_tid << ", "
- << "replay_tag_data=" << m_replay_tag_data << dendl;
+ dout(15) << "mirror_uuid=" << mirror_uuid << ", "
+ << "predecessor=" << predecessor << ", "
+ << "replay_tag_tid=" << m_replay_tag_tid << dendl;
Context *ctx = create_context_callback<
ImageReplayer, &ImageReplayer<I>::handle_allocate_local_tag>(this);
m_local_journal->allocate_tag(mirror_uuid, predecessor, ctx);
template <typename I>
void ImageReplayer<I>::handle_allocate_local_tag(int r) {
- dout(20) << "r=" << r << dendl;
+ dout(15) << "r=" << r << ", "
+ << "tag_tid=" << m_local_journal->get_tag_tid() << dendl;
if (r < 0) {
derr << "failed to allocate journal tag: " << cpp_strerror(r) << dendl;
dout(20) << dendl;
assert(r == 0);
- on_name_changed();
+ bool update_status = false;
+ {
+ RWLock::RLocker snap_locker(m_local_image_ctx->snap_lock);
+ if (m_local_image_name != m_local_image_ctx->name) {
+ m_local_image_name = m_local_image_ctx->name;
+ update_status = true;
+ }
+ }
+
+ if (update_status) {
+ reschedule_update_status_task(0);
+ }
// attempt to process the next event
handle_replay_ready();
template <typename I>
void ImageReplayer<I>::finish_mirror_image_status_update() {
+ reregister_admin_socket_hook();
+
Context *on_finish = nullptr;
{
Mutex::Locker locker(m_lock);
return;
}
- dout(20) << "registered asok hook: " << m_name << dendl;
+ dout(15) << "registered asok hook: " << m_name << dendl;
asok_hook = new ImageReplayerAdminSocketHook<I>(g_ceph_context, m_name,
this);
int r = asok_hook->register_commands();
template <typename I>
void ImageReplayer<I>::unregister_admin_socket_hook() {
- dout(20) << dendl;
+ dout(15) << dendl;
AdminSocketHook *asok_hook = nullptr;
{
}
template <typename I>
-void ImageReplayer<I>::on_name_changed() {
+void ImageReplayer<I>::reregister_admin_socket_hook() {
{
Mutex::Locker locker(m_lock);
- std::string name = m_local_ioctx.get_pool_name() + "/" +
- m_local_image_ctx->name;
+ auto name = m_local_ioctx.get_pool_name() + "/" + m_local_image_name;
if (m_name == name) {
return;
}
int64_t m_local_pool_id;
std::string m_local_image_id;
std::string m_global_image_id;
+ std::string m_local_image_name;
std::string m_name;
mutable Mutex m_lock;
void register_admin_socket_hook();
void unregister_admin_socket_hook();
-
- void on_name_changed();
+ void reregister_admin_socket_hook();
};
} // namespace mirror
namespace mirror {
using namespace image_sync;
+using librbd::util::create_async_context_callback;
using librbd::util::create_context_callback;
using librbd::util::unique_lock_name;
dout(20) << dendl;
- Context *ctx = create_context_callback<
- ImageSync<I>, &ImageSync<I>::handle_notify_sync_request>(this);
+ m_lock.Lock();
+ if (m_canceled) {
+ m_lock.Unlock();
+ BaseRequest::finish(-ECANCELED);
+ return;
+ }
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_notify_sync_request>(this));
m_instance_watcher->notify_sync_request(m_local_image_ctx->id, ctx);
+ m_lock.Unlock();
}
template <typename I>
void ImageSync<I>::handle_notify_sync_request(int r) {
dout(20) << ": r=" << r << dendl;
+ m_lock.Lock();
+ if (r == 0 && m_canceled) {
+ r = -ECANCELED;
+ }
+ m_lock.Unlock();
+
if (r < 0) {
BaseRequest::finish(r);
return;
template <typename I>
void InstanceWatcher<I>::notify_sync_complete(const std::string &sync_id) {
- dout(20) << "sync_id=" << sync_id << dendl;
-
Mutex::Locker locker(m_lock);
+ notify_sync_complete(m_lock, sync_id);
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_sync_complete(const Mutex&,
+ const std::string &sync_id) {
+ dout(10) << "sync_id=" << sync_id << dendl;
+ assert(m_lock.is_locked());
auto it = m_inflight_sync_reqs.find(sync_id);
assert(it != m_inflight_sync_reqs.end());
Context *on_start = nullptr;
{
Mutex::Locker locker(m_lock);
-
assert(sync_ctx->req != nullptr);
assert(sync_ctx->on_start != nullptr);
std::swap(sync_ctx->on_start, on_start);
sync_ctx->req = nullptr;
+
+ if (r == -ECANCELED) {
+ notify_sync_complete(m_lock, sync_ctx->sync_id);
+ }
}
on_start->complete(r == -ECANCELED ? r : 0);
-
- if (r == -ECANCELED) {
- notify_sync_complete(sync_ctx->sync_id);
- }
}
template <typename I>
bool unsuspend_notify_request(C_NotifyInstanceRequest *req);
void unsuspend_notify_requests();
+ void notify_sync_complete(const Mutex& lock, const std::string &sync_id);
void handle_notify_sync_request(C_SyncRequest *sync_ctx, int r);
void handle_notify_sync_complete(C_SyncRequest *sync_ctx, int r);
}
if (m_leader_watcher) {
m_leader_watcher->shut_down();
- m_leader_watcher.reset();
}
if (m_instance_watcher) {
m_instance_watcher->shut_down();
- m_instance_watcher.reset();
}
if (m_instance_replayer) {
m_instance_replayer->shut_down();
- m_instance_replayer.reset();
}
+ m_leader_watcher.reset();
+ m_instance_watcher.reset();
+ m_instance_replayer.reset();
+
assert(!m_local_pool_watcher);
assert(!m_remote_pool_watcher);
m_local_rados.reset();
m_cond.WaitInterval(m_lock, utime_t(1, 0));
}
}
+
+ m_instance_replayer->stop();
}
void PoolReplayer::print_status(Formatter *f, stringstream *ss)
void BootstrapRequest<I>::handle_is_primary(int r) {
dout(20) << ": r=" << r << dendl;
- if (r < 0) {
+ if (r == -ENOENT) {
+ dout(5) << ": remote image is not mirrored" << dendl;
+ m_ret_val = -EREMOTEIO;
+ close_remote_image();
+ return;
+ } else if (r < 0) {
derr << ": error querying remote image primary status: " << cpp_strerror(r)
<< dendl;
m_ret_val = r;
}
if (!m_primary) {
- dout(5) << ": remote image is not primary -- skipping image replay"
- << dendl;
- m_ret_val = -EREMOTEIO;
- update_client_state();
- return;
+ if (m_local_image_id.empty()) {
+ // no local image and remote isn't primary -- don't sync it
+ dout(5) << ": remote image is not primary -- not syncing"
+ << dendl;
+ m_ret_val = -EREMOTEIO;
+ close_remote_image();
+ return;
+ } else if (m_client_meta->state !=
+ librbd::journal::MIRROR_PEER_STATE_REPLAYING) {
+ // ensure we attempt to re-sync to remote if it's re-promoted
+ dout(5) << ": remote image is not primary -- sync interrupted"
+ << dendl;
+ m_ret_val = -EREMOTEIO;
+ update_client_state();
+ return;
+ }
}
if (!m_client_meta->image_id.empty()) {
}
if (m_local_image_id.empty()) {
+ // prepare to create local image
update_client_image();
return;
}
template <typename I>
void BootstrapRequest<I>::update_client_state() {
- if (m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_REPLAYING) {
- // state already set for replaying upon failover
- close_remote_image();
- return;
- }
-
dout(20) << dendl;
update_progress("UPDATE_CLIENT_STATE");
I *local_image_ctx = (*m_local_image_ctx);
{
- RWLock::RLocker snap_locker(local_image_ctx->snap_lock);
+ local_image_ctx->snap_lock.get_read();
if (local_image_ctx->journal == nullptr) {
+ local_image_ctx->snap_lock.put_read();
+
derr << ": local image does not support journaling" << dendl;
m_ret_val = -EINVAL;
close_local_image();
r = (*m_local_image_ctx)->journal->is_resync_requested(m_do_resync);
if (r < 0) {
+ local_image_ctx->snap_lock.put_read();
+
derr << ": failed to check if a resync was requested" << dendl;
m_ret_val = r;
close_local_image();
return;
}
+
+ m_local_tag_tid = local_image_ctx->journal->get_tag_tid();
+ m_local_tag_data = local_image_ctx->journal->get_tag_data();
+ dout(10) << ": local tag=" << m_local_tag_tid << ", "
+ << "local tag data=" << m_local_tag_data << dendl;
+ local_image_ctx->snap_lock.put_read();
+ }
+
+ if (m_local_tag_data.mirror_uuid != m_remote_mirror_uuid && !m_primary) {
+ // if the local mirror is not linked to the (now) non-primary image,
+ // stop the replay. Otherwise, we ignore that the remote is non-primary
+ // so that we can replay the demotion
+ dout(5) << ": remote image is not primary -- skipping image replay"
+ << dendl;
+ m_ret_val = -EREMOTEIO;
+ close_local_image();
+ return;
}
if (*m_do_resync) {
update_progress("REGISTER_CLIENT");
- librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
- m_local_image_id};
+ assert(m_local_image_id.empty());
+ librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta;
mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
librbd::journal::ClientData client_data{mirror_peer_client_meta};
}
*m_client_state = cls::journal::CLIENT_STATE_CONNECTED;
- *m_client_meta = librbd::journal::MirrorPeerClientMeta(m_local_image_id);
+ *m_client_meta = librbd::journal::MirrorPeerClientMeta();
m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
is_primary();
// At this point, the local image was existing, non-primary, and replaying;
// and the remote image is primary. Attempt to link the local image's most
// recent tag to the remote image's tag chain.
- uint64_t local_tag_tid;
- librbd::journal::TagData local_tag_data;
- I *local_image_ctx = (*m_local_image_ctx);
- {
- RWLock::RLocker snap_locker(local_image_ctx->snap_lock);
- if (local_image_ctx->journal == nullptr) {
- derr << ": local image does not support journaling" << dendl;
- m_ret_val = -EINVAL;
- close_local_image();
- return;
- }
-
- local_tag_tid = local_image_ctx->journal->get_tag_tid();
- local_tag_data = local_image_ctx->journal->get_tag_data();
- dout(20) << ": local tag " << local_tag_tid << ": "
- << local_tag_data << dendl;
- }
-
bool remote_tag_data_valid = false;
librbd::journal::TagData remote_tag_data;
boost::optional<uint64_t> remote_orphan_tag_tid =
// decode the remote tags
for (auto &remote_tag : m_remote_tags) {
- if (local_tag_data.predecessor.commit_valid &&
- local_tag_data.predecessor.mirror_uuid == m_remote_mirror_uuid &&
- local_tag_data.predecessor.tag_tid > remote_tag.tid) {
- dout(20) << ": skipping processed predecessor remote tag "
+ if (m_local_tag_data.predecessor.commit_valid &&
+ m_local_tag_data.predecessor.mirror_uuid == m_remote_mirror_uuid &&
+ m_local_tag_data.predecessor.tag_tid > remote_tag.tid) {
+ dout(15) << ": skipping processed predecessor remote tag "
<< remote_tag.tid << dendl;
continue;
}
dout(10) << ": decoded remote tag " << remote_tag.tid << ": "
<< remote_tag_data << dendl;
- if (!local_tag_data.predecessor.commit_valid) {
+ if (!m_local_tag_data.predecessor.commit_valid) {
// newly synced local image (no predecessor) replays from the first tag
if (remote_tag_data.mirror_uuid != librbd::Journal<>::LOCAL_MIRROR_UUID) {
dout(20) << ": skipping non-primary remote tag" << dendl;
break;
}
- if (local_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) {
+ if (m_local_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) {
// demotion last available local epoch
- if (remote_tag_data.mirror_uuid == local_tag_data.mirror_uuid &&
+ if (remote_tag_data.mirror_uuid == m_local_tag_data.mirror_uuid &&
remote_tag_data.predecessor.commit_valid &&
remote_tag_data.predecessor.tag_tid ==
- local_tag_data.predecessor.tag_tid) {
+ m_local_tag_data.predecessor.tag_tid) {
// demotion matches remote epoch
if (remote_tag_data.predecessor.mirror_uuid == m_local_mirror_uuid &&
- local_tag_data.predecessor.mirror_uuid ==
+ m_local_tag_data.predecessor.mirror_uuid ==
librbd::Journal<>::LOCAL_MIRROR_UUID) {
// local demoted and remote has matching event
dout(20) << ": found matching local demotion tag" << dendl;
continue;
}
- if (local_tag_data.predecessor.mirror_uuid == m_remote_mirror_uuid &&
+ if (m_local_tag_data.predecessor.mirror_uuid == m_remote_mirror_uuid &&
remote_tag_data.predecessor.mirror_uuid ==
librbd::Journal<>::LOCAL_MIRROR_UUID) {
// remote demoted and local has matching event
}
if (remote_tag_data_valid &&
- local_tag_data.mirror_uuid == m_remote_mirror_uuid) {
- dout(20) << ": local image is in clean replay state" << dendl;
+ m_local_tag_data.mirror_uuid == m_remote_mirror_uuid) {
+ dout(10) << ": local image is in clean replay state" << dendl;
} else if (reconnect_orphan) {
dout(20) << ": remote image was demoted/promoted" << dendl;
} else {
#include "include/rados/librados.hpp"
#include "common/Mutex.h"
#include "cls/journal/cls_journal_types.h"
+#include "librbd/journal/Types.h"
#include "librbd/journal/TypeTraits.h"
#include "tools/rbd_mirror/BaseRequest.h"
#include "tools/rbd_mirror/types.h"
int m_ret_val = 0;
ImageSync<ImageCtxT> *m_image_sync = nullptr;
+ uint64_t m_local_tag_tid = 0;
+ librbd::journal::TagData m_local_tag_data;
+
bufferlist m_out_bl;
void get_remote_tag_class();
return;
} else if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLING) {
dout(5) << ": image mirroring is being disabled" << dendl;
- *m_primary = false;
+ r = -ENOENT;
} else {
derr << ": image mirroring is disabled" << dendl;
r = -EINVAL;
derr << ": failed to decode image mirror state: " << cpp_strerror(r)
<< dendl;
}
+ } else if (r == -ENOENT) {
+ dout(5) << ": image is not mirrored" << dendl;
} else {
derr << ": failed to retrieve image mirror state: " << cpp_strerror(r)
<< dendl;
void OpenLocalImageRequest<I>::handle_is_primary(int r) {
dout(20) << ": r=" << r << dendl;
- if (r < 0) {
+ if (r == -ENOENT) {
+ dout(5) << ": local image is not mirrored" << dendl;
+ send_close_image(r);
+ return;
+ } else if (r < 0) {
derr << ": error querying local image primary status: " << cpp_strerror(r)
<< dendl;
send_close_image(r);
return;
}
+ get_local_image_name();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::get_local_image_name() {
+ dout(20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::dir_get_name_start(&op, *m_local_image_id);
+
+ m_out_bl.clear();
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ PrepareLocalImageRequest<I>,
+ &PrepareLocalImageRequest<I>::handle_get_local_image_name>(this);
+ int r = m_io_ctx.aio_operate(RBD_DIRECTORY, aio_comp, &op, &m_out_bl);
+ assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_get_local_image_name(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ if (r == 0) {
+ bufferlist::iterator it = m_out_bl.begin();
+ r = librbd::cls_client::dir_get_name_finish(&it, m_local_image_name);
+ }
+
+ if (r < 0) {
+ if (r != -ENOENT) {
+ derr << "failed to retrieve image name: " << cpp_strerror(r) << dendl;
+ }
+ finish(r);
+ return;
+ }
+
get_mirror_state();
}
static PrepareLocalImageRequest *create(librados::IoCtx &io_ctx,
const std::string &global_image_id,
std::string *local_image_id,
+ std::string *local_image_name,
std::string *tag_owner,
ContextWQ *work_queue,
Context *on_finish) {
return new PrepareLocalImageRequest(io_ctx, global_image_id, local_image_id,
- tag_owner, work_queue, on_finish);
+ local_image_name, tag_owner, work_queue,
+ on_finish);
}
PrepareLocalImageRequest(librados::IoCtx &io_ctx,
const std::string &global_image_id,
std::string *local_image_id,
+ std::string *local_image_name,
std::string *tag_owner,
ContextWQ *work_queue,
Context *on_finish)
: m_io_ctx(io_ctx), m_global_image_id(global_image_id),
- m_local_image_id(local_image_id), m_tag_owner(tag_owner),
- m_work_queue(work_queue), m_on_finish(on_finish) {
+ m_local_image_id(local_image_id), m_local_image_name(local_image_name),
+ m_tag_owner(tag_owner), m_work_queue(work_queue), m_on_finish(on_finish) {
}
void send();
* GET_LOCAL_IMAGE_ID
* |
* v
+ * GET_LOCAL_IMAGE_NAME
+ * |
+ * v
* GET_MIRROR_STATE
* |
* v
librados::IoCtx &m_io_ctx;
std::string m_global_image_id;
std::string *m_local_image_id;
+ std::string *m_local_image_name;
std::string *m_tag_owner;
ContextWQ *m_work_queue;
Context *m_on_finish;
void get_local_image_id();
void handle_get_local_image_id(int r);
+ void get_local_image_name();
+ void handle_get_local_image_name(int r);
+
void get_mirror_state();
void handle_get_mirror_state(int r);
cerr << err << std::endl;
return r;
}
-
if (forker.is_parent()) {
- global_init_postfork_start(g_ceph_context);
if (forker.parent_wait(err) != 0) {
return -ENXIO;
}
return 0;
}
+ global_init_postfork_start(g_ceph_context);
}
common_init_finish(g_ceph_context);
cout << cfg->devpath << std::endl;
if (g_conf->daemonize) {
- forker.daemonize();
- global_init_postfork_start(g_ceph_context);
global_init_postfork_finish(g_ceph_context);
+ forker.daemonize();
}
{