RELEASE=5.1
PACKAGE=ceph
-VER=12.2.4
+VER=12.2.5
DEBREL=pve1
SRCDIR=ceph
cmake_minimum_required(VERSION 2.8.11)
project(ceph)
-set(VERSION 12.2.4)
+set(VERSION 12.2.5)
if(POLICY CMP0046)
# Tweak policies (this one disables "missing" dependency warning)
message(STATUS "Looking for openssl anyways, because radosgw selected")
find_package(OpenSSL)
endif()
+# https://curl.haxx.se/docs/install.html mentions the
+# configure flags for various ssl backends
+ execute_process(
+ COMMAND
+ "sh" "-c"
+ "curl-config --configure | grep with-ssl"
+ RESULT_VARIABLE NO_CURL_SSL_LINK
+ ERROR_VARIABLE CURL_CONFIG_ERRORS
+ )
+ if (CURL_CONFIG_ERRORS)
+ message(WARNING "unable to run curl-config; rgw cannot make ssl requests to external systems reliably")
+ endif()
+ find_package(OpenSSL)
if (OPENSSL_FOUND)
+ if (NOT NO_CURL_SSL_LINK)
+ message(STATUS "libcurl is linked with openssl: explicitly setting locks")
+ set(WITH_CURL_OPENSSL ON)
+ endif() # CURL_SSL_LINK
execute_process(
COMMAND
"sh" "-c"
message(STATUS "crypto soname: ${LIBCRYPTO_SONAME}")
else()
message(WARNING "ssl not found: rgw civetweb may fail to dlopen libssl libcrypto")
- endif()
+ endif() # OPENSSL_FOUND
endif (WITH_RADOSGW)
#option for CephFS
Files: doc/*
Copyright: (c) 2010-2012 New Dream Network and contributors
-License: Creative Commons Attribution-ShareAlike (CC BY-SA)
+License: Creative Commons Attribution Share Alike 3.0 (CC-BY-SA-3.0)
Files: bin/git-archive-all.sh
License: GPL3
and refuses to start as consistency and untrimmable cache issues may
develop. The new option client_die_on_failed_dentry_invalidate (default:
true) may be turned off to allow the client to proceed (dangerous!).
+
+12.2.5
+------
+
+- *CephFS*:
+
+ * Upgrading an MDS cluster to 12.2.3+ will result in all active MDS
+ exiting due to feature incompatibilities once an upgraded MDS comes online
+ (even as standby). Operators may ignore the error messages and continue
+ upgrading/restarting or follow this upgrade sequence:
+
+ Reduce the number of ranks to 1 (`ceph fs set <fs_name> max_mds 1`),
+ deactivate all other ranks (`ceph mds deactivate <fs_name>:<n>`), shutdown
+ standbys leaving the one active MDS, upgrade the single active MDS, then
+ upgrade/start standbys. Finally, restore the previous max_mds.
+
+ See also: https://tracker.ceph.com/issues/23172
+
+* *rados list-inconsistent-obj format changes:*
+
+ * Various error strings have been improved. For example, the "oi" or "oi_attr"
+ in errors which stands for object info is now "info" (e.g. oi_attr_missing is
+ now info_missing).
+
+ * The object's "selected_object_info" is now in json format instead of string.
+
+ * The attribute errors (attr_value_mismatch, attr_name_mismatch) only apply to user
+ attributes. Only user attributes are output and have the internal leading underscore
+ stripped.
+
+ * If there are hash information errors (hinfo_missing, hinfo_corrupted,
+ hinfo_inconsistency) then "hashinfo" is added with the json format of the
+ information. If the information is corrupt then "hashinfo" is a string
+ containing the value.
+
+ * If there are snapset errors (snapset_missing, snapset_corrupted,
+ snapset_inconsistency) then "snapset" is added with the json format of the
+ information. If the information is corrupt then "snapset" is a string containing
+ the value.
+
+ * If there are object information errors (info_missing, info_corrupted,
+ obj_size_info_mismatch, object_info_inconsistency) then "object_info" is added
+ with the json format of the information instead of a string. If the information
+ is corrupt then "object_info" is a string containing the value.
+
+* *rados list-inconsistent-snapset format changes:*
+
+ * Various error strings have been improved. For example, the "ss_attr" in
+ errors which stands for snapset info is now "snapset" (e.g. ss_attr_missing is
+ now snapset_missing). The error snapset_mismatch has been renamed to snapset_error
+ to better reflect what it means.
+
+ * The head snapset information is output in json format as "snapset." This means that
+ even when there are no head errors, the head object will be output when any shard
+ has an error. This head object is there to show the snapset that was used in
+ determining errors.
+
Most of Ceph is licensed under the LGPL version 2.1. Some
miscellaneous code is under BSD-style license or is public domain.
The documentation is licensed under Creative Commons
-Attribution-ShareAlike (CC BY-SA). There are a handful of headers
+Attribution Share Alike 3.0 (CC-BY-SA-3.0). There are a handful of headers
included here that are licensed under the GPL. Please see the file
COPYING for a full inventory of licenses by file.
# Contributor: John Coyle <dx9err@gmail.com>
# Maintainer: John Coyle <dx9err@gmail.com>
pkgname=ceph
-pkgver=12.2.4
+pkgver=12.2.5
pkgrel=0
pkgdesc="Ceph is a distributed object store and file system"
pkgusers="ceph"
xmlstarlet
yasm
"
-source="ceph-12.2.4.tar.bz2"
+source="ceph-12.2.5.tar.bz2"
subpackages="
$pkgname-base
$pkgname-common
_udevrulesdir=/etc/udev/rules.d
_python_sitelib=/usr/lib/python2.7/site-packages
-builddir=$srcdir/ceph-12.2.4
+builddir=$srcdir/ceph-12.2.5
build() {
export CEPH_BUILD_VIRTUALENV=$builddir
# main package definition
#################################################################################
Name: ceph
-Version: 12.2.4
+Version: 12.2.5
Release: 0%{?dist}
%if 0%{?fedora} || 0%{?rhel}
Epoch: 2
%global _epoch_prefix %{?epoch:%{epoch}:}
Summary: User space components of the Ceph file system
-License: LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and MIT
+License: LGPL-2.1 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and MIT
%if 0%{?suse_version}
Group: System/Filesystems
%endif
URL: http://ceph.com/
-Source0: http://ceph.com/download/ceph-12.2.4.tar.bz2
+Source0: http://ceph.com/download/ceph-12.2.5.tar.bz2
%if 0%{?suse_version}
%if 0%{?is_opensuse}
ExclusiveArch: x86_64 aarch64 ppc64 ppc64le
BuildRequires: python-Werkzeug
BuildRequires: python-numpy-devel
%endif
+BuildRequires: python-coverage
BuildRequires: python-pecan
BuildRequires: socat
%endif
# common
#################################################################################
%prep
-%autosetup -p1 -n ceph-12.2.4
+%autosetup -p1 -n ceph-12.2.5
%build
%if 0%{with cephfs_java}
%global _epoch_prefix %{?epoch:%{epoch}:}
Summary: User space components of the Ceph file system
-License: LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and MIT
+License: LGPL-2.1 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and MIT
%if 0%{?suse_version}
Group: System/Filesystems
%endif
BuildRequires: python-Werkzeug
BuildRequires: python-numpy-devel
%endif
+BuildRequires: python-coverage
BuildRequires: python-pecan
BuildRequires: socat
%endif
+ceph (12.2.5-1) stable; urgency=medium
+
+ * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com> Mon, 23 Apr 2018 16:18:32 +0000
+
ceph (12.2.4-1) stable; urgency=medium
* New upstream release
libbabeltrace-ctf-dev,
libbabeltrace-dev,
libblkid-dev (>= 2.17),
- libcurl4-gnutls-dev,
+ libcurl4-openssl-dev,
libexpat1-dev,
libfuse-dev,
libgoogle-perftools-dev [i386 amd64 arm64],
pkg-config,
python (>= 2.7),
python-all-dev,
+ python-coverage,
python-cherrypy3,
- python-jinja2,
python-nose,
python-pecan,
python-prettytable,
Architecture: linux-any
Depends: ceph-base (= ${binary:Version}),
python-cherrypy3,
+ python-jinja2,
python-openssl,
python-pecan,
python-werkzeug,
Files: doc/*
Copyright: (c) 2010-2012 New Dream Network and contributors
-License: Creative Commons Attribution-ShareAlike (CC BY-SA)
+License: Creative Commons Attribution Share Alike 3.0 (CC-BY-SA-3.0)
Files: src/mount/canonicalize.c
Copyright: Copyright (C) 1993 Rick Sladkey <jrs@world.std.com>
+Upgrading the MDS Cluster
+=========================
+
+Currently the MDS cluster does not have built-in versioning or file system
+flags to support seamless upgrades of the MDSs without potentially causing
+assertions or other faults due to incompatible messages or other functional
+differences. For this reason, it's necessary during any cluster upgrade to
+reduce the number of active MDS for a file system to one first so that two
+active MDS do not communicate with different versions. Further, it's also
+necessary to take standbys offline as any new CompatSet flags will propogate
+via the MDSMap to all MDS and cause older MDS to suicide.
+
+The proper sequence for upgrading the MDS cluster is:
+
+1. Reduce the number of ranks to 1:
+
+::
+
+ ceph fs set <fs_name> max_mds 1
+
+2. Deactivate all non-zero ranks, from the highest rank to the lowest, while waiting for each MDS to finish stopping:
+
+::
+
+ ceph mds deactivate <fs_name>:<n>
+ ceph status # wait for MDS to finish stopping
+
+3. Take all standbys offline, e.g. using systemctl:
+
+::
+
+ systemctl stop ceph-mds.target
+ ceph status # confirm only one MDS is online and is active
+
+4. Upgrade the single active MDS, e.g. using systemctl:
+
+::
+
+ systemctl restart ceph-mds.target
+
+5. Upgrade/start the standby daemons.
+
+6. Restore the previous max_mds for your cluster:
+
+::
+
+ ceph fs set <fs_name> max_mds <old_max_mds>
+
Upgrading pre-Firefly filesystems past Jewel
============================================
import os
project = u'Ceph'
-copyright = u'2016, Red Hat, Inc, and contributors. Licensed under Creative Commons BY-SA'
+copyright = u'2016, Red Hat, Inc, and contributors. Licensed under Creative Commons Attribution Share Alike 3.0 (CC-BY-SA-3.0)'
version = 'dev'
release = 'dev'
Example::
ceph.wal_uuid=A58D1C68-0D6E-4CB3-8E99-B261AD47CC39
+
+
+``vdo``
+-------
+A VDO-enabled device is detected when device is getting prepared, and then
+stored for later checks when activating. This affects mount options by
+appending the ``discard`` mount flag, regardless of mount flags being used.
+
+Example for an enabled VDO device::
+
+ ceph.vdo=1
:Author: Loic Dachary
:Author: Nathan Cutler
-:License: Creative Commons Attribution-ShareAlike (CC BY-SA)
+:License: Creative Commons Attribution Share Alike 3.0 (CC-BY-SA-3.0)
.. note:: The old (pre-2016) developer documentation has been moved to :doc:`/dev/index-old`.
items must be in `pem` form. Because the combined file contains the
secret key, it should be protected from unauthorized access.
-To configure ssl operation, append ``s`` to the port number. Currently
-it is not possible to configure the radosgw to listen on both
-http and https, you must pick only one. So::
+To configure ssl operation, append ``s`` to the port number. For eg::
[client.rgw.gateway-node1]
rgw_frontends = civetweb port=443s ssl_certificate=/etc/ceph/private/keyandcert.pem
+.. versionadded :: Luminous
+
+Furthermore, civetweb can be made to bind to multiple ports, by separating them
+with ``+`` in the configuration. This allows for use cases where both ssl and
+non-ssl connections are hosted by a single rgw instance. For eg::
+
+ [client.rgw.gateway-node1]
+ rgw_frontends = civetweb port=80+443s ssl_certificate=/etc/ceph/private/keyandcert.pem
+
+
Migrating from Apache to Civetweb
---------------------------------
auth service required = cephx
auth client required = cephx
osd journal size = 1024
- osd pool default size = 2
- osd pool default min size = 1
+ osd pool default size = 3
+ osd pool default min size = 2
osd pool default pg num = 333
osd pool default pgp num = 333
osd crush chooseleaf type = 1
Short Form
----------
-Ceph provides the ``ceph-disk`` utility, which can prepare a disk, partition or
-directory for use with Ceph. The ``ceph-disk`` utility creates the OSD ID by
-incrementing the index. Additionally, ``ceph-disk`` will add the new OSD to the
-CRUSH map under the host for you. Execute ``ceph-disk -h`` for CLI details.
-The ``ceph-disk`` utility automates the steps of the `Long Form`_ below. To
+Ceph provides the ``ceph-volume`` utility, which can prepare a logical volume, disk, or partition
+for use with Ceph. The ``ceph-volume`` utility creates the OSD ID by
+incrementing the index. Additionally, ``ceph-volume`` will add the new OSD to the
+CRUSH map under the host for you. Execute ``ceph-volume -h`` for CLI details.
+The ``ceph-volume`` utility automates the steps of the `Long Form`_ below. To
create the first two OSDs with the short form procedure, execute the following
on ``node2`` and ``node3``:
+bluestore
+^^^^^^^^^
+#. Create the OSD. ::
+
+ ssh {node-name}
+ sudo ceph-volume lvm create --data {data-path}
+
+ For example::
+
+ ssh node1
+ sudo ceph-volume lvm create --data /dev/hdd1
+
+Alternatively, the creation process can be split in two phases (prepare, and
+activate):
#. Prepare the OSD. ::
ssh {node-name}
- sudo ceph-disk prepare --cluster {cluster-name} --cluster-uuid {uuid} {data-path} [{journal-path}]
+ sudo ceph-volume lvm prepare --data {data-path} {data-path}
For example::
ssh node1
- sudo ceph-disk prepare --cluster ceph --cluster-uuid a7f64266-0894-4f1e-a635-d0aeaca0e993 --fs-type ext4 /dev/hdd1
+ sudo ceph-volume lvm prepare --data /dev/hdd1
+
+ Once prepared, the ``ID`` and ``FSID`` of the prepared OSD are required for
+ activation. These can be obtained by listing OSDs in the current server::
+ sudo ceph-volume lvm list
#. Activate the OSD::
- sudo ceph-disk activate {data-path} [--activate-key {path}]
+ sudo ceph-volume lvm activate {ID} {FSID}
+
+ For example::
+
+ sudo ceph-volume lvm activate 0 a7f64266-0894-4f1e-a635-d0aeaca0e993
+
+
+filestore
+^^^^^^^^^
+#. Create the OSD. ::
+
+ ssh {node-name}
+ sudo ceph-volume lvm create --filestore --data {data-path} --journal {journal-path}
+
+ For example::
+
+ ssh node1
+ sudo ceph-volume lvm create --filestore --data /dev/hdd1 --journal /dev/hdd2
+
+Alternatively, the creation process can be split in two phases (prepare, and
+activate):
+
+#. Prepare the OSD. ::
+
+ ssh {node-name}
+ sudo ceph-volume lvm prepare --filestore --data {data-path} --journal {journal-path}
For example::
- sudo ceph-disk activate /dev/hdd1
+ ssh node1
+ sudo ceph-volume lvm prepare --filestore --data /dev/hdd1 --journal /dev/hdd2
+
+ Once prepared, the ``ID`` and ``FSID`` of the prepared OSD are required for
+ activation. These can be obtained by listing OSDs in the current server::
+
+ sudo ceph-volume lvm list
+
+#. Activate the OSD::
+
+ sudo ceph-volume lvm activate --filestore {ID} {FSID}
+
+ For example::
- **Note:** Use the ``--activate-key`` argument if you do not have a copy
- of ``/var/lib/ceph/bootstrap-osd/{cluster}.keyring`` on the Ceph Node.
+ sudo ceph-volume lvm activate --filestore 0 a7f64266-0894-4f1e-a635-d0aeaca0e993
Long Form
[ -a | --add-key *base64_key* ]
[ --cap *subsystem* *capability* ]
[ --caps *capfile* ]
+ [ --mode *mode* ]
Description
will set all of capabilities associated with a given key, for all subsystems
+ .. option:: --mode *mode*
+
+ will set the desired file mode to the keyring e.g: 0644, defaults to 0600
+
Capabilities
============
Example
=======
-To create a new keyring containing a key for client.foo::
+To create a new keyring containing a key for client.foo with a 0644 file mode::
- ceph-authtool -C -n client.foo --gen-key keyring
+ ceph-authtool -C -n client.foo --gen-key keyring --mode 0644
To associate some capabilities with the key (namely, the ability to
mount a Ceph filesystem)::
Usage::
- ceph-volume lvm activate --filestore <osd id> <osd fsid>
+ ceph-volume lvm activate --bluestore <osd id> <osd fsid>
Optional Arguments:
* [-h, --help] show the help message and exit
-* [--bluestore] filestore objectstore (not yet implemented)
-* [--filestore] filestore objectstore (current default)
+* [--auto-detect-objectstore] Automatically detect the objecstore by inspecting
+ the OSD
+* [--bluestore] bluestore objectstore (default)
+* [--filestore] filestore objectstore
+* [--all] Activate all OSDs found in the system
+* [--no-systemd] Skip creating and enabling systemd units and starting of OSD
+ services
**prepare**
-Prepares a logical volume to be used as an OSD and journal using a ``filestore`` setup
-(``bluestore`` support is planned). It will not create or modify the logical volumes
+Prepares a logical volume to be used as an OSD and journal using a ``filestore``
+or ``bluestore`` (default) setup. It will not create or modify the logical volumes
except for adding extra metadata.
Usage::
* [-h, --help] show the help message and exit
* [--journal JOURNAL] A logical group name, path to a logical volume, or path to a device
-* [--journal-size GB] Size (in GB) A logical group name or a path to a logical volume
* [--bluestore] Use the bluestore objectstore (default)
+* [--block.wal] Path to a bluestore block.wal logical volume or partition
+* [--block.db] Path to a bluestore block.db logical volume or partition
* [--filestore] Use the filestore objectstore
* [--dmcrypt] Enable encryption for the underlying OSD devices
* [--osd-id OSD_ID] Reuse an existing OSD id
* [--osd-fsid OSD_FSID] Reuse an existing OSD fsid
+* [--crush-device-class] Define a CRUSH device class to assign the OSD to
Required arguments:
The single-call process unifies exactly what ``prepare`` and ``activate`` do,
with the convenience of doing it all at once. Flags and general usage are
-equivalent to those of the ``prepare`` subcommand.
+equivalent to those of the ``prepare`` and ``activate`` subcommand.
**trigger**
This subcommand is not meant to be used directly, and it is used by systemd so
Positional arguments:
-* <DEVICE> Either in the form of ``vg/lv`` for logical volumes or
- ``/path/to/sda1`` for regular devices.
+* <DEVICE> Either in the form of ``vg/lv`` for logical volumes,
+ ``/path/to/sda1`` or ``/path/to/sda`` for regular devices.
**zap**
Positional arguments:
-* <DEVICE> Either in the form of ``vg/lv`` for logical volumes or
- ``/path/to/sda1`` for regular devices.
+* <DEVICE> Either in the form of ``vg/lv`` for logical volumes,
+ ``/path/to/sda1`` or ``/path/to/sda`` for regular devices.
simple
:command:`zone list`
List all zones set on this cluster.
+:command:`sync error list`
+ list sync error.
+
+:command:`sync error trim`
+ trim sync error.
+
:command:`pool add`
Add an existing pool for data placement.
daemon is currently active, you may want to set up a load balancer
front-end to direct traffic to whichever manager endpoint is
available.
+
+Available methods
+-----------------
+
+You can navigate to the ``/doc`` endpoint for full list of available
+endpoints and HTTP methods implemented for each endpoint.
+
+For example, if you want to use the PATCH method of the ``/osd/<id>``
+endpoint to set the state ``up`` of the OSD id ``1``, you can use the
+following curl command::
+
+ echo -En '{"up": true}' | curl --request PATCH --data @- --silent --insecure --user <user> 'https://<ceph-mgr>:<port>/osd/1'
+
+or you can use python to do so::
+
+ $ python
+ >> import requests
+ >> result = requests.patch(
+ 'https://<ceph-mgr>:<port>/osd/1',
+ json={"up": True},
+ auth=("<user>", "<password>")
+ )
+ >> print result.json()
+
+Some of the other endpoints implemented in the *restful* module include
+
+* ``/config/cluster``: **GET**
+* ``/config/osd``: **GET**, **PATCH**
+* ``/crush/rule``: **GET**
+* ``/mon``: **GET**
+* ``/osd``: **GET**
+* ``/pool``: **GET**, **POST**
+* ``/pool/<arg>``: **DELETE**, **GET**, **PATCH**
+* ``/request``: **DELETE**, **GET**, **POST**
+* ``/request/<arg>``: **DELETE**, **GET**
+* ``/server``: **GET**
+
+The ``/request`` endpoint
+-------------------------
+
+You can use the ``/request`` endpoint to poll the state of a request
+you scheduled with any **DELETE**, **POST** or **PATCH** method. These
+methods are by default asynchronous since it may take longer for them
+to finish execution. You can modify this behaviour by appending
+``?wait=1`` to the request url. The returned request will then always
+be completed.
+
+The **POST** method of the ``/request`` method provides a passthrough
+for the ceph mon commands as defined in ``src/mon/MonCommands.h``.
+Let's consider the following command::
+
+ COMMAND("osd ls " \
+ "name=epoch,type=CephInt,range=0,req=false", \
+ "show all OSD ids", "osd", "r", "cli,rest")
+
+The **prefix** is **osd ls**. The optional argument's name is **epoch**
+and it is of type ``CephInt``, i.e. ``integer``. This means that you
+need to do the following **POST** request to schedule the command::
+
+ $ python
+ >> import requests
+ >> result = requests.post(
+ 'https://<ceph-mgr>:<port>/request',
+ json={'prefix': 'osd ls', 'epoch': 0},
+ auth=("<user>", "<password>")
+ )
+ >> print result.json()
Configuration
-------------
-Two configuration keys are mandatory for the module to work:
+Two configuration keys are vital for the module to work:
-- mgr/zabbix/zabbix_host
-- mgr/zabbix/identifier
+- zabbix_host
+- identifier (optional)
The parameter *zabbix_host* controls the hostname of the Zabbix server to which
*zabbix_sender* will send the items. This can be a IP-Address if required by
when sending items to Zabbix. This should match the name of the *Host* in
your Zabbix server.
+When the *identifier* parameter is not configured the ceph-<fsid> of the cluster
+will be used when sending data to Zabbix.
+
+This would for example be *ceph-c4d32a99-9e80-490f-bd3a-1d22d8a7d354*
+
Additional configuration keys which can be configured and their default values:
- mgr/zabbix/zabbix_port: 10051
]
},
"selected_object_info": {
- "type": "string"
+ "type": "object",
+ "description": "Selected object information",
+ "additionalProperties": true
},
"union_shard_errors": {
"description": "Union of all shard errors",
"missing",
"stat_error",
"read_error",
- "data_digest_mismatch_oi",
- "omap_digest_mismatch_oi",
- "size_mismatch_oi",
+ "data_digest_mismatch_info",
+ "omap_digest_mismatch_info",
+ "size_mismatch_info",
"ec_hash_error",
"ec_size_error",
- "oi_attr_missing",
- "oi_attr_corrupted",
- "obj_size_oi_mismatch",
- "ss_attr_missing",
- "ss_attr_corrupted"
+ "info_missing",
+ "info_corrupted",
+ "obj_size_info_mismatch",
+ "snapset_missing",
+ "snapset_corrupted",
+ "hinfo_missing",
+ "hinfo_corrupted"
]
},
"minItems": 0,
"size_mismatch",
"attr_value_mismatch",
"attr_name_mismatch",
- "snapset_inconsistency"
+ "snapset_inconsistency",
+ "hinfo_inconsistency"
]
},
"minItems": 0,
"type": "object",
"properties": {
"object_info": {
- "type": "string"
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "object",
+ "description": "Object information",
+ "additionalProperties": true
+ }
+ ]
+ },
+ "snapset": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "object",
+ "description": "Snap set information",
+ "additionalProperties": true
+ }
+ ]
+ },
+ "hashinfo": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "object",
+ "description": "Erasure code hash information",
+ "additionalProperties": true
+ }
+ ]
},
"shard": {
"type": "integer"
"missing",
"stat_error",
"read_error",
- "data_digest_mismatch_oi",
- "omap_digest_mismatch_oi",
- "size_mismatch_oi",
+ "data_digest_mismatch_info",
+ "omap_digest_mismatch_info",
+ "size_mismatch_info",
"ec_hash_error",
"ec_size_error",
- "oi_attr_missing",
- "oi_attr_corrupted",
- "obj_size_oi_mismatch",
- "ss_attr_missing",
- "ss_attr_corrupted"
+ "info_missing",
+ "info_corrupted",
+ "obj_size_info_mismatch",
+ "snapset_missing",
+ "snapset_corrupted",
+ "hinfo_missing",
+ "hinfo_corrupted"
]
},
"minItems": 0,
"value",
"Base64"
],
- "additionalProperties": false,
- "minItems": 1
+ "additionalProperties": false
}
}
},
+ "additionalProperties": false,
"required": [
"osd",
"primary",
"type": "array",
"items": {
"enum": [
- "ss_attr_missing",
- "ss_attr_corrupted",
- "oi_attr_missing",
- "oi_attr_corrupted",
- "snapset_mismatch",
+ "snapset_missing",
+ "snapset_corrupted",
+ "info_missing",
+ "info_corrupted",
+ "snapset_error",
"head_mismatch",
"headless",
"size_mismatch",
"clone_missing"
]
},
- "minItems": 1,
+ "minItems": 0,
"uniqueItems": true
},
"missing": {
#. A small partition is formatted with XFS and contains basic metadata
for the OSD. This *data directory* includes information about the
OSD (its identifier, which cluster it belongs to, and its private
- keyring.
+ keyring).
#. The rest of the device is normally a large partition occupying the
rest of the device that is managed directly by BlueStore contains
It is possible to use a smaller checksum value by truncating the
checksum to two or one byte, reducing the metadata overhead. The
trade-off is that the probability that a random error will not be
-detected is higher with a smaller checksum, going from about one if
-four billion with a 32-bit (4 byte) checksum to one is 65,536 for a
+detected is higher with a smaller checksum, going from about one in
+four billion with a 32-bit (4 byte) checksum to one in 65,536 for a
16-bit (2 byte) checksum or one in 256 for an 8-bit (1 byte) checksum.
The smaller checksum values can be used by selecting `crc32c_16` or
`crc32c_8` as the checksum algorithm.
and for erasure coded pools (which rely on cloning to implement
efficient two-phase commits).
-For more information, see :doc:`bluestore-config-ref`.
+For more information, see :doc:`bluestore-config-ref` and :doc:`/rados/operations/bluestore-migration`.
FileStore
---------
--- /dev/null
+=====================
+ BlueStore Migration
+=====================
+
+Each OSD can run either BlueStore or FileStore, and a single Ceph
+cluster can contain a mix of both. Users who have previously deployed
+FileStore are likely to want to transition to BlueStore in order to
+take advantage of the improved performance and robustness. There are
+several strategies for making such a transition.
+
+An individual OSD cannot be converted in place in isolation, however:
+BlueStore and FileStore are simply too different for that to be
+practical. "Conversion" will rely either on the cluster's normal
+replication and healing support or tools and strategies that copy OSD
+content from an old (FileStore) device to a new (BlueStore) one.
+
+
+Deploy new OSDs with BlueStore
+==============================
+
+Any new OSDs (e.g., when the cluster is expanded) can be deployed
+using BlueStore. This is the default behavior so no specific change
+is needed.
+
+Similarly, any OSDs that are reprovisioned after replacing a failed drive
+can use BlueStore.
+
+Convert existing OSDs
+=====================
+
+Mark out and replace
+--------------------
+
+The simplest approach is to mark out each device in turn, wait for the
+data to rereplicate across the cluster, reprovision the OSD, and mark
+it back in again. It is simple and easy to automate. However, it requires
+more data migration than should be necessary, so it is not optimal.
+
+#. Identify a FileStore OSD to replace::
+
+ ID=<osd-id-number>
+ DEVICE=<disk-device>
+
+ You can tell whether a given OSD is FileStore or BlueStore with::
+
+ ceph osd metadata $ID | grep osd_objectstore
+
+ You can get a current count of filestore vs bluestore with::
+
+ ceph osd count-metadata osd_objectstore
+
+#. Mark the filestore OSD out::
+
+ ceph osd out $ID
+
+#. Wait for the data to migrate off the OSD in question::
+
+ while ! ceph osd safe-to-destroy $ID ; sleep 60 ; done
+
+#. Stop the OSD::
+
+ systemctl kill ceph-osd@$ID
+
+#. Make note of which device this OSD is using::
+
+ mount | grep /var/lib/ceph/osd/ceph-$ID
+
+#. Unmount the OSD::
+
+ umount /var/lib/ceph/osd/ceph-$ID
+
+#. Destroy the OSD data. Be *EXTREMELY CAREFUL* as this will destroy
+ the contents of the device; be certain the data on the device is
+ not needed (i.e., that the cluster is healthy) before proceeding. ::
+
+ ceph-volume lvm zap $DEVICE
+
+#. Tell the cluster the OSD has been destroyed (and a new OSD can be
+ reprovisioned with the same ID)::
+
+ ceph osd destroy $ID --yes-i-really-mean-it
+
+#. Reprovision a BlueStore OSD in its place with the same OSD ID.
+ This requires you do identify which device to wipe based on what you saw
+ mounted above. BE CAREFUL! ::
+
+ ceph-volume lvm create --bluestore --data $DEVICE --osd-id $ID
+
+#. Repeat.
+
+You can allow the refilling of the replacement OSD to happen
+concurrently with the draining of the next OSD, or follow the same
+procedure for multiple OSDs in parallel, as long as you ensure the
+cluster is fully clean (all data has all replicas) before destroying
+any OSDs. Failure to do so will reduce the redundancy of your data
+and increase the risk of (or potentially even cause) data loss.
+
+Advantages:
+
+* Simple.
+* Can be done on a device-by-device basis.
+* No spare devices or hosts are required.
+
+Disadvantages:
+
+* Data is copied over the network twice: once to some other OSD in the
+ cluster (to maintain the desired number of replicas), and then again
+ back to the reprovisioned BlueStore OSD.
+
+
+Whole host replacement
+----------------------
+
+If you have a spare host in the cluster, or have sufficient free space
+to evacuate an entire host in order to use it as a spare, then the
+conversion can be done on a host-by-host basis with each stored copy of
+the data migrating only once.
+
+First, you need have empty host that has no data. There are two ways to do this: either by starting with a new, empty host that isn't yet part of the cluster, or by offloading data from an existing host that in the cluster.
+
+Use a new, empty host
+^^^^^^^^^^^^^^^^^^^^^
+
+Ideally the host should have roughly the
+same capacity as other hosts you will be converting (although it
+doesn't strictly matter). ::
+
+ NEWHOST=<empty-host-name>
+
+Add the host to the CRUSH hierarchy, but do not attach it to the root::
+
+ ceph osd crush add-bucket $NEWHOST host
+
+Make sure the ceph packages are installed.
+
+Use an existing host
+^^^^^^^^^^^^^^^^^^^^
+
+If you would like to use an existing host
+that is already part of the cluster, and there is sufficient free
+space on that host so that all of its data can be migrated off,
+then you can instead do::
+
+ OLDHOST=<existing-cluster-host-to-offload>
+ ceph osd crush unlink $OLDHOST default
+
+where "default" is the immediate ancestor in the CRUSH map. (For
+smaller clusters with unmodified configurations this will normally
+be "default", but it might also be a rack name.) You should now
+see the host at the top of the OSD tree output with no parent::
+
+ $ bin/ceph osd tree
+ ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
+ -5 0 host oldhost
+ 10 ssd 1.00000 osd.10 up 1.00000 1.00000
+ 11 ssd 1.00000 osd.11 up 1.00000 1.00000
+ 12 ssd 1.00000 osd.12 up 1.00000 1.00000
+ -1 3.00000 root default
+ -2 3.00000 host foo
+ 0 ssd 1.00000 osd.0 up 1.00000 1.00000
+ 1 ssd 1.00000 osd.1 up 1.00000 1.00000
+ 2 ssd 1.00000 osd.2 up 1.00000 1.00000
+ ...
+
+If everything looks good, jump directly to the "Wait for data
+migration to complete" step below and proceed from there to clean up
+the old OSDs.
+
+Migration process
+^^^^^^^^^^^^^^^^^
+
+If you're using a new host, start at step #1. For an existing host,
+jump to step #5 below.
+
+#. Provision new BlueStore OSDs for all devices::
+
+ ceph-volume lvm create --bluestore --data /dev/$DEVICE
+
+#. Verify OSDs join the cluster with::
+
+ ceph osd tree
+
+ You should see the new host ``$NEWHOST`` with all of the OSDs beneath
+ it, but the host should *not* be nested beneath any other node in
+ hierarchy (like ``root default``). For example, if ``newhost`` is
+ the empty host, you might see something like::
+
+ $ bin/ceph osd tree
+ ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
+ -5 0 host newhost
+ 10 ssd 1.00000 osd.10 up 1.00000 1.00000
+ 11 ssd 1.00000 osd.11 up 1.00000 1.00000
+ 12 ssd 1.00000 osd.12 up 1.00000 1.00000
+ -1 3.00000 root default
+ -2 3.00000 host oldhost1
+ 0 ssd 1.00000 osd.0 up 1.00000 1.00000
+ 1 ssd 1.00000 osd.1 up 1.00000 1.00000
+ 2 ssd 1.00000 osd.2 up 1.00000 1.00000
+ ...
+
+#. Identify the first target host to convert ::
+
+ OLDHOST=<existing-cluster-host-to-convert>
+
+#. Swap the new host into the old host's position in the cluster::
+
+ ceph osd crush swap-bucket $NEWHOST $OLDHOST
+
+ At this point all data on ``$OLDHOST`` will start migrating to OSDs
+ on ``$NEWHOST``. If there is a difference in the total capacity of
+ the old and new hosts you may also see some data migrate to or from
+ other nodes in the cluster, but as long as the hosts are similarly
+ sized this will be a relatively small amount of data.
+
+#. Wait for data migration to complete::
+
+ while ! ceph osd safe-to-destroy $(ceph osd ls-tree $OLDHOST); do sleep 60 ; done
+
+#. Stop all old OSDs on the now-empty ``$OLDHOST``::
+
+ ssh $OLDHOST
+ systemctl kill ceph-osd.target
+ umount /var/lib/ceph/osd/ceph-*
+
+#. Destroy and purge the old OSDs::
+
+ for osd in `ceph osd ls-tree $OLDHOST`; do
+ ceph osd purge $osd --yes-i-really-mean-it
+ done
+
+#. Wipe the old OSD devices. This requires you do identify which
+ devices are to be wiped manually (BE CAREFUL!). For each device,::
+
+ ceph-volume lvm zap $DEVICE
+
+#. Use the now-empty host as the new host, and repeat::
+
+ NEWHOST=$OLDHOST
+
+Advantages:
+
+* Data is copied over the network only once.
+* Converts an entire host's OSDs at once.
+* Can parallelize to converting multiple hosts at a time.
+* No spare devices are required on each host.
+
+Disadvantages:
+
+* A spare host is required.
+* An entire host's worth of OSDs will be migrating data at a time. This
+ is like likely to impact overall cluster performance.
+* All migrated data still makes one full hop over the network.
+
+
+Per-OSD device copy
+-------------------
+
+A single logical OSD can be converted by using the ``copy`` function
+of ``ceph-objectstore-tool``. This requires that the host have a free
+device (or devices) to provision a new, empty BlueStore OSD. For
+example, if each host in your cluster has 12 OSDs, then you'd need a
+13th available device so that each OSD can be converted in turn before the
+old device is reclaimed to convert the next OSD.
+
+Caveats:
+
+* This strategy requires that a blank BlueStore OSD be prepared
+ without allocating a new OSD ID, something that the ``ceph-volume``
+ tool doesn't support. More importantly, the setup of *dmcrypt* is
+ closely tied to the OSD identity, which means that this approach
+ does not work with encrypted OSDs.
+
+* The device must be manually partitioned.
+
+* Tooling not implemented!
+
+* Not documented!
+
+Advantages:
+
+* Little or no data migrates over the network during the conversion.
+
+Disadvantages:
+
+* Tooling not fully implemented.
+* Process not documented.
+* Each host must have a spare or empty device.
+* The OSD is offline during the conversion, which means new writes will
+ be written to only a subset of the OSDs. This increases the risk of data
+ loss due to a subsequent failure. (However, if there is a failure before
+ conversion is complete, the original FileStore OSD can be started to provide
+ access to its original data.)
add-or-rm-osds
add-or-rm-mons
+ bluestore-migration
Command Reference <control>
*Creating*
Ceph is still creating the placement group.
+*Activating*
+ The placement group is peered but not yet active.
+
*Active*
Ceph will process requests to the placement group.
*Forced-Recovery*
High recovery priority of that PG is enforced by user.
-*Backfill*
+*Recovery-wait*
+ The placement group is waiting in line to start recover.
+
+*Recovery-toofull*
+ A recovery operation is waiting because the destination OSD is over its
+ full ratio.
+
+*Recovery-unfound*
+ Recovery stopped due to unfound objects.
+
+*Backfilling*
Ceph is scanning and synchronizing the entire contents of a placement group
instead of inferring what contents need to be synchronized from the logs of
- recent operations. *Backfill* is a special case of recovery.
+ recent operations. Backfill is a special case of recovery.
*Forced-Backfill*
High backfill priority of that PG is enforced by user.
-*Wait-backfill*
+*Backfill-wait*
The placement group is waiting in line to start backfill.
*Backfill-toofull*
A backfill operation is waiting because the destination OSD is over its
full ratio.
+*Backfill-unfound*
+ Backfill stopped due to unfound objects.
+
*Incomplete*
Ceph detects that a placement group is missing information about
writes that may have occurred, or does not have any healthy
The placement group has peered, but cannot serve client IO due to not having
enough copies to reach the pool's configured min_size parameter. Recovery
may occur in this state, so the pg may heal up to min_size eventually.
+
+*Snaptrim*
+ Trimming snaps.
+
+*Snaptrim-wait*
+ Queued to trim snaps.
+
+*Snaptrim-error*
+ Error stopped trimming snaps.
"size_mismatch"
],
"union_shard_errors": [
- "data_digest_mismatch_oi",
- "size_mismatch_oi"
+ "data_digest_mismatch_info",
+ "size_mismatch_info"
],
"selected_object_info": "0:602f83fe:::foo:head(16'1 client.4110.0:1 dirty|data_digest|omap_digest s 968 uv 1 dd e978e67f od ffffffff alloc_hint [0 0 0])",
"shards": [
{
"osd": 2,
"errors": [
- "data_digest_mismatch_oi",
- "size_mismatch_oi"
+ "data_digest_mismatch_info",
+ "size_mismatch_info"
],
"size": 0,
"omap_digest": "0xffffffff",
``oi`` indicate a comparison with ``selected_object_info``. Look at the
``shards`` array to determine which shard has which error(s).
- * ``data_digest_mismatch_oi``: the digest stored in the object-info is not
+ * ``data_digest_mismatch_info``: the digest stored in the object-info is not
``0xffffffff``, which is calculated from the shard read from OSD.2
- * ``size_mismatch_oi``: the size stored in the object-info is different
+ * ``size_mismatch_info``: the size stored in the object-info is different
from the one read from OSD.2. The latter is 0.
You can repair the inconsistent placement group by executing::
as encoded in the corresponding read operation.
+Set Quota for an Individual Bucket
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To set a quota, the user must have ``buckets`` capability set with ``write``
+permission. ::
+
+ PUT /admin/bucket?quota&uid=<uid>&bucket=<bucket-name>"a
+
+The content must include a JSON representation of the quota settings
+as mentioned in Set Bucket Quota section above.
+
Standard Error Responses
instances or all radosgw-admin commands can be put into the ``[global]`` or the
``[client]`` section to avoid specifying instance-name.
+``rgw frontends``
+
+:Description: Configures the HTTP frontend(s). The configuration for multiple
+ frontends can be provided in a comma-delimited list. Each frontend
+ configuration may include a list of options separated by spaces,
+ where each option is in the form "key=value" or "key". See
+ `HTTP Frontends`_ for more on supported options.
+
+:Type: String
+:Default: ``civetweb port=7480``
+
``rgw data``
:Description: Sets the location of the data files for Ceph Object Gateway.
.. _Rados cluster handles: ../../rados/api/librados-intro/#step-2-configuring-a-cluster-handle
.. _Barbican: ../barbican
.. _Encryption: ../encryption
+.. _HTTP Frontends: ../frontends
--- /dev/null
+==============
+HTTP Frontends
+==============
+
+.. contents::
+
+The Ceph Object Gateway supports two embedded HTTP frontend libraries
+that can be configured with ``rgw_frontends``.
+
+Beast
+=====
+
+.. versionadded:: Luminous
+
+The ``beast`` frontend uses the Boost.Beast library for HTTP parsing
+and the Boost.Asio library for asynchronous network i/o.
+
+Options
+-------
+
+``port``
+
+:Description: Sets the listening port number.
+
+:Type: Integer
+:Default: ``80``
+
+
+Civetweb
+========
+
+.. versionadded:: Firefly
+
+The ``civetweb`` frontend uses the Civetweb HTTP library, which is a
+fork of Mongoose.
+
+
+Options
+-------
+
+``port``
+
+:Description: Sets the listening port number. For SSL-enabled ports, add an
+ ``s`` suffix like ``443s``. To bind a specific IPv4 or IPv6
+ address, use the form ``address:port``. Multiple endpoints
+ can be separated by ``+`` as in ``127.0.0.1:8000+443s``.
+
+:Type: String
+:Default: ``7480``
+
+
+``num_threads``
+
+:Description: Sets the number of threads spawned by Civetweb to handle
+ incoming HTTP connections. This effectively limits the number
+ of concurrent connections that the frontend can service.
+
+:Type: Integer
+:Default: ``rgw_thread_pool_size``
+
+
+``request_timeout_ms``
+
+:Description: The amount of time in milliseconds that Civetweb will wait
+ for more incoming data before giving up.
+
+:Type: Integer
+:Default: ``30000``
+
+
+``ssl_certificate``
+
+:Description: Path to the SSL certificate file used for SSL-enabled ports.
+
+:Type: String
+:Default: None
+
+
+A complete list of supported options can be found in the `Civetweb User Manual`_.
+
+
+Generic Options
+===============
+
+Some frontend options are generic and supported by all frontends:
+
+``prefix``
+
+:Description: A prefix string that is inserted into the URI of all
+ requests. For example, a swift-only frontend could supply
+ a uri prefix of ``/swift``.
+
+:Type: String
+:Default: None
+
+
+.. _Civetweb User Manual: https://civetweb.github.io/civetweb/UserManual.html
:maxdepth: 1
Manual Install w/Civetweb <../../install/install-ceph-gateway>
+ HTTP Frontends <frontends>
Multisite Configuration <multisite>
Configuring Pools <pools>
Config Reference <config-ref>
import os
project = u'Ceph'
-copyright = u'2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA'
+copyright = u'2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons Attribution Share Alike 3.0 (CC-BY-SA-3.0)'
version = 'dev'
release = 'dev'
--- /dev/null
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7]
+- [client.0]
+openstack:
+- volumes: # attached to each instance
+ count: 4
+ size: 10 # GB
--- /dev/null
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7]
+- [client.0]
+- [client.1]
+openstack:
+- volumes: # attached to each instance
+ count: 4
+ size: 10 # GB
--- /dev/null
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7]
+- [client.0]
+- [client.1]
+- [client.2]
+- [client.3]
+openstack:
+- volumes: # attached to each instance
+ count: 4
+ size: 10 # GB
--- /dev/null
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ osd:
+ osd shutdown pgref assert: true
#######################################################################
+calc() { awk "BEGIN{print $*}"; }
+
##
# Return a list of numbers that are increasingly larger and whose
# total is **timeout** seconds. It can be used to have short sleep
local i
local total="0"
i=$first_step
- while test "$(echo $total + $i \<= $timeout | bc -l)" = "1"; do
- echo -n "$i "
- total=$(echo $total + $i | bc -l)
- i=$(echo $i \* 2 | bc -l)
+ while test "$(calc $total + $i \<= $timeout)" = "1"; do
+ echo -n "$(calc $i) "
+ total=$(calc $total + $i)
+ i=$(calc $i \* 2)
done
- if test "$(echo $total \< $timeout | bc -l)" = "1"; then
- echo -n $(echo $timeout - $total | bc -l)
+ if test "$(calc $total \< $timeout)" = "1"; then
+ echo -n "$(calc $timeout - $total) "
fi
$trace && shopt -s -o xtrace
}
function test_get_timeout_delays() {
test "$(get_timeout_delays 1)" = "1 " || return 1
- test "$(get_timeout_delays 5)" = "1 2 2" || return 1
- test "$(get_timeout_delays 6)" = "1 2 3" || return 1
+ test "$(get_timeout_delays 5)" = "1 2 2 " || return 1
+ test "$(get_timeout_delays 6)" = "1 2 3 " || return 1
test "$(get_timeout_delays 7)" = "1 2 4 " || return 1
- test "$(get_timeout_delays 8)" = "1 2 4 1" || return 1
- test "$(get_timeout_delays 1 .1)" = ".1 .2 .4 .3" || return 1
- test "$(get_timeout_delays 1.5 .1)" = ".1 .2 .4 .8 " || return 1
- test "$(get_timeout_delays 5 .1)" = ".1 .2 .4 .8 1.6 1.9" || return 1
- test "$(get_timeout_delays 6 .1)" = ".1 .2 .4 .8 1.6 2.9" || return 1
- test "$(get_timeout_delays 6.3 .1)" = ".1 .2 .4 .8 1.6 3.2 " || return 1
- test "$(get_timeout_delays 20 .1)" = ".1 .2 .4 .8 1.6 3.2 6.4 7.3" || return 1
+ test "$(get_timeout_delays 8)" = "1 2 4 1 " || return 1
+ test "$(get_timeout_delays 1 .1)" = "0.1 0.2 0.4 0.3 " || return 1
+ test "$(get_timeout_delays 1.5 .1)" = "0.1 0.2 0.4 0.8 " || return 1
+ test "$(get_timeout_delays 5 .1)" = "0.1 0.2 0.4 0.8 1.6 1.9 " || return 1
+ test "$(get_timeout_delays 6 .1)" = "0.1 0.2 0.4 0.8 1.6 2.9 " || return 1
+ test "$(get_timeout_delays 6.3 .1)" = "0.1 0.2 0.4 0.8 1.6 3.2 " || return 1
+ test "$(get_timeout_delays 20 .1)" = "0.1 0.2 0.4 0.8 1.6 3.2 6.4 7.3 " || return 1
}
#######################################################################
#
function run_in_background() {
local pid_variable=$1
- shift;
+ shift
# Execute the command and prepend the output with its pid
# We enforce to return the exit status of the command and not the awk one.
- ("$@" |& awk '{ a[i++] = $0 }END{for (i = 0; i in a; ++i) { print "'$$': " a[i]} }'; return ${PIPESTATUS[0]}) >&2 &
+ ("$@" |& sed 's/^/'$$': /'; return "${PIPESTATUS[0]}") >&2 &
eval "$pid_variable+=\" $!\""
}
+function save_stdout {
+ local out="$1"
+ shift
+ "$@" > "$out"
+}
+
function test_run_in_background() {
local pids
run_in_background pids sleep 1
shift
export CEPH_MON="127.0.0.1:7105" # git grep '\<7105\>' : there must be only one
- export CEPH_ARGS
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
CEPH_ARGS+="--mon-host=$CEPH_MON "
+ export CEPH_ARGS
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
ceph osd pool delete 黄 黄 --yes-i-really-really-mean-it
}
+function TEST_pool_create_rep_expected_num_objects() {
+ local dir=$1
+ setup $dir || return 1
+
+ # disable pg dir merge
+ CEPH_ARGS+="--filestore-merge-threshold=-10 "
+ export CEPH_ARGS
+ run_mon $dir a || return 1
+ run_osd $dir 0 || return 1
+
+ ceph osd pool create rep_expected_num_objects 64 64 replicated replicated_rule 100000 || return 1
+ # wait for pg dir creating
+ sleep 5
+ ret=$(find ${dir}/0/current/1.0_head/ | grep DIR | wc -l)
+ if [ "$ret" -le 2 ];
+ then
+ return 1
+ else
+ echo "TEST_pool_create_rep_expected_num_objects PASS"
+ fi
+}
+
main osd-pool-create "$@"
# Local Variables:
--- /dev/null
+#!/usr/bin/env bash
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2018 Red Hat <contact@redhat.com>
+#
+# Author: Josh Durgin <jdurgin@redhat.com>
+# Author: David Zafman <dzafman@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7100" # git grep '\<7100\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+PGID=
+
+function test_log_size()
+{
+ local PGID=$1
+ local EXPECTED=$2
+ ceph tell osd.\* flush_pg_stats
+ sleep 3
+ ceph pg $PGID query | jq .info.stats.log_size
+ ceph pg $PGID query | jq .info.stats.log_size | grep "${EXPECTED}"
+}
+
+function setup_log_test() {
+ local dir=$1
+ local which=$2
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+
+ ceph osd pool create test 1 1 || true
+ POOL_ID=$(ceph osd dump --format json | jq '.pools[] | select(.pool_name == "test") | .pool')
+ PGID="${POOL_ID}.0"
+
+ ceph tell osd.\* injectargs -- --osd-min-pg-log-entries 20 || return 1
+ ceph tell osd.\* injectargs -- --osd-max-pg-log-entries 30 || return 1
+ ceph tell osd.\* injectargs -- --osd-pg-log-trim-min 10 || return 1
+ ceph tell osd.\* injectargs -- --osd-pg-log-dups-tracked 10 || return 1
+
+ touch foo
+ for i in $(seq 1 20)
+ do
+ rados -p test put foo foo || return 1
+ done
+
+ test_log_size $PGID 20 || return 1
+
+ rados -p test rm foo || return 1
+
+ # generate error entries
+ for i in $(seq 1 20)
+ do
+ rados -p test rm foo
+ done
+
+ # log should have been trimmed down to min_entries with one extra
+ test_log_size $PGID 21 || return 1
+}
+
+function TEST_repro_long_log1()
+{
+ local dir=$1
+
+ setup_log_test $dir || return 1
+ # regular write should trim the log
+ rados -p test put foo foo || return 1
+ test_log_size $PGID 22 || return 1
+}
+
+function TEST_repro_long_log2()
+{
+ local dir=$1
+
+ setup_log_test $dir || return 1
+ local PRIMARY=$(ceph pg $PGID query | jq '.info.stats.up_primary')
+ kill_daemons $dir TERM osd.$PRIMARY || return 1
+ CEPH_ARGS="--osd-max-pg-log-entries=2 --no-mon-config" ceph-objectstore-tool --data-path $dir/$PRIMARY --pgid $PGID --op trim-pg-log || return 1
+ run_osd $dir $PRIMARY || return 1
+ wait_for_clean || return 1
+ test_log_size $PGID 2 || return 1
+}
+
+function TEST_trim_max_entries()
+{
+ local dir=$1
+
+ setup_log_test $dir || return 1
+
+ ceph tell osd.\* injectargs -- --osd-min-pg-log-entries 1
+ ceph tell osd.\* injectargs -- --osd-pg-log-trim-min 2
+ ceph tell osd.\* injectargs -- --osd-pg-log-trim-max 4
+
+ # adding log entries, should only trim 4 and add one each time
+ rados -p test rm foo
+ test_log_size $PGID 17
+ rados -p test rm foo
+ test_log_size $PGID 14
+ rados -p test rm foo
+ test_log_size $PGID 11
+ rados -p test rm foo
+ test_log_size $PGID 8
+ rados -p test rm foo
+ test_log_size $PGID 5
+ rados -p test rm foo
+ test_log_size $PGID 2
+
+ # below trim_min
+ rados -p test rm foo
+ test_log_size $PGID 3
+ rados -p test rm foo
+ test_log_size $PGID 4
+
+ rados -p test rm foo
+ test_log_size $PGID 2
+}
+
+main repro-long-log "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && ../qa/run-standalone.sh repro_long_log.sh"
+# End:
pids=""
for pg in $(seq 0 $(expr $PGS - 1))
do
- run_in_background pids pg_scrub $poolid.$(echo "{ obase=16; $pg }" | bc | tr '[:upper:]' '[:lower:]')
+ run_in_background pids pg_scrub $poolid.$(printf "%x" $pg)
done
ceph pg dump pgs
wait_background pids
# Set to "yes" in order to ignore diff errors and save results to update test
getjson="no"
-# Ignore the epoch and filter out the attr '_' value because it has date information and won't match
-if [ "$(jq --version 2>&1 | awk '{ print $3}')" = "1.3" ]; then # Not sure all versions that apply here
- jqfilter='.inconsistents | (.[].shards[].attrs[] | select(.name == "_") | .value) |= "----Stripped-by-test----"'
-else
- jqfilter='.inconsistents | (.[].shards[].attrs[]? | select(.name == "_") | .value) |= "----Stripped-by-test----"'
-fi
-sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
+# Filter out mtime and local_mtime dates, version, prior_version and last_reqid (client) from any object_info.
+jqfilter='def walk(f):
+ . as $in
+ | if type == "object" then
+ reduce keys[] as $key
+ ( {}; . + { ($key): ($in[$key] | walk(f)) } ) | f
+ elif type == "array" then map( walk(f) ) | f
+ else f
+ end;
+walk(if type == "object" then del(.mtime) else . end)
+| walk(if type == "object" then del(.local_mtime) else . end)
+| walk(if type == "object" then del(.last_reqid) else . end)
+| walk(if type == "object" then del(.version) else . end)
+| walk(if type == "object" then del(.prior_version) else . end)
+| walk(if type == "object" then del(.redirect_target) else . end)
+| walk(if type == "object" then del(.legacy_snaps) else . end)'
-# Remove items are not consistent across runs, the pg interval and client
-sedfilter='s/\([ ]*\"\(selected_\)*object_info\":.*head[(]\)[^[:space:]]* [^[:space:]]* \(.*\)/\1\3/'
+sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
function run() {
local dir=$1
# Get epoch for repair-get requests
epoch=$(jq .epoch $dir/json)
- jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
+ jq "$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
{
"inconsistents": [
{
"primary": false
},
{
+ "object_info": {
+ "oid": {
+ "oid": "ROBJ1",
+ "key": "",
+ "snapid": -2,
+ "hash": 1454963827,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'58",
+ "prior_version": "21'3",
+ "last_reqid": "osd.1.0:57",
+ "user_version": 3,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xf5fba2c6",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"size": 9,
"errors": [
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"osd": 1,
"primary": true
}
],
- "selected_object_info": "3:ce3f1d6a:::ROBJ1:head(47'54 osd.0.0:53 dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ1",
+ "key": "",
+ "snapid": -2,
+ "hash": 1454963827,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'58",
+ "prior_version": "21'3",
+ "last_reqid": "osd.1.0:57",
+ "user_version": 3,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:19.804040",
+ "local_mtime": "2018-04-05 14:33:19.804839",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xf5fba2c6",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"errors": [
"size_mismatch"
"primary": true
}
],
- "selected_object_info": "3:bc819597:::ROBJ12:head(47'52 osd.0.0:51 dirty|omap|data_digest|omap_digest s 7 uv 36 dd 2ddbf8f5 od 67f306a alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ12",
+ "key": "",
+ "snapid": -2,
+ "hash": 3920199997,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'56",
+ "prior_version": "43'36",
+ "last_reqid": "osd.1.0:55",
+ "user_version": 36,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x067f306a",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
"stat_error"
],
"primary": true
}
],
- "selected_object_info": "3:d60617f9:::ROBJ13:head(47'55 osd.0.0:54 dirty|omap|data_digest|omap_digest s 7 uv 39 dd 2ddbf8f5 od 6441854d alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ13",
+ "key": "",
+ "snapid": -2,
+ "hash": 2682806379,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'59",
+ "prior_version": "45'39",
+ "last_reqid": "osd.1.0:58",
+ "user_version": 39,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x6441854d",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
"stat_error"
],
{
"shards": [
{
+ "object_info": "bad-val",
"size": 7,
- "attrs": [
- {
- "Base64": false,
- "value": "",
- "name": "_"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
"errors": [
- "oi_attr_corrupted"
+ "info_corrupted"
],
"osd": 0,
"primary": false
},
{
"size": 7,
- "attrs": [
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
"errors": [
- "oi_attr_missing"
+ "info_missing"
],
"osd": 1,
"primary": true
}
],
"union_shard_errors": [
- "oi_attr_missing",
- "oi_attr_corrupted"
+ "info_missing",
+ "info_corrupted"
],
"errors": [],
"object": {
{
"shards": [
{
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
+ "object_info": {
+ "oid": {
+ "oid": "ROBJ15",
+ "key": "",
+ "snapid": -2,
+ "hash": 504996876,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
},
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
+ "version": "51'49",
+ "prior_version": "49'45",
+ "last_reqid": "osd.1.0:48",
+ "user_version": 45,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:29.498969",
+ "local_mtime": "2018-04-05 14:33:29.499890",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x2d2a4d6e",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"size": 7,
"errors": [],
"osd": 0,
"primary": false
},
{
- "attrs": [
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
"size": 7,
"errors": [
- "oi_attr_missing"
+ "info_missing"
],
"osd": 1,
"primary": true
}
],
- "selected_object_info": "3:30259878:::ROBJ15:head(47'46 osd.0.0:45 dirty|omap|data_digest|omap_digest s 7 uv 45 dd 2ddbf8f5 od 2d2a4d6e alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ15",
+ "key": "",
+ "snapid": -2,
+ "hash": 504996876,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'49",
+ "prior_version": "49'45",
+ "last_reqid": "osd.1.0:48",
+ "user_version": 45,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x2d2a4d6e",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "oi_attr_missing"
+ "info_missing"
],
"errors": [],
"object": {
},
"shards": [
{
- "attrs": [
- {
- "Base64": true,
- "name": "_",
- "value": ""
- }
- ],
"errors": [
- "ss_attr_missing"
+ "snapset_missing"
],
"osd": 0,
"primary": false,
"size": 7
},
{
- "attrs": [
- {
- "Base64": true,
- "name": "_",
- "value": ""
- },
- {
- "Base64": false,
- "name": "snapset",
- "value": "bad-val"
- }
- ],
"errors": [
- "ss_attr_corrupted"
+ "snapset_corrupted"
],
"osd": 1,
"primary": true,
+ "snapset": "bad-val",
"size": 7
}
],
"union_shard_errors": [
- "ss_attr_missing",
- "ss_attr_corrupted"
+ "snapset_missing",
+ "snapset_corrupted"
]
},
{
"primary": true
}
],
- "selected_object_info": "3:f2a5b2a4:::ROBJ3:head(47'57 osd.0.0:56 dirty|omap|data_digest|omap_digest s 7 uv 9 dd 2ddbf8f5 od b35dfd alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ3",
+ "key": "",
+ "snapid": -2,
+ "hash": 625845583,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'61",
+ "prior_version": "25'9",
+ "last_reqid": "osd.1.0:60",
+ "user_version": 9,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x00b35dfd",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
"missing"
],
"shards": [
{
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "bad-val",
- "name": "_key1-ROBJ8"
+ "name": "key1-ROBJ8"
},
{
"Base64": false,
"value": "val2-ROBJ8",
- "name": "_key2-ROBJ8"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key2-ROBJ8"
}
],
"size": 7,
},
{
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "val1-ROBJ8",
- "name": "_key1-ROBJ8"
+ "name": "key1-ROBJ8"
},
{
"Base64": false,
"value": "val3-ROBJ8",
- "name": "_key3-ROBJ8"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key3-ROBJ8"
}
],
"size": 7,
"primary": true
}
],
- "selected_object_info": "3:86586531:::ROBJ8:head(82'62 client.4351.0:1 dirty|omap|data_digest|omap_digest s 7 uv 66 dd 2ddbf8f5 od d6be81dc alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ8",
+ "key": "",
+ "snapid": -2,
+ "hash": 2359695969,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "79'66",
+ "prior_version": "79'65",
+ "last_reqid": "client.4554.0:1",
+ "user_version": 66,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xd6be81dc",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [],
"errors": [
"attr_value_mismatch",
{
"shards": [
{
- "object_info": "3:ffdb2004:::ROBJ9:head(102'63 client.4433.0:1 dirty|omap|data_digest|omap_digest s 1 uv 67 dd 2b63260d od 2eecc539 alloc_hint [0 0 0])",
+ "object_info": {
+ "oid": {
+ "oid": "ROBJ9",
+ "key": "",
+ "snapid": -2,
+ "hash": 537189375,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "95'67",
+ "prior_version": "51'64",
+ "last_reqid": "client.4649.0:1",
+ "user_version": 67,
+ "size": 1,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2b63260d",
+ "omap_digest": "0x2eecc539",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"size": 1,
"errors": [],
"osd": 0,
"primary": false
},
{
- "object_info": "3:ffdb2004:::ROBJ9:head(47'60 osd.0.0:59 dirty|omap|data_digest|omap_digest s 7 uv 27 dd 2ddbf8f5 od 2eecc539 alloc_hint [0 0 0])",
+ "object_info": {
+ "oid": {
+ "oid": "ROBJ9",
+ "key": "",
+ "snapid": -2,
+ "hash": 537189375,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'64",
+ "prior_version": "37'27",
+ "last_reqid": "osd.1.0:63",
+ "user_version": 27,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:25.352485",
+ "local_mtime": "2018-04-05 14:33:25.353746",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x2eecc539",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"size": 1,
"errors": [
- "obj_size_oi_mismatch"
+ "obj_size_info_mismatch"
],
"osd": 1,
"primary": true
}
],
- "selected_object_info": "3:ffdb2004:::ROBJ9:head(102'63 client.4433.0:1 dirty|omap|data_digest|omap_digest s 1 uv 67 dd 2b63260d od 2eecc539 alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ9",
+ "key": "",
+ "snapid": -2,
+ "hash": 537189375,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "95'67",
+ "prior_version": "51'64",
+ "last_reqid": "client.4649.0:1",
+ "user_version": 67,
+ "size": 1,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2b63260d",
+ "omap_digest": "0x2eecc539",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "obj_size_oi_mismatch"
+ "obj_size_info_mismatch"
],
"errors": [
"object_info_inconsistency"
}
EOF
- jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
+ jq "$jqfilter" $dir/json | jq '.inconsistents' | python -c "$sortkeys" > $dir/csjson
diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
jq '.' $dir/json > save1.json
fi
- if which jsonschema > /dev/null;
+ if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
then
jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
fi
# Get epoch for repair-get requests
epoch=$(jq .epoch $dir/json)
- jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
+ jq "$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
{
"inconsistents": [
{
"primary": false
},
{
+ "object_info": {
+ "oid": {
+ "oid": "ROBJ1",
+ "key": "",
+ "snapid": -2,
+ "hash": 1454963827,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'58",
+ "prior_version": "21'3",
+ "last_reqid": "osd.1.0:57",
+ "user_version": 3,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:19.804040",
+ "local_mtime": "2018-04-05 14:33:19.804839",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xf5fba2c6",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"data_digest": "0x2d4a11c2",
"omap_digest": "0xf5fba2c6",
"size": 9,
"errors": [
- "data_digest_mismatch_oi",
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "data_digest_mismatch_info",
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"osd": 1,
"primary": true
}
],
- "selected_object_info": "3:ce3f1d6a:::ROBJ1:head(47'54 osd.0.0:53 dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ1",
+ "key": "",
+ "snapid": -2,
+ "hash": 1454963827,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'58",
+ "prior_version": "21'3",
+ "last_reqid": "osd.1.0:57",
+ "user_version": 3,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:19.804040",
+ "local_mtime": "2018-04-05 14:33:19.804839",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xf5fba2c6",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "data_digest_mismatch_oi",
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "data_digest_mismatch_info",
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"errors": [
"data_digest_mismatch",
"omap_digest": "0xa8dd5adc",
"size": 7,
"errors": [
- "omap_digest_mismatch_oi"
+ "omap_digest_mismatch_info"
],
"osd": 0,
"primary": false
"omap_digest": "0xa8dd5adc",
"size": 7,
"errors": [
- "omap_digest_mismatch_oi"
+ "omap_digest_mismatch_info"
],
"osd": 1,
"primary": true
}
],
- "selected_object_info": "3:b1f19cbd:::ROBJ10:head(47'51 osd.0.0:50 dirty|omap|data_digest|omap_digest s 7 uv 30 dd 2ddbf8f5 od c2025a24 alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "alloc_hint_flags": 0,
+ "data_digest": "0x2ddbf8f5",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "last_reqid": "osd.0.0:50",
+ "local_mtime": "2018-04-05 14:33:26.762368",
+ "lost": 0,
+ "manifest": {
+ "type": 0
+ },
+ "mtime": "2018-04-05 14:33:26.762368",
+ "oid": {
+ "hash": 3174666125,
+ "key": "",
+ "max": 0,
+ "namespace": "",
+ "oid": "ROBJ10",
+ "pool": 3,
+ "snapid": -2
+ },
+ "omap_digest": "0xc2025a24",
+ "prior_version": "41'33",
+ "size": 7,
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "user_version": 30,
+ "version": "47'51",
+ "watchers": {}
+ },
"union_shard_errors": [
- "omap_digest_mismatch_oi"
+ "omap_digest_mismatch_info"
],
"errors": [],
"object": {
"primary": true
}
],
- "selected_object_info": "3:87abbf36:::ROBJ11:head(47'48 osd.0.0:47 dirty|omap|data_digest|omap_digest s 7 uv 33 dd 2ddbf8f5 od a03cef03 alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ11",
+ "key": "",
+ "snapid": -2,
+ "hash": 1828574689,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'52",
+ "prior_version": "41'33",
+ "last_reqid": "osd.1.0:51",
+ "user_version": 33,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:26.761286",
+ "local_mtime": "2018-04-05 14:33:26.762368",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xa03cef03",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
"read_error"
],
"primary": true
}
],
- "selected_object_info": "3:bc819597:::ROBJ12:head(47'52 osd.0.0:51 dirty|omap|data_digest|omap_digest s 7 uv 36 dd 2ddbf8f5 od 67f306a alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ12",
+ "key": "",
+ "snapid": -2,
+ "hash": 3920199997,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'56",
+ "prior_version": "43'36",
+ "last_reqid": "osd.1.0:55",
+ "user_version": 36,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:27.460958",
+ "local_mtime": "2018-04-05 14:33:27.462109",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x067f306a",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
"stat_error"
],
{
"shards": [
{
- "attrs": [
- {
- "Base64": false,
- "value": "",
- "name": "_"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
+ "object_info": "bad-val",
"data_digest": "0x2ddbf8f5",
"omap_digest": "0x4f14f849",
"size": 7,
"errors": [
- "oi_attr_corrupted"
+ "info_corrupted"
],
"osd": 0,
"primary": false
},
{
- "attrs": [
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
"data_digest": "0x2ddbf8f5",
"omap_digest": "0x4f14f849",
"size": 7,
"errors": [
- "oi_attr_missing"
+ "info_missing"
],
"osd": 1,
"primary": true
}
],
"union_shard_errors": [
- "oi_attr_missing",
- "oi_attr_corrupted"
+ "info_missing",
+ "info_corrupted"
],
"errors": [],
"object": {
{
"shards": [
{
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
+ "object_info": {
+ "oid": {
+ "oid": "ROBJ15",
+ "key": "",
+ "snapid": -2,
+ "hash": 504996876,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
},
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
+ "version": "51'49",
+ "prior_version": "49'45",
+ "last_reqid": "osd.1.0:48",
+ "user_version": 45,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:29.498969",
+ "local_mtime": "2018-04-05 14:33:29.499890",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x2d2a4d6e",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"data_digest": "0x2ddbf8f5",
"omap_digest": "0x2d2a4d6e",
"size": 7,
"primary": false
},
{
- "attrs": [
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
"data_digest": "0x2ddbf8f5",
"omap_digest": "0x2d2a4d6e",
"size": 7,
"errors": [
- "oi_attr_missing"
+ "info_missing"
],
"osd": 1,
"primary": true
}
],
- "selected_object_info": "3:30259878:::ROBJ15:head(47'46 osd.0.0:45 dirty|omap|data_digest|omap_digest s 7 uv 45 dd 2ddbf8f5 od 2d2a4d6e alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ15",
+ "key": "",
+ "snapid": -2,
+ "hash": 504996876,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'49",
+ "prior_version": "49'45",
+ "last_reqid": "osd.1.0:48",
+ "user_version": 45,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:29.498969",
+ "local_mtime": "2018-04-05 14:33:29.499890",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x2d2a4d6e",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "oi_attr_missing"
+ "info_missing"
],
"errors": [],
"object": {
},
"shards": [
{
- "attrs": [
- {
- "Base64": true,
- "name": "_",
- "value": ""
- }
- ],
"data_digest": "0x2ddbf8f5",
"errors": [
- "ss_attr_missing"
+ "snapset_missing"
],
"omap_digest": "0x8b699207",
"osd": 0,
"size": 7
},
{
- "attrs": [
- {
- "Base64": true,
- "name": "_",
- "value": ""
- },
- {
- "Base64": false,
- "name": "snapset",
- "value": "bad-val"
- }
- ],
+ "snapset": "bad-val",
"data_digest": "0x2ddbf8f5",
"errors": [
- "ss_attr_corrupted"
+ "snapset_corrupted"
],
"omap_digest": "0x8b699207",
"osd": 1,
}
],
"union_shard_errors": [
- "ss_attr_missing",
- "ss_attr_corrupted"
+ "snapset_missing",
+ "snapset_corrupted"
]
},
{
"omap_digest": "0xf8e11918",
"size": 7,
"errors": [
- "data_digest_mismatch_oi"
+ "data_digest_mismatch_info"
],
"osd": 0,
"primary": false
"primary": true
}
],
- "selected_object_info": "3:e97ce31e:::ROBJ2:head(47'56 osd.0.0:55 dirty|omap|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od f8e11918 alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ2",
+ "key": "",
+ "snapid": -2,
+ "hash": 2026323607,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'60",
+ "prior_version": "23'6",
+ "last_reqid": "osd.1.0:59",
+ "user_version": 6,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:20.498756",
+ "local_mtime": "2018-04-05 14:33:20.499704",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xf8e11918",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "data_digest_mismatch_oi"
+ "data_digest_mismatch_info"
],
"errors": [
"data_digest_mismatch"
"primary": true
}
],
- "selected_object_info": "3:f2a5b2a4:::ROBJ3:head(47'57 osd.0.0:56 dirty|omap|data_digest|omap_digest s 7 uv 9 dd 2ddbf8f5 od b35dfd alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ3",
+ "key": "",
+ "snapid": -2,
+ "hash": 625845583,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'61",
+ "prior_version": "25'9",
+ "last_reqid": "osd.1.0:60",
+ "user_version": 9,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:21.189382",
+ "local_mtime": "2018-04-05 14:33:21.190446",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x00b35dfd",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
"missing"
],
"omap_digest": "0xd7178dfe",
"size": 7,
"errors": [
- "omap_digest_mismatch_oi"
+ "omap_digest_mismatch_info"
],
"osd": 0,
"primary": false
"primary": true
}
],
- "selected_object_info": "3:f4981d31:::ROBJ4:head(47'58 osd.0.0:57 dirty|omap|data_digest|omap_digest s 7 uv 12 dd 2ddbf8f5 od e2d46ea4 alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ4",
+ "key": "",
+ "snapid": -2,
+ "hash": 2360875311,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'62",
+ "prior_version": "27'12",
+ "last_reqid": "osd.1.0:61",
+ "user_version": 12,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:21.862313",
+ "local_mtime": "2018-04-05 14:33:21.863261",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xe2d46ea4",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "omap_digest_mismatch_oi"
+ "omap_digest_mismatch_info"
],
"errors": [
"omap_digest_mismatch"
"omap_digest": "0x06cac8f6",
"size": 7,
"errors": [
- "omap_digest_mismatch_oi"
+ "omap_digest_mismatch_info"
],
"osd": 1,
"primary": true
}
],
- "selected_object_info": "3:f4bfd4d1:::ROBJ5:head(47'59 osd.0.0:58 dirty|omap|data_digest|omap_digest s 7 uv 15 dd 2ddbf8f5 od 1a862a41 alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ5",
+ "key": "",
+ "snapid": -2,
+ "hash": 2334915887,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'63",
+ "prior_version": "29'15",
+ "last_reqid": "osd.1.0:62",
+ "user_version": 15,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:22.589300",
+ "local_mtime": "2018-04-05 14:33:22.590376",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x1a862a41",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "omap_digest_mismatch_oi"
+ "omap_digest_mismatch_info"
],
"errors": [
"omap_digest_mismatch"
"omap_digest": "0x689ee887",
"size": 7,
"errors": [
- "omap_digest_mismatch_oi"
+ "omap_digest_mismatch_info"
],
"osd": 0,
"primary": false
"primary": true
}
],
- "selected_object_info": "3:a53c12e8:::ROBJ6:head(47'50 osd.0.0:49 dirty|omap|data_digest|omap_digest s 7 uv 18 dd 2ddbf8f5 od 179c919f alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ6",
+ "key": "",
+ "snapid": -2,
+ "hash": 390610085,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'54",
+ "prior_version": "31'18",
+ "last_reqid": "osd.1.0:53",
+ "user_version": 18,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:23.289188",
+ "local_mtime": "2018-04-05 14:33:23.290130",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x179c919f",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "omap_digest_mismatch_oi"
+ "omap_digest_mismatch_info"
],
"errors": [
"omap_digest_mismatch"
"omap_digest": "0x6a73cc07",
"size": 7,
"errors": [
- "omap_digest_mismatch_oi"
+ "omap_digest_mismatch_info"
],
"osd": 1,
"primary": true
}
],
- "selected_object_info": "3:8b55fa4b:::ROBJ7:head(47'49 osd.0.0:48 dirty|omap|data_digest|omap_digest s 7 uv 21 dd 2ddbf8f5 od efced57a alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ7",
+ "key": "",
+ "snapid": -2,
+ "hash": 3529485009,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'53",
+ "prior_version": "33'21",
+ "last_reqid": "osd.1.0:52",
+ "user_version": 21,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:23.979658",
+ "local_mtime": "2018-04-05 14:33:23.980731",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xefced57a",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "omap_digest_mismatch_oi"
+ "omap_digest_mismatch_info"
],
"errors": [
"omap_digest_mismatch"
"shards": [
{
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "bad-val",
- "name": "_key1-ROBJ8"
+ "name": "key1-ROBJ8"
},
{
"Base64": false,
"value": "val2-ROBJ8",
- "name": "_key2-ROBJ8"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key2-ROBJ8"
}
],
"data_digest": "0x2ddbf8f5",
},
{
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "val1-ROBJ8",
- "name": "_key1-ROBJ8"
+ "name": "key1-ROBJ8"
},
{
"Base64": false,
"value": "val3-ROBJ8",
- "name": "_key3-ROBJ8"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key3-ROBJ8"
}
],
"data_digest": "0x2ddbf8f5",
"primary": true
}
],
- "selected_object_info": "3:86586531:::ROBJ8:head(82'62 client.4351.0:1 dirty|omap|data_digest|omap_digest s 7 uv 66 dd 2ddbf8f5 od d6be81dc alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ8",
+ "key": "",
+ "snapid": -2,
+ "hash": 2359695969,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "79'66",
+ "prior_version": "79'65",
+ "last_reqid": "client.4554.0:1",
+ "user_version": 66,
+ "size": 7,
+ "mtime": "2018-04-05 14:34:05.598688",
+ "local_mtime": "2018-04-05 14:34:05.599698",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xd6be81dc",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [],
"errors": [
"attr_value_mismatch",
{
"shards": [
{
- "object_info": "3:ffdb2004:::ROBJ9:head(47'60 osd.0.0:59 dirty|omap|data_digest|omap_digest s 7 uv 27 dd 2ddbf8f5 od 2eecc539 alloc_hint [0 0 0])",
+ "object_info": {
+ "oid": {
+ "oid": "ROBJ9",
+ "key": "",
+ "snapid": -2,
+ "hash": 537189375,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "51'64",
+ "prior_version": "37'27",
+ "last_reqid": "osd.1.0:63",
+ "user_version": 27,
+ "size": 7,
+ "mtime": "2018-04-05 14:33:25.352485",
+ "local_mtime": "2018-04-05 14:33:25.353746",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x2eecc539",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"data_digest": "0x1f26fb26",
"omap_digest": "0x2eecc539",
"size": 3,
"errors": [
- "obj_size_oi_mismatch"
+ "obj_size_info_mismatch"
],
"osd": 0,
"primary": false
},
{
- "object_info": "3:ffdb2004:::ROBJ9:head(122'64 client.4532.0:1 dirty|omap|data_digest|omap_digest s 3 uv 68 dd 1f26fb26 od 2eecc539 alloc_hint [0 0 0])",
+ "object_info": {
+ "oid": {
+ "oid": "ROBJ9",
+ "key": "",
+ "snapid": -2,
+ "hash": 537189375,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "119'68",
+ "prior_version": "51'64",
+ "last_reqid": "client.4834.0:1",
+ "user_version": 68,
+ "size": 3,
+ "mtime": "2018-04-05 14:35:01.500659",
+ "local_mtime": "2018-04-05 14:35:01.502117",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x1f26fb26",
+ "omap_digest": "0x2eecc539",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"data_digest": "0x1f26fb26",
"omap_digest": "0x2eecc539",
"size": 3,
"primary": true
}
],
- "selected_object_info": "3:ffdb2004:::ROBJ9:head(122'64 client.4532.0:1 dirty|omap|data_digest|omap_digest s 3 uv 68 dd 1f26fb26 od 2eecc539 alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ9",
+ "key": "",
+ "snapid": -2,
+ "hash": 537189375,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "119'68",
+ "prior_version": "51'64",
+ "last_reqid": "client.4834.0:1",
+ "user_version": 68,
+ "size": 3,
+ "mtime": "2018-04-05 14:35:01.500659",
+ "local_mtime": "2018-04-05 14:35:01.502117",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x1f26fb26",
+ "omap_digest": "0x2eecc539",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "obj_size_oi_mismatch"
+ "obj_size_info_mismatch"
],
"errors": [
"object_info_inconsistency"
}
EOF
- jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
+ jq "$jqfilter" $dir/json | jq '.inconsistents' | python -c "$sortkeys" > $dir/csjson
diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
jq '.' $dir/json > save2.json
fi
- if which jsonschema > /dev/null;
+ if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
then
jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
fi
local dir=$1
local allow_overwrites=$2
local poolname=ecpool
- local total_objs=5
+ local total_objs=7
setup $dir || return 1
run_mon $dir a || return 1
objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
;;
+ 6)
+ objectstore_tool $dir 0 $objname rm-attr hinfo_key || return 1
+ echo -n bad-val > $dir/bad-val
+ objectstore_tool $dir 1 $objname set-attr hinfo_key $dir/bad-val || return 1
+ ;;
+
+ 7)
+ local payload=MAKETHISDIFFERENTFROMOTHEROBJECTS
+ echo $payload > $dir/DIFFERENT
+ rados --pool $poolname put $objname $dir/DIFFERENT || return 1
+
+ # Get hinfo_key from EOBJ1
+ objectstore_tool $dir 0 EOBJ1 get-attr hinfo_key > $dir/hinfo
+ objectstore_tool $dir 0 $objname set-attr hinfo_key $dir/hinfo || return 1
+ rm -f $dir/hinfo
+ ;;
+
esac
done
# Get epoch for repair-get requests
epoch=$(jq .epoch $dir/json)
- jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
+ jq "$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
{
"inconsistents": [
{
"primary": false
},
{
+ "object_info": {
+ "oid": {
+ "oid": "EOBJ1",
+ "key": "",
+ "snapid": -2,
+ "hash": 560836233,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "27'1",
+ "prior_version": "0'0",
+ "last_reqid": "client.4184.0:1",
+ "user_version": 1,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"size": 9,
"shard": 0,
"errors": [
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"osd": 1,
"primary": true
"primary": false
}
],
- "selected_object_info": "3:9175b684:::EOBJ1:head(21'1 client.4179.0:1 dirty|data_digest|omap_digest s 7 uv 1 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ1",
+ "key": "",
+ "snapid": -2,
+ "hash": 560836233,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "27'1",
+ "prior_version": "0'0",
+ "last_reqid": "client.4184.0:1",
+ "user_version": 1,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"errors": [
"size_mismatch"
"primary": false
}
],
- "selected_object_info": "3:b197b25d:::EOBJ3:head(37'3 client.4251.0:1 dirty|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ3",
+ "key": "",
+ "snapid": -2,
+ "hash": 3125668237,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "39'3",
+ "prior_version": "0'0",
+ "last_reqid": "client.4252.0:1",
+ "user_version": 3,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
"missing"
],
"shards": [
{
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "bad-val",
- "name": "_key1-EOBJ4"
+ "name": "key1-EOBJ4"
},
{
"Base64": false,
"value": "val2-EOBJ4",
- "name": "_key2-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key2-EOBJ4"
}
],
"size": 2048,
"errors": [],
"size": 2048,
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "val1-EOBJ4",
- "name": "_key1-EOBJ4"
+ "name": "key1-EOBJ4"
},
{
"Base64": false,
"value": "val2-EOBJ4",
- "name": "_key2-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key2-EOBJ4"
}
]
},
"errors": [],
"size": 2048,
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "val1-EOBJ4",
- "name": "_key1-EOBJ4"
+ "name": "key1-EOBJ4"
},
{
"Base64": false,
"value": "val3-EOBJ4",
- "name": "_key3-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key3-EOBJ4"
}
]
}
],
- "selected_object_info": "3:5e723e06:::EOBJ4:head(45'6 client.4289.0:1 dirty|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ4",
+ "key": "",
+ "snapid": -2,
+ "hash": 1618759290,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "45'6",
+ "prior_version": "45'5",
+ "last_reqid": "client.4294.0:1",
+ "user_version": 6,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [],
"errors": [
"attr_value_mismatch",
"primary": false
},
{
+ "object_info": {
+ "oid": {
+ "oid": "EOBJ5",
+ "key": "",
+ "snapid": -2,
+ "hash": 2918945441,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "59'7",
+ "prior_version": "0'0",
+ "last_reqid": "client.4382.0:1",
+ "user_version": 7,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"size": 4096,
"shard": 0,
"errors": [
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"osd": 1,
"primary": true
"primary": false
}
],
- "selected_object_info": "3:8549dfb5:::EOBJ5:head(65'7 client.4441.0:1 dirty|data_digest|omap_digest s 7 uv 7 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ5",
+ "key": "",
+ "snapid": -2,
+ "hash": 2918945441,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "59'7",
+ "prior_version": "0'0",
+ "last_reqid": "client.4382.0:1",
+ "user_version": 7,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"errors": [
"size_mismatch"
"nspace": "",
"name": "EOBJ5"
}
- }
- ],
- "epoch": 0
-}
-EOF
-
- jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
- if test $getjson = "yes"
- then
- jq '.' $dir/json > save3.json
- fi
-
- if which jsonschema > /dev/null;
- then
- jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
- fi
-
- pg_deep_scrub $pg
-
+ },
+ {
+ "errors": [],
+ "object": {
+ "locator": "",
+ "name": "EOBJ6",
+ "nspace": "",
+ "snap": "head",
+ "version": 8
+ },
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ6",
+ "key": "",
+ "snapid": -2,
+ "hash": 3050890866,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "65'8",
+ "prior_version": "0'0",
+ "last_reqid": "client.4418.0:1",
+ "user_version": 8,
+ "size": 7,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
+ "shards": [
+ {
+ "errors": [
+ "hinfo_missing"
+ ],
+ "osd": 0,
+ "primary": false,
+ "shard": 2,
+ "size": 2048
+ },
+ {
+ "errors": [
+ "hinfo_corrupted"
+ ],
+ "osd": 1,
+ "primary": true,
+ "shard": 0,
+ "hashinfo": "bad-val",
+ "size": 2048
+ },
+ {
+ "errors": [],
+ "osd": 2,
+ "primary": false,
+ "shard": 1,
+ "size": 2048,
+ "hashinfo": {
+ "cumulative_shard_hashes": [
+ {
+ "hash": 80717615,
+ "shard": 0
+ },
+ {
+ "hash": 1534491824,
+ "shard": 1
+ },
+ {
+ "hash": 80717615,
+ "shard": 2
+ }
+ ],
+ "total_chunk_size": 2048
+ }
+ }
+ ],
+ "union_shard_errors": [
+ "hinfo_missing",
+ "hinfo_corrupted"
+ ]
+ },
+ {
+ "errors": [
+ "hinfo_inconsistency"
+ ],
+ "object": {
+ "locator": "",
+ "name": "EOBJ7",
+ "nspace": "",
+ "snap": "head",
+ "version": 10
+ },
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ7",
+ "key": "",
+ "snapid": -2,
+ "hash": 3258066308,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "75'10",
+ "prior_version": "75'9",
+ "last_reqid": "client.4482.0:1",
+ "user_version": 10,
+ "size": 34,
+ "mtime": "",
+ "local_mtime": "",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x136e4e27",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
+ "shards": [
+ {
+ "hashinfo": {
+ "cumulative_shard_hashes": [
+ {
+ "hash": 80717615,
+ "shard": 0
+ },
+ {
+ "hash": 1534491824,
+ "shard": 1
+ },
+ {
+ "hash": 80717615,
+ "shard": 2
+ }
+ ],
+ "total_chunk_size": 2048
+ },
+ "errors": [],
+ "osd": 0,
+ "primary": false,
+ "shard": 2,
+ "size": 2048
+ },
+ {
+ "hashinfo": {
+ "cumulative_shard_hashes": [
+ {
+ "hash": 1534350760,
+ "shard": 0
+ },
+ {
+ "hash": 1534491824,
+ "shard": 1
+ },
+ {
+ "hash": 1534350760,
+ "shard": 2
+ }
+ ],
+ "total_chunk_size": 2048
+ },
+ "errors": [],
+ "osd": 1,
+ "primary": true,
+ "shard": 0,
+ "size": 2048
+ },
+ {
+ "hashinfo": {
+ "cumulative_shard_hashes": [
+ {
+ "hash": 1534350760,
+ "shard": 0
+ },
+ {
+ "hash": 1534491824,
+ "shard": 1
+ },
+ {
+ "hash": 1534350760,
+ "shard": 2
+ }
+ ],
+ "total_chunk_size": 2048
+ },
+ "errors": [],
+ "osd": 2,
+ "primary": false,
+ "shard": 1,
+ "size": 2048
+ }
+ ],
+ "union_shard_errors": []
+ }
+ ],
+ "epoch": 0
+}
+EOF
+
+ jq "$jqfilter" $dir/json | jq '.inconsistents' | python -c "$sortkeys" > $dir/csjson
+ diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ if test $getjson = "yes"
+ then
+ jq '.' $dir/json > save3.json
+ fi
+
+ if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
+ then
+ jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
+ fi
+
+ pg_deep_scrub $pg
+
rados list-inconsistent-pg $poolname > $dir/json || return 1
# Check pg count
test $(jq '. | length' $dir/json) = "1" || return 1
if [ "$allow_overwrites" = "true" ]
then
- jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
+ jq "$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
{
"inconsistents": [
{
"primary": false
},
{
+ "object_info": {
+ "oid": {
+ "oid": "EOBJ1",
+ "key": "",
+ "snapid": -2,
+ "hash": 560836233,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "27'1",
+ "prior_version": "0'0",
+ "last_reqid": "client.4184.0:1",
+ "user_version": 1,
+ "size": 7,
+ "mtime": "2018-04-05 14:31:33.837147",
+ "local_mtime": "2018-04-05 14:31:33.840763",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"size": 9,
"shard": 0,
"errors": [
"read_error",
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"osd": 1,
"primary": true
"primary": false
}
],
- "selected_object_info": "3:9175b684:::EOBJ1:head(27'1 client.4155.0:1 dirty|data_digest|omap_digest s 7 uv 1 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ1",
+ "key": "",
+ "snapid": -2,
+ "hash": 560836233,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "27'1",
+ "prior_version": "0'0",
+ "last_reqid": "client.4184.0:1",
+ "user_version": 1,
+ "size": 7,
+ "mtime": "2018-04-05 14:31:33.837147",
+ "local_mtime": "2018-04-05 14:31:33.840763",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
"read_error",
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"errors": [
"size_mismatch"
"primary": false
}
],
- "selected_object_info": "3:b197b25d:::EOBJ3:head(41'3 client.4199.0:1 dirty|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ3",
+ "key": "",
+ "snapid": -2,
+ "hash": 3125668237,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "39'3",
+ "prior_version": "0'0",
+ "last_reqid": "client.4252.0:1",
+ "user_version": 3,
+ "size": 7,
+ "mtime": "2018-04-05 14:31:46.841145",
+ "local_mtime": "2018-04-05 14:31:46.844996",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
"missing"
],
"shards": [
{
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "bad-val",
- "name": "_key1-EOBJ4"
+ "name": "key1-EOBJ4"
},
{
"Base64": false,
"value": "val2-EOBJ4",
- "name": "_key2-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key2-EOBJ4"
}
],
"data_digest": "0x00000000",
},
{
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "val1-EOBJ4",
- "name": "_key1-EOBJ4"
+ "name": "key1-EOBJ4"
},
{
"Base64": false,
"value": "val2-EOBJ4",
- "name": "_key2-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key2-EOBJ4"
}
],
"data_digest": "0x00000000",
},
{
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "val1-EOBJ4",
- "name": "_key1-EOBJ4"
+ "name": "key1-EOBJ4"
},
{
"Base64": false,
"value": "val3-EOBJ4",
- "name": "_key3-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key3-EOBJ4"
}
],
"data_digest": "0x00000000",
"primary": false
}
],
- "selected_object_info": "3:5e723e06:::EOBJ4:head(48'6 client.4223.0:1 dirty|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ4",
+ "key": "",
+ "snapid": -2,
+ "hash": 1618759290,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "45'6",
+ "prior_version": "45'5",
+ "last_reqid": "client.4294.0:1",
+ "user_version": 6,
+ "size": 7,
+ "mtime": "2018-04-05 14:31:54.663622",
+ "local_mtime": "2018-04-05 14:31:54.664527",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [],
"errors": [
"attr_value_mismatch",
{
"data_digest": "0x00000000",
"omap_digest": "0xffffffff",
+ "object_info": {
+ "oid": {
+ "oid": "EOBJ5",
+ "key": "",
+ "snapid": -2,
+ "hash": 2918945441,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "59'7",
+ "prior_version": "0'0",
+ "last_reqid": "client.4382.0:1",
+ "user_version": 7,
+ "size": 7,
+ "mtime": "2018-04-05 14:32:12.929161",
+ "local_mtime": "2018-04-05 14:32:12.934707",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"size": 4096,
"errors": [
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"shard": 0,
"osd": 1,
"primary": false
}
],
- "selected_object_info": "3:8549dfb5:::EOBJ5:head(65'7 client.4288.0:1 dirty|data_digest|omap_digest s 7 uv 7 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ5",
+ "key": "",
+ "snapid": -2,
+ "hash": 2918945441,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "59'7",
+ "prior_version": "0'0",
+ "last_reqid": "client.4382.0:1",
+ "user_version": 7,
+ "size": 7,
+ "mtime": "2018-04-05 14:32:12.929161",
+ "local_mtime": "2018-04-05 14:32:12.934707",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"errors": [
"size_mismatch"
"nspace": "",
"name": "EOBJ5"
}
+ },
+ {
+ "object": {
+ "name": "EOBJ6",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "version": 8
+ },
+ "errors": [],
+ "union_shard_errors": [
+ "read_error",
+ "hinfo_missing",
+ "hinfo_corrupted"
+ ],
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ6",
+ "key": "",
+ "snapid": -2,
+ "hash": 3050890866,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "65'8",
+ "prior_version": "0'0",
+ "last_reqid": "client.4418.0:1",
+ "user_version": 8,
+ "size": 7,
+ "mtime": "2018-04-05 14:32:20.634116",
+ "local_mtime": "2018-04-05 14:32:20.637999",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
+ "shards": [
+ {
+ "osd": 0,
+ "primary": false,
+ "shard": 2,
+ "errors": [
+ "read_error",
+ "hinfo_missing"
+ ],
+ "size": 2048
+ },
+ {
+ "osd": 1,
+ "primary": true,
+ "shard": 0,
+ "errors": [
+ "read_error",
+ "hinfo_corrupted"
+ ],
+ "size": 2048,
+ "hashinfo": "bad-val"
+ },
+ {
+ "osd": 2,
+ "primary": false,
+ "shard": 1,
+ "errors": [],
+ "size": 2048,
+ "omap_digest": "0xffffffff",
+ "data_digest": "0x00000000",
+ "hashinfo": {
+ "cumulative_shard_hashes": [
+ {
+ "hash": 80717615,
+ "shard": 0
+ },
+ {
+ "hash": 1534491824,
+ "shard": 1
+ },
+ {
+ "hash": 80717615,
+ "shard": 2
+ }
+ ],
+ "total_chunk_size": 2048
+ }
+ }
+ ]
+ },
+ {
+ "object": {
+ "name": "EOBJ7",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "version": 10
+ },
+ "errors": [
+ "hinfo_inconsistency"
+ ],
+ "union_shard_errors": [],
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ7",
+ "key": "",
+ "snapid": -2,
+ "hash": 3258066308,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "75'10",
+ "prior_version": "75'9",
+ "last_reqid": "client.4482.0:1",
+ "user_version": 10,
+ "size": 34,
+ "mtime": "2018-04-05 14:32:33.058782",
+ "local_mtime": "2018-04-05 14:32:33.059679",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x136e4e27",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
+ "shards": [
+ {
+ "osd": 0,
+ "primary": false,
+ "shard": 2,
+ "errors": [],
+ "size": 2048,
+ "omap_digest": "0xffffffff",
+ "data_digest": "0x00000000",
+ "hashinfo": {
+ "cumulative_shard_hashes": [
+ {
+ "hash": 80717615,
+ "shard": 0
+ },
+ {
+ "hash": 1534491824,
+ "shard": 1
+ },
+ {
+ "hash": 80717615,
+ "shard": 2
+ }
+ ],
+ "total_chunk_size": 2048
+ }
+ },
+ {
+ "osd": 1,
+ "primary": true,
+ "shard": 0,
+ "errors": [],
+ "size": 2048,
+ "omap_digest": "0xffffffff",
+ "data_digest": "0x00000000",
+ "hashinfo": {
+ "cumulative_shard_hashes": [
+ {
+ "hash": 1534350760,
+ "shard": 0
+ },
+ {
+ "hash": 1534491824,
+ "shard": 1
+ },
+ {
+ "hash": 1534350760,
+ "shard": 2
+ }
+ ],
+ "total_chunk_size": 2048
+ }
+ },
+ {
+ "osd": 2,
+ "primary": false,
+ "shard": 1,
+ "errors": [],
+ "size": 2048,
+ "omap_digest": "0xffffffff",
+ "data_digest": "0x00000000",
+ "hashinfo": {
+ "cumulative_shard_hashes": [
+ {
+ "hash": 1534350760,
+ "shard": 0
+ },
+ {
+ "hash": 1534491824,
+ "shard": 1
+ },
+ {
+ "hash": 1534350760,
+ "shard": 2
+ }
+ ],
+ "total_chunk_size": 2048
+ }
+ }
+ ]
}
],
"epoch": 0
else
- jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
+ jq "$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
{
"inconsistents": [
{
"primary": false
},
{
+ "object_info": {
+ "oid": {
+ "oid": "EOBJ1",
+ "key": "",
+ "snapid": -2,
+ "hash": 560836233,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "27'1",
+ "prior_version": "0'0",
+ "last_reqid": "client.4192.0:1",
+ "user_version": 1,
+ "size": 7,
+ "mtime": "2018-04-05 14:30:10.688009",
+ "local_mtime": "2018-04-05 14:30:10.691774",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"size": 9,
"shard": 0,
"errors": [
"read_error",
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"osd": 1,
"primary": true
"primary": false
}
],
- "selected_object_info": "3:9175b684:::EOBJ1:head(21'1 client.4179.0:1 dirty|data_digest|omap_digest s 7 uv 1 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ1",
+ "key": "",
+ "snapid": -2,
+ "hash": 560836233,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "27'1",
+ "prior_version": "0'0",
+ "last_reqid": "client.4192.0:1",
+ "user_version": 1,
+ "size": 7,
+ "mtime": "2018-04-05 14:30:10.688009",
+ "local_mtime": "2018-04-05 14:30:10.691774",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
"read_error",
- "size_mismatch_oi",
- "obj_size_oi_mismatch"
+ "size_mismatch_info",
+ "obj_size_info_mismatch"
],
"errors": [
"size_mismatch"
"primary": false
}
],
- "selected_object_info": "3:9babd184:::EOBJ2:head(29'2 client.4217.0:1 dirty|data_digest|omap_digest s 7 uv 2 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ2",
+ "key": "",
+ "snapid": -2,
+ "hash": 562812377,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "33'2",
+ "prior_version": "0'0",
+ "last_reqid": "client.4224.0:1",
+ "user_version": 2,
+ "size": 7,
+ "mtime": "2018-04-05 14:30:14.152945",
+ "local_mtime": "2018-04-05 14:30:14.154014",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
"ec_hash_error"
],
"primary": false
}
],
- "selected_object_info": "3:b197b25d:::EOBJ3:head(37'3 client.4251.0:1 dirty|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ3",
+ "key": "",
+ "snapid": -2,
+ "hash": 3125668237,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "39'3",
+ "prior_version": "0'0",
+ "last_reqid": "client.4258.0:1",
+ "user_version": 3,
+ "size": 7,
+ "mtime": "2018-04-05 14:30:18.875544",
+ "local_mtime": "2018-04-05 14:30:18.880153",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
"missing"
],
"shards": [
{
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "bad-val",
- "name": "_key1-EOBJ4"
+ "name": "key1-EOBJ4"
},
{
"Base64": false,
"value": "val2-EOBJ4",
- "name": "_key2-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key2-EOBJ4"
}
],
"data_digest": "0x04cfa72f",
"omap_digest": "0xffffffff",
"data_digest": "0x04cfa72f",
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "val1-EOBJ4",
- "name": "_key1-EOBJ4"
+ "name": "key1-EOBJ4"
},
{
"Base64": false,
"value": "val2-EOBJ4",
- "name": "_key2-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key2-EOBJ4"
}
]
},
"omap_digest": "0xffffffff",
"data_digest": "0x04cfa72f",
"attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
{
"Base64": false,
"value": "val1-EOBJ4",
- "name": "_key1-EOBJ4"
+ "name": "key1-EOBJ4"
},
{
"Base64": false,
"value": "val3-EOBJ4",
- "name": "_key3-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
+ "name": "key3-EOBJ4"
}
]
}
],
- "selected_object_info": "3:5e723e06:::EOBJ4:head(45'6 client.4289.0:1 dirty|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ4",
+ "key": "",
+ "snapid": -2,
+ "hash": 1618759290,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "45'6",
+ "prior_version": "45'5",
+ "last_reqid": "client.4296.0:1",
+ "user_version": 6,
+ "size": 7,
+ "mtime": "2018-04-05 14:30:22.271983",
+ "local_mtime": "2018-04-05 14:30:22.272840",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [],
"errors": [
"attr_value_mismatch",
"primary": false
},
{
+ "object_info": {
+ "oid": {
+ "oid": "EOBJ5",
+ "key": "",
+ "snapid": -2,
+ "hash": 2918945441,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "59'7",
+ "prior_version": "0'0",
+ "last_reqid": "client.4384.0:1",
+ "user_version": 7,
+ "size": 7,
+ "mtime": "2018-04-05 14:30:35.162395",
+ "local_mtime": "2018-04-05 14:30:35.166390",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"size": 4096,
"shard": 0,
"errors": [
- "size_mismatch_oi",
+ "size_mismatch_info",
"ec_size_error",
- "obj_size_oi_mismatch"
+ "obj_size_info_mismatch"
],
"osd": 1,
"primary": true
"primary": false
}
],
- "selected_object_info": "3:8549dfb5:::EOBJ5:head(65'7 client.4441.0:1 dirty|data_digest|omap_digest s 7 uv 7 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ5",
+ "key": "",
+ "snapid": -2,
+ "hash": 2918945441,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "59'7",
+ "prior_version": "0'0",
+ "last_reqid": "client.4384.0:1",
+ "user_version": 7,
+ "size": 7,
+ "mtime": "2018-04-05 14:30:35.162395",
+ "local_mtime": "2018-04-05 14:30:35.166390",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"union_shard_errors": [
- "size_mismatch_oi",
+ "size_mismatch_info",
"ec_size_error",
- "obj_size_oi_mismatch"
+ "obj_size_info_mismatch"
],
"errors": [
"size_mismatch"
"nspace": "",
"name": "EOBJ5"
}
+ },
+ {
+ "object": {
+ "name": "EOBJ6",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "version": 8
+ },
+ "errors": [],
+ "union_shard_errors": [
+ "read_error",
+ "hinfo_missing",
+ "hinfo_corrupted"
+ ],
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ6",
+ "key": "",
+ "snapid": -2,
+ "hash": 3050890866,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "65'8",
+ "prior_version": "0'0",
+ "last_reqid": "client.4420.0:1",
+ "user_version": 8,
+ "size": 7,
+ "mtime": "2018-04-05 14:30:40.914673",
+ "local_mtime": "2018-04-05 14:30:40.917705",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
+ "shards": [
+ {
+ "osd": 0,
+ "primary": false,
+ "shard": 2,
+ "errors": [
+ "read_error",
+ "hinfo_missing"
+ ],
+ "size": 2048
+ },
+ {
+ "osd": 1,
+ "primary": true,
+ "shard": 0,
+ "errors": [
+ "read_error",
+ "hinfo_corrupted"
+ ],
+ "size": 2048,
+ "hashinfo": "bad-val"
+ },
+ {
+ "osd": 2,
+ "primary": false,
+ "shard": 1,
+ "errors": [],
+ "size": 2048,
+ "omap_digest": "0xffffffff",
+ "data_digest": "0x04cfa72f",
+ "hashinfo": {
+ "cumulative_shard_hashes": [
+ {
+ "hash": 80717615,
+ "shard": 0
+ },
+ {
+ "hash": 1534491824,
+ "shard": 1
+ },
+ {
+ "hash": 80717615,
+ "shard": 2
+ }
+ ],
+ "total_chunk_size": 2048
+ }
+ }
+ ]
+ },
+ {
+ "object": {
+ "name": "EOBJ7",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "version": 10
+ },
+ "errors": [
+ "hinfo_inconsistency"
+ ],
+ "union_shard_errors": [
+ "ec_hash_error"
+ ],
+ "selected_object_info": {
+ "oid": {
+ "oid": "EOBJ7",
+ "key": "",
+ "snapid": -2,
+ "hash": 3258066308,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "75'10",
+ "prior_version": "75'9",
+ "last_reqid": "client.4486.0:1",
+ "user_version": 10,
+ "size": 34,
+ "mtime": "2018-04-05 14:30:50.995009",
+ "local_mtime": "2018-04-05 14:30:50.996112",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest",
+ "omap_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x136e4e27",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
+ "shards": [
+ {
+ "osd": 0,
+ "primary": false,
+ "shard": 2,
+ "errors": [
+ "ec_hash_error"
+ ],
+ "size": 2048,
+ "hashinfo": {
+ "cumulative_shard_hashes": [
+ {
+ "hash": 80717615,
+ "shard": 0
+ },
+ {
+ "hash": 1534491824,
+ "shard": 1
+ },
+ {
+ "hash": 80717615,
+ "shard": 2
+ }
+ ],
+ "total_chunk_size": 2048
+ }
+ },
+ {
+ "osd": 1,
+ "primary": true,
+ "shard": 0,
+ "errors": [],
+ "size": 2048,
+ "omap_digest": "0xffffffff",
+ "data_digest": "0x5b7455a8",
+ "hashinfo": {
+ "cumulative_shard_hashes": [
+ {
+ "hash": 1534350760,
+ "shard": 0
+ },
+ {
+ "hash": 1534491824,
+ "shard": 1
+ },
+ {
+ "hash": 1534350760,
+ "shard": 2
+ }
+ ],
+ "total_chunk_size": 2048
+ }
+ },
+ {
+ "osd": 2,
+ "primary": false,
+ "shard": 1,
+ "errors": [],
+ "size": 2048,
+ "omap_digest": "0xffffffff",
+ "data_digest": "0x5b7455a8",
+ "hashinfo": {
+ "cumulative_shard_hashes": [
+ {
+ "hash": 1534350760,
+ "shard": 0
+ },
+ {
+ "hash": 1534491824,
+ "shard": 1
+ },
+ {
+ "hash": 1534350760,
+ "shard": 2
+ }
+ ],
+ "total_chunk_size": 2048
+ }
+ }
+ ]
}
],
"epoch": 0
fi
- jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
+ jq "$jqfilter" $dir/json | jq '.inconsistents' | python -c "$sortkeys" > $dir/csjson
diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
jq '.' $dir/json > save${num}.json
fi
- if which jsonschema > /dev/null;
+ if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
then
jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
fi
rados list-inconsistent-obj $pg > $dir/json || return 1
- jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
+ jq "$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
{
"epoch": 34,
"inconsistents": [
"snapset_inconsistency"
],
"union_shard_errors": [],
- "selected_object_info": "3:ce3f1d6a:::ROBJ1:head(27'8 client.4143.0:1 dirty|omap|data_digest s 21 uv 8 dd 53acb008 alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ1",
+ "key": "",
+ "snapid": -2,
+ "hash": 1454963827,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "24'8",
+ "prior_version": "21'3",
+ "last_reqid": "client.4195.0:1",
+ "user_version": 8,
+ "size": 21,
+ "mtime": "2018-04-05 14:35:43.286117",
+ "local_mtime": "2018-04-05 14:35:43.288990",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x53acb008",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"shards": [
{
"osd": 0,
"primary": false,
"errors": [],
"size": 21,
- "snapset": "1=[1]:{1=[1]}"
+ "snapset": {
+ "clones": [
+ {
+ "overlap": "[]",
+ "size": 7,
+ "snap": 1,
+ "snaps": [
+ 1
+ ]
+ }
+ ],
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ }
+ }
},
{
"osd": 1,
"primary": true,
"errors": [],
"size": 21,
- "snapset": "0=[]:[]+stray_clone_snaps={1=[1]}"
+ "snapset": {
+ "clones": [],
+ "head_exists": 0,
+ "snap_context": {
+ "seq": 0,
+ "snaps": []
+ }
+ }
}
]
},
"snapset_inconsistency"
],
"union_shard_errors": [],
- "selected_object_info": "3:e97ce31e:::ROBJ2:head(31'10 client.4155.0:1 dirty|omap|data_digest s 21 uv 10 dd 53acb008 alloc_hint [0 0 0])",
+ "selected_object_info": {
+ "oid": {
+ "oid": "ROBJ2",
+ "key": "",
+ "snapid": -2,
+ "hash": 2026323607,
+ "max": 0,
+ "pool": 3,
+ "namespace": ""
+ },
+ "version": "28'10",
+ "prior_version": "23'6",
+ "last_reqid": "client.4223.0:1",
+ "user_version": 10,
+ "size": 21,
+ "mtime": "2018-04-05 14:35:48.326856",
+ "local_mtime": "2018-04-05 14:35:48.328097",
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest"
+ ],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x53acb008",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "alloc_hint_flags": 0,
+ "manifest": {
+ "type": 0
+ },
+ "watchers": {}
+ },
"shards": [
{
"osd": 0,
"primary": false,
"errors": [],
"size": 21,
- "snapset": "0=[]:[]+stray_clone_snaps={1=[1]}"
+ "snapset": {
+ "clones": [],
+ "head_exists": 0,
+ "snap_context": {
+ "seq": 0,
+ "snaps": []
+ }
+ }
},
{
"osd": 1,
"primary": true,
"errors": [],
"size": 21,
- "snapset": "1=[1]:{1=[1]}"
+ "snapset": {
+ "clones": [
+ {
+ "overlap": "[]",
+ "size": 7,
+ "snap": 1,
+ "snaps": [
+ 1
+ ]
+ }
+ ],
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ }
+ }
}
]
}
}
EOF
- jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
+ jq "$jqfilter" $dir/json | jq '.inconsistents' | python -c "$sortkeys" > $dir/csjson
diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
jq '.' $dir/json > save6.json
fi
- if which jsonschema > /dev/null;
+ if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
then
jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
fi
#
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+# Test development and debugging
+# Set to "yes" in order to ignore diff errors and save results to update test
+getjson="no"
+
function run() {
local dir=$1
shift
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
+ setup $dir || return 1
$func $dir || return 1
+ teardown $dir || return 1
done
}
function TEST_scrub_snaps() {
local dir=$1
local poolname=test
+ local OBJS=15
+ local OSDS=1
TESTDATA="testdata.$$"
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mon $dir a --osd_pool_default_size=$OSDS || return 1
run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
-
- create_rbd_pool || return 1
- wait_for_clean || return 1
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ run_osd $dir $osd || return 1
+ done
# Create a pool with a single pg
create_pool $poolname 1 1
poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
- for i in `seq 1 15`
+ for i in `seq 1 $OBJS`
do
rados -p $poolname put obj${i} $TESTDATA
done
+ local primary=$(get_primary $poolname obj1)
SNAP=1
rados -p $poolname mksnap snap${SNAP}
dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
kill_daemons $dir TERM osd || return 1
- # Don't need to ceph_objectstore_tool function because osd stopped
+ # Don't need to use ceph_objectstore_tool() function because osd stopped
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj1)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" --force remove
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj1)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" --force remove
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":2)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --op list obj5 | grep \"snapid\":2)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" remove
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":1)"
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --op list obj5 | grep \"snapid\":1)"
OBJ5SAVE="$JSON"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" remove
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":4)"
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --op list obj5 | grep \"snapid\":4)"
dd if=/dev/urandom of=$TESTDATA bs=256 count=18
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" set-bytes $TESTDATA
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj3)"
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj3)"
dd if=/dev/urandom of=$TESTDATA bs=256 count=15
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" set-bytes $TESTDATA
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj4 | grep \"snapid\":7)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --op list obj4 | grep \"snapid\":7)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" remove
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj2)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" rm-attr snapset
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj2)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" rm-attr snapset
# Create a clone which isn't in snapset and doesn't have object info
JSON="$(echo "$OBJ5SAVE" | sed s/snapid\":1/snapid\":7/)"
dd if=/dev/urandom of=$TESTDATA bs=256 count=7
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" set-bytes $TESTDATA
rm -f $TESTDATA
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj6)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj7)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset corrupt
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj8)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset seq
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj9)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_size
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj10)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_overlap
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj11)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clones
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj12)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset head
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj13)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset snaps
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj14)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset size
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj6)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj7)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset corrupt
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj8)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset seq
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj9)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset clone_size
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj10)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset clone_overlap
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj11)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset clones
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj12)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset head
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj13)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset snaps
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj14)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset size
echo "garbage" > $dir/bad
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj15)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-attr snapset $dir/bad
+ JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj15)"
+ ceph-objectstore-tool --data-path $dir/${primary} "$JSON" set-attr snapset $dir/bad
rm -f $dir/bad
- run_osd $dir 0 || return 1
- create_rbd_pool || return 1
- wait_for_clean || return 1
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ run_osd $dir $osd || return 1
+ done
local pgid="${poolid}.0"
if ! pg_scrub "$pgid" ; then
test $(jq -r '.[0]' $dir/json) = $pgid || return 1
rados list-inconsistent-snapset $pgid > $dir/json || return 1
- test $(jq '.inconsistents | length' $dir/json) = "21" || return 1
local jqfilter='.inconsistents'
local sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
},
{
"errors": [
- "oi_attr_missing",
+ "info_missing",
"headless"
],
"snap": 7,
"nspace": "",
"name": "obj5"
},
+ {
+ "name": "obj10",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 1,
+ "size": 1032,
+ "overlap": "????",
+ "snaps": [
+ 1
+ ]
+ }
+ ]
+ },
+ "errors": []
+ },
{
"extra clones": [
1
"snap": "head",
"locator": "",
"nspace": "",
- "name": "obj11"
+ "name": "obj11",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": []
+ }
},
{
"errors": [
"snap": "head",
"locator": "",
"nspace": "",
- "name": "obj12"
+ "name": "obj12",
+ "snapset": {
+ "head_exists": 0,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 1,
+ "size": 1032,
+ "overlap": "[]",
+ "snaps": [
+ 1
+ ]
+ }
+ ]
+ }
+ },
+ {
+ "name": "obj14",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 1,
+ "size": 1033,
+ "overlap": "[]",
+ "snaps": [
+ 1
+ ]
+ }
+ ]
+ },
+ "errors": []
},
{
"errors": [
- "ss_attr_corrupted"
+ "snapset_corrupted"
],
"snap": "head",
"locator": "",
4
],
"errors": [
- "ss_attr_missing",
+ "snapset_missing",
"extra_clones"
],
"snap": "head",
"snap": "head",
"locator": "",
"nspace": "",
- "name": "obj3"
+ "name": "obj3",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 3,
+ "snaps": [
+ 3,
+ 2,
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 1,
+ "size": 1032,
+ "overlap": "[]",
+ "snaps": [
+ 1
+ ]
+ },
+ {
+ "snap": 3,
+ "size": 256,
+ "overlap": "[]",
+ "snaps": [
+ 3,
+ 2
+ ]
+ }
+ ]
+ }
},
{
"missing": [
"snap": "head",
"locator": "",
"nspace": "",
- "name": "obj4"
+ "name": "obj4",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 7,
+ "snaps": [
+ 7,
+ 6,
+ 5,
+ 4,
+ 3,
+ 2,
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 7,
+ "size": 1032,
+ "overlap": "[]",
+ "snaps": [
+ 7,
+ 6,
+ 5,
+ 4,
+ 3,
+ 2,
+ 1
+ ]
+ }
+ ]
+ }
},
{
"missing": [
"snap": "head",
"locator": "",
"nspace": "",
- "name": "obj5"
+ "name": "obj5",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 6,
+ "snaps": [
+ 6,
+ 5,
+ 4,
+ 3,
+ 2,
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 1,
+ "size": 1032,
+ "overlap": "[]",
+ "snaps": [
+ 1
+ ]
+ },
+ {
+ "snap": 2,
+ "size": 256,
+ "overlap": "[]",
+ "snaps": [
+ 2
+ ]
+ },
+ {
+ "snap": 4,
+ "size": 512,
+ "overlap": "[]",
+ "snaps": [
+ 4,
+ 3
+ ]
+ },
+ {
+ "snap": 6,
+ "size": 1024,
+ "overlap": "[]",
+ "snaps": [
+ 6,
+ 5
+ ]
+ }
+ ]
+ }
},
{
"extra clones": [
"snap": "head",
"locator": "",
"nspace": "",
- "name": "obj6"
+ "name": "obj6",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": []
+ }
},
{
"extra clones": [
"snap": "head",
"locator": "",
"nspace": "",
- "name": "obj7"
+ "name": "obj7",
+ "snapset": {
+ "head_exists": 0,
+ "snap_context": {
+ "seq": 0,
+ "snaps": []
+ },
+ "clones": []
+ }
},
{
"errors": [
- "snapset_mismatch"
+ "snapset_error"
],
"snap": "head",
"locator": "",
"nspace": "",
- "name": "obj8"
+ "name": "obj8",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 0,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 1,
+ "size": 1032,
+ "overlap": "[]",
+ "snaps": [
+ 1
+ ]
+ }
+ ]
+ }
+ },
+ {
+ "name": "obj9",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "snapset": {
+ "head_exists": 1,
+ "snap_context": {
+ "seq": 1,
+ "snaps": [
+ 1
+ ]
+ },
+ "clones": [
+ {
+ "snap": 1,
+ "size": "????",
+ "overlap": "[]",
+ "snaps": [
+ 1
+ ]
+ }
+ ]
+ },
+ "errors": []
}
],
"epoch": 20
EOF
jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || return 1
+ diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ if test $getjson = "yes"
+ then
+ jq '.' $dir/json > save1.json
+ fi
- if which jsonschema > /dev/null;
+ if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
then
jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-snap.json || return 1
fi
for err_string in "${err_strings[@]}"
do
- if ! grep "$err_string" $dir/osd.0.log > /dev/null;
+ if ! grep "$err_string" $dir/osd.${primary}.log > /dev/null;
then
echo "Missing log message '$err_string'"
ERRORS=$(expr $ERRORS + 1)
fi
done
- teardown $dir || return 1
-
if [ $ERRORS != "0" ];
then
echo "TEST FAILED WITH $ERRORS ERRORS"
--- /dev/null
+#!/usr/bin/env bash
+#
+# Copyright (C) 2018 Red Hat <contact@redhat.com>
+#
+# Author: David Zafman <dzafman@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7138" # git grep '\<7138\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ $func $dir || return 1
+ done
+}
+
+function TEST_scrub_test() {
+ local dir=$1
+ local poolname=test
+ local OSDS=3
+ local objects=15
+
+ TESTDATA="testdata.$$"
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=3 || return 1
+ run_mgr $dir x || return 1
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ run_osd $dir $osd || return 1
+ done
+
+ # Create a pool with a single pg
+ create_pool $poolname 1 1
+ wait_for_clean || return 1
+ poolid=$(ceph osd dump | grep "^pool.*[']${poolname}[']" | awk '{ print $2 }')
+
+ dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
+ for i in `seq 1 $objects`
+ do
+ rados -p $poolname put obj${i} $TESTDATA
+ done
+ rm -f $TESTDATA
+
+ local primary=$(get_primary $poolname obj1)
+ local otherosd=$(get_not_primary $poolname obj1)
+ if [ "$otherosd" = "2" ];
+ then
+ local anotherosd="0"
+ else
+ local anotherosd="2"
+ fi
+
+ objectstore_tool $dir $anotherosd obj1 set-bytes /etc/fstab
+
+ local pgid="${poolid}.0"
+ pg_deep_scrub "$pgid" || return 1
+
+ ceph pg dump pgs | grep ^${pgid} | grep -q -- +inconsistent || return 1
+ test "$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_scrub_errors')" = "2" || return 1
+
+ ceph osd out $primary
+ wait_for_clean || return 1
+
+ pg_deep_scrub "$pgid" || return 1
+
+ test "$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_scrub_errors')" = "2" || return 1
+ test "$(ceph pg $pgid query | jq '.peer_info[0].stats.stat_sum.num_scrub_errors')" = "2" || return 1
+ ceph pg dump pgs | grep ^${pgid} | grep -q -- +inconsistent || return 1
+
+ ceph osd in $primary
+ wait_for_clean || return 1
+
+ repair "$pgid" || return 1
+ wait_for_clean || return 1
+
+ # This sets up the test after we've repaired with previous primary has old value
+ test "$(ceph pg $pgid query | jq '.peer_info[0].stats.stat_sum.num_scrub_errors')" = "2" || return 1
+ ceph pg dump pgs | grep ^${pgid} | grep -vq -- +inconsistent || return 1
+
+ ceph osd out $primary
+ wait_for_clean || return 1
+
+ test "$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_scrub_errors')" = "0" || return 1
+ test "$(ceph pg $pgid query | jq '.peer_info[0].stats.stat_sum.num_scrub_errors')" = "0" || return 1
+ test "$(ceph pg $pgid query | jq '.peer_info[1].stats.stat_sum.num_scrub_errors')" = "0" || return 1
+ ceph pg dump pgs | grep ^${pgid} | grep -vq -- +inconsistent || return 1
+
+ teardown $dir || return 1
+}
+
+main osd-scrub-test "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && \
+# qa/standalone/scrub/osd-scrub-test.sh"
# Specify a bad --op command
cmd = (CFSD_PREFIX + "--op oops").format(osd=ONEOSD)
- ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, dump-import)")
+ ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, dump-import, trim-pg-log)")
# Provide just the object param not a command
cmd = (CFSD_PREFIX + "object").format(osd=ONEOSD)
- Scrub error on inode
- Behind on trimming
- Metadata damage detected
+ - bad backtrace on inode
- overall HEALTH_
- (MDS_TRIM)
conf:
- cephfs_test_runner:
modules:
- tasks.cephfs.test_scrub_checks
+ - tasks.cephfs.test_scrub
log-whitelist:
- OSD full dropping all updates
- OSD near full
+ - pausewr flag
- failsafe engaged, dropping updates
- failsafe disengaged, no longer dropping
- is full \(reached quota
+ - POOL_FULL
+ - POOL_BACKFILLFULL
conf:
mon:
mon osd nearfull ratio: 0.6
--- /dev/null
+../../../../cephfs/clusters/1-mds-1-client.yaml
\ No newline at end of file
+++ /dev/null
-../../../../clusters/fixed-3-cephfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/log-config.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/osd-asserts.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_health.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/clusters/1-mds-2-client.yaml
\ No newline at end of file
+++ /dev/null
-roles:
-- [mon.a, mgr.x, mds.a, osd.0, osd.1]
-- [mon.b, mon.c, osd.2, osd.3]
-- [client.0]
-- [client.1]
-openstack:
-- volumes: # attached to each instance
- count: 2
- size: 10 # GB
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/log-config.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/osd-asserts.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_health.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/clusters/1-mds-4-client.yaml
\ No newline at end of file
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, osd.2, osd.3, mds.a, mds.c, client.2]
-- [mgr.x, osd.4, osd.5, osd.6, osd.7, mds.b, mds.d, client.3]
-- [client.0]
-- [client.1]
-openstack:
-- volumes: # attached to each instance
- count: 2
- size: 10 # GB
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/log-config.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/osd-asserts.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_health.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
overrides:
ceph:
+ cephfs_ec_profile:
+ - disabled
log-whitelist:
- OSD full dropping all updates
- OSD near full
+ - pausewr flag
- failsafe engaged, dropping updates
- failsafe disengaged, no longer dropping
- is full \(reached quota
+ - POOL_FULL
+ - POOL_BACKFILLFULL
conf:
+ mon:
+ mon osd nearfull ratio: 0.6
+ mon osd backfillfull ratio: 0.6
+ mon osd full ratio: 0.7
osd:
osd mon report interval max: 5
osd objectstore: memstore
- memstore device bytes: 100000000
+ osd failsafe full ratio: 1.0
+ memstore device bytes: 200000000
tasks:
- cephfs_test_runner:
--- /dev/null
+../../../../cephfs/clusters/1-mds-1-client.yaml
\ No newline at end of file
+++ /dev/null
-../../../../clusters/fixed-3-cephfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/log-config.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/osd-asserts.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_health.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
+overrides:
+ ceph:
+ log-whitelist:
+ - \(MON_DOWN\)
tasks:
- install:
- ceph:
- \(OSDMAP_FLAGS\)
- \(OSD_
- \(PG_
- - \(OBJECT_DEGRADED\)
+ - \(OBJECT_
- \(POOL_APP_NOT_ENABLED\)
conf:
osd:
- \(OSDMAP_FLAGS\)
- \(OSD_
- \(PG_
- - \(OBJECT_DEGRADED\)
+ - \(OBJECT_
- \(POOL_APP_NOT_ENABLED\)
conf:
osd:
clone: true
type: block
disks: 3
+ time_wait: 120
test: http://git.ceph.com/?p={repo};a=blob_plain;hb={branch};f=qa/run_xfstests_qemu.sh
exclude_arch: armv7l
conf:
client:
debug rgw: 20
- rgw:
- frontend: civetweb
rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo=
rgw crypt require ssl: false
rgw:
- frontend: civetweb
compression type: random
size: 30
# reluctantely :( hard-coded machine type
# it will override command line args with teuthology-suite
-machine_type: vps
+machine_type: ovh
roles:
- - mon.a
- mds.a
+++ /dev/null
-../../../../../distros/all/centos_7.3.yaml
\ No newline at end of file
+++ /dev/null
-../../../../../distros/all/ubuntu_14.04.yaml
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: |
- Run ceph on two nodes, using one of them as a client,
- with a separate client-only node.
- Use xfs beneath the osds.
- install ceph/jewel v10.2.0 point version
- run workload and upgrade-sequence in parallel
- install ceph/jewel latest version
- run workload and upgrade-sequence in parallel
- install ceph/-x version (jewel or kraken)
- run workload and upgrade-sequence in parallel
-overrides:
- ceph:
- log-whitelist:
- - reached quota
- - scrub
- - osd_map_max_advance
- - wrongly marked
- - overall HEALTH_
- - \(MGR_DOWN\)
- - \(OSD_
- - \(PG_
- - \(CACHE_
- fs: xfs
- conf:
- global:
- mon warn on pool no app: false
- mon:
- mon debug unsafe allow tier with nonempty snaps: true
- osd:
- osd map max advance: 1000
- osd map cache size: 1100
-roles:
-- - mon.a
- - mds.a
- - osd.0
- - osd.1
- - osd.2
- - mgr.x
-- - mon.b
- - mon.c
- - osd.3
- - osd.4
- - osd.5
- - client.0
-- - client.1
-openstack:
-- volumes: # attached to each instance
- count: 3
- size: 30 # GB
-tasks:
-- print: "**** v10.2.0 about to install"
-- install:
- tag: v10.2.0
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev', 'librgw2']
-- print: "**** done v10.2.0 install"
-- ceph:
- fs: xfs
- skip_mgr_daemons: true
- add_osds_to_crush: true
-- print: "**** done ceph xfs"
-- sequential:
- - workload
-- print: "**** done workload v10.2.0"
-- install.upgrade:
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- mon.a:
- branch: jewel
- mon.b:
- branch: jewel
- # Note that client.a IS NOT upgraded at this point
- #client.1:
- #branch: jewel
-- parallel:
- - workload_jewel
- - upgrade-sequence_jewel
-- print: "**** done parallel jewel branch"
-- install.upgrade:
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- client.1:
- branch: jewel
-- print: "**** done branch: jewel install.upgrade on client.1"
-- install.upgrade:
- mon.a:
- mon.b:
-- print: "**** done branch: -x install.upgrade on mon.a and mon.b"
-- parallel:
- - workload_x
- - upgrade-sequence_x
-- print: "**** done parallel -x branch"
-- exec:
- osd.0:
- - ceph osd set-require-min-compat-client luminous
-# Run librados tests on the -x upgraded cluster
-- install.upgrade:
- client.1:
-- workunit:
- branch: jewel
- clients:
- client.1:
- - rados/test-upgrade-v11.0.0.sh
- - cls
-- print: "**** done final test on -x cluster"
-#######################
-workload:
- sequential:
- - workunit:
- clients:
- client.0:
- - suites/blogbench.sh
-workload_jewel:
- full_sequential:
- - workunit:
- branch: jewel
- clients:
- client.1:
- - rados/test.sh
- - cls
- env:
- CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_image'
- - print: "**** done rados/test.sh & cls workload_jewel"
- - sequential:
- - rgw: [client.0]
- - print: "**** done rgw workload_jewel"
- - s3tests:
- client.0:
- force-branch: ceph-jewel
- rgw_server: client.0
- scan_for_encryption_keys: false
- - print: "**** done s3tests workload_jewel"
-upgrade-sequence_jewel:
- sequential:
- - print: "**** done branch: jewel install.upgrade"
- - ceph.restart: [mds.a]
- - sleep:
- duration: 60
- - ceph.restart: [osd.0]
- - sleep:
- duration: 30
- - ceph.restart: [osd.1]
- - sleep:
- duration: 30
- - ceph.restart: [osd.2]
- - sleep:
- duration: 30
- - ceph.restart: [osd.3]
- - sleep:
- duration: 30
- - ceph.restart: [osd.4]
- - sleep:
- duration: 30
- - ceph.restart: [osd.5]
- - sleep:
- duration: 60
- - ceph.restart: [mon.a]
- - sleep:
- duration: 60
- - ceph.restart: [mon.b]
- - sleep:
- duration: 60
- - ceph.restart: [mon.c]
- - sleep:
- duration: 60
- - print: "**** done ceph.restart all jewel branch mds/osd/mon"
-workload_x:
- sequential:
- - workunit:
- branch: jewel
- clients:
- client.1:
- - rados/test-upgrade-v11.0.0-noec.sh
- - cls
- env:
- CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_image'
- - print: "**** done rados/test-upgrade-v11.0.0.sh & cls workload_x NOT upgraded client"
- - workunit:
- branch: jewel
- clients:
- client.0:
- - rados/test-upgrade-v11.0.0-noec.sh
- - cls
- - print: "**** done rados/test-upgrade-v11.0.0.sh & cls workload_x upgraded client"
- - rgw: [client.1]
- - print: "**** done rgw workload_x"
- - s3tests:
- client.1:
- force-branch: ceph-jewel
- rgw_server: client.1
- scan_for_encryption_keys: false
- - print: "**** done s3tests workload_x"
-upgrade-sequence_x:
- sequential:
- - ceph.restart: [mds.a]
- - sleep:
- duration: 60
- - ceph.restart: [mon.a]
- - sleep:
- duration: 60
- - ceph.restart: [mon.b]
- - sleep:
- duration: 60
- - ceph.restart: [mon.c]
- - sleep:
- duration: 60
- - ceph.restart: [osd.0]
- - sleep:
- duration: 30
- - ceph.restart: [osd.1]
- - sleep:
- duration: 30
- - ceph.restart: [osd.2]
- - sleep:
- duration: 30
- - ceph.restart: [osd.3]
- - sleep:
- duration: 30
- - ceph.restart: [osd.4]
- - sleep:
- duration: 30
- - ceph.restart:
- daemons: [osd.5]
- wait-for-healthy: false
- wait-for-up-osds: true
- - exec:
- mgr.x:
- - mkdir -p /var/lib/ceph/mgr/ceph-x
- - ceph auth get-or-create-key mgr.x mon 'allow profile mgr'
- - ceph auth export mgr.x > /var/lib/ceph/mgr/ceph-x/keyring
- - ceph.restart:
- daemons: [mgr.x]
- wait-for-healthy: false
- - exec:
- osd.0:
- - ceph osd require-osd-release luminous
- - ceph.healthy:
- - print: "**** done ceph.restart all -x branch mds/osd/mon"
workload_luminous:
full_sequential:
- workunit:
- branch: luminous
+ tag: v12.2.2
clients:
client.1:
- rados/test.sh
branch: luminous
clients:
client.1:
- - rados/test.sh
+ - rados/test-upgrade-to-mimic.sh
- cls
- - print: "**** done rados/test.sh & cls workload_x NOT upgraded client"
+ - print: "**** done rados/test-upgrade-to-mimic.sh & cls workload_x NOT upgraded client"
- workunit:
branch: luminous
clients:
client.0:
- - rados/test.sh
+ - rados/test-upgrade-to-mimic.sh
- cls
- print: "**** done rados/test.sh & cls workload_x upgraded client"
- rgw: [client.1]
else:
return True
- def rados(self, args, pool=None, namespace=None, stdin_data=None):
+ def rados(self, args, pool=None, namespace=None, stdin_data=None,
+ stdin_file=None):
"""
Call into the `rados` CLI from an MDS
"""
args = ([os.path.join(self._prefix, "rados"), "-p", pool] +
(["--namespace", namespace] if namespace else []) +
args)
+
+ if stdin_file is not None:
+ args = ["bash", "-c", "cat " + stdin_file + " | " + " ".join(args)]
+
p = remote.run(
args=args,
stdin=stdin_data,
print "writing some data through which we expect to succeed"
bytes = 0
f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
- bytes += os.write(f, 'a' * 4096)
+ bytes += os.write(f, 'a' * 512 * 1024)
os.fsync(f)
print "fsync'ed data successfully, will now attempt to fill fs"
# from write
full = False
- for n in range(0, {fill_mb}):
+ for n in range(0, int({fill_mb} * 0.9)):
bytes += os.write(f, 'x' * 1024 * 1024)
- print "wrote bytes via buffered write, may repeat"
- print "done writing bytes"
+ print "wrote {{0}} bytes via buffered write, may repeat".format(bytes)
+ print "done writing {{0}} bytes".format(bytes)
# OK, now we should sneak in under the full condition
# due to the time it takes the OSDs to report to the
os.fsync(f)
print "successfully fsync'ed prior to getting full state reported"
- # Now wait for the full flag to get set so that our
- # next flush IO will fail
- time.sleep(30)
+ # buffered write, add more dirty data to the buffer
+ print "starting buffered write"
+ try:
+ for n in range(0, int({fill_mb} * 0.2)):
+ bytes += os.write(f, 'x' * 1024 * 1024)
+ print "sleeping a bit as we've exceeded 90% of our expected full ratio"
+ time.sleep({full_wait})
+ except OSError:
+ pass;
- # A buffered IO, should succeed
- print "starting buffered write we expect to succeed"
- os.write(f, 'x' * 4096)
print "wrote, now waiting 30s and then doing a close we expect to fail"
# Wait long enough for a background flush that should fail
# from write
full = False
- for n in range(0, {fill_mb} + 1):
+ for n in range(0, int({fill_mb} * 1.1)):
try:
bytes += os.write(f, 'x' * 1024 * 1024)
print "wrote bytes via buffered write, moving on to fsync"
else:
print "Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0))
- if n > {fill_mb} * 0.8:
+ if n > {fill_mb} * 0.9:
# Be cautious in the last region where we expect to hit
# the full condition, so that we don't overshoot too dramatically
- print "sleeping a bit as we've exceeded 80% of our expected full ratio"
+ print "sleeping a bit as we've exceeded 90% of our expected full ratio"
time.sleep({full_wait})
if not full:
super(TestClusterFull, self).setUp()
if self.pool_capacity is None:
- # This is a hack to overcome weird fluctuations in the reported
- # `max_avail` attribute of pools that sometimes occurs in between
- # tests (reason as yet unclear, but this dodges the issue)
- TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail']
- TestClusterFull.fill_mb = int(1.05 * (self.pool_capacity / (1024.0 * 1024.0)))
+ max_avail = self.fs.get_pool_df(self._data_pool_name())['max_avail']
+ full_ratio = float(self.fs.get_config("mon_osd_full_ratio", service_type="mon"))
+ TestClusterFull.pool_capacity = int(max_avail * full_ratio)
+ TestClusterFull.fill_mb = (self.pool_capacity / (1024 * 1024))
def is_full(self):
return self.fs.is_full()
--- /dev/null
+"""
+Test CephFS scrub (distinct from OSD scrub) functionality
+"""
+import logging
+import os
+import traceback
+from collections import namedtuple
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+log = logging.getLogger(__name__)
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class Workload(object):
+ def __init__(self, filesystem, mount):
+ self._mount = mount
+ self._filesystem = filesystem
+ self._initial_state = None
+
+ # Accumulate backtraces for every failed validation, and return them. Backtraces
+ # are rather verbose, but we only see them when something breaks, and they
+ # let us see which check failed without having to decorate each check with
+ # a string
+ self._errors = []
+
+ def assert_equal(self, a, b):
+ try:
+ if a != b:
+ raise AssertionError("{0} != {1}".format(a, b))
+ except AssertionError as e:
+ self._errors.append(
+ ValidationError(e, traceback.format_exc(3))
+ )
+
+ def write(self):
+ """
+ Write the workload files to the mount
+ """
+ raise NotImplementedError()
+
+ def validate(self):
+ """
+ Read from the mount and validate that the workload files are present (i.e. have
+ survived or been reconstructed from the test scenario)
+ """
+ raise NotImplementedError()
+
+ def damage(self):
+ """
+ Damage the filesystem pools in ways that will be interesting to recover from. By
+ default just wipe everything in the metadata pool
+ """
+ # Delete every object in the metadata pool
+ objects = self._filesystem.rados(["ls"]).split("\n")
+ for o in objects:
+ self._filesystem.rados(["rm", o])
+
+ def flush(self):
+ """
+ Called after client unmount, after write: flush whatever you want
+ """
+ self._filesystem.mds_asok(["flush", "journal"])
+
+
+class BacktraceWorkload(Workload):
+ """
+ Single file, single directory, wipe the backtrace and check it.
+ """
+ def write(self):
+ self._mount.run_shell(["mkdir", "subdir"])
+ self._mount.write_n_mb("subdir/sixmegs", 6)
+
+ def validate(self):
+ st = self._mount.stat("subdir/sixmegs")
+ self._filesystem.mds_asok(["flush", "journal"])
+ bt = self._filesystem.read_backtrace(st['st_ino'])
+ parent = bt['ancestors'][0]['dname']
+ self.assert_equal(parent, "sixmegs")
+ return self._errors
+
+ def damage(self):
+ st = self._mount.stat("subdir/sixmegs")
+ self._filesystem.mds_asok(["flush", "journal"])
+ self._filesystem._write_data_xattr(st['st_ino'], "parent", "")
+
+
+class DupInodeWorkload(Workload):
+ """
+ Duplicate an inode and try scrubbing it twice."
+ """
+
+ def write(self):
+ self._mount.run_shell(["mkdir", "parent"])
+ self._mount.run_shell(["mkdir", "parent/child"])
+ self._mount.write_n_mb("parent/parentfile", 6)
+ self._mount.write_n_mb("parent/child/childfile", 6)
+
+ def damage(self):
+ temp_bin_path = "/tmp/10000000000.00000000_omap.bin"
+ self._mount.umount()
+ self._filesystem.mds_asok(["flush", "journal"])
+ self._filesystem.mds_stop()
+ self._filesystem.rados(["getomapval", "10000000000.00000000",
+ "parentfile_head", temp_bin_path])
+ self._filesystem.rados(["setomapval", "10000000000.00000000",
+ "shadow_head"], stdin_file=temp_bin_path)
+ self._filesystem.set_ceph_conf('mds', 'mds hack allow loading invalid metadata', True)
+ self._filesystem.mds_restart()
+ self._filesystem.wait_for_daemons()
+
+ def validate(self):
+ self._filesystem.mds_asok(["scrub_path", "/", "recursive", "repair"])
+ self.assert_equal(self._filesystem.are_daemons_healthy(), True)
+ return self._errors
+
+
+class TestScrub(CephFSTestCase):
+ MDSS_REQUIRED = 1
+
+ def _scrub(self, workload, workers=1):
+ """
+ That when all objects in metadata pool are removed, we can rebuild a metadata pool
+ based on the contents of a data pool, and a client can see and read our files.
+ """
+
+ # First, inject some files
+
+ workload.write()
+
+ # are off by default, but in QA we need to explicitly disable them)
+ self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+ self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+ # Apply any data damage the workload wants
+ workload.damage()
+
+ self.fs.mds_asok(["scrub_path", "/", "recursive", "repair"])
+
+ # See that the files are present and correct
+ errors = workload.validate()
+ if errors:
+ log.error("Validation errors found: {0}".format(len(errors)))
+ for e in errors:
+ log.error(e.exception)
+ log.error(e.backtrace)
+ raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+ errors[0].exception, errors[0].backtrace
+ ))
+
+ def test_scrub_backtrace(self):
+ self._scrub(BacktraceWorkload(self.fs, self.mount_a))
+
+ def test_scrub_dup_inode(self):
+ self._scrub(DupInodeWorkload(self.fs, self.mount_a))
self._selftest_plugin("zabbix")
def test_prometheus(self):
+ self._assign_ports("prometheus", "server_port", min_port=8100)
self._selftest_plugin("prometheus")
def test_influx(self):
import logging
import os
import yaml
+import time
from teuthology import misc as teuthology
from teuthology import contextutil
cachemode=cachemode,
),
])
+ time_wait = client_config.get('time_wait', 0)
log.info('starting qemu...')
procs.append(
log.info('waiting for qemu tests to finish...')
run.wait(procs)
+ if time_wait > 0:
+ log.debug('waiting {time_wait} sec for workloads detect finish...'.format(
+ time_wait=time_wait));
+ time.sleep(time_wait)
+
log.debug('checking that qemu tests succeeded...')
for client in config.iterkeys():
(remote,) = ctx.cluster.only(client).remotes.keys()
for client in config:
ctx.cluster.only(client).run(
args=[
- 'git', 'clone',
+ 'git',
+ 'clone',
+ '--branch',
+ 'ceph-luminous',
teuth_config.ceph_git_base_url + 'swift.git',
'{tdir}/swift'.format(tdir=testdir),
],
--- /dev/null
+#!/bin/bash -ex
+
+parallel=1
+[ "$1" = "--serial" ] && parallel=0
+
+color=""
+[ -t 1 ] && color="--gtest_color=yes"
+
+function cleanup() {
+ pkill -P $$ || true
+}
+trap cleanup EXIT ERR HUP INT QUIT
+
+declare -A pids
+
+for f in \
+ api_aio api_io api_list api_lock api_misc \
+ api_tier api_pool api_snapshots api_stat api_watch_notify api_cmd \
+ api_service \
+ api_c_write_operations \
+ 'api_c_read_operations --gtest_filter=-CReadOpsTest.Exec' \
+ list_parallel \
+ open_pools_parallel \
+ delete_pools_parallel \
+ watch_notify
+do
+ if [ $parallel -eq 1 ]; then
+ r=`printf '%25s' $f`
+ ff=`echo $f | awk '{print $1}'`
+ bash -o pipefail -exc "ceph_test_rados_$f $color 2>&1 | tee ceph_test_rados_$ff.log | sed \"s/^/$r: /\"" &
+ pid=$!
+ echo "test $f on pid $pid"
+ pids[$f]=$pid
+ else
+ ceph_test_rados_$f
+ fi
+done
+
+ret=0
+if [ $parallel -eq 1 ]; then
+for t in "${!pids[@]}"
+do
+ pid=${pids[$t]}
+ if ! wait $pid
+ then
+ echo "error in $t ($pid)"
+ ret=1
+ fi
+done
+fi
+
+exit $ret
#!/bin/bash -ex
STACK_BRANCH=stable/pike
+TEMPEST_BRANCH=17.2.0
STACK_USER=${STACK_USER:-stack}
STACK_GROUP=${STACK_GROUP:-stack}
cd devstack
cp ${STACK_HOME_PATH}/local.conf .
+export TEMPEST_BRANCH=${TEMPEST_BRANCH}
export PYTHONUNBUFFERED=true
export PROJECTS="openstack/devstack-plugin-ceph openstack/devstack-plugin-mariadb"
$(rbd_watch_out_file ${image}) $(rbd_watch_asok ${image})
}
-wait_for_clean
-
pool="rbd"
image=testimg$$
ceph_admin="ceph --admin-daemon $(rbd_watch_asok ${image})"
function run() {
local install_cmd
local which_pkg="which"
+ source /etc/os-release
if test -f /etc/redhat-release ; then
- source /etc/os-release
if ! type bc > /dev/null 2>&1 ; then
echo "Please install bc and re-run."
exit 1
else
install_cmd="yum install -y"
fi
- else
+ elif type zypper > /dev/null 2>&1 ; then
+ install_cmd="zypper --gpg-auto-import-keys --non-interactive install --no-recommends"
+ elif type apt-get > /dev/null 2>&1 ; then
+ install_cmd="apt-get install -y"
which_pkg="debianutils"
fi
- type apt-get > /dev/null 2>&1 && install_cmd="apt-get install -y"
- type zypper > /dev/null 2>&1 && install_cmd="zypper --gpg-auto-import-keys --non-interactive install"
-
if ! type sudo > /dev/null 2>&1 ; then
echo "Please install sudo and re-run. This script assumes it is running"
echo "as a normal user with the ability to run commands as root via sudo."
$DRY_RUN sudo $install_cmd ccache jq $which_pkg
else
echo "WARNING: Don't know how to install packages" >&2
+ echo "This probably means distribution $ID is not supported by run-make-check.sh" >&2
fi
if test -f ./install-deps.sh ; then
-52085d5249a80c5f5121a76d6288429f35e4e77b
-v12.2.4
+cad919881333ac92274171586c827e01f554a70a
+v12.2.5
add_custom_target(ceph-detect-init
COMMAND
${CMAKE_SOURCE_DIR}/src/tools/setup-virtualenv.sh ${CEPH_DETECT_INIT_VIRTUALENV} &&
- ${CEPH_DETECT_INIT_VIRTUALENV}/bin/pip install --no-index --use-wheel --find-links=file:${CMAKE_SOURCE_DIR}/src/ceph-detect-init/wheelhouse -e .
+ ${CEPH_DETECT_INIT_VIRTUALENV}/bin/pip install --no-index --find-links=file:${CMAKE_SOURCE_DIR}/src/ceph-detect-init/wheelhouse -e .
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/ceph-detect-init
COMMENT "ceph-detect-init is being created")
add_dependencies(tests ceph-detect-init)
usedevelop = true
deps =
{env:NO_INDEX:}
- --use-wheel
--find-links=file://{toxinidir}/wheelhouse
-r{toxinidir}/requirements.txt
-r{toxinidir}/test-requirements.txt
add_custom_target(ceph-disk
COMMAND
${CMAKE_SOURCE_DIR}/src/tools/setup-virtualenv.sh ${CEPH_DISK_VIRTUALENV} &&
- ${CEPH_DISK_VIRTUALENV}/bin/pip install --no-index --use-wheel --find-links=file:${CMAKE_SOURCE_DIR}/src/ceph-disk/wheelhouse -e .
+ ${CEPH_DISK_VIRTUALENV}/bin/pip install --no-index --find-links=file:${CMAKE_SOURCE_DIR}/src/ceph-disk/wheelhouse -e .
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/ceph-disk
COMMENT "ceph-disk is being created")
add_dependencies(tests ceph-disk)
usedevelop = true
deps =
{env:NO_INDEX:}
- --use-wheel
--find-links=file://{toxinidir}/wheelhouse
-r{toxinidir}/requirements.txt
-r{toxinidir}/test-requirements.txt
../ceph-detect-init
[testenv:py27]
-sitepackages=True
+basepython = python2.7
passenv = CEPH_ROOT CEPH_BIN CEPH_LIB CEPH_BUILD_VIRTUALENV
changedir = {env:CEPH_BUILD_DIR}
commands = coverage run --append --source=ceph_disk {envbindir}/py.test -vv {toxinidir}/tests/test_main.py
that prefixes tags with ``ceph.`` and uses ``=`` for assignment, and provides
set of utilities for interacting with LVM.
"""
+import logging
+import os
from ceph_volume import process
from ceph_volume.exceptions import MultipleLVsError, MultipleVGsError, MultiplePVsError
+logger = logging.getLogger(__name__)
+
def _output_parser(output, fields):
"""
return tag_mapping
+def _vdo_parents(devices):
+ """
+ It is possible we didn't get a logical volume, or a mapper path, but
+ a device like /dev/sda2, to resolve this, we must look at all the slaves of
+ every single device in /sys/block and if any of those devices is related to
+ VDO devices, then we can add the parent
+ """
+ parent_devices = []
+ for parent in os.listdir('/sys/block'):
+ for slave in os.listdir('/sys/block/%s/slaves' % parent):
+ if slave in devices:
+ parent_devices.append('/dev/%s' % parent)
+ parent_devices.append(parent)
+ return parent_devices
+
+
+def _vdo_slaves(vdo_names):
+ """
+ find all the slaves associated with each vdo name (from realpath) by going
+ into /sys/block/<realpath>/slaves
+ """
+ devices = []
+ for vdo_name in vdo_names:
+ mapper_path = '/dev/mapper/%s' % vdo_name
+ if not os.path.exists(mapper_path):
+ continue
+ # resolve the realpath and realname of the vdo mapper
+ vdo_realpath = os.path.realpath(mapper_path)
+ vdo_realname = vdo_realpath.split('/')[-1]
+ slaves_path = '/sys/block/%s/slaves' % vdo_realname
+ if not os.path.exists(slaves_path):
+ continue
+ devices.append(vdo_realpath)
+ devices.append(mapper_path)
+ devices.append(vdo_realname)
+ for slave in os.listdir(slaves_path):
+ devices.append('/dev/%s' % slave)
+ devices.append(slave)
+ return devices
+
+
+def _is_vdo(path):
+ """
+ A VDO device can be composed from many different devices, go through each
+ one of those devices and its slaves (if any) and correlate them back to
+ /dev/mapper and their realpaths, and then check if they appear as part of
+ /sys/kvdo/<name>/statistics
+
+ From the realpath of a logical volume, determine if it is a VDO device or
+ not, by correlating it to the presence of the name in
+ /sys/kvdo/<name>/statistics and all the previously captured devices
+ """
+ if not os.path.isdir('/sys/kvdo'):
+ return False
+ realpath = os.path.realpath(path)
+ realpath_name = realpath.split('/')[-1]
+ devices = []
+ vdo_names = set()
+ # get all the vdo names
+ for dirname in os.listdir('/sys/kvdo/'):
+ if os.path.isdir('/sys/kvdo/%s/statistics' % dirname):
+ vdo_names.add(dirname)
+
+ # find all the slaves associated with each vdo name (from realpath) by
+ # going into /sys/block/<realpath>/slaves
+ devices.extend(_vdo_slaves(vdo_names))
+
+ # Find all possible parents, looking into slaves that are related to VDO
+ devices.extend(_vdo_parents(devices))
+
+ return any([
+ path in devices,
+ realpath in devices,
+ realpath_name in devices])
+
+
+def is_vdo(path):
+ """
+ Detect if a path is backed by VDO, proxying the actual call to _is_vdo so
+ that we can prevent an exception breaking OSD creation. If an exception is
+ raised, it will get captured and logged to file, while returning
+ a ``False``.
+ """
+ try:
+ if _is_vdo(path):
+ return '1'
+ return '0'
+ except Exception:
+ logger.exception('Unable to properly detect device as VDO: %s', path)
+ return '0'
+
+
def get_api_vgs():
"""
Return the list of group volumes available in the system using flags to
include common metadata associated with them
- Command and sample delimeted output, should look like::
+ Command and sample delimited output should look like::
- $ vgs --noheadings --separator=';' \
+ $ vgs --noheadings --readonly --separator=';' \
-o vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free
ubuntubox-vg;1;2;0;wz--n-;299.52g;12.00m
osd_vg;3;1;0;wz--n-;29.21g;9.21g
"""
fields = 'vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free'
stdout, stderr, returncode = process.call(
- ['vgs', '--noheadings', '--separator=";"', '-o', fields]
+ ['vgs', '--noheadings', '--readonly', '--separator=";"', '-o', fields]
)
return _output_parser(stdout, fields)
Return the list of logical volumes available in the system using flags to include common
metadata associated with them
- Command and delimeted output, should look like::
+ Command and delimited output should look like::
- $ lvs --noheadings --separator=';' -o lv_tags,lv_path,lv_name,vg_name
+ $ lvs --noheadings --readonly --separator=';' -o lv_tags,lv_path,lv_name,vg_name
;/dev/ubuntubox-vg/root;root;ubuntubox-vg
;/dev/ubuntubox-vg/swap_1;swap_1;ubuntubox-vg
"""
fields = 'lv_tags,lv_path,lv_name,vg_name,lv_uuid'
stdout, stderr, returncode = process.call(
- ['lvs', '--noheadings', '--separator=";"', '-o', fields]
+ ['lvs', '--noheadings', '--readonly', '--separator=";"', '-o', fields]
)
return _output_parser(stdout, fields)
This will only return physical volumes set up to work with LVM.
- Command and delimeted output, should look like::
+ Command and delimited output should look like::
- $ pvs --noheadings --separator=';' -o pv_name,pv_tags,pv_uuid
+ $ pvs --noheadings --readonly --separator=';' -o pv_name,pv_tags,pv_uuid
/dev/sda1;;
/dev/sdv;;07A4F654-4162-4600-8EB3-88D1E42F368D
fields = 'pv_name,pv_tags,pv_uuid,vg_name'
stdout, stderr, returncode = process.call(
- ['pvs', '--no-heading', '--separator=";"', '-o', fields]
+ ['pvs', '--no-heading', '--readonly', '--separator=";"', '-o', fields]
)
return _output_parser(stdout, fields)
"""
Removes a volume group.
"""
- fail_msg = "Unable to remove vg %s".format(vg_name)
+ fail_msg = "Unable to remove vg %s" % vg_name
process.run(
[
'vgremove',
"""
Removes a physical volume.
"""
- fail_msg = "Unable to remove vg %s".format(pv_name)
+ fail_msg = "Unable to remove vg %s" % pv_name
process.run(
[
'pvremove',
terminal_verbose=True,
)
if returncode != 0:
- raise RuntimeError("Unable to remove %s".format(path))
+ raise RuntimeError("Unable to remove %s" % path)
return True
def _purge(self):
"""
- Deplete all the items in the list, used internally only so that we can
+ Delete all the items in the list, used internally only so that we can
dynamically allocate the items when filtering without the concern of
messing up the contents
"""
from ceph_volume.util import encryption as encryption_utils
from ceph_volume.systemd import systemctl
from ceph_volume.api import lvm as api
+from .listing import direct_report
logger = logging.getLogger(__name__)
-def activate_filestore(lvs):
+def activate_filestore(lvs, no_systemd=False):
# find the osd
osd_lv = lvs.get(lv_tags={'ceph.type': 'data'})
if not osd_lv:
raise RuntimeError('Unable to find a data LV for filestore activation')
is_encrypted = osd_lv.tags.get('ceph.encrypted', '0') == '1'
+ is_vdo = osd_lv.tags.get('ceph.vdo', '0')
osd_id = osd_lv.tags['ceph.osd_id']
conf.cluster = osd_lv.tags['ceph.cluster_name']
source = '/dev/mapper/%s' % osd_lv.lv_uuid
else:
source = osd_lv.lv_path
+
# mount the osd
destination = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
if not system.device_is_mounted(source, destination=destination):
- process.run(['mount', '-v', source, destination])
+ prepare_utils.mount_osd(source, osd_id, is_vdo=is_vdo)
# always re-do the symlink regardless if it exists, so that the journal
# device path that may have changed can be mapped correctly every time
# make sure that the journal has proper permissions
system.chown(osd_journal)
- # enable the ceph-volume unit for this OSD
- systemctl.enable_volume(osd_id, osd_fsid, 'lvm')
+ if no_systemd is False:
+ # enable the ceph-volume unit for this OSD
+ systemctl.enable_volume(osd_id, osd_fsid, 'lvm')
- # start the OSD
- systemctl.start_osd(osd_id)
+ # start the OSD
+ systemctl.start_osd(osd_id)
terminal.success("ceph-volume lvm activate successful for osd ID: %s" % osd_id)
return None
-def activate_bluestore(lvs):
+def activate_bluestore(lvs, no_systemd=False):
# find the osd
osd_lv = lvs.get(lv_tags={'ceph.type': 'block'})
+ if not osd_lv:
+ raise RuntimeError('could not find a bluestore OSD to activate')
is_encrypted = osd_lv.tags.get('ceph.encrypted', '0') == '1'
dmcrypt_secret = None
osd_id = osd_lv.tags['ceph.osd_id']
process.run(['ln', '-snf', wal_device_path, destination])
system.chown(wal_device_path)
- # enable the ceph-volume unit for this OSD
- systemctl.enable_volume(osd_id, osd_fsid, 'lvm')
+ if no_systemd is False:
+ # enable the ceph-volume unit for this OSD
+ systemctl.enable_volume(osd_id, osd_fsid, 'lvm')
- # start the OSD
- systemctl.start_osd(osd_id)
+ # start the OSD
+ systemctl.start_osd(osd_id)
terminal.success("ceph-volume lvm activate successful for osd ID: %s" % osd_id)
self.argv = argv
@decorators.needs_root
- def activate(self, args):
+ def activate_all(self, args):
+ listed_osds = direct_report()
+ osds = {}
+ for osd_id, devices in listed_osds.items():
+ # the metadata for all devices in each OSD will contain
+ # the FSID which is required for activation
+ for device in devices:
+ fsid = device.get('tags', {}).get('ceph.osd_fsid')
+ if fsid:
+ osds[fsid] = osd_id
+ break
+ if not osds:
+ terminal.warning('Was unable to find any OSDs to activate')
+ terminal.warning('Verify OSDs are present with "ceph-volume lvm list"')
+ return
+ for osd_fsid, osd_id in osds.items():
+ if systemctl.osd_is_active(osd_id):
+ terminal.warning(
+ 'OSD ID %s FSID %s process is active. Skipping activation' % (osd_id, osd_fsid)
+ )
+ else:
+ terminal.info('Activating OSD ID %s FSID %s' % (osd_id, osd_fsid))
+ self.activate(args, osd_id=osd_id, osd_fsid=osd_fsid)
+
+ @decorators.needs_root
+ def activate(self, args, osd_id=None, osd_fsid=None):
+ """
+ :param args: The parsed arguments coming from the CLI
+ :param osd_id: When activating all, this gets populated with an existing OSD ID
+ :param osd_fsid: When activating all, this gets populated with an existing OSD FSID
+ """
+ osd_id = osd_id if osd_id is not None else args.osd_id
+ osd_fsid = osd_fsid if osd_fsid is not None else args.osd_fsid
+
lvs = api.Volumes()
# filter them down for the OSD ID and FSID we need to activate
- if args.osd_id and args.osd_fsid:
- lvs.filter(lv_tags={'ceph.osd_id': args.osd_id, 'ceph.osd_fsid': args.osd_fsid})
- elif args.osd_fsid and not args.osd_id:
- lvs.filter(lv_tags={'ceph.osd_fsid': args.osd_fsid})
+ if osd_id and osd_fsid:
+ lvs.filter(lv_tags={'ceph.osd_id': osd_id, 'ceph.osd_fsid': osd_fsid})
+ elif osd_fsid and not osd_id:
+ lvs.filter(lv_tags={'ceph.osd_fsid': osd_fsid})
if not lvs:
- raise RuntimeError('could not find osd.%s with fsid %s' % (args.osd_id, args.osd_fsid))
+ raise RuntimeError('could not find osd.%s with fsid %s' % (osd_id, osd_fsid))
# This argument is only available when passed in directly or via
# systemd, not when ``create`` is being used
if getattr(args, 'auto_detect_objectstore', False):
logger.info('unable to find a journal associated with the OSD, assuming bluestore')
return activate_bluestore(lvs)
if args.bluestore:
- activate_bluestore(lvs)
+ activate_bluestore(lvs, no_systemd=args.no_systemd)
elif args.filestore:
- activate_filestore(lvs)
- terminal.success("ceph-volume lvm activate successful for osd ID: %s" % args.osd_id)
+ activate_filestore(lvs, no_systemd=args.no_systemd)
def main(self):
sub_command_help = dedent("""
The lvs associated with the OSD need to have been prepared previously,
so that all needed tags and metadata exist.
+ When migrating OSDs, or a multiple-osd activation is needed, the
+ ``--all`` flag can be used instead of the individual ID and FSID:
+
+ ceph-volume lvm activate --all
+
""")
parser = argparse.ArgumentParser(
prog='ceph-volume lvm activate',
parser.add_argument(
'--bluestore',
action='store_true',
- help='filestore objectstore (not yet implemented)',
+ help='bluestore objectstore (default)',
)
parser.add_argument(
'--filestore',
action='store_true',
- help='filestore objectstore (current default)',
+ help='filestore objectstore',
+ )
+ parser.add_argument(
+ '--all',
+ dest='activate_all',
+ action='store_true',
+ help='Activate all OSDs found in the system',
+ )
+ parser.add_argument(
+ '--no-systemd',
+ dest='no_systemd',
+ action='store_true',
+ help='Skip creating and enabling systemd units and starting OSD services',
)
if len(self.argv) == 0:
print(sub_command_help)
# cause both to be True
if not args.bluestore and not args.filestore:
args.bluestore = True
- self.activate(args)
+ if args.activate_all:
+ self.activate_all(args)
+ else:
+ self.activate(args)
help='Enable device encryption via dm-crypt',
)
+ parser.add_argument(
+ '--no-systemd',
+ dest='no_systemd',
+ action='store_true',
+ help='Skip creating and enabling systemd units and starting OSD services when activating',
+ )
+
# Do not parse args, so that consumers can do something before the args get
# parsed triggering argparse behavior
return parser
from ceph_volume import decorators
from ceph_volume.util import disk
from ceph_volume.api import lvm as api
+from ceph_volume.exceptions import MultipleLVsError
logger = logging.getLogger(__name__)
print(''.join(output))
+def direct_report():
+ """
+ Other non-cli consumers of listing information will want to consume the
+ report without the need to parse arguments or other flags. This helper
+ bypasses the need to deal with the class interface which is meant for cli
+ handling.
+ """
+ _list = List([])
+ # this is crucial: make sure that all paths will reflect current
+ # information. In the case of a system that has migrated, the disks will
+ # have changed paths
+ _list.update()
+ return _list.full_report()
+
+
class List(object):
help = 'list logical volumes and devices associated with Ceph'
"""
Generate a report for a single device. This can be either a logical
volume in the form of vg/lv or a device with an absolute path like
- /dev/sda1
+ /dev/sda1 or /dev/sda
"""
lvs = api.Volumes()
report = {}
lv = api.get_lv_from_argument(device)
+
+ # check if there was a pv created with the
+ # name of device
+ pv = api.get_pv(pv_name=device)
+ if pv and not lv:
+ try:
+ lv = api.get_lv(vg_name=pv.vg_name)
+ except MultipleLVsError:
+ lvs.filter(vg_name=pv.vg_name)
+ return self.full_report(lvs=lvs)
+
if lv:
try:
_id = lv.tags['ceph.osd_id']
)
return report
- def full_report(self):
+ def full_report(self, lvs=None):
"""
Generate a report for all the logical volumes and associated devices
that have been previously prepared by Ceph
"""
- lvs = api.Volumes()
+ if lvs is None:
+ lvs = api.Volumes()
report = {}
for lv in lvs:
try:
device = prepare_dmcrypt(key, device, 'data', tags)
journal = prepare_dmcrypt(key, journal, 'journal', tags)
+ # vdo detection
+ is_vdo = api.is_vdo(device)
# create the directory
prepare_utils.create_osd_path(osd_id)
# format the device
prepare_utils.format_device(device)
# mount the data device
- prepare_utils.mount_osd(device, osd_id)
+ prepare_utils.mount_osd(device, osd_id, is_vdo=is_vdo)
# symlink the journal
prepare_utils.link_journal(journal, osd_id)
# get the latest monmap
if device_name is None:
return '', '', tags
tags['ceph.type'] = device_type
+ tags['ceph.vdo'] = api.is_vdo(device_name)
lv = self.get_lv(device_name)
if lv:
uuid = lv.lv_uuid
"""
if disk.is_partition(arg) or disk.is_device(arg):
# we must create a vg, and then a single lv
- vg_name = "ceph-%s" % cluster_fsid
- if api.get_vg(vg_name=vg_name):
- # means we already have a group for this, make a different one
- # XXX this could end up being annoying for an operator, maybe?
- vg_name = "ceph-%s" % str(uuid.uuid4())
+ vg_name = "ceph-%s" % str(uuid.uuid4())
api.create_vg(vg_name, arg)
lv_name = "osd-%s-%s" % (device_type, osd_fsid)
return api.create_lv(
tags['ceph.data_uuid'] = data_lv.lv_uuid
tags['ceph.cephx_lockbox_secret'] = cephx_lockbox_secret
tags['ceph.encrypted'] = encrypted
+ tags['ceph.vdo'] = api.is_vdo(data_lv.lv_path)
journal_device, journal_uuid, tags = self.setup_device('journal', args.journal, tags)
tags['ceph.block_uuid'] = block_lv.lv_uuid
tags['ceph.cephx_lockbox_secret'] = cephx_lockbox_secret
tags['ceph.encrypted'] = encrypted
+ tags['ceph.vdo'] = api.is_vdo(block_lv.lv_path)
wal_device, wal_uuid, tags = self.setup_device('wal', args.block_wal, tags)
db_device, db_uuid, tags = self.setup_device('db', args.block_db, tags)
logger = logging.getLogger(__name__)
+def which(executable):
+ """
+ Proxy function to ceph_volume.util.system.which because the ``system``
+ module does import ``process``
+ """
+ from ceph_volume.util import system
+ return system.which(executable)
+
+
def log_output(descriptor, message, terminal_logging, logfile_logging):
"""
log output to both the logger and the terminal if terminal_logging is
def obfuscate(command_, on=None):
"""
Certain commands that are useful to log might contain information that
- should be replaced by '*' like when creating OSDs and the keyryings are
+ should be replaced by '*' like when creating OSDs and the keyrings are
being passed, which should not be logged.
:param on: A string (will match a flag) or an integer (will match an index)
stop_on_error = kw.pop('stop_on_error', True)
command_msg = obfuscate(command, kw.pop('obfuscate', None))
fail_msg = kw.pop('fail_msg', None)
+ executable = which(command.pop(0))
+ command.insert(0, executable)
logger.info(command_msg)
terminal.write(command_msg)
terminal_logging = kw.pop('terminal_logging', True)
:param terminal_verbose: Log command output to terminal, defaults to False, and
it is forcefully set to True if a return code is non-zero
:param logfile_verbose: Log stderr/stdout output to log file. Defaults to True
+ :param verbose_on_failure: On a non-zero exit status, it will forcefully set logging ON for
+ the terminal. Defaults to True
"""
terminal_verbose = kw.pop('terminal_verbose', False)
logfile_verbose = kw.pop('logfile_verbose', True)
+ verbose_on_failure = kw.pop('verbose_on_failure', True)
show_command = kw.pop('show_command', False)
+ executable = which(command.pop(0))
+ command.insert(0, executable)
command_msg = "Running command: %s" % ' '.join(command)
stdin = kw.pop('stdin', None)
logger.info(command_msg)
if returncode != 0:
# set to true so that we can log the stderr/stdout that callers would
- # do anyway
- terminal_verbose = True
+ # do anyway as long as verbose_on_failure is set (defaults to True)
+ if verbose_on_failure:
+ terminal_verbose = True
+ # logfiles aren't disruptive visually, unlike the terminal, so this
+ # should always be on when there is a failure
logfile_verbose = True
# the following can get a messed up order in the log if the system call
process.run(['systemctl', 'mask', unit])
+def is_active(unit):
+ out, err, rc = process.call(
+ ['systemctl', 'is-active', unit],
+ verbose_on_failure=False
+ )
+ return rc == 0
+
+
def start_osd(id_):
return start(osd_unit % id_)
return disable(osd_unit % id_)
+def osd_is_active(id_):
+ return is_active(osd_unit % id_)
+
+
def enable_volume(id_, fsid, device_type='lvm'):
return enable(volume_unit % (device_type, id_, fsid))
# systemctl allows using a glob like '*' for masking, but there was a bug
# in that it wouldn't allow this for service templates. This means that
# masking ceph-disk@* will not work, so we must link the service directly.
- # /etc/systemd takes precendence regardless of the location of the unit
+ # /etc/systemd takes precedence regardless of the location of the unit
process.run(
['ln', '-sf', '/dev/null', '/etc/systemd/system/ceph-disk@.service']
)
+import os
import pytest
from ceph_volume import process, exceptions
from ceph_volume.api import lvm as api
api.create_lv('foo', 'foo_group', size='5G', tags={'ceph.type': 'data'})
data_tag = ['lvchange', '--addtag', 'ceph.data_device=/path', '/path']
assert capture.calls[2]['args'][0] == data_tag
+
+
+#
+# The following tests are pretty gnarly. VDO detection is very convoluted and
+# involves correlating information from device mappers, realpaths, slaves of
+# those mappers, and parents or related mappers. This makes it very hard to
+# patch nicely or keep tests short and readable. These tests are trying to
+# ensure correctness, the better approach will be to do some functional testing
+# with VDO.
+#
+
+
+@pytest.fixture
+def disable_kvdo_path(monkeypatch):
+ monkeypatch.setattr('os.path.isdir', lambda x: False)
+
+
+@pytest.fixture
+def enable_kvdo_path(monkeypatch):
+ monkeypatch.setattr('os.path.isdir', lambda x: True)
+
+
+# Stub for os.listdir
+
+
+class ListDir(object):
+
+ def __init__(self, paths):
+ self.paths = paths
+ self._normalize_paths()
+ self.listdir = os.listdir
+
+ def _normalize_paths(self):
+ for k, v in self.paths.items():
+ self.paths[k.rstrip('/')] = v.rstrip('/')
+
+ def add(self, original, fake):
+ self.paths[original.rstrip('/')] = fake.rstrip('/')
+
+ def __call__(self, path):
+ return self.listdir(self.paths[path.rstrip('/')])
+
+
+@pytest.fixture(scope='function')
+def listdir(monkeypatch):
+ def apply(paths=None, stub=None):
+ if not stub:
+ stub = ListDir(paths)
+ if paths:
+ for original, fake in paths.items():
+ stub.add(original, fake)
+
+ monkeypatch.setattr('os.listdir', stub)
+ return apply
+
+
+@pytest.fixture(scope='function')
+def makedirs(tmpdir):
+ def create(directory):
+ path = os.path.join(str(tmpdir), directory)
+ os.makedirs(path)
+ return path
+ create.base = str(tmpdir)
+ return create
+
+
+class TestIsVdo(object):
+
+ def test_no_vdo_dir(self, disable_kvdo_path):
+ assert api._is_vdo('/path') is False
+
+ def test_exceptions_return_false(self, monkeypatch):
+ def throw():
+ raise Exception()
+ monkeypatch.setattr('ceph_volume.api.lvm._is_vdo', throw)
+ assert api.is_vdo('/path') == '0'
+
+ def test_is_vdo_returns_a_string(self, monkeypatch):
+ monkeypatch.setattr('ceph_volume.api.lvm._is_vdo', lambda x: True)
+ assert api.is_vdo('/path') == '1'
+
+ def test_kvdo_dir_no_devices(self, makedirs, enable_kvdo_path, listdir, monkeypatch):
+ kvdo_path = makedirs('sys/kvdo')
+ listdir(paths={'/sys/kvdo': kvdo_path})
+ monkeypatch.setattr('ceph_volume.api.lvm._vdo_slaves', lambda x: [])
+ monkeypatch.setattr('ceph_volume.api.lvm._vdo_parents', lambda x: [])
+ assert api._is_vdo('/dev/mapper/vdo0') is False
+
+ def test_vdo_slaves_found_and_matched(self, makedirs, enable_kvdo_path, listdir, monkeypatch):
+ kvdo_path = makedirs('sys/kvdo')
+ listdir(paths={'/sys/kvdo': kvdo_path})
+ monkeypatch.setattr('ceph_volume.api.lvm._vdo_slaves', lambda x: ['/dev/dm-3'])
+ monkeypatch.setattr('ceph_volume.api.lvm._vdo_parents', lambda x: [])
+ assert api._is_vdo('/dev/dm-3') is True
+
+ def test_vdo_parents_found_and_matched(self, makedirs, enable_kvdo_path, listdir, monkeypatch):
+ kvdo_path = makedirs('sys/kvdo')
+ listdir(paths={'/sys/kvdo': kvdo_path})
+ monkeypatch.setattr('ceph_volume.api.lvm._vdo_slaves', lambda x: [])
+ monkeypatch.setattr('ceph_volume.api.lvm._vdo_parents', lambda x: ['/dev/dm-4'])
+ assert api._is_vdo('/dev/dm-4') is True
+
+
+class TestVdoSlaves(object):
+
+ def test_slaves_are_not_found(self, makedirs, listdir, monkeypatch):
+ slaves_path = makedirs('sys/block/vdo0/slaves')
+ listdir(paths={'/sys/block/vdo0/slaves': slaves_path})
+ monkeypatch.setattr('ceph_volume.api.lvm.os.path.exists', lambda x: True)
+ result = sorted(api._vdo_slaves(['vdo0']))
+ assert '/dev/mapper/vdo0' in result
+ assert 'vdo0' in result
+
+ def test_slaves_are_found(self, makedirs, listdir, monkeypatch):
+ slaves_path = makedirs('sys/block/vdo0/slaves')
+ makedirs('sys/block/vdo0/slaves/dm-4')
+ makedirs('dev/mapper/vdo0')
+ listdir(paths={'/sys/block/vdo0/slaves': slaves_path})
+ monkeypatch.setattr('ceph_volume.api.lvm.os.path.exists', lambda x: True)
+ result = sorted(api._vdo_slaves(['vdo0']))
+ assert '/dev/dm-4' in result
+ assert 'dm-4' in result
+
+
+class TestVDOParents(object):
+
+ def test_parents_are_found(self, makedirs, listdir):
+ block_path = makedirs('sys/block')
+ slaves_path = makedirs('sys/block/dm-4/slaves')
+ makedirs('sys/block/dm-4/slaves/dm-3')
+ listdir(paths={
+ '/sys/block/dm-4/slaves': slaves_path,
+ '/sys/block': block_path})
+ result = api._vdo_parents(['dm-3'])
+ assert '/dev/dm-4' in result
+ assert 'dm-4' in result
+
+ def test_parents_are_not_found(self, makedirs, listdir):
+ block_path = makedirs('sys/block')
+ slaves_path = makedirs('sys/block/dm-4/slaves')
+ makedirs('sys/block/dm-4/slaves/dm-5')
+ listdir(paths={
+ '/sys/block/dm-4/slaves': slaves_path,
+ '/sys/block': block_path})
+ result = api._vdo_parents(['dm-3'])
+ assert result == []
import pytest
from ceph_volume.devices.lvm import activate
from ceph_volume.api import lvm as api
+from ceph_volume.tests.conftest import Capture
class Args(object):
# default flags
self.bluestore = False
self.filestore = False
+ self.no_systemd = False
self.auto_detect_objectstore = None
for k, v in kw.items():
setattr(self, k, v)
with pytest.raises(RuntimeError):
activate.Activate([]).activate(args)
+ def test_filestore_no_systemd(self, is_root, volumes, monkeypatch, capture):
+ fake_enable = Capture()
+ fake_start_osd = Capture()
+ monkeypatch.setattr('ceph_volume.util.system.device_is_mounted', lambda *a, **kw: True)
+ monkeypatch.setattr('ceph_volume.util.system.chown', lambda *a, **kw: True)
+ monkeypatch.setattr('ceph_volume.process.run', lambda *a, **kw: True)
+ monkeypatch.setattr(activate.systemctl, 'enable_volume', fake_enable)
+ monkeypatch.setattr(activate.systemctl, 'start_osd', fake_start_osd)
+ JournalVolume = api.Volume(
+ lv_name='journal',
+ lv_path='/dev/vg/journal',
+ lv_uuid='000',
+ lv_tags=','.join([
+ "ceph.cluster_name=ceph", "ceph.journal_device=/dev/vg/journal",
+ "ceph.journal_uuid=000", "ceph.type=journal",
+ "ceph.osd_id=0","ceph.osd_fsid=1234"])
+ )
+ DataVolume = api.Volume(
+ lv_name='data',
+ lv_path='/dev/vg/data',
+ lv_tags="ceph.cluster_name=ceph,ceph.journal_device=/dev/vg/journal,ceph.journal_uuid=000,ceph.type=data,ceph.osd_id=0,ceph.osd_fsid=1234")
+ volumes.append(DataVolume)
+ volumes.append(JournalVolume)
+ monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+ args = Args(osd_id=None, osd_fsid='1234', no_systemd=True, filestore=True)
+ activate.Activate([]).activate(args)
+ assert fake_enable.calls == []
+ assert fake_start_osd.calls == []
+
+ def test_filestore_systemd(self, is_root, volumes, monkeypatch, capture):
+ fake_enable = Capture()
+ fake_start_osd = Capture()
+ monkeypatch.setattr('ceph_volume.util.system.device_is_mounted', lambda *a, **kw: True)
+ monkeypatch.setattr('ceph_volume.util.system.chown', lambda *a, **kw: True)
+ monkeypatch.setattr('ceph_volume.process.run', lambda *a, **kw: True)
+ monkeypatch.setattr(activate.systemctl, 'enable_volume', fake_enable)
+ monkeypatch.setattr(activate.systemctl, 'start_osd', fake_start_osd)
+ JournalVolume = api.Volume(
+ lv_name='journal',
+ lv_path='/dev/vg/journal',
+ lv_uuid='000',
+ lv_tags=','.join([
+ "ceph.cluster_name=ceph", "ceph.journal_device=/dev/vg/journal",
+ "ceph.journal_uuid=000", "ceph.type=journal",
+ "ceph.osd_id=0","ceph.osd_fsid=1234"])
+ )
+ DataVolume = api.Volume(
+ lv_name='data',
+ lv_path='/dev/vg/data',
+ lv_tags="ceph.cluster_name=ceph,ceph.journal_device=/dev/vg/journal,ceph.journal_uuid=000,ceph.type=data,ceph.osd_id=0,ceph.osd_fsid=1234")
+ volumes.append(DataVolume)
+ volumes.append(JournalVolume)
+ monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+ args = Args(osd_id=None, osd_fsid='1234', no_systemd=False, filestore=True)
+ activate.Activate([]).activate(args)
+ assert fake_enable.calls != []
+ assert fake_start_osd.calls != []
+
+ def test_bluestore_no_systemd(self, is_root, volumes, monkeypatch, capture):
+ fake_enable = Capture()
+ fake_start_osd = Capture()
+ monkeypatch.setattr('ceph_volume.util.system.path_is_mounted', lambda *a, **kw: True)
+ monkeypatch.setattr('ceph_volume.util.system.chown', lambda *a, **kw: True)
+ monkeypatch.setattr('ceph_volume.process.run', lambda *a, **kw: True)
+ monkeypatch.setattr(activate.systemctl, 'enable_volume', fake_enable)
+ monkeypatch.setattr(activate.systemctl, 'start_osd', fake_start_osd)
+ DataVolume = api.Volume(
+ lv_name='data',
+ lv_path='/dev/vg/data',
+ lv_tags="ceph.cluster_name=ceph,,ceph.journal_uuid=000,ceph.type=block,ceph.osd_id=0,ceph.osd_fsid=1234")
+ volumes.append(DataVolume)
+ monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+ args = Args(osd_id=None, osd_fsid='1234', no_systemd=True, bluestore=True)
+ activate.Activate([]).activate(args)
+ assert fake_enable.calls == []
+ assert fake_start_osd.calls == []
+
+ def test_bluestore_systemd(self, is_root, volumes, monkeypatch, capture):
+ fake_enable = Capture()
+ fake_start_osd = Capture()
+ monkeypatch.setattr('ceph_volume.util.system.path_is_mounted', lambda *a, **kw: True)
+ monkeypatch.setattr('ceph_volume.util.system.chown', lambda *a, **kw: True)
+ monkeypatch.setattr('ceph_volume.process.run', lambda *a, **kw: True)
+ monkeypatch.setattr(activate.systemctl, 'enable_volume', fake_enable)
+ monkeypatch.setattr(activate.systemctl, 'start_osd', fake_start_osd)
+ DataVolume = api.Volume(
+ lv_name='data',
+ lv_path='/dev/vg/data',
+ lv_tags="ceph.cluster_name=ceph,,ceph.journal_uuid=000,ceph.type=block,ceph.osd_id=0,ceph.osd_fsid=1234")
+ volumes.append(DataVolume)
+ monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+ args = Args(osd_id=None, osd_fsid='1234', no_systemd=False, bluestore=True)
+ activate.Activate([]).activate(args)
+ assert fake_enable.calls != []
+ assert fake_start_osd.calls != []
+
class TestActivateFlags(object):
parsed_args = capture.calls[0]['args'][0]
assert parsed_args.filestore is False
assert parsed_args.bluestore is True
+
+
+class TestActivateAll(object):
+
+ def test_does_not_detect_osds(self, capsys, is_root, capture, monkeypatch):
+ monkeypatch.setattr('ceph_volume.devices.lvm.activate.direct_report', lambda: {})
+ args = ['--all']
+ activation = activate.Activate(args)
+ activation.main()
+ out, err = capsys.readouterr()
+ assert 'Was unable to find any OSDs to activate' in out
+ assert 'Verify OSDs are present with ' in out
+
+ def test_detects_running_osds(self, capsys, is_root, capture, monkeypatch):
+ monkeypatch.setattr('ceph_volume.devices.lvm.activate.direct_report', lambda: direct_report)
+ monkeypatch.setattr('ceph_volume.devices.lvm.activate.systemctl.osd_is_active', lambda x: True)
+ args = ['--all']
+ activation = activate.Activate(args)
+ activation.main()
+ out, err = capsys.readouterr()
+ assert 'a8789a96ce8b process is active. Skipping activation' in out
+ assert 'b8218eaa1634 process is active. Skipping activation' in out
+
+ def test_detects_osds_to_activate(self, is_root, capture, monkeypatch):
+ monkeypatch.setattr('ceph_volume.devices.lvm.activate.direct_report', lambda: direct_report)
+ monkeypatch.setattr('ceph_volume.devices.lvm.activate.systemctl.osd_is_active', lambda x: False)
+ args = ['--all']
+ activation = activate.Activate(args)
+ activation.activate = capture
+ activation.main()
+ calls = sorted(capture.calls, key=lambda x: x['kwargs']['osd_id'])
+ assert calls[0]['kwargs']['osd_id'] == '0'
+ assert calls[0]['kwargs']['osd_fsid'] == '957d22b7-24ce-466a-9883-b8218eaa1634'
+ assert calls[1]['kwargs']['osd_id'] == '1'
+ assert calls[1]['kwargs']['osd_fsid'] == 'd0f3e4ad-e52a-4520-afc0-a8789a96ce8b'
+
+#
+# Activate All fixture
+#
+
+direct_report = {
+ "0": [
+ {
+ "lv_name": "osd-block-957d22b7-24ce-466a-9883-b8218eaa1634",
+ "lv_path": "/dev/ceph-d4962338-46ff-4cd5-8ea6-c033dbdc5b44/osd-block-957d22b7-24ce-466a-9883-b8218eaa1634",
+ "lv_tags": "ceph.block_device=/dev/ceph-d4962338-46ff-4cd5-8ea6-c033dbdc5b44/osd-block-957d22b7-24ce-466a-9883-b8218eaa1634,ceph.block_uuid=6MixOd-2Q1I-f8K3-PPOq-UJGV-L3A0-0XwUm4,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=d4962338-46ff-4cd5-8ea6-c033dbdc5b44,ceph.cluster_name=ceph,ceph.crush_device_class=None,ceph.encrypted=0,ceph.osd_fsid=957d22b7-24ce-466a-9883-b8218eaa1634,ceph.osd_id=0,ceph.type=block",
+ "lv_uuid": "6MixOd-2Q1I-f8K3-PPOq-UJGV-L3A0-0XwUm4",
+ "name": "osd-block-957d22b7-24ce-466a-9883-b8218eaa1634",
+ "path": "/dev/ceph-d4962338-46ff-4cd5-8ea6-c033dbdc5b44/osd-block-957d22b7-24ce-466a-9883-b8218eaa1634",
+ "tags": {
+ "ceph.block_device": "/dev/ceph-d4962338-46ff-4cd5-8ea6-c033dbdc5b44/osd-block-957d22b7-24ce-466a-9883-b8218eaa1634",
+ "ceph.block_uuid": "6MixOd-2Q1I-f8K3-PPOq-UJGV-L3A0-0XwUm4",
+ "ceph.cephx_lockbox_secret": "",
+ "ceph.cluster_fsid": "d4962338-46ff-4cd5-8ea6-c033dbdc5b44",
+ "ceph.cluster_name": "ceph",
+ "ceph.crush_device_class": "None",
+ "ceph.encrypted": "0",
+ "ceph.osd_fsid": "957d22b7-24ce-466a-9883-b8218eaa1634",
+ "ceph.osd_id": "0",
+ "ceph.type": "block"
+ },
+ "type": "block",
+ "vg_name": "ceph-d4962338-46ff-4cd5-8ea6-c033dbdc5b44"
+ }
+ ],
+ "1": [
+ {
+ "lv_name": "osd-block-d0f3e4ad-e52a-4520-afc0-a8789a96ce8b",
+ "lv_path": "/dev/ceph-7538bcf0-f155-4d3f-a9fd-d8b15905e532/osd-block-d0f3e4ad-e52a-4520-afc0-a8789a96ce8b",
+ "lv_tags": "ceph.block_device=/dev/ceph-7538bcf0-f155-4d3f-a9fd-d8b15905e532/osd-block-d0f3e4ad-e52a-4520-afc0-a8789a96ce8b,ceph.block_uuid=1igwLb-ZlmV-eLgp-hapx-c1Hr-M5gz-sHjnyW,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=d4962338-46ff-4cd5-8ea6-c033dbdc5b44,ceph.cluster_name=ceph,ceph.crush_device_class=None,ceph.encrypted=0,ceph.osd_fsid=d0f3e4ad-e52a-4520-afc0-a8789a96ce8b,ceph.osd_id=1,ceph.type=block",
+ "lv_uuid": "1igwLb-ZlmV-eLgp-hapx-c1Hr-M5gz-sHjnyW",
+ "name": "osd-block-d0f3e4ad-e52a-4520-afc0-a8789a96ce8b",
+ "path": "/dev/ceph-7538bcf0-f155-4d3f-a9fd-d8b15905e532/osd-block-d0f3e4ad-e52a-4520-afc0-a8789a96ce8b",
+ "tags": {
+ "ceph.block_device": "/dev/ceph-7538bcf0-f155-4d3f-a9fd-d8b15905e532/osd-block-d0f3e4ad-e52a-4520-afc0-a8789a96ce8b",
+ "ceph.block_uuid": "1igwLb-ZlmV-eLgp-hapx-c1Hr-M5gz-sHjnyW",
+ "ceph.cephx_lockbox_secret": "",
+ "ceph.cluster_fsid": "d4962338-46ff-4cd5-8ea6-c033dbdc5b44",
+ "ceph.cluster_name": "ceph",
+ "ceph.crush_device_class": "None",
+ "ceph.encrypted": "0",
+ "ceph.osd_fsid": "d0f3e4ad-e52a-4520-afc0-a8789a96ce8b",
+ "ceph.osd_id": "1",
+ "ceph.type": "block"
+ },
+ "type": "block",
+ "vg_name": "ceph-7538bcf0-f155-4d3f-a9fd-d8b15905e532"
+ }
+ ]
+}
client.vm.provider :libvirt do |lv|
lv.memory = MEMORY
lv.random_hostname = true
+ lv.nic_model_type = "e1000"
end
# Parallels
rgw.vm.provider :libvirt do |lv|
lv.memory = MEMORY
lv.random_hostname = true
+ lv.nic_model_type = "e1000"
end
# Parallels
nfs.vm.provider :libvirt do |lv|
lv.memory = MEMORY
lv.random_hostname = true
+ lv.nic_model_type = "e1000"
end
# Parallels
mds.vm.provider :libvirt do |lv|
lv.memory = MEMORY
lv.random_hostname = true
+ lv.nic_model_type = "e1000"
end
# Parallels
mds.vm.provider "parallels" do |prl|
rbd_mirror.vm.provider :libvirt do |lv|
lv.memory = MEMORY
lv.random_hostname = true
+ lv.nic_model_type = "e1000"
end
# Parallels
rbd_mirror.vm.provider "parallels" do |prl|
iscsi_gw.vm.provider :libvirt do |lv|
lv.memory = MEMORY
lv.random_hostname = true
+ lv.nic_model_type = "e1000"
end
# Parallels
iscsi_gw.vm.provider "parallels" do |prl|
mon.vm.provider :libvirt do |lv|
lv.memory = MEMORY
lv.random_hostname = true
+ lv.nic_model_type = "e1000"
end
# Parallels
end
lv.memory = MEMORY
lv.random_hostname = true
+ lv.nic_model_type = "e1000"
end
# Parallels
name: ceph-osd@2
state: stopped
- - name: destroy osd.2
+ - name: destroy osd.2
command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
- - name: zap /dev/sdd1
+ - name: zap /dev/sdd1
command: "ceph-volume lvm zap /dev/sdd1 --destroy"
environment:
CEPH_VOLUME_DEBUG: 1
- - name: redeploy osd.2 using /dev/sdd1
+ - name: redeploy osd.2 using /dev/sdd1
command: "ceph-volume lvm create --bluestore --data /dev/sdd1 --osd-id 2"
environment:
CEPH_VOLUME_DEBUG: 1
name: ceph-osd@0
state: stopped
- - name: destroy osd.0
+ - name: destroy osd.0
command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
- - name: zap test_group/data-lv1
+ - name: zap test_group/data-lv1
command: "ceph-volume lvm zap test_group/data-lv1"
environment:
CEPH_VOLUME_DEBUG: 1
- - name: redeploy osd.0 using test_group/data-lv1
+ - name: redeploy osd.0 using test_group/data-lv1
command: "ceph-volume lvm create --bluestore --data test_group/data-lv1 --osd-id 0"
environment:
CEPH_VOLUME_DEBUG: 1
+
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+ - name: destroy osd.0
+ command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+ - name: zap test_group/data-lv1
+ command: "ceph-volume lvm zap test_group/data-lv1"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: prepare osd.0 using test_group/data-lv1
+ command: "ceph-volume lvm prepare --bluestore --data test_group/data-lv1 --osd-id 0"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: activate all to start the previously prepared osd.0
+ command: "ceph-volume lvm activate --all"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
name: ceph-osd@2
state: stopped
- - name: destroy osd.2
+ - name: destroy osd.2
command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
- - name: zap /dev/sdd1
+ - name: zap /dev/sdd1
command: "ceph-volume lvm zap /dev/sdd1 --destroy"
environment:
CEPH_VOLUME_DEBUG: 1
- - name: zap /dev/sdd2
+ - name: zap /dev/sdd2
command: "ceph-volume lvm zap /dev/sdd2 --destroy"
environment:
CEPH_VOLUME_DEBUG: 1
- - name: redeploy osd.2 using /dev/sdd1
+ - name: redeploy osd.2 using /dev/sdd1
command: "ceph-volume lvm create --filestore --data /dev/sdd1 --journal /dev/sdd2 --osd-id 2"
environment:
CEPH_VOLUME_DEBUG: 1
+
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+ - name: destroy osd.0
+ command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+ - name: zap test_group/data-lv1
+ command: "ceph-volume lvm zap test_group/data-lv1"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: zap /dev/sdc1
+ command: "ceph-volume lvm zap /dev/sdc1 --destroy"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: prepare osd.0 again using test_group/data-lv1
+ command: "ceph-volume lvm prepare --filestore --data test_group/data-lv1 --journal /dev/sdc1 --osd-id 0"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: activate all to start the previously prepared osd.0
+ command: "ceph-volume lvm activate --filestore --all"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
name: ceph-osd@2
state: stopped
- - name: destroy osd.2
+ - name: destroy osd.2
command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
- - name: zap /dev/sdd1
+ - name: zap /dev/sdd1
command: "ceph-volume lvm zap /dev/sdd1 --destroy"
environment:
CEPH_VOLUME_DEBUG: 1
- - name: redeploy osd.2 using /dev/sdd1
+ - name: redeploy osd.2 using /dev/sdd1
command: "ceph-volume lvm create --bluestore --data /dev/sdd1 --osd-id 2"
environment:
CEPH_VOLUME_DEBUG: 1
+
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+ - name: destroy osd.0
+ command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+ - name: zap test_group/data-lv1
+ command: "ceph-volume lvm zap test_group/data-lv1"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: prepare osd.0 again using test_group/data-lv1
+ command: "ceph-volume lvm prepare --bluestore --data test_group/data-lv1 --osd-id 0"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: activate all to start the previously prepared osd.0
+ command: "ceph-volume lvm activate --all"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
name: ceph-osd@2
state: stopped
- - name: destroy osd.2
+ - name: destroy osd.2
command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
- - name: zap /dev/sdd1
+ - name: zap /dev/sdd1
command: "ceph-volume lvm zap /dev/sdd1 --destroy"
environment:
CEPH_VOLUME_DEBUG: 1
- - name: zap /dev/sdd2
+ - name: zap /dev/sdd2
command: "ceph-volume lvm zap /dev/sdd2 --destroy"
environment:
CEPH_VOLUME_DEBUG: 1
- - name: redeploy osd.2 using /dev/sdd1
+ - name: redeploy osd.2 using /dev/sdd1
command: "ceph-volume lvm create --filestore --data /dev/sdd1 --journal /dev/sdd2 --osd-id 2"
environment:
CEPH_VOLUME_DEBUG: 1
+
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+ - name: destroy osd.0
+ command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+ - name: zap test_group/data-lv1
+ command: "ceph-volume lvm zap test_group/data-lv1"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: zap /dev/sdc1
+ command: "ceph-volume lvm zap /dev/sdc1 --destroy"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: prepare osd.0 again using test_group/data-lv1
+ command: "ceph-volume lvm prepare --filestore --data test_group/data-lv1 --journal /dev/sdc1 --osd-id 0"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: activate all to start the previously prepared osd.0
+ command: "ceph-volume lvm activate --filestore --all"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
passenv=*
setenv=
ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config
+ ANSIBLE_ACTION_PLUGINS = {envdir}/tmp/ceph-ansible/plugins/actions
ANSIBLE_STDOUT_CALLBACK = debug
ANSIBLE_RETRY_FILES_ENABLED = False
ANSIBLE_SSH_RETRIES = 5
# test cluster state using ceph-ansible tests
testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
- # reboot all vms
- vagrant reload --no-provision
+ # reboot all vms - attempt
+ bash {toxinidir}/../scripts/vagrant_reload.sh {env:VAGRANT_UP_FLAGS:"--no-provision"} {posargs:--provider=virtualbox}
# retest to ensure cluster came back up correctly after rebooting
testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
command: "ceph-volume lvm create --bluestore --data test_group/data-lv1 --osd-id 0"
environment:
CEPH_VOLUME_DEBUG: 1
+
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+ - name: destroy osd.0
+ command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+ - name: zap test_group/data-lv1
+ command: "ceph-volume lvm zap test_group/data-lv1"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: prepare osd.0 using test_group/data-lv1
+ command: "ceph-volume lvm prepare --bluestore --data test_group/data-lv1 --osd-id 0"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: activate all to start the previously prepared osd.0
+ command: "ceph-volume lvm activate --all"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
name: ceph-osd@2
state: stopped
- - name: destroy osd.2
+ - name: destroy osd.2
command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
- - name: zap /dev/sdd1
+ - name: zap /dev/sdd1
command: "ceph-volume lvm zap /dev/sdd1 --destroy"
environment:
CEPH_VOLUME_DEBUG: 1
- - name: zap /dev/sdd2
+ - name: zap /dev/sdd2
command: "ceph-volume lvm zap /dev/sdd2 --destroy"
environment:
CEPH_VOLUME_DEBUG: 1
- - name: redeploy osd.2 using /dev/sdd1
+ - name: redeploy osd.2 using /dev/sdd1
command: "ceph-volume lvm create --filestore --data /dev/sdd1 --journal /dev/sdd2 --osd-id 2"
environment:
CEPH_VOLUME_DEBUG: 1
+
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+ - name: destroy osd.0
+ command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+ - name: zap test_group/data-lv1
+ command: "ceph-volume lvm zap test_group/data-lv1"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: zap /dev/sdc1
+ command: "ceph-volume lvm zap /dev/sdc1 --destroy"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: prepare osd.0 again using test_group/data-lv1
+ command: "ceph-volume lvm prepare --filestore --data test_group/data-lv1 --journal /dev/sdc1 --osd-id 0"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: activate all to start the previously prepared osd.0
+ command: "ceph-volume lvm activate --filestore --all"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
--- /dev/null
+#!/bin/bash
+
+# vagrant-libvirt has a common behavior where it times out when "reloading" vms. Instead
+# of calling `vagrant reload` attempt to halt everything, and then start everything, which gives
+# this script the ability to try the `vagrant up` again in case of failure
+#
+
+vagrant halt
+# This should not really be needed, but in case of a possible race condition between halt
+# and up, it might improve things
+sleep 5
+
+
+retries=0
+until [ $retries -ge 5 ]
+do
+ echo "Attempting to start VMs. Attempts: $retries"
+ timeout 10m vagrant up "$@" && break
+ retries=$[$retries+1]
+ sleep 5
+done
passenv=*
setenv=
ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config
+ ANSIBLE_ACTION_PLUGINS = {envdir}/tmp/ceph-ansible/plugins/actions
ANSIBLE_STDOUT_CALLBACK = debug
ANSIBLE_RETRY_FILES_ENABLED = False
ANSIBLE_SSH_RETRIES = 5
ansible-playbook -vv -i {changedir}/hosts {changedir}/test.yml
# reboot all vms
- vagrant reload --no-provision
+ bash {toxinidir}/../scripts/vagrant_reload.sh {env:VAGRANT_UP_FLAGS:"--no-provision"} {posargs:--provider=virtualbox}
# wait 2 minutes for services to be ready
sleep 120
--- /dev/null
+import pytest
+from ceph_volume.tests.conftest import Factory
+from ceph_volume import process
+
+
+@pytest.fixture
+def mock_call(monkeypatch):
+ """
+ Monkeypatches process.call, so that a caller can add behavior to the response
+ """
+ def apply(stdout=None, stderr=None, returncode=0):
+ stdout_stream = Factory(read=lambda: stdout)
+ stderr_stream = Factory(read=lambda: stderr)
+ return_value = Factory(
+ stdout=stdout_stream,
+ stderr=stderr_stream,
+ wait=lambda: returncode,
+ communicate=lambda x: (stdout, stderr, returncode)
+ )
+
+ monkeypatch.setattr(
+ 'ceph_volume.process.subprocess.Popen',
+ lambda *a, **kw: return_value)
+
+ return apply
+
+
+class TestCall(object):
+
+ def test_stderr_terminal_and_logfile(self, mock_call, caplog, capsys):
+ mock_call(stdout='stdout\n', stderr='some stderr message\n')
+ process.call(['ls'], terminal_verbose=True)
+ out, err = capsys.readouterr()
+ log_lines = [line[-1] for line in caplog.record_tuples]
+ assert 'Running command: ' in log_lines[0]
+ assert 'ls' in log_lines[0]
+ assert 'stderr some stderr message' in log_lines[-1]
+ assert 'some stderr message' in out
+
+ def test_stderr_terminal_and_logfile_off(self, mock_call, caplog, capsys):
+ mock_call(stdout='stdout\n', stderr='some stderr message\n')
+ process.call(['ls'], terminal_verbose=False)
+ out, err = capsys.readouterr()
+ log_lines = [line[-1] for line in caplog.record_tuples]
+ assert 'Running command: ' in log_lines[0]
+ assert 'ls' in log_lines[0]
+ assert 'stderr some stderr message' in log_lines[-1]
+ assert out == ''
+
+ def test_verbose_on_failure(self, mock_call, caplog, capsys):
+ mock_call(stdout='stdout\n', stderr='stderr\n', returncode=1)
+ process.call(['ls'], terminal_verbose=False, logfile_verbose=False)
+ out, err = capsys.readouterr()
+ log_lines = '\n'.join([line[-1] for line in caplog.record_tuples])
+ assert 'Running command: ' in log_lines
+ assert 'ls' in log_lines
+ assert 'stderr' in log_lines
+ assert 'stdout: stdout' in out
+
+ def test_silent_verbose_on_failure(self, mock_call, caplog, capsys):
+ mock_call(stdout='stdout\n', stderr='stderr\n', returncode=1)
+ process.call(['ls'], verbose_on_failure=False)
+ out, err = capsys.readouterr()
+ log_lines = '\n'.join([line[-1] for line in caplog.record_tuples])
+ assert 'Running command: ' in log_lines
+ assert 'ls' in log_lines
+ assert 'stderr' in log_lines
+ assert out == ''
@pytest.mark.parametrize("flags", ceph_conf_mount_values)
def test_normalize_lists(self, flags):
- result = prepare._normalize_mount_flags(flags)
- assert result == 'rw,auto,exec'
+ result = sorted(prepare._normalize_mount_flags(flags).split(','))
+ assert ','.join(result) == 'auto,exec,rw'
@pytest.mark.parametrize("flags", string_mount_values)
def test_normalize_strings(self, flags):
- result = prepare._normalize_mount_flags(flags)
- assert result == 'rw,auto,exec'
+ result = sorted(prepare._normalize_mount_flags(flags).split(','))
+ assert ','.join(result) == 'auto,exec,rw'
+
+ @pytest.mark.parametrize("flags", ceph_conf_mount_values)
+ def test_normalize_extra_flags(self, flags):
+ result = prepare._normalize_mount_flags(flags, extras=['discard'])
+ assert sorted(result.split(',')) == ['auto', 'discard', 'exec', 'rw']
+
+ @pytest.mark.parametrize("flags", ceph_conf_mount_values)
+ def test_normalize_duplicate_extra_flags(self, flags):
+ result = prepare._normalize_mount_flags(flags, extras=['rw', 'discard'])
+ assert sorted(result.split(',')) == ['auto', 'discard', 'exec', 'rw']
+
+ @pytest.mark.parametrize("flags", string_mount_values)
+ def test_normalize_strings_flags(self, flags):
+ result = sorted(prepare._normalize_mount_flags(flags, extras=['discard']).split(','))
+ assert ','.join(result) == 'auto,discard,exec,rw'
+
+ @pytest.mark.parametrize("flags", string_mount_values)
+ def test_normalize_strings_duplicate_flags(self, flags):
+ result = sorted(prepare._normalize_mount_flags(flags, extras=['discard','rw']).split(','))
+ assert ','.join(result) == 'auto,discard,exec,rw'
def test_is_not_binary(self, tmpfile):
binary_path = tmpfile(contents='asd\n\nlkjh0')
assert system.is_binary(binary_path) is False
+
+
+class TestWhich(object):
+
+ def test_executable_exists_but_is_not_file(self, monkeypatch):
+ monkeypatch.setattr(system.os.path, 'isfile', lambda x: False)
+ monkeypatch.setattr(system.os.path, 'exists', lambda x: True)
+ assert system.which('exedir') == 'exedir'
+
+ def test_executable_does_not_exist(self, monkeypatch):
+ monkeypatch.setattr(system.os.path, 'isfile', lambda x: False)
+ monkeypatch.setattr(system.os.path, 'exists', lambda x: False)
+ assert system.which('exedir') == 'exedir'
+
+ def test_executable_exists_as_file(self, monkeypatch):
+ monkeypatch.setattr(system.os.path, 'isfile', lambda x: True)
+ monkeypatch.setattr(system.os.path, 'exists', lambda x: True)
+ assert system.which('ceph') == '/usr/local/bin/ceph'
+
+ def test_warnings_when_executable_isnt_matched(self, monkeypatch, capsys):
+ monkeypatch.setattr(system.os.path, 'isfile', lambda x: True)
+ monkeypatch.setattr(system.os.path, 'exists', lambda x: False)
+ system.which('exedir')
+ stdout, stderr = capsys.readouterr()
+ assert 'Absolute path not found for executable: exedir' in stdout
+ assert 'Ensure $PATH environment variable contains common executable locations' in stdout
return _lsblk_parser(' '.join(out))
-def _lsblk_type(device):
- """
- Helper function that will use the ``TYPE`` label output of ``lsblk`` to determine
- if a device is a partition or disk.
- It does not process the output to return a boolean, but it does process it to return the
- """
- out, err, rc = process.call(
- ['blkid', '-s', 'PARTUUID', '-o', 'value', device]
- )
- return ' '.join(out).strip()
-
-
def is_device(dev):
"""
Boolean to determine if a given device is a block device (**not**
process.run(command)
-def _normalize_mount_flags(flags):
+def _normalize_mount_flags(flags, extras=None):
"""
Mount flag options have to be a single string, separated by a comma. If the
flags are separated by spaces, or with commas and spaces in ceph.conf, the
[" rw ,", "exec"]
:param flags: A list of flags, or a single string of mount flags
+ :param extras: Extra set of mount flags, useful when custom devices like VDO need
+ ad-hoc mount configurations
"""
+ # Instead of using set(), we append to this new list here, because set()
+ # will create an arbitrary order on the items that is made worst when
+ # testing with tools like tox that includes a randomizer seed. By
+ # controlling the order, it is easier to correctly assert the expectation
+ unique_flags = []
if isinstance(flags, list):
+ if extras:
+ flags.extend(extras)
+
# ensure that spaces and commas are removed so that they can join
- # correctly
- return ','.join([f.strip().strip(',') for f in flags if f])
+ # correctly, remove duplicates
+ for f in flags:
+ if f and f not in unique_flags:
+ unique_flags.append(f.strip().strip(','))
+ return ','.join(unique_flags)
# split them, clean them, and join them back again
flags = flags.strip().split(' ')
- return ','.join(
- [f.strip().strip(',') for f in flags if f]
- )
-
-
-def mount_osd(device, osd_id):
+ if extras:
+ flags.extend(extras)
+
+ # remove possible duplicates
+ for f in flags:
+ if f and f not in unique_flags:
+ unique_flags.append(f.strip().strip(','))
+ flags = ','.join(unique_flags)
+ # Before returning, split them again, since strings can be mashed up
+ # together, preventing removal of duplicate entries
+ return ','.join(set(flags.split(',')))
+
+
+def mount_osd(device, osd_id, **kw):
+ extras = []
+ is_vdo = kw.get('is_vdo', '0')
+ if is_vdo == '1':
+ extras = ['discard']
destination = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
command = ['mount', '-t', 'xfs', '-o']
flags = conf.ceph.get_list(
default=constants.mount.get('xfs'),
split=' ',
)
- command.append(_normalize_mount_flags(flags))
+ command.append(
+ _normalize_mount_flags(flags, extras=extras)
+ )
command.append(device)
command.append(destination)
process.run(command)
import platform
import tempfile
import uuid
-from ceph_volume import process
+from ceph_volume import process, terminal
from . import as_string
logger = logging.getLogger(__name__)
+mlogger = terminal.MultiLogger(__name__)
# TODO: get these out of here and into a common area for others to consume
if platform.system() == 'FreeBSD':
return str(uuid.uuid4())
+def which(executable):
+ """find the location of an executable"""
+ locations = (
+ '/usr/local/bin',
+ '/bin',
+ '/usr/bin',
+ '/usr/local/sbin',
+ '/usr/sbin',
+ '/sbin',
+ )
+
+ for location in locations:
+ executable_path = os.path.join(location, executable)
+ if os.path.exists(executable_path) and os.path.isfile(executable_path):
+ return executable_path
+ mlogger.warning('Absolute path not found for executable: %s', executable)
+ mlogger.warning('Ensure $PATH environment variable contains common executable locations')
+ # fallback to just returning the argument as-is, to prevent a hard fail,
+ # and hoping that the system might have the executable somewhere custom
+ return executable
+
+
def get_ceph_user_ids():
"""
Return the id and gid of the ceph user
return; // guard if at end of func
if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
- (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER))
- _release(in);
+ (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER)) {
+ if (_release(in))
+ used &= ~CEPH_CAP_FILE_CACHE;
+ }
if (!in->cap_snaps.empty())
flush_snaps(in);
ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
// invalidate our userspace inode cache
- if (cct->_conf->client_oc)
+ if (cct->_conf->client_oc) {
objectcacher->release_set(&in->oset);
+ if (!objectcacher->set_is_empty(&in->oset))
+ lderr(cct) << "failed to invalidate cache for " << *in << dendl;
+ }
_schedule_invalidate_callback(in, 0, 0);
}
{
if (unmounting)
return;
- if (can_invalidate_dentries && dentry_invalidate_cb && root->dir) {
- for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
- p != root->dir->dentries.end();
- ++p) {
- if (p->second->inode)
- _schedule_invalidate_dentry_callback(p->second, false);
+ if (can_invalidate_dentries) {
+ if (dentry_invalidate_cb && root->dir) {
+ for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
+ p != root->dir->dentries.end();
+ ++p) {
+ if (p->second->inode)
+ _schedule_invalidate_dentry_callback(p->second, false);
+ }
}
} else if (remount_cb) {
// Hacky:
<< " seq " << dn->lease_seq
<< dendl;
- if (!dn->inode || dn->inode->caps_issued_mask(mask)) {
+ if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
// is dn lease valid?
utime_t now = ceph_clock_now();
if (dn->lease_mds >= 0 &&
<< " vs lease_gen " << dn->lease_gen << dendl;
}
// dir lease?
- if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
+ if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
if (dn->cap_shared_gen == dir->shared_gen &&
- (!dn->inode || dn->inode->caps_issued_mask(mask)))
+ (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
goto hit_dn;
if (!dn->inode && (dir->flags & I_COMPLETE)) {
ldout(cct, 10) << "_lookup concluded ENOENT locally for "
}
} else {
// can we conclude ENOENT locally?
- if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
+ if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
(dir->flags & I_COMPLETE)) {
ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
return -ENOENT;
int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
{
- bool yes = in->caps_issued_mask(mask);
+ bool yes = in->caps_issued_mask(mask, true);
ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
if (yes && !force)
}
if (mask & CEPH_SETATTR_MTIME) {
req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
- req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
+ req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
CEPH_CAP_FILE_WR;
}
if (mask & CEPH_SETATTR_ATIME) {
ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
return -EFBIG;
}
- req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
+ req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
CEPH_CAP_FILE_WR;
}
req->head.args.setattr.mask = mask;
req->head.args.readdir.frag = fg;
req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
if (dirp->last_name.length()) {
- req->path2.set_path(dirp->last_name.c_str());
+ req->path2.set_path(dirp->last_name);
} else if (dirp->hash_order()) {
req->head.args.readdir.offset_hash = dirp->offset_high();
}
if (diri->dn_set.empty())
in = diri;
else
- in = diri->get_first_parent()->inode;
+ in = diri->get_first_parent()->dir->parent_inode;
int r;
- r = _getattr(diri, caps, dirp->perms);
+ r = _getattr(in, caps, dirp->perms);
if (r < 0)
return r;
<< dendl;
if (dirp->inode->snapid != CEPH_SNAPDIR &&
dirp->inode->is_complete_and_ordered() &&
- dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
+ dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
if (err != -EAGAIN)
return err;
unsigned mask = statx_to_mask(flags, want);
int r = 0;
- if (mask && !f->inode->caps_issued_mask(mask)) {
+ if (mask && !f->inode->caps_issued_mask(mask, true)) {
r = _getattr(f->inode, mask, perms);
if (r < 0) {
ldout(cct, 3) << "fstatx exit on error!" << dendl;
int res = 0;
unsigned mask = statx_to_mask(flags, want);
- if (mask && !in->caps_issued_mask(mask))
+ if (mask && !in->caps_issued_mask(mask, true))
res = _ll_getattr(in, mask, perms);
if (res == 0)
c |= it->second->issued;
i |= it->second->implemented;
}
+
+ // exclude caps issued by non-auth MDS, but are been revoking by
+ // the auth MDS. The non-auth MDS should be revoking/exporting
+ // these caps, but the message is delayed.
+ if (auth_cap)
+ c &= ~auth_cap->implemented | auth_cap->issued;
+
if (implemented)
*implemented = i;
return c;
touch_cap(caps[mds]);
}
-bool Inode::caps_issued_mask(unsigned mask)
+/**
+ * caps_issued_mask - check whether we have all of the caps in the mask
+ * @mask: mask to check against
+ * @allow_impl: whether the caller can also use caps that are implemented but not issued
+ *
+ * This is the bog standard "check whether we have the required caps" operation.
+ * Typically, we only check against the capset that is currently "issued".
+ * In other words, we ignore caps that have been revoked but not yet released.
+ *
+ * Some callers (particularly those doing attribute retrieval) can also make
+ * use of the full set of "implemented" caps to satisfy requests from the
+ * cache.
+ *
+ * Those callers should refrain from taking new references to implemented
+ * caps!
+ */
+bool Inode::caps_issued_mask(unsigned mask, bool allow_impl)
{
int c = snap_caps;
+ int i = 0;
+
if ((c & mask) == mask)
return true;
// prefer auth cap
return true;
}
c |= it->second->issued;
+ i |= it->second->implemented;
}
}
+
+ if (allow_impl)
+ c |= i;
+
if ((c & mask) == mask) {
// bah.. touch them all
for (map<mds_rank_t,Cap*>::iterator it = caps.begin();
int caps_issued(int *implemented = 0) const;
void touch_cap(Cap *cap);
void try_touch_cap(mds_rank_t mds);
- bool caps_issued_mask(unsigned mask);
+ bool caps_issued_mask(unsigned mask, bool allow_impl=false);
int caps_used();
int caps_file_wanted();
int caps_wanted();
void set_retry_attempt(int a) { head.num_retry = a; }
void set_filepath(const filepath& fp) { path = fp; }
void set_filepath2(const filepath& fp) { path2 = fp; }
- void set_string2(const char *s) { path2.set_path(s, 0); }
+ void set_string2(const char *s) { path2.set_path(boost::string_view(s), 0); }
void set_caller_perms(const UserPerm& _perms) {
perms.shallow_copy(_perms);
head.caller_uid = perms.uid();
uint64_t size = st.st_size;
dout(0) << "file " << filename << " size is " << size << dendl;
- inode_t inode;
+ inode_t<> inode;
memset(&inode, 0, sizeof(inode));
inode.ino = st.st_ino;
ret = client->fdescribe_layout(fd, &inode.layout);
CLS_METHOD_RD | CLS_METHOD_WR,
group_image_remove, &h_group_image_remove);
cls_register_cxx_method(h_class, "group_image_list",
- CLS_METHOD_RD | CLS_METHOD_WR,
+ CLS_METHOD_RD,
group_image_list, &h_group_image_list);
cls_register_cxx_method(h_class, "group_image_set",
CLS_METHOD_RD | CLS_METHOD_WR,
rgw_bucket_entry_ver ver;
ver.epoch = (op.olh_epoch ? op.olh_epoch : olh.get_epoch());
- real_time mtime = real_clock::now(); /* mtime has no real meaning in instance removal context */
+ real_time mtime = obj.mtime(); /* mtime has no real meaning in instance removal context */
ret = log_index_operation(hctx, op.key, CLS_RGW_OP_UNLINK_INSTANCE, op.op_tag,
mtime, ver,
CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker,
if (key.compare(end_key) > 0) {
key_iter = key;
+ if (truncated) {
+ *truncated = false;
+ }
return 0;
}
string user_key;
bool truncated_status = false;
+ assert(truncated != nullptr);
+
if (!by_user) {
usage_record_prefix_by_time(end, end_key);
} else {
if (ret < 0)
return ret;
- if (truncated) {
- *truncated = truncated_status;
- }
+ *truncated = truncated_status;
map<string, bufferlist>::iterator iter = keys.begin();
if (iter == keys.end())
if (!by_user && key.compare(end_key) >= 0) {
CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
+ *truncated = false;
key_iter = key;
return 0;
}
if (by_user && key.compare(0, user_key.size(), user_key) != 0) {
CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
+ *truncated = false;
key_iter = key;
return 0;
}
continue;
/* keys are sorted by epoch, so once we're past end we're done */
- if (e.epoch >= end)
+ if (e.epoch >= end) {
+ *truncated = false;
return 0;
+ }
ret = cb(hctx, key, e, param);
if (ret < 0)
static int usage_log_trim_cb(cls_method_context_t hctx, const string& key, rgw_usage_log_entry& entry, void *param)
{
+ bool *found = (bool *)param;
+ if (found) {
+ *found = true;
+ }
string key_by_time;
string key_by_user;
string iter;
bool more;
+ bool found = false;
#define MAX_USAGE_TRIM_ENTRIES 128
- ret = usage_iterate_range(hctx, op.start_epoch, op.end_epoch, op.user, iter, MAX_USAGE_TRIM_ENTRIES, &more, usage_log_trim_cb, NULL);
+ ret = usage_iterate_range(hctx, op.start_epoch, op.end_epoch, op.user, iter, MAX_USAGE_TRIM_ENTRIES, &more, usage_log_trim_cb, (void *)&found);
if (ret < 0)
return ret;
- if (!more && iter.empty())
+ if (!more && !found)
return -ENODATA;
return 0;
return 0;
}
+/// A method to reset the user.buckets header stats in accordance to the values
+/// seen in the user.buckets omap keys. This will not be equivalent to --sync-stats
+/// which requires comparing the values with actual bucket meta stats supplied
+/// by RGW
+static int cls_user_reset_stats(cls_method_context_t hctx, bufferlist *in, bufferlist *out /*ignore*/)
+{
+ cls_user_reset_stats_op op;
+
+ try {
+ auto bliter = in->begin();
+ ::decode(op, bliter);
+ } catch (buffer::error& err) {
+ CLS_LOG(0, "ERROR: cls_user_reset_op(): failed to decode op");
+ return -EINVAL;
+ }
+ cls_user_header header;
+ bool truncated = false;
+ string from_index, prefix;
+ do {
+ map<string, bufferlist> keys;
+ int rc = cls_cxx_map_get_vals(hctx, from_index, prefix, MAX_ENTRIES, &keys, &truncated);
+
+ if (rc < 0)
+ return rc;
+
+ for (const auto&kv : keys){
+ cls_user_bucket_entry e;
+ try {
+ auto bl = kv.second;
+ auto bliter = bl.begin();
+ decode(e, bliter);
+ } catch (buffer::error& err) {
+ CLS_LOG(0, "ERROR: failed to decode bucket entry for %s", kv.first.c_str());
+ return -EIO;
+ }
+ add_header_stats(&header.stats, e);
+ }
+ } while (truncated);
+
+ bufferlist bl;
+ header.last_stats_update = op.time;
+ ::encode(header, bl);
+
+ return cls_cxx_map_write_header(hctx, &bl);
+}
+
CLS_INIT(user)
{
CLS_LOG(1, "Loaded user class!");
cls_method_handle_t h_user_remove_bucket;
cls_method_handle_t h_user_list_buckets;
cls_method_handle_t h_user_get_header;
+ cls_method_handle_t h_user_reset_stats;
cls_register("user", &h_class);
cls_register_cxx_method(h_class, "remove_bucket", CLS_METHOD_RD | CLS_METHOD_WR, cls_user_remove_bucket, &h_user_remove_bucket);
cls_register_cxx_method(h_class, "list_buckets", CLS_METHOD_RD, cls_user_list_buckets, &h_user_list_buckets);
cls_register_cxx_method(h_class, "get_header", CLS_METHOD_RD, cls_user_get_header, &h_user_get_header);
-
+ cls_register_cxx_method(h_class, "reset_user_stats", CLS_METHOD_RD | CLS_METHOD_WR, cls_user_reset_stats, &h_user_reset_stats);
return;
}
op.exec("user", "get_header", inbl, new ClsUserGetHeaderCtx(header, NULL, pret));
}
+void cls_user_reset_stats(librados::ObjectWriteOperation &op)
+{
+ bufferlist inbl;
+ cls_user_reset_stats_op call;
+ call.time = real_clock::now();
+ ::encode(call, inbl);
+ op.exec("user", "reset_user_stats", inbl);
+}
+
int cls_user_get_header_async(IoCtx& io_ctx, string& oid, RGWGetUserHeader_CB *ctx)
{
bufferlist in, out;
int *pret);
void cls_user_get_header(librados::ObjectReadOperation& op, cls_user_header *header, int *pret);
int cls_user_get_header_async(librados::IoCtx& io_ctx, string& oid, RGWGetUserHeader_CB *ctx);
+void cls_user_reset_stats(librados::ObjectWriteOperation& op);
#endif
};
WRITE_CLASS_ENCODER(cls_user_get_header_op)
+struct cls_user_reset_stats_op {
+ real_time time;
+ cls_user_reset_stats_op() {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ ::encode(time, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator& bl) {
+ DECODE_START(1, bl);
+ ::decode(time, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<cls_user_reset_stats_op*>& ls);
+};
+WRITE_CLASS_ENCODER(cls_user_reset_stats_op);
+
struct cls_user_get_header_ret {
cls_user_header header;
Formatter::~Formatter() { }
-Formatter *Formatter::create(const std::string &type,
- const std::string& default_type,
- const std::string& fallback)
+Formatter *Formatter::create(boost::string_view type,
+ boost::string_view default_type,
+ boost::string_view fallback)
{
- std::string mytype = type;
+ std::string mytype(type);
if (mytype == "")
- mytype = default_type;
+ mytype = std::string(default_type);
if (mytype == "json")
return new JSONFormatter(false);
m_ss << " ";
}
-void JSONFormatter::print_quoted_string(const std::string& s)
+void JSONFormatter::print_quoted_string(boost::string_view s)
{
- int len = escape_json_attr_len(s.c_str(), s.size());
+ int len = escape_json_attr_len(s.data(), s.size());
char escaped[len];
- escape_json_attr(s.c_str(), s.size(), escaped);
+ escape_json_attr(s.data(), s.size(), escaped);
m_ss << '\"' << escaped << '\"';
}
m_ss << foo;
}
-void JSONFormatter::dump_string(const char *name, const std::string& s)
+void JSONFormatter::dump_string(const char *name, boost::string_view s)
{
print_name(name);
print_quoted_string(s);
m_ss << "\n";
}
-void XMLFormatter::dump_string(const char *name, const std::string& s)
+void XMLFormatter::dump_string(const char *name, boost::string_view s)
{
std::string e(name);
std::transform(e.begin(), e.end(), e.begin(),
[this](char c) { return this->to_lower_underscore(c); });
print_spaces();
- m_ss << "<" << e << ">" << escape_xml_str(s.c_str()) << "</" << e << ">";
+ m_ss << "<" << e << ">" << escape_xml_str(s.data()) << "</" << e << ">";
if (m_pretty)
m_ss << "\n";
}
-void XMLFormatter::dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs)
+void XMLFormatter::dump_string_with_attrs(const char *name, boost::string_view s, const FormatterAttrs& attrs)
{
std::string e(name);
std::transform(e.begin(), e.end(), e.begin(),
std::string attrs_str;
get_attrs_str(&attrs, attrs_str);
print_spaces();
- m_ss << "<" << e << attrs_str << ">" << escape_xml_str(s.c_str()) << "</" << e << ">";
+ m_ss << "<" << e << attrs_str << ">" << escape_xml_str(s.data()) << "</" << e << ">";
if (m_pretty)
m_ss << "\n";
}
}
}
-std::string XMLFormatter::escape_xml_str(const char *str)
+std::string XMLFormatter::escape_xml_str(boost::string_view str)
{
- int len = escape_xml_attr_len(str);
+ size_t len = escape_xml_attr_len(str.data());
std::vector<char> escaped(len, '\0');
- escape_xml_attr(str, &escaped[0]);
+ escape_xml_attr(str.data(), &escaped[0]);
return std::string(&escaped[0]);
}
m_ss.str("");
}
-void TableFormatter::dump_string(const char *name, const std::string& s)
+void TableFormatter::dump_string(const char *name, boost::string_view s)
{
finish_pending_string();
size_t i = m_vec_index(name);
m_ss.str("");
}
-void TableFormatter::dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs)
+void TableFormatter::dump_string_with_attrs(const char *name, boost::string_view s, const FormatterAttrs& attrs)
{
finish_pending_string();
size_t i = m_vec_index(name);
#include "include/int_types.h"
#include "include/buffer_fwd.h"
+#include <boost/utility/string_view.hpp>
+
#include <deque>
#include <list>
#include <vector>
class Formatter {
public:
- static Formatter *create(const std::string& type,
- const std::string& default_type,
- const std::string& fallback);
- static Formatter *create(const std::string& type,
- const std::string& default_type) {
+ static Formatter *create(boost::string_view type,
+ boost::string_view default_type,
+ boost::string_view fallback);
+ static Formatter *create(boost::string_view type,
+ boost::string_view default_type) {
return create(type, default_type, "");
}
- static Formatter *create(const std::string& type) {
+ static Formatter *create(boost::string_view type) {
return create(type, "json-pretty", "");
}
virtual void dump_unsigned(const char *name, uint64_t u) = 0;
virtual void dump_int(const char *name, int64_t s) = 0;
virtual void dump_float(const char *name, double d) = 0;
- virtual void dump_string(const char *name, const std::string& s) = 0;
+ virtual void dump_string(const char *name, boost::string_view s) = 0;
virtual void dump_bool(const char *name, bool b)
{
dump_format_unquoted(name, "%s", (b ? "true" : "false"));
{
open_object_section(name);
}
- virtual void dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs)
+ virtual void dump_string_with_attrs(const char *name, boost::string_view s, const FormatterAttrs& attrs)
{
dump_string(name, s);
}
void dump_unsigned(const char *name, uint64_t u) override;
void dump_int(const char *name, int64_t u) override;
void dump_float(const char *name, double d) override;
- void dump_string(const char *name, const std::string& s) override;
+ void dump_string(const char *name, boost::string_view s) override;
std::ostream& dump_stream(const char *name) override;
void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
int get_len() const override;
bool m_pretty;
void open_section(const char *name, bool is_array);
- void print_quoted_string(const std::string& s);
+ void print_quoted_string(boost::string_view s);
void print_name(const char *name);
void print_comma(json_formatter_stack_entry_d& entry);
void finish_pending_string();
void dump_unsigned(const char *name, uint64_t u) override;
void dump_int(const char *name, int64_t u) override;
void dump_float(const char *name, double d) override;
- void dump_string(const char *name, const std::string& s) override;
+ void dump_string(const char *name, boost::string_view s) override;
std::ostream& dump_stream(const char *name) override;
void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
int get_len() const override;
/* with attrs */
void open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs) override;
void open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs) override;
- void dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs) override;
+ void dump_string_with_attrs(const char *name, boost::string_view s, const FormatterAttrs& attrs) override;
protected:
void open_section_in_ns(const char *name, const char *ns, const FormatterAttrs *attrs);
void finish_pending_string();
void print_spaces();
- static std::string escape_xml_str(const char *str);
+ static std::string escape_xml_str(boost::string_view str);
void get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str);
char to_lower_underscore(char c) const;
void dump_unsigned(const char *name, uint64_t u) override;
void dump_int(const char *name, int64_t u) override;
void dump_float(const char *name, double d) override;
- void dump_string(const char *name, const std::string& s) override;
+ void dump_string(const char *name, boost::string_view s) override;
void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
- void dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs) override;
+ void dump_string_with_attrs(const char *name, boost::string_view s, const FormatterAttrs& attrs) override;
std::ostream& dump_stream(const char *name) override;
int get_len() const override;
dump_template(name, d);
}
-void HTMLFormatter::dump_string(const char *name, const std::string& s)
+void HTMLFormatter::dump_string(const char *name, boost::string_view s)
{
- dump_template(name, escape_xml_str(s.c_str()));
+ dump_template(name, escape_xml_str(s.data()));
}
-void HTMLFormatter::dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs)
+void HTMLFormatter::dump_string_with_attrs(const char *name, boost::string_view s, const FormatterAttrs& attrs)
{
std::string e(name);
std::string attrs_str;
get_attrs_str(&attrs, attrs_str);
print_spaces();
- m_ss << "<li>" << e << ": " << escape_xml_str(s.c_str()) << attrs_str << "</li>";
+ m_ss << "<li>" << e << ": " << escape_xml_str(s.data()) << attrs_str << "</li>";
if (m_pretty)
m_ss << "\n";
}
void dump_unsigned(const char *name, uint64_t u) override;
void dump_int(const char *name, int64_t u) override;
void dump_float(const char *name, double d) override;
- void dump_string(const char *name, const std::string& s) override;
+ void dump_string(const char *name, boost::string_view s) override;
std::ostream& dump_stream(const char *name) override;
void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
/* with attrs */
- void dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs) override;
+ void dump_string_with_attrs(const char *name, boost::string_view s, const FormatterAttrs& attrs) override;
private:
template <typename T> void dump_template(const char *name, T arg);
static void generic_usage(bool is_server)
{
- cout << "\
- --conf/-c FILE read configuration from the given configuration file\n\
- --id/-i ID set ID portion of my name\n\
- --name/-n TYPE.ID set name\n\
- --cluster NAME set cluster name (default: ceph)\n\
- --setuser USER set uid to user or uid (and gid to user's gid)\n\
- --setgroup GROUP set gid to group or gid\n\
- --version show version and quit\n\
-" << std::endl;
+ cout <<
+ " --conf/-c FILE read configuration from the given configuration file" << std::endl <<
+ (is_server ?
+ " --id/-i ID set ID portion of my name" :
+ " --id ID set ID portion of my name") << std::endl <<
+ " --name/-n TYPE.ID set name" << std::endl <<
+ " --cluster NAME set cluster name (default: ceph)" << std::endl <<
+ " --setuser USER set uid to user or uid (and gid to user's gid)" << std::endl <<
+ " --setgroup GROUP set gid to group or gid" << std::endl <<
+ " --version show version and quit" << std::endl
+ << std::endl;
if (is_server) {
- cout << "\
- -d run in foreground, log to stderr.\n\
- -f run in foreground, log to usual location.\n";
- cout << "\
- --debug_ms N set message debug level (e.g. 1)\n";
+ cout <<
+ " -d run in foreground, log to stderr" << std::endl <<
+ " -f run in foreground, log to usual location" << std::endl <<
+ std::endl <<
+ " --debug_ms N set message debug level (e.g. 1)" << std::endl;
}
cout.flush();
return time_point(seconds(ts.tv_sec) + nanoseconds(ts.tv_nsec));
}
+ static time_point zero() {
+ return time_point::min();
+ }
+
// A monotonic clock's timepoints are only meaningful to the
// computer on which they were generated. Thus having an
// optional skew is meaningless.
u_char nsbuf[NS_PACKETSZ];
int len;
-
+ int family = cct->_conf->ms_bind_ipv6 ? AF_INET6 : AF_INET;
+ int type = cct->_conf->ms_bind_ipv6 ? ns_t_aaaa : ns_t_a;
#ifdef HAVE_RES_NQUERY
- len = resolv_h->res_nquery(*res, hostname.c_str(), ns_c_in, ns_t_a, nsbuf, sizeof(nsbuf));
+ len = resolv_h->res_nquery(*res, hostname.c_str(), ns_c_in, type, nsbuf, sizeof(nsbuf));
#else
{
# ifndef HAVE_THREAD_SAFE_RES_QUERY
Mutex::Locker l(lock);
# endif
- len = resolv_h->res_query(hostname.c_str(), ns_c_in, ns_t_a, nsbuf, sizeof(nsbuf));
+ len = resolv_h->res_query(hostname.c_str(), ns_c_in, type, nsbuf, sizeof(nsbuf));
}
#endif
if (len < 0) {
char addr_buf[64];
memset(addr_buf, 0, sizeof(addr_buf));
- inet_ntop(AF_INET, ns_rr_rdata(rr), addr_buf, sizeof(addr_buf));
+ inet_ntop(family, ns_rr_rdata(rr), addr_buf, sizeof(addr_buf));
if (!addr->parse(addr_buf)) {
lderr(cct) << "failed to parse address '" << (const char *)ns_rr_rdata(rr)
<< "'" << dendl;
#define SGL_QUOTE_XESCAPE "'"
#define DBL_QUOTE_XESCAPE """
-int escape_xml_attr_len(const char *buf)
+size_t escape_xml_attr_len(const char *buf)
{
const char *b;
- int ret = 0;
+ size_t ret = 0;
for (b = buf; *b; ++b) {
unsigned char c = *b;
switch (c) {
#define TAB_JESCAPE "\\t"
#define NEWLINE_JESCAPE "\\n"
-int escape_json_attr_len(const char *buf, int src_len)
+size_t escape_json_attr_len(const char *buf, size_t src_len)
{
const char *b;
- int ret = 0;
- int i;
+ size_t i, ret = 0;
for (i = 0, b = buf; i < src_len; ++i, ++b) {
unsigned char c = *b;
switch (c) {
return ret;
}
-void escape_json_attr(const char *buf, int src_len, char *out)
+void escape_json_attr(const char *buf, size_t src_len, char *out)
{
char *o = out;
const char *b;
- int i;
+ size_t i;
for (i = 0, b = buf; i < src_len; ++i, ++b) {
unsigned char c = *b;
switch (c) {
#ifndef CEPH_RGW_ESCAPE_H
#define CEPH_RGW_ESCAPE_H
+#include <stdlib.h>
+
#ifdef __cplusplus
extern "C" {
#endif
/* Returns the length of a buffer that would be needed to escape 'buf'
* as an XML attrribute
*/
-int escape_xml_attr_len(const char *buf);
+size_t escape_xml_attr_len(const char *buf);
/* Escapes 'buf' as an XML attribute. Assumes that 'out' is at least long
* enough to fit the output. You can find out the required length by calling
/* Returns the length of a buffer that would be needed to escape 'buf'
* as an JSON attrribute
*/
-int escape_json_attr_len(const char *buf, int src_len);
+size_t escape_json_attr_len(const char *buf, size_t src_len);
/* Escapes 'buf' as an JSON attribute. Assumes that 'out' is at least long
* enough to fit the output. You can find out the required length by calling
* escape_json_attr_len first.
*/
-void escape_json_attr(const char *buf, int src_len, char *out);
+void escape_json_attr(const char *buf, size_t src_len, char *out);
/* Note: we escape control characters. Although the XML spec doesn't actually
* require this, Amazon does it in their XML responses.
continue;
struct in6_addr *cur = &((struct sockaddr_in6*)addrs->ifa_addr)->sin6_addr;
+ if (IN6_IS_ADDR_LINKLOCAL(cur))
+ continue;
netmask_ipv6(cur, prefix_len, &temp);
if (IN6_ARE_ADDR_EQUAL(&temp, &want))
OPTION(osd_pg_log_dups_tracked, OPT_U32) // how many versions back to track combined in both pglog's regular + dup logs
OPTION(osd_force_recovery_pg_log_entries_factor, OPT_FLOAT) // max entries factor before force recovery
OPTION(osd_pg_log_trim_min, OPT_U32)
+OPTION(osd_pg_log_trim_max, OPT_U32)
OPTION(osd_op_complaint_time, OPT_FLOAT) // how many seconds old makes an op complaint-worthy
OPTION(osd_command_max_records, OPT_INT)
OPTION(osd_max_pg_blocked_by, OPT_U32) // max peer osds to report that are blocking our progress
OPTION(osd_debug_op_order, OPT_BOOL)
OPTION(osd_debug_verify_missing_on_start, OPT_BOOL)
OPTION(osd_debug_scrub_chance_rewrite_digest, OPT_U64)
-OPTION(osd_debug_verify_snaps_on_info, OPT_BOOL)
+OPTION(osd_debug_verify_snaps, OPT_BOOL)
OPTION(osd_debug_verify_stray_on_activate, OPT_BOOL)
OPTION(osd_debug_skip_full_check_in_backfill_reservation, OPT_BOOL)
OPTION(osd_debug_reject_backfill_probability, OPT_DOUBLE)
OPTION(bluestore_cache_size_ssd, OPT_U64)
OPTION(bluestore_cache_meta_ratio, OPT_DOUBLE)
OPTION(bluestore_cache_kv_ratio, OPT_DOUBLE)
-OPTION(bluestore_cache_kv_max, OPT_U64) // limit the maximum amount of cache for the kv store
+OPTION(bluestore_cache_kv_max, OPT_INT) // limit the maximum amount of cache for the kv store
OPTION(bluestore_kvbackend, OPT_STR)
OPTION(bluestore_allocator, OPT_STR) // stupid | bitmap
OPTION(bluestore_freelist_blocks_per_key, OPT_INT)
.set_long_description("OSD will refuse to instantiate PG if the number of PG it serves exceeds this number.")
.add_see_also("mon_max_pg_per_osd"),
+ Option("osd_pg_log_trim_max", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(10000)
+ .set_description("maximum number of entries to remove at once from the PG log")
+ .add_service("osd")
+ .add_see_also("osd_min_pg_log_entries")
+ .add_see_also("osd_max_pg_log_entries"),
+
Option("osd_op_complaint_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(30)
.set_description(""),
.set_default(0)
.set_description(""),
- Option("osd_debug_verify_snaps_on_info", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ Option("osd_debug_verify_snaps", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
.set_description("Preallocated buffer for inline shards"),
Option("bluestore_cache_trim_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
- .set_default(.2)
+ .set_default(.05)
.set_description("How frequently we trim the bluestore cache"),
Option("bluestore_cache_trim_max_skip_pinned", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(.99)
.set_description("Ratio of bluestore cache to devote to kv database (rocksdb)"),
- Option("bluestore_cache_kv_max", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ Option("bluestore_cache_kv_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(512_M)
- .set_description("Max memory (bytes) to devote to kv database (rocksdb)"),
+ .set_description("Max memory (bytes) to devote to kv database (rocksdb)")
+ .set_long_description("A negative value means using bluestore_cache_meta_ratio "
+ "and bluestore_cache_kv_ratio instead of calculating these ratios using "
+ "bluestore_cache_size_* and bluestore_cache_kv_max."),
Option("bluestore_kvbackend", Option::TYPE_STR, Option::LEVEL_DEV)
.set_default("rocksdb")
.set_description(""),
Option("mds_log_max_segments", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
- .set_default(30)
+ .set_default(128)
.set_description(""),
Option("mds_bal_export_pin", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(100)
.set_description("minimum number of capabilities a client may hold"),
- Option("mds_max_ratio_caps_per_client", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ Option("mds_max_ratio_caps_per_client", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(.8)
.set_description("maximum ratio of current caps that may be recalled during MDS cache pressure"),
+ Option("mds_hack_allow_loading_invalid_metadata", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description("INTENTIONALLY CAUSE DATA LOSS by bypasing checks for invalid metadata on disk. Allows testing repair tools."),
});
}
errors |= inc_snapset_t::HEADLESS_CLONE;
}
-void inconsistent_snapset_wrapper::set_ss_attr_missing()
+void inconsistent_snapset_wrapper::set_snapset_missing()
{
errors |= inc_snapset_t::SNAPSET_MISSING;
}
-void inconsistent_snapset_wrapper::set_oi_attr_missing()
+void inconsistent_snapset_wrapper::set_info_missing()
{
- errors |= inc_snapset_t::OI_MISSING;
+ errors |= inc_snapset_t::INFO_MISSING;
}
-void inconsistent_snapset_wrapper::set_ss_attr_corrupted()
+void inconsistent_snapset_wrapper::set_snapset_corrupted()
{
errors |= inc_snapset_t::SNAPSET_CORRUPTED;
}
-void inconsistent_snapset_wrapper::set_oi_attr_corrupted()
+void inconsistent_snapset_wrapper::set_info_corrupted()
{
- errors |= inc_snapset_t::OI_CORRUPTED;
+ errors |= inc_snapset_t::INFO_CORRUPTED;
}
void inconsistent_snapset_wrapper::set_clone_missing(snapid_t snap)
clones.push_back(snap);
}
-void inconsistent_snapset_wrapper::set_snapset_mismatch()
+void inconsistent_snapset_wrapper::set_snapset_error()
{
- errors |= inc_snapset_t::SNAP_MISMATCH;
+ errors |= inc_snapset_t::SNAP_ERROR;
}
void inconsistent_snapset_wrapper::set_head_mismatch()
void inconsistent_snapset_wrapper::encode(bufferlist& bl) const
{
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(errors, bl);
::encode(object, bl);
::encode(clones, bl);
::encode(missing, bl);
+ ::encode(ss_bl, bl);
ENCODE_FINISH(bl);
}
void inconsistent_snapset_wrapper::decode(bufferlist::iterator& bp)
{
- DECODE_START(1, bp);
+ DECODE_START(2, bp);
::decode(errors, bp);
::decode(object, bp);
::decode(clones, bp);
::decode(missing, bp);
+ if (struct_v >= 2) {
+ ::decode(ss_bl, bp);
+ }
DECODE_FINISH(bp);
}
void set_missing() {
errors |= err_t::SHARD_MISSING;
}
- void set_omap_digest_mismatch_oi() {
- errors |= err_t::OMAP_DIGEST_MISMATCH_OI;
+ void set_omap_digest_mismatch_info() {
+ errors |= err_t::OMAP_DIGEST_MISMATCH_INFO;
}
- void set_size_mismatch_oi() {
- errors |= err_t::SIZE_MISMATCH_OI;
+ void set_size_mismatch_info() {
+ errors |= err_t::SIZE_MISMATCH_INFO;
}
- void set_data_digest_mismatch_oi() {
- errors |= err_t::DATA_DIGEST_MISMATCH_OI;
+ void set_data_digest_mismatch_info() {
+ errors |= err_t::DATA_DIGEST_MISMATCH_INFO;
}
void set_read_error() {
errors |= err_t::SHARD_READ_ERR;
void set_ec_size_mismatch() {
errors |= err_t::SHARD_EC_SIZE_MISMATCH;
}
- void set_oi_attr_missing() {
- errors |= err_t::OI_ATTR_MISSING;
+ void set_info_missing() {
+ errors |= err_t::INFO_MISSING;
}
- void set_oi_attr_corrupted() {
- errors |= err_t::OI_ATTR_CORRUPTED;
+ void set_info_corrupted() {
+ errors |= err_t::INFO_CORRUPTED;
}
- void set_ss_attr_missing() {
- errors |= err_t::SS_ATTR_MISSING;
+ void set_snapset_missing() {
+ errors |= err_t::SNAPSET_MISSING;
}
- void set_ss_attr_corrupted() {
- errors |= err_t::SS_ATTR_CORRUPTED;
+ void set_snapset_corrupted() {
+ errors |= err_t::SNAPSET_CORRUPTED;
}
- void set_obj_size_oi_mismatch() {
- errors |= err_t::OBJ_SIZE_OI_MISMATCH;
+ void set_obj_size_info_mismatch() {
+ errors |= err_t::OBJ_SIZE_INFO_MISMATCH;
+ }
+ void set_hinfo_missing() {
+ errors |= err_t::HINFO_MISSING;
+ }
+ void set_hinfo_corrupted() {
+ errors |= err_t::HINFO_CORRUPTED;
}
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bp);
void set_snapset_inconsistency() {
errors |= obj_err_t::SNAPSET_INCONSISTENCY;
}
+ void set_hinfo_inconsistency() {
+ errors |= obj_err_t::HINFO_INCONSISTENCY;
+ }
void add_shard(const pg_shard_t& pgs, const shard_info_wrapper& shard);
void set_auth_missing(const hobject_t& hoid,
const map<pg_shard_t, ScrubMap*>&,
void set_headless();
// soid claims that it is a head or a snapdir, but its SS_ATTR
// is missing.
- void set_ss_attr_missing();
- void set_oi_attr_missing();
- void set_ss_attr_corrupted();
- void set_oi_attr_corrupted();
+ void set_snapset_missing();
+ void set_info_missing();
+ void set_snapset_corrupted();
+ void set_info_corrupted();
// snapset with missing clone
void set_clone_missing(snapid_t);
// Clones that are there
void set_clone(snapid_t);
// the snapset is not consistent with itself
- void set_snapset_mismatch();
+ void set_snapset_error();
// soid.snap inconsistent with snapset
void set_head_mismatch();
void set_size_mismatch();
* Copyright 2014 Cloudius Systems
*/
/*
- * C++2014 dependencies removed. Uses of std::string_view adapted to
+ * C++2014 dependencies removed. Uses of boost::string_view adapted to
* boost::string_ref. Matt Benjamin <mbenjamin@redhat.com>
*/
lderr(cct) << "distro_detect - /etc/os-release is required" << dendl;
}
- for (const char* rk: {"distro", "distro_version"}) {
+ for (const char* rk: {"distro", "distro_description"}) {
if (m->find(rk) == m->end())
lderr(cct) << "distro_detect - can't detect " << rk << dendl;
}
return false;
}
- ldout(cct, 1) << "check_item_loc item " << item << " loc " << loc << dendl;
+ ldout(cct, 2) << __func__ << " item " << item << " loc " << loc << dendl;
return false;
}
return b->size;
}
+int CrushWrapper::get_rule_failure_domain(int rule_id)
+{
+ crush_rule *rule = get_rule(rule_id);
+ if (IS_ERR(rule)) {
+ return -ENOENT;
+ }
+ int type = 0; // default to osd-level
+ for (unsigned s = 0; s < rule->len; ++s) {
+ if ((rule->steps[s].op == CRUSH_RULE_CHOOSE_FIRSTN ||
+ rule->steps[s].op == CRUSH_RULE_CHOOSE_INDEP ||
+ rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_FIRSTN ||
+ rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_INDEP) &&
+ rule->steps[s].arg2 > type) {
+ type = rule->steps[s].arg2;
+ }
+ }
+ return type;
+}
+
int CrushWrapper::_get_leaves(int id, list<int> *leaves)
{
assert(leaves);
*/
int get_children(int id, list<int> *children);
+ /**
+ * get failure-domain type of a specific crush rule
+ * @param rule_id crush rule id
+ * @return type of failure-domain or a negative errno on error.
+ */
+ int get_rule_failure_domain(int rule_id);
+
/**
* enumerate leaves(devices) of given node
*
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef CEPH_BTREE_INTERVAL_SET_H
-#define CEPH_BTREE_INTERVAL_SET_H
-
-#include <iterator>
-#include <map>
-#include <ostream>
-
-#include "encoding.h"
-
-#ifndef MIN
-# define MIN(a,b) ((a)<=(b) ? (a):(b))
-#endif
-#ifndef MAX
-# define MAX(a,b) ((a)>=(b) ? (a):(b))
-#endif
-
-#include "cpp-btree/btree_map.h"
-#include "assert.h"
-#include "encoding_btree.h"
-
-template<typename T,
- typename Alloc = std::allocator<std::pair<const T, T>>>
-class btree_interval_set {
- public:
-
- typedef btree::btree_map<T,T, std::less<T>, Alloc> map_t;
-
- class const_iterator;
-
- class iterator : public std::iterator <std::forward_iterator_tag, T>
- {
- public:
- explicit iterator(typename map_t::iterator iter)
- : _iter(iter)
- { }
-
- // For the copy constructor and assignment operator, the compiler-generated functions, which
- // perform simple bitwise copying, should be fine.
-
- bool operator==(const iterator& rhs) const {
- return (_iter == rhs._iter);
- }
-
- bool operator!=(const iterator& rhs) const {
- return (_iter != rhs._iter);
- }
-
- // Dereference this iterator to get a pair.
- std::pair < T, T > &operator*() {
- return *_iter;
- }
-
- // Return the interval start.
- T get_start() const {
- return _iter->first;
- }
-
- // Return the interval length.
- T get_len() const {
- return _iter->second;
- }
-
- // Set the interval length.
- void set_len(T len) {
- _iter->second = len;
- }
-
- // Preincrement
- iterator &operator++()
- {
- ++_iter;
- return *this;
- }
-
- // Postincrement
- iterator operator++(int)
- {
- iterator prev(_iter);
- ++_iter;
- return prev;
- }
-
- friend class btree_interval_set<T>::const_iterator;
-
- protected:
- typename map_t::iterator _iter;
- friend class btree_interval_set<T>;
- };
-
- class const_iterator : public std::iterator <std::forward_iterator_tag, T>
- {
- public:
- explicit const_iterator(typename map_t::const_iterator iter)
- : _iter(iter)
- { }
-
- const_iterator(const iterator &i)
- : _iter(i._iter)
- { }
-
- // For the copy constructor and assignment operator, the compiler-generated functions, which
- // perform simple bitwise copying, should be fine.
-
- bool operator==(const const_iterator& rhs) const {
- return (_iter == rhs._iter);
- }
-
- bool operator!=(const const_iterator& rhs) const {
- return (_iter != rhs._iter);
- }
-
- // Dereference this iterator to get a pair.
- std::pair < T, T > operator*() const {
- return *_iter;
- }
-
- // Return the interval start.
- T get_start() const {
- return _iter->first;
- }
-
- // Return the interval length.
- T get_len() const {
- return _iter->second;
- }
-
- // Preincrement
- const_iterator &operator++()
- {
- ++_iter;
- return *this;
- }
-
- // Postincrement
- const_iterator operator++(int)
- {
- const_iterator prev(_iter);
- ++_iter;
- return prev;
- }
-
- protected:
- typename map_t::const_iterator _iter;
- };
-
- btree_interval_set() : _size(0) {}
-
- int num_intervals() const
- {
- return m.size();
- }
-
- typename btree_interval_set<T,Alloc>::iterator begin() {
- return typename btree_interval_set<T,Alloc>::iterator(m.begin());
- }
-
- typename btree_interval_set<T,Alloc>::iterator lower_bound(T start) {
- return typename btree_interval_set<T,Alloc>::iterator(find_inc_m(start));
- }
-
- typename btree_interval_set<T,Alloc>::iterator end() {
- return typename btree_interval_set<T,Alloc>::iterator(m.end());
- }
-
- typename btree_interval_set<T,Alloc>::const_iterator begin() const {
- return typename btree_interval_set<T,Alloc>::const_iterator(m.begin());
- }
-
- typename btree_interval_set<T,Alloc>::const_iterator lower_bound(T start) const {
- return typename btree_interval_set<T,Alloc>::const_iterator(find_inc(start));
- }
-
- typename btree_interval_set<T,Alloc>::const_iterator end() const {
- return typename btree_interval_set<T,Alloc>::const_iterator(m.end());
- }
-
- // helpers
- private:
- typename map_t::const_iterator find_inc(T start) const {
- typename map_t::const_iterator p = m.lower_bound(start); // p->first >= start
- if (p != m.begin() &&
- (p == m.end() || p->first > start)) {
- p--; // might overlap?
- if (p->first + p->second <= start)
- p++; // it doesn't.
- }
- return p;
- }
-
- typename map_t::iterator find_inc_m(T start) {
- typename map_t::iterator p = m.lower_bound(start);
- if (p != m.begin() &&
- (p == m.end() || p->first > start)) {
- p--; // might overlap?
- if (p->first + p->second <= start)
- p++; // it doesn't.
- }
- return p;
- }
-
- typename map_t::const_iterator find_adj(T start) const {
- typename map_t::const_iterator p = m.lower_bound(start);
- if (p != m.begin() &&
- (p == m.end() || p->first > start)) {
- p--; // might touch?
- if (p->first + p->second < start)
- p++; // it doesn't.
- }
- return p;
- }
-
- typename map_t::iterator find_adj_m(T start) {
- typename map_t::iterator p = m.lower_bound(start);
- if (p != m.begin() &&
- (p == m.end() || p->first > start)) {
- p--; // might touch?
- if (p->first + p->second < start)
- p++; // it doesn't.
- }
- return p;
- }
-
- public:
- bool operator==(const btree_interval_set& other) const {
- return _size == other._size && m == other.m;
- }
-
- int64_t size() const {
- return _size;
- }
-
- void encode(bufferlist& bl) const {
- ::encode(m, bl);
- }
- void encode_nohead(bufferlist& bl) const {
- ::encode_nohead(m, bl);
- }
- void decode(bufferlist::iterator& bl) {
- ::decode(m, bl);
- _size = 0;
- for (typename map_t::const_iterator p = m.begin();
- p != m.end();
- p++)
- _size += p->second;
- }
- void decode_nohead(int n, bufferlist::iterator& bl) {
- ::decode_nohead(n, m, bl);
- _size = 0;
- for (typename map_t::const_iterator p = m.begin();
- p != m.end();
- p++)
- _size += p->second;
- }
-
- void clear() {
- m.clear();
- _size = 0;
- }
-
- bool contains(T i, T *pstart=0, T *plen=0) const {
- typename map_t::const_iterator p = find_inc(i);
- if (p == m.end()) return false;
- if (p->first > i) return false;
- if (p->first+p->second <= i) return false;
- assert(p->first <= i && p->first+p->second > i);
- if (pstart)
- *pstart = p->first;
- if (plen)
- *plen = p->second;
- return true;
- }
- bool contains(T start, T len) const {
- typename map_t::const_iterator p = find_inc(start);
- if (p == m.end()) return false;
- if (p->first > start) return false;
- if (p->first+p->second <= start) return false;
- assert(p->first <= start && p->first+p->second > start);
- if (p->first+p->second < start+len) return false;
- return true;
- }
- bool intersects(T start, T len) const {
- btree_interval_set a;
- a.insert(start, len);
- btree_interval_set i;
- i.intersection_of( *this, a );
- if (i.empty()) return false;
- return true;
- }
-
- // outer range of set
- bool empty() const {
- return m.empty();
- }
- T range_start() const {
- assert(!empty());
- typename map_t::const_iterator p = m.begin();
- return p->first;
- }
- T range_end() const {
- assert(!empty());
- typename map_t::const_iterator p = m.end();
- p--;
- return p->first+p->second;
- }
-
- // interval start after p (where p not in set)
- bool starts_after(T i) const {
- assert(!contains(i));
- typename map_t::const_iterator p = find_inc(i);
- if (p == m.end()) return false;
- return true;
- }
- T start_after(T i) const {
- assert(!contains(i));
- typename map_t::const_iterator p = find_inc(i);
- return p->first;
- }
-
- // interval end that contains start
- T end_after(T start) const {
- assert(contains(start));
- typename map_t::const_iterator p = find_inc(start);
- return p->first+p->second;
- }
-
- void insert(T val) {
- insert(val, 1);
- }
-
- void insert(T start, T len, T *pstart=0, T *plen=0) {
- //cout << "insert " << start << "~" << len << endl;
- assert(len > 0);
- _size += len;
- typename map_t::iterator p = find_adj_m(start);
- if (p == m.end()) {
- m[start] = len; // new interval
- if (pstart)
- *pstart = start;
- if (plen)
- *plen = len;
- } else {
- if (p->first < start) {
-
- if (p->first + p->second != start) {
- //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl;
- ceph_abort();
- }
-
- p->second += len; // append to end
-
- typename map_t::iterator n = p;
- n++;
- if (pstart)
- *pstart = p->first;
- if (n != m.end() &&
- start+len == n->first) { // combine with next, too!
- p->second += n->second;
- if (plen)
- *plen = p->second;
- m.erase(n);
- } else {
- if (plen)
- *plen = p->second;
- }
- } else {
- if (start+len == p->first) {
- if (pstart)
- *pstart = start;
- if (plen)
- *plen = len + p->second;
- T plen = p->second;
- m.erase(p);
- m[start] = len + plen; // append to front
- } else {
- assert(p->first > start+len);
- if (pstart)
- *pstart = start;
- if (plen)
- *plen = len;
- m[start] = len; // new interval
- }
- }
- }
- }
-
- void swap(btree_interval_set<T>& other) {
- m.swap(other.m);
- std::swap(_size, other._size);
- }
-
- void erase(iterator &i) {
- _size -= i.get_len();
- assert(_size >= 0);
- m.erase(i._iter);
- }
-
- void erase(T val) {
- erase(val, 1);
- }
-
- void erase(T start, T len) {
- typename map_t::iterator p = find_inc_m(start);
-
- _size -= len;
- assert(_size >= 0);
-
- assert(p != m.end());
- assert(p->first <= start);
-
- T before = start - p->first;
- assert(p->second >= before+len);
- T after = p->second - before - len;
-
- if (before)
- p->second = before; // shorten bit before
- else
- m.erase(p);
- if (after)
- m[start+len] = after;
- }
-
-
- void subtract(const btree_interval_set &a) {
- for (typename map_t::const_iterator p = a.m.begin();
- p != a.m.end();
- p++)
- erase(p->first, p->second);
- }
-
- void insert(const btree_interval_set &a) {
- for (typename map_t::const_iterator p = a.m.begin();
- p != a.m.end();
- p++)
- insert(p->first, p->second);
- }
-
-
- void intersection_of(const btree_interval_set &a, const btree_interval_set &b) {
- assert(&a != this);
- assert(&b != this);
- clear();
-
- typename map_t::const_iterator pa = a.m.begin();
- typename map_t::const_iterator pb = b.m.begin();
-
- while (pa != a.m.end() && pb != b.m.end()) {
- // passing?
- if (pa->first + pa->second <= pb->first)
- { pa++; continue; }
- if (pb->first + pb->second <= pa->first)
- { pb++; continue; }
- T start = MAX(pa->first, pb->first);
- T en = MIN(pa->first+pa->second, pb->first+pb->second);
- assert(en > start);
- insert(start, en-start);
- if (pa->first+pa->second > pb->first+pb->second)
- pb++;
- else
- pa++;
- }
- }
- void intersection_of(const btree_interval_set& b) {
- btree_interval_set a;
- swap(a);
- intersection_of(a, b);
- }
-
- void union_of(const btree_interval_set &a, const btree_interval_set &b) {
- assert(&a != this);
- assert(&b != this);
- clear();
-
- //cout << "union_of" << endl;
-
- // a
- m = a.m;
- _size = a._size;
-
- // - (a*b)
- btree_interval_set ab;
- ab.intersection_of(a, b);
- subtract(ab);
-
- // + b
- insert(b);
- return;
- }
- void union_of(const btree_interval_set &b) {
- btree_interval_set a;
- swap(a);
- union_of(a, b);
- }
-
- bool subset_of(const btree_interval_set &big) const {
- for (typename map_t::const_iterator i = m.begin();
- i != m.end();
- i++)
- if (!big.contains(i->first, i->second)) return false;
- return true;
- }
-
- /*
- * build a subset of @other, starting at or after @start, and including
- * @len worth of values, skipping holes. e.g.,
- * span_of([5~10,20~5], 8, 5) -> [8~2,20~3]
- */
- void span_of(const btree_interval_set &other, T start, T len) {
- clear();
- typename map_t::const_iterator p = other.find_inc(start);
- if (p == other.m.end())
- return;
- if (p->first < start) {
- if (p->first + p->second < start)
- return;
- if (p->first + p->second < start + len) {
- T howmuch = p->second - (start - p->first);
- insert(start, howmuch);
- len -= howmuch;
- p++;
- } else {
- insert(start, len);
- return;
- }
- }
- while (p != other.m.end() && len > 0) {
- if (p->second < len) {
- insert(p->first, p->second);
- len -= p->second;
- p++;
- } else {
- insert(p->first, len);
- return;
- }
- }
- }
-
-private:
- // data
- int64_t _size;
- map_t m; // map start -> len
-};
-
-
-template<class T, class A>
-inline std::ostream& operator<<(std::ostream& out, const btree_interval_set<T,A> &s) {
- out << "[";
- const char *prequel = "";
- for (auto i = s.begin();
- i != s.end();
- ++i)
- {
- out << prequel << i.get_start() << "~" << i.get_len();
- prequel = ",";
- }
- out << "]";
- return out;
-}
-
-template<class T,typename A>
-inline void encode(const btree_interval_set<T,A>& s, bufferlist& bl)
-{
- s.encode(bl);
-}
-template<class T,typename A>
-inline void decode(btree_interval_set<T,A>& s, bufferlist::iterator& p)
-{
- s.decode(p);
-}
-
-#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_INCLUDE_BTREE_MAP_H
+#define CEPH_INCLUDE_BTREE_MAP_H
+
+#include "include/cpp-btree/btree.h"
+#include "include/cpp-btree/btree_map.h"
+#include "include/assert.h" // cpp-btree uses system assert, blech
+#include "include/encoding.h"
+
+template<class T, class U>
+inline void encode(const btree::btree_map<T,U>& m, bufferlist& bl)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U>
+inline void encode(const btree::btree_map<T,U>& m, bufferlist& bl, uint64_t features)
+{
+ __u32 n = (__u32)(m.size());
+ encode(n, bl);
+ for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl, features);
+ encode(p->second, bl, features);
+ }
+}
+template<class T, class U>
+inline void decode(btree::btree_map<T,U>& m, bufferlist::iterator& p)
+{
+ __u32 n;
+ decode(n, p);
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+template<class T, class U>
+inline void encode_nohead(const btree::btree_map<T,U>& m, bufferlist& bl)
+{
+ for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+ encode(p->first, bl);
+ encode(p->second, bl);
+ }
+}
+template<class T, class U>
+inline void decode_nohead(int n, btree::btree_map<T,U>& m, bufferlist::iterator& p)
+{
+ m.clear();
+ while (n--) {
+ T k;
+ decode(k, p);
+ decode(m[k], p);
+ }
+}
+
+#endif
#ifndef CEPH_COMPACT_MAP_H
#define CEPH_COMPACT_MAP_H
+#include "buffer.h"
+#include "encoding.h"
+
#include <map>
+#include <memory>
template <class Key, class T, class Map>
class compact_map_base {
protected:
- Map *map;
+ std::unique_ptr<Map> map;
void alloc_internal() {
if (!map)
- map = new Map;
+ map.reset(new Map);
}
void free_internal() {
- if (map) {
- delete map;
- map = 0;
- }
+ map.reset();
}
template <class It>
class const_iterator_base {
const_reverse_iterator(const compact_map_base* m, const typename Map::const_reverse_iterator& i)
: const_iterator_base<typename Map::const_reverse_iterator>(m, i) { }
};
- compact_map_base() : map(0) {}
- compact_map_base(const compact_map_base& o) : map(0) {
+ compact_map_base(const compact_map_base& o) {
if (o.map) {
alloc_internal();
*map = *o.map;
}
}
- ~compact_map_base() { delete map; }
+ compact_map_base() {}
+ ~compact_map_base() {}
bool empty() const {
return !map || map->empty();
size_t count (const Key& k) const {
return map ? map->count(k) : 0;
}
- void erase (iterator p) {
+ iterator erase (iterator p) {
if (map) {
assert(this == p.map);
- map->erase(p.it);
- if (map->empty())
- free_internal();
+ auto it = map->erase(p.it);
+ if (map->empty()) {
+ free_internal();
+ return iterator(this);
+ } else {
+ return iterator(this, it);
+ }
+ } else {
+ return iterator(this);
}
}
size_t erase (const Key& k) {
free_internal();
}
void swap(compact_map_base& o) {
- Map *tmp = map;
- map = o.map;
- o.map = tmp;
+ map.swap(o.map);
}
compact_map_base& operator=(const compact_map_base& o) {
if (o.map) {
alloc_internal();
return iterator(this, map->insert(val));
}
+ template <class... Args>
+ std::pair<iterator,bool> emplace ( Args&&... args ) {
+ alloc_internal();
+ auto em = map->emplace(std::forward<Args>(args)...);
+ return std::pair<iterator,bool>(iterator(this, em.first), em.second);
+ }
iterator begin() {
if (!map)
return iterator(this);
m.decode(p);
}
-template <class Key, class T>
-class compact_map : public compact_map_base<Key, T, std::map<Key,T> > {
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+class compact_map : public compact_map_base<Key, T, std::map<Key,T,Compare,Alloc> > {
public:
T& operator[](const Key& k) {
this->alloc_internal();
}
};
-template <class Key, class T>
-inline std::ostream& operator<<(std::ostream& out, const compact_map<Key, T>& m)
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+inline std::ostream& operator<<(std::ostream& out, const compact_map<Key, T, Compare, Alloc>& m)
{
out << "{";
- for (typename compact_map<Key, T>::const_iterator it = m.begin();
- it != m.end();
- ++it) {
- if (it != m.begin())
+ bool first = true;
+ for (const auto &p : m) {
+ if (!first)
out << ",";
- out << it->first << "=" << it->second;
+ out << p.first << "=" << p.second;
+ first = false;
}
out << "}";
return out;
}
-template <class Key, class T>
-class compact_multimap : public compact_map_base<Key, T, std::multimap<Key,T> > {
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+class compact_multimap : public compact_map_base<Key, T, std::multimap<Key,T,Compare,Alloc> > {
};
-template <class Key, class T>
-inline std::ostream& operator<<(std::ostream& out, const compact_multimap<Key, T>& m)
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+inline std::ostream& operator<<(std::ostream& out, const compact_multimap<Key, T, Compare, Alloc>& m)
{
out << "{{";
- for (typename compact_map<Key, T>::const_iterator it = m.begin(); !it.end(); ++it) {
- if (it != m.begin())
+ bool first = true;
+ for (const auto &p : m) {
+ if (!first)
out << ",";
- out << it->first << "=" << it->second;
+ out << p.first << "=" << p.second;
+ first = false;
}
out << "}}";
return out;
#ifndef CEPH_COMPACT_SET_H
#define CEPH_COMPACT_SET_H
-#include <stdlib.h>
-
-#include "include/buffer.h"
-#include "include/encoding.h"
+#include "buffer.h"
+#include "encoding.h"
+#include <memory>
+#include <set>
template <class T, class Set>
class compact_set_base {
protected:
- Set *set;
+ std::unique_ptr<Set> set;
void alloc_internal() {
if (!set)
- set = new Set;
+ set.reset(new Set);
}
void free_internal() {
- if (set) {
- delete set;
- set = 0;
- }
+ set.reset();
}
template <class It>
class iterator_base {
}
};
- compact_set_base() : set(0) {}
- compact_set_base(const compact_set_base& o) : set(0) {
+ compact_set_base() {}
+ compact_set_base(const compact_set_base& o) {
if (o.set) {
alloc_internal();
*set = *o.set;
}
}
- ~compact_set_base() { delete set; }
+ ~compact_set_base() {}
bool empty() const {
size_t count(const T& t) const {
return set ? set->count(t) : 0;
}
- void erase (iterator p) {
+ iterator erase (iterator p) {
if (set) {
assert(this == p.set);
- set->erase(p.it);
- if (set->empty())
- free_internal();
+ auto it = set->erase(p.it);
+ if (set->empty()) {
+ free_internal();
+ return iterator(this);
+ } else {
+ return iterator(this, it);
+ }
+ } else {
+ return iterator(this);
}
}
size_t erase (const T& t) {
return 0;
size_t r = set->erase(t);
if (set->empty())
- free_internal();
+ free_internal();
return r;
}
void clear() {
free_internal();
}
void swap(compact_set_base& o) {
- Set *tmp = set;
- set = o.set;
- o.set = tmp;
+ set.swap(o.set);
}
compact_set_base& operator=(const compact_set_base& o) {
if (o.set) {
std::pair<typename Set::iterator,bool> r = set->insert(t);
return std::make_pair(iterator(this, r.first), r.second);
}
+ template <class... Args>
+ std::pair<iterator,bool> emplace ( Args&&... args ) {
+ alloc_internal();
+ auto em = set->emplace(std::forward<Args>(args)...);
+ return std::pair<iterator,bool>(iterator(this, em.first), em.second);
+ }
+
iterator begin() {
if (!set)
return iterator(this);
m.decode(p);
}
-template <class T>
-class compact_set : public compact_set_base<T, std::set<T> > {
+template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> >
+class compact_set : public compact_set_base<T, std::set<T, Compare, Alloc> > {
};
-template <class T>
-inline std::ostream& operator<<(std::ostream& out, const compact_set<T>& s)
+template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> >
+inline std::ostream& operator<<(std::ostream& out, const compact_set<T,Compare,Alloc>& s)
{
- for (typename compact_set<T>::const_iterator it = s.begin();
- it != s.end(); ++it) {
- if (it != s.begin())
+ bool first = true;
+ for (auto &v : s) {
+ if (!first)
out << ",";
- out << it->first << "=" << it->second;
+ out << v;
+ first = false;
}
return out;
}
/* define if radosgw's beast frontend enabled */
#cmakedefine WITH_RADOSGW_BEAST_FRONTEND
+/* define if radosgw has openssl support */
+#cmakedefine WITH_CURL_OPENSSL
+
/* define if HAVE_THREAD_SAFE_RES_QUERY */
#cmakedefine HAVE_THREAD_SAFE_RES_QUERY
#ifndef CEPH_ENCODING_H
#define CEPH_ENCODING_H
+#include <set>
+#include <map>
+#include <deque>
+#include <vector>
+#include <string>
+#include <boost/utility/string_view.hpp>
+#include <tuple>
+#include <boost/optional/optional_io.hpp>
+#include <boost/tuple/tuple.hpp>
+
+#include "include/unordered_map.h"
+#include "include/unordered_set.h"
+
#include "include/int_types.h"
#include "include/memory.h"
// string
-inline void encode(const std::string& s, bufferlist& bl, uint64_t features=0)
+inline void encode(boost::string_view s, bufferlist& bl, uint64_t features=0)
{
__u32 len = s.length();
encode(len, bl);
if (len)
bl.append(s.data(), len);
}
+inline void encode(const std::string& s, bufferlist& bl, uint64_t features=0)
+{
+ return encode(boost::string_view(s), bl, features);
+}
inline void decode(std::string& s, bufferlist::iterator& p)
{
__u32 len;
p.copy(len, s);
}
-inline void encode_nohead(const std::string& s, bufferlist& bl)
+inline void encode_nohead(boost::string_view s, bufferlist& bl)
{
bl.append(s.data(), s.length());
}
+inline void encode_nohead(const std::string& s, bufferlist& bl)
+{
+ encode_nohead(boost::string_view(s), bl);
+}
inline void decode_nohead(int len, std::string& s, bufferlist::iterator& p)
{
s.clear();
// const char* (encode only, string compatible)
inline void encode(const char *s, bufferlist& bl)
{
- __u32 len = strlen(s);
- encode(len, bl);
- if (len)
- bl.append(s, len);
+ encode(boost::string_view(s, strlen(s)), bl);
}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_INCLUDE_ENCODING_BTREE_H
-#define CEPH_INCLUDE_ENCODING_BTREE_H
-
-#include "include/cpp-btree/btree_map.h"
-
-template<class T, class U>
-inline void encode(const btree::btree_map<T,U>& m, bufferlist& bl)
-{
- __u32 n = (__u32)(m.size());
- encode(n, bl);
- for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
- encode(p->first, bl);
- encode(p->second, bl);
- }
-}
-template<class T, class U>
-inline void encode(const btree::btree_map<T,U>& m, bufferlist& bl, uint64_t features)
-{
- __u32 n = (__u32)(m.size());
- encode(n, bl);
- for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
- encode(p->first, bl, features);
- encode(p->second, bl, features);
- }
-}
-template<class T, class U>
-inline void decode(btree::btree_map<T,U>& m, bufferlist::iterator& p)
-{
- __u32 n;
- decode(n, p);
- m.clear();
- while (n--) {
- T k;
- decode(k, p);
- decode(m[k], p);
- }
-}
-template<class T, class U>
-inline void encode_nohead(const btree::btree_map<T,U>& m, bufferlist& bl)
-{
- for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
- encode(p->first, bl);
- encode(p->second, bl);
- }
-}
-template<class T, class U>
-inline void decode_nohead(int n, btree::btree_map<T,U>& m, bufferlist::iterator& p)
-{
- m.clear();
- while (n--) {
- T k;
- decode(k, p);
- decode(m[k], p);
- }
-}
-
-#endif
#include <iosfwd>
#include <string>
+#include <boost/utility/string_view.hpp>
#include <vector>
using namespace std;
public:
filepath() : ino(0), encoded(false) { }
+ filepath(boost::string_view s, inodeno_t i) : ino(i), path(s), encoded(false) { }
filepath(const string& s, inodeno_t i) : ino(i), path(s), encoded(false) { }
filepath(const char* s, inodeno_t i) : ino(i), path(s), encoded(false) { }
filepath(const filepath& o) {
}
filepath(inodeno_t i) : ino(i), encoded(false) { }
- void set_path(const char *s, inodeno_t b) {
- path = s;
- ino = b;
- }
-
/*
* if we are fed a relative path as a string, either set ino=0 (strictly
* relative) or 1 (absolute). throw out any leading '/'.
*/
- filepath(const char *s) : encoded(false) {
+ filepath(boost::string_view s) : encoded(false) {
set_path(s);
}
- void set_path(const char *s) {
+ filepath(const char *s) : encoded(false) {
+ set_path(boost::string_view(s));
+ }
+
+ void set_path(boost::string_view s, inodeno_t b) {
+ path = std::string(s);
+ ino = b;
+ }
+ void set_path(boost::string_view s) {
if (s[0] == '/') {
- path = s + 1;
+ path = std::string(s.substr(1));
ino = 1;
} else {
ino = 0;
- path = s;
+ path = std::string(s);
}
bits.clear();
}
bits.pop_back();
rebuild_path();
}
- void push_dentry(const string& s) {
+ void push_dentry(boost::string_view s) {
if (bits.empty() && path.length() > 0)
parse_bits();
if (!bits.empty())
path += "/";
- path += s;
- bits.push_back(s);
+ path += std::string(s);
+ bits.emplace_back(std::string(s));
+ }
+ void push_dentry(const string& s) {
+ push_dentry(boost::string_view(s));
}
void push_dentry(const char *cs) {
- string s = cs;
- push_dentry(s);
+ push_dentry(boost::string_view(cs, strlen(cs)));
}
void push_front_dentry(const string& s) {
bits.insert(bits.begin(), s);
return out << '*';
}
-inline void encode(frag_t f, bufferlist& bl) { encode_raw(f._enc, bl); }
-inline void decode(frag_t &f, bufferlist::iterator& p) {
+inline void encode(const frag_t &f, bufferlist& bl) { encode_raw(f._enc, bl); }
+inline void decode(frag_t &f, bufferlist::iterator& p) {
__u32 v;
- decode_raw(v, p);
+ decode_raw(v, p);
f._enc = v;
}
#include "encoding.h"
+/*
+ * *** NOTE ***
+ *
+ * This class is written to work with a variety of map-like containers,
+ * *include* ones that invalidate iterators when they are modified (e.g.,
+ * flat_map and btree_map).
+ */
+
#ifndef MIN
# define MIN(a,b) ((a)<=(b) ? (a):(b))
#endif
#endif
-template<typename T>
+template<typename T, typename Map = std::map<T,T>>
class interval_set {
public:
class iterator : public std::iterator <std::forward_iterator_tag, T>
{
public:
- explicit iterator(typename std::map<T,T>::iterator iter)
+ explicit iterator(typename Map::iterator iter)
: _iter(iter)
{ }
return prev;
}
- friend class interval_set<T>::const_iterator;
+ friend class interval_set<T,Map>::const_iterator;
protected:
- typename std::map<T,T>::iterator _iter;
- friend class interval_set<T>;
+ typename Map::iterator _iter;
+ friend class interval_set<T,Map>;
};
class const_iterator : public std::iterator <std::forward_iterator_tag, T>
{
public:
- explicit const_iterator(typename std::map<T,T>::const_iterator iter)
+ explicit const_iterator(typename Map::const_iterator iter)
: _iter(iter)
{ }
}
protected:
- typename std::map<T,T>::const_iterator _iter;
+ typename Map::const_iterator _iter;
};
interval_set() : _size(0) {}
- interval_set(std::map<T,T>& other) {
+ interval_set(Map& other) {
m.swap(other);
_size = 0;
for (auto& i : m) {
return m.size();
}
- typename interval_set<T>::iterator begin() {
- return typename interval_set<T>::iterator(m.begin());
+ typename interval_set<T,Map>::iterator begin() {
+ return typename interval_set<T,Map>::iterator(m.begin());
}
- typename interval_set<T>::iterator lower_bound(T start) {
- return typename interval_set<T>::iterator(find_inc_m(start));
+ typename interval_set<T,Map>::iterator lower_bound(T start) {
+ return typename interval_set<T,Map>::iterator(find_inc_m(start));
}
- typename interval_set<T>::iterator end() {
- return typename interval_set<T>::iterator(m.end());
+ typename interval_set<T,Map>::iterator end() {
+ return typename interval_set<T,Map>::iterator(m.end());
}
- typename interval_set<T>::const_iterator begin() const {
- return typename interval_set<T>::const_iterator(m.begin());
+ typename interval_set<T,Map>::const_iterator begin() const {
+ return typename interval_set<T,Map>::const_iterator(m.begin());
}
- typename interval_set<T>::const_iterator lower_bound(T start) const {
- return typename interval_set<T>::const_iterator(find_inc(start));
+ typename interval_set<T,Map>::const_iterator lower_bound(T start) const {
+ return typename interval_set<T,Map>::const_iterator(find_inc(start));
}
- typename interval_set<T>::const_iterator end() const {
- return typename interval_set<T>::const_iterator(m.end());
+ typename interval_set<T,Map>::const_iterator end() const {
+ return typename interval_set<T,Map>::const_iterator(m.end());
}
// helpers
private:
- typename std::map<T,T>::const_iterator find_inc(T start) const {
- typename std::map<T,T>::const_iterator p = m.lower_bound(start); // p->first >= start
+ typename Map::const_iterator find_inc(T start) const {
+ typename Map::const_iterator p = m.lower_bound(start); // p->first >= start
if (p != m.begin() &&
(p == m.end() || p->first > start)) {
p--; // might overlap?
return p;
}
- typename std::map<T,T>::iterator find_inc_m(T start) {
- typename std::map<T,T>::iterator p = m.lower_bound(start);
+ typename Map::iterator find_inc_m(T start) {
+ typename Map::iterator p = m.lower_bound(start);
if (p != m.begin() &&
(p == m.end() || p->first > start)) {
p--; // might overlap?
return p;
}
- typename std::map<T,T>::const_iterator find_adj(T start) const {
- typename std::map<T,T>::const_iterator p = m.lower_bound(start);
+ typename Map::const_iterator find_adj(T start) const {
+ typename Map::const_iterator p = m.lower_bound(start);
if (p != m.begin() &&
(p == m.end() || p->first > start)) {
p--; // might touch?
return p;
}
- typename std::map<T,T>::iterator find_adj_m(T start) {
- typename std::map<T,T>::iterator p = m.lower_bound(start);
+ typename Map::iterator find_adj_m(T start) {
+ typename Map::iterator p = m.lower_bound(start);
if (p != m.begin() &&
(p == m.end() || p->first > start)) {
p--; // might touch?
}
return p;
}
+
+ void intersection_size_asym(const interval_set &s, const interval_set &l) {
+ typename decltype(m)::const_iterator ps = s.m.begin(), pl;
+ assert(ps != s.m.end());
+ T offset = ps->first, prev_offset;
+ bool first = true;
+ typename decltype(m)::iterator mi = m.begin();
+
+ while (1) {
+ if (first)
+ first = false;
+ else
+ assert(offset > prev_offset);
+ pl = l.find_inc(offset);
+ prev_offset = offset;
+ if (pl == l.m.end())
+ break;
+ while (ps != s.m.end() && ps->first + ps->second <= pl->first)
+ ++ps;
+ if (ps == s.m.end())
+ break;
+ offset = pl->first + pl->second;
+ if (offset <= ps->first) {
+ offset = ps->first;
+ continue;
+ }
+
+ if (*ps == *pl) {
+ do {
+ mi = m.insert(mi, *ps);
+ _size += ps->second;
+ ++ps;
+ ++pl;
+ } while (ps != s.m.end() && pl != l.m.end() && *ps == *pl);
+ if (ps == s.m.end())
+ break;
+ offset = ps->first;
+ continue;
+ }
+
+ T start = std::max<T>(ps->first, pl->first);
+ T en = std::min<T>(ps->first + ps->second, offset);
+ assert(en > start);
+ typename decltype(m)::value_type i{start, en - start};
+ mi = m.insert(mi, i);
+ _size += i.second;
+ if (ps->first + ps->second <= offset) {
+ ++ps;
+ if (ps == s.m.end())
+ break;
+ offset = ps->first;
+ }
+ }
+ }
+
+ bool subset_size_sym(const interval_set &b) const {
+ auto pa = m.begin(), pb = b.m.begin();
+ const auto a_end = m.end(), b_end = b.m.end();
+
+ while (pa != a_end && pb != b_end) {
+ while (pb->first + pb->second <= pa->first) {
+ ++pb;
+ if (pb == b_end)
+ return false;
+ }
+
+ if (*pa == *pb) {
+ do {
+ ++pa;
+ ++pb;
+ } while (pa != a_end && pb != b_end && *pa == *pb);
+ continue;
+ }
+
+ // interval begins before other
+ if (pa->first < pb->first)
+ return false;
+ // interval is longer than other
+ if (pa->first + pa->second > pb->first + pb->second)
+ return false;
+
+ ++pa;
+ }
+
+ return pa == a_end;
+ }
public:
bool operator==(const interval_set& other) const {
}
void bound_encode(size_t& p) const {
- denc_traits<std::map<T,T>>::bound_encode(m, p);
+ denc_traits<Map>::bound_encode(m, p);
}
void encode(bufferlist::contiguous_appender& p) const {
denc(m, p);
}
void encode_nohead(bufferlist::contiguous_appender& p) const {
- denc_traits<std::map<T,T>>::encode_nohead(m, p);
+ denc_traits<Map>::encode_nohead(m, p);
}
void decode_nohead(int n, bufferptr::iterator& p) {
- denc_traits<std::map<T,T>>::decode_nohead(n, m, p);
+ denc_traits<Map>::decode_nohead(n, m, p);
_size = 0;
for (const auto& i : m) {
_size += i.second;
}
bool contains(T i, T *pstart=0, T *plen=0) const {
- typename std::map<T,T>::const_iterator p = find_inc(i);
+ typename Map::const_iterator p = find_inc(i);
if (p == m.end()) return false;
if (p->first > i) return false;
if (p->first+p->second <= i) return false;
return true;
}
bool contains(T start, T len) const {
- typename std::map<T,T>::const_iterator p = find_inc(start);
+ typename Map::const_iterator p = find_inc(start);
if (p == m.end()) return false;
if (p->first > start) return false;
if (p->first+p->second <= start) return false;
}
T range_start() const {
assert(!empty());
- typename std::map<T,T>::const_iterator p = m.begin();
+ typename Map::const_iterator p = m.begin();
return p->first;
}
T range_end() const {
assert(!empty());
- typename std::map<T,T>::const_iterator p = m.end();
+ typename Map::const_iterator p = m.end();
p--;
return p->first+p->second;
}
// interval start after p (where p not in set)
bool starts_after(T i) const {
assert(!contains(i));
- typename std::map<T,T>::const_iterator p = find_inc(i);
+ typename Map::const_iterator p = find_inc(i);
if (p == m.end()) return false;
return true;
}
T start_after(T i) const {
assert(!contains(i));
- typename std::map<T,T>::const_iterator p = find_inc(i);
+ typename Map::const_iterator p = find_inc(i);
return p->first;
}
// interval end that contains start
T end_after(T start) const {
assert(contains(start));
- typename std::map<T,T>::const_iterator p = find_inc(start);
+ typename Map::const_iterator p = find_inc(start);
return p->first+p->second;
}
//cout << "insert " << start << "~" << len << endl;
assert(len > 0);
_size += len;
- typename std::map<T,T>::iterator p = find_adj_m(start);
+ typename Map::iterator p = find_adj_m(start);
if (p == m.end()) {
m[start] = len; // new interval
if (pstart)
p->second += len; // append to end
- typename std::map<T,T>::iterator n = p;
+ typename Map::iterator n = p;
n++;
+ if (pstart)
+ *pstart = p->first;
if (n != m.end() &&
start+len == n->first) { // combine with next, too!
p->second += n->second;
+ if (plen)
+ *plen = p->second;
m.erase(n);
- }
- if (pstart)
- *pstart = p->first;
- if (plen)
- *plen = p->second;
+ } else {
+ if (plen)
+ *plen = p->second;
+ }
} else {
if (start+len == p->first) {
- m[start] = len + p->second; // append to front
if (pstart)
*pstart = start;
if (plen)
*plen = len + p->second;
+ T psecond = p->second;
m.erase(p);
+ m[start] = len + psecond; // append to front
} else {
assert(p->first > start+len);
- m[start] = len; // new interval
if (pstart)
*pstart = start;
if (plen)
*plen = len;
+ m[start] = len; // new interval
}
}
}
}
- void swap(interval_set<T>& other) {
+ void swap(interval_set<T,Map>& other) {
m.swap(other.m);
std::swap(_size, other._size);
}
erase(val, 1);
}
- void erase(T start, T len) {
- typename std::map<T,T>::iterator p = find_inc_m(start);
+ void erase(T start, T len,
+ std::function<bool(T, T)> post_process = {}) {
+ typename Map::iterator p = find_inc_m(start);
_size -= len;
assert(_size >= 0);
T before = start - p->first;
assert(p->second >= before+len);
T after = p->second - before - len;
-
- if (before)
+ if (before && (post_process ? post_process(p->first, before) : true)) {
p->second = before; // shorten bit before
- else
+ } else {
m.erase(p);
- if (after)
- m[start+len] = after;
+ }
+ if (after && (post_process ? post_process(start + len, after) : true)) {
+ m[start + len] = after;
+ }
}
-
void subtract(const interval_set &a) {
- for (typename std::map<T,T>::const_iterator p = a.m.begin();
+ for (typename Map::const_iterator p = a.m.begin();
p != a.m.end();
p++)
erase(p->first, p->second);
}
void insert(const interval_set &a) {
- for (typename std::map<T,T>::const_iterator p = a.m.begin();
+ for (typename Map::const_iterator p = a.m.begin();
p != a.m.end();
p++)
insert(p->first, p->second);
assert(&b != this);
clear();
- typename std::map<T,T>::const_iterator pa = a.m.begin();
- typename std::map<T,T>::const_iterator pb = b.m.begin();
+ const interval_set *s, *l;
+
+ if (a.size() < b.size()) {
+ s = &a;
+ l = &b;
+ } else {
+ s = &b;
+ l = &a;
+ }
+
+ if (!s->size())
+ return;
+
+ /*
+ * Use the lower_bound algorithm for larger size ratios
+ * where it performs better, but not for smaller size
+ * ratios where sequential search performs better.
+ */
+ if (l->size() / s->size() >= 10) {
+ intersection_size_asym(*s, *l);
+ return;
+ }
+
+ typename Map::const_iterator pa = a.m.begin();
+ typename Map::const_iterator pb = b.m.begin();
typename decltype(m)::iterator mi = m.begin();
while (pa != a.m.end() && pb != b.m.end()) {
}
bool subset_of(const interval_set &big) const {
- for (typename std::map<T,T>::const_iterator i = m.begin();
+ if (!size())
+ return true;
+ if (size() > big.size())
+ return false;
+ if (range_end() > big.range_end())
+ return false;
+
+ /*
+ * Use the lower_bound algorithm for larger size ratios
+ * where it performs better, but not for smaller size
+ * ratios where sequential search performs better.
+ */
+ if (big.size() / size() < 10)
+ return subset_size_sym(big);
+
+ for (typename Map::const_iterator i = m.begin();
i != m.end();
i++)
if (!big.contains(i->first, i->second)) return false;
return true;
}
+ /*
+ * build a subset of @other for given rage [@start, @end)
+ * E.g.:
+ * subset_of([5~10,20~5], 0, 100) -> [5~10,20~5]
+ * subset_of([5~10,20~5], 5, 25) -> [5~10,20~5]
+ * subset_of([5~10,20~5], 1, 10) -> [5~5]
+ * subset_of([5~10,20~5], 8, 24) -> [8~7, 20~4]
+ */
+ void subset_of(const interval_set &other, T start, T end) {
+ assert(end >= start);
+ clear();
+ if (end == start) {
+ return;
+ }
+ typename Map::const_iterator p = other.find_inc(start);
+ if (p == other.m.end())
+ return;
+ if (p->first < start) {
+ if (p->first + p->second >= end) {
+ insert(start, end - start);
+ return;
+ } else {
+ insert(start, p->first + p->second - start);
+ ++p;
+ }
+ }
+ while (p != other.m.end()) {
+ assert(p->first >= start);
+ if (p->first >= end) {
+ return;
+ }
+ if (p->first + p->second >= end) {
+ insert(p->first, end - p->first);
+ return;
+ } else {
+ // whole
+ insert(p->first, p->second);
+ ++p;
+ }
+ }
+ }
+
/*
* build a subset of @other, starting at or after @start, and including
* @len worth of values, skipping holes. e.g.,
*/
void span_of(const interval_set &other, T start, T len) {
clear();
- typename std::map<T,T>::const_iterator p = other.find_inc(start);
+ typename Map::const_iterator p = other.find_inc(start);
if (p == other.m.end())
return;
if (p->first < start) {
}
/*
- * Move contents of m into another std::map<T,T>. Use that instead of
- * encoding interval_set into bufferlist then decoding it back into std::map.
+ * Move contents of m into another Map. Use that instead of
+ * encoding interval_set into bufferlist then decoding it back into Map.
*/
- void move_into(std::map<T,T>& other) {
+ void move_into(Map& other) {
other = std::move(m);
}
private:
// data
int64_t _size;
- std::map<T,T> m; // map start -> len
+ Map m; // map start -> len
};
// declare traits explicitly because (1) it's templatized, and (2) we
// want to include _nohead variants.
-template<typename T>
-struct denc_traits<interval_set<T>> {
+template<typename T, typename Map>
+struct denc_traits<interval_set<T,Map>> {
static constexpr bool supported = true;
static constexpr bool bounded = false;
static constexpr bool featured = false;
- static constexpr bool need_contiguous = denc_traits<T>::need_contiguous;
- static void bound_encode(const interval_set<T>& v, size_t& p) {
+ static constexpr bool need_contiguous = denc_traits<T,Map>::need_contiguous;
+ static void bound_encode(const interval_set<T,Map>& v, size_t& p) {
v.bound_encode(p);
}
- static void encode(const interval_set<T>& v,
+ static void encode(const interval_set<T,Map>& v,
bufferlist::contiguous_appender& p) {
v.encode(p);
}
- static void decode(interval_set<T>& v, bufferptr::iterator& p) {
+ static void decode(interval_set<T,Map>& v, bufferptr::iterator& p) {
v.decode(p);
}
template<typename U=T>
static typename std::enable_if<sizeof(U) && !need_contiguous>::type
- decode(interval_set<T>& v, bufferlist::iterator& p) {
+ decode(interval_set<T,Map>& v, bufferlist::iterator& p) {
v.decode(p);
}
- static void encode_nohead(const interval_set<T>& v,
+ static void encode_nohead(const interval_set<T,Map>& v,
bufferlist::contiguous_appender& p) {
v.encode_nohead(p);
}
- static void decode_nohead(size_t n, interval_set<T>& v,
+ static void decode_nohead(size_t n, interval_set<T,Map>& v,
bufferptr::iterator& p) {
v.decode_nohead(n, p);
}
};
-template<class T>
-inline std::ostream& operator<<(std::ostream& out, const interval_set<T> &s) {
+template<class T, typename Map>
+inline std::ostream& operator<<(std::ostream& out, const interval_set<T,Map> &s) {
out << "[";
const char *prequel = "";
- for (typename interval_set<T>::const_iterator i = s.begin();
+ for (typename interval_set<T,Map>::const_iterator i = s.begin();
i != s.end();
++i)
{
#include <common/Formatter.h>
#include "include/assert.h"
+#include "include/compact_map.h"
+#include "include/compact_set.h"
/*
using map = std::map<k, v, cmp, \
pool_allocator<std::pair<const k,v>>>; \
\
+ template<typename k,typename v, typename cmp = std::less<k> > \
+ using compact_map = compact_map<k, v, cmp, \
+ pool_allocator<std::pair<const k,v>>>; \
+ \
+ template<typename k, typename cmp = std::less<k> > \
+ using compact_set = compact_set<k, cmp, pool_allocator<k>>; \
+ \
template<typename k,typename v, typename cmp = std::less<k> > \
using multimap = std::multimap<k,v,cmp, \
pool_allocator<std::pair<const k, \
SHARD_MISSING = 1 << 1,
SHARD_STAT_ERR = 1 << 2,
SHARD_READ_ERR = 1 << 3,
- DATA_DIGEST_MISMATCH_OI = 1 << 9,
- OMAP_DIGEST_MISMATCH_OI = 1 << 10,
- SIZE_MISMATCH_OI = 1 << 11,
+ DATA_DIGEST_MISMATCH_OI = 1 << 9, // Old
+ DATA_DIGEST_MISMATCH_INFO = 1 << 9,
+ OMAP_DIGEST_MISMATCH_OI = 1 << 10, // Old
+ OMAP_DIGEST_MISMATCH_INFO = 1 << 10,
+ SIZE_MISMATCH_OI = 1 << 11, // Old
+ SIZE_MISMATCH_INFO = 1 << 11,
SHARD_EC_HASH_MISMATCH = 1 << 12,
SHARD_EC_SIZE_MISMATCH = 1 << 13,
- OI_ATTR_MISSING = 1 << 14,
- OI_ATTR_CORRUPTED = 1 << 15,
- SS_ATTR_MISSING = 1 << 16,
- SS_ATTR_CORRUPTED = 1 << 17,
- OBJ_SIZE_OI_MISMATCH = 1 << 18
+ OI_ATTR_MISSING = 1 << 14, // Old
+ INFO_MISSING = 1 << 14,
+ OI_ATTR_CORRUPTED = 1 << 15, // Old
+ INFO_CORRUPTED = 1 << 15,
+ SS_ATTR_MISSING = 1 << 16, // Old
+ SNAPSET_MISSING = 1 << 16,
+ SS_ATTR_CORRUPTED = 1 << 17, // Old
+ SNAPSET_CORRUPTED = 1 << 17,
+ OBJ_SIZE_OI_MISMATCH = 1 << 18, // Old
+ OBJ_SIZE_INFO_MISMATCH = 1 << 18,
+ HINFO_MISSING = 1 << 19,
+ HINFO_CORRUPTED = 1 << 20
// When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
};
uint64_t errors = 0;
- static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_OI|OI_ATTR_MISSING|OI_ATTR_CORRUPTED|SS_ATTR_MISSING|SS_ATTR_CORRUPTED|OBJ_SIZE_OI_MISMATCH;
- static constexpr uint64_t DEEP_ERRORS = SHARD_READ_ERR|DATA_DIGEST_MISMATCH_OI|OMAP_DIGEST_MISMATCH_OI|SHARD_EC_HASH_MISMATCH|SHARD_EC_SIZE_MISMATCH;
+ static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_INFO|INFO_MISSING|INFO_CORRUPTED|SNAPSET_MISSING|SNAPSET_CORRUPTED|OBJ_SIZE_INFO_MISMATCH|HINFO_MISSING|HINFO_CORRUPTED;
+ static constexpr uint64_t DEEP_ERRORS = SHARD_READ_ERR|DATA_DIGEST_MISMATCH_INFO|OMAP_DIGEST_MISMATCH_INFO|SHARD_EC_HASH_MISMATCH|SHARD_EC_SIZE_MISMATCH;
bool has_shard_missing() const {
return errors & SHARD_MISSING;
}
bool has_read_error() const {
return errors & SHARD_READ_ERR;
}
- bool has_data_digest_mismatch_oi() const {
+ bool has_data_digest_mismatch_oi() const { // Compatibility
return errors & DATA_DIGEST_MISMATCH_OI;
}
- bool has_omap_digest_mismatch_oi() const {
+ bool has_data_digest_mismatch_info() const {
+ return errors & DATA_DIGEST_MISMATCH_INFO;
+ }
+ bool has_omap_digest_mismatch_oi() const { // Compatibility
return errors & OMAP_DIGEST_MISMATCH_OI;
}
- bool has_size_mismatch_oi() const {
+ bool has_omap_digest_mismatch_info() const {
+ return errors & OMAP_DIGEST_MISMATCH_INFO;
+ }
+ bool has_size_mismatch_oi() const { // Compatibility
return errors & SIZE_MISMATCH_OI;
}
+ bool has_size_mismatch_info() const {
+ return errors & SIZE_MISMATCH_INFO;
+ }
bool has_ec_hash_error() const {
return errors & SHARD_EC_HASH_MISMATCH;
}
bool has_ec_size_error() const {
return errors & SHARD_EC_SIZE_MISMATCH;
}
- bool has_oi_attr_missing() const {
+ bool has_oi_attr_missing() const { // Compatibility
return errors & OI_ATTR_MISSING;
}
- bool has_oi_attr_corrupted() const {
+ bool has_info_missing() const {
+ return errors & INFO_MISSING;
+ }
+ bool has_oi_attr_corrupted() const { // Compatibility
return errors & OI_ATTR_CORRUPTED;
}
- bool has_ss_attr_missing() const {
+ bool has_info_corrupted() const {
+ return errors & INFO_CORRUPTED;
+ }
+ bool has_ss_attr_missing() const { // Compatibility
return errors & SS_ATTR_MISSING;
}
- bool has_ss_attr_corrupted() const {
+ bool has_snapset_missing() const {
+ return errors & SNAPSET_MISSING;
+ }
+ bool has_ss_attr_corrupted() const { // Compatibility
return errors & SS_ATTR_CORRUPTED;
}
+ bool has_snapset_corrupted() const {
+ return errors & SNAPSET_CORRUPTED;
+ }
bool has_shallow_errors() const {
return errors & SHALLOW_ERRORS;
}
bool has_deep_errors() const {
return errors & DEEP_ERRORS;
}
- bool has_obj_size_oi_mismatch() const {
+ bool has_obj_size_oi_mismatch() const { // Compatibility
return errors & OBJ_SIZE_OI_MISMATCH;
+ }
+ bool has_obj_size_info_mismatch() const {
+ return errors & OBJ_SIZE_INFO_MISMATCH;
+ }
+ bool has_hinfo_missing() const {
+ return errors & HINFO_MISSING;
+ }
+ bool has_hinfo_corrupted() const {
+ return errors & HINFO_CORRUPTED;
}
};
ATTR_VALUE_MISMATCH = 1 << 7,
ATTR_NAME_MISMATCH = 1 << 8,
SNAPSET_INCONSISTENCY = 1 << 9,
+ HINFO_INCONSISTENCY = 1 << 10,
// When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
};
uint64_t errors = 0;
- static constexpr uint64_t SHALLOW_ERRORS = OBJECT_INFO_INCONSISTENCY|SIZE_MISMATCH|ATTR_VALUE_MISMATCH|ATTR_NAME_MISMATCH|SNAPSET_INCONSISTENCY;
+ static constexpr uint64_t SHALLOW_ERRORS = OBJECT_INFO_INCONSISTENCY|SIZE_MISMATCH|ATTR_VALUE_MISMATCH|ATTR_NAME_MISMATCH|SNAPSET_INCONSISTENCY|HINFO_INCONSISTENCY;
static constexpr uint64_t DEEP_ERRORS = DATA_DIGEST_MISMATCH|OMAP_DIGEST_MISMATCH;
bool has_object_info_inconsistency() const {
return errors & OBJECT_INFO_INCONSISTENCY;
bool has_snapset_inconsistency() const {
return errors & SNAPSET_INCONSISTENCY;
}
+ bool has_hinfo_inconsistency() const {
+ return errors & HINFO_INCONSISTENCY;
+ }
};
struct inconsistent_obj_t : obj_err_t {
SNAPSET_MISSING = 1 << 0,
SNAPSET_CORRUPTED = 1 << 1,
CLONE_MISSING = 1 << 2,
- SNAP_MISMATCH = 1 << 3,
+ SNAP_ERROR = 1 << 3,
HEAD_MISMATCH = 1 << 4,
HEADLESS_CLONE = 1 << 5,
SIZE_MISMATCH = 1 << 6,
- OI_MISSING = 1 << 7,
- OI_CORRUPTED = 1 << 8,
+ OI_MISSING = 1 << 7, // Old
+ INFO_MISSING = 1 << 7,
+ OI_CORRUPTED = 1 << 8, // Old
+ INFO_CORRUPTED = 1 << 8,
EXTRA_CLONES = 1 << 9,
};
uint64_t errors = 0;
// Extra clones
std::vector<snap_t> clones;
std::vector<snap_t> missing;
+ ceph::bufferlist ss_bl;
- bool ss_attr_missing() const {
+ bool ss_attr_missing() const { // Compatibility
return errors & SNAPSET_MISSING;
}
- bool ss_attr_corrupted() const {
+ bool snapset_missing() const {
+ return errors & SNAPSET_MISSING;
+ }
+ bool ss_attr_corrupted() const { // Compatibility
+ return errors & SNAPSET_CORRUPTED;
+ }
+ bool snapset_corrupted() const {
return errors & SNAPSET_CORRUPTED;
}
bool clone_missing() const {
return errors & CLONE_MISSING;
}
- bool snapset_mismatch() const {
- return errors & SNAP_MISMATCH;
+ bool snapset_mismatch() const { // Compatibility
+ return errors & SNAP_ERROR;
+ }
+ bool snapset_error() const {
+ return errors & SNAP_ERROR;
}
bool head_mismatch() const {
return errors & HEAD_MISMATCH;
bool size_mismatch() const {
return errors & SIZE_MISMATCH;
}
- bool oi_attr_missing() const {
+ bool oi_attr_missing() const { // Compatibility
return errors & OI_MISSING;
}
- bool oi_attr_corrupted() const {
+ bool info_missing() const {
+ return errors & INFO_MISSING;
+ }
+ bool oi_attr_corrupted() const { // Compatibility
return errors & OI_CORRUPTED;
}
+ bool info_corrupted() const {
+ return errors & INFO_CORRUPTED;
+ }
bool extra_clones() const {
return errors & EXTRA_CLONES;
}
add_dependencies(jni-header libcephfs)
find_jar(JUNIT_JAR
- NAMES junit junit4
+ NAMES junit4 junit
PATHS "/usr/share/java")
if(JUNIT_JAR)
set(CMAKE_JAVA_INCLUDE_PATH ${JUNIT_JAR} ${libcephfs_jar})
void JournalMetadata::flush_commit_position() {
ldout(m_cct, 20) << __func__ << dendl;
- Mutex::Locker timer_locker(*m_timer_lock);
- Mutex::Locker locker(m_lock);
- if (m_commit_position_ctx == nullptr) {
- return;
- }
-
- cancel_commit_task();
- handle_commit_position_task();
+ C_SaferCond ctx;
+ flush_commit_position(&ctx);
+ ctx.wait();
}
void JournalMetadata::flush_commit_position(Context *on_safe) {
Mutex::Locker timer_locker(*m_timer_lock);
Mutex::Locker locker(m_lock);
- if (m_commit_position_ctx == nullptr) {
+ if (m_commit_position_ctx == nullptr && m_flush_commits_in_progress == 0) {
// nothing to flush
if (on_safe != nullptr) {
m_work_queue->queue(on_safe, 0);
}
if (on_safe != nullptr) {
- m_commit_position_ctx = new C_FlushCommitPosition(
- m_commit_position_ctx, on_safe);
+ m_flush_commit_position_ctxs.push_back(on_safe);
+ }
+ if (m_commit_position_ctx == nullptr) {
+ return;
}
+
cancel_commit_task();
handle_commit_position_task();
}
void JournalMetadata::refresh(Context *on_complete) {
ldout(m_cct, 10) << "refreshing mutable metadata" << dendl;
- C_Refresh *refresh = new C_Refresh(this, on_complete);
+
+ {
+ Mutex::Locker locker(m_lock);
+ if (on_complete != nullptr) {
+ m_refresh_ctxs.push_back(on_complete);
+ }
+ ++m_refreshes_in_progress;
+ }
+
+ auto refresh = new C_Refresh(this);
get_mutable_metadata(&refresh->minimum_set, &refresh->active_set,
&refresh->registered_clients, refresh);
}
void JournalMetadata::handle_refresh_complete(C_Refresh *refresh, int r) {
ldout(m_cct, 10) << "refreshed mutable metadata: r=" << r << dendl;
- if (r == 0) {
- Mutex::Locker locker(m_lock);
+ m_lock.Lock();
+ if (r == 0) {
Client client(m_client_id, bufferlist());
RegisteredClients::iterator it = refresh->registered_clients.find(client);
if (it != refresh->registered_clients.end()) {
}
}
- if (refresh->on_finish != NULL) {
- refresh->on_finish->complete(r);
+ Contexts refresh_ctxs;
+ assert(m_refreshes_in_progress > 0);
+ --m_refreshes_in_progress;
+ if (m_refreshes_in_progress == 0) {
+ std::swap(refresh_ctxs, m_refresh_ctxs);
+ }
+ m_lock.Unlock();
+
+ for (auto ctx : refresh_ctxs) {
+ ctx->complete(r);
}
}
assert(m_lock.is_locked());
assert(m_commit_position_ctx != nullptr);
assert(m_commit_position_task_ctx != nullptr);
-
m_timer->cancel_event(m_commit_position_task_ctx);
m_commit_position_task_ctx = NULL;
}
assert(m_timer_lock->is_locked());
assert(m_lock.is_locked());
assert(m_commit_position_ctx != nullptr);
- if (m_commit_position_task_ctx == NULL) {
+ if (m_commit_position_task_ctx == nullptr) {
m_commit_position_task_ctx =
m_timer->add_event_after(m_settings.commit_interval,
new C_CommitPositionTask(this));
<< "client_id=" << m_client_id << ", "
<< "commit_position=" << m_commit_position << dendl;
- librados::ObjectWriteOperation op;
- client::client_commit(&op, m_client_id, m_commit_position);
+ m_commit_position_task_ctx = nullptr;
+ Context* commit_position_ctx = nullptr;
+ std::swap(commit_position_ctx, m_commit_position_ctx);
- Context *ctx = new C_NotifyUpdate(this, m_commit_position_ctx);
- m_commit_position_ctx = NULL;
+ m_async_op_tracker.start_op();
+ ++m_flush_commits_in_progress;
- ctx = schedule_laggy_clients_disconnect(ctx);
+ Context* ctx = new FunctionContext([this, commit_position_ctx](int r) {
+ Contexts flush_commit_position_ctxs;
+ m_lock.Lock();
+ assert(m_flush_commits_in_progress > 0);
+ --m_flush_commits_in_progress;
+ if (m_flush_commits_in_progress == 0) {
+ std::swap(flush_commit_position_ctxs, m_flush_commit_position_ctxs);
+ }
+ m_lock.Unlock();
- librados::AioCompletion *comp =
- librados::Rados::aio_create_completion(ctx, NULL,
- utils::rados_ctx_callback);
+ commit_position_ctx->complete(0);
+ for (auto ctx : flush_commit_position_ctxs) {
+ ctx->complete(0);
+ }
+ m_async_op_tracker.finish_op();
+ });
+ ctx = new C_NotifyUpdate(this, ctx);
+ ctx = new FunctionContext([this, ctx](int r) {
+ // manually kick of a refresh in case the notification is missed
+ // and ignore the next notification that we are about to send
+ m_lock.Lock();
+ ++m_ignore_watch_notifies;
+ m_lock.Unlock();
+
+ refresh(ctx);
+ });
+ ctx = new FunctionContext([this, ctx](int r) {
+ schedule_laggy_clients_disconnect(ctx);
+ });
+
+ librados::ObjectWriteOperation op;
+ client::client_commit(&op, m_client_id, m_commit_position);
+
+ auto comp = librados::Rados::aio_create_completion(ctx, nullptr,
+ utils::rados_ctx_callback);
int r = m_ioctx.aio_operate(m_oid, comp, &op);
assert(r == 0);
comp->release();
-
- m_commit_position_task_ctx = NULL;
}
void JournalMetadata::schedule_watch_reset() {
bufferlist bl;
m_ioctx.notify_ack(m_oid, notify_id, cookie, bl);
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_ignore_watch_notifies > 0) {
+ --m_ignore_watch_notifies;
+ return;
+ }
+ }
+
refresh(NULL);
}
ldout(m_cct, 10) << "notified journal header update: r=" << r << dendl;
}
-Context *JournalMetadata::schedule_laggy_clients_disconnect(Context *on_finish) {
- assert(m_lock.is_locked());
-
+void JournalMetadata::schedule_laggy_clients_disconnect(Context *on_finish) {
ldout(m_cct, 20) << __func__ << dendl;
-
if (m_settings.max_concurrent_object_sets <= 0) {
- return on_finish;
+ on_finish->complete(0);
+ return;
}
Context *ctx = on_finish;
+ {
+ Mutex::Locker locker(m_lock);
+ for (auto &c : m_registered_clients) {
+ if (c.state == cls::journal::CLIENT_STATE_DISCONNECTED ||
+ c.id == m_client_id ||
+ m_settings.whitelisted_laggy_clients.count(c.id) > 0) {
+ continue;
+ }
+ const std::string &client_id = c.id;
+ uint64_t object_set = 0;
+ if (!c.commit_position.object_positions.empty()) {
+ auto &position = *(c.commit_position.object_positions.begin());
+ object_set = position.object_number / m_splay_width;
+ }
- for (auto &c : m_registered_clients) {
- if (c.state == cls::journal::CLIENT_STATE_DISCONNECTED ||
- c.id == m_client_id ||
- m_settings.whitelisted_laggy_clients.count(c.id) > 0) {
- continue;
- }
- const std::string &client_id = c.id;
- uint64_t object_set = 0;
- if (!c.commit_position.object_positions.empty()) {
- auto &position = *(c.commit_position.object_positions.begin());
- object_set = position.object_number / m_splay_width;
- }
-
- if (m_active_set > object_set + m_settings.max_concurrent_object_sets) {
- ldout(m_cct, 1) << __func__ << ": " << client_id
- << ": scheduling disconnect" << dendl;
+ if (m_active_set > object_set + m_settings.max_concurrent_object_sets) {
+ ldout(m_cct, 1) << __func__ << ": " << client_id
+ << ": scheduling disconnect" << dendl;
- ctx = new FunctionContext([this, client_id, ctx](int r1) {
- ldout(m_cct, 10) << __func__ << ": " << client_id
- << ": flagging disconnected" << dendl;
+ ctx = new FunctionContext([this, client_id, ctx](int r1) {
+ ldout(m_cct, 10) << __func__ << ": " << client_id
+ << ": flagging disconnected" << dendl;
- librados::ObjectWriteOperation op;
- client::client_update_state(&op, client_id,
- cls::journal::CLIENT_STATE_DISCONNECTED);
+ librados::ObjectWriteOperation op;
+ client::client_update_state(
+ &op, client_id, cls::journal::CLIENT_STATE_DISCONNECTED);
- librados::AioCompletion *comp =
- librados::Rados::aio_create_completion(ctx, nullptr,
- utils::rados_ctx_callback);
- int r = m_ioctx.aio_operate(m_oid, comp, &op);
- assert(r == 0);
- comp->release();
- });
+ auto comp = librados::Rados::aio_create_completion(
+ ctx, nullptr, utils::rados_ctx_callback);
+ int r = m_ioctx.aio_operate(m_oid, comp, &op);
+ assert(r == 0);
+ comp->release();
+ });
+ }
}
}
if (ctx == on_finish) {
ldout(m_cct, 20) << __func__ << ": no laggy clients to disconnect" << dendl;
}
-
- return ctx;
+ ctx->complete(0);
}
std::ostream &operator<<(std::ostream &os,
private:
typedef std::map<uint64_t, uint64_t> AllocatedEntryTids;
typedef std::list<JournalMetadataListener*> Listeners;
+ typedef std::list<Context*> Contexts;
struct CommitEntry {
uint64_t object_num;
uint64_t minimum_set;
uint64_t active_set;
RegisteredClients registered_clients;
- Context *on_finish;
- C_Refresh(JournalMetadata *_journal_metadata, Context *_on_finish)
- : journal_metadata(_journal_metadata), minimum_set(0), active_set(0),
- on_finish(_on_finish) {
+ C_Refresh(JournalMetadata *_journal_metadata)
+ : journal_metadata(_journal_metadata), minimum_set(0), active_set(0) {
Mutex::Locker locker(journal_metadata->m_lock);
journal_metadata->m_async_op_tracker.start_op();
}
size_t m_update_notifications;
Cond m_update_cond;
+ size_t m_ignore_watch_notifies = 0;
+ size_t m_refreshes_in_progress = 0;
+ Contexts m_refresh_ctxs;
+
uint64_t m_commit_position_tid = 0;
ObjectSetPosition m_commit_position;
Context *m_commit_position_ctx;
Context *m_commit_position_task_ctx;
+ size_t m_flush_commits_in_progress = 0;
+ Contexts m_flush_commit_position_ctxs;
+
AsyncOpTracker m_async_op_tracker;
void handle_immutable_metadata(int r, Context *on_init);
void handle_watch_error(int err);
void handle_notified(int r);
- Context *schedule_laggy_clients_disconnect(Context *on_finish);
+ void schedule_laggy_clients_disconnect(Context *on_finish);
friend std::ostream &operator<<(std::ostream &os,
const JournalMetadata &journal_metadata);
}
void ObjectRecorder::send_appends_aio() {
- AppendBuffers *append_buffers;
- uint64_t append_tid;
+ librados::ObjectWriteOperation op;
+ client::guard_append(&op, m_soft_max_size);
+ C_Gather *gather_ctx;
{
Mutex::Locker locker(*m_lock);
- append_tid = m_append_tid++;
+ uint64_t append_tid = m_append_tid++;
m_in_flight_tids.insert(append_tid);
- // safe to hold pointer outside lock until op is submitted
- append_buffers = &m_in_flight_appends[append_tid];
- append_buffers->swap(m_pending_buffers);
- }
-
- ldout(m_cct, 10) << __func__ << ": " << m_oid << " flushing journal tid="
- << append_tid << dendl;
- C_AppendFlush *append_flush = new C_AppendFlush(this, append_tid);
- C_Gather *gather_ctx = new C_Gather(m_cct, append_flush);
-
- librados::ObjectWriteOperation op;
- client::guard_append(&op, m_soft_max_size);
- for (AppendBuffers::iterator it = append_buffers->begin();
- it != append_buffers->end(); ++it) {
- ldout(m_cct, 20) << __func__ << ": flushing " << *it->first
- << dendl;
- op.append(it->second);
- op.set_op_flags2(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+ ldout(m_cct, 10) << __func__ << ": " << m_oid << " flushing journal tid="
+ << append_tid << dendl;
+
+ gather_ctx = new C_Gather(m_cct, new C_AppendFlush(this, append_tid));
+ auto append_buffers = &m_in_flight_appends[append_tid];
+
+ for (auto it = m_pending_buffers.begin(); it != m_pending_buffers.end(); ) {
+ ldout(m_cct, 20) << __func__ << ": flushing " << *it->first << dendl;
+ op.append(it->second);
+ op.set_op_flags2(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+ m_aio_sent_size += it->second.length();
+ append_buffers->push_back(*it);
+ it = m_pending_buffers.erase(it);
+ if (m_aio_sent_size >= m_soft_max_size) {
+ break;
+ }
+ }
}
librados::AioCompletion *rados_completion =
Cond m_in_flight_flushes_cond;
AppendBuffers m_pending_buffers;
+ uint64_t m_aio_sent_size = 0;
bool m_aio_scheduled;
void handle_append_task();
#include "include/memory.h"
#include <boost/scoped_ptr.hpp>
#include "include/encoding.h"
-#include "include/cpp-btree/btree.h"
-#include "include/cpp-btree/btree_map.h"
-#include "include/encoding_btree.h"
+#include "include/btree_map.h"
#include "KeyValueDB.h"
#include "osd/osd_types.h"
::ObjectOperation op;
prepare_assert_ops(&op);
op.remove();
- return operate(oid, &op, NULL);
+ return operate(oid, &op, nullptr, librados::OPERATION_FULL_FORCE);
}
int librados::IoCtxImpl::remove(const object_t& oid, int flags)
{
tracepoint(librados, rados_getxattrs_next_enter, iter);
librados::RadosXattrsIter *it = static_cast<librados::RadosXattrsIter*>(iter);
+ if (it->val) {
+ free(it->val);
+ it->val = NULL;
+ }
if (it->i == it->attrset.end()) {
*name = NULL;
*val = NULL;
tracepoint(librados, rados_getxattrs_next_exit, 0, NULL, NULL, 0);
return 0;
}
- free(it->val);
const std::string &s(it->i->first);
*name = s.c_str();
bufferlist &bl(it->i->second);
void calc_snap_set_diff(CephContext *cct, const librados::snap_set_t& snap_set,
librados::snap_t start, librados::snap_t end,
interval_set<uint64_t> *diff, uint64_t *end_size,
- bool *end_exists, librados::snap_t *clone_end_snap_id)
+ bool *end_exists, librados::snap_t *clone_end_snap_id,
+ bool *whole_object)
{
ldout(cct, 10) << "calc_snap_set_diff start " << start << " end " << end
<< ", snap_set seq " << snap_set.seq << dendl;
*end_size = 0;
*end_exists = false;
*clone_end_snap_id = 0;
+ *whole_object = false;
for (vector<librados::clone_info_t>::const_iterator r = snap_set.clones.begin();
r != snap_set.clones.end();
// head is valid starting from right after the last seen seq
a = snap_set.seq + 1;
b = librados::SNAP_HEAD;
+ } else if (r->snaps.empty()) {
+ ldout(cct, 1) << "clone " << r->cloneid
+ << ": empty snaps, return whole object" << dendl;
+ *whole_object = true;
+ return;
} else {
a = r->snaps[0];
// note: b might be < r->cloneid if a snap has been trimmed.
const librados::snap_set_t& snap_set,
librados::snap_t start, librados::snap_t end,
interval_set<uint64_t> *diff, uint64_t *end_size,
- bool *end_exists, librados::snap_t *clone_end_snap_id);
+ bool *end_exists, librados::snap_t *clone_end_snap_id,
+ bool *whole_object);
#endif
handle_flushing_replay();
}
});
+ ctx = new FunctionContext([this, ctx](int r) {
+ // ensure the commit position is flushed to disk
+ m_journaler->flush_commit_position(ctx);
+ });
ctx = new FunctionContext([this, cct, cancel_ops, ctx](int r) {
ldout(cct, 20) << this << " handle_replay_complete: "
<< "shut down replay" << dendl;
m_lock.Unlock();
// stop replay, shut down, and restart
- Context *ctx = new FunctionContext([this, cct](int r) {
+ Context* ctx = create_context_callback<
+ Journal<I>, &Journal<I>::handle_flushing_restart>(this);
+ ctx = new FunctionContext([this, ctx](int r) {
+ // ensure the commit position is flushed to disk
+ m_journaler->flush_commit_position(ctx);
+ });
+ ctx = new FunctionContext([this, cct, ctx](int r) {
ldout(cct, 20) << this << " handle_replay_process_safe: "
<< "shut down replay" << dendl;
{
assert(m_state == STATE_FLUSHING_RESTART);
}
- m_journal_replay->shut_down(true, create_context_callback<
- Journal<I>, &Journal<I>::handle_flushing_restart>(this));
+ m_journal_replay->shut_down(true, ctx);
});
m_journaler->stop_replay(ctx);
return;
r = invoke_async_request("check object map", true,
boost::bind(&Operations<I>::check_object_map, this,
boost::ref(prog_ctx), _1),
- [] (Context *c) { c->complete(-EOPNOTSUPP); });
+ [this](Context *c) {
+ m_image_ctx.op_work_queue->queue(c, -EOPNOTSUPP);
+ });
return r;
}
uint64_t end_size;
bool end_exists;
librados::snap_t clone_end_snap_id;
+ bool whole_object;
calc_snap_set_diff(cct, m_snap_set, m_diff_context.from_snap_id,
m_diff_context.end_snap_id, &diff, &end_size,
- &end_exists, &clone_end_snap_id);
+ &end_exists, &clone_end_snap_id, &whole_object);
+ if (whole_object) {
+ ldout(cct, 1) << "object " << m_oid << ": need to provide full object"
+ << dendl;
+ }
ldout(cct, 20) << " diff " << diff << " end_exists=" << end_exists
<< dendl;
- if (diff.empty()) {
+ if (diff.empty() && !whole_object) {
if (m_diff_context.from_snap_id == 0 && !end_exists) {
compute_parent_overlap(diffs);
}
return;
- } else if (m_diff_context.whole_object) {
+ } else if (m_diff_context.whole_object || whole_object) {
// provide the full object extents to the callback
for (vector<ObjectExtent>::iterator q = m_object_extents.begin();
q != m_object_extents.end(); ++q) {
case DISCARD_ACTION_REMOVE:
return "remove";
case DISCARD_ACTION_REMOVE_TRUNCATE:
- return "remove (truncate)";
+ return "remove (create+truncate)";
case DISCARD_ACTION_TRUNCATE:
return "truncate";
case DISCARD_ACTION_ZERO:
wr->remove();
break;
case DISCARD_ACTION_REMOVE_TRUNCATE:
+ wr->create(false);
+ // fall through
case DISCARD_ACTION_TRUNCATE:
wr->truncate(this->m_object_off);
break;
bool ObjectMapIterateRequest<I>::should_complete(int r) {
CephContext *cct = m_image_ctx.cct;
ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl;
+ if (r < 0) {
+ lderr(cct) << "object map operation encountered an error: "
+ << cpp_strerror(r) << dendl;
+ }
RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
switch (m_state) {
case STATE_VERIFY_OBJECTS:
if (m_invalidate.test_and_set()) {
send_invalidate_object_map();
+ return false;
} else if (r == 0) {
return true;
}
}
if (r < 0) {
- lderr(cct) << "object map operation encountered an error: "
- << cpp_strerror(r)
- << dendl;
return true;
}
CephContext *cct = image_ctx.cct;
ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
<< "r=" << r << dendl;
- r = filter_state_return_code(r);
+ r = filter_return_code(r);
if (r < 0) {
if (r == -EEXIST) {
ldout(cct, 1) << "image already exists" << dendl;
}
template <typename I>
-int RenameRequest<I>::filter_state_return_code(int r) {
+int RenameRequest<I>::filter_return_code(int r) const {
I &image_ctx = this->m_image_ctx;
CephContext *cct = image_ctx.cct;
protected:
void send_op() override;
bool should_complete(int r) override;
+ int filter_return_code(int r) const override;
journal::Event create_event(uint64_t op_tid) const override {
return journal::RenameEvent(op_tid, m_dest_name);
bufferlist m_header_bl;
- int filter_state_return_code(int r);
-
void send_read_source_header();
void send_write_destination_header();
void send_update_directory();
}
}
+private:
+ ~Entry() = default;
+
+public:
// function improves estimate for expected size of message
void hint_size() {
if (m_exp_len != NULL) {
int snprintf(char* dst, size_t avail) const {
return m_streambuf.snprintf(dst, avail);
}
+
+ void destroy() {
+ if (m_exp_len != NULL) {
+ this->~Entry();
+ ::operator delete(this);
+ } else {
+ delete(this);
+ }
+ }
};
}
Entry *t;
while (m_head) {
t = m_head->m_next;
- delete m_head;
+ m_head->destroy();
m_head = t;
}
}
// trim
while (m_recent.m_len > m_max_recent) {
- delete m_recent.dequeue();
+ m_recent.dequeue()->destroy();
}
m_flush_mutex_holder = 0;
#define dout_prefix *_dout << "mds.beacon." << name << ' '
-Beacon::Beacon(CephContext *cct_, MonClient *monc_, std::string name_) :
+Beacon::Beacon(CephContext *cct_, MonClient *monc_, boost::string_view name_) :
Dispatcher(cct_), lock("Beacon"), monc(monc_), timer(g_ceph_context, lock),
name(name_), standby_for_rank(MDS_RANK_NONE),
standby_for_fscid(FS_CLUSTER_ID_NONE), want_state(MDSMap::STATE_BOOT),
#ifndef BEACON_STATE_H
#define BEACON_STATE_H
+#include <boost/utility/string_view.hpp>
+
#include "include/types.h"
#include "include/Context.h"
#include "common/Mutex.h"
class Beacon : public Dispatcher
{
public:
- Beacon(CephContext *cct_, MonClient *monc_, std::string name);
+ Beacon(CephContext *cct_, MonClient *monc_, boost::string_view name);
~Beacon() override;
void init(MDSMap const *mdsmap);
{
assert(dir);
dir->inode->make_path(fp, projected);
- fp.push_dentry(name);
+ fp.push_dentry(get_name());
}
/*
void CDentry::set_object_info(MDSCacheObjectInfo &info)
{
info.dirfrag = dir->dirfrag();
- info.dname = name;
+ info.dname = std::string(boost::string_view(name));
info.snapid = last;
}
#define CEPH_CDENTRY_H
#include <string>
+#include <boost/utility/string_view.hpp>
#include <set>
#include "include/counter.h"
friend class CDir;
struct linkage_t {
- CInode *inode;
- inodeno_t remote_ino;
- unsigned char remote_d_type;
+ CInode *inode = nullptr;
+ inodeno_t remote_ino = 0;
+ unsigned char remote_d_type = 0;
- linkage_t() : inode(0), remote_ino(0), remote_d_type(0) {}
+ linkage_t() {}
// dentry type is primary || remote || null
// inode ptr is required for primary, optional for remote, undefined for null
static const unsigned EXPORT_NONCE = 1;
- CDentry(const std::string& n, __u32 h,
+ CDentry(boost::string_view n, __u32 h,
snapid_t f, snapid_t l) :
- name(n), hash(h),
+ hash(h),
first(f), last(l),
item_dirty(this),
lock(this, &lock_type),
versionlock(this, &versionlock_type),
- dir(0),
- version(0), projected_version(0) {
- }
- CDentry(const std::string& n, __u32 h, inodeno_t ino, unsigned char dt,
+ name(n)
+ {}
+ CDentry(boost::string_view n, __u32 h, inodeno_t ino, unsigned char dt,
snapid_t f, snapid_t l) :
- name(n), hash(h),
+ hash(h),
first(f), last(l),
item_dirty(this),
lock(this, &lock_type),
versionlock(this, &versionlock_type),
- dir(0),
- version(0), projected_version(0) {
+ name(n)
+ {
linkage.remote_ino = ino;
linkage.remote_d_type = dt;
}
const CDir *get_dir() const { return dir; }
CDir *get_dir() { return dir; }
- const std::string& get_name() const { return name; }
+ boost::string_view get_name() const { return boost::string_view(name); }
__u32 get_hash() const { return hash; }
void dump(Formatter *f) const;
- std::string name;
__u32 hash;
snapid_t first, last;
static LockType lock_type;
static LockType versionlock_type;
- SimpleLock lock;
- LocalLock versionlock;
+ SimpleLock lock; // FIXME referenced containers not in mempool
+ LocalLock versionlock; // FIXME referenced containers not in mempool
- map<client_t,ClientLease*> client_lease_map;
+ mempool::mds_co::map<client_t,ClientLease*> client_lease_map;
protected:
friend class CInode;
friend class C_MDC_XlockRequest;
- CDir *dir; // containing dirfrag
+ CDir *dir = nullptr; // containing dirfrag
linkage_t linkage;
- list<linkage_t> projected;
+ mempool::mds_co::list<linkage_t> projected;
+
+ version_t version = 0; // dir version when last touched.
+ version_t projected_version = 0; // what it will be when i unlock/commit.
- version_t version; // dir version when last touched.
- version_t projected_version; // what it will be when i unlock/commit.
+private:
+ mempool::mds_co::string name;
};
ostream& operator<<(ostream& out, const CDentry& dn);
*
*/
+#include <boost/utility/string_view.hpp>
#include "include/types.h"
num_dentries_auth_subtree_nested(0),
dir_auth(CDIR_AUTH_DEFAULT)
{
- state = STATE_INITIAL;
-
memset(&fnode, 0, sizeof(fnode));
// auth
assert(in->is_dir());
- if (auth)
- state |= STATE_AUTH;
+ if (auth) state_set(STATE_AUTH);
}
/**
frag_info_t frag_info;
nest_info_t nest_info;
- for (map_t::iterator i = items.begin(); i != items.end(); ++i) {
+ for (auto i = items.begin(); i != items.end(); ++i) {
if (i->second->last != CEPH_NOSNAP)
continue;
CDentry::linkage_t *dnl = i->second->get_linkage();
if (!good) {
if (!scrub) {
- for (map_t::iterator i = items.begin(); i != items.end(); ++i) {
+ for (auto i = items.begin(); i != items.end(); ++i) {
CDentry *dn = i->second;
if (dn->get_linkage()->is_primary()) {
CInode *in = dn->get_linkage()->inode;
return good;
}
-CDentry *CDir::lookup(const string& name, snapid_t snap)
+CDentry *CDir::lookup(boost::string_view name, snapid_t snap)
{
dout(20) << "lookup (" << snap << ", '" << name << "')" << dendl;
- map_t::iterator iter = items.lower_bound(dentry_key_t(snap, name.c_str(),
- inode->hash_dentry_name(name)));
+ auto iter = items.lower_bound(dentry_key_t(snap, name, inode->hash_dentry_name(name)));
if (iter == items.end())
return 0;
- if (iter->second->name == name &&
+ if (iter->second->get_name() == name &&
iter->second->first <= snap &&
iter->second->last >= snap) {
dout(20) << " hit -> " << iter->first << dendl;
return 0;
}
-CDentry *CDir::lookup_exact_snap(const string& name, snapid_t last) {
- map_t::iterator p = items.find(dentry_key_t(last, name.c_str(),
- inode->hash_dentry_name(name)));
+CDentry *CDir::lookup_exact_snap(boost::string_view name, snapid_t last) {
+ auto p = items.find(dentry_key_t(last, name, inode->hash_dentry_name(name)));
if (p == items.end())
return NULL;
return p->second;
* linking fun
*/
-CDentry* CDir::add_null_dentry(const string& dname,
+CDentry* CDir::add_null_dentry(boost::string_view dname,
snapid_t first, snapid_t last)
{
// foreign
// add to dir
assert(items.count(dn->key()) == 0);
- //assert(null_items.count(dn->name) == 0);
+ //assert(null_items.count(dn->get_name()) == 0);
items[dn->key()] = dn;
if (last == CEPH_NOSNAP)
}
-CDentry* CDir::add_primary_dentry(const string& dname, CInode *in,
+CDentry* CDir::add_primary_dentry(boost::string_view dname, CInode *in,
snapid_t first, snapid_t last)
{
// primary
// add to dir
assert(items.count(dn->key()) == 0);
- //assert(null_items.count(dn->name) == 0);
+ //assert(null_items.count(dn->get_name()) == 0);
items[dn->key()] = dn;
return dn;
}
-CDentry* CDir::add_remote_dentry(const string& dname, inodeno_t ino, unsigned char d_type,
+CDentry* CDir::add_remote_dentry(boost::string_view dname, inodeno_t ino, unsigned char d_type,
snapid_t first, snapid_t last)
{
// foreign
// add to dir
assert(items.count(dn->key()) == 0);
- //assert(null_items.count(dn->name) == 0);
+ //assert(null_items.count(dn->get_name()) == 0);
items[dn->key()] = dn;
if (last == CEPH_NOSNAP)
bloom.reset(new bloom_filter(size, 1.0 / size, 0));
}
/* This size and false positive probability is completely random.*/
- bloom->insert(dn->name.c_str(), dn->name.size());
+ bloom->insert(dn->get_name().data(), dn->get_name().size());
}
-bool CDir::is_in_bloom(const string& name)
+bool CDir::is_in_bloom(boost::string_view name)
{
if (!bloom)
return false;
- return bloom->contains(name.c_str(), name.size());
+ return bloom->contains(name.data(), name.size());
}
void CDir::remove_null_dentries() {
dout(12) << "remove_null_dentries " << *this << dendl;
- CDir::map_t::iterator p = items.begin();
+ auto p = items.begin();
while (p != items.end()) {
CDentry *dn = p->second;
++p;
// clear dirty only when the directory was not snapshotted
bool clear_dirty = !inode->snaprealm;
- CDir::map_t::iterator p = items.begin();
+ auto p = items.begin();
while (p != items.end()) {
CDentry *dn = p->second;
++p;
{
dout(10) << "purge_stale_snap_data " << snaps << dendl;
- CDir::map_t::iterator p = items.begin();
+ auto p = items.begin();
while (p != items.end()) {
CDentry *dn = p->second;
++p;
if (dn->get_linkage()->is_primary()) {
CInode *in = dn->get_linkage()->get_inode();
- inode_t *pi = in->get_projected_inode();
+ auto pi = in->get_projected_inode();
if (dn->get_linkage()->get_inode()->is_dir())
fnode.fragstat.nsubdirs++;
else
auth_pin(this);
if (!waiting_on_dentry.empty()) {
- for (auto p = waiting_on_dentry.begin(); p != waiting_on_dentry.end(); ++p)
- dentry_waiters[p->first].swap(p->second);
+ for (const auto &p : waiting_on_dentry) {
+ auto &e = dentry_waiters[p.first];
+ for (const auto &waiter : p.second) {
+ e.push_back(waiter);
+ }
+ }
waiting_on_dentry.clear();
put(PIN_DNWAITER);
}
// repartition dentries
while (!items.empty()) {
- CDir::map_t::iterator p = items.begin();
+ auto p = items.begin();
CDentry *dn = p->second;
- frag_t subfrag = inode->pick_dirfrag(dn->name);
+ frag_t subfrag = inode->pick_dirfrag(dn->get_name());
int n = (subfrag.value() & (subfrag.mask() ^ frag.mask())) >> subfrag.mask_shift();
dout(15) << " subfrag " << subfrag << " n=" << n << " for " << p->first << dendl;
CDir *f = subfrags[n];
f->steal_dentry(dn);
}
- for (auto& p : dentry_waiters) {
+ for (const auto &p : dentry_waiters) {
frag_t subfrag = inode->pick_dirfrag(p.first.name);
int n = (subfrag.value() & (subfrag.mask() ^ frag.mask())) >> subfrag.mask_shift();
CDir *f = subfrags[n];
if (f->waiting_on_dentry.empty())
f->get(PIN_DNWAITER);
- f->waiting_on_dentry[p.first].swap(p.second);
+ auto &e = f->waiting_on_dentry[p.first];
+ for (const auto &waiter : p.second) {
+ e.push_back(waiter);
+ }
}
// FIXME: handle dirty old rstat
if (!dentry_waiters.empty()) {
get(PIN_DNWAITER);
- for (auto& p : dentry_waiters) {
- waiting_on_dentry[p.first].swap(p.second);
+ for (const auto &p : dentry_waiters) {
+ auto &e = waiting_on_dentry[p.first];
+ for (const auto &waiter : p.second) {
+ e.push_back(waiter);
+ }
}
}
void CDir::resync_accounted_fragstat()
{
fnode_t *pf = get_projected_fnode();
- inode_t *pi = inode->get_projected_inode();
+ auto pi = inode->get_projected_inode();
if (pf->accounted_fragstat.version != pi->dirstat.version) {
pf->fragstat.version = pi->dirstat.version;
void CDir::resync_accounted_rstat()
{
fnode_t *pf = get_projected_fnode();
- inode_t *pi = inode->get_projected_inode();
+ auto pi = inode->get_projected_inode();
if (pf->accounted_rstat.version != pi->rstat.version) {
pf->rstat.version = pi->rstat.version;
if (in->is_frozen())
continue;
- inode_t *pi = in->project_inode();
- pi->version = in->pre_dirty();
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
inode->mdcache->project_rstat_inode_to_frag(in, this, 0, 0, NULL);
}
* WAITING
*/
-void CDir::add_dentry_waiter(const string& dname, snapid_t snapid, MDSInternalContextBase *c)
+void CDir::add_dentry_waiter(boost::string_view dname, snapid_t snapid, MDSInternalContextBase *c)
{
if (waiting_on_dentry.empty())
get(PIN_DNWAITER);
<< " " << c << " on " << *this << dendl;
}
-void CDir::take_dentry_waiting(const string& dname, snapid_t first, snapid_t last,
+void CDir::take_dentry_waiting(boost::string_view dname, snapid_t first, snapid_t last,
list<MDSInternalContextBase*>& ls)
{
if (waiting_on_dentry.empty())
string_snap_t lb(dname, first);
string_snap_t ub(dname, last);
- compact_map<string_snap_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dentry.lower_bound(lb);
- while (p != waiting_on_dentry.end() &&
- !(ub < p->first)) {
+ auto it = waiting_on_dentry.lower_bound(lb);
+ while (it != waiting_on_dentry.end() &&
+ !(ub < it->first)) {
dout(10) << "take_dentry_waiting dentry " << dname
<< " [" << first << "," << last << "] found waiter on snap "
- << p->first.snapid
+ << it->first.snapid
<< " on " << *this << dendl;
- ls.splice(ls.end(), p->second);
- waiting_on_dentry.erase(p++);
+ for (const auto &waiter : it->second) {
+ ls.push_back(waiter);
+ }
+ waiting_on_dentry.erase(it++);
}
if (waiting_on_dentry.empty())
{
dout(10) << "take_sub_waiting" << dendl;
if (!waiting_on_dentry.empty()) {
- for (compact_map<string_snap_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dentry.begin();
- p != waiting_on_dentry.end();
- ++p)
- ls.splice(ls.end(), p->second);
+ for (const auto &p : waiting_on_dentry) {
+ for (const auto &waiter : p.second) {
+ ls.push_back(waiter);
+ }
+ }
waiting_on_dentry.clear();
put(PIN_DNWAITER);
}
{
if ((mask & WAIT_DENTRY) && !waiting_on_dentry.empty()) {
// take all dentry waiters
- while (!waiting_on_dentry.empty()) {
- compact_map<string_snap_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dentry.begin();
- dout(10) << "take_waiting dentry " << p->first.name
- << " snap " << p->first.snapid << " on " << *this << dendl;
- ls.splice(ls.end(), p->second);
- waiting_on_dentry.erase(p);
+ for (const auto &p : waiting_on_dentry) {
+ dout(10) << "take_waiting dentry " << p.first.name
+ << " snap " << p.first.snapid << " on " << *this << dendl;
+ for (const auto &waiter : p.second) {
+ ls.push_back(waiter);
+ }
}
+ waiting_on_dentry.clear();
put(PIN_DNWAITER);
}
fnode_t *CDir::project_fnode()
{
assert(get_version() != 0);
- fnode_t *p = new fnode_t;
- *p = *get_projected_fnode();
- projected_fnode.push_back(p);
+ projected_fnode.emplace_back(*get_projected_fnode());
+ auto &p = projected_fnode.back();
if (scrub_infop && scrub_infop->last_scrub_dirty) {
- p->localized_scrub_stamp = scrub_infop->last_local.time;
- p->localized_scrub_version = scrub_infop->last_local.version;
- p->recursive_scrub_stamp = scrub_infop->last_recursive.time;
- p->recursive_scrub_version = scrub_infop->last_recursive.version;
+ p.localized_scrub_stamp = scrub_infop->last_local.time;
+ p.localized_scrub_version = scrub_infop->last_local.version;
+ p.recursive_scrub_stamp = scrub_infop->last_recursive.time;
+ p.recursive_scrub_version = scrub_infop->last_recursive.version;
scrub_infop->last_scrub_dirty = false;
scrub_maybe_delete_info();
}
- dout(10) << "project_fnode " << p << dendl;
- return p;
+ dout(10) << __func__ << " " << &p << dendl;
+ return &p;
}
void CDir::pop_and_dirty_projected_fnode(LogSegment *ls)
{
assert(!projected_fnode.empty());
- dout(15) << "pop_and_dirty_projected_fnode " << projected_fnode.front()
- << " v" << projected_fnode.front()->version << dendl;
- fnode = *projected_fnode.front();
+ auto &front = projected_fnode.front();
+ dout(15) << __func__ << " " << &front << " v" << front.version << dendl;
+ fnode = front;
_mark_dirty(ls);
- delete projected_fnode.front();
projected_fnode.pop_front();
}
return fetch(c, want, ignore_authpinnability);
}
-void CDir::fetch(MDSInternalContextBase *c, const string& want_dn, bool ignore_authpinnability)
+void CDir::fetch(MDSInternalContextBase *c, boost::string_view want_dn, bool ignore_authpinnability)
{
dout(10) << "fetch on " << *this << dendl;
}
if (c) add_waiter(WAIT_COMPLETE, c);
- if (!want_dn.empty()) wanted_items.insert(want_dn);
+ if (!want_dn.empty()) wanted_items.insert(mempool::mds_co::string(want_dn));
// already fetching?
if (state_test(CDir::STATE_FETCHING)) {
} else {
assert(c);
std::set<std::string> str_keys;
- for (auto p = keys.begin(); p != keys.end(); ++p) {
+ for (auto p : keys) {
string str;
- p->encode(str);
+ p.encode(str);
str_keys.insert(str);
}
rd.omap_get_vals_by_keys(str_keys, &fin->omap, &fin->ret2);
}
CDentry *CDir::_load_dentry(
- const std::string &key,
- const std::string &dname,
+ boost::string_view key,
+ boost::string_view dname,
const snapid_t last,
bufferlist &bl,
const int pos,
if (stale) {
if (!dn) {
- stale_items.insert(key);
+ stale_items.insert(mempool::mds_co::string(key));
*force_dirty = true;
}
return dn;
if (stale) {
if (!dn) {
- stale_items.insert(key);
+ stale_items.insert(mempool::mds_co::string(key));
*force_dirty = true;
}
return dn;
//in->hack_accessed = false;
//in->hack_load_stamp = ceph_clock_now();
//num_new_inodes_loaded++;
+ } else if (g_conf->get_val<bool>("mds_hack_allow_loading_invalid_metadata")) {
+ dout(20) << "hack: adding duplicate dentry for " << *in << dendl;
+ dn = add_primary_dentry(dname, in, first, last);
} else {
dout(0) << "_fetched badness: got (but i already had) " << *in
<< " mode " << in->inode.mode
continue;
}
- if (dn && (wanted_items.count(dname) > 0 || !complete)) {
+ if (dn && (wanted_items.count(mempool::mds_co::string(boost::string_view(dname))) > 0 || !complete)) {
dout(10) << " touching wanted dn " << *dn << dendl;
inode->mdcache->touch_dentry(dn);
}
finish_waiting(WAIT_COMPLETE, -EIO);
}
-void CDir::go_bad_dentry(snapid_t last, const std::string &dname)
+void CDir::go_bad_dentry(snapid_t last, boost::string_view dname)
{
- dout(10) << "go_bad_dentry " << dname << dendl;
+ dout(10) << __func__ << " " << dname << dendl;
+ std::string path(get_path());
+ path += "/";
+ path += std::string(dname);
const bool fatal = cache->mds->damage_table.notify_dentry(
- inode->ino(), frag, last, dname, get_path() + "/" + dname);
+ inode->ino(), frag, last, dname, path);
if (fatal) {
cache->mds->damaged();
ceph_abort(); // unreachable, damaged() respawns us
object_locator_t oloc(cache->mds->mdsmap->get_metadata_pool());
if (!stale_items.empty()) {
- for (compact_set<string>::iterator p = stale_items.begin();
- p != stale_items.end();
- ++p) {
- to_remove.insert(*p);
- write_size += (*p).length();
+ for (const auto &p : stale_items) {
+ to_remove.insert(std::string(boost::string_view(p)));
+ write_size += p.length();
}
stale_items.clear();
}
}
if (dn->get_linkage()->is_null()) {
- dout(10) << " rm " << dn->name << " " << *dn << dendl;
+ dout(10) << " rm " << dn->get_name() << " " << *dn << dendl;
write_size += key.length();
to_remove.insert(key);
} else {
- dout(10) << " set " << dn->name << " " << *dn << dendl;
+ dout(10) << " set " << dn->get_name() << " " << *dn << dendl;
bufferlist dnbl;
_encode_dentry(dn, dnbl, snaps);
write_size += key.length() + dnbl.length();
if (dn->linkage.is_remote()) {
inodeno_t ino = dn->linkage.get_remote_ino();
unsigned char d_type = dn->linkage.get_remote_d_type();
- dout(14) << " pos " << bl.length() << " dn '" << dn->name << "' remote ino " << ino << dendl;
+ dout(14) << " pos " << bl.length() << " dn '" << dn->get_name() << "' remote ino " << ino << dendl;
// marker, name, ino
bl.append('L'); // remote link
CInode *in = dn->linkage.get_inode();
assert(in);
- dout(14) << " pos " << bl.length() << " dn '" << dn->name << "' inode " << *in << dendl;
+ dout(14) << " pos " << bl.length() << " dn '" << dn->get_name() << "' inode " << *in << dendl;
// marker, name, inode, [symlink string]
bl.append('I'); // inode
// finishers?
bool were_waiters = !waiting_for_commit.empty();
- compact_map<version_t, list<MDSInternalContextBase*> >::iterator p = waiting_for_commit.begin();
- while (p != waiting_for_commit.end()) {
- compact_map<version_t, list<MDSInternalContextBase*> >::iterator n = p;
- ++n;
- if (p->first > committed_version) {
- dout(10) << " there are waiters for " << p->first << ", committing again" << dendl;
- _commit(p->first, -1);
+ auto it = waiting_for_commit.begin();
+ while (it != waiting_for_commit.end()) {
+ auto _it = it;
+ ++_it;
+ if (it->first > committed_version) {
+ dout(10) << " there are waiters for " << it->first << ", committing again" << dendl;
+ _commit(it->first, -1);
break;
}
- cache->mds->queue_waiters(p->second);
- waiting_for_commit.erase(p);
- p = n;
+ std::list<MDSInternalContextBase*> t;
+ for (const auto &waiter : it->second)
+ t.push_back(waiter);
+ cache->mds->queue_waiters(t);
+ waiting_for_commit.erase(it);
+ it = _it;
}
// try drop dentries in this dirfrag if it's about to be purged
frag_info_t c;
memset(&c, 0, sizeof(c));
- for (map_t::iterator it = items.begin();
+ for (auto it = items.begin();
it != items.end();
++it) {
CDentry *dn = it->second;
scrub_infop->others_scrubbing.clear();
scrub_infop->others_scrubbed.clear();
- for (map_t::iterator i = items.begin();
+ for (auto i = items.begin();
i != items.end();
++i) {
// TODO: handle snapshot scrubbing
scrub_infop->last_scrub_dirty = true;
}
-int CDir::_next_dentry_on_set(set<dentry_key_t>& dns, bool missing_okay,
+int CDir::_next_dentry_on_set(dentry_key_set &dns, bool missing_okay,
MDSInternalContext *cb, CDentry **dnout)
{
dentry_key_t dnkey;
continue;
}
+ if (!dn->get_linkage()->is_primary()) {
+ dout(15) << " skip dentry " << dnkey.name
+ << ", no longer primary" << dendl;
+ continue;
+ }
+
*dnout = dn;
return 0;
}
#ifndef CEPH_CDIR_H
#define CEPH_CDIR_H
-#include "include/counter.h"
-#include "include/types.h"
-#include "include/buffer_fwd.h"
-#include "common/bloom_filter.hpp"
-#include "common/config.h"
-#include "common/DecayCounter.h"
-
-#include "MDSCacheObject.h"
-
#include <iosfwd>
-
#include <list>
-#include <set>
#include <map>
+#include <set>
#include <string>
+#include <boost/utility/string_view.hpp>
+#include "common/DecayCounter.h"
+#include "common/bloom_filter.hpp"
+#include "common/config.h"
+#include "include/buffer_fwd.h"
+#include "include/counter.h"
+#include "include/types.h"
#include "CInode.h"
+#include "MDSCacheObject.h"
class CDentry;
class MDCache;
// common states
static const unsigned STATE_CLEAN = 0;
- static const unsigned STATE_INITIAL = 0;
// these state bits are preserved by an import/export
// ...except if the directory is hashed, in which case none of them are!
fnode_t fnode;
snapid_t first;
- compact_map<snapid_t,old_rstat_t> dirty_old_rstat; // [value.first,key]
+ mempool::mds_co::compact_map<snapid_t,old_rstat_t> dirty_old_rstat; // [value.first,key]
// my inodes with dirty rstat data
elist<CInode*> dirty_rstat_inodes;
protected:
version_t projected_version;
- std::list<fnode_t*> projected_fnode;
+ mempool::mds_co::list<fnode_t> projected_fnode;
public:
elist<CDentry*> dirty_dentries;
if (projected_fnode.empty())
return &fnode;
else
- return projected_fnode.back();
+ return &projected_fnode.back();
}
fnode_t *get_projected_fnode() {
if (projected_fnode.empty())
return &fnode;
else
- return projected_fnode.back();
+ return &projected_fnode.back();
}
fnode_t *project_fnode();
void log_mark_dirty();
public:
- typedef std::map<dentry_key_t, CDentry*> map_t;
+ typedef mempool::mds_co::map<dentry_key_t, CDentry*> dentry_key_map;
+ typedef mempool::mds_co::set<dentry_key_t> dentry_key_set;
class scrub_info_t {
public:
/// inodes we contain with dirty scrub stamps
- map<dentry_key_t,CInode*> dirty_scrub_stamps; // TODO: make use of this!
+ dentry_key_map dirty_scrub_stamps; // TODO: make use of this!
struct scrub_stamps {
version_t version;
utime_t time;
bool pending_scrub_error;
/// these are lists of children in each stage of scrubbing
- set<dentry_key_t> directories_to_scrub;
- set<dentry_key_t> directories_scrubbing;
- set<dentry_key_t> directories_scrubbed;
- set<dentry_key_t> others_to_scrub;
- set<dentry_key_t> others_scrubbing;
- set<dentry_key_t> others_scrubbed;
+ dentry_key_set directories_to_scrub;
+ dentry_key_set directories_scrubbing;
+ dentry_key_set directories_scrubbed;
+ dentry_key_set others_to_scrub;
+ dentry_key_set others_scrubbing;
+ dentry_key_set others_scrubbed;
ScrubHeaderRefConst header;
* list will be filled with all CDentry * which have been returned
* from scrub_dentry_next() but not sent back via scrub_dentry_finished().
*/
- void scrub_dentries_scrubbing(list<CDentry*> *out_dentries);
+ void scrub_dentries_scrubbing(std::list<CDentry*> *out_dentries);
/**
* Report to the CDir that a CDentry has been scrubbed. Call this
* for every CDentry returned from scrub_dentry_next().
* Check the given set (presumably one of those in scrub_info_t) for the
* next key to scrub and look it up (or fail!).
*/
- int _next_dentry_on_set(set<dentry_key_t>& dns, bool missing_okay,
+ int _next_dentry_on_set(dentry_key_set &dns, bool missing_okay,
MDSInternalContext *cb, CDentry **dnout);
protected:
- std::unique_ptr<scrub_info_t> scrub_infop;
+ std::unique_ptr<scrub_info_t> scrub_infop; // FIXME not in mempool
// contents of this directory
- map_t items; // non-null AND null
+ dentry_key_map items; // non-null AND null
unsigned num_head_items;
unsigned num_head_null;
unsigned num_snap_items;
version_t committing_version;
version_t committed_version;
- compact_set<string> stale_items;
+ mempool::mds_co::compact_set<mempool::mds_co::string> stale_items;
// lock nesting, freeze
static int num_frozen_trees;
// cache control (defined for authority; hints for replicas)
__s32 dir_rep;
- compact_set<__s32> dir_rep_by; // if dir_rep == REP_LIST
+ mempool::mds_co::compact_set<__s32> dir_rep_by; // if dir_rep == REP_LIST
// popularity
dirfrag_load_vec_t pop_me;
friend class C_IO_Dir_OMAP_FetchedMore;
friend class C_IO_Dir_Committed;
- std::unique_ptr<bloom_filter> bloom;
+ std::unique_ptr<bloom_filter> bloom; // XXX not part of mempool::mds_co
/* If you set up the bloom filter, you must keep it accurate!
* It's deleted when you mark_complete() and is deliberately not serialized.*/
const CInode *get_inode() const { return inode; }
CDir *get_parent_dir() { return inode->get_parent_dir(); }
- map_t::iterator begin() { return items.begin(); }
- map_t::iterator end() { return items.end(); }
- map_t::iterator lower_bound(dentry_key_t key) { return items.lower_bound(key); }
+ dentry_key_map::iterator begin() { return items.begin(); }
+ dentry_key_map::iterator end() { return items.end(); }
+ dentry_key_map::iterator lower_bound(dentry_key_t key) { return items.lower_bound(key); }
unsigned get_num_head_items() const { return num_head_items; }
unsigned get_num_head_null() const { return num_head_null; }
// -- dentries and inodes --
public:
- CDentry* lookup_exact_snap(const std::string& dname, snapid_t last);
- CDentry* lookup(const std::string& n, snapid_t snap=CEPH_NOSNAP);
+ CDentry* lookup_exact_snap(boost::string_view dname, snapid_t last);
+ CDentry* lookup(boost::string_view n, snapid_t snap=CEPH_NOSNAP);
CDentry* lookup(const char *n, snapid_t snap=CEPH_NOSNAP) {
- return lookup(std::string(n), snap);
+ return lookup(boost::string_view(n), snap);
}
- CDentry* add_null_dentry(const std::string& dname,
+ CDentry* add_null_dentry(boost::string_view dname,
snapid_t first=2, snapid_t last=CEPH_NOSNAP);
- CDentry* add_primary_dentry(const std::string& dname, CInode *in,
+ CDentry* add_primary_dentry(boost::string_view dname, CInode *in,
snapid_t first=2, snapid_t last=CEPH_NOSNAP);
- CDentry* add_remote_dentry(const std::string& dname, inodeno_t ino, unsigned char d_type,
+ CDentry* add_remote_dentry(boost::string_view dname, inodeno_t ino, unsigned char d_type,
snapid_t first=2, snapid_t last=CEPH_NOSNAP);
void remove_dentry( CDentry *dn ); // delete dentry
void link_remote_inode( CDentry *dn, inodeno_t ino, unsigned char d_type);
void try_remove_unlinked_dn(CDentry *dn);
void add_to_bloom(CDentry *dn);
- bool is_in_bloom(const std::string& name);
+ bool is_in_bloom(boost::string_view name);
bool has_bloom() { return (bloom ? true : false); }
void remove_bloom() {
bloom.reset();
public:
- void split(int bits, list<CDir*>& subs, list<MDSInternalContextBase*>& waiters, bool replay);
- void merge(list<CDir*>& subs, list<MDSInternalContextBase*>& waiters, bool replay);
+ void split(int bits, std::list<CDir*>& subs, list<MDSInternalContextBase*>& waiters, bool replay);
+ void merge(std::list<CDir*>& subs, std::list<MDSInternalContextBase*>& waiters, bool replay);
bool should_split() const {
return (int)get_frag_size() > g_conf->mds_bal_split_size;
void prepare_new_fragment(bool replay);
void prepare_old_fragment(map<string_snap_t, std::list<MDSInternalContextBase*> >& dentry_waiters, bool replay);
void steal_dentry(CDentry *dn); // from another dir. used by merge/split.
- void finish_old_fragment(list<MDSInternalContextBase*>& waiters, bool replay);
+ void finish_old_fragment(std::list<MDSInternalContextBase*>& waiters, bool replay);
void init_fragment_pins();
return file_object_t(ino(), frag);
}
void fetch(MDSInternalContextBase *c, bool ignore_authpinnability=false);
- void fetch(MDSInternalContextBase *c, const std::string& want_dn, bool ignore_authpinnability=false);
+ void fetch(MDSInternalContextBase *c, boost::string_view want_dn, bool ignore_authpinnability=false);
void fetch(MDSInternalContextBase *c, const std::set<dentry_key_t>& keys);
protected:
- compact_set<string> wanted_items;
+ mempool::mds_co::compact_set<mempool::mds_co::string> wanted_items;
void _omap_fetch(MDSInternalContextBase *fin, const std::set<dentry_key_t>& keys);
void _omap_fetch_more(
bufferlist& hdrbl, std::map<std::string, bufferlist>& omap,
MDSInternalContextBase *fin);
CDentry *_load_dentry(
- const std::string &key,
- const std::string &dname,
+ boost::string_view key,
+ boost::string_view dname,
snapid_t last,
bufferlist &bl,
int pos,
const std::set<snapid_t> *snaps,
bool *force_dirty,
- list<CInode*> *undef_inodes);
+ std::list<CInode*> *undef_inodes);
/**
* Mark this fragment as BADFRAG (common part of go_bad and go_bad_dentry)
/**
* Go bad due to a damaged dentry (register with damagetable and go BADFRAG)
*/
- void go_bad_dentry(snapid_t last, const std::string &dname);
+ void go_bad_dentry(snapid_t last, boost::string_view dname);
/**
* Go bad due to a damaged header (register with damagetable and go BADFRAG)
bool complete, int r);
// -- commit --
- compact_map<version_t, std::list<MDSInternalContextBase*> > waiting_for_commit;
+ mempool::mds_co::compact_map<version_t, mempool::mds_co::list<MDSInternalContextBase*> > waiting_for_commit;
void _commit(version_t want, int op_prio);
void _omap_commit(int op_prio);
void _encode_dentry(CDentry *dn, bufferlist& bl, const std::set<snapid_t> *snaps);
// -- waiters --
protected:
- compact_map< string_snap_t, std::list<MDSInternalContextBase*> > waiting_on_dentry;
+ mempool::mds_co::compact_map< string_snap_t, mempool::mds_co::list<MDSInternalContextBase*> > waiting_on_dentry; // FIXME string_snap_t not in mempool
public:
- bool is_waiting_for_dentry(const std::string& dname, snapid_t snap) {
+ bool is_waiting_for_dentry(boost::string_view dname, snapid_t snap) {
return waiting_on_dentry.count(string_snap_t(dname, snap));
}
- void add_dentry_waiter(const std::string& dentry, snapid_t snap, MDSInternalContextBase *c);
- void take_dentry_waiting(const std::string& dentry, snapid_t first, snapid_t last, std::list<MDSInternalContextBase*>& ls);
+ void add_dentry_waiter(boost::string_view dentry, snapid_t snap, MDSInternalContextBase *c);
+ void take_dentry_waiting(boost::string_view dentry, snapid_t first, snapid_t last, std::list<MDSInternalContextBase*>& ls);
void take_sub_waiting(std::list<MDSInternalContextBase*>& ls); // dentry or ino
void add_waiter(uint64_t mask, MDSInternalContextBase *c) override;
if (in.is_frozen_inode()) out << " FROZEN";
if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN";
- const inode_t *pi = in.get_projected_inode();
+ const CInode::mempool_inode *pi = in.get_projected_inode();
if (pi->is_truncating())
out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")";
if (in.inode.is_dir()) {
out << " " << in.inode.dirstat;
if (g_conf->mds_debug_scatterstat && in.is_projected()) {
- const inode_t *pi = in.get_projected_inode();
+ const CInode::mempool_inode *pi = in.get_projected_inode();
out << "->" << pi->dirstat;
}
} else {
if (!(in.inode.rstat == in.inode.accounted_rstat))
out << "/" << in.inode.accounted_rstat;
if (g_conf->mds_debug_scatterstat && in.is_projected()) {
- const inode_t *pi = in.get_projected_inode();
+ const CInode::mempool_inode *pi = in.get_projected_inode();
out << "->" << pi->rstat;
if (!(pi->rstat == pi->accounted_rstat))
out << "/" << pi->accounted_rstat;
}
if (!in.get_mds_caps_wanted().empty()) {
out << " mcw={";
- for (compact_map<int,int>::const_iterator p = in.get_mds_caps_wanted().begin();
- p != in.get_mds_caps_wanted().end();
- ++p) {
- if (p != in.get_mds_caps_wanted().begin())
+ bool first = true;
+ for (const auto &p : in.get_mds_caps_wanted()) {
+ if (!first)
out << ',';
- out << p->first << '=' << ccap_string(p->second);
+ out << p.first << '=' << ccap_string(p.second);
+ first = false;
}
out << '}';
}
auth_pin(this); // pin head inode...
}
- set<client_t>& clients = client_need_snapflush[snapid];
+ auto &clients = client_need_snapflush[snapid];
if (clients.empty())
snapin->auth_pin(this); // ...and pin snapped/old inode!
void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
{
- dout(10) << "remove_need_snapflush client." << client << " snapid " << snapid << " on " << snapin << dendl;
- compact_map<snapid_t, std::set<client_t> >::iterator p = client_need_snapflush.find(snapid);
- if (p == client_need_snapflush.end()) {
+ dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
+ auto it = client_need_snapflush.find(snapid);
+ if (it == client_need_snapflush.end()) {
dout(10) << " snapid not found" << dendl;
return;
}
- if (!p->second.count(client)) {
+ size_t n = it->second.erase(client);
+ if (n == 0) {
dout(10) << " client not found" << dendl;
return;
}
- p->second.erase(client);
- if (p->second.empty()) {
- client_need_snapflush.erase(p);
+ if (it->second.empty()) {
+ client_need_snapflush.erase(it);
snapin->auth_unpin(this);
if (client_need_snapflush.empty()) {
{
dout(10) << "split_need_snapflush [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl;
bool need_flush = false;
- for (compact_map<snapid_t, set<client_t> >::iterator p = client_need_snapflush.lower_bound(cowin->first);
- p != client_need_snapflush.end() && p->first < in->first; ) {
- compact_map<snapid_t, set<client_t> >::iterator q = p;
- ++p;
- assert(!q->second.empty());
- if (cowin->last >= q->first) {
+ for (auto it = client_need_snapflush.lower_bound(cowin->first);
+ it != client_need_snapflush.end() && it->first < in->first; ) {
+ assert(!it->second.empty());
+ if (cowin->last >= it->first) {
cowin->auth_pin(this);
need_flush = true;
- } else
- client_need_snapflush.erase(q);
+ ++it;
+ } else {
+ it = client_need_snapflush.erase(it);
+ }
in->auth_unpin(this);
}
return need_flush;
}
}
-inode_t *CInode::project_inode(map<string,bufferptr> *px)
+/* Ideally this function would be subsumed by project_inode but it is also
+ * needed by CInode::project_past_snaprealm_parent so we keep it.
+ */
+sr_t &CInode::project_snaprealm(projected_inode &pi)
{
- if (projected_nodes.empty()) {
- projected_nodes.push_back(new projected_inode_t(new inode_t(inode)));
- if (px)
- *px = xattrs;
+ const sr_t *cur_srnode = get_projected_srnode();
+
+ assert(!pi.snapnode);
+ if (cur_srnode) {
+ pi.snapnode.reset(new sr_t(*cur_srnode));
} else {
- projected_nodes.push_back(new projected_inode_t(
- new inode_t(*projected_nodes.back()->inode)));
- if (px)
- *px = *get_projected_xattrs();
+ pi.snapnode.reset(new sr_t());
+ pi.snapnode->created = 0;
+ pi.snapnode->current_parent_since = get_oldest_snap();
}
+ ++num_projected_srnodes;
- projected_inode_t &pi = *projected_nodes.back();
+ dout(10) << __func__ << " " << pi.snapnode.get() << dendl;
+ return *pi.snapnode.get();
+}
- if (px) {
- pi.xattrs = px;
- ++num_projected_xattrs;
+CInode::projected_inode &CInode::project_inode(bool xattr, bool snap)
+{
+ if (projected_nodes.empty()) {
+ projected_nodes.emplace_back(inode);
+ } else {
+ projected_nodes.emplace_back(projected_nodes.back().inode);
}
+ auto &pi = projected_nodes.back();
if (scrub_infop && scrub_infop->last_scrub_dirty) {
- pi.inode->last_scrub_stamp = scrub_infop->last_scrub_stamp;
- pi.inode->last_scrub_version = scrub_infop->last_scrub_version;
+ pi.inode.last_scrub_stamp = scrub_infop->last_scrub_stamp;
+ pi.inode.last_scrub_version = scrub_infop->last_scrub_version;
scrub_infop->last_scrub_dirty = false;
scrub_maybe_delete_info();
}
- dout(15) << "project_inode " << pi.inode << dendl;
- return pi.inode;
+
+ if (xattr) {
+ pi.xattrs.reset(new mempool_xattr_map(*get_projected_xattrs()));
+ ++num_projected_xattrs;
+ }
+
+ if (snap) {
+ project_snaprealm(pi);
+ }
+
+ dout(15) << __func__ << " " << pi.inode.ino << dendl;
+ return pi;
}
void CInode::pop_and_dirty_projected_inode(LogSegment *ls)
{
assert(!projected_nodes.empty());
- dout(15) << "pop_and_dirty_projected_inode " << projected_nodes.front()->inode
- << " v" << projected_nodes.front()->inode->version << dendl;
+ auto &front = projected_nodes.front();
+ dout(15) << __func__ << " " << front.inode.ino
+ << " v" << front.inode.version << dendl;
int64_t old_pool = inode.layout.pool_id;
- mark_dirty(projected_nodes.front()->inode->version, ls);
- inode = *projected_nodes.front()->inode;
+ mark_dirty(front.inode.version, ls);
+ inode = front.inode;
if (inode.is_backtrace_updated())
_mark_dirty_parent(ls, old_pool != inode.layout.pool_id);
- map<string,bufferptr> *px = projected_nodes.front()->xattrs;
- if (px) {
+ if (front.xattrs) {
--num_projected_xattrs;
- xattrs = *px;
- delete px;
+ xattrs = *front.xattrs;
}
- if (projected_nodes.front()->snapnode) {
- pop_projected_snaprealm(projected_nodes.front()->snapnode);
+ auto &snapnode = front.snapnode;
+ if (snapnode) {
+ pop_projected_snaprealm(snapnode.get());
--num_projected_srnodes;
}
- delete projected_nodes.front()->inode;
- delete projected_nodes.front();
-
projected_nodes.pop_front();
}
-sr_t *CInode::project_snaprealm(snapid_t snapid)
-{
- sr_t *cur_srnode = get_projected_srnode();
- sr_t *new_srnode;
-
- if (cur_srnode) {
- new_srnode = new sr_t(*cur_srnode);
- } else {
- new_srnode = new sr_t();
- new_srnode->created = snapid;
- new_srnode->current_parent_since = get_oldest_snap();
- }
- dout(10) << "project_snaprealm " << new_srnode << dendl;
- projected_nodes.back()->snapnode = new_srnode;
- ++num_projected_srnodes;
- return new_srnode;
-}
-
/* if newparent != parent, add parent to past_parents
if parent DNE, we need to find what the parent actually is and fill that in */
void CInode::project_past_snaprealm_parent(SnapRealm *newparent)
{
- sr_t *new_snap = project_snaprealm();
+ assert(!projected_nodes.empty());
+ sr_t &new_snap = project_snaprealm(projected_nodes.back());
SnapRealm *oldparent;
if (!snaprealm) {
oldparent = find_snaprealm();
- new_snap->seq = oldparent->get_newest_seq();
+ new_snap.seq = oldparent->get_newest_seq();
}
else
oldparent = snaprealm->parent;
if (newparent != oldparent) {
snapid_t oldparentseq = oldparent->get_newest_seq();
- if (oldparentseq + 1 > new_snap->current_parent_since) {
- new_snap->past_parents[oldparentseq].ino = oldparent->inode->ino();
- new_snap->past_parents[oldparentseq].first = new_snap->current_parent_since;
+ if (oldparentseq + 1 > new_snap.current_parent_since) {
+ new_snap.past_parents[oldparentseq].ino = oldparent->inode->ino();
+ new_snap.past_parents[oldparentseq].first = new_snap.current_parent_since;
}
- new_snap->current_parent_since = MAX(oldparentseq, newparent->get_last_created()) + 1;
+ new_snap.current_parent_since = std::max(oldparentseq, newparent->get_last_created()) + 1;
}
}
<< " -> " << next_snaprealm->past_parents << dendl;
}
snaprealm->srnode = *next_snaprealm;
- delete next_snaprealm;
// we should be able to open these up (or have them already be open).
bool ok = snaprealm->_open_parents(NULL);
// dirfrags
-__u32 InodeStoreBase::hash_dentry_name(const string &dn)
+__u32 InodeStoreBase::hash_dentry_name(boost::string_view dn)
{
int which = inode.dir_layout.dl_dir_hash;
if (!which)
return ceph_str_hash(which, dn.data(), dn.length());
}
-frag_t InodeStoreBase::pick_dirfrag(const string& dn)
+frag_t InodeStoreBase::pick_dirfrag(boost::string_view dn)
{
if (dirfragtree.empty())
return frag_t(); // avoid the string hash if we can.
bool CInode::get_dirfrags_under(frag_t fg, list<CDir*>& ls)
{
bool all = true;
- list<frag_t> fglist;
+ std::list<frag_t> fglist;
dirfragtree.get_leaves_under(fg, fglist);
for (list<frag_t>::iterator p = fglist.begin(); p != fglist.end(); ++p)
if (dirfrags.count(*p))
fragtree_t tmpdft;
tmpdft.force_to_leaf(g_ceph_context, fg);
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) {
- tmpdft.force_to_leaf(g_ceph_context, p->first);
- if (fg.contains(p->first) && !dirfragtree.is_leaf(p->first))
- ls.push_back(p->second);
+ for (auto &p : dirfrags) {
+ tmpdft.force_to_leaf(g_ceph_context, p.first);
+ if (fg.contains(p.first) && !dirfragtree.is_leaf(p.first))
+ ls.push_back(p.second);
}
all = true;
tmpdft.get_leaves_under(fg, fglist);
- for (list<frag_t>::iterator p = fglist.begin(); p != fglist.end(); ++p)
- if (!dirfrags.count(*p)) {
+ for (const auto &p : fglist) {
+ if (!dirfrags.count(p)) {
all = false;
break;
}
+ }
return all;
}
void CInode::verify_dirfrags()
{
bool bad = false;
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) {
- if (!dirfragtree.is_leaf(p->first)) {
- dout(0) << "have open dirfrag " << p->first << " but not leaf in " << dirfragtree
- << ": " << *p->second << dendl;
+ for (const auto &p : dirfrags) {
+ if (!dirfragtree.is_leaf(p.first)) {
+ dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
+ << ": " << *p.second << dendl;
bad = true;
}
}
void CInode::force_dirfrags()
{
bool bad = false;
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) {
- if (!dirfragtree.is_leaf(p->first)) {
- dout(0) << "have open dirfrag " << p->first << " but not leaf in " << dirfragtree
- << ": " << *p->second << dendl;
+ for (auto &p : dirfrags) {
+ if (!dirfragtree.is_leaf(p.first)) {
+ dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
+ << ": " << *p.second << dendl;
bad = true;
}
}
return NULL;
}
-void CInode::get_dirfrags(list<CDir*>& ls)
+void CInode::get_dirfrags(std::list<CDir*>& ls)
{
// all dirfrags
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p)
- ls.push_back(p->second);
+ for (const auto &p : dirfrags) {
+ ls.push_back(p.second);
+ }
}
void CInode::get_nested_dirfrags(list<CDir*>& ls)
{
// dirfrags in same subtree
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p)
- if (!p->second->is_subtree_root())
- ls.push_back(p->second);
+ for (const auto &p : dirfrags) {
+ if (!p.second->is_subtree_root())
+ ls.push_back(p.second);
+ }
}
void CInode::get_subtree_dirfrags(list<CDir*>& ls)
{
// dirfrags that are roots of new subtrees
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p)
- if (p->second->is_subtree_root())
- ls.push_back(p->second);
+ for (const auto &p : dirfrags) {
+ if (p.second->is_subtree_root())
+ ls.push_back(p.second);
+ }
}
}
// dump any remaining dentries, for debugging purposes
- for (CDir::map_t::iterator p = dir->items.begin();
- p != dir->items.end();
- ++p)
- dout(14) << "close_dirfrag LEFTOVER dn " << *p->second << dendl;
+ for (const auto &p : dir->items)
+ dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl;
assert(dir->get_num_ref() == 0);
delete dir;
bool CInode::has_subtree_root_dirfrag(int auth)
{
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p)
- if (p->second->is_subtree_root() &&
- (auth == -1 || p->second->dir_auth.first == auth))
+ for (const auto &p : dirfrags) {
+ if (p.second->is_subtree_root() &&
+ (auth == -1 || p.second->dir_auth.first == auth))
return true;
+ }
return false;
}
bool CInode::has_subtree_or_exporting_dirfrag()
{
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p)
- if (p->second->is_subtree_root() ||
- p->second->state_test(CDir::STATE_EXPORTING))
+ for (const auto &p : dirfrags) {
+ if (p.second->is_subtree_root() ||
+ p.second->state_test(CDir::STATE_EXPORTING))
return true;
+ }
return false;
}
{
if (stickydir_ref == 0) {
get(PIN_STICKYDIRS);
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p) {
- p->second->state_set(CDir::STATE_STICKY);
- p->second->get(CDir::PIN_STICKY);
+ for (const auto &p : dirfrags) {
+ p.second->state_set(CDir::STATE_STICKY);
+ p.second->get(CDir::PIN_STICKY);
}
}
stickydir_ref++;
stickydir_ref--;
if (stickydir_ref == 0) {
put(PIN_STICKYDIRS);
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p) {
- p->second->state_clear(CDir::STATE_STICKY);
- p->second->put(CDir::PIN_STICKY);
+ for (const auto &p : dirfrags) {
+ p.second->state_clear(CDir::STATE_STICKY);
+ p.second->put(CDir::PIN_STICKY);
}
}
}
assert(is_base());
pv = get_projected_version() + 1;
}
- // force update backtrace for old format inode (see inode_t::decode)
+ // force update backtrace for old format inode (see mempool_inode::decode)
if (inode.backtrace_version == 0 && !projected_nodes.empty()) {
- inode_t *pi = projected_nodes.back()->inode;
- if (pi->backtrace_version == 0)
- pi->update_backtrace(pv);
+ mempool_inode &pi = projected_nodes.back().inode;
+ if (pi.backtrace_version == 0)
+ pi.update_backtrace(pv);
}
return pv;
}
CDentry *pdn = get_parent_dn();
while (pdn) {
CInode *diri = pdn->get_dir()->get_inode();
- bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, in->inode.version));
+ bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->inode.version));
in = diri;
pdn = in->get_parent_dn();
}
- for (compact_set<int64_t>::iterator i = inode.old_pools.begin();
- i != inode.old_pools.end();
- ++i) {
+ for (auto &p : inode.old_pools) {
// don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
- if (*i != pool)
- bt.old_pools.insert(*i);
+ if (p != pool)
+ bt.old_pools.insert(p);
}
}
// In the case where DIRTYPOOL is set, we update all old pools backtraces
// such that anyone reading them will see the new pool ID in
// inode_backtrace_t::pool and go read everything else from there.
- for (compact_set<int64_t>::iterator p = inode.old_pools.begin();
- p != inode.old_pools.end();
- ++p) {
- if (*p == pool)
+ for (const auto &p : inode.old_pools) {
+ if (p == pool)
continue;
- dout(20) << __func__ << ": updating old pool " << *p << dendl;
+ dout(20) << __func__ << ": updating old pool " << p << dendl;
ObjectOperation op;
op.priority = op_prio;
op.create(false);
op.setxattr("parent", parent_bl);
- object_locator_t oloc(*p);
+ object_locator_t oloc(p);
mdcache->mds->objecter->mutate(oid, oloc, op, snapc,
ceph::real_clock::now(),
0, gather.new_sub());
::decode(backtrace, bl);
CDentry *pdn = get_parent_dn();
if (backtrace.ancestors.empty() ||
- backtrace.ancestors[0].dname != pdn->name ||
+ backtrace.ancestors[0].dname != pdn->get_name() ||
backtrace.ancestors[0].dirino != pdn->get_dir()->ino())
err = -EINVAL;
}
bufferlist& snap_blob, __u8 struct_v)
{
::decode(inode, bl);
- if (is_symlink())
- ::decode(symlink, bl);
+ if (is_symlink()) {
+ std::string tmp;
+ ::decode(tmp, bl);
+ symlink = mempool::mds_co::string(boost::string_view(tmp));
+ }
::decode(dirfragtree, bl);
::decode(xattrs, bl);
::decode(snap_blob, bl);
::encode(inode.dirstat, bl); // only meaningful if i am auth.
bufferlist tmp;
__u32 n = 0;
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p) {
- frag_t fg = p->first;
- CDir *dir = p->second;
+ for (const auto &p : dirfrags) {
+ frag_t fg = p.first;
+ CDir *dir = p.second;
if (is_auth() || dir->is_auth()) {
fnode_t *pf = dir->get_projected_fnode();
dout(15) << fg << " " << *dir << dendl;
::encode(inode.rstat, bl); // only meaningful if i am auth.
bufferlist tmp;
__u32 n = 0;
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p) {
- frag_t fg = p->first;
- CDir *dir = p->second;
+ for (const auto &p : dirfrags) {
+ frag_t fg = p.first;
+ CDir *dir = p.second;
if (is_auth() || dir->is_auth()) {
fnode_t *pf = dir->get_projected_fnode();
dout(10) << fg << " " << *dir << dendl;
// dft was scattered, or we may still be be waiting on the
// notify from the auth)
dirfragtree.swap(temp);
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p) {
- if (!dirfragtree.is_leaf(p->first)) {
- dout(10) << " forcing open dirfrag " << p->first << " to leaf (racing with split|merge)" << dendl;
- dirfragtree.force_to_leaf(g_ceph_context, p->first);
+ for (const auto &p : dirfrags) {
+ if (!dirfragtree.is_leaf(p.first)) {
+ dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl;
+ dirfragtree.force_to_leaf(g_ceph_context, p.first);
}
- if (p->second->is_auth())
- p->second->state_clear(CDir::STATE_DIRTYDFT);
+ if (p.second->is_auth())
+ p.second->state_clear(CDir::STATE_DIRTYDFT);
}
}
if (g_conf->mds_debug_frag)
snapid_t fgfirst;
nest_info_t rstat;
nest_info_t accounted_rstat;
- compact_map<snapid_t,old_rstat_t> dirty_old_rstat;
+ decltype(CDir::dirty_old_rstat) dirty_old_rstat;
::decode(fg, p);
::decode(fgfirst, p);
::decode(rstat, p);
{
dout(10) << "start_scatter " << *lock << " on " << *this << dendl;
assert(is_auth());
- inode_t *pi = get_projected_inode();
+ mempool_inode *pi = get_projected_inode();
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p) {
- frag_t fg = p->first;
- CDir *dir = p->second;
+ for (const auto &p : dirfrags) {
+ frag_t fg = p.first;
+ CDir *dir = p.second;
fnode_t *pf = dir->get_projected_fnode();
dout(20) << fg << " " << *dir << dendl;
MutationRef mut(new MutationImpl());
mut->ls = mdlog->get_current_segment();
- inode_t *pi = get_projected_inode();
+ mempool_inode *pi = get_projected_inode();
fnode_t *pf = dir->project_fnode();
const char *ename = 0;
// adjust summation
assert(is_auth());
- inode_t *pi = get_projected_inode();
+ mempool_inode *pi = get_projected_inode();
bool touched_mtime = false, touched_chattr = false;
dout(20) << " orig dirstat " << pi->dirstat << dendl;
pi->dirstat.version++;
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p) {
- frag_t fg = p->first;
- CDir *dir = p->second;
+ for (const auto &p : dirfrags) {
+ frag_t fg = p.first;
+ CDir *dir = p.second;
dout(20) << fg << " " << *dir << dendl;
bool update;
// adjust summation
assert(is_auth());
- inode_t *pi = get_projected_inode();
+ mempool_inode *pi = get_projected_inode();
dout(20) << " orig rstat " << pi->rstat << dendl;
pi->rstat.version++;
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p) {
- frag_t fg = p->first;
- CDir *dir = p->second;
+ for (const auto &p : dirfrags) {
+ frag_t fg = p.first;
+ CDir *dir = p.second;
dout(20) << fg << " " << *dir << dendl;
bool update;
dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat,
dir->first, CEPH_NOSNAP, this, true);
- for (compact_map<snapid_t,old_rstat_t>::iterator q = dir->dirty_old_rstat.begin();
- q != dir->dirty_old_rstat.end();
- ++q)
- mdcache->project_rstat_frag_to_inode(q->second.rstat, q->second.accounted_rstat,
- q->second.first, q->first, this, true);
+ for (auto &p : dir->dirty_old_rstat) {
+ mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat,
+ p.second.first, p.first, this, true);
+ }
if (update) // dir contents not valid if frozen or non-auth
dir->check_rstats();
} else {
dout(10) << "finish_scatter_gather_update_accounted " << type << " on " << *this << dendl;
assert(is_auth());
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p) {
- CDir *dir = p->second;
+ for (const auto &p : dirfrags) {
+ CDir *dir = p.second;
if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen())
continue;
if (waiting_on_dir.empty())
return;
- compact_map<frag_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dir.find(fg);
- if (p != waiting_on_dir.end()) {
- dout(10) << "take_dir_waiting frag " << fg << " on " << *this << dendl;
- ls.splice(ls.end(), p->second);
- waiting_on_dir.erase(p);
+ auto it = waiting_on_dir.find(fg);
+ if (it != waiting_on_dir.end()) {
+ dout(10) << __func__ << " frag " << fg << " on " << *this << dendl;
+ ls.splice(ls.end(), it->second);
+ waiting_on_dir.erase(it);
if (waiting_on_dir.empty())
put(PIN_DIRWAITER);
if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) {
// take all dentry waiters
while (!waiting_on_dir.empty()) {
- compact_map<frag_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dir.begin();
- dout(10) << "take_waiting dirfrag " << p->first << " on " << *this << dendl;
- ls.splice(ls.end(), p->second);
- waiting_on_dir.erase(p);
+ auto it = waiting_on_dir.begin();
+ dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl;
+ ls.splice(ls.end(), it->second);
+ waiting_on_dir.erase(it);
}
put(PIN_DIRWAITER);
}
if (g_conf->mds_debug_auth_pins) {
// audit
int s = 0;
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p) {
- CDir *dir = p->second;
+ for (const auto &p : dirfrags) {
+ CDir *dir = p.second;
if (!dir->is_subtree_root() && dir->get_cum_auth_pins())
s++;
}
return MIN(t, oldest_snap);
}
-old_inode_t& CInode::cow_old_inode(snapid_t follows, bool cow_head)
+CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head)
{
assert(follows >= first);
- inode_t *pi = cow_head ? get_projected_inode() : get_previous_projected_inode();
- map<string,bufferptr> *px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs();
+ mempool_inode *pi = cow_head ? get_projected_inode() : get_previous_projected_inode();
+ mempool_xattr_map *px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs();
- old_inode_t &old = old_inodes[follows];
+ mempool_old_inode &old = old_inodes[follows];
old.first = first;
old.inode = *pi;
old.xattrs = *px;
void CInode::split_old_inode(snapid_t snap)
{
- compact_map<snapid_t, old_inode_t>::iterator p = old_inodes.lower_bound(snap);
- assert(p != old_inodes.end() && p->second.first < snap);
+ auto it = old_inodes.lower_bound(snap);
+ assert(it != old_inodes.end() && it->second.first < snap);
- old_inode_t &old = old_inodes[snap - 1];
- old = p->second;
+ mempool_old_inode &old = old_inodes[snap - 1];
+ old = it->second;
- p->second.first = snap;
- dout(10) << "split_old_inode " << "[" << old.first << "," << p->first
- << "] to [" << snap << "," << p->first << "] on " << *this << dendl;
+ it->second.first = snap;
+ dout(10) << __func__ << " " << "[" << old.first << "," << it->first
+ << "] to [" << snap << "," << it->first << "] on " << *this << dendl;
}
void CInode::pre_cow_old_inode()
{
dout(10) << "purge_stale_snap_data " << snaps << dendl;
- if (old_inodes.empty())
- return;
-
- compact_map<snapid_t,old_inode_t>::iterator p = old_inodes.begin();
- while (p != old_inodes.end()) {
- set<snapid_t>::const_iterator q = snaps.lower_bound(p->second.first);
- if (q == snaps.end() || *q > p->first) {
- dout(10) << " purging old_inode [" << p->second.first << "," << p->first << "]" << dendl;
- old_inodes.erase(p++);
- } else
- ++p;
+ for (auto it = old_inodes.begin(); it != old_inodes.end(); ) {
+ const snapid_t &id = it->first;
+ const auto &s = snaps.lower_bound(it->second.first);
+ if (s == snaps.end() || *s > id) {
+ dout(10) << " purging old_inode [" << it->second.first << "," << id << "]" << dendl;
+ it = old_inodes.erase(it);
+ } else {
+ ++it;
+ }
}
}
/*
* pick/create an old_inode
*/
-old_inode_t * CInode::pick_old_inode(snapid_t snap)
+CInode::mempool_old_inode * CInode::pick_old_inode(snapid_t snap)
{
- compact_map<snapid_t, old_inode_t>::iterator p = old_inodes.lower_bound(snap); // p is first key >= to snap
- if (p != old_inodes.end() && p->second.first <= snap) {
- dout(10) << "pick_old_inode snap " << snap << " -> [" << p->second.first << "," << p->first << "]" << dendl;
- return &p->second;
+ auto it = old_inodes.lower_bound(snap); // p is first key >= to snap
+ if (it != old_inodes.end() && it->second.first <= snap) {
+ dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl;
+ return &it->second;
}
dout(10) << "pick_old_inode snap " << snap << " -> nothing" << dendl;
return NULL;
(linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift());
}
-int CInode::get_caps_allowed_for_client(Session *session, inode_t *file_i) const
+int CInode::get_caps_allowed_for_client(Session *session, mempool_inode *file_i) const
{
client_t client = session->info.inst.name.num();
int allowed;
//cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
}
if (is_auth())
- for (compact_map<int,int>::const_iterator it = mds_caps_wanted.begin();
- it != mds_caps_wanted.end();
- ++it) {
- w |= it->second;
- other |= it->second;
+ for (const auto &p : mds_caps_wanted) {
+ w |= p.second;
+ other |= p.second;
//cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
}
if (ploner) *ploner = (loner >> shift) & mask;
bool valid = true;
// pick a version!
- inode_t *oi = &inode;
- inode_t *pi = get_projected_inode();
+ mempool_inode *oi = &inode;
+ mempool_inode *pi = get_projected_inode();
- map<string, bufferptr> *pxattrs = 0;
+ CInode::mempool_xattr_map *pxattrs = nullptr;
if (snapid != CEPH_NOSNAP) {
valid = false;
if (is_multiversion()) {
- compact_map<snapid_t,old_inode_t>::iterator p = old_inodes.lower_bound(snapid);
- if (p != old_inodes.end()) {
- if (p->second.first > snapid) {
- if (p != old_inodes.begin())
- --p;
+ auto it = old_inodes.lower_bound(snapid);
+ if (it != old_inodes.end()) {
+ if (it->second.first > snapid) {
+ if (it != old_inodes.begin())
+ --it;
}
- if (p->second.first <= snapid && snapid <= p->first) {
- dout(15) << "encode_inodestat snapid " << snapid
- << " to old_inode [" << p->second.first << "," << p->first << "]"
- << " " << p->second.inode.rstat
+ if (it->second.first <= snapid && snapid <= it->first) {
+ dout(15) << __func__ << " snapid " << snapid
+ << " to old_inode [" << it->second.first << "," << it->first << "]"
+ << " " << it->second.inode.rstat
<< dendl;
- pi = oi = &p->second.inode;
- pxattrs = &p->second.xattrs;
+ auto &p = it->second;
+ pi = oi = &p.inode;
+ pxattrs = &p.xattrs;
} else {
// snapshoted remote dentry can result this
dout(0) << "encode_inodestat old_inode for snapid " << snapid
bool plocal = versionlock.get_last_wrlock_client() == client;
bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client;
- inode_t *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi;
+ mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi;
dout(20) << " pfile " << pfile << " pauth " << pauth
<< " plink " << plink << " pxattr " << pxattr
<< " valid=" << valid << dendl;
// file
- inode_t *file_i = pfile ? pi:oi;
+ mempool_inode *file_i = pfile ? pi:oi;
file_layout_t layout;
if (is_dir()) {
layout = (ppolicy ? pi : oi)->layout;
}
// auth
- inode_t *auth_i = pauth ? pi:oi;
+ mempool_inode *auth_i = pauth ? pi:oi;
// link
- inode_t *link_i = plink ? pi:oi;
+ mempool_inode *link_i = plink ? pi:oi;
// xattr
- inode_t *xattr_i = pxattr ? pi:oi;
+ mempool_inode *xattr_i = pxattr ? pi:oi;
// xattr
bufferlist xbl;
::encode(inline_data, bl);
}
if (session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA)) {
- inode_t *policy_i = ppolicy ? pi : oi;
+ mempool_inode *policy_i = ppolicy ? pi : oi;
::encode(policy_i->quota, bl);
}
if (session->connection->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) {
bool plink = linklock.is_xlocked_by_client(client);
bool pxattr = xattrlock.is_xlocked_by_client(client);
- inode_t *oi = &inode;
- inode_t *pi = get_projected_inode();
- inode_t *i = (pfile|pauth|plink|pxattr) ? pi : oi;
+ mempool_inode *oi = &inode;
+ mempool_inode *pi = get_projected_inode();
+ mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi;
dout(20) << "encode_cap_message pfile " << pfile
<< " pauth " << pauth << " plink " << plink << " pxattr " << pxattr
m->head.nlink = i->nlink;
i = pxattr ? pi:oi;
- map<string,bufferptr> *ix = pxattr ? get_projected_xattrs() : &xattrs;
+ auto ix = pxattr ? get_projected_xattrs() : &xattrs;
if ((cap->pending() & CEPH_CAP_XATTR_SHARED) &&
i->xattr_version > cap->client_xattr_version) {
dout(10) << " including xattrs v " << i->xattr_version << dendl;
{
::decode(first, p);
::decode(inode, p);
- ::decode(symlink, p);
+ {
+ std::string tmp;
+ ::decode(tmp, p);
+ symlink = mempool::mds_co::string(boost::string_view(tmp));
+ }
::decode(dirfragtree, p);
::decode(xattrs, p);
::decode(old_inodes, p);
// include scatterlock info for any bounding CDirs
bufferlist bounding;
if (inode.is_dir())
- for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
- p != dirfrags.end();
- ++p) {
- CDir *dir = p->second;
+ for (const auto &p : dirfrags) {
+ CDir *dir = p.second;
if (dir->state_test(CDir::STATE_EXPORTBOUND)) {
- ::encode(p->first, bounding);
+ ::encode(p.first, bounding);
::encode(dir->fnode.fragstat, bounding);
::encode(dir->fnode.accounted_fragstat, bounding);
::encode(dir->fnode.rstat, bounding);
inode.dump(f);
f->dump_string("symlink", symlink);
f->open_array_section("old_inodes");
- for (compact_map<snapid_t, old_inode_t>::const_iterator i = old_inodes.begin();
- i != old_inodes.end(); ++i) {
+ for (const auto &p : old_inodes) {
f->open_object_section("old_inode");
- {
- // The key is the last snapid, the first is in the old_inode_t
- f->dump_int("last", i->first);
- i->second.dump(f);
- }
+ // The key is the last snapid, the first is in the mempool_old_inode
+ f->dump_int("last", p.first);
+ p.second.dump(f);
f->close_section(); // old_inode
}
f->close_section(); // old_inodes
/**
* Fetch backtrace and set tag if tag is non-empty
*/
- void fetch_backtrace_and_tag(CInode *in, std::string tag,
+ void fetch_backtrace_and_tag(CInode *in, boost::string_view tag,
Context *fin, int *bt_r, bufferlist *bt)
{
const int64_t pool = in->get_backtrace_pool();
bool _start(int rval) {
if (in->is_dirty()) {
MDCache *mdcache = in->mdcache;
- inode_t& inode = in->inode;
+ mempool_inode& inode = in->inode;
dout(20) << "validating a dirty CInode; results will be inconclusive"
<< dendl;
}
// present)
if (in->scrub_infop) {
// I'm a non-orphan, so look up my ScrubHeader via my linkage
- const std::string &tag = in->scrub_infop->header->get_tag();
+ boost::string_view tag = in->scrub_infop->header->get_tag();
// Rather than using the usual CInode::fetch_backtrace,
// use a special variant that optionally writes a tag in the same
// operation.
int memory_newer;
MDCache *mdcache = in->mdcache; // For the benefit of dout
- const inode_t& inode = in->inode; // For the benefit of dout
+ const mempool_inode& inode = in->inode; // For the benefit of dout
// Ignore rval because it's the result of a FAILOK operation
// from fetch_backtrace_and_tag: the real result is in
results->inode.ondisk_value = shadow_in->inode;
results->inode.memory_value = in->inode;
- inode_t& si = shadow_in->inode;
- inode_t& i = in->inode;
+ mempool_inode& si = shadow_in->inode;
+ mempool_inode& i = in->inode;
if (si.version > i.version) {
// uh, what?
results->inode.error_str << "On-disk inode is newer than in-memory one!";
}
// check each dirfrag...
- for (compact_map<frag_t,CDir*>::iterator p = in->dirfrags.begin();
- p != in->dirfrags.end();
- ++p) {
- CDir *dir = p->second;
+ for (const auto &p : in->dirfrags) {
+ CDir *dir = p.second;
assert(dir->get_version() > 0);
nest_info.add(dir->fnode.accounted_rstat);
dir_info.add(dir->fnode.accounted_fragstat);
if (dir->scrub_infop->header->get_repair()) {
results->raw_stats.repaired = true;
results->raw_stats.error_str
- << "dirfrag(" << p->first << ") has bad stats (will be fixed); ";
+ << "dirfrag(" << p.first << ") has bad stats (will be fixed); ";
} else {
results->raw_stats.error_str
- << "dirfrag(" << p->first << ") has bad stats; ";
+ << "dirfrag(" << p.first << ") has bad stats; ";
}
frags_errors++;
}
f->dump_int("want_loner", want_loner_cap.v);
f->open_array_section("mds_caps_wanted");
- for (compact_map<int,int>::const_iterator p = mds_caps_wanted.begin();
- p != mds_caps_wanted.end(); ++p) {
+ for (const auto &p : mds_caps_wanted) {
f->open_object_section("mds_cap_wanted");
- f->dump_int("rank", p->first);
- f->dump_string("cap", ccap_string(p->second));
+ f->dump_int("rank", p.first);
+ f->dump_string("cap", ccap_string(p.second));
f->close_section();
}
f->close_section();
// break out of const-land to set up implicit initial state
CInode *me = const_cast<CInode*>(this);
- inode_t *in = me->get_projected_inode();
+ mempool_inode *in = me->get_projected_inode();
scrub_info_t *si = new scrub_info_t();
si->scrub_start_stamp = si->last_scrub_stamp = in->last_scrub_stamp;
MDSInternalContextBase *f)
{
dout(20) << __func__ << " with scrub_version " << get_version() << dendl;
- assert(!scrub_is_in_progress());
+ if (scrub_is_in_progress()) {
+ dout(20) << __func__ << " inode moved during scrub, reinitializing "
+ << dendl;
+ assert(scrub_infop->scrub_parent);
+ CDentry *dn = scrub_infop->scrub_parent;
+ CDir *dir = dn->dir;
+ dn->put(CDentry::PIN_SCRUBPARENT);
+ assert(dir->scrub_infop && dir->scrub_infop->directory_scrubbing);
+ dir->scrub_infop->directories_scrubbing.erase(dn->key());
+ dir->scrub_infop->others_scrubbing.erase(dn->key());
+ }
scrub_info();
if (!scrub_infop)
scrub_infop = new scrub_info_t();
const CDentry *pdn = in->get_projected_parent_dn();
if (!pdn)
break;
- const inode_t *pi = in->get_projected_inode();
+ const mempool_inode *pi = in->get_projected_inode();
// ignore export pin for unlinked directory
if (pi->nlink == 0)
break;
#ifndef CEPH_CINODE_H
#define CEPH_CINODE_H
+#include <list>
+#include <map>
+#include <set>
+#include <boost/utility/string_view.hpp>
+
#include "common/config.h"
#include "include/counter.h"
#include "include/elist.h"
#include "SnapRealm.h"
#include "Mutation.h"
-#include <list>
-#include <set>
-#include <map>
-
#define dout_context g_ceph_context
class Context;
*/
class InodeStoreBase {
public:
- inode_t inode; // the inode itself
- std::string symlink; // symlink dest, if symlink
- std::map<std::string, bufferptr> xattrs;
+ typedef inode_t<mempool::mds_co::pool_allocator> mempool_inode;
+ typedef old_inode_t<mempool::mds_co::pool_allocator> mempool_old_inode;
+ typedef mempool::mds_co::compact_map<snapid_t, mempool_old_inode> mempool_old_inode_map;
+ typedef xattr_map<mempool::mds_co::pool_allocator> mempool_xattr_map; // FIXME bufferptr not in mempool
+
+ mempool_inode inode; // the inode itself
+ mempool::mds_co::string symlink; // symlink dest, if symlink
+ mempool_xattr_map xattrs;
fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map.
- compact_map<snapid_t, old_inode_t> old_inodes; // key = last, value.first = first
- snapid_t oldest_snap;
- damage_flags_t damage_flags;
+ mempool_old_inode_map old_inodes; // key = last, value.first = first
+ snapid_t oldest_snap = CEPH_NOSNAP;
+ damage_flags_t damage_flags = 0;
- InodeStoreBase() : oldest_snap(CEPH_NOSNAP), damage_flags(0) { }
+ InodeStoreBase() {}
/* Helpers */
bool is_file() const { return inode.is_file(); }
void dump(Formatter *f) const;
/* For use by offline tools */
- __u32 hash_dentry_name(const std::string &dn);
- frag_t pick_dirfrag(const std::string &dn);
+ __u32 hash_dentry_name(boost::string_view dn);
+ frag_t pick_dirfrag(boost::string_view dn);
};
class InodeStore : public InodeStoreBase {
public:
+ // FIXME bufferlist not part of mempool
bufferlist snap_blob; // Encoded copy of SnapRealm, because we can't
// rehydrate it without full MDCache
void encode(bufferlist &bl, uint64_t features) const {
public:
MDCache *mdcache;
- SnapRealm *snaprealm;
- SnapRealm *containing_realm;
+ SnapRealm *snaprealm = nullptr;
+ SnapRealm *containing_realm = nullptr;
snapid_t first, last;
- compact_set<snapid_t> dirty_old_rstats;
+ mempool::mds_co::compact_set<snapid_t> dirty_old_rstats;
class scrub_stamp_info_t {
public:
/// version we started our latest scrub (whether in-progress or finished)
- version_t scrub_start_version;
+ version_t scrub_start_version = 0;
/// time we started our latest scrub (whether in-progress or finished)
utime_t scrub_start_stamp;
/// version we started our most recent finished scrub
- version_t last_scrub_version;
+ version_t last_scrub_version = 0;
/// time we started our most recent finished scrub
utime_t last_scrub_stamp;
- scrub_stamp_info_t() : scrub_start_version(0), last_scrub_version(0) {}
+ scrub_stamp_info_t() {}
void reset() {
scrub_start_version = last_scrub_version = 0;
scrub_start_stamp = last_scrub_stamp = utime_t();
class scrub_info_t : public scrub_stamp_info_t {
public:
- CDentry *scrub_parent;
- MDSInternalContextBase *on_finish;
+ CDentry *scrub_parent = nullptr;
+ MDSInternalContextBase *on_finish = nullptr;
- bool last_scrub_dirty; /// are our stamps dirty with respect to disk state?
- bool scrub_in_progress; /// are we currently scrubbing?
- bool children_scrubbed;
+ bool last_scrub_dirty = false; /// are our stamps dirty with respect to disk state?
+ bool scrub_in_progress = false; /// are we currently scrubbing?
+ bool children_scrubbed = false;
/// my own (temporary) stamps and versions for each dirfrag we have
- std::map<frag_t, scrub_stamp_info_t> dirfrag_stamps;
+ std::map<frag_t, scrub_stamp_info_t> dirfrag_stamps; // XXX not part of mempool
ScrubHeaderRef header;
- scrub_info_t() : scrub_stamp_info_t(),
- scrub_parent(NULL), on_finish(NULL),
- last_scrub_dirty(false), scrub_in_progress(false),
- children_scrubbed(false) {}
+ scrub_info_t() {}
};
const scrub_info_t *scrub_info() const{
}
snapid_t get_oldest_snap();
- uint64_t last_journaled; // log offset for the last time i was journaled
+ uint64_t last_journaled = 0; // log offset for the last time i was journaled
//loff_t last_open_journaled; // log offset for the last journaled EOpen
utime_t last_dirstat_prop;
void mark_dirty_rstat();
void clear_dirty_rstat();
- //bool hack_accessed;
+ //bool hack_accessed = false;
//utime_t hack_load_stamp;
/**
* Projection methods, used to store inode changes until they have been journaled,
* at which point they are popped.
* Usage:
- * project_inode as needed. If you're also projecting xattrs, pass
- * in an xattr map (by pointer), then edit the map.
- * If you're also projecting the snaprealm, call project_snaprealm after
- * calling project_inode, and modify the snaprealm as necessary.
+ * project_inode as needed. If you're changing xattrs or sr_t, then pass true
+ * as needed then change the xattrs/snapnode member as needed. (Dirty
+ * exception: project_past_snaprealm_parent allows you to project the
+ * snapnode after doing project_inode (i.e. you don't need to pass
+ * snap=true).
*
* Then, journal. Once journaling is done, pop_and_dirty_projected_inode.
* This function will take care of the inode itself, the xattrs, and the snaprealm.
*/
- struct projected_inode_t {
- inode_t *inode;
- std::map<std::string,bufferptr> *xattrs;
- sr_t *snapnode;
-
- projected_inode_t()
- : inode(NULL), xattrs(NULL), snapnode(NULL) {}
- projected_inode_t(inode_t *in, sr_t *sn)
- : inode(in), xattrs(NULL), snapnode(sn) {}
- projected_inode_t(inode_t *in, std::map<std::string, bufferptr> *xp = NULL, sr_t *sn = NULL)
- : inode(in), xattrs(xp), snapnode(sn) {}
+ class projected_inode {
+ public:
+ mempool_inode inode;
+ std::unique_ptr<mempool_xattr_map> xattrs;
+ std::unique_ptr<sr_t> snapnode;
+
+ projected_inode() = delete;
+ projected_inode(const mempool_inode &in) : inode(in) {}
};
- std::list<projected_inode_t*> projected_nodes; // projected values (only defined while dirty)
- int num_projected_xattrs;
- int num_projected_srnodes;
-
- inode_t *project_inode(std::map<std::string,bufferptr> *px=0);
+
+private:
+ mempool::mds_co::list<projected_inode> projected_nodes; // projected values (only defined while dirty)
+ size_t num_projected_xattrs = 0;
+ size_t num_projected_srnodes = 0;
+
+ sr_t &project_snaprealm(projected_inode &pi);
+public:
+ CInode::projected_inode &project_inode(bool xattr = false, bool snap = false);
void pop_and_dirty_projected_inode(LogSegment *ls);
- projected_inode_t *get_projected_node() {
+ projected_inode *get_projected_node() {
if (projected_nodes.empty())
return NULL;
else
- return projected_nodes.back();
+ return &projected_nodes.back();
}
version_t get_projected_version() const {
if (projected_nodes.empty())
return inode.version;
else
- return projected_nodes.back()->inode->version;
+ return projected_nodes.back().inode.version;
}
bool is_projected() const {
return !projected_nodes.empty();
}
- const inode_t *get_projected_inode() const {
+ const mempool_inode *get_projected_inode() const {
if (projected_nodes.empty())
return &inode;
else
- return projected_nodes.back()->inode;
+ return &projected_nodes.back().inode;
}
- inode_t *get_projected_inode() {
+ mempool_inode *get_projected_inode() {
if (projected_nodes.empty())
return &inode;
else
- return projected_nodes.back()->inode;
+ return &projected_nodes.back().inode;
}
- inode_t *get_previous_projected_inode() {
+ mempool_inode *get_previous_projected_inode() {
assert(!projected_nodes.empty());
- std::list<projected_inode_t*>::reverse_iterator p = projected_nodes.rbegin();
- ++p;
- if (p != projected_nodes.rend())
- return (*p)->inode;
+ auto it = projected_nodes.rbegin();
+ ++it;
+ if (it != projected_nodes.rend())
+ return &it->inode;
else
return &inode;
}
- std::map<std::string,bufferptr> *get_projected_xattrs() {
+ mempool_xattr_map *get_projected_xattrs() {
if (num_projected_xattrs > 0) {
- for (std::list<projected_inode_t*>::reverse_iterator p = projected_nodes.rbegin();
- p != projected_nodes.rend();
- ++p)
- if ((*p)->xattrs)
- return (*p)->xattrs;
+ for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
+ if (it->xattrs)
+ return it->xattrs.get();
}
return &xattrs;
}
- std::map<std::string,bufferptr> *get_previous_projected_xattrs() {
- std::list<projected_inode_t*>::reverse_iterator p = projected_nodes.rbegin();
- for (++p; // skip the most recent projected value
- p != projected_nodes.rend();
- ++p)
- if ((*p)->xattrs)
- return (*p)->xattrs;
+ mempool_xattr_map *get_previous_projected_xattrs() {
+ if (num_projected_xattrs > 0) {
+ for (auto it = ++projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
+ if (it->xattrs)
+ return it->xattrs.get();
+ }
return &xattrs;
}
- sr_t *project_snaprealm(snapid_t snapid=0);
const sr_t *get_projected_srnode() const {
if (num_projected_srnodes > 0) {
- for (std::list<projected_inode_t*>::const_reverse_iterator p = projected_nodes.rbegin();
- p != projected_nodes.rend();
- ++p)
- if ((*p)->snapnode)
- return (*p)->snapnode;
+ for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
+ if (it->snapnode)
+ return it->snapnode.get();
}
if (snaprealm)
return &snaprealm->srnode;
}
sr_t *get_projected_srnode() {
if (num_projected_srnodes > 0) {
- for (std::list<projected_inode_t*>::reverse_iterator p = projected_nodes.rbegin();
- p != projected_nodes.rend();
- ++p)
- if ((*p)->snapnode)
- return (*p)->snapnode;
+ for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
+ if (it->snapnode)
+ return it->snapnode.get();
}
if (snaprealm)
return &snaprealm->srnode;
void pop_projected_snaprealm(sr_t *next_snaprealm);
public:
- old_inode_t& cow_old_inode(snapid_t follows, bool cow_head);
+ mempool_old_inode& cow_old_inode(snapid_t follows, bool cow_head);
void split_old_inode(snapid_t snap);
- old_inode_t *pick_old_inode(snapid_t last);
+ mempool_old_inode *pick_old_inode(snapid_t last);
void pre_cow_old_inode();
void purge_stale_snap_data(const std::set<snapid_t>& snaps);
// -- cache infrastructure --
private:
- compact_map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode
- int stickydir_ref;
- scrub_info_t *scrub_infop;
+ mempool::mds_co::compact_map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode
+ int stickydir_ref = 0;
+ scrub_info_t *scrub_infop = nullptr;
public:
bool has_dirfrags() { return !dirfrags.empty(); }
protected:
// parent dentries in cache
- CDentry *parent; // primary link
- compact_set<CDentry*> remote_parents; // if hard linked
+ CDentry *parent = nullptr; // primary link
+ mempool::mds_co::compact_set<CDentry*> remote_parents; // if hard linked
- std::list<CDentry*> projected_parent; // for in-progress rename, (un)link, etc.
+ mempool::mds_co::list<CDentry*> projected_parent; // for in-progress rename, (un)link, etc.
- mds_authority_t inode_auth;
+ mds_authority_t inode_auth = CDIR_AUTH_DEFAULT;
// -- distributed state --
protected:
// file capabilities
- std::map<client_t, Capability*> client_caps; // client -> caps
- compact_map<int32_t, int32_t> mds_caps_wanted; // [auth] mds -> caps wanted
- int replica_caps_wanted; // [replica] what i've requested from auth
+ using cap_map = mempool::mds_co::map<client_t, Capability*>;
+ cap_map client_caps; // client -> caps
+ mempool::mds_co::compact_map<int32_t, int32_t> mds_caps_wanted; // [auth] mds -> caps wanted
+ int replica_caps_wanted = 0; // [replica] what i've requested from auth
public:
- compact_map<int, std::set<client_t> > client_snap_caps; // [auth] [snap] dirty metadata we still need from the head
- compact_map<snapid_t, std::set<client_t> > client_need_snapflush;
+ mempool::mds_co::compact_map<int, mempool::mds_co::set<client_t> > client_snap_caps; // [auth] [snap] dirty metadata we still need from the head
+ mempool::mds_co::compact_map<snapid_t, mempool::mds_co::set<client_t> > client_need_snapflush;
void add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
void remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
protected:
- ceph_lock_state_t *fcntl_locks;
- ceph_lock_state_t *flock_locks;
+ ceph_lock_state_t *fcntl_locks = nullptr;
+ ceph_lock_state_t *flock_locks = nullptr;
ceph_lock_state_t *get_fcntl_lock_state() {
if (!fcntl_locks)
elist<CInode*>::item& item_recover_queue_front = item_dirty_dirfrag_nest;
public:
- int auth_pin_freeze_allowance;
+ int auth_pin_freeze_allowance = 0;
inode_load_vec_t pop;
friend class CInodeExport;
// ---------------------------
+ CInode() = delete;
CInode(MDCache *c, bool auth=true, snapid_t f=2, snapid_t l=CEPH_NOSNAP) :
mdcache(c),
- snaprealm(0), containing_realm(0),
first(f), last(l),
- last_journaled(0), //last_open_journaled(0),
- //hack_accessed(true),
- num_projected_xattrs(0),
- num_projected_srnodes(0),
- stickydir_ref(0),
- scrub_infop(NULL),
- parent(0),
- inode_auth(CDIR_AUTH_DEFAULT),
- replica_caps_wanted(0),
- fcntl_locks(0), flock_locks(0),
- item_dirty(this), item_caps(this), item_open_file(this), item_dirty_parent(this),
+ item_dirty(this),
+ item_caps(this),
+ item_open_file(this),
+ item_dirty_parent(this),
item_dirty_dirfrag_dir(this),
item_dirty_dirfrag_nest(this),
item_dirty_dirfrag_dirfragtree(this),
- auth_pin_freeze_allowance(0),
pop(ceph_clock_now()),
versionlock(this, &versionlock_type),
authlock(this, &authlock_type),
snaplock(this, &snaplock_type),
nestlock(this, &nestlock_type),
flocklock(this, &flocklock_type),
- policylock(this, &policylock_type),
- loner_cap(-1), want_loner_cap(-1)
+ policylock(this, &policylock_type)
{
- state = 0;
if (auth) state_set(STATE_AUTH);
}
~CInode() override {
vinodeno_t vino() const { return vinodeno_t(inode.ino, last); }
int d_type() const { return IFTODT(inode.mode); }
- inode_t& get_inode() { return inode; }
+ mempool_inode& get_inode() { return inode; }
CDentry* get_parent_dn() { return parent; }
const CDentry* get_parent_dn() const { return parent; }
const CDentry* get_projected_parent_dn() const { return !projected_parent.empty() ? projected_parent.back() : parent; }
// -- waiting --
protected:
- compact_map<frag_t, std::list<MDSInternalContextBase*> > waiting_on_dir;
+ mempool::mds_co::compact_map<frag_t, std::list<MDSInternalContextBase*> > waiting_on_dir;
public:
void add_dir_waiter(frag_t fg, MDSInternalContextBase *c);
void take_dir_waiting(frag_t fg, std::list<MDSInternalContextBase*>& ls);
static LockType flocklock_type;
static LockType policylock_type;
+ // FIXME not part of mempool
LocalLock versionlock;
SimpleLock authlock;
SimpleLock linklock;
// -- caps -- (new)
// client caps
- client_t loner_cap, want_loner_cap;
+ client_t loner_cap = -1, want_loner_cap = -1;
client_t get_loner() const { return loner_cap; }
client_t get_wanted_loner() const { return want_loner_cap; }
int count_nonstale_caps() {
int n = 0;
- for (std::map<client_t,Capability*>::iterator it = client_caps.begin();
- it != client_caps.end();
- ++it)
- if (!it->second->is_stale())
+ for (const auto &p : client_caps) {
+ if (!p.second->is_stale())
n++;
+ }
return n;
}
bool multiple_nonstale_caps() {
int n = 0;
- for (std::map<client_t,Capability*>::iterator it = client_caps.begin();
- it != client_caps.end();
- ++it)
- if (!it->second->is_stale()) {
+ for (const auto &p : client_caps) {
+ if (!p.second->is_stale()) {
if (n)
return true;
n++;
}
+ }
return false;
}
bool is_any_caps() { return !client_caps.empty(); }
bool is_any_nonstale_caps() { return count_nonstale_caps(); }
- const compact_map<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted; }
- compact_map<int32_t,int32_t>& get_mds_caps_wanted() { return mds_caps_wanted; }
+ const mempool::mds_co::compact_map<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted; }
+ mempool::mds_co::compact_map<int32_t,int32_t>& get_mds_caps_wanted() { return mds_caps_wanted; }
- const std::map<client_t,Capability*>& get_client_caps() const { return client_caps; }
+ const cap_map& get_client_caps() const { return client_caps; }
Capability *get_client_cap(client_t client) {
auto client_caps_entry = client_caps.find(client);
if (client_caps_entry != client_caps.end())
int get_caps_allowed_by_type(int type) const;
int get_caps_careful() const;
int get_xlocker_mask(client_t client) const;
- int get_caps_allowed_for_client(Session *s, inode_t *file_i) const;
+ int get_caps_allowed_for_client(Session *s, mempool_inode *file_i) const;
// caps issued, wanted
int get_caps_issued(int *ploner = 0, int *pother = 0, int *pxlocker = 0,
// -- hierarchy stuff --
public:
void set_primary_parent(CDentry *p) {
- assert(parent == 0);
+ assert(parent == 0 ||
+ g_conf->get_val<bool>("mds_hack_allow_loading_invalid_metadata"));
parent = p;
}
void remove_primary_parent(CDentry *dn) {
std::stringstream error_str;
};
- bool performed_validation;
- bool passed_validation;
+ bool performed_validation = false;
+ bool passed_validation = false;
struct raw_stats_t {
frag_info_t dirstat;
};
member_status<inode_backtrace_t> backtrace;
- member_status<inode_t> inode;
+ member_status<mempool_inode> inode; // XXX should not be in mempool; wait for pmr
member_status<raw_stats_t> raw_stats;
- validated_data() : performed_validation(false),
- passed_validation(false) {}
+ validated_data() {}
void dump(Formatter *f) const;
f->dump_unsigned("pending", _pending);
f->open_array_section("revokes");
- for (list<revoke_info>::const_iterator p = _revokes.begin(); p != _revokes.end(); ++p) {
+ for (const auto &r : _revokes) {
f->open_object_section("revoke");
- p->dump(f);
+ r.dump(f);
f->close_section();
}
f->close_section();
ls.back()->last_issue_stamp = utime_t(12, 13);
ls.back()->_wanted = 14;
ls.back()->_pending = 15;
- ls.back()->_revokes.push_back(revoke_info());
- ls.back()->_revokes.back().before = 16;
- ls.back()->_revokes.back().seq = 17;
- ls.back()->_revokes.back().last_issue = 18;
- ls.back()->_revokes.push_back(revoke_info());
- ls.back()->_revokes.back().before = 19;
- ls.back()->_revokes.back().seq = 20;
- ls.back()->_revokes.back().last_issue = 21;
+ {
+ ls.back()->_revokes.emplace_back();
+ auto &r = ls.back()->_revokes.back();
+ r.before = 16;
+ r.seq = 17;
+ r.last_issue = 18;
+ }
+ {
+ ls.back()->_revokes.emplace_back();
+ auto &r = ls.back()->_revokes.back();
+ r.before = 19;
+ r.seq = 20;
+ r.last_issue = 21;
+ }
}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(Capability, co_cap, mds_co);
#ifndef CEPH_CAPABILITY_H
#define CEPH_CAPABILITY_H
-#include "include/counter.h"
#include "include/buffer_fwd.h"
+#include "include/counter.h"
+#include "include/mempool.h"
#include "include/xlist.h"
#include "common/config.h"
#include "mdstypes.h"
+
/*
Capability protocol notes.
class Capability : public Counter<Capability> {
public:
+ MEMPOOL_CLASS_HELPERS();
+
struct Export {
int64_t cap_id;
int32_t wanted;
ceph_seq_t issue(unsigned c) {
if (_pending & ~c) {
// revoking (and maybe adding) bits. note caps prior to this revocation
- _revokes.push_back(revoke_info(_pending, last_sent, last_issue));
+ _revokes.emplace_back(_pending, last_sent, last_issue);
_pending = c;
_issued |= c;
} else if (~_pending & c) {
}
void _calc_issued() {
_issued = _pending;
- for (list<revoke_info>::iterator p = _revokes.begin(); p != _revokes.end(); ++p)
- _issued |= p->before;
+ for (const auto &r : _revokes) {
+ _issued |= r.before;
+ }
}
void confirm_receipt(ceph_seq_t seq, unsigned caps) {
if (seq == last_sent) {
// - add new caps to _pending
// - track revocations in _revokes list
__u32 _pending, _issued;
- list<revoke_info> _revokes;
+ mempool::mds_co::list<revoke_info> _revokes;
ceph_seq_t last_sent;
ceph_seq_t last_issue;
DentryDamage(
inodeno_t ino_,
frag_t frag_,
- std::string dname_,
+ boost::string_view dname_,
snapid_t snap_id_)
: ino(ino_), frag(frag_), dname(dname_), snap_id(snap_id_)
{}
bool DamageTable::notify_dentry(
inodeno_t ino, frag_t frag,
- snapid_t snap_id, const std::string &dname, const std::string &path)
+ snapid_t snap_id, boost::string_view dname, boost::string_view path)
{
if (oversized()) {
return true;
if (dentries.count(key) == 0) {
DamageEntryRef entry = std::make_shared<DentryDamage>(
ino, frag, dname, snap_id);
- entry->path = path;
+ entry->path = std::string(path);
dentries[key][DentryIdent(dname, snap_id)] = entry;
by_id[entry->id] = std::move(entry);
}
}
bool DamageTable::notify_dirfrag(inodeno_t ino, frag_t frag,
- const std::string &path)
+ boost::string_view path)
{
// Special cases: damage to these dirfrags is considered fatal to
// the MDS rank that owns them.
auto key = DirFragIdent(ino, frag);
if (dirfrags.count(key) == 0) {
DamageEntryRef entry = std::make_shared<DirFragDamage>(ino, frag);
- entry->path = path;
+ entry->path = std::string(path);
dirfrags[key] = entry;
by_id[entry->id] = std::move(entry);
}
return false;
}
-bool DamageTable::notify_remote_damaged(inodeno_t ino, const std::string &path)
+bool DamageTable::notify_remote_damaged(inodeno_t ino, boost::string_view path)
{
if (oversized()) {
return true;
if (remotes.count(ino) == 0) {
auto entry = std::make_shared<BacktraceDamage>(ino);
- entry->path = path;
+ entry->path = std::string(path);
remotes[ino] = entry;
by_id[entry->id] = std::move(entry);
}
bool DamageTable::is_dentry_damaged(
const CDir *dir_frag,
- const std::string &dname,
+ boost::string_view dname,
const snapid_t snap_id) const
{
if (dentries.count(
#ifndef DAMAGE_TABLE_H_
#define DAMAGE_TABLE_H_
+#include <boost/utility/string_view.hpp>
+
#include "mdstypes.h"
#include "auth/Crypto.h"
}
}
- DentryIdent(const std::string &dname_, snapid_t snap_id_)
+ DentryIdent(boost::string_view dname_, snapid_t snap_id_)
: dname(dname_), snap_id(snap_id_)
{}
};
*
* @return true if fatal
*/
- bool notify_dirfrag(inodeno_t ino, frag_t frag, const std::string &path);
+ bool notify_dirfrag(inodeno_t ino, frag_t frag, boost::string_view path);
/**
* Indicate that a particular dentry cannot be loaded.
*/
bool notify_dentry(
inodeno_t ino, frag_t frag,
- snapid_t snap_id, const std::string &dname, const std::string &path);
+ snapid_t snap_id, boost::string_view dname, boost::string_view path);
/**
* Indicate that a particular Inode could not be loaded by number
*/
bool notify_remote_damaged(
- inodeno_t ino, const std::string &path);
+ inodeno_t ino, boost::string_view path);
bool is_dentry_damaged(
const CDir *dir_frag,
- const std::string &dname,
+ boost::string_view dname,
const snapid_t snap_id) const;
bool is_dirfrag_damaged(
}
-void FSMap::create_filesystem(const std::string &name,
+void FSMap::create_filesystem(boost::string_view name,
int64_t metadata_pool, int64_t data_pool,
uint64_t features)
{
auto fs = std::make_shared<Filesystem>();
- fs->mds_map.fs_name = name;
+ fs->mds_map.fs_name = std::string(name);
fs->mds_map.max_mds = 1;
fs->mds_map.data_pools.push_back(data_pool);
fs->mds_map.metadata_pool = metadata_pool;
}
int FSMap::parse_filesystem(
- std::string const &ns_str,
+ boost::string_view ns_str,
std::shared_ptr<const Filesystem> *result
) const
{
std::string ns_err;
- fs_cluster_id_t fscid = strict_strtol(ns_str.c_str(), 10, &ns_err);
+ std::string s(ns_str);
+ fs_cluster_id_t fscid = strict_strtol(s.c_str(), 10, &ns_err);
if (!ns_err.empty() || filesystems.count(fscid) == 0) {
for (auto &fs : filesystems) {
- if (fs.second->mds_map.fs_name == ns_str) {
+ if (fs.second->mds_map.fs_name == s) {
*result = std::const_pointer_cast<const Filesystem>(fs.second);
return 0;
}
mds_map.print(out);
}
-mds_gid_t FSMap::find_standby_for(mds_role_t role, const std::string& name) const
+mds_gid_t FSMap::find_standby_for(mds_role_t role, boost::string_view name) const
{
mds_gid_t result = MDS_GID_NONE;
return MDS_GID_NONE;
}
-mds_gid_t FSMap::find_replacement_for(mds_role_t role, const std::string& name,
+mds_gid_t FSMap::find_replacement_for(mds_role_t role, boost::string_view name,
bool force_standby_active) const {
const mds_gid_t standby = find_standby_for(role, name);
if (standby)
* if legacy_client_ns is set.
*/
int FSMap::parse_role(
- const std::string &role_str,
+ boost::string_view role_str,
mds_role_t *role,
std::ostream &ss) const
{
mds_rank_t rank;
std::string err;
- std::string rank_str = role_str.substr(rank_pos);
+ std::string rank_str(role_str.substr(rank_pos));
long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
if (rank_i < 0 || !err.empty()) {
ss << "Invalid rank '" << rank_str << "'";
#ifndef CEPH_FSMAP_H
#define CEPH_FSMAP_H
+#include <map>
+#include <set>
+#include <string>
+#include <boost/utility/string_view.hpp>
+
#include <errno.h>
#include "include/types.h"
#include "msg/Message.h"
#include "mds/MDSMap.h"
-#include <set>
-#include <map>
-#include <string>
-
#include "common/config.h"
#include "include/CompatSet.h"
/**
* Resolve daemon name to GID
*/
- mds_gid_t find_mds_gid_by_name(const std::string& s) const
+ mds_gid_t find_mds_gid_by_name(boost::string_view s) const
{
const auto info = get_mds_info();
for (const auto &p : info) {
/**
* Resolve daemon name to status
*/
- const MDSMap::mds_info_t* find_by_name(const std::string& name) const
+ const MDSMap::mds_info_t* find_by_name(boost::string_view name) const
{
std::map<mds_gid_t, MDSMap::mds_info_t> result;
for (const auto &i : standby_daemons) {
* Caller must already have validated all arguments vs. the existing
* FSMap and OSDMap contents.
*/
- void create_filesystem(const std::string &name,
+ void create_filesystem(boost::string_view name,
int64_t metadata_pool, int64_t data_pool,
uint64_t features);
bool filesystem_exists(fs_cluster_id_t fscid) const {return filesystems.count(fscid) > 0;}
std::shared_ptr<const Filesystem> get_filesystem(fs_cluster_id_t fscid) const {return std::const_pointer_cast<const Filesystem>(filesystems.at(fscid));}
std::shared_ptr<const Filesystem> get_filesystem(void) const {return std::const_pointer_cast<const Filesystem>(filesystems.begin()->second);}
- std::shared_ptr<const Filesystem> get_filesystem(const std::string &name) const
+ std::shared_ptr<const Filesystem> get_filesystem(boost::string_view name) const
{
for (const auto &i : filesystems) {
if (i.second->mds_map.fs_name == name) {
}
int parse_filesystem(
- std::string const &ns_str,
+ boost::string_view ns_str,
std::shared_ptr<const Filesystem> *result
) const;
int parse_role(
- const std::string &role_str,
+ boost::string_view role_str,
mds_role_t *role,
std::ostream &ss) const;
return false;
}
- mds_gid_t find_standby_for(mds_role_t mds, const std::string& name) const;
+ mds_gid_t find_standby_for(mds_role_t mds, boost::string_view name) const;
mds_gid_t find_unused_for(mds_role_t mds, bool force_standby_active) const;
- mds_gid_t find_replacement_for(mds_role_t mds, const std::string& name,
+ mds_gid_t find_replacement_for(mds_role_t mds, boost::string_view name,
bool force_standby_active) const;
void get_health(list<pair<health_status_t,std::string> >& summary,
#ifndef CEPH_FSMAPCOMPACT_H
#define CEPH_FSMAPCOMPACT_H
-#include "mds/mdstypes.h"
#include <map>
#include <string>
+#include <boost/utility/string_view.hpp>
+
+#include "mds/mdstypes.h"
class FSMapUser {
public:
epoch_t get_epoch() const { return epoch; }
- fs_cluster_id_t get_fs_cid(const std::string &name) const {
+ fs_cluster_id_t get_fs_cid(boost::string_view name) const {
for (auto &p : filesystems) {
if (p.second.name == name)
return p.first;
*
*/
+#include <boost/utility/string_view.hpp>
#include "MDSRank.h"
#include "MDCache.h"
dout(10) << " client_snap_caps " << in->client_snap_caps << dendl;
// check for snap writeback completion
bool gather = false;
- compact_map<int,set<client_t> >::iterator p = in->client_snap_caps.begin();
+ auto p = in->client_snap_caps.begin();
while (p != in->client_snap_caps.end()) {
SimpleLock *lock = in->get_lock(p->first);
assert(lock);
}
};
-uint64_t Locker::calc_new_max_size(inode_t *pi, uint64_t size)
+uint64_t Locker::calc_new_max_size(CInode::mempool_inode *pi, uint64_t size)
{
uint64_t new_max = (size + 1) << 1;
uint64_t max_inc = g_conf->mds_client_writeable_range_max_inc_objs;
}
void Locker::calc_new_client_ranges(CInode *in, uint64_t size,
- map<client_t,client_writeable_range_t> *new_ranges,
+ CInode::mempool_inode::client_range_map *new_ranges,
bool *max_increased)
{
- inode_t *latest = in->get_projected_inode();
+ auto latest = in->get_projected_inode();
uint64_t ms;
if(latest->has_layout()) {
ms = calc_new_max_size(latest, size);
assert(in->is_auth());
assert(in->is_file());
- inode_t *latest = in->get_projected_inode();
- map<client_t, client_writeable_range_t> new_ranges;
+ CInode::mempool_inode *latest = in->get_projected_inode();
+ CInode::mempool_inode::client_range_map new_ranges;
uint64_t size = latest->size;
bool update_size = new_size > 0;
bool update_max = false;
MutationRef mut(new MutationImpl());
mut->ls = mds->mdlog->get_current_segment();
- inode_t *pi = in->project_inode();
- pi->version = in->pre_dirty();
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
if (update_max) {
- dout(10) << "check_inode_max_size client_ranges " << pi->client_ranges << " -> " << new_ranges << dendl;
- pi->client_ranges = new_ranges;
+ dout(10) << "check_inode_max_size client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
+ pi.inode.client_ranges = new_ranges;
}
if (update_size) {
- dout(10) << "check_inode_max_size size " << pi->size << " -> " << new_size << dendl;
- pi->size = new_size;
- pi->rstat.rbytes = new_size;
- dout(10) << "check_inode_max_size mtime " << pi->mtime << " -> " << new_mtime << dendl;
- pi->mtime = new_mtime;
+ dout(10) << "check_inode_max_size size " << pi.inode.size << " -> " << new_size << dendl;
+ pi.inode.size = new_size;
+ pi.inode.rstat.rbytes = new_size;
+ dout(10) << "check_inode_max_size mtime " << pi.inode.mtime << " -> " << new_mtime << dendl;
+ pi.inode.mtime = new_mtime;
}
// use EOpen if the file is still open; otherwise, use EUpdate.
for (auto p = head_in->client_need_snapflush.begin();
p != head_in->client_need_snapflush.end() && p->first < last; ) {
snapid_t snapid = p->first;
- set<client_t>& clients = p->second;
+ auto &clients = p->second;
++p; // be careful, q loop below depends on this
if (clients.count(client)) {
*/
void Locker::handle_client_caps(MClientCaps *m)
{
- Session *session = static_cast<Session *>(m->get_connection()->get_priv());
client_t client = m->get_source().num();
-
snapid_t follows = m->get_snap_follows();
dout(7) << "handle_client_caps "
<< ((m->flags & CLIENT_CAPS_SYNC) ? "sync" : "async")
<< " tid " << m->get_client_tid() << " follows " << follows
<< " op " << ceph_cap_op_name(m->get_op()) << dendl;
+ Session *session = mds->get_session(m);
if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
if (!session) {
dout(5) << " no session, dropping " << *m << dendl;
};
void Locker::process_request_cap_release(MDRequestRef& mdr, client_t client, const ceph_mds_request_release& item,
- const string &dname)
+ boost::string_view dname)
{
inodeno_t ino = (uint64_t)item.ino;
uint64_t cap_id = item.cap_id;
// normal metadata updates that we can apply to the head as well.
// update xattrs?
- bool xattrs = false;
- map<string,bufferptr> *px = 0;
- if ((dirty & CEPH_CAP_XATTR_EXCL) &&
- m->xattrbl.length() &&
- m->head.xattr_version > in->get_projected_inode()->xattr_version)
- xattrs = true;
-
- old_inode_t *oi = 0;
+ CInode::mempool_xattr_map *px = nullptr;
+ bool xattrs = (dirty & CEPH_CAP_XATTR_EXCL) &&
+ m->xattrbl.length() &&
+ m->head.xattr_version > in->get_projected_inode()->xattr_version;
+
+ CInode::mempool_old_inode *oi = 0;
if (in->is_multiversion()) {
oi = in->pick_old_inode(snap);
}
- inode_t *pi;
+ CInode::mempool_inode *i;
if (oi) {
dout(10) << " writing into old inode" << dendl;
- pi = in->project_inode();
- pi->version = in->pre_dirty();
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
if (snap > oi->first)
in->split_old_inode(snap);
- pi = &oi->inode;
+ i = &oi->inode;
if (xattrs)
px = &oi->xattrs;
} else {
+ auto &pi = in->project_inode(xattrs);
+ pi.inode.version = in->pre_dirty();
+ i = &pi.inode;
if (xattrs)
- px = new map<string,bufferptr>;
- pi = in->project_inode(px);
- pi->version = in->pre_dirty();
+ px = pi.xattrs.get();
}
- _update_cap_fields(in, dirty, m, pi);
+ _update_cap_fields(in, dirty, m, i);
// xattr
- if (px) {
- dout(7) << " xattrs v" << pi->xattr_version << " -> " << m->head.xattr_version
+ if (xattrs) {
+ dout(7) << " xattrs v" << i->xattr_version << " -> " << m->head.xattr_version
<< " len " << m->xattrbl.length() << dendl;
- pi->xattr_version = m->head.xattr_version;
+ i->xattr_version = m->head.xattr_version;
bufferlist::iterator p = m->xattrbl.begin();
::decode(*px, p);
}
- if (pi->client_ranges.count(client)) {
- if (in->last == snap) {
- dout(10) << " removing client_range entirely" << dendl;
- pi->client_ranges.erase(client);
- } else {
- dout(10) << " client_range now follows " << snap << dendl;
- pi->client_ranges[client].follows = snap;
+ {
+ auto it = i->client_ranges.find(client);
+ if (it != i->client_ranges.end()) {
+ if (in->last == snap) {
+ dout(10) << " removing client_range entirely" << dendl;
+ i->client_ranges.erase(it);
+ } else {
+ dout(10) << " client_range now follows " << snap << dendl;
+ it->second.follows = snap;
+ }
}
}
client, ack));
}
-void Locker::_update_cap_fields(CInode *in, int dirty, MClientCaps *m, inode_t *pi)
+void Locker::_update_cap_fields(CInode *in, int dirty, MClientCaps *m, CInode::mempool_inode *pi)
{
if (dirty == 0)
return;
<< " on " << *in << dendl;
assert(in->is_auth());
client_t client = m->get_source().num();
- inode_t *latest = in->get_projected_inode();
+ CInode::mempool_inode *latest = in->get_projected_inode();
// increase or zero max_size?
uint64_t size = m->get_size();
if (!dirty && !change_max)
return false;
- Session *session = static_cast<Session *>(m->get_connection()->get_priv());
+ Session *session = mds->get_session(m);
if (session->check_access(in, MAY_WRITE,
m->caller_uid, m->caller_gid, NULL, 0, 0) < 0) {
- session->put();
dout(10) << "check_access failed, dropping cap update on " << *in << dendl;
return false;
}
- session->put();
// do the update.
EUpdate *le = new EUpdate(mds->mdlog, "cap update");
mds->mdlog->start_entry(le);
- // xattrs update?
- map<string,bufferptr> *px = 0;
- if ((dirty & CEPH_CAP_XATTR_EXCL) &&
- m->xattrbl.length() &&
- m->head.xattr_version > in->get_projected_inode()->xattr_version)
- px = new map<string,bufferptr>;
+ bool xattr = (dirty & CEPH_CAP_XATTR_EXCL) &&
+ m->xattrbl.length() &&
+ m->head.xattr_version > in->get_projected_inode()->xattr_version;
- inode_t *pi = in->project_inode(px);
- pi->version = in->pre_dirty();
+ auto &pi = in->project_inode(xattr);
+ pi.inode.version = in->pre_dirty();
MutationRef mut(new MutationImpl());
mut->ls = mds->mdlog->get_current_segment();
- _update_cap_fields(in, dirty, m, pi);
+ _update_cap_fields(in, dirty, m, &pi.inode);
if (change_max) {
dout(7) << " max_size " << old_max << " -> " << new_max
<< " for " << *in << dendl;
if (new_max) {
- pi->client_ranges[client].range.first = 0;
- pi->client_ranges[client].range.last = new_max;
- pi->client_ranges[client].follows = in->first - 1;
+ auto &cr = pi.inode.client_ranges[client];
+ cr.range.first = 0;
+ cr.range.last = new_max;
+ cr.follows = in->first - 1;
} else
- pi->client_ranges.erase(client);
+ pi.inode.client_ranges.erase(client);
}
if (change_max || (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)))
if (dirty & CEPH_CAP_AUTH_EXCL)
wrlock_force(&in->authlock, mut);
- // xattr
- if (px) {
- dout(7) << " xattrs v" << pi->xattr_version << " -> " << m->head.xattr_version << dendl;
- pi->xattr_version = m->head.xattr_version;
+ // xattrs update?
+ if (xattr) {
+ dout(7) << " xattrs v" << pi.inode.xattr_version << " -> " << m->head.xattr_version << dendl;
+ pi.inode.xattr_version = m->head.xattr_version;
bufferlist::iterator p = m->xattrbl.begin();
- ::decode(*px, p);
-
+ ::decode(*pi.xattrs, p);
wrlock_force(&in->xattrlock, mut);
}
mds->set_osd_epoch_barrier(m->osd_epoch_barrier);
}
- Session *session = static_cast<Session *>(m->get_connection()->get_priv());
+ Session *session = mds->get_session(m);
for (vector<ceph_mds_cap_item>::iterator p = m->caps.begin(); p != m->caps.end(); ++p) {
_do_cap_release(client, inodeno_t((uint64_t)p->ino) , p->cap_id, p->migrate_seq, p->seq);
in->pre_cow_old_inode(); // avoid cow mayhem
- inode_t *pi = in->project_inode();
- pi->version = in->pre_dirty();
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
in->finish_scatter_gather_update(lock->get_type());
lock->start_flush();
#ifndef CEPH_MDS_LOCKER_H
#define CEPH_MDS_LOCKER_H
+#include <boost/utility/string_view.hpp>
+
#include "include/types.h"
#include <map>
class MDSRank;
class Session;
-class CInode;
class CDentry;
struct SnapRealm;
class ScatterLock;
class LocalLock;
+#include "CInode.h"
#include "SimpleLock.h"
#include "Mutation.h"
bool should_defer_client_cap_frozen(CInode *in);
void process_request_cap_release(MDRequestRef& mdr, client_t client, const ceph_mds_request_release& r,
- const string &dname);
+ boost::string_view dname);
void kick_cap_releases(MDRequestRef& mdr);
void kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq);
bool _need_flush_mdlog(CInode *in, int wanted_caps);
void adjust_cap_wanted(Capability *cap, int wanted, int issue_seq);
void handle_client_caps(class MClientCaps *m);
- void _update_cap_fields(CInode *in, int dirty, MClientCaps *m, inode_t *pi);
+ void _update_cap_fields(CInode *in, int dirty, MClientCaps *m, CInode::mempool_inode *pi);
void _do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, MClientCaps *m, MClientCaps *ack);
void _do_null_snapflush(CInode *head_in, client_t client, snapid_t last=CEPH_NOSNAP);
bool _do_cap_update(CInode *in, Capability *cap, int dirty, snapid_t follows, MClientCaps *m,
void file_update_finish(CInode *in, MutationRef& mut, bool share_max, bool issue_client_cap,
client_t client, MClientCaps *ack);
private:
- uint64_t calc_new_max_size(inode_t *pi, uint64_t size);
+ uint64_t calc_new_max_size(CInode::mempool_inode *pi, uint64_t size);
public:
void calc_new_client_ranges(CInode *in, uint64_t size,
- map<client_t, client_writeable_range_t>* new_ranges,
+ CInode::mempool_inode::client_range_map* new_ranges,
bool *max_increased);
bool check_inode_max_size(CInode *in, bool force_wrlock=false,
uint64_t newmax=0, uint64_t newsize=0,
}
}
+const std::map<std::string, LogEvent::EventType> LogEvent::types = {
+ {"SUBTREEMAP", EVENT_SUBTREEMAP},
+ {"SUBTREEMAP_TEST", EVENT_SUBTREEMAP_TEST},
+ {"EXPORT", EVENT_EXPORT},
+ {"IMPORTSTART", EVENT_IMPORTSTART},
+ {"IMPORTFINISH", EVENT_IMPORTFINISH},
+ {"FRAGMENT", EVENT_FRAGMENT},
+ {"RESETJOURNAL", EVENT_RESETJOURNAL},
+ {"SESSION", EVENT_SESSION},
+ {"SESSIONS_OLD", EVENT_SESSIONS_OLD},
+ {"SESSIONS", EVENT_SESSIONS},
+ {"UPDATE", EVENT_UPDATE},
+ {"SLAVEUPDATE", EVENT_SLAVEUPDATE},
+ {"OPEN", EVENT_OPEN},
+ {"COMMITTED", EVENT_COMMITTED},
+ {"TABLECLIENT", EVENT_TABLECLIENT},
+ {"TABLESERVER", EVENT_TABLESERVER},
+ {"NOOP", EVENT_NOOP}
+};
/*
* Resolve type string to type enum
*
* Return -1 if not found
*/
-LogEvent::EventType LogEvent::str_to_type(std::string const &str)
+LogEvent::EventType LogEvent::str_to_type(boost::string_view str)
{
- std::map<std::string, EventType> types;
- types["SUBTREEMAP"] = EVENT_SUBTREEMAP;
- types["SUBTREEMAP_TEST"] = EVENT_SUBTREEMAP_TEST;
- types["EXPORT"] = EVENT_EXPORT;
- types["IMPORTSTART"] = EVENT_IMPORTSTART;
- types["IMPORTFINISH"] = EVENT_IMPORTFINISH;
- types["FRAGMENT"] = EVENT_FRAGMENT;
- types["RESETJOURNAL"] = EVENT_RESETJOURNAL;
- types["SESSION"] = EVENT_SESSION;
- types["SESSIONS_OLD"] = EVENT_SESSIONS_OLD;
- types["SESSIONS"] = EVENT_SESSIONS;
- types["UPDATE"] = EVENT_UPDATE;
- types["SLAVEUPDATE"] = EVENT_SLAVEUPDATE;
- types["OPEN"] = EVENT_OPEN;
- types["COMMITTED"] = EVENT_COMMITTED;
- types["TABLECLIENT"] = EVENT_TABLECLIENT;
- types["TABLESERVER"] = EVENT_TABLESERVER;
- types["NOOP"] = EVENT_NOOP;
-
- return types[str];
+ return LogEvent::types.at(std::string(str));
}
virtual ~LogEvent() { }
string get_type_str() const;
- static EventType str_to_type(std::string const &str);
+ static EventType str_to_type(boost::string_view str);
EventType get_type() const { return _type; }
void set_type(EventType t) { _type = t; }
* tools can examine metablobs while traversing lists of LogEvent.
*/
virtual EMetaBlob *get_metablob() { return NULL; }
+
+private:
+ static const std::map<std::string, LogEvent::EventType> types;
};
inline ostream& operator<<(ostream& out, const LogEvent &le) {
return;
}
- mds_load.clear();
- if (mds->get_nodeid() == 0)
+ if (mds->get_nodeid() == 0) {
beat_epoch++;
+
+ mds_load.clear();
+ }
// my load
mds_load_t load = get_load(now);
goto out;
}
+ if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) {
+ dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl;
+
+ beat_epoch = m->get_beat();
+ // clear the mds load info whose epoch is less than beat_epoch
+ mds_load.clear();
+ }
+
if (who == 0) {
- dout(20) << " from mds0, new epoch" << dendl;
+ dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
+ if (beat_epoch != m->get_beat()) {
+ mds_load.clear();
+ }
beat_epoch = m->get_beat();
send_heartbeat();
dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
double subdir_sum = 0;
- for (CDir::map_t::iterator it = dir->begin();
- it != dir->end();
- ++it) {
+ for (auto it = dir->begin(); it != dir->end(); ++it) {
CInode *in = it->second->get_linkage()->get_inode();
if (!in) continue;
if (!in->is_dir()) continue;
void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amount)
{
// hit me
- double v = dir->pop_me.get(type).hit(now, amount);
+ double v = dir->pop_me.get(type).hit(now, mds->mdcache->decayrate, amount);
const bool hot = (v > g_conf->mds_bal_split_rd && type == META_POP_IRD) ||
(v > g_conf->mds_bal_split_wr && type == META_POP_IWR);
bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
while (true) {
- dir->pop_nested.get(type).hit(now, amount);
+ dir->pop_nested.get(type).hit(now, mds->mdcache->decayrate, amount);
if (rd_adj != 0.0)
dir->pop_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
if (hit_subtree) {
- dir->pop_auth_subtree.get(type).hit(now, amount);
+ dir->pop_auth_subtree.get(type).hit(now, mds->mdcache->decayrate, amount);
if (rd_adj != 0.0)
dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
}
#include <iostream>
#include <sstream>
#include <string>
+#include <boost/utility/string_view.hpp>
#include <map>
#include "MDCache.h"
assert(head_in);
if (head_in->split_need_snapflush(oldin, in)) {
oldin->client_snap_caps = in->client_snap_caps;
- for (compact_map<int,set<client_t> >::iterator p = in->client_snap_caps.begin();
- p != in->client_snap_caps.end();
- ++p) {
- SimpleLock *lock = oldin->get_lock(p->first);
+ for (const auto &p : in->client_snap_caps) {
+ SimpleLock *lock = oldin->get_lock(p.first);
assert(lock);
- for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+ for (const auto &q : p.second) {
oldin->auth_pin(lock);
lock->set_state(LOCK_SNAP_SYNC); // gathering
lock->get_wrlock(true);
+ (void)q; /* unused */
}
}
}
if (!in->client_caps.empty()) {
const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
// clone caps?
- for (auto p : in->client_caps) {
+ for (auto &p : in->client_caps) {
client_t client = p.first;
Capability *cap = p.second;
int issued = cap->issued();
snapid_t oldfirst = dn->first;
dn->first = dir_follows+1;
if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
- CDentry *olddn = dn->dir->add_remote_dentry(dn->name, in->ino(), in->d_type(),
+ CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(),
oldfirst, dir_follows);
olddn->pre_dirty();
dout(10) << " olddn " << *olddn << dendl;
mut->add_cow_inode(oldin);
if (pcow_inode)
*pcow_inode = oldin;
- CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, oldin->last);
+ CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, oldin->last);
oldin->inode.version = olddn->pre_dirty();
dout(10) << " olddn " << *olddn << dendl;
bool need_snapflush = !oldin->client_snap_caps.empty();
mut->add_cow_dentry(olddn);
} else {
assert(dnl->is_remote());
- CDentry *olddn = dn->dir->add_remote_dentry(dn->name, dnl->get_remote_ino(), dnl->get_remote_d_type(),
+ CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
oldfirst, follows);
olddn->pre_dirty();
dout(10) << " olddn " << *olddn << dendl;
int linkunlink, SnapRealm *prealm)
{
CDentry *parentdn = cur->get_projected_parent_dn();
- inode_t *curi = cur->get_projected_inode();
+ CInode::mempool_inode *curi = cur->get_projected_inode();
if (cur->first > first)
first = cur->first;
}
if (g_conf->mds_snap_rstat) {
- for (compact_set<snapid_t>::iterator p = cur->dirty_old_rstats.begin();
- p != cur->dirty_old_rstats.end();
- ++p) {
- old_inode_t& old = cur->old_inodes[*p];
- snapid_t ofirst = MAX(old.first, floor);
- set<snapid_t>::const_iterator q = snaps.lower_bound(ofirst);
- if (q == snaps.end() || *q > *p)
+ for (const auto &p : cur->dirty_old_rstats) {
+ auto &old = cur->old_inodes[p];
+ snapid_t ofirst = std::max(old.first, floor);
+ auto it = snaps.lower_bound(ofirst);
+ if (it == snaps.end() || *it > p)
continue;
- if (*p >= floor)
- _project_rstat_inode_to_frag(old.inode, ofirst, *p, parent, 0, false);
+ if (p >= floor)
+ _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false);
}
}
cur->dirty_old_rstats.clear();
}
-void MDCache::_project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last,
+void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last,
CDir *parent, int linkunlink, bool update_inode)
{
dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
first = ofirst;
// find any intersection with last
- compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.lower_bound(last);
- if (p == parent->dirty_old_rstat.end()) {
+ auto it = parent->dirty_old_rstat.lower_bound(last);
+ if (it == parent->dirty_old_rstat.end()) {
dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
first = parent->dirty_old_rstat.rbegin()->first+1;
}
} else {
- // *p last is >= last
- if (p->second.first <= last) {
- // *p intersects [first,last]
- if (p->second.first < first) {
- dout(10) << " splitting off left bit [" << p->second.first << "," << first-1 << "]" << dendl;
- parent->dirty_old_rstat[first-1] = p->second;
- p->second.first = first;
+ // *it last is >= last
+ if (it->second.first <= last) {
+ // *it intersects [first,last]
+ if (it->second.first < first) {
+ dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
+ parent->dirty_old_rstat[first-1] = it->second;
+ it->second.first = first;
}
- if (p->second.first > first)
- first = p->second.first;
- if (last < p->first) {
- dout(10) << " splitting off right bit [" << last+1 << "," << p->first << "]" << dendl;
- parent->dirty_old_rstat[last] = p->second;
- p->second.first = last+1;
+ if (it->second.first > first)
+ first = it->second.first;
+ if (last < it->first) {
+ dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
+ parent->dirty_old_rstat[last] = it->second;
+ it->second.first = last+1;
}
} else {
- // *p is to the _right_ of [first,last]
- p = parent->dirty_old_rstat.lower_bound(first);
- // new *p last is >= first
- if (p->second.first <= last && // new *p isn't also to the right, and
- p->first >= first) { // it intersects our first bit,
- dout(10) << " staying to the right of [" << p->second.first << "," << p->first << "]..." << dendl;
- first = p->first+1;
+ // *it is to the _right_ of [first,last]
+ it = parent->dirty_old_rstat.lower_bound(first);
+ // new *it last is >= first
+ if (it->second.first <= last && // new *it isn't also to the right, and
+ it->first >= first) { // it intersects our first bit,
+ dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
+ first = it->first+1;
}
dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
}
dout(20) << " delta " << delta << dendl;
while (last >= ofirst) {
- inode_t *pi;
+ CInode::mempool_inode *pi;
snapid_t first;
if (last == pin->last) {
pi = pin->get_projected_inode();
first = MAX(ofirst, pin->first);
if (first > pin->first) {
- old_inode_t& old = pin->cow_old_inode(first-1, cow_head);
+ auto &old = pin->cow_old_inode(first-1, cow_head);
dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
}
} else {
} else {
// our life is easier here because old_inodes is not sparse
// (although it may not begin at snapid 1)
- compact_map<snapid_t,old_inode_t>::iterator p = pin->old_inodes.lower_bound(last);
- if (p == pin->old_inodes.end()) {
+ auto it = pin->old_inodes.lower_bound(last);
+ if (it == pin->old_inodes.end()) {
dout(10) << " no old_inode <= " << last << ", done." << dendl;
break;
}
- first = p->second.first;
+ first = it->second.first;
if (first > last) {
- dout(10) << " oldest old_inode is [" << first << "," << p->first << "], done." << dendl;
+ dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
//assert(p == pin->old_inodes.begin());
break;
}
- if (p->first > last) {
- dout(10) << " splitting right old_inode [" << first << "," << p->first << "] to ["
- << (last+1) << "," << p->first << "]" << dendl;
- pin->old_inodes[last] = p->second;
- p->second.first = last+1;
- pin->dirty_old_rstats.insert(p->first);
+ if (it->first > last) {
+ dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
+ << (last+1) << "," << it->first << "]" << dendl;
+ pin->old_inodes[last] = it->second;
+ it->second.first = last+1;
+ pin->dirty_old_rstats.insert(it->first);
}
}
if (first < ofirst) {
if (!in->is_auth() || in->is_frozen())
return;
- inode_t *i = in->get_projected_inode();
+ auto i = in->get_projected_inode();
if (!i->quota.is_enable())
return;
pin->pre_cow_old_inode(); // avoid cow mayhem!
- inode_t *pi = pin->project_inode();
- pi->version = pin->pre_dirty();
+ auto &pi = pin->project_inode();
+ pi.inode.version = pin->pre_dirty();
// dirstat
if (do_parent_mtime || linkunlink) {
dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
bool touched_mtime = false, touched_chattr = false;
- pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
+ pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
pf->accounted_fragstat = pf->fragstat;
if (touched_mtime)
- pi->mtime = pi->ctime = pi->dirstat.mtime;
+ pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime;
if (touched_chattr)
- pi->change_attr = pi->dirstat.change_attr;
- dout(20) << "predirty_journal_parents gives " << pi->dirstat << " on " << *pin << dendl;
+ pi.inode.change_attr = pi.inode.dirstat.change_attr;
+ dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl;
if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
- if (pi->dirstat.size() < 0)
+ if (pi.inode.dirstat.size() < 0)
assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
- if (pi->dirstat.size() != pf->fragstat.size()) {
+ if (pi.inode.dirstat.size() != pf->fragstat.size()) {
mds->clog->error() << "unmatched fragstat size on single dirfrag "
- << parent->dirfrag() << ", inode has " << pi->dirstat
+ << parent->dirfrag() << ", inode has " << pi.inode.dirstat
<< ", dirfrag has " << pf->fragstat;
// trust the dirfrag for now
- pi->dirstat = pf->fragstat;
+ pi.inode.dirstat = pf->fragstat;
assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
}
parent->resync_accounted_rstat();
if (g_conf->mds_snap_rstat) {
- for (compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.begin();
- p != parent->dirty_old_rstat.end();
- ++p)
- project_rstat_frag_to_inode(p->second.rstat, p->second.accounted_rstat, p->second.first,
- p->first, pin, true);//false);
+ for (auto &p : parent->dirty_old_rstat) {
+ project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
+ p.first, pin, true);
+ }
}
parent->dirty_old_rstat.clear();
project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
pf->accounted_rstat = pf->rstat;
if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
- if (pi->rstat.rbytes != pf->rstat.rbytes) {
+ if (pi.inode.rstat.rbytes != pf->rstat.rbytes) {
mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
- << parent->dirfrag() << ", inode has " << pi->rstat
+ << parent->dirfrag() << ", inode has " << pi.inode.rstat
<< ", dirfrag has " << pf->rstat;
// trust the dirfrag for now
- pi->rstat = pf->rstat;
+ pi.inode.rstat = pf->rstat;
assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
}
}
}
- for (auto p : dirs_to_add) {
+ for (auto &p : dirs_to_add) {
CDir *dir = p.second;
le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
le->metablob.add_dir(dir, false);
d->take_waiting(d_mask, waiters);
// inode waiters too
- for (CDir::map_t::iterator p = d->items.begin();
- p != d->items.end();
- ++p) {
- CDentry *dn = p->second;
+ for (auto &p : d->items) {
+ CDentry *dn = p.second;
CDentry::linkage_t *dnl = dn->get_linkage();
if (dnl->is_primary()) {
dnl->get_inode()->take_waiting(i_mask, waiters);
CDir *subdir = *p++;
dout(10) << " removing dirfrag " << subdir << dendl;
- CDir::map_t::iterator q = subdir->items.begin();
- while (q != subdir->items.end()) {
- CDentry *dn = q->second;
- ++q;
+ auto it = subdir->items.begin();
+ while (it != subdir->items.end()) {
+ CDentry *dn = it->second;
+ ++it;
CDentry::linkage_t *dnl = dn->get_linkage();
if (dnl->is_primary()) {
CInode *tin = dnl->get_inode();
{
dout(7) << "trim_unlinked_inodes" << dendl;
list<CInode*> q;
- for (auto p : inode_map) {
+ for (auto &p : inode_map) {
CInode *in = p.second;
if (in->get_parent_dn() == NULL && !in->is_base()) {
dout(7) << " will trim from " << *in << dendl;
}
// dentries in this dir
- for (CDir::map_t::iterator q = dir->items.begin();
- q != dir->items.end();
- ++q) {
+ for (auto &p : dir->items) {
// dn
- CDentry *dn = q->second;
+ CDentry *dn = p.second;
CDentry::linkage_t *dnl = dn->get_linkage();
if (auth) {
dn->state_set(CDentry::STATE_AUTH);
if (mds->is_rejoin()) {
// WEAK
rejoin->add_weak_dirfrag(dir->dirfrag());
- for (CDir::map_t::iterator p = dir->items.begin();
- p != dir->items.end();
- ++p) {
- CDentry *dn = p->second;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
+ assert(dn->last == CEPH_NOSNAP);
CDentry::linkage_t *dnl = dn->get_linkage();
dout(15) << " add_weak_primary_dentry " << *dn << dendl;
assert(dnl->is_primary());
CInode *in = dnl->get_inode();
assert(dnl->get_inode()->is_dir());
- rejoin->add_weak_primary_dentry(dir->ino(), dn->name.c_str(), dn->first, dn->last, in->ino());
+ rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
in->get_nested_dirfrags(nested);
if (in->is_dirty_scattered()) {
dout(10) << " sending scatterlock state on " << *in << dendl;
rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
dir->state_set(CDir::STATE_REJOINING);
- for (CDir::map_t::iterator p = dir->items.begin();
- p != dir->items.end();
- ++p) {
- CDentry *dn = p->second;
+ for (auto it = dir->items.begin(); it != dir->items.end(); ++it) {
+ CDentry *dn = it->second;
CDentry::linkage_t *dnl = dn->get_linkage();
dout(15) << " add_strong_dentry " << *dn << dendl;
- rejoin->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
+ rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
dnl->is_remote() ? dnl->get_remote_d_type():0,
unsigned dnonce = dn->add_replica(from);
dout(10) << " have " << *dn << dendl;
if (ack)
- ack->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
+ ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
dnl->get_inode()->ino(), inodeno_t(0), 0,
dnonce, dn->lock.get_replica_state());
}
// dentries
- for (CDir::map_t::iterator p = dir->items.begin();
- p != dir->items.end();
- ++p) {
- CDentry *dn = p->second;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
if (dn->is_replica(from) &&
(ack == NULL ||
ack->strong_dentries.count(dir->dirfrag()) == 0 ||
- ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->name, dn->last)) == 0)) {
+ ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->get_name(), dn->last)) == 0)) {
dentry_remove_replica(dn, from, gather_locks);
dout(10) << " rem " << *dn << dendl;
}
}
};
- for (auto p : inode_map)
+ for (auto &p : inode_map)
scour_func(p.second);
- for (auto p : snap_inode_map)
+ for (auto &p : snap_inode_map)
scour_func(p.second);
}
CDir *dir = *p;
dir->clear_replica_map();
- for (CDir::map_t::iterator p = dir->items.begin();
- p != dir->items.end();
- ++p) {
- CDentry *dn = p->second;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
dn->clear_replica_map();
dout(10) << " trimming " << *dn << dendl;
it->second->add_dirfrag_base(dir);
}
- for (CDir::map_t::iterator q = dir->items.begin();
- q != dir->items.end();
- ++q) {
- CDentry *dn = q->second;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
CDentry::linkage_t *dnl = dn->get_linkage();
// inode
auto it = acks.find(r.first);
if (it == acks.end())
continue;
- it->second->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
+ it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
dnl->is_remote() ? dnl->get_remote_d_type():0,
{
dout(10) << "reissue_all_caps" << dendl;
- for (auto p : inode_map) {
+ for (auto &p : inode_map) {
CInode *in = p.second;
if (in->is_head() && in->is_any_caps()) {
// called by MDSRank::active_start(). There shouldn't be any frozen subtree.
s.erase(*s.rbegin());
dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
if (s.size() > 1) {
- inode_t *pi = in->project_inode();
+ CInode::mempool_inode pi = in->project_inode();
pi->version = in->pre_dirty();
auto mut(std::make_shared<MutationImpl>());
void MDCache::identify_files_to_recover()
{
dout(10) << "identify_files_to_recover" << dendl;
- for (auto p : inode_map) {
+ for (auto &p : inode_map) {
CInode *in = p.second;
if (!in->is_auth())
continue;
void MDCache::truncate_inode(CInode *in, LogSegment *ls)
{
- inode_t *pi = in->get_projected_inode();
+ auto pi = in->get_projected_inode();
dout(10) << "truncate_inode "
<< pi->truncate_from << " -> " << pi->truncate_size
<< " on " << *in
void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
{
- inode_t *pi = &in->inode;
+ auto pi = &in->inode;
dout(10) << "_truncate_inode "
<< pi->truncate_from << " -> " << pi->truncate_size
<< " on " << *in << dendl;
ls->truncating_inodes.erase(p);
// update
- inode_t *pi = in->project_inode();
- pi->version = in->pre_dirty();
- pi->truncate_from = 0;
- pi->truncate_pending--;
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
+ pi.inode.truncate_from = 0;
+ pi.inode.truncate_pending--;
MutationRef mut(new MutationImpl());
mut->ls = mds->mdlog->get_current_segment();
assert(a != mds->get_nodeid());
if (expiremap.count(a) == 0)
expiremap[a] = new MCacheExpire(mds->get_nodeid());
- expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->name, dn->last, dn->get_replica_nonce());
+ expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
}
}
bool keep_dir = !can_trim_non_auth_dirfrag(dir);
- CDir::map_t::iterator j = dir->begin();
- CDir::map_t::iterator i = j;
+ auto j = dir->begin();
+ auto i = j;
while (j != dir->end()) {
i = j++;
CDentry *dn = i->second;
break;
}
- for (CDir::map_t::iterator p = dir->items.begin();
- p != dir->items.end();
- ++p) {
- CDentry *dn = p->second;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
CDentry::linkage_t *dnl = dn->get_linkage();
if (dnl->is_null())
continue;
return NULL;
for (unsigned i = 0; i < fp.depth(); i++) {
- const string& dname = fp[i];
+ boost::string_view dname = fp[i];
frag_t fg = in->pick_dirfrag(dname);
dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
CDir *curdir = in->get_dirfrag(fg);
CDir *dir = dn->get_dir();
if (dir) {
dir->get_inode()->make_path_string(path);
- path = path + "/" + dn->get_name();
+ path += "/";
+ path += std::string(dn->get_name());
}
bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
if (!pdn)
break;
CInode *diri = pdn->get_dir()->get_inode();
- reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name,
+ reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
in->inode.version));
in = diri;
}
void MDCache::request_kill(MDRequestRef& mdr)
{
// rollback slave requests is tricky. just let the request proceed.
- if (mdr->done_locking && mdr->has_more() &&
+ if (mdr->has_more() &&
(!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
- dout(10) << "request_kill " << *mdr << " -- already started slave requests, no-op" << dendl;
+ if (!mdr->done_locking) {
+ assert(mdr->more()->witnessed.empty());
+ mdr->aborted = true;
+ dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
+ } else {
+ dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
+ }
assert(mdr->used_prealloc_ino == 0);
assert(mdr->prealloc_inos.empty());
le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid);
- inode_t *pi = in->project_inode();
- pi->version = in->pre_dirty();
- pi->rstat.rsnaprealms++;
+ auto &pi = in->project_inode(false, true);
+ pi.inode.version = in->pre_dirty();
+ pi.inode.rstat.rsnaprealms++;
bufferlist::iterator p = mdr->more()->snapidbl.begin();
snapid_t seq;
::decode(seq, p);
- sr_t *newsnap = in->project_snaprealm(seq);
- newsnap->seq = seq;
- newsnap->last_created = seq;
+ auto &newsnap = *pi.snapnode;
+ newsnap.created = seq;
+ newsnap.seq = seq;
+ newsnap.last_created = seq;
predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
journal_cow_inode(mut, &le->metablob, in);
dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
return;
}
- for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) {
- CDentry *dn = q->second;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
dn->state_set(CDentry::STATE_STRAY);
CDentry::linkage_t *dnl = dn->get_projected_linkage();
if (dnl->is_primary()) {
void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
{
- ::encode(dn->name, bl);
+ ::encode(dn->get_name(), bl);
::encode(dn->last, bl);
dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
}
//if (*it == except) continue;
dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
+ std::set<int32_t> s;
+ for (const auto &r : dir->dir_rep_by) {
+ s.insert(r);
+ }
mds->send_message_mds(new MDirUpdate(mds->get_nodeid(),
dir->dirfrag(),
dir->dir_rep,
- dir->dir_rep_by,
+ s,
path,
bcast),
*it);
// Update if it already exists. Othwerwise it got updated by discover reply.
dout(5) << "dir_update on " << *dir << dendl;
dir->dir_rep = m->get_dir_rep();
- dir->dir_rep_by = m->get_dir_rep_by();
+ dir->dir_rep_by.clear();
+ for (const auto &e : m->get_dir_rep_by()) {
+ dir->dir_rep_by.insert(e);
+ }
}
// done
continue;
CDentry::linkage_t *dnl = dn->get_linkage();
MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
- dn->name, dnl->is_primary());
+ dn->get_name(), dnl->is_primary());
if (dnl->is_primary()) {
dout(10) << " primary " << *dnl->get_inode() << dendl;
replicate_inode(dnl->get_inode(), p.first, m->bl,
rejoin_gather.count(*it)))
continue;
- MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->name);
+ MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->get_name());
if (straydn)
replicate_stray(straydn, *it, unlink->straybl);
mds->send_message_mds(unlink, *it);
if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
dout(15) << " marking " << *dir << dendl;
- for (CDir::map_t::iterator p = dir->items.begin();
- p != dir->items.end();
- ++p) {
- CDentry *dn = p->second;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
dn->get(CDentry::PIN_FRAGMENTING);
assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
dn->state_set(CDentry::STATE_FRAGMENTING);
if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
dir->state_clear(CDir::STATE_DNPINNEDFRAG);
- for (CDir::map_t::iterator p = dir->items.begin();
- p != dir->items.end();
- ++p) {
- CDentry *dn = p->second;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
assert(dn->state_test(CDentry::STATE_FRAGMENTING));
dn->state_clear(CDentry::STATE_FRAGMENTING);
dn->put(CDentry::PIN_FRAGMENTING);
// dft lock
if (diri->is_auth()) {
// journal dirfragtree
- inode_t *pi = diri->project_inode();
- pi->version = diri->pre_dirty();
+ auto &pi = diri->project_inode();
+ pi.inode.version = diri->pre_dirty();
journal_dirty_inode(mdr.get(), &le->metablob, diri);
} else {
mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
CDir *dir = *p;
dout(10) << " result frag " << *dir << dendl;
- for (CDir::map_t::iterator p = dir->items.begin();
- p != dir->items.end();
- ++p) {
- CDentry *dn = p->second;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
assert(dn->state_test(CDentry::STATE_FRAGMENTING));
dn->state_clear(CDentry::STATE_FRAGMENTING);
dn->put(CDentry::PIN_FRAGMENTING);
}
if (diri_auth) {
- diri->project_inode()->version = diri->pre_dirty();
+ auto &pi = diri->project_inode();
+ pi.inode.version = diri->pre_dirty();
diri->pop_and_dirty_projected_inode(ls); // hacky
le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
} else {
mds->server->force_clients_readonly();
// revoke write caps
- for (auto p : inode_map) {
+ for (auto &p : inode_map) {
CInode *in = p.second;
if (in->is_head())
mds->locker->eval(in, CEPH_CAP_LOCKS);
CDir *dir = *p;
dout(7) << " dirfrag " << *dir << dendl;
- for (CDir::map_t::iterator p = dir->items.begin();
- p != dir->items.end();
- ++p) {
- CDentry *dn = p->second;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
dout(7) << " dentry " << *dn << dendl;
CDentry::linkage_t *dnl = dn->get_linkage();
if (dnl->is_primary() && dnl->get_inode())
}
};
- for (auto p : inode_map)
+ for (auto &p : inode_map)
show_func(p.second);
- for (auto p : snap_inode_map)
+ for (auto &p : snap_inode_map)
show_func(p.second);
}
return 0;
}
-int MDCache::dump_cache(std::string const &file_name)
+int MDCache::dump_cache(boost::string_view file_name)
{
- return dump_cache(file_name.c_str(), NULL);
+ return dump_cache(file_name, NULL);
}
int MDCache::dump_cache(Formatter *f)
{
- return dump_cache(NULL, f);
+ return dump_cache(boost::string_view(""), f);
}
-int MDCache::dump_cache(const string& dump_root, int depth, Formatter *f)
+int MDCache::dump_cache(boost::string_view dump_root, int depth, Formatter *f)
{
- return dump_cache(NULL, f, dump_root, depth);
+ return dump_cache(boost::string_view(""), f, dump_root, depth);
}
/**
* Dump the metadata cache, either to a Formatter, if
* provided, else to a plain text file.
*/
-int MDCache::dump_cache(const char *fn, Formatter *f,
- const string& dump_root, int depth)
+int MDCache::dump_cache(boost::string_view fn, Formatter *f,
+ boost::string_view dump_root, int depth)
{
int r = 0;
int fd = -1;
if (f) {
f->open_array_section("inodes");
} else {
- char deffn[200];
- if (!fn) {
- snprintf(deffn, sizeof(deffn), "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
- fn = deffn;
+ char path[PATH_MAX] = "";
+ if (fn.length()) {
+ snprintf(path, sizeof path, "%s", fn.data());
+ } else {
+ snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
}
- dout(1) << "dump_cache to " << fn << dendl;
+ dout(1) << "dump_cache to " << path << dendl;
- fd = ::open(fn, O_WRONLY|O_CREAT|O_EXCL, 0600);
+ fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL, 0600);
if (fd < 0) {
- derr << "failed to open " << fn << ": " << cpp_strerror(errno) << dendl;
+ derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
return errno;
}
}
if (f) {
f->open_array_section("dentries");
}
- for (CDir::map_t::iterator q = dir->items.begin();
- q != dir->items.end();
- ++q) {
- CDentry *dn = q->second;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
if (f) {
f->open_object_section("dentry");
dn->dump(f);
return 1;
};
- for (auto p : inode_map) {
+ for (auto &p : inode_map) {
r = dump_func(p.second);
if (r < 0)
goto out;
}
- for (auto p : snap_inode_map) {
+ for (auto &p : snap_inode_map) {
r = dump_func(p.second);
if (r < 0)
goto out;
};
void MDCache::enqueue_scrub(
- const string& path,
- const std::string &tag,
+ boost::string_view path,
+ boost::string_view tag,
bool force, bool recursive, bool repair,
Formatter *f, Context *fin)
{
dout(10) << __func__ << path << dendl;
MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
- filepath fp(path.c_str());
+ filepath fp(path);
mdr->set_filepath(fp);
C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(f, fin);
frag_info_t frag_info;
nest_info_t nest_info;
- for (CDir::map_t::iterator it = dir->begin(); it != dir->end(); ++it) {
+ for (auto it = dir->begin(); it != dir->end(); ++it) {
CDentry *dn = it->second;
if (dn->last != CEPH_NOSNAP)
continue;
mds->server->respond_to_request(mdr, 0);
}
-void MDCache::flush_dentry(const string& path, Context *fin)
+void MDCache::flush_dentry(boost::string_view path, Context *fin)
{
if (is_readonly()) {
dout(10) << __func__ << ": read-only FS" << dendl;
}
dout(10) << "flush_dentry " << path << dendl;
MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
- filepath fp(path.c_str());
+ filepath fp(path);
mdr->set_filepath(fp);
mdr->internal_op_finish = fin;
flush_dentry_work(mdr);
assert(diri->get_projected_parent_dir()->inode->is_stray());
list<CDir*> ls;
diri->get_dirfrags(ls);
- for (auto p : ls) {
+ for (auto &p : ls) {
if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
p->try_remove_dentries_for_stray();
}
#ifndef CEPH_MDCACHE_H
#define CEPH_MDCACHE_H
+#include <boost/utility/string_view.hpp>
+
#include "include/types.h"
#include "include/filepath.h"
#include "include/elist.h"
void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
int linkunlink, SnapRealm *prealm);
- void _project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last,
+ void _project_rstat_inode_to_frag(CInode::mempool_inode & inode, snapid_t ofirst, snapid_t last,
CDir *parent, int linkunlink, bool update_inode);
void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
snapid_t ofirst, snapid_t last,
return NULL;
return in->get_dirfrag(df.frag);
}
- CDir* get_dirfrag(inodeno_t ino, const string& dn) {
+ CDir* get_dirfrag(inodeno_t ino, boost::string_view dn) {
CInode *in = get_inode(ino);
if (!in)
return NULL;
void discard_delayed_expire(CDir *dir);
protected:
- int dump_cache(const char *fn, Formatter *f,
- const std::string& dump_root = "",
+ int dump_cache(boost::string_view fn, Formatter *f,
+ boost::string_view dump_root = "",
int depth = -1);
public:
int dump_cache() { return dump_cache(NULL, NULL); }
- int dump_cache(const std::string &filename);
+ int dump_cache(boost::string_view filename);
int dump_cache(Formatter *f);
- int dump_cache(const std::string& dump_root, int depth, Formatter *f);
+ int dump_cache(boost::string_view dump_root, int depth, Formatter *f);
int cache_status(Formatter *f);
void repair_dirfrag_stats_work(MDRequestRef& mdr);
friend class C_MDC_RepairDirfragStats;
public:
- void flush_dentry(const string& path, Context *fin);
+ void flush_dentry(boost::string_view path, Context *fin);
/**
* Create and start an OP_ENQUEUE_SCRUB
*/
- void enqueue_scrub(const string& path, const std::string &tag,
+ void enqueue_scrub(boost::string_view path, boost::string_view tag,
bool force, bool recursive, bool repair,
Formatter *f, Context *fin);
void repair_inode_stats(CInode *diri);
*
*/
+#include <boost/utility/string_view.hpp>
#include <errno.h>
#include <fcntl.h>
// drop ..
}
-bool MDSCapMatch::match(const std::string &target_path,
+bool MDSCapMatch::match(boost::string_view target_path,
const int caller_uid,
const int caller_gid,
const vector<uint64_t> *caller_gid_list) const
return true;
}
-bool MDSCapMatch::match_path(const std::string &target_path) const
+bool MDSCapMatch::match_path(boost::string_view target_path) const
{
if (path.length()) {
if (target_path.find(path) != 0)
* Is the client *potentially* able to access this path? Actual
* permission will depend on uids/modes in the full is_capable.
*/
-bool MDSAuthCaps::path_capable(const std::string &inode_path) const
+bool MDSAuthCaps::path_capable(boost::string_view inode_path) const
{
for (const auto &i : grants) {
if (i.match.match_path(inode_path)) {
* This is true if any of the 'grant' clauses in the capability match the
* requested path + op.
*/
-bool MDSAuthCaps::is_capable(const std::string &inode_path,
+bool MDSAuthCaps::is_capable(boost::string_view inode_path,
uid_t inode_uid, gid_t inode_gid,
unsigned inode_mode,
uid_t caller_uid, gid_t caller_gid,
MDSCapMatch()));
}
-bool MDSAuthCaps::parse(CephContext *c, const std::string& str, ostream *err)
+bool MDSAuthCaps::parse(CephContext *c, boost::string_view str, ostream *err)
{
// Special case for legacy caps
if (str == "allow") {
return true;
}
- MDSCapParser<std::string::const_iterator> g;
- std::string::const_iterator iter = str.begin();
- std::string::const_iterator end = str.end();
+ auto iter = str.begin();
+ auto end = str.end();
+ MDSCapParser<decltype(iter)> g;
bool r = qi::phrase_parse(iter, end, g, ascii::space, *this);
cct = c; // set after parser self-assignment
#ifndef MDS_AUTH_CAPS_H
#define MDS_AUTH_CAPS_H
-#include <vector>
-#include <string>
#include <sstream>
+#include <string>
+#include <boost/utility/string_view.hpp>
+#include <vector>
+
#include "include/types.h"
#include "common/debug.h"
}
// check whether this grant matches against a given file and caller uid:gid
- bool match(const std::string &target_path,
+ bool match(boost::string_view target_path,
const int caller_uid,
const int caller_gid,
const vector<uint64_t> *caller_gid_list) const;
*
* @param target_path filesystem path without leading '/'
*/
- bool match_path(const std::string &target_path) const;
+ bool match_path(boost::string_view target_path) const;
};
struct MDSCapGrant {
: cct(NULL), grants(grants_) { }
void set_allow_all();
- bool parse(CephContext *cct, const std::string &str, std::ostream *err);
+ bool parse(CephContext *cct, boost::string_view str, std::ostream *err);
bool allow_all() const;
- bool is_capable(const std::string &inode_path,
+ bool is_capable(boost::string_view inode_path,
uid_t inode_uid, gid_t inode_gid, unsigned inode_mode,
uid_t uid, gid_t gid, const vector<uint64_t> *caller_gid_list,
unsigned mask, uid_t new_uid, gid_t new_gid) const;
- bool path_capable(const std::string &inode_path) const;
+ bool path_capable(boost::string_view inode_path) const;
friend std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap);
};
client_t client;
MDSCacheObject *parent;
- ceph_seq_t seq;
+ ceph_seq_t seq = 0;
utime_t ttl;
xlist<ClientLease*>::item item_session_lease; // per-session list
xlist<ClientLease*>::item item_lease; // global list
ClientLease(client_t c, MDSCacheObject *p) :
- client(c), parent(p), seq(0),
+ client(c), parent(p),
item_session_lease(this),
item_lease(this) { }
+ ClientLease() = delete;
};
// ============================================
// cons
public:
- MDSCacheObject() :
- state(0),
- ref(0),
- auth_pins(0), nested_auth_pins(0),
- replica_nonce(0)
- {}
+ MDSCacheObject() {}
virtual ~MDSCacheObject() {}
// printing
// --------------------------------------------
// state
protected:
- __u32 state; // state bits
+ __u32 state = 0; // state bits
public:
unsigned get_state() const { return state; }
// --------------------------------------------
// pins
protected:
- __s32 ref; // reference count
+ __s32 ref = 0; // reference count
#ifdef MDS_REF_SET
mempool::mds_co::map<int,int> ref_map;
#endif
}
protected:
- int auth_pins;
- int nested_auth_pins;
+ int auth_pins = 0;
+ int nested_auth_pins = 0;
#ifdef MDS_AUTHPIN_SET
mempool::mds_co::multiset<void*> auth_pin_set;
#endif
// --------------------------------------------
// replication (across mds cluster)
protected:
- unsigned replica_nonce; // [replica] defined on replica
- typedef compact_map<mds_rank_t,unsigned> replica_map_type;
+ unsigned replica_nonce = 0; // [replica] defined on replica
+ typedef mempool::mds_co::compact_map<mds_rank_t,unsigned> replica_map_type;
replica_map_type replica_map; // [auth] mds -> nonce
public:
#define dout_prefix *_dout << "mds." << name << ' '
// cons/des
-MDSDaemon::MDSDaemon(const std::string &n, Messenger *m, MonClient *mc) :
+MDSDaemon::MDSDaemon(boost::string_view n, Messenger *m, MonClient *mc) :
Dispatcher(m->cct),
mds_lock("MDSDaemon::mds_lock"),
stopping(false),
mgrc(m->cct, m),
log_client(m->cct, messenger, &mc->monmap, LogClient::NO_FLAGS),
mds_rank(NULL),
- asok_hook(NULL)
+ asok_hook(NULL),
+ starttime(mono_clock::now())
{
orig_argc = 0;
orig_argv = NULL;
f->dump_unsigned("osdmap_epoch", 0);
f->dump_unsigned("osdmap_epoch_barrier", 0);
}
+
+ f->dump_float("uptime", get_uptime().count());
+
f->close_section(); // status
}
dout(10) << sizeof(MDSCacheObject) << "\tMDSCacheObject" << dendl;
dout(10) << sizeof(CInode) << "\tCInode" << dendl;
dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item *7=" << 7*sizeof(elist<void*>::item) << dendl;
- dout(10) << sizeof(inode_t) << "\t inode_t " << dendl;
+ dout(10) << sizeof(CInode::mempool_inode) << "\t inode " << dendl;
+ dout(10) << sizeof(CInode::mempool_old_inode) << "\t old_inode " << dendl;
dout(10) << sizeof(nest_info_t) << "\t nest_info_t " << dendl;
dout(10) << sizeof(frag_info_t) << "\t frag_info_t " << dendl;
dout(10) << sizeof(SimpleLock) << "\t SimpleLock *5=" << 5*sizeof(SimpleLock) << dendl;
void MDSDaemon::send_command_reply(MCommand *m, MDSRank *mds_rank,
int r, bufferlist outbl,
- const std::string& outs)
+ boost::string_view outs)
{
Session *session = static_cast<Session *>(m->get_connection()->get_priv());
assert(session != NULL);
// If someone is using a closed session for sending commands (e.g.
// the ceph CLI) then we should feel free to clean up this connection
// as soon as we've sent them a response.
- const bool live_session = mds_rank &&
- mds_rank->sessionmap.get_session(session->info.inst.name) != nullptr
- && session->get_state_seq() > 0;
+ const bool live_session =
+ session->get_state_seq() > 0 &&
+ mds_rank &&
+ mds_rank->sessionmap.get_session(session->info.inst.name);
if (!live_session) {
// This session only existed to issue commands, so terminate it
// as soon as we can.
assert(session->is_closed());
session->connection->mark_disposable();
- session->put();
}
+ session->put();
MCommandReply *reply = new MCommandReply(r, outs);
reply->set_tid(m->get_tid());
} else {
r = _handle_command(cmdmap, m, &outbl, &outs, &run_after, &need_reply);
}
+ session->put();
if (need_reply) {
send_command_reply(m, mds_rank, r, outbl, outs);
#ifndef CEPH_MDS_H
#define CEPH_MDS_H
+#include <boost/utility/string_view.hpp>
+
#include "common/LogClient.h"
#include "common/Mutex.h"
#include "common/Timer.h"
SafeTimer timer;
+
+ mono_time get_starttime() const {
+ return starttime;
+ }
+ chrono::duration<double> get_uptime() const {
+ mono_time now = mono_clock::now();
+ return chrono::duration<double>(now-starttime);
+ }
+
protected:
Beacon beacon;
MDSRankDispatcher *mds_rank;
public:
- MDSDaemon(const std::string &n, Messenger *m, MonClient *mc);
+ MDSDaemon(boost::string_view n, Messenger *m, MonClient *mc);
~MDSDaemon() override;
int orig_argc;
const char **orig_argv;
// special message types
friend class C_MDS_Send_Command_Reply;
static void send_command_reply(MCommand *m, MDSRank* mds_rank, int r,
- bufferlist outbl, const std::string& outs);
+ bufferlist outbl, boost::string_view outs);
int _handle_command(
const cmdmap_t &cmdmap,
MCommand *m,
void handle_command(class MCommand *m);
void handle_mds_map(class MMDSMap *m);
void _handle_mds_map(MDSMap *oldmap);
+
+private:
+ mono_time starttime = mono_clock::zero();
};
#ifndef CEPH_MDSMAP_H
#define CEPH_MDSMAP_H
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+#include <boost/utility/string_view.hpp>
+
#include <errno.h>
#include "include/types.h"
#include "msg/Message.h"
#include "include/health.h"
-#include <set>
-#include <map>
-#include <string>
-#include <algorithm>
-
#include "common/config.h"
#include "include/CompatSet.h"
void set_flag(int f) { flags |= f; }
void clear_flag(int f) { flags &= ~f; }
- const std::string &get_fs_name() const {return fs_name;}
+ boost::string_view get_fs_name() const {return fs_name;}
void set_snaps_allowed() {
set_flag(CEPH_MDSMAP_ALLOW_SNAPS);
assert(up.count(m) && mds_info.count(up.at(m)));
return mds_info.at(up.at(m));
}
- mds_gid_t find_mds_gid_by_name(const std::string& s) const {
+ mds_gid_t find_mds_gid_by_name(boost::string_view s) const {
for (std::map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
++p) {
*
*/
+#include <boost/utility/string_view.hpp>
+
#include "common/debug.h"
#include "common/errno.h"
messenger(msgr), monc(monc_),
respawn_hook(respawn_hook_),
suicide_hook(suicide_hook_),
- standby_replaying(false)
+ standby_replaying(false),
+ starttime(mono_clock::now())
{
hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank", pthread_self());
return false;
}
+Session *MDSRank::get_session(Message *m)
+{
+ Session *session = static_cast<Session *>(m->get_connection()->get_priv());
+ if (session) {
+ dout(20) << "get_session have " << session << " " << session->info.inst
+ << " state " << session->get_state_name() << dendl;
+ session->put(); // not carry ref
+ } else {
+ dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
+ }
+ return session;
+}
void MDSRank::send_message(Message *m, Connection *c)
{
public:
C_MDS_Send_Command_Reply(MDSRank *_mds, MCommand *_m) :
MDSInternalContext(_mds), m(_m) { m->get(); }
- void send (int r, const std::string& out_str) {
+ void send (int r, boost::string_view out_str) {
bufferlist bl;
MDSDaemon::send_command_reply(m, mds, r, bl, out_str);
m->put();
f->close_section(); //sessions
}
-void MDSRank::command_scrub_path(Formatter *f, const string& path, vector<string>& scrubop_vec)
+void MDSRank::command_scrub_path(Formatter *f, boost::string_view path, vector<string>& scrubop_vec)
{
bool force = false;
bool recursive = false;
}
void MDSRank::command_tag_path(Formatter *f,
- const string& path, const std::string &tag)
+ boost::string_view path, boost::string_view tag)
{
C_SaferCond scond;
{
scond.wait();
}
-void MDSRank::command_flush_path(Formatter *f, const string& path)
+void MDSRank::command_flush_path(Formatter *f, boost::string_view path)
{
C_SaferCond scond;
{
void MDSRank::command_export_dir(Formatter *f,
- const std::string &path,
+ boost::string_view path,
mds_rank_t target)
{
int r = _command_export_dir(path, target);
}
int MDSRank::_command_export_dir(
- const std::string &path,
+ boost::string_view path,
mds_rank_t target)
{
Mutex::Locker l(mds_lock);
- filepath fp(path.c_str());
+ filepath fp(path);
if (target == whoami || !mdsmap->is_up(target) || !mdsmap->is_in(target)) {
derr << "bad MDS target " << target << dendl;
} else if (state == MDSMap::STATE_CLIENTREPLAY) {
dump_clientreplay_status(f);
}
+ f->dump_float("rank_uptime", get_uptime().count());
}
void MDSRank::dump_clientreplay_status(Formatter *f) const
#ifndef MDS_RANK_H_
#define MDS_RANK_H_
+#include <boost/utility/string_view.hpp>
+
#include "common/DecayCounter.h"
#include "common/LogClient.h"
#include "common/Timer.h"
// a separate lock here in future potentially.
Mutex &mds_lock;
+ mono_time get_starttime() const {
+ return starttime;
+ }
+ chrono::duration<double> get_uptime() const {
+ mono_time now = mono_clock::now();
+ return chrono::duration<double>(now-starttime);
+ }
+
class CephContext *cct;
bool is_daemon_stopping() const;
Session *get_session(client_t client) {
return sessionmap.get_session(entity_name_t::CLIENT(client.v));
}
+ Session *get_session(Message *m);
PerfCounters *logger, *mlogger;
OpTracker op_tracker;
finished_queue.push_back(c);
progress_thread.signal();
}
- void queue_waiters(list<MDSInternalContextBase*>& ls) {
+ void queue_waiters(std::list<MDSInternalContextBase*>& ls) {
finished_queue.splice( finished_queue.end(), ls );
progress_thread.signal();
}
protected:
void dump_clientreplay_status(Formatter *f) const;
- void command_scrub_path(Formatter *f, const string& path, vector<string>& scrubop_vec);
- void command_tag_path(Formatter *f, const string& path,
- const string &tag);
- void command_flush_path(Formatter *f, const string& path);
+ void command_scrub_path(Formatter *f, boost::string_view path, vector<string>& scrubop_vec);
+ void command_tag_path(Formatter *f, boost::string_view path,
+ boost::string_view tag);
+ void command_flush_path(Formatter *f, boost::string_view path);
void command_flush_journal(Formatter *f);
void command_get_subtrees(Formatter *f);
void command_export_dir(Formatter *f,
- const std::string &path, mds_rank_t dest);
+ boost::string_view path, mds_rank_t dest);
bool command_dirfrag_split(
cmdmap_t cmdmap,
std::ostream &ss);
cmdmap_t cmdmap,
std::ostream &ss,
Formatter *f);
- int _command_export_dir(const std::string &path, mds_rank_t dest);
+ int _command_export_dir(boost::string_view path, mds_rank_t dest);
int _command_flush_journal(std::stringstream *ss);
CDir *_command_dirfrag_get(
const cmdmap_t &cmdmap,
/* Update MDSMap export_targets for this rank. Called on ::tick(). */
void update_targets(utime_t now);
+
+private:
+ mono_time starttime = mono_clock::zero();
};
/* This expects to be given a reference which it is responsible for.
return 0;
}
-int Mantle::balance(const std::string &script,
+int Mantle::balance(boost::string_view script,
mds_rank_t whoami,
const std::vector<std::map<std::string, double>> &metrics,
std::map<mds_rank_t, double> &my_targets)
lua_settop(L, 0); /* clear the stack */
/* load the balancer */
- if (luaL_loadstring(L, script.c_str())) {
+ if (luaL_loadstring(L, script.data())) {
mantle_dout(0) << "WARNING: mantle could not load balancer: "
<< lua_tostring(L, -1) << mantle_dendl;
return -EINVAL;
#ifndef CEPH_MANTLE_H
#define CEPH_MANTLE_H
+#include <boost/utility/string_view.hpp>
+
#include <lua.hpp>
#include <vector>
#include <map>
public:
Mantle();
~Mantle() { if (L) lua_close(L); }
- int balance(const std::string &script,
+ int balance(boost::string_view script,
mds_rank_t whoami,
const std::vector <std::map<std::string, double>> &metrics,
std::map<mds_rank_t,double> &my_targets);
dfs.pop_front();
approx_size += frag_size;
- for (CDir::map_t::iterator p = dir->begin(); p != dir->end(); ++p) {
- CDentry *dn = p->second;
+ for (auto &p : *dir) {
+ CDentry *dn = p.second;
if (dn->get_linkage()->is_null()) {
approx_size += null_size;
continue;
// dentries
list<CDir*> subdirs;
- CDir::map_t::iterator it;
- for (it = dir->begin(); it != dir->end(); ++it) {
- CDentry *dn = it->second;
+ for (auto &p : *dir) {
+ CDentry *dn = p.second;
CInode *in = dn->get_linkage()->get_inode();
if (!dn->is_replicated())
dout(7) << "encode_export_dir exporting " << *dn << dendl;
// dn name
- ::encode(dn->name, exportbl);
+ ::encode(dn->get_name(), exportbl);
::encode(dn->last, exportbl);
// state
}
// subdirs
- for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); ++it)
- num_exported += encode_export_dir(exportbl, *it, exported_client_map, now);
+ for (auto &dir : subdirs)
+ num_exported += encode_export_dir(exportbl, dir, exported_client_map, now);
return num_exported;
}
// dentries
list<CDir*> subdirs;
- CDir::map_t::iterator it;
- for (it = dir->begin(); it != dir->end(); ++it) {
- CDentry *dn = it->second;
+ for (auto &p : *dir) {
+ CDentry *dn = p.second;
CInode *in = dn->get_linkage()->get_inode();
// dentry
CDir *t = rq.front();
rq.pop_front();
t->abort_export();
- for (CDir::map_t::iterator p = t->items.begin(); p != t->items.end(); ++p) {
- p->second->abort_export();
- if (!p->second->get_linkage()->is_primary())
+ for (auto &p : *t) {
+ CDentry *dn = p.second;
+ dn->abort_export();
+ if (!dn->get_linkage()->is_primary())
continue;
- CInode *in = p->second->get_linkage()->get_inode();
+ CInode *in = dn->get_linkage()->get_inode();
in->abort_export();
if (in->state_test(CInode::STATE_EVALSTALECAPS)) {
in->state_clear(CInode::STATE_EVALSTALECAPS);
if (cur->is_dirty())
cur->mark_clean();
- CDir::map_t::iterator it;
- for (it = cur->begin(); it != cur->end(); ++it) {
- CDentry *dn = it->second;
+ for (auto &p : *cur) {
+ CDentry *dn = p.second;
// dentry
dn->state_clear(CDentry::STATE_AUTH);
void RecoveryQueue::_start(CInode *in)
{
- inode_t *pi = in->get_projected_inode();
+ auto pi = in->get_projected_inode();
// blech
if (pi->client_ranges.size() && !pi->get_max_size()) {
#ifndef SCRUB_HEADER_H_
#define SCRUB_HEADER_H_
+#include <boost/utility/string_view.hpp>
+
class CInode;
/**
*/
class ScrubHeader {
public:
- ScrubHeader(const std::string &tag_, bool force_, bool recursive_,
+ ScrubHeader(boost::string_view tag_, bool force_, bool recursive_,
bool repair_, Formatter *f_)
: tag(tag_), force(force_), recursive(recursive_), repair(repair_),
formatter(f_), origin(nullptr)
bool get_repair() const { return repair; }
bool get_force() const { return force; }
const CInode *get_origin() const { return origin; }
- const std::string &get_tag() const { return tag; }
+ boost::string_view get_tag() const { return tag; }
Formatter &get_formatter() const { return *formatter; }
bool get_repaired() const { return repaired; }
if (parent) {
auto dir = parent->get_dir();
mdcache->mds->damage_table.notify_dentry(
- dir->inode->ino(), dir->frag, parent->last, parent->name, path);
+ dir->inode->ino(), dir->frag, parent->last, parent->get_name(), path);
}
}
#include <list>
#include <iostream>
+#include <boost/utility/string_view.hpp>
using namespace std;
#include "common/config.h"
}
// active?
- if (!mds->is_active()) {
- if (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
- (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT)) {
- MClientRequest *req = static_cast<MClientRequest*>(m);
- Session *session = get_session(req);
+ // handle_slave_request()/handle_client_session() will wait if necessary
+ if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
+ MClientRequest *req = static_cast<MClientRequest*>(m);
+ if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
+ Session *session = mds->get_session(req);
if (!session || session->is_closed()) {
dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
req->put();
}
bool wait_for_active = true;
- if (m->get_type() == MSG_MDS_SLAVE_REQUEST) {
- // handle_slave_request() will wait if necessary
- wait_for_active = false;
- } else if (mds->is_stopping()) {
- if (m->get_source().is_mds() ||
- m->get_type() == CEPH_MSG_CLIENT_SESSION)
+ if (mds->is_stopping()) {
+ if (m->get_source().is_mds())
wait_for_active = false;
} else if (mds->is_clientreplay()) {
- // session open requests need to be handled during replay,
- // close requests need to be delayed
- if ((m->get_type() == CEPH_MSG_CLIENT_SESSION &&
- (static_cast<MClientSession*>(m))->get_op() != CEPH_SESSION_REQUEST_CLOSE)) {
+ if (req->is_queued_for_replay()) {
wait_for_active = false;
- } else if (m->get_type() == CEPH_MSG_CLIENT_REQUEST) {
- MClientRequest *req = static_cast<MClientRequest*>(m);
- if (req->is_queued_for_replay()) {
- wait_for_active = false;
- }
}
}
if (wait_for_active) {
}
};
-Session *Server::get_session(Message *m)
-{
- Session *session = static_cast<Session *>(m->get_connection()->get_priv());
- if (session) {
- dout(20) << "get_session have " << session << " " << session->info.inst
- << " state " << session->get_state_name() << dendl;
- session->put(); // not carry ref
- } else {
- dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
- }
- return session;
-}
-
/* This function DOES put the passed message before returning*/
void Server::handle_client_session(MClientSession *m)
{
version_t pv;
bool blacklisted = false;
- Session *session = get_session(m);
+ Session *session = mds->get_session(m);
dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
assert(m->get_source().is_client()); // should _not_ come from an mds!
return;
}
+ if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
+ // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
+ } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
+ // close requests need to be handled when mds is active
+ if (mds->get_state() < MDSMap::STATE_ACTIVE) {
+ mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ } else {
+ if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ }
+
if (logger)
logger->inc(l_mdss_handle_client_session);
} else if (session->is_killing()) {
// destroy session, close connection
if (session->connection != NULL) {
- session->connection->mark_down();
+ session->connection->mark_down();
+ session->connection->set_priv(NULL);
}
mds->sessionmap.remove_session(session);
} else {
{
dout(7) << "handle_client_reconnect " << m->get_source() << dendl;
client_t from = m->get_source().num();
- Session *session = get_session(m);
+ Session *session = mds->get_session(m);
assert(session);
if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
// active session?
Session *session = 0;
if (req->get_source().is_client()) {
- session = get_session(req);
+ session = mds->get_session(req);
if (!session) {
dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
} else if (session->is_closed() ||
if (mdr->killed) {
dout(10) << "request " << *mdr << " was killed" << dendl;
return;
+ } else if (mdr->aborted) {
+ mdr->aborted = false;
+ mdcache->request_kill(mdr);
+ return;
}
MClientRequest *req = mdr->client_request;
* verify that the dir exists and would own the dname.
* do not check if the dentry exists.
*/
-CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, const string& dname)
+CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, boost::string_view dname)
{
// make sure parent is a dir?
if (!diri->is_dir()) {
* prepare a null (or existing) dentry in given dir.
* wait for any dn lock.
*/
-CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string& dname, bool okexist)
+CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, boost::string_view dname, bool okexist)
{
dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
assert(dir->is_auth());
bufferlist::iterator p = req->get_data().begin();
// xattrs on new inode?
- map<string,bufferptr> xattrs;
+ CInode::mempool_xattr_map xattrs;
::decode(xattrs, p);
- for (map<string,bufferptr>::iterator p = xattrs.begin(); p != xattrs.end(); ++p) {
- dout(10) << "prepare_new_inode setting xattr " << p->first << dendl;
- in->xattrs[p->first] = p->second;
+ for (const auto &p : xattrs) {
+ dout(10) << "prepare_new_inode setting xattr " << p.first << dendl;
+ auto em = in->xattrs.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple(p.second));
+ if (!em.second)
+ em.first->second = p.second;
}
}
}
// make a null dentry?
- const string &dname = refpath.last_dentry();
+ boost::string_view dname = refpath.last_dentry();
CDentry *dn;
if (mustexist) {
dn = dir->lookup(dname);
issued = cap->issued();
int mask = req->head.args.getattr.mask;
- if ((mask & CEPH_CAP_LINK_SHARED) && (issued & CEPH_CAP_LINK_EXCL) == 0) rdlocks.insert(&ref->linklock);
- if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0) rdlocks.insert(&ref->authlock);
- if ((mask & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0) rdlocks.insert(&ref->filelock);
- if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0) rdlocks.insert(&ref->xattrlock);
+ if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
+ rdlocks.insert(&ref->linklock);
+ if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
+ rdlocks.insert(&ref->authlock);
+ if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
+ rdlocks.insert(&ref->xattrlock);
+ if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
+ // Don't wait on unstable filelock if client is allowed to read file size.
+ // This can reduce the response time of getattr in the case that multiple
+ // clients do stat(2) and there are writers.
+ // The downside of this optimization is that mds may not issue Fs caps along
+ // with getattr reply. Client may need to send more getattr requests.
+ if (mdr->rdlocks.count(&ref->filelock)) {
+ rdlocks.insert(&ref->filelock);
+ } else if (ref->filelock.is_stable() ||
+ ref->filelock.get_num_wrlocks() > 0 ||
+ !ref->filelock.can_read(mdr->get_client())) {
+ rdlocks.insert(&ref->filelock);
+ mdr->done_locking = false;
+ }
+ }
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
return;
// wait for pending truncate?
- const inode_t *pi = cur->get_projected_inode();
+ const auto pi = cur->get_projected_inode();
if (pi->is_truncating()) {
dout(10) << " waiting for pending truncate from " << pi->truncate_from
<< " to " << pi->truncate_size << " to complete on " << *cur << dendl;
continue;
} else {
// touch everything i _do_ have
- for (CDir::map_t::iterator p = dir->begin(); p != dir->end(); ++p)
- if (!p->second->get_linkage()->is_null())
- mdcache->lru.lru_touch(p->second);
+ for (auto &p : *dir) {
+ if (!p.second->get_linkage()->is_null())
+ mdcache->lru.lru_touch(p.second);
+ }
// already issued caps and leases, reply immediately.
if (dnbl.length() > 0) {
}
assert(in);
- if ((int)(dnbl.length() + dn->name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
+ if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
break;
}
// dentry
dout(12) << "including dn " << *dn << dendl;
- ::encode(dn->name, dnbl);
+ ::encode(dn->get_name(), dnbl);
mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
// inode
return;
// trunc from bigger -> smaller?
- inode_t *pi = cur->get_projected_inode();
+ auto pip = cur->get_projected_inode();
- uint64_t old_size = MAX(pi->size, req->head.args.setattr.old_size);
+ uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
// ENOSPC on growing file while full, but allow shrinks
if (is_full && req->head.args.setattr.size > old_size) {
bool truncating_smaller = false;
if (mask & CEPH_SETATTR_SIZE) {
truncating_smaller = req->head.args.setattr.size < old_size;
- if (truncating_smaller && pi->is_truncating()) {
- dout(10) << " waiting for pending truncate from " << pi->truncate_from
- << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
+ if (truncating_smaller && pip->is_truncating()) {
+ dout(10) << " waiting for pending truncate from " << pip->truncate_from
+ << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
mds->locker->drop_locks(mdr.get());
mdr->drop_local_auth_pins();
cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
EUpdate *le = new EUpdate(mdlog, "setattr");
mdlog->start_entry(le);
- pi = cur->project_inode();
+ auto &pi = cur->project_inode();
if (mask & CEPH_SETATTR_UID)
- pi->uid = req->head.args.setattr.uid;
+ pi.inode.uid = req->head.args.setattr.uid;
if (mask & CEPH_SETATTR_GID)
- pi->gid = req->head.args.setattr.gid;
+ pi.inode.gid = req->head.args.setattr.gid;
if (mask & CEPH_SETATTR_MODE)
- pi->mode = (pi->mode & ~07777) | (req->head.args.setattr.mode & 07777);
+ pi.inode.mode = (pi.inode.mode & ~07777) | (req->head.args.setattr.mode & 07777);
else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
- S_ISREG(pi->mode) &&
- (pi->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
- pi->mode &= ~(S_ISUID|S_ISGID);
+ S_ISREG(pi.inode.mode) &&
+ (pi.inode.mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
+ pi.inode.mode &= ~(S_ISUID|S_ISGID);
}
if (mask & CEPH_SETATTR_MTIME)
- pi->mtime = req->head.args.setattr.mtime;
+ pi.inode.mtime = req->head.args.setattr.mtime;
if (mask & CEPH_SETATTR_ATIME)
- pi->atime = req->head.args.setattr.atime;
+ pi.inode.atime = req->head.args.setattr.atime;
if (mask & CEPH_SETATTR_BTIME)
- pi->btime = req->head.args.setattr.btime;
+ pi.inode.btime = req->head.args.setattr.btime;
if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
- pi->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
+ pi.inode.time_warp_seq++; // maybe not a timewarp, but still a serialization point.
if (mask & CEPH_SETATTR_SIZE) {
if (truncating_smaller) {
- pi->truncate(old_size, req->head.args.setattr.size);
+ pi.inode.truncate(old_size, req->head.args.setattr.size);
le->metablob.add_truncate_start(cur->ino());
} else {
- pi->size = req->head.args.setattr.size;
- pi->rstat.rbytes = pi->size;
+ pi.inode.size = req->head.args.setattr.size;
+ pi.inode.rstat.rbytes = pi.inode.size;
}
- pi->mtime = mdr->get_op_stamp();
+ pi.inode.mtime = mdr->get_op_stamp();
// adjust client's max_size?
- map<client_t,client_writeable_range_t> new_ranges;
+ CInode::mempool_inode::client_range_map new_ranges;
bool max_increased = false;
- mds->locker->calc_new_client_ranges(cur, pi->size, &new_ranges, &max_increased);
- if (pi->client_ranges != new_ranges) {
- dout(10) << " client_ranges " << pi->client_ranges << " -> " << new_ranges << dendl;
- pi->client_ranges = new_ranges;
+ mds->locker->calc_new_client_ranges(cur, pi.inode.size, &new_ranges, &max_increased);
+ if (pi.inode.client_ranges != new_ranges) {
+ dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
+ pi.inode.client_ranges = new_ranges;
changed_ranges = true;
}
}
- pi->version = cur->pre_dirty();
- pi->ctime = mdr->get_op_stamp();
- pi->change_attr++;
+ pi.inode.version = cur->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
// log + wait
le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
mdlog->start_entry(le);
// prepare
- inode_t *pi = in->project_inode();
- pi->version = in->pre_dirty();
- pi->mtime = pi->ctime = mdr->get_op_stamp();
- pi->change_attr++;
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
+ pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
- uint64_t old_size = MAX(pi->size, mdr->client_request->head.args.open.old_size);
+ uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
if (old_size > 0) {
- pi->truncate(old_size, 0);
+ pi.inode.truncate(old_size, 0);
le->metablob.add_truncate_start(in->ino());
}
bool changed_ranges = false;
if (cmode & CEPH_FILE_MODE_WR) {
- pi->client_ranges[client].range.first = 0;
- pi->client_ranges[client].range.last = pi->get_layout_size_increment();
- pi->client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
+ pi.inode.client_ranges[client].range.first = 0;
+ pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
+ pi.inode.client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
changed_ranges = true;
}
return;
// project update
- inode_t *pi = cur->project_inode();
- pi->layout = layout;
+ auto &pi = cur->project_inode();
+ pi.inode.layout = layout;
// add the old pool to the inode
- pi->add_old_pool(old_layout.pool_id);
- pi->version = cur->pre_dirty();
- pi->ctime = mdr->get_op_stamp();
- pi->change_attr++;
+ pi.inode.add_old_pool(old_layout.pool_id);
+ pi.inode.version = cur->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
// log + wait
mdr->ls = mdlog->get_current_segment();
return;
// validate layout
- const inode_t *old_pi = cur->get_projected_inode();
+ const auto old_pi = cur->get_projected_inode();
file_layout_t layout;
if (old_pi->has_layout())
layout = old_pi->layout;
if (!check_access(mdr, cur, access))
return;
- inode_t *pi = cur->project_inode();
- pi->layout = layout;
- pi->version = cur->pre_dirty();
+ auto &pi = cur->project_inode();
+ pi.inode.layout = layout;
+ pi.inode.version = cur->pre_dirty();
// log + wait
mdr->ls = mdlog->get_current_segment();
<< " bytes on " << *cur
<< dendl;
- inode_t *pi = NULL;
+ CInode::mempool_inode *pip = nullptr;
string rest;
if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
- pi = cur->project_inode();
- pi->layout = layout;
+ auto &pi = cur->project_inode();
+ pi.inode.layout = layout;
mdr->no_early_reply = true;
+ pip = &pi.inode;
} else if (name.compare(0, 16, "ceph.file.layout") == 0) {
if (!cur->is_file()) {
respond_to_request(mdr, -EINVAL);
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
- pi = cur->project_inode();
- int64_t old_pool = pi->layout.pool_id;
- pi->add_old_pool(old_pool);
- pi->layout = layout;
- pi->ctime = mdr->get_op_stamp();
+ auto &pi = cur->project_inode();
+ int64_t old_pool = pi.inode.layout.pool_id;
+ pi.inode.add_old_pool(old_pool);
+ pi.inode.layout = layout;
+ pi.inode.ctime = mdr->get_op_stamp();
+ pip = &pi.inode;
} else if (name.compare(0, 10, "ceph.quota") == 0) {
if (!cur->is_dir() || cur->is_root()) {
respond_to_request(mdr, -EINVAL);
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
- pi = cur->project_inode();
- pi->quota = quota;
+ auto &pi = cur->project_inode();
+ pi.inode.quota = quota;
+
mdr->no_early_reply = true;
+ pip = &pi.inode;
} else if (name.find("ceph.dir.pin") == 0) {
if (!cur->is_dir() || cur->is_root()) {
respond_to_request(mdr, -EINVAL);
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
- pi = cur->project_inode();
+ auto &pi = cur->project_inode();
cur->set_export_pin(rank);
+ pip = &pi.inode;
} else {
dout(10) << " unknown vxattr " << name << dendl;
respond_to_request(mdr, -EINVAL);
return;
}
- pi->change_attr++;
- pi->ctime = mdr->get_op_stamp();
- pi->version = cur->pre_dirty();
+ pip->change_attr++;
+ pip->ctime = mdr->get_op_stamp();
+ pip->version = cur->pre_dirty();
if (cur->is_file())
- pi->update_backtrace();
+ pip->update_backtrace();
// log + wait
mdr->ls = mdlog->get_current_segment();
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
- inode_t *pi = cur->project_inode();
- pi->clear_layout();
- pi->version = cur->pre_dirty();
+ auto &pi = cur->project_inode();
+ pi.inode.clear_layout();
+ pi.inode.version = cur->pre_dirty();
// log + wait
mdr->ls = mdlog->get_current_segment();
if (!check_access(mdr, cur, MAY_WRITE))
return;
- map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
+ auto pxattrs = cur->get_projected_xattrs();
size_t len = req->get_data().length();
size_t inc = len + name.length();
// check xattrs kv pairs size
size_t cur_xattrs_size = 0;
for (const auto& p : *pxattrs) {
- if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
+ if ((flags & CEPH_XATTR_REPLACE) && (name.compare(std::string(boost::string_view(p.first))) == 0)) {
continue;
}
cur_xattrs_size += p.first.length() + p.second.length();
return;
}
- if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(name)) {
+ if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(boost::string_view(name)))) {
dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
respond_to_request(mdr, -EEXIST);
return;
}
- if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(name)) {
+ if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(boost::string_view(name)))) {
dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
respond_to_request(mdr, -ENODATA);
return;
dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
// project update
- map<string,bufferptr> *px = new map<string,bufferptr>;
- inode_t *pi = cur->project_inode(px);
- pi->version = cur->pre_dirty();
- pi->ctime = mdr->get_op_stamp();
- pi->change_attr++;
- pi->xattr_version++;
- px->erase(name);
- if (!(flags & CEPH_XATTR_REMOVE)) {
- (*px)[name] = buffer::create(len);
+ auto &pi = cur->project_inode(true);
+ pi.inode.version = cur->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
+ pi.inode.xattr_version++;
+ auto &px = *pi.xattrs;
+ if ((flags & CEPH_XATTR_REMOVE)) {
+ px.erase(mempool::mds_co::string(boost::string_view(name)));
+ } else {
+ bufferptr b = buffer::create(len);
if (len)
- req->get_data().copy(0, len, (*px)[name].c_str());
+ req->get_data().copy(0, len, b.c_str());
+ auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(boost::string_view(name))), std::forward_as_tuple(b));
+ if (!em.second)
+ em.first->second = b;
}
// log + wait
void Server::handle_client_removexattr(MDRequestRef& mdr)
{
MClientRequest *req = mdr->client_request;
- string name(req->get_path2());
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
+ std::string name(req->get_path2());
+ std::set<SimpleLock*> rdlocks, wrlocks, xlocks;
file_layout_t *dir_layout = NULL;
CInode *cur;
if (name == "ceph.dir.layout")
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
- map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
- if (pxattrs->count(name) == 0) {
+ auto pxattrs = cur->get_projected_xattrs();
+ if (pxattrs->count(mempool::mds_co::string(boost::string_view(name))) == 0) {
dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
respond_to_request(mdr, -ENODATA);
return;
dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
// project update
- map<string,bufferptr> *px = new map<string,bufferptr>;
- inode_t *pi = cur->project_inode(px);
- pi->version = cur->pre_dirty();
- pi->ctime = mdr->get_op_stamp();
- pi->change_attr++;
- pi->xattr_version++;
- px->erase(name);
+ auto &pi = cur->project_inode(true);
+ auto &px = *pi.xattrs;
+ pi.inode.version = cur->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
+ pi.inode.xattr_version++;
+ px.erase(mempool::mds_co::string(boost::string_view(name)));
// log + wait
mdr->ls = mdlog->get_current_segment();
// it's a symlink
dn->push_projected_linkage(newi);
- newi->symlink = req->get_path2();
+ newi->symlink = mempool::mds_co::string(boost::string_view(req->get_path2()));
newi->inode.size = newi->symlink.length();
newi->inode.rstat.rbytes = newi->inode.size;
newi->inode.rstat.rfiles = 1;
version_t tipv = targeti->pre_dirty();
// project inode update
- inode_t *pi = targeti->project_inode();
- pi->nlink++;
- pi->ctime = mdr->get_op_stamp();
- pi->change_attr++;
- pi->version = tipv;
+ auto &pi = targeti->project_inode();
+ pi.inode.nlink++;
+ pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
+ pi.inode.version = tipv;
// log + wait
EUpdate *le = new EUpdate(mdlog, "link_local");
ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
mdlog->start_entry(le);
- inode_t *pi = dnl->get_inode()->project_inode();
+ auto &pi = dnl->get_inode()->project_inode();
// update journaled target inode
bool inc;
if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
inc = true;
- pi->nlink++;
+ pi.inode.nlink++;
} else {
inc = false;
- pi->nlink--;
+ pi.inode.nlink--;
}
link_rollback rollback;
::encode(rollback, le->rollback);
mdr->more()->rollback_bl = le->rollback;
- pi->ctime = mdr->get_op_stamp();
- pi->version = targeti->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.version = targeti->pre_dirty();
- dout(10) << " projected inode " << pi << " v " << pi->version << dendl;
+ dout(10) << " projected inode " << pi.inode.ino << " v " << pi.inode.version << dendl;
// commit case
mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
dout(10) << " target is " << *in << dendl;
assert(!in->is_projected()); // live slave request hold versionlock xlock.
- inode_t *pi = in->project_inode();
- pi->version = in->pre_dirty();
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
mut->add_projected_inode(in);
// parent dir rctime
fnode_t *pf = parent->project_fnode();
mut->add_projected_fnode(parent);
pf->version = parent->pre_dirty();
- if (pf->fragstat.mtime == pi->ctime) {
+ if (pf->fragstat.mtime == pi.inode.ctime) {
pf->fragstat.mtime = rollback.old_dir_mtime;
- if (pf->rstat.rctime == pi->ctime)
+ if (pf->rstat.rctime == pi.inode.ctime)
pf->rstat.rctime = rollback.old_dir_rctime;
mut->add_updated_lock(&parent->get_inode()->filelock);
mut->add_updated_lock(&parent->get_inode()->nestlock);
}
// inode
- pi->ctime = rollback.old_ctime;
+ pi.inode.ctime = rollback.old_ctime;
if (rollback.was_inc)
- pi->nlink--;
+ pi.inode.nlink--;
else
- pi->nlink++;
+ pi.inode.nlink++;
// journal it
ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
// the unlinked dentry
dn->pre_dirty();
- inode_t *pi = in->project_inode();
- dn->make_path_string(pi->stray_prior_path, true);
+ auto &pi = in->project_inode();
+ {
+ std::string t;
+ dn->make_path_string(t, true);
+ pi.inode.stray_prior_path = mempool::mds_co::string(boost::string_view(t));
+ }
mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
- pi->version = in->pre_dirty();
- pi->ctime = mdr->get_op_stamp();
- pi->change_attr++;
- pi->nlink--;
- if (pi->nlink == 0)
+ pi.inode.version = in->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
+ pi.inode.nlink--;
+ if (pi.inode.nlink == 0)
in->state_set(CInode::STATE_ORPHAN);
if (dnl->is_primary()) {
if (in->snaprealm || follows + 1 > in->get_oldest_snap())
in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
- pi->update_backtrace();
+ pi.inode.update_backtrace();
le->metablob.add_primary_dentry(straydn, in, true, true);
} else {
// remote link. update remote inode.
MMDSSlaveRequest::OP_RMDIRPREP);
req->srcdnpath = filepath(trace.front()->get_dir()->ino());
for (auto dn : trace)
- req->srcdnpath.push_dentry(dn->name);
+ req->srcdnpath.push_dentry(dn->get_name());
mdcache->replicate_stray(straydn, who, req->stray);
req->op_stamp = mdr->get_op_stamp();
rmdir_rollback rollback;
rollback.reqid = mdr->reqid;
rollback.src_dir = dn->get_dir()->dirfrag();
- rollback.src_dname = dn->name;
+ rollback.src_dname = std::string(dn->get_name());
rollback.dest_dir = straydn->get_dir()->dirfrag();
- rollback.dest_dname = straydn->name;
+ rollback.dest_dname = std::string(straydn->get_name());
::encode(rollback, mdr->more()->rollback_bl);
dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
respond_to_request(mdr, -EINVAL);
return;
}
- const string &destname = destpath.last_dentry();
+ boost::string_view destname = destpath.last_dentry();
vector<CDentry*>& srctrace = mdr->dn[1];
vector<CDentry*>& desttrace = mdr->dn[0];
}
// src == dest?
- if (srcdn->get_dir() == destdir && srcdn->name == destname) {
+ if (srcdn->get_dir() == destdir && srcdn->get_name() == destname) {
dout(7) << "rename src=dest, noop" << dendl;
respond_to_request(mdr, 0);
return;
req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
for (auto dn : srctrace)
- req->srcdnpath.push_dentry(dn->name);
+ req->srcdnpath.push_dentry(dn->get_name());
req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
for (auto dn : dsttrace)
- req->destdnpath.push_dentry(dn->name);
+ req->destdnpath.push_dentry(dn->get_name());
if (straydn)
mdcache->replicate_stray(straydn, who, req->stray);
}
// prepare
- inode_t *pi = 0; // renamed inode
- inode_t *tpi = 0; // target/overwritten inode
+ CInode::mempool_inode *spi = 0; // renamed inode
+ CInode::mempool_inode *tpi = 0; // target/overwritten inode
// target inode
if (!linkmerge) {
assert(straydn); // moving to straydn.
// link--, and move.
if (destdn->is_auth()) {
- tpi = oldin->project_inode(); //project_snaprealm
- tpi->version = straydn->pre_dirty(tpi->version);
- tpi->update_backtrace();
+ auto &pi= oldin->project_inode(); //project_snaprealm
+ pi.inode.version = straydn->pre_dirty(pi.inode.version);
+ pi.inode.update_backtrace();
+ tpi = &pi.inode;
}
straydn->push_projected_linkage(oldin);
} else if (destdnl->is_remote()) {
// nlink-- targeti
if (oldin->is_auth()) {
- tpi = oldin->project_inode();
- tpi->version = oldin->pre_dirty();
+ auto &pi = oldin->project_inode();
+ pi.inode.version = oldin->pre_dirty();
+ tpi = &pi.inode;
}
}
}
destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
// srci
if (srci->is_auth()) {
- pi = srci->project_inode();
- pi->version = srci->pre_dirty();
+ auto &pi = srci->project_inode();
+ pi.inode.version = srci->pre_dirty();
+ spi = &pi.inode;
}
} else {
dout(10) << " will merge remote onto primary link" << dendl;
if (destdn->is_auth()) {
- pi = oldin->project_inode();
- pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
+ auto &pi = oldin->project_inode();
+ pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
+ spi = &pi.inode;
}
}
} else { // primary
dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
}
}
- pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
+ auto &pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
// & srcdnl->snaprealm
- pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
- pi->update_backtrace();
+ pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
+ pi.inode.update_backtrace();
+ spi = &pi.inode;
}
destdn->push_projected_linkage(srci);
}
srcdn->push_projected_linkage(); // push null linkage
if (!silent) {
- if (pi) {
- pi->ctime = mdr->get_op_stamp();
- pi->change_attr++;
+ if (spi) {
+ spi->ctime = mdr->get_op_stamp();
+ spi->change_attr++;
if (linkmerge)
- pi->nlink--;
+ spi->nlink--;
}
if (tpi) {
tpi->ctime = mdr->get_op_stamp();
tpi->change_attr++;
- destdn->make_path_string(tpi->stray_prior_path, true);
+ {
+ std::string t;
+ destdn->make_path_string(t, true);
+ tpi->stray_prior_path = mempool::mds_co::string(boost::string_view(t));
+ }
tpi->nlink--;
if (tpi->nlink == 0)
oldin->state_set(CInode::STATE_ORPHAN);
rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
- rollback.orig_src.dname = srcdn->name;
+ rollback.orig_src.dname = std::string(srcdn->get_name());
if (srcdnl->is_primary())
rollback.orig_src.ino = srcdnl->get_inode()->ino();
else {
rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
- rollback.orig_dest.dname = destdn->name;
+ rollback.orig_dest.dname = std::string(destdn->get_name());
if (destdnl->is_primary())
rollback.orig_dest.ino = destdnl->get_inode()->ino();
else if (destdnl->is_remote()) {
rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
- rollback.stray.dname = straydn->name;
+ rollback.stray.dname = std::string(straydn->get_name());
}
::encode(rollback, mdr->more()->rollback_bl);
dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
rollback.orig_src.remote_d_type);
}
- inode_t *pi = 0;
+ CInode::mempool_inode *pip = 0;
if (in) {
if (in->authority().first == whoami) {
- pi = in->project_inode();
+ auto &pi = in->project_inode();
mut->add_projected_inode(in);
- pi->version = in->pre_dirty();
+ pi.inode.version = in->pre_dirty();
+ pip = &pi.inode;
} else
- pi = in->get_projected_inode();
- if (pi->ctime == rollback.ctime)
- pi->ctime = rollback.orig_src.old_ctime;
+ pip = in->get_projected_inode();
+ if (pip->ctime == rollback.ctime)
+ pip->ctime = rollback.orig_src.old_ctime;
}
if (srcdn && srcdn->authority().first == whoami) {
nest_info_t blah;
_rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
- in ? in->is_dir() : false, 1, pi ? pi->accounted_rstat : blah);
+ in ? in->is_dir() : false, 1, pip ? pip->accounted_rstat : blah);
}
// repair dest
straydn->push_projected_linkage();
if (target) {
- inode_t *ti = NULL;
+ CInode::mempool_inode *ti = NULL;
if (target->authority().first == whoami) {
- ti = target->project_inode();
+ auto &pi = target->project_inode();
mut->add_projected_inode(target);
- ti->version = target->pre_dirty();
+ pi.inode.version = target->pre_dirty();
+ ti = &pi.inode;
} else
ti = target->get_projected_inode();
if (ti->ctime == rollback.ctime)
// actual
string snap_name;
if (p->second->ino == diri->ino())
- snap_name = p->second->name;
+ snap_name = std::string(p->second->name);
else
- snap_name = p->second->get_long_name();
+ snap_name = std::string(p->second->get_long_name());
unsigned start_len = dnbl.length();
if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
return;
}
- const string &snapname = req->get_filepath().last_dentry();
+ boost::string_view snapname = req->get_filepath().last_dentry();
if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
SnapInfo info;
info.ino = diri->ino();
info.snapid = snapid;
- info.name = snapname;
+ info.name = std::string(snapname);
info.stamp = mdr->get_op_stamp();
- inode_t *pi = diri->project_inode();
- pi->ctime = info.stamp;
- pi->version = diri->pre_dirty();
+ auto &pi = diri->project_inode(false, true);
+ pi.inode.ctime = info.stamp;
+ pi.inode.version = diri->pre_dirty();
// project the snaprealm
- sr_t *newsnap = diri->project_snaprealm(snapid);
- newsnap->snaps[snapid] = info;
- newsnap->seq = snapid;
- newsnap->last_created = snapid;
+ auto &newsnap = *pi.snapnode;
+ newsnap.created = snapid;
+ auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
+ if (!em.second)
+ em.first->second = info;
+ newsnap.seq = snapid;
+ newsnap.last_created = snapid;
// journal the inode changes
mdr->ls = mdlog->get_current_segment();
return;
}
- const string &snapname = req->get_filepath().last_dentry();
+ boost::string_view snapname = req->get_filepath().last_dentry();
if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
// journal
- inode_t *pi = diri->project_inode();
- pi->version = diri->pre_dirty();
- pi->ctime = mdr->get_op_stamp();
+ auto &pi = diri->project_inode(false, true);
+ pi.inode.version = diri->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "rmsnap");
mdlog->start_entry(le);
// project the snaprealm
- sr_t *newnode = diri->project_snaprealm();
- newnode->snaps.erase(snapid);
- newnode->seq = seq;
- newnode->last_destroyed = seq;
+ auto &newnode = *pi.snapnode;
+ newnode.snaps.erase(snapid);
+ newnode.seq = seq;
+ newnode.last_destroyed = seq;
le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
le->metablob.add_table_transaction(TABLE_SNAP, stid);
return;
}
- const string &dstname = req->get_filepath().last_dentry();
- const string &srcname = req->get_filepath2().last_dentry();
+ boost::string_view dstname = req->get_filepath().last_dentry();
+ boost::string_view srcname = req->get_filepath2().last_dentry();
dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
if (srcname.length() == 0 || srcname[0] == '_') {
dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
// journal
- inode_t *pi = diri->project_inode();
- pi->ctime = mdr->get_op_stamp();
- pi->version = diri->pre_dirty();
+ auto &pi = diri->project_inode(false, true);
+ pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.version = diri->pre_dirty();
// project the snaprealm
- sr_t *newsnap = diri->project_snaprealm();
- assert(newsnap->snaps.count(snapid));
- newsnap->snaps[snapid].name = dstname;
+ auto &newsnap = *pi.snapnode;
+ auto it = newsnap.snaps.find(snapid);
+ assert(it != newsnap.snaps.end());
+ it->second.name = std::string(dstname);
// journal the inode changes
mdr->ls = mdlog->get_current_segment();
#ifndef CEPH_MDS_SERVER_H
#define CEPH_MDS_SERVER_H
+#include <boost/utility/string_view.hpp>
+
#include "MDSRank.h"
#include "Mutation.h"
bool waiting_for_reconnect(client_t c) const;
void dump_reconnect_status(Formatter *f) const;
- Session *get_session(Message *m);
void handle_client_session(class MClientSession *m);
void _session_logged(Session *session, uint64_t state_seq,
bool open, version_t pv, interval_set<inodeno_t>& inos,version_t piv);
bool check_fragment_space(MDRequestRef& mdr, CDir *in);
bool check_access(MDRequestRef& mdr, CInode *in, unsigned mask);
bool _check_access(Session *session, CInode *in, unsigned mask, int caller_uid, int caller_gid, int setattr_uid, int setattr_gid);
- CDir *validate_dentry_dir(MDRequestRef& mdr, CInode *diri, const string& dname);
+ CDir *validate_dentry_dir(MDRequestRef& mdr, CInode *diri, boost::string_view dname);
CDir *traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath);
- CDentry *prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string& dname, bool okexist=false);
+ CDentry *prepare_null_dentry(MDRequestRef& mdr, CDir *dir, boost::string_view dname, bool okexist=false);
CDentry *prepare_stray_dentry(MDRequestRef& mdr, CInode *in);
CInode* prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
file_layout_t *layout=NULL);
if (!in->is_base())
diri = in->get_projected_parent_dn()->get_dir()->get_inode();
if (diri && diri->is_stray()){
- path = in->get_projected_inode()->stray_prior_path;
+ path = std::string(boost::string_view(in->get_projected_inode()->stray_prior_path));
dout(20) << __func__ << " stray_prior_path " << path << dendl;
} else {
in->make_path_string(path, true);
* Strict boolean parser. Allow true/false/0/1.
* Anything else is -EINVAL.
*/
- auto is_true = [](const std::string &bstr, bool *out) -> bool
+ auto is_true = [](boost::string_view bstr, bool *out) -> bool
{
assert(out != nullptr);
f->open_array_section("gather_set");
if (have_more()) {
- for(std::set<int32_t>::iterator i = more()->gather_set.begin();
- i != more()->gather_set.end(); ++i) {
- f->dump_int("rank", *i);
+ for(const auto &i : more()->gather_set) {
+ f->dump_int("rank", i);
}
}
f->close_section();
private:
int num_rdlock;
+ // XXX not in mempool
struct unstable_bits_t {
set<__s32> gather_set; // auth+rep. >= 0 is mds, < 0 is client
// local state
- int num_wrlock, num_xlock;
+ int num_wrlock = 0, num_xlock = 0;
MutationRef xlock_by;
- client_t xlock_by_client;
- client_t excl_client;
+ client_t xlock_by_client = -1;
+ client_t excl_client = -1;
bool empty() {
return
excl_client == -1;
}
- unstable_bits_t() : num_wrlock(0),
- num_xlock(0),
- xlock_by(),
- xlock_by_client(-1),
- excl_client(-1) {}
+ unstable_bits_t() {}
};
mutable std::unique_ptr<unstable_bits_t> _unstable;
#ifndef CEPH_SNAPCLIENT_H
#define CEPH_SNAPCLIENT_H
+#include <boost/utility/string_view.hpp>
+
#include "MDSTableClient.h"
#include "snap.h"
void resend_queries() override {}
void handle_query_result(MMDSTableRequest *m) override {}
- void prepare_create(inodeno_t dirino, const string& name, utime_t stamp,
+ void prepare_create(inodeno_t dirino, boost::string_view name, utime_t stamp,
version_t *pstid, bufferlist *pbl, MDSInternalContextBase *onfinish) {
bufferlist bl;
__u32 op = TABLE_OP_CREATE;
_prepare(bl, pstid, pbl, onfinish);
}
- void prepare_update(inodeno_t ino, snapid_t snapid, const string& name, utime_t stamp,
+ void prepare_update(inodeno_t ino, snapid_t snapid, boost::string_view name, utime_t stamp,
version_t *pstid, bufferlist *pbl, MDSInternalContextBase *onfinish) {
bufferlist bl;
__u32 op = TABLE_OP_UPDATE;
#include "MDCache.h"
#include "MDSRank.h"
+#include <boost/utility/string_view.hpp>
+
#include "messages/MClientSnap.h"
parent->get_snap_info(infomap, MAX(first, srnode.current_parent_since), last);
}
-const string& SnapRealm::get_snapname(snapid_t snapid, inodeno_t atino)
+boost::string_view SnapRealm::get_snapname(snapid_t snapid, inodeno_t atino)
{
auto srnode_snaps_entry = srnode.snaps.find(snapid);
if (srnode_snaps_entry != srnode.snaps.end()) {
return parent->get_snapname(snapid, atino);
}
-snapid_t SnapRealm::resolve_snapname(const string& n, inodeno_t atino, snapid_t first, snapid_t last)
+snapid_t SnapRealm::resolve_snapname(boost::string_view n, inodeno_t atino, snapid_t first, snapid_t last)
{
// first try me
dout(10) << "resolve_snapname '" << n << "' in [" << first << "," << last << "]" << dendl;
n[0] != '_') return 0;
int next_ = n.find('_', 1);
if (next_ < 0) return 0;
- pname = n.substr(1, next_ - 1);
- pino = atoll(n.c_str() + next_ + 1);
+ pname = std::string(n.substr(1, next_ - 1));
+ pino = atoll(n.data() + next_ + 1);
dout(10) << " " << n << " parses to name '" << pname << "' dirino " << pino << dendl;
}
#ifndef CEPH_MDS_SNAPREALM_H
#define CEPH_MDS_SNAPREALM_H
+#include <boost/utility/string_view.hpp>
+
#include "mdstypes.h"
#include "snap.h"
#include "include/xlist.h"
inodes_with_caps(0)
{ }
- bool exists(const string &name) const {
+ bool exists(boost::string_view name) const {
for (map<snapid_t,SnapInfo>::const_iterator p = srnode.snaps.begin();
p != srnode.snaps.end();
++p) {
const bufferlist& get_snap_trace();
void build_snap_trace(bufferlist& snapbl) const;
- const string& get_snapname(snapid_t snapid, inodeno_t atino);
- snapid_t resolve_snapname(const string &name, inodeno_t atino, snapid_t first=0, snapid_t last=CEPH_NOSNAP);
+ boost::string_view get_snapname(snapid_t snapid, inodeno_t atino);
+ snapid_t resolve_snapname(boost::string_view name, inodeno_t atino, snapid_t first=0, snapid_t last=CEPH_NOSNAP);
const set<snapid_t>& get_snaps() const;
const SnapContext& get_snap_context() const;
to = MAX(in->inode.max_size_ever, to);
}
- inode_t *pi = in->get_projected_inode();
+ auto pi = in->get_projected_inode();
item.size = to;
item.layout = pi->layout;
- item.old_pools = pi->old_pools;
+ item.old_pools.clear();
+ for (const auto &p : pi->old_pools)
+ item.old_pools.insert(p);
item.snapc = *snapc;
}
EUpdate *le = new EUpdate(mds->mdlog, "purge_stray truncate");
mds->mdlog->start_entry(le);
- inode_t *pi = in->project_inode();
- pi->size = 0;
- pi->max_size_ever = 0;
- pi->client_ranges.clear();
- pi->truncate_size = 0;
- pi->truncate_from = 0;
- pi->version = in->pre_dirty();
+ auto &pi = in->project_inode();
+ pi.inode.size = 0;
+ pi.inode.max_size_ever = 0;
+ pi.inode.client_ranges.clear();
+ pi.inode.truncate_size = 0;
+ pi.inode.truncate_from = 0;
+ pi.inode.version = in->pre_dirty();
le->metablob.add_dir_context(dn->dir);
le->metablob.add_primary_dentry(dn, in, true);
if (!in->remote_parents.empty()) {
// unlink any stale remote snap dentry.
- for (compact_set<CDentry*>::iterator p = in->remote_parents.begin();
- p != in->remote_parents.end(); ) {
- CDentry *remote_dn = *p;
- ++p;
+ for (auto it = in->remote_parents.begin(); it != in->remote_parents.end(); ) {
+ CDentry *remote_dn = *it;
+ ++it;
assert(remote_dn->last != CEPH_NOSNAP);
remote_dn->unlink_remote(remote_dn->get_linkage());
}
/* If no remote_dn hinted, pick one arbitrarily */
if (remote_dn == NULL) {
if (!stray_in->remote_parents.empty()) {
- for (compact_set<CDentry*>::iterator p = stray_in->remote_parents.begin();
- p != stray_in->remote_parents.end();
- ++p)
- if ((*p)->last == CEPH_NOSNAP && !(*p)->is_projected()) {
- if ((*p)->is_auth()) {
- remote_dn = *p;
+ for (const auto &dn : stray_in->remote_parents) {
+ if (dn->last == CEPH_NOSNAP && !dn->is_projected()) {
+ if (dn->is_auth()) {
+ remote_dn = dn;
if (remote_dn->dir->can_auth_pin())
break;
} else if (!remote_dn) {
- remote_dn = *p;
+ remote_dn = dn;
}
}
+ }
}
if (!remote_dn) {
dout(20) << __func__ << ": not reintegrating (no remote parents in cache)" << dendl;
#ifndef CEPH_MDS_EMETABLOB_H
#define CEPH_MDS_EMETABLOB_H
+#include <boost/utility/string_view.hpp>
+
#include <stdlib.h>
#include "../CInode.h"
static const int STATE_DIRTYPARENT = (1<<1);
static const int STATE_DIRTYPOOL = (1<<2);
static const int STATE_NEED_SNAPFLUSH = (1<<3);
- typedef compact_map<snapid_t, old_inode_t> old_inodes_t;
- string dn; // dentry
+ std::string dn; // dentry
snapid_t dnfirst, dnlast;
version_t dnv{0};
- inode_t inode; // if it's not
+ CInode::mempool_inode inode; // if it's not XXX should not be part of mempool; wait for std::pmr to simplify
fragtree_t dirfragtree;
- map<string,bufferptr> xattrs;
- string symlink;
+ CInode::mempool_xattr_map xattrs;
+ std::string symlink;
snapid_t oldest_snap;
bufferlist snapbl;
__u8 state{0};
- old_inodes_t old_inodes;
+ CInode::mempool_old_inode_map old_inodes; // XXX should not be part of mempool; wait for std::pmr to simplify
fullbit(const fullbit& o);
const fullbit& operator=(const fullbit& o);
- fullbit(const string& d, snapid_t df, snapid_t dl,
- version_t v, const inode_t& i, const fragtree_t &dft,
- const map<string,bufferptr> &xa, const string& sym,
+ fullbit(boost::string_view d, snapid_t df, snapid_t dl,
+ version_t v, const CInode::mempool_inode& i, const fragtree_t &dft,
+ const CInode::mempool_xattr_map &xa, boost::string_view sym,
snapid_t os, const bufferlist &sbl, __u8 st,
- const old_inodes_t *oi = NULL) :
+ const CInode::mempool_old_inode_map *oi = NULL) :
dn(d), dnfirst(df), dnlast(dl), dnv(v), inode(i), xattrs(xa),
oldest_snap(os), state(st)
{
if (i.is_symlink())
- symlink = sym;
+ symlink = std::string(sym);
if (i.is_dir())
dirfragtree = dft;
if (oi)
/* remotebit - a dentry + remote inode link (i.e. just an ino)
*/
struct remotebit {
- string dn;
+ std::string dn;
snapid_t dnfirst, dnlast;
version_t dnv;
inodeno_t ino;
unsigned char d_type;
bool dirty;
- remotebit(const string& d, snapid_t df, snapid_t dl, version_t v, inodeno_t i, unsigned char dt, bool dr) :
+ remotebit(boost::string_view d, snapid_t df, snapid_t dl, version_t v, inodeno_t i, unsigned char dt, bool dr) :
dn(d), dnfirst(df), dnlast(dl), dnv(v), ino(i), d_type(dt), dirty(dr) { }
explicit remotebit(bufferlist::iterator &p) { decode(p); }
remotebit(): dnfirst(0), dnlast(0), dnv(0), ino(0),
* nullbit - a null dentry
*/
struct nullbit {
- string dn;
+ std::string dn;
snapid_t dnfirst, dnlast;
version_t dnv;
bool dirty;
- nullbit(const string& d, snapid_t df, snapid_t dl, version_t v, bool dr) :
+ nullbit(boost::string_view d, snapid_t df, snapid_t dl, version_t v, bool dr) :
dn(d), dnfirst(df), dnlast(dl), dnv(v), dirty(dr) { }
explicit nullbit(bufferlist::iterator &p) { decode(p); }
nullbit(): dnfirst(0), dnlast(0), dnv(0), dirty(false) {}
in->last_journaled = event_seq;
//cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
- const inode_t *pi = in->get_projected_inode();
+ const auto pi = in->get_projected_inode();
if ((state & fullbit::STATE_DIRTY) && pi->is_backtrace_updated())
state |= fullbit::STATE_DIRTYPARENT;
add_primary_dentry(dn, 0, dirty, dirty_parent, dirty_pool);
}
- void add_root(bool dirty, CInode *in, const inode_t *pi=0, fragtree_t *pdft=0, bufferlist *psnapbl=0,
- map<string,bufferptr> *px=0) {
+ void add_root(bool dirty, CInode *in, const CInode::mempool_inode *pi=0, fragtree_t *pdft=0, bufferlist *psnapbl=0,
+ CInode::mempool_xattr_map *px=0) {
in->last_journaled = event_seq;
//cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
#ifndef CEPH_INODE_BACKTRACE_H
#define CEPH_INODE_BACKTRACE_H
+#include <boost/utility/string_view.hpp>
+
#include "mdstypes.h"
namespace ceph {
version_t version; // child's version at time of backpointer creation
inode_backpointer_t() : version(0) {}
- inode_backpointer_t(inodeno_t i, const string &d, version_t v) : dirino(i), dname(d), version(v) {}
+ inode_backpointer_t(inodeno_t i, boost::string_view d, version_t v) : dirino(i), dname(d), version(v) {}
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator &bl);
inode.dump(f);
f->close_section(); // inode
f->open_object_section("xattrs");
- for (map<string, bufferptr>::const_iterator iter = xattrs.begin();
- iter != xattrs.end(); ++iter) {
- string s(iter->second.c_str(), iter->second.length());
- f->dump_string(iter->first.c_str(), s);
+ for (const auto &p : xattrs) {
+ std::string s(p.second.c_str(), p.second.length());
+ f->dump_string(p.first.c_str(), s);
}
f->close_section(); // xattrs
if (inode.is_symlink()) {
f->dump_string("state", state_string());
if (!old_inodes.empty()) {
f->open_array_section("old inodes");
- for (old_inodes_t::const_iterator iter = old_inodes.begin();
- iter != old_inodes.end();
- ++iter) {
+ for (const auto &p : old_inodes) {
f->open_object_section("inode");
- f->dump_int("snapid", iter->first);
- iter->second.dump(f);
+ f->dump_int("snapid", p.first);
+ p.second.dump(f);
f->close_section(); // inode
}
f->close_section(); // old inodes
void EMetaBlob::fullbit::generate_test_instances(list<EMetaBlob::fullbit*>& ls)
{
- inode_t inode;
+ CInode::mempool_inode inode;
fragtree_t fragtree;
- map<string,bufferptr> empty_xattrs;
+ CInode::mempool_xattr_map empty_xattrs;
bufferlist empty_snapbl;
fullbit *sample = new fullbit("/testdn", 0, 0, 0,
inode, fragtree, empty_xattrs, "", 0, empty_snapbl,
}
}
} else if (in->inode.is_symlink()) {
- in->symlink = symlink;
+ in->symlink = mempool::mds_co::string(boost::string_view(symlink));
}
in->old_inodes = old_inodes;
if (!in->old_inodes.empty()) {
for (list<ceph::shared_ptr<fullbit> >::const_iterator
iter = fb_list.begin(); iter != fb_list.end(); ++iter) {
- std::string const &dentry = (*iter)->dn;
- children[dir_ino].push_back(dentry);
- ino_locations[(*iter)->inode.ino] = Location(dir_ino, dentry);
+ boost::string_view dentry = (*iter)->dn;
+ children[dir_ino].emplace_back(dentry);
+ ino_locations[(*iter)->inode.ino] = Location(dir_ino, std::string(dentry));
}
for (list<nullbit>::const_iterator
iter = nb_list.begin(); iter != nb_list.end(); ++iter) {
- std::string const &dentry = iter->dn;
- children[dir_ino].push_back(dentry);
+ boost::string_view dentry = iter->dn;
+ children[dir_ino].emplace_back(dentry);
}
for (list<remotebit>::const_iterator
iter = rb_list.begin(); iter != rb_list.end(); ++iter) {
- std::string const &dentry = iter->dn;
- children[dir_ino].push_back(dentry);
+ boost::string_view dentry = iter->dn;
+ children[dir_ino].emplace_back(dentry);
}
}
list<ceph::shared_ptr<fullbit> > const &fb_list = dl.get_dfull();
for (list<ceph::shared_ptr<fullbit> >::const_iterator
iter = fb_list.begin(); iter != fb_list.end(); ++iter) {
- std::string const &dentry = (*iter)->dn;
+ std::string dentry((*iter)->dn);
children[dir_ino].push_back(dentry);
- ino_locations[(*iter)->inode.ino] = Location(dir_ino, dentry);
+ ino_locations[(*iter)->inode.ino] = Location(dir_ino, std::string(dentry));
if (children.find((*iter)->inode.ino) == children.end()) {
- leaf_locations.push_back(Location(dir_ino, dentry));
+ leaf_locations.push_back(Location(dir_ino, std::string(dentry)));
}
}
list<nullbit> const &nb_list = dl.get_dnull();
for (list<nullbit>::const_iterator
iter = nb_list.begin(); iter != nb_list.end(); ++iter) {
- std::string const &dentry = iter->dn;
- leaf_locations.push_back(Location(dir_ino, dentry));
+ boost::string_view dentry = iter->dn;
+ leaf_locations.push_back(Location(dir_ino, std::string(dentry)));
}
list<remotebit> const &rb_list = dl.get_dremote();
for (list<remotebit>::const_iterator
iter = rb_list.begin(); iter != rb_list.end(); ++iter) {
- std::string const &dentry = iter->dn;
- leaf_locations.push_back(Location(dir_ino, dentry));
+ boost::string_view dentry = iter->dn;
+ leaf_locations.push_back(Location(dir_ino, std::string(dentry)));
}
}
static const struct sm_state_t simplelock[LOCK_MAX] = {
// stable loner rep state r rp rd wr fwr l x caps,other
[LOCK_SYNC] = { 0, false, LOCK_SYNC, ANY, 0, ANY, 0, 0, ANY, 0, CEPH_CAP_GSHARED,0,0,CEPH_CAP_GSHARED },
- [LOCK_LOCK_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, ANY, XCL, XCL, 0, 0, XCL, 0, 0,0,0,0 },
+ [LOCK_LOCK_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, AUTH, XCL, XCL, 0, 0, XCL, 0, 0,0,0,0 },
[LOCK_EXCL_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, 0,CEPH_CAP_GSHARED,0,0 },
[LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0, 0, 0, 0, AUTH,0, 0, 0,0,0,0 },
[LOCK_EXCL] = { 0, true, LOCK_LOCK, 0, 0, REQ, XCL, 0, 0, 0, 0,CEPH_CAP_GEXCL|CEPH_CAP_GSHARED,0,0 },
[LOCK_SYNC_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, ANY, 0, 0, 0, 0, 0, 0, 0,CEPH_CAP_GSHARED,0,0 },
- [LOCK_LOCK_EXCL] = { LOCK_EXCL, false, LOCK_LOCK, ANY, 0, 0, 0, 0, 0, 0, CEPH_CAP_GSHARED,0,0,0 },
+ [LOCK_LOCK_EXCL] = { LOCK_EXCL, false, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, CEPH_CAP_GSHARED,0,0,0 },
[LOCK_REMOTEXLOCK]={ LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
[LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0, 0, 0, 0, AUTH,0, 0, 0,0,0,0 },
[LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, ANY, 0,0,0,0 },
- [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, ANY, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
[LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_MIX, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
[LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
- [LOCK_TSYN_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_TSYN_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
[LOCK_TSYN] = { 0, false, LOCK_LOCK, AUTH, 0, AUTH,0, 0, 0, 0, 0,0,0,0 },
[LOCK_LOCK_TSYN] = { LOCK_TSYN, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
[LOCK_MIX] = { 0, false, LOCK_MIX, 0, 0, REQ, ANY, 0, 0, 0, 0,0,0,0 },
[LOCK_TSYN_MIX] = { LOCK_MIX, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
- [LOCK_SYNC_MIX] = { LOCK_MIX, false, LOCK_SYNC_MIX2,0,0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_SYNC_MIX] = { LOCK_MIX, false, LOCK_SYNC_MIX2,ANY,0, 0, 0, 0, 0, 0, 0,0,0,0 },
[LOCK_SYNC_MIX2] = { LOCK_MIX, false, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
};
[LOCK_XSYN_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, AUTH, 0, AUTH,0, 0, 0, 0, 0,CEPH_CAP_GCACHE,0,0 },
[LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
- [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 },
+ [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, ANY, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 },
[LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
- [LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_MIX, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 },
- [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_MIX, 0, 0, REQ, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, REQ, 0, 0, 0, 0, 0,0,0,0 },
[LOCK_XSYN_LOCK] = { LOCK_LOCK, true, LOCK_LOCK, AUTH, 0, 0, XCL, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
[LOCK_PREXLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, ANY, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
[LOCK_XSYN_MIX] = { LOCK_MIX, true, LOCK_LOCK, 0, 0, 0, XCL, 0, 0, 0, 0,0,0,0 },
[LOCK_EXCL] = { 0, true, LOCK_LOCK, 0, 0, XCL, XCL, 0, 0, 0, 0,CEPH_CAP_GSHARED|CEPH_CAP_GEXCL|CEPH_CAP_GCACHE|CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GBUFFER|CEPH_CAP_GLAZYIO,0,0 },
- [LOCK_SYNC_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, 0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD,0,0 },
+ [LOCK_SYNC_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, ANY, 0, 0, 0, 0, 0, 0, 0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD,0,0 },
[LOCK_MIX_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, 0, 0, 0, XCL, 0, 0, 0, 0,CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GLAZYIO,0,0 },
[LOCK_LOCK_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
[LOCK_XSYN_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, AUTH, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
free_data();
}
-/*
- * inode_t
- */
-void inode_t::encode(bufferlist &bl, uint64_t features) const
-{
- ENCODE_START(15, 6, bl);
-
- ::encode(ino, bl);
- ::encode(rdev, bl);
- ::encode(ctime, bl);
-
- ::encode(mode, bl);
- ::encode(uid, bl);
- ::encode(gid, bl);
-
- ::encode(nlink, bl);
- {
- // removed field
- bool anchored = 0;
- ::encode(anchored, bl);
- }
-
- ::encode(dir_layout, bl);
- ::encode(layout, bl, features);
- ::encode(size, bl);
- ::encode(truncate_seq, bl);
- ::encode(truncate_size, bl);
- ::encode(truncate_from, bl);
- ::encode(truncate_pending, bl);
- ::encode(mtime, bl);
- ::encode(atime, bl);
- ::encode(time_warp_seq, bl);
- ::encode(client_ranges, bl);
-
- ::encode(dirstat, bl);
- ::encode(rstat, bl);
- ::encode(accounted_rstat, bl);
-
- ::encode(version, bl);
- ::encode(file_data_version, bl);
- ::encode(xattr_version, bl);
- ::encode(backtrace_version, bl);
- ::encode(old_pools, bl);
- ::encode(max_size_ever, bl);
- ::encode(inline_data, bl);
- ::encode(quota, bl);
-
- ::encode(stray_prior_path, bl);
-
- ::encode(last_scrub_version, bl);
- ::encode(last_scrub_stamp, bl);
-
- ::encode(btime, bl);
- ::encode(change_attr, bl);
-
- ::encode(export_pin, bl);
-
- ENCODE_FINISH(bl);
-}
-
-void inode_t::decode(bufferlist::iterator &p)
-{
- DECODE_START_LEGACY_COMPAT_LEN(15, 6, 6, p);
-
- ::decode(ino, p);
- ::decode(rdev, p);
- ::decode(ctime, p);
-
- ::decode(mode, p);
- ::decode(uid, p);
- ::decode(gid, p);
-
- ::decode(nlink, p);
- {
- bool anchored;
- ::decode(anchored, p);
- }
-
- if (struct_v >= 4)
- ::decode(dir_layout, p);
- else
- memset(&dir_layout, 0, sizeof(dir_layout));
- ::decode(layout, p);
- ::decode(size, p);
- ::decode(truncate_seq, p);
- ::decode(truncate_size, p);
- ::decode(truncate_from, p);
- if (struct_v >= 5)
- ::decode(truncate_pending, p);
- else
- truncate_pending = 0;
- ::decode(mtime, p);
- ::decode(atime, p);
- ::decode(time_warp_seq, p);
- if (struct_v >= 3) {
- ::decode(client_ranges, p);
- } else {
- map<client_t, client_writeable_range_t::byte_range_t> m;
- ::decode(m, p);
- for (map<client_t, client_writeable_range_t::byte_range_t>::iterator
- q = m.begin(); q != m.end(); ++q)
- client_ranges[q->first].range = q->second;
- }
-
- ::decode(dirstat, p);
- ::decode(rstat, p);
- ::decode(accounted_rstat, p);
-
- ::decode(version, p);
- ::decode(file_data_version, p);
- ::decode(xattr_version, p);
- if (struct_v >= 2)
- ::decode(backtrace_version, p);
- if (struct_v >= 7)
- ::decode(old_pools, p);
- if (struct_v >= 8)
- ::decode(max_size_ever, p);
- if (struct_v >= 9) {
- ::decode(inline_data, p);
- } else {
- inline_data.version = CEPH_INLINE_NONE;
- }
- if (struct_v < 10)
- backtrace_version = 0; // force update backtrace
- if (struct_v >= 11)
- ::decode(quota, p);
-
- if (struct_v >= 12) {
- ::decode(stray_prior_path, p);
- }
-
- if (struct_v >= 13) {
- ::decode(last_scrub_version, p);
- ::decode(last_scrub_stamp, p);
- }
- if (struct_v >= 14) {
- ::decode(btime, p);
- ::decode(change_attr, p);
- } else {
- btime = utime_t();
- change_attr = 0;
- }
-
- if (struct_v >= 15) {
- ::decode(export_pin, p);
- } else {
- export_pin = MDS_RANK_NONE;
- }
-
- DECODE_FINISH(p);
-}
-
-void inode_t::dump(Formatter *f) const
-{
- f->dump_unsigned("ino", ino);
- f->dump_unsigned("rdev", rdev);
- f->dump_stream("ctime") << ctime;
- f->dump_stream("btime") << btime;
- f->dump_unsigned("mode", mode);
- f->dump_unsigned("uid", uid);
- f->dump_unsigned("gid", gid);
- f->dump_unsigned("nlink", nlink);
-
- f->open_object_section("dir_layout");
- ::dump(dir_layout, f);
- f->close_section();
-
- f->dump_object("layout", layout);
-
- f->open_array_section("old_pools");
- for (compact_set<int64_t>::const_iterator i = old_pools.begin();
- i != old_pools.end();
- ++i)
- f->dump_int("pool", *i);
- f->close_section();
-
- f->dump_unsigned("size", size);
- f->dump_unsigned("truncate_seq", truncate_seq);
- f->dump_unsigned("truncate_size", truncate_size);
- f->dump_unsigned("truncate_from", truncate_from);
- f->dump_unsigned("truncate_pending", truncate_pending);
- f->dump_stream("mtime") << mtime;
- f->dump_stream("atime") << atime;
- f->dump_unsigned("time_warp_seq", time_warp_seq);
- f->dump_unsigned("change_attr", change_attr);
- f->dump_int("export_pin", export_pin);
-
- f->open_array_section("client_ranges");
- for (map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin(); p != client_ranges.end(); ++p) {
- f->open_object_section("client");
- f->dump_unsigned("client", p->first.v);
- p->second.dump(f);
- f->close_section();
- }
- f->close_section();
-
- f->open_object_section("dirstat");
- dirstat.dump(f);
- f->close_section();
-
- f->open_object_section("rstat");
- rstat.dump(f);
- f->close_section();
-
- f->open_object_section("accounted_rstat");
- accounted_rstat.dump(f);
- f->close_section();
-
- f->dump_unsigned("version", version);
- f->dump_unsigned("file_data_version", file_data_version);
- f->dump_unsigned("xattr_version", xattr_version);
- f->dump_unsigned("backtrace_version", backtrace_version);
-
- f->dump_string("stray_prior_path", stray_prior_path);
-}
-
-void inode_t::generate_test_instances(list<inode_t*>& ls)
-{
- ls.push_back(new inode_t);
- ls.push_back(new inode_t);
- ls.back()->ino = 1;
- // i am lazy.
-}
-
-int inode_t::compare(const inode_t &other, bool *divergent) const
-{
- assert(ino == other.ino);
- *divergent = false;
- if (version == other.version) {
- if (rdev != other.rdev ||
- ctime != other.ctime ||
- btime != other.btime ||
- mode != other.mode ||
- uid != other.uid ||
- gid != other.gid ||
- nlink != other.nlink ||
- memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
- layout != other.layout ||
- old_pools != other.old_pools ||
- size != other.size ||
- max_size_ever != other.max_size_ever ||
- truncate_seq != other.truncate_seq ||
- truncate_size != other.truncate_size ||
- truncate_from != other.truncate_from ||
- truncate_pending != other.truncate_pending ||
- change_attr != other.change_attr ||
- mtime != other.mtime ||
- atime != other.atime ||
- time_warp_seq != other.time_warp_seq ||
- inline_data != other.inline_data ||
- client_ranges != other.client_ranges ||
- !(dirstat == other.dirstat) ||
- !(rstat == other.rstat) ||
- !(accounted_rstat == other.accounted_rstat) ||
- file_data_version != other.file_data_version ||
- xattr_version != other.xattr_version ||
- backtrace_version != other.backtrace_version) {
- *divergent = true;
- }
- return 0;
- } else if (version > other.version) {
- *divergent = !older_is_consistent(other);
- return 1;
- } else {
- assert(version < other.version);
- *divergent = !other.older_is_consistent(*this);
- return -1;
- }
-}
-
-bool inode_t::older_is_consistent(const inode_t &other) const
-{
- if (max_size_ever < other.max_size_ever ||
- truncate_seq < other.truncate_seq ||
- time_warp_seq < other.time_warp_seq ||
- inline_data.version < other.inline_data.version ||
- dirstat.version < other.dirstat.version ||
- rstat.version < other.rstat.version ||
- accounted_rstat.version < other.accounted_rstat.version ||
- file_data_version < other.file_data_version ||
- xattr_version < other.xattr_version ||
- backtrace_version < other.backtrace_version) {
- return false;
- }
- return true;
-}
-
-/*
- * old_inode_t
- */
-void old_inode_t::encode(bufferlist& bl, uint64_t features) const
-{
- ENCODE_START(2, 2, bl);
- ::encode(first, bl);
- ::encode(inode, bl, features);
- ::encode(xattrs, bl);
- ENCODE_FINISH(bl);
-}
-
-void old_inode_t::decode(bufferlist::iterator& bl)
-{
- DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
- ::decode(first, bl);
- ::decode(inode, bl);
- ::decode(xattrs, bl);
- DECODE_FINISH(bl);
-}
-
-void old_inode_t::dump(Formatter *f) const
-{
- f->dump_unsigned("first", first);
- inode.dump(f);
- f->open_object_section("xattrs");
- for (map<string,bufferptr>::const_iterator p = xattrs.begin(); p != xattrs.end(); ++p) {
- string v(p->second.c_str(), p->second.length());
- f->dump_string(p->first.c_str(), v);
- }
- f->close_section();
-}
-
-void old_inode_t::generate_test_instances(list<old_inode_t*>& ls)
-{
- ls.push_back(new old_inode_t);
- ls.push_back(new old_inode_t);
- ls.back()->first = 2;
- list<inode_t*> ils;
- inode_t::generate_test_instances(ils);
- ls.back()->inode = *ils.back();
- ls.back()->xattrs["user.foo"] = buffer::copy("asdf", 4);
- ls.back()->xattrs["user.unprintable"] = buffer::copy("\000\001\002", 3);
-}
-
/*
* fnode_t
void inode_load_vec_t::encode(bufferlist &bl) const
{
ENCODE_START(2, 2, bl);
- for (int i=0; i<NUM; i++)
- ::encode(vec[i], bl);
+ for (const auto &i : vec) {
+ ::encode(i, bl);
+ }
ENCODE_FINISH(bl);
}
void inode_load_vec_t::decode(const utime_t &t, bufferlist::iterator &p)
{
DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
- for (int i=0; i<NUM; i++)
- ::decode(vec[i], t, p);
+ for (auto &i : vec) {
+ ::decode(i, t, p);
+ }
DECODE_FINISH(p);
}
void inode_load_vec_t::dump(Formatter *f)
{
f->open_array_section("Decay Counters");
- for (vector<DecayCounter>::const_iterator i = vec.begin(); i != vec.end(); ++i) {
+ for (const auto &i : vec) {
f->open_object_section("Decay Counter");
- i->dump(f);
+ i.dump(f);
f->close_section();
}
f->close_section();
void dirfrag_load_vec_t::dump(Formatter *f) const
{
f->open_array_section("Decay Counters");
- for (vector<DecayCounter>::const_iterator i = vec.begin(); i != vec.end(); ++i) {
+ for (const auto &i : vec) {
f->open_object_section("Decay Counter");
- i->dump(f);
+ i.dump(f);
f->close_section();
}
f->close_section();
#include <ostream>
#include <set>
#include <map>
+#include <boost/utility/string_view.hpp>
#include "common/config.h"
#include "common/Clock.h"
struct scatter_info_t {
- version_t version;
+ version_t version = 0;
- scatter_info_t() : version(0) {}
+ scatter_info_t() {}
};
struct frag_info_t : public scatter_info_t {
// this frag
utime_t mtime;
- uint64_t change_attr;
- int64_t nfiles; // files
- int64_t nsubdirs; // subdirs
+ uint64_t change_attr = 0;
+ int64_t nfiles = 0; // files
+ int64_t nsubdirs = 0; // subdirs
- frag_info_t() : change_attr(0), nfiles(0), nsubdirs(0) {}
+ frag_info_t() {}
int64_t size() const { return nfiles + nsubdirs; }
struct nest_info_t : public scatter_info_t {
// this frag + children
utime_t rctime;
- int64_t rbytes;
- int64_t rfiles;
- int64_t rsubdirs;
+ int64_t rbytes = 0;
+ int64_t rfiles = 0;
+ int64_t rsubdirs = 0;
int64_t rsize() const { return rfiles + rsubdirs; }
- int64_t rsnaprealms;
+ int64_t rsnaprealms = 0;
- nest_info_t() : rbytes(0), rfiles(0), rsubdirs(0), rsnaprealms(0) {}
+ nest_info_t() {}
void zero() {
*this = nest_info_t();
struct quota_info_t
{
- int64_t max_bytes;
- int64_t max_files;
+ int64_t max_bytes = 0;
+ int64_t max_files = 0;
- quota_info_t() : max_bytes(0), max_files(0) {}
+ quota_info_t() {}
void encode(bufferlist& bl) const {
ENCODE_START(1, 1, bl);
*/
struct client_writeable_range_t {
struct byte_range_t {
- uint64_t first, last; // interval client can write to
- byte_range_t() : first(0), last(0) {}
+ uint64_t first = 0, last = 0; // interval client can write to
+ byte_range_t() {}
};
byte_range_t range;
- snapid_t follows; // aka "data+metadata flushed thru"
+ snapid_t follows = 0; // aka "data+metadata flushed thru"
- client_writeable_range_t() : follows(0) {}
+ client_writeable_range_t() {}
void encode(bufferlist &bl) const;
void decode(bufferlist::iterator& bl);
void dump(Formatter *f) const;
- static void generate_test_instances(list<client_writeable_range_t*>& ls);
+ static void generate_test_instances(std::list<client_writeable_range_t*>& ls);
};
inline void decode(client_writeable_range_t::byte_range_t& range, bufferlist::iterator& bl) {
private:
std::unique_ptr<bufferlist> blp;
public:
- version_t version;
+ version_t version = 1;
void free_data() {
blp.reset();
}
size_t length() const { return blp ? blp->length() : 0; }
- inline_data_t() : version(1) {}
+ inline_data_t() {}
inline_data_t(const inline_data_t& o) : version(o.version) {
if (o.blp)
get_data() = *o.blp;
/*
* inode_t
*/
+template<template<typename> class Allocator = std::allocator>
struct inode_t {
/**
* ***************
* ***************
*/
// base (immutable)
- inodeno_t ino;
- uint32_t rdev; // if special file
+ inodeno_t ino = 0;
+ uint32_t rdev = 0; // if special file
// affected by any inode change...
utime_t ctime; // inode change time
utime_t btime; // birth time
// perm (namespace permissions)
- uint32_t mode;
- uid_t uid;
- gid_t gid;
+ uint32_t mode = 0;
+ uid_t uid = 0;
+ gid_t gid = 0;
// nlink
- int32_t nlink;
+ int32_t nlink = 0;
// file (data access)
ceph_dir_layout dir_layout; // [dir only]
file_layout_t layout;
- compact_set <int64_t> old_pools;
- uint64_t size; // on directory, # dentries
- uint64_t max_size_ever; // max size the file has ever been
- uint32_t truncate_seq;
- uint64_t truncate_size, truncate_from;
- uint32_t truncate_pending;
+ compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools;
+ uint64_t size = 0; // on directory, # dentries
+ uint64_t max_size_ever = 0; // max size the file has ever been
+ uint32_t truncate_seq = 0;
+ uint64_t truncate_size = 0, truncate_from = 0;
+ uint32_t truncate_pending = 0;
utime_t mtime; // file data modify time.
utime_t atime; // file data access time.
- uint32_t time_warp_seq; // count of (potential) mtime/atime timewarps (i.e., utimes())
- inline_data_t inline_data;
+ uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes())
+ inline_data_t inline_data; // FIXME check
// change attribute
- uint64_t change_attr;
+ uint64_t change_attr = 0;
- std::map<client_t,client_writeable_range_t> client_ranges; // client(s) can write to these ranges
+ using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>;
+ client_range_map client_ranges; // client(s) can write to these ranges
// dirfrag, recursive accountin
frag_info_t dirstat; // protected by my filelock
quota_info_t quota;
- mds_rank_t export_pin;
+ mds_rank_t export_pin = MDS_RANK_NONE;
// special stuff
- version_t version; // auth only
- version_t file_data_version; // auth only
- version_t xattr_version;
+ version_t version = 0; // auth only
+ version_t file_data_version = 0; // auth only
+ version_t xattr_version = 0;
utime_t last_scrub_stamp; // start time of last complete scrub
- version_t last_scrub_version;// (parent) start version of last complete scrub
+ version_t last_scrub_version = 0;// (parent) start version of last complete scrub
- version_t backtrace_version;
+ version_t backtrace_version = 0;
snapid_t oldest_snap;
- string stray_prior_path; //stores path before unlink
-
- inode_t() : ino(0), rdev(0),
- mode(0), uid(0), gid(0), nlink(0),
- size(0), max_size_ever(0),
- truncate_seq(0), truncate_size(0), truncate_from(0),
- truncate_pending(0),
- time_warp_seq(0), change_attr(0),
- export_pin(MDS_RANK_NONE),
- version(0), file_data_version(0), xattr_version(0),
- last_scrub_version(0), backtrace_version(0) {
+ std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink
+
+ inode_t()
+ {
clear_layout();
memset(&dir_layout, 0, sizeof(dir_layout));
memset("a, 0, sizeof(quota));
void encode(bufferlist &bl, uint64_t features) const;
void decode(bufferlist::iterator& bl);
void dump(Formatter *f) const;
- static void generate_test_instances(list<inode_t*>& ls);
+ static void generate_test_instances(std::list<inode_t*>& ls);
/**
* Compare this inode_t with another that represent *the same inode*
* at different points in time.
private:
bool older_is_consistent(const inode_t &other) const;
};
-WRITE_CLASS_ENCODER_FEATURES(inode_t)
+// These methods may be moved back to mdstypes.cc when we have pmr
+template<template<typename> class Allocator>
+void inode_t<Allocator>::encode(bufferlist &bl, uint64_t features) const
+{
+ ENCODE_START(15, 6, bl);
+
+ ::encode(ino, bl);
+ ::encode(rdev, bl);
+ ::encode(ctime, bl);
+
+ ::encode(mode, bl);
+ ::encode(uid, bl);
+ ::encode(gid, bl);
+
+ ::encode(nlink, bl);
+ {
+ // removed field
+ bool anchored = 0;
+ ::encode(anchored, bl);
+ }
+
+ ::encode(dir_layout, bl);
+ ::encode(layout, bl, features);
+ ::encode(size, bl);
+ ::encode(truncate_seq, bl);
+ ::encode(truncate_size, bl);
+ ::encode(truncate_from, bl);
+ ::encode(truncate_pending, bl);
+ ::encode(mtime, bl);
+ ::encode(atime, bl);
+ ::encode(time_warp_seq, bl);
+ ::encode(client_ranges, bl);
+
+ ::encode(dirstat, bl);
+ ::encode(rstat, bl);
+ ::encode(accounted_rstat, bl);
+
+ ::encode(version, bl);
+ ::encode(file_data_version, bl);
+ ::encode(xattr_version, bl);
+ ::encode(backtrace_version, bl);
+ ::encode(old_pools, bl);
+ ::encode(max_size_ever, bl);
+ ::encode(inline_data, bl);
+ ::encode(quota, bl);
+
+ ::encode(stray_prior_path, bl);
+
+ ::encode(last_scrub_version, bl);
+ ::encode(last_scrub_stamp, bl);
+
+ ::encode(btime, bl);
+ ::encode(change_attr, bl);
+
+ ::encode(export_pin, bl);
+
+ ENCODE_FINISH(bl);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::decode(bufferlist::iterator &p)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(15, 6, 6, p);
+
+ ::decode(ino, p);
+ ::decode(rdev, p);
+ ::decode(ctime, p);
+
+ ::decode(mode, p);
+ ::decode(uid, p);
+ ::decode(gid, p);
+
+ ::decode(nlink, p);
+ {
+ bool anchored;
+ ::decode(anchored, p);
+ }
+
+ if (struct_v >= 4)
+ ::decode(dir_layout, p);
+ else
+ memset(&dir_layout, 0, sizeof(dir_layout));
+ ::decode(layout, p);
+ ::decode(size, p);
+ ::decode(truncate_seq, p);
+ ::decode(truncate_size, p);
+ ::decode(truncate_from, p);
+ if (struct_v >= 5)
+ ::decode(truncate_pending, p);
+ else
+ truncate_pending = 0;
+ ::decode(mtime, p);
+ ::decode(atime, p);
+ ::decode(time_warp_seq, p);
+ if (struct_v >= 3) {
+ ::decode(client_ranges, p);
+ } else {
+ map<client_t, client_writeable_range_t::byte_range_t> m;
+ ::decode(m, p);
+ for (map<client_t, client_writeable_range_t::byte_range_t>::iterator
+ q = m.begin(); q != m.end(); ++q)
+ client_ranges[q->first].range = q->second;
+ }
+
+ ::decode(dirstat, p);
+ ::decode(rstat, p);
+ ::decode(accounted_rstat, p);
+
+ ::decode(version, p);
+ ::decode(file_data_version, p);
+ ::decode(xattr_version, p);
+ if (struct_v >= 2)
+ ::decode(backtrace_version, p);
+ if (struct_v >= 7)
+ ::decode(old_pools, p);
+ if (struct_v >= 8)
+ ::decode(max_size_ever, p);
+ if (struct_v >= 9) {
+ ::decode(inline_data, p);
+ } else {
+ inline_data.version = CEPH_INLINE_NONE;
+ }
+ if (struct_v < 10)
+ backtrace_version = 0; // force update backtrace
+ if (struct_v >= 11)
+ ::decode(quota, p);
+
+ if (struct_v >= 12) {
+ std::string tmp;
+ ::decode(tmp, p);
+ stray_prior_path = std::basic_string<char,std::char_traits<char>,Allocator<char>>(boost::string_view(tmp));
+ }
+
+ if (struct_v >= 13) {
+ ::decode(last_scrub_version, p);
+ ::decode(last_scrub_stamp, p);
+ }
+ if (struct_v >= 14) {
+ ::decode(btime, p);
+ ::decode(change_attr, p);
+ } else {
+ btime = utime_t();
+ change_attr = 0;
+ }
+
+ if (struct_v >= 15) {
+ ::decode(export_pin, p);
+ } else {
+ export_pin = MDS_RANK_NONE;
+ }
+
+ DECODE_FINISH(p);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::dump(Formatter *f) const
+{
+ f->dump_unsigned("ino", ino);
+ f->dump_unsigned("rdev", rdev);
+ f->dump_stream("ctime") << ctime;
+ f->dump_stream("btime") << btime;
+ f->dump_unsigned("mode", mode);
+ f->dump_unsigned("uid", uid);
+ f->dump_unsigned("gid", gid);
+ f->dump_unsigned("nlink", nlink);
+
+ f->open_object_section("dir_layout");
+ ::dump(dir_layout, f);
+ f->close_section();
+
+ f->dump_object("layout", layout);
+
+ f->open_array_section("old_pools");
+ for (const auto &p : old_pools) {
+ f->dump_int("pool", p);
+ }
+ f->close_section();
+
+ f->dump_unsigned("size", size);
+ f->dump_unsigned("truncate_seq", truncate_seq);
+ f->dump_unsigned("truncate_size", truncate_size);
+ f->dump_unsigned("truncate_from", truncate_from);
+ f->dump_unsigned("truncate_pending", truncate_pending);
+ f->dump_stream("mtime") << mtime;
+ f->dump_stream("atime") << atime;
+ f->dump_unsigned("time_warp_seq", time_warp_seq);
+ f->dump_unsigned("change_attr", change_attr);
+ f->dump_int("export_pin", export_pin);
+
+ f->open_array_section("client_ranges");
+ for (const auto &p : client_ranges) {
+ f->open_object_section("client");
+ f->dump_unsigned("client", p.first.v);
+ p.second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_object_section("dirstat");
+ dirstat.dump(f);
+ f->close_section();
+
+ f->open_object_section("rstat");
+ rstat.dump(f);
+ f->close_section();
+
+ f->open_object_section("accounted_rstat");
+ accounted_rstat.dump(f);
+ f->close_section();
+
+ f->dump_unsigned("version", version);
+ f->dump_unsigned("file_data_version", file_data_version);
+ f->dump_unsigned("xattr_version", xattr_version);
+ f->dump_unsigned("backtrace_version", backtrace_version);
+
+ f->dump_string("stray_prior_path", stray_prior_path);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::generate_test_instances(list<inode_t*>& ls)
+{
+ ls.push_back(new inode_t<Allocator>);
+ ls.push_back(new inode_t<Allocator>);
+ ls.back()->ino = 1;
+ // i am lazy.
+}
+
+template<template<typename> class Allocator>
+int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const
+{
+ assert(ino == other.ino);
+ *divergent = false;
+ if (version == other.version) {
+ if (rdev != other.rdev ||
+ ctime != other.ctime ||
+ btime != other.btime ||
+ mode != other.mode ||
+ uid != other.uid ||
+ gid != other.gid ||
+ nlink != other.nlink ||
+ memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
+ layout != other.layout ||
+ old_pools != other.old_pools ||
+ size != other.size ||
+ max_size_ever != other.max_size_ever ||
+ truncate_seq != other.truncate_seq ||
+ truncate_size != other.truncate_size ||
+ truncate_from != other.truncate_from ||
+ truncate_pending != other.truncate_pending ||
+ change_attr != other.change_attr ||
+ mtime != other.mtime ||
+ atime != other.atime ||
+ time_warp_seq != other.time_warp_seq ||
+ inline_data != other.inline_data ||
+ client_ranges != other.client_ranges ||
+ !(dirstat == other.dirstat) ||
+ !(rstat == other.rstat) ||
+ !(accounted_rstat == other.accounted_rstat) ||
+ file_data_version != other.file_data_version ||
+ xattr_version != other.xattr_version ||
+ backtrace_version != other.backtrace_version) {
+ *divergent = true;
+ }
+ return 0;
+ } else if (version > other.version) {
+ *divergent = !older_is_consistent(other);
+ return 1;
+ } else {
+ assert(version < other.version);
+ *divergent = !other.older_is_consistent(*this);
+ return -1;
+ }
+}
+
+template<template<typename> class Allocator>
+bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const
+{
+ if (max_size_ever < other.max_size_ever ||
+ truncate_seq < other.truncate_seq ||
+ time_warp_seq < other.time_warp_seq ||
+ inline_data.version < other.inline_data.version ||
+ dirstat.version < other.dirstat.version ||
+ rstat.version < other.rstat.version ||
+ accounted_rstat.version < other.accounted_rstat.version ||
+ file_data_version < other.file_data_version ||
+ xattr_version < other.xattr_version ||
+ backtrace_version < other.backtrace_version) {
+ return false;
+ }
+ return true;
+}
+
+template<template<typename> class Allocator>
+inline void encode(const inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
+{
+ ENCODE_DUMP_PRE();
+ c.encode(bl, features);
+ ENCODE_DUMP_POST(cl);
+}
+template<template<typename> class Allocator>
+inline void decode(inode_t<Allocator> &c, ::ceph::bufferlist::iterator &p)
+{
+ c.decode(p);
+}
+
+template<template<typename> class Allocator>
+using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>;
+
+template<template<typename> class Allocator>
+using xattr_map = compact_map<alloc_string<Allocator>, bufferptr, std::less<alloc_string<Allocator>>, Allocator<std::pair<const alloc_string<Allocator>, bufferptr>>>; // FIXME bufferptr not in mempool
/*
* old_inode_t
*/
+template<template<typename> class Allocator = std::allocator>
struct old_inode_t {
snapid_t first;
- inode_t inode;
- std::map<string,bufferptr> xattrs;
+ inode_t<Allocator> inode;
+ xattr_map<Allocator> xattrs;
void encode(bufferlist &bl, uint64_t features) const;
void decode(bufferlist::iterator& bl);
void dump(Formatter *f) const;
- static void generate_test_instances(list<old_inode_t*>& ls);
+ static void generate_test_instances(std::list<old_inode_t*>& ls);
};
-WRITE_CLASS_ENCODER_FEATURES(old_inode_t)
+
+// These methods may be moved back to mdstypes.cc when we have pmr
+template<template<typename> class Allocator>
+void old_inode_t<Allocator>::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(2, 2, bl);
+ ::encode(first, bl);
+ ::encode(inode, bl, features);
+ ::encode(xattrs, bl);
+ ENCODE_FINISH(bl);
+}
+
+template<template<typename> class Allocator>
+void old_inode_t<Allocator>::decode(bufferlist::iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ ::decode(first, bl);
+ ::decode(inode, bl);
+ ::decode(xattrs, bl);
+ DECODE_FINISH(bl);
+}
+
+template<template<typename> class Allocator>
+void old_inode_t<Allocator>::dump(Formatter *f) const
+{
+ f->dump_unsigned("first", first);
+ inode.dump(f);
+ f->open_object_section("xattrs");
+ for (const auto &p : xattrs) {
+ std::string v(p.second.c_str(), p.second.length());
+ f->dump_string(p.first.c_str(), v);
+ }
+ f->close_section();
+}
+
+template<template<typename> class Allocator>
+void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls)
+{
+ ls.push_back(new old_inode_t<Allocator>);
+ ls.push_back(new old_inode_t<Allocator>);
+ ls.back()->first = 2;
+ std::list<inode_t<Allocator>*> ils;
+ inode_t<Allocator>::generate_test_instances(ils);
+ ls.back()->inode = *ils.back();
+ ls.back()->xattrs["user.foo"] = buffer::copy("asdf", 4);
+ ls.back()->xattrs["user.unprintable"] = buffer::copy("\000\001\002", 3);
+}
+
+template<template<typename> class Allocator>
+inline void encode(const old_inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
+{
+ ENCODE_DUMP_PRE();
+ c.encode(bl, features);
+ ENCODE_DUMP_POST(cl);
+}
+template<template<typename> class Allocator>
+inline void decode(old_inode_t<Allocator> &c, ::ceph::bufferlist::iterator &p)
+{
+ c.decode(p);
+}
/*
* like an inode, but for a dir frag
*/
struct fnode_t {
- version_t version;
+ version_t version = 0;
snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru
frag_info_t fragstat, accounted_fragstat;
nest_info_t rstat, accounted_rstat;
- damage_flags_t damage_flags;
+ damage_flags_t damage_flags = 0;
// we know we and all our descendants have been scrubbed since this version
- version_t recursive_scrub_version;
+ version_t recursive_scrub_version = 0;
utime_t recursive_scrub_stamp;
// version at which we last scrubbed our personal data structures
- version_t localized_scrub_version;
+ version_t localized_scrub_version = 0;
utime_t localized_scrub_stamp;
void encode(bufferlist &bl) const;
void decode(bufferlist::iterator& bl);
void dump(Formatter *f) const;
static void generate_test_instances(list<fnode_t*>& ls);
- fnode_t() : version(0), damage_flags(0),
- recursive_scrub_version(0), localized_scrub_version(0) {}
+ fnode_t() {}
};
WRITE_CLASS_ENCODER(fnode_t)
// dentries
struct dentry_key_t {
- snapid_t snapid;
- const char *name;
- __u32 hash;
- dentry_key_t() : snapid(0), name(0), hash(0) {}
- dentry_key_t(snapid_t s, const char *n, __u32 h=0) :
+ snapid_t snapid = 0;
+ boost::string_view name;
+ __u32 hash = 0;
+ dentry_key_t() {}
+ dentry_key_t(snapid_t s, boost::string_view n, __u32 h=0) :
snapid(s), name(n), hash(h) {}
- bool is_valid() { return name || snapid; }
+ bool is_valid() { return name.length() || snapid; }
// encode into something that can be decoded as a string.
// name_ (head) or name_%x (!head)
::decode(key, bl);
decode_helper(key, nm, sn);
}
- static void decode_helper(const string& key, string& nm, snapid_t& sn) {
+ static void decode_helper(boost::string_view key, string& nm, snapid_t& sn) {
size_t i = key.find_last_of('_');
assert(i != string::npos);
- if (key.compare(i+1, string::npos, "head") == 0) {
+ if (key.compare(i+1, boost::string_view::npos, "head") == 0) {
// name_head
sn = CEPH_NOSNAP;
} else {
// name_%x
long long unsigned x = 0;
- sscanf(key.c_str() + i + 1, "%llx", &x);
+ std::string x_str(key.substr(i+1));
+ sscanf(x_str.c_str(), "%llx", &x);
sn = x;
}
- nm = string(key.c_str(), i);
+ nm = std::string(key.substr(0, i));
}
};
int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash);
if (c)
return c < 0;
- c = strcmp(k1.name, k2.name);
+ c = k1.name.compare(k2.name);
if (c)
return c < 0;
return k1.snapid < k2.snapid;
string name;
snapid_t snapid;
string_snap_t() {}
- string_snap_t(const string& n, snapid_t s) : name(n), snapid(s) {}
+ string_snap_t(boost::string_view n, snapid_t s) : name(n), snapid(s) {}
string_snap_t(const char *n, snapid_t s) : name(n), snapid(s) {}
void encode(bufferlist& bl) const;
WRITE_CLASS_ENCODER(string_snap_t)
inline bool operator<(const string_snap_t& l, const string_snap_t& r) {
- int c = strcmp(l.name.c_str(), r.name.c_str());
+ int c = l.name.compare(r.name);
return c < 0 || (c == 0 && l.snapid < r.snapid);
}
* pending mutation state in the table.
*/
struct mds_table_pending_t {
- uint64_t reqid;
- __s32 mds;
- version_t tid;
- mds_table_pending_t() : reqid(0), mds(0), tid(0) {}
+ uint64_t reqid = 0;
+ __s32 mds = 0;
+ version_t tid = 0;
+ mds_table_pending_t() {}
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bl);
void dump(Formatter *f) const;
struct metareqid_t {
entity_name_t name;
- uint64_t tid;
- metareqid_t() : tid(0) {}
+ uint64_t tid = 0;
+ metareqid_t() {}
metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {}
void encode(bufferlist& bl) const {
::encode(name, bl);
memset(&capinfo, 0, sizeof(capinfo));
snap_follows = 0;
}
- cap_reconnect_t(uint64_t cap_id, inodeno_t pino, const string& p, int w, int i,
+ cap_reconnect_t(uint64_t cap_id, inodeno_t pino, boost::string_view p, int w, int i,
inodeno_t sr, snapid_t sf, bufferlist& lb) :
path(p) {
capinfo.cap_id = cap_id;
// dir frag
struct dirfrag_t {
- inodeno_t ino;
+ inodeno_t ino = 0;
frag_t frag;
- dirfrag_t() : ino(0) { }
+ dirfrag_t() {}
dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
void encode(bufferlist& bl) const {
class inode_load_vec_t {
static const int NUM = 2;
- std::vector < DecayCounter > vec;
+ std::array<DecayCounter, NUM> vec;
public:
explicit inode_load_vec_t(const utime_t &now)
- : vec(NUM, DecayCounter(now))
+ : vec{DecayCounter(now), DecayCounter(now)}
{}
// for dencoder infrastructure
- inode_load_vec_t() :
- vec(NUM, DecayCounter())
- {}
+ inode_load_vec_t() {}
DecayCounter &get(int t) {
assert(t < NUM);
return vec[t];
class dirfrag_load_vec_t {
public:
static const int NUM = 5;
- std::vector < DecayCounter > vec;
+ std::array<DecayCounter, NUM> vec;
explicit dirfrag_load_vec_t(const utime_t &now)
- : vec(NUM, DecayCounter(now))
- { }
- // for dencoder infrastructure
- dirfrag_load_vec_t()
- : vec(NUM, DecayCounter())
+ : vec{
+ DecayCounter(now),
+ DecayCounter(now),
+ DecayCounter(now),
+ DecayCounter(now),
+ DecayCounter(now)
+ }
{}
+ // for dencoder infrastructure
+ dirfrag_load_vec_t() {}
void encode(bufferlist &bl) const {
ENCODE_START(2, 2, bl);
- for (int i=0; i<NUM; i++)
- ::encode(vec[i], bl);
+ for (const auto &i : vec) {
+ ::encode(i, bl);
+ }
ENCODE_FINISH(bl);
}
void decode(const utime_t &t, bufferlist::iterator &p) {
DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
- for (int i=0; i<NUM; i++)
- ::decode(vec[i], t, p);
+ for (auto &i : vec) {
+ ::decode(i, t, p);
+ }
DECODE_FINISH(p);
}
// for dencoder infrastructure
return vec[t];
}
void adjust(utime_t now, const DecayRate& rate, double d) {
- for (int i=0; i<NUM; i++)
- vec[i].adjust(now, rate, d);
+ for (auto &i : vec) {
+ i.adjust(now, rate, d);
+ }
}
void zero(utime_t now) {
- for (int i=0; i<NUM; i++)
- vec[i].reset(now);
+ for (auto &i : vec) {
+ i.reset(now);
+ }
}
double meta_load(utime_t now, const DecayRate& rate) {
return
dirfrag_load_vec_t auth;
dirfrag_load_vec_t all;
- double req_rate;
- double cache_hit_rate;
- double queue_len;
+ double req_rate = 0.0;
+ double cache_hit_rate = 0.0;
+ double queue_len = 0.0;
- double cpu_load_avg;
+ double cpu_load_avg = 0.0;
- explicit mds_load_t(const utime_t &t) :
- auth(t), all(t), req_rate(0), cache_hit_rate(0),
- queue_len(0), cpu_load_avg(0)
- {}
+ explicit mds_load_t(const utime_t &t) : auth(t), all(t) {}
// mostly for the dencoder infrastructure
- mds_load_t() :
- auth(), all(),
- req_rate(0), cache_hit_rate(0), queue_len(0), cpu_load_avg(0)
- {}
+ mds_load_t() : auth(), all() {}
double mds_load(); // defiend in MDBalancer.cc
void encode(bufferlist& bl) const;
public:
static const int MAX = 4;
int last[MAX];
- int p, n;
+ int p = 0, n = 0;
DecayCounter count;
public:
- load_spread_t() : p(0), n(0), count(ceph_clock_now())
+ load_spread_t() : count(ceph_clock_now())
{
for (int i=0; i<MAX; i++)
last[i] = -1;
class MDSCacheObjectInfo {
public:
- inodeno_t ino;
+ inodeno_t ino = 0;
dirfrag_t dirfrag;
string dname;
snapid_t snapid;
- MDSCacheObjectInfo() : ino(0) {}
+ MDSCacheObjectInfo() {}
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bl);
*
*/
+#include <boost/utility/string_view.hpp>
+
#include "snap.h"
#include "common/Formatter.h"
<< "' " << sn.stamp << ")";
}
-const string& SnapInfo::get_long_name()
+boost::string_view SnapInfo::get_long_name()
{
if (long_name.length() == 0) {
char nm[80];
#ifndef CEPH_MDS_SNAP_H
#define CEPH_MDS_SNAP_H
+#include <boost/utility/string_view.hpp>
+
#include "mdstypes.h"
#include "common/snap_types.h"
void dump(Formatter *f) const;
static void generate_test_instances(list<SnapInfo*>& ls);
- const string& get_long_name();
+ boost::string_view get_long_name();
};
WRITE_CLASS_ENCODER(SnapInfo)
#ifndef CEPH_MCACHEEXPIRE_H
#define CEPH_MCACHEEXPIRE_H
+#include <boost/utility/string_view.hpp>
+
#include "mds/mdstypes.h"
class MCacheExpire : public Message {
void add_dir(dirfrag_t r, dirfrag_t df, unsigned nonce) {
realms[r].dirs[df] = nonce;
}
- void add_dentry(dirfrag_t r, dirfrag_t df, const string& dn, snapid_t last, unsigned nonce) {
- realms[r].dentries[df][pair<string,snapid_t>(dn,last)] = nonce;
+ void add_dentry(dirfrag_t r, dirfrag_t df, boost::string_view dn, snapid_t last, unsigned nonce) {
+ realms[r].dentries[df][pair<string,snapid_t>(std::string(dn),last)] = nonce;
}
void add_realm(dirfrag_t df, realm& r) {
#ifndef CEPH_MCLIENTLEASE_H
#define CEPH_MCLIENTLEASE_H
+#include <boost/utility/string_view.hpp>
+
#include "msg/Message.h"
struct MClientLease : public Message {
struct ceph_mds_lease h;
- string dname;
+ std::string dname;
int get_action() const { return h.action; }
ceph_seq_t get_seq() const { return h.seq; }
h.last = sl;
h.duration_ms = 0;
}
- MClientLease(int ac, ceph_seq_t seq, int m, uint64_t i, uint64_t sf, uint64_t sl, const string& d) :
+ MClientLease(int ac, ceph_seq_t seq, int m, uint64_t i, uint64_t sf, uint64_t sl, boost::string_view d) :
Message(CEPH_MSG_CLIENT_LEASE),
dname(d) {
h.action = ac;
*
*/
+#include <boost/utility/string_view.hpp>
+
#include "msg/Message.h"
#include "include/filepath.h"
#include "mds/mdstypes.h"
void set_retry_attempt(int a) { head.num_retry = a; }
void set_filepath(const filepath& fp) { path = fp; }
void set_filepath2(const filepath& fp) { path2 = fp; }
- void set_string2(const char *s) { path2.set_path(s, 0); }
+ void set_string2(const char *s) { path2.set_path(boost::string_view(s), 0); }
void set_caller_uid(unsigned u) { head.caller_uid = u; }
void set_caller_gid(unsigned g) { head.caller_gid = g; }
void set_gid_list(int count, const gid_t *gids) {
#ifndef CEPH_MCOMMANDREPLY_H
#define CEPH_MCOMMANDREPLY_H
+#include <boost/utility/string_view.hpp>
+
#include "msg/Message.h"
#include "MCommand.h"
: Message(MSG_COMMAND_REPLY), r(_r) {
header.tid = m->get_tid();
}
- MCommandReply(int _r, string s)
+ MCommandReply(int _r, boost::string_view s)
: Message(MSG_COMMAND_REPLY),
r(_r), rs(s) { }
private:
#ifndef CEPH_MDENTRYLINK_H
#define CEPH_MDENTRYLINK_H
+#include <boost/utility/string_view.hpp>
+
class MDentryLink : public Message {
dirfrag_t subtree;
dirfrag_t dirfrag;
MDentryLink() :
Message(MSG_MDS_DENTRYLINK) { }
- MDentryLink(dirfrag_t r, dirfrag_t df, string& n, bool p) :
+ MDentryLink(dirfrag_t r, dirfrag_t df, boost::string_view n, bool p) :
Message(MSG_MDS_DENTRYLINK),
subtree(r),
dirfrag(df),
#ifndef CEPH_MDENTRYUNLINK_H
#define CEPH_MDENTRYUNLINK_H
+#include <boost/utility/string_view.hpp>
+
class MDentryUnlink : public Message {
dirfrag_t dirfrag;
string dn;
MDentryUnlink() :
Message(MSG_MDS_DENTRYUNLINK) { }
- MDentryUnlink(dirfrag_t df, string& n) :
+ MDentryUnlink(dirfrag_t df, boost::string_view n) :
Message(MSG_MDS_DENTRYUNLINK),
dirfrag(df),
dn(n) {}
#include "msg/Message.h"
class MDirUpdate : public Message {
- mds_rank_t from_mds;
- dirfrag_t dirfrag;
- int32_t dir_rep;
- int32_t discover;
- compact_set<int32_t> dir_rep_by;
- filepath path;
- int tried_discover;
+public:
+ MDirUpdate() : Message(MSG_MDS_DIRUPDATE) {}
+ MDirUpdate(mds_rank_t f,
+ dirfrag_t dirfrag,
+ int dir_rep,
+ const std::set<int32_t>& dir_rep_by,
+ filepath& path,
+ bool discover = false) :
+ Message(MSG_MDS_DIRUPDATE), from_mds(f), dirfrag(dirfrag),
+ dir_rep(dir_rep), dir_rep_by(dir_rep_by), path(path) {
+ this->discover = discover ? 5 : 0;
+ }
- public:
mds_rank_t get_source_mds() const { return from_mds; }
dirfrag_t get_dirfrag() const { return dirfrag; }
int get_dir_rep() const { return dir_rep; }
- const compact_set<int>& get_dir_rep_by() const { return dir_rep_by; }
+ const std::set<int32_t>& get_dir_rep_by() const { return dir_rep_by; }
bool should_discover() const { return discover > tried_discover; }
const filepath& get_path() const { return path; }
bool has_tried_discover() const { return tried_discover > 0; }
void inc_tried_discover() { ++tried_discover; }
- MDirUpdate() : Message(MSG_MDS_DIRUPDATE), tried_discover(0) {}
- MDirUpdate(mds_rank_t f,
- dirfrag_t dirfrag,
- int dir_rep,
- compact_set<int>& dir_rep_by,
- filepath& path,
- bool discover = false) :
- Message(MSG_MDS_DIRUPDATE), tried_discover(0) {
- this->from_mds = f;
- this->dirfrag = dirfrag;
- this->dir_rep = dir_rep;
- this->dir_rep_by = dir_rep_by;
- this->discover = discover ? 5 : 0;
- this->path = path;
- }
-private:
- ~MDirUpdate() override {}
-
-public:
const char *get_type_name() const override { return "dir_update"; }
void print(ostream& out) const override {
out << "dir_update(" << get_dirfrag() << ")";
::encode(dir_rep_by, payload);
::encode(path, payload);
}
+
+private:
+ ~MDirUpdate() override {}
+
+ mds_rank_t from_mds = -1;
+ dirfrag_t dirfrag;
+ int32_t dir_rep = 5;
+ int32_t discover = 5;
+ std::set<int32_t> dir_rep_by;
+ filepath path;
+ int tried_discover = 0;
};
#endif
bool is_flag_error_dn() { return flag_error_dn; }
bool is_flag_error_dir() { return flag_error_dir; }
- std::string& get_error_dentry() { return error_dentry; }
+ const std::string& get_error_dentry() { return error_dentry; }
int get_starts_with() { return starts_with; }
}
// void set_flag_forward() { flag_forward = true; }
- void set_flag_error_dn(const std::string& dn) {
+ void set_flag_error_dn(boost::string_view dn) {
flag_error_dn = true;
- error_dentry = dn;
+ error_dentry = std::string(dn);
}
void set_flag_error_dir() {
flag_error_dir = true;
void set_dir_auth_hint(int a) {
dir_auth_hint = a;
}
- void set_error_dentry(const std::string& dn) {
- error_dentry = dn;
+ void set_error_dentry(boost::string_view dn) {
+ error_dentry = std::string(dn);
}
#ifndef CEPH_MMDSBEACON_H
#define CEPH_MMDSBEACON_H
+#include <boost/utility/string_view.hpp>
+
#include "messages/PaxosServiceMessage.h"
#include "include/types.h"
}
MDSHealthMetric() : type(MDS_HEALTH_NULL), sev(HEALTH_OK) {}
- MDSHealthMetric(mds_metric_t type_, health_status_t sev_, std::string const &message_)
+ MDSHealthMetric(mds_metric_t type_, health_status_t sev_, boost::string_view message_)
: type(type_), sev(sev_), message(message_) {}
};
WRITE_CLASS_ENCODER(MDSHealthMetric)
#ifndef CEPH_MMDSCACHEREJOIN_H
#define CEPH_MMDSCACHEREJOIN_H
+#include <boost/utility/string_view.hpp>
+
#include "msg/Message.h"
#include "include/types.h"
void add_weak_dirfrag(dirfrag_t df) {
weak_dirfrags.insert(df);
}
- void add_weak_dentry(inodeno_t dirino, const string& dname, snapid_t last, dn_weak& dnw) {
+ void add_weak_dentry(inodeno_t dirino, boost::string_view dname, snapid_t last, dn_weak& dnw) {
weak[dirino][string_snap_t(dname, last)] = dnw;
}
- void add_weak_primary_dentry(inodeno_t dirino, const string& dname, snapid_t first, snapid_t last, inodeno_t ino) {
+ void add_weak_primary_dentry(inodeno_t dirino, boost::string_view dname, snapid_t first, snapid_t last, inodeno_t ino) {
weak[dirino][string_snap_t(dname, last)] = dn_weak(first, ino);
}
- void add_strong_dentry(dirfrag_t df, const string& dname, snapid_t first, snapid_t last, inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int ls) {
+ void add_strong_dentry(dirfrag_t df, boost::string_view dname, snapid_t first, snapid_t last, inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int ls) {
strong_dentries[df][string_snap_t(dname, last)] = dn_strong(first, pi, ri, rdt, n, ls);
}
- void add_dentry_authpin(dirfrag_t df, const string& dname, snapid_t last,
+ void add_dentry_authpin(dirfrag_t df, boost::string_view dname, snapid_t last,
const metareqid_t& ri, __u32 attempt) {
authpinned_dentries[df][string_snap_t(dname, last)].push_back(slave_reqid(ri, attempt));
}
- void add_dentry_xlock(dirfrag_t df, const string& dname, snapid_t last,
+ void add_dentry_xlock(dirfrag_t df, boost::string_view dname, snapid_t last,
const metareqid_t& ri, __u32 attempt) {
xlocked_dentries[df][string_snap_t(dname, last)] = slave_reqid(ri, attempt);
}
class MOSDPGUpdateLogMissing : public MOSDFastDispatchOp {
- static const int HEAD_VERSION = 2;
+ static const int HEAD_VERSION = 3;
static const int COMPAT_VERSION = 1;
shard_id_t from;
ceph_tid_t rep_tid;
mempool::osd_pglog::list<pg_log_entry_t> entries;
+ // piggybacked osd/pg state
+ eversion_t pg_trim_to; // primary->replica: trim to here
+ eversion_t pg_roll_forward_to; // primary->replica: trim rollback info to here
epoch_t get_epoch() const { return map_epoch; }
spg_t get_pgid() const { return pgid; }
shard_id_t from,
epoch_t epoch,
epoch_t min_epoch,
- ceph_tid_t rep_tid)
+ ceph_tid_t rep_tid,
+ eversion_t pg_trim_to,
+ eversion_t pg_roll_forward_to)
: MOSDFastDispatchOp(MSG_OSD_PG_UPDATE_LOG_MISSING, HEAD_VERSION,
COMPAT_VERSION),
map_epoch(epoch),
pgid(pgid),
from(from),
rep_tid(rep_tid),
- entries(entries) {}
+ entries(entries),
+ pg_trim_to(pg_trim_to),
+ pg_roll_forward_to(pg_roll_forward_to)
+ {}
private:
~MOSDPGUpdateLogMissing() override {}
out << "pg_update_log_missing(" << pgid << " epoch " << map_epoch
<< "/" << min_epoch
<< " rep_tid " << rep_tid
- << " entries " << entries << ")";
+ << " entries " << entries
+ << " trim_to " << pg_trim_to
+ << " roll_forward_to " << pg_roll_forward_to
+ << ")";
}
void encode_payload(uint64_t features) override {
::encode(rep_tid, payload);
::encode(entries, payload);
::encode(min_epoch, payload);
+ ::encode(pg_trim_to, payload);
+ ::encode(pg_roll_forward_to, payload);
}
void decode_payload() override {
bufferlist::iterator p = payload.begin();
} else {
min_epoch = map_epoch;
}
+ if (header.version >= 3) {
+ ::decode(pg_trim_to, p);
+ ::decode(pg_roll_forward_to, p);
+ }
}
};
class MOSDPGUpdateLogMissingReply : public MOSDFastDispatchOp {
- static const int HEAD_VERSION = 2;
+ static const int HEAD_VERSION = 3;
static const int COMPAT_VERSION = 1;
spg_t pgid;
shard_id_t from;
ceph_tid_t rep_tid;
+ // piggybacked osd state
+ eversion_t last_complete_ondisk;
epoch_t get_epoch() const { return map_epoch; }
spg_t get_pgid() const { return pgid; }
shard_id_t from,
epoch_t epoch,
epoch_t min_epoch,
- ceph_tid_t rep_tid)
+ ceph_tid_t rep_tid,
+ eversion_t last_complete_ondisk)
: MOSDFastDispatchOp(
MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY,
HEAD_VERSION,
min_epoch(min_epoch),
pgid(pgid),
from(from),
- rep_tid(rep_tid)
+ rep_tid(rep_tid),
+ last_complete_ondisk(last_complete_ondisk)
{}
private:
void print(ostream& out) const override {
out << "pg_update_log_missing_reply(" << pgid << " epoch " << map_epoch
<< "/" << min_epoch
- << " rep_tid " << rep_tid << ")";
+ << " rep_tid " << rep_tid
+ << " lcod " << last_complete_ondisk << ")";
}
void encode_payload(uint64_t features) override {
::encode(from, payload);
::encode(rep_tid, payload);
::encode(min_epoch, payload);
+ ::encode(last_complete_ondisk, payload);
}
void decode_payload() override {
bufferlist::iterator p = payload.begin();
} else {
min_epoch = map_epoch;
}
+ if (header.version >= 3) {
+ ::decode(last_complete_ondisk, p);
+ }
}
};
Py_RETURN_NONE;
}
+static PyObject*
+ceph_have_mon_connection(BaseMgrModule *self, PyObject *args)
+{
+ if (self->py_modules->get_monc().is_connected()) {
+ Py_RETURN_TRUE;
+ } else {
+ Py_RETURN_FALSE;
+ }
+}
+
PyMethodDef BaseMgrModule_methods[] = {
{"_ceph_get", (PyCFunction)ceph_state_get, METH_VARARGS,
{"_ceph_set_uri", (PyCFunction)ceph_set_uri, METH_VARARGS,
"Advertize a service URI served by this module"},
+ {"_ceph_have_mon_connection", (PyCFunction)ceph_have_mon_connection,
+ METH_NOARGS, "Find out whether this mgr daemon currently has "
+ "a connection to a monitor"},
+
{NULL, NULL, 0, NULL}
};
// Start communicating with daemons to learn statistics etc
int r = server.init(monc->get_global_id(), client_messenger->get_myaddr());
if (r < 0) {
- derr << "Initialize server fail"<< dendl;
- return;
+ derr << "Initialize server fail: " << cpp_strerror(r) << dendl;
+ // This is typically due to a bind() failure, so let's let
+ // systemd restart us.
+ exit(1);
}
dout(4) << "Initialized server at " << server.get_myaddr() << dendl;
bool MgrStandby::ms_dispatch(Message *m)
{
Mutex::Locker l(lock);
+ bool handled = false;
dout(4) << state_str() << " " << *m << dendl;
if (m->get_type() == MSG_MGR_MAP) {
} else if (active_mgr) {
auto am = active_mgr;
lock.Unlock();
- bool handled = am->ms_dispatch(m);
+ handled = am->ms_dispatch(m);
lock.Lock();
return handled;
} else {
return false;
}
+ if (m->get_type() == MSG_MGR_MAP) {
+ // let this pass through for mgrc
+ handled = false;
+ }
+ return handled;
}
dump_pyobject(name, PyFloat_FromDouble(d));
}
-void PyFormatter::dump_string(const char *name, const std::string& s)
+void PyFormatter::dump_string(const char *name, boost::string_view s)
{
- dump_pyobject(name, PyString_FromString(s.c_str()));
+ dump_pyobject(name, PyString_FromString(s.data()));
}
void PyFormatter::dump_bool(const char *name, bool b)
void dump_unsigned(const char *name, uint64_t u) override;
void dump_int(const char *name, int64_t u) override;
void dump_float(const char *name, double d) override;
- void dump_string(const char *name, const std::string& s) override;
+ void dump_string(const char *name, boost::string_view s) override;
std::ostream& dump_stream(const char *name) override;
void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
&max_iterations, &pool_list)) {
return nullptr;
}
+ if (!PyList_CheckExact(pool_list)) {
+ derr << __func__ << " pool_list not a list" << dendl;
+ return nullptr;
+ }
+ set<int64_t> pools;
+ for (auto i = 0; i < PyList_Size(pool_list); ++i) {
+ PyObject *pool_name = PyList_GET_ITEM(pool_list, i);
+ if (!PyString_Check(pool_name)) {
+ derr << __func__ << " " << pool_name << " not a string" << dendl;
+ return nullptr;
+ }
+ auto pool_id = self->osdmap->lookup_pg_pool_name(
+ PyString_AsString(pool_name));
+ if (pool_id < 0) {
+ derr << __func__ << " pool '" << PyString_AsString(pool_name)
+ << "' does not exist" << dendl;
+ return nullptr;
+ }
+ pools.insert(pool_id);
+ }
dout(10) << __func__ << " osdmap " << self->osdmap << " inc " << incobj->inc
<< " max_deviation " << max_deviation
<< " max_iterations " << max_iterations
+ << " pools " << pools
<< dendl;
- set<int64_t> pools;
- // FIXME: unpack pool_list and translate to pools set
int r = self->osdmap->calc_pg_upmaps(g_ceph_context,
max_deviation,
max_iterations,
return -ENOENT;
}
std::map<string, cmd_vartype> modified = cmdmap;
- modified["fs_name"] = fs->mds_map.get_fs_name();
+ modified["fs_name"] = std::string(fs->mds_map.get_fs_name());
return T::handle(mon, fsmap, op, modified, ss);
}
};
bool MgrMonitor::preprocess_beacon(MonOpRequestRef op)
{
MMgrBeacon *m = static_cast<MMgrBeacon*>(op->get_req());
+ mon->no_reply(op); // we never reply to beacons
dout(4) << "beacon from " << m->get_gid() << dendl;
if (!check_caps(op, m->get_fsid())) {
bool MgrStatMonitor::preprocess_report(MonOpRequestRef op)
{
+ mon->no_reply(op);
return false;
}
int authenticate(double timeout=0.0);
bool is_authenticated() const {return authenticated;}
+ bool is_connected() const { return active_con != nullptr; }
+
/**
* Try to flush as many log messages as we can in a single
* message. Use this before shutting down to transmit your
}
creating_pgs_t
-OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc)
+OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
+ const OSDMap& nextmap)
{
dout(10) << __func__ << dendl;
creating_pgs_t pending_creatings;
pending_creatings.last_scan_epoch = osdmap.get_epoch();
}
+ // filter out any pgs that shouldn't exist.
+ {
+ auto i = pending_creatings.pgs.begin();
+ while (i != pending_creatings.pgs.end()) {
+ if (!nextmap.pg_exists(i->first)) {
+ dout(10) << __func__ << " removing pg " << i->first
+ << " which should not exist" << dendl;
+ i = pending_creatings.pgs.erase(i);
+ } else {
+ ++i;
+ }
+ }
+ }
+
// process queue
unsigned max = MAX(1, g_conf->mon_osd_max_creating_pgs);
const auto total = pending_creatings.pgs.size();
}
}
+ // clean inappropriate pg_upmap/pg_upmap_items (if any)
+ osdmap.maybe_remove_pg_upmaps(cct, osdmap, &pending_inc);
+
// features for osdmap and its incremental
uint64_t features = mon->get_quorum_con_features();
// and pg creating, also!
if (mon->monmap->get_required_features().contains_all(
ceph::features::mon::FEATURE_LUMINOUS)) {
- auto pending_creatings = update_pending_pgs(pending_inc);
+ auto pending_creatings = update_pending_pgs(pending_inc, tmp);
if (osdmap.get_epoch() &&
osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
dout(7) << __func__ << " in the middle of upgrading, "
mon->clog->debug() << m->get_target() << " reported immediately failed by "
<< m->get_orig_source_inst();
force_failure(target_osd, reporter);
+ mon->no_reply(op);
return true;
}
mon->clog->debug() << m->get_target() << " reported failed by "
auto m = static_cast<MOSDPGCreated*>(op->get_req());
dout(10) << __func__ << " " << *m << dendl;
auto session = m->get_session();
+ mon->no_reply(op);
if (!session) {
dout(10) << __func__ << ": no monitor session!" << dendl;
return true;
auto beacon = static_cast<MOSDBeacon*>(op->get_req());
// check caps
auto session = beacon->get_session();
+ mon->no_reply(op);
if (!session) {
dout(10) << __func__ << " no monitor session!" << dendl;
return true;
for (const auto& pg : creating_pgs.pgs) {
int acting_primary = -1;
auto pgid = pg.first;
+ if (!osdmap.pg_exists(pgid)) {
+ dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
+ << dendl;
+ continue;
+ }
auto mapped = pg.second.first;
dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
mapping.get(pgid, nullptr, nullptr, nullptr, &acting_primary);
goto reply;
}
+ if (pool_opts_t::is_opt_name(var) &&
+ !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
+ ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+
selected_choices.insert(selected);
}
if (f) {
+ f->open_object_section("pool");
+ f->dump_string("pool", poolstr);
+ f->dump_int("pool_id", pool);
for(choices_set_t::const_iterator it = selected_choices.begin();
it != selected_choices.end(); ++it) {
choices_map_t::const_iterator i;
}
}
assert(i != ALL_CHOICES.end());
- bool pool_opt = pool_opts_t::is_opt_name(i->first);
- if (!pool_opt) {
- f->open_object_section("pool");
- f->dump_string("pool", poolstr);
- f->dump_int("pool_id", pool);
- }
switch(*it) {
case PG_NUM:
f->dump_int("pg_num", p->get_pg_num());
case WRITE_FADVISE_DONTNEED:
case NOSCRUB:
case NODEEP_SCRUB:
- f->dump_string(i->first.c_str(),
- p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
- "true" : "false");
+ f->dump_bool(i->first.c_str(),
+ p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
break;
case HIT_SET_PERIOD:
f->dump_int("hit_set_period", p->hit_set_period);
case CSUM_MIN_BLOCK:
pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
if (p->opts.is_set(key)) {
- f->open_object_section("pool");
- f->dump_string("pool", poolstr);
- f->dump_int("pool_id", pool);
if(*it == CSUM_TYPE) {
int val;
p->opts.get(pool_opts_t::CSUM_TYPE, &val);
} else {
p->opts.dump(i->first, f.get());
}
- f->close_section();
- f->flush(rdata);
- }
+ }
break;
}
- if (!pool_opt) {
- f->close_section();
- f->flush(rdata);
- }
}
-
+ f->close_section();
+ f->flush(rdata);
} else /* !f */ {
for(choices_set_t::const_iterator it = selected_choices.begin();
it != selected_choices.end(); ++it) {
string erasure_code_profile;
stringstream ss;
string rule_name;
+ int ret = 0;
if (m->auid)
- return prepare_new_pool(m->name, m->auid, m->crush_rule, rule_name,
+ ret = prepare_new_pool(m->name, m->auid, m->crush_rule, rule_name,
0, 0,
erasure_code_profile,
pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
else
- return prepare_new_pool(m->name, session->auid, m->crush_rule, rule_name,
+ ret = prepare_new_pool(m->name, session->auid, m->crush_rule, rule_name,
0, 0,
erasure_code_profile,
pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
+
+ if (ret < 0) {
+ dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
+ }
+ return ret;
}
int OSDMonitor::crush_rename_bucket(const string& srcname,
r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
crush_rule_name, &crush_rule, ss);
if (r) {
- dout(10) << " prepare_pool_crush_rule returns " << r << dendl;
+ dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
return r;
}
if (g_conf->mon_osd_crush_smoke_test) {
r = tester.test_with_fork(g_conf->mon_lease);
auto duration = ceph::coarse_mono_clock::now() - start;
if (r < 0) {
- dout(10) << " tester.test_with_fork returns " << r
+ dout(10) << "tester.test_with_fork returns " << r
<< ": " << err.str() << dendl;
*ss << "crush test failed with " << r << ": " << err.str();
return r;
unsigned size, min_size;
r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
if (r) {
- dout(10) << " prepare_pool_size returns " << r << dendl;
+ dout(10) << "prepare_pool_size returns " << r << dendl;
return r;
}
r = check_pg_num(-1, pg_num, size, ss);
if (r) {
- dout(10) << " prepare_pool_size returns " << r << dendl;
+ dout(10) << "check_pg_num returns " << r << dendl;
return r;
}
uint32_t stripe_width = 0;
r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
if (r) {
- dout(10) << " prepare_pool_stripe_width returns " << r << dendl;
+ dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
return r;
}
err = -ENOENT;
goto reply;
}
+ if (pending_inc.old_pools.count(pgid.pool())) {
+ ss << "pool of " << pgid << " is pending removal";
+ err = -ENOENT;
+ getline(ss, rs);
+ wait_for_finished_proposal(op,
+ new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
+ return true;
+ }
enum {
OP_PG_UPMAP,
}
bool implicit_rule_creation = false;
+ int64_t expected_num_objects = 0;
string rule_name;
cmd_getval(g_ceph_context, cmdmap, "rule", rule_name);
string erasure_code_profile;
rule_name = poolstr;
}
}
+ cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
+ expected_num_objects, int64_t(0));
} else {
//NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
- rule_name = erasure_code_profile;
+ // and put expected_num_objects to rule field
+ if (erasure_code_profile != "") { // cmd is from CLI
+ if (rule_name != "") {
+ string interr;
+ expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
+ if (interr.length()) {
+ ss << "error parsing integer value '" << rule_name << "': " << interr;
+ err = -EINVAL;
+ goto reply;
+ }
+ }
+ rule_name = erasure_code_profile;
+ } else { // cmd is well-formed
+ cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
+ expected_num_objects, int64_t(0));
+ }
}
if (!implicit_rule_creation && rule_name != "") {
goto reply;
}
- int64_t expected_num_objects;
- cmd_getval(g_ceph_context, cmdmap, "expected_num_objects", expected_num_objects, int64_t(0));
if (expected_num_objects < 0) {
ss << "'expected_num_objects' must be non-negative";
err = -EINVAL;
err = -EINVAL;
goto reply;
}
+ if (!osdmap.pg_exists(pgid)) {
+ ss << "pg " << pgid << " should not exist";
+ err = -ENOENT;
+ goto reply;
+ }
bool creating_now;
{
std::lock_guard<std::mutex> l(creating_pgs_lock);
// Apply CephFS-specific checks
const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
if (pending_fsmap.pool_in_use(base_pool_id)) {
- if (base_pool->type != pg_pool_t::TYPE_REPLICATED) {
- // If the underlying pool is erasure coded, we can't permit the
- // removal of the replicated tier that CephFS relies on to access it
- *ss << "pool '" << base_pool_name << "' is in use by CephFS via its tier";
+ if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
+ // If the underlying pool is erasure coded and does not allow EC
+ // overwrites, we can't permit the removal of the replicated tier that
+ // CephFS relies on to access it
+ *ss << "pool '" << base_pool_name <<
+ "' does not allow EC overwrites and is in use by CephFS"
+ " via its tier";
*err = -EBUSY;
return false;
}
pending_inc.old_pg_upmap.insert(p.first);
}
}
+ // remove any pending pg_upmap mappings for this pool
+ {
+ auto it = pending_inc.new_pg_upmap.begin();
+ while (it != pending_inc.new_pg_upmap.end()) {
+ if (it->first.pool() == (uint64_t)pool) {
+ dout(10) << __func__ << " " << pool
+ << " removing pending pg_upmap "
+ << it->first << dendl;
+ it = pending_inc.new_pg_upmap.erase(it);
+ } else {
+ it++;
+ }
+ }
+ }
// remove any pg_upmap_items mappings for this pool
for (auto& p : osdmap.pg_upmap_items) {
if (p.first.pool() == (uint64_t)pool) {
pending_inc.old_pg_upmap_items.insert(p.first);
}
}
+ // remove any pending pg_upmap mappings for this pool
+ {
+ auto it = pending_inc.new_pg_upmap_items.begin();
+ while (it != pending_inc.new_pg_upmap_items.end()) {
+ if (it->first.pool() == (uint64_t)pool) {
+ dout(10) << __func__ << " " << pool
+ << " removing pending pg_upmap_items "
+ << it->first << dendl;
+ it = pending_inc.new_pg_upmap_items.erase(it);
+ } else {
+ it++;
+ }
+ }
+ }
// remove any choose_args for this pool
CrushWrapper newcrush;
creating_pgs_t creating_pgs;
mutable std::mutex creating_pgs_lock;
- creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc);
+ creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc,
+ const OSDMap& nextmap);
void trim_creating_pgs(creating_pgs_t *creating_pgs,
const ceph::unordered_map<pg_t,pg_stat_t>& pgm);
unsigned scan_for_creating_pgs(
min = proj;
}
} else {
- dout(0) << "Cannot get stat of OSD " << p->first << dendl;
+ if (osdmap.is_up(p->first)) {
+ // This is a level 4 rather than an error, because we might have
+ // only just started, and not received the first stats message yet.
+ dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl;
+ }
}
}
return min;
MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
trim(t, get_first_committed(), trim_to);
put_first_committed(t, trim_to);
+ cached_first_committed = trim_to;
// let the service add any extra stuff
encode_trim_extra(t, trim_to);
int r = 0;
if (fd >= nevent) {
int new_size = nevent << 2;
- while (fd > new_size)
+ while (fd >= new_size)
new_size <<= 2;
ldout(cct, 20) << __func__ << " event count exceed " << nevent << ", expand to " << new_size << dendl;
r = driver->resize_events(new_size);
set(ra_dir ${ocf_dir}/resource.d/${PROJECT_NAME})
configure_file(rbd.in rbd @ONLY)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/rbd DESTINATION ${ra_dir})
+install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/rbd DESTINATION ${ra_dir})
#include "Allocator.h"
#include "BitAllocator.h"
-#include "include/btree_interval_set.h"
class BitMapAllocator : public Allocator {
CephContext* cct;
int r = _allocate(
log_file->fnode.prefer_bdev,
cct->_conf->bluefs_max_log_runway,
- &log_file->fnode.extents);
- log_file->fnode.recalc_allocated();
+ &log_file->fnode);
assert(r == 0);
log_writer = _create_writer(log_file);
}
file_map.erase(file->fnode.ino);
file->deleted = true;
- file->fnode.recalc_allocated();
+
if (file->dirty_seq) {
assert(file->dirty_seq > log_seq_stable);
assert(dirty_files.count(file->dirty_seq));
dout(20) << __func__ << " need " << need << dendl;
mempool::bluefs::vector<bluefs_extent_t> old_extents;
- old_extents.swap(log_file->fnode.extents);
- log_file->fnode.recalc_allocated();
+ uint64_t old_allocated = 0;
+ log_file->fnode.swap_extents(old_extents, old_allocated);
while (log_file->fnode.get_allocated() < need) {
int r = _allocate(log_file->fnode.prefer_bdev,
need - log_file->fnode.get_allocated(),
- &log_file->fnode.extents);
- log_file->fnode.recalc_allocated();
+ &log_file->fnode);
assert(r == 0);
}
while (log_file->fnode.get_allocated() < need) {
int r = _allocate(log_file->fnode.prefer_bdev,
cct->_conf->bluefs_max_log_runway,
- &log_file->fnode.extents);
+ &log_file->fnode);
assert(r == 0);
- log_file->fnode.recalc_allocated();
}
dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
// allocate
int r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
- &new_log->fnode.extents);
+ &new_log->fnode);
assert(r == 0);
- new_log->fnode.recalc_allocated();
new_log_writer = _create_writer(new_log);
new_log_writer->append(bl);
if (discarded + e.length <= old_log_jump_to) {
dout(10) << __func__ << " remove old log extent " << e << dendl;
discarded += e.length;
- log_file->fnode.extents.erase(log_file->fnode.extents.begin());
+ log_file->fnode.pop_front_extent();
} else {
dout(10) << __func__ << " remove front of old log extent " << e << dendl;
uint64_t drop = old_log_jump_to - discarded;
}
old_extents.push_back(temp);
}
- new_log->fnode.extents.insert(new_log->fnode.extents.end(),
- log_file->fnode.extents.begin(),
- log_file->fnode.extents.end());
+ auto from = log_file->fnode.extents.begin();
+ auto to = log_file->fnode.extents.end();
+ while (from != to) {
+ new_log->fnode.append_extent(*from);
+ ++from;
+ }
// clear the extents from old log file, they are added to new log
- log_file->fnode.extents.clear();
-
+ log_file->fnode.clear_extents();
// swap the log files. New log file is the log file now.
- log_file->fnode.extents.swap(new_log->fnode.extents);
- log_file->fnode.recalc_allocated();
- new_log->fnode.recalc_allocated();
+ new_log->fnode.swap_extents(log_file->fnode);
+
log_writer->pos = log_writer->file->fnode.size =
log_writer->pos - old_log_jump_to + new_log_jump_to;
}
int r = _allocate(log_writer->file->fnode.prefer_bdev,
cct->_conf->bluefs_max_log_runway,
- &log_writer->file->fnode.extents);
+ &log_writer->file->fnode);
assert(r == 0);
- log_writer->file->fnode.recalc_allocated();
log_t.op_file_update(log_writer->file->fnode);
}
assert(h->file->fnode.ino != 1);
int r = _allocate(h->file->fnode.prefer_bdev,
offset + length - allocated,
- &h->file->fnode.extents);
+ &h->file->fnode);
if (r < 0) {
derr << __func__ << " allocated: 0x" << std::hex << allocated
<< " offset: 0x" << offset << " length: 0x" << length << std::dec
assert(0 == "bluefs enospc");
return r;
}
- h->file->fnode.recalc_allocated();
if (cct->_conf->bluefs_preextend_wal_files &&
h->writer_type == WRITER_WAL) {
// NOTE: this *requires* that rocksdb also has log recycling
}
int BlueFS::_allocate(uint8_t id, uint64_t len,
- mempool::bluefs::vector<bluefs_extent_t> *ev)
+ bluefs_fnode_t* node)
{
dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
<< " from " << (int)id << dendl;
if (r == 0) {
uint64_t hint = 0;
- if (!ev->empty()) {
- hint = ev->back().end();
- }
+ if (!node->extents.empty() && node->extents.back().bdev == id) {
+ hint = node->extents.back().end();
+ }
extents.reserve(4); // 4 should be (more than) enough for most allocations
alloc_len = alloc[id]->allocate(left, min_alloc_size, hint, &extents);
}
<< "; fallback to bdev " << (int)id + 1
<< std::dec << dendl;
}
- return _allocate(id + 1, len, ev);
+ return _allocate(id + 1, len, node);
}
if (bdev[id])
derr << __func__ << " failed to allocate 0x" << std::hex << left
}
for (auto& p : extents) {
- bluefs_extent_t e = bluefs_extent_t(id, p.offset, p.length);
- if (!ev->empty() &&
- ev->back().bdev == e.bdev &&
- ev->back().end() == (uint64_t) e.offset) {
- ev->back().length += e.length;
- } else {
- ev->push_back(e);
- }
+ node->append_extent(bluefs_extent_t(id, p.offset, p.length));
}
return 0;
uint64_t allocated = f->fnode.get_allocated();
if (off + len > allocated) {
uint64_t want = off + len - allocated;
- int r = _allocate(f->fnode.prefer_bdev, want, &f->fnode.extents);
+ int r = _allocate(f->fnode.prefer_bdev, want, &f->fnode);
if (r < 0)
return r;
- f->fnode.recalc_allocated();
log_t.op_file_update(f->fnode);
}
return 0;
for (auto& p : file->fnode.extents) {
pending_release[p.bdev].insert(p.offset, p.length);
}
- file->fnode.extents.clear();
- file->fnode.recalc_allocated();
+
+ file->fnode.clear_extents();
}
}
assert(file->fnode.ino > 1);
bool BlueFS::wal_is_rotational()
{
- if (!bdev[BDEV_WAL] || bdev[BDEV_WAL]->is_rotational())
- return true;
- return false;
+ if (bdev[BDEV_WAL]) {
+ return bdev[BDEV_WAL]->is_rotational();
+ } else if (bdev[BDEV_DB]) {
+ return bdev[BDEV_DB]->is_rotational();
+ }
+ return bdev[BDEV_SLOW]->is_rotational();
}
void _drop_link(FileRef f);
int _allocate(uint8_t bdev, uint64_t len,
- mempool::bluefs::vector<bluefs_extent_t> *ev);
+ bluefs_fnode_t* node);
int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
int _flush(FileWriter *h, bool force);
int _fsync(FileWriter *h, std::unique_lock<std::mutex>& l);
int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
0, 0, &exts);
- if (alloc_len < (int64_t)gift) {
- derr << __func__ << " allocate failed on 0x" << std::hex << gift
- << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+ if (alloc_len <= 0) {
+ dout(1) << __func__ << " no allocate on 0x" << std::hex << gift
+ << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+ alloc->unreserve(gift);
+ alloc->dump();
+ return 0;
+ } else if (alloc_len < (int64_t)gift) {
+ dout(1) << __func__ << " insufficient allocate on 0x" << std::hex << gift
+ << " min_alloc_size 0x" << min_alloc_size
+ << " allocated 0x" << alloc_len
+ << std::dec << dendl;
+ alloc->unreserve(gift - alloc_len);
alloc->dump();
- assert(0 == "allocate failed, wtf");
- return -ENOSPC;
}
for (auto& p : exts) {
bluestore_pextent_t e = bluestore_pextent_t(p);
mempool_thread.init();
-
mounted = true;
return 0;
mempool_thread.shutdown();
dout(20) << __func__ << " stopping kv thread" << dendl;
_kv_stop();
- _reap_collections();
_flush_cache();
dout(20) << __func__ << " closing" << dendl;
buf->available = alloc->get_free();
if (bluefs) {
- // part of our shared device is "free" according to BlueFS
- // Don't include bluestore_bluefs_min because that space can't
- // be used for any other purpose.
- buf->available += bluefs->get_free(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min;
-
- // include dedicated db, too, if that isn't the shared device.
- if (bluefs_shared_bdev != BlueFS::BDEV_DB) {
- buf->total += bluefs->get_total(BlueFS::BDEV_DB);
+ // part of our shared device is "free" according to BlueFS, but we
+ // can't touch bluestore_bluefs_min of it.
+ int64_t shared_available = std::min(
+ bluefs->get_free(bluefs_shared_bdev),
+ bluefs->get_total(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min);
+ if (shared_available > 0) {
+ buf->available += shared_available;
}
}
void BlueStore::_queue_reap_collection(CollectionRef& c)
{
dout(10) << __func__ << " " << c << " " << c->cid << dendl;
- std::lock_guard<std::mutex> l(reap_lock);
+ // _reap_collections and this in the same thread,
+ // so no need a lock.
removed_collections.push_back(c);
}
void BlueStore::_reap_collections()
{
+
list<CollectionRef> removed_colls;
{
- std::lock_guard<std::mutex> l(reap_lock);
- removed_colls.swap(removed_collections);
+ // _queue_reap_collection and this in the same thread.
+ // So no need a lock.
+ if (!removed_collections.empty())
+ removed_colls.swap(removed_collections);
+ else
+ return;
}
- bool all_reaped = true;
-
- for (list<CollectionRef>::iterator p = removed_colls.begin();
- p != removed_colls.end();
- ++p) {
+ list<CollectionRef>::iterator p = removed_colls.begin();
+ while (p != removed_colls.end()) {
CollectionRef c = *p;
dout(10) << __func__ << " " << c << " " << c->cid << dendl;
if (c->onode_map.map_any([&](OnodeRef o) {
if (o->flushing_count.load()) {
dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
<< " flush_txns " << o->flushing_count << dendl;
- return false;
+ return true;
}
- return true;
+ return false;
})) {
- all_reaped = false;
+ ++p;
continue;
}
c->onode_map.clear();
+ p = removed_colls.erase(p);
dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
}
-
- if (all_reaped) {
+ if (removed_colls.empty()) {
dout(10) << __func__ << " all reaped" << dendl;
+ } else {
+ removed_collections.splice(removed_collections.begin(), removed_colls);
}
}
pos += hole;
left -= hole;
}
- BlobRef bptr = lp->blob;
+ BlobRef& bptr = lp->blob;
unsigned l_off = pos - lp->logical_offset;
unsigned b_off = l_off + lp->blob_offset;
unsigned b_len = std::min(left, lp->length - l_off);
vector<bufferlist> compressed_blob_bls;
IOContext ioc(cct, NULL, true); // allow EIO
for (auto& p : blobs2read) {
- BlobRef bptr = p.first;
+ const BlobRef& bptr = p.first;
dout(20) << __func__ << " blob " << *bptr << std::hex
<< " need " << p.second << std::dec << dendl;
if (bptr->get_blob().is_compressed()) {
auto p = compressed_blob_bls.begin();
blobs2read_t::iterator b2r_it = blobs2read.begin();
while (b2r_it != blobs2read.end()) {
- BlobRef bptr = b2r_it->first;
+ const BlobRef& bptr = b2r_it->first;
dout(20) << __func__ << " blob " << *bptr << std::hex
<< " need 0x" << b2r_it->second << std::dec << dendl;
if (bptr->get_blob().is_compressed()) {
{
// update allocator with full released set
if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
- dout(10) << __func__ << " " << txc << " " << txc->released << dendl;
+ dout(10) << __func__ << " " << txc << " " << std::hex
+ << txc->released << std::dec << dendl;
for (interval_set<uint64_t>::iterator p = txc->released.begin();
p != txc->released.end();
++p) {
}
kv_sync_thread.join();
kv_finalize_thread.join();
+ assert(removed_collections.empty());
{
std::lock_guard<std::mutex> l(kv_lock);
kv_stop = false;
return r;
}
-void BlueStore::_dump_onode(OnodeRef o, int log_level)
+void BlueStore::_dump_onode(const OnodeRef& o, int log_level)
{
if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level))
return;
PerfCounters *logger = nullptr;
- std::mutex reap_lock;
list<CollectionRef> removed_collections;
RWLock debug_read_error_lock = {"BlueStore::debug_read_error_lock"};
void _assign_nid(TransContext *txc, OnodeRef o);
uint64_t _assign_blobid(TransContext *txc);
- void _dump_onode(OnodeRef o, int log_level=30);
+ void _dump_onode(const OnodeRef& o, int log_level=30);
void _dump_extent_map(ExtentMap& em, int log_level=30);
void _dump_transaction(Transaction *t, int log_level = 30);
// later flush() occurs.
io_since_flush.store(true);
- int r = aio[i]->get_return_value();
+ long r = aio[i]->get_return_value();
if (r < 0) {
derr << __func__ << " got " << cpp_strerror(r) << dendl;
if (ioc->allow_eio && r == -EIO) {
/// return the effective length of the extent if we align to alloc_unit
uint64_t StupidAllocator::_aligned_len(
- btree_interval_set<uint64_t,allocator>::iterator p,
+ StupidAllocator::interval_set_t::iterator p,
uint64_t alloc_unit)
{
uint64_t skew = p.get_start() % alloc_unit;
std::lock_guard<std::mutex> l(lock);
dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
<< std::dec << dendl;
- btree_interval_set<uint64_t,allocator> rm;
+ interval_set_t rm;
rm.insert(offset, length);
for (unsigned i = 0; i < free.size() && !rm.empty(); ++i) {
- btree_interval_set<uint64_t,allocator> overlap;
+ interval_set_t overlap;
overlap.intersection_of(rm, free[i]);
if (!overlap.empty()) {
dout(20) << __func__ << " bin " << i << " rm 0x" << std::hex << overlap
<< std::dec << dendl;
- free[i].subtract(overlap);
+ auto it = overlap.begin();
+ auto it_end = overlap.end();
+ while (it != it_end) {
+ auto o = it.get_start();
+ auto l = it.get_len();
+
+ free[i].erase(o, l,
+ [&](uint64_t off, uint64_t len) {
+ unsigned newbin = _choose_bin(len);
+ if (newbin != i) {
+ ldout(cct, 30) << __func__ << " demoting1 0x" << std::hex << off << "~" << len
+ << std::dec << " to bin " << newbin << dendl;
+ _insert_free(off, len);
+ return false;
+ }
+ return true;
+ });
+ ++it;
+ }
+
rm.subtract(overlap);
}
}
#include <mutex>
#include "Allocator.h"
-#include "include/btree_interval_set.h"
+#include "include/btree_map.h"
+#include "include/interval_set.h"
#include "os/bluestore/bluestore_types.h"
#include "include/mempool.h"
int64_t num_reserved; ///< reserved bytes
typedef mempool::bluestore_alloc::pool_allocator<
- pair<const uint64_t,uint64_t>> allocator;
- std::vector<btree_interval_set<uint64_t,allocator>> free; ///< leading-edge copy
+ pair<const uint64_t,uint64_t>> allocator_t;
+ typedef btree::btree_map<uint64_t,uint64_t,std::less<uint64_t>,allocator_t> interval_set_map_t;
+ typedef interval_set<uint64_t,interval_set_map_t> interval_set_t;
+ std::vector<interval_set_t> free; ///< leading-edge copy
uint64_t last_alloc;
void _insert_free(uint64_t offset, uint64_t len);
uint64_t _aligned_len(
- btree_interval_set<uint64_t,allocator>::iterator p,
+ interval_set_t::iterator p,
uint64_t alloc_unit);
public:
int fd;
boost::container::small_vector<iovec,4> iov;
uint64_t offset, length;
- int rval;
+ long rval;
bufferlist bl; ///< write payload (so that it remains stable for duration)
boost::intrusive::list_member_hook<> queue_item;
bl.append(std::move(p));
}
- int get_return_value() {
+ long get_return_value() {
return rval;
}
};
<< " size 0x" << std::hex << file.size << std::dec
<< " mtime " << file.mtime
<< " bdev " << (int)file.prefer_bdev
+ << " allocated " << std::hex << file.allocated << std::dec
<< " extents " << file.extents
<< ")";
}
allocated += p.length;
}
- DENC(bluefs_fnode_t, v, p) {
+ DENC_HELPERS
+ void bound_encode(size_t& p) const {
+ _denc_friend(*this, p);
+ }
+ void encode(bufferlist::contiguous_appender& p) const {
+ DENC_DUMP_PRE(bluefs_fnode_t);
+ _denc_friend(*this, p);
+ DENC_DUMP_POST(bluefs_fnode_t);
+ }
+ void decode(buffer::ptr::iterator& p) {
+ _denc_friend(*this, p);
+ recalc_allocated();
+ }
+ template<typename T, typename P>
+ friend typename std::enable_if<
+ boost::is_same<T,bluefs_fnode_t>::value ||
+ boost::is_same<T,const bluefs_fnode_t>::value>::type
+ _denc_friend(T& v, P& p) {
DENC_START(1, 1, p);
denc_varint(v.ino, p);
denc_varint(v.size, p);
DENC_FINISH(p);
}
+ void append_extent(const bluefs_extent_t& ext) {
+ extents.push_back(ext);
+ allocated += ext.length;
+ }
+
+ void pop_front_extent() {
+ auto it = extents.begin();
+ allocated -= it->length;
+ extents.erase(it);
+ }
+
+ void swap_extents(bluefs_fnode_t& other) {
+ other.extents.swap(extents);
+ std::swap(allocated, other.allocated);
+ }
+ void swap_extents(mempool::bluefs::vector<bluefs_extent_t>& swap_to, uint64_t& new_allocated) {
+ swap_to.swap(extents);
+ std::swap(allocated, new_allocated);
+ }
+ void clear_extents() {
+ extents.clear();
+ allocated = 0;
+ }
+
mempool::bluefs::vector<bluefs_extent_t>::iterator seek(
uint64_t off, uint64_t *x_off);
void dump(Formatter *f) const;
static void generate_test_instances(list<bluefs_fnode_t*>& ls);
+
};
WRITE_CLASS_DENC(bluefs_fnode_t)
#include <ostream>
#include <bitset>
+#include <type_traits>
#include "include/types.h"
#include "include/interval_set.h"
#include "include/utime.h"
}
}
- int map(uint64_t x_off, uint64_t x_len,
- std::function<int(uint64_t,uint64_t)> f) const {
+ template<class F>
+ int map(uint64_t x_off, uint64_t x_len, F&& f) const {
auto p = extents.begin();
assert(p != extents.end());
while (x_off >= p->length) {
}
return 0;
}
+ template<class F>
void map_bl(uint64_t x_off,
bufferlist& bl,
- std::function<void(uint64_t,bufferlist&)> f) const {
+ F&& f) const {
auto p = extents.begin();
assert(p != extents.end());
while (x_off >= p->length) {
if (r < 0 && replaying) {
assert(r == -ERANGE);
derr << __FUNC__ << ": short source tolerated because we are replaying" << dendl;
- r = pos - from;;
+ r = len;
}
assert(replaying || pos == end);
if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) {
}
if (bl.length() > 0) {
bufferlist::iterator bp = bl.begin();
- ::decode(hinfo, bp);
+ try {
+ ::decode(hinfo, bp);
+ } catch(...) {
+ dout(0) << __func__ << ": Can't decode hinfo for " << hoid << dendl;
+ return ECUtil::HashInfoRef();
+ }
if (checks && hinfo.get_total_chunk_size() != (uint64_t)st.st_size) {
dout(0) << __func__ << ": Mismatch of total_chunk_size "
<< hinfo.get_total_chunk_size() << dendl;
void ECUtil::HashInfo::dump(Formatter *f) const
{
f->dump_unsigned("total_chunk_size", total_chunk_size);
- f->open_object_section("cumulative_shard_hashes");
+ f->open_array_section("cumulative_shard_hashes");
for (unsigned i = 0; i != cumulative_shard_hashes.size(); ++i) {
f->open_object_section("hash");
f->dump_unsigned("shard", i);
f->close_section();
}
+namespace ECUtil {
+std::ostream& operator<<(std::ostream& out, const HashInfo& hi)
+{
+ ostringstream hashes;
+ for (auto hash: hi.cumulative_shard_hashes)
+ hashes << " " << hex << hash;
+ return out << "tcs=" << hi.total_chunk_size << hashes.str();
+}
+}
+
void ECUtil::HashInfo::generate_test_instances(list<HashInfo*>& o)
{
o.push_back(new HashInfo(3));
#ifndef ECUTIL_H
#define ECUTIL_H
+#include <ostream>
#include "erasure-code/ErasureCodeInterface.h"
#include "include/buffer_fwd.h"
#include "include/assert.h"
*this = rhs;
projected_total_chunk_size = ptcs;
}
+ friend std::ostream& operator<<(std::ostream& out, const HashInfo& hi);
};
typedef ceph::shared_ptr<HashInfo> HashInfoRef;
}
-void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want)
+void OSDService::queue_want_pg_temp(pg_t pgid,
+ const vector<int>& want,
+ bool forced)
{
Mutex::Locker l(pg_temp_lock);
- map<pg_t,vector<int> >::iterator p = pg_temp_pending.find(pgid);
+ auto p = pg_temp_pending.find(pgid);
if (p == pg_temp_pending.end() ||
- p->second != want) {
- pg_temp_wanted[pgid] = want;
+ p->second.acting != want ||
+ forced) {
+ pg_temp_wanted[pgid] = pg_temp_t{want, forced};
}
}
void OSDService::_sent_pg_temp()
{
- for (map<pg_t,vector<int> >::iterator p = pg_temp_wanted.begin();
- p != pg_temp_wanted.end();
- ++p)
- pg_temp_pending[p->first] = p->second;
+ pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
+ make_move_iterator(end(pg_temp_wanted)));
pg_temp_wanted.clear();
}
<< pg_temp_wanted.size() << dendl;
}
+std::ostream& operator<<(std::ostream& out,
+ const OSDService::pg_temp_t& pg_temp)
+{
+ out << pg_temp.acting;
+ if (pg_temp.forced) {
+ out << " (forced)";
+ }
+ return out;
+}
+
void OSDService::send_pg_temp()
{
Mutex::Locker l(pg_temp_lock);
if (pg_temp_wanted.empty())
return;
dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
- MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
- m->pg_temp = pg_temp_wanted;
- monc->send_mon_message(m);
+ MOSDPGTemp *ms[2] = {nullptr, nullptr};
+ for (auto& pg_temp : pg_temp_wanted) {
+ auto& m = ms[pg_temp.second.forced];
+ if (!m) {
+ m = new MOSDPGTemp(osdmap->get_epoch());
+ m->forced = pg_temp.second.forced;
+ }
+ m->pg_temp.emplace(pg_temp.first,
+ pg_temp.second.acting);
+ }
+ for (auto m : ms) {
+ if (m) {
+ monc->send_mon_message(m);
+ }
+ }
_sent_pg_temp();
}
osd_plb.add_u64(
l_osd_pg_stray, "numpg_stray",
"Placement groups ready to be deleted from this osd");
+ osd_plb.add_u64(
+ l_osd_pg_removing, "numpg_removing",
+ "Placement groups queued for local deletion", "pgsr",
+ PerfCountersBuilder::PRIO_USEFUL);
osd_plb.add_u64(
l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
pg->upgrade(store);
}
+ if (pg->dne()) {
+ dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
+ pg->ch = nullptr;
+ service.pg_remove_epoch(pg->pg_id);
+ pg->unlock();
+ {
+ // Delete pg
+ RWLock::WLocker l(pg_map_lock);
+ auto p = pg_map.find(pg->get_pgid());
+ assert(p != pg_map.end() && p->second == pg);
+ dout(20) << __func__ << " removed pg " << pg << " from pg_map" << dendl;
+ pg_map.erase(p);
+ pg->put("PGMap");
+ }
+ recursive_remove_collection(cct, store, pgid, *it);
+ continue;
+ }
+
service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
// generate state for PG's current mapping
{
bool do_sub_pg_creates = false;
bool have_pending_creates = false;
- MOSDPGTemp *pgtemp = nullptr;
{
const auto max_pgs_per_osd =
(cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
}
auto pg = pending_creates_from_osd.cbegin();
while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
- if (!pgtemp) {
- pgtemp = new MOSDPGTemp{osdmap->get_epoch()};
- }
+ dout(20) << __func__ << " pg " << pg->first << dendl;
vector<int> acting;
osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
- pgtemp->pg_temp[pg->first] = twiddle(acting);
+ service.queue_want_pg_temp(pg->first, twiddle(acting), true);
pg = pending_creates_from_osd.erase(pg);
+ do_sub_pg_creates = true;
spare_pgs--;
}
have_pending_creates = (pending_creates_from_mon > 0 ||
<< start << dendl;
do_renew_subs = true;
}
- } else if (pgtemp || do_sub_pg_creates) {
+ } else if (do_sub_pg_creates) {
// no need to subscribe the osdmap continuously anymore
// once the pgtemp and/or mon_subscribe(pg_creates) is sent
if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
monc->renew_subs();
}
- if (pgtemp) {
- pgtemp->forced = true;
- monc->send_mon_message(pgtemp);
- }
+ service.send_pg_temp();
}
void OSD::build_initial_pg_history(
logger->set(l_osd_cached_crc, buffer::get_cached_crc());
logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
logger->set(l_osd_missed_crc, buffer::get_missed_crc());
+ logger->set(l_osd_pg_removing, remove_wq.get_remove_queue_len());
// osd_lock is not being held, which means the OSD state
// might change when doing the monitor report
logger->set(l_osd_pg_primary, num_pg_primary);
logger->set(l_osd_pg_replica, num_pg_replica);
logger->set(l_osd_pg_stray, num_pg_stray);
+ logger->set(l_osd_pg_removing, remove_wq.get_remove_queue_len());
}
void OSD::activate_map()
uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
auto sdata = shard_list[shard_index];
bool queued = false;
- unsigned pushes_to_free = 0;
{
Mutex::Locker l(sdata->sdata_op_ordering_lock);
auto p = sdata->pg_slots.find(pgid);
++i) {
sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
}
- for (auto& q : p->second.to_process) {
- pushes_to_free += q.get_reserved_pushes();
- }
p->second.to_process.clear();
p->second.waiting_for_pg = false;
++p->second.requeue_seq;
queued = true;
}
}
- if (pushes_to_free > 0) {
- osd->service.release_reserved_pushes(pushes_to_free);
- }
if (queued) {
sdata->sdata_lock.Lock();
sdata->sdata_cond.SignalOne();
l_osd_pg_primary,
l_osd_pg_replica,
l_osd_pg_stray,
+ l_osd_pg_removing,
l_osd_hb_to,
l_osd_map,
l_osd_mape,
// -- pg_temp --
private:
Mutex pg_temp_lock;
- map<pg_t, vector<int> > pg_temp_wanted;
- map<pg_t, vector<int> > pg_temp_pending;
+ struct pg_temp_t {
+ pg_temp_t()
+ {}
+ pg_temp_t(vector<int> v, bool f)
+ : acting{v}, forced{f}
+ {}
+ vector<int> acting;
+ bool forced = false;
+ };
+ map<pg_t, pg_temp_t> pg_temp_wanted;
+ map<pg_t, pg_temp_t> pg_temp_pending;
void _sent_pg_temp();
+ friend std::ostream& operator<<(std::ostream&, const pg_temp_t&);
public:
- void queue_want_pg_temp(pg_t pgid, vector<int>& want);
+ void queue_want_pg_temp(pg_t pgid, const vector<int>& want,
+ bool forced = false);
void remove_want_pg_temp(pg_t pgid);
void requeue_pg_temp();
void send_pg_temp();
void _clear() override {
remove_queue.clear();
}
+ int get_remove_queue_len() {
+ lock();
+ int r = remove_queue.size();
+ unlock();
+ return r;
+ }
} remove_wq;
// -- status reporting --
}
}
+void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
+ const OSDMap& osdmap,
+ Incremental *pending_inc)
+{
+ ldout(cct, 10) << __func__ << dendl;
+ OSDMap tmpmap;
+ tmpmap.deepish_copy_from(osdmap);
+ tmpmap.apply_incremental(*pending_inc);
+
+ for (auto& p : tmpmap.pg_upmap) {
+ ldout(cct, 10) << __func__ << " pg_upmap entry "
+ << "[" << p.first << ":" << p.second << "]"
+ << dendl;
+ auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first);
+ if (crush_rule < 0) {
+ lderr(cct) << __func__ << " unable to load crush-rule of pg "
+ << p.first << dendl;
+ continue;
+ }
+ auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
+ if (type < 0) {
+ lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
+ << p.first << dendl;
+ continue;
+ } else if (type == 0) {
+ ldout(cct, 10) << __func__ << " failure-domain of pg " << p.first
+ << " is osd-level, skipping"
+ << dendl;
+ continue;
+ }
+ ldout(cct, 10) << __func__ << " pg " << p.first
+ << " crush-rule-id " << crush_rule
+ << " failure-domain-type " << type
+ << dendl;
+ vector<int> raw;
+ int primary;
+ tmpmap.pg_to_raw_up(p.first, &raw, &primary);
+ set<int> parents;
+ bool error = false;
+ bool collide = false;
+ for (auto osd : raw) {
+ auto parent = tmpmap.crush->get_parent_of_type(osd, type);
+ if (parent >= 0) {
+ lderr(cct) << __func__ << " unable to get parent of raw osd." << osd
+ << ", pg " << p.first
+ << dendl;
+ error = true;
+ break;
+ }
+ auto r = parents.insert(parent);
+ if (!r.second) {
+ collide = true;
+ break;
+ }
+ }
+ if (!error && collide) {
+ ldout(cct, 10) << __func__ << " removing invalid pg_upmap "
+ << "[" << p.first << ":" << p.second << "]"
+ << ", final mapping result will be: " << raw
+ << dendl;
+ auto it = pending_inc->new_pg_upmap.find(p.first);
+ if (it != pending_inc->new_pg_upmap.end()) {
+ pending_inc->new_pg_upmap.erase(it);
+ }
+ if (osdmap.pg_upmap.count(p.first)) {
+ pending_inc->old_pg_upmap.insert(p.first);
+ }
+ }
+ }
+ for (auto& p : tmpmap.pg_upmap_items) {
+ ldout(cct, 10) << __func__ << " pg_upmap_items entry "
+ << "[" << p.first << ":" << p.second << "]"
+ << dendl;
+ auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first);
+ if (crush_rule < 0) {
+ lderr(cct) << __func__ << " unable to load crush-rule of pg "
+ << p.first << dendl;
+ continue;
+ }
+ auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
+ if (type < 0) {
+ lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
+ << p.first << dendl;
+ continue;
+ } else if (type == 0) {
+ ldout(cct, 10) << __func__ << " failure-domain of pg " << p.first
+ << " is osd-level, skipping"
+ << dendl;
+ continue;
+ }
+ ldout(cct, 10) << __func__ << " pg " << p.first
+ << " crush_rule_id " << crush_rule
+ << " failure_domain_type " << type
+ << dendl;
+ vector<int> raw;
+ int primary;
+ tmpmap.pg_to_raw_up(p.first, &raw, &primary);
+ set<int> parents;
+ bool error = false;
+ bool collide = false;
+ for (auto osd : raw) {
+ auto parent = tmpmap.crush->get_parent_of_type(osd, type);
+ if (parent >= 0) {
+ lderr(cct) << __func__ << " unable to get parent of raw osd." << osd
+ << ", pg " << p.first
+ << dendl;
+ error = true;
+ break;
+ }
+ auto r = parents.insert(parent);
+ if (!r.second) {
+ collide = true;
+ break;
+ }
+ }
+ if (!error && collide) {
+ ldout(cct, 10) << __func__ << " removing invalid pg_upmap_items "
+ << "[" << p.first << ":" << p.second << "]"
+ << ", final mapping result will be: " << raw
+ << dendl;
+ // This is overkilling, but simpler..
+ auto it = pending_inc->new_pg_upmap_items.find(p.first);
+ if (it != pending_inc->new_pg_upmap_items.end()) {
+ pending_inc->new_pg_upmap_items.erase(it);
+ }
+ if (osdmap.pg_upmap_items.count(p.first)) {
+ pending_inc->old_pg_upmap_items.insert(p.first);
+ }
+ }
+ }
+}
+
int OSDMap::apply_incremental(const Incremental &inc)
{
new_blacklist_entries = false;
ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
for (auto p : pmap) {
auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
+ if (adjusted_weight == 0) {
+ continue;
+ }
osd_weight[p.first] += adjusted_weight;
osd_weight_total += adjusted_weight;
}
#ifndef CEPH_OSDMAP_H
#define CEPH_OSDMAP_H
-#include "include/cpp-btree/btree_map.h"
-
/*
* describe properties of the OSD cluster.
* disks, disk groups, total # osds,
#include <set>
#include <map>
#include "include/memory.h"
+#include "include/btree_map.h"
using namespace std;
// forward declaration
*/
uint64_t get_up_osd_features() const;
+ void maybe_remove_pg_upmaps(CephContext *cct,
+ const OSDMap& osdmap,
+ Incremental *pending_inc);
+
int apply_incremental(const Incremental &inc);
/// try to re-use/reference addrs in oldmap from newmap
return p->get_size();
}
+ int get_pg_pool_crush_rule(pg_t pgid) const {
+ if (!pg_exists(pgid)) {
+ return -ENOENT;
+ }
+ const pg_pool_t *p = get_pg_pool(pgid.pool());
+ assert(p);
+ return p->get_crush_rule();
+ }
+
private:
/// pg -> (raw osd list)
void _pg_to_raw_osds(
try {
::decode(snaps, p);
} catch (...) {
+ derr << __func__ << " decode snaps failure on " << *i << dendl;
snaps.clear();
}
set<snapid_t> _snaps(snaps.begin(), snaps.end());
<< "...repaired";
}
snap_mapper.add_oid(hoid, obj_snaps, &_t);
- r = osd->store->apply_transaction(osr.get(), std::move(t));
- if (r != 0) {
- derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
- << dendl;
+
+ // wait for repair to apply to avoid confusing other bits of the system.
+ {
+ Cond my_cond;
+ Mutex my_lock("PG::_scan_snaps my_lock");
+ int r = 0;
+ bool done;
+ t.register_on_applied_sync(
+ new C_SafeCond(&my_lock, &my_cond, &done, &r));
+ r = osd->store->apply_transaction(osr.get(), std::move(t));
+ if (r != 0) {
+ derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
+ << dendl;
+ } else {
+ my_lock.Lock();
+ while (!done)
+ my_cond.Wait(my_lock);
+ my_lock.Unlock();
+ }
}
}
}
bool PG::append_log_entries_update_missing(
const mempool::osd_pglog::list<pg_log_entry_t> &entries,
- ObjectStore::Transaction &t)
+ ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
+ boost::optional<eversion_t> roll_forward_to)
{
assert(!entries.empty());
assert(entries.begin()->version > info.last_update);
PGLogEntryHandler rollbacker{this, &t};
+ if (roll_forward_to) {
+ pg_log.roll_forward(&rollbacker);
+ }
bool invalidate_stats =
pg_log.append_new_log_entries(info.last_backfill,
info.last_backfill_bitwise,
entries,
&rollbacker);
+
+ if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
+ pg_log.roll_forward(&rollbacker);
+ }
+ if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
+ pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
+ last_rollback_info_trimmed_to_applied = *roll_forward_to;
+ }
+
info.last_update = pg_log.get_head();
if (pg_log.get_missing().num_missing() == 0) {
// advance last_complete since nothing else is missing!
info.last_complete = info.last_update;
}
-
info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
+
+ dout(20) << __func__ << "trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
+ if (trim_to)
+ pg_log.trim(*trim_to, info);
dirty_info = true;
write_if_dirty(t);
return invalidate_stats;
void PG::merge_new_log_entries(
const mempool::osd_pglog::list<pg_log_entry_t> &entries,
- ObjectStore::Transaction &t)
+ ObjectStore::Transaction &t,
+ boost::optional<eversion_t> trim_to,
+ boost::optional<eversion_t> roll_forward_to)
{
dout(10) << __func__ << " " << entries << dendl;
assert(is_primary());
- bool rebuild_missing = append_log_entries_update_missing(entries, t);
+ bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
i != actingbackfill.end();
++i) {
pg_shard_t old_acting_primary = get_primary();
pg_shard_t old_up_primary = up_primary;
bool was_old_primary = is_primary();
+ bool was_old_replica = is_replica();
acting.swap(oldacting);
up.swap(oldup);
actingbackfill.clear();
scrub_queued = false;
- // reset primary state?
+ // reset primary/replica state?
if (was_old_primary || is_primary()) {
osd->remove_want_pg_temp(info.pgid.pgid);
+ } else if (was_old_replica || is_replica()) {
+ osd->remove_want_pg_temp(info.pgid.pgid);
}
clear_primary_state();
assert(!is_primary());
update_history(oinfo.history);
+ if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
+ info.stats.stats.sum.num_scrub_errors = 0;
+ info.stats.stats.sum.num_shallow_scrub_errors = 0;
+ info.stats.stats.sum.num_deep_scrub_errors = 0;
+ dirty_info = true;
+ }
- if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
- // DEBUG: verify that the snaps are empty in snap_mapper
- if (cct->_conf->osd_debug_verify_snaps_on_info) {
- interval_set<snapid_t> p;
- p.union_of(oinfo.purged_snaps, info.purged_snaps);
- p.subtract(info.purged_snaps);
- if (!p.empty()) {
- for (interval_set<snapid_t>::iterator i = p.begin();
- i != p.end();
- ++i) {
- for (snapid_t snap = i.get_start();
- snap != i.get_len() + i.get_start();
- ++snap) {
- vector<hobject_t> hoids;
- int r = snap_mapper.get_next_objects_to_trim(snap, 1, &hoids);
- if (r != 0 && r != -ENOENT) {
- derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
- << cpp_strerror(r) << dendl;
- ceph_abort();
- } else if (r != -ENOENT) {
- assert(!hoids.empty());
- derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
- << cpp_strerror(r) << " for object "
- << hoids[0] << " on snap " << snap
- << " which should have been fully trimmed " << dendl;
- ceph_abort();
- }
- }
- }
- }
- }
+ if (!(info.purged_snaps == oinfo.purged_snaps)) {
+ dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
+ << dendl;
info.purged_snaps = oinfo.purged_snaps;
dirty_info = true;
dirty_big_info = true;
map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
};
+public:
+ bool dne() { return info.dne(); }
struct RecoveryCtx {
utime_t start_time;
map<int, map<spg_t, pg_query_t> > *query_map;
bool append_log_entries_update_missing(
const mempool::osd_pglog::list<pg_log_entry_t> &entries,
- ObjectStore::Transaction &t);
+ ObjectStore::Transaction &t,
+ boost::optional<eversion_t> trim_to,
+ boost::optional<eversion_t> roll_forward_to);
/**
* Merge entries updating missing as necessary on all
*/
void merge_new_log_entries(
const mempool::osd_pglog::list<pg_log_entry_t> &entries,
- ObjectStore::Transaction &t);
+ ObjectStore::Transaction &t,
+ boost::optional<eversion_t> trim_to,
+ boost::optional<eversion_t> roll_forward_to);
void reset_interval_flush();
void start_peering_interval(
errorstream << "data_digest 0x" << std::hex << candidate.digest
<< " != data_digest 0x" << auth_oi.data_digest << std::dec
<< " from auth oi " << auth_oi;
- shard_result.set_data_digest_mismatch_oi();
+ shard_result.set_data_digest_mismatch_info();
}
}
if (auth_oi.is_omap_digest() && candidate.omap_digest_present) {
errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
<< " != omap_digest 0x" << auth_oi.omap_digest << std::dec
<< " from auth oi " << auth_oi;
- shard_result.set_omap_digest_mismatch_oi();
+ shard_result.set_omap_digest_mismatch_info();
}
}
}
errorstream << "size " << candidate.size
<< " != size " << oi_size
<< " from auth oi " << auth_oi;
- shard_result.set_size_mismatch_oi();
+ shard_result.set_size_mismatch_info();
}
if (auth.size != candidate.size) {
if (error != CLEAN)
i != auth.attrs.end();
++i) {
// We check system keys seperately
- if (i->first == OI_ATTR || i->first == SS_ATTR)
+ if (i->first == OI_ATTR || i->first[0] != '_')
continue;
if (!candidate.attrs.count(i->first)) {
if (error != CLEAN)
i != candidate.attrs.end();
++i) {
// We check system keys seperately
- if (i->first == OI_ATTR || i->first == SS_ATTR)
+ if (i->first == OI_ATTR || i->first[0] != '_')
continue;
if (!auth.attrs.count(i->first)) {
if (error != CLEAN)
inconsistent_obj_wrapper &object_error)
{
eversion_t auth_version;
- bufferlist first_oi_bl, first_ss_bl;
+ bufferlist first_oi_bl, first_ss_bl, first_hk_bl;
// Create list of shards with primary first so it will be auth copy all
// other things being equal.
bufferlist bl;
map<string, bufferptr>::iterator k;
SnapSet ss;
- bufferlist ss_bl;
+ bufferlist ss_bl, hk_bl;
if (i->second.stat_error) {
shard_info.set_stat_error();
if (obj.is_head() || obj.is_snapdir()) {
k = i->second.attrs.find(SS_ATTR);
if (k == i->second.attrs.end()) {
- shard_info.set_ss_attr_missing();
- error_string += " ss_attr_missing";
+ shard_info.set_snapset_missing();
+ error_string += " snapset_missing";
} else {
ss_bl.push_back(k->second);
try {
}
} catch (...) {
// invalid snapset, probably corrupt
- shard_info.set_ss_attr_corrupted();
- error_string += " ss_attr_corrupted";
+ shard_info.set_snapset_corrupted();
+ error_string += " snapset_corrupted";
+ }
+ }
+ }
+
+ if (parent->get_pool().is_erasure()) {
+ ECUtil::HashInfo hi;
+ k = i->second.attrs.find(ECUtil::get_hinfo_key());
+ if (k == i->second.attrs.end()) {
+ shard_info.set_hinfo_missing();
+ error_string += " hinfo_key_missing";
+ } else {
+ hk_bl.push_back(k->second);
+ try {
+ bufferlist::iterator bliter = hk_bl.begin();
+ decode(hi, bliter);
+ if (first_hk_bl.length() == 0) {
+ first_hk_bl.append(hk_bl);
+ } else if (!object_error.has_hinfo_inconsistency() && !hk_bl.contents_equal(first_hk_bl)) {
+ object_error.set_hinfo_inconsistency();
+ error_string += " hinfo_inconsistency";
+ }
+ } catch (...) {
+ // invalid snapset, probably corrupt
+ shard_info.set_hinfo_corrupted();
+ error_string += " hinfo_corrupted";
}
}
}
k = i->second.attrs.find(OI_ATTR);
if (k == i->second.attrs.end()) {
// no object info on object, probably corrupt
- shard_info.set_oi_attr_missing();
- error_string += " oi_attr_missing";
+ shard_info.set_info_missing();
+ error_string += " info_missing";
goto out;
}
bl.push_back(k->second);
::decode(oi, bliter);
} catch (...) {
// invalid object info, probably corrupt
- shard_info.set_oi_attr_corrupted();
- error_string += " oi_attr_corrupted";
+ shard_info.set_info_corrupted();
+ error_string += " info_corrupted";
goto out;
}
if (i->second.size != be_get_ondisk_size(oi.size)) {
dout(5) << __func__ << " size " << i->second.size << " oi size " << oi.size << dendl;
- shard_info.set_obj_size_oi_mismatch();
- error_string += " obj_size_oi_mismatch";
+ shard_info.set_obj_size_info_mismatch();
+ error_string += " obj_size_info_mismatch";
}
// Don't use this particular shard due to previous errors
// recorded digest != actual digest?
if (auth_oi.is_data_digest() && auth_object.digest_present &&
auth_oi.data_digest != auth_object.digest) {
- assert(shard_map[auth->first].has_data_digest_mismatch_oi());
+ assert(shard_map[auth->first].has_data_digest_mismatch_info());
errorstream << pgid << " recorded data digest 0x"
<< std::hex << auth_oi.data_digest << " != on disk 0x"
<< auth_object.digest << std::dec << " on " << auth_oi.soid
}
if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
auth_oi.omap_digest != auth_object.omap_digest) {
- assert(shard_map[auth->first].has_omap_digest_mismatch_oi());
+ assert(shard_map[auth->first].has_omap_digest_mismatch_info());
errorstream << pgid << " recorded omap digest 0x"
<< std::hex << auth_oi.omap_digest << " != on disk 0x"
<< auth_object.omap_digest << std::dec
bv);
if (r >= 0) {
object_info_t oi(bv);
- assert(oi.version == i.second.have);
+ assert(oi.version == i.second.have || eversion_t() == i.second.have);
} else {
assert(i.second.is_delete() || eversion_t() == i.second.have);
}
if (limit != eversion_t() &&
limit != pg_trim_to &&
pg_log.get_log().approx_size() > target) {
- size_t num_to_trim = pg_log.get_log().approx_size() - target;
- if (num_to_trim < cct->_conf->osd_pg_log_trim_min) {
+ size_t num_to_trim = MIN(pg_log.get_log().approx_size() - target,
+ cct->_conf->osd_pg_log_trim_max);
+ if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
+ cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
return;
}
list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
in_progress_proxy_ops.erase(p);
}
-void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop)
+void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
+ vector<ceph_tid_t> *tids)
{
dout(10) << __func__ << " " << prdop->soid << dendl;
prdop->canceled = true;
// cancel objecter op, if we can
if (prdop->objecter_tid) {
- osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED);
+ tids->push_back(prdop->objecter_tid);
for (uint32_t i = 0; i < prdop->ops.size(); i++) {
prdop->ops[i].outdata.clear();
}
}
}
-void PrimaryLogPG::cancel_proxy_ops(bool requeue)
+void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
{
dout(10) << __func__ << dendl;
// cancel proxy reads
map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
while (p != proxyread_ops.end()) {
- cancel_proxy_read((p++)->second);
+ cancel_proxy_read((p++)->second, tids);
}
// cancel proxy writes
map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
while (q != proxywrite_ops.end()) {
- cancel_proxy_write((q++)->second);
+ cancel_proxy_write((q++)->second, tids);
}
if (requeue) {
pwop->ctx = NULL;
}
-void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop)
+void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
+ vector<ceph_tid_t> *tids)
{
dout(10) << __func__ << " " << pwop->soid << dendl;
pwop->canceled = true;
// cancel objecter op, if we can
if (pwop->objecter_tid) {
- osd->objecter->op_cancel(pwop->objecter_tid, -ECANCELED);
+ tids->push_back(pwop->objecter_tid);
delete pwop->ctx;
pwop->ctx = NULL;
proxywrite_ops.erase(pwop->objecter_tid);
// FIXME: if the src etc match, we could avoid restarting from the
// beginning.
CopyOpRef cop = copy_ops[dest];
- cancel_copy(cop, false);
+ vector<ceph_tid_t> tids;
+ cancel_copy(cop, false, &tids);
+ osd->objecter->op_cancel(tids, -ECANCELED);
}
CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
{
+ vector<ceph_tid_t> tids;
dout(10) << __func__ << " " << oid << " tid " << tid
<< " " << cpp_strerror(r) << dendl;
map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
it != proxyread_ops.end();) {
if (it->second->soid == cobc->obs.oi.soid) {
- cancel_proxy_read((it++)->second);
+ cancel_proxy_read((it++)->second, &tids);
} else {
++it;
}
for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
it != proxywrite_ops.end();) {
if (it->second->soid == cobc->obs.oi.soid) {
- cancel_proxy_write((it++)->second);
+ cancel_proxy_write((it++)->second, &tids);
} else {
++it;
}
}
+ osd->objecter->op_cancel(tids, -ECANCELED);
kick_proxy_ops_blocked(cobc->obs.oi.soid);
}
kick_object_context_blocked(cobc);
}
+void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
+ vector<ceph_tid_t> tids;
+ for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
+ it != proxyread_ops.end();) {
+ if (it->second->soid == oid) {
+ cancel_proxy_read((it++)->second, &tids);
+ } else {
+ ++it;
+ }
+ }
+ for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
+ it != proxywrite_ops.end();) {
+ if (it->second->soid == oid) {
+ cancel_proxy_write((it++)->second, &tids);
+ } else {
+ ++it;
+ }
+ }
+ osd->objecter->op_cancel(tids, -ECANCELED);
+ kick_proxy_ops_blocked(oid);
+}
+
void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
{
dout(20) << __func__ << " " << cop
agent_choose_mode();
}
-void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue)
+void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
+ vector<ceph_tid_t> *tids)
{
dout(10) << __func__ << " " << cop->obc->obs.oi.soid
<< " from " << cop->src << " " << cop->oloc
// cancel objecter op, if we can
if (cop->objecter_tid) {
- osd->objecter->op_cancel(cop->objecter_tid, -ECANCELED);
+ tids->push_back(cop->objecter_tid);
cop->objecter_tid = 0;
if (cop->objecter_tid2) {
- osd->objecter->op_cancel(cop->objecter_tid2, -ECANCELED);
+ tids->push_back(cop->objecter_tid2);
cop->objecter_tid2 = 0;
}
}
cop->obc = ObjectContextRef();
}
-void PrimaryLogPG::cancel_copy_ops(bool requeue)
+void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
{
dout(10) << __func__ << dendl;
map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
while (p != copy_ops.end()) {
// requeue this op? can I queue up all of them?
- cancel_copy((p++)->second, requeue);
+ cancel_copy((p++)->second, requeue, tids);
}
}
osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
fop->dup_ops.pop_front();
}
- cancel_flush(fop, false);
+ vector<ceph_tid_t> tids;
+ cancel_flush(fop, false, &tids);
+ osd->objecter->op_cancel(tids, -ECANCELED);
}
/**
return -EAGAIN; // will retry
} else {
osd->logger->inc(l_osd_tier_try_flush_fail);
- cancel_flush(fop, false);
+ vector<ceph_tid_t> tids;
+ cancel_flush(fop, false, &tids);
+ osd->objecter->op_cancel(tids, -ECANCELED);
return -ECANCELED;
}
}
dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
close_op_ctx(ctx.release());
osd->logger->inc(l_osd_tier_try_flush_fail);
- cancel_flush(fop, false);
+ vector<ceph_tid_t> tids;
+ cancel_flush(fop, false, &tids);
+ osd->objecter->op_cancel(tids, -ECANCELED);
return -ECANCELED;
}
return -EINPROGRESS;
}
-void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue)
+void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
+ vector<ceph_tid_t> *tids)
{
dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
<< fop->objecter_tid << dendl;
if (fop->objecter_tid) {
- osd->objecter->op_cancel(fop->objecter_tid, -ECANCELED);
+ tids->push_back(fop->objecter_tid);
fop->objecter_tid = 0;
}
- if (fop->blocking) {
+ if (fop->io_tids.size()) {
+ for (auto &p : fop->io_tids) {
+ tids->push_back(p.second);
+ p.second = 0;
+ }
+ }
+ if (fop->blocking && fop->obc->is_blocked()) {
fop->obc->stop_block();
kick_object_context_blocked(fop->obc);
}
flush_ops.erase(fop->obc->obs.oi.soid);
}
-void PrimaryLogPG::cancel_flush_ops(bool requeue)
+void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
{
dout(10) << __func__ << dendl;
map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
while (p != flush_ops.end()) {
- cancel_flush((p++)->second, requeue);
+ cancel_flush((p++)->second, requeue, tids);
}
}
[this, entries, repop, on_complete]() {
ObjectStore::Transaction t;
eversion_t old_last_update = info.last_update;
- merge_new_log_entries(entries, t);
+ merge_new_log_entries(entries, t, pg_trim_to, min_last_complete_ondisk);
set<pg_shard_t> waiting_on;
pg_whoami.shard,
get_osdmap()->get_epoch(),
last_peering_reset,
- repop->rep_tid);
+ repop->rep_tid,
+ pg_trim_to,
+ min_last_complete_ondisk);
osd->send_message_osd_cluster(
peer.osd, m, get_osdmap()->get_epoch());
waiting_on.insert(peer);
int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
assert(r == 0);
});
+
+ calc_trim_to();
}
void PrimaryLogPG::cancel_log_updates()
op->get_req());
assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
ObjectStore::Transaction t;
- append_log_entries_update_missing(m->entries, t);
+ boost::optional<eversion_t> op_trim_to, op_roll_forward_to;
+ if (m->pg_trim_to != eversion_t())
+ op_trim_to = m->pg_trim_to;
+ if (m->pg_roll_forward_to != eversion_t())
+ op_roll_forward_to = m->pg_roll_forward_to;
+
+ dout(20) << __func__ << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
+
+ append_log_entries_update_missing(m->entries, t, op_trim_to, op_roll_forward_to);
+ eversion_t new_lcod = info.last_complete;
Context *complete = new FunctionContext(
[=](int) {
op->get_req());
lock();
if (!pg_has_reset_since(msg->get_epoch())) {
+ update_last_complete_ondisk(new_lcod);
MOSDPGUpdateLogMissingReply *reply =
new MOSDPGUpdateLogMissingReply(
spg_t(info.pgid.pgid, primary_shard().shard),
pg_whoami.shard,
msg->get_epoch(),
msg->min_epoch,
- msg->get_tid());
+ msg->get_tid(),
+ new_lcod);
reply->set_priority(CEPH_MSG_PRIO_HIGH);
msg->get_connection()->send_message(reply);
}
if (it != log_entry_update_waiting_on.end()) {
if (it->second.waiting_on.count(m->get_from())) {
it->second.waiting_on.erase(m->get_from());
+ if (m->last_complete_ondisk != eversion_t()) {
+ update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
+ }
} else {
osd->clog->error()
<< info.pgid << " got reply "
scrub_clear_state();
unreg_next_scrub();
- cancel_copy_ops(false);
- cancel_flush_ops(false);
- cancel_proxy_ops(false);
+
+ vector<ceph_tid_t> tids;
+ cancel_copy_ops(false, &tids);
+ cancel_flush_ops(false, &tids);
+ cancel_proxy_ops(false, &tids);
+ osd->objecter->op_cancel(tids, -ECANCELED);
+
apply_and_flush_repops(false);
cancel_log_updates();
// we must remove PGRefs, so do this this prior to release_backoffs() callers
clear_scrub_reserved();
- cancel_copy_ops(is_primary());
- cancel_flush_ops(is_primary());
- cancel_proxy_ops(is_primary());
+ vector<ceph_tid_t> tids;
+ cancel_copy_ops(is_primary(), &tids);
+ cancel_flush_ops(is_primary(), &tids);
+ cancel_proxy_ops(is_primary(), &tids);
+ osd->objecter->op_cancel(tids, -ECANCELED);
// requeue object waiters
for (auto& p : waiting_for_unreadable_object) {
vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
unsigned missing = 0;
inconsistent_snapset_wrapper soid_error, head_error;
+ unsigned soid_error_count = 0;
bufferlist last_data;
osd->clog->error() << mode << " " << info.pgid << " " << soid
<< " no '" << OI_ATTR << "' attr";
++scrubber.shallow_errors;
- soid_error.set_oi_attr_missing();
+ soid_error.set_info_missing();
} else {
bufferlist bv;
bv.push_back(p->second.attrs[OI_ATTR]);
osd->clog->error() << mode << " " << info.pgid << " " << soid
<< " can't decode '" << OI_ATTR << "' attr " << e.what();
++scrubber.shallow_errors;
- soid_error.set_oi_attr_corrupted();
- soid_error.set_oi_attr_missing(); // Not available too
+ soid_error.set_info_corrupted();
+ soid_error.set_info_missing(); // Not available too
}
}
++scrubber.shallow_errors;
soid_error.set_headless();
scrubber.store->add_snap_error(pool.id, soid_error);
+ ++soid_error_count;
if (head && soid.get_head() == head->get_head())
head_error.set_clone(soid.snap);
continue;
}
// Save previous head error information
- if (head && head_error.errors)
+ if (head && (head_error.errors || soid_error_count))
scrubber.store->add_snap_error(pool.id, head_error);
// Set this as a new head object
head = soid;
missing = 0;
head_error = soid_error;
+ soid_error_count = 0;
dout(20) << __func__ << " " << mode << " new head " << head << dendl;
<< " no '" << SS_ATTR << "' attr";
++scrubber.shallow_errors;
snapset = boost::none;
- head_error.set_ss_attr_missing();
+ head_error.set_snapset_missing();
} else {
bufferlist bl;
bl.push_back(p->second.attrs[SS_ATTR]);
try {
snapset = SnapSet(); // Initialize optional<> before decoding into it
::decode(snapset.get(), blp);
+ head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
} catch (buffer::error& e) {
snapset = boost::none;
osd->clog->error() << mode << " " << info.pgid << " " << soid
<< " can't decode '" << SS_ATTR << "' attr " << e.what();
++scrubber.shallow_errors;
- head_error.set_ss_attr_corrupted();
+ head_error.set_snapset_corrupted();
}
}
osd->clog->error() << mode << " " << info.pgid << " " << soid
<< " snaps.seq not set";
++scrubber.shallow_errors;
- head_error.set_snapset_mismatch();
+ head_error.set_snapset_error();
}
}
// what's next?
++curclone;
- if (soid_error.errors)
+ if (soid_error.errors) {
scrubber.store->add_snap_error(pool.id, soid_error);
+ ++soid_error_count;
+ }
}
scrub_cstat.add(stat);
log_missing(missing, head, osd->clog, info.pgid, __func__,
mode, pool.info.allow_incomplete_clones());
}
- if (head && head_error.errors)
+ if (head && (head_error.errors || soid_error_count))
scrubber.store->add_snap_error(pool.id, head_error);
for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
bool blocking; ///< whether we are blocking updates
bool removal; ///< we are removing the backend object
boost::optional<std::function<void()>> on_flush; ///< callback, may be null
+ // for chunked object
+ map<uint64_t, ceph_tid_t> io_tids;
FlushOp()
: flushed_version(0), objecter_tid(0), rval(0),
void _copy_some(ObjectContextRef obc, CopyOpRef cop);
void finish_copyfrom(CopyFromCallback *cb);
void finish_promote(int r, CopyResults *results, ObjectContextRef obc);
- void cancel_copy(CopyOpRef cop, bool requeue);
- void cancel_copy_ops(bool requeue);
+ void cancel_copy(CopyOpRef cop, bool requeue, vector<ceph_tid_t> *tids);
+ void cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids);
friend struct C_Copyfrom;
boost::optional<std::function<void()>> &&on_flush);
void finish_flush(hobject_t oid, ceph_tid_t tid, int r);
int try_flush_mark_clean(FlushOpRef fop);
- void cancel_flush(FlushOpRef fop, bool requeue);
- void cancel_flush_ops(bool requeue);
+ void cancel_flush(FlushOpRef fop, bool requeue, vector<ceph_tid_t> *tids);
+ void cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids);
/// @return false if clone is has been evicted
bool is_present_clone(hobject_t coid);
map<hobject_t, list<OpRequestRef>> in_progress_proxy_ops;
void kick_proxy_ops_blocked(hobject_t& soid);
- void cancel_proxy_ops(bool requeue);
+ void cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids);
// -- proxyread --
map<ceph_tid_t, ProxyReadOpRef> proxyread_ops;
void do_proxy_read(OpRequestRef op, ObjectContextRef obc = NULL);
void finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r);
- void cancel_proxy_read(ProxyReadOpRef prdop);
+ void cancel_proxy_read(ProxyReadOpRef prdop, vector<ceph_tid_t> *tids);
friend struct C_ProxyRead;
map<ceph_tid_t, ProxyWriteOpRef> proxywrite_ops;
void do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc = NULL);
+ void cancel_and_requeue_proxy_ops(hobject_t oid);
void finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r);
- void cancel_proxy_write(ProxyWriteOpRef pwop);
+ void cancel_proxy_write(ProxyWriteOpRef pwop, vector<ceph_tid_t> *tids);
friend struct C_ProxyWrite_Commit;
MapCacher::Transaction<std::string, bufferlist> *t)
{
dout(20) << __func__ << " " << oid << " " << snaps << dendl;
+ assert(!snaps.empty());
assert(check(oid));
{
object_snaps out;
int r = get_snaps(oid, &out);
- assert(r == -ENOENT);
+ if (r != -ENOENT) {
+ derr << __func__ << " found existing snaps mapped on " << oid
+ << ", removing" << dendl;
+ assert(!cct->_conf->osd_debug_verify_snaps);
+ remove_oid(oid, t);
+ }
}
object_snaps _snaps(oid, snaps);
unsigned priority,
unsigned cost,
Request item) {
- queue.enqueue(get_inner_client(cl, item), priority, cost, item);
+ queue.enqueue(get_inner_client(cl, item), priority, 0u, item);
}
// Enqueue the op in the front of the regular queue
unsigned priority,
unsigned cost,
Request item) {
- queue.enqueue_front(get_inner_client(cl, item), priority, cost, item);
+ queue.enqueue_front(get_inner_client(cl, item), priority, 0u, item);
}
// Return an op to be dispatched
unsigned priority,
unsigned cost,
Request item) override final {
- queue.enqueue(get_osd_op_type(item), priority, cost, item);
+ queue.enqueue(get_osd_op_type(item), priority, 0u, item);
}
// Enqueue the op in the front of the regular queue
unsigned priority,
unsigned cost,
Request item) override final {
- queue.enqueue_front(get_osd_op_type(item), priority, cost, item);
+ queue.enqueue_front(get_osd_op_type(item), priority, 0u, item);
}
// Returns if the queue is empty
type = PG_STATE_STALE;
else if (state == "remapped")
type = PG_STATE_REMAPPED;
- else if (state == "deep_scrub")
+ else if (state == "deep")
type = PG_STATE_DEEP_SCRUB;
else if (state == "backfilling")
type = PG_STATE_BACKFILLING;
// this was the first post-hammer thing we added; if it's missing, encode
// like hammer.
v = 21;
- }
- if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
v = 24;
}
for (vector<snapid_t>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
f->open_object_section("clone");
f->dump_unsigned("snap", *p);
- f->dump_unsigned("size", clone_size.find(*p)->second);
- f->dump_stream("overlap") << clone_overlap.find(*p)->second;
+ auto cs = clone_size.find(*p);
+ if (cs != clone_size.end())
+ f->dump_unsigned("size", cs->second);
+ else
+ f->dump_string("size", "????");
+ auto co = clone_overlap.find(*p);
+ if (co != clone_overlap.end())
+ f->dump_stream("overlap") << co->second;
+ else
+ f->dump_stream("overlap") << "????";
auto q = clone_snaps.find(*p);
if (q != clone_snaps.end()) {
f->open_array_section("snaps");
f->dump_stream("mtime") << mtime;
f->dump_stream("local_mtime") << local_mtime;
f->dump_unsigned("lost", (int)is_lost());
- f->dump_unsigned("flags", (int)flags);
+ vector<string> sv = get_flag_vector(flags);
+ f->open_array_section("flags");
+ for (auto str: sv)
+ f->dump_string("flags", str);
+ f->close_section();
f->open_array_section("legacy_snaps");
for (auto s : legacy_snaps) {
f->dump_unsigned("snap", s);
f->close_section();
f->dump_unsigned("truncate_seq", truncate_seq);
f->dump_unsigned("truncate_size", truncate_size);
- f->dump_unsigned("data_digest", data_digest);
- f->dump_unsigned("omap_digest", omap_digest);
+ f->dump_format("data_digest", "0x%08x", data_digest);
+ f->dump_format("omap_digest", "0x%08x", omap_digest);
f->dump_unsigned("expected_object_size", expected_object_size);
f->dump_unsigned("expected_write_size", expected_write_size);
f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
FLOOR(num_rd_kb);
FLOOR(num_wr);
FLOOR(num_wr_kb);
- FLOOR(num_scrub_errors);
FLOOR(num_shallow_scrub_errors);
FLOOR(num_deep_scrub_errors);
+ num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
FLOOR(num_objects_recovered);
FLOOR(num_bytes_recovered);
FLOOR(num_keys_recovered);
SPLIT(num_rd_kb);
SPLIT(num_wr);
SPLIT(num_wr_kb);
- SPLIT(num_scrub_errors);
- SPLIT(num_shallow_scrub_errors);
- SPLIT(num_deep_scrub_errors);
+ SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
+ SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
+ for (unsigned i = 0; i < out.size(); ++i) {
+ out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
+ out[i].num_deep_scrub_errors;
+ }
SPLIT(num_objects_recovered);
SPLIT(num_bytes_recovered);
SPLIT(num_keys_recovered);
static string get_flag_string(flag_t flags) {
string s;
+ vector<string> sv = get_flag_vector(flags);
+ for (auto ss : sv) {
+ s += string("|") + ss;
+ }
+ if (s.length())
+ return s.substr(1);
+ return s;
+ }
+ static vector<string> get_flag_vector(flag_t flags) {
+ vector<string> sv;
if (flags & FLAG_LOST)
- s += "|lost";
+ sv.insert(sv.end(), "lost");
if (flags & FLAG_WHITEOUT)
- s += "|whiteout";
+ sv.insert(sv.end(), "whiteout");
if (flags & FLAG_DIRTY)
- s += "|dirty";
+ sv.insert(sv.end(), "dirty");
if (flags & FLAG_USES_TMAP)
- s += "|uses_tmap";
+ sv.insert(sv.end(), "uses_tmap");
if (flags & FLAG_OMAP)
- s += "|omap";
+ sv.insert(sv.end(), "omap");
if (flags & FLAG_DATA_DIGEST)
- s += "|data_digest";
+ sv.insert(sv.end(), "data_digest");
if (flags & FLAG_OMAP_DIGEST)
- s += "|omap_digest";
+ sv.insert(sv.end(), "omap_digest");
if (flags & FLAG_CACHE_PIN)
- s += "|cache_pin";
+ sv.insert(sv.end(), "cache_pin");
if (flags & FLAG_MANIFEST)
- s += "|manifest";
- if (s.length())
- return s.substr(1);
- return s;
+ sv.insert(sv.end(), "manifest");
+ return sv;
}
string get_flag_string() const {
return get_flag_string(flags);
ldout(cct, 10) << "_do_flush wanted to do " << flush_pos << "~" << len
<< " already too close to prezero_pos " << prezero_pos
<< ", zeroing first" << dendl;
- waiting_for_zero = true;
+ waiting_for_zero_pos = flush_pos + len;
return;
}
if (static_cast<uint64_t>(newlen) < len) {
ldout(cct, 10) << "_do_flush wanted to do " << flush_pos << "~" << len
<< " but hit prezero_pos " << prezero_pos
<< ", will do " << flush_pos << "~" << newlen << dendl;
+ waiting_for_zero_pos = flush_pos + len;
len = newlen;
- } else {
- waiting_for_zero = false;
}
- } else {
- waiting_for_zero = false;
}
ldout(cct, 10) << "_do_flush flushing " << flush_pos << "~" << len << dendl;
pending_zero.erase(b);
}
- if (waiting_for_zero) {
- _do_flush();
+ if (waiting_for_zero_pos > flush_pos) {
+ _do_flush(waiting_for_zero_pos - flush_pos);
}
} else {
pending_zero.insert(start, len);
// protect write_buf from bufferlist _len overflow
Throttle write_buf_throttle;
- bool waiting_for_zero;
+ uint64_t waiting_for_zero_pos;
interval_set<uint64_t> pending_zero; // non-contig bits we've zeroed
std::map<uint64_t, uint64_t> pending_safe; // flush_pos -> safe_pos
// when safe through given offset
prezeroing_pos(0), prezero_pos(0), write_pos(0), flush_pos(0),
safe_pos(0), next_safe_pos(0),
write_buf_throttle(cct, "write_buf_throttle", UINT_MAX - (UINT_MAX >> 3)),
- waiting_for_zero(false),
+ waiting_for_zero_pos(0),
read_pos(0), requested_pos(0), received_pos(0),
fetch_len(0), temp_fetch_len(0),
on_readable(0), on_write_error(NULL), called_write_error(false),
expire_pos = 0;
trimming_pos = 0;
trimmed_pos = 0;
- waiting_for_zero = false;
+ waiting_for_zero_pos = 0;
}
// Asynchronous operations
return ret;
}
+int Objecter::op_cancel(const vector<ceph_tid_t>& tids, int r)
+{
+ unique_lock wl(rwlock);
+ ldout(cct,10) << __func__ << " " << tids << dendl;
+ for (auto tid : tids) {
+ _op_cancel(tid, r);
+ }
+ return 0;
+}
+
int Objecter::_op_cancel(ceph_tid_t tid, int r)
{
int ret = 0;
op->tid = 0;
m->get_redirect().combine_with_locator(op->target.target_oloc,
op->target.target_oid.name);
- op->target.flags |= CEPH_OSD_FLAG_REDIRECTED;
+ op->target.flags |= (CEPH_OSD_FLAG_REDIRECTED | CEPH_OSD_FLAG_IGNORE_OVERLAY);
_op_submit(op, sul, NULL);
m->put();
return;
int _op_cancel(ceph_tid_t tid, int r);
public:
int op_cancel(ceph_tid_t tid, int r);
+ int op_cancel(const vector<ceph_tid_t>& tidls, int r);
/**
* Any write op which is in progress at the start of this call shall no
pass
+class ArgumentMissing(ArgumentError):
+ """
+ Argument value missing in a command
+ """
+ pass
+
+
class ArgumentValid(ArgumentError):
"""
Argument value is otherwise invalid (doesn't match choices, for instance)
return d
# special-case the "0 expected 1" case
if desc.numseen == 0 and desc.n == 1:
- raise ArgumentNumber(
+ raise ArgumentMissing(
'missing required parameter {0}'.format(desc)
)
raise ArgumentNumber(
print("bestcmds_sorted: ", file=sys.stderr)
pprint.PrettyPrinter(stream=sys.stderr).pprint(bestcmds_sorted)
+ e = None
# for everything in bestcmds, look for a true match
for cmdsig in bestcmds_sorted:
for cmd in cmdsig.values():
# ignore prefix mismatches; we just haven't found
# the right command yet
pass
+ except ArgumentMissing as e:
+ if len(bestcmds) == 1:
+ found = cmd
+ break
except ArgumentTooFew:
# It looked like this matched the beginning, but it
# didn't have enough args supplied. If we're out of
# Solid mismatch on an arg (type, range, etc.)
# Stop now, because we have the right command but
# some other input is invalid
- print("Invalid command: ", e, file=sys.stderr)
- print(concise_sig(sig), ': ', cmd['help'], file=sys.stderr)
- return {}
- if found:
+ found = cmd
+ break
+ if found or e:
break
- if not found:
- print('no valid command found; 10 closest matches:', file=sys.stderr)
- for cmdsig in bestcmds[:10]:
+ if found:
+ if not valid_dict:
+ print("Invalid command:", e, file=sys.stderr)
+ print(concise_sig(sig), ': ', cmd['help'], file=sys.stderr)
+ else:
+ bestcmds = bestcmds[:10]
+ print('no valid command found; {0} closest matches:'.format(len(bestcmds)), file=sys.stderr)
+ for cmdsig in bestcmds:
for (cmdtag, cmd) in cmdsig.items():
print(concise_sig(cmd['sig']), file=sys.stderr)
- return None
-
return valid_dict
+
def find_cmd_target(childargs):
"""
Using a minimal validation, figure out whether the command
self.pg_stat = {
i['pgid']: i['stat_sum'] for i in pg_dump.get('pg_stats', [])
}
- self.poolids = [p['pool'] for p in self.osdmap_dump.get('pools', [])]
+ osd_poolids = [p['pool'] for p in self.osdmap_dump.get('pools', [])]
+ pg_poolids = [p['poolid'] for p in pg_dump.get('pool_stats', [])]
+ self.poolids = set(osd_poolids) & set(pg_poolids)
self.pg_up = {}
self.pg_up_by_poolid = {}
for poolid in self.poolids:
return 0.0
class Plan:
- def __init__(self, name, ms):
+ def __init__(self, name, ms, pools):
self.mode = 'unknown'
self.name = name
self.initial = ms
+ self.pools = pools
self.osd_weights = {}
self.compat_ws = {}
class Eval:
- root_ids = {} # root name -> id
- pool_name = {} # pool id -> pool name
- pool_id = {} # pool name -> id
- pool_roots = {} # pool name -> root name
- root_pools = {} # root name -> pools
- target_by_root = {} # root name -> target weight map
- count_by_pool = {}
- count_by_root = {}
- actual_by_pool = {} # pool -> by_* -> actual weight map
- actual_by_root = {} # pool -> by_* -> actual weight map
- total_by_pool = {} # pool -> by_* -> total
- total_by_root = {} # root -> by_* -> total
- stats_by_pool = {} # pool -> by_* -> stddev or avg -> value
- stats_by_root = {} # root -> by_* -> stddev or avg -> value
-
- score_by_pool = {}
- score_by_root = {}
-
- score = 0.0
-
def __init__(self, ms):
self.ms = ms
+ self.root_ids = {} # root name -> id
+ self.pool_name = {} # pool id -> pool name
+ self.pool_id = {} # pool name -> id
+ self.pool_roots = {} # pool name -> root name
+ self.root_pools = {} # root name -> pools
+ self.target_by_root = {} # root name -> target weight map
+ self.count_by_pool = {}
+ self.count_by_root = {}
+ self.actual_by_pool = {} # pool -> by_* -> actual weight map
+ self.actual_by_root = {} # pool -> by_* -> actual weight map
+ self.total_by_pool = {} # pool -> by_* -> total
+ self.total_by_root = {} # root -> by_* -> total
+ self.stats_by_pool = {} # pool -> by_* -> stddev or avg -> value
+ self.stats_by_root = {} # root -> by_* -> stddev or avg -> value
+
+ self.score_by_pool = {}
+ self.score_by_root = {}
+
+ self.score = 0.0
def show(self, verbose=False):
if verbose:
num = max(len(target), 1)
r = {}
for t in ('pgs', 'objects', 'bytes'):
+ if total[t] == 0:
+ r[t] = {
+ 'avg': 0,
+ 'stddev': 0,
+ 'sum_weight': 0,
+ 'score': 0,
+ }
+ continue
+
avg = float(total[t]) / float(num)
dev = 0.0
"perm": "rw",
},
{
- "cmd": "balancer eval name=plan,type=CephString,req=false",
- "desc": "Evaluate data distribution for the current cluster or specific plan",
+ "cmd": "balancer eval name=option,type=CephString,req=false",
+ "desc": "Evaluate data distribution for the current cluster or specific pool or specific plan",
"perm": "r",
},
{
- "cmd": "balancer eval-verbose name=plan,type=CephString,req=false",
- "desc": "Evaluate data distribution for the current cluster or specific plan (verbosely)",
+ "cmd": "balancer eval-verbose name=option,type=CephString,req=false",
+ "desc": "Evaluate data distribution for the current cluster or specific pool or specific plan (verbosely)",
"perm": "r",
},
{
- "cmd": "balancer optimize name=plan,type=CephString",
+ "cmd": "balancer optimize name=plan,type=CephString name=pools,type=CephString,n=N,req=false",
"desc": "Run optimizer to create a new plan",
"perm": "rw",
},
return (0, '', '')
elif command['prefix'] == 'balancer eval' or command['prefix'] == 'balancer eval-verbose':
verbose = command['prefix'] == 'balancer eval-verbose'
- if 'plan' in command:
- plan = self.plans.get(command['plan'])
+ pools = []
+ if 'option' in command:
+ plan = self.plans.get(command['option'])
if not plan:
- return (-errno.ENOENT, '', 'plan %s not found' %
- command['plan'])
- ms = plan.final_state()
+ # not a plan, does it look like a pool?
+ osdmap = self.get_osdmap()
+ valid_pool_names = [p['pool_name'] for p in osdmap.dump().get('pools', [])]
+ option = command['option']
+ if option not in valid_pool_names:
+ return (-errno.EINVAL, '', 'option "%s" not a plan or a pool' % option)
+ pools.append(option)
+ ms = MappingState(osdmap, self.get("pg_dump"), 'pool "%s"' % option)
+ else:
+ pools = plan.pools
+ ms = plan.final_state()
else:
ms = MappingState(self.get_osdmap(),
self.get("pg_dump"),
'current cluster')
- return (0, self.evaluate(ms, verbose=verbose), '')
+ return (0, self.evaluate(ms, pools, verbose=verbose), '')
elif command['prefix'] == 'balancer optimize':
- plan = self.plan_create(command['plan'])
- self.optimize(plan)
- return (0, '', '')
+ pools = []
+ if 'pools' in command:
+ pools = command['pools']
+ osdmap = self.get_osdmap()
+ valid_pool_names = [p['pool_name'] for p in osdmap.dump().get('pools', [])]
+ invalid_pool_names = []
+ for p in pools:
+ if p not in valid_pool_names:
+ invalid_pool_names.append(p)
+ if len(invalid_pool_names):
+ return (-errno.EINVAL, '', 'pools %s not found' % invalid_pool_names)
+ plan = self.plan_create(command['plan'], osdmap, pools)
+ r, detail = self.optimize(plan)
+ # remove plan if we are currently unable to find an optimization
+ # or distribution is already perfect
+ if r:
+ self.plan_rm(command['plan'])
+ return (r, '', detail)
elif command['prefix'] == 'balancer rm':
self.plan_rm(command['plan'])
return (0, '', '')
plan = self.plans.get(command['plan'])
if not plan:
return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
- self.execute(plan)
- self.plan_rm(plan)
- return (0, '', '')
+ r, detail = self.execute(plan)
+ self.plan_rm(command['plan'])
+ return (r, '', detail)
else:
return (-errno.EINVAL, '',
"Command not found '{0}'".format(command['prefix']))
if self.active and self.time_in_interval(timeofday, begin_time, end_time):
self.log.debug('Running')
name = 'auto_%s' % time.strftime(TIME_FORMAT, time.gmtime())
- plan = self.plan_create(name)
- if self.optimize(plan):
+ plan = self.plan_create(name, self.get_osdmap(), [])
+ r, detail = self.optimize(plan)
+ if r == 0:
self.execute(plan)
self.plan_rm(name)
self.log.debug('Sleeping for %d', sleep_interval)
self.event.wait(sleep_interval)
self.event.clear()
- def plan_create(self, name):
- plan = Plan(name, MappingState(self.get_osdmap(),
- self.get("pg_dump"),
- 'plan %s initial' % name))
+ def plan_create(self, name, osdmap, pools):
+ plan = Plan(name,
+ MappingState(osdmap,
+ self.get("pg_dump"),
+ 'plan %s initial' % name),
+ pools)
self.plans[name] = plan
return plan
if name in self.plans:
del self.plans[name]
- def calc_eval(self, ms):
+ def calc_eval(self, ms, pools):
pe = Eval(ms)
pool_rule = {}
pool_info = {}
for p in ms.osdmap_dump.get('pools',[]):
+ if len(pools) and p['pool_name'] not in pools:
+ continue
+ # skip dead or not-yet-ready pools too
+ if p['pool'] not in ms.poolids:
+ continue
pe.pool_name[p['pool']] = p['pool_name']
pe.pool_id[p['pool_name']] = p['pool']
pool_rule[p['pool_name']] = p['crush_rule']
pe.pool_roots[p['pool_name']] = []
pool_info[p['pool_name']] = p
- pools = pe.pool_id.keys()
- if len(pools) == 0:
+ if len(pool_info) == 0:
return pe
self.log.debug('pool_name %s' % pe.pool_name)
self.log.debug('pool_id %s' % pe.pool_id)
self.log.debug('pool_rule %s' % pool_rule)
osd_weight = { a['osd']: a['weight']
- for a in ms.osdmap_dump.get('osds',[]) }
+ for a in ms.osdmap_dump.get('osds',[]) if a['weight'] > 0 }
# get expected distributions by root
actual_by_root = {}
rootids = ms.crush.find_takes()
roots = []
for rootid in rootids:
- root = ms.crush.get_item_name(rootid)
- pe.root_ids[root] = rootid
- roots.append(root)
ls = ms.osdmap.get_pools_by_take(rootid)
+ want = []
+ # find out roots associating with pools we are passed in
+ for candidate in ls:
+ if candidate in pe.pool_name:
+ want.append(candidate)
+ if len(want) == 0:
+ continue
+ root = ms.crush.get_item_name(rootid)
pe.root_pools[root] = []
- for poolid in ls:
+ for poolid in want:
pe.pool_roots[pe.pool_name[poolid]].append(root)
pe.root_pools[root].append(pe.pool_name[poolid])
+ pe.root_ids[root] = rootid
+ roots.append(root)
weight_map = ms.crush.get_take_weight_osd_map(rootid)
adjusted_map = {
- osd: cw * osd_weight.get(osd, 1.0)
- for osd,cw in weight_map.iteritems()
+ osd: cw * osd_weight[osd]
+ for osd,cw in weight_map.iteritems() if osd in osd_weight and cw > 0
}
- sum_w = sum(adjusted_map.values()) or 1.0
+ sum_w = sum(adjusted_map.values())
+ assert len(adjusted_map) == 0 or sum_w > 0
pe.target_by_root[root] = { osd: w / sum_w
for osd,w in adjusted_map.iteritems() }
actual_by_root[root] = {
'objects': objects,
'bytes': bytes,
}
- for root, m in pe.total_by_root.iteritems():
+ for root in pe.total_by_root.iterkeys():
pe.count_by_root[root] = {
'pgs': {
k: float(v)
pe.total_by_root[a]
) for a, b in pe.count_by_root.iteritems()
}
+ self.log.debug('stats_by_root %s' % pe.stats_by_root)
# the scores are already normalized
pe.score_by_root = {
'bytes': pe.stats_by_root[r]['bytes']['score'],
} for r in pe.total_by_root.keys()
}
+ self.log.debug('score_by_root %s' % pe.score_by_root)
# total score is just average of normalized stddevs
pe.score = 0.0
pe.score /= 3 * len(roots)
return pe
- def evaluate(self, ms, verbose=False):
- pe = self.calc_eval(ms)
+ def evaluate(self, ms, pools, verbose=False):
+ pe = self.calc_eval(ms, pools)
return pe.show(verbose=verbose)
def optimize(self, plan):
self.log.debug('unknown %f degraded %f inactive %f misplaced %g',
unknown, degraded, inactive, misplaced)
if unknown > 0.0:
- self.log.info('Some PGs (%f) are unknown; waiting', unknown)
+ detail = 'Some PGs (%f) are unknown; try again later' % unknown
+ self.log.info(detail)
+ return -errno.EAGAIN, detail
elif degraded > 0.0:
- self.log.info('Some objects (%f) are degraded; waiting', degraded)
+ detail = 'Some objects (%f) are degraded; try again later' % degraded
+ self.log.info(detail)
+ return -errno.EAGAIN, detail
elif inactive > 0.0:
- self.log.info('Some PGs (%f) are inactive; waiting', inactive)
+ detail = 'Some PGs (%f) are inactive; try again later' % inactive
+ self.log.info(detail)
+ return -errno.EAGAIN, detail
elif misplaced >= max_misplaced:
- self.log.info('Too many objects (%f > %f) are misplaced; waiting',
- misplaced, max_misplaced)
+ detail = 'Too many objects (%f > %f) are misplaced; ' \
+ 'try again later' % (misplaced, max_misplaced)
+ self.log.info(detail)
+ return -errno.EAGAIN, detail
else:
if plan.mode == 'upmap':
return self.do_upmap(plan)
elif plan.mode == 'crush-compat':
return self.do_crush_compat(plan)
elif plan.mode == 'none':
+ detail = 'Please do "ceph balancer mode" to choose a valid mode first'
self.log.info('Idle')
+ return -errno.ENOEXEC, detail
else:
- self.log.info('Unrecognized mode %s' % plan.mode)
- return False
-
+ detail = 'Unrecognized mode %s' % plan.mode
+ self.log.info(detail)
+ return -errno.EINVAL, detail
##
def do_upmap(self, plan):
max_deviation = float(self.get_config('upmap_max_deviation', .01))
ms = plan.initial
- pools = [str(i['pool_name']) for i in ms.osdmap_dump.get('pools',[])]
+ if len(plan.pools):
+ pools = plan.pools
+ else: # all
+ pools = [str(i['pool_name']) for i in ms.osdmap_dump.get('pools',[])]
if len(pools) == 0:
- self.log.info('no pools, nothing to do')
- return False
+ detail = 'No pools available'
+ self.log.info(detail)
+ return -errno.ENOENT, detail
# shuffle pool list so they all get equal (in)attention
random.shuffle(pools)
self.log.info('pools %s' % pools)
if left <= 0:
break
self.log.info('prepared %d/%d changes' % (total_did, max_iterations))
- return True
+ if total_did == 0:
+ return -errno.EALREADY, 'Unable to find further optimization,' \
+ 'or distribution is already perfect'
+ return 0, ''
def do_crush_compat(self, plan):
self.log.info('do_crush_compat')
max_iterations = int(self.get_config('crush_compat_max_iterations', 25))
if max_iterations < 1:
- return False
+ return -errno.EINVAL, '"crush_compat_max_iterations" must be >= 1'
step = float(self.get_config('crush_compat_step', .5))
if step <= 0 or step >= 1.0:
- return False
+ return -errno.EINVAL, '"crush_compat_step" must be in (0, 1)'
max_misplaced = float(self.get_config('max_misplaced',
default_max_misplaced))
min_pg_per_osd = 2
ms = plan.initial
osdmap = ms.osdmap
crush = osdmap.get_crush()
- pe = self.calc_eval(ms)
- if pe.score == 0:
- self.log.info('Distribution is already perfect')
- return False
+ pe = self.calc_eval(ms, plan.pools)
+ min_score_to_optimize = float(self.get_config('min_score', 0))
+ if pe.score <= min_score_to_optimize:
+ if pe.score == 0:
+ detail = 'Distribution is already perfect'
+ else:
+ detail = 'score %f <= min_score %f, will not optimize' \
+ % (pe.score, min_score_to_optimize)
+ self.log.info(detail)
+ return -errno.EALREADY, detail
# get current osd reweights
orig_osd_weight = { a['osd']: a['weight']
if b < 1.0 and b > 0.0 ]
# get current compat weight-set weights
- orig_ws = self.get_compat_weight_set_weights()
- if orig_ws is None:
- return False
+ orig_ws = self.get_compat_weight_set_weights(ms)
+ if not orig_ws:
+ return -errno.EAGAIN, 'compat weight-set not available'
orig_ws = { a: b for a, b in orig_ws.iteritems() if a >= 0 }
# Make sure roots don't overlap their devices. If so, we
overlap[osd] = 1
visited[osd] = 1
if len(overlap) > 0:
- self.log.error('error: some osds belong to multiple subtrees: %s' %
- overlap)
- return False
+ detail = 'Some osds belong to multiple subtrees: %s' % \
+ overlap.keys()
+ self.log.error(detail)
+ return -errno.EOPNOTSUPP, detail
key = 'pgs' # pgs objects or bytes
random.shuffle(roots)
for root in roots:
pools = best_pe.root_pools[root]
- pgs = len(best_pe.target_by_root[root])
- min_pgs = pgs * min_pg_per_osd
- if best_pe.total_by_root[root] < min_pgs:
+ osds = len(best_pe.target_by_root[root])
+ min_pgs = osds * min_pg_per_osd
+ if best_pe.total_by_root[root][key] < min_pgs:
self.log.info('Skipping root %s (pools %s), total pgs %d '
'< minimum %d (%d per osd)',
- root, pools, pgs, min_pgs, min_pg_per_osd)
+ root, pools,
+ best_pe.total_by_root[root][key],
+ min_pgs, min_pg_per_osd)
continue
self.log.info('Balancing root %s (pools %s) by %s' %
(root, pools, key))
# recalc
plan.compat_ws = copy.deepcopy(next_ws)
next_ms = plan.final_state()
- next_pe = self.calc_eval(next_ms)
+ next_pe = self.calc_eval(next_ms, plan.pools)
next_misplaced = next_ms.calc_misplaced_from(ms)
self.log.debug('Step result score %f -> %f, misplacing %f',
best_pe.score, next_pe.score, next_misplaced)
next_misplaced, max_misplaced, step)
else:
if next_pe.score > best_pe.score * 1.0001:
+ bad_steps += 1
if bad_steps < 5 and random.randint(0, 100) < 70:
self.log.debug('Score got worse, taking another step')
else:
if w != orig_osd_weight[osd]:
self.log.debug('osd.%d reweight %f', osd, w)
plan.osd_weights[osd] = w
- return True
+ return 0, ''
else:
self.log.info('Failed to find further optimization, score %f',
pe.score)
- return False
-
- def get_compat_weight_set_weights(self):
- # enable compat weight-set
- self.log.debug('ceph osd crush weight-set create-compat')
- result = CommandResult('')
- self.send_command(result, 'mon', '', json.dumps({
- 'prefix': 'osd crush weight-set create-compat',
- 'format': 'json',
- }), '')
- r, outb, outs = result.wait()
- if r != 0:
- self.log.error('Error creating compat weight-set')
- return
-
- result = CommandResult('')
- self.send_command(result, 'mon', '', json.dumps({
- 'prefix': 'osd crush dump',
- 'format': 'json',
- }), '')
- r, outb, outs = result.wait()
- if r != 0:
- self.log.error('Error dumping crush map')
- return
- try:
- crushmap = json.loads(outb)
- except:
- raise RuntimeError('unable to parse crush map')
+ plan.compat_ws = {}
+ return -errno.EDOM, 'Unable to find further optimization, ' \
+ 'change balancer mode and retry might help'
+
+ def get_compat_weight_set_weights(self, ms):
+ if '-1' not in ms.crush_dump.get('choose_args', {}):
+ # enable compat weight-set first
+ self.log.debug('ceph osd crush weight-set create-compat')
+ result = CommandResult('')
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'osd crush weight-set create-compat',
+ 'format': 'json',
+ }), '')
+ r, outb, outs = result.wait()
+ if r != 0:
+ self.log.error('Error creating compat weight-set')
+ return
+
+ result = CommandResult('')
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'osd crush dump',
+ 'format': 'json',
+ }), '')
+ r, outb, outs = result.wait()
+ if r != 0:
+ self.log.error('Error dumping crush map')
+ return
+ try:
+ crushmap = json.loads(outb)
+ except:
+ raise RuntimeError('unable to parse crush map')
+ else:
+ crushmap = ms.crush_dump
raw = crushmap.get('choose_args',{}).get('-1', [])
weight_set = {}
r, outb, outs = result.wait()
if r != 0:
self.log.error('Error creating compat weight-set')
- return
+ return r, outs
for osd, weight in plan.compat_ws.iteritems():
self.log.info('ceph osd crush weight-set reweight-compat osd.%d %f',
for result in commands:
r, outb, outs = result.wait()
if r != 0:
- self.log.error('Error on command')
- return
+ self.log.error('execute error: r = %d, detail = %s' % (r, outs))
+ return r, outs
self.log.debug('done')
+ return 0, ''
"format": "json",
"pool": pool_name,
'rule': pool_name,
- 'erasure_code_profile': pool_name,
"pool_type": 'replicated',
'pg_num': str(pg_num),
}), "")
"""
return self._ceph_get(data_name)
+ def _stattype_to_str(self, stattype):
+
+ typeonly = stattype & self.PERFCOUNTER_TYPE_MASK
+ if typeonly == 0:
+ return 'gauge'
+ if typeonly == self.PERFCOUNTER_LONGRUNAVG:
+ # this lie matches the DaemonState decoding: only val, no counts
+ return 'counter'
+ if typeonly == self.PERFCOUNTER_COUNTER:
+ return 'counter'
+ if typeonly == self.PERFCOUNTER_HISTOGRAM:
+ return 'histogram'
+
+ return ''
+
def get_server(self, hostname):
"""
Called by the plugin to load information about a particular
for server in self.list_servers():
for service in server['services']:
- if service['type'] not in ("mds", "osd", "mon"):
+ if service['type'] not in ("rgw", "mds", "osd", "mon"):
continue
schema = self.get_perf_schema(service['type'], service['id'])
:return: a string
"""
return self._ceph_set_uri(uri)
+
+ def have_mon_connection(self):
+ """
+ Check whether this ceph-mgr daemon has an open connection
+ to a monitor. If it doesn't, then it's likely that the
+ information we have about the cluster is out of date,
+ and/or the monitor cluster is down.
+ """
+
+ return self._ceph_have_mon_connection()
\ No newline at end of file
import errno
import math
import os
+import socket
from collections import OrderedDict
-from mgr_module import MgrModule
+from mgr_module import MgrModule, MgrStandbyModule
# Defaults for the Prometheus HTTP server. Can also set in config-key
# see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
elif status == 'HEALTH_ERR':
return 2
-PG_STATES = ['creating', 'active', 'clean', 'down', 'scrubbing', 'deep', 'degraded',
- 'inconsistent', 'peering', 'repair', 'recovering', 'forced-recovery',
- 'backfill', 'forced-backfill', 'wait-backfill', 'backfill-toofull',
- 'incomplete', 'stale', 'remapped', 'undersized', 'peered']
+PG_STATES = [
+ "active",
+ "clean",
+ "down",
+ "recovery_unfound",
+ "backfill_unfound",
+ "scrubbing",
+ "degraded",
+ "inconsistent",
+ "peering",
+ "repair",
+ "recovering",
+ "forced_recovery",
+ "backfill_wait",
+ "incomplete",
+ "stale",
+ "remapped",
+ "deep",
+ "backfilling",
+ "forced_backfill",
+ "backfill_toofull",
+ "recovery_wait",
+ "recovery_toofull",
+ "undersized",
+ "activating",
+ "peered",
+ "snaptrim",
+ "snaptrim_wait",
+ "snaptrim_error",
+ "creating",
+ "unknown"]
DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_objects']
DF_POOL = ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty',
'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
-OSD_METADATA = ('cluster_addr', 'device_class', 'id', 'public_addr')
+OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance',
+ 'norecover', 'noscrub', 'nodeep-scrub')
-OSD_STATUS = ['weight', 'up', 'in']
-
-OSD_STATS = ['apply_latency_ms', 'commit_latency_ms']
+FS_METADATA = ('data_pools', 'id', 'metadata_pool', 'name')
-POOL_METADATA = ('pool_id', 'name')
+MDS_METADATA = ('id', 'fs', 'hostname', 'public_addr', 'rank', 'ceph_version')
-DISK_OCCUPATION = ('instance', 'device', 'ceph_daemon')
+MON_METADATA = ('id', 'hostname', 'public_addr', 'rank', 'ceph_version')
+OSD_METADATA = ('cluster_addr', 'device_class', 'id', 'hostname', 'public_addr',
+ 'ceph_version')
-class Metric(object):
- def __init__(self, mtype, name, desc, labels=None):
- self.mtype = mtype
- self.name = name
- self.desc = desc
- self.labelnames = labels # tuple if present
- self.value = dict() # indexed by label values
-
- def set(self, value, labelvalues=None):
- # labelvalues must be a tuple
- labelvalues = labelvalues or ('',)
- self.value[labelvalues] = value
-
- def str_expfmt(self):
-
- def promethize(path):
- ''' replace illegal metric name characters '''
- result = path.replace('.', '_').replace('+', '_plus').replace('::', '_')
-
- # Hyphens usually turn into underscores, unless they are
- # trailing
- if result.endswith("-"):
- result = result[0:-1] + "_minus"
- else:
- result = result.replace("-", "_")
-
- return "ceph_{0}".format(result)
+OSD_STATUS = ['weight', 'up', 'in']
- def floatstr(value):
- ''' represent as Go-compatible float '''
- if value == float('inf'):
- return '+Inf'
- if value == float('-inf'):
- return '-Inf'
- if math.isnan(value):
- return 'NaN'
- return repr(float(value))
+OSD_STATS = ['apply_latency_ms', 'commit_latency_ms']
- name = promethize(self.name)
- expfmt = '''
-# HELP {name} {desc}
-# TYPE {name} {mtype}'''.format(
- name=name,
- desc=self.desc,
- mtype=self.mtype,
- )
+POOL_METADATA = ('pool_id', 'name')
- for labelvalues, value in self.value.items():
- if self.labelnames:
- labels = zip(self.labelnames, labelvalues)
- labels = ','.join('%s="%s"' % (k, v) for k, v in labels)
- else:
- labels = ''
- if labels:
- fmtstr = '\n{name}{{{labels}}} {value}'
- else:
- fmtstr = '\n{name} {value}'
- expfmt += fmtstr.format(
- name=name,
- labels=labels,
- value=floatstr(value),
- )
- return expfmt
+RGW_METADATA = ('id', 'hostname', 'ceph_version')
+DISK_OCCUPATION = ('instance', 'device', 'ceph_daemon')
-class Module(MgrModule):
- COMMANDS = [
- {
- "cmd": "prometheus self-test",
- "desc": "Run a self test on the prometheus module",
- "perm": "rw"
- },
- ]
- def __init__(self, *args, **kwargs):
- super(Module, self).__init__(*args, **kwargs)
- self.notified = False
- self.serving = False
+class Metrics(object):
+ def __init__(self):
self.metrics = self._setup_static_metrics()
- self.schema = OrderedDict()
- _global_instance['plugin'] = self
-
- def _stattype_to_str(self, stattype):
+ self.pending = {}
+
+ def set(self, key, value, labels=('',)):
+ '''
+ Set the value of a single Metrics. This should be used for static metrics,
+ e.g. cluster health.
+ '''
+ self.metrics[key].set(value, labels)
+
+ def append(self, key, value, labels = ('',)):
+ '''
+ Append a metrics to the staging area. Use this to aggregate daemon specific
+ metrics that can appear and go away as daemons are added or removed.
+ '''
+ if key not in self.pending:
+ self.pending[key] = []
+ self.pending[key].append((labels, value))
+
+ def reset(self):
+ '''
+ When metrics aggregation is done, call Metrics.reset() to apply the
+ aggregated metric. This will remove all label -> value mappings for a
+ metric and set the new mapping (from pending). This means daemon specific
+ metrics os daemons that do no longer exist, are removed.
+ '''
+ for k, v in self.pending.items():
+ self.metrics[k].reset(v)
+ self.pending = {}
+
+ def add_metric(self, path, metric):
+ if path not in self.metrics:
+ self.metrics[path] = metric
- typeonly = stattype & self.PERFCOUNTER_TYPE_MASK
- if typeonly == 0:
- return 'gauge'
- if typeonly == self.PERFCOUNTER_LONGRUNAVG:
- # this lie matches the DaemonState decoding: only val, no counts
- return 'counter'
- if typeonly == self.PERFCOUNTER_COUNTER:
- return 'counter'
- if typeonly == self.PERFCOUNTER_HISTOGRAM:
- return 'histogram'
-
- return ''
def _setup_static_metrics(self):
metrics = {}
'health_status',
'Cluster health status'
)
- metrics['mon_quorum_count'] = Metric(
+ metrics['mon_quorum_status'] = Metric(
'gauge',
- 'mon_quorum_count',
- 'Monitors in quorum'
+ 'mon_quorum_status',
+ 'Monitors in quorum',
+ ('ceph_daemon',)
+ )
+ metrics['fs_metadata'] = Metric(
+ 'untyped',
+ 'fs_metadata',
+ 'FS Metadata',
+ FS_METADATA
+ )
+ metrics['mds_metadata'] = Metric(
+ 'untyped',
+ 'mds_metadata',
+ 'MDS Metadata',
+ MDS_METADATA
+ )
+ metrics['mon_metadata'] = Metric(
+ 'untyped',
+ 'mon_metadata',
+ 'MON Metadata',
+ MON_METADATA
)
metrics['osd_metadata'] = Metric(
'untyped',
'POOL Metadata',
POOL_METADATA
)
+
+ metrics['rgw_metadata'] = Metric(
+ 'untyped',
+ 'rgw_metadata',
+ 'RGW Metadata',
+ RGW_METADATA
+ )
+
+ metrics['pg_total'] = Metric(
+ 'gauge',
+ 'pg_total',
+ 'PG Total Count'
+ )
+
+ for flag in OSD_FLAGS:
+ path = 'osd_flag_{}'.format(flag)
+ metrics[path] = Metric(
+ 'untyped',
+ path,
+ 'OSD Flag {}'.format(flag)
+ )
for state in OSD_STATUS:
path = 'osd_{}'.format(state)
- self.log.debug("init: creating {}".format(path))
metrics[path] = Metric(
'untyped',
path,
)
for stat in OSD_STATS:
path = 'osd_{}'.format(stat)
- self.log.debug("init: creating {}".format(path))
metrics[path] = Metric(
'gauge',
path,
)
for state in PG_STATES:
path = 'pg_{}'.format(state)
- self.log.debug("init: creating {}".format(path))
metrics[path] = Metric(
'gauge',
path,
)
for state in DF_CLUSTER:
path = 'cluster_{}'.format(state)
- self.log.debug("init: creating {}".format(path))
metrics[path] = Metric(
'gauge',
path,
)
for state in DF_POOL:
path = 'pool_{}'.format(state)
- self.log.debug("init: creating {}".format(path))
metrics[path] = Metric(
'gauge',
path,
return metrics
- def shutdown(self):
- self.serving = False
- pass
+
+
+class Metric(object):
+ def __init__(self, mtype, name, desc, labels=None):
+ self.mtype = mtype
+ self.name = name
+ self.desc = desc
+ self.labelnames = labels # tuple if present
+ self.value = {} # indexed by label values
+
+ def set(self, value, labelvalues=None):
+ # labelvalues must be a tuple
+ labelvalues = labelvalues or ('',)
+ self.value[labelvalues] = value
+
+ def reset(self, values):
+ self.value = {}
+ for labelvalues, value in values:
+ self.value[labelvalues] = value
+
+ def str_expfmt(self):
+
+ def promethize(path):
+ ''' replace illegal metric name characters '''
+ result = path.replace('.', '_').replace('+', '_plus').replace('::', '_')
+
+ # Hyphens usually turn into underscores, unless they are
+ # trailing
+ if result.endswith("-"):
+ result = result[0:-1] + "_minus"
+ else:
+ result = result.replace("-", "_")
+
+ return "ceph_{0}".format(result)
+
+ def floatstr(value):
+ ''' represent as Go-compatible float '''
+ if value == float('inf'):
+ return '+Inf'
+ if value == float('-inf'):
+ return '-Inf'
+ if math.isnan(value):
+ return 'NaN'
+ return repr(float(value))
+
+ name = promethize(self.name)
+ expfmt = '''
+# HELP {name} {desc}
+# TYPE {name} {mtype}'''.format(
+ name=name,
+ desc=self.desc,
+ mtype=self.mtype,
+ )
+
+ for labelvalues, value in self.value.items():
+ if self.labelnames:
+ labels = zip(self.labelnames, labelvalues)
+ labels = ','.join('%s="%s"' % (k, v) for k, v in labels)
+ else:
+ labels = ''
+ if labels:
+ fmtstr = '\n{name}{{{labels}}} {value}'
+ else:
+ fmtstr = '\n{name} {value}'
+ expfmt += fmtstr.format(
+ name=name,
+ labels=labels,
+ value=floatstr(value),
+ )
+ return expfmt
+
+
+class Module(MgrModule):
+ COMMANDS = [
+ {
+ "cmd": "prometheus self-test",
+ "desc": "Run a self test on the prometheus module",
+ "perm": "rw"
+ },
+ ]
+
+ def __init__(self, *args, **kwargs):
+ super(Module, self).__init__(*args, **kwargs)
+ self.metrics = Metrics()
+ self.schema = OrderedDict()
+ _global_instance['plugin'] = self
def get_health(self):
health = json.loads(self.get('health')['json'])
- self.metrics['health_status'].set(
- health_status_to_number(health['status'])
+ self.metrics.set('health_status',
+ health_status_to_number(health['status'])
)
def get_df(self):
# maybe get the to-be-exported metrics from a config?
df = self.get('df')
for stat in DF_CLUSTER:
- path = 'cluster_{}'.format(stat)
- self.metrics[path].set(df['stats'][stat])
+ self.metrics.set('cluster_{}'.format(stat), df['stats'][stat])
for pool in df['pools']:
for stat in DF_POOL:
- path = 'pool_{}'.format(stat)
- self.metrics[path].set(pool['stats'][stat], (pool['id'],))
+ self.metrics.append('pool_{}'.format(stat),
+ pool['stats'][stat],
+ (pool['id'],))
+
+ def get_fs(self):
+ fs_map = self.get('fs_map')
+ servers = self.get_service_list()
+ active_daemons = []
+ for fs in fs_map['filesystems']:
+ # collect fs metadata
+ data_pools = ",".join([str(pool) for pool in fs['mdsmap']['data_pools']])
+ self.metrics.append('fs_metadata', 1,
+ (data_pools,
+ fs['id'],
+ fs['mdsmap']['metadata_pool'],
+ fs['mdsmap']['fs_name']))
+ for gid, daemon in fs['mdsmap']['info'].items():
+ id_ = daemon['name']
+ host_version = servers.get((id_, 'mds'), ('',''))
+ self.metrics.append('mds_metadata', 1,
+ (id_, fs['id'], host_version[0],
+ daemon['addr'], daemon['rank'],
+ host_version[1]))
def get_quorum_status(self):
mon_status = json.loads(self.get('mon_status')['json'])
- self.metrics['mon_quorum_count'].set(len(mon_status['quorum']))
+ servers = self.get_service_list()
+ for mon in mon_status['monmap']['mons']:
+ rank = mon['rank']
+ id_ = mon['name']
+ host_version = servers.get((id_, 'mon'), ('',''))
+ self.metrics.append('mon_metadata', 1,
+ (id_, host_version[0],
+ mon['public_addr'].split(':')[0], rank,
+ host_version[1]))
+ in_quorum = int(rank in mon_status['quorum'])
+ self.metrics.append('mon_quorum_status', in_quorum,
+ ('mon_{}'.format(id_),))
def get_pg_status(self):
# TODO add per pool status?
- pg_s = self.get('pg_summary')['all']
- reported_pg_s = [(s,v) for key, v in pg_s.items() for s in
- key.split('+')]
- for state, value in reported_pg_s:
+ pg_status = self.get('pg_status')
+
+ # Set total count of PGs, first
+ self.metrics.set('pg_total', pg_status['num_pgs'])
+
+ reported_states = {}
+ for pg in pg_status['pgs_by_state']:
+ for state in pg['state_name'].split('+'):
+ reported_states[state] = reported_states.get(state, 0) + pg['count']
+
+ for state in reported_states:
path = 'pg_{}'.format(state)
try:
- self.metrics[path].set(value)
+ self.metrics.set(path, reported_states[state])
except KeyError:
self.log.warn("skipping pg in unknown state {}".format(state))
- reported_states = [s[0] for s in reported_pg_s]
+
for state in PG_STATES:
- path = 'pg_{}'.format(state)
if state not in reported_states:
try:
- self.metrics[path].set(0)
+ self.metrics.set('pg_{}'.format(state), 0)
except KeyError:
self.log.warn("skipping pg in unknown state {}".format(state))
for osd in osd_stats['osd_stats']:
id_ = osd['osd']
for stat in OSD_STATS:
- status = osd['perf_stat'][stat]
- self.metrics['osd_{}'.format(stat)].set(
- status,
- ('osd.{}'.format(id_),))
+ val = osd['perf_stat'][stat]
+ self.metrics.append('osd_{}'.format(stat), val,
+ ('osd.{}'.format(id_),))
+
+ def get_service_list(self):
+ ret = {}
+ for server in self.list_servers():
+ version = server.get('ceph_version', '')
+ host = server.get('hostname', '')
+ for service in server.get('services', []):
+ ret.update({(service['id'], service['type']): (host, version)})
+ return ret
def get_metadata_and_osd_status(self):
osd_map = self.get('osd_map')
+ osd_flags = osd_map['flags'].split(',')
+ for flag in OSD_FLAGS:
+ self.metrics.set('osd_flag_{}'.format(flag),
+ int(flag in osd_flags))
+
osd_devices = self.get('osd_map_crush')['devices']
+ servers = self.get_service_list()
for osd in osd_map['osds']:
+ # id can be used to link osd metrics and metadata
id_ = osd['osd']
+ # collect osd metadata
p_addr = osd['public_addr'].split(':')[0]
c_addr = osd['cluster_addr'].split(':')[0]
- dev_class = next((osd for osd in osd_devices if osd['id'] == id_))
- self.metrics['osd_metadata'].set(0, (
+ if p_addr == "-" or c_addr == "-":
+ self.log.info(
+ "Missing address metadata for osd {0}, skipping occupation"
+ " and metadata records for this osd".format(id_)
+ )
+ continue
+
+ dev_class = None
+ for osd_device in osd_devices:
+ if osd_device['id'] == id_:
+ dev_class = osd_device.get('class', '')
+ break
+
+ if dev_class is None:
+ self.log.info(
+ "OSD {0} is missing from CRUSH map, skipping output".format(
+ id_))
+ continue
+
+ host_version = servers.get((str(id_), 'osd'), ('',''))
+
+ self.metrics.append('osd_metadata', 1, (
c_addr,
- dev_class['class'],
- id_,
- p_addr
+ dev_class,
+ id_, host_version[0],
+ p_addr, host_version[1]
))
+
+ # collect osd status
for state in OSD_STATUS:
status = osd[state]
- self.metrics['osd_{}'.format(state)].set(
- status,
- ('osd.{}'.format(id_),))
+ self.metrics.append('osd_{}'.format(state), status,
+ ('osd.{}'.format(id_),))
+ # collect disk occupation metadata
osd_metadata = self.get_metadata("osd", str(id_))
+ if osd_metadata is None:
+ continue
dev_keys = ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
osd_dev_node = None
for dev_key in dev_keys:
if osd_dev_node and osd_hostname:
self.log.debug("Got dev for osd {0}: {1}/{2}".format(
id_, osd_hostname, osd_dev_node))
- self.metrics['disk_occupation'].set(0, (
+ self.metrics.set('disk_occupation', 1, (
osd_hostname,
osd_dev_node,
"osd.{0}".format(id_)
self.log.info("Missing dev node metadata for osd {0}, skipping "
"occupation record for this osd".format(id_))
+ pool_meta = []
for pool in osd_map['pools']:
- id_ = pool['pool']
- name = pool['pool_name']
- self.metrics['pool_metadata'].set(0, (id_, name))
+ self.metrics.append('pool_metadata', 1, (pool['pool'], pool['pool_name']))
+
+ # Populate rgw_metadata
+ for key, value in servers.items():
+ service_id, service_type = key
+ if service_type != 'rgw':
+ continue
+ hostname, version = value
+ self.metrics.append(
+ 'rgw_metadata',
+ 1,
+ (service_id, hostname, version)
+ )
def collect(self):
self.get_health()
self.get_df()
+ self.get_fs()
self.get_osd_stats()
self.get_quorum_status()
self.get_metadata_and_osd_status()
self.get_pg_status()
- for daemon, counters in self.get_all_perf_counters().iteritems():
+ for daemon, counters in self.get_all_perf_counters().items():
for path, counter_info in counters.items():
stattype = self._stattype_to_str(counter_info['type'])
# XXX simplify first effort: no histograms
self.log.debug('ignoring %s, type %s' % (path, stattype))
continue
- if path not in self.metrics:
- self.metrics[path] = Metric(
+ self.metrics.add_metric(path, Metric(
stattype,
path,
counter_info['description'],
("ceph_daemon",),
- )
+ ))
- self.metrics[path].set(
- counter_info['value'],
- (daemon,)
- )
+ self.metrics.append(path, counter_info['value'], (daemon,))
+ # It is sufficient to reset the pending metrics once per scrape
+ self.metrics.reset()
- return self.metrics
+ return self.metrics.metrics
def handle_command(self, cmd):
if cmd['prefix'] == 'prometheus self-test':
@cherrypy.expose
def metrics(self):
- metrics = global_instance().collect()
- cherrypy.response.headers['Content-Type'] = 'text/plain'
- if metrics:
- return self.format_metrics(metrics)
+ if global_instance().have_mon_connection():
+ metrics = global_instance().collect()
+ cherrypy.response.headers['Content-Type'] = 'text/plain'
+ if metrics:
+ return self.format_metrics(metrics)
+ else:
+ raise cherrypy.HTTPError(503, 'No MON connection')
server_addr = self.get_localized_config('server_addr', DEFAULT_ADDR)
server_port = self.get_localized_config('server_port', DEFAULT_PORT)
(server_addr, server_port)
)
+ # Publish the URI that others may use to access the service we're
+ # about to start serving
+ self.set_uri('http://{0}:{1}/'.format(
+ socket.getfqdn() if server_addr == '::' else server_addr,
+ server_port
+ ))
+
cherrypy.config.update({
'server.socket_host': server_addr,
'server.socket_port': int(server_port),
'engine.autoreload.on': False
})
cherrypy.tree.mount(Root(), "/")
+ self.log.info('Starting engine...')
cherrypy.engine.start()
+ self.log.info('Engine started.')
cherrypy.engine.block()
+
+ def shutdown(self):
+ self.log.info('Stopping engine...')
+ cherrypy.engine.wait(state=cherrypy.engine.states.STARTED)
+ cherrypy.engine.exit()
+ self.log.info('Stopped engine')
+
+
+class StandbyModule(MgrStandbyModule):
+ def serve(self):
+ server_addr = self.get_localized_config('server_addr', '::')
+ server_port = self.get_localized_config('server_port', DEFAULT_PORT)
+ self.log.info("server_addr: %s server_port: %s" % (server_addr, server_port))
+ cherrypy.config.update({
+ 'server.socket_host': server_addr,
+ 'server.socket_port': int(server_port),
+ 'engine.autoreload.on': False
+ })
+
+ module = self
+
+ class Root(object):
+
+ @cherrypy.expose
+ def index(self):
+ active_uri = module.get_active_uri()
+ return '''<!DOCTYPE html>
+<html>
+ <head><title>Ceph Exporter</title></head>
+ <body>
+ <h1>Ceph Exporter</h1>
+ <p><a href='{}metrics'>Metrics</a></p>
+ </body>
+</html>'''.format(active_uri)
+
+ @cherrypy.expose
+ def metrics(self):
+ cherrypy.response.headers['Content-Type'] = 'text/plain'
+ return ''
+
+ cherrypy.tree.mount(Root(), '/', {})
+ self.log.info('Starting engine...')
+ cherrypy.engine.start()
+ self.log.info("Waiting for engine...")
+ cherrypy.engine.wait(state=cherrypy.engine.states.STOPPED)
+ self.log.info('Engine started.')
+
+ def shutdown(self):
+ self.log.info("Stopping engine...")
+ cherrypy.engine.wait(state=cherrypy.engine.states.STARTED)
+ cherrypy.engine.stop()
+ self.log.info("Stopped engine")
'zabbix_sender': '/usr/bin/zabbix_sender',
'zabbix_host': None,
'zabbix_port': 10051,
- 'identifier': None, 'interval': 60
+ 'identifier': "",
+ 'interval': 60
}
COMMANDS = [
},
{
"cmd": "zabbix send",
- "desc": "Force sending data to Zabbux",
+ "desc": "Force sending data to Zabbix",
"perm": "rw"
},
{
self.event = Event()
def init_module_config(self):
- for key, default in self.config_keys.items():
- value = self.get_localized_config(key, default)
- if value is None:
- raise RuntimeError('Configuration key {0} not set; "ceph '
- 'config-key set mgr/zabbix/{0} '
- '<value>"'.format(key))
+ self.fsid = self.get('mon_map')['fsid']
+ self.log.debug('Found Ceph fsid %s', self.fsid)
- self.set_config_option(key, value)
+ for key, default in self.config_keys.items():
+ self.set_config_option(key, self.get_config(key, default))
def set_config_option(self, option, value):
if option not in self.config_keys.keys():
if option == 'interval' and value < 10:
raise RuntimeError('interval should be set to at least 10 seconds')
+ self.log.debug('Setting in-memory config option %s to: %s', option,
+ value)
self.config[option] = value
+ return True
def get_data(self):
data = dict()
def send(self):
data = self.get_data()
- self.log.debug('Sending data to Zabbix server %s',
- self.config['zabbix_host'])
- self.log.debug(data)
+ identifier = self.config['identifier']
+ if identifier is None or len(identifier) == 0:
+ identifier = 'ceph-{0}'.format(self.fsid)
+
+ if not self.config['zabbix_host']:
+ self.log.error('Zabbix server not set, please configure using: '
+ 'ceph zabbix config-set zabbix_host <zabbix_host>')
+ return
try:
+ self.log.info(
+ 'Sending data to Zabbix server %s as host/identifier %s',
+ self.config['zabbix_host'], identifier)
+ self.log.debug(data)
+
zabbix = ZabbixSender(self.config['zabbix_sender'],
self.config['zabbix_host'],
self.config['zabbix_port'], self.log)
- zabbix.send(self.config['identifier'], data)
+
+ zabbix.send(identifier, data)
+ return True
except Exception as exc:
self.log.error('Exception when sending: %s', exc)
+ return False
+
def handle_command(self, command):
if command['prefix'] == 'zabbix config-show':
return 0, json.dumps(self.config), ''
return -errno.EINVAL, '', 'Value should not be empty or None'
self.log.debug('Setting configuration option %s to %s', key, value)
- self.set_config_option(key, value)
- self.set_localized_config(key, value)
- return 0, 'Configuration option {0} updated'.format(key), ''
+ if self.set_config_option(key, value):
+ self.set_config(key, value)
+ return 0, 'Configuration option {0} updated'.format(key), ''
+
+ return 1,\
+ 'Failed to update configuration option {0}'.format(key), ''
+
elif command['prefix'] == 'zabbix send':
- self.send()
- return 0, 'Sending data to Zabbix', ''
+ if self.send():
+ return 0, 'Sending data to Zabbix', ''
+
+ return 1, 'Failed to send data to Zabbix', ''
elif command['prefix'] == 'zabbix self-test':
self.self_test()
return 0, 'Self-test succeeded', ''
self.event.set()
def serve(self):
- self.log.debug('Zabbix module starting up')
+ self.log.info('Zabbix module starting up')
self.run = True
self.init_module_config()
- for key, value in self.config.items():
- self.log.debug('%s: %s', key, value)
-
while self.run:
self.log.debug('Waking up for new iteration')
rgw_frontend.cc
rgw_gc.cc
rgw_http_client.cc
+ rgw_http_client_curl.cc
rgw_json_enc.cc
rgw_keystone.cc
rgw_ldap.cc
${EXPAT_LIBRARIES}
${OPENLDAP_LIBRARIES} ${CRYPTO_LIBS})
+if (WITH_CURL_OPENSSL)
+ target_link_libraries(rgw_a ${OPENSSL_LIBRARIES})
+endif (WITH_CURL_OPENSSL)
+
set(radosgw_srcs
rgw_loadgen_process.cc
rgw_civetweb.cc
goto done;
}
+ /* now expected by rgw_log_op() */
+ rgw_env.set("REQUEST_METHOD", s->info.method);
+ rgw_env.set("REQUEST_URI", s->info.request_uri);
+ rgw_env.set("QUERY_STRING", "");
+
/* XXX authorize does less here then in the REST path, e.g.,
* the user's info is cached, but still incomplete */
req->log(s, "authorizing");
#include "rgw_realm_watcher.h"
#include "rgw_role.h"
#include "rgw_reshard.h"
+#include "rgw_http_client_curl.h"
using namespace std;
cout << " reshard list list all bucket resharding or scheduled to be reshared\n";
cout << " reshard process process of scheduled reshard jobs\n";
cout << " reshard cancel cancel resharding a bucket\n";
+ cout << " sync error list list sync error\n";
+ cout << " sync error trim trim sync error\n";
cout << "options:\n";
cout << " --tenant=<tenant> tenant name\n";
cout << " --uid=<id> user id\n";
cout << " (NOTE: required to delete a non-empty bucket)\n";
cout << " --sync-stats option to 'user stats', update user stats with current\n";
cout << " stats reported by user's buckets indexes\n";
+ cout << " --reset-stats option to 'user stats', reset stats in accordance with user buckets\n";
cout << " --show-log-entries=<flag> enable/disable dump of log entries on log show\n";
cout << " --show-log-sum=<flag> enable/disable dump of log summation on log show\n";
cout << " --skip-zero-entries log show only dumps entries that don't have zero value\n";
OPT_MDLOG_FETCH,
OPT_MDLOG_STATUS,
OPT_SYNC_ERROR_LIST,
+ OPT_SYNC_ERROR_TRIM,
OPT_BILOG_LIST,
OPT_BILOG_TRIM,
OPT_BILOG_STATUS,
(strcmp(prev_cmd, "error") == 0)) {
if (strcmp(cmd, "list") == 0)
return OPT_SYNC_ERROR_LIST;
+ if (strcmp(cmd, "trim") == 0)
+ return OPT_SYNC_ERROR_TRIM;
} else if (strcmp(prev_cmd, "mdlog") == 0) {
if (strcmp(cmd, "list") == 0)
return OPT_MDLOG_LIST;
int include_all = false;
int sync_stats = false;
+ int reset_stats = false;
int bypass_gc = false;
int warnings_only = false;
int inconsistent_index = false;
// do nothing
} else if (ceph_argparse_binary_flag(args, i, &sync_stats, NULL, "--sync-stats", (char*)NULL)) {
// do nothing
+ } else if (ceph_argparse_binary_flag(args, i, &reset_stats, NULL, "--reset-stats", (char*)NULL)) {
+ // do nothing
} else if (ceph_argparse_binary_flag(args, i, &include_all, NULL, "--include-all", (char*)NULL)) {
// do nothing
} else if (ceph_argparse_binary_flag(args, i, &extra_info, NULL, "--extra-info", (char*)NULL)) {
rgw_user_init(store);
rgw_bucket_init(store->meta_mgr);
+ struct rgw_curl_setup {
+ rgw_curl_setup() {
+ rgw::curl::setup_curl(boost::none);
+ }
+ ~rgw_curl_setup() {
+ rgw::curl::cleanup_curl();
+ }
+ } curl_cleanup;
+
StoreDestructor store_destructor(store);
if (raw_storage_op) {
}
if (opt_cmd == OPT_RESHARD_CANCEL) {
- RGWReshard reshard(store);
-
if (bucket_name.empty()) {
cerr << "ERROR: bucket not specified" << std::endl;
return EINVAL;
}
- cls_rgw_reshard_entry entry;
- //entry.tenant = tenant;
- entry.bucket_name = bucket_name;
- //entry.bucket_id = bucket_id;
- int ret = reshard.get(entry);
+
+ rgw_bucket bucket;
+ RGWBucketInfo bucket_info;
+ map<string, bufferlist> attrs;
+ ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket, &attrs);
if (ret < 0) {
- cerr << "Error in getting bucket " << bucket_name << ": " << cpp_strerror(-ret) << std::endl;
+ cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+ return -ret;
+ }
+
+ RGWBucketReshard br(store, bucket_info, attrs);
+ int ret = br.cancel();
+ if (ret < 0) {
+ if (ret == -EBUSY) {
+ cerr << "There is ongoing resharding, please retry after " << g_conf->rgw_reshard_bucket_lock_duration <<
+ " seconds " << std::endl;
+ } else {
+ cerr << "Error canceling bucket " << bucket_name << " resharding: " << cpp_strerror(-ret) <<
+ std::endl;
+ }
return ret;
}
+ RGWReshard reshard(store);
- /* TBD stop running resharding */
+ cls_rgw_reshard_entry entry;
+ //entry.tenant = tenant;
+ entry.bucket_name = bucket_name;
+ //entry.bucket_id = bucket_id;
- ret =reshard.remove(entry);
- if (ret < 0) {
- cerr << "Error removing bucket " << bucket_name << " for resharding queue: " << cpp_strerror(-ret) <<
- std::endl;
+ ret = reshard.remove(entry);
+ if (ret < 0 && ret != -ENOENT) {
+ cerr << "Error in getting bucket " << bucket_name << ": " << cpp_strerror(-ret) << std::endl;
return ret;
}
}
}
if (opt_cmd == OPT_USER_STATS) {
+ if (user_id.empty()) {
+ cerr << "ERROR: uid not specified" << std::endl;
+ return EINVAL;
+ }
+
+ string user_str = user_id.to_str();
+ if (reset_stats) {
+ if (!bucket_name.empty()){
+ cerr << "ERROR: recalculate doesn't work on buckets" << std::endl;
+ return EINVAL;
+ }
+ ret = store->cls_user_reset_stats(user_str);
+ if (ret < 0) {
+ cerr << "ERROR: could not clear user stats: " << cpp_strerror(-ret) << std::endl;
+ return -ret;
+ }
+ }
+
if (sync_stats) {
if (!bucket_name.empty()) {
int ret = rgw_bucket_sync_user_stats(store, tenant, bucket_name);
}
}
- if (user_id.empty()) {
- cerr << "ERROR: uid not specified" << std::endl;
- return EINVAL;
- }
cls_user_header header;
- string user_str = user_id.to_str();
int ret = store->cls_user_get_header(user_str, &header);
if (ret < 0) {
if (ret == -ENOENT) { /* in case of ENOENT */
cerr << "ERROR: source zone not specified" << std::endl;
return EINVAL;
}
+
RGWDataSyncStatusManager sync(store, store->get_async_rados(), source_zone);
int ret = sync.init();
cerr << "ERROR: source zone not specified" << std::endl;
return EINVAL;
}
- RGWDataSyncStatusManager sync(store, store->get_async_rados(), source_zone);
- int ret = sync.init();
+ RGWSyncModuleInstanceRef sync_module;
+ int ret = store->get_sync_modules_manager()->create_instance(g_ceph_context, store->get_zone().tier_type,
+ store->get_zone_params().tier_config, &sync_module);
+ if (ret < 0) {
+ lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
+ return ret;
+ }
+
+ RGWDataSyncStatusManager sync(store, store->get_async_rados(), source_zone, sync_module);
+
+ ret = sync.init();
if (ret < 0) {
cerr << "ERROR: sync.init() returned ret=" << ret << std::endl;
return -ret;
formatter->flush(cout);
}
+ if (opt_cmd == OPT_SYNC_ERROR_TRIM) {
+ utime_t start_time, end_time;
+ int ret = parse_date_str(start_date, start_time);
+ if (ret < 0)
+ return -ret;
+
+ ret = parse_date_str(end_date, end_time);
+ if (ret < 0)
+ return -ret;
+
+ if (shard_id < 0) {
+ shard_id = 0;
+ }
+
+ for (; shard_id < ERROR_LOGGER_SHARDS; ++shard_id) {
+ string oid = RGWSyncErrorLogger::get_shard_oid(RGW_SYNC_ERROR_LOG_SHARD_PREFIX, shard_id);
+ ret = store->time_log_trim(oid, start_time.to_real_time(), end_time.to_real_time(), start_marker, end_marker);
+ if (ret < 0 && ret != -ENODATA) {
+ cerr << "ERROR: sync error trim: " << cpp_strerror(-ret) << std::endl;
+ return -ret;
+ }
+ if (specified_shard_id) {
+ break;
+ }
+ }
+ }
+
if (opt_cmd == OPT_BILOG_TRIM) {
if (bucket_name.empty()) {
cerr << "ERROR: bucket not specified" << std::endl;
class AsioFrontend {
RGWProcessEnv env;
+ RGWFrontendConfig* conf;
boost::asio::io_service service;
tcp::acceptor acceptor;
void accept(boost::system::error_code ec);
public:
- AsioFrontend(const RGWProcessEnv& env)
- : env(env), acceptor(service), peer_socket(service) {}
+ AsioFrontend(const RGWProcessEnv& env, RGWFrontendConfig* conf)
+ : env(env), conf(conf), acceptor(service), peer_socket(service) {}
int init();
int run();
int AsioFrontend::init()
{
- auto ep = tcp::endpoint{tcp::v4(), static_cast<unsigned short>(env.port)};
- ldout(ctx(), 4) << "frontend listening on " << ep << dendl;
+ std::string port_str;
+ conf->get_val("port", "80", &port_str);
+ unsigned short port;
+ boost::asio::ip::address addr; // default to 'any'
boost::system::error_code ec;
+
+ auto colon = port_str.find(':');
+ if (colon != port_str.npos) {
+ addr = boost::asio::ip::make_address(port_str.substr(0, colon), ec);
+ if (ec) {
+ lderr(ctx()) << "failed to parse address '" << port_str << "': " << ec.message() << dendl;
+ return -ec.value();
+ }
+ port = std::stoul(port_str.substr(colon + 1), nullptr, 0);
+ } else {
+ port = std::stoul(port_str, nullptr, 0);
+ }
+
+ tcp::endpoint ep = {addr, port};
+ ldout(ctx(), 4) << "frontend listening on " << ep << dendl;
+
acceptor.open(ep.protocol(), ec);
if (ec) {
lderr(ctx()) << "failed to open socket: " << ec.message() << dendl;
class RGWAsioFrontend::Impl : public AsioFrontend {
public:
- Impl(const RGWProcessEnv& env) : AsioFrontend(env) {}
+ Impl(const RGWProcessEnv& env, RGWFrontendConfig* conf) : AsioFrontend(env, conf) {}
};
-RGWAsioFrontend::RGWAsioFrontend(const RGWProcessEnv& env)
- : impl(new Impl(env))
+RGWAsioFrontend::RGWAsioFrontend(const RGWProcessEnv& env,
+ RGWFrontendConfig* conf)
+ : impl(new Impl(env, conf))
{
}
class Impl;
std::unique_ptr<Impl> impl;
public:
- RGWAsioFrontend(const RGWProcessEnv& env);
+ RGWAsioFrontend(const RGWProcessEnv& env, RGWFrontendConfig* conf);
~RGWAsioFrontend() override;
int init() override;
namespace auth {
namespace s3 {
-/* FIXME(rzarzynski): duplicated from rgw_rest_s3.h. */
-#define RGW_AUTH_GRACE_MINS 15
+bool is_time_skew_ok(time_t t)
+{
+ auto req_tp = ceph::coarse_real_clock::from_time_t(t);
+ auto cur_tp = ceph::coarse_real_clock::now();
+
+ if (req_tp < cur_tp - RGW_AUTH_GRACE ||
+ req_tp > cur_tp + RGW_AUTH_GRACE) {
+ dout(10) << "NOTICE: request time skew too big." << dendl;
+ using ceph::operator<<;
+ dout(10) << "req_tp=" << req_tp << ", cur_tp=" << cur_tp << dendl;
+ return false;
+ }
+
+ return true;
+}
static inline int parse_v4_query_string(const req_info& info, /* in */
boost::string_view& credential, /* out */
return -EPERM;
}
- /* Used for pre-signatured url, We shouldn't return -ERR_REQUEST_TIME_SKEWED
- * when current time <= X-Amz-Expires */
- bool qsr = false;
-
- uint64_t now_req = 0;
- uint64_t now = ceph_clock_now();
-
boost::string_view expires = info.args.get("X-Amz-Expires");
- if (!expires.empty()) {
- /* X-Amz-Expires provides the time period, in seconds, for which
- the generated presigned URL is valid. The minimum value
- you can set is 1, and the maximum is 604800 (seven days) */
- time_t exp = atoll(expires.data());
- if ((exp < 1) || (exp > 7*24*60*60)) {
- dout(10) << "NOTICE: exp out of range, exp = " << exp << dendl;
- return -EPERM;
- }
- /* handle expiration in epoch time */
- now_req = (uint64_t)internal_timegm(&date_t);
- if (now >= now_req + exp) {
- dout(10) << "NOTICE: now = " << now << ", now_req = " << now_req << ", exp = " << exp << dendl;
- return -EPERM;
- }
- qsr = true;
+ if (expires.empty()) {
+ return -EPERM;
}
-
- if ((now_req < now - RGW_AUTH_GRACE_MINS * 60 ||
- now_req > now + RGW_AUTH_GRACE_MINS * 60) && !qsr) {
- dout(10) << "NOTICE: request time skew too big." << dendl;
- dout(10) << "now_req = " << now_req << " now = " << now
- << "; now - RGW_AUTH_GRACE_MINS="
- << now - RGW_AUTH_GRACE_MINS * 60
- << "; now + RGW_AUTH_GRACE_MINS="
- << now + RGW_AUTH_GRACE_MINS * 60 << dendl;
- return -ERR_REQUEST_TIME_SKEWED;
+ /* X-Amz-Expires provides the time period, in seconds, for which
+ the generated presigned URL is valid. The minimum value
+ you can set is 1, and the maximum is 604800 (seven days) */
+ time_t exp = atoll(expires.data());
+ if ((exp < 1) || (exp > 7*24*60*60)) {
+ dout(10) << "NOTICE: exp out of range, exp = " << exp << dendl;
+ return -EPERM;
+ }
+ /* handle expiration in epoch time */
+ uint64_t req_sec = (uint64_t)internal_timegm(&date_t);
+ uint64_t now = ceph_clock_now();
+ if (now >= req_sec + exp) {
+ dout(10) << "NOTICE: now = " << now << ", req_sec = " << req_sec << ", exp = " << exp << dendl;
+ return -EPERM;
}
signedheaders = info.args.get("X-Amz-SignedHeaders");
}
date = d;
+ if (!is_time_skew_ok(internal_timegm(&t))) {
+ return -ERR_REQUEST_TIME_SKEWED;
+ }
+
return 0;
}
namespace auth {
namespace s3 {
+static constexpr auto RGW_AUTH_GRACE = std::chrono::minutes{15};
+
+// returns true if the request time is within RGW_AUTH_GRACE of the current time
+bool is_time_skew_ok(time_t t);
+
class ExternalAuthStrategy : public rgw::auth::Strategy,
public rgw::auth::RemoteApplier::Factory {
typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t;
return r;
}
+int RGWBucket::set_quota(RGWBucketAdminOpState& op_state, std::string *err_msg)
+{
+ rgw_bucket bucket = op_state.get_bucket();
+ RGWBucketInfo bucket_info;
+ map<string, bufferlist> attrs;
+ RGWObjectCtx obj_ctx(store);
+ int r = store->get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL, &attrs);
+ if (r < 0) {
+ set_err_msg(err_msg, "could not get bucket info for bucket=" + bucket.name + ": " + cpp_strerror(-r));
+ return r;
+ }
+
+ bucket_info.quota = op_state.quota;
+ r = store->put_bucket_instance_info(bucket_info, false, real_time(), &attrs);
+ if (r < 0) {
+ set_err_msg(err_msg, "ERROR: failed writing bucket instance info: " + cpp_strerror(-r));
+ return r;
+ }
+ return r;
+}
+
int RGWBucket::remove(RGWBucketAdminOpState& op_state, bool bypass_gc,
bool keep_index_consistent, std::string *err_msg)
{
return 0;
}
+int RGWBucketAdminOp::set_quota(RGWRados *store, RGWBucketAdminOpState& op_state)
+{
+ RGWBucket bucket;
+
+ int ret = bucket.init(store, op_state);
+ if (ret < 0)
+ return ret;
+ return bucket.set_quota(op_state);
+}
void rgw_data_change::dump(Formatter *f) const
{
rgw_bucket bucket;
+ RGWQuotaInfo quota;
+
void set_fetch_stats(bool value) { stat_buckets = value; }
void set_check_objects(bool value) { check_objects = value; }
void set_fix_index(bool value) { fix_index = value; }
void set_object(std::string& object_str) {
object_name = object_str;
}
+ void set_quota(RGWQuotaInfo& value) {
+ quota = value;
+ }
+
rgw_user& get_user_id() { return uid; }
std::string& get_user_display_name() { return display_name; }
int remove(RGWBucketAdminOpState& op_state, bool bypass_gc = false, bool keep_index_consistent = true, std::string *err_msg = NULL);
int link(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL);
int unlink(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL);
+ int set_quota(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL);
int remove_object(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL);
int policy_bl_to_stream(bufferlist& bl, ostream& o);
const std::list<std::string>& user_ids,
RGWFormatterFlusher& flusher,
bool warnings_only = false);
+ static int set_quota(RGWRados *store, RGWBucketAdminOpState& op_state);
};
: conn(conn),
explicit_keepalive(false),
explicit_conn_close(false),
+ got_eof_on_read(false),
txbuf(*this)
{
sockaddr *lsa = mg_get_local_addr(conn);
size_t RGWCivetWeb::read_data(char *buf, size_t len)
{
- const int ret = mg_read(conn, buf, len);
- if (ret < 0) {
- throw rgw::io::Exception(EIO, std::system_category());
+ int c, ret;
+ if (got_eof_on_read) {
+ return 0;
+ }
+ for (c = 0; c < len; c += ret) {
+ ret = mg_read(conn, buf, len);
+ if (ret < 0) {
+ throw rgw::io::Exception(EIO, std::system_category());
+ }
+ if (!ret) {
+ got_eof_on_read = true;
+ break;
+ }
}
- return ret;
+ return c;
}
void RGWCivetWeb::flush()
bool explicit_keepalive;
bool explicit_conn_close;
+ bool got_eof_on_read;
rgw::io::StaticOutputBufferer<> txbuf;
RGWRestfulIO client_io(dout_context, &real_client_io);
RGWRequest req(env.store->get_new_req_id());
+ int http_ret = 0;
int ret = process_request(env.store, env.rest, &req, env.uri_prefix,
- *env.auth_registry, &client_io, env.olog);
+ *env.auth_registry, &client_io, env.olog, &http_ret);
if (ret < 0) {
/* We don't really care about return code. */
dout(20) << "process_request() returned " << ret << dendl;
}
- /* Mark as processed. */
- return 1;
+ if (http_ret <= 0) {
+ /* Mark as processed. */
+ return 1;
+ }
+
+ return http_ret;
}
int RGWCivetWebFrontend::run()
int rgw_perf_start(CephContext *cct)
{
- PerfCountersBuilder plb(cct, cct->_conf->name.to_str(), l_rgw_first, l_rgw_last);
+ PerfCountersBuilder plb(cct, "rgw", l_rgw_first, l_rgw_last);
+
+ // RGW emits comparatively few metrics, so let's be generous
+ // and mark them all USEFUL to get transmission to ceph-mgr by default.
+ plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
plb.add_u64_counter(l_rgw_req, "req", "Requests");
plb.add_u64_counter(l_rgw_failed_req, "failed_req", "Aborted requests");
extern bool match_policy(boost::string_view pattern, boost::string_view input,
uint32_t flag);
+void rgw_setup_saved_curl_handles();
+void rgw_release_all_curl_handles();
+
#endif
bufferlist out_bl, in_bl, temp_in_bl;
bl.copy(bl_ofs, bl_len, temp_in_bl);
bl_ofs = 0;
- int r;
+ int r = 0;
if (waiting.length() != 0) {
in_bl.append(waiting);
in_bl.append(temp_in_bl);
}
cur_ofs += bl_len;
-
off_t ch_len = std::min<off_t>(out_bl.length() - q_ofs, q_len);
- r = next->handle_data(out_bl, q_ofs, ch_len);
- if (r < 0) {
- lderr(cct) << "handle_data failed with exit code " << r << dendl;
- return r;
+ if (ch_len > 0) {
+ r = next->handle_data(out_bl, q_ofs, ch_len);
+ if (r < 0) {
+ lderr(cct) << "handle_data failed with exit code " << r << dendl;
+ return r;
+ }
+ out_bl.splice(0, q_ofs + ch_len);
+ q_len -= ch_len;
+ q_ofs = 0;
}
- q_len -= ch_len;
- q_ofs = 0;
return r;
}
RGWZoneParams& zone_params = store->get_zone_params();
- sync_module = store->get_sync_module();
+ if (sync_module == nullptr) {
+ sync_module = store->get_sync_module();
+ }
conn = store->get_zone_conn_by_id(source_zone);
if (!conn) {
syncstopped = false;
continue;
}
+ if (e.op == CLS_RGW_OP_CANCEL) {
+ continue;
+ }
if (e.state != CLS_RGW_STATE_COMPLETE) {
continue;
}
: store(_store), source_zone(_source_zone), conn(NULL), error_logger(NULL),
sync_module(nullptr),
source_log(store, async_rados, observer), num_shards(0) {}
+ RGWDataSyncStatusManager(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
+ const string& _source_zone, const RGWSyncModuleInstanceRef& _sync_module,
+ rgw::BucketChangeObserver *observer = nullptr)
+ : store(_store), source_zone(_source_zone), conn(NULL), error_logger(NULL),
+ sync_module(_sync_module),
+ source_log(store, async_rados, observer), num_shards(0) {}
~RGWDataSyncStatusManager() {
finalize();
}
lsubdout(parent->get_fs()->get_context(), rgw, 15)
<< __func__
- << " offset=" << name
+ << " offset=" << ((name) ? name : "(nil)")
<< dendl;
if ((! name) &&
}
bool eof() {
- lsubdout(cct, rgw, 15) << "READDIR offset: " << offset
- << " is_truncated: " << is_truncated
- << dendl;
+ if (unlikely(cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15))) {
+ bool is_offset =
+ unlikely(! get<const char*>(&offset)) ||
+ !! get<const char*>(offset);
+ lsubdout(cct, rgw, 15) << "READDIR offset: " <<
+ ((is_offset) ? offset : "(nil)")
+ << " is_truncated: " << is_truncated
+ << dendl;
+ }
return !is_truncated;
}
}
bool eof() {
- lsubdout(cct, rgw, 15) << "READDIR offset: " << offset
- << " next marker: " << next_marker
- << " is_truncated: " << is_truncated
- << dendl;
+ if (unlikely(cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15))) {
+ bool is_offset =
+ unlikely(! get<const char*>(&offset)) ||
+ !! get<const char*>(offset);
+ lsubdout(cct, rgw, 15) << "READDIR offset: " <<
+ ((is_offset) ? offset : "(nil)")
+ << " next marker: " << next_marker
+ << " is_truncated: " << is_truncated
+ << dendl;
+ }
return !is_truncated;
}
dump_value_int(name, "%f", d);
}
-void RGWFormatter_Plain::dump_string(const char *name, const std::string& s)
+void RGWFormatter_Plain::dump_string(const char *name, boost::string_view s)
{
- dump_format(name, "%s", s.c_str());
+ dump_format(name, "%s", s.data());
}
std::ostream& RGWFormatter_Plain::dump_stream(const char *name)
void dump_unsigned(const char *name, uint64_t u) override;
void dump_int(const char *name, int64_t u) override;
void dump_float(const char *name, double d) override;
- void dump_string(const char *name, const std::string& s) override;
+ void dump_string(const char *name, boost::string_view s) override;
std::ostream& dump_stream(const char *name) override;
void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
int get_len() const override;
}
};
+struct RGWCurlHandle {
+ int uses;
+ mono_time lastuse;
+ CURL* h;
+
+ RGWCurlHandle(CURL* h) : uses(0), h(h) {};
+ CURL* operator*() {
+ return this->h;
+ }
+};
+
+#define MAXIDLE 5
+class RGWCurlHandles : public Thread {
+public:
+ Mutex cleaner_lock;
+ std::vector<RGWCurlHandle*>saved_curl;
+ int cleaner_shutdown;
+ Cond cleaner_cond;
+
+ RGWCurlHandles() :
+ cleaner_lock{"RGWCurlHandles::cleaner_lock"},
+ cleaner_shutdown{0} {
+ }
+
+ RGWCurlHandle* get_curl_handle();
+ void release_curl_handle_now(RGWCurlHandle* curl);
+ void release_curl_handle(RGWCurlHandle* curl);
+ void flush_curl_handles();
+ void* entry();
+ void stop();
+};
+
+RGWCurlHandle* RGWCurlHandles::get_curl_handle() {
+ RGWCurlHandle* curl = 0;
+ CURL* h;
+ {
+ Mutex::Locker lock(cleaner_lock);
+ if (!saved_curl.empty()) {
+ curl = *saved_curl.begin();
+ saved_curl.erase(saved_curl.begin());
+ }
+ }
+ if (curl) {
+ } else if ((h = curl_easy_init())) {
+ curl = new RGWCurlHandle{h};
+ } else {
+ // curl = 0;
+ }
+ return curl;
+}
+
+void RGWCurlHandles::release_curl_handle_now(RGWCurlHandle* curl)
+{
+ curl_easy_cleanup(**curl);
+ delete curl;
+}
+
+void RGWCurlHandles::release_curl_handle(RGWCurlHandle* curl)
+{
+ if (cleaner_shutdown) {
+ release_curl_handle_now(curl);
+ } else {
+ curl_easy_reset(**curl);
+ Mutex::Locker lock(cleaner_lock);
+ curl->lastuse = mono_clock::now();
+ saved_curl.insert(saved_curl.begin(), 1, curl);
+ }
+}
+
+void* RGWCurlHandles::entry()
+{
+ RGWCurlHandle* curl;
+ Mutex::Locker lock(cleaner_lock);
+
+ for (;;) {
+ if (cleaner_shutdown) {
+ if (saved_curl.empty())
+ break;
+ } else {
+ utime_t release = ceph_clock_now() + utime_t(MAXIDLE,0);
+ cleaner_cond.WaitUntil(cleaner_lock, release);
+ }
+ mono_time now = mono_clock::now();
+ while (!saved_curl.empty()) {
+ auto cend = saved_curl.end();
+ --cend;
+ curl = *cend;
+ if (!cleaner_shutdown && now - curl->lastuse < std::chrono::seconds(MAXIDLE))
+ break;
+ saved_curl.erase(cend);
+ release_curl_handle_now(curl);
+ }
+ }
+ return nullptr;
+}
+
+void RGWCurlHandles::stop()
+{
+ Mutex::Locker lock(cleaner_lock);
+ cleaner_shutdown = 1;
+ cleaner_cond.Signal();
+}
+
+void RGWCurlHandles::flush_curl_handles()
+{
+ stop();
+ join();
+ if (!saved_curl.empty()) {
+ dout(0) << "ERROR: " << __func__ << " failed final cleanup" << dendl;
+ }
+ saved_curl.shrink_to_fit();
+}
+
+static RGWCurlHandles *handles;
+// XXX make this part of the token cache? (but that's swift-only;
+// and this especially needs to integrates with s3...)
+
+void rgw_setup_saved_curl_handles()
+{
+ handles = new RGWCurlHandles();
+ handles->create("rgw_curl");
+}
+
+void rgw_release_all_curl_handles()
+{
+ handles->flush_curl_handles();
+ delete handles;
+}
+
/*
* the simple set of callbacks will be called on RGWHTTPClient::process()
*/
last_method = (method ? method : "");
last_url = (url ? url : "");
- curl_handle = curl_easy_init();
+ auto ca = handles->get_curl_handle();
+ curl_handle = **ca;
dout(20) << "sending request to " << url << dendl;
ret = -EINVAL;
}
curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &http_status);
- curl_easy_cleanup(curl_handle);
+ handles->release_curl_handle(ca);
curl_slist_free_all(h);
return ret;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "rgw_http_client_curl.h"
+#include <mutex>
+#include <vector>
+#include <curl/curl.h>
+
+#include "rgw_common.h"
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+#ifdef WITH_CURL_OPENSSL
+#include <openssl/crypto.h>
+#endif
+
+#if defined(WITH_CURL_OPENSSL) && OPENSSL_API_COMPAT < 0x10100000L
+namespace openssl {
+
+class RGWSSLSetup
+{
+ std::vector <std::mutex> locks;
+public:
+ RGWSSLSetup(int n) : locks (n){}
+
+ void set_lock(int id){
+ try {
+ locks.at(id).lock();
+ } catch (std::out_of_range& e) {
+ dout(0) << __func__ << "failed to set locks" << dendl;
+ }
+ }
+
+ void clear_lock(int id){
+ try {
+ locks.at(id).unlock();
+ } catch (std::out_of_range& e) {
+ dout(0) << __func__ << "failed to unlock" << dendl;
+ }
+ }
+};
+
+
+void rgw_ssl_locking_callback(int mode, int id, const char *file, int line)
+{
+ static RGWSSLSetup locks(CRYPTO_num_locks());
+ if (mode & CRYPTO_LOCK)
+ locks.set_lock(id);
+ else
+ locks.clear_lock(id);
+}
+
+unsigned long rgw_ssl_thread_id_callback(){
+ return (unsigned long)pthread_self();
+}
+
+void init_ssl(){
+ CRYPTO_set_id_callback((unsigned long (*) ()) rgw_ssl_thread_id_callback);
+ CRYPTO_set_locking_callback(rgw_ssl_locking_callback);
+}
+
+} /* namespace openssl */
+#endif // WITH_CURL_OPENSSL
+
+
+namespace rgw {
+namespace curl {
+
+static void check_curl()
+{
+#ifndef HAVE_CURL_MULTI_WAIT
+ derr << "WARNING: libcurl doesn't support curl_multi_wait()" << dendl;
+ derr << "WARNING: cross zone / region transfer performance may be affected" << dendl;
+#endif
+}
+
+#if defined(WITH_CURL_OPENSSL) && OPENSSL_API_COMPAT < 0x10100000L
+void init_ssl() {
+ ::openssl::init_ssl();
+}
+
+bool fe_inits_ssl(boost::optional <const fe_map_t&> m, long& curl_global_flags){
+ if (m) {
+ for (const auto& kv: *m){
+ if (kv.first == "civetweb" || kv.first == "beast"){
+ std::string cert;
+ kv.second->get_val("ssl_certificate","", &cert);
+ if (!cert.empty()){
+ /* TODO this flag is no op for curl > 7.57 */
+ curl_global_flags &= ~CURL_GLOBAL_SSL;
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+}
+#endif // WITH_CURL_OPENSSL
+
+std::once_flag curl_init_flag;
+
+void setup_curl(boost::optional<const fe_map_t&> m) {
+ check_curl();
+
+ long curl_global_flags = CURL_GLOBAL_ALL;
+
+ #if defined(WITH_CURL_OPENSSL) && OPENSSL_API_COMPAT < 0x10100000L
+ if (!fe_inits_ssl(m, curl_global_flags))
+ init_ssl();
+ #endif
+
+ std::call_once(curl_init_flag, curl_global_init, curl_global_flags);
+ rgw_setup_saved_curl_handles();
+}
+
+void cleanup_curl() {
+ rgw_release_all_curl_handles();
+ curl_global_cleanup();
+}
+
+} /* namespace curl */
+} /* namespace rgw */
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 SUSE Linux GmBH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef RGW_HTTP_CLIENT_CURL_H
+#define RGW_HTTP_CLIENT_CURL_H
+
+#include <map>
+#include <boost/optional.hpp>
+#include "rgw_frontend.h"
+
+namespace rgw {
+namespace curl {
+using fe_map_t = std::multimap <std::string, RGWFrontendConfig *>;
+
+void setup_curl(boost::optional<const fe_map_t&> m);
+void cleanup_curl();
+}
+}
+
+#endif
else
set_param_str(s, "HTTP_REFERER", entry.referrer);
- std::string uri(s->info.env->get("REQUEST_METHOD"));
- uri.append(" ");
-
- uri.append(s->info.env->get("REQUEST_URI"));
- const char* qs = s->info.env->get("QUERY_STRING");
- if(qs && (*qs != '\0')) {
- uri.append("?");
- uri.append(qs);
+ std::string uri;
+ if (s->info.env->exists("REQUEST_METHOD")) {
+ uri.append(s->info.env->get("REQUEST_METHOD"));
+ uri.append(" ");
}
- uri.append(" ");
- uri.append("HTTP/");
- uri.append(s->info.env->get("HTTP_VERSION"));
+ if (s->info.env->exists("REQUEST_URI")) {
+ uri.append(s->info.env->get("REQUEST_URI"));
+ }
+
+ if (s->info.env->exists("QUERY_STRING")) {
+ const char* qs = s->info.env->get("QUERY_STRING");
+ if(qs && (*qs != '\0')) {
+ uri.append("?");
+ uri.append(qs);
+ }
+ }
+
+ if (s->info.env->exists("HTTP_VERSION")) {
+ uri.append(" ");
+ uri.append("HTTP/");
+ uri.append(s->info.env->get("HTTP_VERSION"));
+ }
entry.uri = std::move(uri);
#include <errno.h>
#include <signal.h>
-#include <curl/curl.h>
-
#include <boost/intrusive_ptr.hpp>
#include "acconfig.h"
#include "rgw_request.h"
#include "rgw_process.h"
#include "rgw_frontend.h"
+#include "rgw_http_client_curl.h"
#if defined(WITH_RADOSGW_BEAST_FRONTEND)
#include "rgw_asio_frontend.h"
#endif /* WITH_RADOSGW_BEAST_FRONTEND */
_exit(0);
}
-#ifdef HAVE_CURL_MULTI_WAIT
-static void check_curl()
-{
-}
-#else
-static void check_curl()
-{
- derr << "WARNING: libcurl doesn't support curl_multi_wait()" << dendl;
- derr << "WARNING: cross zone / region transfer performance may be affected" << dendl;
-}
-#endif
class C_InitTimeout : public Context {
public:
static RGWRESTMgr *rest_filter(RGWRados *store, int dialect, RGWRESTMgr *orig)
{
RGWSyncModuleInstanceRef sync_module = store->get_sync_module();
- return sync_module->get_rest_filter(dialect, orig);
+ if (sync_module) {
+ return sync_module->get_rest_filter(dialect, orig);
+ } else {
+ return orig;
+ }
}
/*
g_conf->set_val_or_die("rgw_zonegroup", g_conf->rgw_region.c_str());
}
- check_curl();
-
if (g_conf->daemonize) {
global_init_daemonize(g_ceph_context);
}
}
rgw_init_resolver();
-
- curl_global_init(CURL_GLOBAL_ALL);
-
+ rgw::curl::setup_curl(fe_map);
+
#if defined(WITH_RADOSGW_FCGI_FRONTEND)
FCGX_Init();
#endif
std::string uri_prefix;
config->get_val("prefix", "", &uri_prefix);
RGWProcessEnv env{ store, &rest, olog, port, uri_prefix, auth_registry };
- fe = new RGWAsioFrontend(env);
+ fe = new RGWAsioFrontend(env, config);
}
#endif /* WITH_RADOSGW_BEAST_FRONTEND */
#if defined(WITH_RADOSGW_FCGI_FRONTEND)
rgw_tools_cleanup();
rgw_shutdown_resolver();
- curl_global_cleanup();
+ rgw::curl::cleanup_curl();
rgw_perf_stop(g_ceph_context);
}
}
+ if (op_ret == -ECANCELED) {
+ op_ret = 0;
+ }
if (op_ret == -ERR_PRECONDITION_FAILED && no_precondition_error) {
op_ret = 0;
}
const std::string& frontend_prefix,
const rgw_auth_registry_t& auth_registry,
RGWRestfulIO* const client_io,
- OpsLogSocket* const olog)
+ OpsLogSocket* const olog,
+ int* http_ret)
{
int ret = client_io->init(g_ceph_context);
rgw_log_op(store, rest, s, (op ? op->name() : "unknown"), olog);
}
- int http_ret = s->err.http_ret;
+ if (http_ret != nullptr) {
+ *http_ret = s->err.http_ret;
+ }
int op_ret = 0;
if (op) {
op_ret = op->get_ret();
}
req->log_format(s, "op status=%d", op_ret);
- req->log_format(s, "http status=%d", http_ret);
+ req->log_format(s, "http status=%d", s->err.http_ret);
if (handler)
handler->put_op(op);
dout(1) << "====== req done req=" << hex << req << dec
<< " op status=" << op_ret
- << " http_status=" << http_ret
+ << " http_status=" << s->err.http_ret
<< " ======"
<< dendl;
const std::string& frontend_prefix,
const rgw_auth_registry_t& auth_registry,
RGWRestfulIO* client_io,
- OpsLogSocket* olog);
+ OpsLogSocket* olog,
+ int* http_ret = nullptr);
extern int rgw_process_authenticated(RGWHandler_REST* handler,
RGWOp*& op,
ldout(cct, 0)
<< __func__
<< " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-ret)
- << " (this can be due to a pool or placement group misconfiguration, e.g., pg_num < pgp_num)"
+ << " (this can be due to a pool or placement group misconfiguration, e.g."
+ << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
<< dendl;
}
if (ret < 0)
if (ret < 0) {
return ret;
}
+ if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
+ // Current implementation does not follow S3 spec and even
+ // may result in data corruption silently when copying
+ // multipart objects acorss pools. So reject COPY operations
+ //on encrypted objects before it is fully functional.
+ ldout(cct, 0) << "ERROR: copy op for encrypted object " << src_obj
+ << " has not been implemented." << dendl;
+ return -ERR_NOT_IMPLEMENTED;
+ }
src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
src_attrs.erase(RGW_ATTR_DELETE_AT);
store->remove_rgw_head_obj(op);
r = ref.ioctx.operate(ref.oid, &op);
- bool need_invalidate = false;
- if (r == -ECANCELED) {
- /* raced with another operation, we can regard it as removed */
- need_invalidate = true;
- r = 0;
- }
+
+ /* raced with another operation, object state is indeterminate */
+ const bool need_invalidate = (r == -ECANCELED);
int64_t poolid = ref.ioctx.get_id();
if (r >= 0) {
return 0;
}
+int RGWRados::cls_user_reset_stats(const string& user_id)
+{
+ string buckets_obj_id;
+ rgw_get_buckets_obj(user_id, buckets_obj_id);
+ rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
+
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::ObjectWriteOperation op;
+ ::cls_user_reset_stats(op);
+ return ref.ioctx.operate(ref.oid, &op);
+}
+
int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
{
string buckets_obj_id;
int fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix);
int cls_user_get_header(const string& user_id, cls_user_header *header);
+ int cls_user_reset_stats(const string& user_id);
int cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx);
int cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info);
int cls_user_list_buckets(rgw_raw_obj& obj,
return ::create_new_bucket_instance(store, new_num_shards, bucket_info, bucket_attrs, new_bucket_info);
}
+int RGWBucketReshard::cancel()
+{
+ int ret = lock_bucket();
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = clear_resharding();
+
+ unlock_bucket();
+ return 0;
+}
+
class BucketInfoReshardUpdate
{
RGWRados *store;
int ret = cls_rgw_reshard_get(store->reshard_pool_ctx, logshard_oid, entry);
if (ret < 0) {
- lderr(store->ctx()) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
+ if (ret != -ENOENT) {
+ lderr(store->ctx()) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant <<
+ " bucket=" << entry.bucket_name << dendl;
+ }
return ret;
}
RGWReshard *reshard_log = nullptr);
int abort();
int get_status(std::list<cls_rgw_bucket_instance_entry> *status);
+ int cancel();
};
class RGWReshard {
{
delimiter = s->info.args.get("delimiter");
prefix = s->info.args.get("prefix");
- string str = s->info.args.get("max-parts");
+ string str = s->info.args.get("max-uploads");
if (!str.empty())
max_uploads = atoi(str.c_str());
else
http_ret = RGWBucketAdminOp::remove_bucket(store, op_state);
}
+class RGWOp_Set_Bucket_Quota : public RGWRESTOp {
+
+public:
+ RGWOp_Set_Bucket_Quota() {}
+
+ int check_caps(RGWUserCaps& caps) {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute();
+
+ virtual const string name() { return "set_bucket_quota"; }
+};
+
+#define QUOTA_INPUT_MAX_LEN 1024
+
+void RGWOp_Set_Bucket_Quota::execute()
+{
+ bool uid_arg_existed = false;
+ std::string uid_str;
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str, &uid_arg_existed);
+ if (! uid_arg_existed) {
+ http_ret = -EINVAL;
+ return;
+ }
+ rgw_user uid(uid_str);
+ bool bucket_arg_existed = false;
+ std::string bucket;
+ RESTArgs::get_string(s, "bucket", bucket, &bucket, &bucket_arg_existed);
+ if (! bucket_arg_existed) {
+ http_ret = -EINVAL;
+ return;
+ }
+
+ bool use_http_params;
+
+ if (s->content_length > 0) {
+ use_http_params = false;
+ } else {
+ const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
+ use_http_params = (!encoding || strcmp(encoding, "chunked") != 0);
+ }
+ RGWQuotaInfo quota;
+ if (!use_http_params) {
+ bool empty;
+ http_ret = rgw_rest_get_json_input(store->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty);
+ if (http_ret < 0) {
+ if (!empty)
+ return;
+ /* was probably chunked input, but no content provided, configure via http params */
+ use_http_params = true;
+ }
+ }
+ if (use_http_params) {
+ RGWBucketInfo bucket_info;
+ map<string, bufferlist> attrs;
+ RGWObjectCtx obj_ctx(store);
+ http_ret = store->get_bucket_info(obj_ctx, uid.tenant, bucket, bucket_info, NULL, &attrs);
+ if (http_ret < 0) {
+ return;
+ }
+ RGWQuotaInfo *old_quota = &bucket_info.quota;
+ int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size);
+ int64_t max_size_kb;
+ RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, "a.max_objects);
+ RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb);
+ quota.max_size = max_size_kb * 1024;
+ RESTArgs::get_bool(s, "enabled", old_quota->enabled, "a.enabled);
+ }
+
+ RGWBucketAdminOpState op_state;
+ op_state.set_user_id(uid);
+ op_state.set_bucket_name(bucket);
+ op_state.set_quota(quota);
+
+ http_ret = RGWBucketAdminOp::set_quota(store, op_state);
+}
+
class RGWOp_Object_Remove: public RGWRESTOp {
public:
RGWOp *RGWHandler_Bucket::op_put()
{
+ if (s->info.args.sub_resource_exists("quota"))
+ return new RGWOp_Set_Bucket_Quota;
return new RGWOp_Bucket_Link;
}
dump_errno(s);
end_header(s, this, "application/xml");
+ dump_start(s);
if (op_ret == 0) {
s->formatter->open_object_section_in_ns("CopyObjectResult", XMLNS_AWS_S3);
}
return state->exists;
}
+int RGWHandler_REST_S3Website::init(RGWRados *store, req_state *s,
+ rgw::io::BasicClient* cio)
+{
+ // save the original object name before retarget() replaces it with the
+ // result of get_effective_key(). the error_handler() needs the original
+ // object name for redirect handling
+ original_object_name = s->object.name;
+
+ return RGWHandler_REST_S3::init(store, s, cio);
+}
+
int RGWHandler_REST_S3Website::retarget(RGWOp* op, RGWOp** new_op) {
*new_op = op;
ldout(s->cct, 10) << __func__ << "Starting retarget" << dendl;
RGWBWRoutingRule rrule;
bool should_redirect =
- s->bucket_info.website_conf.should_redirect(s->object.name, http_error_code,
- &rrule);
+ s->bucket_info.website_conf.should_redirect(original_object_name,
+ http_error_code, &rrule);
if (should_redirect) {
const string& hostname = s->info.env->get("HTTP_HOST", "");
const string& protocol =
(s->info.env->get("SERVER_PORT_SECURE") ? "https" : "http");
int redirect_code = 0;
- rrule.apply_rule(protocol, hostname, s->object.name, &s->redirect,
- &redirect_code);
+ rrule.apply_rule(protocol, hostname, original_object_name,
+ &s->redirect, &redirect_code);
// Apply a custom HTTP response code
if (redirect_code > 0)
s->err.http_ret = redirect_code; // Apply a custom HTTP response code
namespace auth {
namespace s3 {
-bool AWSGeneralAbstractor::is_time_skew_ok(const utime_t& header_time,
- const bool qsr) const
-{
- /* Check for time skew first. */
- const time_t req_sec = header_time.sec();
- time_t now;
- time(&now);
-
- if ((req_sec < now - RGW_AUTH_GRACE_MINS * 60 ||
- req_sec > now + RGW_AUTH_GRACE_MINS * 60) && !qsr) {
- ldout(cct, 10) << "req_sec=" << req_sec << " now=" << now
- << "; now - RGW_AUTH_GRACE_MINS="
- << now - RGW_AUTH_GRACE_MINS * 60
- << "; now + RGW_AUTH_GRACE_MINS="
- << now + RGW_AUTH_GRACE_MINS * 60
- << dendl;
-
- ldout(cct, 0) << "NOTICE: request time skew too big now="
- << utime_t(now, 0)
- << " req_time=" << header_time
- << dendl;
- return false;
- } else {
- return true;
- }
-}
-
-
static rgw::auth::Completer::cmplptr_t
null_completer_factory(const boost::optional<std::string>& secret_key)
{
qsr = true;
boost::string_view expires = s->info.args.get("Expires");
- if (! expires.empty()) {
- /* It looks we have the guarantee that expires is a null-terminated,
- * and thus string_view::data() can be safely used. */
- const time_t exp = atoll(expires.data());
- time_t now;
- time(&now);
-
- if (now >= exp) {
- throw -EPERM;
- }
+ if (expires.empty()) {
+ throw -EPERM;
+ }
+
+ /* It looks we have the guarantee that expires is a null-terminated,
+ * and thus string_view::data() can be safely used. */
+ const time_t exp = atoll(expires.data());
+ time_t now;
+ time(&now);
+
+ if (now >= exp) {
+ throw -EPERM;
}
} else {
/* The "Authorization" HTTP header is being used. */
ldout(cct, 10) << "string_to_sign:\n"
<< rgw::crypt_sanitize::auth{s,string_to_sign} << dendl;
- if (! is_time_skew_ok(header_time, qsr)) {
+ if (!qsr && !is_time_skew_ok(header_time)) {
throw -ERR_REQUEST_TIME_SKEWED;
}
#include "rgw_auth.h"
#include "rgw_auth_filters.h"
-#define RGW_AUTH_GRACE_MINS 15
-
struct rgw_http_error {
int http_ret;
const char *s3_code;
class AWSGeneralAbstractor : public AWSEngine::VersionAbstractor {
CephContext* const cct;
- bool is_time_skew_ok(const utime_t& header_time,
- const bool qsr) const;
-
virtual boost::optional<std::string>
get_v4_canonical_headers(const req_info& info,
const boost::string_view& signedheaders,
#include "rgw_rest_s3.h"
class RGWHandler_REST_S3Website : public RGWHandler_REST_S3 {
+ std::string original_object_name; // object name before retarget()
bool web_dir() const;
protected:
int retarget(RGWOp *op, RGWOp **new_op) override;
public:
using RGWHandler_REST_S3::RGWHandler_REST_S3;
~RGWHandler_REST_S3Website() override = default;
+
+ int init(RGWRados *store, req_state *s, rgw::io::BasicClient* cio) override;
int error_handler(int err_no, string *error_content) override;
};
bool suspended;
bool system;
+ bool quota_set;
int32_t max_buckets;
RGWUserAdminOpState op_state;
RESTArgs::get_string(s, "user-caps", caps, &caps);
RESTArgs::get_bool(s, "generate-key", false, &gen_key);
RESTArgs::get_bool(s, "suspended", false, &suspended);
- RESTArgs::get_int32(s, "max-buckets", RGW_DEFAULT_MAX_BUCKETS, &max_buckets);
+ RESTArgs::get_int32(s, "max-buckets", RGW_DEFAULT_MAX_BUCKETS, &max_buckets, "a_set);
RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
RESTArgs::get_bool(s, "system", false, &system);
op_state.set_access_key(access_key);
op_state.set_secret_key(secret_key);
- if (max_buckets != RGW_DEFAULT_MAX_BUCKETS)
+ if (quota_set)
op_state.set_max_buckets(max_buckets);
if (gen_key)
endif()
endif()
+include(CheckCXXSourceCompiles)
+if(NOT MSVC)
+ set(CMAKE_REQUIRED_FLAGS "-msse4.2")
+endif()
+CHECK_CXX_SOURCE_COMPILES("
+#include <cstdint>
+#include <nmmintrin.h>
+int main() {
+ volatile uint32_t x = _mm_crc32_u32(0, 0);
+}
+" HAVE_SSE42)
+unset(CMAKE_REQUIRED_FLAGS)
+if(HAVE_SSE42 AND NOT WITH_SSE42)
+set_source_files_properties(
+ util/crc32c.cc
+ PROPERTIES COMPILE_FLAGS "-msse4.2")
+endif()
+
option(FAIL_ON_WARNINGS "Treat compile warnings as errors" ON)
if(FAIL_ON_WARNINGS)
if(MSVC)
table0_[c >> 24];
}
-#if defined(HAVE_SSE42) && defined(__GNUC__)
-#if defined(__clang__)
-#if __has_cpp_attribute(gnu::target)
-__attribute__ ((target ("sse4.2")))
-#endif
-#else // gcc supports this since 4.4
-__attribute__ ((target ("sse4.2")))
-#endif
-#endif
static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
#ifdef __SSE4_2__
#ifdef __LP64__
- $ ceph-authtool kring --create-keyring
+ $ ceph-authtool kring --create-keyring --mode 0644
creating kring
$ ceph-authtool kring --add-key 'FAKEBASE64 foo'
- $ ceph-authtool kring --create-keyring
+ $ ceph-authtool kring --create-keyring --mode 0644
creating kring
$ ceph-authtool kring --add-key 'AQAK7yxNeF+nHBAA0SgSdbs8IkJrxroDeJ6SwQ== 18446744073709551615'
- $ ceph-authtool kring --create-keyring --gen-key
+ $ ceph-authtool kring --create-keyring --gen-key --mode 0644
creating kring
$ ceph-authtool --cap osd 'allow rx pool=swimming' kring
- $ ceph-authtool kring --create-keyring --gen-key
+ $ ceph-authtool kring --create-keyring --gen-key --mode 0644
creating kring
# TODO is this nice?
- $ ceph-authtool kring --create-keyring --gen-key
+ $ ceph-authtool kring --create-keyring --gen-key --mode 0644
creating kring
$ ceph-authtool --cap osd 'allow rx pool=swimming' kring
- $ ceph-authtool kring --create-keyring --gen-key
+ $ ceph-authtool kring --create-keyring --gen-key --mode 0644
creating kring
$ ceph-authtool --cap osd 'allow rx pool=swimming' kring
- $ ceph-authtool kring --create-keyring
+ $ ceph-authtool kring --create-keyring --mode 0600
creating kring
$ ceph-authtool kring --list
$ ceph-authtool kring -l
[client.admin]
\\tkey = [a-zA-Z0-9+/]+=* \(esc\) (re)
-
- $ ceph-authtool kring --create-keyring
+ $ ceph-authtool kring --create-keyring --mode 0644
creating kring
$ ceph-authtool kring --list
--cap SUBSYSTEM CAPABILITY will set the capability for given subsystem
--caps CAPSFILE will set all of capabilities associated with a
given key, for all subsystems
+ --mode MODE will set the desired file mode to the keyring
+ e.g: '0644', defaults to '0600'
[1]
--cap SUBSYSTEM CAPABILITY will set the capability for given subsystem
--caps CAPSFILE will set all of capabilities associated with a
given key, for all subsystems
+ --mode MODE will set the desired file mode to the keyring
+ e.g: '0644', defaults to '0600'
[1]
# demonstrate that manpage examples fail without config
--cap SUBSYSTEM CAPABILITY will set the capability for given subsystem
--caps CAPSFILE will set all of capabilities associated with a
given key, for all subsystems
+ --mode MODE will set the desired file mode to the keyring
+ e.g: '0644', defaults to '0600'
[1]
--- /dev/null
+ $ crushtool -i "$TESTDIR/simple.template" --add-bucket host0 host --loc cluster cluster0 -o map0 > /dev/null
+ $ crushtool -i map0 --add-bucket host1 host -o map1 > /dev/null
+ $ crushtool -i map1 --move host1 --loc cluster cluster0 -o map2 > /dev/null
+ $ crushtool -i map2 --add-item 1 1.0 device1 --loc cluster cluster0 -o map3 > /dev/null
+ $ crushtool -i map3 --move device1 --loc host host0 -o map4 > /dev/null
+ $ crushtool -d map4
+ # begin crush map
+
+ # devices
+ device 1 device1
+
+ # types
+ type 0 device
+ type 1 host
+ type 2 cluster
+
+ # buckets
+ host host0 {
+ \tid -2\t\t# do not change unnecessarily (esc)
+ \t# weight 1.000 (esc)
+ \talg straw (esc)
+ \thash 0\t# rjenkins1 (esc)
+ \titem device1 weight 1.000 (esc)
+ }
+ host host1 {
+ \tid -3\t\t# do not change unnecessarily (esc)
+ \t# weight 0.000 (esc)
+ \talg straw (esc)
+ \thash 0\t# rjenkins1 (esc)
+ }
+ cluster cluster0 {
+ \tid -1\t\t# do not change unnecessarily (esc)
+ \t# weight 1.000 (esc)
+ \talg straw (esc)
+ \thash 0\t# rjenkins1 (esc)
+ \titem host0 weight 1.000 (esc)
+ \titem host1 weight 0.000 (esc)
+ }
+
+ # rules
+ rule data {
+ \tid 0 (esc)
+ \ttype replicated (esc)
+ \tmin_size 1 (esc)
+ \tmax_size 10 (esc)
+ \tstep take cluster0 (esc)
+ \tstep chooseleaf firstn 0 type host (esc)
+ \tstep emit (esc)
+ }
+ rule metadata {
+ \tid 1 (esc)
+ \ttype replicated (esc)
+ \tmin_size 1 (esc)
+ \tmax_size 10 (esc)
+ \tstep take cluster0 (esc)
+ \tstep chooseleaf firstn 0 type host (esc)
+ \tstep emit (esc)
+ }
+ rule rbd {
+ \tid 2 (esc)
+ \ttype replicated (esc)
+ \tmin_size 1 (esc)
+ \tmax_size 10 (esc)
+ \tstep take cluster0 (esc)
+ \tstep chooseleaf firstn 0 type host (esc)
+ \tstep emit (esc)
+ }
+
+ # end crush map
-i mapfn --reweight-item name weight
reweight a given item (and adjust ancestor
weights as needed)
+ -i mapfn --add-bucket name type [--loc type name ...]
+ insert a bucket into the hierachy at the given
+ location
+ -i mapfn --move name --loc type name ...
+ move the given item to specified location
-i mapfn --reweight recalculate all bucket weights
-i mapfn --create-simple-rule name root type mode
create crush rule <name> to start from <root>,
reshard list list all bucket resharding or scheduled to be reshared
reshard process process of scheduled reshard jobs
reshard cancel cancel resharding a bucket
+ sync error list list sync error
+ sync error trim trim sync error
options:
--tenant=<tenant> tenant name
--uid=<id> user id
(NOTE: required to delete a non-empty bucket)
--sync-stats option to 'user stats', update user stats with current
stats reported by user's buckets indexes
+ --reset-stats option to 'user stats', reset stats in accordance with user buckets
--show-log-entries=<flag> enable/disable dump of log entries on log show
--show-log-sum=<flag> enable/disable dump of log summation on log show
--skip-zero-entries log show only dumps entries that don't have zero value
--path-prefix path prefix for filtering roles
--conf/-c FILE read configuration from the given configuration file
- --id/-i ID set ID portion of my name
+ --id ID set ID portion of my name
--name/-n TYPE.ID set name
--cluster NAME set cluster name (default: ceph)
--setuser USER set uid to user or uid (and gid to user's gid)
*/
#include <gtest/gtest.h>
+#include <boost/container/flat_map.hpp>
#include "include/interval_set.h"
-#include "include/btree_interval_set.h"
+#include "include/btree_map.h"
using namespace ceph;
typedef T ISet;
};
-typedef ::testing::Types< interval_set<IntervalValueType> , btree_interval_set<IntervalValueType> > IntervalSetTypes;
+typedef ::testing::Types<
+ interval_set<IntervalValueType>,
+ interval_set<IntervalValueType,
+ btree::btree_map<IntervalValueType,IntervalValueType>>,
+ interval_set<IntervalValueType,
+ boost::container::flat_map<IntervalValueType,IntervalValueType>>
+ > IntervalSetTypes;
TYPED_TEST_CASE(IntervalSetTest, IntervalSetTypes);
iset1.insert( 24, 1);
ASSERT_FALSE(iset1.subset_of(iset2));
+
+ iset2.insert( 24, 1);
+ ASSERT_TRUE(iset1.subset_of(iset2));
+
+ iset1.insert( 30, 5);
+ ASSERT_FALSE(iset1.subset_of(iset2));
+
+ iset2.insert( 30, 5);
+ ASSERT_TRUE(iset1.subset_of(iset2));
+
+ iset2.erase( 30, 1);
+ ASSERT_FALSE(iset1.subset_of(iset2));
+
+ iset1.erase( 30, 1);
+ ASSERT_TRUE(iset1.subset_of(iset2));
+
+ iset2.erase( 34, 1);
+ ASSERT_FALSE(iset1.subset_of(iset2));
+
+ iset1.erase( 34, 1);
+ ASSERT_TRUE(iset1.subset_of(iset2));
+
+ iset1.insert( 40, 5);
+ ASSERT_FALSE(iset1.subset_of(iset2));
+
+ iset2.insert( 39, 7);
+ ASSERT_TRUE(iset1.subset_of(iset2));
+
+ iset1.insert( 50, 5);
+ iset2.insert( 55, 2);
+ ASSERT_FALSE(iset1.subset_of(iset2));
+
+ ISet iset3, iset4, expected;
+ iset3.insert(5, 10);
+ iset3.insert(20, 5);
+
+ iset4.subset_of(iset3, 0, 100);
+ expected.insert(5, 10);
+ expected.insert(20, 5);
+ ASSERT_TRUE(iset4 == expected);
+
+ iset4.clear();
+ iset4.subset_of(iset3, 5, 25);
+ ASSERT_TRUE(iset4 == expected);
+
+ iset4.clear();
+ iset4.subset_of(iset3, 1, 10);
+ expected.clear();
+ expected.insert(5, 5);
+ ASSERT_TRUE(iset4 == expected);
+
+ iset4.clear();
+ iset4.subset_of(iset3, 8, 24);
+ expected.clear();
+ expected.insert(8, 7);
+ expected.insert(20, 4);
+ ASSERT_TRUE(iset4 == expected);
+
+ iset4.clear();
+ iset4.subset_of(iset3, 0, 0);
+ expected.clear();
+ ASSERT_TRUE(iset4 == expected);
+
+ iset4.clear();
+ iset4.subset_of(iset3, 0, 1);
+ ASSERT_TRUE(iset4 == expected);
+
+ iset4.clear();
+ iset4.subset_of(iset3, 0, 5);
+ ASSERT_TRUE(iset4 == expected);
+
+ iset4.clear();
+ iset4.subset_of(iset3, 25, 30);
+ ASSERT_TRUE(iset4 == expected);
+
+ iset4.clear();
+ iset4.subset_of(iset3, 26, 40);
+ ASSERT_TRUE(iset4 == expected);
}
TYPED_TEST(IntervalSetTest, span_of) {
iset1.span_of( iset2, 100, 5 );
ASSERT_EQ( iset1.num_intervals(), 0);
ASSERT_EQ( iset1.size(), 0);
-}
\ No newline at end of file
+}
collect_sys_info(&sys_info, cct);
ASSERT_TRUE(sys_info.find("distro") != sys_info.end());
- ASSERT_TRUE(sys_info.find("distro_version") != sys_info.end());
ASSERT_TRUE(sys_info.find("distro_description") != sys_info.end());
cct->put();
numtests=0
echo "checking ceph-dencoder generated test instances..."
echo "numgen type"
-for type in `ceph-dencoder list_types`; do
+ceph-dencoder list_types | while read type; do
num=`ceph-dencoder type $type count_tests`
echo "$num $type"
for n in `seq 1 1 $num 2>/dev/null`; do
- safe_type=$type
- # BitVector<2> needs some escaping to avoid bash issues with <>
- if [ "$type" = "BitVector<2>" ]; then
- safe_type="BitVector\<2\>"
- fi
pids=""
- run_in_background pids bash -c "ceph-dencoder type $safe_type select_test $n dump_json > $tmp1"
- run_in_background pids bash -c "ceph-dencoder type $safe_type select_test $n encode decode dump_json > $tmp2"
- run_in_background pids bash -c "ceph-dencoder type $safe_type select_test $n copy dump_json > $tmp3"
- run_in_background pids bash -c "ceph-dencoder type $safe_type select_test $n copy_ctor dump_json > $tmp4"
+ run_in_background pids save_stdout "$tmp1" ceph-dencoder type "$type" select_test "$n" dump_json
+ run_in_background pids save_stdout "$tmp2" ceph-dencoder type "$type" select_test "$n" encode decode dump_json
+ run_in_background pids save_stdout "$tmp3" ceph-dencoder type "$type" select_test "$n" copy dump_json
+ run_in_background pids save_stdout "$tmp4" ceph-dencoder type "$type" select_test "$n" copy_ctor dump_json
wait_background pids
if [ $? -ne 0 ]; then
# the sorted json output. this is a weaker test, but is better
# than nothing.
deterministic=0
- if ceph-dencoder type $type is_deterministic; then
+ if ceph-dencoder type "$type" is_deterministic; then
deterministic=1
fi
fi
if [ $deterministic -ne 0 ]; then
- run_in_background pids bash -c "ceph-dencoder type $safe_type select_test $n encode export $tmp1"
- run_in_background pids bash -c "ceph-dencoder type $safe_type select_test $n encode decode encode export $tmp2"
+ run_in_background pids ceph-dencoder type "$type" select_test $n encode export "$tmp1"
+ run_in_background pids ceph-dencoder type "$type" select_test $n encode decode encode export "$tmp2"
wait_background pids
if ! cmp $tmp1 $tmp2; then
TYPE(frag_info_t)
TYPE(nest_info_t)
TYPE(client_writeable_range_t)
-TYPE_FEATUREFUL(inode_t)
-TYPE_FEATUREFUL(old_inode_t)
+TYPE_FEATUREFUL(inode_t<std::allocator>)
+TYPE_FEATUREFUL(old_inode_t<std::allocator>)
TYPE(fnode_t)
TYPE(old_rstat_t)
TYPE_FEATUREFUL(session_info_t)
TEST(inode_t, compare_equal)
{
- inode_t foo;
- inode_t bar;
+ inode_t<> foo;
+ inode_t<> bar;
int compare_r;
bool divergent;
compare_r = foo.compare(bar, &divergent);
TEST(inode_t, compare_aged)
{
- inode_t foo;
- inode_t bar;
+ inode_t<> foo;
+ inode_t<> bar;
foo.ino = 1234;
foo.ctime.set_from_double(10.0);
TEST(inode_t, compare_divergent)
{
- inode_t foo;
- inode_t bar;
+ inode_t<> foo;
+ inode_t<> bar;
foo.ino = 1234;
foo.ctime.set_from_double(10.0);
#include <thread>
#include <atomic>
+/* in ms -- 1 minute */
+#define MAX_WAIT (60 * 1000)
+
+static void wait_for_atomic_bool(std::atomic_bool &recalled)
+{
+ int i = 0;
+
+ while (!recalled.load()) {
+ ASSERT_LT(i++, MAX_WAIT);
+ usleep(1000);
+ }
+}
+
static int set_default_deleg_timeout(struct ceph_mount_info *cmount)
{
uint32_t session_timeout = ceph_get_cap_return_timeout(cmount);
struct ceph_statx stx;
UserPerm *perms = ceph_mount_perms(cmount);
- ASSERT_EQ(ceph_ll_lookup(cmount, root, filename, &file, &stx, 0, 0, perms), 0);
- int ret;
+ ASSERT_EQ(ceph_ll_lookup(cmount, root, filename, &file, &stx, CEPH_STATX_ALL_STATS, 0, perms), 0);
+ int ret, i = 0;
for (;;) {
+ ASSERT_EQ(ceph_ll_getattr(cmount, file, &stx, CEPH_STATX_ALL_STATS, 0, perms), 0);
ret = ceph_ll_open(cmount, file, flags, &fh, perms);
if (ret != -EAGAIN)
break;
+ ASSERT_LT(i++, MAX_WAIT);
usleep(1000);
}
ASSERT_EQ(ret, 0);
struct ceph_statx stx;
UserPerm *perms = ceph_mount_perms(cmount);
- int ret;
+ int ret, i = 0;
for (;;) {
switch (cmd) {
case DelegTestRename:
}
if (ret != -EAGAIN)
break;
+ ASSERT_LT(i++, MAX_WAIT);
usleep(1000);
}
ASSERT_EQ(ret, 0);
O_RDWR|O_CREAT|O_EXCL, &file, &fh, &stx, 0, 0, perms), 0);
ASSERT_EQ(ceph_ll_delegation(cmount, fh, CEPH_DELEGATION_WR, dummy_deleg_cb, &recalled), 0);
std::thread breaker1(open_breaker_func, tcmount, filename, O_RDWR, &opened);
- while (!recalled.load())
- usleep(1000);
+
+ wait_for_atomic_bool(recalled);
ASSERT_EQ(opened.load(), false);
ASSERT_EQ(ceph_ll_delegation(cmount, fh, CEPH_DELEGATION_NONE, dummy_deleg_cb, &recalled), 0);
breaker1.join();
O_RDWR|O_CREAT|O_EXCL, &file, &fh, &stx, 0, 0, perms), 0);
ASSERT_EQ(ceph_ll_delegation(cmount, fh, CEPH_DELEGATION_WR, dummy_deleg_cb, &recalled), 0);
std::thread breaker2(open_breaker_func, tcmount, filename, O_RDONLY, &opened);
- while (!recalled.load())
- usleep(1000);
+ wait_for_atomic_bool(recalled);
ASSERT_EQ(opened.load(), false);
ASSERT_EQ(ceph_ll_delegation(cmount, fh, CEPH_DELEGATION_NONE, dummy_deleg_cb, &recalled), 0);
breaker2.join();
// ensure that r/w open breaks r/o delegation
opened.store(false);
std::thread breaker4(open_breaker_func, tcmount, filename, O_WRONLY, &opened);
- while (!recalled.load())
- usleep(1000);
+ wait_for_atomic_bool(recalled);
usleep(1000);
ASSERT_EQ(opened.load(), false);
ASSERT_EQ(ceph_ll_delegation(cmount, fh, CEPH_DELEGATION_NONE, dummy_deleg_cb, &recalled), 0);
O_RDWR|O_CREAT|O_EXCL, &file, &fh, &stx, 0, 0, perms), 0);
ASSERT_EQ(ceph_ll_delegation(cmount, fh, CEPH_DELEGATION_WR, dummy_deleg_cb, &recalled), 0);
std::thread breaker5(namespace_breaker_func, tcmount, DelegTestLink, filename, newname);
- while (!recalled.load())
- usleep(1000);
+ wait_for_atomic_bool(recalled);
ASSERT_EQ(ceph_ll_delegation(cmount, fh, CEPH_DELEGATION_NONE, dummy_deleg_cb, &recalled), 0);
breaker5.join();
ASSERT_EQ(ceph_ll_close(cmount, fh), 0);
O_RDWR|O_CREAT|O_EXCL, &file, &fh, &stx, 0, 0, perms), 0);
ASSERT_EQ(ceph_ll_delegation(cmount, fh, CEPH_DELEGATION_WR, dummy_deleg_cb, &recalled), 0);
std::thread breaker6(namespace_breaker_func, tcmount, DelegTestRename, filename, newname);
- while (!recalled.load())
- usleep(1000);
+ wait_for_atomic_bool(recalled);
ASSERT_EQ(ceph_ll_delegation(cmount, fh, CEPH_DELEGATION_NONE, dummy_deleg_cb, &recalled), 0);
breaker6.join();
ASSERT_EQ(ceph_ll_close(cmount, fh), 0);
O_RDWR|O_CREAT|O_EXCL, &file, &fh, &stx, 0, 0, perms), 0);
ASSERT_EQ(ceph_ll_delegation(cmount, fh, CEPH_DELEGATION_WR, dummy_deleg_cb, &recalled), 0);
std::thread breaker7(namespace_breaker_func, tcmount, DelegTestUnlink, filename, nullptr);
- while (!recalled.load())
- usleep(1000);
+ wait_for_atomic_bool(recalled);
ASSERT_EQ(ceph_ll_delegation(cmount, fh, CEPH_DELEGATION_NONE, dummy_deleg_cb, &recalled), 0);
breaker7.join();
ASSERT_EQ(ceph_ll_close(cmount, fh), 0);
ASSERT_EQ(ceph_ll_getattr(cmount, root, &stx, 0, 0, perms), -ENOTCONN);
ceph_release(cmount);
}
+
+TEST(LibCephFS, RecalledGetattr) {
+ struct ceph_mount_info *cmount1;
+ ASSERT_EQ(ceph_create(&cmount1, NULL), 0);
+ ASSERT_EQ(ceph_conf_read_file(cmount1, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount1, NULL));
+ ASSERT_EQ(ceph_mount(cmount1, "/"), 0);
+ ASSERT_EQ(set_default_deleg_timeout(cmount1), 0);
+
+ Inode *root, *file;
+ ASSERT_EQ(ceph_ll_lookup_root(cmount1, &root), 0);
+
+ char filename[32];
+ sprintf(filename, "recalledgetattr%x", getpid());
+
+ Fh *fh;
+ struct ceph_statx stx;
+ UserPerm *perms = ceph_mount_perms(cmount1);
+
+ ASSERT_EQ(ceph_ll_create(cmount1, root, filename, 0666,
+ O_RDWR|O_CREAT|O_EXCL, &file, &fh, &stx, 0, 0, perms), 0);
+ ASSERT_EQ(ceph_ll_write(cmount1, fh, 0, sizeof(filename), filename), sizeof(filename));
+ ASSERT_EQ(ceph_ll_close(cmount1, fh), 0);
+
+ /* New mount for read delegation */
+ struct ceph_mount_info *cmount2;
+ ASSERT_EQ(ceph_create(&cmount2, NULL), 0);
+ ASSERT_EQ(ceph_conf_read_file(cmount2, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount2, NULL));
+ ASSERT_EQ(ceph_mount(cmount2, "/"), 0);
+ ASSERT_EQ(set_default_deleg_timeout(cmount2), 0);
+
+ ASSERT_EQ(ceph_ll_lookup_root(cmount2, &root), 0);
+ perms = ceph_mount_perms(cmount2);
+ ASSERT_EQ(ceph_ll_lookup(cmount2, root, filename, &file, &stx, 0, 0, perms), 0);
+
+ ASSERT_EQ(ceph_ll_open(cmount2, file, O_WRONLY, &fh, perms), 0);
+ ASSERT_EQ(ceph_ll_write(cmount2, fh, 0, sizeof(filename), filename), sizeof(filename));
+ ASSERT_EQ(ceph_ll_close(cmount2, fh), 0);
+
+ ASSERT_EQ(ceph_ll_open(cmount2, file, O_RDONLY, &fh, perms), 0);
+
+ /* Break delegation */
+ std::atomic_bool recalled(false);
+ ASSERT_EQ(ceph_ll_delegation(cmount2, fh, CEPH_DELEGATION_RD, dummy_deleg_cb, &recalled), 0);
+ ASSERT_EQ(ceph_ll_read(cmount2, fh, 0, sizeof(filename), filename), sizeof(filename));
+ ASSERT_EQ(ceph_ll_getattr(cmount2, file, &stx, CEPH_STATX_ALL_STATS, 0, perms), 0);
+ std::atomic_bool opened(false);
+ std::thread breaker1(open_breaker_func, cmount1, filename, O_WRONLY, &opened);
+ int i = 0;
+ do {
+ ASSERT_EQ(ceph_ll_getattr(cmount2, file, &stx, CEPH_STATX_ALL_STATS, 0, perms), 0);
+ ASSERT_LT(i++, MAX_WAIT);
+ usleep(1000);
+ } while (!recalled.load());
+ ASSERT_EQ(opened.load(), false);
+ ASSERT_EQ(ceph_ll_getattr(cmount2, file, &stx, CEPH_STATX_ALL_STATS, 0, perms), 0);
+ ASSERT_EQ(ceph_ll_delegation(cmount2, fh, CEPH_DELEGATION_NONE, dummy_deleg_cb, nullptr), 0);
+ breaker1.join();
+ ASSERT_EQ(ceph_ll_close(cmount2, fh), 0);
+ ceph_unmount(cmount2);
+ ceph_release(cmount2);
+ ceph_unmount(cmount1);
+ ceph_release(cmount1);
+}
// test getdents
struct dirent *getdents_entries;
- getdents_entries = (struct dirent *)malloc((r + 2) * sizeof(*getdents_entries));
+ size_t getdents_entries_len = (r + 2) * sizeof(*getdents_entries);
+ getdents_entries = (struct dirent *)malloc(getdents_entries_len);
int count = 0;
std::vector<std::string> found;
while (true) {
- int len = ceph_getdents(cmount, ls_dir, (char *)getdents_entries, r * sizeof(*getdents_entries));
+ int len = ceph_getdents(cmount, ls_dir, (char *)getdents_entries, getdents_entries_len);
if (len == 0)
break;
ASSERT_GT(len, 0);
return TestMemIoCtxImpl::assert_exists(oid);
}
+ MOCK_METHOD2(create, int(const std::string&, bool));
+ int do_create(const std::string& oid, bool exclusive) {
+ return TestMemIoCtxImpl::create(oid, exclusive);
+ }
+
MOCK_METHOD3(cmpext, int(const std::string&, uint64_t, bufferlist&));
int do_cmpext(const std::string& oid, uint64_t off, bufferlist& cmp_bl) {
return TestMemIoCtxImpl::cmpext(oid, off, cmp_bl);
ON_CALL(*this, aio_watch(_, _, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_aio_watch));
ON_CALL(*this, aio_unwatch(_, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_aio_unwatch));
ON_CALL(*this, assert_exists(_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_assert_exists));
+ ON_CALL(*this, create(_,_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_create));
ON_CALL(*this, cmpext(_,_,_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_cmpext));
ON_CALL(*this, exec(_, _, _, _, _, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_exec));
ON_CALL(*this, get_instance_id()).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_get_instance_id));
}
}
+ void expect_create(MockTestImageCtx &mock_image_ctx, bool exclusive) {
+ EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx), create(_, exclusive))
+ .Times(1);
+ }
+
void expect_truncate(MockTestImageCtx &mock_image_ctx, int offset, int r) {
auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx),
truncate(_, offset, _));
ASSERT_EQ(0, rbd.open(m_ioctx, image, m_image_name.c_str(), NULL));
ASSERT_EQ(0, image.snap_create("one"));
ASSERT_EQ(0, image.snap_protect("one"));
+ uint64_t features;
+ ASSERT_EQ(0, image.features(&features));
image.close();
std::string clone_name = get_temp_image_name();
int order = 0;
ASSERT_EQ(0, rbd.clone(m_ioctx, m_image_name.c_str(), "one", m_ioctx,
- clone_name.c_str(), RBD_FEATURE_LAYERING, &order));
+ clone_name.c_str(), features, &order));
librbd::ImageCtx *ictx;
ASSERT_EQ(0, open_image(clone_name, &ictx));
InSequence seq;
expect_get_parent_overlap(mock_image_ctx, CEPH_NOSNAP, 4096, 0);
expect_prune_parent_extents(mock_image_ctx, {{0, 4096}}, 4096, 4096);
- expect_object_may_exist(mock_image_ctx, 0, true);
+ expect_object_may_exist(mock_image_ctx, 0, false);
+ expect_object_map_update(mock_image_ctx, 0, 1, OBJECT_EXISTS, {}, false, 0);
+ expect_create(mock_image_ctx, false);
expect_truncate(mock_image_ctx, 0, 0);
C_SaferCond ctx;
{
REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
- string pool_name1 = m_pool_name;
+ string pool_name1 = create_pool(true);
string pool_name2 = create_pool(true);
string pool_name3 = create_pool(true);
+ ASSERT_NE("", pool_name1);
ASSERT_NE("", pool_name2);
ASSERT_NE("", pool_name3);
TEST_F(TestLibRBD, RenameViaLockOwner)
{
- REQUIRE_FEATURE(RBD_FEATURE_LAYERING | RBD_FEATURE_EXCLUSIVE_LOCK);
+ REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
librados::IoCtx ioctx;
ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
TEST_F(TestLibRBD, SnapCreateViaLockOwner)
{
- REQUIRE_FEATURE(RBD_FEATURE_LAYERING | RBD_FEATURE_EXCLUSIVE_LOCK);
+ REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
librados::IoCtx ioctx;
ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
TEST_F(TestLibRBD, SnapRemoveViaLockOwner)
{
- REQUIRE_FEATURE(RBD_FEATURE_LAYERING | RBD_FEATURE_EXCLUSIVE_LOCK);
+ REQUIRE_FEATURE(RBD_FEATURE_FAST_DIFF);
librados::IoCtx ioctx;
ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
expect_stop_replay(mock_journaler);
expect_shut_down_replay(mock_image_ctx, mock_journal_replay, 0);
expect_committed(mock_journaler, 0);
+ expect_flush_commit_position(mock_journaler);
expect_start_append(mock_journaler);
ASSERT_EQ(0, when_open(mock_journal));
}
expect_stop_replay(mock_journaler);
expect_shut_down_replay(mock_image_ctx, mock_journal_replay, 0);
expect_committed(mock_journaler, 3);
+ expect_flush_commit_position(mock_journaler);
expect_start_append(mock_journaler);
MockJournalReplay mock_journal_replay;
expect_stop_replay(mock_journaler);
expect_shut_down_replay(mock_image_ctx, mock_journal_replay, 0, true);
+ expect_flush_commit_position(mock_journaler);
expect_shut_down_journaler(mock_journaler);
// replay failure should result in replay-restart
expect_stop_replay(mock_journaler);
expect_shut_down_replay(mock_image_ctx, mock_journal_replay, 0);
+ expect_flush_commit_position(mock_journaler);
expect_start_append(mock_journaler);
ASSERT_EQ(0, when_open(mock_journal));
std::bind(&invoke_replay_complete, _1, 0));
expect_stop_replay(mock_journaler);
expect_shut_down_replay(mock_image_ctx, mock_journal_replay, -EINVAL);
+ expect_flush_commit_position(mock_journaler);
expect_shut_down_journaler(mock_journaler);
// replay flush failure should result in replay-restart
expect_stop_replay(mock_journaler);
expect_shut_down_replay(mock_image_ctx, mock_journal_replay, 0);
+ expect_flush_commit_position(mock_journaler);
expect_start_append(mock_journaler);
ASSERT_EQ(0, when_open(mock_journal));
EXPECT_CALL(mock_journal_replay, decode(_, _)).WillOnce(Return(-EBADMSG));
expect_stop_replay(mock_journaler);
expect_shut_down_replay(mock_image_ctx, mock_journal_replay, 0, true);
+ expect_flush_commit_position(mock_journaler);
expect_shut_down_journaler(mock_journaler);
// replay failure should result in replay-restart
std::bind(&invoke_replay_complete, _1, 0));
expect_stop_replay(mock_journaler);
expect_shut_down_replay(mock_image_ctx, mock_journal_replay, 0);
+ expect_flush_commit_position(mock_journaler);
expect_start_append(mock_journaler);
ASSERT_EQ(0, when_open(mock_journal));
MockJournalReplay mock_journal_replay;
expect_stop_replay(mock_journaler);
expect_shut_down_replay(mock_image_ctx, mock_journal_replay, 0);
+ expect_flush_commit_position(mock_journaler);
expect_start_append(mock_journaler);
ASSERT_EQ(0, when_open(mock_journal));
mock_replay_entry);
expect_stop_replay(mock_journaler);
expect_shut_down_replay(mock_image_ctx, mock_journal_replay, 0, true);
+ expect_flush_commit_position(mock_journaler);
expect_shut_down_journaler(mock_journaler);
// replay write-to-disk failure should result in replay-restart
expect_stop_replay(mock_journaler);
expect_shut_down_replay(mock_image_ctx, mock_journal_replay, 0);
+ expect_flush_commit_position(mock_journaler);
expect_start_append(mock_journaler);
C_SaferCond ctx;
EXPECT_CALL(mock_journal_replay, shut_down(false, _))
.WillOnce(DoAll(SaveArg<1>(&on_flush),
InvokeWithoutArgs(this, &TestMockJournal::wake_up)));
+ expect_flush_commit_position(mock_journaler);
// replay write-to-disk failure should result in replay-restart
expect_shut_down_journaler(mock_journaler);
expect_stop_replay(mock_journaler);
expect_shut_down_replay(mock_image_ctx, mock_journal_replay, 0);
+ expect_flush_commit_position(mock_journaler);
expect_start_append(mock_journaler);
C_SaferCond ctx;
typedef boost::mt11213b gen_type;
+const uint64_t DEF_STORE_TEST_BLOCKDEV_SIZE = 10240000000;
#define dout_context g_ceph_context
#if GTEST_HAS_PARAM_TEST
g_conf->set_val("bluestore_csum_type", "crc32c");
}
+TEST_P(StoreTestSpecificAUSize, ExcessiveFragmentation) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ g_conf->set_val("bluestore_block_size",
+ stringify((uint64_t)2048 * 1024 * 1024));
+
+ ASSERT_EQ(g_conf->get_val<uint64_t>("bluefs_alloc_size"),
+ 1024 * 1024);
+
+ size_t block_size = 0x10000;
+ StartDeferred(block_size);
+
+ ObjectStore::Sequencer osr("test");
+ int r;
+ coll_t cid;
+ ghobject_t hoid1(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
+
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = apply_transaction(store, &osr, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // create 2x400MB objects in a way that their pextents are interleaved
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size * 4, 'a')); // 256KB
+ uint64_t offs = 0;
+ while(offs < (uint64_t)400 * 1024 * 1024) {
+ t.write(cid, hoid1, offs, bl.length(), bl, 0);
+ t.write(cid, hoid2, offs, bl.length(), bl, 0);
+ r = apply_transaction(store, &osr, std::move(t));
+ ASSERT_EQ(r, 0);
+ offs += bl.length();
+ if( (offs % (100 * 1024 * 1024)) == 0) {
+ std::cout<<"written " << offs << std::endl;
+ }
+ }
+ }
+ std::cout<<"written 800MB"<<std::endl;
+ {
+ // Partially overwrite objects with 100MB each leaving space
+ // fragmented and occuping still unfragmented space at the end
+ // So we'll have enough free space but it'll lack long enough (e.g. 1MB)
+ // contiguous pextents.
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size * 4, 'a'));
+ uint64_t offs = 0;
+ while(offs < 112 * 1024 * 1024) {
+ t.write(cid, hoid1, offs, bl.length(), bl, 0);
+ t.write(cid, hoid2, offs, bl.length(), bl, 0);
+ r = apply_transaction(store, &osr, std::move(t));
+ ASSERT_EQ(r, 0);
+ // this will produce high fragmentation if original allocations
+ // were contiguous
+ offs += bl.length();
+ if( (offs % (10 * 1024 * 1024)) == 0) {
+ std::cout<<"written " << offs << std::endl;
+ }
+ }
+ }
+ {
+ // remove one of the object producing much free space
+ // and hence triggering bluefs rebalance.
+ // Which should fail as there is no long enough pextents.
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid2);
+ r = apply_transaction(store, &osr, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ auto to_sleep = 5 *
+ (int)g_conf->get_val<double>("bluestore_bluefs_balance_interval");
+ std::cout<<"sleeping... " << std::endl;
+ sleep(to_sleep);
+
+ {
+ // touch another object to triggerrebalance
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid1);
+ r = apply_transaction(store, &osr, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid1);
+ t.remove(cid, hoid2);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = apply_transaction(store, &osr, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ g_conf->set_val("bluestore_block_size",
+ stringify(DEF_STORE_TEST_BLOCKDEV_SIZE));
+}
+
#endif //#if defined(HAVE_LIBAIO)
TEST_P(StoreTest, KVDBHistogramTest) {
g_ceph_context->_conf->set_val("bdev_debug_aio", "true");
// specify device size
- g_ceph_context->_conf->set_val("bluestore_block_size", "10240000000");
+ g_ceph_context->_conf->set_val("bluestore_block_size",
+ stringify(DEF_STORE_TEST_BLOCKDEV_SIZE));
g_ceph_context->_conf->set_val(
"enable_experimental_unrecoverable_data_corrupting_features", "*");
osdmap.apply_incremental(new_pool_inc);
}
unsigned int get_num_osds() { return num_osds; }
-
+ void get_crush(CrushWrapper& newcrush) {
+ bufferlist bl;
+ osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ bufferlist::iterator p = bl.begin();
+ newcrush.decode(p);
+ }
+ int crush_move(const string &name, const vector<string> &argvec) {
+ map<string,string> loc;
+ CrushWrapper::parse_loc_map(argvec, &loc);
+ CrushWrapper newcrush;
+ get_crush(newcrush);
+ if (!newcrush.name_exists(name)) {
+ return -ENOENT;
+ }
+ int id = newcrush.get_item_id(name);
+ int err;
+ if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
+ if (id >= 0) {
+ err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
+ } else {
+ err = newcrush.move_bucket(g_ceph_context, id, loc);
+ }
+ if (err >= 0) {
+ OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ osdmap.apply_incremental(pending_inc);
+ err = 0;
+ }
+ } else {
+ // already there
+ err = 0;
+ }
+ return err;
+ }
+ int crush_rule_create_replicated(const string &name,
+ const string &root,
+ const string &type) {
+ if (osdmap.crush->rule_exists(name)) {
+ return osdmap.crush->get_rule_id(name);
+ }
+ CrushWrapper newcrush;
+ get_crush(newcrush);
+ string device_class;
+ stringstream ss;
+ int ruleno = newcrush.add_simple_rule(
+ name, root, type, device_class,
+ "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
+ if (ruleno >= 0) {
+ OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ osdmap.apply_incremental(pending_inc);
+ }
+ return ruleno;
+ }
void test_mappings(int pool,
int num,
vector<int> *any,
ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"-12"}, &out, &cout));
}
+TEST_F(OSDMapTest, CleanPGUpmaps) {
+ set_up_map();
+
+ // build a crush rule of type host
+ const int expected_host_num = 3;
+ int osd_per_host = get_num_osds() / expected_host_num;
+ ASSERT_GE(2, osd_per_host);
+ int index = 0;
+ for (int i = 0; i < (int)get_num_osds(); i++) {
+ if (i && i % osd_per_host == 0) {
+ ++index;
+ }
+ stringstream osd_name;
+ stringstream host_name;
+ vector<string> move_to;
+ osd_name << "osd." << i;
+ host_name << "host-" << index;
+ move_to.push_back("root=default");
+ string host_loc = "host=" + host_name.str();
+ move_to.push_back(host_loc);
+ int r = crush_move(osd_name.str(), move_to);
+ ASSERT_EQ(0, r);
+ }
+ const string upmap_rule = "upmap";
+ int upmap_rule_no = crush_rule_create_replicated(
+ upmap_rule, "default", "host");
+ ASSERT_LT(0, upmap_rule_no);
+
+ // create a replicated pool which references the above rule
+ OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
+ new_pool_inc.new_pool_max = osdmap.get_pool_max();
+ new_pool_inc.fsid = osdmap.get_fsid();
+ pg_pool_t empty;
+ uint64_t upmap_pool_id = ++new_pool_inc.new_pool_max;
+ pg_pool_t *p = new_pool_inc.get_new_pool(upmap_pool_id, &empty);
+ p->size = 2;
+ p->set_pg_num(64);
+ p->set_pgp_num(64);
+ p->type = pg_pool_t::TYPE_REPLICATED;
+ p->crush_rule = upmap_rule_no;
+ p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+ new_pool_inc.new_pool_names[upmap_pool_id] = "upmap_pool";
+ osdmap.apply_incremental(new_pool_inc);
+
+ pg_t rawpg(0, upmap_pool_id);
+ pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+ vector<int> up;
+ int up_primary;
+ osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+ ASSERT_LT(1U, up.size());
+ {
+ // validate we won't have two OSDs from a same host
+ int parent_0 = osdmap.crush->get_parent_of_type(up[0],
+ osdmap.crush->get_type_id("host"));
+ int parent_1 = osdmap.crush->get_parent_of_type(up[1],
+ osdmap.crush->get_type_id("host"));
+ ASSERT_TRUE(parent_0 != parent_1);
+ }
+
+ {
+ // TEST pg_upmap
+ {
+ // STEP-1: enumerate all children of up[0]'s parent,
+ // replace up[1] with one of them (other than up[0])
+ int parent = osdmap.crush->get_parent_of_type(up[0],
+ osdmap.crush->get_type_id("host"));
+ set<int> candidates;
+ osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), &candidates);
+ ASSERT_LT(1U, candidates.size());
+ int replaced_by = -1;
+ for (auto c: candidates) {
+ if (c != up[0]) {
+ replaced_by = c;
+ break;
+ }
+ }
+ ASSERT_NE(-1, replaced_by);
+ // generate a new pg_upmap item and apply
+ vector<int32_t> new_pg_upmap;
+ new_pg_upmap.push_back(up[0]);
+ new_pg_upmap.push_back(replaced_by); // up[1] -> replaced_by
+ OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+ pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
+ new_pg_upmap.begin(), new_pg_upmap.end());
+ osdmap.apply_incremental(pending_inc);
+ {
+ // validate pg_upmap is there
+ vector<int> new_up;
+ int new_up_primary;
+ osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+ ASSERT_TRUE(up.size() == new_up.size());
+ ASSERT_TRUE(new_up[0] == new_pg_upmap[0]);
+ ASSERT_TRUE(new_up[1] == new_pg_upmap[1]);
+ // and we shall have two OSDs from a same host now..
+ int parent_0 = osdmap.crush->get_parent_of_type(new_up[0],
+ osdmap.crush->get_type_id("host"));
+ int parent_1 = osdmap.crush->get_parent_of_type(new_up[1],
+ osdmap.crush->get_type_id("host"));
+ ASSERT_TRUE(parent_0 == parent_1);
+ }
+ }
+ {
+ // STEP-2: apply cure
+ OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+ osdmap.maybe_remove_pg_upmaps(g_ceph_context, osdmap, &pending_inc);
+ osdmap.apply_incremental(pending_inc);
+ {
+ // validate pg_upmap is gone (reverted)
+ vector<int> new_up;
+ int new_up_primary;
+ osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+ ASSERT_TRUE(new_up == up);
+ ASSERT_TRUE(new_up_primary = up_primary);
+ }
+ }
+ }
+
+ {
+ // TEST pg_upmap_items
+ // enumerate all used hosts first
+ set<int> parents;
+ for (auto u: up) {
+ int parent = osdmap.crush->get_parent_of_type(u,
+ osdmap.crush->get_type_id("host"));
+ ASSERT_GT(0, parent);
+ parents.insert(parent);
+ }
+ int candidate_parent = 0;
+ set<int> candidate_children;
+ vector<int> up_after_out;
+ {
+ // STEP-1: try mark out up[1] and all other OSDs from the same host
+ int parent = osdmap.crush->get_parent_of_type(up[1],
+ osdmap.crush->get_type_id("host"));
+ set<int> children;
+ osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent),
+ &children);
+ OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+ for (auto c: children) {
+ pending_inc.new_weight[c] = CEPH_OSD_OUT;
+ }
+ OSDMap tmpmap;
+ tmpmap.deepish_copy_from(osdmap);
+ tmpmap.apply_incremental(pending_inc);
+ vector<int> new_up;
+ int new_up_primary;
+ tmpmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+ // verify that we'll have OSDs from a different host..
+ int will_choose = -1;
+ for (auto o: new_up) {
+ int parent = tmpmap.crush->get_parent_of_type(o,
+ osdmap.crush->get_type_id("host"));
+ if (!parents.count(parent)) {
+ will_choose = o;
+ candidate_parent = parent; // record
+ break;
+ }
+ }
+ ASSERT_LT(-1, will_choose); // it is an OSD!
+ ASSERT_TRUE(candidate_parent != 0);
+ osdmap.crush->get_leaves(osdmap.crush->get_item_name(candidate_parent),
+ &candidate_children);
+ ASSERT_TRUE(candidate_children.count(will_choose));
+ candidate_children.erase(will_choose);
+ ASSERT_TRUE(!candidate_children.empty());
+ up_after_out = new_up; // needed for verification..
+ }
+ {
+ // STEP-2: generating a new pg_upmap_items entry by
+ // replacing up[0] with one coming from candidate_children
+ int victim = up[0];
+ int replaced_by = *candidate_children.begin();
+ vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+ new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
+ // apply
+ OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+ pending_inc.new_pg_upmap_items[pgid] =
+ mempool::osdmap::vector<pair<int32_t,int32_t>>(
+ new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+ osdmap.apply_incremental(pending_inc);
+ {
+ // validate pg_upmap_items is there
+ vector<int> new_up;
+ int new_up_primary;
+ osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+ ASSERT_TRUE(up.size() == new_up.size());
+ ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), replaced_by) !=
+ new_up.end());
+ // and up[1] too
+ ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), up[1]) !=
+ new_up.end());
+ }
+ }
+ {
+ // STEP-3: mark out up[1] and all other OSDs from the same host
+ int parent = osdmap.crush->get_parent_of_type(up[1],
+ osdmap.crush->get_type_id("host"));
+ set<int> children;
+ osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent),
+ &children);
+ OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+ for (auto c: children) {
+ pending_inc.new_weight[c] = CEPH_OSD_OUT;
+ }
+ osdmap.apply_incremental(pending_inc);
+ {
+ // validate we have two OSDs from the same host now..
+ vector<int> new_up;
+ int new_up_primary;
+ osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+ ASSERT_TRUE(up.size() == new_up.size());
+ int parent_0 = osdmap.crush->get_parent_of_type(new_up[0],
+ osdmap.crush->get_type_id("host"));
+ int parent_1 = osdmap.crush->get_parent_of_type(new_up[1],
+ osdmap.crush->get_type_id("host"));
+ ASSERT_TRUE(parent_0 == parent_1);
+ }
+ }
+ {
+ // STEP-4: apply cure
+ OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+ osdmap.maybe_remove_pg_upmaps(g_ceph_context, osdmap, &pending_inc);
+ osdmap.apply_incremental(pending_inc);
+ {
+ // validate pg_upmap_items is gone (reverted)
+ vector<int> new_up;
+ int new_up_primary;
+ osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+ ASSERT_TRUE(new_up == up_after_out);
+ }
+ }
+ }
+}
+
TEST(PGTempMap, basic)
{
PGTempMap m;
EXPECT_TRUE(dirty_big_info);
}
- // If our log is empty, the incoming log needs to have not been trimmed.
- {
- clear();
-
- pg_log_t olog;
- pg_info_t oinfo;
- pg_shard_t fromosd;
- pg_info_t info;
- list<hobject_t> remove_snap;
- bool dirty_info = false;
- bool dirty_big_info = false;
-
- // olog has been trimmed
- olog.tail = eversion_t(1, 1);
-
- TestHandler h(remove_snap);
- PrCtl unset_dumpable;
- ASSERT_DEATH(merge_log(oinfo, olog, fromosd, info, &h,
- dirty_info, dirty_big_info), "");
- }
-
}
TEST_F(PGLogTest, proc_replica_log) {
import os
import re
+import sys
import json
+from StringIO import StringIO
def get_command_descriptions(what):
CEPH_BIN = os.environ['CEPH_BIN']
def assert_valid_command(self, args):
result = validate_command(sigdict, args)
- assert_not_equal(result,None)
- assert_not_equal(result,{})
+ assert_not_in(result, [{}, None])
def check_1_natural_arg(self, prefix, command):
self.assert_valid_command([prefix, command, '1'])
command,
'toomany']))
+ def capture_output(self, args, stdout=None, stderr=None):
+ if stdout:
+ stdout = StringIO()
+ sys.stdout = stdout
+ if stderr:
+ stderr = StringIO()
+ sys.stderr = stderr
+ ret = validate_command(sigdict, args)
+ if stdout:
+ stdout = stdout.getvalue().strip()
+ if stderr:
+ stderr = stderr.getvalue().strip()
+ return ret, stdout, stderr
+
class TestBasic:
# ArgumentPrefix("no match for {0}".format(s)) is not able to convert
# unicode str parameter into str. and validate_command() should not
# choke on it.
- assert_is_none(validate_command(sigdict, [u'章鱼和鱿鱼']))
- assert_is_none(validate_command(sigdict, [u'–w']))
+ assert_equal({}, validate_command(sigdict, [u'章鱼和鱿鱼']))
+ assert_equal({}, validate_command(sigdict, [u'–w']))
# actually we always pass unicode strings to validate_command() in "ceph"
# CLI, but we also use bytestrings in our tests, so make sure it does not
# break.
- assert_is_none(validate_command(sigdict, ['章鱼和鱿鱼']))
- assert_is_none(validate_command(sigdict, ['–w']))
+ assert_equal({}, validate_command(sigdict, ['章鱼和鱿鱼']))
+ assert_equal({}, validate_command(sigdict, ['–w']))
class TestPG(TestArgparse):
assert_equal({}, validate_command(sigdict, ['pg', 'debug',
'invalid']))
+ def test_pg_missing_args_output(self):
+ ret, _, stderr = self.capture_output(['pg'], stderr=True)
+ assert_equal({}, ret)
+ assert_regexp_matches(stderr, re.compile('no valid command found.* closest matches'))
+
+ def test_pg_wrong_arg_output(self):
+ ret, _, stderr = self.capture_output(['pg', 'map', 'bad-pgid'],
+ stderr=True)
+ assert_equal({}, ret)
+ assert_in("Invalid command", stderr)
+
class TestAuth(TestArgparse):
'pause', 'toomany']))
def test_cluster_snap(self):
- assert_equal(None, validate_command(sigdict, ['osd', 'cluster_snap']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'cluster_snap']))
def test_down(self):
self.check_1_or_more_string_args('osd', 'down')
('ns1', 'ns1-c'), ('ns1', 'ns1-d')])
def test_xattrs(self):
- xattrs = dict(a=b'1', b=b'2', c=b'3', d=b'a\0b', e=b'\0')
+ xattrs = dict(a=b'1', b=b'2', c=b'3', d=b'a\0b', e=b'\0', f='')
self.ioctx.write('abc', b'')
for key, value in xattrs.items():
self.ioctx.set_xattr('abc', key, value)
eq(stored_xattrs, xattrs)
def test_obj_xattrs(self):
- xattrs = dict(a=b'1', b=b'2', c=b'3', d=b'a\0b', e=b'\0')
+ xattrs = dict(a=b'1', b=b'2', c=b'3', d=b'a\0b', e=b'\0', f='')
self.ioctx.write('abc', b'')
obj = list(self.ioctx.list_objects())[0]
for key, value in xattrs.items():
IMG_SIZE = 8 << 20 # 8 MiB
IMG_ORDER = 22 # 4 MiB objects
+os.environ["RBD_FORCE_ALLOW_V1"] = "1"
+
def setup_module():
global rados
rados = Rados(conffile='')
rados/RadosImport.cc
rados/PoolDump.cc
${PROJECT_SOURCE_DIR}/src/common/util.cc
- ${PROJECT_SOURCE_DIR}/src/common/obj_bencher.cc)
+ ${PROJECT_SOURCE_DIR}/src/common/obj_bencher.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/ECUtil.cc)
add_executable(rados ${rados_srcs})
target_link_libraries(rados librados global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} radosstriper)
install(TARGETS rados DESTINATION bin)
<< " -a BASE64, --add-key BASE64 will add an encoded key to the keyring\n"
<< " --cap SUBSYSTEM CAPABILITY will set the capability for given subsystem\n"
<< " --caps CAPSFILE will set all of capabilities associated with a\n"
- << " given key, for all subsystems"
+ << " given key, for all subsystems\n"
+ << " --mode MODE will set the desired file mode to the keyring\n"
+ << " e.g: '0644', defaults to '0600'"
<< std::endl;
exit(1);
}
bool print_key = false;
bool create_keyring = false;
bool set_auid = false;
+ int mode = 0600; // keyring file mode
std::vector<const char*>::iterator i;
/* Handle options unique to ceph-authtool
exit(1);
}
set_auid = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--mode", (char*)NULL)) {
+ std::string err;
+ mode = strict_strtoll(val.c_str(), 8, &err);
+ if (!err.empty()) {
+ cerr << "Option --mode requires an argument" << std::endl;
+ exit(1);
+ }
} else if (fn.empty()) {
fn = *i++;
} else {
if (modified) {
bufferlist bl;
keyring.encode_plaintext(bl);
- r = bl.write_file(fn.c_str(), 0600);
+ r = bl.write_file(fn.c_str(), mode);
if (r < 0) {
cerr << "could not write " << fn << std::endl;
exit(1);
return 0;
}
+int do_trim_pg_log(ObjectStore *store, const coll_t &coll,
+ pg_info_t &info, const spg_t &pgid,
+ ObjectStore::Sequencer &osr, epoch_t map_epoch,
+ PastIntervals &past_intervals)
+{
+ ghobject_t oid = pgid.make_pgmeta_oid();
+ struct stat st;
+ int r = store->stat(coll, oid, &st);
+ assert(r == 0);
+ assert(st.st_size == 0);
+
+ cerr << "Log bounds are: " << "(" << info.log_tail << ","
+ << info.last_update << "]" << std::endl;
+
+ uint64_t max_entries = g_ceph_context->_conf->osd_max_pg_log_entries;
+ if (info.last_update.version - info.log_tail.version <= max_entries) {
+ cerr << "Log not larger than osd_max_pg_log_entries " << max_entries << std::endl;
+ return 0;
+ }
+
+ assert(info.last_update.version > max_entries);
+ version_t trim_to = info.last_update.version - max_entries;
+ size_t trim_at_once = g_ceph_context->_conf->osd_pg_log_trim_max;
+ eversion_t new_tail;
+ bool done = false;
+
+ while (!done) {
+ // gather keys so we can delete them in a batch without
+ // affecting the iterator
+ set<string> keys_to_trim;
+ {
+ ObjectMap::ObjectMapIterator p = store->get_omap_iterator(coll, oid);
+ if (!p)
+ break;
+ for (p->seek_to_first(); p->valid(); p->next(false)) {
+ if (p->key()[0] == '_')
+ continue;
+ if (p->key() == "can_rollback_to")
+ continue;
+ if (p->key() == "divergent_priors")
+ continue;
+ if (p->key() == "rollback_info_trimmed_to")
+ continue;
+ if (p->key() == "may_include_deletes_in_missing")
+ continue;
+ if (p->key().substr(0, 7) == string("missing"))
+ continue;
+ if (p->key().substr(0, 4) == string("dup_"))
+ continue;
+
+ bufferlist bl = p->value();
+ bufferlist::iterator bp = bl.begin();
+ pg_log_entry_t e;
+ try {
+ e.decode_with_checksum(bp);
+ } catch (const buffer::error &e) {
+ cerr << "Error reading pg log entry: " << e << std::endl;
+ }
+ if (debug) {
+ cerr << "read entry " << e << std::endl;
+ }
+ if (e.version.version > trim_to) {
+ done = true;
+ break;
+ }
+ keys_to_trim.insert(p->key());
+ new_tail = e.version;
+ if (keys_to_trim.size() >= trim_at_once)
+ break;
+ }
+
+ if (!p->valid())
+ done = true;
+ } // deconstruct ObjectMapIterator
+
+ // delete the keys
+ if (!dry_run && !keys_to_trim.empty()) {
+ cout << "Removing keys " << *keys_to_trim.begin() << " - " << *keys_to_trim.rbegin() << std::endl;
+ ObjectStore::Transaction t;
+ t.omap_rmkeys(coll, oid, keys_to_trim);
+ int r = store->apply_transaction(&osr, std::move(t));
+ if (r) {
+ cerr << "Error trimming logs " << cpp_strerror(r) << std::endl;
+ }
+ }
+ }
+
+ // update pg info with new tail
+ if (!dry_run && new_tail != eversion_t()) {
+ info.log_tail = new_tail;
+ ObjectStore::Transaction t;
+ int ret = write_info(t, map_epoch, info, past_intervals);
+ if (ret)
+ return ret;
+ ret = store->apply_transaction(&osr, std::move(t));
+ if (ret) {
+ cerr << "Error updating pg info " << cpp_strerror(ret) << std::endl;
+ }
+ }
+
+ // compact the db since we just removed a bunch of data
+ cerr << "Finished trimming, now compacting..." << std::endl;
+ if (!dry_run)
+ store->compact();
+ return 0;
+}
+
const int OMAP_BATCH_SIZE = 25;
void get_omap_batch(ObjectMap::ObjectMapIterator &iter, map<string, bufferlist> &oset)
{
("journal-path", po::value<string>(&jpath),
"path to journal, use if tool can't find it")
("pgid", po::value<string>(&pgidstr),
- "PG id, mandatory for info, log, remove, export, export-remove, rm-past-intervals, mark-complete, and mandatory for apply-layout-settings if --pool is not specified")
+ "PG id, mandatory for info, log, remove, export, export-remove, rm-past-intervals, mark-complete, trim-pg-log, and mandatory for apply-layout-settings if --pool is not specified")
("pool", po::value<string>(&pool),
"Pool name, mandatory for apply-layout-settings if --pgid is not specified")
("op", po::value<string>(&op),
"Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
- "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, apply-layout-settings, update-mon-db, dump-import]")
+ "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, apply-layout-settings, update-mon-db, dump-import, trim-pg-log]")
("epoch", po::value<unsigned>(&epoch),
"epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified")
("file", po::value<string>(&file),
// The ops which require --pgid option are checked here and
// mentioned in the usage for --pgid.
if ((op == "info" || op == "log" || op == "remove" || op == "export"
- || op == "export-remove" || op == "rm-past-intervals" || op == "mark-complete") &&
+ || op == "export-remove" || op == "rm-past-intervals"
+ || op == "mark-complete" || op == "trim-pg-log") &&
pgidstr.length() == 0) {
cerr << "Must provide pgid" << std::endl;
usage(desc);
// If not an object command nor any of the ops handled below, then output this usage
// before complaining about a bad pgid
- if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "rm-past-intervals" && op != "mark-complete") {
+ if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "rm-past-intervals" && op != "mark-complete" && op != "trim-pg-log") {
cerr << "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
- "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, dump-import)"
+ "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, dump-import, trim-pg-log)"
<< std::endl;
usage(desc);
ret = 1;
fs->apply_transaction(osr, std::move(*t));
}
cout << "Marking complete succeeded" << std::endl;
+ } else if (op == "trim-pg-log") {
+ ret = do_trim_pg_log(fs, coll, info, pgid, *osr,
+ map_epoch, past_intervals);
+ if (ret < 0) {
+ cerr << "Error trimming pg log: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ cout << "Finished trimming pg log" << std::endl;
+ goto out;
} else {
assert(!"Should have already checked for valid --op");
}
int nlink;
bool is_dir;
link_info_t() : version(0), nlink(0), is_dir(false) {}
- link_info_t(inodeno_t di, frag_t df, const string& n, const inode_t i) :
+ link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::mempool_inode& i) :
dirino(di), frag(df), name(n),
version(i.version), nlink(i.nlink), is_dir(S_IFDIR & i.mode) {}
dirfrag_t dirfrag() const {
} else {
r = dumper.dump(path.c_str());
}
- dumper.shutdown();
}
return r;
} else {
r = resetter.reset(mds_role_t(role_selector.get_ns(), rank));
}
- resetter.shutdown();
return r;
}
new_inode.xattrs = fb.xattrs;
new_inode.dirfragtree = fb.dirfragtree;
new_inode.snap_blob = fb.snapbl;
- new_inode.symlink = fb.symlink;
+ new_inode.symlink = mempool::mds_co::string(boost::string_view(fb.symlink));
new_inode.old_inodes = fb.old_inodes;
// Serialize InodeStore
objecter(NULL),
lock("MDSUtility::lock"),
finisher(g_ceph_context, "MDSUtility", "fn_mds_utility"),
- waiting_for_mds_map(NULL)
+ waiting_for_mds_map(NULL),
+ inited(false)
{
monc = new MonClient(g_ceph_context);
messenger = Messenger::create_client_messenger(g_ceph_context, "mds");
MDSUtility::~MDSUtility()
{
+ if (inited) {
+ shutdown();
+ }
delete objecter;
delete monc;
delete messenger;
finisher.start();
+ inited = true;
return 0;
}
Context *waiting_for_mds_map;
+ bool inited;
public:
MDSUtility();
~MDSUtility() override;
std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
}
- data_scan.shutdown();
return rc;
}
std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
}
- jt.shutdown();
-
return rc;
}
std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
}
- tt.shutdown();
-
return rc;
}
#include <errno.h>
#include <fstream>
+#include <type_traits>
#include "common/debug.h"
#include "common/errno.h"
cout << " -i mapfn --reweight-item name weight\n";
cout << " reweight a given item (and adjust ancestor\n"
<< " weights as needed)\n";
+ cout << " -i mapfn --add-bucket name type [--loc type name ...]\n"
+ << " insert a bucket into the hierachy at the given\n"
+ << " location\n";
+ cout << " -i mapfn --move name --loc type name ...\n"
+ << " move the given item to specified location\n";
cout << " -i mapfn --reweight recalculate all bucket weights\n";
cout << " -i mapfn --create-simple-rule name root type mode\n"
<< " create crush rule <name> to start from <root>,\n"
int size;
};
+int do_add_bucket(CephContext* cct,
+ const char* me,
+ CrushWrapper& crush,
+ const string& add_name,
+ const string& add_type,
+ const map<string,string>& add_loc) {
+ int bucketno;
+ if (crush.name_exists(add_name)) {
+ cerr << me << " bucket '" << add_name << "' already exists" << std::endl;
+ return -EEXIST;
+ }
+ int type = crush.get_type_id(add_type);
+ if (type <= 0) {
+ cerr << me << " bad bucket type: " << add_type << std::endl;
+ return -EINVAL;
+ }
+ int r = 0;
+ r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT, type, 0, nullptr, nullptr, &bucketno);
+ if (r < 0) {
+ cerr << me << " unable to add bucket: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ r = crush.set_item_name(bucketno, add_name);
+ if (r < 0) {
+ cerr << me << " bad bucket name: " << add_name << std::endl;
+ return r;
+ }
+ if (!add_loc.empty()) {
+ if (!crush.check_item_loc(cct, bucketno, add_loc, (int*)nullptr)) {
+ r = crush.move_bucket(cct, bucketno, add_loc);
+ if (r < 0) {
+ cerr << me << " error moving bucket '" << add_name << "' to " << add_loc << std::endl;
+ return r;
+ }
+ }
+ }
+ return 0;
+}
+
+// return 1 for no change, 0 for successful change, negative on error
+int do_move_item(CephContext* cct,
+ const char *me,
+ CrushWrapper& crush,
+ const string& name,
+ const map<string,string>& loc)
+{
+ if (!crush.name_exists(name)) {
+ cerr << me << " item '" << name << "' does not exist" << std::endl;
+ return -ENOENT;
+ }
+ int id = crush.get_item_id(name);
+ if (loc.empty()) {
+ cerr << me << " expecting additional --loc argument to --move" << std::endl;
+ return -EINVAL;
+ }
+ if (crush.check_item_loc(cct, id, loc, (int*)nullptr)) {
+ // it's already there
+ cerr << me << " item '" << name << "' already at " << loc << std::endl;
+ return 1;
+ }
+ if (id >= 0) {
+ switch (int r = crush.create_or_move_item(cct, id, 0, name, loc)) {
+ case 0:
+ return 1;
+ case 1:
+ return 0;
+ default:
+ return r;
+ }
+ } else {
+ return crush.move_bucket(cct, id, loc);
+ }
+}
+
int main(int argc, const char **argv)
{
vector<const char*> args;
argv_to_vec(argc, argv, args);
const char *me = argv[0];
- std::string infn, srcfn, outfn, add_name, remove_name, reweight_name;
+ std::string infn, srcfn, outfn, add_name, add_type, remove_name, reweight_name;
+ std::string move_name;
bool compile = false;
bool decompile = false;
bool check = false;
bool reweight = false;
int add_item = -1;
+ bool add_bucket = false;
bool update_item = false;
+ bool move_item = false;
bool add_rule = false;
std::string rule_name, rule_root, rule_type, rule_mode, rule_device_class;
bool del_rule = false;
}
add_name.assign(*i);
i = args.erase(i);
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--add-bucket", (char*)NULL)) {
+ add_name.assign(val);
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --add-bucket" << std::endl;
+ return EXIT_FAILURE;
+ }
+ add_type = *i;
+ i = args.erase(i);
+ add_bucket = true;
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--move", (char*)NULL)) {
+ move_name.assign(val);
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ move_item = true;
} else if (ceph_argparse_witharg(args, i, &val, err, "--create-simple-rule", (char*)NULL)) {
rule_name.assign(val);
if (!err.str().empty()) {
return EXIT_FAILURE;
}
if (!check && !compile && !decompile && !build && !test && !reweight && !adjust && !tree && !dump &&
- add_item < 0 && !add_rule && !del_rule && full_location < 0 &&
+ add_item < 0 && !add_bucket && !move_item && !add_rule && !del_rule && full_location < 0 &&
remove_name.empty() && reweight_name.empty()) {
cerr << "no action specified; -h for help" << std::endl;
return EXIT_FAILURE;
}
}
+ if (add_bucket) {
+ int r = do_add_bucket(cct.get(), me, crush, add_name, add_type, add_loc);
+ if (!r) {
+ modified = true;
+ } else {
+ return r;
+ }
+ }
+
+ if (move_item) {
+ int r = do_move_item(cct.get(), me, crush, move_name, add_loc);
+ if (!r) {
+ modified = true;
+ } else {
+ return r;
+ }
+ }
if (add_rule) {
if (crush.rule_exists(rule_name)) {
cerr << "rule " << rule_name << " already exists" << std::endl;
#include "PoolDump.h"
#include "RadosImport.h"
+#include "osd/ECUtil.h"
+
using namespace librados;
// two steps seem to be necessary to do this right
f.dump_string("error", "stat_error");
if (err.has_read_error())
f.dump_string("error", "read_error");
- if (err.has_data_digest_mismatch_oi())
- f.dump_string("error", "data_digest_mismatch_oi");
- if (err.has_omap_digest_mismatch_oi())
- f.dump_string("error", "omap_digest_mismatch_oi");
- if (err.has_size_mismatch_oi())
- f.dump_string("error", "size_mismatch_oi");
+ if (err.has_data_digest_mismatch_info())
+ f.dump_string("error", "data_digest_mismatch_info");
+ if (err.has_omap_digest_mismatch_info())
+ f.dump_string("error", "omap_digest_mismatch_info");
+ if (err.has_size_mismatch_info())
+ f.dump_string("error", "size_mismatch_info");
if (err.has_ec_hash_error())
f.dump_string("error", "ec_hash_error");
if (err.has_ec_size_error())
f.dump_string("error", "ec_size_error");
- if (err.has_oi_attr_missing())
- f.dump_string("error", "oi_attr_missing");
- if (err.has_oi_attr_corrupted())
- f.dump_string("error", "oi_attr_corrupted");
- if (err.has_obj_size_oi_mismatch())
- f.dump_string("error", "obj_size_oi_mismatch");
- if (err.has_ss_attr_missing())
- f.dump_string("error", "ss_attr_missing");
- if (err.has_ss_attr_corrupted())
- f.dump_string("error", "ss_attr_corrupted");
+ if (err.has_info_missing())
+ f.dump_string("error", "info_missing");
+ if (err.has_info_corrupted())
+ f.dump_string("error", "info_corrupted");
+ if (err.has_obj_size_info_mismatch())
+ f.dump_string("error", "obj_size_info_mismatch");
+ if (err.has_snapset_missing())
+ f.dump_string("error", "snapset_missing");
+ if (err.has_snapset_corrupted())
+ f.dump_string("error", "snapset_corrupted");
+ if (err.has_hinfo_missing())
+ f.dump_string("error", "hinfo_missing");
+ if (err.has_hinfo_corrupted())
+ f.dump_string("error", "hinfo_corrupted");
f.close_section();
}
f.dump_format("data_digest", "0x%08x", shard.data_digest);
}
- if (!shard.has_oi_attr_missing() && !shard.has_oi_attr_corrupted() &&
- inc.has_object_info_inconsistency()) {
- object_info_t oi;
- bufferlist bl;
+ if ((inc.union_shards.has_info_missing()
+ || inc.union_shards.has_info_corrupted()
+ || inc.has_object_info_inconsistency()
+ || shard.has_obj_size_info_mismatch()) &&
+ !shard.has_info_missing()) {
map<std::string, ceph::bufferlist>::iterator k = (const_cast<shard_info_t&>(shard)).attrs.find(OI_ATTR);
assert(k != shard.attrs.end()); // Can't be missing
- bufferlist::iterator bliter = k->second.begin();
- ::decode(oi, bliter); // Can't be corrupted
- f.dump_stream("object_info") << oi;
+ if (!shard.has_info_corrupted()) {
+ object_info_t oi;
+ bufferlist bl;
+ bufferlist::iterator bliter = k->second.begin();
+ ::decode(oi, bliter); // Can't be corrupted
+ f.open_object_section("object_info");
+ oi.dump(&f);
+ f.close_section();
+ } else {
+ bool b64;
+ f.dump_string("object_info", cleanbin(k->second, b64));
+ }
}
- if (!shard.has_ss_attr_missing() && !shard.has_ss_attr_corrupted() &&
- inc.has_snapset_inconsistency()) {
- SnapSet ss;
- bufferlist bl;
+ if ((inc.union_shards.has_snapset_missing()
+ || inc.union_shards.has_snapset_corrupted()
+ || inc.has_snapset_inconsistency()) &&
+ !shard.has_snapset_missing()) {
map<std::string, ceph::bufferlist>::iterator k = (const_cast<shard_info_t&>(shard)).attrs.find(SS_ATTR);
assert(k != shard.attrs.end()); // Can't be missing
- bufferlist::iterator bliter = k->second.begin();
- decode(ss, bliter); // Can't be corrupted
- f.dump_stream("snapset") << ss;
+ if (!shard.has_snapset_corrupted()) {
+ SnapSet ss;
+ bufferlist bl;
+ bufferlist::iterator bliter = k->second.begin();
+ decode(ss, bliter); // Can't be corrupted
+ f.open_object_section("snapset");
+ ss.dump(&f);
+ f.close_section();
+ } else {
+ bool b64;
+ f.dump_string("snapset", cleanbin(k->second, b64));
+ }
+ }
+ if ((inc.union_shards.has_hinfo_missing()
+ || inc.union_shards.has_hinfo_corrupted()
+ || inc.has_hinfo_inconsistency()) &&
+ !shard.has_hinfo_missing()) {
+ map<std::string, ceph::bufferlist>::iterator k = (const_cast<shard_info_t&>(shard)).attrs.find(ECUtil::get_hinfo_key());
+ assert(k != shard.attrs.end()); // Can't be missing
+ if (!shard.has_hinfo_corrupted()) {
+ ECUtil::HashInfo hi;
+ bufferlist bl;
+ bufferlist::iterator bliter = k->second.begin();
+ decode(hi, bliter); // Can't be corrupted
+ f.open_object_section("hashinfo");
+ hi.dump(&f);
+ f.close_section();
+ } else {
+ bool b64;
+ f.dump_string("hashinfo", cleanbin(k->second, b64));
+ }
}
- if (inc.has_attr_name_mismatch() || inc.has_attr_value_mismatch()
- || inc.union_shards.has_oi_attr_missing()
- || inc.union_shards.has_oi_attr_corrupted()
- || inc.union_shards.has_ss_attr_missing()
- || inc.union_shards.has_ss_attr_corrupted()) {
+ if (inc.has_attr_name_mismatch() || inc.has_attr_value_mismatch()) {
f.open_array_section("attrs");
for (auto kv : shard.attrs) {
+ // System attribute handled above
+ if (kv.first == OI_ATTR || kv.first[0] != '_')
+ continue;
f.open_object_section("attr");
- f.dump_string("name", kv.first);
+ // Skip leading underscore since only giving user attrs
+ f.dump_string("name", kv.first.substr(1));
bool b64;
f.dump_string("value", cleanbin(kv.second, b64));
f.dump_bool("Base64", b64);
f.dump_string("error", "attr_name_mismatch");
if (err.has_snapset_inconsistency())
f.dump_string("error", "snapset_inconsistency");
+ if (err.has_hinfo_inconsistency())
+ f.dump_string("error", "hinfo_inconsistency");
f.close_section();
}
assert(k != shard.attrs.end()); // Can't be missing
bufferlist::iterator bliter = k->second.begin();
::decode(oi, bliter); // Can't be corrupted
- f.dump_stream("selected_object_info") << oi;
+ f.open_object_section("selected_object_info");
+ oi.dump(&f);
+ f.close_section();
break;
}
}
{
dump_object_id(inc.object, f);
+ if (inc.ss_bl.length()) {
+ SnapSet ss;
+ bufferlist bl = inc.ss_bl;
+ bufferlist::iterator bliter = bl.begin();
+ decode(ss, bliter); // Can't be corrupted
+ f.open_object_section("snapset");
+ ss.dump(&f);
+ f.close_section();
+ }
f.open_array_section("errors");
- if (inc.ss_attr_missing())
- f.dump_string("error", "ss_attr_missing");
- if (inc.ss_attr_corrupted())
- f.dump_string("error", "ss_attr_corrupted");
- if (inc.oi_attr_missing())
- f.dump_string("error", "oi_attr_missing");
- if (inc.oi_attr_corrupted())
- f.dump_string("error", "oi_attr_corrupted");
- if (inc.snapset_mismatch())
- f.dump_string("error", "snapset_mismatch");
+ if (inc.snapset_missing())
+ f.dump_string("error", "snapset_missing");
+ if (inc.snapset_corrupted())
+ f.dump_string("error", "snapset_corrupted");
+ if (inc.info_missing())
+ f.dump_string("error", "info_missing");
+ if (inc.info_corrupted())
+ f.dump_string("error", "info_corrupted");
+ if (inc.snapset_error())
+ f.dump_string("error", "snapset_error");
if (inc.head_mismatch())
f.dump_string("error", "head_mismatch");
if (inc.headless())
cls::journal::ObjectPosition master = m_master_position;
uint64_t mirror_tag_tid = m_mirror_position.tag_tid;
- while (master.tag_tid != mirror_tag_tid) {
+ while (master.tag_tid > mirror_tag_tid) {
auto tag_it = m_tag_cache.find(master.tag_tid);
if (tag_it == m_tag_cache.end()) {
send_update_tag_cache(master.tag_tid, mirror_tag_tid);
}
librbd::journal::TagData &tag_data = tag_it->second;
m_entries_behind_master += master.entry_tid;
- master = cls::journal::ObjectPosition(0, tag_data.predecessor.tag_tid,
- tag_data.predecessor.entry_tid);
+ master = {0, tag_data.predecessor.tag_tid, tag_data.predecessor.entry_tid};
+ }
+ if (master.tag_tid == mirror_tag_tid &&
+ master.entry_tid > m_mirror_position.entry_tid) {
+ m_entries_behind_master += master.entry_tid - m_mirror_position.entry_tid;
}
- m_entries_behind_master += master.entry_tid - m_mirror_position.entry_tid;
dout(20) << "clearing tags not needed any more (below mirror position)"
<< dendl;
template <typename I>
void ReplayStatusFormatter<I>::send_update_tag_cache(uint64_t master_tag_tid,
uint64_t mirror_tag_tid) {
-
- dout(20) << "master_tag_tid=" << master_tag_tid << ", mirror_tag_tid="
- << mirror_tag_tid << dendl;
-
- if (master_tag_tid == mirror_tag_tid) {
+ if (master_tag_tid <= mirror_tag_tid ||
+ m_tag_cache.find(master_tag_tid) != m_tag_cache.end()) {
Context *on_finish = nullptr;
{
Mutex::Locker locker(m_lock);
return;
}
+ dout(20) << "master_tag_tid=" << master_tag_tid << ", mirror_tag_tid="
+ << mirror_tag_tid << dendl;
+
FunctionContext *ctx = new FunctionContext(
[this, master_tag_tid, mirror_tag_tid](int r) {
handle_update_tag_cache(master_tag_tid, mirror_tag_tid, r);
tag_data.predecessor.mirror_uuid !=
librbd::Journal<>::ORPHAN_MIRROR_UUID) {
dout(20) << "hit remote image non-primary epoch" << dendl;
- tag_data.predecessor.tag_tid = mirror_tag_tid;
- } else if (tag_data.predecessor.tag_tid == 0) {
- // We failed. Don't consider this fatal, just terminate retrieving.
- dout(20) << "making fake tag" << dendl;
- tag_data.predecessor.tag_tid = mirror_tag_tid;
+ tag_data.predecessor = {};
}
dout(20) << "decoded tag " << master_tag_tid << ": " << tag_data << dendl;
- m_tag_cache.insert(std::make_pair(master_tag_tid, tag_data));
+ m_tag_cache[master_tag_tid] = tag_data;
send_update_tag_cache(tag_data.predecessor.tag_tid, mirror_tag_tid);
}
void ObjectCopyRequest<I>::handle_read_object(int r) {
dout(20) << ": r=" << r << dendl;
+ auto snap_seq = m_snap_sync_ops.begin()->first.second;
+ if (r == -ENOENT && m_read_whole_object[snap_seq]) {
+ dout(5) << ": object missing when forced to read whole object"
+ << dendl;
+ r = 0;
+ }
+
if (r == -ENOENT) {
m_retry_snap_set = m_snap_set;
m_retry_missing_read = true;
uint64_t end_size;
bool exists;
librados::snap_t clone_end_snap_id;
+ bool read_whole_object;
calc_snap_set_diff(cct, m_snap_set, start_remote_snap_id,
end_remote_snap_id, &diff, &end_size, &exists,
- &clone_end_snap_id);
+ &clone_end_snap_id, &read_whole_object);
+
+ if (read_whole_object) {
+ dout(1) << ": need to read full object" << dendl;
+ diff.insert(0, m_remote_image_ctx->layout.object_size);
+ exists = true;
+ end_size = m_remote_image_ctx->layout.object_size;
+ clone_end_snap_id = end_remote_snap_id;
+ }
dout(20) << ": "
<< "start_remote_snap=" << start_remote_snap_id << ", "
// do not read past the sync point snapshot
clone_end_snap_id = remote_sync_pont_snap_id;
}
+ m_read_whole_object[clone_end_snap_id] = read_whole_object;
// object write/zero, or truncate
// NOTE: a single snapshot clone might represent multiple snapshots, but
bool m_retry_missing_read = false;
librados::snap_set_t m_retry_snap_set;
+ std::map<librados::snap_t, bool> m_read_whole_object;
SnapSyncOps m_snap_sync_ops;
SnapObjectStates m_snap_object_states;
goto close_ret;
}
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0)
+ goto close_fd;
+
+ r = rados.connect();
+ if (r < 0)
+ goto close_fd;
+
+ r = rados.ioctx_create(cfg->poolname.c_str(), io_ctx);
+ if (r < 0)
+ goto close_fd;
+
+ r = rbd.open(io_ctx, image, cfg->imgname.c_str());
+ if (r < 0)
+ goto close_fd;
+
+ if (cfg->exclusive) {
+ r = image.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE);
+ if (r < 0) {
+ cerr << "rbd-nbd: failed to acquire exclusive lock: " << cpp_strerror(r)
+ << std::endl;
+ goto close_fd;
+ }
+ }
+
+ if (!cfg->snapname.empty()) {
+ r = image.snap_set(cfg->snapname.c_str());
+ if (r < 0)
+ goto close_fd;
+ }
+
+ r = image.stat(info, sizeof(info));
+ if (r < 0)
+ goto close_fd;
+
if (cfg->devpath.empty()) {
char dev[64];
bool try_load_module = true;
read_only = 1;
}
- r = rados.init_with_context(g_ceph_context);
- if (r < 0)
- goto close_nbd;
-
- r = rados.connect();
- if (r < 0)
- goto close_nbd;
-
- r = rados.ioctx_create(cfg->poolname.c_str(), io_ctx);
- if (r < 0)
- goto close_nbd;
-
- r = rbd.open(io_ctx, image, cfg->imgname.c_str());
- if (r < 0)
- goto close_nbd;
-
- if (cfg->exclusive) {
- r = image.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE);
- if (r < 0) {
- cerr << "rbd-nbd: failed to acquire exclusive lock: " << cpp_strerror(r)
- << std::endl;
- goto close_nbd;
- }
- }
-
- if (!cfg->snapname.empty()) {
- r = image.snap_set(cfg->snapname.c_str());
- if (r < 0)
- goto close_nbd;
- }
-
- r = image.stat(info, sizeof(info));
- if (r < 0)
- goto close_nbd;
-
r = ioctl(nbd, NBD_SET_BLKSIZE, RBD_NBD_BLKSIZE);
if (r < 0) {
r = -errno;
export NO_INDEX=--no-index
fi
-pip $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install $NO_INDEX --use-wheel --find-links=file://$(pwd)/wheelhouse 'tox >=1.9'
+pip $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install $NO_INDEX --find-links=file://$(pwd)/wheelhouse 'tox >=1.9'
if test -f requirements.txt ; then
- pip $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install $NO_INDEX --use-wheel --find-links=file://$(pwd)/wheelhouse -r requirements.txt
+ pip $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install $NO_INDEX --find-links=file://$(pwd)/wheelhouse -r requirements.txt
fi
ExecStart=/usr/bin/ceph-mgr -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
+RestartSec=10
StartLimitInterval=30min
StartLimitBurst=3