cmake_minimum_required(VERSION 2.8.11)
project(ceph)
-set(VERSION 12.2.0)
+set(VERSION 12.2.1)
if(POLICY CMP0046)
# Tweak policies (this one disables "missing" dependency warning)
New commands "pg cancel-force-recovery" and "pg cancel-force-backfill"
restore default recovery/backfill priority of previously forced pgs.
+
+12.2.1
+------
+
+* Clusters will need to upgrade to 12.2.1 before upgrading to any
+ Mimic 13.y.z version (either a development release or an eventual
+ stable Mimic release).
+
+- *CephFS*:
+
+ * Limiting MDS cache via a memory limit is now supported using the new
+ mds_cache_memory_limit config option (1GB by default). A cache reservation
+ can also be specified using mds_cache_reservation as a percentage of the
+ limit (5% by default). Limits by inode count are still supported using
+ mds_cache_size. Setting mds_cache_size to 0 (the default) disables the
+ inode limit.
# Contributor: John Coyle <dx9err@gmail.com>
# Maintainer: John Coyle <dx9err@gmail.com>
pkgname=ceph
-pkgver=12.2.0
+pkgver=12.2.1
pkgrel=0
pkgdesc="Ceph is a distributed object store and file system"
pkgusers="ceph"
xmlstarlet
yasm
"
-source="ceph-12.2.0.tar.bz2"
+source="ceph-12.2.1.tar.bz2"
subpackages="
$pkgname-base
$pkgname-common
_udevrulesdir=/etc/udev/rules.d
_python_sitelib=/usr/lib/python2.7/site-packages
-builddir=$srcdir/ceph-12.2.0
+builddir=$srcdir/ceph-12.2.1
build() {
export CEPH_BUILD_VIRTUALENV=$builddir
# main package definition
#################################################################################
Name: ceph
-Version: 12.2.0
+Version: 12.2.1
Release: 0%{?dist}
%if 0%{?fedora} || 0%{?rhel}
Epoch: 2
Group: System/Filesystems
%endif
URL: http://ceph.com/
-Source0: http://ceph.com/download/ceph-12.2.0.tar.bz2
+Source0: http://ceph.com/download/ceph-12.2.1.tar.bz2
%if 0%{?suse_version}
%if 0%{?is_opensuse}
ExclusiveArch: x86_64 aarch64 ppc64 ppc64le
%if 0%{?suse_version}
Group: System/Filesystems
%endif
+Requires: fuse
%description fuse
FUSE based client for Ceph distributed network file system
# common
#################################################################################
%prep
-%autosetup -p1 -n ceph-12.2.0
+%autosetup -p1 -n ceph-12.2.1
%build
%if 0%{with cephfs_java}
%{_libdir}/librbd_tp.so.*
%endif
-%post -n librbd1
-/sbin/ldconfig
-mkdir -p /usr/lib64/qemu/
-ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
+%post -n librbd1 -p /sbin/ldconfig
%postun -n librbd1 -p /sbin/ldconfig
%if 0%{?suse_version}
Group: System/Filesystems
%endif
+Requires: fuse
%description fuse
FUSE based client for Ceph distributed network file system
%{_libdir}/librbd_tp.so.*
%endif
-%post -n librbd1
-/sbin/ldconfig
-mkdir -p /usr/lib64/qemu/
-ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
+%post -n librbd1 -p /sbin/ldconfig
%postun -n librbd1 -p /sbin/ldconfig
+ceph (12.2.1-1) stable; urgency=medium
+
+ * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com> Tue, 26 Sep 2017 16:27:06 +0000
+
ceph (12.2.0-1) stable; urgency=medium
* New upstream release
Message: "Client *name* failing to respond to cache pressure"
Code: MDS_HEALTH_CLIENT_RECALL, MDS_HEALTH_CLIENT_RECALL_MANY
-Description: Clients maintain a metadata cache. Items (such as inodes)
-in the client cache are also pinned in the MDS cache, so when the MDS
-needs to shrink its cache (to stay within ``mds_cache_size``), it
-sends messages to clients to shrink their caches too. If the client
-is unresponsive or buggy, this can prevent the MDS from properly staying
-within its ``mds_cache_size`` and it may eventually run out of memory
-and crash. This message appears if a client has taken more than
-``mds_recall_state_timeout`` (default 60s) to comply.
+Description: Clients maintain a metadata cache. Items (such as inodes) in the
+client cache are also pinned in the MDS cache, so when the MDS needs to shrink
+its cache (to stay within ``mds_cache_size`` or ``mds_cache_memory_limit``), it
+sends messages to clients to shrink their caches too. If the client is
+unresponsive or buggy, this can prevent the MDS from properly staying within
+its cache limits and it may eventually run out of memory and crash. This
+message appears if a client has taken more than ``mds_recall_state_timeout``
+(default 60s) to comply.
Message: "Client *name* failing to advance its oldest client/flush tid"
Code: MDS_HEALTH_CLIENT_OLDEST_TID, MDS_HEALTH_CLIENT_OLDEST_TID_MANY
Message: "Too many inodes in cache"
Code: MDS_HEALTH_CACHE_OVERSIZED
-Description: The MDS is not succeeding in trimming its cache to comply
-with the limit set by the administrator. If the MDS cache becomes too large,
-the daemon may exhaust available memory and crash.
-This message appears if the actual cache size (in inodes) is at least 50%
-greater than ``mds_cache_size`` (default 100000).
-
+Description: The MDS is not succeeding in trimming its cache to comply with the
+limit set by the administrator. If the MDS cache becomes too large, the daemon
+may exhaust available memory and crash. By default, this message appears if
+the actual cache size (in inodes or memory) is at least 50% greater than
+``mds_cache_size`` (default 100000) or ``mds_cache_memory_limit`` (default
+1GB). Modify ``mds_health_cache_threshold`` to set the warning ratio.
:Type: 64-bit Integer Unsigned
:Default: ``1ULL << 40``
+``mds cache memory limit``
+
+:Description: The memory limit the MDS should enforce for its cache.
+ Administrators should use this instead of ``mds cache size``.
+:Type: 64-bit Integer Unsigned
+:Default: ``1073741824``
+
+``mds cache reservation``
+
+:Description: The cache reservation (memory or inodes) for the MDS cache to maintain.
+ Once the MDS begins dipping into its reservation, it will recall
+ client state until its cache size shrinks to restore the
+ reservation.
+:Type: Float
+:Default: ``0.05``
``mds cache size``
-:Description: The number of inodes to cache.
+:Description: The number of inodes to cache. A value of 0 indicates an
+ unlimited number. It is recommended to use
+ ``mds_cache_memory_limit`` to limit the amount of memory the MDS
+ cache uses.
:Type: 32-bit Integer
-:Default: ``100000``
-
+:Default: ``0``
``mds cache mid``
Usage::
- ceph osd new {<id>} {<uuid>} -i {<secrets.json>}
+ ceph osd new {<uuid>} {<id>} -i {<secrets.json>}
The secrets JSON file is optional but if provided, is expected to maintain
a form of the following format::
Ceph block devices are thin-provisioned, resizable and store data striped over
multiple OSDs in a Ceph cluster. Ceph block devices leverage
:abbr:`RADOS (Reliable Autonomic Distributed Object Store)` capabilities
-such as snapshotting, replication and consistency. Ceph's
-:abbr:`RADOS (Reliable Autonomic Distributed Object Store)` Block Devices (RBD)
+such as snapshotting, replication and consistency. Ceph's
+:abbr:`RADOS (Reliable Autonomic Distributed Object Store)` Block Devices (RBD)
interact with OSDs using kernel modules or the ``librbd`` library.
.. ditaa:: +------------------------+ +------------------------+
| OSDs | | Monitors |
+------------------------+ +------------------------+
-.. note:: Kernel modules can use Linux page caching. For ``librbd``-based
+.. note:: Kernel modules can use Linux page caching. For ``librbd``-based
applications, Ceph supports `RBD Caching`_.
Ceph's block devices deliver high performance with infinite scalability to
to operate the `Ceph RADOS Gateway`_, the `Ceph FS filesystem`_, and Ceph block
devices simultaneously.
-.. important:: To use Ceph Block Devices, you must have access to a running
+.. important:: To use Ceph Block Devices, you must have access to a running
Ceph cluster.
.. toctree::
Commands <rados-rbd-cmds>
Kernel Modules <rbd-ko>
Snapshots<rbd-snapshot>
- Mirroring <rbd-mirroring>
+ Mirroring <rbd-mirroring>
+ iSCSI Gateway <iscsi-overview>
QEMU <qemu-rbd>
libvirt <libvirt>
Cache Settings <rbd-config-ref/>
APIs <api/index>
-
-
-
.. _RBD Caching: ../rbd-config-ref/
.. _kernel modules: ../rbd-ko/
.. _QEMU: ../qemu-rbd/
--- /dev/null
+----------------------------------
+The iSCSI Initiator for VMware ESX
+----------------------------------
+
+**Prerequisite:**
+
+- VMware ESX 6.0 or later
+
+**iSCSI Discovery and Multipath Device Setup:**
+
+#. From vSphere, open the Storage Adapters, on the Configuration tab. Right click
+ on the iSCSI Software Adapter and select Properties.
+
+#. In the General tab click the "Advanced" button and in the "Advanced Settings"
+ set RecoveryTimeout to 25.
+
+#. If CHAP was setup on the iSCSI gateway, in the General tab click the "CHAP…"
+ button. If CHAP is not being used, skip to step 4.
+
+#. On the CHAP Credentials windows, select “Do not use CHAP unless required by target”,
+ and enter the "Name" and "Secret" values used on the initial setup for the iSCSI
+ gateway, then click on the "OK" button.
+
+#. On the Dynamic Discovery tab, click the "Add…" button, and enter the IP address
+ and port of one of the iSCSI target portals. Click on the "OK" button.
+
+#. Close the iSCSI Initiator Properties window. A prompt will ask to rescan the
+ iSCSI software adapter. Select Yes.
+
+#. In the Details pane, the LUN on the iSCSI target will be displayed. Right click
+ on a device and select "Manage Paths".
+
+#. On the Manage Paths window, select “Most Recently Used (VMware)” for the policy
+ path selection. Close and repeat for the other disks.
+
+Now the disks can be used for datastores.
--- /dev/null
+------------------------------------------------
+The iSCSI Initiator for Red Hat Enterprise Linux
+------------------------------------------------
+
+**Prerequisite:**
+
+- Package ``iscsi-initiator-utils-6.2.0.873-35`` or newer must be
+ installed
+
+- Package ``device-mapper-multipath-0.4.9-99`` or newer must be
+ installed
+
+**Installing:**
+
+Install the iSCSI initiator and multipath tools:
+
+ ::
+
+ # yum install iscsi-initiator-utils
+ # yum install device-mapper-multipath
+
+**Configuring:**
+
+#. Create the default ``/etc/multipath.conf`` file and enable the
+ ``multiapthd`` service:
+
+ ::
+
+ # mpathconf --enable --with_multipathd y
+
+#. Add the following to ``/etc/multipath.conf`` file:
+
+ ::
+
+ devices {
+ device {
+ vendor "LIO-ORG"
+ hardware_handler "1 alua"
+ path_grouping_policy "failover"
+ path_selector "queue-length 0"
+ failback 60
+ path_checker tur
+ prio alua
+ prio_args exclusive_pref_bit
+ fast_oi_fail_tmo 25
+ no_path_retry queue
+ }
+ }
+
+#. Restart the ``multipathd`` service:
+
+ ::
+
+ # systemctl reload multipathd
+
+**iSCSI Discovery and Setup:**
+
+#. Discover the target portals:
+
+ ::
+
+ # iscsiadm -m discovery -t -st 192.168.56.101
+ 192.168.56.101:3260,1 iqn.2003-01.org.linux-iscsi.rheln1
+ 192.168.56.102:3260,2 iqn.2003-01.org.linux-iscsi.rheln1
+
+#. Login to target:
+
+ ::
+
+ # iscsiadm -m node -T iqn.2003-01.org.linux-iscsi.rheln1 -l
+
+**Multipath IO Setup:**
+
+The multipath daemon (``multipathd``), will set up devices automatically
+based on the ``multipath.conf`` settings. Running the ``multipath``
+command show devices setup in a failover configuration with a priority
+group for each path.
+
+::
+
+ # multipath -ll
+ mpathbt (360014059ca317516a69465c883a29603) dm-1 LIO-ORG ,IBLOCK
+ size=1.0G features='0' hwhandler='1 alua' wp=rw
+ |-+- policy='queue-length 0' prio=50 status=active
+ | `- 28:0:0:1 sde 8:64 active ready running
+ `-+- policy='queue-length 0' prio=10 status=enabled
+ `- 29:0:0:1 sdc 8:32 active ready running
+
+You should now be able to use the RBD image like you would a normal
+multipath’d iSCSI disk.
--- /dev/null
+-----------------------------------------
+The iSCSI Initiator for Microsoft Windows
+-----------------------------------------
+
+**Prerequisite:**
+
+- Microsoft Windows 2016
+
+**iSCSI Initiator, Discovery and Setup:**
+
+#. Install the iSCSI initiator driver and MPIO tools.
+
+#. Launch the MPIO program, click on the “Discover Multi-Paths” tab select “Add
+ support for iSCSI devices”.
+
+#. On the iSCSI Initiator Properties window, on the "Discovery" tab, add a target
+ portal. Enter the IP address or DNS name and Port of the Ceph iSCSI gateway.
+
+#. On the “Targets” tab, select the target and click on “Connect”.
+
+#. On the “Connect To Target” window, select the “Enable multi-path” option, and
+ click the “Advanced” button.
+
+#. Under the "Connet using" section, select a “Target portal IP” . Select the
+ “Enable CHAP login on” and enter the "Name" and "Target secret" values from the
+ Ceph iSCSI Ansible client credentials section, and click OK.
+
+#. Repeat steps 5 and 6 for each target portal defined when setting up
+ the iSCSI gateway.
+
+**Multipath IO Setup:**
+
+Configuring the MPIO load balancing policy, setting the timeout and
+retry options are using PowerShell with the ``mpclaim`` command. The
+reset is done in the MPIO tool.
+
+.. note::
+ It is recommended to increase the ``PDORemovePeriod`` option to 120
+ seconds from PowerShell. This value might need to be adjusted based
+ on the application. When all paths are down, and 120 seconds
+ expires, the operating system will start failing IO requests.
+
+::
+
+ Set-MPIOSetting -NewPDORemovePeriod 120
+
+::
+
+ mpclaim.exe -l -m 1
+
+::
+
+ mpclaim -s -m
+ MSDSM-wide Load Balance Policy: Fail Over Only
+
+#. Using the MPIO tool, from the “Targets” tab, click on the
+ “Devices...” button.
+
+#. From the Devices window, select a disk and click the
+ “MPIO...” button.
+
+#. On the "Device Details" window the paths to each target portal is
+ displayed. If using the ``ceph-ansible`` setup method, the
+ iSCSI gateway will use ALUA to tell the iSCSI initiator which path
+ and iSCSI gateway should be used as the primary path. The Load
+ Balancing Policy “Fail Over Only” must be selected
+
+::
+
+ mpclaim -s -d $MPIO_DISK_ID
+
+.. note::
+ For the ``ceph-ansible`` setup method, there will be one
+ Active/Optimized path which is the path to the iSCSI gateway node
+ that owns the LUN, and there will be an Active/Unoptimized path for
+ each other iSCSI gateway node.
+
+**Tuning:**
+
+Consider using the following registry settings:
+
+- Windows Disk Timeout
+
+ ::
+
+ HKEY_LOCAL_MACHINE\System\CurrentControlSet\Services\Disk
+
+ ::
+
+ TimeOutValue = 65
+
+- Microsoft iSCSI Initiator Driver
+
+ ::
+
+ HKEY_LOCAL_MACHINE\\SYSTEM\CurrentControlSet\Control\Class\{4D36E97B-E325-11CE-BFC1-08002BE10318}\<Instance_Number>\Parameters
+
+ ::
+ LinkDownTime = 25
+ SRBTimeoutDelta = 15
--- /dev/null
+--------------------------------
+Configuring the iSCSI Initiators
+--------------------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ The iSCSI Initiator for Red Hat Enterprise Linux <iscsi-initiator-rhel>
+ The iSCSI Initiator for Microsoft Windows <iscsi-initiator-win>
+ The iSCSI Initiator for VMware ESX <iscsi-initiator-esx>
--- /dev/null
+-----------------------------
+Monitoring the iSCSI gateways
+-----------------------------
+
+Ceph provides an additional tool for iSCSI gateway environments
+to monitor performance of exported RADOS Block Device (RBD) images.
+
+The ``gwtop`` tool is a ``top``-like tool that displays aggregated
+performance metrics of RBD images that are exported to clients over
+iSCSI. The metrics are sourced from a Performance Metrics Domain Agent
+(PMDA). Information from the Linux-IO target (LIO) PMDA is used to list
+each exported RBD image with the connected client and its associated I/O
+metrics.
+
+**Requirements:**
+
+- A running Ceph iSCSI gateway
+
+**Installing:**
+
+#. As ``root``, install the ``ceph-iscsi-tools`` package on each iSCSI
+ gateway node:
+
+ ::
+
+ # yum install ceph-iscsi-tools
+
+#. As ``root``, install the performance co-pilot package on each iSCSI
+ gateway node:
+
+ ::
+
+ # yum install pcp
+
+#. As ``root``, install the LIO PMDA package on each iSCSI gateway node:
+
+ ::
+
+ # yum install pcp-pmda-lio
+
+#. As ``root``, enable and start the performance co-pilot service on
+ each iSCSI gateway node:
+
+ ::
+
+ # systemctl enable pmcd
+ # systemctl start pmcd
+
+#. As ``root``, register the ``pcp-pmda-lio`` agent:
+
+ ::
+
+ cd /var/lib/pcp/pmdas/lio
+ ./Install
+
+By default, ``gwtop`` assumes the iSCSI gateway configuration object is
+stored in a RADOS object called ``gateway.conf`` in the ``rbd`` pool.
+This configuration defines the iSCSI gateways to contact for gathering
+the performance statistics. This can be overridden by using either the
+``-g`` or ``-c`` flags. See ``gwtop --help`` for more details.
+
+The LIO configuration determines which type of performance statistics to
+extract from performance co-pilot. When ``gwtop`` starts it looks at the
+LIO configuration, and if it find user-space disks, then ``gwtop``
+selects the LIO collector automatically.
+
+**Example ``gwtop`` Outputs**
+
+For kernel RBD-based devices:
+
+::
+
+ gwtop 2/2 Gateways CPU% MIN: 4 MAX: 5 Network Total In: 2M Out: 3M 10:20:09
+ Capacity: 8G Disks: 8 IOPS: 500 Clients: 1 Ceph: HEALTH_OK OSDs: 3
+ Pool.Image Src Device Size r/s w/s rMB/s wMB/s await r_await w_await Client
+ iscsi.t1703 rbd0 500M 0 0 0.00 0.00 0.00 0.00 0.00
+ iscsi.testme1 rbd5 500M 0 0 0.00 0.00 0.00 0.00 0.00
+ iscsi.testme2 rbd2 500M 0 0 0.00 0.00 0.00 0.00 0.00
+ iscsi.testme3 rbd3 500M 0 0 0.00 0.00 0.00 0.00 0.00
+ iscsi.testme5 rbd1 500M 0 0 0.00 0.00 0.00 0.00 0.00
+ rbd.myhost_1 T rbd4 4G 500 0 1.95 0.00 2.37 2.37 0.00 rh460p(CON)
+ rbd.test_2 rbd6 1G 0 0 0.00 0.00 0.00 0.00 0.00
+ rbd.testme rbd7 500M 0 0 0.00 0.00 0.00 0.00 0.00
+
+For user backed storage (TCMU) devices:
+
+::
+
+ gwtop 2/2 Gateways CPU% MIN: 4 MAX: 5 Network Total In: 2M Out: 3M 10:20:00
+ Capacity: 8G Disks: 8 IOPS: 503 Clients: 1 Ceph: HEALTH_OK OSDs: 3
+ Pool.Image Src Size iops rMB/s wMB/s Client
+ iscsi.t1703 500M 0 0.00 0.00
+ iscsi.testme1 500M 0 0.00 0.00
+ iscsi.testme2 500M 0 0.00 0.00
+ iscsi.testme3 500M 0 0.00 0.00
+ iscsi.testme5 500M 0 0.00 0.00
+ rbd.myhost_1 T 4G 504 1.95 0.00 rh460p(CON)
+ rbd.test_2 1G 0 0.00 0.00
+ rbd.testme 500M 0 0.00 0.00
+
+In the *Client* column, ``(CON)`` means the iSCSI initiator (client) is
+currently logged into the iSCSI gateway. If ``-multi-`` is displayed,
+then multiple clients are mapped to the single RBD image.
--- /dev/null
+==================
+Ceph iSCSI Gateway
+==================
+
+The iSCSI gateway is integrating Ceph Storage with the iSCSI standard to provide
+a Highly Available (HA) iSCSI target that exports RADOS Block Device (RBD) images
+as SCSI disks. The iSCSI protocol allows clients (initiators) to send SCSI commands
+to SCSI storage devices (targets) over a TCP/IP network. This allows for heterogeneous
+clients, such as Microsoft Windows, to access the Ceph Storage cluster.
+
+Each iSCSI gateway runs the Linux IO target kernel subsystem (LIO) to provide the
+iSCSI protocol support. LIO utilizes a userspace passthrough (TCMU) to interact
+with Ceph's librbd library and expose RBD images to iSCSI clients. With Ceph’s
+iSCSI gateway you can effectively run a fully integrated block-storage
+infrastructure with all the features and benefits of a conventional Storage Area
+Network (SAN).
+
+.. ditaa::
+ Cluster Network
+ +-------------------------------------------+
+ | | | |
+ +-------+ +-------+ +-------+ +-------+
+ | | | | | | | |
+ | OSD 1 | | OSD 2 | | OSD 3 | | OSD N |
+ | {s}| | {s}| | {s}| | {s}|
+ +-------+ +-------+ +-------+ +-------+
+ | | | |
+ +--------->| | +---------+ | |<---------+
+ : | | | RBD | | | :
+ | +----------------| Image |----------------+ |
+ | Public Network | {d} | |
+ | +---------+ |
+ | |
+ | +-------------------+ |
+ | +--------------+ | iSCSI Initators | +--------------+ |
+ | | iSCSI GW | | +-----------+ | | iSCSI GW | |
+ +-->| RBD Module |<--+ | Various | +-->| RBD Module |<--+
+ | | | | Operating | | | |
+ +--------------+ | | Systems | | +--------------+
+ | +-----------+ |
+ +-------------------+
+
+
+.. toctree::
+ :maxdepth: 1
+
+ Requirements <iscsi-requirements>
+ Configuring the iSCSI Target <iscsi-targets>
+ Configuring the iSCSI Initiator <iscsi-initiators>
+ Monitoring the iSCSI Gateways <iscsi-monitoring>
--- /dev/null
+==========================
+iSCSI Gateway Requirements
+==========================
+
+To implement the Ceph iSCSI gateway there are a few requirements. It is recommended
+to use two to four iSCSI gateway nodes for a highly available Ceph iSCSI gateway
+solution.
+
+For hardware recommendations, see the `Hardware Recommendation page <http://docs.ceph.com/docs/master/start/hardware-recommendations/>`_
+for more details.
+
+.. note::
+ On the iSCSI gateway nodes, the memory footprint of the RBD images
+ can grow to a large size. Plan memory requirements accordingly based
+ off the number RBD images mapped.
+
+There are no specific iSCSI gateway options for the Ceph Monitors or
+OSDs, but it is important to lower the default timers for detecting
+down OSDs to reduce the possibility of initiator timeouts. The following
+configuration options are suggested for each OSD node in the storage
+cluster::
+
+ [osd]
+ osd heartbeat grace = 20
+ osd heartbeat interval = 5
+
+- Online Updating Using the Ceph Monitor
+
+ ::
+
+ ceph tell <daemon_type>.<id> injectargs '--<parameter_name> <new_value>'
+
+ ::
+
+ ceph tell osd.0 injectargs '--osd_heartbeat_grace 20'
+ ceph tell osd.0 injectargs '--osd_heartbeat_interval 5'
+
+- Online Updating on the OSD Node
+
+ ::
+
+ ceph daemon <daemon_type>.<id> config set osd_client_watch_timeout 15
+
+ ::
+
+ ceph daemon osd.0 config set osd_heartbeat_grace 20
+ ceph daemon osd.0 config set osd_heartbeat_interval 5
+
+For more details on setting Ceph's configuration options, see the `Configuration page <http://docs.ceph.com/docs/master/rados/configuration/>`_.
--- /dev/null
+==========================================
+Configuring the iSCSI Target using Ansible
+==========================================
+
+The Ceph iSCSI gateway is the iSCSI target node and also a Ceph client
+node. The Ceph iSCSI gateway can be a standalone node or be colocated on
+a Ceph Object Store Disk (OSD) node. Completing the following steps will
+install, and configure the Ceph iSCSI gateway for basic operation.
+
+**Requirements:**
+
+- A running Ceph Luminous (12.2.x) cluster or newer
+
+- RHEL/CentOS 7.4; or Linux kernel v4.14 or newer
+
+- The ``ceph-iscsi-config`` package installed on all the iSCSI gateway nodes
+
+**Installing:**
+
+#. On the Ansible installer node, which could be either the administration node
+ or a dedicated deployment node, perform the following steps:
+
+ #. As ``root``, install the ``ceph-ansible`` package:
+
+ ::
+
+ # yum install ceph-ansible
+
+ #. Add an entry in ``/etc/ansible/hosts`` file for the gateway group:
+
+ ::
+
+ [ceph-iscsi-gw]
+ ceph-igw-1
+ ceph-igw-2
+
+.. note::
+ If co-locating the iSCSI gateway with an OSD node, then add the OSD node to the
+ ``[ceph-iscsi-gw]`` section.
+
+**Configuring:**
+
+The ``ceph-ansible`` package places a file in the ``/usr/share/ceph-ansible/group_vars/``
+directory called ``ceph-iscsi-gw.sample``. Create a copy of this sample file named
+``ceph-iscsi-gw.yml``. Review the following Ansible variables and descriptions,
+and update accordingly.
+
++--------------------------------------+--------------------------------------+
+| Variable | Meaning/Purpose |
++======================================+======================================+
+| ``seed_monitor`` | Each gateway needs access to the |
+| | ceph cluster for rados and rbd |
+| | calls. This means the iSCSI gateway |
+| | must have an appropriate |
+| | ``/etc/ceph/`` directory defined. |
+| | The ``seed_monitor`` host is used to |
+| | populate the iSCSI gateway’s |
+| | ``/etc/ceph/`` directory. |
++--------------------------------------+--------------------------------------+
+| ``cluster_name`` | Define a custom storage cluster |
+| | name. |
++--------------------------------------+--------------------------------------+
+| ``gateway_keyring`` | Define a custom keyring name. |
++--------------------------------------+--------------------------------------+
+| ``deploy_settings`` | If set to ``true``, then deploy the |
+| | settings when the playbook is ran. |
++--------------------------------------+--------------------------------------+
+| ``perform_system_checks`` | This is a boolean value that checks |
+| | for multipath and lvm configuration |
+| | settings on each gateway. It must be |
+| | set to true for at least the first |
+| | run to ensure multipathd and lvm are |
+| | configured properly. |
++--------------------------------------+--------------------------------------+
+| ``gateway_iqn`` | This is the iSCSI IQN that all the |
+| | gateways will expose to clients. |
+| | This means each client will see the |
+| | gateway group as a single subsystem. |
++--------------------------------------+--------------------------------------+
+| ``gateway_ip_list`` | The ip list defines the IP addresses |
+| | that will be used on the front end |
+| | network for iSCSI traffic. This IP |
+| | will be bound to the active target |
+| | portal group on each node, and is |
+| | the access point for iSCSI traffic. |
+| | Each IP should correspond to an IP |
+| | available on the hosts defined in |
+| | the ``ceph-iscsi-gw`` host group in |
+| | ``/etc/ansible/hosts``. |
++--------------------------------------+--------------------------------------+
+| ``rbd_devices`` | This section defines the RBD images |
+| | that will be controlled and managed |
+| | within the iSCSI gateway |
+| | configuration. Parameters like |
+| | ``pool`` and ``image`` are self |
+| | explanatory. Here are the other |
+| | parameters: ``size`` = This defines |
+| | the size of the RBD. You may |
+| | increase the size later, by simply |
+| | changing this value, but shrinking |
+| | the size of an RBD is not supported |
+| | and is ignored. ``host`` = This is |
+| | the iSCSI gateway host name that |
+| | will be responsible for the rbd |
+| | allocation/resize. Every defined |
+| | ``rbd_device`` entry must have a |
+| | host assigned. ``state`` = This is |
+| | typical Ansible syntax for whether |
+| | the resource should be defined or |
+| | removed. A request with a state of |
+| | absent will first be checked to |
+| | ensure the rbd is not mapped to any |
+| | client. If the RBD is unallocated, |
+| | it will be removed from the iSCSI |
+| | gateway and deleted from the |
+| | configuration. |
++--------------------------------------+--------------------------------------+
+| ``client_connections`` | This section defines the iSCSI |
+| | client connection details together |
+| | with the LUN (RBD image) masking. |
+| | Currently only CHAP is supported as |
+| | an authentication mechanism. Each |
+| | connection defines an ``image_list`` |
+| | which is a comma separated list of |
+| | the form |
+| | ``pool.rbd_image[,pool.rbd_image]``. |
+| | RBD images can be added and removed |
+| | from this list, to change the client |
+| | masking. Note that there are no |
+| | checks done to limit RBD sharing |
+| | across client connections. |
++--------------------------------------+--------------------------------------+
+
+.. note::
+ When using the ``gateway_iqn`` variable, and for Red Hat Enterprise Linux
+ clients, installing the ``iscsi-initiator-utils`` package is required for
+ retrieving the gateway’s IQN name. The iSCSI initiator name is located in the
+ ``/etc/iscsi/initiatorname.iscsi`` file.
+
+**Deploying:**
+
+On the Ansible installer node, perform the following steps.
+
+#. As ``root``, execute the Ansible playbook:
+
+ ::
+
+ # cd /usr/share/ceph-ansible
+ # ansible-playbook ceph-iscsi-gw.yml
+
+ .. note::
+ The Ansible playbook will handle RPM dependencies, RBD creation
+ and Linux IO configuration.
+
+#. Verify the configuration from an iSCSI gateway node:
+
+ ::
+
+ # gwcli ls
+
+ .. note::
+ For more information on using the ``gwcli`` command to install and configure
+ a Ceph iSCSI gateaway, see the `Configuring the iSCSI Target using the Command Line Interface`_
+ section.
+
+ .. important::
+ Attempting to use the ``targetcli`` tool to change the configuration will
+ result in the following issues, such as ALUA misconfiguration and path failover
+ problems. There is the potential to corrupt data, to have mismatched
+ configuration across iSCSI gateways, and to have mismatched WWN information,
+ which will lead to client multipath problems.
+
+**Service Management:**
+
+The ``ceph-iscsi-config`` package installs the configuration management
+logic and a Systemd service called ``rbd-target-gw``. When the Systemd
+service is enabled, the ``rbd-target-gw`` will start at boot time and
+will restore the Linux IO state. The Ansible playbook disables the
+target service during the deployment. Below are the outcomes of when
+interacting with the ``rbd-target-gw`` Systemd service.
+
+::
+
+ # systemctl <start|stop|restart|reload> rbd-target-gw
+
+- ``reload``
+
+ A reload request will force ``rbd-target-gw`` to reread the
+ configuration and apply it to the current running environment. This
+ is normally not required, since changes are deployed in parallel from
+ Ansible to all iSCSI gateway nodes
+
+- ``stop``
+
+ A stop request will close the gateway’s portal interfaces, dropping
+ connections to clients and wipe the current LIO configuration from
+ the kernel. This returns the iSCSI gateway to a clean state. When
+ clients are disconnected, active I/O is rescheduled to the other
+ iSCSI gateways by the client side multipathing layer.
+
+**Administration:**
+
+Within the ``/usr/share/ceph-ansible/group_vars/ceph-iscsi-gw`` file
+there are a number of operational workflows that the Ansible playbook
+supports.
+
+.. warning::
+ Before removing RBD images from the iSCSI gateway configuration,
+ follow the standard procedures for removing a storage device from
+ the operating system.
+
++--------------------------------------+--------------------------------------+
+| I want to… | Update the ``ceph-iscsi-gw`` file |
+| | by… |
++======================================+======================================+
+| Add more RBD images | Adding another entry to the |
+| | ``rbd_devices`` section with the new |
+| | image. |
++--------------------------------------+--------------------------------------+
+| Resize an existing RBD image | Updating the size parameter within |
+| | the ``rbd_devices`` section. Client |
+| | side actions are required to pick up |
+| | the new size of the disk. |
++--------------------------------------+--------------------------------------+
+| Add a client | Adding an entry to the |
+| | ``client_connections`` section. |
++--------------------------------------+--------------------------------------+
+| Add another RBD to a client | Adding the relevant RBD |
+| | ``pool.image`` name to the |
+| | ``image_list`` variable for the |
+| | client. |
++--------------------------------------+--------------------------------------+
+| Remove an RBD from a client | Removing the RBD ``pool.image`` name |
+| | from the clients ``image_list`` |
+| | variable. |
++--------------------------------------+--------------------------------------+
+| Remove an RBD from the system | Changing the RBD entry state |
+| | variable to ``absent``. The RBD |
+| | image must be unallocated from the |
+| | operating system first for this to |
+| | succeed. |
++--------------------------------------+--------------------------------------+
+| Change the clients CHAP credentials | Updating the relevant CHAP details |
+| | in ``client_connections``. This will |
+| | need to be coordinated with the |
+| | clients. For example, the client |
+| | issues an iSCSI logout, the |
+| | credentials are changed by the |
+| | Ansible playbook, the credentials |
+| | are changed at the client, then the |
+| | client performs an iSCSI login. |
++--------------------------------------+--------------------------------------+
+| Remove a client | Updating the relevant |
+| | ``client_connections`` item with a |
+| | state of ``absent``. Once the |
+| | Ansible playbook is ran, the client |
+| | will be purged from the system, but |
+| | the disks will remain defined to |
+| | Linux IO for potential reuse. |
++--------------------------------------+--------------------------------------+
+
+Once a change has been made, rerun the Ansible playbook to apply the
+change across the iSCSI gateway nodes.
+
+::
+
+ # ansible-playbook ceph-iscsi-gw.yml
+
+**Removing the Configuration:**
+
+The ``ceph-ansible`` package provides an Ansible playbook to
+remove the iSCSI gateway configuration and related RBD images. The
+Ansible playbook is ``/usr/share/ceph-ansible/purge_gateways.yml``. When
+this Ansible playbook is ran a prompted for the type of purge to
+perform:
+
+*lio* :
+
+In this mode the LIO configuration is purged on all iSCSI gateways that
+are defined. Disks that were created are left untouched within the Ceph
+storage cluster.
+
+*all* :
+
+When ``all`` is chosen, the LIO configuration is removed together with
+**all** RBD images that were defined within the iSCSI gateway
+environment, other unrelated RBD images will not be removed. Ensure the
+correct mode is chosen, this operation will delete data.
+
+.. warning::
+ A purge operation is destructive action against your iSCSI gateway
+ environment.
+
+.. warning::
+ A purge operation will fail, if RBD images have snapshots or clones
+ and are exported through the Ceph iSCSI gateway.
+
+::
+
+ [root@rh7-iscsi-client ceph-ansible]# ansible-playbook purge_gateways.yml
+ Which configuration elements should be purged? (all, lio or abort) [abort]: all
+
+
+ PLAY [Confirm removal of the iSCSI gateway configuration] *********************
+
+
+ GATHERING FACTS ***************************************************************
+ ok: [localhost]
+
+
+ TASK: [Exit playbook if user aborted the purge] *******************************
+ skipping: [localhost]
+
+
+ TASK: [set_fact ] *************************************************************
+ ok: [localhost]
+
+
+ PLAY [Removing the gateway configuration] *************************************
+
+
+ GATHERING FACTS ***************************************************************
+ ok: [ceph-igw-1]
+ ok: [ceph-igw-2]
+
+
+ TASK: [igw_purge | purging the gateway configuration] *************************
+ changed: [ceph-igw-1]
+ changed: [ceph-igw-2]
+
+
+ TASK: [igw_purge | deleting configured rbd devices] ***************************
+ changed: [ceph-igw-1]
+ changed: [ceph-igw-2]
+
+
+ PLAY RECAP ********************************************************************
+ ceph-igw-1 : ok=3 changed=2 unreachable=0 failed=0
+ ceph-igw-2 : ok=3 changed=2 unreachable=0 failed=0
+ localhost : ok=2 changed=0 unreachable=0 failed=0
+
+
+.. _Configuring the iSCSI Target using the Command Line Interface: ../iscsi-target-cli
--- /dev/null
+=============================================================
+Configuring the iSCSI Target using the Command Line Interface
+=============================================================
+
+The Ceph iSCSI gateway is the iSCSI target node and also a Ceph client
+node. The Ceph iSCSI gateway can be a standalone node or be colocated on
+a Ceph Object Store Disk (OSD) node. Completing the following steps will
+install, and configure the Ceph iSCSI gateway for basic operation.
+
+**Requirements:**
+
+- A running Ceph Luminous or later storage cluster
+
+- RHEL/CentOS 7.4; or Linux kernel v4.14 or newer
+
+- The following packages must be installed from your Linux distribution's software repository:
+
+ - ``targetcli-2.1.fb47`` or newer package
+
+ - ``python-rtslib-2.1.fb64`` or newer package
+
+ - ``tcmu-runner-1.3.0`` or newer package
+
+ - ``ceph-iscsi-config-2.3`` or newer package
+
+ - ``ceph-iscsi-cli-2.5`` or newer package
+
+ .. important::
+ If previous versions of these packages exist, then they must
+ be removed first before installing the newer versions.
+
+Do the following steps on the Ceph iSCSI gateway node before proceeding
+to the *Installing* section:
+
+#. If the Ceph iSCSI gateway is not colocated on an OSD node, then copy
+ the Ceph configuration files, located in ``/etc/ceph/``, from a
+ running Ceph node in the storage cluster to the iSCSI Gateway node.
+ The Ceph configuration files must exist on the iSCSI gateway node
+ under ``/etc/ceph/``.
+
+#. Install and configure the `Ceph Command-line
+ Interface <http://docs.ceph.com/docs/master/start/quick-rbd/#install-ceph>`_
+
+#. If needed, open TCP ports 3260 and 5000 on the firewall.
+
+#. Create a new or use an existing RADOS Block Device (RBD).
+
+**Installing:**
+
+#. As ``root``, on all iSCSI gateway nodes, install the
+ ``ceph-iscsi-cli`` package:
+
+ ::
+
+ # yum install ceph-iscsi-cli
+
+#. As ``root``, on all iSCSI gateway nodes, install the ``tcmu-runner``
+ package:
+
+ ::
+
+ # yum install tcmu-runner
+
+#. As ``root``, on a iSCSI gateway node, create a file named
+ ``iscsi-gateway.cfg`` in the ``/etc/ceph/`` directory:
+
+ ::
+
+ # touch /etc/ceph/iscsi-gateway.cfg
+
+ #. Edit the ``iscsi-gateway.cfg`` file and add the following lines:
+
+ ::
+
+ [config]
+ # Name of the Ceph storage cluster. A suitable Ceph configuration file allowing
+ # access to the Ceph storage cluster from the gateway node is required, if not
+ # colocated on an OSD node.
+ cluster_name = ceph
+
+ # Place a copy of the ceph cluster's admin keyring in the gateway's /etc/ceph
+ # drectory and reference the filename here
+ gateway_keyring = ceph.client.admin.keyring
+
+
+ # API settings.
+ # The API supports a number of options that allow you to tailor it to your
+ # local environment. If you want to run the API under https, you will need to
+ # create cert/key files that are compatible for each iSCSI gateway node, that is
+ # not locked to a specific node. SSL cert and key files *must* be called
+ # 'iscsi-gateway.crt' and 'iscsi-gateway.key' and placed in the '/etc/ceph/' directory
+ # on *each* gateway node. With the SSL files in place, you can use 'api_secure = true'
+ # to switch to https mode.
+
+ # To support the API, the bear minimum settings are:
+ api_secure = false
+
+ # Additional API configuration options are as follows, defaults shown.
+ # api_user = admin
+ # api_password = admin
+ # api_port = 5001
+ # trusted_ip_list = 192.168.0.10,192.168.0.11
+
+ .. important::
+ The ``iscsi-gateway.cfg`` file must be identical on all iSCSI gateway nodes.
+
+ #. As ``root``, copy the ``iscsi-gateway.cfg`` file to all iSCSI
+ gateway nodes.
+
+#. As ``root``, on all iSCSI gateway nodes, enable and start the API
+ service:
+
+ ::
+
+ # systemctl enable rbd-target-api
+ # systemctl start rbd-target-api
+
+**Configuring:**
+
+#. As ``root``, on a iSCSI gateway node, start the iSCSI gateway
+ command-line interface:
+
+ ::
+
+ # gwcli
+
+#. Creating the iSCSI gateways:
+
+ ::
+
+ >/iscsi-target create iqn.2003-01.com.redhat.iscsi-gw:<target_name>
+ > goto gateways
+ > create <iscsi_gw_name> <IP_addr_of_gw>
+ > create <iscsi_gw_name> <IP_addr_of_gw>
+
+#. Adding a RADOS Block Device (RBD):
+
+ ::
+
+ > cd /iscsi-target/iqn.2003-01.com.redhat.iscsi-gw:<target_name>/disks/
+ >/disks/ create pool=<pool_name> image=<image_name> size=<image_size>m|g|t
+
+#. Creating a client:
+
+ ::
+
+ > goto hosts
+ > create iqn.1994-05.com.redhat:<client_name>
+ > auth chap=<user_name>/<password> | nochap
+
+
+ .. warning::
+ CHAP must always be configured. Without CHAP, the target will
+ reject any login requests.
+
+#. Adding disks to a client:
+
+ ::
+
+ >/iscsi-target..eph-igw/hosts> cd iqn.1994-05.com.redhat:<client_name>
+ > disk add <pool_name>.<image_name>
+
+The next step is to configure the iSCSI initiators.
--- /dev/null
+=============
+iSCSI Targets
+=============
+
+Traditionally, block-level access to a Ceph storage cluster has been
+limited to QEMU and ``librbd``, which is a key enabler for adoption
+within OpenStack environments. Starting with the Ceph Luminous release,
+block-level access is expanding to offer standard iSCSI support allowing
+wider platform usage, and potentially opening new use cases.
+
+- RHEL/CentOS 7.4; or Linux kernel v4.14 or newer
+
+- A working Ceph Storage cluster, deployed with ``ceph-ansible`` or using the command-line interface
+
+- iSCSI gateways nodes, which can either be colocated with OSD nodes or on dedicated nodes
+
+- Separate network subnets for iSCSI front-end traffic and Ceph back-end traffic
+
+A choice of using Ansible or the command-line interface are the
+available deployment methods for installing and configuring the Ceph
+iSCSI gateway:
+
+.. toctree::
+ :maxdepth: 1
+
+ Using Ansible <iscsi-target-ansible>
+ Using the Command Line Interface <iscsi-target-cli>
--- /dev/null
+overrides:
+ thrashosds:
+ bdev_inject_crash: 2
+ bdev_inject_crash_probability: .5
+ ceph:
+ fs: xfs
+ conf:
+ osd:
+ osd objectstore: bluestore
+ bluestore block size: 96636764160
+ debug bluestore: 20
+ debug bluefs: 20
+ debug rocksdb: 10
+ bluestore fsck on mount: true
+ bluestore allocator: bitmap
+ # lower the full ratios since we can fill up a 100gb osd so quickly
+ mon osd full ratio: .9
+ mon osd backfillfull_ratio: .85
+ mon osd nearfull ratio: .8
+ osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+# bluestore bluefs env mirror: true
+ ceph-deploy:
+ fs: xfs
+ bluestore: yes
+ conf:
+ osd:
+ osd objectstore: bluestore
+ bluestore block size: 96636764160
+ debug bluestore: 20
+ debug bluefs: 20
+ debug rocksdb: 10
+ bluestore fsck on mount: true
+ # lower the full ratios since we can fill up a 100gb osd so quickly
+ mon osd full ratio: .9
+ mon osd backfillfull_ratio: .85
+ mon osd nearfull ratio: .8
+ osd failsafe full ratio: .95
+
osd:
osd objectstore: bluestore
bluestore block size: 96636764160
- debug bluestore: 30
- debug bdev: 20
+ debug bluestore: 20
debug bluefs: 20
debug rocksdb: 10
bluestore compression mode: aggressive
osd:
osd objectstore: bluestore
bluestore block size: 96636764160
- debug bluestore: 30
- debug bdev: 20
+ debug bluestore: 20
debug bluefs: 20
debug rocksdb: 10
bluestore fsck on mount: true
osd:
osd objectstore: bluestore
bluestore block size: 96636764160
- debug bluestore: 30
- debug bdev: 20
+ debug bluestore: 20
debug bluefs: 20
debug rocksdb: 10
bluestore fsck on mount: true
PROGNAME=$(basename $0)
-# xfstests is downloaded from this git repository and then built.
-# XFSTESTS_REPO="git://oss.sgi.com/xfs/cmds/xfstests.git"
-XFSTESTS_REPO="git://git.ceph.com/xfstests.git"
-XFSTESTS_VERSION="facff609afd6a2ca557c2b679e088982026aa188"
-XFSPROGS_REPO="git://oss.sgi.com/xfs/cmds/xfsprogs"
-XFSPROGS_VERSION="v3.2.2"
-XFSDUMP_REPO="git://oss.sgi.com/xfs/cmds/xfsdump"
-XFSDUMP_VERSION="v3.1.4"
-
# Default command line option values
COUNT="1"
EXPUNGE_FILE=""
DO_RANDOMIZE="" # false
-FS_TYPE="xfs"
+FSTYP="xfs"
SCRATCH_DEV="" # MUST BE SPECIFIED
TEST_DEV="" # MUST BE SPECIFIED
TESTS="-g auto" # The "auto" group is supposed to be "known good"
-# We no longer need to set the stripe unit in XFS_MKFS_OPTIONS because recent
-# versions of mkfs.xfs autodetect it.
-
# print an error message and quit with non-zero status
function err() {
if [ $# -gt 0 ]; then
-f|--fs-type)
fs_type_valid "$2" ||
usage "invalid fs_type '$2'"
- FS_TYPE="$2"
+ FSTYP="$2"
shift
;;
-r|--randomize)
################################################################
-[ -n "${TESTDIR}" ] || usage "TESTDIR env variable must be set"
-
-# Set up some environment for normal teuthology test setup.
-# This really should not be necessary but I found it was.
-export CEPH_ARGS="--conf ${TESTDIR}/ceph.conf"
-export CEPH_ARGS="${CEPH_ARGS} --keyring ${TESTDIR}/data/client.0.keyring"
-export CEPH_ARGS="${CEPH_ARGS} --name client.0"
-
-export LD_LIBRARY_PATH="${TESTDIR}/binary/usr/local/lib:${LD_LIBRARY_PATH}"
-export PATH="${TESTDIR}/binary/usr/local/bin:${PATH}"
-export PATH="${TESTDIR}/binary/usr/local/sbin:${PATH}"
-
-################################################################
-
-# Filesystem-specific mkfs options--set if not supplied
-#export XFS_MKFS_OPTIONS="${XFS_MKFS_OPTIONS:--f -l su=65536}"
-export EXT4_MKFS_OPTIONS="${EXT4_MKFS_OPTIONS:--F}"
-export BTRFS_MKFS_OPTION # No defaults
-
-XFSTESTS_DIR="/var/lib/xfstests" # Where the tests live
-XFSPROGS_DIR="/tmp/cephtest/xfsprogs-install"
-XFSDUMP_DIR="/tmp/cephtest/xfsdump-install"
-export PATH="${XFSPROGS_DIR}/sbin:${XFSDUMP_DIR}/sbin:${PATH}"
-
-# download, build, and install xfstests
-function install_xfstests() {
- arg_count 0 $#
-
- local multiple=""
- local ncpu
-
- pushd "${TESTDIR}"
-
- git clone "${XFSTESTS_REPO}"
-
- cd xfstests
- git checkout "${XFSTESTS_VERSION}"
-
- ncpu=$(getconf _NPROCESSORS_ONLN 2>&1)
- [ -n "${ncpu}" -a "${ncpu}" -gt 1 ] && multiple="-j ${ncpu}"
-
- make realclean
- make ${multiple}
- make -k install
-
- popd
-}
-
-# remove previously-installed xfstests files
-function remove_xfstests() {
- arg_count 0 $#
-
- rm -rf "${TESTDIR}/xfstests"
- rm -rf "${XFSTESTS_DIR}"
-}
-
-# create a host options file that uses the specified devices
-function setup_host_options() {
- arg_count 0 $#
- export MNTDIR="/tmp/cephtest"
-
- # Create mount points for the test and scratch filesystems
- mkdir -p ${MNTDIR}
- local test_dir="$(mktemp -d ${MNTDIR}/test_dir.XXXXXXXXXX)"
- local scratch_dir="$(mktemp -d ${MNTDIR}/scratch_mnt.XXXXXXXXXX)"
-
- # Write a host options file that uses these devices.
- # xfstests uses the file defined by HOST_OPTIONS as the
- # place to get configuration variables for its run, and
- # all (or most) of the variables set here are required.
- export HOST_OPTIONS="$(mktemp ${TESTDIR}/host_options.XXXXXXXXXX)"
- cat > "${HOST_OPTIONS}" <<-!
- # Created by ${PROGNAME} on $(date)
- # HOST_OPTIONS="${HOST_OPTIONS}"
- TEST_DEV="${TEST_DEV}"
- SCRATCH_DEV="${SCRATCH_DEV}"
- TEST_DIR="${test_dir}"
- SCRATCH_MNT="${scratch_dir}"
- FSTYP="${FS_TYPE}"
- export TEST_DEV SCRATCH_DEV TEST_DIR SCRATCH_MNT FSTYP
- #
- export XFS_MKFS_OPTIONS="${XFS_MKFS_OPTIONS}"
- !
-
- # Now ensure we are using the same values
- . "${HOST_OPTIONS}"
-}
-
-# remove the host options file, plus the directories it refers to
-function cleanup_host_options() {
- arg_count 0 $#
-
- rm -rf "${TEST_DIR}" "${SCRATCH_MNT}"
- rm -f "${HOST_OPTIONS}"
-}
-
# run mkfs on the given device using the specified filesystem type
function do_mkfs() {
arg_count 1 $#
local options
case "${FSTYP}" in
- xfs) options="${XFS_MKFS_OPTIONS}" ;;
- ext4) options="${EXT4_MKFS_OPTIONS}" ;;
- btrfs) options="${BTRFS_MKFS_OPTIONS}" ;;
+ xfs) options="-f" ;;
+ ext4) options="-F" ;;
+ btrfs) options="-f" ;;
esac
"mkfs.${FSTYP}" ${options} "${dev}" ||
err "unable to make ${FSTYP} file system on device \"${dev}\""
}
-# mount the given device on the given mount point
-function do_mount() {
- arg_count 2 $#
-
- local dev="${1}"
- local dir="${2}"
-
- mount "${dev}" "${dir}" ||
- err "unable to mount file system \"${dev}\" on \"${dir}\""
-}
-
-# unmount a previously-mounted device
-function do_umount() {
- arg_count 1 $#
-
- local dev="${1}"
-
- if mount | grep "${dev}" > /dev/null; then
- if ! umount "${dev}"; then
- err "unable to unmount device \"${dev}\""
- fi
- else
- # Report it but don't error out
- echo "device \"${dev}\" was not mounted" >&2
- fi
-}
-
-# do basic xfstests setup--make and mount the test and scratch filesystems
-function setup_xfstests() {
- arg_count 0 $#
-
- # TEST_DEV can persist across test runs, but for now we
- # don't bother. I believe xfstests prefers its devices to
- # have been already been formatted for the desired
- # filesystem type--it uses blkid to identify things or
- # something. So we mkfs both here for a fresh start.
- do_mkfs "${TEST_DEV}"
- do_mkfs "${SCRATCH_DEV}"
-
- # I believe the test device is expected to be mounted; the
- # scratch doesn't need to be (but it doesn't hurt).
- do_mount "${TEST_DEV}" "${TEST_DIR}"
- do_mount "${SCRATCH_DEV}" "${SCRATCH_MNT}"
-}
-
-# clean up changes made by setup_xfstests
-function cleanup_xfstests() {
- arg_count 0 $#
-
- # Unmount these in case a test left them mounted (plus
- # the corresponding setup function mounted them...)
- do_umount "${TEST_DEV}"
- do_umount "${SCRATCH_DEV}"
- rmdir "${TEST_DIR}"
- rmdir "${SCRATCH_MNT}"
- rmdir "${MNTDIR}"
-}
-
-function install_xfsprogs() {
- arg_count 0 $#
-
- pushd "${TESTDIR}"
- git clone ${XFSPROGS_REPO}
- cd xfsprogs
- git checkout ${XFSPROGS_VERSION}
- libtoolize -c `libtoolize -n -i >/dev/null 2>/dev/null && echo -i` -f
- cp include/install-sh .
- aclocal -I m4
- autoconf
- ./configure --prefix=${XFSPROGS_DIR}
- make install
- popd
-}
-
-function install_xfsdump() {
- arg_count 0 $#
-
- pushd "${TESTDIR}"
- git clone ${XFSDUMP_REPO}
- cd xfsdump
- git checkout ${XFSDUMP_VERSION}
-
- # somebody took #define min and #define max out, which breaks the build on
- # ubuntu. we back out this commit here, though that may cause problems with
- # this script down the line.
- git revert -n 5a2985233c390d59d2a9757b119cb0e001c87a96
- libtoolize -c `libtoolize -n -i >/dev/null 2>/dev/null && echo -i` -f
- cp include/install-sh .
- aclocal -I m4
- autoconf
- ./configure --prefix=${XFSDUMP_DIR}
- (make -k install || true) # that's right, the install process is broken too
- popd
-}
-
-function remove_xfsprogs() {
- arg_count 0 $#
-
- rm -rf ${TESTDIR}/xfsprogs
- rm -rf ${XFSPROGS_DIR}
-}
-
-function remove_xfsdump() {
- arg_count 0 $#
-
- rm -rf ${TESTDIR}/xfsdump
- rm -rf ${XFSDUMP_DIR}
-}
-
-
# top-level setup routine
function setup() {
arg_count 0 $#
- setup_host_options
- install_xfsprogs
- install_xfsdump
- install_xfstests
- setup_xfstests
+ wget -P "${TESTDIR}" http://download.ceph.com/qa/xfstests.tar.gz
+ tar zxf "${TESTDIR}/xfstests.tar.gz" -C "$(dirname "${XFSTESTS_DIR}")"
+ mkdir "${TEST_DIR}"
+ mkdir "${SCRATCH_MNT}"
+ do_mkfs "${TEST_DEV}"
}
# top-level (final) cleanup routine
function cleanup() {
arg_count 0 $#
- cd /
- remove_xfsprogs
- remove_xfsdump
- cleanup_xfstests
- remove_xfstests
- cleanup_host_options
+ # ensure teuthology can clean up the logs
+ chmod -R a+rw "${TESTDIR}/archive"
+
+ findmnt "${TEST_DEV}" && umount "${TEST_DEV}"
+ [ -d "${SCRATCH_MNT}" ] && rmdir "${SCRATCH_MNT}"
+ [ -d "${TEST_DIR}" ] && rmdir "${TEST_DIR}"
+ rm -rf "${XFSTESTS_DIR}"
+ rm -f "${TESTDIR}/xfstests.tar.gz"
}
-trap cleanup EXIT ERR HUP INT QUIT
# ################################################################
start_date="$(date)"
-
parseargs "$@"
+[ -n "${TESTDIR}" ] || usage "TESTDIR env variable must be set"
+[ -d "${TESTDIR}/archive" ] || usage "\$TESTDIR/archive directory must exist"
+TESTDIR="$(readlink -e "${TESTDIR}")"
+[ -n "${EXPUNGE_FILE}" ] && EXPUNGE_FILE="$(readlink -e "${EXPUNGE_FILE}")"
+XFSTESTS_DIR="/var/lib/xfstests" # hardcoded into dbench binary
+TEST_DIR="/mnt/test_dir"
+SCRATCH_MNT="/mnt/scratch_mnt"
+MKFS_OPTIONS=""
+EXT_MOUNT_OPTIONS="-o block_validity"
+
+trap cleanup EXIT ERR HUP INT QUIT
setup
+export TEST_DEV
+export TEST_DIR
+export SCRATCH_DEV
+export SCRATCH_MNT
+export FSTYP
+export MKFS_OPTIONS
+export EXT_MOUNT_OPTIONS
+
pushd "${XFSTESTS_DIR}"
for (( i = 1 ; i <= "${COUNT}" ; i++ )); do
[ "${COUNT}" -gt 1 ] && echo "=== Iteration "$i" starting at: $(date)"
+ RESULT_BASE="${TESTDIR}/archive/results-${i}"
+ mkdir "${RESULT_BASE}"
+ export RESULT_BASE
+
EXPUNGE=""
[ -n "${EXPUNGE_FILE}" ] && EXPUNGE="-E ${EXPUNGE_FILE}"
[ -n "${DO_RANDOMIZE}" ] && RANDOMIZE="-r"
# -T output timestamps
- ./check -T ${RANDOMIZE} ${EXPUNGE} ${TESTS}
- status=$?
+ PATH="${PWD}/bin:${PATH}" ./check -T ${RANDOMIZE} ${EXPUNGE} ${TESTS}
+ findmnt "${TEST_DEV}" && umount "${TEST_DEV}"
[ "${COUNT}" -gt 1 ] && echo "=== Iteration "$i" complete at: $(date)"
done
echo "This xfstests run started at: ${start_date}"
echo "xfstests run completed at: $(date)"
[ "${COUNT}" -gt 1 ] && echo "xfstests run consisted of ${COUNT} iterations"
-
-exit "${status}"
+echo OK
+++ /dev/null
-#!/bin/bash
-#
-# This is a wrapper around run_xfstests.sh to provide an expunge file
-# suitable for krbd xfstests runs.
-
-set -x
-
-[ -n "${TESTDIR}" ] || export TESTDIR="/tmp/cephtest"
-[ -d "${TESTDIR}" ] || mkdir "${TESTDIR}"
-
-SCRIPT="run_xfstests.sh"
-
-if [ -z "${URL_BASE}" ]; then
- URL_BASE="https://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa"
-fi
-
-cd "${TESTDIR}"
-
-wget -O "${SCRIPT}" "${URL_BASE}/${SCRIPT}"
-chmod +x "${SCRIPT}"
-
-EXPUNGE="$(mktemp expunge.XXXXXXXXXX)"
-cat > "${EXPUNGE}" <<-!
- # mv - moved here from the old version of run_xfstests.sh
- # and rbd_xfstests.yaml
- # wasn't run - like 'mv', but wasn't specifically excluded
- # new test - didn't exist in the xfstests version that was
- # used by the old version of this script
-
- generic/038
- generic/042 # zeroes out only the last 4k of test file, but expects
- # only zeros in the entire file. bug in test?
- generic/046 # _count_extents in common/rc assumes backticks do not
- # remove newlines. This breaks parsing on some
- # platforms.
- generic/050 # blockdev --setro right after mkfs returns EBUSY
- generic/078 # RENAME_WHITEOUT was enabled in kernel commit 7dcf5c, but causes
- # a BUG for now
- generic/081 # ubuntu lvm2 doesn't suport --yes argument
- generic/083 # mkfs.xfs -dxize=104857600,agcount=6 fails
- # when sunit=swidth=8192
- generic/093 # not for Linux
- generic/097 # not for Linux
- generic/099 # not for Linux
- generic/204 # stripe size throws off test's math for when to
- # expect ENOSPC
- generic/231 # broken for disk and rbd by xfs kernel commit 4162bb
- generic/247 # race between DIO and mmap writes
- # see (https://lists.01.org/pipermail/lkp/2015-March/002459.html)
-
- shared/272 # not for xfs
- shared/289 # not for xfs
-
- xfs/007 # sector size math
- xfs/030 # mkfs.xfs -dsize=100m,agcount=6 fails
- # when sunit=swidth=8192
- xfs/032 # xfs_copy cleans up with pthread_kill (RHBA-2015-0537)
- xfs/042 # stripe size throws off test's math when filling FS
- xfs/051
- xfs/057 # test for IRIX
- xfs/058 # test for IRIX
- xfs/069 # _filter_bmap in common/punch parses incorrectly if
- # blocks are not stripe-aligned
- xfs/070 # extra output from xfs_repair
- xfs/071 # xfs_repair issue on large offsets (RHBA-2015-0537)
- xfs/073
- xfs/081 # very small mkfs breaks test with sunit=swidth-8192
- xfs/095 # not for Linux
- xfs/096 # checks various mkfs options and chokes on sunit/swidth
- xfs/104 # can't suppress sunit/swidth warnings on mkfs
- xfs/109 # can't suppress sunit/swidth warnings on mkfs
- xfs/167
- xfs/178 # test explicitly checks for stripe width of 0
- xfs/191 # tests NFSv4
- xfs/197 # tests 32-bit machines
- xfs/205 # very small mkfs breaks tests with sunit=swidth=8192
- xfs/242 # _filter_bmap in common/punch parses incorrectly if
- # blocks are not stripe-aligned
- xfs/261 # bug in mount_xfs involving creation of new quota files
- xfs/279 # sector size math (logical v. physical: BZ836433?)
- xfs/297 # XXX: temporarily expunged due to length
- xfs/300 # SELinux
-!
-
-./"${SCRIPT}" -x "$(readlink -f "${EXPUNGE}")" "$@"
-STATUS=$?
-
-rm -f "${EXPUNGE}"
-rm -f "${SCRIPT}"
-
-exit "${STATUS}"
ceph osd crush tree --show-shadow | grep 'class_1' || return 1
ceph osd crush rule create-replicated class_1_rule default host class_1 || return 1
ceph osd crush class rename class_1 class_2
+ ceph osd crush class rename class_1 class_2 # idempotent
ceph osd crush class ls | grep 'class_1' && return 1
ceph osd crush tree --show-shadow | grep 'class_1' && return 1
ceph osd crush class ls | grep 'class_2' || return 1
--- /dev/null
+#!/usr/bin/env bash
+#
+# Copyright (C) 2017 Tencent <contact@tencent.com>
+#
+# Author: Chang Liu <liuchang0812@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7113" # git grep '\<7113\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_ceph_df() {
+ local dir=$1
+ setup $dir || return 1
+
+ run_mon $dir a || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+ run_osd $dir 3 || return 1
+ run_osd $dir 4 || return 1
+ run_osd $dir 5 || return 1
+ run_mgr $dir x || return 1
+
+ profile+=" plugin=jerasure"
+ profile+=" technique=reed_sol_van"
+ profile+=" k=4"
+ profile+=" m=2"
+ profile+=" crush-failure-domain=osd"
+
+ ceph osd erasure-code-profile set ec42profile ${profile}
+
+ local rep_poolname=testcephdf_replicate
+ local ec_poolname=testcephdf_erasurecode
+ create_pool $rep_poolname 6 6 replicated
+ create_pool $ec_poolname 6 6 erasure ec42profile
+
+ local global_avail=`ceph df -f json | jq '.stats.total_avail_bytes'`
+ local rep_avail=`ceph df -f json | jq '.pools | map(select(.name == "$rep_poolname"))[0].stats.max_avail'`
+ local ec_avail=`ceph df -f json | jq '.pools | map(select(.name == "$ec_poolname"))[0].stats.max_avail'`
+
+ echo "${global_avail} >= ${rep_avail}*3" | bc || return 1
+ echo "${global_avail} >= ${ec_avail}*1.5" | bc || return 1
+
+ ceph osd pool delete $rep_poolname $rep_poolname --yes-i-really-really-mean-it
+ ceph osd pool delete $ec_poolname $ec_poolname --yes-i-really-really-mean-it
+ ceph osd erasure-code-profile rm ec42profile
+ teardown $dir || return 1
+}
+
+main osd-pool-df "$@"
run_osd $dir 1 || return 1
run_osd $dir 2 || return 1
- local poolname=testquoa
+ local poolname=testquota
create_pool $poolname 20
local objects=`ceph df detail | grep -w $poolname|awk '{print $3}'`
local bytes=`ceph df detail | grep -w $poolname|awk '{print $4}'`
--- /dev/null
+
+overrides:
+ ceph:
+ log-whitelist:
+ - bad backtrace
+ - object missing on disk
+ - error reading table object
+ - error reading sessionmap
+ - unmatched fragstat
+ - unmatched rstat
+ - was unreadable, recreating it now
+ - Scrub error on inode
+ - Metadata damage detected
+ - MDS_FAILED
+ - MDS_DAMAGE
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_recovery_pool
- failing to respond to cache pressure
- slow requests are blocked
- failing to respond to capability release
+ - MDS cache is too large
- \(MDS_CLIENT_OLDEST_TID\)
+ - \(MDS_CACHE_OVERSIZED\)
tasks:
- cephfs_test_runner:
- was unreadable, recreating it now
- Scrub error on inode
- Metadata damage detected
+ - inconsistent rstat on inode
tasks:
- cephfs_test_runner:
ceph:
log-whitelist:
- evicting unresponsive client
+ - POOL_APP_NOT_ENABLED
- force file system read-only
- bad backtrace
- MDS in read-only mode
+ - \(MDS_READ_ONLY\)
tasks:
- failing to respond to cache pressure
- slow requests are blocked
- failing to respond to capability release
+ - MDS cache is too large
+ - \(MDS_CLIENT_OLDEST_TID\)
+ - \(MDS_CACHE_OVERSIZED\)
tasks:
- cephfs_test_runner:
log-whitelist:
- but it is still running
- slow request
+ - evicting unresponsive client
tasks:
- cephfs_test_runner:
- was unreadable, recreating it now
- Scrub error on inode
- Metadata damage detected
+ - inconsistent rstat on inode
tasks:
- cephfs_test_runner:
-
+overrides:
+ ceph:
+ log-whitelist:
+ - not responding, replacing
+ - \(MDS_INSUFFICIENT_STANDBY\)
tasks:
- cephfs_test_runner:
fail_on_skip: false
modules:
- tasks.cephfs.test_failover
-
--- /dev/null
+../../../cephfs/overrides/whitelist_health.yaml
\ No newline at end of file
--- /dev/null
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
--- /dev/null
+../../../cephfs/overrides/whitelist_health.yaml
\ No newline at end of file
- install:
- ceph:
- rbd.xfstests:
- client.0:
+ client.0: &ref
test_image: 'test_image-0'
+ test_size: 5120 # MB
scratch_image: 'scratch_image-0'
- tests: '-g auto'
+ scratch_size: 5120 # MB
+ fs_type: ext4
+ tests: '-g auto -x clone'
+ exclude:
+ - generic/042
+ - generic/392
+ - generic/044
+ - generic/045
+ - generic/046
+ - generic/223
+ - ext4/304
+ - generic/050 # krbd BLKROSET bug
+ - generic/388
+ - generic/405
+ - generic/422
+ - generic/448
randomize: true
client.1:
+ <<: *ref
test_image: 'test_image-1'
scratch_image: 'scratch_image-1'
- tests: '-g auto'
- randomize: true
cluster: ceph
- exec:
mon.a:
+ - sleep 15
+ - ceph osd dump | grep purged_snapdirs
- ceph pg dump -f json-pretty
- "ceph pg dump sum -f json-pretty | grep num_legacy_snapsets | head -1 | grep ': 0'"
overrides:
osd: # force bluestore since it's required for ec overwrites
osd objectstore: bluestore
bluestore block size: 96636764160
- debug bluestore: 30
- debug bdev: 20
- debug bluefs: 20
- debug rocksdb: 10
enable experimental unrecoverable data corrupting features: "*"
osd debug randomize hobject sort order: false
# this doesn't work with failures bc the log writes are not atomic across the two backends
osd: # force bluestore since it's required for ec overwrites
osd objectstore: bluestore
bluestore block size: 96636764160
- debug bluestore: 30
- debug bdev: 20
- debug bluefs: 20
- debug rocksdb: 10
enable experimental unrecoverable data corrupting features: "*"
osd debug randomize hobject sort order: false
# this doesn't work with failures bc the log writes are not atomic across the two backends
-tasks:
-- ceph:
+overrides:
+ ceph:
conf:
client:
rbd cache: false
meta:
-- desc: 2 ceph clusters with 3 mons and 3 osds each
+- desc: 2 ceph clusters with 1 mon and 3 osds each
roles:
- - cluster1.mon.a
- - cluster1.mon.b
- cluster1.mgr.x
- cluster1.osd.0
- cluster1.osd.1
- cluster1.osd.2
- - cluster2.mon.c
- cluster1.client.0
- cluster2.client.0
-- - cluster1.mon.c
- - cluster2.mon.a
- - cluster2.mon.b
+- - cluster2.mon.a
- cluster2.mgr.x
- cluster2.osd.0
- cluster2.osd.1
osd: # force bluestore since it's required for ec overwrites
osd objectstore: bluestore
bluestore block size: 96636764160
- debug bluestore: 30
- debug bdev: 20
- debug bluefs: 20
- debug rocksdb: 10
enable experimental unrecoverable data corrupting features: "*"
osd debug randomize hobject sort order: false
# this doesn't work with failures bc the log writes are not atomic across the two backends
conf:
client:
debug rgw: 20
+ rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo=
+ rgw crypt require ssl: false
rgw:
compression type: random
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+meta:
+- desc: |
+ Run ceph on two nodes,
+ with a separate client 0,1,2 third node.
+ Use xfs beneath the osds.
+ CephFS tests running on client 2,3
+roles:
+- - mon.a
+ - mgr.x
+ - mds.a
+ - osd.0
+ - osd.1
+- - mon.b
+ - mon.c
+ - osd.2
+ - osd.3
+- - client.0
+ - client.1
+ - client.2
+ - client.3
+overrides:
+ ceph:
+ log-whitelist:
+ - scrub mismatch
+ - ScrubResult
+ - wrongly marked
+ - (POOL_APP_NOT_ENABLED)
+ - overall HEALTH_
+ conf:
+ global:
+ enable experimental unrecoverable data corrupting features: "*"
+ mon:
+ mon warn on osd down out interval zero: false
+ osd:
+ osd_class_load_list: "cephfs hello journal lock log numops rbd refcount
+ replica_log rgw sdk statelog timeindex user version"
+ osd_class_default_list: "cephfs hello journal lock log numops rbd refcount
+ replica_log rgw sdk statelog timeindex user version"
+ fs: xfs
--- /dev/null
+meta:
+- desc: |
+ install ceph/luminous latest
+ run workload and upgrade-sequence in parallel
+ upgrade the client node
+tasks:
+- install:
+ branch: luminous
+- print: "**** done installing luminous"
+- ceph:
+ log-whitelist:
+ - overall HEALTH_
+ - \(FS_
+ - \(MDS_
+ - \(OSD_
+ - \(MON_DOWN\)
+ - \(CACHE_POOL_
+ - \(POOL_
+ - \(MGR_DOWN\)
+ - \(PG_
+ - \(SMALLER_PGP_NUM\)
+ - Monitor daemon marked osd
+ - Behind on trimming
+ - Manager daemon
+ conf:
+ global:
+ mon warn on pool no app: false
+- exec:
+ osd.0:
+ - ceph osd require-osd-release luminous
+ - ceph osd set-require-min-compat-client luminous
+- print: "**** done ceph"
+- install.upgrade:
+ mon.a:
+ mon.b:
+- print: "**** done install.upgrade both hosts"
+- parallel:
+ - workload
+ - upgrade-sequence
+- print: "**** done parallel"
+- install.upgrade:
+ client.0:
+- print: "**** done install.upgrade on client.0"
--- /dev/null
+meta:
+- desc: |
+ run a cephfs stress test
+ mount ceph-fuse on client.2 before running workunit
+workload:
+ full_sequential:
+ - sequential:
+ - ceph-fuse:
+ - print: "**** done ceph-fuse 2-workload"
+ - workunit:
+ clients:
+ client.2:
+ - suites/blogbench.sh
+ - print: "**** done suites/blogbench.sh 2-workload"
--- /dev/null
+meta:
+- desc: |
+ run run randomized correctness test for rados operations
+ on an erasure-coded pool
+workload:
+ full_sequential:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
+ - print: "**** done rados ec task"
--- /dev/null
+meta:
+- desc: |
+ object class functional tests
+workload:
+ full_sequential:
+ - workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - cls
+ - print: "**** done cls 2-workload"
--- /dev/null
+meta:
+- desc: |
+ generate read/write load with rados objects ranging from 1MB to 25MB
+workload:
+ full_sequential:
+ - workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - rados/load-gen-big.sh
+ - print: "**** done rados/load-gen-big.sh 2-workload"
--- /dev/null
+meta:
+- desc: |
+ librbd C and C++ api tests
+workload:
+ full_sequential:
+ - workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ - print: "**** done rbd/test_librbd.sh 2-workload"
--- /dev/null
+meta:
+- desc: |
+ librbd python api tests
+workload:
+ full_sequential:
+ - workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+ - print: "**** done rbd/test_librbd_python.sh 2-workload"
--- /dev/null
+meta:
+- desc: |
+ upgrade the ceph cluster
+upgrade-sequence:
+ sequential:
+ - ceph.restart:
+ daemons: [mon.a, mon.b, mon.c, mgr.x]
+ - ceph.restart:
+ daemons: [osd.0, osd.1, osd.2, osd.3]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+ - ceph.restart:
+ daemons: [mds.a]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+ - print: "**** done ceph.restart all"
--- /dev/null
+meta:
+- desc: |
+ upgrade the ceph cluster,
+ upgrate in two steps
+ step one ordering: mon.a, osd.0, osd.1, mds.a
+ step two ordering: mon.b, mon.c, osd.2, osd.3
+ ceph expected to be healthy state after each step
+upgrade-sequence:
+ sequential:
+ - ceph.restart:
+ daemons: [mon.a]
+ wait-for-healthy: true
+ - sleep:
+ duration: 60
+ - ceph.restart:
+ daemons: [mon.b, mon.c, mgr.x]
+ wait-for-healthy: true
+ - sleep:
+ duration: 60
+ - ceph.restart:
+ daemons: [osd.0, osd.1]
+ wait-for-healthy: true
+ - sleep:
+ duration: 60
+ - ceph.restart: [mds.a]
+ - sleep:
+ duration: 60
+ - sleep:
+ duration: 60
+ - ceph.restart:
+ daemons: [osd.2, osd.3]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+ - sleep:
+ duration: 60
--- /dev/null
+meta:
+- desc: |
+ run a cephfs stress test
+ mount ceph-fuse on client.3 before running workunit
+tasks:
+- sequential:
+ - ceph-fuse:
+ - print: "**** done ceph-fuse 5-final-workload"
+ - workunit:
+ clients:
+ client.3:
+ - suites/blogbench.sh
+ - print: "**** done suites/blogbench.sh 5-final-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool with snapshots
+tasks:
+ - rados:
+ clients: [client.1]
+ ops: 4000
+ objects: 50
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ - print: "**** done rados 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ generate read/write load with rados objects ranging from 1 byte to 1MB
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - rados/load-gen-mix.sh
+ - print: "**** done rados/load-gen-mix.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ librados C and C++ api tests
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+tasks:
+ - mon_thrash:
+ revive_delay: 20
+ thrash_delay: 1
+ - print: "**** done mon_thrash 4-final-workload"
+ - workunit:
+ branch: luminous
+ clients:
+ client.1:
+ - rados/test.sh
+ - print: "**** done rados/test.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ rbd object class functional tests
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - cls/test_cls_rbd.sh
+ - print: "**** done cls/test_cls_rbd.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic import/export cli tests for rbd
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --new-format
+ - print: "**** done rbd/import_export.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ swift api tests for rgw
+overrides:
+ rgw:
+ frontend: civetweb
+tasks:
+ - rgw: [client.1]
+ - print: "**** done rgw 4-final-workload"
+ - swift:
+ client.1:
+ rgw_server: client.1
+ - print: "**** done swift 4-final-workload"
--- /dev/null
+../../../../distros/supported/
\ No newline at end of file
--- /dev/null
+../stress-split/objectstore/
\ No newline at end of file
--- /dev/null
+../stress-split/0-cluster/
\ No newline at end of file
--- /dev/null
+../stress-split/2-partial-upgrade/
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ randomly kill and revive osd
+ small chance to increase the number of pgs
+overrides:
+ ceph:
+ log-whitelist:
+ - but it is still running
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ - log bound mismatch
+tasks:
+- parallel:
+ - stress-tasks
+stress-tasks:
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ min_in: 4
+ chance_thrash_cluster_full: 0
+ chance_thrash_pg_upmap: 0
+ chance_thrash_pg_upmap_items: 0
+ chance_force_recovery: 0
+- print: "**** done thrashosds 3-thrash"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on an erasure coded pool
+stress-tasks:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
+ - print: "**** done rados ec task"
--- /dev/null
+../stress-split/5-finish-upgrade.yaml
\ No newline at end of file
--- /dev/null
+#
+# k=3 implies a stripe_width of 1376*3 = 4128 which is different from
+# the default value of 4096 It is also not a multiple of 1024*1024 and
+# creates situations where rounding rules during recovery becomes
+# necessary.
+#
+meta:
+- desc: |
+ randomized correctness test for rados operations on an erasure coded pool
+ using the jerasure plugin with k=3 and m=1
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ erasure_code_profile:
+ name: jerasure31profile
+ plugin: jerasure
+ k: 3
+ m: 1
+ technique: reed_sol_van
+ crush-failure-domain: osd
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+../../../../distros/supported/
\ No newline at end of file
--- /dev/null
+../stress-split/objectstore/
\ No newline at end of file
--- /dev/null
+../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
--- /dev/null
+openstack:
+ - machine:
+ disk: 100 # GB
+ - volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+meta:
+- desc: |
+ Run ceph on two nodes,
+ with a separate client-only node.
+ Use xfs beneath the osds.
+overrides:
+ ceph:
+ fs: xfs
+ log-whitelist:
+ - overall HEALTH_
+ - \(MON_DOWN\)
+ - \(MGR_DOWN\)
+ conf:
+ global:
+ enable experimental unrecoverable data corrupting features: "*"
+ mon:
+ mon warn on osd down out interval zero: false
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+- - osd.3
+ - osd.4
+ - osd.5
+- - client.0
--- /dev/null
+meta:
+- desc: install ceph/luminous latest
+tasks:
+- install:
+ branch: luminous
+- print: "**** done install luminous"
+- ceph:
+- exec:
+ osd.0:
+ - ceph osd require-osd-release luminous
+ - ceph osd set-require-min-compat-client luminous
+- print: "**** done ceph "
+overrides:
+ ceph:
+ conf:
+ mon:
+ mon warn on osd down out interval zero: false
--- /dev/null
+meta:
+- desc: |
+ install upgrade ceph/-x on one node only
+ 1st half
+ restart : osd.0,1,2
+tasks:
+- install.upgrade:
+ osd.0:
+- print: "**** done install.upgrade osd.0"
+- ceph.restart:
+ daemons: [mon.a,mon.b,mon.c,mgr.x,osd.0,osd.1,osd.2]
+- print: "**** done ceph.restart 1st half"
--- /dev/null
+meta:
+- desc: |
+ randomly kill and revive osd
+ small chance to increase the number of pgs
+overrides:
+ ceph:
+ log-whitelist:
+ - but it is still running
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ - log bound mismatch
+tasks:
+- parallel:
+ - stress-tasks
+stress-tasks:
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ chance_thrash_cluster_full: 0
+ chance_thrash_pg_upmap: 0
+ chance_thrash_pg_upmap_items: 0
+ disable_objectstore_tool_tests: true
+ chance_force_recovery: 0
+- print: "**** done thrashosds 3-thrash"
--- /dev/null
+meta:
+- desc: |
+ run randomized correctness test for rados operations
+ generate write load with rados bench
+stress-tasks:
+- full_sequential:
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+- print: "**** done radosbench 7-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic cls tests for rbd
+stress-tasks:
+- workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - cls/test_cls_rbd.sh
+- print: "**** done cls/test_cls_rbd.sh 5-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic import/export cli tests for rbd
+stress-tasks:
+- workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --new-format
+- print: "**** done rbd/import_export.sh 5-workload"
--- /dev/null
+meta:
+- desc: |
+ librbd C and C++ api tests
+stress-tasks:
+- workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+- print: "**** done rbd/test_librbd.sh 7-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool,
+ using only reads, writes, and deletes
+stress-tasks:
+- full_sequential:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ write_append_excl: false
+ op_weights:
+ read: 45
+ write: 45
+ delete: 10
+- print: "**** done rados/readwrite 5-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool with snapshot operations
+stress-tasks:
+- full_sequential:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+- print: "**** done rados/snaps-few-objects 5-workload"
--- /dev/null
+tasks:
+- install.upgrade:
+ osd.3:
+ client.0:
+- ceph.restart:
+ daemons: [osd.3, osd.4, osd.5]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+
--- /dev/null
+meta:
+- desc: |
+ librbd python api tests
+tasks:
+- workunit:
+ branch: luminous
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+- print: "**** done rbd/test_librbd_python.sh 9-workload"
--- /dev/null
+meta:
+- desc: |
+ swift api tests for rgw
+tasks:
+- rgw:
+ client.0:
+- print: "**** done rgw 9-workload"
+- swift:
+ client.0:
+ rgw_server: client.0
+- print: "**** done swift 9-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool with snapshot operations
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
--- /dev/null
+../../../../distros/supported/
\ No newline at end of file
--- /dev/null
+../../../../../objectstore/bluestore.yaml
\ No newline at end of file
--- /dev/null
+../../../../../objectstore/filestore-xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
if mdss.remotes:
log.info('Setting up CephFS filesystem...')
- fs = Filesystem(ctx, create='cephfs')
+ fs = Filesystem(ctx, name='cephfs', create=True)
is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
Scrubber(self.ceph_manager, self.config)
self.choose_action()()
time.sleep(delay)
+ self.all_up()
if self.random_eio > 0:
self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
'injectargs', '--', '--filestore_debug_random_read_err=0.0')
# Environment references
mounts = None
fs = None
+ recovery_fs = None
ceph_cluster = None
mds_cluster = None
mgr_cluster = None
# FIXME weird explicit naming
mount_a = None
mount_b = None
+ recovery_mount = None
# Declarative test requirements: subclasses should override these to indicate
# their special needs. If not met, tests will be skipped.
# Whether to create the default filesystem during setUp
REQUIRE_FILESYSTEM = True
+ # requires REQUIRE_FILESYSTEM = True
+ REQUIRE_RECOVERY_FILESYSTEM = False
+
LOAD_SETTINGS = []
def setUp(self):
self.mds_cluster.mds_fail()
self.mds_cluster.delete_all_filesystems()
self.fs = None # is now invalid!
+ self.recovery_fs = None
# In case the previous filesystem had filled up the RADOS cluster, wait for that
# flag to pass.
self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])
if self.REQUIRE_FILESYSTEM:
- self.fs = self.mds_cluster.newfs(True)
+ self.fs = self.mds_cluster.newfs(create=True)
self.fs.mds_restart()
# In case some test messed with auth caps, reset them
self.mounts[i].mount()
self.mounts[i].wait_until_mounted()
+ if self.REQUIRE_RECOVERY_FILESYSTEM:
+ if not self.REQUIRE_FILESYSTEM:
+ raise case.SkipTest("Recovery filesystem requires a primary filesystem as well")
+ self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
+ 'enable_multiple', 'true',
+ '--yes-i-really-mean-it')
+ self.recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False)
+ self.recovery_fs.set_metadata_overlay(True)
+ self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
+ self.recovery_fs.create()
+ self.recovery_fs.getinfo(refresh=True)
+ self.recovery_fs.mds_restart()
+ self.recovery_fs.wait_for_daemons()
+
# Load an config settings of interest
for setting in self.LOAD_SETTINGS:
setattr(self, setting, float(self.fs.mds_asok(
import datetime
import re
import errno
+import random
from teuthology.exceptions import CommandFailedError
from teuthology import misc
else:
cb(mds_id)
+ def get_config(self, key, service_type=None):
+ """
+ get_config specialization of service_type="mds"
+ """
+ if service_type != "mds":
+ return super(MDSCluster, self).get_config(key, service_type)
+
+ # Some tests stop MDS daemons, don't send commands to a dead one:
+ service_id = random.sample(filter(lambda i: self.mds_daemons[i].running(), self.mds_daemons), 1)[0]
+ return self.json_asok(['config', 'get', key], service_type, service_id)[key]
+
def mds_stop(self, mds_id=None):
"""
Stop the MDS daemon process(se). If it held a rank, that rank
self._one_or_all(mds_id, _fail_restart)
- def newfs(self, name):
- return Filesystem(self._ctx, create=name)
+ def newfs(self, name='cephfs', create=True):
+ return Filesystem(self._ctx, name=name, create=create)
def status(self):
return FSStatus(self.mon_manager)
This object is for driving a CephFS filesystem. The MDS daemons driven by
MDSCluster may be shared with other Filesystems.
"""
- def __init__(self, ctx, fscid=None, create=None):
+ def __init__(self, ctx, fscid=None, name=None, create=False):
super(Filesystem, self).__init__(ctx)
+ self.name = name
self.id = None
- self.name = None
self.metadata_pool_name = None
+ self.metadata_overlay = False
+ self.data_pool_name = None
self.data_pools = None
client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client'))
self.client_id = client_list[0]
self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1]
- if create is not None:
+ if name is not None:
if fscid is not None:
raise RuntimeError("cannot specify fscid when creating fs")
- if create is True:
- self.name = 'cephfs'
- else:
- self.name = create
- if not self.legacy_configured():
+ if create and not self.legacy_configured():
self.create()
- elif fscid is not None:
- self.id = fscid
- self.getinfo(refresh = True)
+ else:
+ if fscid is not None:
+ self.id = fscid
+ self.getinfo(refresh = True)
# Stash a reference to the first created filesystem on ctx, so
# that if someone drops to the interactive shell they can easily
self.get_pool_names(status = status, refresh = refresh)
return status
+ def set_metadata_overlay(self, overlay):
+ if self.id is not None:
+ raise RuntimeError("cannot specify fscid when configuring overlay")
+ self.metadata_overlay = overlay
+
def deactivate(self, rank):
if rank < 0:
raise RuntimeError("invalid rank")
self.name = "cephfs"
if self.metadata_pool_name is None:
self.metadata_pool_name = "{0}_metadata".format(self.name)
- data_pool_name = "{0}_data".format(self.name)
+ if self.data_pool_name is None:
+ data_pool_name = "{0}_data".format(self.name)
+ else:
+ data_pool_name = self.data_pool_name
log.info("Creating filesystem '{0}'".format(self.name))
self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
self.metadata_pool_name, pgs_per_fs_pool.__str__())
- self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
- data_pool_name, pgs_per_fs_pool.__str__())
- self.mon_manager.raw_cluster_cmd('fs', 'new',
- self.name, self.metadata_pool_name, data_pool_name)
+ if self.metadata_overlay:
+ self.mon_manager.raw_cluster_cmd('fs', 'new',
+ self.name, self.metadata_pool_name, data_pool_name,
+ '--allow-dangerous-metadata-overlay')
+ else:
+ self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+ data_pool_name, pgs_per_fs_pool.__str__())
+ self.mon_manager.raw_cluster_cmd('fs', 'new',
+ self.name, self.metadata_pool_name, data_pool_name)
self.check_pool_application(self.metadata_pool_name)
self.check_pool_application(data_pool_name)
# Turn off spurious standby count warnings from modifying max_mds in tests.
def get_metadata_pool_name(self):
return self.metadata_pool_name
+ def set_data_pool_name(self, name):
+ if self.id is not None:
+ raise RuntimeError("can't set filesystem name if its fscid is set")
+ self.data_pool_name = name
+
def get_namespace_id(self):
return self.id
pass
# The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
- # which depend on the cache size and overall ratio
+ # which depend on the caps outstanding, cache size and overall ratio
self.wait_until_equal(
lambda: self.get_session(mount_a_client_id)['num_caps'],
- int(cache_size * 0.8),
- timeout=600,
- reject_fn=lambda x: x < int(cache_size*.8))
+ int(open_files * 0.2),
+ timeout=30,
+ reject_fn=lambda x: x < int(open_files*0.2))
@needs_trimming
def test_client_pin_root(self):
self.assertFalse(lock_holder.finished)
self.assertFalse(lock_taker.finished)
- mount_a_client_id = self.mount_a.get_global_id()
- self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+ try:
+ mount_a_client_id = self.mount_a.get_global_id()
+ self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
- # Evicting mount_a should let mount_b's attepmt to take the lock
- # suceed
- self.wait_until_true(
- lambda: lock_taker.finished,
- timeout=10)
+ # Evicting mount_a should let mount_b's attempt to take the lock
+ # succeed
+ self.wait_until_true(lambda: lock_taker.finished, timeout=10)
+ finally:
+ # teardown() doesn't quite handle this case cleanly, so help it out
+ self.mount_a.kill()
+ self.mount_a.kill_cleanup()
+
+ # Bring the client back
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
def test_dir_fsync(self):
self._test_fsync(True);
mds_map = self.fs.get_mds_map()
return rank in mds_map['damaged']
- def _rebuild_metadata(self, workload, other_pool=None, workers=1):
+ def _rebuild_metadata(self, workload, workers=1):
"""
That when all objects in metadata pool are removed, we can rebuild a metadata pool
based on the contents of a data pool, and a client can see and read our files.
# First, inject some files
- other_fs = other_pool + '-fs' if other_pool else None
workload.write()
# Unmount the client and flush the journal: the tool should also cope with
self.mount_a.umount_wait()
workload.flush()
- # Create the alternate pool if requested
- if other_pool:
- self.fs.rados(['mkpool', other_pool])
- self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
- 'enable_multiple', 'true',
- '--yes-i-really-mean-it')
- self.fs.mon_manager.raw_cluster_cmd('fs', 'new', other_fs,
- other_pool,
- self.fs.get_data_pool_name(),
- '--allow-dangerous-metadata-overlay')
- self.fs.data_scan(['init', '--force-init', '--filesystem',
- other_fs, '--alternate-pool', other_pool])
- self.fs.mon_manager.raw_cluster_cmd('-s')
- self.fs.table_tool([other_fs + ":0", "reset", "session"])
- self.fs.table_tool([other_fs + ":0", "reset", "snap"])
- self.fs.table_tool([other_fs + ":0", "reset", "inode"])
-
# Stop the MDS
self.fs.mds_stop()
self.fs.mds_fail()
self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
'--yes-i-really-mean-it')
- if other_pool is None:
- self.fs.mds_restart()
+ self.fs.mds_restart()
def get_state(mds_id):
info = self.mds_cluster.get_mds_info(mds_id)
return info['state'] if info is not None else None
- if other_pool is None:
- self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
- for mds_id in self.fs.mds_ids:
- self.wait_until_equal(
- lambda: get_state(mds_id),
- "up:standby",
- timeout=60)
+ self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
+ for mds_id in self.fs.mds_ids:
+ self.wait_until_equal(
+ lambda: get_state(mds_id),
+ "up:standby",
+ timeout=60)
self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
# Normal reset should fail when no objects are present, we'll use --force instead
self.fs.journal_tool(["journal", "reset"])
- if other_pool:
- self.fs.mds_stop()
- self.fs.data_scan(['scan_extents', '--alternate-pool',
- other_pool, '--filesystem', self.fs.name,
- self.fs.get_data_pool_name()])
- self.fs.data_scan(['scan_inodes', '--alternate-pool',
- other_pool, '--filesystem', self.fs.name,
- '--force-corrupt', '--force-init',
- self.fs.get_data_pool_name()])
- self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
- 'recover_dentries', 'list',
- '--alternate-pool', other_pool])
-
- self.fs.data_scan(['init', '--force-init', '--filesystem',
- self.fs.name])
- self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
- '--force-corrupt', '--force-init',
- self.fs.get_data_pool_name()])
- self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
- 'recover_dentries', 'list'])
-
- self.fs.journal_tool(['--rank=' + other_fs + ":0", 'journal',
- 'reset', '--force'])
- self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'journal',
- 'reset', '--force'])
- self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
- other_fs + ":0")
- else:
- self.fs.journal_tool(["journal", "reset", "--force"])
- self.fs.data_scan(["init"])
- self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
- self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
+ self.fs.journal_tool(["journal", "reset", "--force"])
+ self.fs.data_scan(["init"])
+ self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
+ self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
# Mark the MDS repaired
self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
# Start the MDS
self.fs.mds_restart()
self.fs.wait_for_daemons()
- if other_pool:
- for mds_id in self.fs.mds_ids:
- self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + mds_id,
- 'injectargs', '--debug-mds=20')
- self.fs.mon_manager.raw_cluster_cmd('daemon', "mds." + mds_id,
- 'scrub_path', '/',
- 'recursive', 'repair')
log.info(str(self.mds_cluster.status()))
# Mount a client
- self.mount_a.mount(mount_fs_name=other_fs)
+ self.mount_a.mount()
self.mount_a.wait_until_mounted()
# See that the files are present and correct
def test_stashed_layout(self):
self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a))
- def test_rebuild_simple_altpool(self):
- self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a), other_pool="recovery")
-
def _dirfrag_keys(self, object_id):
- self.other_pool = 'recovery'
- self.other_fs = self.other_pool + '-fs'
keys_str = self.fs.rados(["listomapkeys", object_id])
if keys_str:
return keys_str.split("\n")
self.assert_session_count(2, ls_data)
self.mount_a.kill()
- self.mount_a.kill()
- self.mount_b.kill_cleanup()
- self.mount_b.kill_cleanup()
+ self.mount_a.kill_cleanup()
time.sleep(self.mds_session_autoclose * 1.5)
ls_data = self.fs.mds_asok(['session', 'ls'])
fs_avail = output.split('\n')[1].split()[3]
fs_avail = float(fs_avail) * 1024
- ratio = (raw_avail / pool_size) / fs_avail
+ ratio = raw_avail / fs_avail
assert 0.9 < ratio < 1.1
--- /dev/null
+
+"""
+Test our tools for recovering metadata from the data pool into an alternate pool
+"""
+import json
+
+import logging
+import os
+from textwrap import dedent
+import traceback
+from collections import namedtuple, defaultdict
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class OverlayWorkload(object):
+ def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount):
+ self._orig_fs = orig_fs
+ self._recovery_fs = recovery_fs
+ self._orig_mount = orig_mount
+ self._recovery_mount = recovery_mount
+ self._initial_state = None
+
+ # Accumulate backtraces for every failed validation, and return them. Backtraces
+ # are rather verbose, but we only see them when something breaks, and they
+ # let us see which check failed without having to decorate each check with
+ # a string
+ self._errors = []
+
+ def assert_equal(self, a, b):
+ try:
+ if a != b:
+ raise AssertionError("{0} != {1}".format(a, b))
+ except AssertionError as e:
+ self._errors.append(
+ ValidationError(e, traceback.format_exc(3))
+ )
+
+ def write(self):
+ """
+ Write the workload files to the mount
+ """
+ raise NotImplementedError()
+
+ def validate(self):
+ """
+ Read from the mount and validate that the workload files are present (i.e. have
+ survived or been reconstructed from the test scenario)
+ """
+ raise NotImplementedError()
+
+ def damage(self):
+ """
+ Damage the filesystem pools in ways that will be interesting to recover from. By
+ default just wipe everything in the metadata pool
+ """
+ # Delete every object in the metadata pool
+ objects = self._orig_fs.rados(["ls"]).split("\n")
+ for o in objects:
+ self._orig_fs.rados(["rm", o])
+
+ def flush(self):
+ """
+ Called after client unmount, after write: flush whatever you want
+ """
+ self._orig_fs.mds_asok(["flush", "journal"])
+ self._recovery_fs.mds_asok(["flush", "journal"])
+
+
+class SimpleOverlayWorkload(OverlayWorkload):
+ """
+ Single file, single directory, check that it gets recovered and so does its size
+ """
+ def write(self):
+ self._orig_mount.run_shell(["mkdir", "subdir"])
+ self._orig_mount.write_n_mb("subdir/sixmegs", 6)
+ self._initial_state = self._orig_mount.stat("subdir/sixmegs")
+
+ def validate(self):
+ self._recovery_mount.run_shell(["ls", "subdir"])
+ st = self._recovery_mount.stat("subdir/sixmegs")
+ self.assert_equal(st['st_size'], self._initial_state['st_size'])
+ return self._errors
+
+class TestRecoveryPool(CephFSTestCase):
+ MDSS_REQUIRED = 2
+ CLIENTS_REQUIRED = 2
+ REQUIRE_RECOVERY_FILESYSTEM = True
+
+ def is_marked_damaged(self, rank):
+ mds_map = self.fs.get_mds_map()
+ return rank in mds_map['damaged']
+
+ def _rebuild_metadata(self, workload, other_pool=None, workers=1):
+ """
+ That when all objects in metadata pool are removed, we can rebuild a metadata pool
+ based on the contents of a data pool, and a client can see and read our files.
+ """
+
+ # First, inject some files
+
+ workload.write()
+
+ # Unmount the client and flush the journal: the tool should also cope with
+ # situations where there is dirty metadata, but we'll test that separately
+ self.mount_a.umount_wait()
+ self.mount_b.umount_wait()
+ workload.flush()
+
+ # Create the alternate pool if requested
+ recovery_fs = self.recovery_fs.name
+ recovery_pool = self.recovery_fs.get_metadata_pool_name()
+ self.recovery_fs.data_scan(['init', '--force-init',
+ '--filesystem', recovery_fs,
+ '--alternate-pool', recovery_pool])
+ self.recovery_fs.mon_manager.raw_cluster_cmd('-s')
+ self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"])
+ self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"])
+ self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
+
+ # Stop the MDS
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+
+ # After recovery, we need the MDS to not be strict about stats (in production these options
+ # are off by default, but in QA we need to explicitly disable them)
+ self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+ self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+ # Apply any data damage the workload wants
+ workload.damage()
+
+ # Reset the MDS map in case multiple ranks were in play: recovery procedure
+ # only understands how to rebuild metadata under rank 0
+ self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
+ '--yes-i-really-mean-it')
+
+ def get_state(mds_id):
+ info = self.mds_cluster.get_mds_info(mds_id)
+ return info['state'] if info is not None else None
+
+ self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
+ self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
+ self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
+
+ # Run the recovery procedure
+ if False:
+ with self.assertRaises(CommandFailedError):
+ # Normal reset should fail when no objects are present, we'll use --force instead
+ self.fs.journal_tool(["journal", "reset"])
+
+ self.fs.mds_stop()
+ self.fs.data_scan(['scan_extents', '--alternate-pool',
+ recovery_pool, '--filesystem', self.fs.name,
+ self.fs.get_data_pool_name()])
+ self.fs.data_scan(['scan_inodes', '--alternate-pool',
+ recovery_pool, '--filesystem', self.fs.name,
+ '--force-corrupt', '--force-init',
+ self.fs.get_data_pool_name()])
+ self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
+ 'recover_dentries', 'list',
+ '--alternate-pool', recovery_pool])
+
+ self.fs.data_scan(['init', '--force-init', '--filesystem',
+ self.fs.name])
+ self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
+ '--force-corrupt', '--force-init',
+ self.fs.get_data_pool_name()])
+ self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
+ 'recover_dentries', 'list'])
+
+ self.fs.journal_tool(['--rank=' + recovery_fs + ":0", 'journal',
+ 'reset', '--force'])
+ self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'journal',
+ 'reset', '--force'])
+ self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
+ recovery_fs + ":0")
+
+ # Mark the MDS repaired
+ self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+ # Start the MDS
+ self.fs.mds_restart()
+ self.recovery_fs.mds_restart()
+ self.fs.wait_for_daemons()
+ self.recovery_fs.wait_for_daemons()
+ for mds_id in self.recovery_fs.mds_ids:
+ self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + mds_id,
+ 'injectargs', '--debug-mds=20')
+ self.fs.mon_manager.raw_cluster_cmd('daemon', "mds." + mds_id,
+ 'scrub_path', '/',
+ 'recursive', 'repair')
+ log.info(str(self.mds_cluster.status()))
+
+ # Mount a client
+ self.mount_a.mount()
+ self.mount_b.mount(mount_fs_name=recovery_fs)
+ self.mount_a.wait_until_mounted()
+ self.mount_b.wait_until_mounted()
+
+ # See that the files are present and correct
+ errors = workload.validate()
+ if errors:
+ log.error("Validation errors found: {0}".format(len(errors)))
+ for e in errors:
+ log.error(e.exception)
+ log.error(e.backtrace)
+ raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+ errors[0].exception, errors[0].backtrace
+ ))
+
+ def test_rebuild_simple(self):
+ self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs,
+ self.mount_a, self.mount_b))
import logging
+import json
from tasks.mgr.mgr_test_case import MgrTestCase
timeout=10
)
+ # Both daemons should have fully populated metadata
+ # (regression test for http://tracker.ceph.com/issues/21260)
+ meta = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ "mgr", "metadata"))
+ id_to_meta = dict([(i['id'], i) for i in meta])
+ for i in [original_active] + original_standbys:
+ self.assertIn(i, id_to_meta)
+ self.assertIn('ceph_version', id_to_meta[i])
+
# We should be able to fail back over again: the exercises
# our re-initialization of the python runtime within
# a single process lifetime.
#
# grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //'
#
+# to run this standalone:
+# python qa/tasks/radosgw_admin.py [USER] HOSTNAME
+#
import copy
import json
import logging
import time
import datetime
+import Queue
+import bunch
+
+import sys
from cStringIO import StringIO
import boto.exception
import boto.s3.connection
import boto.s3.acl
+from boto.utils import RequestHook
import httplib2
log = logging.getLogger(__name__)
+def usage_acc_findentry2(entries, user, add=True):
+ for e in entries:
+ if e['user'] == user:
+ return e
+ if not add:
+ return None
+ e = {'user': user, 'buckets': []}
+ entries.append(e)
+ return e
+def usage_acc_findsum2(summaries, user, add=True):
+ for e in summaries:
+ if e['user'] == user:
+ return e
+ if not add:
+ return None
+ e = {'user': user, 'categories': [],
+ 'total': {'bytes_received': 0,
+ 'bytes_sent': 0, 'ops': 0, 'successful_ops': 0 }}
+ summaries.append(e)
+ return e
+def usage_acc_update2(x, out, b_in, err):
+ x['bytes_sent'] += b_in
+ x['bytes_received'] += out
+ x['ops'] += 1
+ if not err:
+ x['successful_ops'] += 1
+def usage_acc_validate_fields(r, x, x2, what):
+ q=[]
+ for field in ['bytes_sent', 'bytes_received', 'ops', 'successful_ops']:
+ try:
+ if x2[field] < x[field]:
+ q.append("field %s: %d < %d" % (field, x2[field], x[field]))
+ except Exception as ex:
+ r.append( "missing/bad field " + field + " in " + what + " " + str(ex))
+ return
+ if len(q) > 0:
+ r.append("incomplete counts in " + what + ": " + ", ".join(q))
+class usage_acc:
+ def __init__(self):
+ self.results = {'entries': [], 'summary': []}
+ def findentry(self, user):
+ return usage_acc_findentry2(self.results['entries'], user)
+ def findsum(self, user):
+ return usage_acc_findsum2(self.results['summary'], user)
+ def e2b(self, e, bucket, add=True):
+ for b in e['buckets']:
+ if b['bucket'] == bucket:
+ return b
+ if not add:
+ return None
+ b = {'bucket': bucket, 'categories': []}
+ e['buckets'].append(b)
+ return b
+ def c2x(self, c, cat, add=True):
+ for x in c:
+ if x['category'] == cat:
+ return x
+ if not add:
+ return None
+ x = {'bytes_received': 0, 'category': cat,
+ 'bytes_sent': 0, 'ops': 0, 'successful_ops': 0 }
+ c.append(x)
+ return x
+ def update(self, c, cat, user, out, b_in, err):
+ x = self.c2x(c, cat)
+ usage_acc_update2(x, out, b_in, err)
+ if not err and cat == 'create_bucket' and not x.has_key('owner'):
+ x['owner'] = user
+ def make_entry(self, cat, bucket, user, out, b_in, err):
+ if cat == 'create_bucket' and err:
+ return
+ e = self.findentry(user)
+ b = self.e2b(e, bucket)
+ self.update(b['categories'], cat, user, out, b_in, err)
+ s = self.findsum(user)
+ x = self.c2x(s['categories'], cat)
+ usage_acc_update2(x, out, b_in, err)
+ x = s['total']
+ usage_acc_update2(x, out, b_in, err)
+ def generate_make_entry(self):
+ return lambda cat,bucket,user,out,b_in,err: self.make_entry(cat, bucket, user, out, b_in, err)
+ def get_usage(self):
+ return self.results
+ def compare_results(self, results):
+ if not results.has_key('entries') or not results.has_key('summary'):
+ return ['Missing entries or summary']
+ r = []
+ for e in self.results['entries']:
+ try:
+ e2 = usage_acc_findentry2(results['entries'], e['user'], False)
+ except Exception as ex:
+ r.append("malformed entry looking for user "
+ + e['user'] + " " + str(ex))
+ break
+ if e2 == None:
+ r.append("missing entry for user " + e['user'])
+ continue
+ for b in e['buckets']:
+ c = b['categories']
+ if b['bucket'] == 'nosuchbucket':
+ print "got here"
+ try:
+ b2 = self.e2b(e2, b['bucket'], False)
+ if b2 != None:
+ c2 = b2['categories']
+ except Exception as ex:
+ r.append("malformed entry looking for bucket "
+ + b['bucket'] + " in user " + e['user'] + " " + str(ex))
+ break
+ if b2 == None:
+ r.append("can't find bucket " + b['bucket']
+ + " in user " + e['user'])
+ continue
+ for x in c:
+ try:
+ x2 = self.c2x(c2, x['category'], False)
+ except Exception as ex:
+ r.append("malformed entry looking for "
+ + x['category'] + " in bucket " + b['bucket']
+ + " user " + e['user'] + " " + str(ex))
+ break
+ usage_acc_validate_fields(r, x, x2, "entry: category "
+ + x['category'] + " bucket " + b['bucket']
+ + " in user " + e['user'])
+ for s in self.results['summary']:
+ c = s['categories']
+ try:
+ s2 = usage_acc_findsum2(results['summary'], s['user'], False)
+ except Exception as ex:
+ r.append("malformed summary looking for user " + e['user']
+ + " " + str(ex))
+ break
+ if s2 == None:
+ r.append("missing summary for user " + e['user'] + " " + str(ex))
+ continue
+ try:
+ c2 = s2['categories']
+ except Exception as ex:
+ r.append("malformed summary missing categories for user "
+ + e['user'] + " " + str(ex))
+ break
+ for x in c:
+ try:
+ x2 = self.c2x(c2, x['category'], False)
+ except Exception as ex:
+ r.append("malformed summary looking for "
+ + x['category'] + " user " + e['user'] + " " + str(ex))
+ break
+ usage_acc_validate_fields(r, x, x2, "summary: category "
+ + x['category'] + " in user " + e['user'])
+ x = s['total']
+ try:
+ x2 = s2['total']
+ except Exception as ex:
+ r.append("malformed summary looking for totals for user "
+ + e['user'] + " " + str(ex))
+ break
+ usage_acc_validate_fields(r, x, x2, "summary: totals for user" + e['user'])
+ return r
+
+def ignore_this_entry(cat, bucket, user, out, b_in, err):
+ pass
+class requestlog_queue():
+ def __init__(self, add):
+ self.q = Queue.Queue(1000)
+ self.adder = add
+ def handle_request_data(self, request, response, error=False):
+ now = datetime.datetime.now()
+ if error:
+ pass
+ elif response.status < 200 or response.status >= 400:
+ error = True
+ self.q.put(bunch.Bunch({'t': now, 'o': request, 'i': response, 'e': error}))
+ def clear(self):
+ with self.q.mutex:
+ self.q.queue.clear()
+ def log_and_clear(self, cat, bucket, user, add_entry = None):
+ while not self.q.empty():
+ j = self.q.get()
+ bytes_out = 0
+ if 'Content-Length' in j.o.headers:
+ bytes_out = int(j.o.headers['Content-Length'])
+ bytes_in = 0
+ if 'content-length' in j.i.msg.dict:
+ bytes_in = int(j.i.msg.dict['content-length'])
+ log.info('RL: %s %s %s bytes_out=%d bytes_in=%d failed=%r'
+ % (cat, bucket, user, bytes_out, bytes_in, j.e))
+ if add_entry == None:
+ add_entry = self.adder
+ add_entry(cat, bucket, user, bytes_out, bytes_in, j.e)
+
def create_presigned_url(conn, method, bucket_name, key_name, expiration):
return conn.generate_url(expires_in=expiration,
method=method,
calling_format=boto.s3.connection.OrdinaryCallingFormat(),
)
+ acc = usage_acc()
+ rl = requestlog_queue(acc.generate_make_entry())
+ connection.set_request_hook(rl)
+ connection2.set_request_hook(rl)
+
# legend (test cases can be easily grep-ed out)
# TESTCASE 'testname','object','method','operation','assertion'
+
+ # TESTCASE 'usage-show0' 'usage' 'show' 'all usage' 'succeeds'
+ (err, summary0) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True)
+
# TESTCASE 'info-nosuch','user','info','non-existent user','fails'
(err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
assert err
# create a first bucket
bucket = connection.create_bucket(bucket_name)
+ rl.log_and_clear("create_bucket", bucket_name, user1)
+
# TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list'
(err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True)
assert len(out) == 1
assert out[0] == bucket_name
+ bucket_list = connection.get_all_buckets()
+ assert len(bucket_list) == 1
+ assert bucket_list[0].name == bucket_name
+
+ rl.log_and_clear("list_buckets", '', user1)
+
# TESTCASE 'bucket-list-all','bucket','list','all buckets','succeeds, expected list'
(err, out) = rgwadmin(ctx, client, ['bucket', 'list'], check_status=True)
assert len(out) >= 1
# TESTCASE 'max-bucket-limit,'bucket','create','4 buckets','5th bucket fails due to max buckets == 4'
bucket2 = connection.create_bucket(bucket_name + '2')
+ rl.log_and_clear("create_bucket", bucket_name + '2', user1)
bucket3 = connection.create_bucket(bucket_name + '3')
+ rl.log_and_clear("create_bucket", bucket_name + '3', user1)
bucket4 = connection.create_bucket(bucket_name + '4')
+ rl.log_and_clear("create_bucket", bucket_name + '4', user1)
# the 5th should fail.
failed = False
try:
except Exception:
failed = True
assert failed
+ rl.log_and_clear("create_bucket", bucket_name + '5', user1)
# delete the buckets
bucket2.delete()
+ rl.log_and_clear("delete_bucket", bucket_name + '2', user1)
bucket3.delete()
+ rl.log_and_clear("delete_bucket", bucket_name + '3', user1)
bucket4.delete()
+ rl.log_and_clear("delete_bucket", bucket_name + '4', user1)
# TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list'
(err, out) = rgwadmin(ctx, client, [
# use some space
key = boto.s3.key.Key(bucket)
key.set_contents_from_string('one')
+ rl.log_and_clear("put_obj", bucket_name, user1)
# TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object'
(err, out) = rgwadmin(ctx, client, [
# reclaim it
key.delete()
+ rl.log_and_clear("delete_obj", bucket_name, user1)
# TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error'
(err, out) = rgwadmin(ctx, client,
denied = True
assert not denied
+ rl.log_and_clear("put_obj", bucket_name, user1)
# delete the object
key.delete()
+ rl.log_and_clear("delete_obj", bucket_name, user1)
# link the bucket to another user
(err, out) = rgwadmin(ctx, client, ['metadata', 'get', 'bucket:{n}'.format(n=bucket_name)],
object_name = 'four'
key = boto.s3.key.Key(bucket, object_name)
key.set_contents_from_string(object_name)
+ rl.log_and_clear("put_obj", bucket_name, user1)
+
+ # fetch it too (for usage stats presently)
+ s = key.get_contents_as_string()
+ rl.log_and_clear("get_obj", bucket_name, user1)
+ assert s == object_name
+ # list bucket too (for usage stats presently)
+ keys = list(bucket.list())
+ rl.log_and_clear("list_bucket", bucket_name, user1)
+ assert len(keys) == 1
+ assert keys[0].name == object_name
# now delete it
(err, out) = rgwadmin(ctx, client,
# TODO: show log by bucket+date
- # need to wait for all usage data to get flushed, should take up to 30 seconds
- timestamp = time.time()
- while time.time() - timestamp <= (20 * 60): # wait up to 20 minutes
- (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--categories', 'delete_obj']) # last operation we did is delete obj, wait for it to flush
- if get_user_successful_ops(out, user1) > 0:
- break
- time.sleep(1)
-
- assert time.time() - timestamp <= (20 * 60)
-
- # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds'
- (err, out) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True)
- assert len(out['entries']) > 0
- assert len(out['summary']) > 0
-
- user_summary = get_user_summary(out, user1)
-
- total = user_summary['total']
- assert total['successful_ops'] > 0
-
- # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds'
- (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
- check_status=True)
- assert len(out['entries']) > 0
- assert len(out['summary']) > 0
- user_summary = out['summary'][0]
- for entry in user_summary['categories']:
- assert entry['successful_ops'] > 0
- assert user_summary['user'] == user1
-
- # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds'
- test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket']
- for cat in test_categories:
- (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1, '--categories', cat],
- check_status=True)
- assert len(out['summary']) > 0
- user_summary = out['summary'][0]
- assert user_summary['user'] == user1
- assert len(user_summary['categories']) == 1
- entry = user_summary['categories'][0]
- assert entry['category'] == cat
- assert entry['successful_ops'] > 0
-
- # the usage flush interval is 30 seconds, wait that much an then some
- # to make sure everything has been flushed
- time.sleep(35)
-
- # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed'
- (err, out) = rgwadmin(ctx, client, ['usage', 'trim', '--uid', user1],
- check_status=True)
- (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
- check_status=True)
- assert len(out['entries']) == 0
- assert len(out['summary']) == 0
-
# TESTCASE 'user-suspend2','user','suspend','existing user','succeeds'
(err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1],
check_status=True)
# TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects'
+ denied = False
try:
key = boto.s3.key.Key(bucket)
key.set_contents_from_string('five')
except boto.exception.S3ResponseError as e:
+ denied = True
assert e.status == 403
+ assert denied
+ rl.log_and_clear("put_obj", bucket_name, user1)
+
# TESTCASE 'user-renable2','user','enable','suspended user','succeeds'
(err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1],
check_status=True)
# TESTCASE 'user-renable3','user','enable','reenabled user','can write objects'
key = boto.s3.key.Key(bucket)
key.set_contents_from_string('six')
+ rl.log_and_clear("put_obj", bucket_name, user1)
# TESTCASE 'gc-list', 'gc', 'list', 'get list of objects ready for garbage collection'
big_key = boto.s3.key.Key(bucket)
big_key.set_contents_from_string(test_string)
+ rl.log_and_clear("put_obj", bucket_name, user1)
# now delete the head
big_key.delete()
+ rl.log_and_clear("delete_obj", bucket_name, user1)
# wait a bit to give the garbage collector time to cycle
time.sleep(15)
bucket.delete()
except boto.exception.S3ResponseError as e:
assert e.status == 409
+ rl.log_and_clear("delete_bucket", bucket_name, user1)
key.delete()
+ rl.log_and_clear("delete_obj", bucket_name, user1)
bucket.delete()
+ rl.log_and_clear("delete_bucket", bucket_name, user1)
# TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy'
bucket = connection.create_bucket(bucket_name)
+ rl.log_and_clear("create_bucket", bucket_name, user1)
# create an object
key = boto.s3.key.Key(bucket)
key.set_contents_from_string('seven')
+ rl.log_and_clear("put_obj", bucket_name, user1)
# should be private already but guarantee it
key.set_acl('private')
+ rl.log_and_clear("put_acls", bucket_name, user1)
(err, out) = rgwadmin(ctx, client,
['policy', '--bucket', bucket.name, '--object', key.key],
check_status=True, format='xml')
acl = get_acl(key)
+ rl.log_and_clear("get_acls", bucket_name, user1)
assert acl == out.strip('\n')
# add another grantee by making the object public read
key.set_acl('public-read')
+ rl.log_and_clear("put_acls", bucket_name, user1)
(err, out) = rgwadmin(ctx, client,
['policy', '--bucket', bucket.name, '--object', key.key],
check_status=True, format='xml')
acl = get_acl(key)
+ rl.log_and_clear("get_acls", bucket_name, user1)
assert acl == out.strip('\n')
# TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds'
bucket = connection.create_bucket(bucket_name)
+ rl.log_and_clear("create_bucket", bucket_name, user1)
key_name = ['eight', 'nine', 'ten', 'eleven']
for i in range(4):
key = boto.s3.key.Key(bucket)
key.set_contents_from_string(key_name[i])
+ rl.log_and_clear("put_obj", bucket_name, user1)
(err, out) = rgwadmin(ctx, client,
['bucket', 'rm', '--bucket', bucket_name, '--purge-objects'],
# TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets'
bucket = connection.create_bucket(bucket_name)
+ rl.log_and_clear("create_bucket", bucket_name, user1)
key = boto.s3.key.Key(bucket)
(err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1])
# TESTCASE 'rm-user2', 'user', 'rm', 'user with data', 'succeeds'
bucket = connection.create_bucket(bucket_name)
+ rl.log_and_clear("create_bucket", bucket_name, user1)
key = boto.s3.key.Key(bucket)
key.set_contents_from_string('twelve')
+ rl.log_and_clear("put_obj", bucket_name, user1)
+
+ time.sleep(35)
+
+ # need to wait for all usage data to get flushed, should take up to 30 seconds
+ timestamp = time.time()
+ while time.time() - timestamp <= (2 * 60): # wait up to 20 minutes
+ (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--categories', 'delete_obj']) # one of the operations we did is delete_obj, should be present.
+ if get_user_successful_ops(out, user1) > 0:
+ break
+ time.sleep(1)
+
+ assert time.time() - timestamp <= (20 * 60)
+
+ # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds'
+ (err, out) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True)
+ assert len(out['entries']) > 0
+ assert len(out['summary']) > 0
+
+ r = acc.compare_results(out)
+ if len(r) != 0:
+ sys.stderr.write(("\n".join(r))+"\n")
+ assert(len(r) == 0)
+
+ user_summary = get_user_summary(out, user1)
+
+ total = user_summary['total']
+ assert total['successful_ops'] > 0
+
+ # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds'
+ (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
+ check_status=True)
+ assert len(out['entries']) > 0
+ assert len(out['summary']) > 0
+ user_summary = out['summary'][0]
+ for entry in user_summary['categories']:
+ assert entry['successful_ops'] > 0
+ assert user_summary['user'] == user1
+
+ # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds'
+ test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket']
+ for cat in test_categories:
+ (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1, '--categories', cat],
+ check_status=True)
+ assert len(out['summary']) > 0
+ user_summary = out['summary'][0]
+ assert user_summary['user'] == user1
+ assert len(user_summary['categories']) == 1
+ entry = user_summary['categories'][0]
+ assert entry['category'] == cat
+ assert entry['successful_ops'] > 0
+
+ # should be all through with connection. (anything using connection
+ # should be BEFORE the usage stuff above.)
+ rl.log_and_clear("(before-close)", '-', '-', ignore_this_entry)
+ connection.close()
+ connection = None
+
+ # the usage flush interval is 30 seconds, wait that much an then some
+ # to make sure everything has been flushed
+ time.sleep(35)
+
+ # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed'
+ (err, out) = rgwadmin(ctx, client, ['usage', 'trim', '--uid', user1],
+ check_status=True)
+ (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
+ check_status=True)
+ assert len(out['entries']) == 0
+ assert len(out['summary']) == 0
(err, out) = rgwadmin(ctx, client,
['user', 'rm', '--uid', user1, '--purge-data' ],
(err, out) = rgwadmin(ctx, client, ['zone', 'get','--rgw-zone','default'])
assert len(out) > 0
assert len(out['placement_pools']) == orig_placement_pools + 1
+
+ zonecmd = ['zone', 'placement', 'rm',
+ '--rgw-zone', 'default',
+ '--placement-id', 'new-placement']
+
+ (err, out) = rgwadmin(ctx, client, zonecmd, check_status=True)
+
+import sys
+from tasks.radosgw_admin import task
+from teuthology.config import config
+from teuthology.orchestra import cluster, remote
+import argparse;
+
+def main():
+ if len(sys.argv) == 3:
+ user = sys.argv[1] + "@"
+ host = sys.argv[2]
+ elif len(sys.argv) == 2:
+ user = ""
+ host = sys.argv[1]
+ else:
+ sys.stderr.write("usage: radosgw_admin.py [user] host\n")
+ exit(1)
+ client0 = remote.Remote(user + host)
+ ctx = config
+ ctx.cluster=cluster.Cluster(remotes=[(client0,
+ [ 'ceph.client.rgw.%s' % (host), ]),])
+
+ ctx.rgw = argparse.Namespace()
+ endpoints = {}
+ endpoints['ceph.client.rgw.%s' % host] = (host, 80)
+ ctx.rgw.role_endpoints = endpoints
+ ctx.rgw.realm = None
+ ctx.rgw.regions = {'region0': { 'api name': 'api1',
+ 'is master': True, 'master zone': 'r0z0',
+ 'zones': ['r0z0', 'r0z1'] }}
+ ctx.rgw.config = {'ceph.client.rgw.%s' % host: {'system user': {'name': '%s-system-user' % host}}}
+ task(config, None)
+ exit()
+
+if __name__ == '__main__':
+ main()
import contextlib
import logging
import os
+import tempfile
from cStringIO import StringIO
from teuthology.orchestra import run
scratch_dev: 'scratch_dev'
fs_type: 'xfs'
tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015'
+ exclude:
+ - generic/42
randomize: true
"""
with parallel() as p:
fs_type = properties.get('fs_type')
tests = properties.get('tests')
+ exclude_list = properties.get('exclude')
randomize = properties.get('randomize')
-
(remote,) = ctx.cluster.only(role).remotes.keys()
# Fetch the test script
test_root = teuthology.get_testdir(ctx)
- test_script = 'run_xfstests_krbd.sh'
+ test_script = 'run_xfstests.sh'
test_path = os.path.join(test_root, test_script)
xfstests_url = properties.get('xfstests_url')
log.info(' scratch device: {dev}'.format(dev=scratch_dev))
log.info(' using fs_type: {fs_type}'.format(fs_type=fs_type))
log.info(' tests to run: {tests}'.format(tests=tests))
+ log.info(' exclude list: {}'.format(' '.join(exclude_list)))
log.info(' randomize: {randomize}'.format(randomize=randomize))
+ if exclude_list:
+ with tempfile.NamedTemporaryFile(bufsize=0, prefix='exclude') as exclude_file:
+ for test in exclude_list:
+ exclude_file.write("{}\n".format(test))
+ remote.put_file(exclude_file.name, exclude_file.name)
+
# Note that the device paths are interpreted using
# readlink -f <path> in order to get their canonical
# pathname (so it matches what the kernel remembers).
args = [
'/usr/bin/sudo',
'TESTDIR={tdir}'.format(tdir=testdir),
- 'URL_BASE={url}'.format(url=xfstests_url),
'adjust-ulimits',
'ceph-coverage',
'{tdir}/archive/coverage'.format(tdir=testdir),
'-t', test_dev,
'-s', scratch_dev,
]
+ if exclude_list:
+ args.extend(['-x', exclude_file.name])
if randomize:
args.append('-r')
if tests:
scratch_format: 1
fs_type: 'xfs'
tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015'
+ exclude:
+ - generic/42
randomize: true
xfstests_branch: master
xfstests_url: 'https://raw.github.com/ceph/branch/master/qa'
fs_type=properties.get('fs_type', 'xfs'),
randomize=properties.get('randomize', False),
tests=properties.get('tests'),
+ exclude=properties.get('exclude', []),
xfstests_url=xfstests_url,
)
create_replicated_pool(remote, data_pool, 64, cluster_name, 'rgw')
if ctx.rgw.cache_pools:
create_cache_pool(remote, data_pool, data_pool + '.cache', 64,
- 64*1024*1024, cluster_name, 'rgw')
+ 64*1024*1024, cluster_name)
log.debug('Pools created')
yield
'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
])
-def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="ceph", application=None):
+def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="ceph"):
remote.run(args=[
'sudo', 'ceph', 'osd', 'pool', 'create', cache_name, str(pgnum), '--cluster', cluster_name
])
'sudo', 'ceph', 'osd', 'tier', 'add-cache', base_name, cache_name,
str(size), '--cluster', cluster_name
])
- if application:
- remote.run(args=[
- 'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
- ])
def cmd_erasure_code_profile(profile_name, profile):
"""
# FIXME: unimplemented
pass
- def newfs(self, name):
- return LocalFilesystem(self._ctx, create=name)
+ def newfs(self, name='cephfs', create=True):
+ return LocalFilesystem(self._ctx, name=name, create=create)
class LocalMgrCluster(LocalCephCluster, MgrCluster):
class LocalFilesystem(Filesystem, LocalMDSCluster):
- def __init__(self, ctx, fscid=None, create=None):
+ def __init__(self, ctx, fscid=None, name='cephfs', create=False):
# Deliberately skip calling parent constructor
self._ctx = ctx
self.id = None
self.name = None
self.metadata_pool_name = None
+ self.metadata_overlay = False
+ self.data_pool_name = None
self.data_pools = None
# Hack: cheeky inspection of ceph.conf to see what MDSs exist
self._conf = defaultdict(dict)
- if create is not None:
+ if name is not None:
if fscid is not None:
raise RuntimeError("cannot specify fscid when creating fs")
- if create is True:
- self.name = 'cephfs'
- else:
- self.name = create
- self.create()
- elif fscid is not None:
- self.id = fscid
- self.getinfo(refresh=True)
+ if create and not self.legacy_configured():
+ self.create()
+ else:
+ if fscid is not None:
+ self.id = fscid
+ self.getinfo(refresh=True)
# Stash a reference to the first created filesystem on ctx, so
# that if someone drops to the interactive shell they can easily
ceph osd pool application set app_for_test rbd key1 value1
ceph osd pool application set app_for_test rbd key2 value2
ceph osd pool application set app_for_test rgw key1 value1
+ ceph osd pool application get app_for_test rbd key1 | grep 'value1'
+ ceph osd pool application get app_for_test rbd key2 | grep 'value2'
+ ceph osd pool application get app_for_test rgw key1 | grep 'value1'
ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{"key1":"value1","key2":"value2"},"rgw":{"key1":"value1"}}'
ceph osd crush rule ls | grep foo
ceph osd crush rule rename foo foo-asdf
+ceph osd crush rule rename foo foo-asdf # idempotent
ceph osd crush rule rename bar bar-asdf
ceph osd crush rule ls | grep 'foo-asdf'
ceph osd crush rule ls | grep 'bar-asdf'
ceph osd crush rule rm foo 2>&1 | grep 'does not exist'
ceph osd crush rule rm bar 2>&1 | grep 'does not exist'
ceph osd crush rule rename foo-asdf foo
+ceph osd crush rule rename foo-asdf foo # idempotent
ceph osd crush rule rename bar-asdf bar
ceph osd crush rule ls | expect_false grep 'foo-asdf'
ceph osd crush rule ls | expect_false grep 'bar-asdf'
# 1M sparse, 1M data
rbd rm sparse1 || true
rbd import $RBD_CREATE_ARGS --order 20 ${TMPDIR}/sparse1
-rbd ls -l | grep sparse1 | grep -i '2048k'
+rbd ls -l | grep sparse1 | grep -Ei '(2M|2048k)'
[ $tiered -eq 1 -o "$(objects sparse1)" = '1' ]
# export, compare contents and on-disk size
# 1M data, 1M sparse
rbd rm sparse2 || true
rbd import $RBD_CREATE_ARGS --order 20 ${TMPDIR}/sparse2
-rbd ls -l | grep sparse2 | grep -i '2048k'
+rbd ls -l | grep sparse2 | grep -Ei '(2M|2048k)'
[ $tiered -eq 1 -o "$(objects sparse2)" = '0' ]
rbd export sparse2 ${TMPDIR}/sparse2.out
compare_files_and_ondisk_sizes ${TMPDIR}/sparse2 ${TMPDIR}/sparse2.out
truncate ${TMPDIR}/sparse1 -s 10M
# import from stdin just for fun, verify still sparse
rbd import $RBD_CREATE_ARGS --order 20 - sparse1 < ${TMPDIR}/sparse1
-rbd ls -l | grep sparse1 | grep -i '10240k'
+rbd ls -l | grep sparse1 | grep -Ei '(10M|10240k)'
[ $tiered -eq 1 -o "$(objects sparse1)" = '1' ]
rbd export sparse1 ${TMPDIR}/sparse1.out
compare_files_and_ondisk_sizes ${TMPDIR}/sparse1 ${TMPDIR}/sparse1.out
dd if=/dev/urandom bs=2M count=1 of=${TMPDIR}/sparse2 oflag=append conv=notrunc
# again from stding
rbd import $RBD_CREATE_ARGS --order 20 - sparse2 < ${TMPDIR}/sparse2
-rbd ls -l | grep sparse2 | grep -i '4096k'
+rbd ls -l | grep sparse2 | grep -Ei '(4M|4096k)'
[ $tiered -eq 1 -o "$(objects sparse2)" = '0 2 3' ]
rbd export sparse2 ${TMPDIR}/sparse2.out
compare_files_and_ondisk_sizes ${TMPDIR}/sparse2 ${TMPDIR}/sparse2.out
type var_run_t;
type random_device_t;
type urandom_device_t;
- type setfiles_t;
+ type setfiles_t;
+ type nvme_device_t;
class sock_file unlink;
class lnk_file read;
class dir read;
class file { getattr read open };
+ class blk_file { getattr ioctl open read write };
}
########################################
sysnet_dns_name_resolve(ceph_t)
+allow ceph_t nvme_device_t:blk_file { getattr ioctl open read write };
+
# basis for future security review
allow ceph_t ceph_var_run_t:sock_file { create unlink write setattr };
allow ceph_t self:capability { sys_rawio chown };
-32ce2a3ae5239ee33d6150705cdb24d43bab910c
-v12.2.0
+3e7492b9ada8bdc9a5cd0feafd42fbca27f9c38e
+v12.2.1
list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER})
endif(WITH_CCACHE AND CCACHE_FOUND)
- # We really want to have the CRC32 calculation in RocksDB accelerated
- # with SSE 4.2. For details refer to rocksdb/util/crc32c.cc.
- if (HAVE_INTEL_SSE4_2)
- list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_CXX_FLAGS=${SIMD_COMPILE_FLAGS})
- else()
- list(APPEND ROCKSDB_CMAKE_ARGS -DWITH_SSE42=OFF)
- endif()
+ # SSE 4.2 is enabled by default in rocksdb's crc32c. For details refer to
+ # rocksdb/util/crc32c.cc.
list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_AR=${CMAKE_AR})
list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
from __future__ import print_function
import argparse
-import os
from textwrap import dedent
from ceph_volume import process, conf, decorators
-from ceph_volume.util import system
+from ceph_volume.util import system, disk
from ceph_volume.systemd import systemctl
from . import api
# blow up with a KeyError if this doesn't exist
osd_fsid = osd_lv.tags['ceph.osd_fsid']
if not osd_journal_lv:
- osd_journal = osd_lv.tags.get('ceph.journal_device')
+ # must be a disk partition, by quering blkid by the uuid we are ensuring that the
+ # device path is always correct
+ osd_journal = disk.get_device_from_partuuid(osd_lv.tags['ceph.journal_uuid'])
else:
- osd_journal = osd_journal.lv_path
+ osd_journal = osd_lv.tags['ceph.journal_device']
if not osd_journal:
raise RuntimeError('unable to detect an lv or device journal for OSD %s' % osd_id)
if not system.is_mounted(source, destination=destination):
process.run(['sudo', 'mount', '-v', source, destination])
- # ensure that the symlink for the journal is there
- if not os.path.exists(osd_journal):
- source = osd_journal
- destination = '/var/lib/ceph/osd/%s-%s/journal' % (conf.cluster, osd_id)
- process.run(['sudo', 'ln', '-s', source, destination])
+ # always re-do the symlink regardless if it exists, so that the journal
+ # device path that may have changed can be mapped correctly every time
+ destination = '/var/lib/ceph/osd/%s-%s/journal' % (conf.cluster, osd_id)
+ process.run(['sudo', 'ln', '-snf', osd_journal, destination])
# make sure that the journal has proper permissions
system.chown(osd_journal)
def activate(self, args):
lvs = api.Volumes()
# filter them down for the OSD ID and FSID we need to activate
- lvs.filter(lv_tags={'ceph.osd_id': args.osd_id, 'ceph.osd_fsid': args.osd_fsid})
+ if args.osd_id and args.osd_fsid:
+ lvs.filter(lv_tags={'ceph.osd_id': args.osd_id, 'ceph.osd_fsid': args.osd_fsid})
+ elif args.osd_fsid and not args.osd_id:
+ lvs.filter(lv_tags={'ceph.osd_fsid': args.osd_fsid})
if not lvs:
raise RuntimeError('could not find osd.%s with fsid %s' % (args.osd_id, args.osd_fsid))
activate_filestore(lvs)
set of utilities for interacting with LVM.
"""
from ceph_volume import process
-from ceph_volume.exceptions import MultipleLVsError, MultipleVGsError
+from ceph_volume.exceptions import MultipleLVsError, MultipleVGsError, MultiplePVsError
def _output_parser(output, fields):
;/dev/ubuntubox-vg/swap_1;swap_1;ubuntubox-vg
"""
- fields = 'lv_tags,lv_path,lv_name,vg_name'
+ fields = 'lv_tags,lv_path,lv_name,vg_name,lv_uuid'
stdout, stderr, returncode = process.call(
['sudo', 'lvs', '--noheadings', '--separator=";"', '-o', fields]
)
return _output_parser(stdout, fields)
-def get_lv(lv_name=None, vg_name=None, lv_path=None, lv_tags=None):
+def get_api_pvs():
+ """
+ Return the list of physical volumes configured for lvm and available in the
+ system using flags to include common metadata associated with them like the uuid
+
+ Command and delimeted output, should look like::
+
+ $ sudo pvs --noheadings --separator=';' -o pv_name,pv_tags,pv_uuid
+ /dev/sda1;;
+ /dev/sdv;;07A4F654-4162-4600-8EB3-88D1E42F368D
+
+ """
+ fields = 'pv_name,pv_tags,pv_uuid'
+
+ # note the use of `pvs -a` which will return every physical volume including
+ # ones that have not been initialized as "pv" by LVM
+ stdout, stderr, returncode = process.call(
+ ['sudo', 'pvs', '-a', '--no-heading', '--separator=";"', '-o', fields]
+ )
+
+ return _output_parser(stdout, fields)
+
+
+def get_lv(lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
"""
Return a matching lv for the current system, requiring ``lv_name``,
``vg_name``, ``lv_path`` or ``tags``. Raises an error if more than one lv
but it can also lead to multiple lvs being found, since a lot of metadata
is shared between lvs of a distinct OSD.
"""
- if not any([lv_name, vg_name, lv_path, lv_tags]):
+ if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
return None
lvs = Volumes()
- return lvs.get(lv_name=lv_name, vg_name=vg_name, lv_path=lv_path, lv_tags=lv_tags)
+ return lvs.get(
+ lv_name=lv_name, vg_name=vg_name, lv_path=lv_path, lv_uuid=lv_uuid,
+ lv_tags=lv_tags
+ )
+
+
+def get_pv(pv_name=None, pv_uuid=None, pv_tags=None):
+ """
+ Return a matching pv (physical volume) for the current system, requiring
+ ``pv_name``, ``pv_uuid``, or ``pv_tags``. Raises an error if more than one
+ pv is found.
+ """
+ if not any([pv_name, pv_uuid, pv_tags]):
+ return None
+ pvs = PVolumes()
+ return pvs.get(pv_name=pv_name, pv_uuid=pv_uuid, pv_tags=pv_tags)
+
+
+def create_pv(device):
+ """
+ Create a physical volume from a device, useful when devices need to be later mapped
+ to journals.
+ """
+ process.run([
+ 'sudo',
+ 'pvcreate',
+ '-v', # verbose
+ '-f', # force it
+ '--yes', # answer yes to any prompts
+ device
+ ])
def create_lv(name, group, size=None, **tags):
# actual filtered list if any filters were applied
if vg_tags:
tag_filtered = []
- for k, v in vg_tags.items():
- for volume in filtered:
- if volume.tags.get(k) == str(v):
- if volume not in tag_filtered:
- tag_filtered.append(volume)
- # return the tag_filtered volumes here, the `filtered` list is no
- # longer useable
+ for volume in filtered:
+ matches = all(volume.tags.get(k) == str(v) for k, v in vg_tags.items())
+ if matches:
+ tag_filtered.append(volume)
return tag_filtered
return filtered
"""
self[:] = []
- def _filter(self, lv_name=None, vg_name=None, lv_path=None, lv_tags=None):
+ def _filter(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
"""
The actual method that filters using a new list. Useful so that other
methods that do not want to alter the contents of the list (e.g.
if vg_name:
filtered = [i for i in filtered if i.vg_name == vg_name]
+ if lv_uuid:
+ filtered = [i for i in filtered if i.lv_uuid == lv_uuid]
+
if lv_path:
filtered = [i for i in filtered if i.lv_path == lv_path]
# actual filtered list if any filters were applied
if lv_tags:
tag_filtered = []
- for k, v in lv_tags.items():
- for volume in filtered:
- if volume.tags.get(k) == str(v):
- if volume not in tag_filtered:
- tag_filtered.append(volume)
- # return the tag_filtered volumes here, the `filtered` list is no
- # longer useable
+ for volume in filtered:
+ # all the tags we got need to match on the volume
+ matches = all(volume.tags.get(k) == str(v) for k, v in lv_tags.items())
+ if matches:
+ tag_filtered.append(volume)
return tag_filtered
return filtered
- def filter(self, lv_name=None, vg_name=None, lv_path=None, lv_tags=None):
+ def filter(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
"""
Filter out volumes on top level attributes like ``lv_name`` or by
``lv_tags`` where a dict is required. For example, to find a volume
lv_tags={'ceph.osd_id': '0'}
"""
- if not any([lv_name, vg_name, lv_path, lv_tags]):
- raise TypeError('.filter() requires lv_name, vg_name, lv_path, or tags (none given)')
+ if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
+ raise TypeError('.filter() requires lv_name, vg_name, lv_path, lv_uuid, or tags (none given)')
# first find the filtered volumes with the values in self
filtered_volumes = self._filter(
lv_name=lv_name,
vg_name=vg_name,
lv_path=lv_path,
+ lv_uuid=lv_uuid,
lv_tags=lv_tags
)
# then purge everything
# and add the filtered items
self.extend(filtered_volumes)
- def get(self, lv_name=None, vg_name=None, lv_path=None, lv_tags=None):
+ def get(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
"""
This is a bit expensive, since it will try to filter out all the
matching items in the list, filter them out applying anything that was
but it can also lead to multiple lvs being found, since a lot of metadata
is shared between lvs of a distinct OSD.
"""
- if not any([lv_name, vg_name, lv_path, lv_tags]):
+ if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
return None
lvs = self._filter(
lv_name=lv_name,
vg_name=vg_name,
lv_path=lv_path,
+ lv_uuid=lv_uuid,
lv_tags=lv_tags
)
if not lvs:
return lvs[0]
+class PVolumes(list):
+ """
+ A list of all known (physical) volumes for the current system, with the ability
+ to filter them via keyword arguments.
+ """
+
+ def __init__(self):
+ self._populate()
+
+ def _populate(self):
+ # get all the pvs in the current system
+ for pv_item in get_api_pvs():
+ self.append(PVolume(**pv_item))
+
+ def _purge(self):
+ """
+ Deplete all the items in the list, used internally only so that we can
+ dynamically allocate the items when filtering without the concern of
+ messing up the contents
+ """
+ self[:] = []
+
+ def _filter(self, pv_name=None, pv_uuid=None, pv_tags=None):
+ """
+ The actual method that filters using a new list. Useful so that other
+ methods that do not want to alter the contents of the list (e.g.
+ ``self.find``) can operate safely.
+ """
+ filtered = [i for i in self]
+ if pv_name:
+ filtered = [i for i in filtered if i.pv_name == pv_name]
+
+ if pv_uuid:
+ filtered = [i for i in filtered if i.pv_uuid == pv_uuid]
+
+ # at this point, `filtered` has either all the physical volumes in self
+ # or is an actual filtered list if any filters were applied
+ if pv_tags:
+ tag_filtered = []
+ for pvolume in filtered:
+ matches = all(pvolume.tags.get(k) == str(v) for k, v in pv_tags.items())
+ if matches:
+ tag_filtered.append(pvolume)
+ # return the tag_filtered pvolumes here, the `filtered` list is no
+ # longer useable
+ return tag_filtered
+
+ return filtered
+
+ def filter(self, pv_name=None, pv_uuid=None, pv_tags=None):
+ """
+ Filter out volumes on top level attributes like ``pv_name`` or by
+ ``pv_tags`` where a dict is required. For example, to find a physical volume
+ that has an OSD ID of 0, the filter would look like::
+
+ pv_tags={'ceph.osd_id': '0'}
+
+ """
+ if not any([pv_name, pv_uuid, pv_tags]):
+ raise TypeError('.filter() requires pv_name, pv_uuid, or pv_tags (none given)')
+ # first find the filtered volumes with the values in self
+ filtered_volumes = self._filter(
+ pv_name=pv_name,
+ pv_uuid=pv_uuid,
+ pv_tags=pv_tags
+ )
+ # then purge everything
+ self._purge()
+ # and add the filtered items
+ self.extend(filtered_volumes)
+
+ def get(self, pv_name=None, pv_uuid=None, pv_tags=None):
+ """
+ This is a bit expensive, since it will try to filter out all the
+ matching items in the list, filter them out applying anything that was
+ added and return the matching item.
+
+ This method does *not* alter the list, and it will raise an error if
+ multiple pvs are matched
+
+ It is useful to use ``tags`` when trying to find a specific logical volume,
+ but it can also lead to multiple pvs being found, since a lot of metadata
+ is shared between pvs of a distinct OSD.
+ """
+ if not any([pv_name, pv_uuid, pv_tags]):
+ return None
+ pvs = self._filter(
+ pv_name=pv_name,
+ pv_uuid=pv_uuid,
+ pv_tags=pv_tags
+ )
+ if not pvs:
+ return None
+ if len(pvs) > 1:
+ raise MultiplePVsError(pv_name)
+ return pvs[0]
+
+
class VolumeGroup(object):
"""
Represents an LVM group, with some top-level attributes like ``vg_name``
'--addtag', '%s=%s' % (key, value), self.lv_path
]
)
+
+
+class PVolume(object):
+ """
+ Represents a Physical Volume from LVM, with some top-level attributes like
+ ``pv_name`` and parsed tags as a dictionary of key/value pairs.
+ """
+
+ def __init__(self, **kw):
+ for k, v in kw.items():
+ setattr(self, k, v)
+ self.pv_api = kw
+ self.name = kw['pv_name']
+ self.tags = parse_tags(kw['pv_tags'])
+
+ def __str__(self):
+ return '<%s>' % self.pv_api['pv_name']
+
+ def __repr__(self):
+ return self.__str__()
+
+ def set_tags(self, tags):
+ """
+ :param tags: A dictionary of tag names and values, like::
+
+ {
+ "ceph.osd_fsid": "aaa-fff-bbbb",
+ "ceph.osd_id": "0"
+ }
+
+ At the end of all modifications, the tags are refreshed to reflect
+ LVM's most current view.
+ """
+ for k, v in tags.items():
+ self.set_tag(k, v)
+ # after setting all the tags, refresh them for the current object, use the
+ # pv_* identifiers to filter because those shouldn't change
+ pv_object = get_pv(pv_name=self.pv_name, pv_uuid=self.pv_uuid)
+ self.tags = pv_object.tags
+
+ def set_tag(self, key, value):
+ """
+ Set the key/value pair as an LVM tag. Does not "refresh" the values of
+ the current object for its tags. Meant to be a "fire and forget" type
+ of modification.
+
+ **warning**: Altering tags on a PV has to be done ensuring that the
+ device is actually the one intended. ``pv_name`` is *not* a persistent
+ value, only ``pv_uuid`` is. Using ``pv_uuid`` is the best way to make
+ sure the device getting changed is the one needed.
+ """
+ # remove it first if it exists
+ if self.tags.get(key):
+ current_value = self.tags[key]
+ tag = "%s=%s" % (key, current_value)
+ process.call(['sudo', 'pvchange', '--deltag', tag, self.pv_name])
+
+ process.call(
+ [
+ 'sudo', 'pvchange',
+ '--addtag', '%s=%s' % (key, value), self.pv_name
+ ]
+ )
import os
from textwrap import dedent
from ceph_volume.util import prepare as prepare_utils
-from ceph_volume.util import system
-from ceph_volume import conf, decorators
+from ceph_volume.util import system, disk
+from ceph_volume import conf, decorators, terminal
from . import api
from .common import prepare_parser
def __init__(self, argv):
self.argv = argv
+ def get_journal_ptuuid(self, argument):
+ uuid = disk.get_partuuid(argument)
+ if not uuid:
+ terminal.error('blkid could not detect a PARTUUID for device: %s' % argument)
+ raise RuntimeError('unable to use device for a journal')
+ return uuid
+
def get_journal_lv(self, argument):
"""
Perform some parsing of the value of ``--journal`` so that the process
if not args.journal:
raise RuntimeError('--journal is required when using --filestore')
- journal_device = None
- journal_lv = self.get_journal_lv(args.journal)
- # check if we have an actual path to a device, which is allowed
- if not journal_lv:
- if os.path.exists(args.journal):
- journal_device = args.journal
- else:
- raise RuntimeError(
- '--journal specified an invalid or non-existent device: %s' % args.journal
- )
- # Otherwise the journal_device is the path to the lv
- else:
+ journal_lv = self.get_journal_lv(args.journal)
+ if journal_lv:
journal_device = journal_lv.lv_path
+ journal_uuid = journal_lv.lv_uuid
+ # we can only set tags on an lv, the pv (if any) can't as we
+ # aren't making it part of an lvm group (vg)
journal_lv.set_tags({
'ceph.type': 'journal',
'ceph.osd_fsid': fsid,
'ceph.osd_id': osd_id,
'ceph.cluster_fsid': cluster_fsid,
'ceph.journal_device': journal_device,
+ 'ceph.journal_uuid': journal_uuid,
'ceph.data_device': data_lv.lv_path,
+ 'ceph.data_uuid': data_lv.lv_uuid,
})
+ # allow a file
+ elif os.path.isfile(args.journal):
+ journal_uuid = ''
+ journal_device = args.journal
+
+ # otherwise assume this is a regular disk partition
+ else:
+ journal_uuid = self.get_journal_ptuuid(args.journal)
+ journal_device = args.journal
+
data_lv.set_tags({
'ceph.type': 'data',
'ceph.osd_fsid': fsid,
'ceph.osd_id': osd_id,
'ceph.cluster_fsid': cluster_fsid,
'ceph.journal_device': journal_device,
+ 'ceph.journal_uuid': journal_uuid,
'ceph.data_device': data_lv.lv_path,
+ 'ceph.data_uuid': data_lv.lv_uuid,
})
prepare_filestore(
def parse_osd_uuid(string):
osd_id = '%s-' % parse_osd_id(string)
# remove the id first
- osd_uuid = string.split(osd_id)[-1]
+ osd_uuid = string.split(osd_id, 1)[-1]
if not osd_uuid:
raise SuffixParsingError('OSD uuid', string)
return osd_uuid
return 'This command needs to be executed with sudo or as root'
+class MultiplePVsError(Exception):
+
+ def __init__(self, pv_name):
+ self.pv_name = pv_name
+
+ def __str__(self):
+ msg = "Got more than 1 result looking for physical volume: %s" % self.pv_name
+ return msg
+
+
class MultipleLVsError(Exception):
def __init__(self, lv_name, lv_path):
import pytest
+from ceph_volume.devices.lvm import api
class Capture(object):
@pytest.fixture
def capture():
return Capture()
+
+
+@pytest.fixture
+def volumes(monkeypatch):
+ monkeypatch.setattr('ceph_volume.process.call', lambda x: ('', '', 0))
+ volumes = api.Volumes()
+ volumes._purge()
+ return volumes
+
+
+@pytest.fixture
+def volume_groups(monkeypatch):
+ monkeypatch.setattr('ceph_volume.process.call', lambda x: ('', '', 0))
+ vgs = api.VolumeGroups()
+ vgs._purge()
+ return vgs
+
+
+@pytest.fixture
+def is_root(monkeypatch):
+ """
+ Patch ``os.getuid()`` so that ceph-volume's decorators that ensure a user
+ is root (or is sudoing to superuser) can continue as-is
+ """
+ monkeypatch.setattr('os.getuid', lambda: 0)
--- /dev/null
+import pytest
+from ceph_volume.devices.lvm import activate, api
+
+
+class Args(object):
+
+ def __init__(self, **kw):
+ for k, v in kw.items():
+ setattr(self, k, v)
+
+
+class TestActivate(object):
+
+ # these tests are very functional, hence the heavy patching, it is hard to
+ # test the negative side effect with an actual functional run, so we must
+ # setup a perfect scenario for this test to check it can really work
+ # with/without osd_id
+ def test_no_osd_id_matches_fsid(self, is_root, volumes, monkeypatch, capture):
+ FooVolume = api.Volume(lv_name='foo', lv_path='/dev/vg/foo', lv_tags="ceph.osd_fsid=1234")
+ volumes.append(FooVolume)
+ monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+ monkeypatch.setattr(activate, 'activate_filestore', capture)
+ args = Args(osd_id=None, osd_fsid='1234')
+ activate.Activate([]).activate(args)
+ assert capture.calls[0]['args'][0] == [FooVolume]
+
+ def test_no_osd_id_no_matching_fsid(self, is_root, volumes, monkeypatch, capture):
+ FooVolume = api.Volume(lv_name='foo', lv_path='/dev/vg/foo', lv_tags="ceph.osd_fsid=11234")
+ volumes.append(FooVolume)
+ monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+ monkeypatch.setattr(activate, 'activate_filestore', capture)
+ args = Args(osd_id=None, osd_fsid='1234')
+ with pytest.raises(RuntimeError):
+ activate.Activate([]).activate(args)
return volumes
+@pytest.fixture
+def pvolumes(monkeypatch):
+ monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
+ pvolumes = api.PVolumes()
+ pvolumes._purge()
+ return pvolumes
+
+
@pytest.fixture
def volume_groups(monkeypatch):
monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
monkeypatch.setattr(api, 'Volumes', lambda: volumes)
assert api.get_lv(lv_name='foo') == FooVolume
+ def test_single_lv_is_matched_by_uuid(self, volumes, monkeypatch):
+ FooVolume = api.Volume(
+ lv_name='foo', lv_path='/dev/vg/foo',
+ lv_uuid='1111', lv_tags="ceph.type=data")
+ volumes.append(FooVolume)
+ monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+ assert api.get_lv(lv_uuid='1111') == FooVolume
+
+
+class TestGetPV(object):
+
+ def test_nothing_is_passed_in(self):
+ # so we return a None
+ assert api.get_pv() is None
+
+ def test_single_pv_is_not_matched(self, pvolumes, monkeypatch):
+ FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+ pvolumes.append(FooPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ assert api.get_pv(pv_uuid='foo') is None
+
+ def test_single_pv_is_matched(self, pvolumes, monkeypatch):
+ FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+ pvolumes.append(FooPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ assert api.get_pv(pv_uuid='0000') == FooPVolume
+
+ def test_single_pv_is_matched_by_uuid(self, pvolumes, monkeypatch):
+ FooPVolume = api.PVolume(
+ pv_name='/dev/vg/foo',
+ pv_uuid='1111', pv_tags="ceph.type=data")
+ pvolumes.append(FooPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ assert api.get_pv(pv_uuid='1111') == FooPVolume
+
+
+class TestPVolumes(object):
+
+ def test_filter_by_tag_does_not_match_one(self, pvolumes, monkeypatch):
+ pv_tags = "ceph.type=journal,ceph.osd_id=1,ceph.fsid=000-aaa"
+ FooPVolume = api.PVolume(
+ pv_name='/dev/vg/foo',
+ pv_uuid='1111', pv_tags=pv_tags)
+ pvolumes.append(FooPVolume)
+ pvolumes.filter(pv_tags={'ceph.type': 'journal', 'ceph.osd_id': '2'})
+ assert pvolumes == []
+
+ def test_filter_by_tags_matches(self, pvolumes, monkeypatch):
+ pv_tags = "ceph.type=journal,ceph.osd_id=1"
+ FooPVolume = api.PVolume(
+ pv_name='/dev/vg/foo',
+ pv_uuid='1111', pv_tags=pv_tags)
+ pvolumes.append(FooPVolume)
+ pvolumes.filter(pv_tags={'ceph.type': 'journal', 'ceph.osd_id': '1'})
+ assert pvolumes == [FooPVolume]
+
class TestGetVG(object):
assert len(volumes) == 1
assert volumes[0].lv_name == 'volume1'
+ def test_filter_by_tag_does_not_match_one(self, volumes):
+ lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+ osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
+ journal = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='ceph.osd_id=1,ceph.type=journal')
+ volumes.append(osd)
+ volumes.append(journal)
+ # note the different osd_id!
+ volumes.filter(lv_tags={'ceph.type': 'data', 'ceph.osd_id': '2'})
+ assert volumes == []
+
def test_filter_by_vg_name(self, volumes):
lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
osd = api.Volume(lv_name='volume1', vg_name='ceph_vg', lv_tags=lv_tags)
assert len(volumes) == 1
assert volumes[0].lv_name == 'volume1'
+ def test_filter_by_lv_uuid(self, volumes):
+ osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_uuid='1111', lv_tags='')
+ journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_uuid='', lv_tags='')
+ volumes.append(osd)
+ volumes.append(journal)
+ volumes.filter(lv_uuid='1111')
+ assert len(volumes) == 1
+ assert volumes[0].lv_name == 'volume1'
+
+ def test_filter_by_lv_uuid_nothing_found(self, volumes):
+ osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_uuid='1111', lv_tags='')
+ journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_uuid='', lv_tags='')
+ volumes.append(osd)
+ volumes.append(journal)
+ volumes.filter(lv_uuid='22222')
+ assert volumes == []
+
def test_filter_requires_params(self, volumes):
with pytest.raises(TypeError):
volumes.filter()
assert len(volume_groups) == 1
assert volume_groups[0].vg_name == 'volume1'
+ def test_filter_by_tag_does_not_match_one(self, volume_groups):
+ vg_tags = "ceph.group=dmcache,ceph.disk_type=ssd"
+ osd = api.VolumeGroup(vg_name='volume1', vg_path='/dev/vg/lv', vg_tags=vg_tags)
+ volume_groups.append(osd)
+ volume_groups.filter(vg_tags={'ceph.group': 'data', 'ceph.disk_type': 'ssd'})
+ assert volume_groups == []
+
def test_filter_by_vg_name(self, volume_groups):
vg_tags = "ceph.type=data,ceph.fsid=000-aaa"
osd = api.VolumeGroup(vg_name='ceph_vg', vg_tags=vg_tags)
with pytest.raises(exceptions.SuffixParsingError):
trigger.parse_osd_uuid('ljahs-dfa-slkjhdfa-foo')
+ def test_robust_double_id_in_uuid(self):
+ # it is possible to have the id in the SHA1, this should
+ # be fine parsing that
+ result = trigger.parse_osd_uuid("1-abc959fd-1ec9-4864-b141-3154f9b9f8ed")
+ assert result == 'abc959fd-1ec9-4864-b141-3154f9b9f8ed'
+
(0..CLIENTS - 1).each do |i|
config.vm.define "#{LABEL_PREFIX}client#{i}" do |client|
client.vm.box = CLIENT_BOX
- client.vm.hostname = "#{LABEL_PREFIX}ceph-client#{i}"
+ client.vm.hostname = "#{LABEL_PREFIX}client#{i}"
if ASSIGN_STATIC_IP
client.vm.network :private_network,
ip: "#{PUBLIC_SUBNET}.4#{i}"
# Parallels
client.vm.provider "parallels" do |prl|
- prl.name = "ceph-client#{i}"
+ prl.name = "client#{i}"
prl.memory = "#{MEMORY}"
end
config.vm.define "#{LABEL_PREFIX}rgw#{i}" do |rgw|
rgw.vm.box = BOX
rgw.vm.box_url = BOX_URL
- rgw.vm.hostname = "#{LABEL_PREFIX}ceph-rgw#{i}"
+ rgw.vm.hostname = "#{LABEL_PREFIX}rgw#{i}"
if ASSIGN_STATIC_IP
rgw.vm.network :private_network,
ip: "#{PUBLIC_SUBNET}.5#{i}"
# Parallels
rgw.vm.provider "parallels" do |prl|
- prl.name = "ceph-rgw#{i}"
+ prl.name = "rgw#{i}"
prl.memory = "#{MEMORY}"
end
config.vm.define "nfs#{i}" do |nfs|
nfs.vm.box = BOX
nfs.vm.box_url = BOX_URL
- nfs.vm.hostname = "ceph-nfs#{i}"
+ nfs.vm.hostname = "nfs#{i}"
if ASSIGN_STATIC_IP
nfs.vm.network :private_network,
ip: "#{PUBLIC_SUBNET}.6#{i}"
# Parallels
nfs.vm.provider "parallels" do |prl|
- prl.name = "ceph-nfs#{i}"
+ prl.name = "nfs#{i}"
prl.memory = "#{MEMORY}"
end
config.vm.define "#{LABEL_PREFIX}mds#{i}" do |mds|
mds.vm.box = BOX
mds.vm.box_url = BOX_URL
- mds.vm.hostname = "#{LABEL_PREFIX}ceph-mds#{i}"
+ mds.vm.hostname = "#{LABEL_PREFIX}mds#{i}"
if ASSIGN_STATIC_IP
mds.vm.network :private_network,
ip: "#{PUBLIC_SUBNET}.7#{i}"
end
# Parallels
mds.vm.provider "parallels" do |prl|
- prl.name = "ceph-mds#{i}"
+ prl.name = "mds#{i}"
prl.memory = "#{MEMORY}"
end
config.vm.define "#{LABEL_PREFIX}rbd_mirror#{i}" do |rbd_mirror|
rbd_mirror.vm.box = BOX
rbd_mirror.vm.box_url = BOX_URL
- rbd_mirror.vm.hostname = "#{LABEL_PREFIX}ceph-rbd-mirror#{i}"
+ rbd_mirror.vm.hostname = "#{LABEL_PREFIX}rbd-mirror#{i}"
if ASSIGN_STATIC_IP
rbd_mirror.vm.network :private_network,
ip: "#{PUBLIC_SUBNET}.8#{i}"
end
# Parallels
rbd_mirror.vm.provider "parallels" do |prl|
- prl.name = "ceph-rbd-mirror#{i}"
+ prl.name = "rbd-mirror#{i}"
prl.memory = "#{MEMORY}"
end
config.vm.define "#{LABEL_PREFIX}iscsi_gw#{i}" do |iscsi_gw|
iscsi_gw.vm.box = BOX
iscsi_gw.vm.box_url = BOX_URL
- iscsi_gw.vm.hostname = "#{LABEL_PREFIX}ceph-iscsi-gw#{i}"
+ iscsi_gw.vm.hostname = "#{LABEL_PREFIX}iscsi-gw#{i}"
if ASSIGN_STATIC_IP
iscsi_gw.vm.network :private_network,
ip: "#{PUBLIC_SUBNET}.9#{i}"
end
# Parallels
iscsi_gw.vm.provider "parallels" do |prl|
- prl.name = "ceph-iscsi-gw#{i}"
+ prl.name = "iscsi-gw#{i}"
prl.memory = "#{MEMORY}"
end
config.vm.define "#{LABEL_PREFIX}mon#{i}" do |mon|
mon.vm.box = BOX
mon.vm.box_url = BOX_URL
- mon.vm.hostname = "#{LABEL_PREFIX}ceph-mon#{i}"
+ mon.vm.hostname = "#{LABEL_PREFIX}mon#{i}"
if ASSIGN_STATIC_IP
mon.vm.network :private_network,
ip: "#{PUBLIC_SUBNET}.1#{i}"
# Parallels
mon.vm.provider "parallels" do |prl|
- prl.name = "ceph-mon#{i}"
+ prl.name = "mon#{i}"
prl.memory = "#{MEMORY}"
end
config.vm.define "#{LABEL_PREFIX}osd#{i}" do |osd|
osd.vm.box = BOX
osd.vm.box_url = BOX_URL
- osd.vm.hostname = "#{LABEL_PREFIX}ceph-osd#{i}"
+ osd.vm.hostname = "#{LABEL_PREFIX}osd#{i}"
if ASSIGN_STATIC_IP
osd.vm.network :private_network,
ip: "#{PUBLIC_SUBNET}.10#{i}"
# Parallels
osd.vm.provider "parallels" do |prl|
- prl.name = "ceph-osd#{i}"
+ prl.name = "osd#{i}"
prl.memory = "#{MEMORY}"
(0..1).each do |d|
prl.customize ["set", :id,
journal_size: 100
osd_objectstore: "filestore"
osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
copy_admin_key: true
# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
lvm_volumes:
- - data: test_volume
- journal: /dev/sdc
+ - data: data-lv1
+ journal: /dev/sdc1
data_vg: test_group
+ - data: data-lv2
+ journal: journal1
+ data_vg: test_group
+ journal_vg: journals
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
VAGRANT_CWD = {changedir}
CEPH_VOLUME_DEBUG = 1
deps=
- ansible==2.2.3
+ ansible==2.3.1
testinfra==1.6.0
pytest-xdist
changedir=
journal_size: 100
osd_objectstore: "filestore"
osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
copy_admin_key: true
# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
lvm_volumes:
- - data: test_volume
- journal: /dev/sdc
+ - data: data-lv1
+ journal: /dev/sdc1
data_vg: test_group
+ - data: data-lv2
+ journal: journal1
+ data_vg: test_group
+ journal_vg: journals
os_tuning_params:
- { name: kernel.pid_max, value: 4194303 }
- { name: fs.file-max, value: 26234859 }
--- /dev/null
+from ceph_volume import process
+
+
+def get_partuuid(device):
+ """
+ If a device is a partition, it will probably have a PARTUUID on it that
+ will persist and can be queried against `blkid` later to detect the actual
+ device
+ """
+ out, err, rc = process.call(
+ ['sudo', 'blkid', '-s', 'PARTUUID', '-o', 'value', device]
+ )
+ return ' '.join(out).strip()
+
+
+def get_device_from_partuuid(partuuid):
+ """
+ If a device has a partuuid, query blkid so that it can tell us what that
+ device is
+ """
+ out, err, rc = process.call(
+ ['sudo', 'blkid', '-t', 'PARTUUID="%s"' % partuuid, '-o', 'device']
+ )
+ return ' '.join(out).strip()
def mdsids():
- ret, outbuf, outs = json_command(cluster_handle, prefix='mds dump',
+ ret, outbuf, outs = json_command(cluster_handle, prefix='fs dump',
argdict={'format': 'json'})
if ret:
raise RuntimeError('Can\'t contact mon for mds list')
d = json.loads(outbuf.decode('utf-8'))
l = []
- infodict = d['info']
- for mdsdict in infodict.values():
- l.append(mdsdict['name'])
+ for info in d['standbys']:
+ l.append(info['name'])
+ for fs in d['filesystems']:
+ for info in fs['mdsmap']['info'].values():
+ l.append(info['name'])
return l
return l
+def ids_by_service(service):
+ ids = {"mon": monids,
+ "osd": osdids,
+ "mds": mdsids,
+ "mgr": mgrids}
+ return ids[service]()
+
+
def validate_target(target):
"""
this function will return true iff target is a correct
if len(target) == 2:
# for case "service.id"
service_name, service_id = target[0], target[1]
- exist_ids = []
- if service_name == "mon":
- exist_ids = monids()
- elif service_name == "osd":
- exist_ids = osdids()
- elif service_name == "mds":
- exist_ids = mdsids()
- elif service_name == "mgr":
- exist_ids = mgrids()
- else:
+ try:
+ exist_ids = ids_by_service(service_name)
+ except KeyError:
print('WARN: {0} is not a legal service name, should be one of mon/osd/mds/mgr'.format(service_name),
file=sys.stderr)
return False
# of the form 'cmdNNN' followed by an array of argument descriptors)
# as part of the validated argument JSON object
- targets = [target]
-
if target[1] == '*':
- if target[0] == 'osd':
- targets = [(target[0], o) for o in osdids()]
- elif target[0] == 'mon':
- targets = [(target[0], m) for m in monids()]
+ service = target[0]
+ targets = [(service, o) for o in ids_by_service(service)]
+ else:
+ targets = [target]
final_ret = 0
for target in targets:
if (cct->_conf->client_acl_type == "posix_acl")
acl_type = POSIX_ACL;
- lru.lru_set_max(cct->_conf->client_cache_size);
lru.lru_set_midpoint(cct->_conf->client_cache_mid);
// file handles
// *** FIXME ***
// empty lru
- lru.lru_set_max(0);
trim_cache();
assert(lru.lru_get_size() == 0);
void Client::trim_cache(bool trim_kernel_dcache)
{
- ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << lru.lru_get_max() << dendl;
+ uint64_t max = cct->_conf->client_cache_size;
+ ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
unsigned last = 0;
while (lru.lru_get_size() != last) {
last = lru.lru_get_size();
- if (lru.lru_get_size() <= lru.lru_get_max()) break;
+ if (!unmounting && lru.lru_get_size() <= max) break;
// trim!
Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
trim_dentry(dn);
}
- if (trim_kernel_dcache && lru.lru_get_size() > lru.lru_get_max())
+ if (trim_kernel_dcache && lru.lru_get_size() > max)
_invalidate_kernel_dcache();
// hose root?
MetaSession *tsession = _get_or_open_mds_session(peer_mds);
if (in->caps.count(peer_mds)) {
Cap *tcap = in->caps[peer_mds];
- if (tcap->cap_id != m->peer.cap_id ||
+ if (tcap->cap_id == m->peer.cap_id &&
ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
tcap->cap_id = m->peer.cap_id;
tcap->seq = m->peer.seq - 1;
int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
const UserPerm& perms)
{
- ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
+ ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
int r = _getattr_for_perm(in, perms);
if (r < 0)
goto out;
int Client::may_open(Inode *in, int flags, const UserPerm& perms)
{
- ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
+ ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
unsigned want = 0;
if ((flags & O_ACCMODE) == O_WRONLY)
int Client::may_lookup(Inode *dir, const UserPerm& perms)
{
- ldout(cct, 20) << __func__ << *dir << "; " << perms << dendl;
+ ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
int r = _getattr_for_perm(dir, perms);
if (r < 0)
goto out;
int Client::may_create(Inode *dir, const UserPerm& perms)
{
- ldout(cct, 20) << __func__ << *dir << "; " << perms << dendl;
+ ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
int r = _getattr_for_perm(dir, perms);
if (r < 0)
goto out;
int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
{
- ldout(cct, 20) << __func__ << *dir << "; " << "; name " << name << "; " << perms << dendl;
+ ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
int r = _getattr_for_perm(dir, perms);
if (r < 0)
goto out;
int Client::may_hardlink(Inode *in, const UserPerm& perms)
{
- ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
+ ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
int r = _getattr_for_perm(in, perms);
if (r < 0)
goto out;
{
Mutex::Locker lock(client_lock);
- assert(initialized);
+ if (!initialized)
+ return -ENOTCONN;
int r;
r = authenticate();
{
Mutex::Locker lock(client_lock);
- assert(mounted); // caller is confused?
+ if (unmounting)
+ return;
ldout(cct, 2) << "unmounting" << dendl;
unmounting = true;
wait_sync_caps(last_flush_tid);
// empty lru cache
- lru.lru_set_max(0);
trim_cache();
while (lru.lru_get_size() > 0 ||
tout(cct) << relexisting << std::endl;
tout(cct) << relpath << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
filepath existing(relexisting);
InodeRef in, dir;
tout(cct) << "unlink" << std::endl;
tout(cct) << relpath << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
if (std::string(relpath) == "/")
return -EISDIR;
tout(cct) << relfrom << std::endl;
tout(cct) << relto << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
if (std::string(relfrom) == "/" || std::string(relto) == "/")
return -EBUSY;
tout(cct) << mode << std::endl;
ldout(cct, 10) << "mkdir: " << relpath << dendl;
+ if (unmounting)
+ return -ENOTCONN;
+
if (std::string(relpath) == "/")
return -EEXIST;
tout(cct) << relpath << std::endl;
tout(cct) << mode << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
//get through existing parts of path
filepath path(relpath);
unsigned int i;
tout(cct) << "rmdir" << std::endl;
tout(cct) << relpath << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
if (std::string(relpath) == "/")
return -EBUSY;
tout(cct) << mode << std::endl;
tout(cct) << rdev << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
if (std::string(relpath) == "/")
return -EEXIST;
tout(cct) << target << std::endl;
tout(cct) << relpath << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
if (std::string(relpath) == "/")
return -EEXIST;
tout(cct) << "readlink" << std::endl;
tout(cct) << relpath << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
int r = path_walk(path, &in, perms, false);
mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
mask &= ~CEPH_SETATTR_MODE;
ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
- } else if (kill_sguid && S_ISREG(in->mode)) {
+ } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
/* Must squash the any setuid/setgid bits with an ownership change */
- in->mode &= ~S_ISUID;
- if ((in->mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP))
- in->mode &= ~S_ISGID;
+ in->mode &= ~(S_ISUID|S_ISGID);
mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
}
stat_to_statx(attr, &stx);
mask &= ~CEPH_SETATTR_BTIME;
+
+ if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
+ mask &= ~CEPH_SETATTR_UID;
+ }
+ if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
+ mask &= ~CEPH_SETATTR_GID;
+ }
+
return _setattrx(in, &stx, mask, perms);
}
tout(cct) << relpath << std::endl;
tout(cct) << mask << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
int r = path_walk(path, &in, perms);
tout(cct) << relpath << std::endl;
tout(cct) << mask << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
tout(cct) << fd << std::endl;
tout(cct) << mask << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
tout(cct) << fd << std::endl;
tout(cct) << mask << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
Mutex::Locker lock(client_lock);
tout(cct) << "stat" << std::endl;
tout(cct) << relpath << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
int r = path_walk(path, &in, perms, true, mask);
Mutex::Locker lock(client_lock);
tout(cct) << "statx" << std::endl;
tout(cct) << relpath << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
Mutex::Locker lock(client_lock);
tout(cct) << "lstat" << std::endl;
tout(cct) << relpath << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
// don't follow symlinks
tout(cct) << "chmod" << std::endl;
tout(cct) << relpath << std::endl;
tout(cct) << mode << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
int r = path_walk(path, &in, perms);
tout(cct) << "fchmod" << std::endl;
tout(cct) << fd << std::endl;
tout(cct) << mode << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
tout(cct) << "lchmod" << std::endl;
tout(cct) << relpath << std::endl;
tout(cct) << mode << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
// don't follow symlinks
tout(cct) << relpath << std::endl;
tout(cct) << new_uid << std::endl;
tout(cct) << new_gid << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
int r = path_walk(path, &in, perms);
struct stat attr;
attr.st_uid = new_uid;
attr.st_gid = new_gid;
- int mask = 0;
- if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
- if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
- return _setattr(in, &attr, mask, perms);
+ return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
}
int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
tout(cct) << fd << std::endl;
tout(cct) << new_uid << std::endl;
tout(cct) << new_gid << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
tout(cct) << relpath << std::endl;
tout(cct) << new_uid << std::endl;
tout(cct) << new_gid << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
// don't follow symlinks
tout(cct) << relpath << std::endl;
tout(cct) << buf->modtime << std::endl;
tout(cct) << buf->actime << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
int r = path_walk(path, &in, perms);
tout(cct) << relpath << std::endl;
tout(cct) << buf->modtime << std::endl;
tout(cct) << buf->actime << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
// don't follow symlinks
tout(cct) << fd << std::endl;
tout(cct) << operation << std::endl;
tout(cct) << owner << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
Mutex::Locker lock(client_lock);
tout(cct) << "opendir" << std::endl;
tout(cct) << relpath << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
int r = path_walk(path, &in, perms, true);
void Client::rewinddir(dir_result_t *dirp)
{
Mutex::Locker lock(client_lock);
-
ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
+
+ if (unmounting)
+ return;
+
dir_result_t *d = static_cast<dir_result_t*>(dirp);
_readdir_drop_dirp_buffer(d);
d->reset();
ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
+ if (unmounting)
+ return;
+
if (offset == dirp->offset)
return;
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
dir_result_t *dirp = static_cast<dir_result_t*>(d);
ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
tout(cct) << relpath << std::endl;
tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *fh = NULL;
#if defined(__linux__) && defined(O_PATH)
Mutex::Locker lock(client_lock);
ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
+ if (unmounting)
+ return -ENOTCONN;
+
MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
filepath path(ino);
req->set_filepath(path);
Mutex::Locker lock(client_lock);
ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl;
+ if (unmounting)
+ return -ENOTCONN;
+
MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
filepath path(ino);
req->set_filepath(path);
Mutex::Locker lock(client_lock);
ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl;
+ if (unmounting)
+ return -ENOTCONN;
+
if (!ino->dn_set.empty()) {
// if we exposed the parent here, we'd need to check permissions,
// but right now we just rely on the MDS doing so in make_request
Mutex::Locker lock(client_lock);
ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
+ if (unmounting)
+ return -ENOTCONN;
+
MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
req->set_filepath2(filepath(parent->ino));
req->set_filepath(filepath(ino->ino));
tout(cct) << "close" << std::endl;
tout(cct) << fd << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *fh = get_filehandle(fd);
if (!fh)
return -EBADF;
tout(cct) << offset << std::endl;
tout(cct) << whence << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
tout(cct) << size << std::endl;
tout(cct) << offset << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
tout(cct) << size << std::endl;
tout(cct) << offset << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *fh = get_filehandle(fd);
if (!fh)
return -EBADF;
tout(cct) << fd << std::endl;
tout(cct) << offset << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *fh = get_filehandle(fd);
if (!fh)
return -EBADF;
return r;
/* clear the setuid/setgid bits, if any */
- if (unlikely((in->mode & S_ISUID) ||
- (in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) {
+ if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
struct ceph_statx stx = { 0 };
put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
tout(cct) << fd << std::endl;
tout(cct) << length << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
tout(cct) << fd << std::endl;
tout(cct) << syncdataonly << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
tout(cct) << fd << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
tout(cct) << fd << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
Mutex::Locker lock(client_lock);
tout(cct) << "chdir" << std::endl;
tout(cct) << relpath << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
int r = path_walk(path, &in, perms);
void Client::getcwd(string& dir, const UserPerm& perms)
{
Mutex::Locker l(client_lock);
- _getcwd(dir, perms);
+ if (!unmounting)
+ _getcwd(dir, perms);
}
int Client::statfs(const char *path, struct statvfs *stbuf,
Mutex::Locker l(client_lock);
tout(cct) << "statfs" << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
ceph_statfs stats;
C_SaferCond cond;
int Client::sync_fs()
{
Mutex::Locker l(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
return _sync_fs();
}
int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
{
Mutex::Locker l(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
int r = path_walk(path, &in, perm);
Inode *snapdir = open_snapdir(in.get());
return _mkdir(snapdir, name, 0, perm);
}
+
int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
{
Mutex::Locker l(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
int r = path_walk(path, &in, perms);
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
int Client::get_caps_issued(const char *path, const UserPerm& perms)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath p(path);
InodeRef in;
int r = path_walk(p, &in, perms, true);
tout(cct) << "ll_lookup" << std::endl;
tout(cct) << name << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
int r = 0;
if (!cct->_conf->fuse_default_permissions) {
r = may_lookup(parent, perms);
tout(cct) << "ll_lookupx" << std::endl;
tout(cct) << name << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
int r = 0;
if (!cct->_conf->fuse_default_permissions) {
r = may_lookup(parent, perms);
unsigned int want, unsigned int flags, const UserPerm& perms)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
filepath fp(name, 0);
InodeRef in;
int rc;
tout(cct) << ino.val << std::endl;
tout(cct) << count << std::endl;
+ // Ignore forget if we're no longer mounted
+ if (unmounting)
+ return true;
+
if (ino == 1) return true; // ignore forget on root.
bool last = false;
Inode *Client::ll_get_inode(ino_t ino)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return NULL;
+
vinodeno_t vino = _map_faked_ino(ino);
unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
if (p == inode_map.end())
Inode *Client::ll_get_inode(vinodeno_t vino)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return NULL;
+
unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
if (p == inode_map.end())
return NULL;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
if (res == 0)
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
int res = 0;
unsigned mask = statx_to_mask(flags, want);
const UserPerm& perms)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
InodeRef target(in);
int res = _ll_setattrx(in, stx, mask, perms, &target);
if (res == 0) {
stat_to_statx(attr, &stx);
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
InodeRef target(in);
int res = _ll_setattrx(in, &stx, mask, perms, &target);
if (res == 0) {
const UserPerm& perms)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
InodeRef in;
int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
if (r < 0)
const UserPerm& perms)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
InodeRef in;
int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
if (r < 0)
const UserPerm& perms)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
const UserPerm& perms)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
InodeRef in;
int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
if (r < 0)
const UserPerm& perms)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
InodeRef in;
int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
if (r < 0)
int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
const UserPerm& perms)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
InodeRef in;
int r = Client::path_walk(path, &in, perms, true);
if (r < 0)
const UserPerm& perms)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
InodeRef in;
int r = Client::path_walk(path, &in, perms, false);
if (r < 0)
int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
_setxattr_maybe_wait_for_osdmap(name, value, size);
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
InodeRef in;
int r = Client::path_walk(path, &in, perms, true);
if (r < 0)
_setxattr_maybe_wait_for_osdmap(name, value, size);
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
InodeRef in;
int r = Client::path_walk(path, &in, perms, false);
if (r < 0)
_setxattr_maybe_wait_for_osdmap(name, value, size);
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vino = _get_vino(in);
ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vino = _get_vino(in);
ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vino = _get_vino(in);
ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vino = _get_vino(in);
ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vino = _get_vino(in);
ldout(cct, 3) << "ll_readlink " << vino << dendl;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vparent = _get_vino(parent);
ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
unsigned caps = statx_to_mask(flags, want);
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vparent = _get_vino(parent);
ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vparent = _get_vino(parent);
ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vparent = _get_vino(parent);
ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vparent = _get_vino(parent);
ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vparent = _get_vino(parent);
ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vino = _get_vino(in);
ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vino = _get_vino(in);
ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vparent = _get_vino(parent);
vinodeno_t vnewparent = _get_vino(newparent);
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vino = _get_vino(in);
vinodeno_t vnewparent = _get_vino(newparent);
int Client::ll_osdaddr(int osd, uint32_t *addr)
{
Mutex::Locker lock(client_lock);
+
entity_addr_t g;
bool exists = objecter->with_osdmap([&](const OSDMap& o) {
if (!o.exists(osd))
*addr = ntohl(nb_addr);
return 0;
}
+
uint32_t Client::ll_stripe_unit(Inode *in)
{
Mutex::Locker lock(client_lock);
file_layout_t* layout)
{
Mutex::Locker lock(client_lock);
+
inodeno_t ino = ll_get_inodeno(in);
uint32_t object_size = layout->object_size;
uint32_t su = layout->stripe_unit;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vino = _get_vino(in);
ldout(cct, 3) << "ll_opendir " << vino << dendl;
ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
tout(cct) << "ll_releasedir" << std::endl;
tout(cct) << (unsigned long)dirp << std::endl;
+
+ if (unmounting)
+ return -ENOTCONN;
+
_closedir(dirp);
return 0;
}
tout(cct) << "ll_fsyncdir" << std::endl;
tout(cct) << (unsigned long)dirp << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
return _fsync(dirp->inode.get(), false);
}
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vino = _get_vino(in);
ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
Mutex::Locker lock(client_lock);
InodeRef in;
+ if (unmounting)
+ return -ENOTCONN;
+
int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
fhp, perms);
if (r >= 0) {
Mutex::Locker lock(client_lock);
InodeRef in;
+ if (unmounting)
+ return -ENOTCONN;
int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
if (r >= 0) {
tout(cct) << offset << std::endl;
tout(cct) << whence << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
return _lseek(fh, offset, whence);
}
tout(cct) << off << std::endl;
tout(cct) << len << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
return _read(fh, off, len, bl);
}
file_layout_t* layout)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
vinodeno_t vino = ll_get_vino(in);
object_t oid = file_object_t(vino.ino, blockid);
C_SaferCond onfinish;
Cond cond;
bool done;
int r = 0;
- Context *onsafe;
+ Context *onsafe = nullptr;
if (length == 0) {
return -EINVAL;
/* lock just in time */
client_lock.Lock();
+ if (unmounting) {
+ client_lock.Unlock();
+ delete onsafe;
+ return -ENOTCONN;
+ }
objecter->write(oid,
object_locator_t(layout->pool_id),
tout(cct) << off << std::endl;
tout(cct) << len << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
int r = _write(fh, off, len, data, NULL, 0);
ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
<< dendl;
tout(cct) << "ll_flush" << std::endl;
tout(cct) << (unsigned long)fh << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
return _flush(fh);
}
tout(cct) << "ll_fsync" << std::endl;
tout(cct) << (unsigned long)fh << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
int r = _fsync(fh, syncdataonly);
if (r) {
// If we're returning an error, clear it from the FH
tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
tout(cct) << (unsigned long)fh << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
return _fallocate(fh, mode, offset, length);
}
Mutex::Locker lock(client_lock);
tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *fh = get_filehandle(fd);
if (!fh)
return -EBADF;
tout(cct) << "ll_release (fh)" << std::endl;
tout(cct) << (unsigned long)fh << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
if (ll_unclosed_fh_set.count(fh))
ll_unclosed_fh_set.erase(fh);
return _release_fh(fh);
ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
return _getlk(fh, fl, owner);
}
ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
return _setlk(fh, fl, owner, sleep);
}
ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
+ if (unmounting)
+ return -ENOTCONN;
+
return _flock(fh, cmd, owner);
}
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
filepath path(relpath);
InodeRef in;
int r = path_walk(path, &in, perms);
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
int64_t Client::get_default_pool_id()
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
/* first data pool is the default */
return mdsmap->get_first_data_pool();
}
int64_t Client::get_pool_id(const char *pool_name)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
pool_name);
}
string Client::get_pool_name(int64_t pool)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return string();
+
return objecter->with_osdmap([pool](const OSDMap& o) {
return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
});
int Client::get_pool_replication(int64_t pool)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
return objecter->with_osdmap([pool](const OSDMap& o) {
return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
});
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
if (id < 0)
return -EINVAL;
return objecter->with_osdmap([&](const OSDMap& o) {
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
int Client::get_osd_addr(int osd, entity_addr_t& addr)
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
return objecter->with_osdmap([&](const OSDMap& o) {
if (!o.exists(osd))
return -ENOENT;
{
Mutex::Locker lock(client_lock);
+ if (unmounting)
+ return -ENOTCONN;
+
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
int Client::get_local_osd()
{
Mutex::Locker lock(client_lock);
+
+ if (unmounting)
+ return -ENOTCONN;
+
objecter->with_osdmap([this](const OSDMap& o) {
if (o.get_epoch() != local_osd_epoch) {
local_osd = o.find_osd_on_ip(messenger->get_myaddr());
if (cur == root_ancestor)
break;
+ // deleted inode
+ if (cur->nlink == 0) {
+ cur = root_ancestor;
+ break;
+ }
+
MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
filepath path(cur->ino);
req->set_filepath(path);
{
Mutex::Locker lock(client_lock);
- if (changed.count("client_cache_size") ||
- changed.count("client_cache_mid")) {
- lru.lru_set_max(cct->_conf->client_cache_size);
+ if (changed.count("client_cache_mid")) {
lru.lru_set_midpoint(cct->_conf->client_cache_mid);
}
if (changed.count("client_acl_type")) {
friend void intrusive_ptr_release(Inode *in);
//int get_cache_size() { return lru.lru_get_size(); }
- //void set_cache_size(int m) { lru.lru_set_max(m); }
/**
* Don't call this with in==NULL, use get_or_create for that
}
#define GET_GROUPS(perms, req) { \
- if (cfuse->client->cct->_conf->fuse_set_user_groups) { \
+ if (g_conf->get_val<bool>("fuse_set_user_groups")) { \
gid_t *gids = NULL; \
int count = getgroups(req, &gids); \
perms.init_gids(gids, count); \
* Input:
* @param start_after which name to begin listing after
* (use the empty string to start at the beginning)
- * @param max_return the maximum number of names to list(if 0 means no limit)
+ * @param max_return the maximum number of names to list
* Output:
* @param value
return -EINVAL;
}
+ // TODO remove implicit support for zero during the N-release
+ if (max_return == 0) {
+ max_return = RBD_MAX_KEYS_READ;
+ }
+
map<string, bufferlist> data;
string last_read = metadata_key_for_name(start_after);
- int max_read = max_return ? MIN(RBD_MAX_KEYS_READ, max_return) : RBD_MAX_KEYS_READ;
- bool more;
+ bool more = true;
- do {
+ while (more && data.size() < max_return) {
map<string, bufferlist> raw_data;
+ int max_read = MIN(RBD_MAX_KEYS_READ, max_return - data.size());
int r = cls_cxx_map_get_vals(hctx, last_read, RBD_METADATA_KEY_PREFIX,
max_read, &raw_data, &more);
if (r < 0) {
CLS_ERR("failed to read the vals off of disk: %s", cpp_strerror(r).c_str());
return r;
}
- if (raw_data.empty())
- break;
- map<string, bufferlist>::iterator it = raw_data.begin();
- for (; it != raw_data.end(); ++it)
- data[metadata_name_from_key(it->first)].swap(it->second);
-
- if (!more)
- break;
+ for (auto& kv : raw_data) {
+ data[metadata_name_from_key(kv.first)].swap(kv.second);
+ }
- last_read = raw_data.rbegin()->first;
- if (max_return)
- max_read = MIN(RBD_MAX_KEYS_READ, max_return - data.size());
- } while (more);
+ if (!raw_data.empty()) {
+ last_read = raw_data.rbegin()->first;
+ }
+ }
::encode(data, *out);
return 0;
{
finish_pending_string();
os << m_ss.str();
+ if (m_line_break_enabled)
+ os << "\n";
m_ss.clear();
m_ss.str("");
}
* we should NOT output a newline. This primarily triggers on HTTP redirects */
if (m_pretty && !m_ss_str.empty())
os << "\n";
+ else if (m_line_break_enabled)
+ os << "\n";
m_ss.clear();
m_ss.str("");
}
Formatter();
virtual ~Formatter();
+ virtual void enable_line_break() = 0;
virtual void flush(std::ostream& os) = 0;
void flush(bufferlist &bl);
virtual void reset() = 0;
void set_status(int status, const char* status_name) override {};
void output_header() override {};
void output_footer() override {};
+ void enable_line_break() override { m_line_break_enabled = true; }
void flush(std::ostream& os) override;
using Formatter::flush; // don't hide Formatter::flush(bufferlist &bl)
void reset() override;
std::stringstream m_ss, m_pending_string;
std::list<json_formatter_stack_entry_d> m_stack;
bool m_is_pending_string;
+ bool m_line_break_enabled = false;
};
class XMLFormatter : public Formatter {
void output_header() override;
void output_footer() override;
+ void enable_line_break() override { m_line_break_enabled = true; }
void flush(std::ostream& os) override;
using Formatter::flush; // don't hide Formatter::flush(bufferlist &bl)
void reset() override;
const bool m_underscored;
std::string m_pending_string_name;
bool m_header_done;
+ bool m_line_break_enabled = false;
};
class TableFormatter : public Formatter {
void set_status(int status, const char* status_name) override {};
void output_header() override {};
void output_footer() override {};
+ void enable_line_break() override {};
void flush(std::ostream& os) override;
using Formatter::flush; // don't hide Formatter::flush(bufferlist &bl)
void reset() override;
#include <atomic>
#include "common/LogEntry.h"
#include "common/Mutex.h"
+#include "include/health.h"
class LogClient;
class MLog;
void debug(std::stringstream &s) {
do_log(CLOG_DEBUG, s);
}
+ /**
+ * Convenience function mapping health status to
+ * the appropriate cluster log severity.
+ */
+ LogClientTemp health(health_status_t health) {
+ switch(health) {
+ case HEALTH_OK:
+ return info();
+ case HEALTH_WARN:
+ return warn();
+ case HEALTH_ERR:
+ return error();
+ default:
+ // Invalid health_status_t value
+ ceph_abort();
+ }
+ }
LogClientTemp info() {
return LogClientTemp(CLOG_INFO, *this);
}
explicit GetdescsHook(AdminSocket *as) : m_as(as) {}
bool call(string command, cmdmap_t &cmdmap, string format, bufferlist& out) override {
int cmdnum = 0;
- JSONFormatter jf(false);
+ JSONFormatter jf;
jf.open_object_section("command_descriptions");
for (map<string,string>::iterator p = m_as->m_descs.begin();
p != m_as->m_descs.end();
cmdnum++;
}
jf.close_section(); // command_descriptions
+ jf.enable_line_break();
ostringstream ss;
jf.flush(ss);
out.append(ss.str());
for (int ix = 0; ix < n_lanes; ++ix,
lane_ix = next_evict_lane()) {
Lane& lane = qlane[lane_ix];
+ lane.lock.lock();
/* if object at LRU has refcnt==1, it may be reclaimable */
Object* o = &(lane.q.back());
if (can_reclaim(o)) {
return o;
} else {
// XXX can't make unreachable (means what?)
- lane.lock.lock();
--(o->lru_refcnt);
o->lru_flags &= ~FLAG_EVICTING;
/* unlock in next block */
OPTION(fuse_multithreaded, OPT_BOOL)
OPTION(fuse_require_active_mds, OPT_BOOL) // if ceph_fuse requires active mds server
OPTION(fuse_syncfs_on_mksnap, OPT_BOOL)
-OPTION(fuse_set_user_groups, OPT_BOOL) // if ceph_fuse fills in group lists or not
OPTION(client_try_dentry_invalidate, OPT_BOOL) // the client should try to use dentry invaldation instead of remounting, on kernels it believes that will work for
OPTION(client_die_on_failed_remount, OPT_BOOL)
OPTION(mds_max_file_size, OPT_U64) // Used when creating new CephFS. Change with 'ceph mds set max_file_size <size>' afterwards
// max xattr kv pairs size for each dir/file
OPTION(mds_max_xattr_pairs_size, OPT_U32)
-OPTION(mds_cache_size, OPT_INT)
-OPTION(mds_cache_mid, OPT_FLOAT)
OPTION(mds_max_file_recover, OPT_U32)
OPTION(mds_dir_max_commit_size, OPT_INT) // MB
OPTION(mds_dir_keys_per_op, OPT_INT)
OPTION(mds_freeze_tree_timeout, OPT_FLOAT) // detecting freeze tree deadlock
OPTION(mds_session_autoclose, OPT_FLOAT) // autoclose idle session
OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
-OPTION(mds_health_cache_threshold, OPT_FLOAT) // warn on cache size if it exceeds mds_cache_size by this factor
OPTION(mds_reconnect_timeout, OPT_FLOAT) // seconds to wait for clients during mds restart
// make it (mds_session_timeout - mds_beacon_grace)
OPTION(mds_tick_interval, OPT_FLOAT)
OPTION(rados_osd_op_timeout, OPT_DOUBLE) // how many seconds to wait for a response from osds before returning an error from a rados operation. 0 means no limit.
OPTION(rados_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
-OPTION(rbd_op_threads, OPT_INT)
-OPTION(rbd_op_thread_timeout, OPT_INT)
-OPTION(rbd_non_blocking_aio, OPT_BOOL) // process AIO ops from a worker thread to prevent blocking
-OPTION(rbd_cache, OPT_BOOL) // whether to enable caching (writeback unless rbd_cache_max_dirty is 0)
-OPTION(rbd_cache_writethrough_until_flush, OPT_BOOL) // whether to make writeback caching writethrough until flush is called, to be sure the user of librbd will send flushs so that writeback is safe
-OPTION(rbd_cache_size, OPT_LONGLONG) // cache size in bytes
-OPTION(rbd_cache_max_dirty, OPT_LONGLONG) // dirty limit in bytes - set to 0 for write-through caching
-OPTION(rbd_cache_target_dirty, OPT_LONGLONG) // target dirty limit in bytes
-OPTION(rbd_cache_max_dirty_age, OPT_FLOAT) // seconds in cache before writeback starts
-OPTION(rbd_cache_max_dirty_object, OPT_INT) // dirty limit for objects - set to 0 for auto calculate from rbd_cache_size
-OPTION(rbd_cache_block_writes_upfront, OPT_BOOL) // whether to block writes to the cache before the aio_write call completes (true))
-OPTION(rbd_concurrent_management_ops, OPT_INT) // how many operations can be in flight for a management operation like deleting or resizing an image
-OPTION(rbd_balance_snap_reads, OPT_BOOL)
-OPTION(rbd_localize_snap_reads, OPT_BOOL)
-OPTION(rbd_balance_parent_reads, OPT_BOOL)
-OPTION(rbd_localize_parent_reads, OPT_BOOL)
-OPTION(rbd_readahead_trigger_requests, OPT_INT) // number of sequential requests necessary to trigger readahead
-OPTION(rbd_readahead_max_bytes, OPT_LONGLONG) // set to 0 to disable readahead
-OPTION(rbd_readahead_disable_after_bytes, OPT_LONGLONG) // how many bytes are read in total before readahead is disabled
-OPTION(rbd_clone_copy_on_read, OPT_BOOL)
-OPTION(rbd_blacklist_on_break_lock, OPT_BOOL) // whether to blacklist clients whose lock was broken
-OPTION(rbd_blacklist_expire_seconds, OPT_INT) // number of seconds to blacklist - set to 0 for OSD default
-OPTION(rbd_request_timed_out_seconds, OPT_INT) // number of seconds before maint request times out
-OPTION(rbd_skip_partial_discard, OPT_BOOL) // when trying to discard a range inside an object, set to true to skip zeroing the range.
-OPTION(rbd_enable_alloc_hint, OPT_BOOL) // when writing a object, it will issue a hint to osd backend to indicate the expected size object need
-OPTION(rbd_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
-OPTION(rbd_blkin_trace_all, OPT_BOOL) // create a blkin trace for all RBD requests
-OPTION(rbd_validate_pool, OPT_BOOL) // true if empty pools should be validated for RBD compatibility
-OPTION(rbd_validate_names, OPT_BOOL) // true if image specs should be validated
-OPTION(rbd_auto_exclusive_lock_until_manual_request, OPT_BOOL) // whether to automatically acquire/release exclusive lock until it is explicitly requested, i.e. before we know the user of librbd is properly using the lock API
-OPTION(rbd_mirroring_resync_after_disconnect, OPT_BOOL) // automatically start image resync after mirroring is disconnected due to being laggy
-OPTION(rbd_mirroring_replay_delay, OPT_INT) // time-delay in seconds for rbd-mirror asynchronous replication
-
-OPTION(rbd_default_pool, OPT_STR) // default pool for storing images
-
-/*
- * The following options change the behavior for librbd's image creation methods that
- * don't require all of the parameters. These are provided so that older programs
- * can take advantage of newer features without being rewritten to use new versions
- * of the image creation functions.
- *
- * rbd_create()/RBD::create() are affected by all of these options.
- *
- * rbd_create2()/RBD::create2() and rbd_clone()/RBD::clone() are affected by:
- * - rbd_default_order
- * - rbd_default_stripe_count
- * - rbd_default_stripe_size
- *
- * rbd_create3()/RBD::create3() and rbd_clone2/RBD::clone2() are only
- * affected by rbd_default_order.
- */
-OPTION(rbd_default_format, OPT_INT)
-OPTION(rbd_default_order, OPT_INT)
-OPTION(rbd_default_stripe_count, OPT_U64) // changing requires stripingv2 feature
-OPTION(rbd_default_stripe_unit, OPT_U64) // changing to non-object size requires stripingv2 feature
-OPTION(rbd_default_data_pool, OPT_STR) // optional default pool for storing image data blocks
-
-/**
- * RBD features are only applicable for v2 images. This setting accepts either
- * an integer bitmask value or comma-delimited string of RBD feature names.
- * This setting is always internally stored as an integer bitmask value. The
- * mapping between feature bitmask value and feature name is as follows:
- *
- * +1 -> layering
- * +2 -> striping
- * +4 -> exclusive-lock
- * +8 -> object-map
- * +16 -> fast-diff
- * +32 -> deep-flatten
- * +64 -> journaling
- * +128 -> data-pool
- */
-SAFE_OPTION(rbd_default_features, OPT_STR)
-
-OPTION(rbd_default_map_options, OPT_STR) // default rbd map -o / --options
-
-/**
- * RBD journal options.
- */
-OPTION(rbd_journal_order, OPT_U32) // bits to shift to compute journal object max size, between 12 and 64
-OPTION(rbd_journal_splay_width, OPT_U32) // number of active journal objects
-OPTION(rbd_journal_commit_age, OPT_DOUBLE) // commit time interval, seconds
-OPTION(rbd_journal_object_flush_interval, OPT_INT) // maximum number of pending commits per journal object
-OPTION(rbd_journal_object_flush_bytes, OPT_INT) // maximum number of pending bytes per journal object
-OPTION(rbd_journal_object_flush_age, OPT_DOUBLE) // maximum age (in seconds) for pending commits
-OPTION(rbd_journal_pool, OPT_STR) // pool for journal objects
-OPTION(rbd_journal_max_payload_bytes, OPT_U32) // maximum journal payload size before splitting
-OPTION(rbd_journal_max_concurrent_object_sets, OPT_INT) // maximum number of object sets a journal client can be behind before it is automatically unregistered
-
-/**
- * RBD Mirror options
- */
-OPTION(rbd_mirror_journal_commit_age, OPT_DOUBLE) // commit time interval, seconds
-OPTION(rbd_mirror_journal_poll_age, OPT_DOUBLE) // maximum age (in seconds) between successive journal polls
-OPTION(rbd_mirror_journal_max_fetch_bytes, OPT_U32) // maximum bytes to read from each journal data object per fetch
-OPTION(rbd_mirror_sync_point_update_age, OPT_DOUBLE) // number of seconds between each update of the image sync point object number
-OPTION(rbd_mirror_concurrent_image_syncs, OPT_U32) // maximum number of image syncs in parallel
-OPTION(rbd_mirror_pool_replayers_refresh_interval, OPT_INT) // interval to refresh peers in rbd-mirror daemon
-OPTION(rbd_mirror_delete_retry_interval, OPT_DOUBLE) // interval to check and retry the failed requests in deleter
-OPTION(rbd_mirror_image_state_check_interval, OPT_INT) // interval to get images from pool watcher and set sources in replayer
-OPTION(rbd_mirror_leader_heartbeat_interval, OPT_INT) // interval (in seconds) between mirror leader heartbeats
-OPTION(rbd_mirror_leader_max_missed_heartbeats, OPT_INT) // number of missed heartbeats for non-lock owner to attempt to acquire lock
-OPTION(rbd_mirror_leader_max_acquire_attempts_before_break, OPT_INT) // number of failed attempts to acquire lock after missing heartbeats before breaking lock
-
OPTION(nss_db_path, OPT_STR) // path to nss db
.set_default(50)
.set_description(""),
+ Option("mon_health_log_update_period", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(5)
+ .set_description("Minimum time in seconds between log messages about "
+ "each health check")
+ .set_min(0),
+
Option("mon_data_avail_crit", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_description(""),
.set_description(""),
Option("osd_min_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
- .set_default(3000)
+ .set_default(1500)
.set_description("minimum number of entries to maintain in the PG log")
.add_service("osd")
.add_see_also("osd_max_pg_log_entries")
.set_description(""),
Option("bluefs_allocator", Option::TYPE_STR, Option::LEVEL_DEV)
- .set_default("bitmap")
+ .set_default("stupid")
.set_description(""),
Option("bluefs_preextend_wal_files", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_description("Key value database to use for bluestore"),
Option("bluestore_allocator", Option::TYPE_STR, Option::LEVEL_DEV)
- .set_default("bitmap")
- .add_tag("mkfs")
- .set_description(""),
+ .set_default("stupid")
+ .set_enum_allowed({"bitmap", "stupid"})
+ .set_description("Allocator policy"),
Option("bluestore_freelist_blocks_per_key", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(128)
.set_description(""),
Option("rgw_dynamic_resharding", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
- .set_default(false)
+ .set_default(true)
.set_description(""),
Option("rgw_max_objs_per_shard", Option::TYPE_INT, Option::LEVEL_ADVANCED)
return std::vector<Option>({
Option("rbd_default_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("rbd")
- .set_description("")
+ .set_description("default pool for storing new images")
.set_validator([](std::string *value, std::string *error_message){
boost::regex pattern("^[^@/]+$");
if (!boost::regex_match (*value, pattern)) {
Option("rbd_default_data_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
- .set_description("")
+ .set_description("default pool for storing data blocks for new images")
.set_validator([](std::string *value, std::string *error_message){
boost::regex pattern("^[^@/]*$");
if (!boost::regex_match (*value, pattern)) {
Option("rbd_default_features", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("layering,exclusive-lock,object-map,fast-diff,deep-flatten")
- .set_description("")
+ .set_description("default v2 image features for new images")
+ .set_long_description(
+ "RBD features are only applicable for v2 images. This setting accepts "
+ "either an integer bitmask value or comma-delimited string of RBD "
+ "feature names. This setting is always internally stored as an integer "
+ "bitmask value. The mapping between feature bitmask value and feature "
+ "name is as follows: +1 -> layering, +2 -> striping, "
+ "+4 -> exclusive-lock, +8 -> object-map, +16 -> fast-diff, "
+ "+32 -> deep-flatten, +64 -> journaling, +128 -> data-pool")
.set_safe()
.set_validator([](std::string *value, std::string *error_message){
static const std::map<std::string, uint64_t> FEATURE_MAP = {
Option("rbd_op_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(1)
- .set_description(""),
+ .set_description("number of threads to utilize for internal processing"),
Option("rbd_op_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(60)
- .set_description(""),
+ .set_description("time in seconds for detecting a hung thread"),
Option("rbd_non_blocking_aio", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("process AIO ops from a dispatch thread to prevent blocking"),
Option("rbd_cache", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("whether to enable caching (writeback unless rbd_cache_max_dirty is 0)"),
Option("rbd_cache_writethrough_until_flush", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("whether to make writeback caching writethrough until "
+ "flush is called, to be sure the user of librbd will send "
+ "flushes so that writeback is safe"),
Option("rbd_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(32<<20)
- .set_description(""),
+ .set_description("cache size in bytes"),
Option("rbd_cache_max_dirty", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(24<<20)
- .set_description(""),
+ .set_description("dirty limit in bytes - set to 0 for write-through caching"),
Option("rbd_cache_target_dirty", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(16<<20)
- .set_description(""),
+ .set_description("target dirty limit in bytes"),
Option("rbd_cache_max_dirty_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(1.0)
- .set_description(""),
+ .set_description("seconds in cache before writeback starts"),
Option("rbd_cache_max_dirty_object", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
- .set_description(""),
+ .set_description("dirty limit for objects - set to 0 for auto calculate from rbd_cache_size"),
Option("rbd_cache_block_writes_upfront", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("whether to block writes to the cache before the aio_write call completes"),
Option("rbd_concurrent_management_ops", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(10)
.set_min(1)
- .set_description(""),
+ .set_description("how many operations can be in flight for a management operation like deleting or resizing an image"),
Option("rbd_balance_snap_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("distribute snap read requests to random OSD"),
Option("rbd_localize_snap_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("localize snap read requests to closest OSD"),
Option("rbd_balance_parent_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("distribute parent read requests to random OSD"),
Option("rbd_localize_parent_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("localize parent requests to closest OSD"),
Option("rbd_readahead_trigger_requests", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(10)
- .set_description(""),
+ .set_description("number of sequential requests necessary to trigger readahead"),
Option("rbd_readahead_max_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(512 * 1024)
- .set_description(""),
+ .set_description("set to 0 to disable readahead"),
Option("rbd_readahead_disable_after_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(50 * 1024 * 1024)
- .set_description(""),
+ .set_description("how many bytes are read in total before readahead is disabled"),
Option("rbd_clone_copy_on_read", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("copy-up parent image blocks to clone upon read request"),
Option("rbd_blacklist_on_break_lock", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("whether to blacklist clients whose lock was broken"),
Option("rbd_blacklist_expire_seconds", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
- .set_description(""),
+ .set_description("number of seconds to blacklist - set to 0 for OSD default"),
Option("rbd_request_timed_out_seconds", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(30)
- .set_description(""),
+ .set_description("number of seconds before maintenance request times out"),
Option("rbd_skip_partial_discard", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("when trying to discard a range inside an object, set to true to skip zeroing the range"),
Option("rbd_enable_alloc_hint", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("when writing a object, it will issue a hint to osd backend to indicate the expected size object need"),
Option("rbd_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("true if LTTng-UST tracepoints should be enabled"),
Option("rbd_blkin_trace_all", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("create a blkin trace for all RBD requests"),
Option("rbd_validate_pool", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("validate empty pools for RBD compatibility"),
Option("rbd_validate_names", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("validate new image names for RBD compatibility"),
Option("rbd_auto_exclusive_lock_until_manual_request", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
- .set_description(""),
+ .set_description("automatically acquire/release exclusive lock until it is explicitly requested"),
Option("rbd_mirroring_resync_after_disconnect", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
- .set_description(""),
+ .set_description("automatically start image resync after mirroring is disconnected due to being laggy"),
Option("rbd_mirroring_replay_delay", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
- .set_description(""),
+ .set_description("time-delay in seconds for rbd-mirror asynchronous replication"),
Option("rbd_default_format", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(2)
- .set_description(""),
+ .set_description("default image format for new images"),
Option("rbd_default_order", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(22)
- .set_description(""),
+ .set_description("default order (data block object size) for new images"),
Option("rbd_default_stripe_count", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(0)
- .set_description(""),
+ .set_description("default stripe count for new images"),
Option("rbd_default_stripe_unit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(0)
- .set_description(""),
+ .set_description("default stripe width for new images"),
Option("rbd_default_map_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
- .set_description(""),
+ .set_description("default krbd map options"),
Option("rbd_journal_order", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_min(12)
.set_default(24)
- .set_description(""),
+ .set_description("default order (object size) for journal data objects"),
Option("rbd_journal_splay_width", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(4)
- .set_description(""),
+ .set_description("number of active journal objects"),
Option("rbd_journal_commit_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(5)
- .set_description(""),
+ .set_description("commit time interval, seconds"),
Option("rbd_journal_object_flush_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
- .set_description(""),
+ .set_description("maximum number of pending commits per journal object"),
Option("rbd_journal_object_flush_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
- .set_description(""),
+ .set_description("maximum number of pending bytes per journal object"),
Option("rbd_journal_object_flush_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0)
- .set_description(""),
+ .set_description("maximum age (in seconds) for pending commits"),
Option("rbd_journal_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
- .set_description(""),
+ .set_description("pool for journal objects"),
Option("rbd_journal_max_payload_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(16384)
- .set_description(""),
+ .set_description("maximum journal payload size before splitting"),
Option("rbd_journal_max_concurrent_object_sets", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
- .set_description(""),
+ .set_description("maximum number of object sets a journal client can be behind before it is automatically unregistered"),
+ });
+}
+static std::vector<Option> get_rbd_mirror_options() {
+ return std::vector<Option>({
Option("rbd_mirror_journal_commit_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(5)
- .set_description(""),
+ .set_description("commit time interval, seconds"),
Option("rbd_mirror_journal_poll_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(5)
- .set_description(""),
+ .set_description("maximum age (in seconds) between successive journal polls"),
Option("rbd_mirror_journal_max_fetch_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(32768)
- .set_description(""),
+ .set_description("maximum bytes to read from each journal data object per fetch"),
Option("rbd_mirror_sync_point_update_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(30)
- .set_description(""),
+ .set_description("number of seconds between each update of the image sync point object number"),
Option("rbd_mirror_concurrent_image_syncs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(5)
- .set_description(""),
+ .set_description("maximum number of image syncs in parallel"),
Option("rbd_mirror_pool_replayers_refresh_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(30)
- .set_description(""),
+ .set_description("interval to refresh peers in rbd-mirror daemon"),
Option("rbd_mirror_delete_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(30)
- .set_description(""),
+ .set_description("interval to check and retry the failed requests in deleter"),
Option("rbd_mirror_image_state_check_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(30)
.set_min(1)
- .set_description(""),
+ .set_description("interval to get images from pool watcher and set sources in replayer"),
Option("rbd_mirror_leader_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_min(1)
- .set_description(""),
+ .set_description("interval (in seconds) between mirror leader heartbeats"),
Option("rbd_mirror_leader_max_missed_heartbeats", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(2)
- .set_description(""),
+ .set_description("number of missed heartbeats for non-lock owner to attempt to acquire lock"),
Option("rbd_mirror_leader_max_acquire_attempts_before_break", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(3)
- .set_description(""),
+ .set_description("number of failed attempts to acquire lock after missing heartbeats before breaking lock"),
});
}
.set_description(""),
Option("mds_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
- .set_default(100000)
- .set_description(""),
+ .set_default(0)
+ .set_description("maximum number of inodes in MDS cache (<=0 is unlimited)")
+ .set_long_description("This tunable is no longer recommended. Use mds_cache_memory_limit."),
+
+ Option("mds_cache_memory_limit", Option::TYPE_UINT, Option::LEVEL_BASIC)
+ .set_default(1*(1LL<<30))
+ .set_description("target maximum memory usage of MDS cache")
+ .set_long_description("This sets a target maximum memory usage of the MDS cache and is the primary tunable to limit the MDS memory usage. The MDS will try to stay under a reservation of this limit (by default 95%; 1 - mds_cache_reservation) by trimming unused metadata in its cache and recalling cached items in the client caches. It is possible for the MDS to exceed this limit due to slow recall from clients. The mds_health_cache_threshold (150%) sets a cache full threshold for when the MDS signals a cluster health warning."),
+
+ Option("mds_cache_reservation", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.05)
+ .set_description("amount of memory to reserve"),
+
+ Option("mds_health_cache_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.5)
+ .set_description("threshold for cache size to generate health warning"),
Option("mds_cache_mid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(.7)
.set_default(10)
.set_description(""),
- Option("mds_health_cache_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
- .set_default(1.5)
- .set_description(""),
-
Option("mds_reconnect_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(45)
.set_description(""),
.set_description(""),
Option("fuse_set_user_groups", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
- .set_default(false)
- .set_description(""),
+ .set_default(true)
+ .set_description("check for ceph-fuse to consider supplementary groups for permissions"),
Option("client_try_dentry_invalidate", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
- .set_default(true)
+ .set_default(false)
.set_description(""),
Option("client_die_on_failed_remount", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
ingest(get_rgw_options(), "rgw");
ingest(get_rbd_options(), "rbd");
+ ingest(get_rbd_mirror_options(), "rbd-mirror");
ingest(get_mds_options(), "mds");
ingest(get_mds_client_options(), "mds_client");
#include <sys/mount.h>
#endif
+#include <string>
+
+#include <stdio.h>
+
int64_t unit_to_bytesize(string val, ostream *pss)
{
if (val.empty()) {
result = "Base64:" + result;
return result;
}
+
+std::string bytes2str(uint64_t count) {
+ static char s[][2] = {"\0", "k", "M", "G", "T", "P", "E", "\0"};
+ int i = 0;
+ while (count >= 1024 && *s[i+1]) {
+ count >>= 10;
+ i++;
+ }
+ char str[128];
+ snprintf(str, sizeof str, "%" PRIu64 "%sB", count, s[i]);
+ return std::string(str);
+}
int find_rule(int ruleset, int type, int size) const {
if (!crush) return -1;
- if (!have_uniform_rules) {
- return crush_find_rule(crush, ruleset, type, size);
- } else {
- if (ruleset < (int)crush->max_rules &&
- crush->rules[ruleset])
- return ruleset;
- return -1;
+ if (have_uniform_rules &&
+ ruleset < (int)crush->max_rules &&
+ crush->rules[ruleset] &&
+ crush->rules[ruleset]->mask.type == type &&
+ crush->rules[ruleset]->mask.min_size <= size &&
+ crush->rules[ruleset]->mask.max_size >= size) {
+ return ruleset;
}
+ return crush_find_rule(crush, ruleset, type, size);
}
bool ruleset_exists(const int ruleset) const {
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ALLOC_PTR_H
+#define CEPH_ALLOC_PTR_H
+
+#include <memory>
+
+template <class T>
+class alloc_ptr
+{
+public:
+ typedef typename std::pointer_traits< std::unique_ptr<T> >::pointer pointer;
+ typedef typename std::pointer_traits< std::unique_ptr<T> >::element_type element_type;
+
+ alloc_ptr() : ptr() {}
+
+ template<class U>
+ alloc_ptr(U&& u) : ptr(std::forward<U>(u)) {}
+
+ alloc_ptr(alloc_ptr<pointer>&& rhs) : ptr(std::move(rhs.ptr)) {}
+ alloc_ptr(const alloc_ptr<pointer>& rhs) = delete;
+ alloc_ptr& operator=(const alloc_ptr<pointer>&& rhs) {
+ ptr = rhs.ptr;
+ }
+ alloc_ptr& operator=(const alloc_ptr<pointer>& rhs) {
+ ptr = rhs.ptr;
+ }
+
+ void swap (alloc_ptr<pointer>& rhs) {
+ ptr.swap(rhs.ptr);
+ }
+ element_type* release() {
+ return ptr.release();
+ }
+ void reset(element_type *p = nullptr) {
+ ptr.reset(p);
+ }
+ element_type* get() const {
+ if (!ptr)
+ ptr.reset(new element_type);
+ return ptr.get();
+ }
+ element_type& operator*() const {
+ if (!ptr)
+ ptr.reset(new element_type);
+ return *ptr;
+ }
+ element_type* operator->() const {
+ if (!ptr)
+ ptr.reset(new element_type);
+ return ptr.get();
+ }
+ operator bool() const {
+ return !!ptr;
+ }
+
+ friend bool operator< (const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return std::less<element_type>(*lhs, *rhs);
+ }
+ friend bool operator<=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return std::less_equal<element_type>(*lhs, *rhs);
+ }
+ friend bool operator> (const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return std::greater<element_type>(*lhs, *rhs);
+ }
+ friend bool operator>=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return std::greater_equal<element_type>(*lhs, *rhs);
+ }
+ friend bool operator==(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return *lhs == *rhs;
+ }
+ friend bool operator!=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+ return *lhs != *rhs;
+ }
+private:
+ mutable std::unique_ptr<element_type> ptr;
+};
+
+#endif
#include "assert.h"
#include "encoding_btree.h"
-template<typename T>
+template<typename T,
+ typename Alloc = std::allocator<std::pair<const T, T>>>
class btree_interval_set {
public:
- typedef btree::btree_map<T,T> map_t;
+ typedef btree::btree_map<T,T, std::less<T>, Alloc> map_t;
class const_iterator;
return m.size();
}
- typename btree_interval_set<T>::iterator begin() {
- return typename btree_interval_set<T>::iterator(m.begin());
+ typename btree_interval_set<T,Alloc>::iterator begin() {
+ return typename btree_interval_set<T,Alloc>::iterator(m.begin());
}
- typename btree_interval_set<T>::iterator lower_bound(T start) {
- return typename btree_interval_set<T>::iterator(find_inc_m(start));
+ typename btree_interval_set<T,Alloc>::iterator lower_bound(T start) {
+ return typename btree_interval_set<T,Alloc>::iterator(find_inc_m(start));
}
- typename btree_interval_set<T>::iterator end() {
- return typename btree_interval_set<T>::iterator(m.end());
+ typename btree_interval_set<T,Alloc>::iterator end() {
+ return typename btree_interval_set<T,Alloc>::iterator(m.end());
}
- typename btree_interval_set<T>::const_iterator begin() const {
- return typename btree_interval_set<T>::const_iterator(m.begin());
+ typename btree_interval_set<T,Alloc>::const_iterator begin() const {
+ return typename btree_interval_set<T,Alloc>::const_iterator(m.begin());
}
- typename btree_interval_set<T>::const_iterator lower_bound(T start) const {
- return typename btree_interval_set<T>::const_iterator(find_inc(start));
+ typename btree_interval_set<T,Alloc>::const_iterator lower_bound(T start) const {
+ return typename btree_interval_set<T,Alloc>::const_iterator(find_inc(start));
}
- typename btree_interval_set<T>::const_iterator end() const {
- return typename btree_interval_set<T>::const_iterator(m.end());
+ typename btree_interval_set<T,Alloc>::const_iterator end() const {
+ return typename btree_interval_set<T,Alloc>::const_iterator(m.end());
}
// helpers
};
-template<class T>
-inline std::ostream& operator<<(std::ostream& out, const btree_interval_set<T> &s) {
+template<class T, class A>
+inline std::ostream& operator<<(std::ostream& out, const btree_interval_set<T,A> &s) {
out << "[";
const char *prequel = "";
- for (typename btree_interval_set<T>::const_iterator i = s.begin();
+ for (auto i = s.begin();
i != s.end();
++i)
{
return out;
}
-template<class T>
-inline void encode(const btree_interval_set<T>& s, bufferlist& bl)
+template<class T,typename A>
+inline void encode(const btree_interval_set<T,A>& s, bufferlist& bl)
{
s.encode(bl);
}
-template<class T>
-inline void decode(btree_interval_set<T>& s, bufferlist::iterator& p)
+template<class T,typename A>
+inline void decode(btree_interval_set<T,A>& s, bufferlist::iterator& p)
{
s.decode(p);
}
--it;
return *this;
}
+ const std::pair<const Key,T>& operator*() {
+ return *it;
+ }
const std::pair<const Key,T>* operator->() {
return it.operator->();
}
--it;
return *this;
}
+ std::pair<const Key,T>& operator*() {
+ return *it;
+ }
std::pair<const Key,T>* operator->() {
return it.operator->();
}
#ifndef CEPH_COUNTER_H
#define CEPH_COUNTER_H
+#include <atomic>
+
template <typename T>
class Counter {
public:
~Counter() {
_count()--;
}
- static unsigned long count() {
+ static uint64_t count() {
return _count();
}
- static unsigned long increments() {
+ static uint64_t increments() {
return _increments();
}
- static unsigned long decrements() {
+ static uint64_t decrements() {
return increments()-count();
}
private:
- static unsigned long &_count() {
- static unsigned long c;
+ static std::atomic<uint64_t> &_count() {
+ static std::atomic<uint64_t> c;
return c;
}
- static unsigned long &_increments() {
- static unsigned long i;
+ static std::atomic<uint64_t> &_increments() {
+ static std::atomic<uint64_t> i;
return i;
}
};
#pragma GCC diagnostic ignored "-Wpragmas"
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wuninitialized"
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
template<typename T>
inline void decode(boost::optional<T> &p, bufferlist::iterator &bp)
{
__u8 present;
::decode(present, bp);
if (present) {
- T t;
- p = t;
+ p = T{};
decode(p.get(), bp);
+ } else {
+ p = boost::none;
}
}
#pragma GCC diagnostic pop
typename std::map<T,T>::const_iterator pa = a.m.begin();
typename std::map<T,T>::const_iterator pb = b.m.begin();
-
+ typename decltype(m)::iterator mi = m.begin();
+
while (pa != a.m.end() && pb != b.m.end()) {
// passing?
if (pa->first + pa->second <= pb->first)
{ pa++; continue; }
if (pb->first + pb->second <= pa->first)
{ pb++; continue; }
+
+ if (*pa == *pb) {
+ do {
+ mi = m.insert(mi, *pa);
+ _size += pa->second;
+ ++pa;
+ ++pb;
+ } while (pa != a.m.end() && pb != b.m.end() && *pa == *pb);
+ continue;
+ }
+
T start = MAX(pa->first, pb->first);
T en = MIN(pa->first+pa->second, pb->first+pb->second);
assert(en > start);
- insert(start, en-start);
+ typename decltype(m)::value_type i{start, en - start};
+ mi = m.insert(mi, i);
+ _size += i.second;
if (pa->first+pa->second > pb->first+pb->second)
pb++;
else
#ifndef CEPH_LRU_H
#define CEPH_LRU_H
+#include <math.h>
#include <stdint.h>
#include "common/config.h"
-
-
+#include "xlist.h"
class LRUObject {
- private:
- LRUObject *lru_next, *lru_prev;
- bool lru_pinned;
- class LRU *lru;
- class LRUList *lru_list;
-
- public:
- LRUObject() {
- lru_next = lru_prev = NULL;
- lru_list = 0;
- lru_pinned = false;
- lru = 0;
- }
+public:
+ LRUObject() : lru(), lru_link(this), lru_pinned(false) { }
+ ~LRUObject();
// pin/unpin item in cache
- void lru_pin();
+ void lru_pin();
void lru_unpin();
bool lru_is_expireable() const { return !lru_pinned; }
friend class LRU;
- friend class LRUList;
+private:
+ class LRU *lru;
+ xlist<LRUObject *>::item lru_link;
+ bool lru_pinned;
};
+class LRU {
+public:
+ LRU() : num_pinned(0), midpoint(0.6) {}
-class LRUList {
- private:
- LRUObject *head, *tail;
- uint32_t len;
+ uint64_t lru_get_size() const { return lru_get_top()+lru_get_bot()+lru_get_pintail(); }
+ uint64_t lru_get_top() const { return top.size(); }
+ uint64_t lru_get_bot() const{ return bottom.size(); }
+ uint64_t lru_get_pintail() const { return pintail.size(); }
+ uint64_t lru_get_num_pinned() const { return num_pinned; }
- public:
- LRUList() {
- head = tail = 0;
- len = 0;
- }
+ void lru_set_midpoint(double f) { midpoint = fmin(1.0, fmax(0.0, f)); }
- uint32_t get_length() const { return len; }
-
- LRUObject *get_head() {
- return head;
- }
- LRUObject *get_tail() {
- return tail;
- }
-
- void clear() {
- while (len > 0) {
- remove(get_head());
+ void lru_clear() {
+ while (!top.empty()) {
+ lru_remove(top.front());
}
- }
-
- void insert_head(LRUObject *o) {
- o->lru_next = head;
- o->lru_prev = NULL;
- if (head) {
- head->lru_prev = o;
- } else {
- tail = o;
+ while (!bottom.empty()) {
+ lru_remove(bottom.front());
}
- head = o;
- o->lru_list = this;
- len++;
- }
- void insert_tail(LRUObject *o) {
- o->lru_next = NULL;
- o->lru_prev = tail;
- if (tail) {
- tail->lru_next = o;
- } else {
- head = o;
+ while (!pintail.empty()) {
+ lru_remove(pintail.front());
}
- tail = o;
- o->lru_list = this;
- len++;
- }
-
- void remove(LRUObject *o) {
- assert(o->lru_list == this);
- if (o->lru_next)
- o->lru_next->lru_prev = o->lru_prev;
- else
- tail = o->lru_prev;
- if (o->lru_prev)
- o->lru_prev->lru_next = o->lru_next;
- else
- head = o->lru_next;
- o->lru_next = o->lru_prev = NULL;
- o->lru_list = 0;
- assert(len>0);
- len--;
- }
-
-};
-
-
-class LRU {
- protected:
- LRUList lru_top, lru_bot, lru_pintail;
- uint32_t lru_num, lru_num_pinned;
- uint32_t lru_max; // max items
- double lru_midpoint;
-
- friend class LRUObject;
- //friend class MDCache; // hack
-
- public:
- LRU(int max = 0) {
- lru_num = 0;
- lru_num_pinned = 0;
- lru_midpoint = .6;
- lru_max = max;
- }
-
- uint32_t lru_get_size() const { return lru_num; }
- uint32_t lru_get_top() const { return lru_top.get_length(); }
- uint32_t lru_get_bot() const{ return lru_bot.get_length(); }
- uint32_t lru_get_pintail() const { return lru_pintail.get_length(); }
- uint32_t lru_get_max() const { return lru_max; }
- uint32_t lru_get_num_pinned() const { return lru_num_pinned; }
-
- void lru_set_max(uint32_t m) { lru_max = m; }
- void lru_set_midpoint(float f) { lru_midpoint = f; }
-
- void lru_clear() {
- lru_top.clear();
- lru_bot.clear();
- lru_pintail.clear();
- lru_num = 0;
+ assert(num_pinned == 0);
}
// insert at top of lru
void lru_insert_top(LRUObject *o) {
- //assert(!o->lru_in_lru);
- //o->lru_in_lru = true;
assert(!o->lru);
o->lru = this;
- lru_top.insert_head( o );
- lru_num++;
- if (o->lru_pinned) lru_num_pinned++;
- lru_adjust();
+ top.push_front(&o->lru_link);
+ if (o->lru_pinned) num_pinned++;
+ adjust();
}
// insert at mid point in lru
void lru_insert_mid(LRUObject *o) {
- //assert(!o->lru_in_lru);
- //o->lru_in_lru = true;
assert(!o->lru);
o->lru = this;
- lru_bot.insert_head(o);
- lru_num++;
- if (o->lru_pinned) lru_num_pinned++;
+ bottom.push_front(&o->lru_link);
+ if (o->lru_pinned) num_pinned++;
+ adjust();
}
// insert at bottom of lru
void lru_insert_bot(LRUObject *o) {
assert(!o->lru);
o->lru = this;
- lru_bot.insert_tail(o);
- lru_num++;
- if (o->lru_pinned) lru_num_pinned++;
+ bottom.push_back(&o->lru_link);
+ if (o->lru_pinned) num_pinned++;
+ adjust();
}
- /*
- // insert at bottom of lru
- void lru_insert_pintail(LRUObject *o) {
- assert(!o->lru);
- o->lru = this;
-
- assert(o->lru_pinned);
-
- lru_pintail.insert_head(o);
- lru_num++;
- lru_num_pinned += o->lru_pinned;
- }
- */
-
-
-
-
- // adjust top/bot balance, as necessary
- void lru_adjust() {
- if (!lru_max) return;
-
- unsigned toplen = lru_top.get_length();
- unsigned topwant = (unsigned)(lru_midpoint * ((double)lru_max - lru_num_pinned));
- while (toplen > 0 &&
- toplen > topwant) {
- // remove from tail of top, stick at head of bot
- // FIXME: this could be way more efficient by moving a whole chain of items.
-
- LRUObject *o = lru_top.get_tail();
- lru_top.remove(o);
- lru_bot.insert_head(o);
- toplen--;
- }
- }
-
-
// remove an item
LRUObject *lru_remove(LRUObject *o) {
- // not in list
- //assert(o->lru_in_lru);
- //if (!o->lru_in_lru) return o; // might have expired and been removed that way.
if (!o->lru) return o;
-
- assert((o->lru_list == &lru_pintail) ||
- (o->lru_list == &lru_top) ||
- (o->lru_list == &lru_bot));
- o->lru_list->remove(o);
-
- lru_num--;
- if (o->lru_pinned) lru_num_pinned--;
- o->lru = 0;
+ auto list = o->lru_link.get_list();
+ assert(list == &top || list == &bottom || list == &pintail);
+ o->lru_link.remove_myself();
+ if (o->lru_pinned) num_pinned--;
+ o->lru = nullptr;
+ adjust();
return o;
}
// touch item -- move to head of lru
bool lru_touch(LRUObject *o) {
- lru_remove(o);
- lru_insert_top(o);
+ if (!o->lru) {
+ lru_insert_top(o);
+ } else {
+ assert(o->lru == this);
+ auto list = o->lru_link.get_list();
+ assert(list == &top || list == &bottom || list == &pintail);
+ top.push_front(&o->lru_link);
+ adjust();
+ }
return true;
}
// touch item -- move to midpoint (unless already higher)
bool lru_midtouch(LRUObject *o) {
- if (o->lru_list == &lru_top) return false;
-
- lru_remove(o);
- lru_insert_mid(o);
+ if (!o->lru) {
+ lru_insert_mid(o);
+ } else {
+ assert(o->lru == this);
+ auto list = o->lru_link.get_list();
+ assert(list == &top || list == &bottom || list == &pintail);
+ if (list == &top) return false;
+ bottom.push_front(&o->lru_link);
+ adjust();
+ }
return true;
}
// touch item -- move to bottom
bool lru_bottouch(LRUObject *o) {
- lru_remove(o);
- lru_insert_bot(o);
+ if (!o->lru) {
+ lru_insert_bot(o);
+ } else {
+ assert(o->lru == this);
+ auto list = o->lru_link.get_list();
+ assert(list == &top || list == &bottom || list == &pintail);
+ bottom.push_back(&o->lru_link);
+ adjust();
+ }
return true;
}
void lru_touch_entire_pintail() {
// promote entire pintail to the top lru
- while (lru_pintail.get_length() > 0) {
- LRUObject *o = lru_pintail.get_head();
- lru_pintail.remove(o);
- lru_top.insert_tail(o);
+ while (pintail.size() > 0) {
+ top.push_back(&pintail.front()->lru_link);
+ adjust();
}
}
-
// expire -- expire a single item
LRUObject *lru_get_next_expire() {
- LRUObject *p;
-
// look through tail of bot
- while (lru_bot.get_length()) {
- p = lru_bot.get_tail();
+ while (bottom.size()) {
+ LRUObject *p = bottom.back();
if (!p->lru_pinned) return p;
// move to pintail
- lru_bot.remove(p);
- lru_pintail.insert_head(p);
+ pintail.push_front(&p->lru_link);
+ adjust();
}
// ok, try head then
- while (lru_top.get_length()) {
- p = lru_top.get_tail();
+ while (top.size()) {
+ LRUObject *p = top.back();
if (!p->lru_pinned) return p;
// move to pintail
- lru_top.remove(p);
- lru_pintail.insert_head(p);
+ pintail.push_front(&p->lru_link);
+ adjust();
}
// no luck!
return NULL;
}
-
void lru_status() {
- //generic_dout(10) << "lru: " << lru_num << " items, " << lru_top.get_length() << " top, " << lru_bot.get_length() << " bot, " << lru_pintail.get_length() << " pintail" << dendl;
+ //generic_dout(10) << "lru: " << lru_get_size() << " items, " << top.size() << " top, " << bottom.size() << " bot, " << pintail.size() << " pintail" << dendl;
}
+protected:
+ // adjust top/bot balance, as necessary
+ void adjust() {
+ uint64_t toplen = top.size();
+ uint64_t topwant = (midpoint * (double)(lru_get_size() - num_pinned));
+ /* move items from below midpoint (bottom) to top: move midpoint forward */
+ for (uint64_t i = toplen; i < topwant; i++) {
+ top.push_back(&bottom.front()->lru_link);
+ }
+ /* or: move items from above midpoint (top) to bottom: move midpoint backwards */
+ for (uint64_t i = toplen; i > topwant; i--) {
+ bottom.push_front(&top.back()->lru_link);
+ }
+ }
+
+ uint64_t num_pinned;
+ double midpoint;
+
+ friend class LRUObject;
+private:
+ typedef xlist<LRUObject *> LRUList;
+ LRUList top, bottom, pintail;
};
+inline LRUObject::~LRUObject() {
+ if (lru) {
+ lru->lru_remove(this);
+ }
+}
inline void LRUObject::lru_pin() {
if (lru && !lru_pinned) {
- lru->lru_num_pinned++;
- lru->lru_adjust();
+ lru->num_pinned++;
}
lru_pinned = true;
}
inline void LRUObject::lru_unpin() {
if (lru && lru_pinned) {
- lru->lru_num_pinned--;
+ lru->num_pinned--;
// move from pintail -> bot
- if (lru_list == &lru->lru_pintail) {
- lru->lru_pintail.remove(this);
- lru->lru_bot.insert_tail(this);
+ if (lru_link.get_list() == &lru->pintail) {
+ lru->lru_bottouch(this);
}
- lru->lru_adjust();
}
lru_pinned = false;
}
(This is just because we need to name some static variables and we
can't use :: in a variable name.)
+XXX Note: the new operator hard-codes the allocation size to the size of the
+object given in MEMPOOL_DEFINE_OBJECT_FACTORY. For this reason, you cannot
+incorporate mempools into a base class without also defining a helper/factory
+for the child class as well (as the base class is usually smaller than the
+child class).
+
In order to use the STL containers, simply use the namespaced variant
of the container type. For example,
f(osdmap) \
f(osdmap_mapping) \
f(pgmap) \
+ f(mds_co) \
f(unittest_1) \
f(unittest_2)
#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */
#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
+#define CEPH_OSDMAP_PURGED_SNAPDIRS (1<<20) /* osds have converted snapsets */
/* these are hidden in 'ceph status' view */
#define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL| \
CEPH_OSDMAP_REQUIRE_KRAKEN | \
CEPH_OSDMAP_REQUIRE_LUMINOUS | \
CEPH_OSDMAP_RECOVERY_DELETES | \
- CEPH_OSDMAP_SORTBITWISE)
+ CEPH_OSDMAP_SORTBITWISE | \
+ CEPH_OSDMAP_PURGED_SNAPDIRS)
#define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL | \
CEPH_OSDMAP_REQUIRE_KRAKEN | \
CEPH_OSDMAP_REQUIRE_LUMINOUS)
int64_t unit_to_bytesize(string val, ostream *pss);
+std::string bytes2str(uint64_t count);
+
struct ceph_data_stats
{
uint64_t byte_total;
private:
item *_front, *_back;
- int _size;
+ size_t _size;
public:
xlist(const xlist& other) {
assert(_back == 0);
}
- int size() const {
+ size_t size() const {
assert((bool)_front == (bool)_size);
return _size;
}
: ThreadPool(cct, "librbd::thread_pool", "tp_librbd", 1,
"rbd_op_threads"),
op_work_queue(new ContextWQ("librbd::op_work_queue",
- cct->_conf->rbd_op_thread_timeout,
+ cct->_conf->get_val<int64_t>("rbd_op_thread_timeout"),
this)) {
start();
}
ThreadPool *thread_pool;
get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
io_work_queue = new io::ImageRequestWQ<>(
- this, "librbd::io_work_queue", cct->_conf->rbd_op_thread_timeout,
+ this, "librbd::io_work_queue",
+ cct->_conf->get_val<int64_t>("rbd_op_thread_timeout"),
thread_pool);
- if (cct->_conf->rbd_auto_exclusive_lock_until_manual_request) {
+ if (cct->_conf->get_val<bool>("rbd_auto_exclusive_lock_until_manual_request")) {
exclusive_lock_policy = new exclusive_lock::AutomaticPolicy(this);
} else {
exclusive_lock_policy = new exclusive_lock::StandardPolicy(this);
}
}
-#define ASSIGN_OPTION(config) \
+#define ASSIGN_OPTION(config, type) \
do { \
string key = "rbd_"; \
key = key + #config; \
if (configs[key]) \
- config = local_config_t.rbd_##config; \
+ config = local_config_t.get_val<type>("rbd_"#config); \
else \
- config = cct->_conf->rbd_##config; \
+ config = cct->_conf->get_val<type>("rbd_"#config); \
} while (0);
- ASSIGN_OPTION(non_blocking_aio);
- ASSIGN_OPTION(cache);
- ASSIGN_OPTION(cache_writethrough_until_flush);
- ASSIGN_OPTION(cache_size);
- ASSIGN_OPTION(cache_max_dirty);
- ASSIGN_OPTION(cache_target_dirty);
- ASSIGN_OPTION(cache_max_dirty_age);
- ASSIGN_OPTION(cache_max_dirty_object);
- ASSIGN_OPTION(cache_block_writes_upfront);
- ASSIGN_OPTION(concurrent_management_ops);
- ASSIGN_OPTION(balance_snap_reads);
- ASSIGN_OPTION(localize_snap_reads);
- ASSIGN_OPTION(balance_parent_reads);
- ASSIGN_OPTION(localize_parent_reads);
- ASSIGN_OPTION(readahead_trigger_requests);
- ASSIGN_OPTION(readahead_max_bytes);
- ASSIGN_OPTION(readahead_disable_after_bytes);
- ASSIGN_OPTION(clone_copy_on_read);
- ASSIGN_OPTION(blacklist_on_break_lock);
- ASSIGN_OPTION(blacklist_expire_seconds);
- ASSIGN_OPTION(request_timed_out_seconds);
- ASSIGN_OPTION(enable_alloc_hint);
- ASSIGN_OPTION(journal_order);
- ASSIGN_OPTION(journal_splay_width);
- ASSIGN_OPTION(journal_commit_age);
- ASSIGN_OPTION(journal_object_flush_interval);
- ASSIGN_OPTION(journal_object_flush_bytes);
- ASSIGN_OPTION(journal_object_flush_age);
- ASSIGN_OPTION(journal_pool);
- ASSIGN_OPTION(journal_max_payload_bytes);
- ASSIGN_OPTION(journal_max_concurrent_object_sets);
- ASSIGN_OPTION(mirroring_resync_after_disconnect);
- ASSIGN_OPTION(mirroring_replay_delay);
- ASSIGN_OPTION(skip_partial_discard);
+ ASSIGN_OPTION(non_blocking_aio, bool);
+ ASSIGN_OPTION(cache, bool);
+ ASSIGN_OPTION(cache_writethrough_until_flush, bool);
+ ASSIGN_OPTION(cache_size, int64_t);
+ ASSIGN_OPTION(cache_max_dirty, int64_t);
+ ASSIGN_OPTION(cache_target_dirty, int64_t);
+ ASSIGN_OPTION(cache_max_dirty_age, double);
+ ASSIGN_OPTION(cache_max_dirty_object, int64_t);
+ ASSIGN_OPTION(cache_block_writes_upfront, bool);
+ ASSIGN_OPTION(concurrent_management_ops, int64_t);
+ ASSIGN_OPTION(balance_snap_reads, bool);
+ ASSIGN_OPTION(localize_snap_reads, bool);
+ ASSIGN_OPTION(balance_parent_reads, bool);
+ ASSIGN_OPTION(localize_parent_reads, bool);
+ ASSIGN_OPTION(readahead_trigger_requests, int64_t);
+ ASSIGN_OPTION(readahead_max_bytes, int64_t);
+ ASSIGN_OPTION(readahead_disable_after_bytes, int64_t);
+ ASSIGN_OPTION(clone_copy_on_read, bool);
+ ASSIGN_OPTION(blacklist_on_break_lock, bool);
+ ASSIGN_OPTION(blacklist_expire_seconds, int64_t);
+ ASSIGN_OPTION(request_timed_out_seconds, int64_t);
+ ASSIGN_OPTION(enable_alloc_hint, bool);
+ ASSIGN_OPTION(journal_order, uint64_t);
+ ASSIGN_OPTION(journal_splay_width, uint64_t);
+ ASSIGN_OPTION(journal_commit_age, double);
+ ASSIGN_OPTION(journal_object_flush_interval, int64_t);
+ ASSIGN_OPTION(journal_object_flush_bytes, int64_t);
+ ASSIGN_OPTION(journal_object_flush_age, double);
+ ASSIGN_OPTION(journal_pool, std::string);
+ ASSIGN_OPTION(journal_max_payload_bytes, uint64_t);
+ ASSIGN_OPTION(journal_max_concurrent_object_sets, int64_t);
+ ASSIGN_OPTION(mirroring_resync_after_disconnect, bool);
+ ASSIGN_OPTION(mirroring_replay_delay, int64_t);
+ ASSIGN_OPTION(skip_partial_discard, bool);
+ ASSIGN_OPTION(blkin_trace_all, bool);
}
ExclusiveLock<ImageCtx> *ImageCtx::create_exclusive_lock() {
bool mirroring_resync_after_disconnect;
int mirroring_replay_delay;
bool skip_partial_discard;
+ bool blkin_trace_all;
LibrbdAdminSocketHook *asok_hook;
m_cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
thread_pool_singleton, "librbd::ImageUpdateWatchers::thread_pool");
m_work_queue = new ContextWQ("librbd::ImageUpdateWatchers::op_work_queue",
- m_cct->_conf->rbd_op_thread_timeout,
+ m_cct->_conf->get_val<int64_t>("rbd_op_thread_timeout"),
thread_pool_singleton);
}
schedule_request_lock(true);
} else {
// lock owner acked -- but resend if we don't see them release the lock
- int retry_timeout = m_image_ctx.cct->_conf->client_notify_timeout;
+ int retry_timeout = m_image_ctx.cct->_conf->template get_val<int64_t>(
+ "client_notify_timeout");
ldout(m_image_ctx.cct, 15) << this << " will retry in " << retry_timeout
<< " seconds" << dendl;
schedule_request_lock(true, retry_timeout);
cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
thread_pool_singleton, "librbd::journal::thread_pool");
m_work_queue = new ContextWQ("librbd::journal::work_queue",
- cct->_conf->rbd_op_thread_timeout,
+ cct->_conf->get_val<int64_t>("rbd_op_thread_timeout"),
thread_pool_singleton);
ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock);
}
if (image_options.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &m_stripe_unit) != 0 ||
m_stripe_unit == 0) {
- m_stripe_unit = m_cct->_conf->rbd_default_stripe_unit;
+ m_stripe_unit = m_cct->_conf->get_val<uint64_t>("rbd_default_stripe_unit");
}
if (image_options.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &m_stripe_count) != 0 ||
m_stripe_count == 0) {
- m_stripe_count = m_cct->_conf->rbd_default_stripe_count;
+ m_stripe_count = m_cct->_conf->get_val<uint64_t>("rbd_default_stripe_count");
}
if (get_image_option(image_options, RBD_IMAGE_OPTION_ORDER, &m_order) != 0 ||
m_order == 0) {
- m_order = m_cct->_conf->rbd_default_order;
+ m_order = m_cct->_conf->get_val<int64_t>("rbd_default_order");
}
if (get_image_option(image_options, RBD_IMAGE_OPTION_JOURNAL_ORDER,
&m_journal_order) != 0) {
- m_journal_order = m_cct->_conf->rbd_journal_order;
+ m_journal_order = m_cct->_conf->get_val<uint64_t>("rbd_journal_order");
}
if (get_image_option(image_options, RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH,
&m_journal_splay_width) != 0) {
- m_journal_splay_width = m_cct->_conf->rbd_journal_splay_width;
+ m_journal_splay_width = m_cct->_conf->get_val<uint64_t>("rbd_journal_splay_width");
}
if (image_options.get(RBD_IMAGE_OPTION_JOURNAL_POOL, &m_journal_pool) != 0) {
- m_journal_pool = m_cct->_conf->rbd_journal_pool;
+ m_journal_pool = m_cct->_conf->get_val<std::string>("rbd_journal_pool");
}
if (image_options.get(RBD_IMAGE_OPTION_DATA_POOL, &m_data_pool) != 0) {
- m_data_pool = m_cct->_conf->rbd_default_data_pool;
+ m_data_pool = m_cct->_conf->get_val<std::string>("rbd_default_data_pool");
}
m_layout.object_size = 1ull << m_order;
template<typename I>
void CreateRequest<I>::validate_pool() {
- if (!m_cct->_conf->rbd_validate_pool) {
+ if (!m_cct->_conf->get_val<bool>("rbd_validate_pool")) {
create_id_object();
return;
}
namespace {
int validate_pool(IoCtx &io_ctx, CephContext *cct) {
- if (!cct->_conf->rbd_validate_pool) {
+ if (!cct->_conf->get_val<bool>("rbd_validate_pool")) {
return 0;
}
uint64_t format;
if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0)
- format = cct->_conf->rbd_default_format;
+ format = cct->_conf->get_val<int64_t>("rbd_default_format");
bool old_format = format == 1;
// make sure it doesn't already exist, in either format
uint64_t order = 0;
if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0 || order == 0) {
- order = cct->_conf->rbd_default_order;
+ order = cct->_conf->get_val<int64_t>("rbd_default_order");
}
r = image::CreateRequest<>::validate_order(cct, order);
if (r < 0) {
ImageCtx *ictx = new ImageCtx(srcname, "", "", io_ctx, false);
int r = ictx->state->open(false);
if (r < 0) {
- lderr(ictx->cct) << "error opening source image: " << cpp_strerror(r)
- << dendl;
+ lderr(cct) << "error opening source image: " << cpp_strerror(r) << dendl;
return r;
}
BOOST_SCOPE_EXIT((ictx)) {
}
ZTracer::Trace trace;
- if (cct->_conf->rbd_blkin_trace_all) {
+ if (src->blkin_trace_all) {
trace.init("copy", &src->trace_endpoint);
}
uint64_t left = mylen;
ZTracer::Trace trace;
- if (ictx->cct->_conf->rbd_blkin_trace_all) {
+ if (ictx->blkin_trace_all) {
trace.init("read_iterate", &ictx->trace_endpoint);
}
bool native_async) {
CephContext *cct = m_image_ctx.cct;
ZTracer::Trace trace;
- if (cct->_conf->rbd_blkin_trace_all) {
+ if (m_image_ctx.blkin_trace_all) {
trace.init("wq: read", &m_image_ctx.trace_endpoint);
trace.event("start");
}
bool native_async) {
CephContext *cct = m_image_ctx.cct;
ZTracer::Trace trace;
- if (cct->_conf->rbd_blkin_trace_all) {
+ if (m_image_ctx.blkin_trace_all) {
trace.init("wq: write", &m_image_ctx.trace_endpoint);
trace.event("init");
}
bool native_async) {
CephContext *cct = m_image_ctx.cct;
ZTracer::Trace trace;
- if (cct->_conf->rbd_blkin_trace_all) {
+ if (m_image_ctx.blkin_trace_all) {
trace.init("wq: discard", &m_image_ctx.trace_endpoint);
trace.event("init");
}
void ImageRequestWQ<I>::aio_flush(AioCompletion *c, bool native_async) {
CephContext *cct = m_image_ctx.cct;
ZTracer::Trace trace;
- if (cct->_conf->rbd_blkin_trace_all) {
+ if (m_image_ctx.blkin_trace_all) {
trace.init("wq: flush", &m_image_ctx.trace_endpoint);
trace.event("init");
}
int op_flags, bool native_async) {
CephContext *cct = m_image_ctx.cct;
ZTracer::Trace trace;
- if (cct->_conf->rbd_blkin_trace_all) {
+ if (m_image_ctx.blkin_trace_all) {
trace.init("wq: writesame", &m_image_ctx.trace_endpoint);
trace.event("init");
}
int op_flags, bool native_async) {
CephContext *cct = m_image_ctx.cct;
ZTracer::Trace trace;
- if (cct->_conf->rbd_blkin_trace_all) {
+ if (m_image_ctx.blkin_trace_all) {
trace.init("wq: compare_and_write", &m_image_ctx.trace_endpoint);
trace.event("init");
}
AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
this, m_image_ctx, context_factory, this->create_callback_context(),
&m_prog_ctx, 0, num_objects);
- throttle->start_ops(cct->_conf->rbd_concurrent_management_ops);
+ throttle->start_ops(m_image_ctx.concurrent_management_ops);
}
template <typename I>
#include "common/dout.h"
#include "common/HeartbeatMap.h"
+
#include "include/stringify.h"
#include "include/util.h"
}
// Report if we have significantly exceeded our cache size limit
- if (mds->mdcache->get_cache_size() >
- g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
+ if (mds->mdcache->cache_overfull()) {
std::ostringstream oss;
- oss << "Too many inodes in cache (" << mds->mdcache->get_cache_size()
- << "/" << g_conf->mds_cache_size << "), "
+ oss << "MDS cache is too large (" << bytes2str(mds->mdcache->cache_size())
+ << "/" << bytes2str(mds->mdcache->cache_limit_memory()) << "); "
<< mds->mdcache->num_inodes_with_caps << " inodes in use by clients, "
<< mds->mdcache->get_num_strays() << " stray files";
default: ceph_abort(); return "";
}
}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(CDentry, co_dentry, mds_co);
// dentry
class CDentry : public MDSCacheObject, public LRUObject, public Counter<CDentry> {
public:
+ MEMPOOL_CLASS_HELPERS();
friend class CDir;
struct linkage_t {
::encode(version, bl);
::encode(projected_version, bl);
::encode(lock, bl);
- ::encode(replica_map, bl);
+ ::encode(get_replicas(), bl);
get(PIN_TEMPEXPORTING);
}
void finish_export() {
::decode(version, blp);
::decode(projected_version, blp);
::decode(lock, blp);
- ::decode(replica_map, blp);
+ ::decode(get_replicas(), blp);
// twiddle
state &= MASK_STATE_IMPORT_KEPT;
state_set(CDentry::STATE_AUTH);
if (nstate & STATE_DIRTY)
_mark_dirty(ls);
- if (!replica_map.empty())
+ if (is_replicated())
get(PIN_REPLICATED);
replica_nonce = 0;
}
void CDir::init_fragment_pins()
{
- if (!replica_map.empty())
+ if (is_replicated())
get(PIN_REPLICATED);
if (state_test(STATE_DIRTY))
get(PIN_DIRTY);
for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
CDir *f = new CDir(inode, *p, cache, is_auth());
f->state_set(state & (MASK_STATE_FRAGMENT_KEPT | STATE_COMPLETE));
- f->replica_map = replica_map;
+ f->get_replicas() = get_replicas();
f->dir_auth = dir_auth;
f->init_fragment_pins();
f->set_version(get_version());
steal_dentry(dir->items.begin()->second);
// merge replica map
- for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
- p != dir->replicas_end();
- ++p) {
- unsigned cur = replica_map[p->first];
- if (p->second > cur)
- replica_map[p->first] = p->second;
+ for (const auto &p : dir->get_replicas()) {
+ unsigned cur = get_replicas()[p.first];
+ if (p.second > cur)
+ get_replicas()[p.first] = p.second;
}
// merge version
::encode(pop_auth_subtree, bl);
::encode(dir_rep_by, bl);
- ::encode(replica_map, bl);
+ ::encode(get_replicas(), bl);
get(PIN_TEMPEXPORTING);
}
pop_auth_subtree_nested.add(now, cache->decayrate, pop_auth_subtree);
::decode(dir_rep_by, blp);
- ::decode(replica_map, blp);
- if (!replica_map.empty()) get(PIN_REPLICATED);
+ ::decode(get_replicas(), blp);
+ if (is_replicated()) get(PIN_REPLICATED);
replica_nonce = 0; // no longer defined
return effective_size > fast_limit;
}
+MEMPOOL_DEFINE_OBJECT_FACTORY(CDir, co_dir, mds_co);
friend ostream& operator<<(ostream& out, const class CDir& dir);
public:
+ MEMPOOL_CLASS_HELPERS();
// -- pins --
static const int PIN_DNWAITER = 1;
static const int PIN_INOWAITER = 2;
::encode(pop, bl);
- ::encode(replica_map, bl);
+ ::encode(get_replicas(), bl);
// include scatterlock info for any bounding CDirs
bufferlist bounding;
::decode(pop, ceph_clock_now(), p);
- ::decode(replica_map, p);
- if (!replica_map.empty())
+ ::decode(get_replicas(), p);
+ if (is_replicated())
get(PIN_REPLICATED);
replica_nonce = 0;
return true;
}
}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co);
// cached inode wrapper
class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CInode> {
public:
+ MEMPOOL_CLASS_HELPERS();
// -- pins --
static const int PIN_DIRFRAG = -1;
static const int PIN_CAPS = 2; // client caps
if (!stuck_failed.empty()) {
health_check_t& fscheck = checks->get_or_add(
"FS_WITH_FAILED_MDS", HEALTH_WARN,
- "%num% filesystem%plurals% %isorare% have a failed mds daemon");
+ "%num% filesystem%plurals% %hasorhave% a failed mds daemon");
ostringstream ss;
ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
<< " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
void Locker::send_lock_message(SimpleLock *lock, int msg)
{
- for (compact_map<mds_rank_t,unsigned>::iterator it = lock->get_parent()->replicas_begin();
- it != lock->get_parent()->replicas_end();
- ++it) {
+ for (const auto &it : lock->get_parent()->get_replicas()) {
if (mds->is_cluster_degraded() &&
- mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN)
+ mds->mdsmap->get_state(it.first) < MDSMap::STATE_REJOIN)
continue;
MLock *m = new MLock(lock, msg, mds->get_nodeid());
- mds->send_message_mds(m, it->first);
+ mds->send_message_mds(m, it.first);
}
}
void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data)
{
- for (compact_map<mds_rank_t,unsigned>::iterator it = lock->get_parent()->replicas_begin();
- it != lock->get_parent()->replicas_end();
- ++it) {
+ for (const auto &it : lock->get_parent()->get_replicas()) {
if (mds->is_cluster_degraded() &&
- mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN)
+ mds->mdsmap->get_state(it.first) < MDSMap::STATE_REJOIN)
continue;
MLock *m = new MLock(lock, msg, mds->get_nodeid());
m->set_data(data);
- mds->send_message_mds(m, it->first);
+ mds->send_message_mds(m, it.first);
}
}
#include "include/ceph_fs.h"
#include "include/filepath.h"
+#include "include/util.h"
#include "msg/Message.h"
#include "msg/Messenger.h"
+#include "common/MemoryModel.h"
#include "common/errno.h"
-#include "common/safe_io.h"
#include "common/perf_counters.h"
-#include "common/MemoryModel.h"
+#include "common/safe_io.h"
+
#include "osdc/Journaler.h"
#include "osdc/Filer.h"
cap_imports_num_opening = 0;
opening_root = open = false;
- lru.lru_set_max(g_conf->mds_cache_size);
- lru.lru_set_midpoint(g_conf->mds_cache_mid);
+ lru.lru_set_midpoint(cache_mid());
- bottom_lru.lru_set_max(0);
bottom_lru.lru_set_midpoint(0);
decayrate.set_halflife(g_conf->mds_decay_halflife);
void MDCache::log_stat()
{
- mds->logger->set(l_mds_inode_max, g_conf->mds_cache_size);
+ mds->logger->set(l_mds_inode_max, cache_limit_inodes() == 0 ? INT_MAX : cache_limit_inodes());
mds->logger->set(l_mds_inodes, lru.lru_get_size());
mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
base_inodes.insert(in);
}
- if (CInode::count() >
- g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
+ if (cache_toofull()) {
exceeded_size_limit = true;
}
}
if (to_eval && dir->get_inode()->is_auth())
to_eval->insert(dir->get_inode());
- }
- show_subtrees(15);
+ show_subtrees(15);
+ }
}
void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
msg->quota = i->quota;
mds->send_message_client_counted(msg, session->connection);
}
- for (compact_map<mds_rank_t, unsigned>::iterator it = in->replicas_begin();
- it != in->replicas_end();
- ++it) {
+ for (const auto &it : in->get_replicas()) {
MGatherCaps *msg = new MGatherCaps;
msg->ino = in->ino();
- mds->send_message_mds(msg, it->first);
+ mds->send_message_mds(msg, it.first);
}
}
p != dfs.end();
++p) {
CDir *dir = *p;
+ if (!dir->is_auth())
+ continue;
- if (dir->is_auth() &&
- dir->is_replica(from) &&
+ if (dir->is_replica(from) &&
(ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
dir->remove_replica(from);
dout(10) << " rem " << *dir << dendl;
dq.pop_front();
// dir
- for (compact_map<mds_rank_t,unsigned>::iterator r = dir->replicas_begin();
- r != dir->replicas_end();
- ++r) {
- auto it = acks.find(r->first);
+ for (auto &r : dir->get_replicas()) {
+ auto it = acks.find(r.first);
if (it == acks.end())
continue;
- it->second->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
+ it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
it->second->add_dirfrag_base(dir);
}
in = dnl->get_inode();
// dentry
- for (compact_map<mds_rank_t,unsigned>::iterator r = dn->replicas_begin();
- r != dn->replicas_end();
- ++r) {
- auto it = acks.find(r->first);
+ for (auto &r : dn->get_replicas()) {
+ auto it = acks.find(r.first);
if (it == acks.end())
continue;
it->second->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
dnl->is_remote() ? dnl->get_remote_d_type():0,
- ++r->second,
+ ++r.second,
dn->lock.get_replica_state());
// peer missed MDentrylink message ?
- if (in && !in->is_replica(r->first))
- in->add_replica(r->first);
+ if (in && !in->is_replica(r.first))
+ in->add_replica(r.first);
}
if (!in)
continue;
- for (compact_map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
- r != in->replicas_end();
- ++r) {
- auto it = acks.find(r->first);
+ for (auto &r : in->get_replicas()) {
+ auto it = acks.find(r.first);
if (it == acks.end())
continue;
it->second->add_inode_base(in, mds->mdsmap->get_up_features());
bufferlist bl;
- in->_encode_locks_state_for_rejoin(bl, r->first);
- it->second->add_inode_locks(in, ++r->second, bl);
+ in->_encode_locks_state_for_rejoin(bl, r.first);
+ it->second->add_inode_locks(in, ++r.second, bl);
}
// subdirs in this subtree?
// base inodes too
if (root && root->is_auth())
- for (compact_map<mds_rank_t,unsigned>::iterator r = root->replicas_begin();
- r != root->replicas_end();
- ++r) {
- auto it = acks.find(r->first);
+ for (auto &r : root->get_replicas()) {
+ auto it = acks.find(r.first);
if (it == acks.end())
continue;
it->second->add_inode_base(root, mds->mdsmap->get_up_features());
bufferlist bl;
- root->_encode_locks_state_for_rejoin(bl, r->first);
- it->second->add_inode_locks(root, ++r->second, bl);
+ root->_encode_locks_state_for_rejoin(bl, r.first);
+ it->second->add_inode_locks(root, ++r.second, bl);
}
if (myin)
- for (compact_map<mds_rank_t,unsigned>::iterator r = myin->replicas_begin();
- r != myin->replicas_end();
- ++r) {
- auto it = acks.find(r->first);
+ for (auto &r : myin->get_replicas()) {
+ auto it = acks.find(r.first);
if (it == acks.end())
continue;
it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
bufferlist bl;
- myin->_encode_locks_state_for_rejoin(bl, r->first);
- it->second->add_inode_locks(myin, ++r->second, bl);
+ myin->_encode_locks_state_for_rejoin(bl, r.first);
+ it->second->add_inode_locks(myin, ++r.second, bl);
}
// include inode base for any inodes whose scatterlocks may have updated
p != rejoin_potential_updated_scatterlocks.end();
++p) {
CInode *in = *p;
- for (compact_map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
- r != in->replicas_end();
- ++r) {
- auto it = acks.find(r->first);
+ for (const auto &r : in->get_replicas()) {
+ auto it = acks.find(r.first);
if (it == acks.end())
continue;
it->second->add_inode_base(in, mds->mdsmap->get_up_features());
// ================================================================================
// cache trimming
-
-/*
- * note: only called while MDS is active or stopping... NOT during recovery.
- * however, we may expire a replica whose authority is recovering.
- *
- */
-bool MDCache::trim(int max, int count)
-{
- // trim LRU
- if (count > 0) {
- max = lru.lru_get_size() - count;
- if (max <= 0)
- max = 1;
- } else if (max < 0) {
- max = g_conf->mds_cache_size;
- if (max <= 0)
- return false;
- }
- dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size()
- << "/" << bottom_lru.lru_get_size() << dendl;
-
- // process delayed eval_stray()
- stray_manager.advance_delayed();
-
- map<mds_rank_t, MCacheExpire*> expiremap;
+void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
+{
bool is_standby_replay = mds->is_standby_replay();
- int unexpirable = 0;
- list<CDentry*> unexpirables;
+ std::vector<CDentry *> unexpirables;
+ uint64_t trimmed = 0;
+
+ dout(7) << "trim_lru trimming " << count
+ << " items from LRU"
+ << " size=" << lru.lru_get_size()
+ << " mid=" << lru.lru_get_top()
+ << " pintail=" << lru.lru_get_pintail()
+ << " pinned=" << lru.lru_get_num_pinned()
+ << dendl;
for (;;) {
CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
break;
if (trim_dentry(dn, expiremap)) {
unexpirables.push_back(dn);
- ++unexpirable;
+ } else {
+ trimmed++;
}
}
- for(auto dn : unexpirables)
+ for (auto &dn : unexpirables) {
bottom_lru.lru_insert_mid(dn);
+ }
unexpirables.clear();
- // trim dentries from the LRU: only enough to satisfy `max`,
- while (lru.lru_get_size() + unexpirable > (unsigned)max) {
+ // trim dentries from the LRU until count is reached
+ while (cache_toofull() || count > 0) {
CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
if (!dn) {
break;
}
if ((is_standby_replay && dn->get_linkage()->inode &&
- dn->get_linkage()->inode->item_open_file.is_on_list()) ||
- trim_dentry(dn, expiremap)) {
+ dn->get_linkage()->inode->item_open_file.is_on_list())) {
unexpirables.push_back(dn);
- ++unexpirable;
+ } else if (trim_dentry(dn, expiremap)) {
+ unexpirables.push_back(dn);
+ } else {
+ trimmed++;
}
+ count--;
}
- for(auto dn : unexpirables)
+
+ for (auto &dn : unexpirables) {
lru.lru_insert_mid(dn);
+ }
unexpirables.clear();
+ dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
+}
+
+/*
+ * note: only called while MDS is active or stopping... NOT during recovery.
+ * however, we may expire a replica whose authority is recovering.
+ *
+ * @param count is number of dentries to try to expire
+ */
+bool MDCache::trim(uint64_t count)
+{
+ uint64_t used = cache_size();
+ uint64_t limit = cache_limit_memory();
+ map<mds_rank_t, MCacheExpire*> expiremap;
+
+ dout(7) << "trim bytes_used=" << bytes2str(used)
+ << " limit=" << bytes2str(limit)
+ << " reservation=" << cache_reservation()
+ << "% count=" << count << dendl;
+
+ // process delayed eval_stray()
+ stray_manager.advance_delayed();
+
+ trim_lru(count, expiremap);
+
// trim non-auth, non-bound subtrees
- for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
- p != subtrees.end();) {
+ for (auto p = subtrees.begin(); p != subtrees.end();) {
CDir *dir = p->first;
++p;
CInode *diri = dir->get_inode();
if (!diri->is_auth() && !diri->is_base() &&
dir->get_num_head_items() == 0) {
if (dir->state_test(CDir::STATE_EXPORTING) ||
+ !(mds->is_active() || mds->is_stopping()) ||
dir->is_freezing() || dir->is_frozen())
continue;
}
// trim root?
- if (max == 0 && root) {
+ if (mds->is_stopping() && root) {
list<CDir*> ls;
root->get_dirfrags(ls);
for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
}
// Other rank's base inodes (when I'm stopping)
- if (max == 0) {
+ if (mds->is_stopping()) {
for (set<CInode*>::iterator p = base_inodes.begin();
p != base_inodes.end(); ++p) {
if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
if (nonce == dir->get_replica_nonce(from)) {
// remove from our cached_by
dout(7) << " dir expire on " << *dir << " from mds." << from
- << " replicas was " << dir->replica_map << dendl;
+ << " replicas was " << dir->get_replicas() << dendl;
dir->remove_replica(from);
}
else {
// check client caps
assert(CInode::count() == inode_map.size());
- float caps_per_inode = 0.0;
+ double caps_per_inode = 0.0;
if (CInode::count())
- caps_per_inode = (float)Capability::count() / (float)CInode::count();
+ caps_per_inode = (double)Capability::count() / (double)CInode::count();
dout(2) << "check_memory_usage"
<< " total " << last.get_total()
mds->mlogger->set(l_mdm_rss, last.get_rss());
mds->mlogger->set(l_mdm_heap, last.get_heap());
- if (num_inodes_with_caps > g_conf->mds_cache_size) {
- float ratio = (float)g_conf->mds_cache_size * .9 / (float)num_inodes_with_caps;
- if (ratio < 1.0) {
- last_recall_state = ceph_clock_now();
- mds->server->recall_client_state(ratio);
- }
+ if (cache_toofull()) {
+ last_recall_state = ceph_clock_now();
+ mds->server->recall_client_state();
}
// If the cache size had exceeded its limit, but we're back in bounds
// now, free any unused pool memory so that our memory usage isn't
// permanently bloated.
- if (exceeded_size_limit
- && CInode::count() <=
- g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
+ if (exceeded_size_limit && !cache_toofull()) {
// Only do this once we are back in bounds: otherwise the releases would
// slow down whatever process caused us to exceed bounds to begin with
if (ceph_using_tcmalloc()) {
}
// trim cache
- trim(0);
+ trim(UINT64_MAX);
dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
// SUBTREES
assert(!migrator->is_exporting());
assert(!migrator->is_importing());
- if ((myin && myin->is_auth_pinned()) ||
- (mydir && mydir->is_auth_pinned())) {
- dout(7) << "still have auth pinned objects" << dendl;
- return false;
- }
-
// flush what we can from the log
mds->mdlog->trim(0);
if (mds->mdlog->get_num_segments() > 1) {
return false;
}
+ if ((myin && myin->is_auth_pinned()) ||
+ (mydir && mydir->is_auth_pinned())) {
+ dout(7) << "still have auth pinned objects" << dendl;
+ return false;
+ }
+
// (only do this once!)
if (!mds->mdlog->is_capped()) {
dout(7) << "capping the log" << dendl;
if (bcast) {
mds->get_mds_map()->get_active_mds_set(who);
} else {
- for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
- p != dir->replicas_end();
- ++p)
- who.insert(p->first);
+ for (const auto &p : dir->get_replicas()) {
+ who.insert(p.first);
+ }
}
dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
dout(7) << "send_dentry_link " << *dn << dendl;
CDir *subtree = get_subtree_root(dn->get_dir());
- for (compact_map<mds_rank_t,unsigned>::iterator p = dn->replicas_begin();
- p != dn->replicas_end();
- ++p) {
+ for (const auto &p : dn->get_replicas()) {
// don't tell (rename) witnesses; they already know
- if (mdr.get() && mdr->more()->witnessed.count(p->first))
+ if (mdr.get() && mdr->more()->witnessed.count(p.first))
continue;
- if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
- (mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
- rejoin_gather.count(p->first)))
+ if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
+ (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
+ rejoin_gather.count(p.first)))
continue;
CDentry::linkage_t *dnl = dn->get_linkage();
MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
dn->name, dnl->is_primary());
if (dnl->is_primary()) {
dout(10) << " primary " << *dnl->get_inode() << dendl;
- replicate_inode(dnl->get_inode(), p->first, m->bl,
+ replicate_inode(dnl->get_inode(), p.first, m->bl,
mds->mdsmap->get_up_features());
} else if (dnl->is_remote()) {
inodeno_t ino = dnl->get_remote_ino();
::encode(d_type, m->bl);
} else
ceph_abort(); // aie, bad caller!
- mds->send_message_mds(m, p->first);
+ mds->send_message_mds(m, p.first);
}
}
// tell peers
CDir *first = *info.resultfrags.begin();
- for (compact_map<mds_rank_t,unsigned>::iterator p = first->replicas_begin();
- p != first->replicas_end();
- ++p) {
- if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
- (mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
- rejoin_gather.count(p->first)))
+ for (const auto &p : first->get_replicas()) {
+ if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
+ (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
+ rejoin_gather.count(p.first)))
continue;
MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
for (list<CDir*>::iterator q = info.resultfrags.begin();
q != info.resultfrags.end();
++q)
- replicate_dir(*q, p->first, notify->basebl);
+ replicate_dir(*q, p.first, notify->basebl);
- mds->send_message_mds(notify, p->first);
+ mds->send_message_mds(notify, p.first);
}
// journal commit
}
}
+int MDCache::cache_status(Formatter *f)
+{
+ f->open_object_section("cache");
+
+ f->open_object_section("pool");
+ mempool::get_pool(mempool::mds_co::id).dump(f);
+ f->close_section();
+
+ f->close_section();
+ return 0;
+}
+
int MDCache::dump_cache(std::string const &file_name)
{
return dump_cache(file_name.c_str(), NULL);
bool exceeded_size_limit;
public:
+ static uint64_t cache_limit_inodes(void) {
+ return g_conf->get_val<int64_t>("mds_cache_size");
+ }
+ static uint64_t cache_limit_memory(void) {
+ return g_conf->get_val<uint64_t>("mds_cache_memory_limit");
+ }
+ static double cache_reservation(void) {
+ return g_conf->get_val<double>("mds_cache_reservation");
+ }
+ static double cache_mid(void) {
+ return g_conf->get_val<double>("mds_cache_mid");
+ }
+ static double cache_health_threshold(void) {
+ return g_conf->get_val<double>("mds_health_cache_threshold");
+ }
+ double cache_toofull_ratio(void) const {
+ uint64_t inode_limit = cache_limit_inodes();
+ double inode_reserve = inode_limit*(1.0-cache_reservation());
+ double memory_reserve = cache_limit_memory()*(1.0-cache_reservation());
+ return fmax(0.0, fmax((cache_size()-memory_reserve)/memory_reserve, inode_limit == 0 ? 0.0 : (CInode::count()-inode_reserve)/inode_reserve));
+ }
+ bool cache_toofull(void) const {
+ return cache_toofull_ratio() > 0.0;
+ }
+ uint64_t cache_size(void) const {
+ return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
+ }
+ bool cache_overfull(void) const {
+ uint64_t inode_limit = cache_limit_inodes();
+ return (inode_limit > 0 && CInode::count() > inode_limit*cache_health_threshold()) || (cache_size() > cache_limit_memory()*cache_health_threshold());
+ }
+
void advance_stray() {
stray_index = (stray_index+1)%NUM_STRAY;
}
CInode *get_root() { return root; }
CInode *get_myin() { return myin; }
- // cache
- void set_cache_size(size_t max) { lru.lru_set_max(max); }
size_t get_cache_size() { return lru.lru_get_size(); }
// trimming
- bool trim(int max=-1, int count=-1); // trim cache
+ bool trim(uint64_t count=0);
+private:
+ void trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*>& expiremap);
bool trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap);
void trim_dirfrag(CDir *dir, CDir *con,
map<mds_rank_t, MCacheExpire*>& expiremap);
map<mds_rank_t,class MCacheExpire*>& expiremap);
void send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap);
void trim_non_auth(); // trim out trimmable non-auth items
+public:
bool trim_non_auth_subtree(CDir *directory);
void standby_trim_segment(LogSegment *ls);
void try_trim_non_auth_subtree(CDir *dir);
int dump_cache(Formatter *f);
int dump_cache(const std::string& dump_root, int depth, Formatter *f);
+ int cache_status(Formatter *f);
+
void dump_resolve_status(Formatter *f) const;
void dump_rejoin_status(Formatter *f) const;
if (removed_segment) {
dout(20) << " calling mdcache->trim!" << dendl;
- mds->mdcache->trim(-1);
+ mds->mdcache->trim();
} else {
dout(20) << " removed no segments!" << dendl;
}
uint64_t MDSCacheObject::last_wait_seq = 0;
void MDSCacheObject::finish_waiting(uint64_t mask, int result) {
- list<MDSInternalContextBase*> finished;
+ std::list<MDSInternalContextBase*> finished;
take_waiting(mask, finished);
finish_contexts(g_ceph_context, finished, result);
}
f->open_object_section("auth_state");
{
f->open_object_section("replicas");
- const compact_map<mds_rank_t,unsigned>& replicas = get_replicas();
- for (compact_map<mds_rank_t,unsigned>::const_iterator i = replicas.begin();
- i != replicas.end(); ++i) {
+ for (const auto &it : get_replicas()) {
std::ostringstream rank_str;
- rank_str << i->first;
- f->dump_int(rank_str.str().c_str(), i->second);
+ rank_str << it.first;
+ f->dump_int(rank_str.str().c_str(), it.second);
}
f->close_section();
}
#ifndef CEPH_MDSCACHEOBJECT_H
#define CEPH_MDSCACHEOBJECT_H
-#include <set>
-#include <map>
#include <ostream>
-using namespace std;
-
#include "common/config.h"
+
+#include "include/Context.h"
+#include "include/alloc_ptr.h"
#include "include/assert.h"
+#include "include/mempool.h"
#include "include/types.h"
#include "include/xlist.h"
-#include "include/Context.h"
+
#include "mdstypes.h"
#define MDS_REF_SET // define me for improved debug output, sanity checking
protected:
__s32 ref; // reference count
#ifdef MDS_REF_SET
- std::map<int,int> ref_map;
+ mempool::mds_co::map<int,int> ref_map;
#endif
public:
int auth_pins;
int nested_auth_pins;
#ifdef MDS_AUTHPIN_SET
- multiset<void*> auth_pin_set;
+ mempool::mds_co::multiset<void*> auth_pin_set;
#endif
public:
// replication (across mds cluster)
protected:
unsigned replica_nonce; // [replica] defined on replica
- compact_map<mds_rank_t,unsigned> replica_map; // [auth] mds -> nonce
+ typedef compact_map<mds_rank_t,unsigned> replica_map_type;
+ replica_map_type replica_map; // [auth] mds -> nonce
public:
- bool is_replicated() const { return !replica_map.empty(); }
- bool is_replica(mds_rank_t mds) const { return replica_map.count(mds); }
- int num_replicas() const { return replica_map.size(); }
+ bool is_replicated() const { return !get_replicas().empty(); }
+ bool is_replica(mds_rank_t mds) const { return get_replicas().count(mds); }
+ int num_replicas() const { return get_replicas().size(); }
unsigned add_replica(mds_rank_t mds) {
- if (replica_map.count(mds))
- return ++replica_map[mds]; // inc nonce
- if (replica_map.empty())
+ if (get_replicas().count(mds))
+ return ++get_replicas()[mds]; // inc nonce
+ if (get_replicas().empty())
get(PIN_REPLICATED);
- return replica_map[mds] = 1;
+ return get_replicas()[mds] = 1;
}
void add_replica(mds_rank_t mds, unsigned nonce) {
- if (replica_map.empty())
+ if (get_replicas().empty())
get(PIN_REPLICATED);
- replica_map[mds] = nonce;
+ get_replicas()[mds] = nonce;
}
unsigned get_replica_nonce(mds_rank_t mds) {
- assert(replica_map.count(mds));
- return replica_map[mds];
+ assert(get_replicas().count(mds));
+ return get_replicas()[mds];
}
void remove_replica(mds_rank_t mds) {
- assert(replica_map.count(mds));
- replica_map.erase(mds);
- if (replica_map.empty())
+ assert(get_replicas().count(mds));
+ get_replicas().erase(mds);
+ if (get_replicas().empty()) {
put(PIN_REPLICATED);
+ }
}
void clear_replica_map() {
- if (!replica_map.empty())
+ if (!get_replicas().empty())
put(PIN_REPLICATED);
replica_map.clear();
}
- compact_map<mds_rank_t,unsigned>::iterator replicas_begin() { return replica_map.begin(); }
- compact_map<mds_rank_t,unsigned>::iterator replicas_end() { return replica_map.end(); }
- const compact_map<mds_rank_t,unsigned>& get_replicas() const { return replica_map; }
+ replica_map_type& get_replicas() { return replica_map; }
+ const replica_map_type& get_replicas() const { return replica_map; }
void list_replicas(std::set<mds_rank_t>& ls) const {
- for (compact_map<mds_rank_t,unsigned>::const_iterator p = replica_map.begin();
- p != replica_map.end();
- ++p)
- ls.insert(p->first);
+ for (const auto &p : get_replicas()) {
+ ls.insert(p.first);
+ }
}
unsigned get_replica_nonce() const { return replica_nonce; }
// ---------------------------------------------
// waiting
- protected:
- compact_multimap<uint64_t, pair<uint64_t, MDSInternalContextBase*> > waiting;
+ private:
+ alloc_ptr<mempool::mds_co::multimap<uint64_t, std::pair<uint64_t, MDSInternalContextBase*>>> waiting;
static uint64_t last_wait_seq;
public:
if (!min) {
min = mask;
while (min & (min-1)) // if more than one bit is set
- min &= min-1; // clear LSB
+ min &= min-1; // clear LSB
}
- for (auto p = waiting.lower_bound(min);
- p != waiting.end();
- ++p) {
- if (p->first & mask) return true;
- if (p->first > mask) return false;
+ if (waiting) {
+ for (auto p = waiting->lower_bound(min); p != waiting->end(); ++p) {
+ if (p->first & mask) return true;
+ if (p->first > mask) return false;
+ }
}
return false;
}
virtual void add_waiter(uint64_t mask, MDSInternalContextBase *c) {
- if (waiting.empty())
+ if (waiting->empty())
get(PIN_WAITER);
uint64_t seq = 0;
seq = ++last_wait_seq;
mask &= ~WAIT_ORDERED;
}
- waiting.insert(pair<uint64_t, pair<uint64_t, MDSInternalContextBase*> >(
+ waiting->insert(pair<uint64_t, pair<uint64_t, MDSInternalContextBase*> >(
mask,
pair<uint64_t, MDSInternalContextBase*>(seq, c)));
// pdout(10,g_conf->debug_mds) << (mdsco_db_line_prefix(this))
// << dendl;
}
- virtual void take_waiting(uint64_t mask, list<MDSInternalContextBase*>& ls) {
- if (waiting.empty()) return;
+ virtual void take_waiting(uint64_t mask, std::list<MDSInternalContextBase*>& ls) {
+ if (!waiting || waiting->empty()) return;
// process ordered waiters in the same order that they were added.
std::map<uint64_t, MDSInternalContextBase*> ordered_waiters;
- for (auto it = waiting.begin();
- it != waiting.end(); ) {
+ for (auto it = waiting->begin(); it != waiting->end(); ) {
if (it->first & mask) {
-
- if (it->second.first > 0)
- ordered_waiters.insert(it->second);
- else
- ls.push_back(it->second.second);
+ if (it->second.first > 0) {
+ ordered_waiters.insert(it->second);
+ } else {
+ ls.push_back(it->second.second);
+ }
// pdout(10,g_conf->debug_mds) << (mdsco_db_line_prefix(this))
// << "take_waiting mask " << hex << mask << dec << " took " << it->second
// << " tag " << hex << it->first << dec
// << " on " << *this
// << dendl;
- waiting.erase(it++);
+ waiting->erase(it++);
} else {
// pdout(10,g_conf->debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second
// << " tag " << hex << it->first << dec
// << " on " << *this
// << dendl;
- ++it;
+ ++it;
}
}
- for (auto it = ordered_waiters.begin();
- it != ordered_waiters.end();
- ++it) {
+ for (auto it = ordered_waiters.begin(); it != ordered_waiters.end(); ++it) {
ls.push_back(it->second);
}
- if (waiting.empty())
+ if (waiting->empty()) {
put(PIN_WAITER);
+ waiting.release();
+ }
}
void finish_waiting(uint64_t mask, int result = 0);
asok_hook,
"dump metadata cache (optionally to a file)");
assert(r == 0);
+ r = admin_socket->register_command("cache status",
+ "cache status",
+ asok_hook,
+ "show cache status");
+ assert(r == 0);
r = admin_socket->register_command("dump tree",
"dump tree "
"name=root,type=CephString,req=true "
admin_socket->unregister_command("flush_path");
admin_socket->unregister_command("export dir");
admin_socket->unregister_command("dump cache");
+ admin_socket->unregister_command("cache status");
admin_socket->unregister_command("dump tree");
admin_socket->unregister_command("session evict");
admin_socket->unregister_command("osdmap barrier");
if (r != 0) {
ss << "Failed to dump cache: " << cpp_strerror(r);
+ f->reset();
+ }
+ } else if (command == "cache status") {
+ Mutex::Locker l(mds_lock);
+ int r = mdcache->cache_status(f);
+ if (r != 0) {
+ ss << "Failed to get cache status: " << cpp_strerror(r);
}
} else if (command == "dump tree") {
string root;
int r = mdcache->dump_cache(root, depth, f);
if (r != 0) {
ss << "Failed to dump tree: " << cpp_strerror(r);
+ f->reset();
}
}
} else if (command == "force_readonly") {
assert(dir->is_auth());
assert(dest != mds->get_nodeid());
+ if (!(mds->is_active() || mds->is_stopping())) {
+ dout(7) << "i'm not active, no exports for now" << dendl;
+ return;
+ }
if (mds->mdcache->is_readonly()) {
dout(7) << "read-only FS, no exports for now" << dendl;
return;
MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag(), it->second.tid);
// include list of bystanders
- for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
- p != dir->replicas_end();
- ++p) {
- if (p->first != it->second.peer) {
- dout(10) << "bystander mds." << p->first << dendl;
- prep->add_bystander(p->first);
+ for (const auto &p : dir->get_replicas()) {
+ if (p.first != it->second.peer) {
+ dout(10) << "bystander mds." << p.first << dendl;
+ prep->add_bystander(p.first);
}
}
it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0));
assert(it->second.notify_ack_waiting.empty());
- for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
- p != dir->replicas_end();
- ++p) {
- if (p->first == it->second.peer) continue;
+ for (const auto &p : dir->get_replicas()) {
+ if (p.first == it->second.peer) continue;
if (mds->is_cluster_degraded() &&
- !mds->mdsmap->is_clientreplay_or_active_or_stopping(p->first))
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first))
continue; // only if active
- it->second.warning_ack_waiting.insert(p->first);
- it->second.notify_ack_waiting.insert(p->first); // we'll eventually get a notifyack, too!
+ it->second.warning_ack_waiting.insert(p.first);
+ it->second.notify_ack_waiting.insert(p.first); // we'll eventually get a notifyack, too!
MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), it->second.tid, true,
mds_authority_t(mds->get_nodeid(),CDIR_AUTH_UNKNOWN),
mds_authority_t(mds->get_nodeid(),it->second.peer));
for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
notify->get_bounds().push_back((*q)->dirfrag());
- mds->send_message_mds(notify, p->first);
+ mds->send_message_mds(notify, p.first);
}
cache->show_subtrees();
audit();
- cache->trim(-1, num_dentries); // try trimming exported dentries
+ cache->trim(num_dentries); // try trimming exported dentries
// send pending import_maps?
mds->mdcache->maybe_send_pending_resolves();
// log our failure
mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); // log failure
- cache->trim(-1, num_dentries); // try trimming dentries
+ cache->trim(num_dentries); // try trimming dentries
// notify bystanders; wait in aborting state
import_notify_abort(dir, bounds);
assert(in->is_auth());
// FIXME
- if (in->is_frozen())
+ if (!in->can_auth_pin())
return;
+ in->auth_pin(this);
C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
this, in, mds_rank_t(ex->get_source().num()));
// clients will release caps from the exporter when they receive the cap import message.
finish_import_inode_caps(in, from, false, peer_exports[in], imported_caps);
mds->locker->eval(in, CEPH_CAP_LOCKS, true);
+ in->auth_unpin(this);
}
* to trim some caps, and consequently unpin some inodes in the MDCache so
* that it can trim too.
*/
-void Server::recall_client_state(float ratio)
+void Server::recall_client_state(void)
{
- int max_caps_per_client = (int)(g_conf->mds_cache_size * .8);
- int min_caps_per_client = 100;
+ /* try to recall at least 80% of all caps */
+ uint64_t max_caps_per_client = (Capability::count() * .8);
+ uint64_t min_caps_per_client = 100;
+ /* unless this ratio is smaller: */
+ /* ratio: determine the amount of caps to recall from each client. Use
+ * percentage full over the cache reservation. Cap the ratio at 80% of client
+ * caps. */
+ double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
dout(10) << "recall_client_state " << ratio
<< ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
set<Session*> sessions;
mds->sessionmap.get_client_session_set(sessions);
- for (set<Session*>::const_iterator p = sessions.begin();
- p != sessions.end();
- ++p) {
- Session *session = *p;
+ for (auto &session : sessions) {
if (!session->is_open() ||
!session->info.inst.name.is_client())
continue;
<< dendl;
if (session->caps.size() > min_caps_per_client) {
- int newlim = MIN((int)(session->caps.size() * ratio), max_caps_per_client);
+ uint64_t newlim = MIN((session->caps.size() * ratio), max_caps_per_client);
if (session->caps.size() > newlim) {
MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
m->head.max_caps = newlim;
return;
}
- bool need_auth = !file_mode_is_readonly(cmode) || (flags & CEPH_O_TRUNC);
+ bool need_auth = !file_mode_is_readonly(cmode) ||
+ (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
dout(7) << "read-only FS" << dendl;
bufferlist dnbl;
__u32 numfiles = 0;
bool start = !offset_hash && offset_str.empty();
- bool end = (dir->begin() == dir->end());
// skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
- for (CDir::map_t::iterator it = start ? dir->begin() : dir->lower_bound(skip_key);
- !end && numfiles < max;
- end = (it == dir->end())) {
+ auto it = start ? dir->begin() : dir->lower_bound(skip_key);
+ bool end = (it == dir->end());
+ for (; !end && numfiles < max; end = (it == dir->end())) {
CDentry *dn = it->second;
++it;
if (mask & CEPH_SETATTR_MODE)
pi->mode = (pi->mode & ~07777) | (req->head.args.setattr.mode & 07777);
else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
- S_ISREG(pi->mode)) {
- pi->mode &= ~S_ISUID;
- if ((pi->mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP))
- pi->mode &= ~S_ISGID;
+ S_ISREG(pi->mode) &&
+ (pi->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
+ pi->mode &= ~(S_ISUID|S_ISGID);
}
if (mask & CEPH_SETATTR_MTIME)
dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
dout(7) << "target is " << *targeti << dendl;
if (targeti->is_dir()) {
- dout(7) << "target is a dir, failing..." << dendl;
- respond_to_request(mdr, -EINVAL);
- return;
+ // if srcdn is replica, need to make sure its linkage is correct
+ vector<CDentry*>& trace = mdr->dn[1];
+ if (trace.empty() ||
+ trace.back()->is_auth() ||
+ trace.back()->lock.can_read(mdr->get_client())) {
+ dout(7) << "target is a dir, failing..." << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
}
xlocks.insert(&targeti->linklock);
oldin = mdcache->get_dentry_inode(destdn, mdr, true);
if (!oldin) return;
dout(10) << " oldin " << *oldin << dendl;
-
- // mv /some/thing /to/some/existing_other_thing
- if (oldin->is_dir() && !srci->is_dir()) {
- respond_to_request(mdr, -EISDIR);
- return;
- }
- if (!oldin->is_dir() && srci->is_dir()) {
- respond_to_request(mdr, -ENOTDIR);
- return;
- }
// non-empty dir? do trivial fast unlocked check, do another check later with read locks
if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
respond_to_request(mdr, -ENOTEMPTY);
return;
}
- if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
- respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
- return;
+
+ // if srcdn is replica, need to make sure its linkage is correct
+ if (srcdn->is_auth() ||
+ srcdn->lock.can_read(mdr->get_client()) ||
+ (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) {
+ // mv /some/thing /to/some/existing_other_thing
+ if (oldin->is_dir() && !srci->is_dir()) {
+ respond_to_request(mdr, -EISDIR);
+ return;
+ }
+ if (!oldin->is_dir() && srci->is_dir()) {
+ respond_to_request(mdr, -ENOTDIR);
+ return;
+ }
+ if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
+ respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
+ return;
+ }
}
}
void reconnect_tick();
void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
- void recall_client_state(float ratio);
+ void recall_client_state(void);
void force_clients_readonly();
// -- requests --
* in order to generate health metrics if the session doesn't see
* a commensurate number of calls to ::notify_cap_release
*/
-void Session::notify_recall_sent(const int new_limit)
+void Session::notify_recall_sent(const size_t new_limit)
{
if (recalled_at.is_zero()) {
// Entering recall phase, set up counters so we can later
interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos
void notify_cap_release(size_t n_caps);
- void notify_recall_sent(const int new_limit);
+ void notify_recall_sent(const size_t new_limit);
void clear_recalled_at();
inodeno_t next_ino() const {
}
void init_gather() {
- for (compact_map<mds_rank_t,unsigned>::iterator p = parent->replicas_begin();
- p != parent->replicas_end();
- ++p)
- more()->gather_set.insert(p->first);
+ for (const auto p : parent->get_replicas()) {
+ more()->gather_set.insert(p.first);
+ }
}
bool is_gathering() const {
return have_more() && !more()->gather_set.empty();
#define CEPH_MOSDPGRECOVERYDELETE_H
#include "MOSDFastDispatchOp.h"
+#include "include/ceph_features.h"
/*
* instruct non-primary to remove some objects during recovery
struct MOSDPGRecoveryDelete : public MOSDFastDispatchOp {
- static const int HEAD_VERSION = 1;
+ static const int HEAD_VERSION = 2;
static const int COMPAT_VERSION = 1;
pg_shard_t from;
::encode(from, payload);
::encode(pgid, payload);
::encode(map_epoch, payload);
- ::encode(min_epoch, payload);
+ if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ ::encode(min_epoch, payload);
+ }
::encode(cost, payload);
::encode(objects, payload);
}
::decode(from, p);
::decode(pgid, p);
::decode(map_epoch, p);
- ::decode(min_epoch, p);
+ if (header.version == 1 &&
+ !HAVE_FEATURE(get_connection()->get_features(), SERVER_LUMINOUS)) {
+ min_epoch = map_epoch;
+ } else {
+ ::decode(min_epoch, p);
+ }
::decode(cost, p);
::decode(objects, p);
}
#define MOSDRECOVERYDELETEREPLY_H
#include "MOSDFastDispatchOp.h"
+#include "include/ceph_features.h"
struct MOSDPGRecoveryDeleteReply : public MOSDFastDispatchOp {
- static const int HEAD_VERSION = 1;
+ static const int HEAD_VERSION = 2;
static const int COMPAT_VERSION = 1;
pg_shard_t from;
bufferlist::iterator p = payload.begin();
::decode(pgid.pgid, p);
::decode(map_epoch, p);
- ::decode(min_epoch, p);
+ if (header.version == 1 &&
+ !HAVE_FEATURE(get_connection()->get_features(), SERVER_LUMINOUS)) {
+ min_epoch = map_epoch;
+ } else {
+ ::decode(min_epoch, p);
+ }
::decode(objects, p);
::decode(pgid.shard, p);
::decode(from, p);
void encode_payload(uint64_t features) override {
::encode(pgid.pgid, payload);
::encode(map_epoch, payload);
- ::encode(min_epoch, payload);
+ if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ ::encode(min_epoch, payload);
+ }
::encode(objects, payload);
::encode(pgid.shard, payload);
::encode(from, payload);
void set_status(int status, const char* status_name) override {}
void output_header() override {};
void output_footer() override {};
-
+ void enable_line_break() override {};
void open_array_section(const char *name) override;
void open_object_section(const char *name) override;
dout(10) << "MonCommandCompletion::finish()" << dendl;
{
// Scoped so the Gil is released before calling notify_all()
- Gil gil(pThreadState);
+ // Create new thread state because this is called via the MonClient
+ // Finisher, not the PyModules finisher.
+ Gil gil(pThreadState, true);
auto set_fn = PyObject_GetAttrString(python_completion, "complete");
assert(set_fn != nullptr);
auto last = pgs.lower_bound(pg_t{0, (uint64_t)removed_pool + 1});
pgs.erase(first, last);
created_pools.erase(removed_pool);
+ queue.erase(removed_pool);
return total - pgs.size();
}
void encode(bufferlist& bl) const {
p.second.summary,
boost::regex("%isorare%"),
p.second.detail.size() > 1 ? "are" : "is");
+ p.second.summary = boost::regex_replace(
+ p.second.summary,
+ boost::regex("%hasorhave%"),
+ p.second.detail.size() > 1 ? "have" : "has");
}
encode_health(new_checks, t);
}
if (state == MDSMap::STATE_STOPPED) {
const auto fscid = pending_fsmap.mds_roles.at(gid);
auto fs = pending_fsmap.get_filesystem(fscid);
+
mon->clog->info() << info.human_name() << " finished "
<< "deactivating rank " << info.rank << " in filesystem "
<< fs->mds_map.fs_name << " (now has "
- << fs->mds_map.get_num_in_mds() << " ranks)";
+ << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
auto erased = pending_fsmap.stop(gid);
erased.push_back(gid);
mon->clog->info() << new_info.human_name() << " assigned to "
"filesystem " << fs->mds_map.fs_name << " as rank "
- << mds << " (now has " << fs->mds_map.get_num_in_mds()
+ << mds << " (now has " << fs->mds_map.get_num_in_mds() + 1
<< " ranks)";
pending_fsmap.promote(newgid, fs, mds);
do_propose = true;
} else if (pending_map.active_gid == 0) {
// There is no currently active daemon, select this one.
if (pending_map.standbys.count(m->get_gid())) {
- drop_standby(m->get_gid());
+ drop_standby(m->get_gid(), false);
}
dout(4) << "selecting new active " << m->get_gid()
<< " " << m->get_name()
pending_map.available = false;
pending_map.active_addr = entity_addr_t();
- drop_standby(replacement_gid);
+ drop_standby(replacement_gid, false);
+
return true;
} else {
return false;
cancel_timer();
}
-void MgrMonitor::drop_standby(uint64_t gid)
+void MgrMonitor::drop_standby(uint64_t gid, bool drop_meta)
{
- pending_metadata_rm.insert(pending_map.standbys[gid].name);
- pending_metadata.erase(pending_map.standbys[gid].name);
+ if (drop_meta) {
+ pending_metadata_rm.insert(pending_map.standbys[gid].name);
+ pending_metadata.erase(pending_map.standbys[gid].name);
+ }
pending_map.standbys.erase(gid);
if (last_beacon.count(gid) > 0) {
last_beacon.erase(gid);
*/
bool promote_standby();
void drop_active();
- void drop_standby(uint64_t gid);
+
+ /**
+ * Remove this gid from the list of standbys. By default,
+ * also remove metadata (i.e. forget the daemon entirely).
+ *
+ * Set `drop_meta` to false if you would like to keep
+ * the daemon's metadata, for example if you're dropping
+ * it as a standby before reinstating it as the active daemon.
+ */
+ void drop_standby(uint64_t gid, bool drop_meta=true);
Context *digest_event = nullptr;
void cancel_timer();
void encode_pending(MonitorDBStore::TransactionRef t) override;
version_t get_trim_to() override;
+ bool definitely_converted_snapsets() const {
+ return digest.definitely_converted_snapsets();
+ }
+
bool preprocess_query(MonOpRequestRef op) override;
bool prepare_update(MonOpRequestRef op) override;
"name=key,type=CephString",
"removes application <app> metadata key <key> on pool <poolname>",
"osd", "rw", "cli,rest")
+COMMAND("osd pool application get " \
+ "name=pool,type=CephPoolname,req=fasle " \
+ "name=app,type=CephString,req=false " \
+ "name=key,type=CephString,req=false",
+ "get value of key <key> of application <app> on pool <poolname>",
+ "osd", "r", "cli,rest")
COMMAND("osd utilization",
"get basic pg distribution stats",
"osd", "r", "cli,rest")
MonOpRequest(Message *req, OpTracker *tracker) :
TrackedOp(tracker,
req->get_recv_stamp().is_zero() ?
- req->get_recv_stamp() : ceph_clock_now()),
+ ceph_clock_now() : req->get_recv_stamp()),
request(req),
session(NULL),
con(NULL),
compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2);
compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3);
compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_KRAKEN);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS);
return compat;
}
cancel_probe_timeout();
timecheck_finish();
health_events_cleanup();
+ health_check_log_times.clear();
scrub_event_cancel();
leader_since = utime_t();
assert(HAVE_FEATURE(quorum_con_features, SERVER_KRAKEN));
new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_KRAKEN);
}
+ if (monmap_features.contains_all(ceph::features::mon::FEATURE_LUMINOUS)) {
+ assert(ceph::features::mon::get_persistent().contains_all(
+ ceph::features::mon::FEATURE_LUMINOUS));
+ // this feature should only ever be set if the quorum supports it.
+ assert(HAVE_FEATURE(quorum_con_features, SERVER_LUMINOUS));
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS);
+ }
dout(5) << __func__ << dendl;
_apply_compatset_features(new_features);
if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_KRAKEN)) {
required_features |= CEPH_FEATUREMASK_SERVER_KRAKEN;
}
+ if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_LUMINOUS;
+ }
// monmap
if (monmap->get_required_features().contains_all(
summary == health_status_cache.summary &&
level == health_status_cache.overall)
return;
- if (level == HEALTH_OK)
- clog->info() << "overall " << summary;
- else if (level == HEALTH_WARN)
- clog->warn() << "overall " << summary;
- else if (level == HEALTH_ERR)
- clog->error() << "overall " << summary;
- else
- ceph_abort();
+ clog->health(level) << "overall " << summary;
health_status_cache.summary = summary;
health_status_cache.overall = level;
} else {
if (!g_conf->mon_health_to_clog) {
return;
}
+
+ const utime_t now = ceph_clock_now();
+
// FIXME: log atomically as part of @t instead of using clog.
dout(10) << __func__ << " updated " << updated.checks.size()
<< " previous " << previous.checks.size()
<< dendl;
+ const auto min_log_period = g_conf->get_val<int64_t>(
+ "mon_health_log_update_period");
for (auto& p : updated.checks) {
auto q = previous.checks.find(p.first);
+ bool logged = false;
if (q == previous.checks.end()) {
// new
ostringstream ss;
ss << "Health check failed: " << p.second.summary << " ("
<< p.first << ")";
- if (p.second.severity == HEALTH_WARN)
- clog->warn() << ss.str();
- else
- clog->error() << ss.str();
+ clog->health(p.second.severity) << ss.str();
+
+ logged = true;
} else {
if (p.second.summary != q->second.summary ||
p.second.severity != q->second.severity) {
- // summary or severity changed (ignore detail changes at this level)
- ostringstream ss;
+
+ auto status_iter = health_check_log_times.find(p.first);
+ if (status_iter != health_check_log_times.end()) {
+ if (p.second.severity == q->second.severity &&
+ now - status_iter->second.updated_at < min_log_period) {
+ // We already logged this recently and the severity is unchanged,
+ // so skip emitting an update of the summary string.
+ // We'll get an update out of tick() later if the check
+ // is still failing.
+ continue;
+ }
+ }
+
+ // summary or severity changed (ignore detail changes at this level)
+ ostringstream ss;
ss << "Health check update: " << p.second.summary << " (" << p.first << ")";
- if (p.second.severity == HEALTH_WARN)
- clog->warn() << ss.str();
- else
- clog->error() << ss.str();
+ clog->health(p.second.severity) << ss.str();
+
+ logged = true;
+ }
+ }
+ // Record the time at which we last logged, so that we can check this
+ // when considering whether/when to print update messages.
+ if (logged) {
+ auto iter = health_check_log_times.find(p.first);
+ if (iter == health_check_log_times.end()) {
+ health_check_log_times.emplace(p.first, HealthCheckLogStatus(
+ p.second.severity, p.second.summary, now));
+ } else {
+ iter->second = HealthCheckLogStatus(
+ p.second.severity, p.second.summary, now);
}
}
}
clog->info() << "Health check cleared: " << p.first << " (was: "
<< p.second.summary << ")";
}
+
+ if (health_check_log_times.count(p.first)) {
+ health_check_log_times.erase(p.first);
+ }
}
}
tagstr = tagstr.substr(0, tagstr.find_last_of(' '));
f->dump_string("tag", tagstr);
- list<string> hs;
- get_health(hs, NULL, f.get());
+ if (osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+ get_health_status(true, f.get(), nullptr);
+ } else {
+ list<string> health_str;
+ get_health(health_str, nullptr, f.get());
+ }
monmon()->dump_info(f.get());
osdmon()->dump_info(f.get());
boost::scoped_ptr<Formatter> f(new JSONFormatter(true));
f->open_object_section("pong");
- list<string> health_str;
- get_health(health_str, NULL, f.get());
+ if (osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+ get_health_status(false, f.get(), nullptr);
+ } else {
+ list<string> health_str;
+ get_health(health_str, nullptr, f.get());
+ }
+
{
stringstream ss;
get_mon_status(f.get(), ss);
ostringstream ss;
health_status_t status = timecheck_status(ss, skew_bound, latency);
- if (status == HEALTH_ERR)
- clog->error() << other << " " << ss.str();
- else if (status == HEALTH_WARN)
- clog->warn() << other << " " << ss.str();
+ clog->health(status) << other << " " << ss.str();
dout(10) << __func__ << " from " << other << " ts " << m->timestamp
<< " delta " << delta << " skew_bound " << skew_bound
{
// ok go.
dout(11) << "tick" << dendl;
+ const utime_t now = ceph_clock_now();
+ // Check if we need to emit any delayed health check updated messages
+ if (is_leader()) {
+ const auto min_period = g_conf->get_val<int64_t>(
+ "mon_health_log_update_period");
+ for (auto& svc : paxos_service) {
+ auto health = svc->get_health_checks();
+
+ for (const auto &i : health.checks) {
+ const std::string &code = i.first;
+ const std::string &summary = i.second.summary;
+ const health_status_t severity = i.second.severity;
+
+ auto status_iter = health_check_log_times.find(code);
+ if (status_iter == health_check_log_times.end()) {
+ continue;
+ }
+
+ auto &log_status = status_iter->second;
+ bool const changed = log_status.last_message != summary
+ || log_status.severity != severity;
+
+ if (changed && now - log_status.updated_at > min_period) {
+ log_status.last_message = summary;
+ log_status.updated_at = now;
+ log_status.severity = severity;
+
+ ostringstream ss;
+ ss << "Health check update: " << summary << " (" << code << ")";
+ clog->health(severity) << ss.str();
+ }
+ }
+ }
+ }
+
+
for (vector<PaxosService*>::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) {
(*p)->tick();
(*p)->maybe_trim();
}
// trim sessions
- utime_t now = ceph_clock_now();
{
Mutex::Locker l(session_map_lock);
auto p = session_map.sessions.begin();
const health_check_map_t& previous,
MonitorDBStore::TransactionRef t);
+protected:
+
+ class HealthCheckLogStatus {
+ public:
+ health_status_t severity;
+ std::string last_message;
+ utime_t updated_at = 0;
+ HealthCheckLogStatus(health_status_t severity_,
+ const std::string &last_message_,
+ utime_t updated_at_)
+ : severity(severity_),
+ last_message(last_message_),
+ updated_at(updated_at_)
+ {}
+ };
+ std::map<std::string, HealthCheckLogStatus> health_check_log_times;
+
+public:
+
void get_cluster_status(stringstream &ss, Formatter *f);
void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
#define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
+#define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout")
// make sure you add your feature to Monitor::get_supported_features
derr << __func__ << " mon_debug_no_require_luminous=true" << dendl;
} else {
newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
- newmap.flags |= CEPH_OSDMAP_RECOVERY_DELETES;
+ newmap.flags |=
+ CEPH_OSDMAP_RECOVERY_DELETES |
+ CEPH_OSDMAP_PURGED_SNAPDIRS;
newmap.full_ratio = g_conf->mon_osd_full_ratio;
if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
do_propose = true;
}
}
+ if (!osdmap.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS) &&
+ osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+ mon->mgrstatmon()->is_readable() &&
+ mon->mgrstatmon()->definitely_converted_snapsets()) {
+ dout(1) << __func__ << " all snapsets converted, setting purged_snapdirs"
+ << dendl;
+ add_flag(CEPH_OSDMAP_PURGED_SNAPDIRS);
+ do_propose = true;
+ }
// mark osds down?
if (check_failures(now))
rs << "\n";
rdata.append(rs.str());
}
+ } else if (prefix == "osd pool application get") {
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
+ "json-pretty"));
+ string pool_name;
+ cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+ string app;
+ cmd_getval(g_ceph_context, cmdmap, "app", app);
+ string key;
+ cmd_getval(g_ceph_context, cmdmap, "key", key);
+
+ if (pool_name.empty()) {
+ // all
+ f->open_object_section("pools");
+ for (const auto &pool : osdmap.pools) {
+ std::string name("<unknown>");
+ const auto &pni = osdmap.pool_name.find(pool.first);
+ if (pni != osdmap.pool_name.end())
+ name = pni->second;
+ f->open_object_section(name.c_str());
+ for (auto &app_pair : pool.second.application_metadata) {
+ f->open_object_section(app_pair.first.c_str());
+ for (auto &kv_pair : app_pair.second) {
+ f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+ }
+ f->close_section();
+ }
+ f->close_section(); // name
+ }
+ f->close_section(); // pools
+ f->flush(rdata);
+ } else {
+ int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << pool_name << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ auto p = osdmap.get_pg_pool(pool);
+ // filter by pool
+ if (app.empty()) {
+ f->open_object_section(pool_name.c_str());
+ for (auto &app_pair : p->application_metadata) {
+ f->open_object_section(app_pair.first.c_str());
+ for (auto &kv_pair : app_pair.second) {
+ f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+ }
+ f->close_section(); // application
+ }
+ f->close_section(); // pool_name
+ f->flush(rdata);
+ goto reply;
+ }
+
+ auto app_it = p->application_metadata.find(app);
+ if (app_it == p->application_metadata.end()) {
+ ss << "pool '" << pool_name << "' has no application '" << app << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ // filter by pool + app
+ if (key.empty()) {
+ f->open_object_section(app_it->first.c_str());
+ for (auto &kv_pair : app_it->second) {
+ f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+ }
+ f->close_section(); // application
+ f->flush(rdata);
+ goto reply;
+ }
+ // filter by pool + app + key
+ auto key_it = app_it->second.find(key);
+ if (key_it == app_it->second.end()) {
+ ss << "application '" << app << "' on pool '" << pool_name
+ << "' does not have key '" << key << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ ss << key_it->second << "\n";
+ rdata.append(ss.str());
+ ss.str("");
+ }
} else {
// try prepare update
return false;
*ss << "crush test failed with " << r << ": " << err.str();
return r;
}
- dout(10) << __func__ << " crush somke test duration: "
+ dout(10) << __func__ << " crush smoke test duration: "
<< duration << dendl;
}
unsigned size, min_size;
CrushWrapper newcrush;
_get_pending_crush(newcrush);
-
- if (!newcrush.class_exists(srcname)) {
- err = -ENOENT;
- ss << "class '" << srcname << "' does not exist";
- goto reply;
- }
-
- if (newcrush.class_exists(dstname)) {
- err = -EEXIST;
- ss << "class '" << dstname << "' already exists";
+ if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
+ // suppose this is a replay and return success
+ // so command is idempotent
+ ss << "already renamed to '" << dstname << "'";
+ err = 0;
goto reply;
}
CrushWrapper newcrush;
_get_pending_crush(newcrush);
+ if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
+ // srcname does not exist and dstname already exists
+ // suppose this is a replay and return success
+ // (so this command is idempotent)
+ ss << "already renamed to '" << dstname << "'";
+ err = 0;
+ goto reply;
+ }
+
err = newcrush.rename_rule(srcname, dstname, &ss);
if (err < 0) {
// ss has reason for failure
f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
f->dump_int("bytes_used", sum.num_bytes);
f->dump_format_unquoted("percent_used", "%.2f", (used*100));
- f->dump_unsigned("max_avail", avail);
+ f->dump_unsigned("max_avail", avail / raw_used_rate);
f->dump_int("objects", sum.num_objects);
if (verbose) {
f->dump_int("quota_objects", pool->quota_max_objects);
} else {
tbl << stringify(si_t(sum.num_bytes));
tbl << percentify(used*100);
- tbl << si_t(avail);
+ tbl << si_t(avail / raw_used_rate);
tbl << sum.num_objects;
if (verbose) {
tbl << stringify(si_t(sum.num_objects_dirty))
return 0;
}
+ // kill me post-mimic or -nautilus
+ bool definitely_converted_snapsets() const {
+ // false negative is okay; false positive is not!
+ return
+ num_pg &&
+ num_pg_unknown == 0 &&
+ pg_sum.stats.sum.num_legacy_snapsets == 0;
+ }
+
// kill me post-luminous:
virtual float get_fallback_full_ratio() const {
return .95;
assert(!new_log);
assert(!new_log_writer);
+ // create a new log [writer] so that we know compaction is in progress
+ // (see _should_compact_log)
+ new_log = new File;
+ new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
+
// 1. allocate new log space and jump to it.
old_log_jump_to = log_file->fnode.get_allocated();
uint64_t need = old_log_jump_to + cct->_conf->bluefs_max_log_runway;
dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
<< std::dec << dendl;
- // create a new log [writer]
- new_log = new File;
- new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
+ // allocate
int r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
&new_log->fnode.extents);
assert(r == 0);
<< needs_reshard_end << ")" << std::dec << dendl;
}
- fault_range(db, needs_reshard_begin, needs_reshard_end);
+ fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
// we may need to fault in a larger interval later must have all
// referring extents for spanning blobs loaded in order to have
throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
cct->_conf->bluestore_throttle_bytes +
cct->_conf->bluestore_throttle_deferred_bytes),
+ deferred_finisher(cct, "defered_finisher", "dfin"),
kv_sync_thread(this),
kv_finalize_thread(this),
mempool_thread(this)
throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
cct->_conf->bluestore_throttle_bytes +
cct->_conf->bluestore_throttle_deferred_bytes),
+ deferred_finisher(cct, "defered_finisher", "dfin"),
kv_sync_thread(this),
kv_finalize_thread(this),
min_alloc_size(_min_alloc_size),
"bluestore_compression_required_ratio",
"bluestore_max_alloc_size",
"bluestore_prefer_deferred_size",
+ "bluestore_prefer_deferred_size_hdd",
+ "bluestore_prefer_deferred_size_ssd",
"bluestore_deferred_batch_ops",
"bluestore_deferred_batch_ops_hdd",
"bluestore_deferred_batch_ops_ssd",
}
}
if (changed.count("bluestore_prefer_deferred_size") ||
+ changed.count("bluestore_prefer_deferred_size_hdd") ||
+ changed.count("bluestore_prefer_deferred_size_ssd") ||
changed.count("bluestore_max_alloc_size") ||
changed.count("bluestore_deferred_batch_ops") ||
changed.count("bluestore_deferred_batch_ops_hdd") ||
finishers.push_back(f);
}
+ deferred_finisher.start();
for (auto f : finishers) {
f->start();
}
kv_finalize_stop = false;
}
dout(10) << __func__ << " stopping finishers" << dendl;
+ deferred_finisher.wait_for_empty();
+ deferred_finisher.stop();
for (auto f : finishers) {
f->wait_for_empty();
f->stop();
osrs.push_back(&osr);
}
for (auto& osr : osrs) {
- if (osr->deferred_pending && !osr->deferred_running) {
- _deferred_submit_unlock(osr.get());
- deferred_lock.lock();
+ if (osr->deferred_pending) {
+ if (!osr->deferred_running) {
+ _deferred_submit_unlock(osr.get());
+ deferred_lock.lock();
+ } else {
+ dout(20) << __func__ << " osr " << osr << " already has running"
+ << dendl;
+ }
+ } else {
+ dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
}
}
}
++i;
}
- // demote to deferred_submit_lock, then drop that too
- std::lock_guard<std::mutex> l(deferred_submit_lock);
deferred_lock.unlock();
bdev->aio_submit(&b->ioc);
}
assert(osr->deferred_running == b);
osr->deferred_running = nullptr;
if (!osr->deferred_pending) {
+ dout(20) << __func__ << " dequeueing" << dendl;
auto q = deferred_queue.iterator_to(*osr);
deferred_queue.erase(q);
} else if (deferred_aggressive) {
dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
- finishers[0]->queue(new FunctionContext([&](int) {
+ deferred_finisher.queue(new FunctionContext([&](int) {
deferred_try_submit();
}));
+ } else {
+ dout(20) << __func__ << " leaving queued, more pending" << dendl;
}
}
if ((suggested_boff % (1 << csum_order)) == 0 &&
suggested_boff + final_length <= max_bsize &&
suggested_boff > b_off) {
- dout(20) << __func__ << " forcing blob_offset to "
+ dout(20) << __func__ << " forcing blob_offset to 0x"
<< std::hex << suggested_boff << std::dec << dendl;
assert(suggested_boff >= b_off);
csum_length += suggested_boff - b_off;
b_off = suggested_boff;
}
+ if (csum != Checksummer::CSUM_NONE) {
+ dout(20) << __func__ << " initialize csum setting for new blob " << *b
+ << " csum_type " << Checksummer::get_csum_type_string(csum)
+ << " csum_order " << csum_order
+ << " csum_length 0x" << std::hex << csum_length << std::dec
+ << dendl;
+ dblob.init_csum(csum, csum_order, csum_length);
+ }
}
AllocExtentVector extents;
}
dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
- dout(20) << __func__ << " blob " << *b
- << " csum_type " << Checksummer::get_csum_type_string(csum)
- << " csum_order " << csum_order
- << " csum_length 0x" << std::hex << csum_length << std::dec
- << dendl;
-
- if (csum != Checksummer::CSUM_NONE) {
- if (!dblob.has_csum()) {
- dblob.init_csum(csum, csum_order, csum_length);
- }
+ dout(20) << __func__ << " blob " << *b << dendl;
+ if (dblob.has_csum()) {
dblob.calc_csum(b_off, *l);
}
+
if (wi.mark_unused) {
auto b_end = b_off + wi.bl.length();
if (b_off) {
interval_set<uint64_t> bluefs_extents; ///< block extents owned by bluefs
interval_set<uint64_t> bluefs_extents_reclaiming; ///< currently reclaiming
- std::mutex deferred_lock, deferred_submit_lock;
+ std::mutex deferred_lock;
std::atomic<uint64_t> deferred_seq = {0};
deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
int deferred_queue_size = 0; ///< num txc's queued across all osrs
atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
+ Finisher deferred_finisher;
int m_finisher_num = 1;
vector<Finisher*> finishers;
}
/// return the effective length of the extent if we align to alloc_unit
-static uint64_t aligned_len(btree_interval_set<uint64_t>::iterator p,
- uint64_t alloc_unit)
+uint64_t StupidAllocator::_aligned_len(
+ btree_interval_set<uint64_t,allocator>::iterator p,
+ uint64_t alloc_unit)
{
uint64_t skew = p.get_start() % alloc_unit;
if (skew)
for (bin = orig_bin; bin < (int)free.size(); ++bin) {
p = free[bin].lower_bound(hint);
while (p != free[bin].end()) {
- if (aligned_len(p, alloc_unit) >= want_size) {
+ if (_aligned_len(p, alloc_unit) >= want_size) {
goto found;
}
++p;
p = free[bin].begin();
auto end = hint ? free[bin].lower_bound(hint) : free[bin].end();
while (p != end) {
- if (aligned_len(p, alloc_unit) >= want_size) {
+ if (_aligned_len(p, alloc_unit) >= want_size) {
goto found;
}
++p;
for (bin = orig_bin; bin >= 0; --bin) {
p = free[bin].lower_bound(hint);
while (p != free[bin].end()) {
- if (aligned_len(p, alloc_unit) >= alloc_unit) {
+ if (_aligned_len(p, alloc_unit) >= alloc_unit) {
goto found;
}
++p;
p = free[bin].begin();
auto end = hint ? free[bin].lower_bound(hint) : free[bin].end();
while (p != end) {
- if (aligned_len(p, alloc_unit) >= alloc_unit) {
+ if (_aligned_len(p, alloc_unit) >= alloc_unit) {
goto found;
}
++p;
std::lock_guard<std::mutex> l(lock);
dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
<< std::dec << dendl;
- btree_interval_set<uint64_t> rm;
+ btree_interval_set<uint64_t,allocator> rm;
rm.insert(offset, length);
for (unsigned i = 0; i < free.size() && !rm.empty(); ++i) {
- btree_interval_set<uint64_t> overlap;
+ btree_interval_set<uint64_t,allocator> overlap;
overlap.intersection_of(rm, free[i]);
if (!overlap.empty()) {
dout(20) << __func__ << " bin " << i << " rm 0x" << std::hex << overlap
#include "Allocator.h"
#include "include/btree_interval_set.h"
#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
class StupidAllocator : public Allocator {
CephContext* cct;
int64_t num_free; ///< total bytes in freelist
int64_t num_reserved; ///< reserved bytes
- std::vector<btree_interval_set<uint64_t> > free; ///< leading-edge copy
+ typedef mempool::bluestore_alloc::pool_allocator<
+ pair<const uint64_t,uint64_t>> allocator;
+ std::vector<btree_interval_set<uint64_t,allocator>> free; ///< leading-edge copy
uint64_t last_alloc;
unsigned _choose_bin(uint64_t len);
void _insert_free(uint64_t offset, uint64_t len);
+ uint64_t _aligned_len(
+ btree_interval_set<uint64_t,allocator>::iterator p,
+ uint64_t alloc_unit);
+
public:
StupidAllocator(CephContext* cct);
~StupidAllocator() override;
aio_iter cur = begin;
struct iocb *piocb[aios_size];
- int r, pos = 0;
+ int left = 0;
while (cur != end) {
cur->priv = priv;
- *(piocb+pos) = &cur->iocb;
- ++pos;
+ *(piocb+left) = &cur->iocb;
+ ++left;
++cur;
}
- while (true) {
- r = io_submit(ctx, pos, piocb);
+ int done = 0;
+ while (left > 0) {
+ int r = io_submit(ctx, left, piocb + done);
if (r < 0) {
if (r == -EAGAIN && attempts-- > 0) {
usleep(delay);
(*retries)++;
continue;
}
+ return r;
}
- break;
+ assert(r > 0);
+ done += r;
+ left -= r;
}
- return r;
+ return done;
}
int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max)
if (o.flags) {
out << " " << o.get_flags_string();
}
- if (o.csum_type) {
+ if (o.has_csum()) {
out << " " << Checksummer::get_csum_type_string(o.csum_type)
<< "/0x" << std::hex << (1ull << o.csum_chunk_order) << std::dec;
}
agent_lock.Unlock();
}
+void OSDService::request_osdmap_update(epoch_t e)
+{
+ osd->osdmap_subscribe(e, false);
+}
+
class AgentTimeoutCB : public Context {
PGRef pg;
public:
disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
session_waiting_lock("OSD::session_waiting_lock"),
+ osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
heartbeat_lock("OSD::heartbeat_lock"),
heartbeat_stop(false),
heartbeat_need_update(true),
&debug);
if (new_interval) {
h->same_interval_since = e;
- }
- if (up != new_up) {
- h->same_up_since = e;
- }
- if (acting_primary != new_acting_primary) {
- h->same_primary_since = e;
- }
- if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
- osdmap->get_pg_num(pgid.pgid.pool()),
- nullptr)) {
- h->last_epoch_split = e;
+ if (up != new_up) {
+ h->same_up_since = e;
+ }
+ if (acting_primary != new_acting_primary) {
+ h->same_primary_since = e;
+ }
+ if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
+ osdmap->get_pg_num(pgid.pgid.pool()),
+ nullptr)) {
+ h->last_epoch_split = e;
+ }
+ up = new_up;
+ acting = new_acting;
+ up_primary = new_up_primary;
+ acting_primary = new_acting_primary;
}
lastmap = osdmap;
}
dout(1) << "start_waiting_for_healthy" << dendl;
set_state(STATE_WAITING_FOR_HEALTHY);
last_heartbeat_resample = utime_t();
+
+ // subscribe to osdmap updates, in case our peers really are known to be dead
+ osdmap_subscribe(osdmap->get_epoch() + 1, false);
}
bool OSD::_is_healthy()
void OSD::osdmap_subscribe(version_t epoch, bool force_request)
{
- OSDMapRef osdmap = service.get_osdmap();
- if (osdmap->get_epoch() >= epoch)
+ Mutex::Locker l(osdmap_subscribe_lock);
+ if (latest_subscribed_epoch >= epoch && !force_request)
return;
+ latest_subscribed_epoch = MAX(epoch, latest_subscribed_epoch);
+
if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
force_request) {
monc->renew_subs();
return ret;
}
+ void request_osdmap_update(epoch_t e);
+
// -- stopping --
Mutex is_stopping_lock;
Cond is_stopping_cond;
void osdmap_subscribe(version_t epoch, bool force_request);
/** @} monc helpers */
+ Mutex osdmap_subscribe_lock;
+ epoch_t latest_subscribed_epoch{0};
+
// -- heartbeat --
/// information about a heartbeat peer
struct HeartbeatInfo {
auto q = pg_upmap_items.find(pg);
if (q != pg_upmap_items.end()) {
- for (auto& i : *raw) {
- for (auto& r : q->second) {
- if (r.first != i) {
- continue;
- }
- if (!(r.second != CRUSH_ITEM_NONE &&
- r.second < max_osd &&
- osd_weight[r.second] == 0)) {
- i = r.second;
- }
- break;
+ // NOTE: this approach does not allow a bidirectional swap,
+ // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
+ for (auto& r : q->second) {
+ // make sure the replacement value doesn't already appear
+ bool exists = false;
+ ssize_t pos = -1;
+ for (unsigned i = 0; i < raw->size(); ++i) {
+ int osd = (*raw)[i];
+ if (osd == r.second) {
+ exists = true;
+ break;
+ }
+ // ignore mapping if target is marked out (or invalid osd id)
+ if (osd == r.first &&
+ pos < 0 &&
+ !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
+ osd_weight[r.second] == 0)) {
+ pos = i;
+ }
+ }
+ if (!exists && pos >= 0) {
+ (*raw)[pos] = r.second;
}
}
}
s += ",require_luminous_osds";
if (f & CEPH_OSDMAP_RECOVERY_DELETES)
s += ",recovery_deletes";
+ if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
+ s += ",purged_snapdirs";
if (s.length())
s.erase(0, 1);
return s;
const T *m = static_cast<const T *>(op->get_req());
assert(m->get_type() == MSGTYPE);
+ int from = m->get_source().num();
+
+ // if a repop is replied after a replica goes down in a new osdmap, and
+ // before the pg advances to this new osdmap, the repop replies before this
+ // repop can be discarded by that replica OSD, because the primary resets the
+ // connection to it when handling the new osdmap marking it down, and also
+ // resets the messenger sesssion when the replica reconnects. to avoid the
+ // out-of-order replies, the messages from that replica should be discarded.
+ if (osd->get_osdmap()->is_down(from))
+ return true;
/* Mostly, this overlaps with the old_peering_msg
* condition. An important exception is pushes
* sent by replicas not in the acting set, since
* if such a replica goes down it does not cause
* a new interval. */
- int from = m->get_source().num();
if (get_osdmap()->get_down_at(from) >= m->map_epoch)
return true;
// legacy filestore didn't store collection bit width; fix.
int bits = osd->store->collection_bits(coll);
if (bits < 0) {
- if (coll.is_meta())
- bits = 0;
- else
- bits = info.pgid.get_split_bits(pool.info.get_pg_num());
+ assert(!coll.is_meta()); // otherwise OSD::load_pgs() did a bad thing
+ bits = info.pgid.get_split_bits(pool.info.get_pg_num());
lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
ObjectStore::Transaction t;
t.collection_set_bits(coll, bits);
eversion_t s,
set<eversion_t> *trimmed,
set<string>* trimmed_dups,
- bool* dirty_dups)
+ eversion_t *write_from_dups)
{
if (complete_to != log.end() &&
complete_to->version <= s) {
unindex(e); // remove from index,
// add to dup list
+ generic_dout(20) << "earliest_dup_version = " << earliest_dup_version << dendl;
if (e.version.version >= earliest_dup_version) {
- if (dirty_dups) *dirty_dups = true;
+ if (write_from_dups != nullptr && *write_from_dups > e.version) {
+ generic_dout(20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
+ *write_from_dups = e.version;
+ }
dups.push_back(pg_log_dup_t(e));
index(dups.back());
for (const auto& extra : e.extra_reqids) {
assert(trim_to <= info.last_complete);
dout(10) << "trim " << log << " to " << trim_to << dendl;
- log.trim(cct, trim_to, &trimmed, &trimmed_dups, &dirty_dups);
+ log.trim(cct, trim_to, &trimmed, &trimmed_dups, &write_from_dups);
info.log_tail = log.tail;
}
}
// now handle dups
if (merge_log_dups(olog)) {
- dirty_dups = true;
changed = true;
}
olog.dups.front().version << " to " <<
olog.dups.back().version << dendl;
changed = true;
+ dirty_from_dups = eversion_t();
+ dirty_to_dups = eversion_t::max();
// since our log.dups is empty just copy them
for (const auto& i : olog.dups) {
log.dups.push_back(i);
auto log_tail_version = log.dups.back().version;
auto insert_cursor = log.dups.end();
+ eversion_t last_shared = eversion_t::max();
for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) {
if (i->version <= log_tail_version) break;
log.dups.insert(insert_cursor, *i);
+ last_shared = i->version;
auto prev = insert_cursor;
--prev;
--insert_cursor; // make sure we insert in reverse order
}
+ mark_dirty_from_dups(last_shared);
}
if (olog.dups.front().version < log.dups.front().version) {
olog.dups.front().version << dendl;
changed = true;
+ eversion_t last;
auto insert_cursor = log.dups.begin();
for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) {
if (i->version >= insert_cursor->version) break;
log.dups.insert(insert_cursor, *i);
+ last = i->version;
auto prev = insert_cursor;
--prev;
// be sure to pass address of copy in log.dups
log.index(*prev);
}
+ mark_dirty_to_dups(last);
}
}
}
while (!log.dups.empty() && log.dups.back().version >= log.tail) {
log.unindex(log.dups.back());
+ mark_dirty_from_dups(log.dups.back().version);
log.dups.pop_back();
}
}
!touched_log,
require_rollback,
clear_divergent_priors,
- dirty_dups,
+ dirty_to_dups,
+ dirty_from_dups,
+ write_from_dups,
&rebuilt_missing_with_deletes,
(pg_log_debug ? &log_keys_debug : nullptr));
undirty();
pg_log_t &log,
const coll_t& coll, const ghobject_t &log_oid,
map<eversion_t, hobject_t> &divergent_priors,
- bool require_rollback,
- bool dirty_dups)
+ bool require_rollback
+ )
{
_write_log_and_missing_wo_missing(
t, km, log, coll, log_oid,
divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
set<eversion_t>(),
set<string>(),
- true, true, require_rollback, dirty_dups, nullptr);
+ true, true, require_rollback,
+ eversion_t::max(), eversion_t(), eversion_t(), nullptr);
}
// static
const ghobject_t &log_oid,
const pg_missing_tracker_t &missing,
bool require_rollback,
- bool dirty_dups,
bool *rebuilt_missing_with_deletes)
{
_write_log_and_missing(
set<eversion_t>(),
set<string>(),
missing,
- true, require_rollback, false, dirty_dups, rebuilt_missing_with_deletes, nullptr);
+ true, require_rollback, false,
+ eversion_t::max(),
+ eversion_t(),
+ eversion_t(),
+ rebuilt_missing_with_deletes, nullptr);
}
// static
bool dirty_divergent_priors,
bool touch_log,
bool require_rollback,
- bool dirty_dups,
+ eversion_t dirty_to_dups,
+ eversion_t dirty_from_dups,
+ eversion_t write_from_dups,
set<string> *log_keys_debug
)
{
}
}
- // process dirty_dups after log_keys_debug is filled, so dups do not
+ // process dups after log_keys_debug is filled, so dups do not
// end up in that set
- if (dirty_dups) {
- pg_log_dup_t min;
+ if (dirty_to_dups != eversion_t()) {
+ pg_log_dup_t min, dirty_to_dup;
+ dirty_to_dup.version = dirty_to_dups;
t.omap_rmkeyrange(
coll, log_oid,
- min.get_key_name(), log.dups.begin()->get_key_name());
- for (const auto& entry : log.dups) {
- bufferlist bl;
- ::encode(entry, bl);
- (*km)[entry.get_key_name()].claim(bl);
- }
+ min.get_key_name(), dirty_to_dup.get_key_name());
+ }
+ if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
+ pg_log_dup_t max, dirty_from_dup;
+ max.version = eversion_t::max();
+ dirty_from_dup.version = dirty_from_dups;
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ dirty_from_dup.get_key_name(), max.get_key_name());
+ }
+
+ for (const auto& entry : log.dups) {
+ if (entry.version > dirty_to_dups)
+ break;
+ bufferlist bl;
+ ::encode(entry, bl);
+ (*km)[entry.get_key_name()].claim(bl);
+ }
+
+ for (list<pg_log_dup_t>::reverse_iterator p = log.dups.rbegin();
+ p != log.dups.rend() &&
+ (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
+ p->version >= dirty_to_dups;
+ ++p) {
+ bufferlist bl;
+ ::encode(*p, bl);
+ (*km)[p->get_key_name()].claim(bl);
}
if (dirty_divergent_priors) {
bool touch_log,
bool require_rollback,
bool clear_divergent_priors,
- bool dirty_dups,
+ eversion_t dirty_to_dups,
+ eversion_t dirty_from_dups,
+ eversion_t write_from_dups,
bool *rebuilt_missing_with_deletes, // in/out param
set<string> *log_keys_debug
) {
}
}
- // process dirty_dups after log_keys_debug is filled, so dups do not
+ // process dups after log_keys_debug is filled, so dups do not
// end up in that set
- if (dirty_dups) {
- pg_log_dup_t min;
+ if (dirty_to_dups != eversion_t()) {
+ pg_log_dup_t min, dirty_to_dup;
+ dirty_to_dup.version = dirty_to_dups;
t.omap_rmkeyrange(
coll, log_oid,
- min.get_key_name(), log.dups.begin()->get_key_name());
- for (const auto& entry : log.dups) {
- bufferlist bl;
- ::encode(entry, bl);
- (*km)[entry.get_key_name()].claim(bl);
- }
+ min.get_key_name(), dirty_to_dup.get_key_name());
+ }
+ if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
+ pg_log_dup_t max, dirty_from_dup;
+ max.version = eversion_t::max();
+ dirty_from_dup.version = dirty_from_dups;
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ dirty_from_dup.get_key_name(), max.get_key_name());
+ }
+
+ for (const auto& entry : log.dups) {
+ if (entry.version > dirty_to_dups)
+ break;
+ bufferlist bl;
+ ::encode(entry, bl);
+ (*km)[entry.get_key_name()].claim(bl);
+ }
+
+ for (list<pg_log_dup_t>::reverse_iterator p = log.dups.rbegin();
+ p != log.dups.rend() &&
+ (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
+ p->version >= dirty_to_dups;
+ ++p) {
+ bufferlist bl;
+ ::encode(*p, bl);
+ (*km)[p->get_key_name()].claim(bl);
}
if (clear_divergent_priors) {
eversion_t s,
set<eversion_t> *trimmed,
set<string>* trimmed_dups,
- bool* dirty_dups);
+ eversion_t *write_from_dups);
ostream& print(ostream& out) const;
}; // IndexedLog
eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from
eversion_t writeout_from; ///< must writout keys >= writeout_from
set<eversion_t> trimmed; ///< must clear keys in trimmed
+ eversion_t dirty_to_dups; ///< must clear/writeout all dups <= dirty_to_dups
+ eversion_t dirty_from_dups; ///< must clear/writeout all dups >= dirty_from_dups
+ eversion_t write_from_dups; ///< must write keys >= write_from_dups
set<string> trimmed_dups; ///< must clear keys in trimmed_dups
CephContext *cct;
bool pg_log_debug;
/// Log is clean on [dirty_to, dirty_from)
bool touched_log;
bool clear_divergent_priors;
- bool dirty_dups; /// log.dups is updated
bool rebuilt_missing_with_deletes = false;
void mark_dirty_to(eversion_t to) {
if (from < writeout_from)
writeout_from = from;
}
+ void mark_dirty_to_dups(eversion_t to) {
+ if (to > dirty_to_dups)
+ dirty_to_dups = to;
+ }
+ void mark_dirty_from_dups(eversion_t from) {
+ if (from < dirty_from_dups)
+ dirty_from_dups = from;
+ }
public:
bool is_dirty() const {
return !touched_log ||
!(trimmed.empty()) ||
!missing.is_clean() ||
!(trimmed_dups.empty()) ||
- dirty_dups ||
+ (dirty_to_dups != eversion_t()) ||
+ (dirty_from_dups != eversion_t::max()) ||
+ (write_from_dups != eversion_t::max()) ||
rebuilt_missing_with_deletes;
}
void mark_log_for_rewrite() {
mark_dirty_to(eversion_t::max());
mark_dirty_from(eversion_t());
+ mark_dirty_to_dups(eversion_t::max());
+ mark_dirty_from_dups(eversion_t());
touched_log = false;
}
bool get_rebuilt_missing_with_deletes() const {
writeout_from = eversion_t::max();
check();
missing.flush();
- dirty_dups = false;
+ dirty_to_dups = eversion_t();
+ dirty_from_dups = eversion_t::max();
+ write_from_dups = eversion_t::max();
}
public:
prefix_provider(dpp),
dirty_from(eversion_t::max()),
writeout_from(eversion_t::max()),
+ dirty_from_dups(eversion_t::max()),
+ write_from_dups(eversion_t::max()),
cct(cct),
pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
touched_log(false),
- clear_divergent_priors(false),
- dirty_dups(false)
+ clear_divergent_priors(false)
{ }
void reset_backfill();
log.claim_log_and_clear_rollback_info(o);
missing.clear();
mark_dirty_to(eversion_t::max());
+ mark_dirty_to_dups(eversion_t::max());
}
void split_into(
log.split_out_child(child_pgid, split_bits, &opg_log->log);
missing.split_into(child_pgid, split_bits, &(opg_log->missing));
opg_log->mark_dirty_to(eversion_t::max());
+ opg_log->mark_dirty_to_dups(eversion_t::max());
mark_dirty_to(eversion_t::max());
+ mark_dirty_to_dups(eversion_t::max());
if (missing.may_include_deletes)
opg_log->rebuilt_missing_with_deletes = true;
}
pg_log_t &log,
const coll_t& coll,
const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
- bool require_rollback,
- bool dirty_dups);
+ bool require_rollback);
static void write_log_and_missing(
ObjectStore::Transaction& t,
const ghobject_t &log_oid,
const pg_missing_tracker_t &missing,
bool require_rollback,
- bool dirty_dups,
bool *rebuilt_missing_set_with_deletes);
static void _write_log_and_missing_wo_missing(
bool dirty_divergent_priors,
bool touch_log,
bool require_rollback,
- bool dirty_dups,
+ eversion_t dirty_to_dups,
+ eversion_t dirty_from_dups,
+ eversion_t write_from_dups,
set<string> *log_keys_debug
);
bool touch_log,
bool require_rollback,
bool clear_divergent_priors,
- bool dirty_dups,
+ eversion_t dirty_to_dups,
+ eversion_t dirty_from_dups,
+ eversion_t write_from_dups,
bool *rebuilt_missing_with_deletes,
set<string> *log_keys_debug
);
<< ", queue on waiting_for_map " << op->get_source() << dendl;
waiting_for_map[op->get_source()].push_back(op);
op->mark_delayed("op must wait for map");
+ osd->request_osdmap_update(op->min_epoch);
return;
}
bufferlist t;
uint64_t len = miter->first - last;
r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
- if (r == -EIO) {
- r = rep_repair_primary_object(soid, ctx->op);
- }
if (r < 0) {
osd->clog->error() << coll << " " << soid
<< " sparse-read failed to read: "
bufferlist tmpbl;
r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
op.flags, &tmpbl);
+ if (r == -EIO) {
+ r = rep_repair_primary_object(soid, ctx->op);
+ }
if (r < 0) {
return r;
}
void dump(Formatter *f) const;
static void generate_test_instances(list<pg_log_dup_t*>& o);
+ bool operator==(const pg_log_dup_t &rhs) const {
+ return reqid == rhs.reqid &&
+ version == rhs.version &&
+ user_version == rhs.user_version &&
+ return_code == rhs.return_code;
+ }
+ bool operator!=(const pg_log_dup_t &rhs) const {
+ return !(*this == rhs);
+ }
+
friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
};
WRITE_CLASS_ENCODER(pg_log_dup_t)
font-weight: bold;
}
+ .chart-container {
+ width: 100%;
+ height: 100%;
+ }
+
.dataTables_wrapper .dataTables_filter {
color: #ddd
}
<li rv-each-pool="rbd_pools">
<a rv-href="pool.url"><i class="fa fa-circle-o"></i> {pool.name}</a>
</li>
+ <li class="ceph-none-found" rv-hide="rbd_pools | length">None found</li>
</ul>
</li>
- <li class="ceph-none-found" rv-hide="rbd_pools | length">None found</li>
</ul>
</li>
<li class="treeview{%if path_info.startswith(('/filesystem/', '/clients/'))%} active{%endif%}">
},
options: {
center_text: raw_usage_text,
- responsive: false,
+ responsive: true,
legend: {display: false},
animation: {duration: 0}
}
]
},
options: {
- responsive: false,
+ responsive: true,
legend: {display: false},
animation: {duration: 0}
}
<td>
<span style="font-size: 45px;">{df.stats.total_objects | dimless}</span>
</td>
- <td>
- <canvas id="raw_usage_chart"
- style="height:120px; width:120px;"></canvas>
+ <td >
+ <div style="height:120px; width: 120px;">
+ <canvas id="raw_usage_chart"></canvas>
+ </div>
</td>
<td>
- <canvas id="pool_usage_chart"
- style="height:120px; width: 120px;"></canvas>
+ <div style="height:120px; width: 120px;">
+ <canvas id="pool_usage_chart"></canvas>
+ </div>
</td>
</tr>
<tr>
) + "/s"
metadata = self.get_metadata('mds', info['name'])
- mds_versions[metadata['ceph_version']].append(info['name'])
+ mds_versions[metadata.get('ceph_version', 'unknown')].append(info['name'])
rank_table.append(
{
"rank": rank,
standby_table = []
for standby in fsmap['standbys']:
metadata = self.get_metadata('mds', standby['name'])
- mds_versions[metadata['ceph_version']].append(standby['name'])
+ mds_versions[metadata.get('ceph_version', 'unknown')].append(standby['name'])
standby_table.append({
'name': standby['name']
string start_marker;
string end_marker;
int max_entries = -1;
+ bool max_entries_specified = false;
int admin = false;
bool admin_specified = false;
int system = false;
max_buckets_specified = true;
} else if (ceph_argparse_witharg(args, i, &val, "--max-entries", (char*)NULL)) {
max_entries = (int)strict_strtol(val.c_str(), 10, &err);
+ max_entries_specified = true;
if (!err.empty()) {
cerr << "ERROR: failed to parse max entries: " << err << std::endl;
return EINVAL;
if (inconsistent_index == false) {
RGWBucketAdminOp::remove_bucket(store, bucket_op, bypass_gc, true);
} else {
+ if (!yes_i_really_mean_it) {
+ cerr << "using --inconsistent_index can corrupt the bucket index " << std::endl
+ << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl;
+ return 1;
+ }
RGWBucketAdminOp::remove_bucket(store, bucket_op, bypass_gc, false);
}
}
}
void *handle;
int max = 1000;
- int ret = store->meta_mgr->list_keys_init(metadata_key, &handle);
+ int ret = store->meta_mgr->list_keys_init(metadata_key, marker, &handle);
if (ret < 0) {
cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
return -ret;
}
bool truncated;
+ uint64_t count = 0;
+ if (max_entries_specified) {
+ formatter->open_object_section("result");
+ }
formatter->open_array_section("keys");
+ uint64_t left;
do {
list<string> keys;
- ret = store->meta_mgr->list_keys_next(handle, max, keys, &truncated);
+ left = (max_entries_specified ? max_entries - count : max);
+ ret = store->meta_mgr->list_keys_next(handle, left, keys, &truncated);
if (ret < 0 && ret != -ENOENT) {
cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
return -ret;
} if (ret != -ENOENT) {
for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
formatter->dump_string("key", *iter);
+ ++count;
}
formatter->flush(cout);
}
- } while (truncated);
+ } while (truncated && left > 0);
formatter->close_section();
+
+ if (max_entries_specified) {
+ encode_json("truncated", truncated, formatter);
+ encode_json("count", count, formatter);
+ if (truncated) {
+ encode_json("marker", store->meta_mgr->get_marker(handle), formatter);
+ }
+ formatter->close_section();
+ }
formatter->flush(cout);
store->meta_mgr->list_keys_complete(handle);
rgw::asio::ClientIO real_client{socket, parser, buffer};
auto real_client_io = rgw::io::add_reordering(
- rgw::io::add_buffering(
+ rgw::io::add_buffering(cct,
rgw::io::add_chunking(
rgw::io::add_conlen_controlling(
&real_client))));
- RGWRestfulIO client(&real_client_io);
+ RGWRestfulIO client(cct, &real_client_io);
process_request(env.store, env.rest, &req, env.uri_prefix,
*env.auth_registry, &client, env.olog);
#include "common/errno.h"
#include "common/ceph_json.h"
+#include "common/backport14.h"
#include "rgw_rados.h"
#include "rgw_acl.h"
#include "rgw_acl_s3.h"
pool = store->get_zone_params().domain_root;
}
- int list_keys_init(RGWRados *store, void **phandle) override
- {
- list_keys_info *info = new list_keys_info;
+ int list_keys_init(RGWRados *store, const string& marker, void **phandle) override {
+ auto info = ceph::make_unique<list_keys_info>();
info->store = store;
- *phandle = (void *)info;
+ int ret = store->list_raw_objects_init(store->get_zone_params().domain_root, marker,
+ &info->ctx);
+ if (ret < 0) {
+ return ret;
+ }
+ *phandle = (void *)info.release();
return 0;
}
list<string> unfiltered_keys;
- int ret = store->list_raw_objects(store->get_zone_params().domain_root, no_filter,
- max, info->ctx, unfiltered_keys, truncated);
+ int ret = store->list_raw_objects_next(no_filter, max, info->ctx,
+ unfiltered_keys, truncated);
if (ret < 0 && ret != -ENOENT)
return ret;
if (ret == -ENOENT) {
list_keys_info *info = static_cast<list_keys_info *>(handle);
delete info;
}
+
+ string get_marker(void *handle) {
+ list_keys_info *info = static_cast<list_keys_info *>(handle);
+ return info->store->list_raw_objs_get_cursor(info->ctx);
+ }
};
class RGWBucketInstanceMetadataHandler : public RGWMetadataHandler {
pool = store->get_zone_params().domain_root;
}
- int list_keys_init(RGWRados *store, void **phandle) override
- {
- list_keys_info *info = new list_keys_info;
+ int list_keys_init(RGWRados *store, const string& marker, void **phandle) override {
+ auto info = ceph::make_unique<list_keys_info>();
info->store = store;
- *phandle = (void *)info;
+ int ret = store->list_raw_objects_init(store->get_zone_params().domain_root, marker,
+ &info->ctx);
+ if (ret < 0) {
+ return ret;
+ }
+ *phandle = (void *)info.release();
return 0;
}
list<string> unfiltered_keys;
- int ret = store->list_raw_objects(store->get_zone_params().domain_root, no_filter,
- max, info->ctx, unfiltered_keys, truncated);
+ int ret = store->list_raw_objects_next(no_filter, max, info->ctx,
+ unfiltered_keys, truncated);
if (ret < 0 && ret != -ENOENT)
return ret;
if (ret == -ENOENT) {
delete info;
}
+ string get_marker(void *handle) {
+ list_keys_info *info = static_cast<list_keys_info *>(handle);
+ return info->store->list_raw_objs_get_cursor(info->ctx);
+ }
+
/*
* hash entry for mdlog placement. Use the same hash key we'd have for the bucket entry
* point, so that the log entries end up at the same log shard, so that we process them
RGWCivetWeb cw_client(conn);
auto real_client_io = rgw::io::add_reordering(
- rgw::io::add_buffering(
+ rgw::io::add_buffering(dout_context,
rgw::io::add_chunking(
rgw::io::add_conlen_controlling(
&cw_client))));
- RGWRestfulIO client_io(&real_client_io);
+ RGWRestfulIO client_io(dout_context, &real_client_io);
RGWRequest req(env.store->get_new_req_id());
int ret = process_request(env.store, env.rest, &req, env.uri_prefix,
public:
~RGWRestfulIO() override = default;
- RGWRestfulIO(rgw::io::RestfulClient* engine)
- : AccountingFilter<rgw::io::RestfulClient*>(std::move(engine)) {
+ RGWRestfulIO(CephContext *_cx, rgw::io::RestfulClient* engine)
+ : AccountingFilter<rgw::io::RestfulClient*>(_cx, std::move(engine)) {
}
void add_filter(std::shared_ptr<DecoratedRestfulClient> new_filter) {
bool enabled;
uint64_t total_sent;
uint64_t total_received;
+ CephContext *cct;
public:
template <typename U>
- AccountingFilter(U&& decoratee)
+ AccountingFilter(CephContext *cct, U&& decoratee)
: DecoratedRestfulClient<T>(std::forward<U>(decoratee)),
enabled(false),
total_sent(0),
- total_received(0) {
+ total_received(0), cct(cct) {
}
size_t send_status(const int status,
const char* const status_name) override {
const auto sent = DecoratedRestfulClient<T>::send_status(status,
status_name);
+ lsubdout(cct, rgw, 30) << "AccountingFilter::send_status: e="
+ << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+ << total_sent << dendl;
if (enabled) {
total_sent += sent;
}
size_t send_100_continue() override {
const auto sent = DecoratedRestfulClient<T>::send_100_continue();
+ lsubdout(cct, rgw, 30) << "AccountingFilter::send_100_continue: e="
+ << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+ << total_sent << dendl;
if (enabled) {
total_sent += sent;
}
size_t send_header(const boost::string_ref& name,
const boost::string_ref& value) override {
const auto sent = DecoratedRestfulClient<T>::send_header(name, value);
+ lsubdout(cct, rgw, 30) << "AccountingFilter::send_header: e="
+ << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+ << total_sent << dendl;
if (enabled) {
total_sent += sent;
}
size_t send_content_length(const uint64_t len) override {
const auto sent = DecoratedRestfulClient<T>::send_content_length(len);
+ lsubdout(cct, rgw, 30) << "AccountingFilter::send_content_length: e="
+ << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+ << total_sent << dendl;
if (enabled) {
total_sent += sent;
}
size_t send_chunked_transfer_encoding() override {
const auto sent = DecoratedRestfulClient<T>::send_chunked_transfer_encoding();
+ lsubdout(cct, rgw, 30) << "AccountingFilter::send_chunked_transfer_encoding: e="
+ << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+ << total_sent << dendl;
if (enabled) {
total_sent += sent;
}
size_t complete_header() override {
const auto sent = DecoratedRestfulClient<T>::complete_header();
+ lsubdout(cct, rgw, 30) << "AccountingFilter::complete_header: e="
+ << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+ << total_sent << dendl;
if (enabled) {
total_sent += sent;
}
size_t recv_body(char* buf, size_t max) override {
const auto received = DecoratedRestfulClient<T>::recv_body(buf, max);
+ lsubdout(cct, rgw, 30) << "AccountingFilter::recv_body: e="
+ << (enabled ? "1" : "0") << ", received=" << received << dendl;
if (enabled) {
total_received += received;
}
size_t send_body(const char* const buf,
const size_t len) override {
const auto sent = DecoratedRestfulClient<T>::send_body(buf, len);
+ lsubdout(cct, rgw, 30) << "AccountingFilter::send_body: e="
+ << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+ << total_sent << dendl;
+ if (enabled) {
+ total_sent += sent;
+ }
+ return sent;
+ }
+
+ size_t complete_request() override {
+ const auto sent = DecoratedRestfulClient<T>::complete_request();
+ lsubdout(cct, rgw, 30) << "AccountingFilter::complete_request: e="
+ << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+ << total_sent << dendl;
if (enabled) {
total_sent += sent;
}
void set_account(bool enabled) override {
this->enabled = enabled;
+ lsubdout(cct, rgw, 30) << "AccountingFilter::set_account: e="
+ << (enabled ? "1" : "0") << dendl;
}
};
bool has_content_length;
bool buffer_data;
+ CephContext *cct;
public:
template <typename U>
- BufferingFilter(U&& decoratee)
+ BufferingFilter(CephContext *cct, U&& decoratee)
: DecoratedRestfulClient<T>(std::forward<U>(decoratee)),
has_content_length(false),
- buffer_data(false) {
+ buffer_data(false), cct(cct) {
}
size_t send_content_length(const uint64_t len) override;
{
if (buffer_data) {
data.append(buf, len);
+
+ lsubdout(cct, rgw, 30) << "BufferingFilter<T>::send_body: defer count = "
+ << len << dendl;
return 0;
}
if (! has_content_length) {
/* We will dump everything in complete_request(). */
buffer_data = true;
+ lsubdout(cct, rgw, 30) << "BufferingFilter<T>::complete_header: has_content_length="
+ << (has_content_length ? "1" : "0") << dendl;
return 0;
}
size_t sent = 0;
if (! has_content_length) {
+ /* It is not correct to count these bytes here,
+ * because they can only be part of the header.
+ * Therefore force count to 0.
+ */
sent += DecoratedRestfulClient<T>::send_content_length(data.length());
sent += DecoratedRestfulClient<T>::complete_header();
+ lsubdout(cct, rgw, 30) <<
+ "BufferingFilter::complete_request: !has_content_length: IGNORE: sent="
+ << sent << dendl;
+ sent = 0;
}
if (buffer_data) {
}
data.clear();
buffer_data = false;
+ lsubdout(cct, rgw, 30) << "BufferingFilter::complete_request: buffer_data: sent="
+ << sent << dendl;
}
return sent + DecoratedRestfulClient<T>::complete_request();
}
template <typename T> static inline
-BufferingFilter<T> add_buffering(T&& t) {
- return BufferingFilter<T>(std::forward<T>(t));
+BufferingFilter<T> add_buffering(
+CephContext *cct,
+T&& t) {
+ return BufferingFilter<T>(cct, std::forward<T>(t));
}
#define RGW_ATTR_EXPIRES RGW_ATTR_PREFIX "expires"
#define RGW_ATTR_DELETE_AT RGW_ATTR_PREFIX "delete_at"
#define RGW_ATTR_ID_TAG RGW_ATTR_PREFIX "idtag"
+#define RGW_ATTR_TAIL_TAG RGW_ATTR_PREFIX "tail_tag"
#define RGW_ATTR_SHADOW_OBJ RGW_ATTR_PREFIX "shadow_name"
#define RGW_ATTR_MANIFEST RGW_ATTR_PREFIX "manifest"
#define RGW_ATTR_USER_MANIFEST RGW_ATTR_PREFIX "user_manifest"
int res = 0;
std::string stored_mode = get_str_attribute(attrs, RGW_ATTR_CRYPT_MODE);
ldout(s->cct, 15) << "Encryption mode: " << stored_mode << dendl;
+
+ const char *req_sse = s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION", NULL);
+ if (nullptr != req_sse && (s->op == OP_GET || s->op == OP_HEAD)) {
+ return -ERR_INVALID_REQUEST;
+ }
+
if (stored_mode == "SSE-C-AES256") {
if (s->cct->_conf->rgw_crypt_require_ssl &&
!s->info.env->exists("SERVER_PORT_SECURE")) {
#define BUCKET_SHARD_SYNC_SPAWN_WINDOW 20
#define DATA_SYNC_MAX_ERR_ENTRIES 10
+enum RemoteDatalogStatus {
+ RemoteNotTrimmed = 0,
+ RemoteTrimmed = 1,
+ RemoteMightTrimmed = 2
+};
+
class RGWDataSyncShardCR : public RGWCoroutine {
RGWDataSyncEnv *sync_env;
RGWDataChangesLogInfo shard_info;
string datalog_marker;
+ RemoteDatalogStatus remote_trimmed;
Mutex inc_lock;
Cond inc_cond;
pool(_pool),
shard_id(_shard_id),
sync_marker(_marker),
- marker_tracker(NULL), truncated(false), inc_lock("RGWDataSyncShardCR::inc_lock"),
+ marker_tracker(NULL), truncated(false), remote_trimmed(RemoteNotTrimmed), inc_lock("RGWDataSyncShardCR::inc_lock"),
total_entries(0), spawn_window(BUCKET_SHARD_SYNC_SPAWN_WINDOW), reset_backoff(NULL),
lease_cr(nullptr), lease_stack(nullptr), error_repo(nullptr), max_error_entries(DATA_SYNC_MAX_ERR_ENTRIES),
retry_backoff_secs(RETRY_BACKOFF_SECS_DEFAULT) {
return set_cr_error(retcode);
}
datalog_marker = shard_info.marker;
+ remote_trimmed = RemoteNotTrimmed;
#define INCREMENTAL_MAX_ENTRIES 100
ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " datalog_marker=" << datalog_marker << " sync_marker.marker=" << sync_marker.marker << dendl;
if (datalog_marker > sync_marker.marker) {
spawned_keys.clear();
+ if (sync_marker.marker.empty())
+ remote_trimmed = RemoteMightTrimmed; //remote data log shard might be trimmed;
yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, &sync_marker.marker, &log_entries, &truncated));
if (retcode < 0) {
ldout(sync_env->cct, 0) << "ERROR: failed to read remote data log info: ret=" << retcode << dendl;
drain_all();
return set_cr_error(retcode);
}
+ if ((remote_trimmed == RemoteMightTrimmed) && sync_marker.marker.empty() && log_entries.empty())
+ remote_trimmed = RemoteTrimmed;
+ else
+ remote_trimmed = RemoteNotTrimmed;
for (log_iter = log_entries.begin(); log_iter != log_entries.end(); ++log_iter) {
ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " log_entry: " << log_iter->log_id << ":" << log_iter->log_timestamp << ":" << log_iter->entry.key << dendl;
if (!marker_tracker->index_key_to_marker(log_iter->entry.key, log_iter->log_id)) {
}
}
ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " datalog_marker=" << datalog_marker << " sync_marker.marker=" << sync_marker.marker << dendl;
- if (datalog_marker == sync_marker.marker) {
+ if (datalog_marker == sync_marker.marker || remote_trimmed == RemoteTrimmed) {
#define INCREMENTAL_INTERVAL 20
yield wait(utime_t(INCREMENTAL_INTERVAL, 0));
}
RGWFCGX fcgxfe(req->fcgx);
auto real_client_io = rgw::io::add_reordering(
- rgw::io::add_buffering(
+ rgw::io::add_buffering(cct,
rgw::io::add_chunking(
&fcgxfe)));
- RGWRestfulIO client_io(&real_client_io);
+ RGWRestfulIO client_io(cct, &real_client_io);
int ret = process_request(store, rest, req, uri_prefix,
void set_status(int status, const char* status_name) override {};
void output_header() override {};
void output_footer() override {};
+ void enable_line_break() override {};
void flush(ostream& os) override;
void reset() override;
op.mp_expiration = rule->get_mp_expiration().get_days();
}
op.dm_expiration = rule->get_dm_expiration();
- auto ret = prefix_map.insert(pair<string, lc_op>(rule->get_prefix(), op));
+
+ std::string prefix;
+ if (rule->get_filter().has_prefix()){
+ prefix = rule->get_filter().get_prefix();
+ } else {
+ prefix = rule->get_prefix();
+ }
+ auto ret = prefix_map.emplace(std::move(prefix), std::move(op));
return ret.second;
}
};
WRITE_CLASS_ENCODER(LCExpiration)
+class LCFilter
+{
+ protected:
+ std::string prefix;
+ // TODO add support for tagging
+ public:
+ const std::string& get_prefix() const{
+ return prefix;
+ }
+
+ void set_prefix(const string& _prefix){
+ prefix = _prefix;
+ }
+
+ void set_prefix(std::string&& _prefix){
+ prefix = std::move(_prefix);
+ }
+
+ bool empty() const {
+ return prefix.empty();
+ }
+
+ bool has_prefix() const {
+ return !prefix.empty();
+ }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ ::encode(prefix, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::iterator& bl) {
+ DECODE_START(1, bl);
+ ::decode(prefix, bl);
+ DECODE_FINISH(bl);
+ }
+};
+WRITE_CLASS_ENCODER(LCFilter);
+
+
+
class LCRule
{
protected:
LCExpiration expiration;
LCExpiration noncur_expiration;
LCExpiration mp_expiration;
+ LCFilter filter;
bool dm_expiration = false;
public:
string& get_status() {
return status;
}
-
+
string& get_prefix() {
return prefix;
}
+ LCFilter& get_filter() {
+ return filter;
+ }
+
LCExpiration& get_expiration() {
return expiration;
}
bool valid();
void encode(bufferlist& bl) const {
- ENCODE_START(4, 1, bl);
+ ENCODE_START(5, 1, bl);
::encode(id, bl);
::encode(prefix, bl);
::encode(status, bl);
::encode(noncur_expiration, bl);
::encode(mp_expiration, bl);
::encode(dm_expiration, bl);
+ ::encode(filter, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
- DECODE_START_LEGACY_COMPAT_LEN(4, 1, 1, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(5, 1, 1, bl);
::decode(id, bl);
::decode(prefix, bl);
::decode(status, bl);
if (struct_v >= 4) {
::decode(dm_expiration, bl);
}
+ if (struct_v >= 5) {
+ ::decode(filter, bl);
+ }
DECODE_FINISH(bl);
}
status.clear();
dm_expiration = false;
+ // S3 generates a 48 bit random ID, maybe we could generate shorter IDs
+ static constexpr auto LC_ID_LENGTH = 48;
+
lc_id = static_cast<LCID_S3 *>(find_first("ID"));
- if (!lc_id)
- return false;
- id = lc_id->get_data();
+ if (lc_id){
+ id = lc_id->get_data();
+ } else {
+ gen_rand_alphanumeric_lower(nullptr, &id, LC_ID_LENGTH);
+ }
+
+
+ XMLObj *obj = find_first("Filter");
+
+ if (obj){
+ string _prefix;
+ RGWXMLDecoder::decode_xml("Prefix", _prefix, obj);
+ filter.set_prefix(std::move(_prefix));
+ } else {
+ // Ideally the following code should be deprecated and we should return
+ // False here, The new S3 LC configuration xml spec. makes Filter mandatory
+ // and Prefix optional. However older clients including boto2 still generate
+ // xml according to the older spec, where Prefix existed outside of Filter
+ // and S3 itself seems to be sloppy on enforcing the mandatory Filter
+ // argument. A day will come when S3 enforces their own xml-spec, but it is
+ // not this day
+
+ lc_prefix = static_cast<LCPrefix_S3 *>(find_first("Prefix"));
+
+ if (!lc_prefix){
+ return false;
+ }
+
+ prefix = lc_prefix->get_data();
+ }
- lc_prefix = static_cast<LCPrefix_S3 *>(find_first("Prefix"));
- if (!lc_prefix)
- return false;
- prefix = lc_prefix->get_data();
lc_status = static_cast<LCStatus_S3 *>(find_first("Status"));
if (!lc_status)
void LCRule_S3::to_xml(CephContext *cct, ostream& out) {
out << "<Rule>" ;
out << "<ID>" << id << "</ID>";
- out << "<Prefix>" << prefix << "</Prefix>";
+ if (!filter.empty()) {
+ LCFilter_S3& lc_filter = static_cast<LCFilter_S3&>(filter);
+ lc_filter.to_xml(out);
+ } else {
+ out << "<Prefix>" << prefix << "</Prefix>";
+ }
out << "<Status>" << status << "</Status>";
if (!expiration.empty() || dm_expiration) {
LCExpiration_S3 expir(expiration.get_days_str(), expiration.get_date(), dm_expiration);
string& to_str() { return data; }
};
+class LCFilter_S3 : public LCFilter, public XMLObj
+{
+ public:
+ ~LCFilter_S3() override {}
+ string& to_str() { return data; }
+ void to_xml(ostream& out){
+ out << "<Filter>";
+ if (!prefix.empty())
+ out << "<Prefix>" << prefix << "</Prefix>";
+ out << "</Filter>";
+ }
+ void dump_xml(Formatter *f) const {
+ f->open_object_section("Filter");
+ if (!prefix.empty())
+ encode_xml("Prefix", prefix, f);
+ f->close_section(); // Filter
+ }
+};
+
class LCStatus_S3 : public XMLObj
{
public:
void dump_xml(Formatter *f) const {
f->open_object_section("Rule");
encode_xml("ID", id, f);
- encode_xml("Prefix", prefix, f);
+ // In case of an empty filter and an empty Prefix, we defer to Prefix.
+ if (!filter.empty()) {
+ const LCFilter_S3& lc_filter = static_cast<const LCFilter_S3&>(filter);
+ lc_filter.dump_xml(f);
+ } else {
+ encode_xml("Prefix", prefix, f);
+ }
encode_xml("Status", status, f);
if (!expiration.empty() || dm_expiration) {
LCExpiration_S3 expir(expiration.get_days_str(), expiration.get_date(), dm_expiration);
const LCMPExpiration_S3& mp_expir = static_cast<const LCMPExpiration_S3&>(mp_expiration);
mp_expir.dump_xml(f);
}
+
f->close_section(); // Rule
}
};
env.sign(access_key);
RGWLoadGenIO real_client_io(&env);
- RGWRestfulIO client_io(&real_client_io);
+ RGWRestfulIO client_io(cct, &real_client_io);
int ret = process_request(store, rest, req, uri_prefix,
*auth_registry, &client_io, olog);
if (!s->is_err())
data.successful_ops = 1;
+ ldout(s->cct, 30) << "log_usage: bucket_name=" << bucket_name
+ << " tenant=" << s->bucket_tenant
+ << ", bytes_sent=" << bytes_sent << ", bytes_received="
+ << bytes_received << ", success=" << data.successful_ops << dendl;
+
entry.add(op_name, data);
utime_t ts = ceph_clock_now();
class RGWMetadataTopHandler : public RGWMetadataHandler {
struct iter_data {
- list<string> sections;
- list<string>::iterator iter;
+ set<string> sections;
+ set<string>::iterator iter;
};
public:
int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) override { return -ENOTSUP; }
- int list_keys_init(RGWRados *store, void **phandle) override {
+ int list_keys_init(RGWRados *store, const string& marker, void **phandle) override {
iter_data *data = new iter_data;
- store->meta_mgr->get_sections(data->sections);
- data->iter = data->sections.begin();
+ list<string> sections;
+ store->meta_mgr->get_sections(sections);
+ for (auto& s : sections) {
+ data->sections.insert(s);
+ }
+ data->iter = data->sections.lower_bound(marker);
*phandle = data;
delete data;
}
+
+ virtual string get_marker(void *handle) {
+ iter_data *data = static_cast<iter_data *>(handle);
+
+ if (data->iter != data->sections.end()) {
+ return *(data->iter);
+ }
+
+ return string();
+ }
};
static RGWMetadataTopHandler md_top_handler;
RGWMetadataHandler *handler;
};
-
int RGWMetadataManager::list_keys_init(string& section, void **handle)
+{
+ return list_keys_init(section, string(), handle);
+}
+
+int RGWMetadataManager::list_keys_init(string& section, const string& marker, void **handle)
{
string entry;
RGWMetadataHandler *handler;
list_keys_handle *h = new list_keys_handle;
h->handler = handler;
- ret = handler->list_keys_init(store, &h->handle);
+ ret = handler->list_keys_init(store, marker, &h->handle);
if (ret < 0) {
delete h;
return ret;
return handler->list_keys_next(h->handle, max, keys, truncated);
}
-
void RGWMetadataManager::list_keys_complete(void *handle)
{
list_keys_handle *h = static_cast<list_keys_handle *>(handle);
delete h;
}
+string RGWMetadataManager::get_marker(void *handle)
+{
+ list_keys_handle *h = static_cast<list_keys_handle *>(handle);
+
+ return h->handler->get_marker(h->handle);
+}
+
void RGWMetadataManager::dump_log_entry(cls_log_entry& entry, Formatter *f)
{
f->open_object_section("entry");
real_time mtime, JSONObj *obj, sync_type_t type) = 0;
virtual int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) = 0;
- virtual int list_keys_init(RGWRados *store, void **phandle) = 0;
+ virtual int list_keys_init(RGWRados *store, const string& marker, void **phandle) = 0;
virtual int list_keys_next(void *handle, int max, list<string>& keys, bool *truncated) = 0;
virtual void list_keys_complete(void *handle) = 0;
+ virtual string get_marker(void *handle) = 0;
+
/* key to use for hashing entries for log shard placement */
virtual void get_hash_key(const string& section, const string& key, string& hash_key) {
hash_key = section + ":" + key;
int remove(string& metadata_key);
int list_keys_init(string& section, void **phandle);
+ int list_keys_init(string& section, const string& marker, void **phandle);
int list_keys_next(void *handle, int max, list<string>& keys, bool *truncated);
void list_keys_complete(void *handle);
+ string get_marker(void *handle);
+
void dump_log_entry(cls_log_entry& entry, Formatter *f);
void get_sections(list<string>& sections);
head_obj_op.meta.owner = s->owner.get_id();
head_obj_op.meta.delete_at = delete_at;
head_obj_op.meta.zones_trace = zones_trace;
+ head_obj_op.meta.modify_tail = true;
int r = head_obj_op.write_meta(obj_len, accounted_size, attrs);
if (r < 0)
obj_op.meta.ptag = &s->req_id; /* use req_id as operation tag */
obj_op.meta.owner = s->owner.get_id();
obj_op.meta.flags = PUT_OBJ_CREATE;
+ obj_op.meta.modify_tail = true;
op_ret = obj_op.write_meta(ofs, accounted_size, attrs);
if (op_ret < 0)
return;
bool partial_content;
bool range_parsed;
bool skip_manifest;
+ bool skip_decrypt{false};
rgw_obj obj;
utime_t gc_invalidate_time;
bool is_slo;
const uint64_t rounded_added = rgw_rounded_objsize(added_bytes);
const uint64_t rounded_removed = rgw_rounded_objsize(removed_bytes);
- if ((entry->stats.size + added_bytes - removed_bytes) >= 0) {
+ if (((int64_t)(entry->stats.size + added_bytes - removed_bytes)) >= 0) {
entry->stats.size += added_bytes - removed_bytes;
} else {
entry->stats.size = 0;
}
- if ((entry->stats.size_rounded + rounded_added - rounded_removed) >= 0) {
+ if (((int64_t)(entry->stats.size_rounded + rounded_added - rounded_removed)) >= 0) {
entry->stats.size_rounded += rounded_added - rounded_removed;
} else {
entry->stats.size_rounded = 0;
}
- if ((entry->stats.num_objects + objs_delta) >= 0) {
+ if (((int64_t)(entry->stats.num_objects + objs_delta)) >= 0) {
entry->stats.num_objects += objs_delta;
} else {
entry->stats.num_objects = 0;
obj_op.meta.delete_at = delete_at;
obj_op.meta.user_data = user_data;
obj_op.meta.zones_trace = zones_trace;
+ obj_op.meta.modify_tail = true;
r = obj_op.write_meta(obj_len, accounted_size, attrs);
if (r < 0) {
* Returns: 0 on success, -ERR# otherwise.
*/
int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
- map<string, bufferlist>& attrs, bool assume_noent,
+ map<string, bufferlist>& attrs,
+ bool assume_noent, bool modify_tail,
void *_index_op)
{
RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
if (!ptag && !index_op->get_optag()->empty()) {
ptag = index_op->get_optag();
}
- r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false);
+ r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
if (r < 0)
return r;
bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
int r;
if (assume_noent) {
- r = _do_write_meta(size, accounted_size, attrs, assume_noent, (void *)&index_op);
+ r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
if (r == -EEXIST) {
assume_noent = false;
}
}
if (!assume_noent) {
- r = _do_write_meta(size, accounted_size, attrs, assume_noent, (void *)&index_op);
+ r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
}
return r;
}
if (!attrs[RGW_ATTR_ETAG].length()) {
attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
}
+ if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
+ auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
+ if (ttiter != src_attrs.end()) {
+ attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
+ }
+ }
break;
case RGWRados::ATTRSMOD_MERGE:
for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
return ret;
attrset.erase(RGW_ATTR_ID_TAG);
+ attrset.erase(RGW_ATTR_TAIL_TAG);
uint64_t max_chunk_size;
obj_time_weight dest_mtime_weight;
+ constexpr bool prepend_meta = true;
+ constexpr bool get_op = true;
+ constexpr bool rgwx_stat = true;
+ constexpr bool sync_manifest = true;
+ constexpr bool skip_decrypt = true;
int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
- true /* prepend_meta */, true /* GET */, true /* rgwx-stat */,
- true /* sync manifest */, &cb, &in_stream_req);
+ prepend_meta, get_op, rgwx_stat,
+ sync_manifest, skip_decrypt, &cb, &in_stream_req);
if (ret < 0) {
return ret;
}
}
}
+ static constexpr bool prepend_meta = true;
+ static constexpr bool get_op = true;
+ static constexpr bool rgwx_stat = false;
+ static constexpr bool sync_manifest = true;
+ static constexpr bool skip_decrypt = true;
ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
- true /* prepend_meta */, true /* GET */, false /* rgwx-stat */,
- true /* sync manifest */, &cb, &in_stream_req);
+ prepend_meta, get_op, rgwx_stat,
+ sync_manifest, skip_decrypt, &cb, &in_stream_req);
if (ret < 0) {
goto set_err_state;
}
}
if (!copy_itself) {
+ attrs.erase(RGW_ATTR_TAIL_TAG);
manifest = astate->manifest;
const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
if (tail_placement.bucket.name.empty()) {
write_op.meta.category = category;
write_op.meta.olh_epoch = olh_epoch;
write_op.meta.delete_at = delete_at;
+ write_op.meta.modify_tail = !copy_itself;
ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
if (ret < 0) {
return 0;
}
- string tag = state->obj_tag.to_str();
+ string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
return store->gc->send_chain(chain, tag, false); // do it async
}
return -EINVAL;
}
- if (state->obj_tag.length() == 0) {// check for backward compatibility
+ string tag;
+
+ if (state->tail_tag.length() > 0) {
+ tag = state->tail_tag.c_str();
+ } else if (state->obj_tag.length() > 0) {
+ tag = state->obj_tag.c_str();
+ } else {
ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
return -EINVAL;
}
- string tag = state->obj_tag.c_str();
-
ldout(cct, 0) << "defer chain tag=" << tag << dendl;
return gc->defer_chain(tag, false);
return -ENOENT;
}
- r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true);
+ r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
if (r < 0)
return r;
s->shadow_obj[bl.length()] = '\0';
}
s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
+ auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
+ if (ttiter != s->attrset.end()) {
+ s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
+ }
bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
if (manifest_bl.length()) {
}
int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
- const char *if_match, const char *if_nomatch, bool removal_op)
+ const char *if_match, const char *if_nomatch, bool removal_op,
+ bool modify_tail)
{
int r = get_state(&state, false);
if (r < 0)
ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
op.setxattr(RGW_ATTR_ID_TAG, bl);
+ if (modify_tail) {
+ op.setxattr(RGW_ATTR_TAIL_TAG, bl);
+ }
return 0;
}
return 0;
}
+int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
+{
+ librados::IoCtx& io_ctx = ctx.io_ctx;
+ librados::NObjectIterator& iter = ctx.iter;
+
+ int r = open_pool_ctx(pool, io_ctx);
+ if (r < 0)
+ return r;
+
+ librados::ObjectCursor oc;
+ if (!oc.from_str(cursor)) {
+ ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
+ return -EINVAL;
+ }
+
+ iter = io_ctx.nobjects_begin(oc);
+
+ return 0;
+}
+
+string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
+{
+ return ctx.iter.get_cursor().to_str();
+}
+
int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
bool *is_truncated, RGWAccessListFilter *filter)
{
}
};
-int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
- int max, RGWListRawObjsCtx& ctx, list<string>& oids,
- bool *is_truncated)
+int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
{
- RGWAccessListFilterPrefix filter(prefix_filter);
-
- if (!ctx.initialized) {
- int r = pool_iterate_begin(pool, ctx.iter_ctx);
+ if (!ctx->initialized) {
+ int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
if (r < 0) {
ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
return r;
}
- ctx.initialized = true;
+ ctx->initialized = true;
}
+ return 0;
+}
+int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
+ RGWListRawObjsCtx& ctx, list<string>& oids,
+ bool *is_truncated)
+{
+ if (!ctx.initialized) {
+ return -EINVAL;
+ }
+ RGWAccessListFilterPrefix filter(prefix_filter);
vector<rgw_bucket_dir_entry> objs;
int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
if (r < 0) {
return oids.size();
}
+int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
+ int max, RGWListRawObjsCtx& ctx, list<string>& oids,
+ bool *is_truncated)
+{
+ if (!ctx.initialized) {
+ int r = list_raw_objects_init(pool, string(), &ctx);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
+}
+
+string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
+{
+ return pool_iterate_get_cursor(ctx.iter_ctx);
+}
+
int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
std::list<rgw_bi_log_entry>& result, bool *truncated)
{
ceph::real_time mtime;
uint64_t epoch;
bufferlist obj_tag;
+ bufferlist tail_tag;
string write_tag;
bool fake_tag;
RGWObjManifest manifest;
if (rhs.obj_tag.length()) {
obj_tag = rhs.obj_tag;
}
+ if (rhs.tail_tag.length()) {
+ tail_tag = rhs.tail_tag;
+ }
write_tag = rhs.write_tag;
fake_tag = rhs.fake_tag;
if (rhs.has_manifest) {
return rgw_shards_max();
}
+
int get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref);
+ int list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx);
+ int list_raw_objects_next(const string& prefix_filter, int max,
+ RGWListRawObjsCtx& ctx, list<string>& oids,
+ bool *is_truncated);
int list_raw_objects(const rgw_pool& pool, const string& prefix_filter, int max,
RGWListRawObjsCtx& ctx, list<string>& oids,
bool *is_truncated);
+ string list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx);
int list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result);
int list_zonegroups(list<string>& zonegroups);
void invalidate_state();
int prepare_atomic_modification(librados::ObjectWriteOperation& op, bool reset_obj, const string *ptag,
- const char *ifmatch, const char *ifnomatch, bool removal_op);
+ const char *ifmatch, const char *ifnomatch, bool removal_op, bool modify_tail);
int complete_atomic_modification();
public:
bool canceled;
const string *user_data;
rgw_zone_set *zones_trace;
+ bool modify_tail;
MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
remove_objs(NULL), category(RGW_OBJ_CATEGORY_MAIN), flags(0),
- if_match(NULL), if_nomatch(NULL), olh_epoch(0), canceled(false), user_data(nullptr), zones_trace(nullptr) {}
+ if_match(NULL), if_nomatch(NULL), olh_epoch(0), canceled(false), user_data(nullptr), zones_trace(nullptr),
+ modify_tail(false) {}
} meta;
explicit Write(RGWRados::Object *_target) : target(_target) {}
int _do_write_meta(uint64_t size, uint64_t accounted_size,
map<std::string, bufferlist>& attrs,
- bool assume_noent,
+ bool modify_tail, bool assume_noent,
void *index_op);
int write_meta(uint64_t size, uint64_t accounted_size,
map<std::string, bufferlist>& attrs);
*/
int pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx);
+ /**
+ * Init pool iteration
+ * pool: pool to use
+ * cursor: position to start iteration
+ * ctx: context object to use for the iteration
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+ int pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx);
+
+ /**
+ * Get pool iteration position
+ * ctx: context object to use for the iteration
+ * Returns: string representation of position
+ */
+ string pool_iterate_get_cursor(RGWPoolIterCtx& ctx);
+
/**
* Iterate over pool return object names, use optional filter
* ctx: iteration context, initialized with pool_iterate_begin()
static bool grants_by_type_check_perm(map<int, string>& grants_by_type, int perm, ACLGrant& grant, int check_perm)
{
- if ((perm & check_perm) == perm) {
+ if ((perm & check_perm) == check_perm) {
grants_by_type_add_one_grant(grants_by_type, check_perm, grant);
return true;
}
const real_time *mod_ptr, const real_time *unmod_ptr,
uint32_t mod_zone_id, uint64_t mod_pg_ver,
bool prepend_metadata, bool get_op, bool rgwx_stat,
- bool sync_manifest, RGWGetDataCB *cb,
+ bool sync_manifest, bool skip_decrypt, RGWGetDataCB *cb,
RGWRESTStreamRWRequest **req)
{
string url;
if (sync_manifest) {
params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "sync-manifest", ""));
}
+ if (skip_decrypt) {
+ params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "skip-decrypt", ""));
+ }
if (!obj.key.instance.empty()) {
const string& instance = obj.key.instance;
params.push_back(param_pair_t("versionId", instance));
const ceph::real_time *mod_ptr, const ceph::real_time *unmod_ptr,
uint32_t mod_zone_id, uint64_t mod_pg_ver,
bool prepend_metadata, bool get_op, bool rgwx_stat, bool sync_manifest,
- RGWGetDataCB *cb, RGWRESTStreamRWRequest **req);
+ bool skip_decrypt, RGWGetDataCB *cb, RGWRESTStreamRWRequest **req);
int complete_request(RGWRESTStreamRWRequest *req, string& etag, ceph::real_time *mtime, uint64_t *psize, map<string, string>& attrs);
int get_resource(const string& resource,
}
void RGWOp_Metadata_List::execute() {
+ string marker = s->info.args.get("marker");
+ bool max_entries_specified;
+ string max_entries_str = s->info.args.get("max-entries", &max_entries_specified);
+
+ bool extended_response = (max_entries_specified); /* for backward compatibility, if max-entries is not specified
+ we will send the old response format */
+ uint64_t max_entries = 0;
+
+ if (max_entries_specified) {
+ string err;
+ max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ dout(5) << "Error parsing max-entries " << max_entries_str << dendl;
+ http_ret = -EINVAL;
+ return;
+ }
+ }
+
string metadata_key;
frame_metadata_key(s, metadata_key);
void *handle;
int max = 1000;
- http_ret = store->meta_mgr->list_keys_init(metadata_key, &handle);
+ http_ret = store->meta_mgr->list_keys_init(metadata_key, marker, &handle);
if (http_ret < 0) {
dout(5) << "ERROR: can't get key: " << cpp_strerror(http_ret) << dendl;
return;
}
bool truncated;
+ uint64_t count = 0;
+
+ if (extended_response) {
+ s->formatter->open_object_section("result");
+ }
s->formatter->open_array_section("keys");
+ uint64_t left;
do {
list<string> keys;
- http_ret = store->meta_mgr->list_keys_next(handle, max, keys, &truncated);
+ left = (max_entries_specified ? max_entries - count : max);
+ http_ret = store->meta_mgr->list_keys_next(handle, left, keys, &truncated);
if (http_ret < 0) {
dout(5) << "ERROR: lists_keys_next(): " << cpp_strerror(http_ret)
<< dendl;
for (list<string>::iterator iter = keys.begin(); iter != keys.end();
++iter) {
s->formatter->dump_string("key", *iter);
+ ++count;
}
- } while (truncated);
+ } while (truncated && left > 0);
s->formatter->close_section();
+ if (extended_response) {
+ encode_json("truncated", truncated, s->formatter);
+ encode_json("count", count, s->formatter);
+ if (truncated) {
+ encode_json("marker", store->meta_mgr->get_marker(handle), s->formatter);
+ }
+ s->formatter->close_section();
+ }
store->meta_mgr->list_keys_complete(handle);
http_ret = 0;
// all of the data from its parts. the parts will sync as separate objects
skip_manifest = s->info.args.exists(RGW_SYS_PARAM_PREFIX "sync-manifest");
+ // multisite sync requests should fetch encrypted data, along with the
+ // attributes needed to support decryption on the other zone
+ if (s->system_request) {
+ skip_decrypt = s->info.args.exists(RGW_SYS_PARAM_PREFIX "skip-decrypt");
+ }
+
return RGWGetObj_ObjStore::get_params();
}
int RGWGetObj_ObjStore_S3::get_decrypt_filter(std::unique_ptr<RGWGetDataCB> *filter, RGWGetDataCB* cb, bufferlist* manifest_bl)
{
+ if (skip_decrypt) { // bypass decryption for multisite sync requests
+ return 0;
+ }
+
int res = 0;
std::unique_ptr<BlockCrypt> block_crypt;
res = rgw_s3_prepare_decrypt(s, attrs, &block_crypt, crypt_http_responses);
list<string> sections;
list<string>::iterator sections_iter;
- list<string> result;
+
+ struct meta_list_result {
+ list<string> keys;
+ string marker;
+ uint64_t count{0};
+ bool truncated{false};
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("keys", keys, obj);
+ JSONDecoder::decode_json("marker", marker, obj);
+ JSONDecoder::decode_json("count", count, obj);
+ JSONDecoder::decode_json("truncated", truncated, obj);
+ }
+ } result;
list<string>::iterator iter;
std::unique_ptr<RGWShardedOmapCRManager> entries_index;
bool lost_lock;
bool failed;
+ string marker;
+
map<uint32_t, rgw_meta_sync_marker>& markers;
public:
rearrange_sections();
sections_iter = sections.begin();
for (; sections_iter != sections.end(); ++sections_iter) {
- yield {
- string entrypoint = string("/admin/metadata/") + *sections_iter;
- /* FIXME: need a better scaling solution here, requires streaming output */
- call(new RGWReadRESTResourceCR<list<string> >(cct, conn, sync_env->http_manager,
- entrypoint, NULL, &result));
- }
- if (get_ret_status() < 0) {
- ldout(cct, 0) << "ERROR: failed to fetch metadata section: " << *sections_iter << dendl;
- yield entries_index->finish();
- yield lease_cr->go_down();
- drain_all();
- return set_cr_error(get_ret_status());
- }
- iter = result.begin();
- for (; iter != result.end(); ++iter) {
- if (!lease_cr->is_locked()) {
- lost_lock = true;
- break;
+ do {
+ yield {
+#define META_FULL_SYNC_CHUNK_SIZE "1000"
+ string entrypoint = string("/admin/metadata/") + *sections_iter;
+ rgw_http_param_pair pairs[] = { { "max-entries", META_FULL_SYNC_CHUNK_SIZE },
+ { "marker", result.marker.c_str() },
+ { NULL, NULL } };
+ result.keys.clear();
+ call(new RGWReadRESTResourceCR<meta_list_result >(cct, conn, sync_env->http_manager,
+ entrypoint, pairs, &result));
}
- yield; // allow entries_index consumer to make progress
-
- ldout(cct, 20) << "list metadata: section=" << *sections_iter << " key=" << *iter << dendl;
- string s = *sections_iter + ":" + *iter;
- int shard_id;
- RGWRados *store = sync_env->store;
- int ret = store->meta_mgr->get_log_shard_id(*sections_iter, *iter, &shard_id);
- if (ret < 0) {
- ldout(cct, 0) << "ERROR: could not determine shard id for " << *sections_iter << ":" << *iter << dendl;
- ret_status = ret;
- break;
+ if (get_ret_status() < 0) {
+ ldout(cct, 0) << "ERROR: failed to fetch metadata section: " << *sections_iter << dendl;
+ yield entries_index->finish();
+ yield lease_cr->go_down();
+ drain_all();
+ return set_cr_error(get_ret_status());
}
- if (!entries_index->append(s, shard_id)) {
- break;
+ iter = result.keys.begin();
+ for (; iter != result.keys.end(); ++iter) {
+ if (!lease_cr->is_locked()) {
+ lost_lock = true;
+ break;
+ }
+ yield; // allow entries_index consumer to make progress
+
+ ldout(cct, 20) << "list metadata: section=" << *sections_iter << " key=" << *iter << dendl;
+ string s = *sections_iter + ":" + *iter;
+ int shard_id;
+ RGWRados *store = sync_env->store;
+ int ret = store->meta_mgr->get_log_shard_id(*sections_iter, *iter, &shard_id);
+ if (ret < 0) {
+ ldout(cct, 0) << "ERROR: could not determine shard id for " << *sections_iter << ":" << *iter << dendl;
+ ret_status = ret;
+ break;
+ }
+ if (!entries_index->append(s, shard_id)) {
+ break;
+ }
}
- }
+ } while (result.truncated);
}
yield {
if (!entries_index->finish()) {
#include "common/Formatter.h"
#include "common/ceph_json.h"
#include "common/RWLock.h"
+#include "common/backport14.h"
#include "rgw_rados.h"
#include "rgw_acl.h"
pool = store->get_zone_params().user_uid_pool;
}
- int list_keys_init(RGWRados *store, void **phandle) override
+ int list_keys_init(RGWRados *store, const string& marker, void **phandle) override
{
- list_keys_info *info = new list_keys_info;
+ auto info = ceph::make_unique<list_keys_info>();
info->store = store;
- *phandle = (void *)info;
+ int ret = store->list_raw_objects_init(store->get_zone_params().user_uid_pool, marker,
+ &info->ctx);
+ if (ret < 0) {
+ return ret;
+ }
+
+ *phandle = (void *)info.release();
return 0;
}
list<string> unfiltered_keys;
- int ret = store->list_raw_objects(store->get_zone_params().user_uid_pool, no_filter,
- max, info->ctx, unfiltered_keys, truncated);
+ int ret = store->list_raw_objects_next(no_filter, max, info->ctx,
+ unfiltered_keys, truncated);
if (ret < 0 && ret != -ENOENT)
return ret;
if (ret == -ENOENT) {
list_keys_info *info = static_cast<list_keys_info *>(handle);
delete info;
}
+
+ string get_marker(void *handle) {
+ list_keys_info *info = static_cast<list_keys_info *>(handle);
+ return info->store->list_raw_objs_get_cursor(info->ctx);
+ }
};
void rgw_user_init(RGWRados *store)
endif()
endif()
else()
- option(WITH_SSE42 "build with SSE4.2" ON)
+ option(WITH_SSE42 "build with SSE4.2" OFF)
if(WITH_SSE42)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
endif()
table0_[c >> 24];
}
+#if defined(HAVE_SSE42) && defined(__GNUC__)
+#if defined(__clang__)
+#if __has_cpp_attribute(gnu::target)
+__attribute__ ((target ("sse4.2")))
+#endif
+#else // gcc supports this since 4.4
+__attribute__ ((target ("sse4.2")))
+#endif
+#endif
static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
#ifdef __SSE4_2__
#ifdef __LP64__
static bool isSSE42() {
#if defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
uint32_t c_;
- uint32_t d_;
- __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
+ __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx");
return c_ & (1U << 20); // copied from CpuId.h in Folly.
#elif defined(_WIN64)
int info[4];
#endif
}
-Function ChosenExtend = Choose_Extend();
+static Function ChosenExtend = Choose_Extend();
uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
return ChosenExtend(crc, buf, size);
$ rados -p rbd rm rbd_directory >/dev/null 2>&1 || true
$ rbd ls
$ rbd ls --format json
- [] (no-eol)
+ []
$ rbd ls --format xml
- <images></images> (no-eol)
+ <images></images>
create
=======
-p [ --pool ] arg pool name
--image arg image name
--snap arg snapshot name
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help clone
--snap arg snapshot name
--from-snap arg snapshot starting point
--whole-object compare whole object
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help disk-usage
-p [ --pool ] arg pool name
--image arg image name
--snap arg snapshot name
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
--from-snap arg snapshot starting point
Optional arguments
-p [ --pool ] arg pool name
--image arg image name
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help image-meta remove
--image arg image name
--snap arg snapshot name
--image-id arg image id
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help journal client disconnect
-p [ --pool ] arg pool name
--image arg image name
--journal arg journal name
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help journal inspect
-p [ --pool ] arg pool name
--image arg image name
--journal arg journal name
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help list
Optional arguments
-l [ --long ] long listing format
-p [ --pool ] arg pool name
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help lock add
Optional arguments
-p [ --pool ] arg pool name
--image arg image name
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help lock remove
Optional arguments
-p [ --pool ] arg pool name
--image arg image name
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help mirror pool demote
Optional arguments
-p [ --pool ] arg pool name
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help mirror pool peer add
Optional arguments
-p [ --pool ] arg pool name
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
--verbose be verbose
Show the rbd images mapped by the kernel.
Optional arguments
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help snap create
-p [ --pool ] arg pool name
--image arg image name
--image-id arg image id
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help snap protect
Optional arguments
-p [ --pool ] arg pool name
--image arg image name
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help trash list
-p [ --pool ] arg pool name
-a [ --all ] list images from all sources
-l [ --long ] long listing format
- --format arg output format [plain, json, or xml]
+ --format arg output format (plain, json, or xml) [default: plain]
--pretty-format pretty formatting (json and xml)
rbd help trash move
uint64_t all_features = 0;
ASSERT_EQ(0, get_all_features(&ioctx, oid, &all_features));
- ASSERT_EQ(RBD_FEATURES_ALL, all_features);
+ ASSERT_EQ(static_cast<uint64_t>(RBD_FEATURES_ALL),
+ static_cast<uint64_t>(all_features & RBD_FEATURES_ALL));
ioctx.close();
}
class Item : public LRUObject {
public:
int id;
- explicit Item(int v) : id(v) {}
+ Item() : id(0) {}
+ Item(int i) : id(i) {}
+ void set(int i) {id = i;}
};
TEST(lru, InsertTop) {
- LRU lru = LRU(10);
-
- lru.lru_set_midpoint(.5); // 50% of 10 elements.
- for (int i=0; i<100; i++) {
- lru.lru_insert_top(new Item(i));
+ LRU lru;
+ static const int n = 100;
+ Item items[n];
+
+ lru.lru_set_midpoint(.5); // 50% of elements.
+ for (int i=0; i<n; i++) {
+ items[i].set(i);
+ lru.lru_insert_top(&items[i]);
}
- ASSERT_EQ(5U, lru.lru_get_top());
- ASSERT_EQ(95U, lru.lru_get_bot());
+ ASSERT_EQ(50U, lru.lru_get_top());
+ ASSERT_EQ(50U, lru.lru_get_bot());
ASSERT_EQ(100U, lru.lru_get_size());
ASSERT_EQ(0, (static_cast<Item*>(lru.lru_expire()))->id);
}
TEST(lru, InsertMid) {
- LRU lru = LRU(10);
-
- for (int i=0; i<100; i++) {
- lru.lru_insert_mid(new Item(i));
+ LRU lru;
+ static const int n = 102;
+ Item items[n];
+
+ lru.lru_set_midpoint(.7); // 70% of elements.
+ for (int i=0; i<n; i++) {
+ items[i].set(i);
+ lru.lru_insert_mid(&items[i]);
}
- ASSERT_EQ(0U, lru.lru_get_top());
- ASSERT_EQ(100U, lru.lru_get_bot());
- ASSERT_EQ(100U, lru.lru_get_size());
+ ASSERT_EQ(71U, lru.lru_get_top());
+ ASSERT_EQ(31U, lru.lru_get_bot());
+ ASSERT_EQ(102U, lru.lru_get_size());
ASSERT_EQ(0, (static_cast<Item*>(lru.lru_expire()))->id);
}
TEST(lru, InsertBot) {
- LRU lru = LRU(10);
-
- for (int i=0; i<100; i++) {
- lru.lru_insert_bot(new Item(i));
+ LRU lru;
+ static const int n = 100;
+ Item items[n];
+
+ lru.lru_set_midpoint(.7); // 70% of elements.
+ for (int i=0; i<n; i++) {
+ items[i].set(i);
+ lru.lru_insert_bot(&items[i]);
}
- ASSERT_EQ(0U, lru.lru_get_top());
- ASSERT_EQ(100U, lru.lru_get_bot());
+ ASSERT_EQ(70U, lru.lru_get_top());
+ ASSERT_EQ(30U, lru.lru_get_bot());
ASSERT_EQ(100U, lru.lru_get_size());
ASSERT_EQ(99, (static_cast<Item*>(lru.lru_expire()))->id);
}
TEST(lru, Adjust) {
- LRU lru = LRU(10);
-
- lru.lru_set_midpoint(.6); // 60% of 10 elements.
- for (int i=0; i<100; i++) {
- lru.lru_touch(new Item(i));
+ LRU lru;
+ static const int n = 100;
+ Item items[n];
+
+ lru.lru_set_midpoint(.6); // 60% of elements.
+ for (int i=0; i<n; i++) {
+ items[i].set(i);
+ lru.lru_insert_top(&items[i]);
+ if (i % 5 == 0)
+ items[i].lru_pin();
}
- ASSERT_EQ(6U, lru.lru_get_top());
- ASSERT_EQ(94U, lru.lru_get_bot());
+ ASSERT_EQ(48U, lru.lru_get_top()); /* 60% of unpinned */
+ ASSERT_EQ(52U, lru.lru_get_bot());
ASSERT_EQ(100U, lru.lru_get_size());
- lru.lru_clear();
-
- lru.lru_set_midpoint(1.2); // 120% of 10 elements.
- for (int i=0; i<100; i++) {
- lru.lru_touch(new Item(i));
- }
- ASSERT_EQ(12U, lru.lru_get_top());
- ASSERT_EQ(88U, lru.lru_get_bot());
- ASSERT_EQ(100U, lru.lru_get_size());
+ ASSERT_EQ(1, (static_cast<Item*>(lru.lru_expire()))->id);
+ ASSERT_EQ(1U, lru.lru_get_pintail());
+ ASSERT_EQ(47U, lru.lru_get_top()); /* 60% of unpinned */
+ ASSERT_EQ(51U, lru.lru_get_bot());
+ ASSERT_EQ(99U, lru.lru_get_size());
+ ASSERT_EQ(2, (static_cast<Item*>(lru.lru_expire()))->id);
+ ASSERT_EQ(1U, lru.lru_get_pintail());
+ ASSERT_EQ(46U, lru.lru_get_top()); /* 60% of unpinned */
+ ASSERT_EQ(51U, lru.lru_get_bot());
+ ASSERT_EQ(98U, lru.lru_get_size());
+ ASSERT_EQ(3, (static_cast<Item*>(lru.lru_expire()))->id);
+ ASSERT_EQ(4, (static_cast<Item*>(lru.lru_expire()))->id);
+ ASSERT_EQ(6, (static_cast<Item*>(lru.lru_expire()))->id);
+ ASSERT_EQ(2U, lru.lru_get_pintail());
+ ASSERT_EQ(45U, lru.lru_get_top()); /* 60% of unpinned */
+ ASSERT_EQ(48U, lru.lru_get_bot());
+ ASSERT_EQ(95U, lru.lru_get_size());
}
TEST(lru, Pinning) {
- LRU lru = LRU();
+ LRU lru;
- Item *ob0 = new Item(0);
- Item *ob1 = new Item(1);
+ Item ob0(0), ob1(1);
// test before ob1 are in a LRU
- ob1->lru_pin();
- ASSERT_FALSE(ob1->lru_is_expireable());
+ ob1.lru_pin();
+ ASSERT_FALSE(ob1.lru_is_expireable());
- ob1->lru_unpin();
- ASSERT_TRUE(ob1->lru_is_expireable());
+ ob1.lru_unpin();
+ ASSERT_TRUE(ob1.lru_is_expireable());
// test when ob1 are in a LRU
- lru.lru_touch(ob0);
- lru.lru_touch(ob1);
+ lru.lru_insert_top(&ob0);
+ lru.lru_insert_top(&ob1);
- ob1->lru_pin();
- ob1->lru_pin(); // Verify that, one incr.
+ ob1.lru_pin();
+ ob1.lru_pin(); // Verify that, one incr.
ASSERT_EQ(1U, lru.lru_get_num_pinned());
- ASSERT_FALSE(ob1->lru_is_expireable());
+ ASSERT_FALSE(ob1.lru_is_expireable());
- ob1->lru_unpin();
- ob1->lru_unpin(); // Verify that, one decr.
+ ob1.lru_unpin();
+ ob1.lru_unpin(); // Verify that, one decr.
ASSERT_EQ(0U, lru.lru_get_num_pinned());
- ASSERT_TRUE(ob1->lru_is_expireable());
+ ASSERT_TRUE(ob1.lru_is_expireable());
ASSERT_EQ(0, (static_cast<Item*>(lru.lru_expire()))->id);
- ob0->lru_pin();
+ ob0.lru_pin();
ASSERT_EQ(1, (static_cast<Item*>(lru.lru_expire()))->id);
}
Fh *fh;
Inode *in;
struct ceph_statx stx;
- const mode_t after_mode = S_IRWXU | S_IRWXG;
- const mode_t before_mode = S_IRWXU | S_IRWXG | S_ISUID | S_ISGID;
+ const mode_t after_mode = S_IRWXU;
+ const mode_t before_mode = S_IRWXU | S_ISUID | S_ISGID;
const unsigned want = CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_MODE;
UserPerm *usercred = ceph_mount_perms(cmount);
ASSERT_TRUE(stx.stx_mask & CEPH_STATX_MODE);
ASSERT_EQ(stx.stx_mode & (mode_t)ALLPERMS, after_mode);
+ /* test chown with supplementary groups, and chown with/without exe bit */
+ uid_t u = 65534;
+ gid_t g = 65534;
+ gid_t gids[] = {65533,65532};
+ UserPerm *altcred = ceph_userperm_new(u, g, sizeof gids / sizeof gids[0], gids);
+ stx.stx_uid = u;
+ stx.stx_gid = g;
+ mode_t m = S_ISGID|S_ISUID|S_IRUSR|S_IWUSR;
+ stx.stx_mode = m;
+ ASSERT_EQ(ceph_ll_setattr(cmount, in, &stx, CEPH_STATX_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID, rootcred), 0);
+ ASSERT_EQ(ceph_ll_getattr(cmount, in, &stx, CEPH_STATX_MODE, 0, altcred), 0);
+ ASSERT_EQ(stx.stx_mode&(mode_t)ALLPERMS, m);
+ /* not dropped without exe bit */
+ stx.stx_gid = gids[0];
+ ASSERT_EQ(ceph_ll_setattr(cmount, in, &stx, CEPH_SETATTR_GID, altcred), 0);
+ ASSERT_EQ(ceph_ll_getattr(cmount, in, &stx, CEPH_STATX_MODE, 0, altcred), 0);
+ ASSERT_EQ(stx.stx_mode&(mode_t)ALLPERMS, m);
+ /* now check dropped with exe bit */
+ m = S_ISGID|S_ISUID|S_IRWXU;
+ stx.stx_mode = m;
+ ASSERT_EQ(ceph_ll_setattr(cmount, in, &stx, CEPH_STATX_MODE, altcred), 0);
+ ASSERT_EQ(ceph_ll_getattr(cmount, in, &stx, CEPH_STATX_MODE, 0, altcred), 0);
+ ASSERT_EQ(stx.stx_mode&(mode_t)ALLPERMS, m);
+ stx.stx_gid = gids[1];
+ ASSERT_EQ(ceph_ll_setattr(cmount, in, &stx, CEPH_SETATTR_GID, altcred), 0);
+ ASSERT_EQ(ceph_ll_getattr(cmount, in, &stx, CEPH_STATX_MODE, 0, altcred), 0);
+ ASSERT_EQ(stx.stx_mode&(mode_t)ALLPERMS, m&(S_IRWXU|S_IRWXG|S_IRWXO));
+ ceph_userperm_destroy(altcred);
+
ASSERT_EQ(ceph_ll_close(cmount, fh), 0);
ceph_shutdown(cmount);
}
REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
CephContext* cct = reinterpret_cast<CephContext*>(_rados.cct());
- REQUIRE(!cct->_conf->rbd_skip_partial_discard);
+ REQUIRE(!cct->_conf->get_val<bool>("rbd_skip_partial_discard"));
librbd::ImageCtx *ictx;
ASSERT_EQ(0, open_image(m_image_name, &ictx));
C_SaferCond cond_ctx;
auto c = librbd::io::AioCompletion::create(&cond_ctx);
c->get();
- ictx->io_work_queue->aio_discard(c, 123, 234, cct->_conf->rbd_skip_partial_discard);
+ ictx->io_work_queue->aio_discard(c, 123, 234, ictx->skip_partial_discard);
ASSERT_EQ(0, c->wait_for_complete());
c->put();
// inject a discard operation into the journal
inject_into_journal(ictx,
- librbd::journal::AioDiscardEvent(0, payload.size(), ictx->skip_partial_discard));
+ librbd::journal::AioDiscardEvent(0, payload.size(),
+ ictx->skip_partial_discard));
close_image(ictx);
// re-open the journal so that it replays the new entry
librbd::io::ReadResult{read_result}, 0);
ASSERT_EQ(0, aio_comp->wait_for_complete());
aio_comp->release();
- if (ictx->cct->_conf->rbd_skip_partial_discard) {
+ if (ictx->skip_partial_discard) {
ASSERT_EQ(payload, read_payload);
} else {
ASSERT_EQ(std::string(read_payload.size(), '\0'), read_payload);
// replay several envents and check the commit position
inject_into_journal(ictx,
- librbd::journal::AioDiscardEvent(0, payload.size(), ictx->cct->_conf->rbd_skip_partial_discard));
+ librbd::journal::AioDiscardEvent(0, payload.size(),
+ ictx->skip_partial_discard));
inject_into_journal(ictx,
- librbd::journal::AioDiscardEvent(0, payload.size(), ictx->cct->_conf->rbd_skip_partial_discard));
+ librbd::journal::AioDiscardEvent(0, payload.size(),
+ ictx->skip_partial_discard));
close_image(ictx);
ASSERT_EQ(0, open_image(m_image_name, &ictx));
// verify lock ordering constraints
aio_comp = new librbd::io::AioCompletion();
- ictx->io_work_queue->aio_discard(aio_comp, 0, read_payload.size(), ictx->cct->_conf->rbd_skip_partial_discard);
+ ictx->io_work_queue->aio_discard(aio_comp, 0, read_payload.size(),
+ ictx->skip_partial_discard);
ASSERT_EQ(0, aio_comp->wait_for_complete());
aio_comp->release();
}
uint32_t blacklist_expire_seconds,
bool force_break_lock, Context *on_finish) {
CephContext *cct = reinterpret_cast<CephContext *>(ioctx.cct());
- EXPECT_EQ(cct->_conf->rbd_blacklist_on_break_lock, blacklist_locker);
- EXPECT_EQ(cct->_conf->rbd_blacklist_expire_seconds,
+ EXPECT_EQ(cct->_conf->get_val<bool>("rbd_blacklist_on_break_lock"),
+ blacklist_locker);
+ EXPECT_EQ(cct->_conf->get_val<int64_t>("rbd_blacklist_expire_seconds"),
(int)blacklist_expire_seconds);
EXPECT_FALSE(force_break_lock);
assert(s_instance != nullptr);
mirroring_resync_after_disconnect(
image_ctx.mirroring_resync_after_disconnect),
mirroring_replay_delay(image_ctx.mirroring_replay_delay),
- non_blocking_aio(image_ctx.non_blocking_aio)
+ non_blocking_aio(image_ctx.non_blocking_aio),
+ blkin_trace_all(image_ctx.blkin_trace_all)
{
md_ctx.dup(image_ctx.md_ctx);
data_ctx.dup(image_ctx.data_ctx);
bool mirroring_resync_after_disconnect;
int mirroring_replay_delay;
bool non_blocking_aio;
+ bool blkin_trace_all;
};
} // namespace librbd
} // anonymous namespace
using ::testing::_;
+using ::testing::AtLeast;
using ::testing::Invoke;
using ::testing::StrEq;
using ::testing::WithArg;
};
TEST_F(TestMirroringWatcher, ModeUpdated) {
- EXPECT_CALL(*m_image_watcher, handle_mode_updated(cls::rbd::MIRROR_MODE_DISABLED));
+ EXPECT_CALL(*m_image_watcher,
+ handle_mode_updated(cls::rbd::MIRROR_MODE_DISABLED))
+ .Times(AtLeast(1));
C_SaferCond ctx;
- MockMirroringWatcher::notify_mode_updated(m_ioctx, cls::rbd::MIRROR_MODE_DISABLED, &ctx);
+ MockMirroringWatcher::notify_mode_updated(
+ m_ioctx, cls::rbd::MIRROR_MODE_DISABLED, &ctx);
ASSERT_EQ(0, ctx.wait());
}
EXPECT_CALL(*m_image_watcher,
handle_image_updated(cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
StrEq("image id"),
- StrEq("global image id")));
+ StrEq("global image id")))
+ .Times(AtLeast(1));
C_SaferCond ctx;
- MockMirroringWatcher::notify_image_updated(m_ioctx,
- cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
- "image id", "global image id", &ctx);
+ MockMirroringWatcher::notify_image_updated(
+ m_ioctx, cls::rbd::MIRROR_IMAGE_STATE_ENABLED, "image id",
+ "global image id", &ctx);
ASSERT_EQ(0, ctx.wait());
}
REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
CephContext* cct = reinterpret_cast<CephContext*>(_rados.cct());
- REQUIRE(!cct->_conf->rbd_skip_partial_discard);
+ REQUIRE(!cct->_conf->get_val<bool>("rbd_skip_partial_discard"));
m_image_name = get_temp_image_name();
m_image_size = 1 << 14;
unsigned col = 0;
ASSERT_EQ(stringify(si_t(sum.num_bytes)), tbl.get(0, col++));
ASSERT_EQ(percentify(used_percent), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(avail)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(si_t(avail/pool.size)), tbl.get(0, col++));
ASSERT_EQ(stringify(sum.num_objects), tbl.get(0, col++));
ASSERT_EQ(stringify(si_t(sum.num_objects_dirty)), tbl.get(0, col++));
ASSERT_EQ(stringify(si_t(sum.num_rd)), tbl.get(0, col++));
unsigned col = 0;
ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
ASSERT_EQ(percentify(0), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(avail)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(si_t(avail/pool.size)), tbl.get(0, col++));
ASSERT_EQ(stringify(0), tbl.get(0, col++));
ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
unsigned col = 0;
ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
ASSERT_EQ(percentify(0), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(avail)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(si_t(avail/pool.size)), tbl.get(0, col++));
ASSERT_EQ(stringify(0), tbl.get(0, col++));
}
}
-class PGLogMergeDupsTest : public ::testing::Test, protected PGLog {
+class PGLogMergeDupsTest : protected PGLog, public StoreTestFixture {
public:
- PGLogMergeDupsTest() : PGLog(g_ceph_context) { }
+ PGLogMergeDupsTest() : PGLog(g_ceph_context), StoreTestFixture("memstore") { }
- void SetUp() override { }
+ void SetUp() override {
+ StoreTestFixture::SetUp();
+ ObjectStore::Sequencer osr(__func__);
+ ObjectStore::Transaction t;
+ test_coll = coll_t(spg_t(pg_t(1, 1)));
+ t.create_collection(test_coll, 0);
+ store->apply_transaction(&osr, std::move(t));
+ }
void TearDown() override {
+ test_disk_roundtrip();
clear();
+ StoreTestFixture::TearDown();
}
static pg_log_dup_t create_dup_entry(uint a, uint b) {
void add_dups(uint a, uint b) {
log.dups.push_back(create_dup_entry(a, b));
+ write_from_dups = MIN(write_from_dups, log.dups.back().version);
}
void add_dups(const std::vector<pg_log_dup_t>& l) {
for (auto& i : l) {
log.dups.push_back(i);
+ write_from_dups = MIN(write_from_dups, log.dups.back().version);
}
}
EXPECT_EQ(1u, log.dup_index.count(i.reqid));
}
}
+
+ void test_disk_roundtrip() {
+ ObjectStore::Sequencer osr(__func__);
+ ObjectStore::Transaction t;
+ hobject_t hoid;
+ hoid.pool = 1;
+ hoid.oid = "log";
+ ghobject_t log_oid(hoid);
+ map<string, bufferlist> km;
+ write_log_and_missing(t, &km, test_coll, log_oid, false);
+ if (!km.empty()) {
+ t.omap_setkeys(test_coll, log_oid, km);
+ }
+ ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t)));
+
+ auto orig_dups = log.dups;
+ clear();
+ ostringstream err;
+ read_log_and_missing(store.get(), test_coll, test_coll, log_oid,
+ pg_info_t(), false, err, false);
+ ASSERT_EQ(orig_dups.size(), log.dups.size());
+ ASSERT_EQ(orig_dups, log.dups);
+ auto dups_it = log.dups.begin();
+ for (auto orig_dup : orig_dups) {
+ ASSERT_EQ(orig_dup, *dups_it);
+ ++dups_it;
+ }
+ }
+
+ coll_t test_coll;
};
TEST_F(PGLogMergeDupsTest, OtherEmpty) {
public PGLogTestBase,
public PGLog::IndexedLog
{
- std::list<hobject_t*> test_hobjects;
- CephContext *cct;
-
- void SetUp() override {
- cct = (new CephContext(CEPH_ENTITY_TYPE_OSD))->get();
-
- hobject_t::generate_test_instances(test_hobjects);
- }
+ CephContext *cct = g_ceph_context;
void SetUp(unsigned min_entries, unsigned max_entries, unsigned dup_track) {
constexpr size_t size = 10;
cct->_conf->set_val_or_die("osd_min_pg_log_entries", min_entries_s);
cct->_conf->set_val_or_die("osd_max_pg_log_entries", max_entries_s);
cct->_conf->set_val_or_die("osd_pg_log_dups_tracked", dup_track_s);
-}
-
- void TearDown() override {
- while (!test_hobjects.empty()) {
- delete test_hobjects.front();
- test_hobjects.pop_front();
- }
-
- cct->put();
}
}; // struct PGLogTrimTest
-# if 0
-TEST_F(PGLogTest, Trim1) {
- TestCase t;
-
- t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
- t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(15, 150), mk_evt(10, 100)));
- t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(15, 155), mk_evt(15, 150)));
- t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(20, 160), mk_evt(25, 152)));
- t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(21, 165), mk_evt(26, 160)));
- t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(21, 165), mk_evt(31, 171)));
-
- t.setup();
-}
-#endif
-
-
TEST_F(PGLogTrimTest, TestMakingCephContext)
{
SetUp(1, 2, 5);
std::set<eversion_t> trimmed;
std::set<std::string> trimmed_dups;
- bool dirty_dups = false;
+ eversion_t write_from_dups = eversion_t::max();
- log.trim(cct, mk_evt(19, 157), &trimmed, &trimmed_dups, &dirty_dups);
+ log.trim(cct, mk_evt(19, 157), &trimmed, &trimmed_dups, &write_from_dups);
- EXPECT_EQ(true, dirty_dups);
+ EXPECT_EQ(eversion_t(15, 150), write_from_dups);
EXPECT_EQ(3u, log.log.size());
EXPECT_EQ(3u, trimmed.size());
EXPECT_EQ(2u, log.dups.size());
std::set<eversion_t> trimmed2;
std::set<std::string> trimmed_dups2;
- bool dirty_dups2 = false;
-
- log.trim(cct, mk_evt(20, 164), &trimmed2, &trimmed_dups2, &dirty_dups2);
+ eversion_t write_from_dups2 = eversion_t::max();
- EXPECT_EQ(true, dirty_dups2);
+ log.trim(cct, mk_evt(20, 164), &trimmed2, &trimmed_dups2, &write_from_dups2);
+
+ EXPECT_EQ(eversion_t(19, 160), write_from_dups2);
EXPECT_EQ(2u, log.log.size());
EXPECT_EQ(1u, trimmed2.size());
EXPECT_EQ(2u, log.dups.size());
log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160)));
log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166)));
- bool dirty_dups = false;
+ eversion_t write_from_dups = eversion_t::max();
- log.trim(cct, mk_evt(19, 157), nullptr, nullptr, &dirty_dups);
+ log.trim(cct, mk_evt(19, 157), nullptr, nullptr, &write_from_dups);
- EXPECT_EQ(true, dirty_dups);
+ EXPECT_EQ(eversion_t(15, 150), write_from_dups);
EXPECT_EQ(3u, log.log.size());
EXPECT_EQ(2u, log.dups.size());
}
std::set<eversion_t> trimmed;
std::set<std::string> trimmed_dups;
- bool dirty_dups = false;
+ eversion_t write_from_dups = eversion_t::max();
- log.trim(cct, mk_evt(19, 157), &trimmed, &trimmed_dups, &dirty_dups);
+ log.trim(cct, mk_evt(19, 157), &trimmed, &trimmed_dups, &write_from_dups);
- EXPECT_FALSE(dirty_dups);
+ EXPECT_EQ(eversion_t::max(), write_from_dups);
EXPECT_EQ(3u, log.log.size());
EXPECT_EQ(3u, trimmed.size());
EXPECT_EQ(0u, log.dups.size());
std::set<eversion_t> trimmed;
std::set<std::string> trimmed_dups;
- bool dirty_dups = false;
+ eversion_t write_from_dups = eversion_t::max();
- log.trim(cct, mk_evt(9, 99), &trimmed, &trimmed_dups, &dirty_dups);
+ log.trim(cct, mk_evt(9, 99), &trimmed, &trimmed_dups, &write_from_dups);
- EXPECT_FALSE(dirty_dups);
+ EXPECT_EQ(eversion_t::max(), write_from_dups);
EXPECT_EQ(6u, log.log.size());
EXPECT_EQ(0u, trimmed.size());
EXPECT_EQ(0u, log.dups.size());
std::set<eversion_t> trimmed;
std::set<std::string> trimmed_dups;
- bool dirty_dups = false;
+ eversion_t write_from_dups = eversion_t::max();
- log.trim(cct, mk_evt(22, 180), &trimmed, &trimmed_dups, &dirty_dups);
+ log.trim(cct, mk_evt(22, 180), &trimmed, &trimmed_dups, &write_from_dups);
- EXPECT_EQ(true, dirty_dups);
+ EXPECT_EQ(eversion_t(15, 150), write_from_dups);
EXPECT_EQ(0u, log.log.size());
EXPECT_EQ(6u, trimmed.size());
EXPECT_EQ(5u, log.dups.size());
log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166),
osd_reqid_t(client, 8, 6)));
- bool dirty_dups = false;
+ eversion_t write_from_dups = eversion_t::max();
- log.trim(cct, mk_evt(19, 157), nullptr, nullptr, &dirty_dups);
+ log.trim(cct, mk_evt(19, 157), nullptr, nullptr, &write_from_dups);
- EXPECT_EQ(true, dirty_dups);
+ EXPECT_EQ(eversion_t(15, 150), write_from_dups);
EXPECT_EQ(3u, log.log.size());
EXPECT_EQ(2u, log.dups.size());
}
}
+
// Local Variables:
// compile-command: "cd ../.. ; make unittest_pglog ; ./unittest_pglog --log-to-stderr=true --debug-osd=20 # --gtest_filter=*.* "
// End:
EXPECT_EQ(0, _rados->conf_set("rbd_mirror_leader_max_missed_heartbeats",
"1"));
CephContext *cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct());
- int max_acquire_attempts =
- cct->_conf->rbd_mirror_leader_max_acquire_attempts_before_break;
+ int max_acquire_attempts = cct->_conf->get_val<int64_t>(
+ "rbd_mirror_leader_max_acquire_attempts_before_break");
MockManagedLock mock_managed_lock;
MockMirrorStatusWatcher mock_mirror_status_watcher;
for bucket_name in buckets:
zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
+
+def test_encrypted_object_sync():
+ zonegroup = realm.master_zonegroup()
+ zonegroup_conns = ZonegroupConns(zonegroup)
+
+ (zone1, zone2) = zonegroup_conns.rw_zones[0:2]
+
+ # create a bucket on the first zone
+ bucket_name = gen_bucket_name()
+ log.info('create bucket zone=%s name=%s', zone1.name, bucket_name)
+ bucket = zone1.conn.create_bucket(bucket_name)
+
+ # upload an object with sse-c encryption
+ sse_c_headers = {
+ 'x-amz-server-side-encryption-customer-algorithm': 'AES256',
+ 'x-amz-server-side-encryption-customer-key': 'pO3upElrwuEXSoFwCfnZPdSsmt/xWeFa0N9KgDijwVs=',
+ 'x-amz-server-side-encryption-customer-key-md5': 'DWygnHRtgiJ77HCm+1rvHw=='
+ }
+ key = bucket.new_key('testobj-sse-c')
+ data = 'A'*512
+ key.set_contents_from_string(data, headers=sse_c_headers)
+
+ # upload an object with sse-kms encryption
+ sse_kms_headers = {
+ 'x-amz-server-side-encryption': 'aws:kms',
+ # testkey-1 must be present in 'rgw crypt s3 kms encryption keys' (vstart.sh adds this)
+ 'x-amz-server-side-encryption-aws-kms-key-id': 'testkey-1',
+ }
+ key = bucket.new_key('testobj-sse-kms')
+ key.set_contents_from_string(data, headers=sse_kms_headers)
+
+ # wait for the bucket metadata and data to sync
+ zonegroup_meta_checkpoint(zonegroup)
+ zone_bucket_checkpoint(zone2.zone, zone1.zone, bucket_name)
+
+ # read the encrypted objects from the second zone
+ bucket2 = get_bucket(zone2, bucket_name)
+ key = bucket2.get_key('testobj-sse-c', headers=sse_c_headers)
+ eq(data, key.get_contents_as_string(headers=sse_c_headers))
+
+ key = bucket2.get_key('testobj-sse-kms')
+ eq(data, key.get_contents_as_string())
if (!divergent.empty()) {
assert(missing.get_items().empty());
PGLog::write_log_and_missing_wo_missing(
- t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, true, true);
+ t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, true);
} else {
pg_missing_tracker_t tmissing(missing);
bool rebuilt_missing_set_with_deletes = missing.may_include_deletes;
PGLog::write_log_and_missing(
- t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true, true,
+ t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true,
&rebuilt_missing_set_with_deletes);
}
t.omap_setkeys(coll, info.pgid.make_pgmeta_oid(), km);
ObjectStore::Transaction t;
int bits = src->collection_bits(cid);
if (bits < 0) {
- cerr << "cannot get bit count for collection " << cid << ": "
- << cpp_strerror(bits) << std::endl;
- goto out;
+ if (src->get_type() == "filestore" && cid.is_meta()) {
+ bits = 0;
+ } else {
+ cerr << "cannot get bit count for collection " << cid << ": "
+ << cpp_strerror(bits) << std::endl;
+ goto out;
+ }
}
t.create_collection(cid, bits);
dst->apply_transaction(&osr, std::move(t));
void add_format_options(boost::program_options::options_description *opt) {
opt->add_options()
- (FORMAT.c_str(), po::value<Format>(), "output format [plain, json, or xml]")
+ (FORMAT.c_str(), po::value<Format>(), "output format (plain, json, or xml) [default: plain]")
(PRETTY_FORMAT.c_str(), po::bool_switch(),
"pretty formatting (json and xml)");
}
int extract_spec(const std::string &spec, std::string *pool_name,
std::string *image_name, std::string *snap_name,
SpecValidation spec_validation) {
- if (!g_ceph_context->_conf->rbd_validate_names) {
+ if (!g_ceph_context->_conf->get_val<bool>("rbd_validate_names")) {
spec_validation = SPEC_VALIDATION_NONE;
}
}
std::string get_default_pool_name() {
- return g_ceph_context->_conf->rbd_default_pool;
+ return g_ceph_context->_conf->get_val<std::string>("rbd_default_pool");
}
std::string get_pool_name(const po::variables_map &vm, size_t *arg_index) {
std::cerr << "rbd: --pretty-format only works when --format "
<< "is json or xml" << std::endl;
return -EINVAL;
+ } else if (*formatter != nullptr && !pretty) {
+ formatter->get()->enable_line_break();
}
+ } else if (vm[at::PRETTY_FORMAT].as<bool>()) {
+ std::cerr << "rbd: --pretty-format only works when --format "
+ << "is json or xml" << std::endl;
+ return -EINVAL;
}
return 0;
}
from_snap_name.empty() ? nullptr : from_snap_name.c_str(),
formatter.get());
if (r < 0) {
- std::cerr << "du failed: " << cpp_strerror(r) << std::endl;
+ std::cerr << "rbd: du failed: " << cpp_strerror(r) << std::endl;
return r;
}
return 0;
}
}
ExportDiffContext edc(&image, fd, info.size,
- g_conf->rbd_concurrent_management_ops, no_progress,
- export_format);
+ g_conf->get_val<int64_t>("rbd_concurrent_management_ops"),
+ no_progress, export_format);
r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object,
&C_ExportDiff::export_diff_cb, (void *)&edc);
if (r < 0) {
fd = STDOUT_FILENO;
max_concurrent_ops = 1;
} else {
- max_concurrent_ops = g_conf->rbd_concurrent_management_ops;
+ max_concurrent_ops = g_conf->get_val<int64_t>("rbd_concurrent_management_ops");
fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
if (fd < 0) {
return -errno;
return 0;
}
+const uint32_t MAX_KEYS = 64;
+
} // anonymous namespace
static int do_metadata_list(librbd::Image& image, Formatter *f)
{
- std::map<std::string, bufferlist> pairs;
int r;
TextTable tbl;
- r = image.metadata_list("", 0, &pairs);
- if (r < 0) {
- std::cerr << "failed to list metadata of image : " << cpp_strerror(r)
- << std::endl;
- return r;
- }
-
- if (f) {
- f->open_object_section("metadatas");
- } else {
- tbl.define_column("Key", TextTable::LEFT, TextTable::LEFT);
- tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
- }
+ size_t count = 0;
+ std::string last_key;
+ bool more_results = true;
+ while (more_results) {
+ std::map<std::string, bufferlist> pairs;
+ r = image.metadata_list(last_key, MAX_KEYS, &pairs);
+ if (r < 0) {
+ std::cerr << "failed to list metadata of image : " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
- if (!pairs.empty()) {
- bool one = (pairs.size() == 1);
+ more_results = (pairs.size() == MAX_KEYS);
+ if (!pairs.empty()) {
+ if (count == 0) {
+ if (f) {
+ f->open_object_section("metadatas");
+ } else {
+ tbl.define_column("Key", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
+ }
+ }
- if (!f) {
- std::cout << "There " << (one ? "is " : "are ") << pairs.size()
- << " metadata" << (one ? "" : "s") << " on this image.\n";
- }
+ last_key = pairs.rbegin()->first;
+ count += pairs.size();
- for (std::map<std::string, bufferlist>::iterator it = pairs.begin();
- it != pairs.end(); ++it) {
- std::string val(it->second.c_str(), it->second.length());
- if (f) {
- f->dump_string(it->first.c_str(), val.c_str());
- } else {
- tbl << it->first << val.c_str() << TextTable::endrow;
+ for (auto kv : pairs) {
+ std::string val(kv.second.c_str(), kv.second.length());
+ if (f) {
+ f->dump_string(kv.first.c_str(), val.c_str());
+ } else {
+ tbl << kv.first << val << TextTable::endrow;
+ }
}
}
- if (!f)
- std::cout << tbl;
}
- if (f) {
- f->close_section();
- f->flush(std::cout);
+ if (f == nullptr) {
+ bool single = (count == 1);
+ std::cout << "There " << (single ? "is" : "are") << " " << count << " "
+ << (single ? "metadatum" : "metadata") << " on this image"
+ << (count == 0 ? "." : ":") << std::endl;
+ }
+
+ if (count > 0) {
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << std::endl << tbl;
+ }
}
return 0;
}
ImportDiffContext(librbd::Image *image, int fd, size_t size, bool no_progress)
: image(image), fd(fd), size(size), pc("Importing image diff", no_progress),
throttle((fd == STDIN_FILENO) ? 1 :
- g_conf->rbd_concurrent_management_ops, false), last_offset(0) {
+ g_conf->get_val<int64_t>("rbd_concurrent_management_ops"),
+ false),
+ last_offset(0) {
}
void update_size(size_t new_size)
throttle.reset(new SimpleThrottle(1, false));
} else {
throttle.reset(new SimpleThrottle(
- g_conf->rbd_concurrent_management_ops, false));
+ g_conf->get_val<int64_t>("rbd_concurrent_management_ops"), false));
}
reqlen = min<uint64_t>(reqlen, size);
uint64_t order;
if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
- order = g_conf->rbd_default_order;
+ order = g_conf->get_val<int64_t>("rbd_default_order");
}
// try to fill whole imgblklen blocks for sparsification
}
// parse default options first so they can be overwritten by cli options
- char *default_map_options = strdup(g_conf->rbd_default_map_options.c_str());
+ char *default_map_options = strdup(g_conf->get_val<std::string>(
+ "rbd_default_map_options").c_str());
BOOST_SCOPE_EXIT( (default_map_options) ) {
free(default_map_options);
} BOOST_SCOPE_EXIT_END;
return r;
}
- r = do_list(pool_name, vm["long"].as<bool>(), g_conf->rbd_concurrent_management_ops, formatter.get());
+ r = do_list(pool_name, vm["long"].as<bool>(),
+ g_conf->get_val<int64_t>("rbd_concurrent_management_ops"),
+ formatter.get());
if (r < 0) {
std::cerr << "rbd: list: " << cpp_strerror(r) << std::endl;
return r;
m_factory(std::bind(ImageRequestAllocator<RequestT>(),
std::ref(m_io_ctx), std::ref(m_throttle),
std::placeholders::_1, std::forward<Args>(args)...)),
- m_throttle(g_conf->rbd_concurrent_management_ops, true) {
+ m_throttle(g_conf->get_val<int64_t>("rbd_concurrent_management_ops"),
+ true) {
}
int execute() {
m_failed_timer_lock(timer_lock),
m_asok_hook(new ImageDeleterAdminSocketHook<I>(g_ceph_context, this))
{
- set_failed_timer_interval(g_ceph_context->_conf->rbd_mirror_delete_retry_interval);
+ set_failed_timer_interval(g_ceph_context->_conf->get_val<double>(
+ "rbd_mirror_delete_retry_interval"));
m_image_deleter_thread.create("image_deleter");
}
CephContext *cct = static_cast<CephContext *>(m_local->cct());
journal::Settings settings;
- settings.commit_interval = cct->_conf->rbd_mirror_journal_commit_age;
- settings.max_fetch_bytes = cct->_conf->rbd_mirror_journal_max_fetch_bytes;
+ settings.commit_interval = cct->_conf->get_val<double>(
+ "rbd_mirror_journal_commit_age");
+ settings.max_fetch_bytes = cct->_conf->get_val<uint64_t>(
+ "rbd_mirror_journal_max_fetch_bytes");
m_remote_journaler = new Journaler(m_threads->work_queue,
m_threads->timer,
{
CephContext *cct = static_cast<CephContext *>(m_local->cct());
- double poll_seconds = cct->_conf->rbd_mirror_journal_poll_age;
+ double poll_seconds = cct->_conf->get_val<double>(
+ "rbd_mirror_journal_poll_age");
Mutex::Locker locker(m_lock);
m_replay_handler = new ReplayHandler<I>(this);
ImageSyncThrottler<I>::ImageSyncThrottler()
: m_lock(librbd::util::unique_lock_name("rbd::mirror::ImageSyncThrottler",
this)),
- m_max_concurrent_syncs(
- g_ceph_context->_conf->rbd_mirror_concurrent_image_syncs) {
+ m_max_concurrent_syncs(g_ceph_context->_conf->get_val<uint64_t>(
+ "rbd_mirror_concurrent_image_syncs")) {
dout(20) << "max_concurrent_syncs=" << m_max_concurrent_syncs << dendl;
g_ceph_context->_conf->add_observer(this);
}
void ImageSyncThrottler<I>::handle_conf_change(const struct md_config_t *conf,
const set<string> &changed) {
if (changed.count("rbd_mirror_concurrent_image_syncs")) {
- set_max_concurrent_syncs(conf->rbd_mirror_concurrent_image_syncs);
+ set_max_concurrent_syncs(conf->get_val<uint64_t>("rbd_mirror_concurrent_image_syncs"));
}
}
queue_start_image_replayers();
});
- int after = g_ceph_context->_conf->rbd_mirror_image_state_check_interval;
+ int after = g_ceph_context->_conf->get_val<int64_t>(
+ "rbd_mirror_image_state_check_interval");
dout(20) << "scheduling image state check after " << after << " sec (task "
<< m_image_state_check_task << ")" << dendl;
m_lock(unique_lock_name("rbd::mirror::InstanceWatcher::m_lock", this)),
m_instance_lock(librbd::ManagedLock<I>::create(
m_ioctx, m_work_queue, m_oid, this, librbd::managed_lock::EXCLUSIVE, true,
- m_cct->_conf->rbd_blacklist_expire_seconds)) {
+ m_cct->_conf->get_val<int64_t>("rbd_blacklist_expire_seconds"))) {
}
template <typename I>
cancel_remove_task(instance);
- int after = m_cct->_conf->rbd_mirror_leader_heartbeat_interval *
- (1 + m_cct->_conf->rbd_mirror_leader_max_missed_heartbeats +
- m_cct->_conf->rbd_mirror_leader_max_acquire_attempts_before_break);
+ int after = m_cct->_conf->get_val<int64_t>("rbd_mirror_leader_heartbeat_interval") *
+ (1 + m_cct->_conf->get_val<int64_t>("rbd_mirror_leader_max_missed_heartbeats") +
+ m_cct->_conf->get_val<int64_t>("rbd_mirror_leader_max_acquire_attempts_before_break"));
instance.timer_task = new FunctionContext(
[this, &instance](int r) {
m_lock("rbd::mirror::LeaderWatcher " + io_ctx.get_pool_name()),
m_notifier_id(librados::Rados(io_ctx).get_instance_id()),
m_leader_lock(new LeaderLock(m_ioctx, m_work_queue, m_oid, this, true,
- m_cct->_conf->rbd_blacklist_expire_seconds)) {
+ m_cct->_conf->get_val<int64_t>(
+ "rbd_blacklist_expire_seconds"))) {
}
template <typename I>
m_timer_gate->timer_callback = timer_callback;
});
- int after = delay_factor * m_cct->_conf->rbd_mirror_leader_heartbeat_interval;
+ int after = delay_factor * m_cct->_conf->get_val<int64_t>(
+ "rbd_mirror_leader_heartbeat_interval");
dout(20) << "scheduling " << name << " after " << after << " sec (task "
<< m_timer_task << ")" << dendl;
}
}
- if (m_acquire_attempts >=
- m_cct->_conf->rbd_mirror_leader_max_acquire_attempts_before_break) {
+ if (m_acquire_attempts >= m_cct->_conf->get_val<int64_t>(
+ "rbd_mirror_leader_max_acquire_attempts_before_break")) {
dout(0) << "breaking leader lock after " << m_acquire_attempts << " "
<< "failed attempts to acquire" << dendl;
break_leader_lock();
schedule_timer_task("acquire leader lock",
delay_factor *
- m_cct->_conf->rbd_mirror_leader_max_missed_heartbeats,
+ m_cct->_conf->get_val<int64_t>("rbd_mirror_leader_max_missed_heartbeats"),
false, &LeaderWatcher<I>::acquire_leader_lock, false);
}
}
m_cond.WaitInterval(
m_lock,
- utime_t(m_cct->_conf->rbd_mirror_pool_replayers_refresh_interval, 0));
+ utime_t(m_cct->_conf->get_val<int64_t>("rbd_mirror_pool_replayers_refresh_interval"), 0));
}
// stop all pool replayers in parallel
f->dump_string("local_cluster_admin_socket",
reinterpret_cast<CephContext *>(m_local_io_ctx.cct())->_conf->
- admin_socket);
+ get_val<std::string>("admin_socket"));
f->dump_string("remote_cluster_admin_socket",
reinterpret_cast<CephContext *>(m_remote_io_ctx.cct())->_conf->
- admin_socket);
+ get_val<std::string>("admin_socket"));
f->open_object_section("sync_throttler");
m_instance_watcher->print_sync_status(f, ss);
template <typename I>
Threads<I>::Threads(CephContext *cct) : timer_lock("Threads::timer_lock") {
thread_pool = new ThreadPool(cct, "Journaler::thread_pool", "tp_journal",
- cct->_conf->rbd_op_threads, "rbd_op_threads");
+ cct->_conf->get_val<int64_t>("rbd_op_threads"),
+ "rbd_op_threads");
thread_pool->start();
work_queue = new ContextWQ("Journaler::work_queue",
- cct->_conf->rbd_op_thread_timeout, thread_pool);
+ cct->_conf->get_val<int64_t>("rbd_op_thread_timeout"),
+ thread_pool);
timer = new SafeTimer(cct, timer_lock, true);
timer->init();
m_progress_ctx(progress_ctx),
m_lock(unique_lock_name("ImageCopyRequest::m_lock", this)),
m_updating_sync_point(false), m_update_sync_ctx(nullptr),
- m_update_sync_point_interval(m_local_image_ctx->cct->_conf->rbd_mirror_sync_point_update_age),
+ m_update_sync_point_interval(m_local_image_ctx->cct->_conf->template get_val<double>(
+ "rbd_mirror_sync_point_update_age")),
m_client_meta_copy(*client_meta) {
assert(!m_client_meta_copy.sync_points.empty());
}
bool complete;
{
Mutex::Locker locker(m_lock);
- for (int i = 0; i < cct->_conf->rbd_concurrent_management_ops; ++i) {
+ for (int i = 0; i < cct->_conf->get_val<int64_t>("rbd_concurrent_management_ops"); ++i) {
send_next_object_copy();
if (m_ret_val < 0 && m_current_ops == 0) {
break;
if (ret == -EINVAL) {
// if shrinking an image, a pagecache writeback might reference
// extents outside of the range of the new image extents
- dout(5) << __func__ << ": masking IO out-of-bounds error" << dendl;
+ dout(0) << __func__ << ": masking IO out-of-bounds error" << dendl;
ctx->data.clear();
ret = 0;
}
static int parse_args(vector<const char*>& args, std::ostream *err_msg, Config *cfg)
{
- std::vector<const char*>::iterator i;
- std::ostringstream err;
+ std::string conf_file_list;
+ std::string cluster;
+ CephInitParameters iparams = ceph_argparse_early_args(
+ args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list);
md_config_t config;
- config.parse_config_files(nullptr, nullptr, 0);
+ config.name = iparams.name;
+ config.cluster = cluster;
+
+ if (!conf_file_list.empty()) {
+ config.parse_config_files(conf_file_list.c_str(), nullptr, 0);
+ } else {
+ config.parse_config_files(nullptr, nullptr, 0);
+ }
config.parse_env();
config.parse_argv(args);
- cfg->poolname = config.rbd_default_pool;
+ cfg->poolname = config.get_val<std::string>("rbd_default_pool");
+
+ std::vector<const char*>::iterator i;
+ std::ostringstream err;
for (i = args.begin(); i != args.end(); ) {
if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {