]> git.proxmox.com Git - ceph.git/commitdiff
update sources to v12.2.1
authorFabian Grünbichler <f.gruenbichler@proxmox.com>
Fri, 29 Sep 2017 07:57:31 +0000 (09:57 +0200)
committerFabian Grünbichler <f.gruenbichler@proxmox.com>
Fri, 29 Sep 2017 07:57:31 +0000 (09:57 +0200)
317 files changed:
ceph/CMakeLists.txt
ceph/PendingReleaseNotes
ceph/alpine/APKBUILD
ceph/ceph.spec
ceph/ceph.spec.in
ceph/debian/changelog
ceph/doc/cephfs/health-messages.rst
ceph/doc/cephfs/mds-config-ref.rst
ceph/doc/images/esx_iscsi_chap.png [new file with mode: 0755]
ceph/doc/images/esx_iscsi_conf.png [new file with mode: 0755]
ceph/doc/images/esx_iscsi_disc.png [new file with mode: 0755]
ceph/doc/images/esx_iscsi_general.png [new file with mode: 0755]
ceph/doc/images/esx_iscsi_rescan.png [new file with mode: 0755]
ceph/doc/images/esx_iscsi_select_device.png [new file with mode: 0755]
ceph/doc/images/esx_iscsi_select_mru.png [new file with mode: 0755]
ceph/doc/images/win2016_iscsi_advanced_window.png [new file with mode: 0755]
ceph/doc/images/win2016_iscsi_connect_to_target.png [new file with mode: 0755]
ceph/doc/images/win2016_iscsi_devices_mpio.png [new file with mode: 0755]
ceph/doc/images/win2016_iscsi_discovery_tab.png [new file with mode: 0755]
ceph/doc/images/win2016_iscsi_target_tab.png [new file with mode: 0755]
ceph/doc/images/win2016_iscsi_target_tab2.png [new file with mode: 0755]
ceph/doc/images/win2016_mpclaim_output.png [new file with mode: 0644]
ceph/doc/images/win2016_mpio_set_failover_only.png [new file with mode: 0755]
ceph/doc/man/8/ceph.rst
ceph/doc/rbd/index.rst
ceph/doc/rbd/iscsi-initiator-esx.rst [new file with mode: 0644]
ceph/doc/rbd/iscsi-initiator-rhel.rst [new file with mode: 0644]
ceph/doc/rbd/iscsi-initiator-win.rst [new file with mode: 0644]
ceph/doc/rbd/iscsi-initiators.rst [new file with mode: 0644]
ceph/doc/rbd/iscsi-monitoring.rst [new file with mode: 0644]
ceph/doc/rbd/iscsi-overview.rst [new file with mode: 0644]
ceph/doc/rbd/iscsi-requirements.rst [new file with mode: 0644]
ceph/doc/rbd/iscsi-target-ansible.rst [new file with mode: 0644]
ceph/doc/rbd/iscsi-target-cli.rst [new file with mode: 0644]
ceph/doc/rbd/iscsi-targets.rst [new file with mode: 0644]
ceph/qa/objectstore/bluestore-bitmap.yaml [new file with mode: 0644]
ceph/qa/objectstore/bluestore-comp.yaml
ceph/qa/objectstore/bluestore.yaml
ceph/qa/run_xfstests.sh [changed mode: 0644->0755]
ceph/qa/run_xfstests_krbd.sh [deleted file]
ceph/qa/standalone/crush/crush-classes.sh
ceph/qa/standalone/mon/osd-pool-df.sh [new file with mode: 0755]
ceph/qa/standalone/mon/test_pool_quota.sh
ceph/qa/suites/fs/basic_functional/tasks/alternate-pool.yaml [new file with mode: 0644]
ceph/qa/suites/fs/basic_functional/tasks/client-limits.yaml
ceph/qa/suites/fs/basic_functional/tasks/data-scan.yaml
ceph/qa/suites/fs/multiclient/tasks/cephfs_misc_tests.yaml
ceph/qa/suites/kcephfs/recovery/tasks/auto-repair.yaml
ceph/qa/suites/kcephfs/recovery/tasks/client-limits.yaml
ceph/qa/suites/kcephfs/recovery/tasks/client-recovery.yaml
ceph/qa/suites/kcephfs/recovery/tasks/data-scan.yaml
ceph/qa/suites/kcephfs/recovery/tasks/failover.yaml
ceph/qa/suites/kcephfs/recovery/whitelist_health.yaml [new symlink]
ceph/qa/suites/kcephfs/thrash/thrashosds-health.yaml [new symlink]
ceph/qa/suites/kcephfs/thrash/whitelist_health.yaml [new symlink]
ceph/qa/suites/krbd/singleton/tasks/rbd_xfstests.yaml
ceph/qa/suites/rados/thrash/d-require-luminous/at-end.yaml
ceph/qa/suites/rbd/cli/pool/ec-data-pool.yaml
ceph/qa/suites/rbd/librbd/pool/ec-data-pool.yaml
ceph/qa/suites/rbd/maintenance/workloads/dynamic_features_no_cache.yaml
ceph/qa/suites/rbd/mirror/cluster/2-node.yaml
ceph/qa/suites/rbd/qemu/pool/ec-data-pool.yaml
ceph/qa/suites/rgw/multisite/overrides.yaml
ceph/qa/suites/upgrade/luminous-x/parallel/% [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/+ [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/openstack.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/start.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/1-ceph-install/luminous.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/+ [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/blogbench.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/ec-rados-default.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/rados_api.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/rados_loadgenbig.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/test_rbd_api.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/test_rbd_python.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/3-upgrade-sequence/upgrade-all.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/3-upgrade-sequence/upgrade-mon-osd-mds.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/+ [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/blogbench.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rados-snaps-few-objects.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rados_loadgenmix.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rados_mon_thrash.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_cls.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rgw_swift.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/distros [new symlink]
ceph/qa/suites/upgrade/luminous-x/parallel/objectstore [new symlink]
ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/% [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/0-cluster [new symlink]
ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/2-partial-upgrade [new symlink]
ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/3-thrash/default.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/4-ec-workload.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/5-finish-upgrade.yaml [new symlink]
ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/7-final-workload.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/distros [new symlink]
ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/objectstore [new symlink]
ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/thrashosds-health.yaml [new symlink]
ceph/qa/suites/upgrade/luminous-x/stress-split/% [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/0-cluster/+ [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/0-cluster/openstack.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/0-cluster/start.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/1-ceph-install/luminous.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/2-partial-upgrade/firsthalf.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/3-thrash/default.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/+ [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/radosbench.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/rbd-cls.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/rbd-import-export.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/rbd_api.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/readwrite.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/snaps-few-objects.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/5-finish-upgrade.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/7-final-workload/+ [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/7-final-workload/rbd-python.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/7-final-workload/rgw-swift.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/7-final-workload/snaps-many-objects.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split/distros [new symlink]
ceph/qa/suites/upgrade/luminous-x/stress-split/objectstore/bluestore.yaml [new symlink]
ceph/qa/suites/upgrade/luminous-x/stress-split/objectstore/filestore-xfs.yaml [new symlink]
ceph/qa/suites/upgrade/luminous-x/stress-split/thrashosds-health.yaml [new symlink]
ceph/qa/tasks/ceph.py
ceph/qa/tasks/ceph_manager.py
ceph/qa/tasks/ceph_test_case.py
ceph/qa/tasks/cephfs/cephfs_test_case.py
ceph/qa/tasks/cephfs/filesystem.py
ceph/qa/tasks/cephfs/test_client_limits.py
ceph/qa/tasks/cephfs/test_client_recovery.py
ceph/qa/tasks/cephfs/test_data_scan.py
ceph/qa/tasks/cephfs/test_misc.py
ceph/qa/tasks/cephfs/test_recovery_pool.py [new file with mode: 0644]
ceph/qa/tasks/mgr/test_failover.py
ceph/qa/tasks/radosgw_admin.py
ceph/qa/tasks/rbd.py
ceph/qa/tasks/rgw.py
ceph/qa/tasks/util/rados.py
ceph/qa/tasks/vstart_runner.py
ceph/qa/workunits/cephtool/test.sh
ceph/qa/workunits/mon/crush_ops.sh
ceph/qa/workunits/rbd/import_export.sh
ceph/selinux/ceph.te
ceph/src/.git_version
ceph/src/CMakeLists.txt
ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/api.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/prepare.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/trigger.py
ceph/src/ceph-volume/ceph_volume/exceptions.py
ceph/src/ceph-volume/ceph_volume/tests/conftest.py
ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_activate.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_api.py
ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_trigger.py
ceph/src/ceph-volume/ceph_volume/tests/functional/Vagrantfile
ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/tox.ini
ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/group_vars/all
ceph/src/ceph-volume/ceph_volume/util/disk.py [new file with mode: 0644]
ceph/src/ceph.in
ceph/src/ceph_release
ceph/src/client/Client.cc
ceph/src/client/Client.h
ceph/src/client/fuse_ll.cc
ceph/src/cls/rbd/cls_rbd.cc
ceph/src/common/Formatter.cc
ceph/src/common/Formatter.h
ceph/src/common/LogClient.h
ceph/src/common/admin_socket.cc
ceph/src/common/cohort_lru.h
ceph/src/common/legacy_config_opts.h
ceph/src/common/options.cc
ceph/src/common/util.cc
ceph/src/crush/CrushWrapper.h
ceph/src/include/alloc_ptr.h [new file with mode: 0644]
ceph/src/include/btree_interval_set.h
ceph/src/include/compact_map.h
ceph/src/include/counter.h
ceph/src/include/encoding.h
ceph/src/include/interval_set.h
ceph/src/include/lru.h
ceph/src/include/mempool.h
ceph/src/include/rados.h
ceph/src/include/util.h
ceph/src/include/xlist.h
ceph/src/librbd/ImageCtx.cc
ceph/src/librbd/ImageCtx.h
ceph/src/librbd/ImageState.cc
ceph/src/librbd/ImageWatcher.cc
ceph/src/librbd/Journal.cc
ceph/src/librbd/image/CreateRequest.cc
ceph/src/librbd/internal.cc
ceph/src/librbd/io/ImageRequestWQ.cc
ceph/src/librbd/operation/ObjectMapIterate.cc
ceph/src/mds/Beacon.cc
ceph/src/mds/CDentry.cc
ceph/src/mds/CDentry.h
ceph/src/mds/CDir.cc
ceph/src/mds/CDir.h
ceph/src/mds/CInode.cc
ceph/src/mds/CInode.h
ceph/src/mds/FSMap.cc
ceph/src/mds/Locker.cc
ceph/src/mds/MDCache.cc
ceph/src/mds/MDCache.h
ceph/src/mds/MDLog.cc
ceph/src/mds/MDSCacheObject.cc
ceph/src/mds/MDSCacheObject.h
ceph/src/mds/MDSDaemon.cc
ceph/src/mds/MDSRank.cc
ceph/src/mds/Migrator.cc
ceph/src/mds/Server.cc
ceph/src/mds/Server.h
ceph/src/mds/SessionMap.cc
ceph/src/mds/SessionMap.h
ceph/src/mds/SimpleLock.h
ceph/src/messages/MOSDPGRecoveryDelete.h
ceph/src/messages/MOSDPGRecoveryDeleteReply.h
ceph/src/mgr/PyFormatter.h
ceph/src/mgr/PyState.cc
ceph/src/mon/CreatingPGs.h
ceph/src/mon/MDSMonitor.cc
ceph/src/mon/MgrMonitor.cc
ceph/src/mon/MgrMonitor.h
ceph/src/mon/MgrStatMonitor.h
ceph/src/mon/MonCommands.h
ceph/src/mon/MonOpRequest.h
ceph/src/mon/Monitor.cc
ceph/src/mon/Monitor.h
ceph/src/mon/OSDMonitor.cc
ceph/src/mon/PGMap.cc
ceph/src/mon/PGMap.h
ceph/src/os/bluestore/BlueFS.cc
ceph/src/os/bluestore/BlueStore.cc
ceph/src/os/bluestore/BlueStore.h
ceph/src/os/bluestore/StupidAllocator.cc
ceph/src/os/bluestore/StupidAllocator.h
ceph/src/os/bluestore/aio.cc
ceph/src/os/bluestore/bluestore_types.cc
ceph/src/osd/OSD.cc
ceph/src/osd/OSD.h
ceph/src/osd/OSDMap.cc
ceph/src/osd/PG.cc
ceph/src/osd/PGLog.cc
ceph/src/osd/PGLog.h
ceph/src/osd/PrimaryLogPG.cc
ceph/src/osd/osd_types.h
ceph/src/pybind/mgr/dashboard/base.html
ceph/src/pybind/mgr/dashboard/health.html
ceph/src/pybind/mgr/dashboard/module.py
ceph/src/rgw/rgw_admin.cc
ceph/src/rgw/rgw_asio_frontend.cc
ceph/src/rgw/rgw_bucket.cc
ceph/src/rgw/rgw_civetweb_frontend.cc
ceph/src/rgw/rgw_client_io.h
ceph/src/rgw/rgw_client_io_filters.h
ceph/src/rgw/rgw_common.h
ceph/src/rgw/rgw_crypt.cc
ceph/src/rgw/rgw_data_sync.cc
ceph/src/rgw/rgw_fcgi_process.cc
ceph/src/rgw/rgw_formats.h
ceph/src/rgw/rgw_lc.cc
ceph/src/rgw/rgw_lc.h
ceph/src/rgw/rgw_lc_s3.cc
ceph/src/rgw/rgw_lc_s3.h
ceph/src/rgw/rgw_loadgen_process.cc
ceph/src/rgw/rgw_log.cc
ceph/src/rgw/rgw_metadata.cc
ceph/src/rgw/rgw_metadata.h
ceph/src/rgw/rgw_op.cc
ceph/src/rgw/rgw_op.h
ceph/src/rgw/rgw_quota.cc
ceph/src/rgw/rgw_rados.cc
ceph/src/rgw/rgw_rados.h
ceph/src/rgw/rgw_rest_client.cc
ceph/src/rgw/rgw_rest_conn.cc
ceph/src/rgw/rgw_rest_conn.h
ceph/src/rgw/rgw_rest_metadata.cc
ceph/src/rgw/rgw_rest_s3.cc
ceph/src/rgw/rgw_sync.cc
ceph/src/rgw/rgw_user.cc
ceph/src/rocksdb/CMakeLists.txt
ceph/src/rocksdb/util/crc32c.cc
ceph/src/test/cli-integration/rbd/formatted-output.t
ceph/src/test/cli/rbd/help.t
ceph/src/test/cls_rbd/test_cls_rbd.cc
ceph/src/test/common/test_lru.cc
ceph/src/test/libcephfs/test.cc
ceph/src/test/librbd/journal/test_Entries.cc
ceph/src/test/librbd/journal/test_Replay.cc
ceph/src/test/librbd/managed_lock/test_mock_AcquireRequest.cc
ceph/src/test/librbd/mock/MockImageCtx.h
ceph/src/test/librbd/test_MirroringWatcher.cc
ceph/src/test/librbd/test_internal.cc
ceph/src/test/mon/PGMap.cc
ceph/src/test/osd/TestPGLog.cc
ceph/src/test/rbd_mirror/test_mock_LeaderWatcher.cc
ceph/src/test/rgw/rgw_multi/tests.py
ceph/src/tools/ceph_objectstore_tool.cc
ceph/src/tools/rbd/ArgumentTypes.cc
ceph/src/tools/rbd/Utils.cc
ceph/src/tools/rbd/action/DiskUsage.cc
ceph/src/tools/rbd/action/Export.cc
ceph/src/tools/rbd/action/ImageMeta.cc
ceph/src/tools/rbd/action/Import.cc
ceph/src/tools/rbd/action/Kernel.cc
ceph/src/tools/rbd/action/List.cc
ceph/src/tools/rbd/action/MirrorPool.cc
ceph/src/tools/rbd_mirror/ImageDeleter.cc
ceph/src/tools/rbd_mirror/ImageReplayer.cc
ceph/src/tools/rbd_mirror/ImageSyncThrottler.cc
ceph/src/tools/rbd_mirror/InstanceReplayer.cc
ceph/src/tools/rbd_mirror/InstanceWatcher.cc
ceph/src/tools/rbd_mirror/Instances.cc
ceph/src/tools/rbd_mirror/LeaderWatcher.cc
ceph/src/tools/rbd_mirror/Mirror.cc
ceph/src/tools/rbd_mirror/PoolReplayer.cc
ceph/src/tools/rbd_mirror/Threads.cc
ceph/src/tools/rbd_mirror/image_sync/ImageCopyRequest.cc
ceph/src/tools/rbd_nbd/rbd-nbd.cc

index 17468f2558f04ad634449ea3bbb814e4e2143e1d..c358e3e97482aecbf4cee7f2474aef43c1349911 100644 (file)
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8.11)
 
 project(ceph)
-set(VERSION 12.2.0)
+set(VERSION 12.2.1)
 
 if(POLICY CMP0046)
   # Tweak policies (this one disables "missing" dependency warning)
index ac1a79f6d468d6e839d3b4d3de68656276387178..9ca48cdabe968843468d1573adedd42518dd854e 100644 (file)
   New commands "pg cancel-force-recovery" and "pg cancel-force-backfill"
   restore default recovery/backfill priority of previously forced pgs.
 
+
+12.2.1
+------
+
+* Clusters will need to upgrade to 12.2.1 before upgrading to any
+  Mimic 13.y.z version (either a development release or an eventual
+  stable Mimic release).
+
+- *CephFS*:
+
+  * Limiting MDS cache via a memory limit is now supported using the new
+    mds_cache_memory_limit config option (1GB by default).  A cache reservation
+    can also be specified using mds_cache_reservation as a percentage of the
+    limit (5% by default). Limits by inode count are still supported using
+    mds_cache_size. Setting mds_cache_size to 0 (the default) disables the
+    inode limit.
index ac9156f6deca58e1d0f8d67ff456d9d1ccd82d21..5fd0a9a24c369f870237a6e065fff965e1410a4b 100644 (file)
@@ -1,7 +1,7 @@
 # Contributor: John Coyle <dx9err@gmail.com>
 # Maintainer: John Coyle <dx9err@gmail.com>
 pkgname=ceph
-pkgver=12.2.0
+pkgver=12.2.1
 pkgrel=0
 pkgdesc="Ceph is a distributed object store and file system"
 pkgusers="ceph"
@@ -63,7 +63,7 @@ makedepends="
        xmlstarlet
        yasm
 "
-source="ceph-12.2.0.tar.bz2"
+source="ceph-12.2.1.tar.bz2"
 subpackages="
        $pkgname-base
        $pkgname-common
@@ -116,7 +116,7 @@ _sysconfdir=/etc
 _udevrulesdir=/etc/udev/rules.d
 _python_sitelib=/usr/lib/python2.7/site-packages
 
-builddir=$srcdir/ceph-12.2.0
+builddir=$srcdir/ceph-12.2.1
 
 build() {
        export CEPH_BUILD_VIRTUALENV=$builddir
index 458191e583bcdfe7f6132b8a3b27374f7e2dc888..00d09ee21fcbc09de287f174eac63ace50ba8848 100644 (file)
@@ -61,7 +61,7 @@
 # main package definition
 #################################################################################
 Name:          ceph
-Version:       12.2.0
+Version:       12.2.1
 Release:       0%{?dist}
 %if 0%{?fedora} || 0%{?rhel}
 Epoch:         2
@@ -76,7 +76,7 @@ License:      LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and
 Group:         System/Filesystems
 %endif
 URL:           http://ceph.com/
-Source0:       http://ceph.com/download/ceph-12.2.0.tar.bz2
+Source0:       http://ceph.com/download/ceph-12.2.1.tar.bz2
 %if 0%{?suse_version}
 %if 0%{?is_opensuse}
 ExclusiveArch:  x86_64 aarch64 ppc64 ppc64le
@@ -341,6 +341,7 @@ Summary:    Ceph fuse-based client
 %if 0%{?suse_version}
 Group:         System/Filesystems
 %endif
+Requires:       fuse
 %description fuse
 FUSE based client for Ceph distributed network file system
 
@@ -772,7 +773,7 @@ python-rbd, python-rgw or python-cephfs instead.
 # common
 #################################################################################
 %prep
-%autosetup -p1 -n ceph-12.2.0
+%autosetup -p1 -n ceph-12.2.1
 
 %build
 %if 0%{with cephfs_java}
@@ -1528,10 +1529,7 @@ fi
 %{_libdir}/librbd_tp.so.*
 %endif
 
-%post -n librbd1
-/sbin/ldconfig
-mkdir -p /usr/lib64/qemu/
-ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
+%post -n librbd1 -p /sbin/ldconfig
 
 %postun -n librbd1 -p /sbin/ldconfig
 
index 4749bc7179bb2fe61d587457d76f4a7b3d7dfc96..b45c9feecfbcf0d2cfaf7678df6e1ee5c3744da8 100644 (file)
@@ -341,6 +341,7 @@ Summary:    Ceph fuse-based client
 %if 0%{?suse_version}
 Group:         System/Filesystems
 %endif
+Requires:       fuse
 %description fuse
 FUSE based client for Ceph distributed network file system
 
@@ -1528,10 +1529,7 @@ fi
 %{_libdir}/librbd_tp.so.*
 %endif
 
-%post -n librbd1
-/sbin/ldconfig
-mkdir -p /usr/lib64/qemu/
-ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
+%post -n librbd1 -p /sbin/ldconfig
 
 %postun -n librbd1 -p /sbin/ldconfig
 
index 3fef34d2b768ac90ef8e57f8dfe49a5f00c5e360..f0524391762ca1b1b71a00e0671327106d7e1ec2 100644 (file)
@@ -1,3 +1,9 @@
+ceph (12.2.1-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com>  Tue, 26 Sep 2017 16:27:06 +0000
+
 ceph (12.2.0-1) stable; urgency=medium
 
   * New upstream release
index 54b4f7144e90f48bffe9140c640570a687865fb0..3e68e937116c8c0ec1f776d56fe88f6082afe39e 100644 (file)
@@ -71,14 +71,14 @@ so at all.  This message appears if a client has taken longer than
 
 Message: "Client *name* failing to respond to cache pressure"
 Code: MDS_HEALTH_CLIENT_RECALL, MDS_HEALTH_CLIENT_RECALL_MANY
-Description: Clients maintain a metadata cache.  Items (such as inodes)
-in the client cache are also pinned in the MDS cache, so when the MDS
-needs to shrink its cache (to stay within ``mds_cache_size``), it
-sends messages to clients to shrink their caches too.  If the client
-is unresponsive or buggy, this can prevent the MDS from properly staying
-within its ``mds_cache_size`` and it may eventually run out of memory
-and crash.  This message appears if a client has taken more than
-``mds_recall_state_timeout`` (default 60s) to comply.
+Description: Clients maintain a metadata cache.  Items (such as inodes) in the
+client cache are also pinned in the MDS cache, so when the MDS needs to shrink
+its cache (to stay within ``mds_cache_size`` or ``mds_cache_memory_limit``), it
+sends messages to clients to shrink their caches too.  If the client is
+unresponsive or buggy, this can prevent the MDS from properly staying within
+its cache limits and it may eventually run out of memory and crash.  This
+message appears if a client has taken more than ``mds_recall_state_timeout``
+(default 60s) to comply.
 
 Message: "Client *name* failing to advance its oldest client/flush tid"
 Code: MDS_HEALTH_CLIENT_OLDEST_TID, MDS_HEALTH_CLIENT_OLDEST_TID_MANY
@@ -119,9 +119,9 @@ This message appears if any client requests have taken longer than
 
 Message: "Too many inodes in cache"
 Code: MDS_HEALTH_CACHE_OVERSIZED
-Description: The MDS is not succeeding in trimming its cache to comply
-with the limit set by the administrator.  If the MDS cache becomes too large,
-the daemon may exhaust available memory and crash.
-This message appears if the actual cache size (in inodes) is at least 50%
-greater than ``mds_cache_size`` (default 100000).
-
+Description: The MDS is not succeeding in trimming its cache to comply with the
+limit set by the administrator.  If the MDS cache becomes too large, the daemon
+may exhaust available memory and crash.  By default, this message appears if
+the actual cache size (in inodes or memory) is at least 50% greater than
+``mds_cache_size`` (default 100000) or ``mds_cache_memory_limit`` (default
+1GB). Modify ``mds_health_cache_threshold`` to set the warning ratio.
index b3446d698dbba146cf6cb4464039e889caebac96..4f7bea3ef8585411bf291e3300d40326592c1b30 100644 (file)
 :Type:  64-bit Integer Unsigned
 :Default:  ``1ULL << 40``
 
+``mds cache memory limit``
+
+:Description: The memory limit the MDS should enforce for its cache.
+              Administrators should use this instead of ``mds cache size``.
+:Type:  64-bit Integer Unsigned
+:Default: ``1073741824``
+
+``mds cache reservation``
+
+:Description: The cache reservation (memory or inodes) for the MDS cache to maintain.
+              Once the MDS begins dipping into its reservation, it will recall
+              client state until its cache size shrinks to restore the
+              reservation.
+:Type:  Float
+:Default: ``0.05``
 
 ``mds cache size``
 
-:Description: The number of inodes to cache.
+:Description: The number of inodes to cache. A value of 0 indicates an
+              unlimited number. It is recommended to use
+              ``mds_cache_memory_limit`` to limit the amount of memory the MDS
+              cache uses.
 :Type:  32-bit Integer
-:Default: ``100000``
-
+:Default: ``0``
 
 ``mds cache mid``
 
diff --git a/ceph/doc/images/esx_iscsi_chap.png b/ceph/doc/images/esx_iscsi_chap.png
new file mode 100755 (executable)
index 0000000..4372222
Binary files /dev/null and b/ceph/doc/images/esx_iscsi_chap.png differ
diff --git a/ceph/doc/images/esx_iscsi_conf.png b/ceph/doc/images/esx_iscsi_conf.png
new file mode 100755 (executable)
index 0000000..b1b9388
Binary files /dev/null and b/ceph/doc/images/esx_iscsi_conf.png differ
diff --git a/ceph/doc/images/esx_iscsi_disc.png b/ceph/doc/images/esx_iscsi_disc.png
new file mode 100755 (executable)
index 0000000..338a1eb
Binary files /dev/null and b/ceph/doc/images/esx_iscsi_disc.png differ
diff --git a/ceph/doc/images/esx_iscsi_general.png b/ceph/doc/images/esx_iscsi_general.png
new file mode 100755 (executable)
index 0000000..75a5764
Binary files /dev/null and b/ceph/doc/images/esx_iscsi_general.png differ
diff --git a/ceph/doc/images/esx_iscsi_rescan.png b/ceph/doc/images/esx_iscsi_rescan.png
new file mode 100755 (executable)
index 0000000..cc50c47
Binary files /dev/null and b/ceph/doc/images/esx_iscsi_rescan.png differ
diff --git a/ceph/doc/images/esx_iscsi_select_device.png b/ceph/doc/images/esx_iscsi_select_device.png
new file mode 100755 (executable)
index 0000000..192ab57
Binary files /dev/null and b/ceph/doc/images/esx_iscsi_select_device.png differ
diff --git a/ceph/doc/images/esx_iscsi_select_mru.png b/ceph/doc/images/esx_iscsi_select_mru.png
new file mode 100755 (executable)
index 0000000..c3c9c47
Binary files /dev/null and b/ceph/doc/images/esx_iscsi_select_mru.png differ
diff --git a/ceph/doc/images/win2016_iscsi_advanced_window.png b/ceph/doc/images/win2016_iscsi_advanced_window.png
new file mode 100755 (executable)
index 0000000..cba70ea
Binary files /dev/null and b/ceph/doc/images/win2016_iscsi_advanced_window.png differ
diff --git a/ceph/doc/images/win2016_iscsi_connect_to_target.png b/ceph/doc/images/win2016_iscsi_connect_to_target.png
new file mode 100755 (executable)
index 0000000..29d9e12
Binary files /dev/null and b/ceph/doc/images/win2016_iscsi_connect_to_target.png differ
diff --git a/ceph/doc/images/win2016_iscsi_devices_mpio.png b/ceph/doc/images/win2016_iscsi_devices_mpio.png
new file mode 100755 (executable)
index 0000000..135d41b
Binary files /dev/null and b/ceph/doc/images/win2016_iscsi_devices_mpio.png differ
diff --git a/ceph/doc/images/win2016_iscsi_discovery_tab.png b/ceph/doc/images/win2016_iscsi_discovery_tab.png
new file mode 100755 (executable)
index 0000000..4ac5f71
Binary files /dev/null and b/ceph/doc/images/win2016_iscsi_discovery_tab.png differ
diff --git a/ceph/doc/images/win2016_iscsi_target_tab.png b/ceph/doc/images/win2016_iscsi_target_tab.png
new file mode 100755 (executable)
index 0000000..7543769
Binary files /dev/null and b/ceph/doc/images/win2016_iscsi_target_tab.png differ
diff --git a/ceph/doc/images/win2016_iscsi_target_tab2.png b/ceph/doc/images/win2016_iscsi_target_tab2.png
new file mode 100755 (executable)
index 0000000..302fef9
Binary files /dev/null and b/ceph/doc/images/win2016_iscsi_target_tab2.png differ
diff --git a/ceph/doc/images/win2016_mpclaim_output.png b/ceph/doc/images/win2016_mpclaim_output.png
new file mode 100644 (file)
index 0000000..73e1e5e
Binary files /dev/null and b/ceph/doc/images/win2016_mpclaim_output.png differ
diff --git a/ceph/doc/images/win2016_mpio_set_failover_only.png b/ceph/doc/images/win2016_mpio_set_failover_only.png
new file mode 100755 (executable)
index 0000000..a988e8a
Binary files /dev/null and b/ceph/doc/images/win2016_mpio_set_failover_only.png differ
index 2ae3806704b20eb9f5d16346874a86525e56bc57..1a1800051fabb2cd225c156f6c2836304ec634b5 100644 (file)
@@ -569,7 +569,7 @@ the accompanying lockbox cephx key.
 
 Usage::
 
-    ceph osd new {<id>} {<uuid>} -i {<secrets.json>}
+    ceph osd new {<uuid>} {<id>} -i {<secrets.json>}
 
 The secrets JSON file is optional but if provided, is expected to maintain
 a form of the following format::
index 5d9d433ce1bbaf39b6613dd5743fa2b1772f66f4..c297d0ded2e8ad42ceda7a568722b37fa06d07d7 100644 (file)
@@ -13,8 +13,8 @@ device an ideal candidate to interact with a mass data storage system like Ceph.
 Ceph block devices are thin-provisioned, resizable and store data striped over
 multiple OSDs in a Ceph cluster.  Ceph block devices leverage
 :abbr:`RADOS (Reliable Autonomic Distributed Object Store)` capabilities
-such as snapshotting, replication and consistency. Ceph's 
-:abbr:`RADOS (Reliable Autonomic Distributed Object Store)` Block Devices (RBD) 
+such as snapshotting, replication and consistency. Ceph's
+:abbr:`RADOS (Reliable Autonomic Distributed Object Store)` Block Devices (RBD)
 interact with OSDs using kernel modules or the ``librbd`` library.
 
 .. ditaa::  +------------------------+ +------------------------+
@@ -25,7 +25,7 @@ interact with OSDs using kernel modules or the ``librbd`` library.
             |          OSDs          | |        Monitors        |
             +------------------------+ +------------------------+
 
-.. note:: Kernel modules can use Linux page caching. For ``librbd``-based 
+.. note:: Kernel modules can use Linux page caching. For ``librbd``-based
    applications, Ceph supports `RBD Caching`_.
 
 Ceph's block devices deliver high performance with infinite scalability to
@@ -35,7 +35,7 @@ libvirt and QEMU to integrate with Ceph block devices. You can use the same clus
 to operate the `Ceph RADOS Gateway`_, the `Ceph FS filesystem`_, and Ceph block
 devices simultaneously.
 
-.. important:: To use Ceph Block Devices, you must have access to a running 
+.. important:: To use Ceph Block Devices, you must have access to a running
    Ceph cluster.
 
 .. toctree::
@@ -44,7 +44,8 @@ devices simultaneously.
        Commands <rados-rbd-cmds>
        Kernel Modules <rbd-ko>
        Snapshots<rbd-snapshot>
-        Mirroring <rbd-mirroring>
+       Mirroring <rbd-mirroring>
+       iSCSI Gateway <iscsi-overview>
        QEMU <qemu-rbd>
        libvirt <libvirt>
        Cache Settings <rbd-config-ref/>
@@ -62,9 +63,6 @@ devices simultaneously.
 
        APIs <api/index>
 
-       
-       
-
 .. _RBD Caching: ../rbd-config-ref/
 .. _kernel modules: ../rbd-ko/
 .. _QEMU: ../qemu-rbd/
diff --git a/ceph/doc/rbd/iscsi-initiator-esx.rst b/ceph/doc/rbd/iscsi-initiator-esx.rst
new file mode 100644 (file)
index 0000000..18dd583
--- /dev/null
@@ -0,0 +1,36 @@
+----------------------------------
+The iSCSI Initiator for VMware ESX
+----------------------------------
+
+**Prerequisite:**
+
+-  VMware ESX 6.0 or later
+
+**iSCSI Discovery and Multipath Device Setup:**
+
+#. From vSphere, open the Storage Adapters, on the Configuration tab. Right click
+   on the iSCSI Software Adapter and select Properties.
+
+#. In the General tab click the "Advanced" button and in the "Advanced Settings"
+   set RecoveryTimeout to 25.
+
+#. If CHAP was setup on the iSCSI gateway, in the General tab click the "CHAP…​"
+   button. If CHAP is not being used, skip to step 4.
+
+#. On the CHAP Credentials windows, select “Do not use CHAP unless required by target”,
+   and enter the "Name" and "Secret" values used on the initial setup for the iSCSI
+   gateway, then click on the "OK" button.
+
+#. On the Dynamic Discovery tab, click the "Add…​" button, and enter the IP address
+   and port of one of the iSCSI target portals. Click on the "OK" button.
+
+#. Close the iSCSI Initiator Properties window. A prompt will ask to rescan the
+   iSCSI software adapter. Select Yes.
+
+#. In the Details pane, the LUN on the iSCSI target will be displayed. Right click
+   on a device and select "Manage Paths".
+
+#. On the Manage Paths window, select “Most Recently Used (VMware)” for the policy
+   path selection. Close and repeat for the other disks.
+
+Now the disks can be used for datastores.
diff --git a/ceph/doc/rbd/iscsi-initiator-rhel.rst b/ceph/doc/rbd/iscsi-initiator-rhel.rst
new file mode 100644 (file)
index 0000000..51248e4
--- /dev/null
@@ -0,0 +1,90 @@
+------------------------------------------------
+The iSCSI Initiator for Red Hat Enterprise Linux
+------------------------------------------------
+
+**Prerequisite:**
+
+-  Package ``iscsi-initiator-utils-6.2.0.873-35`` or newer must be
+   installed
+
+-  Package ``device-mapper-multipath-0.4.9-99`` or newer must be
+   installed
+
+**Installing:**
+
+Install the iSCSI initiator and multipath tools:
+
+   ::
+
+       # yum install iscsi-initiator-utils
+       # yum install device-mapper-multipath
+
+**Configuring:**
+
+#. Create the default ``/etc/multipath.conf`` file and enable the
+   ``multiapthd`` service:
+
+   ::
+
+       # mpathconf --enable --with_multipathd y
+
+#. Add the following to ``/etc/multipath.conf`` file:
+
+   ::
+
+       devices {
+               device {
+                       vendor                 "LIO-ORG"
+                       hardware_handler       "1 alua"
+                       path_grouping_policy   "failover"
+                       path_selector          "queue-length 0"
+                       failback               60
+                       path_checker           tur
+                       prio                   alua
+                       prio_args              exclusive_pref_bit
+                       fast_oi_fail_tmo       25
+                       no_path_retry          queue
+               }
+       }
+
+#. Restart the ``multipathd`` service:
+
+   ::
+
+       # systemctl reload multipathd
+
+**iSCSI Discovery and Setup:**
+
+#. Discover the target portals:
+
+   ::
+
+       # iscsiadm -m discovery -t -st 192.168.56.101
+       192.168.56.101:3260,1 iqn.2003-01.org.linux-iscsi.rheln1
+       192.168.56.102:3260,2 iqn.2003-01.org.linux-iscsi.rheln1
+
+#. Login to target:
+
+   ::
+
+       # iscsiadm -m node -T iqn.2003-01.org.linux-iscsi.rheln1 -l
+
+**Multipath IO Setup:**
+
+The multipath daemon (``multipathd``), will set up devices automatically
+based on the ``multipath.conf`` settings. Running the ``multipath``
+command show devices setup in a failover configuration with a priority
+group for each path.
+
+::
+
+    # multipath -ll
+    mpathbt (360014059ca317516a69465c883a29603) dm-1 LIO-ORG ,IBLOCK
+    size=1.0G features='0' hwhandler='1 alua' wp=rw
+    |-+- policy='queue-length 0' prio=50 status=active
+    | `- 28:0:0:1 sde  8:64  active ready running
+    `-+- policy='queue-length 0' prio=10 status=enabled
+      `- 29:0:0:1 sdc  8:32  active ready running
+
+You should now be able to use the RBD image like you would a normal
+multipath’d iSCSI disk.
diff --git a/ceph/doc/rbd/iscsi-initiator-win.rst b/ceph/doc/rbd/iscsi-initiator-win.rst
new file mode 100644 (file)
index 0000000..08a1cfb
--- /dev/null
@@ -0,0 +1,100 @@
+-----------------------------------------
+The iSCSI Initiator for Microsoft Windows
+-----------------------------------------
+
+**Prerequisite:**
+
+-  Microsoft Windows 2016
+
+**iSCSI Initiator, Discovery and Setup:**
+
+#. Install the iSCSI initiator driver and MPIO tools.
+
+#. Launch the MPIO program, click on the “Discover Multi-Paths” tab select “Add
+   support for iSCSI devices”.
+
+#. On the iSCSI Initiator Properties window, on the "Discovery" tab, add a target
+   portal. Enter the IP address or DNS name and Port of the Ceph iSCSI gateway.
+
+#. On the “Targets” tab, select the target and click on “Connect”.
+
+#. On the “Connect To Target” window, select the “Enable multi-path” option, and
+   click the “Advanced” button.
+
+#. Under the "Connet using" section, select a “Target portal IP” . Select the
+   “Enable CHAP login on” and enter the "Name" and "Target secret" values from the
+   Ceph iSCSI Ansible client credentials section, and click OK.
+
+#. Repeat steps 5 and 6 for each target portal defined when setting up
+   the iSCSI gateway.
+
+**Multipath IO Setup:**
+
+Configuring the MPIO load balancing policy, setting the timeout and
+retry options are using PowerShell with the ``mpclaim`` command. The
+reset is done in the MPIO tool.
+
+.. note::
+  It is recommended to increase the ``PDORemovePeriod`` option to 120
+  seconds from PowerShell. This value might need to be adjusted based
+  on the application. When all paths are down, and 120 seconds
+  expires, the operating system will start failing IO requests.
+
+::
+
+    Set-MPIOSetting -NewPDORemovePeriod 120
+
+::
+
+    mpclaim.exe -l -m 1
+
+::
+
+    mpclaim -s -m
+    MSDSM-wide Load Balance Policy: Fail Over Only
+
+#. Using the MPIO tool, from the “Targets” tab, click on the
+   “Devices...” button.
+
+#. From the Devices window, select a disk and click the
+   “MPIO...” button.
+
+#. On the "Device Details" window the paths to each target portal is
+   displayed. If using the ``ceph-ansible`` setup method, the
+   iSCSI gateway will use ALUA to tell the iSCSI initiator which path
+   and iSCSI gateway should be used as the primary path. The Load
+   Balancing Policy “Fail Over Only” must be selected
+
+::
+
+    mpclaim -s -d $MPIO_DISK_ID
+
+.. note::
+  For the ``ceph-ansible`` setup method, there will be one
+  Active/Optimized path which is the path to the iSCSI gateway node
+  that owns the LUN, and there will be an Active/Unoptimized path for
+  each other iSCSI gateway node.
+
+**Tuning:**
+
+Consider using the following registry settings:
+
+-  Windows Disk Timeout
+
+   ::
+
+       HKEY_LOCAL_MACHINE\System\CurrentControlSet\Services\Disk
+
+   ::
+
+       TimeOutValue = 65
+
+-  Microsoft iSCSI Initiator Driver
+
+   ::
+
+       HKEY_LOCAL_MACHINE\\SYSTEM\CurrentControlSet\Control\Class\{4D36E97B-E325-11CE-BFC1-08002BE10318}\<Instance_Number>\Parameters
+
+   ::
+       LinkDownTime = 25
+       SRBTimeoutDelta = 15
diff --git a/ceph/doc/rbd/iscsi-initiators.rst b/ceph/doc/rbd/iscsi-initiators.rst
new file mode 100644 (file)
index 0000000..d3ad633
--- /dev/null
@@ -0,0 +1,10 @@
+--------------------------------
+Configuring the iSCSI Initiators
+--------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  The iSCSI Initiator for Red Hat Enterprise Linux <iscsi-initiator-rhel>
+  The iSCSI Initiator for Microsoft Windows <iscsi-initiator-win>
+  The iSCSI Initiator for VMware ESX <iscsi-initiator-esx>
diff --git a/ceph/doc/rbd/iscsi-monitoring.rst b/ceph/doc/rbd/iscsi-monitoring.rst
new file mode 100644 (file)
index 0000000..d425232
--- /dev/null
@@ -0,0 +1,103 @@
+-----------------------------
+Monitoring the iSCSI gateways
+-----------------------------
+
+Ceph provides an additional tool for iSCSI gateway environments
+to monitor performance of exported RADOS Block Device (RBD) images.
+
+The ``gwtop`` tool is a ``top``-like tool that displays aggregated
+performance metrics of RBD images that are exported to clients over
+iSCSI. The metrics are sourced from a Performance Metrics Domain Agent
+(PMDA). Information from the Linux-IO target (LIO) PMDA is used to list
+each exported RBD image with the connected client and its associated I/O
+metrics.
+
+**Requirements:**
+
+-  A running Ceph iSCSI gateway
+
+**Installing:**
+
+#. As ``root``, install the ``ceph-iscsi-tools`` package on each iSCSI
+   gateway node:
+
+   ::
+
+       # yum install ceph-iscsi-tools
+
+#. As ``root``, install the performance co-pilot package on each iSCSI
+   gateway node:
+
+   ::
+
+       # yum install pcp
+
+#. As ``root``, install the LIO PMDA package on each iSCSI gateway node:
+
+   ::
+
+       # yum install pcp-pmda-lio
+
+#. As ``root``, enable and start the performance co-pilot service on
+   each iSCSI gateway node:
+
+   ::
+
+       # systemctl enable pmcd
+       # systemctl start pmcd
+
+#. As ``root``, register the ``pcp-pmda-lio`` agent:
+
+   ::
+
+       cd /var/lib/pcp/pmdas/lio
+       ./Install
+
+By default, ``gwtop`` assumes the iSCSI gateway configuration object is
+stored in a RADOS object called ``gateway.conf`` in the ``rbd`` pool.
+This configuration defines the iSCSI gateways to contact for gathering
+the performance statistics. This can be overridden by using either the
+``-g`` or ``-c`` flags. See ``gwtop --help`` for more details.
+
+The LIO configuration determines which type of performance statistics to
+extract from performance co-pilot. When ``gwtop`` starts it looks at the
+LIO configuration, and if it find user-space disks, then ``gwtop``
+selects the LIO collector automatically.
+
+**Example ``gwtop`` Outputs**
+
+For kernel RBD-based devices:
+
+::
+
+    gwtop  2/2 Gateways   CPU% MIN:  4 MAX:  5    Network Total In:    2M  Out:    3M   10:20:09
+    Capacity:   8G    Disks:   8   IOPS:  500   Clients:  1   Ceph: HEALTH_OK          OSDs:   3
+    Pool.Image     Src  Device   Size     r/s     w/s    rMB/s     wMB/s    await  r_await  w_await  Client
+    iscsi.t1703          rbd0    500M       0       0     0.00      0.00     0.00     0.00     0.00
+    iscsi.testme1        rbd5    500M       0       0     0.00      0.00     0.00     0.00     0.00
+    iscsi.testme2        rbd2    500M       0       0     0.00      0.00     0.00     0.00     0.00
+    iscsi.testme3        rbd3    500M       0       0     0.00      0.00     0.00     0.00     0.00
+    iscsi.testme5        rbd1    500M       0       0     0.00      0.00     0.00     0.00     0.00
+    rbd.myhost_1    T    rbd4      4G     500       0     1.95      0.00     2.37     2.37     0.00  rh460p(CON)
+    rbd.test_2           rbd6      1G       0       0     0.00      0.00     0.00     0.00     0.00
+    rbd.testme           rbd7    500M       0       0     0.00      0.00     0.00     0.00     0.00
+
+For user backed storage (TCMU) devices:
+
+::
+
+    gwtop  2/2 Gateways   CPU% MIN:  4 MAX:  5    Network Total In:    2M  Out:    3M   10:20:00
+    Capacity:   8G    Disks:   8   IOPS:  503   Clients:  1   Ceph: HEALTH_OK          OSDs:   3
+    Pool.Image       Src    Size     iops     rMB/s     wMB/s   Client
+    iscsi.t1703             500M        0      0.00      0.00
+    iscsi.testme1           500M        0      0.00      0.00
+    iscsi.testme2           500M        0      0.00      0.00
+    iscsi.testme3           500M        0      0.00      0.00
+    iscsi.testme5           500M        0      0.00      0.00
+    rbd.myhost_1      T       4G      504      1.95      0.00   rh460p(CON)
+    rbd.test_2                1G        0      0.00      0.00
+    rbd.testme              500M        0      0.00      0.00
+
+In the *Client* column, ``(CON)`` means the iSCSI initiator (client) is
+currently logged into the iSCSI gateway. If ``-multi-`` is displayed,
+then multiple clients are mapped to the single RBD image.
diff --git a/ceph/doc/rbd/iscsi-overview.rst b/ceph/doc/rbd/iscsi-overview.rst
new file mode 100644 (file)
index 0000000..a8c64e2
--- /dev/null
@@ -0,0 +1,50 @@
+==================
+Ceph iSCSI Gateway
+==================
+
+The iSCSI gateway is integrating Ceph Storage with the iSCSI standard to provide
+a Highly Available (HA) iSCSI target that exports RADOS Block Device (RBD) images
+as SCSI disks. The iSCSI protocol allows clients (initiators) to send SCSI commands
+to SCSI storage devices (targets) over a TCP/IP network. This allows for heterogeneous
+clients, such as Microsoft Windows, to access the Ceph Storage cluster.
+
+Each iSCSI gateway runs the Linux IO target kernel subsystem (LIO) to provide the
+iSCSI protocol support. LIO utilizes a userspace passthrough (TCMU) to interact
+with Ceph's librbd library and expose RBD images to iSCSI clients. With Ceph’s
+iSCSI gateway you can effectively run a fully integrated block-storage
+infrastructure with all the features and benefits of a conventional Storage Area
+Network (SAN).
+
+.. ditaa::
+                  Cluster Network
+                 +-------------------------------------------+
+                 |             |               |             |
+             +-------+     +-------+       +-------+     +-------+
+             |       |     |       |       |       |     |       |
+             | OSD 1 |     | OSD 2 |       | OSD 3 |     | OSD N |
+             |    {s}|     |    {s}|       |    {s}|     |    {s}|
+             +-------+     +-------+       +-------+     +-------+
+                 |             |               |             |
+      +--------->|             |  +---------+  |             |<---------+
+      :          |             |  |   RBD   |  |             |          :
+      |          +----------------|  Image  |----------------+          |
+      |           Public Network  |    {d}  |                           |
+      |                           +---------+                           |
+      |                                                                 |
+      |                      +-------------------+                      |
+      |   +--------------+   |  iSCSI Initators  |   +--------------+   |
+      |   | iSCSI GW     |   |   +-----------+   |   | iSCSI GW     |   |
+      +-->|  RBD Module  |<--+   | Various   |   +-->|  RBD Module  |<--+
+          |              |   |   | Operating |   |   |              |
+          +--------------+   |   | Systems   |   |   +--------------+
+                             |   +-----------+   |
+                             +-------------------+
+
+
+.. toctree::
+  :maxdepth: 1
+
+  Requirements <iscsi-requirements>
+  Configuring the iSCSI Target <iscsi-targets>
+  Configuring the iSCSI Initiator <iscsi-initiators>
+  Monitoring the iSCSI Gateways <iscsi-monitoring>
diff --git a/ceph/doc/rbd/iscsi-requirements.rst b/ceph/doc/rbd/iscsi-requirements.rst
new file mode 100644 (file)
index 0000000..1ae19e0
--- /dev/null
@@ -0,0 +1,49 @@
+==========================
+iSCSI Gateway Requirements
+==========================
+
+To implement the Ceph iSCSI gateway there are a few requirements. It is recommended
+to use two to four iSCSI gateway nodes for a highly available Ceph iSCSI gateway
+solution.
+
+For hardware recommendations, see the `Hardware Recommendation page <http://docs.ceph.com/docs/master/start/hardware-recommendations/>`_
+for more details.
+
+.. note::
+    On the iSCSI gateway nodes, the memory footprint of the RBD images
+    can grow to a large size. Plan memory requirements accordingly based
+    off the number RBD images mapped.
+
+There are no specific iSCSI gateway options for the Ceph Monitors or
+OSDs, but it is important to lower the default timers for detecting
+down OSDs to reduce the possibility of initiator timeouts. The following
+configuration options are suggested for each OSD node in the storage
+cluster::
+
+        [osd]
+        osd heartbeat grace = 20
+        osd heartbeat interval = 5
+
+-  Online Updating Using the Ceph Monitor
+
+   ::
+
+       ceph tell <daemon_type>.<id> injectargs '--<parameter_name> <new_value>'
+
+   ::
+
+       ceph tell osd.0 injectargs '--osd_heartbeat_grace 20'
+       ceph tell osd.0 injectargs '--osd_heartbeat_interval 5'
+
+-  Online Updating on the OSD Node
+
+   ::
+
+       ceph daemon <daemon_type>.<id> config set osd_client_watch_timeout 15
+
+   ::
+
+       ceph daemon osd.0 config set osd_heartbeat_grace 20
+       ceph daemon osd.0 config set osd_heartbeat_interval 5
+
+For more details on setting Ceph's configuration options, see the `Configuration page <http://docs.ceph.com/docs/master/rados/configuration/>`_.
diff --git a/ceph/doc/rbd/iscsi-target-ansible.rst b/ceph/doc/rbd/iscsi-target-ansible.rst
new file mode 100644 (file)
index 0000000..4169a9f
--- /dev/null
@@ -0,0 +1,343 @@
+==========================================
+Configuring the iSCSI Target using Ansible
+==========================================
+
+The Ceph iSCSI gateway is the iSCSI target node and also a Ceph client
+node. The Ceph iSCSI gateway can be a standalone node or be colocated on
+a Ceph Object Store Disk (OSD) node. Completing the following steps will
+install, and configure the Ceph iSCSI gateway for basic operation.
+
+**Requirements:**
+
+-  A running Ceph Luminous (12.2.x) cluster or newer
+
+-  RHEL/CentOS 7.4; or Linux kernel v4.14 or newer
+
+-  The ``ceph-iscsi-config`` package installed on all the iSCSI gateway nodes
+
+**Installing:**
+
+#. On the Ansible installer node, which could be either the administration node
+   or a dedicated deployment node, perform the following steps:
+
+   #. As ``root``, install the ``ceph-ansible`` package:
+
+      ::
+
+          # yum install ceph-ansible
+
+   #. Add an entry in ``/etc/ansible/hosts`` file for the gateway group:
+
+      ::
+
+          [ceph-iscsi-gw]
+          ceph-igw-1
+          ceph-igw-2
+
+.. note::
+  If co-locating the iSCSI gateway with an OSD node, then add the OSD node to the
+  ``[ceph-iscsi-gw]`` section.
+
+**Configuring:**
+
+The ``ceph-ansible`` package places a file in the ``/usr/share/ceph-ansible/group_vars/``
+directory called ``ceph-iscsi-gw.sample``. Create a copy of this sample file named
+``ceph-iscsi-gw.yml``. Review the following Ansible variables and descriptions,
+and update accordingly.
+
++--------------------------------------+--------------------------------------+
+| Variable                             | Meaning/Purpose                      |
++======================================+======================================+
+| ``seed_monitor``                     | Each gateway needs access to the     |
+|                                      | ceph cluster for rados and rbd       |
+|                                      | calls. This means the iSCSI gateway  |
+|                                      | must have an appropriate             |
+|                                      | ``/etc/ceph/`` directory defined.    |
+|                                      | The ``seed_monitor`` host is used to |
+|                                      | populate the iSCSI gateway’s         |
+|                                      | ``/etc/ceph/`` directory.            |
++--------------------------------------+--------------------------------------+
+| ``cluster_name``                     | Define a custom storage cluster      |
+|                                      | name.                                |
++--------------------------------------+--------------------------------------+
+| ``gateway_keyring``                  | Define a custom keyring name.        |
++--------------------------------------+--------------------------------------+
+| ``deploy_settings``                  | If set to ``true``, then deploy the  |
+|                                      | settings when the playbook is ran.   |
++--------------------------------------+--------------------------------------+
+| ``perform_system_checks``            | This is a boolean value that checks  |
+|                                      | for multipath and lvm configuration  |
+|                                      | settings on each gateway. It must be |
+|                                      | set to true for at least the first   |
+|                                      | run to ensure multipathd and lvm are |
+|                                      | configured properly.                 |
++--------------------------------------+--------------------------------------+
+| ``gateway_iqn``                      | This is the iSCSI IQN that all the   |
+|                                      | gateways will expose to clients.     |
+|                                      | This means each client will see the  |
+|                                      | gateway group as a single subsystem. |
++--------------------------------------+--------------------------------------+
+| ``gateway_ip_list``                  | The ip list defines the IP addresses |
+|                                      | that will be used on the front end   |
+|                                      | network for iSCSI traffic. This IP   |
+|                                      | will be bound to the active target   |
+|                                      | portal group on each node, and is    |
+|                                      | the access point for iSCSI traffic.  |
+|                                      | Each IP should correspond to an IP   |
+|                                      | available on the hosts defined in    |
+|                                      | the ``ceph-iscsi-gw`` host group in  |
+|                                      | ``/etc/ansible/hosts``.              |
++--------------------------------------+--------------------------------------+
+| ``rbd_devices``                      | This section defines the RBD images  |
+|                                      | that will be controlled and managed  |
+|                                      | within the iSCSI gateway             |
+|                                      | configuration. Parameters like       |
+|                                      | ``pool`` and ``image`` are self      |
+|                                      | explanatory. Here are the other      |
+|                                      | parameters: ``size`` = This defines  |
+|                                      | the size of the RBD. You may         |
+|                                      | increase the size later, by simply   |
+|                                      | changing this value, but shrinking   |
+|                                      | the size of an RBD is not supported  |
+|                                      | and is ignored. ``host`` = This is   |
+|                                      | the iSCSI gateway host name that     |
+|                                      | will be responsible for the rbd      |
+|                                      | allocation/resize. Every defined     |
+|                                      | ``rbd_device`` entry must have a     |
+|                                      | host assigned. ``state`` = This is   |
+|                                      | typical Ansible syntax for whether   |
+|                                      | the resource should be defined or    |
+|                                      | removed. A request with a state of   |
+|                                      | absent will first be checked to      |
+|                                      | ensure the rbd is not mapped to any  |
+|                                      | client. If the RBD is unallocated,   |
+|                                      | it will be removed from the iSCSI    |
+|                                      | gateway and deleted from the         |
+|                                      | configuration.                       |
++--------------------------------------+--------------------------------------+
+| ``client_connections``               | This section defines the iSCSI       |
+|                                      | client connection details together   |
+|                                      | with the LUN (RBD image) masking.    |
+|                                      | Currently only CHAP is supported as  |
+|                                      | an authentication mechanism. Each    |
+|                                      | connection defines an ``image_list`` |
+|                                      | which is a comma separated list of   |
+|                                      | the form                             |
+|                                      | ``pool.rbd_image[,pool.rbd_image]``. |
+|                                      | RBD images can be added and removed  |
+|                                      | from this list, to change the client |
+|                                      | masking. Note that there are no      |
+|                                      | checks done to limit RBD sharing     |
+|                                      | across client connections.           |
++--------------------------------------+--------------------------------------+
+
+.. note::
+  When using the ``gateway_iqn`` variable, and for Red Hat Enterprise Linux
+  clients, installing the ``iscsi-initiator-utils`` package is required for
+  retrieving the gateway’s IQN name. The iSCSI initiator name is located in the
+  ``/etc/iscsi/initiatorname.iscsi`` file.
+
+**Deploying:**
+
+On the Ansible installer node, perform the following steps.
+
+#. As ``root``, execute the Ansible playbook:
+
+   ::
+
+       # cd /usr/share/ceph-ansible
+       # ansible-playbook ceph-iscsi-gw.yml
+
+   .. note::
+    The Ansible playbook will handle RPM dependencies, RBD creation
+    and Linux IO configuration.
+
+#. Verify the configuration from an iSCSI gateway node:
+
+   ::
+
+       # gwcli ls
+
+   .. note::
+    For more information on using the ``gwcli`` command to install and configure
+    a Ceph iSCSI gateaway, see the `Configuring the iSCSI Target using the Command Line Interface`_
+    section.
+
+   .. important::
+    Attempting to use the ``targetcli`` tool to change the configuration will
+    result in the following issues, such as ALUA misconfiguration and path failover
+    problems. There is the potential to corrupt data, to have mismatched
+    configuration across iSCSI gateways, and to have mismatched WWN information,
+    which will lead to client multipath problems.
+
+**Service Management:**
+
+The ``ceph-iscsi-config`` package installs the configuration management
+logic and a Systemd service called ``rbd-target-gw``. When the Systemd
+service is enabled, the ``rbd-target-gw`` will start at boot time and
+will restore the Linux IO state. The Ansible playbook disables the
+target service during the deployment. Below are the outcomes of when
+interacting with the ``rbd-target-gw`` Systemd service.
+
+::
+
+    # systemctl <start|stop|restart|reload> rbd-target-gw
+
+-  ``reload``
+
+   A reload request will force ``rbd-target-gw`` to reread the
+   configuration and apply it to the current running environment. This
+   is normally not required, since changes are deployed in parallel from
+   Ansible to all iSCSI gateway nodes
+
+-  ``stop``
+
+   A stop request will close the gateway’s portal interfaces, dropping
+   connections to clients and wipe the current LIO configuration from
+   the kernel. This returns the iSCSI gateway to a clean state. When
+   clients are disconnected, active I/O is rescheduled to the other
+   iSCSI gateways by the client side multipathing layer.
+
+**Administration:**
+
+Within the ``/usr/share/ceph-ansible/group_vars/ceph-iscsi-gw`` file
+there are a number of operational workflows that the Ansible playbook
+supports.
+
+.. warning::
+  Before removing RBD images from the iSCSI gateway configuration,
+  follow the standard procedures for removing a storage device from
+  the operating system.
+
++--------------------------------------+--------------------------------------+
+| I want to…​                          | Update the ``ceph-iscsi-gw`` file    |
+|                                      | by…​                                 |
++======================================+======================================+
+| Add more RBD images                  | Adding another entry to the          |
+|                                      | ``rbd_devices`` section with the new |
+|                                      | image.                               |
++--------------------------------------+--------------------------------------+
+| Resize an existing RBD image         | Updating the size parameter within   |
+|                                      | the ``rbd_devices`` section. Client  |
+|                                      | side actions are required to pick up |
+|                                      | the new size of the disk.            |
++--------------------------------------+--------------------------------------+
+| Add a client                         | Adding an entry to the               |
+|                                      | ``client_connections`` section.      |
++--------------------------------------+--------------------------------------+
+| Add another RBD to a client          | Adding the relevant RBD              |
+|                                      | ``pool.image`` name to the           |
+|                                      | ``image_list`` variable for the      |
+|                                      | client.                              |
++--------------------------------------+--------------------------------------+
+| Remove an RBD from a client          | Removing the RBD ``pool.image`` name |
+|                                      | from the clients ``image_list``      |
+|                                      | variable.                            |
++--------------------------------------+--------------------------------------+
+| Remove an RBD from the system        | Changing the RBD entry state         |
+|                                      | variable to ``absent``. The RBD      |
+|                                      | image must be unallocated from the   |
+|                                      | operating system first for this to   |
+|                                      | succeed.                             |
++--------------------------------------+--------------------------------------+
+| Change the clients CHAP credentials  | Updating the relevant CHAP details   |
+|                                      | in ``client_connections``. This will |
+|                                      | need to be coordinated with the      |
+|                                      | clients. For example, the client     |
+|                                      | issues an iSCSI logout, the          |
+|                                      | credentials are changed by the       |
+|                                      | Ansible playbook, the credentials    |
+|                                      | are changed at the client, then the  |
+|                                      | client performs an iSCSI login.      |
++--------------------------------------+--------------------------------------+
+| Remove a client                      | Updating the relevant                |
+|                                      | ``client_connections`` item with a   |
+|                                      | state of ``absent``. Once the        |
+|                                      | Ansible playbook is ran, the client  |
+|                                      | will be purged from the system, but  |
+|                                      | the disks will remain defined to     |
+|                                      | Linux IO for potential reuse.        |
++--------------------------------------+--------------------------------------+
+
+Once a change has been made, rerun the Ansible playbook to apply the
+change across the iSCSI gateway nodes.
+
+::
+
+    # ansible-playbook ceph-iscsi-gw.yml
+
+**Removing the Configuration:**
+
+The ``ceph-ansible`` package provides an Ansible playbook to
+remove the iSCSI gateway configuration and related RBD images. The
+Ansible playbook is ``/usr/share/ceph-ansible/purge_gateways.yml``. When
+this Ansible playbook is ran a prompted for the type of purge to
+perform:
+
+*lio* :
+
+In this mode the LIO configuration is purged on all iSCSI gateways that
+are defined. Disks that were created are left untouched within the Ceph
+storage cluster.
+
+*all* :
+
+When ``all`` is chosen, the LIO configuration is removed together with
+**all** RBD images that were defined within the iSCSI gateway
+environment, other unrelated RBD images will not be removed. Ensure the
+correct mode is chosen, this operation will delete data.
+
+.. warning::
+  A purge operation is destructive action against your iSCSI gateway
+  environment.
+
+.. warning::
+  A purge operation will fail, if RBD images have snapshots or clones
+  and are exported through the Ceph iSCSI gateway.
+
+::
+
+    [root@rh7-iscsi-client ceph-ansible]# ansible-playbook purge_gateways.yml
+    Which configuration elements should be purged? (all, lio or abort) [abort]: all
+
+
+    PLAY [Confirm removal of the iSCSI gateway configuration] *********************
+
+
+    GATHERING FACTS ***************************************************************
+    ok: [localhost]
+
+
+    TASK: [Exit playbook if user aborted the purge] *******************************
+    skipping: [localhost]
+
+
+    TASK: [set_fact ] *************************************************************
+    ok: [localhost]
+
+
+    PLAY [Removing the gateway configuration] *************************************
+
+
+    GATHERING FACTS ***************************************************************
+    ok: [ceph-igw-1]
+    ok: [ceph-igw-2]
+
+
+    TASK: [igw_purge | purging the gateway configuration] *************************
+    changed: [ceph-igw-1]
+    changed: [ceph-igw-2]
+
+
+    TASK: [igw_purge | deleting configured rbd devices] ***************************
+    changed: [ceph-igw-1]
+    changed: [ceph-igw-2]
+
+
+    PLAY RECAP ********************************************************************
+    ceph-igw-1                 : ok=3    changed=2    unreachable=0    failed=0
+    ceph-igw-2                 : ok=3    changed=2    unreachable=0    failed=0
+    localhost                  : ok=2    changed=0    unreachable=0    failed=0
+
+
+.. _Configuring the iSCSI Target using the Command Line Interface: ../iscsi-target-cli
diff --git a/ceph/doc/rbd/iscsi-target-cli.rst b/ceph/doc/rbd/iscsi-target-cli.rst
new file mode 100644 (file)
index 0000000..6da6f10
--- /dev/null
@@ -0,0 +1,163 @@
+=============================================================
+Configuring the iSCSI Target using the Command Line Interface
+=============================================================
+
+The Ceph iSCSI gateway is the iSCSI target node and also a Ceph client
+node. The Ceph iSCSI gateway can be a standalone node or be colocated on
+a Ceph Object Store Disk (OSD) node. Completing the following steps will
+install, and configure the Ceph iSCSI gateway for basic operation.
+
+**Requirements:**
+
+-  A running Ceph Luminous or later storage cluster
+
+-  RHEL/CentOS 7.4; or Linux kernel v4.14 or newer
+
+-  The following packages must be installed from your Linux distribution's software repository:
+
+   -  ``targetcli-2.1.fb47`` or newer package
+
+   -  ``python-rtslib-2.1.fb64`` or newer package
+
+   -  ``tcmu-runner-1.3.0`` or newer package
+
+   -  ``ceph-iscsi-config-2.3`` or newer package
+
+   -  ``ceph-iscsi-cli-2.5`` or newer package
+
+     .. important::
+        If previous versions of these packages exist, then they must
+        be removed first before installing the newer versions.
+
+Do the following steps on the Ceph iSCSI gateway node before proceeding
+to the *Installing* section:
+
+#. If the Ceph iSCSI gateway is not colocated on an OSD node, then copy
+   the Ceph configuration files, located in ``/etc/ceph/``, from a
+   running Ceph node in the storage cluster to the iSCSI Gateway node.
+   The Ceph configuration files must exist on the iSCSI gateway node
+   under ``/etc/ceph/``.
+
+#. Install and configure the `Ceph Command-line
+   Interface <http://docs.ceph.com/docs/master/start/quick-rbd/#install-ceph>`_
+
+#. If needed, open TCP ports 3260 and 5000 on the firewall.
+
+#. Create a new or use an existing RADOS Block Device (RBD).
+
+**Installing:**
+
+#. As ``root``, on all iSCSI gateway nodes, install the
+   ``ceph-iscsi-cli`` package:
+
+   ::
+
+       # yum install ceph-iscsi-cli
+
+#. As ``root``, on all iSCSI gateway nodes, install the ``tcmu-runner``
+   package:
+
+   ::
+
+       # yum install tcmu-runner
+
+#. As ``root``, on a iSCSI gateway node, create a file named
+   ``iscsi-gateway.cfg`` in the ``/etc/ceph/`` directory:
+
+   ::
+
+       # touch /etc/ceph/iscsi-gateway.cfg
+
+   #. Edit the ``iscsi-gateway.cfg`` file and add the following lines:
+
+      ::
+
+          [config]
+          # Name of the Ceph storage cluster. A suitable Ceph configuration file allowing
+          # access to the Ceph storage cluster from the gateway node is required, if not
+          # colocated on an OSD node.
+          cluster_name = ceph
+
+          # Place a copy of the ceph cluster's admin keyring in the gateway's /etc/ceph
+          # drectory and reference the filename here
+          gateway_keyring = ceph.client.admin.keyring
+
+
+          # API settings.
+          # The API supports a number of options that allow you to tailor it to your
+          # local environment. If you want to run the API under https, you will need to
+          # create cert/key files that are compatible for each iSCSI gateway node, that is
+          # not locked to a specific node. SSL cert and key files *must* be called
+          # 'iscsi-gateway.crt' and 'iscsi-gateway.key' and placed in the '/etc/ceph/' directory
+          # on *each* gateway node. With the SSL files in place, you can use 'api_secure = true'
+          # to switch to https mode.
+
+          # To support the API, the bear minimum settings are:
+          api_secure = false
+
+          # Additional API configuration options are as follows, defaults shown.
+          # api_user = admin
+          # api_password = admin
+          # api_port = 5001
+          # trusted_ip_list = 192.168.0.10,192.168.0.11
+
+      .. important::
+        The ``iscsi-gateway.cfg`` file must be identical on all iSCSI gateway nodes.
+
+   #. As ``root``, copy the ``iscsi-gateway.cfg`` file to all iSCSI
+      gateway nodes.
+
+#. As ``root``, on all iSCSI gateway nodes, enable and start the API
+   service:
+
+   ::
+
+       # systemctl enable rbd-target-api
+       # systemctl start rbd-target-api
+
+**Configuring:**
+
+#. As ``root``, on a iSCSI gateway node, start the iSCSI gateway
+   command-line interface:
+
+   ::
+
+       # gwcli
+
+#. Creating the iSCSI gateways:
+
+   ::
+
+       >/iscsi-target create iqn.2003-01.com.redhat.iscsi-gw:<target_name>
+       > goto gateways
+       > create <iscsi_gw_name> <IP_addr_of_gw>
+       > create <iscsi_gw_name> <IP_addr_of_gw>
+
+#. Adding a RADOS Block Device (RBD):
+
+   ::
+
+       > cd /iscsi-target/iqn.2003-01.com.redhat.iscsi-gw:<target_name>/disks/
+       >/disks/ create pool=<pool_name> image=<image_name> size=<image_size>m|g|t
+
+#. Creating a client:
+
+   ::
+
+       > goto hosts
+       > create iqn.1994-05.com.redhat:<client_name>
+       > auth chap=<user_name>/<password> | nochap
+
+
+  .. warning::
+      CHAP must always be configured. Without CHAP, the target will
+      reject any login requests.
+
+#. Adding disks to a client:
+
+   ::
+
+       >/iscsi-target..eph-igw/hosts> cd iqn.1994-05.com.redhat:<client_name>
+       > disk add <pool_name>.<image_name>
+
+The next step is to configure the iSCSI initiators.
diff --git a/ceph/doc/rbd/iscsi-targets.rst b/ceph/doc/rbd/iscsi-targets.rst
new file mode 100644 (file)
index 0000000..b7dcac7
--- /dev/null
@@ -0,0 +1,27 @@
+=============
+iSCSI Targets
+=============
+
+Traditionally, block-level access to a Ceph storage cluster has been
+limited to QEMU and ``librbd``, which is a key enabler for adoption
+within OpenStack environments. Starting with the Ceph Luminous release,
+block-level access is expanding to offer standard iSCSI support allowing
+wider platform usage, and potentially opening new use cases.
+
+-  RHEL/CentOS 7.4; or Linux kernel v4.14 or newer
+
+-  A working Ceph Storage cluster, deployed with ``ceph-ansible`` or using the command-line interface
+
+-  iSCSI gateways nodes, which can either be colocated with OSD nodes or on dedicated nodes
+
+-  Separate network subnets for iSCSI front-end traffic and Ceph back-end traffic
+
+A choice of using Ansible or the command-line interface are the
+available deployment methods for installing and configuring the Ceph
+iSCSI gateway:
+
+.. toctree::
+  :maxdepth: 1
+
+  Using Ansible <iscsi-target-ansible>
+  Using the Command Line Interface <iscsi-target-cli>
diff --git a/ceph/qa/objectstore/bluestore-bitmap.yaml b/ceph/qa/objectstore/bluestore-bitmap.yaml
new file mode 100644 (file)
index 0000000..88dca3a
--- /dev/null
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
index 16767efac762961aef75f159e2266413990bd072..b408032fdefdc8cc1d44a987fcbecf69deb2e81c 100644 (file)
@@ -8,8 +8,7 @@ overrides:
       osd:
         osd objectstore: bluestore
         bluestore block size: 96636764160
-        debug bluestore: 30
-        debug bdev: 20
+        debug bluestore: 20
         debug bluefs: 20
         debug rocksdb: 10
         bluestore compression mode: aggressive
index 838f3715b2c011b009bdbfc78f0983e98bb7e242..19dfeb036ff31dca3d2c69bb8c798bd10e92b3a3 100644 (file)
@@ -8,8 +8,7 @@ overrides:
       osd:
         osd objectstore: bluestore
         bluestore block size: 96636764160
-        debug bluestore: 30
-        debug bdev: 20
+        debug bluestore: 20
         debug bluefs: 20
         debug rocksdb: 10
         bluestore fsck on mount: true
@@ -27,8 +26,7 @@ overrides:
       osd:
         osd objectstore: bluestore
         bluestore block size: 96636764160
-        debug bluestore: 30
-        debug bdev: 20
+        debug bluestore: 20
         debug bluefs: 20
         debug rocksdb: 10
         bluestore fsck on mount: true
old mode 100644 (file)
new mode 100755 (executable)
index 4a782ba..858100a
@@ -31,27 +31,15 @@ set -e
 
 PROGNAME=$(basename $0)
 
-# xfstests is downloaded from this git repository and then built.
-# XFSTESTS_REPO="git://oss.sgi.com/xfs/cmds/xfstests.git"
-XFSTESTS_REPO="git://git.ceph.com/xfstests.git"
-XFSTESTS_VERSION="facff609afd6a2ca557c2b679e088982026aa188"
-XFSPROGS_REPO="git://oss.sgi.com/xfs/cmds/xfsprogs"
-XFSPROGS_VERSION="v3.2.2"
-XFSDUMP_REPO="git://oss.sgi.com/xfs/cmds/xfsdump"
-XFSDUMP_VERSION="v3.1.4"
-
 # Default command line option values
 COUNT="1"
 EXPUNGE_FILE=""
 DO_RANDOMIZE=""        # false
-FS_TYPE="xfs"
+FSTYP="xfs"
 SCRATCH_DEV="" # MUST BE SPECIFIED
 TEST_DEV=""    # MUST BE SPECIFIED
 TESTS="-g auto"        # The "auto" group is supposed to be "known good"
 
-# We no longer need to set the stripe unit in XFS_MKFS_OPTIONS because recent
-# versions of mkfs.xfs autodetect it.
-
 # print an error message and quit with non-zero status
 function err() {
        if [ $# -gt 0 ]; then
@@ -197,7 +185,7 @@ function parseargs() {
                        -f|--fs-type)
                                fs_type_valid "$2" ||
                                        usage "invalid fs_type '$2'"
-                               FS_TYPE="$2"
+                               FSTYP="$2"
                                shift
                                ;;
                        -r|--randomize)
@@ -237,102 +225,6 @@ function parseargs() {
 
 ################################################################
 
-[ -n "${TESTDIR}" ] || usage "TESTDIR env variable must be set"
-
-# Set up some environment for normal teuthology test setup.
-# This really should not be necessary but I found it was.
-export CEPH_ARGS="--conf ${TESTDIR}/ceph.conf"
-export CEPH_ARGS="${CEPH_ARGS} --keyring ${TESTDIR}/data/client.0.keyring"
-export CEPH_ARGS="${CEPH_ARGS} --name client.0"
-
-export LD_LIBRARY_PATH="${TESTDIR}/binary/usr/local/lib:${LD_LIBRARY_PATH}"
-export PATH="${TESTDIR}/binary/usr/local/bin:${PATH}"
-export PATH="${TESTDIR}/binary/usr/local/sbin:${PATH}"
-
-################################################################
-
-# Filesystem-specific mkfs options--set if not supplied
-#export XFS_MKFS_OPTIONS="${XFS_MKFS_OPTIONS:--f -l su=65536}"
-export EXT4_MKFS_OPTIONS="${EXT4_MKFS_OPTIONS:--F}"
-export BTRFS_MKFS_OPTION       # No defaults
-
-XFSTESTS_DIR="/var/lib/xfstests"       # Where the tests live
-XFSPROGS_DIR="/tmp/cephtest/xfsprogs-install"
-XFSDUMP_DIR="/tmp/cephtest/xfsdump-install"
-export PATH="${XFSPROGS_DIR}/sbin:${XFSDUMP_DIR}/sbin:${PATH}"
-
-# download, build, and install xfstests
-function install_xfstests() {
-       arg_count 0 $#
-
-       local multiple=""
-       local ncpu
-
-       pushd "${TESTDIR}"
-
-       git clone "${XFSTESTS_REPO}"
-
-       cd xfstests
-       git checkout "${XFSTESTS_VERSION}"
-
-       ncpu=$(getconf _NPROCESSORS_ONLN 2>&1)
-       [ -n "${ncpu}" -a "${ncpu}" -gt 1 ] && multiple="-j ${ncpu}"
-
-       make realclean
-       make ${multiple}
-       make -k install
-
-       popd
-}
-
-# remove previously-installed xfstests files
-function remove_xfstests() {
-       arg_count 0 $#
-
-       rm -rf "${TESTDIR}/xfstests"
-       rm -rf "${XFSTESTS_DIR}"
-}
-
-# create a host options file that uses the specified devices
-function setup_host_options() {
-       arg_count 0 $#
-       export MNTDIR="/tmp/cephtest"
-
-       # Create mount points for the test and scratch filesystems
-       mkdir -p ${MNTDIR}
-       local test_dir="$(mktemp -d ${MNTDIR}/test_dir.XXXXXXXXXX)"
-       local scratch_dir="$(mktemp -d ${MNTDIR}/scratch_mnt.XXXXXXXXXX)"
-
-       # Write a host options file that uses these devices.
-       # xfstests uses the file defined by HOST_OPTIONS as the
-       # place to get configuration variables for its run, and
-       # all (or most) of the variables set here are required.
-       export HOST_OPTIONS="$(mktemp ${TESTDIR}/host_options.XXXXXXXXXX)"
-       cat > "${HOST_OPTIONS}" <<-!
-               # Created by ${PROGNAME} on $(date)
-               # HOST_OPTIONS="${HOST_OPTIONS}"
-               TEST_DEV="${TEST_DEV}"
-               SCRATCH_DEV="${SCRATCH_DEV}"
-               TEST_DIR="${test_dir}"
-               SCRATCH_MNT="${scratch_dir}"
-               FSTYP="${FS_TYPE}"
-               export TEST_DEV SCRATCH_DEV TEST_DIR SCRATCH_MNT FSTYP
-               #
-               export XFS_MKFS_OPTIONS="${XFS_MKFS_OPTIONS}"
-       !
-
-       # Now ensure we are using the same values
-       . "${HOST_OPTIONS}"
-}
-
-# remove the host options file, plus the directories it refers to
-function cleanup_host_options() {
-       arg_count 0 $#
-
-       rm -rf "${TEST_DIR}" "${SCRATCH_MNT}"
-       rm -f "${HOST_OPTIONS}"
-}
-
 # run mkfs on the given device using the specified filesystem type
 function do_mkfs() {
        arg_count 1 $#
@@ -341,161 +233,74 @@ function do_mkfs() {
        local options
 
        case "${FSTYP}" in
-               xfs)    options="${XFS_MKFS_OPTIONS}" ;;
-               ext4)   options="${EXT4_MKFS_OPTIONS}" ;;
-               btrfs)  options="${BTRFS_MKFS_OPTIONS}" ;;
+               xfs)    options="-f" ;;
+               ext4)   options="-F" ;;
+               btrfs)  options="-f" ;;
        esac
 
        "mkfs.${FSTYP}" ${options} "${dev}" ||
                err "unable to make ${FSTYP} file system on device \"${dev}\""
 }
 
-# mount the given device on the given mount point
-function do_mount() {
-       arg_count 2 $#
-
-       local dev="${1}"
-       local dir="${2}"
-
-       mount "${dev}" "${dir}" ||
-               err "unable to mount file system \"${dev}\" on \"${dir}\""
-}
-
-# unmount a previously-mounted device
-function do_umount() {
-       arg_count 1 $#
-
-       local dev="${1}"
-
-       if mount | grep "${dev}" > /dev/null; then
-               if ! umount "${dev}"; then
-                       err "unable to unmount device \"${dev}\""
-               fi
-       else
-               # Report it but don't error out
-               echo "device \"${dev}\" was not mounted" >&2
-       fi
-}
-
-# do basic xfstests setup--make and mount the test and scratch filesystems
-function setup_xfstests() {
-       arg_count 0 $#
-
-       # TEST_DEV can persist across test runs, but for now we
-       # don't bother.   I believe xfstests prefers its devices to
-       # have been already been formatted for the desired
-       # filesystem type--it uses blkid to identify things or
-       # something.  So we mkfs both here for a fresh start.
-       do_mkfs "${TEST_DEV}"
-       do_mkfs "${SCRATCH_DEV}"
-
-       # I believe the test device is expected to be mounted; the
-       # scratch doesn't need to be (but it doesn't hurt).
-       do_mount "${TEST_DEV}" "${TEST_DIR}"
-       do_mount "${SCRATCH_DEV}" "${SCRATCH_MNT}"
-}
-
-# clean up changes made by setup_xfstests
-function cleanup_xfstests() {
-       arg_count 0 $#
-
-       # Unmount these in case a test left them mounted (plus
-       # the corresponding setup function mounted them...)
-       do_umount "${TEST_DEV}"
-       do_umount "${SCRATCH_DEV}"
-       rmdir "${TEST_DIR}"
-       rmdir "${SCRATCH_MNT}"
-       rmdir "${MNTDIR}"
-}
-
-function install_xfsprogs() {
-       arg_count 0 $#
-
-       pushd "${TESTDIR}"
-       git clone ${XFSPROGS_REPO}
-       cd xfsprogs
-       git checkout ${XFSPROGS_VERSION}
-       libtoolize -c `libtoolize -n -i >/dev/null 2>/dev/null && echo -i` -f
-       cp include/install-sh .
-       aclocal -I m4
-       autoconf
-       ./configure --prefix=${XFSPROGS_DIR}
-       make install
-       popd
-}
-
-function install_xfsdump() {
-       arg_count 0 $#
-
-       pushd "${TESTDIR}"
-       git clone ${XFSDUMP_REPO}
-       cd xfsdump
-       git checkout ${XFSDUMP_VERSION}
-
-       # somebody took #define min and #define max out, which breaks the build on
-       # ubuntu. we back out this commit here, though that may cause problems with
-       # this script down the line.
-       git revert -n 5a2985233c390d59d2a9757b119cb0e001c87a96
-       libtoolize -c `libtoolize -n -i >/dev/null 2>/dev/null && echo -i` -f
-       cp include/install-sh .
-       aclocal -I m4
-       autoconf
-       ./configure --prefix=${XFSDUMP_DIR}
-       (make -k install || true) # that's right, the install process is broken too
-       popd
-}
-
-function remove_xfsprogs() {
-       arg_count 0 $#
-
-       rm -rf ${TESTDIR}/xfsprogs
-       rm -rf ${XFSPROGS_DIR}
-}      
-
-function remove_xfsdump() {
-       arg_count 0 $#
-
-       rm -rf ${TESTDIR}/xfsdump
-       rm -rf ${XFSDUMP_DIR}
-}
-
-
 # top-level setup routine
 function setup() {
        arg_count 0 $#
 
-       setup_host_options
-       install_xfsprogs
-       install_xfsdump
-       install_xfstests
-       setup_xfstests
+       wget -P "${TESTDIR}" http://download.ceph.com/qa/xfstests.tar.gz
+       tar zxf "${TESTDIR}/xfstests.tar.gz" -C "$(dirname "${XFSTESTS_DIR}")"
+       mkdir "${TEST_DIR}"
+       mkdir "${SCRATCH_MNT}"
+       do_mkfs "${TEST_DEV}"
 }
 
 # top-level (final) cleanup routine
 function cleanup() {
        arg_count 0 $#
 
-       cd /
-       remove_xfsprogs
-       remove_xfsdump
-       cleanup_xfstests
-       remove_xfstests
-       cleanup_host_options
+       # ensure teuthology can clean up the logs
+       chmod -R a+rw "${TESTDIR}/archive"
+
+       findmnt "${TEST_DEV}" && umount "${TEST_DEV}"
+       [ -d "${SCRATCH_MNT}" ] && rmdir "${SCRATCH_MNT}"
+       [ -d "${TEST_DIR}" ] && rmdir "${TEST_DIR}"
+       rm -rf "${XFSTESTS_DIR}"
+       rm -f "${TESTDIR}/xfstests.tar.gz"
 }
-trap cleanup EXIT ERR HUP INT QUIT
 
 # ################################################################
 
 start_date="$(date)"
-
 parseargs "$@"
+[ -n "${TESTDIR}" ] || usage "TESTDIR env variable must be set"
+[ -d "${TESTDIR}/archive" ] || usage "\$TESTDIR/archive directory must exist"
+TESTDIR="$(readlink -e "${TESTDIR}")"
+[ -n "${EXPUNGE_FILE}" ] && EXPUNGE_FILE="$(readlink -e "${EXPUNGE_FILE}")"
 
+XFSTESTS_DIR="/var/lib/xfstests"  # hardcoded into dbench binary
+TEST_DIR="/mnt/test_dir"
+SCRATCH_MNT="/mnt/scratch_mnt"
+MKFS_OPTIONS=""
+EXT_MOUNT_OPTIONS="-o block_validity"
+
+trap cleanup EXIT ERR HUP INT QUIT
 setup
 
+export TEST_DEV
+export TEST_DIR
+export SCRATCH_DEV
+export SCRATCH_MNT
+export FSTYP
+export MKFS_OPTIONS
+export EXT_MOUNT_OPTIONS
+
 pushd "${XFSTESTS_DIR}"
 for (( i = 1 ; i <= "${COUNT}" ; i++ )); do
        [ "${COUNT}" -gt 1 ] && echo "=== Iteration "$i" starting at:  $(date)"
 
+       RESULT_BASE="${TESTDIR}/archive/results-${i}"
+       mkdir "${RESULT_BASE}"
+       export RESULT_BASE
+
        EXPUNGE=""
        [ -n "${EXPUNGE_FILE}" ] && EXPUNGE="-E ${EXPUNGE_FILE}"
 
@@ -503,8 +308,8 @@ for (( i = 1 ; i <= "${COUNT}" ; i++ )); do
        [ -n "${DO_RANDOMIZE}" ] && RANDOMIZE="-r"
 
        # -T output timestamps
-       ./check -T ${RANDOMIZE} ${EXPUNGE} ${TESTS}
-       status=$?
+       PATH="${PWD}/bin:${PATH}" ./check -T ${RANDOMIZE} ${EXPUNGE} ${TESTS}
+       findmnt "${TEST_DEV}" && umount "${TEST_DEV}"
 
        [ "${COUNT}" -gt 1 ] && echo "=== Iteration "$i" complete at:  $(date)"
 done
@@ -515,5 +320,4 @@ popd
 echo "This xfstests run started at:  ${start_date}"
 echo "xfstests run completed at:     $(date)"
 [ "${COUNT}" -gt 1 ] && echo "xfstests run consisted of ${COUNT} iterations"
-
-exit "${status}"
+echo OK
diff --git a/ceph/qa/run_xfstests_krbd.sh b/ceph/qa/run_xfstests_krbd.sh
deleted file mode 100644 (file)
index aafc0f1..0000000
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/bin/bash
-#
-# This is a wrapper around run_xfstests.sh to provide an expunge file
-# suitable for krbd xfstests runs.
-
-set -x
-
-[ -n "${TESTDIR}" ] || export TESTDIR="/tmp/cephtest"
-[ -d "${TESTDIR}" ] || mkdir "${TESTDIR}"
-
-SCRIPT="run_xfstests.sh"
-
-if [ -z "${URL_BASE}" ]; then
-       URL_BASE="https://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa"
-fi
-
-cd "${TESTDIR}"
-
-wget -O "${SCRIPT}" "${URL_BASE}/${SCRIPT}"
-chmod +x "${SCRIPT}"
-
-EXPUNGE="$(mktemp expunge.XXXXXXXXXX)"
-cat > "${EXPUNGE}" <<-!
-       # mv - moved here from the old version of run_xfstests.sh
-       #      and rbd_xfstests.yaml
-       # wasn't run - like 'mv', but wasn't specifically excluded
-       # new test - didn't exist in the xfstests version that was
-       #            used by the old version of this script
-       
-       generic/038
-       generic/042     # zeroes out only the last 4k of test file, but expects
-                       #  only zeros in the entire file. bug in test?
-       generic/046     # _count_extents in common/rc assumes backticks do not
-                       #  remove newlines. This breaks parsing on some
-                       #  platforms.
-       generic/050     # blockdev --setro right after mkfs returns EBUSY
-       generic/078 # RENAME_WHITEOUT was enabled in kernel commit 7dcf5c, but causes
-                       # a BUG for now
-       generic/081     # ubuntu lvm2 doesn't suport --yes argument
-       generic/083     # mkfs.xfs -dxize=104857600,agcount=6 fails
-                       #  when sunit=swidth=8192
-       generic/093     # not for Linux
-       generic/097     # not for Linux
-       generic/099     # not for Linux
-       generic/204     # stripe size throws off test's math for when to
-                       #  expect ENOSPC
-       generic/231 # broken for disk and rbd by xfs kernel commit 4162bb
-       generic/247 # race between DIO and mmap writes
-               # see (https://lists.01.org/pipermail/lkp/2015-March/002459.html)
-
-       shared/272      # not for xfs
-       shared/289      # not for xfs
-
-       xfs/007         # sector size math
-       xfs/030         # mkfs.xfs -dsize=100m,agcount=6 fails
-                       #  when sunit=swidth=8192
-       xfs/032         # xfs_copy cleans up with pthread_kill (RHBA-2015-0537)
-       xfs/042         # stripe size throws off test's math when filling FS
-       xfs/051
-       xfs/057         # test for IRIX
-       xfs/058         # test for IRIX
-       xfs/069         # _filter_bmap in common/punch parses incorrectly if
-                       #  blocks are not stripe-aligned
-       xfs/070         # extra output from xfs_repair
-       xfs/071         # xfs_repair issue on large offsets (RHBA-2015-0537)
-       xfs/073
-       xfs/081         # very small mkfs breaks test with sunit=swidth-8192
-       xfs/095         # not for Linux
-       xfs/096         # checks various mkfs options and chokes on sunit/swidth
-       xfs/104         # can't suppress sunit/swidth warnings on mkfs
-       xfs/109         # can't suppress sunit/swidth warnings on mkfs
-       xfs/167
-       xfs/178         # test explicitly checks for stripe width of 0
-       xfs/191         # tests NFSv4
-       xfs/197         # tests 32-bit machines
-       xfs/205         # very small mkfs breaks tests with sunit=swidth=8192
-       xfs/242         # _filter_bmap in common/punch parses incorrectly if
-                       #  blocks are not stripe-aligned
-       xfs/261         # bug in mount_xfs involving creation of new quota files
-       xfs/279         # sector size math (logical v. physical: BZ836433?)
-       xfs/297         # XXX: temporarily expunged due to length
-       xfs/300         # SELinux
-!
-
-./"${SCRIPT}" -x "$(readlink -f "${EXPUNGE}")" "$@"
-STATUS=$?
-
-rm -f "${EXPUNGE}"
-rm -f "${SCRIPT}"
-
-exit "${STATUS}"
index 160f9d8c1a12d8f1844b489b479a1e867c0b6471..bcaab3fa0e8898093f137bad3e3e77a3ff0e478d 100755 (executable)
@@ -209,6 +209,7 @@ function TEST_mon_classes() {
     ceph osd crush tree --show-shadow | grep 'class_1' || return 1
     ceph osd crush rule create-replicated class_1_rule default host class_1 || return 1
     ceph osd crush class rename class_1 class_2
+    ceph osd crush class rename class_1 class_2 # idempotent
     ceph osd crush class ls | grep 'class_1' && return 1
     ceph osd crush tree --show-shadow | grep 'class_1' && return 1
     ceph osd crush class ls | grep 'class_2' || return 1
diff --git a/ceph/qa/standalone/mon/osd-pool-df.sh b/ceph/qa/standalone/mon/osd-pool-df.sh
new file mode 100755 (executable)
index 0000000..3ed169d
--- /dev/null
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2017 Tencent <contact@tencent.com>
+#
+# Author: Chang Liu <liuchang0812@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON="127.0.0.1:7113" # git grep '\<7113\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+function TEST_ceph_df() {
+    local dir=$1
+    setup $dir || return 1
+
+    run_mon $dir a || return 1
+    run_osd $dir 0 || return 1
+    run_osd $dir 1 || return 1
+    run_osd $dir 2 || return 1
+    run_osd $dir 3 || return 1
+    run_osd $dir 4 || return 1
+    run_osd $dir 5 || return 1
+    run_mgr $dir x || return 1
+
+    profile+=" plugin=jerasure"
+    profile+=" technique=reed_sol_van"
+    profile+=" k=4"
+    profile+=" m=2"
+    profile+=" crush-failure-domain=osd"
+
+    ceph osd erasure-code-profile set ec42profile ${profile}
+    local rep_poolname=testcephdf_replicate
+    local ec_poolname=testcephdf_erasurecode
+    create_pool $rep_poolname 6 6 replicated
+    create_pool $ec_poolname 6 6 erasure ec42profile
+
+    local global_avail=`ceph df -f json | jq '.stats.total_avail_bytes'`
+    local rep_avail=`ceph df -f json | jq '.pools | map(select(.name == "$rep_poolname"))[0].stats.max_avail'`
+    local ec_avail=`ceph df -f json | jq '.pools | map(select(.name == "$ec_poolname"))[0].stats.max_avail'`
+
+    echo "${global_avail} >= ${rep_avail}*3" | bc || return 1
+    echo "${global_avail} >= ${ec_avail}*1.5" | bc || return 1
+
+    ceph osd pool delete  $rep_poolname $rep_poolname  --yes-i-really-really-mean-it
+    ceph osd pool delete  $ec_poolname $ec_poolname  --yes-i-really-really-mean-it
+    ceph osd erasure-code-profile rm ec42profile
+    teardown $dir || return 1
+}
+
+main osd-pool-df "$@"
index 85941a895f631870062953393b457c10a1be8031..7ea6ae0dddc0fb55fa642ba1f5f792212242ea2f 100755 (executable)
@@ -33,7 +33,7 @@ function TEST_pool_quota() {
     run_osd $dir 1 || return 1
     run_osd $dir 2 || return 1
 
-    local poolname=testquoa
+    local poolname=testquota
     create_pool $poolname 20
     local objects=`ceph df detail | grep -w $poolname|awk '{print $3}'`
     local bytes=`ceph df detail | grep -w $poolname|awk '{print $4}'`
diff --git a/ceph/qa/suites/fs/basic_functional/tasks/alternate-pool.yaml b/ceph/qa/suites/fs/basic_functional/tasks/alternate-pool.yaml
new file mode 100644 (file)
index 0000000..94d5cc6
--- /dev/null
@@ -0,0 +1,20 @@
+
+overrides:
+  ceph:
+    log-whitelist:
+      - bad backtrace
+      - object missing on disk
+      - error reading table object
+      - error reading sessionmap
+      - unmatched fragstat
+      - unmatched rstat
+      - was unreadable, recreating it now
+      - Scrub error on inode
+      - Metadata damage detected
+      - MDS_FAILED
+      - MDS_DAMAGE
+
+tasks:
+  - cephfs_test_runner:
+      modules:
+        - tasks.cephfs.test_recovery_pool
index d22bc353c77dd325e5621c30ff12954808ecd699..635d0b6d8256544e405321c4df4e4fe0ea2e7268 100644 (file)
@@ -9,7 +9,9 @@ overrides:
       - failing to respond to cache pressure
       - slow requests are blocked
       - failing to respond to capability release
+      - MDS cache is too large
       - \(MDS_CLIENT_OLDEST_TID\)
+      - \(MDS_CACHE_OVERSIZED\)
 
 tasks:
   - cephfs_test_runner:
index 26ccd2bed624ed16858034f320ca950c29d04900..64c8a23b0ad286f019df4d880871af07f7a4298c 100644 (file)
@@ -11,6 +11,7 @@ overrides:
       - was unreadable, recreating it now
       - Scrub error on inode
       - Metadata damage detected
+      - inconsistent rstat on inode
 
 tasks:
   - cephfs_test_runner:
index 0f1572377a7a228634633a8390d77e2e2dc1c8c8..cb84e648380c2171508e2fef8dd338f987f6fd6d 100644 (file)
@@ -7,3 +7,4 @@ overrides:
   ceph:
     log-whitelist:
       - evicting unresponsive client
+      - POOL_APP_NOT_ENABLED
index 629804ec752e3cc170e00de243b41b2f352905c0..90d0e7bcb04561f3137a552e619da8c93adfa8fa 100644 (file)
@@ -4,6 +4,7 @@ overrides:
       - force file system read-only
       - bad backtrace
       - MDS in read-only mode
+      - \(MDS_READ_ONLY\)
 
 
 tasks:
index d297e5c3a669bad8fb9b8412b0799aac42130745..f816cee9bec03fed508d2bedc86ba213cf88abd7 100644 (file)
@@ -9,6 +9,9 @@ overrides:
       - failing to respond to cache pressure
       - slow requests are blocked
       - failing to respond to capability release
+      - MDS cache is too large
+      - \(MDS_CLIENT_OLDEST_TID\)
+      - \(MDS_CACHE_OVERSIZED\)
 
 tasks:
   - cephfs_test_runner:
index ebf9e63bee15bfd1257a1e4cbc9231f25765b833..72ce013fabfc9c4c71692c8189b5b20a02630980 100644 (file)
@@ -6,6 +6,7 @@ overrides:
     log-whitelist:
       - but it is still running
       - slow request
+      - evicting unresponsive client
 
 tasks:
   - cephfs_test_runner:
index 9428624e094e98c212b9172d959377a6b1ac411d..b2cd7395012b3bf98aa239cce16cf61767ef688b 100644 (file)
@@ -10,6 +10,7 @@ overrides:
       - was unreadable, recreating it now
       - Scrub error on inode
       - Metadata damage detected
+      - inconsistent rstat on inode
 
 tasks:
   - cephfs_test_runner:
index b06985336c93555d11caf06ee430bcb86caf4873..2e4655be0434a9ad779523a52cd369a54f276a88 100644 (file)
@@ -1,7 +1,10 @@
-
+overrides:
+  ceph:
+    log-whitelist:
+      - not responding, replacing
+      - \(MDS_INSUFFICIENT_STANDBY\)
 tasks:
   - cephfs_test_runner:
       fail_on_skip: false
       modules:
         - tasks.cephfs.test_failover
-
diff --git a/ceph/qa/suites/kcephfs/recovery/whitelist_health.yaml b/ceph/qa/suites/kcephfs/recovery/whitelist_health.yaml
new file mode 120000 (symlink)
index 0000000..90ca7b6
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/overrides/whitelist_health.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/thrash/thrashosds-health.yaml b/ceph/qa/suites/kcephfs/thrash/thrashosds-health.yaml
new file mode 120000 (symlink)
index 0000000..ebf7f34
--- /dev/null
@@ -0,0 +1 @@
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/thrash/whitelist_health.yaml b/ceph/qa/suites/kcephfs/thrash/whitelist_health.yaml
new file mode 120000 (symlink)
index 0000000..90ca7b6
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/overrides/whitelist_health.yaml
\ No newline at end of file
index d27054a848227e8985d741d8f9cabd2168dc1411..443aa0efac855dff8e796e4e1c36196e4a4ab139 100644 (file)
@@ -11,13 +11,28 @@ tasks:
 - install:
 - ceph:
 - rbd.xfstests:
-    client.0:
+    client.0: &ref
         test_image: 'test_image-0'
+        test_size: 5120  # MB
         scratch_image: 'scratch_image-0'
-        tests: '-g auto'
+        scratch_size: 5120  # MB
+        fs_type: ext4
+        tests: '-g auto -x clone'
+        exclude:
+        - generic/042
+        - generic/392
+        - generic/044
+        - generic/045
+        - generic/046
+        - generic/223
+        - ext4/304
+        - generic/050  # krbd BLKROSET bug
+        - generic/388
+        - generic/405
+        - generic/422
+        - generic/448
         randomize: true
     client.1:
+        <<: *ref
         test_image: 'test_image-1'
         scratch_image: 'scratch_image-1'
-        tests: '-g auto'
-        randomize: true
index 4e60cd6560a30ce5cda406483f9499003da80511..ef998cc89143d7b453f04fb160ddcd932e6bc18f 100644 (file)
@@ -14,6 +14,8 @@ tasks:
       cluster: ceph
   - exec:
       mon.a:
+        - sleep 15
+        - ceph osd dump | grep purged_snapdirs
         - ceph pg dump -f json-pretty
         - "ceph pg dump sum -f json-pretty | grep num_legacy_snapsets | head -1 | grep ': 0'"
 overrides:
index 7fdb1f1a41553eb07833b9dcfaaf4125f6ba4ace..376bf08eddb4dee943af65450fb1a19259d12830 100644 (file)
@@ -21,10 +21,6 @@ overrides:
       osd: # force bluestore since it's required for ec overwrites
         osd objectstore: bluestore
         bluestore block size: 96636764160
-        debug bluestore: 30
-        debug bdev: 20
-        debug bluefs: 20
-        debug rocksdb: 10
         enable experimental unrecoverable data corrupting features: "*"
         osd debug randomize hobject sort order: false
 # this doesn't work with failures bc the log writes are not atomic across the two backends
index a63ab270340745cb2b10872c2091fb9a5d2be64b..f39a5bb4ca622798ff73783cc5e3221e4b85319b 100644 (file)
@@ -18,10 +18,6 @@ overrides:
       osd: # force bluestore since it's required for ec overwrites
         osd objectstore: bluestore
         bluestore block size: 96636764160
-        debug bluestore: 30
-        debug bdev: 20
-        debug bluefs: 20
-        debug rocksdb: 10
         enable experimental unrecoverable data corrupting features: "*"
         osd debug randomize hobject sort order: false
 # this doesn't work with failures bc the log writes are not atomic across the two backends
index 3f31b18e183cb970e7f1eb3b2746f58774debdfd..dc8671b7d9ddef654efb476b3110fa4e0221bd12 100644 (file)
@@ -1,5 +1,5 @@
-tasks:
-- ceph:
+overrides:
+  ceph:
     conf:
       client:
         rbd cache: false
index 8eb30c45e6f2c2ae86c32d086e11493db38557f1..fbc76bda77364a7d35e6d25b4b3f1e639e7bebe6 100644 (file)
@@ -1,18 +1,14 @@
 meta:
-- desc: 2 ceph clusters with 3 mons and 3 osds each
+- desc: 2 ceph clusters with 1 mon and 3 osds each
 roles:
 - - cluster1.mon.a
-  - cluster1.mon.b
   - cluster1.mgr.x
   - cluster1.osd.0
   - cluster1.osd.1
   - cluster1.osd.2
-  - cluster2.mon.c
   - cluster1.client.0
   - cluster2.client.0
-- - cluster1.mon.c
-  - cluster2.mon.a
-  - cluster2.mon.b
+- - cluster2.mon.a
   - cluster2.mgr.x
   - cluster2.osd.0
   - cluster2.osd.1
index a63ab270340745cb2b10872c2091fb9a5d2be64b..f39a5bb4ca622798ff73783cc5e3221e4b85319b 100644 (file)
@@ -18,10 +18,6 @@ overrides:
       osd: # force bluestore since it's required for ec overwrites
         osd objectstore: bluestore
         bluestore block size: 96636764160
-        debug bluestore: 30
-        debug bdev: 20
-        debug bluefs: 20
-        debug rocksdb: 10
         enable experimental unrecoverable data corrupting features: "*"
         osd debug randomize hobject sort order: false
 # this doesn't work with failures bc the log writes are not atomic across the two backends
index c41470d521b52a42ff471514786dbbed00d14d82..1ac9181ae358d4b22da62dbd2d6ff918a65587fb 100644 (file)
@@ -4,5 +4,7 @@ overrides:
     conf:
       client:
         debug rgw: 20
+        rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo=
+        rgw crypt require ssl: false
   rgw:
     compression type: random
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/% b/ceph/qa/suites/upgrade/luminous-x/parallel/%
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/+ b/ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/+
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/openstack.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/openstack.yaml
new file mode 100644 (file)
index 0000000..f4d1349
--- /dev/null
@@ -0,0 +1,4 @@
+openstack:
+  - volumes: # attached to each instance
+      count: 3
+      size: 30 # GB
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/start.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/start.yaml
new file mode 100644 (file)
index 0000000..1e8d5a5
--- /dev/null
@@ -0,0 +1,39 @@
+meta:
+- desc: |
+   Run ceph on two nodes,
+   with a separate client 0,1,2 third node.
+   Use xfs beneath the osds.
+   CephFS tests running on client 2,3
+roles:
+- - mon.a
+  - mgr.x
+  - mds.a
+  - osd.0
+  - osd.1
+- - mon.b
+  - mon.c
+  - osd.2
+  - osd.3
+- - client.0
+  - client.1
+  - client.2
+  - client.3
+overrides:
+  ceph:
+    log-whitelist:
+    - scrub mismatch
+    - ScrubResult
+    - wrongly marked
+    - (POOL_APP_NOT_ENABLED)
+    - overall HEALTH_
+    conf:
+      global:
+        enable experimental unrecoverable data corrupting features: "*"
+      mon:
+        mon warn on osd down out interval zero: false
+      osd:
+        osd_class_load_list: "cephfs hello journal lock log numops rbd refcount 
+                              replica_log rgw sdk statelog timeindex user version"
+        osd_class_default_list: "cephfs hello journal lock log numops rbd refcount 
+                                 replica_log rgw sdk statelog timeindex user version"
+    fs: xfs
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/1-ceph-install/luminous.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/1-ceph-install/luminous.yaml
new file mode 100644 (file)
index 0000000..3d57f79
--- /dev/null
@@ -0,0 +1,43 @@
+meta:
+- desc: |
+   install ceph/luminous latest
+   run workload and upgrade-sequence in parallel
+   upgrade the client node
+tasks:
+- install:
+    branch: luminous
+- print: "**** done installing luminous"
+- ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - \(FS_
+      - \(MDS_
+      - \(OSD_
+      - \(MON_DOWN\)
+      - \(CACHE_POOL_
+      - \(POOL_
+      - \(MGR_DOWN\)
+      - \(PG_
+      - \(SMALLER_PGP_NUM\)
+      - Monitor daemon marked osd
+      - Behind on trimming
+      - Manager daemon
+    conf:
+      global:
+        mon warn on pool no app: false
+- exec:
+    osd.0:
+      - ceph osd require-osd-release luminous
+      - ceph osd set-require-min-compat-client luminous
+- print: "**** done ceph"
+- install.upgrade:
+    mon.a:
+    mon.b:
+- print: "**** done install.upgrade both hosts"
+- parallel:
+    - workload
+    - upgrade-sequence
+- print: "**** done parallel"
+- install.upgrade:
+    client.0:
+- print: "**** done install.upgrade on client.0"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/+ b/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/+
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/blogbench.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/blogbench.yaml
new file mode 100644 (file)
index 0000000..021fcc6
--- /dev/null
@@ -0,0 +1,14 @@
+meta:
+- desc: |
+   run a cephfs stress test
+   mount ceph-fuse on client.2 before running workunit
+workload:
+  full_sequential:
+  - sequential:
+    - ceph-fuse:
+    - print: "**** done ceph-fuse 2-workload"
+    - workunit:
+        clients:
+           client.2:
+            - suites/blogbench.sh
+    - print: "**** done suites/blogbench.sh 2-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/ec-rados-default.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/ec-rados-default.yaml
new file mode 100644 (file)
index 0000000..5c5a958
--- /dev/null
@@ -0,0 +1,24 @@
+meta:
+- desc: |
+   run run randomized correctness test for rados operations
+   on an erasure-coded pool
+workload:
+  full_sequential:
+  - rados:
+      clients: [client.0]
+      ops: 4000
+      objects: 50
+      ec_pool: true
+      write_append_excl: false
+      op_weights:
+        read: 100
+        write: 0
+        append: 100
+        delete: 50
+        snap_create: 50
+        snap_remove: 50
+        rollback: 50
+        copy_from: 50
+        setattr: 25
+        rmattr: 25
+  - print: "**** done rados ec task"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/rados_api.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/rados_api.yaml
new file mode 100644 (file)
index 0000000..e4cc9f9
--- /dev/null
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   object class functional tests
+workload:
+  full_sequential:
+    - workunit:
+        branch: luminous
+        clients:
+          client.0:
+            - cls
+    - print: "**** done cls 2-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/rados_loadgenbig.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/rados_loadgenbig.yaml
new file mode 100644 (file)
index 0000000..874a8c5
--- /dev/null
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   generate read/write load with rados objects ranging from 1MB to 25MB
+workload:
+  full_sequential:
+    - workunit:
+        branch: luminous
+        clients:
+          client.0:
+            - rados/load-gen-big.sh
+    - print: "**** done rados/load-gen-big.sh 2-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/test_rbd_api.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/test_rbd_api.yaml
new file mode 100644 (file)
index 0000000..81563c9
--- /dev/null
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   librbd C and C++ api tests
+workload:
+  full_sequential:
+    - workunit:
+        branch: luminous
+        clients:
+          client.0:
+            - rbd/test_librbd.sh
+    - print: "**** done rbd/test_librbd.sh 2-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/test_rbd_python.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/2-workload/test_rbd_python.yaml
new file mode 100644 (file)
index 0000000..e17207d
--- /dev/null
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   librbd python api tests
+workload:
+  full_sequential:
+    - workunit:
+        branch: luminous
+        clients:
+          client.0:
+            - rbd/test_librbd_python.sh
+    - print: "**** done rbd/test_librbd_python.sh 2-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/3-upgrade-sequence/upgrade-all.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/3-upgrade-sequence/upgrade-all.yaml
new file mode 100644 (file)
index 0000000..cff3a68
--- /dev/null
@@ -0,0 +1,16 @@
+meta:
+- desc: |
+   upgrade the ceph cluster
+upgrade-sequence:
+   sequential:
+   - ceph.restart:
+       daemons: [mon.a, mon.b, mon.c, mgr.x]
+   - ceph.restart:
+       daemons: [osd.0, osd.1, osd.2, osd.3]
+       wait-for-healthy: false
+       wait-for-osds-up: true
+   - ceph.restart:
+       daemons: [mds.a]
+       wait-for-healthy: false
+       wait-for-osds-up: true
+   - print: "**** done ceph.restart all"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/3-upgrade-sequence/upgrade-mon-osd-mds.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/3-upgrade-sequence/upgrade-mon-osd-mds.yaml
new file mode 100644 (file)
index 0000000..f197de6
--- /dev/null
@@ -0,0 +1,35 @@
+meta:
+- desc: |
+   upgrade the ceph cluster,
+   upgrate in two steps
+   step one ordering: mon.a, osd.0, osd.1, mds.a
+   step two ordering: mon.b, mon.c, osd.2, osd.3
+   ceph expected to be healthy state after each step
+upgrade-sequence:
+   sequential:
+   - ceph.restart:
+       daemons: [mon.a]
+       wait-for-healthy: true
+   - sleep:
+       duration: 60
+   - ceph.restart:
+       daemons: [mon.b, mon.c, mgr.x]
+       wait-for-healthy: true
+   - sleep:
+       duration: 60
+   - ceph.restart:
+       daemons: [osd.0, osd.1]
+       wait-for-healthy: true
+   - sleep:
+       duration: 60
+   - ceph.restart: [mds.a]
+   - sleep:
+       duration: 60
+   - sleep:
+       duration: 60
+   - ceph.restart:
+       daemons: [osd.2, osd.3]
+       wait-for-healthy: false
+       wait-for-osds-up: true
+   - sleep:
+       duration: 60
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/+ b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/+
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/blogbench.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/blogbench.yaml
new file mode 100644 (file)
index 0000000..d2629c0
--- /dev/null
@@ -0,0 +1,13 @@
+meta:
+- desc: |
+   run a cephfs stress test
+   mount ceph-fuse on client.3 before running workunit
+tasks:
+- sequential:
+  - ceph-fuse:
+  - print: "**** done ceph-fuse 5-final-workload"
+  - workunit:
+      clients:
+         client.3:
+          - suites/blogbench.sh
+  - print: "**** done suites/blogbench.sh 5-final-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rados-snaps-few-objects.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rados-snaps-few-objects.yaml
new file mode 100644 (file)
index 0000000..d8b3dcb
--- /dev/null
@@ -0,0 +1,17 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool with snapshots
+tasks:
+  - rados:
+      clients: [client.1]
+      ops: 4000
+      objects: 50
+      write_append_excl: false
+      op_weights:
+        read: 100
+        write: 100
+        delete: 50
+        snap_create: 50
+        snap_remove: 50
+        rollback: 50
+  - print: "**** done rados 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rados_loadgenmix.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rados_loadgenmix.yaml
new file mode 100644 (file)
index 0000000..922a9da
--- /dev/null
@@ -0,0 +1,9 @@
+meta:
+- desc: |
+   generate read/write load with rados objects ranging from 1 byte to 1MB
+tasks:
+  - workunit:
+      clients:
+        client.1:
+          - rados/load-gen-mix.sh
+  - print: "**** done rados/load-gen-mix.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rados_mon_thrash.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rados_mon_thrash.yaml
new file mode 100644 (file)
index 0000000..a42b7d2
--- /dev/null
@@ -0,0 +1,18 @@
+meta:
+- desc: |
+   librados C and C++ api tests
+overrides:
+  ceph:
+    log-whitelist:
+      - reached quota
+tasks:
+  - mon_thrash:
+      revive_delay: 20
+      thrash_delay: 1
+  - print: "**** done mon_thrash 4-final-workload"
+  - workunit:
+      branch: luminous
+      clients:
+        client.1:
+          - rados/test.sh
+  - print: "**** done rados/test.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_cls.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_cls.yaml
new file mode 100644 (file)
index 0000000..aaf0a37
--- /dev/null
@@ -0,0 +1,9 @@
+meta:
+- desc: |
+   rbd object class functional tests
+tasks:
+  - workunit:
+      clients:
+        client.1:
+          - cls/test_cls_rbd.sh
+  - print: "**** done cls/test_cls_rbd.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export.yaml
new file mode 100644 (file)
index 0000000..46e1355
--- /dev/null
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd
+tasks:
+  - workunit:
+      clients:
+        client.1:
+          - rbd/import_export.sh
+      env:
+        RBD_CREATE_ARGS: --new-format
+  - print: "**** done rbd/import_export.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rgw_swift.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rgw_swift.yaml
new file mode 100644 (file)
index 0000000..7a7659f
--- /dev/null
@@ -0,0 +1,13 @@
+meta:
+- desc: |
+   swift api tests for rgw
+overrides:
+  rgw:
+    frontend: civetweb
+tasks:
+  - rgw: [client.1]
+  - print: "**** done rgw 4-final-workload"
+  - swift:
+      client.1:
+        rgw_server: client.1
+  - print: "**** done swift 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/distros b/ceph/qa/suites/upgrade/luminous-x/parallel/distros
new file mode 120000 (symlink)
index 0000000..ca99fee
--- /dev/null
@@ -0,0 +1 @@
+../../../../distros/supported/
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/objectstore b/ceph/qa/suites/upgrade/luminous-x/parallel/objectstore
new file mode 120000 (symlink)
index 0000000..016cbf9
--- /dev/null
@@ -0,0 +1 @@
+../stress-split/objectstore/
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/% b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/%
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/0-cluster b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/0-cluster
new file mode 120000 (symlink)
index 0000000..3580937
--- /dev/null
@@ -0,0 +1 @@
+../stress-split/0-cluster/
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/2-partial-upgrade b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/2-partial-upgrade
new file mode 120000 (symlink)
index 0000000..ab35fc1
--- /dev/null
@@ -0,0 +1 @@
+../stress-split/2-partial-upgrade/
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/3-thrash/default.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/3-thrash/default.yaml
new file mode 100644 (file)
index 0000000..edae7b3
--- /dev/null
@@ -0,0 +1,25 @@
+meta:
+- desc: |
+   randomly kill and revive osd
+   small chance to increase the number of pgs
+overrides:
+  ceph:
+    log-whitelist:
+    - but it is still running
+    - wrongly marked me down
+    - objects unfound and apparently lost
+    - log bound mismatch
+tasks:
+- parallel:
+  - stress-tasks
+stress-tasks:
+- thrashosds:
+    timeout: 1200
+    chance_pgnum_grow: 1
+    chance_pgpnum_fix: 1
+    min_in: 4
+    chance_thrash_cluster_full: 0
+    chance_thrash_pg_upmap: 0
+    chance_thrash_pg_upmap_items: 0
+    chance_force_recovery: 0
+- print: "**** done thrashosds 3-thrash"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/4-ec-workload.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/4-ec-workload.yaml
new file mode 100644 (file)
index 0000000..c89551e
--- /dev/null
@@ -0,0 +1,22 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on an erasure coded pool
+stress-tasks:
+  - rados:
+      clients: [client.0]
+      ops: 4000
+      objects: 50
+      ec_pool: true
+      write_append_excl: false
+      op_weights:
+        read: 100
+        write: 0
+        append: 100
+        delete: 50
+        snap_create: 50
+        snap_remove: 50
+        rollback: 50
+        copy_from: 50
+        setattr: 25
+        rmattr: 25
+  - print: "**** done rados ec task"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/5-finish-upgrade.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/5-finish-upgrade.yaml
new file mode 120000 (symlink)
index 0000000..a66a7dc
--- /dev/null
@@ -0,0 +1 @@
+../stress-split/5-finish-upgrade.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/7-final-workload.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/7-final-workload.yaml
new file mode 100644 (file)
index 0000000..50a1465
--- /dev/null
@@ -0,0 +1,35 @@
+#
+# k=3 implies a stripe_width of 1376*3 = 4128 which is different from
+# the default value of 4096 It is also not a multiple of 1024*1024 and
+# creates situations where rounding rules during recovery becomes
+# necessary.
+#
+meta:
+- desc: |
+   randomized correctness test for rados operations on an erasure coded pool
+   using the jerasure plugin with k=3 and m=1
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 50
+    ec_pool: true
+    write_append_excl: false
+    erasure_code_profile:
+      name: jerasure31profile
+      plugin: jerasure
+      k: 3
+      m: 1
+      technique: reed_sol_van
+      crush-failure-domain: osd
+    op_weights:
+      read: 100
+      write: 0
+      append: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 50
+      copy_from: 50
+      setattr: 25
+      rmattr: 25
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/distros b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/distros
new file mode 120000 (symlink)
index 0000000..ca99fee
--- /dev/null
@@ -0,0 +1 @@
+../../../../distros/supported/
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/objectstore b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/objectstore
new file mode 120000 (symlink)
index 0000000..016cbf9
--- /dev/null
@@ -0,0 +1 @@
+../stress-split/objectstore/
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/thrashosds-health.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/thrashosds-health.yaml
new file mode 120000 (symlink)
index 0000000..e0426db
--- /dev/null
@@ -0,0 +1 @@
+../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/% b/ceph/qa/suites/upgrade/luminous-x/stress-split/%
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/0-cluster/+ b/ceph/qa/suites/upgrade/luminous-x/stress-split/0-cluster/+
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/0-cluster/openstack.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/0-cluster/openstack.yaml
new file mode 100644 (file)
index 0000000..a0d5c20
--- /dev/null
@@ -0,0 +1,6 @@
+openstack:
+  - machine:
+      disk: 100 # GB
+  - volumes: # attached to each instance
+      count: 3
+      size: 30 # GB
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/0-cluster/start.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/0-cluster/start.yaml
new file mode 100644 (file)
index 0000000..e3ad918
--- /dev/null
@@ -0,0 +1,29 @@
+meta:
+- desc: |
+   Run ceph on two nodes,
+   with a separate client-only node.
+   Use xfs beneath the osds.
+overrides:
+  ceph:
+    fs: xfs
+    log-whitelist:
+      - overall HEALTH_
+      - \(MON_DOWN\)
+      - \(MGR_DOWN\)
+    conf:
+      global:
+        enable experimental unrecoverable data corrupting features: "*"
+      mon:
+        mon warn on osd down out interval zero: false
+roles:
+- - mon.a
+  - mon.b
+  - mon.c
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+- - osd.3
+  - osd.4
+  - osd.5
+- - client.0
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/1-ceph-install/luminous.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/1-ceph-install/luminous.yaml
new file mode 100644 (file)
index 0000000..2230525
--- /dev/null
@@ -0,0 +1,17 @@
+meta:
+- desc: install ceph/luminous latest
+tasks:
+- install:
+    branch: luminous
+- print: "**** done install luminous"
+- ceph:
+- exec:
+    osd.0:
+      - ceph osd require-osd-release luminous
+      - ceph osd set-require-min-compat-client luminous
+- print: "**** done ceph "
+overrides:
+  ceph:
+    conf:
+      mon:
+        mon warn on osd down out interval zero: false
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/2-partial-upgrade/firsthalf.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/2-partial-upgrade/firsthalf.yaml
new file mode 100644 (file)
index 0000000..87fa1d5
--- /dev/null
@@ -0,0 +1,12 @@
+meta:
+- desc: |
+   install upgrade ceph/-x on one node only
+   1st half
+   restart : osd.0,1,2
+tasks:
+- install.upgrade:
+    osd.0:
+- print: "**** done install.upgrade osd.0"
+- ceph.restart:
+    daemons: [mon.a,mon.b,mon.c,mgr.x,osd.0,osd.1,osd.2]
+- print: "**** done ceph.restart 1st half"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/3-thrash/default.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/3-thrash/default.yaml
new file mode 100644 (file)
index 0000000..b3fddef
--- /dev/null
@@ -0,0 +1,25 @@
+meta:
+- desc: |
+   randomly kill and revive osd
+   small chance to increase the number of pgs
+overrides:
+  ceph:
+    log-whitelist:
+    - but it is still running
+    - wrongly marked me down
+    - objects unfound and apparently lost
+    - log bound mismatch
+tasks:
+- parallel:
+  - stress-tasks
+stress-tasks:
+- thrashosds:
+    timeout: 1200
+    chance_pgnum_grow: 1
+    chance_pgpnum_fix: 1
+    chance_thrash_cluster_full: 0
+    chance_thrash_pg_upmap: 0
+    chance_thrash_pg_upmap_items: 0
+    disable_objectstore_tool_tests: true
+    chance_force_recovery: 0
+- print: "**** done thrashosds 3-thrash"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/+ b/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/+
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/radosbench.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/radosbench.yaml
new file mode 100644 (file)
index 0000000..626ae8e
--- /dev/null
@@ -0,0 +1,40 @@
+meta:
+- desc: |
+   run randomized correctness test for rados operations
+   generate write load with rados bench
+stress-tasks:
+- full_sequential:
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+- print: "**** done radosbench 7-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/rbd-cls.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/rbd-cls.yaml
new file mode 100644 (file)
index 0000000..f8cc4d8
--- /dev/null
@@ -0,0 +1,10 @@
+meta:
+- desc: |
+   run basic cls tests for rbd
+stress-tasks:
+- workunit:
+    branch: luminous
+    clients:
+      client.0:
+        - cls/test_cls_rbd.sh
+- print: "**** done cls/test_cls_rbd.sh 5-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/rbd-import-export.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/rbd-import-export.yaml
new file mode 100644 (file)
index 0000000..30a677a
--- /dev/null
@@ -0,0 +1,12 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd
+stress-tasks:
+- workunit:
+    branch: luminous
+    clients:
+      client.0:
+        - rbd/import_export.sh
+    env:
+      RBD_CREATE_ARGS: --new-format
+- print: "**** done rbd/import_export.sh 5-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/rbd_api.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/rbd_api.yaml
new file mode 100644 (file)
index 0000000..9079aa3
--- /dev/null
@@ -0,0 +1,10 @@
+meta:
+- desc: |
+   librbd C and C++ api tests
+stress-tasks:
+- workunit:
+     branch: luminous
+     clients:
+        client.0:
+           - rbd/test_librbd.sh
+- print: "**** done rbd/test_librbd.sh 7-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/readwrite.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/readwrite.yaml
new file mode 100644 (file)
index 0000000..41e34d6
--- /dev/null
@@ -0,0 +1,16 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool,
+   using only reads, writes, and deletes
+stress-tasks:
+- full_sequential:
+  - rados:
+      clients: [client.0]
+      ops: 4000
+      objects: 500
+      write_append_excl: false
+      op_weights:
+        read: 45
+        write: 45
+        delete: 10
+- print: "**** done rados/readwrite 5-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/snaps-few-objects.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/4-workload/snaps-few-objects.yaml
new file mode 100644 (file)
index 0000000..f56d0de
--- /dev/null
@@ -0,0 +1,18 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool with snapshot operations
+stress-tasks:
+- full_sequential:
+  - rados:
+      clients: [client.0]
+      ops: 4000
+      objects: 50
+      write_append_excl: false
+      op_weights:
+        read: 100
+        write: 100
+        delete: 50
+        snap_create: 50
+        snap_remove: 50
+        rollback: 50
+- print: "**** done rados/snaps-few-objects 5-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/5-finish-upgrade.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/5-finish-upgrade.yaml
new file mode 100644 (file)
index 0000000..1d528cd
--- /dev/null
@@ -0,0 +1,9 @@
+tasks:
+- install.upgrade:
+    osd.3:
+    client.0:
+- ceph.restart:
+    daemons: [osd.3, osd.4, osd.5]
+    wait-for-healthy: false
+    wait-for-osds-up: true
+
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/7-final-workload/+ b/ceph/qa/suites/upgrade/luminous-x/stress-split/7-final-workload/+
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/7-final-workload/rbd-python.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/7-final-workload/rbd-python.yaml
new file mode 100644 (file)
index 0000000..92fe658
--- /dev/null
@@ -0,0 +1,10 @@
+meta:
+- desc: |
+   librbd python api tests
+tasks:
+- workunit:
+    branch: luminous
+    clients:
+      client.0:
+        - rbd/test_librbd_python.sh
+- print: "**** done rbd/test_librbd_python.sh 9-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/7-final-workload/rgw-swift.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/7-final-workload/rgw-swift.yaml
new file mode 100644 (file)
index 0000000..76e5d6f
--- /dev/null
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   swift api tests for rgw
+tasks:
+- rgw:
+    client.0:
+- print: "**** done rgw 9-workload"
+- swift:
+    client.0:
+      rgw_server: client.0
+- print: "**** done swift 9-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/7-final-workload/snaps-many-objects.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/7-final-workload/snaps-many-objects.yaml
new file mode 100644 (file)
index 0000000..805bf97
--- /dev/null
@@ -0,0 +1,16 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool with snapshot operations
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 500
+    write_append_excl: false
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 50
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/distros b/ceph/qa/suites/upgrade/luminous-x/stress-split/distros
new file mode 120000 (symlink)
index 0000000..ca99fee
--- /dev/null
@@ -0,0 +1 @@
+../../../../distros/supported/
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/objectstore/bluestore.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/objectstore/bluestore.yaml
new file mode 120000 (symlink)
index 0000000..d644598
--- /dev/null
@@ -0,0 +1 @@
+../../../../../objectstore/bluestore.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/objectstore/filestore-xfs.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/objectstore/filestore-xfs.yaml
new file mode 120000 (symlink)
index 0000000..03750e5
--- /dev/null
@@ -0,0 +1 @@
+../../../../../objectstore/filestore-xfs.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split/thrashosds-health.yaml b/ceph/qa/suites/upgrade/luminous-x/stress-split/thrashosds-health.yaml
new file mode 120000 (symlink)
index 0000000..e0426db
--- /dev/null
@@ -0,0 +1 @@
+../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
index 358ec8d648e99555adf05ce5290ab418408e910a..a37fec1c070374e88612306dd9797a88219637f7 100644 (file)
@@ -365,7 +365,7 @@ def cephfs_setup(ctx, config):
     if mdss.remotes:
         log.info('Setting up CephFS filesystem...')
 
-        fs = Filesystem(ctx, create='cephfs')
+        fs = Filesystem(ctx, name='cephfs', create=True)
 
         is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
         all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
index b2f687e49bb2a89030d30a8286248f871402e3a3..9da03bdd9082324689e018ebadd48aea84c2e423 100644 (file)
@@ -1031,6 +1031,7 @@ class Thrasher:
                         Scrubber(self.ceph_manager, self.config)
             self.choose_action()()
             time.sleep(delay)
+        self.all_up()
         if self.random_eio > 0:
             self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
                           'injectargs', '--', '--filestore_debug_random_read_err=0.0')
index 0f66106c9f1f0ef0b87b5402867af105e28ece76..5767df4611de9727763f44a30b004c8d6c20fe4b 100644 (file)
@@ -18,6 +18,7 @@ class CephTestCase(unittest.TestCase):
     # Environment references
     mounts = None
     fs = None
+    recovery_fs = None
     ceph_cluster = None
     mds_cluster = None
     mgr_cluster = None
index 1181c80048fb21e30ce49cbaa1d1908e62512d43..801d0d3114c4adb6971cee81cabce98d0921eeda 100644 (file)
@@ -43,6 +43,7 @@ class CephFSTestCase(CephTestCase):
     # FIXME weird explicit naming
     mount_a = None
     mount_b = None
+    recovery_mount = None
 
     # Declarative test requirements: subclasses should override these to indicate
     # their special needs.  If not met, tests will be skipped.
@@ -55,6 +56,9 @@ class CephFSTestCase(CephTestCase):
     # Whether to create the default filesystem during setUp
     REQUIRE_FILESYSTEM = True
 
+    # requires REQUIRE_FILESYSTEM = True
+    REQUIRE_RECOVERY_FILESYSTEM = False
+
     LOAD_SETTINGS = []
 
     def setUp(self):
@@ -105,6 +109,7 @@ class CephFSTestCase(CephTestCase):
         self.mds_cluster.mds_fail()
         self.mds_cluster.delete_all_filesystems()
         self.fs = None # is now invalid!
+        self.recovery_fs = None
 
         # In case the previous filesystem had filled up the RADOS cluster, wait for that
         # flag to pass.
@@ -138,7 +143,7 @@ class CephFSTestCase(CephTestCase):
                 self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])
 
         if self.REQUIRE_FILESYSTEM:
-            self.fs = self.mds_cluster.newfs(True)
+            self.fs = self.mds_cluster.newfs(create=True)
             self.fs.mds_restart()
 
             # In case some test messed with auth caps, reset them
@@ -157,6 +162,20 @@ class CephFSTestCase(CephTestCase):
                 self.mounts[i].mount()
                 self.mounts[i].wait_until_mounted()
 
+        if self.REQUIRE_RECOVERY_FILESYSTEM:
+            if not self.REQUIRE_FILESYSTEM:
+                raise case.SkipTest("Recovery filesystem requires a primary filesystem as well")
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
+                                                'enable_multiple', 'true',
+                                                '--yes-i-really-mean-it')
+            self.recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False)
+            self.recovery_fs.set_metadata_overlay(True)
+            self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
+            self.recovery_fs.create()
+            self.recovery_fs.getinfo(refresh=True)
+            self.recovery_fs.mds_restart()
+            self.recovery_fs.wait_for_daemons()
+
         # Load an config settings of interest
         for setting in self.LOAD_SETTINGS:
             setattr(self, setting, float(self.fs.mds_asok(
index dad3bd87b2654979a2b22a6664e39401c7fc16e2..44f6cbaf16dbf0e9e9e2418dfed9f5dd9ca4acbd 100644 (file)
@@ -8,6 +8,7 @@ import time
 import datetime
 import re
 import errno
+import random
 
 from teuthology.exceptions import CommandFailedError
 from teuthology import misc
@@ -225,6 +226,17 @@ class MDSCluster(CephCluster):
         else:
             cb(mds_id)
 
+    def get_config(self, key, service_type=None):
+        """
+        get_config specialization of service_type="mds"
+        """
+        if service_type != "mds":
+            return super(MDSCluster, self).get_config(key, service_type)
+
+        # Some tests stop MDS daemons, don't send commands to a dead one:
+        service_id = random.sample(filter(lambda i: self.mds_daemons[i].running(), self.mds_daemons), 1)[0]
+        return self.json_asok(['config', 'get', key], service_type, service_id)[key]
+
     def mds_stop(self, mds_id=None):
         """
         Stop the MDS daemon process(se).  If it held a rank, that rank
@@ -256,8 +268,8 @@ class MDSCluster(CephCluster):
 
         self._one_or_all(mds_id, _fail_restart)
 
-    def newfs(self, name):
-        return Filesystem(self._ctx, create=name)
+    def newfs(self, name='cephfs', create=True):
+        return Filesystem(self._ctx, name=name, create=create)
 
     def status(self):
         return FSStatus(self.mon_manager)
@@ -362,30 +374,29 @@ class Filesystem(MDSCluster):
     This object is for driving a CephFS filesystem.  The MDS daemons driven by
     MDSCluster may be shared with other Filesystems.
     """
-    def __init__(self, ctx, fscid=None, create=None):
+    def __init__(self, ctx, fscid=None, name=None, create=False):
         super(Filesystem, self).__init__(ctx)
 
+        self.name = name
         self.id = None
-        self.name = None
         self.metadata_pool_name = None
+        self.metadata_overlay = False
+        self.data_pool_name = None
         self.data_pools = None
 
         client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client'))
         self.client_id = client_list[0]
         self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1]
 
-        if create is not None:
+        if name is not None:
             if fscid is not None:
                 raise RuntimeError("cannot specify fscid when creating fs")
-            if create is True:
-                self.name = 'cephfs'
-            else:
-                self.name = create
-            if not self.legacy_configured():
+            if create and not self.legacy_configured():
                 self.create()
-        elif fscid is not None:
-            self.id = fscid
-        self.getinfo(refresh = True)
+        else:
+            if fscid is not None:
+                self.id = fscid
+                self.getinfo(refresh = True)
 
         # Stash a reference to the first created filesystem on ctx, so
         # that if someone drops to the interactive shell they can easily
@@ -412,6 +423,11 @@ class Filesystem(MDSCluster):
         self.get_pool_names(status = status, refresh = refresh)
         return status
 
+    def set_metadata_overlay(self, overlay):
+        if self.id is not None:
+            raise RuntimeError("cannot specify fscid when configuring overlay")
+        self.metadata_overlay = overlay
+
     def deactivate(self, rank):
         if rank < 0:
             raise RuntimeError("invalid rank")
@@ -441,7 +457,10 @@ class Filesystem(MDSCluster):
             self.name = "cephfs"
         if self.metadata_pool_name is None:
             self.metadata_pool_name = "{0}_metadata".format(self.name)
-        data_pool_name = "{0}_data".format(self.name)
+        if self.data_pool_name is None:
+            data_pool_name = "{0}_data".format(self.name)
+        else:
+            data_pool_name = self.data_pool_name
 
         log.info("Creating filesystem '{0}'".format(self.name))
 
@@ -449,10 +468,15 @@ class Filesystem(MDSCluster):
 
         self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
                                          self.metadata_pool_name, pgs_per_fs_pool.__str__())
-        self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
-                                         data_pool_name, pgs_per_fs_pool.__str__())
-        self.mon_manager.raw_cluster_cmd('fs', 'new',
-                                         self.name, self.metadata_pool_name, data_pool_name)
+        if self.metadata_overlay:
+            self.mon_manager.raw_cluster_cmd('fs', 'new',
+                                             self.name, self.metadata_pool_name, data_pool_name,
+                                             '--allow-dangerous-metadata-overlay')
+        else:
+            self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                             data_pool_name, pgs_per_fs_pool.__str__())
+            self.mon_manager.raw_cluster_cmd('fs', 'new',
+                                             self.name, self.metadata_pool_name, data_pool_name)
         self.check_pool_application(self.metadata_pool_name)
         self.check_pool_application(data_pool_name)
         # Turn off spurious standby count warnings from modifying max_mds in tests.
@@ -565,6 +589,11 @@ class Filesystem(MDSCluster):
     def get_metadata_pool_name(self):
         return self.metadata_pool_name
 
+    def set_data_pool_name(self, name):
+        if self.id is not None:
+            raise RuntimeError("can't set filesystem name if its fscid is set")
+        self.data_pool_name = name
+
     def get_namespace_id(self):
         return self.id
 
index 3f8ffa8758f79c221bf4b2eb4a31e3d357708e9e..b06d2a1d233fa0dc5fcecb18fe134942f8171750 100644 (file)
@@ -81,12 +81,12 @@ class TestClientLimits(CephFSTestCase):
             pass
 
         # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
-        # which depend on the cache size and overall ratio
+        # which depend on the caps outstanding, cache size and overall ratio
         self.wait_until_equal(
             lambda: self.get_session(mount_a_client_id)['num_caps'],
-            int(cache_size * 0.8),
-            timeout=600,
-            reject_fn=lambda x: x < int(cache_size*.8))
+            int(open_files * 0.2),
+            timeout=30,
+            reject_fn=lambda x: x < int(open_files*0.2))
 
     @needs_trimming
     def test_client_pin_root(self):
index 4940ab89b7cb2185a603b06f26d5b61315ecd050..fd58c1427338e1cf1f234ec55361e33b56782730 100644 (file)
@@ -395,14 +395,21 @@ class TestClientRecovery(CephFSTestCase):
         self.assertFalse(lock_holder.finished)
         self.assertFalse(lock_taker.finished)
 
-        mount_a_client_id = self.mount_a.get_global_id()
-        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+        try:
+            mount_a_client_id = self.mount_a.get_global_id()
+            self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
 
-        # Evicting mount_a should let mount_b's attepmt to take the lock
-        # suceed
-        self.wait_until_true(
-            lambda: lock_taker.finished,
-            timeout=10)
+            # Evicting mount_a should let mount_b's attempt to take the lock
+            # succeed
+            self.wait_until_true(lambda: lock_taker.finished, timeout=10)
+        finally:
+            # teardown() doesn't quite handle this case cleanly, so help it out
+            self.mount_a.kill()
+            self.mount_a.kill_cleanup()
+
+        # Bring the client back
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
 
     def test_dir_fsync(self):
        self._test_fsync(True);
index c30f5fa53efc6da26e49e8699e72f2ec7fb2a3e1..a2d315768aa03b6863c38c7bcedf6bb595ae0b6e 100644 (file)
@@ -309,7 +309,7 @@ class TestDataScan(CephFSTestCase):
         mds_map = self.fs.get_mds_map()
         return rank in mds_map['damaged']
 
-    def _rebuild_metadata(self, workload, other_pool=None, workers=1):
+    def _rebuild_metadata(self, workload, workers=1):
         """
         That when all objects in metadata pool are removed, we can rebuild a metadata pool
         based on the contents of a data pool, and a client can see and read our files.
@@ -317,7 +317,6 @@ class TestDataScan(CephFSTestCase):
 
         # First, inject some files
 
-        other_fs = other_pool + '-fs' if other_pool else None
         workload.write()
 
         # Unmount the client and flush the journal: the tool should also cope with
@@ -325,23 +324,6 @@ class TestDataScan(CephFSTestCase):
         self.mount_a.umount_wait()
         workload.flush()
 
-        # Create the alternate pool if requested
-        if other_pool:
-            self.fs.rados(['mkpool', other_pool])
-            self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
-                                                'enable_multiple', 'true',
-                                                '--yes-i-really-mean-it')
-            self.fs.mon_manager.raw_cluster_cmd('fs', 'new', other_fs,
-                                                other_pool,
-                                                self.fs.get_data_pool_name(),
-                                                '--allow-dangerous-metadata-overlay')
-            self.fs.data_scan(['init', '--force-init', '--filesystem',
-                               other_fs, '--alternate-pool', other_pool])
-            self.fs.mon_manager.raw_cluster_cmd('-s')
-            self.fs.table_tool([other_fs + ":0", "reset", "session"])
-            self.fs.table_tool([other_fs + ":0", "reset", "snap"])
-            self.fs.table_tool([other_fs + ":0", "reset", "inode"])
-
         # Stop the MDS
         self.fs.mds_stop()
         self.fs.mds_fail()
@@ -359,20 +341,18 @@ class TestDataScan(CephFSTestCase):
         self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
                 '--yes-i-really-mean-it')
 
-        if other_pool is None:
-            self.fs.mds_restart()
+        self.fs.mds_restart()
 
         def get_state(mds_id):
             info = self.mds_cluster.get_mds_info(mds_id)
             return info['state'] if info is not None else None
 
-        if other_pool is None:
-            self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
-            for mds_id in self.fs.mds_ids:
-                self.wait_until_equal(
-                        lambda: get_state(mds_id),
-                        "up:standby",
-                        timeout=60)
+        self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
+        for mds_id in self.fs.mds_ids:
+            self.wait_until_equal(
+                    lambda: get_state(mds_id),
+                    "up:standby",
+                    timeout=60)
 
         self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
         self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
@@ -384,38 +364,10 @@ class TestDataScan(CephFSTestCase):
                 # Normal reset should fail when no objects are present, we'll use --force instead
                 self.fs.journal_tool(["journal", "reset"])
 
-        if other_pool:
-            self.fs.mds_stop()
-            self.fs.data_scan(['scan_extents', '--alternate-pool',
-                               other_pool, '--filesystem', self.fs.name,
-                               self.fs.get_data_pool_name()])
-            self.fs.data_scan(['scan_inodes', '--alternate-pool',
-                               other_pool, '--filesystem', self.fs.name,
-                               '--force-corrupt', '--force-init',
-                               self.fs.get_data_pool_name()])
-            self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
-                                  'recover_dentries', 'list',
-                                  '--alternate-pool', other_pool])
-
-            self.fs.data_scan(['init', '--force-init', '--filesystem',
-                               self.fs.name])
-            self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
-                               '--force-corrupt', '--force-init',
-                               self.fs.get_data_pool_name()])
-            self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
-                                  'recover_dentries', 'list'])
-
-            self.fs.journal_tool(['--rank=' + other_fs + ":0", 'journal',
-                                  'reset', '--force'])
-            self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'journal',
-                                  'reset', '--force'])
-            self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
-                                                other_fs + ":0")
-        else:
-            self.fs.journal_tool(["journal", "reset", "--force"])
-            self.fs.data_scan(["init"])
-            self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
-            self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
+        self.fs.journal_tool(["journal", "reset", "--force"])
+        self.fs.data_scan(["init"])
+        self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
+        self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
 
         # Mark the MDS repaired
         self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
@@ -423,17 +375,10 @@ class TestDataScan(CephFSTestCase):
         # Start the MDS
         self.fs.mds_restart()
         self.fs.wait_for_daemons()
-        if other_pool:
-            for mds_id in self.fs.mds_ids:
-                self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + mds_id,
-                                                    'injectargs', '--debug-mds=20')
-                self.fs.mon_manager.raw_cluster_cmd('daemon', "mds." + mds_id,
-                                                    'scrub_path', '/',
-                                                    'recursive', 'repair')
         log.info(str(self.mds_cluster.status()))
 
         # Mount a client
-        self.mount_a.mount(mount_fs_name=other_fs)
+        self.mount_a.mount()
         self.mount_a.wait_until_mounted()
 
         # See that the files are present and correct
@@ -468,12 +413,7 @@ class TestDataScan(CephFSTestCase):
     def test_stashed_layout(self):
         self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a))
 
-    def test_rebuild_simple_altpool(self):
-        self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a), other_pool="recovery")
-
     def _dirfrag_keys(self, object_id):
-        self.other_pool = 'recovery'
-        self.other_fs = self.other_pool + '-fs'
         keys_str = self.fs.rados(["listomapkeys", object_id])
         if keys_str:
             return keys_str.split("\n")
index 77ca07a194a1824bb663be2461088b219f62ced5..d857cfddf01a47f972957c963318ebf4c5105c1f 100644 (file)
@@ -124,9 +124,7 @@ class TestMisc(CephFSTestCase):
         self.assert_session_count(2, ls_data)
 
         self.mount_a.kill()
-        self.mount_a.kill()
-        self.mount_b.kill_cleanup()
-        self.mount_b.kill_cleanup()
+        self.mount_a.kill_cleanup()
 
         time.sleep(self.mds_session_autoclose * 1.5)
         ls_data = self.fs.mds_asok(['session', 'ls'])
@@ -147,5 +145,5 @@ class TestMisc(CephFSTestCase):
         fs_avail = output.split('\n')[1].split()[3]
         fs_avail = float(fs_avail) * 1024
 
-        ratio = (raw_avail / pool_size) / fs_avail
+        ratio = raw_avail / fs_avail
         assert 0.9 < ratio < 1.1
diff --git a/ceph/qa/tasks/cephfs/test_recovery_pool.py b/ceph/qa/tasks/cephfs/test_recovery_pool.py
new file mode 100644 (file)
index 0000000..097342a
--- /dev/null
@@ -0,0 +1,220 @@
+
+"""
+Test our tools for recovering metadata from the data pool into an alternate pool
+"""
+import json
+
+import logging
+import os
+from textwrap import dedent
+import traceback
+from collections import namedtuple, defaultdict
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class OverlayWorkload(object):
+    def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount):
+        self._orig_fs = orig_fs
+        self._recovery_fs = recovery_fs
+        self._orig_mount = orig_mount
+        self._recovery_mount = recovery_mount
+        self._initial_state = None
+
+        # Accumulate backtraces for every failed validation, and return them.  Backtraces
+        # are rather verbose, but we only see them when something breaks, and they
+        # let us see which check failed without having to decorate each check with
+        # a string
+        self._errors = []
+
+    def assert_equal(self, a, b):
+        try:
+            if a != b:
+                raise AssertionError("{0} != {1}".format(a, b))
+        except AssertionError as e:
+            self._errors.append(
+                ValidationError(e, traceback.format_exc(3))
+            )
+
+    def write(self):
+        """
+        Write the workload files to the mount
+        """
+        raise NotImplementedError()
+
+    def validate(self):
+        """
+        Read from the mount and validate that the workload files are present (i.e. have
+        survived or been reconstructed from the test scenario)
+        """
+        raise NotImplementedError()
+
+    def damage(self):
+        """
+        Damage the filesystem pools in ways that will be interesting to recover from.  By
+        default just wipe everything in the metadata pool
+        """
+        # Delete every object in the metadata pool
+        objects = self._orig_fs.rados(["ls"]).split("\n")
+        for o in objects:
+            self._orig_fs.rados(["rm", o])
+
+    def flush(self):
+        """
+        Called after client unmount, after write: flush whatever you want
+        """
+        self._orig_fs.mds_asok(["flush", "journal"])
+        self._recovery_fs.mds_asok(["flush", "journal"])
+
+
+class SimpleOverlayWorkload(OverlayWorkload):
+    """
+    Single file, single directory, check that it gets recovered and so does its size
+    """
+    def write(self):
+        self._orig_mount.run_shell(["mkdir", "subdir"])
+        self._orig_mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._orig_mount.stat("subdir/sixmegs")
+
+    def validate(self):
+        self._recovery_mount.run_shell(["ls", "subdir"])
+        st = self._recovery_mount.stat("subdir/sixmegs")
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+        return self._errors
+
+class TestRecoveryPool(CephFSTestCase):
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 2
+    REQUIRE_RECOVERY_FILESYSTEM = True
+
+    def is_marked_damaged(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank in mds_map['damaged']
+
+    def _rebuild_metadata(self, workload, other_pool=None, workers=1):
+        """
+        That when all objects in metadata pool are removed, we can rebuild a metadata pool
+        based on the contents of a data pool, and a client can see and read our files.
+        """
+
+        # First, inject some files
+
+        workload.write()
+
+        # Unmount the client and flush the journal: the tool should also cope with
+        # situations where there is dirty metadata, but we'll test that separately
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+        workload.flush()
+
+        # Create the alternate pool if requested
+        recovery_fs = self.recovery_fs.name
+        recovery_pool = self.recovery_fs.get_metadata_pool_name()
+        self.recovery_fs.data_scan(['init', '--force-init',
+                                    '--filesystem', recovery_fs,
+                                    '--alternate-pool', recovery_pool])
+        self.recovery_fs.mon_manager.raw_cluster_cmd('-s')
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"])
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"])
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
+
+        # Stop the MDS
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # After recovery, we need the MDS to not be strict about stats (in production these options
+        # are off by default, but in QA we need to explicitly disable them)
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+        # Apply any data damage the workload wants
+        workload.damage()
+
+        # Reset the MDS map in case multiple ranks were in play: recovery procedure
+        # only understands how to rebuild metadata under rank 0
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
+                '--yes-i-really-mean-it')
+
+        def get_state(mds_id):
+            info = self.mds_cluster.get_mds_info(mds_id)
+            return info['state'] if info is not None else None
+
+        self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
+
+        # Run the recovery procedure
+        if False:
+            with self.assertRaises(CommandFailedError):
+                # Normal reset should fail when no objects are present, we'll use --force instead
+                self.fs.journal_tool(["journal", "reset"])
+
+        self.fs.mds_stop()
+        self.fs.data_scan(['scan_extents', '--alternate-pool',
+                           recovery_pool, '--filesystem', self.fs.name,
+                           self.fs.get_data_pool_name()])
+        self.fs.data_scan(['scan_inodes', '--alternate-pool',
+                           recovery_pool, '--filesystem', self.fs.name,
+                           '--force-corrupt', '--force-init',
+                           self.fs.get_data_pool_name()])
+        self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
+                              'recover_dentries', 'list',
+                              '--alternate-pool', recovery_pool])
+
+        self.fs.data_scan(['init', '--force-init', '--filesystem',
+                           self.fs.name])
+        self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
+                           '--force-corrupt', '--force-init',
+                           self.fs.get_data_pool_name()])
+        self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
+                              'recover_dentries', 'list'])
+
+        self.fs.journal_tool(['--rank=' + recovery_fs + ":0", 'journal',
+                              'reset', '--force'])
+        self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'journal',
+                              'reset', '--force'])
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
+                                            recovery_fs + ":0")
+
+        # Mark the MDS repaired
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+        # Start the MDS
+        self.fs.mds_restart()
+        self.recovery_fs.mds_restart()
+        self.fs.wait_for_daemons()
+        self.recovery_fs.wait_for_daemons()
+        for mds_id in self.recovery_fs.mds_ids:
+            self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + mds_id,
+                                                'injectargs', '--debug-mds=20')
+            self.fs.mon_manager.raw_cluster_cmd('daemon', "mds." + mds_id,
+                                                'scrub_path', '/',
+                                                'recursive', 'repair')
+        log.info(str(self.mds_cluster.status()))
+
+        # Mount a client
+        self.mount_a.mount()
+        self.mount_b.mount(mount_fs_name=recovery_fs)
+        self.mount_a.wait_until_mounted()
+        self.mount_b.wait_until_mounted()
+
+        # See that the files are present and correct
+        errors = workload.validate()
+        if errors:
+            log.error("Validation errors found: {0}".format(len(errors)))
+            for e in errors:
+                log.error(e.exception)
+                log.error(e.backtrace)
+            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+                errors[0].exception, errors[0].backtrace
+            ))
+
+    def test_rebuild_simple(self):
+        self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs,
+                                                     self.mount_a, self.mount_b))
index c3ac7fb78be6e5baa19a9c2df5a1bc25e531dfa4..0dd9cb7e8bacb02e9c522020468ce927401bd716 100644 (file)
@@ -1,5 +1,6 @@
 
 import logging
+import json
 
 from tasks.mgr.mgr_test_case import MgrTestCase
 
@@ -92,6 +93,15 @@ class TestFailover(MgrTestCase):
             timeout=10
         )
 
+        # Both daemons should have fully populated metadata
+        # (regression test for http://tracker.ceph.com/issues/21260)
+        meta = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "mgr", "metadata"))
+        id_to_meta = dict([(i['id'], i) for i in meta])
+        for i in [original_active] + original_standbys:
+            self.assertIn(i, id_to_meta)
+            self.assertIn('ceph_version', id_to_meta[i])
+
         # We should be able to fail back over again: the exercises
         # our re-initialization of the python runtime within
         # a single process lifetime.
index 86903476434fcd72c2724a961d5d624f099605c8..8e744e3b6628e614cc306418b46d98018c7e254c 100644 (file)
@@ -6,18 +6,26 @@ Rgw admin testing against a running instance
 #
 #   grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //'
 #
+# to run this standalone:
+#      python qa/tasks/radosgw_admin.py [USER] HOSTNAME
+#
 
 import copy
 import json
 import logging
 import time
 import datetime
+import Queue
+import bunch
+
+import sys
 
 from cStringIO import StringIO
 
 import boto.exception
 import boto.s3.connection
 import boto.s3.acl
+from boto.utils import RequestHook
 
 import httplib2
 
@@ -27,6 +35,197 @@ from util.rgw import rgwadmin, get_user_summary, get_user_successful_ops
 
 log = logging.getLogger(__name__)
 
+def usage_acc_findentry2(entries, user, add=True):
+    for e in entries:
+        if e['user'] == user:
+            return e
+    if not add:
+            return None
+    e = {'user': user, 'buckets': []}
+    entries.append(e)
+    return e
+def usage_acc_findsum2(summaries, user, add=True):
+    for e in summaries:
+        if e['user'] == user:
+            return e
+    if not add:
+        return None
+    e = {'user': user, 'categories': [],
+        'total': {'bytes_received': 0,
+            'bytes_sent': 0, 'ops': 0, 'successful_ops': 0 }}
+    summaries.append(e)
+    return e
+def usage_acc_update2(x, out, b_in, err):
+    x['bytes_sent'] += b_in
+    x['bytes_received'] += out
+    x['ops'] += 1
+    if not err:
+        x['successful_ops'] += 1
+def usage_acc_validate_fields(r, x, x2, what):
+    q=[]
+    for field in ['bytes_sent', 'bytes_received', 'ops', 'successful_ops']:
+        try:
+            if x2[field] < x[field]:
+                q.append("field %s: %d < %d" % (field, x2[field], x[field]))
+        except Exception as ex:
+            r.append( "missing/bad field " + field + " in " + what + " " + str(ex))
+            return
+    if len(q) > 0:
+        r.append("incomplete counts in " + what + ": " + ", ".join(q))
+class usage_acc:
+    def __init__(self):
+        self.results = {'entries': [], 'summary': []}
+    def findentry(self, user):
+        return usage_acc_findentry2(self.results['entries'], user)
+    def findsum(self, user):
+        return usage_acc_findsum2(self.results['summary'], user)
+    def e2b(self, e, bucket, add=True):
+        for b in e['buckets']:
+            if b['bucket'] == bucket:
+                return b
+        if not add:
+                return None
+        b = {'bucket': bucket, 'categories': []}
+        e['buckets'].append(b)
+        return b
+    def c2x(self, c, cat, add=True):
+        for x in c:
+            if x['category'] == cat:
+                return x
+        if not add:
+                return None
+        x = {'bytes_received': 0, 'category': cat,
+            'bytes_sent': 0, 'ops': 0, 'successful_ops': 0 }
+        c.append(x)
+        return x
+    def update(self, c, cat, user, out, b_in, err):
+        x = self.c2x(c, cat)
+        usage_acc_update2(x, out, b_in, err)
+        if not err and cat == 'create_bucket' and not x.has_key('owner'):
+            x['owner'] = user
+    def make_entry(self, cat, bucket, user, out, b_in, err):
+        if cat == 'create_bucket' and err:
+                return
+        e = self.findentry(user)
+        b = self.e2b(e, bucket)
+        self.update(b['categories'], cat, user, out, b_in, err)
+        s = self.findsum(user)
+        x = self.c2x(s['categories'], cat)
+        usage_acc_update2(x, out, b_in, err)
+        x = s['total']
+        usage_acc_update2(x, out, b_in, err)
+    def generate_make_entry(self):
+        return lambda cat,bucket,user,out,b_in,err: self.make_entry(cat, bucket, user, out, b_in, err)
+    def get_usage(self):
+        return self.results
+    def compare_results(self, results):
+        if not results.has_key('entries') or not results.has_key('summary'):
+            return ['Missing entries or summary']
+        r = []
+        for e in self.results['entries']:
+            try:
+                e2 = usage_acc_findentry2(results['entries'], e['user'], False)
+            except Exception as ex:
+                r.append("malformed entry looking for user "
+                   + e['user'] + " " + str(ex))
+                break
+            if e2 == None:
+                r.append("missing entry for user " + e['user'])
+                continue
+            for b in e['buckets']:
+                c = b['categories']
+                if b['bucket'] == 'nosuchbucket':
+                    print "got here"
+                try:
+                    b2 = self.e2b(e2, b['bucket'], False)
+                    if b2 != None:
+                            c2 = b2['categories']
+                except Exception as ex:
+                    r.append("malformed entry looking for bucket "
+                       + b['bucket'] + " in user " + e['user'] + " " + str(ex))
+                    break
+                if b2 == None:
+                    r.append("can't find bucket " + b['bucket']
+                       + " in user " + e['user'])
+                    continue
+                for x in c:
+                    try:
+                        x2 = self.c2x(c2, x['category'], False)
+                    except Exception as ex:
+                        r.append("malformed entry looking for "
+                           + x['category'] + " in bucket " + b['bucket']
+                           + " user " + e['user'] + " " + str(ex))
+                        break
+                    usage_acc_validate_fields(r, x, x2, "entry: category "
+                       + x['category'] + " bucket " + b['bucket']
+                       + " in user " + e['user'])
+        for s in self.results['summary']:
+            c = s['categories']
+            try:
+                s2 = usage_acc_findsum2(results['summary'], s['user'], False)
+            except Exception as ex:
+                r.append("malformed summary looking for user " + e['user']
+                   + " " + str(ex))
+                break
+            if s2 == None:
+                r.append("missing summary for user " + e['user'] + " " + str(ex))
+                continue
+            try:
+                c2 = s2['categories']
+            except Exception as ex:
+                r.append("malformed summary missing categories for user "
+                   + e['user'] + " " + str(ex))
+                break
+            for x in c:
+                try:
+                    x2 = self.c2x(c2, x['category'], False)
+                except Exception as ex:
+                    r.append("malformed summary looking for "
+                       + x['category'] + " user " + e['user'] + " " + str(ex))
+                    break
+                usage_acc_validate_fields(r, x, x2, "summary: category "
+                   + x['category'] + " in user " + e['user'])
+            x = s['total']
+            try:
+                x2 = s2['total']
+            except Exception as ex:
+                r.append("malformed summary looking for totals for user "
+                   + e['user'] + " " + str(ex))
+                break
+            usage_acc_validate_fields(r, x, x2, "summary: totals for user" + e['user'])
+        return r
+
+def ignore_this_entry(cat, bucket, user, out, b_in, err):
+    pass
+class requestlog_queue():
+    def __init__(self, add):
+        self.q = Queue.Queue(1000)
+        self.adder = add
+    def handle_request_data(self, request, response, error=False):
+        now = datetime.datetime.now()
+       if error:
+           pass
+       elif response.status < 200 or response.status >= 400:
+           error = True
+        self.q.put(bunch.Bunch({'t': now, 'o': request, 'i': response, 'e': error}))
+    def clear(self):
+        with self.q.mutex:
+            self.q.queue.clear()
+    def log_and_clear(self, cat, bucket, user, add_entry = None):
+        while not self.q.empty():
+            j = self.q.get()
+           bytes_out = 0
+            if 'Content-Length' in j.o.headers:
+               bytes_out = int(j.o.headers['Content-Length'])
+            bytes_in = 0
+            if 'content-length' in j.i.msg.dict:
+               bytes_in = int(j.i.msg.dict['content-length'])
+            log.info('RL: %s %s %s bytes_out=%d bytes_in=%d failed=%r'
+               % (cat, bucket, user, bytes_out, bytes_in, j.e))
+           if add_entry == None:
+               add_entry = self.adder
+           add_entry(cat, bucket, user, bytes_out, bytes_in, j.e)
+
 def create_presigned_url(conn, method, bucket_name, key_name, expiration):
     return conn.generate_url(expires_in=expiration,
         method=method,
@@ -119,8 +318,17 @@ def task(ctx, config):
         calling_format=boto.s3.connection.OrdinaryCallingFormat(),
         )
 
+    acc = usage_acc()
+    rl = requestlog_queue(acc.generate_make_entry())
+    connection.set_request_hook(rl)
+    connection2.set_request_hook(rl)
+
     # legend (test cases can be easily grep-ed out)
     # TESTCASE 'testname','object','method','operation','assertion'
+
+    # TESTCASE 'usage-show0' 'usage' 'show' 'all usage' 'succeeds'
+    (err, summary0) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True)
+
     # TESTCASE 'info-nosuch','user','info','non-existent user','fails'
     (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
     assert err
@@ -266,11 +474,19 @@ def task(ctx, config):
     # create a first bucket
     bucket = connection.create_bucket(bucket_name)
 
+    rl.log_and_clear("create_bucket", bucket_name, user1)
+
     # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list'
     (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True)
     assert len(out) == 1
     assert out[0] == bucket_name
 
+    bucket_list = connection.get_all_buckets()
+    assert len(bucket_list) == 1
+    assert bucket_list[0].name == bucket_name
+
+    rl.log_and_clear("list_buckets", '', user1)
+
     # TESTCASE 'bucket-list-all','bucket','list','all buckets','succeeds, expected list'
     (err, out) = rgwadmin(ctx, client, ['bucket', 'list'], check_status=True)
     assert len(out) >= 1
@@ -278,8 +494,11 @@ def task(ctx, config):
 
     # TESTCASE 'max-bucket-limit,'bucket','create','4 buckets','5th bucket fails due to max buckets == 4'
     bucket2 = connection.create_bucket(bucket_name + '2')
+    rl.log_and_clear("create_bucket", bucket_name + '2', user1)
     bucket3 = connection.create_bucket(bucket_name + '3')
+    rl.log_and_clear("create_bucket", bucket_name + '3', user1)
     bucket4 = connection.create_bucket(bucket_name + '4')
+    rl.log_and_clear("create_bucket", bucket_name + '4', user1)
     # the 5th should fail.
     failed = False
     try:
@@ -287,11 +506,15 @@ def task(ctx, config):
     except Exception:
         failed = True
     assert failed
+    rl.log_and_clear("create_bucket", bucket_name + '5', user1)
 
     # delete the buckets
     bucket2.delete()
+    rl.log_and_clear("delete_bucket", bucket_name + '2', user1)
     bucket3.delete()
+    rl.log_and_clear("delete_bucket", bucket_name + '3', user1)
     bucket4.delete()
+    rl.log_and_clear("delete_bucket", bucket_name + '4', user1)
 
     # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list'
     (err, out) = rgwadmin(ctx, client, [
@@ -307,6 +530,7 @@ def task(ctx, config):
     # use some space
     key = boto.s3.key.Key(bucket)
     key.set_contents_from_string('one')
+    rl.log_and_clear("put_obj", bucket_name, user1)
 
     # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object'
     (err, out) = rgwadmin(ctx, client, [
@@ -317,6 +541,7 @@ def task(ctx, config):
 
     # reclaim it
     key.delete()
+    rl.log_and_clear("delete_obj", bucket_name, user1)
 
     # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error'
     (err, out) = rgwadmin(ctx, client,
@@ -344,9 +569,11 @@ def task(ctx, config):
         denied = True
 
     assert not denied
+    rl.log_and_clear("put_obj", bucket_name, user1)
 
     # delete the object
     key.delete()
+    rl.log_and_clear("delete_obj", bucket_name, user1)
 
     # link the bucket to another user
     (err, out) = rgwadmin(ctx, client, ['metadata', 'get', 'bucket:{n}'.format(n=bucket_name)],
@@ -383,6 +610,17 @@ def task(ctx, config):
     object_name = 'four'
     key = boto.s3.key.Key(bucket, object_name)
     key.set_contents_from_string(object_name)
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # fetch it too (for usage stats presently)
+    s = key.get_contents_as_string()
+    rl.log_and_clear("get_obj", bucket_name, user1)
+    assert s == object_name
+    # list bucket too (for usage stats presently)
+    keys = list(bucket.list())
+    rl.log_and_clear("list_bucket", bucket_name, user1)
+    assert len(keys) == 1
+    assert keys[0].name == object_name
 
     # now delete it
     (err, out) = rgwadmin(ctx, client,
@@ -428,72 +666,22 @@ def task(ctx, config):
 
     # TODO: show log by bucket+date
 
-    # need to wait for all usage data to get flushed, should take up to 30 seconds
-    timestamp = time.time()
-    while time.time() - timestamp <= (20 * 60):      # wait up to 20 minutes
-        (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--categories', 'delete_obj'])  # last operation we did is delete obj, wait for it to flush
-        if get_user_successful_ops(out, user1) > 0:
-            break
-        time.sleep(1)
-
-    assert time.time() - timestamp <= (20 * 60)
-
-    # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds'
-    (err, out) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True)
-    assert len(out['entries']) > 0
-    assert len(out['summary']) > 0
-
-    user_summary = get_user_summary(out, user1)
-
-    total = user_summary['total']
-    assert total['successful_ops'] > 0
-
-    # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds'
-    (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
-        check_status=True)
-    assert len(out['entries']) > 0
-    assert len(out['summary']) > 0
-    user_summary = out['summary'][0]
-    for entry in user_summary['categories']:
-        assert entry['successful_ops'] > 0
-    assert user_summary['user'] == user1
-
-    # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds'
-    test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket']
-    for cat in test_categories:
-        (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1, '--categories', cat],
-            check_status=True)
-        assert len(out['summary']) > 0
-        user_summary = out['summary'][0]
-        assert user_summary['user'] == user1
-        assert len(user_summary['categories']) == 1
-        entry = user_summary['categories'][0]
-        assert entry['category'] == cat
-        assert entry['successful_ops'] > 0
-
-    # the usage flush interval is 30 seconds, wait that much an then some
-    # to make sure everything has been flushed
-    time.sleep(35)
-
-    # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed'
-    (err, out) = rgwadmin(ctx, client, ['usage', 'trim', '--uid', user1],
-        check_status=True)
-    (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
-        check_status=True)
-    assert len(out['entries']) == 0
-    assert len(out['summary']) == 0
-
     # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds'
     (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1],
         check_status=True)
 
     # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects'
+    denied = False
     try:
         key = boto.s3.key.Key(bucket)
         key.set_contents_from_string('five')
     except boto.exception.S3ResponseError as e:
+        denied = True
         assert e.status == 403
 
+    assert denied
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
     # TESTCASE 'user-renable2','user','enable','suspended user','succeeds'
     (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1],
         check_status=True)
@@ -501,6 +689,7 @@ def task(ctx, config):
     # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects'
     key = boto.s3.key.Key(bucket)
     key.set_contents_from_string('six')
+    rl.log_and_clear("put_obj", bucket_name, user1)
 
     # TESTCASE 'gc-list', 'gc', 'list', 'get list of objects ready for garbage collection'
 
@@ -509,9 +698,11 @@ def task(ctx, config):
 
     big_key = boto.s3.key.Key(bucket)
     big_key.set_contents_from_string(test_string)
+    rl.log_and_clear("put_obj", bucket_name, user1)
 
     # now delete the head
     big_key.delete()
+    rl.log_and_clear("delete_obj", bucket_name, user1)
 
     # wait a bit to give the garbage collector time to cycle
     time.sleep(15)
@@ -537,45 +728,56 @@ def task(ctx, config):
         bucket.delete()
     except boto.exception.S3ResponseError as e:
         assert e.status == 409
+    rl.log_and_clear("delete_bucket", bucket_name, user1)
 
     key.delete()
+    rl.log_and_clear("delete_obj", bucket_name, user1)
     bucket.delete()
+    rl.log_and_clear("delete_bucket", bucket_name, user1)
 
     # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy'
     bucket = connection.create_bucket(bucket_name)
+    rl.log_and_clear("create_bucket", bucket_name, user1)
 
     # create an object
     key = boto.s3.key.Key(bucket)
     key.set_contents_from_string('seven')
+    rl.log_and_clear("put_obj", bucket_name, user1)
 
     # should be private already but guarantee it
     key.set_acl('private')
+    rl.log_and_clear("put_acls", bucket_name, user1)
 
     (err, out) = rgwadmin(ctx, client,
         ['policy', '--bucket', bucket.name, '--object', key.key],
         check_status=True, format='xml')
 
     acl = get_acl(key)
+    rl.log_and_clear("get_acls", bucket_name, user1)
 
     assert acl == out.strip('\n')
 
     # add another grantee by making the object public read
     key.set_acl('public-read')
+    rl.log_and_clear("put_acls", bucket_name, user1)
 
     (err, out) = rgwadmin(ctx, client,
         ['policy', '--bucket', bucket.name, '--object', key.key],
         check_status=True, format='xml')
 
     acl = get_acl(key)
+    rl.log_and_clear("get_acls", bucket_name, user1)
 
     assert acl == out.strip('\n')
 
     # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds'
     bucket = connection.create_bucket(bucket_name)
+    rl.log_and_clear("create_bucket", bucket_name, user1)
     key_name = ['eight', 'nine', 'ten', 'eleven']
     for i in range(4):
         key = boto.s3.key.Key(bucket)
         key.set_contents_from_string(key_name[i])
+    rl.log_and_clear("put_obj", bucket_name, user1)
 
     (err, out) = rgwadmin(ctx, client,
         ['bucket', 'rm', '--bucket', bucket_name, '--purge-objects'],
@@ -594,6 +796,7 @@ def task(ctx, config):
 
     # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets'
     bucket = connection.create_bucket(bucket_name)
+    rl.log_and_clear("create_bucket", bucket_name, user1)
     key = boto.s3.key.Key(bucket)
 
     (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1])
@@ -601,8 +804,78 @@ def task(ctx, config):
 
     # TESTCASE 'rm-user2', 'user', 'rm', 'user with data', 'succeeds'
     bucket = connection.create_bucket(bucket_name)
+    rl.log_and_clear("create_bucket", bucket_name, user1)
     key = boto.s3.key.Key(bucket)
     key.set_contents_from_string('twelve')
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    time.sleep(35)
+
+    # need to wait for all usage data to get flushed, should take up to 30 seconds
+    timestamp = time.time()
+    while time.time() - timestamp <= (2 * 60):      # wait up to 20 minutes
+        (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--categories', 'delete_obj'])  # one of the operations we did is delete_obj, should be present.
+        if get_user_successful_ops(out, user1) > 0:
+            break
+        time.sleep(1)
+
+    assert time.time() - timestamp <= (20 * 60)
+
+    # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds'
+    (err, out) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True)
+    assert len(out['entries']) > 0
+    assert len(out['summary']) > 0
+
+    r = acc.compare_results(out)
+    if len(r) != 0:
+        sys.stderr.write(("\n".join(r))+"\n")
+        assert(len(r) == 0)
+
+    user_summary = get_user_summary(out, user1)
+
+    total = user_summary['total']
+    assert total['successful_ops'] > 0
+
+    # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds'
+    (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
+        check_status=True)
+    assert len(out['entries']) > 0
+    assert len(out['summary']) > 0
+    user_summary = out['summary'][0]
+    for entry in user_summary['categories']:
+        assert entry['successful_ops'] > 0
+    assert user_summary['user'] == user1
+
+    # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds'
+    test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket']
+    for cat in test_categories:
+        (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1, '--categories', cat],
+            check_status=True)
+        assert len(out['summary']) > 0
+        user_summary = out['summary'][0]
+        assert user_summary['user'] == user1
+        assert len(user_summary['categories']) == 1
+        entry = user_summary['categories'][0]
+        assert entry['category'] == cat
+        assert entry['successful_ops'] > 0
+
+    # should be all through with connection. (anything using connection
+    #  should be BEFORE the usage stuff above.)
+    rl.log_and_clear("(before-close)", '-', '-', ignore_this_entry)
+    connection.close()
+    connection = None
+
+    # the usage flush interval is 30 seconds, wait that much an then some
+    # to make sure everything has been flushed
+    time.sleep(35)
+
+    # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed'
+    (err, out) = rgwadmin(ctx, client, ['usage', 'trim', '--uid', user1],
+        check_status=True)
+    (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
+        check_status=True)
+    assert len(out['entries']) == 0
+    assert len(out['summary']) == 0
 
     (err, out) = rgwadmin(ctx, client,
         ['user', 'rm', '--uid', user1, '--purge-data' ],
@@ -638,3 +911,45 @@ def task(ctx, config):
     (err, out) = rgwadmin(ctx, client, ['zone', 'get','--rgw-zone','default'])
     assert len(out) > 0
     assert len(out['placement_pools']) == orig_placement_pools + 1
+
+    zonecmd = ['zone', 'placement', 'rm',
+       '--rgw-zone', 'default',
+       '--placement-id', 'new-placement']
+
+    (err, out) = rgwadmin(ctx, client, zonecmd, check_status=True)
+
+import sys
+from tasks.radosgw_admin import task
+from teuthology.config import config
+from teuthology.orchestra import cluster, remote
+import argparse;
+
+def main():
+    if len(sys.argv) == 3:
+       user = sys.argv[1] + "@"
+       host = sys.argv[2]
+    elif len(sys.argv) == 2:
+        user = ""
+       host = sys.argv[1]
+    else:
+        sys.stderr.write("usage: radosgw_admin.py [user] host\n")
+       exit(1)
+    client0 = remote.Remote(user + host)
+    ctx = config
+    ctx.cluster=cluster.Cluster(remotes=[(client0,
+     [ 'ceph.client.rgw.%s' % (host),  ]),])
+
+    ctx.rgw = argparse.Namespace()
+    endpoints = {}
+    endpoints['ceph.client.rgw.%s' % host] = (host, 80)
+    ctx.rgw.role_endpoints = endpoints
+    ctx.rgw.realm = None
+    ctx.rgw.regions = {'region0': { 'api name': 'api1',
+           'is master': True, 'master zone': 'r0z0',
+           'zones': ['r0z0', 'r0z1'] }}
+    ctx.rgw.config = {'ceph.client.rgw.%s' % host: {'system user': {'name': '%s-system-user' % host}}}
+    task(config, None)
+    exit()
+
+if __name__ == '__main__':
+    main()
index 7b3a8ef049c6b80f5069145e663de251429bbb6f..d45636a5692d74010d2cb3b0750b280043a89ccb 100644 (file)
@@ -4,6 +4,7 @@ Rbd testing task
 import contextlib
 import logging
 import os
+import tempfile
 
 from cStringIO import StringIO
 from teuthology.orchestra import run
@@ -334,6 +335,8 @@ def run_xfstests(ctx, config):
                 scratch_dev: 'scratch_dev'
                 fs_type: 'xfs'
                 tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015'
+                exclude:
+                - generic/42
                 randomize: true
     """
     with parallel() as p:
@@ -360,14 +363,14 @@ def run_xfstests_one_client(ctx, role, properties):
 
         fs_type = properties.get('fs_type')
         tests = properties.get('tests')
+        exclude_list = properties.get('exclude')
         randomize = properties.get('randomize')
 
-
         (remote,) = ctx.cluster.only(role).remotes.keys()
 
         # Fetch the test script
         test_root = teuthology.get_testdir(ctx)
-        test_script = 'run_xfstests_krbd.sh'
+        test_script = 'run_xfstests.sh'
         test_path = os.path.join(test_root, test_script)
 
         xfstests_url = properties.get('xfstests_url')
@@ -390,15 +393,21 @@ def run_xfstests_one_client(ctx, role, properties):
         log.info('    scratch device: {dev}'.format(dev=scratch_dev))
         log.info('     using fs_type: {fs_type}'.format(fs_type=fs_type))
         log.info('      tests to run: {tests}'.format(tests=tests))
+        log.info('      exclude list: {}'.format(' '.join(exclude_list)))
         log.info('         randomize: {randomize}'.format(randomize=randomize))
 
+        if exclude_list:
+            with tempfile.NamedTemporaryFile(bufsize=0, prefix='exclude') as exclude_file:
+                for test in exclude_list:
+                    exclude_file.write("{}\n".format(test))
+                remote.put_file(exclude_file.name, exclude_file.name)
+
         # Note that the device paths are interpreted using
         # readlink -f <path> in order to get their canonical
         # pathname (so it matches what the kernel remembers).
         args = [
             '/usr/bin/sudo',
             'TESTDIR={tdir}'.format(tdir=testdir),
-            'URL_BASE={url}'.format(url=xfstests_url),
             'adjust-ulimits',
             'ceph-coverage',
             '{tdir}/archive/coverage'.format(tdir=testdir),
@@ -409,6 +418,8 @@ def run_xfstests_one_client(ctx, role, properties):
             '-t', test_dev,
             '-s', scratch_dev,
             ]
+        if exclude_list:
+            args.extend(['-x', exclude_file.name])
         if randomize:
             args.append('-r')
         if tests:
@@ -445,6 +456,8 @@ def xfstests(ctx, config):
                 scratch_format: 1
                 fs_type: 'xfs'
                 tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015'
+                exclude:
+                - generic/42
                 randomize: true
                 xfstests_branch: master
                 xfstests_url: 'https://raw.github.com/ceph/branch/master/qa'
@@ -508,6 +521,7 @@ def xfstests(ctx, config):
             fs_type=properties.get('fs_type', 'xfs'),
             randomize=properties.get('randomize', False),
             tests=properties.get('tests'),
+            exclude=properties.get('exclude', []),
             xfstests_url=xfstests_url,
             )
 
index b6050c87ad14dcfc5b4ea57d247ba18157596e42..cec0b648bc71b4c9a50574287fad0275ced6e9ae 100644 (file)
@@ -147,7 +147,7 @@ def create_pools(ctx, clients):
             create_replicated_pool(remote, data_pool, 64, cluster_name, 'rgw')
         if ctx.rgw.cache_pools:
             create_cache_pool(remote, data_pool, data_pool + '.cache', 64,
-                              64*1024*1024, cluster_name, 'rgw')
+                              64*1024*1024, cluster_name)
     log.debug('Pools created')
     yield
 
index 09388ab38cd52f986d3291d1d1a7de5f30395526..86c4b53892135e53d4b02fef8c06a965a29a4b4f 100644 (file)
@@ -45,7 +45,7 @@ def create_replicated_pool(remote, name, pgnum, cluster_name="ceph", application
             'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
         ])
 
-def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="ceph", application=None):
+def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="ceph"):
     remote.run(args=[
         'sudo', 'ceph', 'osd', 'pool', 'create', cache_name, str(pgnum), '--cluster', cluster_name
     ])
@@ -53,10 +53,6 @@ def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="
         'sudo', 'ceph', 'osd', 'tier', 'add-cache', base_name, cache_name,
         str(size), '--cluster', cluster_name
     ])
-    if application:
-        remote.run(args=[
-            'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
-        ])
 
 def cmd_erasure_code_profile(profile_name, profile):
     """
index ce729c96b0894d05863d76175c13d5ce1465fc62..842e80d4d4e857c9421a50e7e28548e5c1d720ea 100644 (file)
@@ -678,8 +678,8 @@ class LocalMDSCluster(LocalCephCluster, MDSCluster):
         # FIXME: unimplemented
         pass
 
-    def newfs(self, name):
-        return LocalFilesystem(self._ctx, create=name)
+    def newfs(self, name='cephfs', create=True):
+        return LocalFilesystem(self._ctx, name=name, create=create)
 
 
 class LocalMgrCluster(LocalCephCluster, MgrCluster):
@@ -691,13 +691,15 @@ class LocalMgrCluster(LocalCephCluster, MgrCluster):
 
 
 class LocalFilesystem(Filesystem, LocalMDSCluster):
-    def __init__(self, ctx, fscid=None, create=None):
+    def __init__(self, ctx, fscid=None, name='cephfs', create=False):
         # Deliberately skip calling parent constructor
         self._ctx = ctx
 
         self.id = None
         self.name = None
         self.metadata_pool_name = None
+        self.metadata_overlay = False
+        self.data_pool_name = None
         self.data_pools = None
 
         # Hack: cheeky inspection of ceph.conf to see what MDSs exist
@@ -722,17 +724,15 @@ class LocalFilesystem(Filesystem, LocalMDSCluster):
 
         self._conf = defaultdict(dict)
 
-        if create is not None:
+        if name is not None:
             if fscid is not None:
                 raise RuntimeError("cannot specify fscid when creating fs")
-            if create is True:
-                self.name = 'cephfs'
-            else:
-                self.name = create
-            self.create()
-        elif fscid is not None:
-            self.id = fscid
-        self.getinfo(refresh=True)
+            if create and not self.legacy_configured():
+                self.create()
+        else:
+            if fscid is not None:
+                self.id = fscid
+                self.getinfo(refresh=True)
 
         # Stash a reference to the first created filesystem on ctx, so
         # that if someone drops to the interactive shell they can easily
index 7c43ada2885c09e64f0b3a60291174343cacb8ab..f5a313ea2a5aff846efba6baaecd69ede8a9df59 100755 (executable)
@@ -2385,6 +2385,9 @@ function test_mon_pool_application()
   ceph osd pool application set app_for_test rbd key1 value1
   ceph osd pool application set app_for_test rbd key2 value2
   ceph osd pool application set app_for_test rgw key1 value1
+  ceph osd pool application get app_for_test rbd key1 | grep 'value1'
+  ceph osd pool application get app_for_test rbd key2 | grep 'value2'
+  ceph osd pool application get app_for_test rgw key1 | grep 'value1'
 
   ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{"key1":"value1","key2":"value2"},"rgw":{"key1":"value1"}}'
 
index 44bc70f4e923b983d1ba19362d0f65ae77f55e6b..348811e7c4b24e9118292ed9ab7e756e2d3bb30e 100755 (executable)
@@ -38,12 +38,14 @@ ceph osd pool rm ec-foo ec-foo --yes-i-really-really-mean-it
 ceph osd crush rule ls | grep foo
 
 ceph osd crush rule rename foo foo-asdf
+ceph osd crush rule rename foo foo-asdf # idempotent
 ceph osd crush rule rename bar bar-asdf
 ceph osd crush rule ls | grep 'foo-asdf'
 ceph osd crush rule ls | grep 'bar-asdf'
 ceph osd crush rule rm foo 2>&1 | grep 'does not exist'
 ceph osd crush rule rm bar 2>&1 | grep 'does not exist'
 ceph osd crush rule rename foo-asdf foo
+ceph osd crush rule rename foo-asdf foo # idempotent
 ceph osd crush rule rename bar-asdf bar
 ceph osd crush rule ls | expect_false grep 'foo-asdf'
 ceph osd crush rule ls | expect_false grep 'bar-asdf'
index 41585f2843b15b495e7fd3f444bc9c01b62e3c89..c9ecb8b62523b6d609dba3efef305613af989ab3 100755 (executable)
@@ -158,7 +158,7 @@ dd if=/dev/urandom bs=1M count=1 of=${TMPDIR}/sparse2; truncate ${TMPDIR}/sparse
 # 1M sparse, 1M data
 rbd rm sparse1 || true
 rbd import $RBD_CREATE_ARGS --order 20 ${TMPDIR}/sparse1
-rbd ls -l | grep sparse1 | grep -i '2048k'
+rbd ls -l | grep sparse1 | grep -Ei '(2M|2048k)'
 [ $tiered -eq 1 -o "$(objects sparse1)" = '1' ]
 
 # export, compare contents and on-disk size
@@ -170,7 +170,7 @@ rbd rm sparse1
 # 1M data, 1M sparse
 rbd rm sparse2 || true
 rbd import $RBD_CREATE_ARGS --order 20 ${TMPDIR}/sparse2
-rbd ls -l | grep sparse2 | grep -i '2048k'
+rbd ls -l | grep sparse2 | grep -Ei '(2M|2048k)'
 [ $tiered -eq 1 -o "$(objects sparse2)" = '0' ]
 rbd export sparse2 ${TMPDIR}/sparse2.out
 compare_files_and_ondisk_sizes ${TMPDIR}/sparse2 ${TMPDIR}/sparse2.out
@@ -181,7 +181,7 @@ rbd rm sparse2
 truncate ${TMPDIR}/sparse1 -s 10M
 # import from stdin just for fun, verify still sparse
 rbd import $RBD_CREATE_ARGS --order 20 - sparse1 < ${TMPDIR}/sparse1
-rbd ls -l | grep sparse1 | grep -i '10240k'
+rbd ls -l | grep sparse1 | grep -Ei '(10M|10240k)'
 [ $tiered -eq 1 -o "$(objects sparse1)" = '1' ]
 rbd export sparse1 ${TMPDIR}/sparse1.out
 compare_files_and_ondisk_sizes ${TMPDIR}/sparse1 ${TMPDIR}/sparse1.out
@@ -192,7 +192,7 @@ rbd rm sparse1
 dd if=/dev/urandom bs=2M count=1 of=${TMPDIR}/sparse2 oflag=append conv=notrunc
 # again from stding
 rbd import $RBD_CREATE_ARGS --order 20 - sparse2 < ${TMPDIR}/sparse2
-rbd ls -l | grep sparse2 | grep -i '4096k'
+rbd ls -l | grep sparse2 | grep -Ei '(4M|4096k)'
 [ $tiered -eq 1 -o "$(objects sparse2)" = '0 2 3' ]
 rbd export sparse2 ${TMPDIR}/sparse2.out
 compare_files_and_ondisk_sizes ${TMPDIR}/sparse2 ${TMPDIR}/sparse2.out
index 272947d1e44d59538edde5a8cc26051a1c9ec561..552f73601cd9262e83c59bfc9c6876e6f5c8d085 100644 (file)
@@ -5,11 +5,13 @@ require {
        type var_run_t;
        type random_device_t;
        type urandom_device_t;
-        type setfiles_t;
+       type setfiles_t;
+       type nvme_device_t;
        class sock_file unlink;
        class lnk_file read;
        class dir read;
        class file { getattr read open };
+       class blk_file { getattr ioctl open read write };
 }
 
 ########################################
@@ -86,6 +88,8 @@ logging_send_syslog_msg(ceph_t)
 
 sysnet_dns_name_resolve(ceph_t)
 
+allow ceph_t nvme_device_t:blk_file { getattr ioctl open read write };
+
 # basis for future security review
 allow ceph_t ceph_var_run_t:sock_file { create unlink write setattr };
 allow ceph_t self:capability { sys_rawio chown };
index c345fdc666a2a7c6f5f733de4b1f0c6f422fb3f7..4af7022267304529383fce7d4ac048f1105c73b4 100644 (file)
@@ -1,2 +1,2 @@
-32ce2a3ae5239ee33d6150705cdb24d43bab910c
-v12.2.0
+3e7492b9ada8bdc9a5cd0feafd42fbca27f9c38e
+v12.2.1
index 915e10aa66493da8afc072f9f206f3caeda8ad98..3cdcb95be4b9b78daace47edbba3c3eab1bd449e 100644 (file)
@@ -811,13 +811,8 @@ if (NOT WITH_SYSTEM_ROCKSDB)
     list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER})
   endif(WITH_CCACHE AND CCACHE_FOUND)
 
-  # We really want to have the CRC32 calculation in RocksDB accelerated
-  # with SSE 4.2. For details refer to rocksdb/util/crc32c.cc.
-  if (HAVE_INTEL_SSE4_2)
-    list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_CXX_FLAGS=${SIMD_COMPILE_FLAGS})
-  else()
-    list(APPEND ROCKSDB_CMAKE_ARGS -DWITH_SSE42=OFF)
-  endif()
+  # SSE 4.2 is enabled by default in rocksdb's crc32c. For details refer to
+  # rocksdb/util/crc32c.cc.
   list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_AR=${CMAKE_AR})
   list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
 
index 7787aabd18bfd741d0d8282a0d18df7d1486f0e9..5a755672a95dc243e9d7522586c3da00cb13b1e1 100644 (file)
@@ -1,9 +1,8 @@
 from __future__ import print_function
 import argparse
-import os
 from textwrap import dedent
 from ceph_volume import process, conf, decorators
-from ceph_volume.util import system
+from ceph_volume.util import system, disk
 from ceph_volume.systemd import systemctl
 from . import api
 
@@ -18,9 +17,11 @@ def activate_filestore(lvs):
     # blow up with a KeyError if this doesn't exist
     osd_fsid = osd_lv.tags['ceph.osd_fsid']
     if not osd_journal_lv:
-        osd_journal = osd_lv.tags.get('ceph.journal_device')
+        # must be a disk partition, by quering blkid by the uuid we are ensuring that the
+        # device path is always correct
+        osd_journal = disk.get_device_from_partuuid(osd_lv.tags['ceph.journal_uuid'])
     else:
-        osd_journal = osd_journal.lv_path
+        osd_journal = osd_lv.tags['ceph.journal_device']
 
     if not osd_journal:
         raise RuntimeError('unable to detect an lv or device journal for OSD %s' % osd_id)
@@ -31,11 +32,10 @@ def activate_filestore(lvs):
     if not system.is_mounted(source, destination=destination):
         process.run(['sudo', 'mount', '-v', source, destination])
 
-    # ensure that the symlink for the journal is there
-    if not os.path.exists(osd_journal):
-        source = osd_journal
-        destination = '/var/lib/ceph/osd/%s-%s/journal' % (conf.cluster, osd_id)
-        process.run(['sudo', 'ln', '-s', source, destination])
+    # always re-do the symlink regardless if it exists, so that the journal
+    # device path that may have changed can be mapped correctly every time
+    destination = '/var/lib/ceph/osd/%s-%s/journal' % (conf.cluster, osd_id)
+    process.run(['sudo', 'ln', '-snf', osd_journal, destination])
 
     # make sure that the journal has proper permissions
     system.chown(osd_journal)
@@ -63,7 +63,10 @@ class Activate(object):
     def activate(self, args):
         lvs = api.Volumes()
         # filter them down for the OSD ID and FSID we need to activate
-        lvs.filter(lv_tags={'ceph.osd_id': args.osd_id, 'ceph.osd_fsid': args.osd_fsid})
+        if args.osd_id and args.osd_fsid:
+            lvs.filter(lv_tags={'ceph.osd_id': args.osd_id, 'ceph.osd_fsid': args.osd_fsid})
+        elif args.osd_fsid and not args.osd_id:
+            lvs.filter(lv_tags={'ceph.osd_fsid': args.osd_fsid})
         if not lvs:
             raise RuntimeError('could not find osd.%s with fsid %s' % (args.osd_id, args.osd_fsid))
         activate_filestore(lvs)
index 944a4343dad438a7ffd1ffd88cd6d50408fd9972..e5bc26234715675543625d42484a14ef3c907114 100644 (file)
@@ -4,7 +4,7 @@ that prefixes tags with ``ceph.`` and uses ``=`` for assignment, and provides
 set of utilities for interacting with LVM.
 """
 from ceph_volume import process
-from ceph_volume.exceptions import MultipleLVsError, MultipleVGsError
+from ceph_volume.exceptions import MultipleLVsError, MultipleVGsError, MultiplePVsError
 
 
 def _output_parser(output, fields):
@@ -101,14 +101,37 @@ def get_api_lvs():
           ;/dev/ubuntubox-vg/swap_1;swap_1;ubuntubox-vg
 
     """
-    fields = 'lv_tags,lv_path,lv_name,vg_name'
+    fields = 'lv_tags,lv_path,lv_name,vg_name,lv_uuid'
     stdout, stderr, returncode = process.call(
         ['sudo', 'lvs', '--noheadings', '--separator=";"', '-o', fields]
     )
     return _output_parser(stdout, fields)
 
 
-def get_lv(lv_name=None, vg_name=None, lv_path=None, lv_tags=None):
+def get_api_pvs():
+    """
+    Return the list of physical volumes configured for lvm and available in the
+    system using flags to include common metadata associated with them like the uuid
+
+    Command and delimeted output, should look like::
+
+        $ sudo pvs --noheadings --separator=';' -o pv_name,pv_tags,pv_uuid
+          /dev/sda1;;
+          /dev/sdv;;07A4F654-4162-4600-8EB3-88D1E42F368D
+
+    """
+    fields = 'pv_name,pv_tags,pv_uuid'
+
+    # note the use of `pvs -a` which will return every physical volume including
+    # ones that have not been initialized as "pv" by LVM
+    stdout, stderr, returncode = process.call(
+        ['sudo', 'pvs', '-a', '--no-heading', '--separator=";"', '-o', fields]
+    )
+
+    return _output_parser(stdout, fields)
+
+
+def get_lv(lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
     """
     Return a matching lv for the current system, requiring ``lv_name``,
     ``vg_name``, ``lv_path`` or ``tags``. Raises an error if more than one lv
@@ -118,10 +141,40 @@ def get_lv(lv_name=None, vg_name=None, lv_path=None, lv_tags=None):
     but it can also lead to multiple lvs being found, since a lot of metadata
     is shared between lvs of a distinct OSD.
     """
-    if not any([lv_name, vg_name, lv_path, lv_tags]):
+    if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
         return None
     lvs = Volumes()
-    return lvs.get(lv_name=lv_name, vg_name=vg_name, lv_path=lv_path, lv_tags=lv_tags)
+    return lvs.get(
+        lv_name=lv_name, vg_name=vg_name, lv_path=lv_path, lv_uuid=lv_uuid,
+        lv_tags=lv_tags
+    )
+
+
+def get_pv(pv_name=None, pv_uuid=None, pv_tags=None):
+    """
+    Return a matching pv (physical volume) for the current system, requiring
+    ``pv_name``, ``pv_uuid``, or ``pv_tags``. Raises an error if more than one
+    pv is found.
+    """
+    if not any([pv_name, pv_uuid, pv_tags]):
+        return None
+    pvs = PVolumes()
+    return pvs.get(pv_name=pv_name, pv_uuid=pv_uuid, pv_tags=pv_tags)
+
+
+def create_pv(device):
+    """
+    Create a physical volume from a device, useful when devices need to be later mapped
+    to journals.
+    """
+    process.run([
+        'sudo',
+        'pvcreate',
+        '-v',  # verbose
+        '-f',  # force it
+        '--yes', # answer yes to any prompts
+        device
+    ])
 
 
 def create_lv(name, group, size=None, **tags):
@@ -231,13 +284,10 @@ class VolumeGroups(list):
         # actual filtered list if any filters were applied
         if vg_tags:
             tag_filtered = []
-            for k, v in vg_tags.items():
-                for volume in filtered:
-                    if volume.tags.get(k) == str(v):
-                        if volume not in tag_filtered:
-                            tag_filtered.append(volume)
-            # return the tag_filtered volumes here, the `filtered` list is no
-            # longer useable
+            for volume in filtered:
+                matches = all(volume.tags.get(k) == str(v) for k, v in vg_tags.items())
+                if matches:
+                    tag_filtered.append(volume)
             return tag_filtered
 
         return filtered
@@ -314,7 +364,7 @@ class Volumes(list):
         """
         self[:] = []
 
-    def _filter(self, lv_name=None, vg_name=None, lv_path=None, lv_tags=None):
+    def _filter(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
         """
         The actual method that filters using a new list. Useful so that other
         methods that do not want to alter the contents of the list (e.g.
@@ -327,6 +377,9 @@ class Volumes(list):
         if vg_name:
             filtered = [i for i in filtered if i.vg_name == vg_name]
 
+        if lv_uuid:
+            filtered = [i for i in filtered if i.lv_uuid == lv_uuid]
+
         if lv_path:
             filtered = [i for i in filtered if i.lv_path == lv_path]
 
@@ -334,18 +387,16 @@ class Volumes(list):
         # actual filtered list if any filters were applied
         if lv_tags:
             tag_filtered = []
-            for k, v in lv_tags.items():
-                for volume in filtered:
-                    if volume.tags.get(k) == str(v):
-                        if volume not in tag_filtered:
-                            tag_filtered.append(volume)
-            # return the tag_filtered volumes here, the `filtered` list is no
-            # longer useable
+            for volume in filtered:
+                # all the tags we got need to match on the volume
+                matches = all(volume.tags.get(k) == str(v) for k, v in lv_tags.items())
+                if matches:
+                    tag_filtered.append(volume)
             return tag_filtered
 
         return filtered
 
-    def filter(self, lv_name=None, vg_name=None, lv_path=None, lv_tags=None):
+    def filter(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
         """
         Filter out volumes on top level attributes like ``lv_name`` or by
         ``lv_tags`` where a dict is required. For example, to find a volume
@@ -354,13 +405,14 @@ class Volumes(list):
             lv_tags={'ceph.osd_id': '0'}
 
         """
-        if not any([lv_name, vg_name, lv_path, lv_tags]):
-            raise TypeError('.filter() requires lv_name, vg_name, lv_path, or tags (none given)')
+        if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
+            raise TypeError('.filter() requires lv_name, vg_name, lv_path, lv_uuid, or tags (none given)')
         # first find the filtered volumes with the values in self
         filtered_volumes = self._filter(
             lv_name=lv_name,
             vg_name=vg_name,
             lv_path=lv_path,
+            lv_uuid=lv_uuid,
             lv_tags=lv_tags
         )
         # then purge everything
@@ -368,7 +420,7 @@ class Volumes(list):
         # and add the filtered items
         self.extend(filtered_volumes)
 
-    def get(self, lv_name=None, vg_name=None, lv_path=None, lv_tags=None):
+    def get(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
         """
         This is a bit expensive, since it will try to filter out all the
         matching items in the list, filter them out applying anything that was
@@ -381,12 +433,13 @@ class Volumes(list):
         but it can also lead to multiple lvs being found, since a lot of metadata
         is shared between lvs of a distinct OSD.
         """
-        if not any([lv_name, vg_name, lv_path, lv_tags]):
+        if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
             return None
         lvs = self._filter(
             lv_name=lv_name,
             vg_name=vg_name,
             lv_path=lv_path,
+            lv_uuid=lv_uuid,
             lv_tags=lv_tags
         )
         if not lvs:
@@ -396,6 +449,104 @@ class Volumes(list):
         return lvs[0]
 
 
+class PVolumes(list):
+    """
+    A list of all known (physical) volumes for the current system, with the ability
+    to filter them via keyword arguments.
+    """
+
+    def __init__(self):
+        self._populate()
+
+    def _populate(self):
+        # get all the pvs in the current system
+        for pv_item in get_api_pvs():
+            self.append(PVolume(**pv_item))
+
+    def _purge(self):
+        """
+        Deplete all the items in the list, used internally only so that we can
+        dynamically allocate the items when filtering without the concern of
+        messing up the contents
+        """
+        self[:] = []
+
+    def _filter(self, pv_name=None, pv_uuid=None, pv_tags=None):
+        """
+        The actual method that filters using a new list. Useful so that other
+        methods that do not want to alter the contents of the list (e.g.
+        ``self.find``) can operate safely.
+        """
+        filtered = [i for i in self]
+        if pv_name:
+            filtered = [i for i in filtered if i.pv_name == pv_name]
+
+        if pv_uuid:
+            filtered = [i for i in filtered if i.pv_uuid == pv_uuid]
+
+        # at this point, `filtered` has either all the physical volumes in self
+        # or is an actual filtered list if any filters were applied
+        if pv_tags:
+            tag_filtered = []
+            for pvolume in filtered:
+                matches = all(pvolume.tags.get(k) == str(v) for k, v in pv_tags.items())
+                if matches:
+                    tag_filtered.append(pvolume)
+            # return the tag_filtered pvolumes here, the `filtered` list is no
+            # longer useable
+            return tag_filtered
+
+        return filtered
+
+    def filter(self, pv_name=None, pv_uuid=None, pv_tags=None):
+        """
+        Filter out volumes on top level attributes like ``pv_name`` or by
+        ``pv_tags`` where a dict is required. For example, to find a physical volume
+        that has an OSD ID of 0, the filter would look like::
+
+            pv_tags={'ceph.osd_id': '0'}
+
+        """
+        if not any([pv_name, pv_uuid, pv_tags]):
+            raise TypeError('.filter() requires pv_name, pv_uuid, or pv_tags (none given)')
+        # first find the filtered volumes with the values in self
+        filtered_volumes = self._filter(
+            pv_name=pv_name,
+            pv_uuid=pv_uuid,
+            pv_tags=pv_tags
+        )
+        # then purge everything
+        self._purge()
+        # and add the filtered items
+        self.extend(filtered_volumes)
+
+    def get(self, pv_name=None, pv_uuid=None, pv_tags=None):
+        """
+        This is a bit expensive, since it will try to filter out all the
+        matching items in the list, filter them out applying anything that was
+        added and return the matching item.
+
+        This method does *not* alter the list, and it will raise an error if
+        multiple pvs are matched
+
+        It is useful to use ``tags`` when trying to find a specific logical volume,
+        but it can also lead to multiple pvs being found, since a lot of metadata
+        is shared between pvs of a distinct OSD.
+        """
+        if not any([pv_name, pv_uuid, pv_tags]):
+            return None
+        pvs = self._filter(
+            pv_name=pv_name,
+            pv_uuid=pv_uuid,
+            pv_tags=pv_tags
+        )
+        if not pvs:
+            return None
+        if len(pvs) > 1:
+            raise MultiplePVsError(pv_name)
+        return pvs[0]
+
+
 class VolumeGroup(object):
     """
     Represents an LVM group, with some top-level attributes like ``vg_name``
@@ -470,3 +621,66 @@ class Volume(object):
                 '--addtag', '%s=%s' % (key, value), self.lv_path
             ]
         )
+
+
+class PVolume(object):
+    """
+    Represents a Physical Volume from LVM, with some top-level attributes like
+    ``pv_name`` and parsed tags as a dictionary of key/value pairs.
+    """
+
+    def __init__(self, **kw):
+        for k, v in kw.items():
+            setattr(self, k, v)
+        self.pv_api = kw
+        self.name = kw['pv_name']
+        self.tags = parse_tags(kw['pv_tags'])
+
+    def __str__(self):
+        return '<%s>' % self.pv_api['pv_name']
+
+    def __repr__(self):
+        return self.__str__()
+
+    def set_tags(self, tags):
+        """
+        :param tags: A dictionary of tag names and values, like::
+
+            {
+                "ceph.osd_fsid": "aaa-fff-bbbb",
+                "ceph.osd_id": "0"
+            }
+
+        At the end of all modifications, the tags are refreshed to reflect
+        LVM's most current view.
+        """
+        for k, v in tags.items():
+            self.set_tag(k, v)
+        # after setting all the tags, refresh them for the current object, use the
+        # pv_* identifiers to filter because those shouldn't change
+        pv_object = get_pv(pv_name=self.pv_name, pv_uuid=self.pv_uuid)
+        self.tags = pv_object.tags
+
+    def set_tag(self, key, value):
+        """
+        Set the key/value pair as an LVM tag. Does not "refresh" the values of
+        the current object for its tags. Meant to be a "fire and forget" type
+        of modification.
+
+        **warning**: Altering tags on a PV has to be done ensuring that the
+        device is actually the one intended. ``pv_name`` is *not* a persistent
+        value, only ``pv_uuid`` is. Using ``pv_uuid`` is the best way to make
+        sure the device getting changed is the one needed.
+        """
+        # remove it first if it exists
+        if self.tags.get(key):
+            current_value = self.tags[key]
+            tag = "%s=%s" % (key, current_value)
+            process.call(['sudo', 'pvchange', '--deltag', tag, self.pv_name])
+
+        process.call(
+            [
+                'sudo', 'pvchange',
+                '--addtag', '%s=%s' % (key, value), self.pv_name
+            ]
+        )
index a9630ce48d0f86e5dfd97664300564b96199b2f4..1ca5b0d88540f86546a49deff5d68de02676df05 100644 (file)
@@ -3,8 +3,8 @@ import json
 import os
 from textwrap import dedent
 from ceph_volume.util import prepare as prepare_utils
-from ceph_volume.util import system
-from ceph_volume import conf, decorators
+from ceph_volume.util import system, disk
+from ceph_volume import conf, decorators, terminal
 from . import api
 from .common import prepare_parser
 
@@ -51,6 +51,13 @@ class Prepare(object):
     def __init__(self, argv):
         self.argv = argv
 
+    def get_journal_ptuuid(self, argument):
+        uuid = disk.get_partuuid(argument)
+        if not uuid:
+            terminal.error('blkid could not detect a PARTUUID for device: %s' % argument)
+            raise RuntimeError('unable to use device for a journal')
+        return uuid
+
     def get_journal_lv(self, argument):
         """
         Perform some parsing of the value of ``--journal`` so that the process
@@ -88,36 +95,43 @@ class Prepare(object):
 
             if not args.journal:
                 raise RuntimeError('--journal is required when using --filestore')
-            journal_device = None
-            journal_lv = self.get_journal_lv(args.journal)
 
-            # check if we have an actual path to a device, which is allowed
-            if not journal_lv:
-                if os.path.exists(args.journal):
-                    journal_device = args.journal
-                else:
-                    raise RuntimeError(
-                        '--journal specified an invalid or non-existent device: %s' % args.journal
-                    )
-            # Otherwise the journal_device is the path to the lv
-            else:
+            journal_lv = self.get_journal_lv(args.journal)
+            if journal_lv:
                 journal_device = journal_lv.lv_path
+                journal_uuid = journal_lv.lv_uuid
+                # we can only set tags on an lv, the pv (if any) can't as we
+                # aren't making it part of an lvm group (vg)
                 journal_lv.set_tags({
                     'ceph.type': 'journal',
                     'ceph.osd_fsid': fsid,
                     'ceph.osd_id': osd_id,
                     'ceph.cluster_fsid': cluster_fsid,
                     'ceph.journal_device': journal_device,
+                    'ceph.journal_uuid': journal_uuid,
                     'ceph.data_device': data_lv.lv_path,
+                    'ceph.data_uuid': data_lv.lv_uuid,
                 })
 
+            # allow a file
+            elif os.path.isfile(args.journal):
+                journal_uuid = ''
+                journal_device = args.journal
+
+            # otherwise assume this is a regular disk partition
+            else:
+                journal_uuid = self.get_journal_ptuuid(args.journal)
+                journal_device = args.journal
+
             data_lv.set_tags({
                 'ceph.type': 'data',
                 'ceph.osd_fsid': fsid,
                 'ceph.osd_id': osd_id,
                 'ceph.cluster_fsid': cluster_fsid,
                 'ceph.journal_device': journal_device,
+                'ceph.journal_uuid': journal_uuid,
                 'ceph.data_device': data_lv.lv_path,
+                'ceph.data_uuid': data_lv.lv_uuid,
             })
 
             prepare_filestore(
index 7486bfaa9e0430d40b908f61d43b681dd4a7d50f..9111620729cdbc20242455f4dc89be414c632c49 100644 (file)
@@ -18,7 +18,7 @@ def parse_osd_id(string):
 def parse_osd_uuid(string):
     osd_id = '%s-' % parse_osd_id(string)
     # remove the id first
-    osd_uuid = string.split(osd_id)[-1]
+    osd_uuid = string.split(osd_id, 1)[-1]
     if not osd_uuid:
         raise SuffixParsingError('OSD uuid', string)
     return osd_uuid
index 75c6b6c64053697bfa86f042eb6072fbab9c419e..211d9d09b9e04d01e8a4006b1e2df34c7dee50a1 100644 (file)
@@ -50,6 +50,16 @@ class SuperUserError(Exception):
         return 'This command needs to be executed with sudo or as root'
 
 
+class MultiplePVsError(Exception):
+
+    def __init__(self, pv_name):
+        self.pv_name = pv_name
+
+    def __str__(self):
+        msg = "Got more than 1 result looking for physical volume: %s" % self.pv_name
+        return msg
+
+
 class MultipleLVsError(Exception):
 
     def __init__(self, lv_name, lv_path):
index 869979ebf2db047b90b58d1e8127a7e090e670a8..7a580e57c726dc7f55160134e78c9344a6732741 100644 (file)
@@ -1,4 +1,5 @@
 import pytest
+from ceph_volume.devices.lvm import api
 
 class Capture(object):
 
@@ -14,3 +15,28 @@ class Capture(object):
 @pytest.fixture
 def capture():
     return Capture()
+
+
+@pytest.fixture
+def volumes(monkeypatch):
+    monkeypatch.setattr('ceph_volume.process.call', lambda x: ('', '', 0))
+    volumes = api.Volumes()
+    volumes._purge()
+    return volumes
+
+
+@pytest.fixture
+def volume_groups(monkeypatch):
+    monkeypatch.setattr('ceph_volume.process.call', lambda x: ('', '', 0))
+    vgs = api.VolumeGroups()
+    vgs._purge()
+    return vgs
+
+
+@pytest.fixture
+def is_root(monkeypatch):
+    """
+    Patch ``os.getuid()`` so that ceph-volume's decorators that ensure a user
+    is root (or is sudoing to superuser) can continue as-is
+    """
+    monkeypatch.setattr('os.getuid', lambda: 0)
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_activate.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_activate.py
new file mode 100644 (file)
index 0000000..40df775
--- /dev/null
@@ -0,0 +1,34 @@
+import pytest
+from ceph_volume.devices.lvm import activate, api
+
+
+class Args(object):
+
+    def __init__(self, **kw):
+        for k, v in kw.items():
+            setattr(self, k, v)
+
+
+class TestActivate(object):
+
+    # these tests are very functional, hence the heavy patching, it is hard to
+    # test the negative side effect with an actual functional run, so we must
+    # setup a perfect scenario for this test to check it can really work
+    # with/without osd_id
+    def test_no_osd_id_matches_fsid(self, is_root, volumes, monkeypatch, capture):
+        FooVolume = api.Volume(lv_name='foo', lv_path='/dev/vg/foo', lv_tags="ceph.osd_fsid=1234")
+        volumes.append(FooVolume)
+        monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+        monkeypatch.setattr(activate, 'activate_filestore', capture)
+        args = Args(osd_id=None, osd_fsid='1234')
+        activate.Activate([]).activate(args)
+        assert capture.calls[0]['args'][0] == [FooVolume]
+
+    def test_no_osd_id_no_matching_fsid(self, is_root, volumes, monkeypatch, capture):
+        FooVolume = api.Volume(lv_name='foo', lv_path='/dev/vg/foo', lv_tags="ceph.osd_fsid=11234")
+        volumes.append(FooVolume)
+        monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+        monkeypatch.setattr(activate, 'activate_filestore', capture)
+        args = Args(osd_id=None, osd_fsid='1234')
+        with pytest.raises(RuntimeError):
+            activate.Activate([]).activate(args)
index 089afa1a8c00557263c298c46c4da429eb8663ef..d6aa549040d76e3fccf0677eb517b5ea7e79fa93 100644 (file)
@@ -76,6 +76,14 @@ def volumes(monkeypatch):
     return volumes
 
 
+@pytest.fixture
+def pvolumes(monkeypatch):
+    monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
+    pvolumes = api.PVolumes()
+    pvolumes._purge()
+    return pvolumes
+
+
 @pytest.fixture
 def volume_groups(monkeypatch):
     monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
@@ -96,6 +104,62 @@ class TestGetLV(object):
         monkeypatch.setattr(api, 'Volumes', lambda: volumes)
         assert api.get_lv(lv_name='foo') == FooVolume
 
+    def test_single_lv_is_matched_by_uuid(self, volumes, monkeypatch):
+        FooVolume = api.Volume(
+            lv_name='foo', lv_path='/dev/vg/foo',
+            lv_uuid='1111', lv_tags="ceph.type=data")
+        volumes.append(FooVolume)
+        monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+        assert api.get_lv(lv_uuid='1111') == FooVolume
+
+
+class TestGetPV(object):
+
+    def test_nothing_is_passed_in(self):
+        # so we return a None
+        assert api.get_pv() is None
+
+    def test_single_pv_is_not_matched(self, pvolumes, monkeypatch):
+        FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+        pvolumes.append(FooPVolume)
+        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+        assert api.get_pv(pv_uuid='foo') is None
+
+    def test_single_pv_is_matched(self, pvolumes, monkeypatch):
+        FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+        pvolumes.append(FooPVolume)
+        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+        assert api.get_pv(pv_uuid='0000') == FooPVolume
+
+    def test_single_pv_is_matched_by_uuid(self, pvolumes, monkeypatch):
+        FooPVolume = api.PVolume(
+            pv_name='/dev/vg/foo',
+            pv_uuid='1111', pv_tags="ceph.type=data")
+        pvolumes.append(FooPVolume)
+        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+        assert api.get_pv(pv_uuid='1111') == FooPVolume
+
+
+class TestPVolumes(object):
+
+    def test_filter_by_tag_does_not_match_one(self, pvolumes, monkeypatch):
+        pv_tags = "ceph.type=journal,ceph.osd_id=1,ceph.fsid=000-aaa"
+        FooPVolume = api.PVolume(
+            pv_name='/dev/vg/foo',
+            pv_uuid='1111', pv_tags=pv_tags)
+        pvolumes.append(FooPVolume)
+        pvolumes.filter(pv_tags={'ceph.type': 'journal', 'ceph.osd_id': '2'})
+        assert pvolumes == []
+
+    def test_filter_by_tags_matches(self, pvolumes, monkeypatch):
+        pv_tags = "ceph.type=journal,ceph.osd_id=1"
+        FooPVolume = api.PVolume(
+            pv_name='/dev/vg/foo',
+            pv_uuid='1111', pv_tags=pv_tags)
+        pvolumes.append(FooPVolume)
+        pvolumes.filter(pv_tags={'ceph.type': 'journal', 'ceph.osd_id': '1'})
+        assert pvolumes == [FooPVolume]
+
 
 class TestGetVG(object):
 
@@ -142,6 +206,16 @@ class TestVolumes(object):
         assert len(volumes) == 1
         assert volumes[0].lv_name == 'volume1'
 
+    def test_filter_by_tag_does_not_match_one(self, volumes):
+        lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
+        journal = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='ceph.osd_id=1,ceph.type=journal')
+        volumes.append(osd)
+        volumes.append(journal)
+        # note the different osd_id!
+        volumes.filter(lv_tags={'ceph.type': 'data', 'ceph.osd_id': '2'})
+        assert volumes == []
+
     def test_filter_by_vg_name(self, volumes):
         lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
         osd = api.Volume(lv_name='volume1', vg_name='ceph_vg', lv_tags=lv_tags)
@@ -161,6 +235,23 @@ class TestVolumes(object):
         assert len(volumes) == 1
         assert volumes[0].lv_name == 'volume1'
 
+    def test_filter_by_lv_uuid(self, volumes):
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_uuid='1111', lv_tags='')
+        journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_uuid='', lv_tags='')
+        volumes.append(osd)
+        volumes.append(journal)
+        volumes.filter(lv_uuid='1111')
+        assert len(volumes) == 1
+        assert volumes[0].lv_name == 'volume1'
+
+    def test_filter_by_lv_uuid_nothing_found(self, volumes):
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_uuid='1111', lv_tags='')
+        journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_uuid='', lv_tags='')
+        volumes.append(osd)
+        volumes.append(journal)
+        volumes.filter(lv_uuid='22222')
+        assert volumes == []
+
     def test_filter_requires_params(self, volumes):
         with pytest.raises(TypeError):
             volumes.filter()
@@ -198,6 +289,13 @@ class TestVolumeGroups(object):
         assert len(volume_groups) == 1
         assert volume_groups[0].vg_name == 'volume1'
 
+    def test_filter_by_tag_does_not_match_one(self, volume_groups):
+        vg_tags = "ceph.group=dmcache,ceph.disk_type=ssd"
+        osd = api.VolumeGroup(vg_name='volume1', vg_path='/dev/vg/lv', vg_tags=vg_tags)
+        volume_groups.append(osd)
+        volume_groups.filter(vg_tags={'ceph.group': 'data', 'ceph.disk_type': 'ssd'})
+        assert volume_groups == []
+
     def test_filter_by_vg_name(self, volume_groups):
         vg_tags = "ceph.type=data,ceph.fsid=000-aaa"
         osd = api.VolumeGroup(vg_name='ceph_vg', vg_tags=vg_tags)
index f1dff2d32dd6c7d66629521d7dfbe43169bf15d6..b5280f931bfdb3b30c184e57cdc809d65b1047b9 100644 (file)
@@ -36,4 +36,10 @@ class TestParseOSDUUID(object):
         with pytest.raises(exceptions.SuffixParsingError):
             trigger.parse_osd_uuid('ljahs-dfa-slkjhdfa-foo')
 
+    def test_robust_double_id_in_uuid(self):
+        # it is possible to have the id in the SHA1, this should
+        # be fine parsing that
+        result = trigger.parse_osd_uuid("1-abc959fd-1ec9-4864-b141-3154f9b9f8ed")
+        assert result == 'abc959fd-1ec9-4864-b141-3154f9b9f8ed'
+
 
index 4a7949573a9a8e042fcabd3f760324c9eefe24b2..469ca33906df391b79d1532dbeb93f07aa361373 100644 (file)
@@ -65,7 +65,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
   (0..CLIENTS - 1).each do |i|
     config.vm.define "#{LABEL_PREFIX}client#{i}" do |client|
       client.vm.box = CLIENT_BOX
-      client.vm.hostname = "#{LABEL_PREFIX}ceph-client#{i}"
+      client.vm.hostname = "#{LABEL_PREFIX}client#{i}"
       if ASSIGN_STATIC_IP
         client.vm.network :private_network,
           ip: "#{PUBLIC_SUBNET}.4#{i}"
@@ -88,7 +88,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 
       # Parallels
       client.vm.provider "parallels" do |prl|
-        prl.name = "ceph-client#{i}"
+        prl.name = "client#{i}"
         prl.memory = "#{MEMORY}"
       end
 
@@ -102,7 +102,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     config.vm.define "#{LABEL_PREFIX}rgw#{i}" do |rgw|
       rgw.vm.box = BOX
       rgw.vm.box_url = BOX_URL
-      rgw.vm.hostname = "#{LABEL_PREFIX}ceph-rgw#{i}"
+      rgw.vm.hostname = "#{LABEL_PREFIX}rgw#{i}"
       if ASSIGN_STATIC_IP
         rgw.vm.network :private_network,
           ip: "#{PUBLIC_SUBNET}.5#{i}"
@@ -126,7 +126,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 
       # Parallels
       rgw.vm.provider "parallels" do |prl|
-        prl.name = "ceph-rgw#{i}"
+        prl.name = "rgw#{i}"
         prl.memory = "#{MEMORY}"
       end
 
@@ -140,7 +140,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     config.vm.define "nfs#{i}" do |nfs|
       nfs.vm.box = BOX
       nfs.vm.box_url = BOX_URL
-      nfs.vm.hostname = "ceph-nfs#{i}"
+      nfs.vm.hostname = "nfs#{i}"
       if ASSIGN_STATIC_IP
         nfs.vm.network :private_network,
           ip: "#{PUBLIC_SUBNET}.6#{i}"
@@ -164,7 +164,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 
       # Parallels
       nfs.vm.provider "parallels" do |prl|
-        prl.name = "ceph-nfs#{i}"
+        prl.name = "nfs#{i}"
         prl.memory = "#{MEMORY}"
       end
 
@@ -178,7 +178,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     config.vm.define "#{LABEL_PREFIX}mds#{i}" do |mds|
       mds.vm.box = BOX
       mds.vm.box_url = BOX_URL
-      mds.vm.hostname = "#{LABEL_PREFIX}ceph-mds#{i}"
+      mds.vm.hostname = "#{LABEL_PREFIX}mds#{i}"
       if ASSIGN_STATIC_IP
         mds.vm.network :private_network,
           ip: "#{PUBLIC_SUBNET}.7#{i}"
@@ -200,7 +200,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
       end
       # Parallels
       mds.vm.provider "parallels" do |prl|
-        prl.name = "ceph-mds#{i}"
+        prl.name = "mds#{i}"
         prl.memory = "#{MEMORY}"
       end
 
@@ -214,7 +214,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     config.vm.define "#{LABEL_PREFIX}rbd_mirror#{i}" do |rbd_mirror|
       rbd_mirror.vm.box = BOX
       rbd_mirror.vm.box_url = BOX_URL
-      rbd_mirror.vm.hostname = "#{LABEL_PREFIX}ceph-rbd-mirror#{i}"
+      rbd_mirror.vm.hostname = "#{LABEL_PREFIX}rbd-mirror#{i}"
       if ASSIGN_STATIC_IP
         rbd_mirror.vm.network :private_network,
           ip: "#{PUBLIC_SUBNET}.8#{i}"
@@ -236,7 +236,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
       end
       # Parallels
       rbd_mirror.vm.provider "parallels" do |prl|
-        prl.name = "ceph-rbd-mirror#{i}"
+        prl.name = "rbd-mirror#{i}"
         prl.memory = "#{MEMORY}"
       end
 
@@ -250,7 +250,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     config.vm.define "#{LABEL_PREFIX}iscsi_gw#{i}" do |iscsi_gw|
       iscsi_gw.vm.box = BOX
       iscsi_gw.vm.box_url = BOX_URL
-      iscsi_gw.vm.hostname = "#{LABEL_PREFIX}ceph-iscsi-gw#{i}"
+      iscsi_gw.vm.hostname = "#{LABEL_PREFIX}iscsi-gw#{i}"
       if ASSIGN_STATIC_IP
         iscsi_gw.vm.network :private_network,
           ip: "#{PUBLIC_SUBNET}.9#{i}"
@@ -272,7 +272,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
       end
       # Parallels
       iscsi_gw.vm.provider "parallels" do |prl|
-        prl.name = "ceph-iscsi-gw#{i}"
+        prl.name = "iscsi-gw#{i}"
         prl.memory = "#{MEMORY}"
       end
 
@@ -286,7 +286,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     config.vm.define "#{LABEL_PREFIX}mon#{i}" do |mon|
       mon.vm.box = BOX
       mon.vm.box_url = BOX_URL
-      mon.vm.hostname = "#{LABEL_PREFIX}ceph-mon#{i}"
+      mon.vm.hostname = "#{LABEL_PREFIX}mon#{i}"
       if ASSIGN_STATIC_IP
         mon.vm.network :private_network,
           ip: "#{PUBLIC_SUBNET}.1#{i}"
@@ -309,7 +309,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 
       # Parallels
       mon.vm.provider "parallels" do |prl|
-        prl.name = "ceph-mon#{i}"
+        prl.name = "mon#{i}"
         prl.memory = "#{MEMORY}"
       end
 
@@ -323,7 +323,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     config.vm.define "#{LABEL_PREFIX}osd#{i}" do |osd|
       osd.vm.box = BOX
       osd.vm.box_url = BOX_URL
-      osd.vm.hostname = "#{LABEL_PREFIX}ceph-osd#{i}"
+      osd.vm.hostname = "#{LABEL_PREFIX}osd#{i}"
       if ASSIGN_STATIC_IP
         osd.vm.network :private_network,
           ip: "#{PUBLIC_SUBNET}.10#{i}"
@@ -378,7 +378,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 
       # Parallels
       osd.vm.provider "parallels" do |prl|
-        prl.name = "ceph-osd#{i}"
+        prl.name = "osd#{i}"
         prl.memory = "#{MEMORY}"
         (0..1).each do |d|
           prl.customize ["set", :id,
index cd16377b1b1036bb49609f61382df2e27475aaa2..e7c1f72300ae0f4a11b08e33c9e808d5b101b415 100644 (file)
@@ -8,12 +8,18 @@ monitor_interface: eth1
 journal_size: 100
 osd_objectstore: "filestore"
 osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
 copy_admin_key: true
 # test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
 lvm_volumes:
-  - data: test_volume
-    journal: /dev/sdc
+  - data: data-lv1
+    journal: /dev/sdc1
     data_vg: test_group
+  - data: data-lv2
+    journal: journal1
+    data_vg: test_group
+    journal_vg: journals
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index 6fe16a51e6bfbe089afaa2f07bde46506af895e9..6e0dfbf2d4c0de078e9c7ee96a9feb115a51958c 100644 (file)
@@ -15,7 +15,7 @@ setenv=
   VAGRANT_CWD = {changedir}
   CEPH_VOLUME_DEBUG = 1
 deps=
-  ansible==2.2.3
+  ansible==2.3.1
   testinfra==1.6.0
   pytest-xdist
 changedir=
index cd16377b1b1036bb49609f61382df2e27475aaa2..e7c1f72300ae0f4a11b08e33c9e808d5b101b415 100644 (file)
@@ -8,12 +8,18 @@ monitor_interface: eth1
 journal_size: 100
 osd_objectstore: "filestore"
 osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
 copy_admin_key: true
 # test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
 lvm_volumes:
-  - data: test_volume
-    journal: /dev/sdc
+  - data: data-lv1
+    journal: /dev/sdc1
     data_vg: test_group
+  - data: data-lv2
+    journal: journal1
+    data_vg: test_group
+    journal_vg: journals
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
diff --git a/ceph/src/ceph-volume/ceph_volume/util/disk.py b/ceph/src/ceph-volume/ceph_volume/util/disk.py
new file mode 100644 (file)
index 0000000..0d3061d
--- /dev/null
@@ -0,0 +1,24 @@
+from ceph_volume import process
+
+
+def get_partuuid(device):
+    """
+    If a device is a partition, it will probably have a PARTUUID on it that
+    will persist and can be queried against `blkid` later to detect the actual
+    device
+    """
+    out, err, rc = process.call(
+        ['sudo', 'blkid', '-s', 'PARTUUID', '-o', 'value', device]
+    )
+    return ' '.join(out).strip()
+
+
+def get_device_from_partuuid(partuuid):
+    """
+    If a device has a partuuid, query blkid so that it can tell us what that
+    device is
+    """
+    out, err, rc = process.call(
+        ['sudo', 'blkid', '-t', 'PARTUUID="%s"' % partuuid, '-o', 'device']
+    )
+    return ' '.join(out).strip()
index bb2516c788f9953cccba5d71498f3c288dce4432..cb1bd86941950a99012db3e913cde9b90de4f2ca 100755 (executable)
@@ -173,15 +173,17 @@ def monids():
 
 
 def mdsids():
-    ret, outbuf, outs = json_command(cluster_handle, prefix='mds dump',
+    ret, outbuf, outs = json_command(cluster_handle, prefix='fs dump',
                                      argdict={'format': 'json'})
     if ret:
         raise RuntimeError('Can\'t contact mon for mds list')
     d = json.loads(outbuf.decode('utf-8'))
     l = []
-    infodict = d['info']
-    for mdsdict in infodict.values():
-        l.append(mdsdict['name'])
+    for info in d['standbys']:
+        l.append(info['name'])
+    for fs in d['filesystems']:
+        for info in fs['mdsmap']['info'].values():
+            l.append(info['name'])
     return l
 
 
@@ -199,6 +201,14 @@ def mgrids():
     return l
 
 
+def ids_by_service(service):
+    ids = {"mon": monids,
+           "osd": osdids,
+           "mds": mdsids,
+           "mgr": mgrids}
+    return ids[service]()
+
+
 def validate_target(target):
     """
       this function will return true iff target is a correct
@@ -211,16 +221,9 @@ def validate_target(target):
     if len(target) == 2:
         # for case "service.id"
         service_name, service_id = target[0], target[1]
-        exist_ids = []
-        if service_name == "mon":
-            exist_ids = monids()
-        elif service_name == "osd":
-            exist_ids = osdids()
-        elif service_name == "mds":
-            exist_ids = mdsids()
-        elif service_name == "mgr":
-            exist_ids = mgrids()
-        else:
+        try:
+            exist_ids = ids_by_service(service_name)
+        except KeyError:
             print('WARN: {0} is not a legal service name, should be one of mon/osd/mds/mgr'.format(service_name),
                   file=sys.stderr)
             return False
@@ -1020,13 +1023,11 @@ def main():
     # of the form 'cmdNNN' followed by an array of argument descriptors)
     # as part of the validated argument JSON object
 
-    targets = [target]
-
     if target[1] == '*':
-        if target[0] == 'osd':
-            targets = [(target[0], o) for o in osdids()]
-        elif target[0] == 'mon':
-            targets = [(target[0], m) for m in monids()]
+        service = target[0]
+        targets = [(service, o) for o in ids_by_service(service)]
+    else:
+        targets = [target]
 
     final_ret = 0
     for target in targets:
index 5fbeda91da545b28fa0463e7431a1082ef40baf6..f3a62f672169b68bc8403153a5c66fa5f22c10ed 100644 (file)
@@ -1,3 +1,3 @@
 12
 luminous
-rc
+stable
index cc0a93c366778d22ec7e4fa4784dfece86c17303..29a9c49f79faf26e56c5f496734d58fa52dc712d 100644 (file)
@@ -271,7 +271,6 @@ Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
   if (cct->_conf->client_acl_type == "posix_acl")
     acl_type = POSIX_ACL;
 
-  lru.lru_set_max(cct->_conf->client_cache_size);
   lru.lru_set_midpoint(cct->_conf->client_cache_mid);
 
   // file handles
@@ -331,7 +330,6 @@ void Client::tear_down_cache()
   // *** FIXME ***
 
   // empty lru
-  lru.lru_set_max(0);
   trim_cache();
   assert(lru.lru_get_size() == 0);
 
@@ -600,12 +598,13 @@ void Client::shutdown()
 
 void Client::trim_cache(bool trim_kernel_dcache)
 {
-  ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << lru.lru_get_max() << dendl;
+  uint64_t max = cct->_conf->client_cache_size;
+  ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
   unsigned last = 0;
   while (lru.lru_get_size() != last) {
     last = lru.lru_get_size();
 
-    if (lru.lru_get_size() <= lru.lru_get_max())  break;
+    if (!unmounting && lru.lru_get_size() <= max)  break;
 
     // trim!
     Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
@@ -615,7 +614,7 @@ void Client::trim_cache(bool trim_kernel_dcache)
     trim_dentry(dn);
   }
 
-  if (trim_kernel_dcache && lru.lru_get_size() > lru.lru_get_max())
+  if (trim_kernel_dcache && lru.lru_get_size() > max)
     _invalidate_kernel_dcache();
 
   // hose root?
@@ -4771,7 +4770,7 @@ void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
       MetaSession *tsession = _get_or_open_mds_session(peer_mds);
       if (in->caps.count(peer_mds)) {
        Cap *tcap = in->caps[peer_mds];
-       if (tcap->cap_id != m->peer.cap_id ||
+       if (tcap->cap_id == m->peer.cap_id &&
            ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
          tcap->cap_id = m->peer.cap_id;
          tcap->seq = m->peer.seq - 1;
@@ -5212,7 +5211,7 @@ ostream& operator<<(ostream &out, const UserPerm& perm) {
 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
                        const UserPerm& perms)
 {
-  ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
+  ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
   int r = _getattr_for_perm(in, perms);
   if (r < 0)
     goto out;
@@ -5268,7 +5267,7 @@ out:
 
 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
 {
-  ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
+  ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
   unsigned want = 0;
 
   if ((flags & O_ACCMODE) == O_WRONLY)
@@ -5305,7 +5304,7 @@ out:
 
 int Client::may_lookup(Inode *dir, const UserPerm& perms)
 {
-  ldout(cct, 20) << __func__ << *dir << "; " << perms << dendl;
+  ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
   int r = _getattr_for_perm(dir, perms);
   if (r < 0)
     goto out;
@@ -5318,7 +5317,7 @@ out:
 
 int Client::may_create(Inode *dir, const UserPerm& perms)
 {
-  ldout(cct, 20) << __func__ << *dir << "; " << perms << dendl;
+  ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
   int r = _getattr_for_perm(dir, perms);
   if (r < 0)
     goto out;
@@ -5331,7 +5330,7 @@ out:
 
 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
 {
-  ldout(cct, 20) << __func__ << *dir << "; " << "; name " << name << "; " << perms << dendl;
+  ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
   int r = _getattr_for_perm(dir, perms);
   if (r < 0)
     goto out;
@@ -5356,7 +5355,7 @@ out:
 
 int Client::may_hardlink(Inode *in, const UserPerm& perms)
 {
-  ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
+  ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
   int r = _getattr_for_perm(in, perms);
   if (r < 0)
     goto out;
@@ -5562,7 +5561,8 @@ int Client::mds_command(
 {
   Mutex::Locker lock(client_lock);
 
-  assert(initialized);
+  if (!initialized)
+    return -ENOTCONN;
 
   int r;
   r = authenticate();
@@ -5825,7 +5825,8 @@ void Client::unmount()
 {
   Mutex::Locker lock(client_lock);
 
-  assert(mounted);  // caller is confused?
+  if (unmounting)
+    return;
 
   ldout(cct, 2) << "unmounting" << dendl;
   unmounting = true;
@@ -5913,7 +5914,6 @@ void Client::unmount()
   wait_sync_caps(last_flush_tid);
 
   // empty lru cache
-  lru.lru_set_max(0);
   trim_cache();
 
   while (lru.lru_get_size() > 0 ||
@@ -6303,6 +6303,9 @@ int Client::link(const char *relexisting, const char *relpath, const UserPerm& p
   tout(cct) << relexisting << std::endl;
   tout(cct) << relpath << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath existing(relexisting);
 
   InodeRef in, dir;
@@ -6342,6 +6345,9 @@ int Client::unlink(const char *relpath, const UserPerm& perm)
   tout(cct) << "unlink" << std::endl;
   tout(cct) << relpath << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   if (std::string(relpath) == "/")
     return -EISDIR;
 
@@ -6367,6 +6373,9 @@ int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
   tout(cct) << relfrom << std::endl;
   tout(cct) << relto << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   if (std::string(relfrom) == "/" || std::string(relto) == "/")
     return -EBUSY;
 
@@ -6408,6 +6417,9 @@ int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
   tout(cct) << mode << std::endl;
   ldout(cct, 10) << "mkdir: " << relpath << dendl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   if (std::string(relpath) == "/")
     return -EEXIST;
 
@@ -6434,6 +6446,9 @@ int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
   tout(cct) << relpath << std::endl;
   tout(cct) << mode << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   //get through existing parts of path
   filepath path(relpath);
   unsigned int i;
@@ -6486,6 +6501,9 @@ int Client::rmdir(const char *relpath, const UserPerm& perms)
   tout(cct) << "rmdir" << std::endl;
   tout(cct) << relpath << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   if (std::string(relpath) == "/")
     return -EBUSY;
 
@@ -6512,6 +6530,9 @@ int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t
   tout(cct) << mode << std::endl;
   tout(cct) << rdev << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   if (std::string(relpath) == "/")
     return -EEXIST;
 
@@ -6539,6 +6560,9 @@ int Client::symlink(const char *target, const char *relpath, const UserPerm& per
   tout(cct) << target << std::endl;
   tout(cct) << relpath << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   if (std::string(relpath) == "/")
     return -EEXIST;
 
@@ -6563,6 +6587,9 @@ int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm
   tout(cct) << "readlink" << std::endl;
   tout(cct) << relpath << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   int r = path_walk(path, &in, perms, false);
@@ -6698,11 +6725,9 @@ int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
       mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
       mask &= ~CEPH_SETATTR_MODE;
       ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
-    } else if (kill_sguid && S_ISREG(in->mode)) {
+    } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
       /* Must squash the any setuid/setgid bits with an ownership change */
-      in->mode &= ~S_ISUID;
-      if ((in->mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP))
-       in->mode &= ~S_ISGID;
+      in->mode &= ~(S_ISUID|S_ISGID);
       mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
     }
 
@@ -6845,6 +6870,14 @@ int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
 
   stat_to_statx(attr, &stx);
   mask &= ~CEPH_SETATTR_BTIME;
+
+  if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
+    mask &= ~CEPH_SETATTR_UID;
+  }
+  if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
+    mask &= ~CEPH_SETATTR_GID;
+  }
+
   return _setattrx(in, &stx, mask, perms);
 }
 
@@ -6856,6 +6889,9 @@ int Client::setattr(const char *relpath, struct stat *attr, int mask,
   tout(cct) << relpath << std::endl;
   tout(cct) << mask  << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   int r = path_walk(path, &in, perms);
@@ -6872,6 +6908,9 @@ int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
   tout(cct) << relpath << std::endl;
   tout(cct) << mask  << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
@@ -6887,6 +6926,9 @@ int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
   tout(cct) << fd << std::endl;
   tout(cct) << mask  << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -6904,6 +6946,9 @@ int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm&
   tout(cct) << fd << std::endl;
   tout(cct) << mask  << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -6921,6 +6966,10 @@ int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
   Mutex::Locker lock(client_lock);
   tout(cct) << "stat" << std::endl;
   tout(cct) << relpath << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   int r = path_walk(path, &in, perms, true, mask);
@@ -6966,6 +7015,10 @@ int Client::statx(const char *relpath, struct ceph_statx *stx,
   Mutex::Locker lock(client_lock);
   tout(cct) << "statx" << std::endl;
   tout(cct) << relpath << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
 
@@ -6993,6 +7046,10 @@ int Client::lstat(const char *relpath, struct stat *stbuf,
   Mutex::Locker lock(client_lock);
   tout(cct) << "lstat" << std::endl;
   tout(cct) << relpath << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   // don't follow symlinks
@@ -7135,6 +7192,10 @@ int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
   tout(cct) << "chmod" << std::endl;
   tout(cct) << relpath << std::endl;
   tout(cct) << mode << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   int r = path_walk(path, &in, perms);
@@ -7151,6 +7212,10 @@ int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
   tout(cct) << "fchmod" << std::endl;
   tout(cct) << fd << std::endl;
   tout(cct) << mode << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -7169,6 +7234,10 @@ int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
   tout(cct) << "lchmod" << std::endl;
   tout(cct) << relpath << std::endl;
   tout(cct) << mode << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   // don't follow symlinks
@@ -7188,6 +7257,10 @@ int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
   tout(cct) << relpath << std::endl;
   tout(cct) << new_uid << std::endl;
   tout(cct) << new_gid << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   int r = path_walk(path, &in, perms);
@@ -7196,10 +7269,7 @@ int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
   struct stat attr;
   attr.st_uid = new_uid;
   attr.st_gid = new_gid;
-  int mask = 0;
-  if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
-  if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
-  return _setattr(in, &attr, mask, perms);
+  return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
 }
 
 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
@@ -7209,6 +7279,10 @@ int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
   tout(cct) << fd << std::endl;
   tout(cct) << new_uid << std::endl;
   tout(cct) << new_gid << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -7233,6 +7307,10 @@ int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
   tout(cct) << relpath << std::endl;
   tout(cct) << new_uid << std::endl;
   tout(cct) << new_gid << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   // don't follow symlinks
@@ -7256,6 +7334,10 @@ int Client::utime(const char *relpath, struct utimbuf *buf,
   tout(cct) << relpath << std::endl;
   tout(cct) << buf->modtime << std::endl;
   tout(cct) << buf->actime << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   int r = path_walk(path, &in, perms);
@@ -7277,6 +7359,10 @@ int Client::lutime(const char *relpath, struct utimbuf *buf,
   tout(cct) << relpath << std::endl;
   tout(cct) << buf->modtime << std::endl;
   tout(cct) << buf->actime << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   // don't follow symlinks
@@ -7298,6 +7384,10 @@ int Client::flock(int fd, int operation, uint64_t owner)
   tout(cct) << fd << std::endl;
   tout(cct) << operation << std::endl;
   tout(cct) << owner << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -7310,6 +7400,10 @@ int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& p
   Mutex::Locker lock(client_lock);
   tout(cct) << "opendir" << std::endl;
   tout(cct) << relpath << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   int r = path_walk(path, &in, perms, true);
@@ -7364,8 +7458,11 @@ void Client::_closedir(dir_result_t *dirp)
 void Client::rewinddir(dir_result_t *dirp)
 {
   Mutex::Locker lock(client_lock);
-
   ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
+
+  if (unmounting)
+    return;
+
   dir_result_t *d = static_cast<dir_result_t*>(dirp);
   _readdir_drop_dirp_buffer(d);
   d->reset();
@@ -7384,6 +7481,9 @@ void Client::seekdir(dir_result_t *dirp, loff_t offset)
 
   ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
 
+  if (unmounting)
+    return;
+
   if (offset == dirp->offset)
     return;
 
@@ -7633,6 +7733,9 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
 
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   dir_result_t *dirp = static_cast<dir_result_t*>(d);
 
   ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
@@ -8010,6 +8113,9 @@ int Client::open(const char *relpath, int flags, const UserPerm& perms,
   tout(cct) << relpath << std::endl;
   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *fh = NULL;
 
 #if defined(__linux__) && defined(O_PATH)
@@ -8094,6 +8200,9 @@ int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
   Mutex::Locker lock(client_lock);
   ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
   filepath path(ino);
   req->set_filepath(path);
@@ -8124,6 +8233,9 @@ int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
   Mutex::Locker lock(client_lock);
   ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
   filepath path(ino);
   req->set_filepath(path);
@@ -8152,6 +8264,9 @@ int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
   Mutex::Locker lock(client_lock);
   ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   if (!ino->dn_set.empty()) {
     // if we exposed the parent here, we'd need to check permissions,
     // but right now we just rely on the MDS doing so in make_request
@@ -8197,6 +8312,9 @@ int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
   Mutex::Locker lock(client_lock);
   ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
   req->set_filepath2(filepath(parent->ino));
   req->set_filepath(filepath(ino->ino));
@@ -8382,6 +8500,9 @@ int Client::close(int fd)
   tout(cct) << "close" << std::endl;
   tout(cct) << fd << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *fh = get_filehandle(fd);
   if (!fh)
     return -EBADF;
@@ -8404,6 +8525,9 @@ loff_t Client::lseek(int fd, loff_t offset, int whence)
   tout(cct) << offset << std::endl;
   tout(cct) << whence << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -8525,6 +8649,9 @@ int Client::read(int fd, char *buf, loff_t size, loff_t offset)
   tout(cct) << size << std::endl;
   tout(cct) << offset << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -8868,6 +8995,9 @@ int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
   tout(cct) << size << std::endl;
   tout(cct) << offset << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *fh = get_filehandle(fd);
   if (!fh)
     return -EBADF;
@@ -8893,6 +9023,9 @@ int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, in
     tout(cct) << fd << std::endl;
     tout(cct) << offset << std::endl;
 
+    if (unmounting)
+     return -ENOTCONN;
+
     Fh *fh = get_filehandle(fd);
     if (!fh)
         return -EBADF;
@@ -9015,8 +9148,7 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
     return r;
 
   /* clear the setuid/setgid bits, if any */
-  if (unlikely((in->mode & S_ISUID) ||
-              (in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) {
+  if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
     struct ceph_statx stx = { 0 };
 
     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
@@ -9201,6 +9333,9 @@ int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
   tout(cct) << fd << std::endl;
   tout(cct) << length << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -9220,6 +9355,9 @@ int Client::fsync(int fd, bool syncdataonly)
   tout(cct) << fd << std::endl;
   tout(cct) << syncdataonly << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -9323,6 +9461,9 @@ int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
   tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
   tout(cct) << fd << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -9341,6 +9482,9 @@ int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
   tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
   tout(cct) << fd << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -9369,6 +9513,10 @@ int Client::chdir(const char *relpath, std::string &new_cwd,
   Mutex::Locker lock(client_lock);
   tout(cct) << "chdir" << std::endl;
   tout(cct) << relpath << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   int r = path_walk(path, &in, perms);
@@ -9425,7 +9573,8 @@ void Client::_getcwd(string& dir, const UserPerm& perms)
 void Client::getcwd(string& dir, const UserPerm& perms)
 {
   Mutex::Locker l(client_lock);
-  _getcwd(dir, perms);
+  if (!unmounting)
+    _getcwd(dir, perms);
 }
 
 int Client::statfs(const char *path, struct statvfs *stbuf,
@@ -9434,6 +9583,9 @@ int Client::statfs(const char *path, struct statvfs *stbuf,
   Mutex::Locker l(client_lock);
   tout(cct) << "statfs" << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   ceph_statfs stats;
   C_SaferCond cond;
 
@@ -9929,6 +10081,10 @@ int Client::_sync_fs()
 int Client::sync_fs()
 {
   Mutex::Locker l(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   return _sync_fs();
 }
 
@@ -9979,6 +10135,10 @@ int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
 {
   Mutex::Locker l(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   int r = path_walk(path, &in, perm);
@@ -9992,9 +10152,14 @@ int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
   Inode *snapdir = open_snapdir(in.get());
   return _mkdir(snapdir, name, 0, perm);
 }
+
 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
 {
   Mutex::Locker l(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   int r = path_walk(path, &in, perms);
@@ -10016,6 +10181,9 @@ int Client::get_caps_issued(int fd) {
 
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -10026,6 +10194,10 @@ int Client::get_caps_issued(int fd) {
 int Client::get_caps_issued(const char *path, const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath p(path);
   InodeRef in;
   int r = path_walk(p, &in, perms, true);
@@ -10078,6 +10250,9 @@ int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
   tout(cct) << "ll_lookup" << std::endl;
   tout(cct) << name << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   int r = 0;
   if (!cct->_conf->fuse_default_permissions) {
     r = may_lookup(parent, perms);
@@ -10116,6 +10291,9 @@ int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
   tout(cct) << "ll_lookupx" << std::endl;
   tout(cct) << name << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   int r = 0;
   if (!cct->_conf->fuse_default_permissions) {
     r = may_lookup(parent, perms);
@@ -10148,6 +10326,10 @@ int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
                    unsigned int want, unsigned int flags, const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath fp(name, 0);
   InodeRef in;
   int rc;
@@ -10227,6 +10409,10 @@ bool Client::ll_forget(Inode *in, int count)
   tout(cct) << ino.val << std::endl;
   tout(cct) << count << std::endl;
 
+  // Ignore forget if we're no longer mounted
+  if (unmounting)
+    return true;
+
   if (ino == 1) return true;  // ignore forget on root.
 
   bool last = false;
@@ -10258,6 +10444,10 @@ snapid_t Client::ll_get_snapid(Inode *in)
 Inode *Client::ll_get_inode(ino_t ino)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return NULL;
+
   vinodeno_t vino = _map_faked_ino(ino);
   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
   if (p == inode_map.end())
@@ -10270,6 +10460,10 @@ Inode *Client::ll_get_inode(ino_t ino)
 Inode *Client::ll_get_inode(vinodeno_t vino)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return NULL;
+
   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
   if (p == inode_map.end())
     return NULL;
@@ -10296,6 +10490,9 @@ int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
 
   if (res == 0)
@@ -10309,6 +10506,9 @@ int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   int res = 0;
   unsigned mask = statx_to_mask(flags, want);
 
@@ -10354,6 +10554,10 @@ int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
                        const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   InodeRef target(in);
   int res = _ll_setattrx(in, stx, mask, perms, &target);
   if (res == 0) {
@@ -10372,6 +10576,10 @@ int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
   stat_to_statx(attr, &stx);
 
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   InodeRef target(in);
   int res = _ll_setattrx(in, &stx, mask, perms, &target);
   if (res == 0) {
@@ -10391,6 +10599,10 @@ int Client::getxattr(const char *path, const char *name, void *value, size_t siz
                     const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   InodeRef in;
   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
   if (r < 0)
@@ -10402,6 +10614,10 @@ int Client::lgetxattr(const char *path, const char *name, void *value, size_t si
                      const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   InodeRef in;
   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
   if (r < 0)
@@ -10413,6 +10629,10 @@ int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
                      const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -10423,6 +10643,10 @@ int Client::listxattr(const char *path, char *list, size_t size,
                      const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   InodeRef in;
   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
   if (r < 0)
@@ -10434,6 +10658,10 @@ int Client::llistxattr(const char *path, char *list, size_t size,
                       const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   InodeRef in;
   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
   if (r < 0)
@@ -10444,6 +10672,10 @@ int Client::llistxattr(const char *path, char *list, size_t size,
 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -10454,6 +10686,10 @@ int Client::removexattr(const char *path, const char *name,
                        const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   InodeRef in;
   int r = Client::path_walk(path, &in, perms, true);
   if (r < 0)
@@ -10465,6 +10701,10 @@ int Client::lremovexattr(const char *path, const char *name,
                         const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   InodeRef in;
   int r = Client::path_walk(path, &in, perms, false);
   if (r < 0)
@@ -10475,6 +10715,10 @@ int Client::lremovexattr(const char *path, const char *name,
 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -10487,6 +10731,10 @@ int Client::setxattr(const char *path, const char *name, const void *value,
   _setxattr_maybe_wait_for_osdmap(name, value, size);
 
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   InodeRef in;
   int r = Client::path_walk(path, &in, perms, true);
   if (r < 0)
@@ -10500,6 +10748,10 @@ int Client::lsetxattr(const char *path, const char *name, const void *value,
   _setxattr_maybe_wait_for_osdmap(name, value, size);
 
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   InodeRef in;
   int r = Client::path_walk(path, &in, perms, false);
   if (r < 0)
@@ -10513,6 +10765,10 @@ int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
   _setxattr_maybe_wait_for_osdmap(name, value, size);
 
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -10594,6 +10850,9 @@ int Client::ll_getxattr(Inode *in, const char *name, void *value,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vino = _get_vino(in);
 
   ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
@@ -10660,6 +10919,9 @@ int Client::ll_listxattr(Inode *in, char *names, size_t size,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vino = _get_vino(in);
 
   ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
@@ -10842,6 +11104,9 @@ int Client::ll_setxattr(Inode *in, const char *name, const void *value,
 
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vino = _get_vino(in);
 
   ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
@@ -10903,6 +11168,9 @@ int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vino = _get_vino(in);
 
   ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
@@ -11141,6 +11409,9 @@ int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& per
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vino = _get_vino(in);
 
   ldout(cct, 3) << "ll_readlink " << vino << dendl;
@@ -11218,6 +11489,9 @@ int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vparent = _get_vino(parent);
 
   ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
@@ -11254,6 +11528,9 @@ int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
   unsigned caps = statx_to_mask(flags, want);
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vparent = _get_vino(parent);
 
   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
@@ -11437,6 +11714,9 @@ int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vparent = _get_vino(parent);
 
   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
@@ -11470,6 +11750,9 @@ int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vparent = _get_vino(parent);
 
   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
@@ -11551,6 +11834,9 @@ int Client::ll_symlink(Inode *parent, const char *name, const char *value,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vparent = _get_vino(parent);
 
   ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
@@ -11585,6 +11871,9 @@ int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vparent = _get_vino(parent);
 
   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
@@ -11663,6 +11952,9 @@ int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vino = _get_vino(in);
 
   ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
@@ -11730,6 +12022,9 @@ int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vino = _get_vino(in);
 
   ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
@@ -11845,6 +12140,9 @@ int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vparent = _get_vino(parent);
   vinodeno_t vnewparent = _get_vino(newparent);
 
@@ -11917,6 +12215,9 @@ int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vino = _get_vino(in);
   vinodeno_t vnewparent = _get_vino(newparent);
 
@@ -11955,6 +12256,7 @@ int Client::ll_num_osds(void)
 int Client::ll_osdaddr(int osd, uint32_t *addr)
 {
   Mutex::Locker lock(client_lock);
+
   entity_addr_t g;
   bool exists = objecter->with_osdmap([&](const OSDMap& o) {
       if (!o.exists(osd))
@@ -11968,6 +12270,7 @@ int Client::ll_osdaddr(int osd, uint32_t *addr)
   *addr = ntohl(nb_addr);
   return 0;
 }
+
 uint32_t Client::ll_stripe_unit(Inode *in)
 {
   Mutex::Locker lock(client_lock);
@@ -12003,6 +12306,7 @@ int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
                              file_layout_t* layout)
 {
   Mutex::Locker lock(client_lock);
+
   inodeno_t ino = ll_get_inodeno(in);
   uint32_t object_size = layout->object_size;
   uint32_t su = layout->stripe_unit;
@@ -12044,6 +12348,9 @@ int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vino = _get_vino(in);
 
   ldout(cct, 3) << "ll_opendir " << vino << dendl;
@@ -12070,6 +12377,10 @@ int Client::ll_releasedir(dir_result_t *dirp)
   ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
   tout(cct) << "ll_releasedir" << std::endl;
   tout(cct) << (unsigned long)dirp << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
   _closedir(dirp);
   return 0;
 }
@@ -12081,6 +12392,9 @@ int Client::ll_fsyncdir(dir_result_t *dirp)
   tout(cct) << "ll_fsyncdir" << std::endl;
   tout(cct) << (unsigned long)dirp << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   return _fsync(dirp->inode.get(), false);
 }
 
@@ -12090,6 +12404,9 @@ int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
 
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vino = _get_vino(in);
 
   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
@@ -12206,6 +12523,9 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode,
   Mutex::Locker lock(client_lock);
   InodeRef in;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
                      fhp, perms);
   if (r >= 0) {
@@ -12233,6 +12553,8 @@ int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
   Mutex::Locker lock(client_lock);
   InodeRef in;
 
+  if (unmounting)
+    return -ENOTCONN;
 
   int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
   if (r >= 0) {
@@ -12259,6 +12581,9 @@ loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
   tout(cct) << offset << std::endl;
   tout(cct) << whence << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   return _lseek(fh, offset, whence);
 }
 
@@ -12271,6 +12596,9 @@ int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
   tout(cct) << off << std::endl;
   tout(cct) << len << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   return _read(fh, off, len, bl);
 }
 
@@ -12281,6 +12609,10 @@ int Client::ll_read_block(Inode *in, uint64_t blockid,
                          file_layout_t* layout)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   vinodeno_t vino = ll_get_vino(in);
   object_t oid = file_object_t(vino.ino, blockid);
   C_SaferCond onfinish;
@@ -12320,7 +12652,7 @@ int Client::ll_write_block(Inode *in, uint64_t blockid,
   Cond cond;
   bool done;
   int r = 0;
-  Context *onsafe;
+  Context *onsafe = nullptr;
 
   if (length == 0) {
     return -EINVAL;
@@ -12352,6 +12684,11 @@ int Client::ll_write_block(Inode *in, uint64_t blockid,
 
   /* lock just in time */
   client_lock.Lock();
+  if (unmounting) {
+    client_lock.Unlock();
+    delete onsafe;
+    return -ENOTCONN;
+  }
 
   objecter->write(oid,
                  object_locator_t(layout->pool_id),
@@ -12414,6 +12751,9 @@ int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
   tout(cct) << off << std::endl;
   tout(cct) << len << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   int r = _write(fh, off, len, data, NULL, 0);
   ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
                << dendl;
@@ -12427,6 +12767,9 @@ int Client::ll_flush(Fh *fh)
   tout(cct) << "ll_flush" << std::endl;
   tout(cct) << (unsigned long)fh << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   return _flush(fh);
 }
 
@@ -12437,6 +12780,9 @@ int Client::ll_fsync(Fh *fh, bool syncdataonly)
   tout(cct) << "ll_fsync" << std::endl;
   tout(cct) << (unsigned long)fh << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   int r = _fsync(fh, syncdataonly);
   if (r) {
     // If we're returning an error, clear it from the FH
@@ -12598,6 +12944,9 @@ int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
   tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
   tout(cct) << (unsigned long)fh << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   return _fallocate(fh, mode, offset, length);
 }
 
@@ -12606,6 +12955,9 @@ int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
   Mutex::Locker lock(client_lock);
   tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *fh = get_filehandle(fd);
   if (!fh)
     return -EBADF;
@@ -12624,6 +12976,9 @@ int Client::ll_release(Fh *fh)
   tout(cct) << "ll_release (fh)" << std::endl;
   tout(cct) << (unsigned long)fh << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   if (ll_unclosed_fh_set.count(fh))
     ll_unclosed_fh_set.erase(fh);
   return _release_fh(fh);
@@ -12636,6 +12991,9 @@ int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
   ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
   tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   return _getlk(fh, fl, owner);
 }
 
@@ -12646,6 +13004,9 @@ int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
   ldout(cct, 3) << "ll_setlk  (fh) " << fh << " " << fh->inode->ino << dendl;
   tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   return _setlk(fh, fl, owner, sleep);
 }
 
@@ -12656,6 +13017,9 @@ int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
   ldout(cct, 3) << "ll_flock  (fh) " << fh << " " << fh->inode->ino << dendl;
   tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
 
+  if (unmounting)
+    return -ENOTCONN;
+
   return _flock(fh, cmd, owner);
 }
 
@@ -12693,6 +13057,9 @@ int Client::describe_layout(const char *relpath, file_layout_t *lp,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   filepath path(relpath);
   InodeRef in;
   int r = path_walk(path, &in, perms);
@@ -12709,6 +13076,9 @@ int Client::fdescribe_layout(int fd, file_layout_t *lp)
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -12723,6 +13093,10 @@ int Client::fdescribe_layout(int fd, file_layout_t *lp)
 int64_t Client::get_default_pool_id()
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   /* first data pool is the default */ 
   return mdsmap->get_first_data_pool(); 
 }
@@ -12732,6 +13106,10 @@ int64_t Client::get_default_pool_id()
 int64_t Client::get_pool_id(const char *pool_name)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
                               pool_name);
 }
@@ -12739,6 +13117,10 @@ int64_t Client::get_pool_id(const char *pool_name)
 string Client::get_pool_name(int64_t pool)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return string();
+
   return objecter->with_osdmap([pool](const OSDMap& o) {
       return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
     });
@@ -12747,6 +13129,10 @@ string Client::get_pool_name(int64_t pool)
 int Client::get_pool_replication(int64_t pool)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   return objecter->with_osdmap([pool](const OSDMap& o) {
       return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
     });
@@ -12756,6 +13142,9 @@ int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& o
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -12798,6 +13187,10 @@ int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& o
 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   if (id < 0)
     return -EINVAL;
   return objecter->with_osdmap([&](const OSDMap& o) {
@@ -12810,6 +13203,9 @@ int Client::get_file_stripe_address(int fd, loff_t offset,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -12839,6 +13235,10 @@ int Client::get_file_stripe_address(int fd, loff_t offset,
 int Client::get_osd_addr(int osd, entity_addr_t& addr)
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   return objecter->with_osdmap([&](const OSDMap& o) {
       if (!o.exists(osd))
        return -ENOENT;
@@ -12853,6 +13253,9 @@ int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
 {
   Mutex::Locker lock(client_lock);
 
+  if (unmounting)
+    return -ENOTCONN;
+
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
@@ -12872,6 +13275,10 @@ int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
 int Client::get_local_osd()
 {
   Mutex::Locker lock(client_lock);
+
+  if (unmounting)
+    return -ENOTCONN;
+
   objecter->with_osdmap([this](const OSDMap& o) {
       if (o.get_epoch() != local_osd_epoch) {
        local_osd = o.find_osd_on_ip(messenger->get_myaddr());
@@ -13013,6 +13420,12 @@ Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
     if (cur == root_ancestor)
       break;
 
+    // deleted inode
+    if (cur->nlink == 0) {
+      cur = root_ancestor;
+      break;
+    }
+
     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
     filepath path(cur->ino);
     req->set_filepath(path);
@@ -13342,9 +13755,7 @@ void Client::handle_conf_change(const struct md_config_t *conf,
 {
   Mutex::Locker lock(client_lock);
 
-  if (changed.count("client_cache_size") ||
-      changed.count("client_cache_mid")) {
-    lru.lru_set_max(cct->_conf->client_cache_size);
+  if (changed.count("client_cache_mid")) {
     lru.lru_set_midpoint(cct->_conf->client_cache_mid);
   }
   if (changed.count("client_acl_type")) {
index 8a1852e7af067e68b7c33b6cfafb17c0e670e7f6..e89a25440506ac0be1dd550cc011d7932251f9f5 100644 (file)
@@ -504,7 +504,6 @@ protected:
   friend void intrusive_ptr_release(Inode *in);
 
   //int get_cache_size() { return lru.lru_get_size(); }
-  //void set_cache_size(int m) { lru.lru_set_max(m); }
 
   /**
    * Don't call this with in==NULL, use get_or_create for that
index d10f7b8143a282251bf8310f36b4071c7fe133ee..d24ad5c3451a14a5211988510e313af7efd06589 100644 (file)
@@ -135,7 +135,7 @@ static int getgroups_cb(void *handle, gid_t **sgids)
 }
 
 #define GET_GROUPS(perms, req) {                               \
-  if (cfuse->client->cct->_conf->fuse_set_user_groups) {       \
+  if (g_conf->get_val<bool>("fuse_set_user_groups")) { \
     gid_t *gids = NULL;                                                \
     int count = getgroups(req, &gids);                         \
     perms.init_gids(gids, count);                              \
index 0c2d7aee595ec4ac52acc82cab009019225fe8dc..79795dbc37d142f916dacdef6744145257b132a2 100644 (file)
@@ -2630,7 +2630,7 @@ static const string metadata_name_from_key(const string &key)
  * Input:
  * @param start_after which name to begin listing after
  *        (use the empty string to start at the beginning)
- * @param max_return the maximum number of names to list(if 0 means no limit)
+ * @param max_return the maximum number of names to list
 
  * Output:
  * @param value
@@ -2649,33 +2649,33 @@ int metadata_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
     return -EINVAL;
   }
 
+  // TODO remove implicit support for zero during the N-release
+  if (max_return == 0) {
+    max_return = RBD_MAX_KEYS_READ;
+  }
+
   map<string, bufferlist> data;
   string last_read = metadata_key_for_name(start_after);
-  int max_read = max_return ? MIN(RBD_MAX_KEYS_READ, max_return) : RBD_MAX_KEYS_READ;
-  bool more;
+  bool more = true;
 
-  do {
+  while (more && data.size() < max_return) {
     map<string, bufferlist> raw_data;
+    int max_read = MIN(RBD_MAX_KEYS_READ, max_return - data.size());
     int r = cls_cxx_map_get_vals(hctx, last_read, RBD_METADATA_KEY_PREFIX,
                                  max_read, &raw_data, &more);
     if (r < 0) {
       CLS_ERR("failed to read the vals off of disk: %s", cpp_strerror(r).c_str());
       return r;
     }
-    if (raw_data.empty())
-      break;
 
-    map<string, bufferlist>::iterator it = raw_data.begin();
-    for (; it != raw_data.end(); ++it)
-      data[metadata_name_from_key(it->first)].swap(it->second);
-
-    if (!more)
-      break;
+    for (auto& kv : raw_data) {
+      data[metadata_name_from_key(kv.first)].swap(kv.second);
+    }
 
-    last_read = raw_data.rbegin()->first;
-    if (max_return)
-      max_read = MIN(RBD_MAX_KEYS_READ, max_return - data.size());
-  } while (more);
+    if (!raw_data.empty()) {
+      last_read = raw_data.rbegin()->first;
+    }
+  }
 
   ::encode(data, *out);
   return 0;
index 1a400aaa159bc27b7917cdeddba98d11a04b0b74..3b3589db805ca18ce52458ecb1de89f98ba8ead2 100644 (file)
@@ -127,6 +127,8 @@ void JSONFormatter::flush(std::ostream& os)
 {
   finish_pending_string();
   os << m_ss.str();
+  if (m_line_break_enabled)
+    os << "\n";
   m_ss.clear();
   m_ss.str("");
 }
@@ -326,6 +328,8 @@ void XMLFormatter::flush(std::ostream& os)
    * we should NOT output a newline. This primarily triggers on HTTP redirects */
   if (m_pretty && !m_ss_str.empty())
     os << "\n";
+  else if (m_line_break_enabled)
+    os << "\n";
   m_ss.clear();
   m_ss.str("");
 }
index aa695ba30f546e5ac54638de650e37a359f0a0e2..df6c0a99a5ebaf4743d850dc84369990cff3f9c9 100644 (file)
@@ -37,6 +37,7 @@ namespace ceph {
     Formatter();
     virtual ~Formatter();
 
+    virtual void enable_line_break() = 0;
     virtual void flush(std::ostream& os) = 0;
     void flush(bufferlist &bl);
     virtual void reset() = 0;
@@ -93,6 +94,7 @@ namespace ceph {
     void set_status(int status, const char* status_name) override {};
     void output_header() override {};
     void output_footer() override {};
+    void enable_line_break() override { m_line_break_enabled = true; }
     void flush(std::ostream& os) override;
     using Formatter::flush; // don't hide Formatter::flush(bufferlist &bl)
     void reset() override;
@@ -128,6 +130,7 @@ namespace ceph {
     std::stringstream m_ss, m_pending_string;
     std::list<json_formatter_stack_entry_d> m_stack;
     bool m_is_pending_string;
+    bool m_line_break_enabled = false;
   };
 
   class XMLFormatter : public Formatter {
@@ -139,6 +142,7 @@ namespace ceph {
     void output_header() override;
     void output_footer() override;
 
+    void enable_line_break() override { m_line_break_enabled = true; }
     void flush(std::ostream& os) override;
     using Formatter::flush; // don't hide Formatter::flush(bufferlist &bl)
     void reset() override;
@@ -176,6 +180,7 @@ namespace ceph {
     const bool m_underscored;
     std::string m_pending_string_name;
     bool m_header_done;
+    bool m_line_break_enabled = false;
   };
 
   class TableFormatter : public Formatter {
@@ -185,6 +190,7 @@ namespace ceph {
     void set_status(int status, const char* status_name) override {};
     void output_header() override {};
     void output_footer() override {};
+    void enable_line_break() override {};
     void flush(std::ostream& os) override;
     using Formatter::flush; // don't hide Formatter::flush(bufferlist &bl)
     void reset() override;
index be70e4512837b8c1975631c76514f95ef35739d1..77a98514b0d299f64bfda3fb810ac5648c38c345 100644 (file)
@@ -18,6 +18,7 @@
 #include <atomic>
 #include "common/LogEntry.h"
 #include "common/Mutex.h"
+#include "include/health.h"
 
 class LogClient;
 class MLog;
@@ -91,6 +92,23 @@ public:
   void debug(std::stringstream &s) {
     do_log(CLOG_DEBUG, s);
   }
+  /**
+   * Convenience function mapping health status to
+   * the appropriate cluster log severity.
+   */
+  LogClientTemp health(health_status_t health) {
+    switch(health) {
+      case HEALTH_OK:
+        return info();
+      case HEALTH_WARN:
+        return warn();
+      case HEALTH_ERR:
+        return error();
+      default:
+        // Invalid health_status_t value
+        ceph_abort();
+    }
+  }
   LogClientTemp info() {
     return LogClientTemp(CLOG_INFO, *this);
   }
index 2bb767edcff7026e8a94f7bf6e967de9dd2282ed..dc15dbb84b9d7bcd7ceca940cb803a64cbff7ab4 100644 (file)
@@ -536,7 +536,7 @@ public:
   explicit GetdescsHook(AdminSocket *as) : m_as(as) {}
   bool call(string command, cmdmap_t &cmdmap, string format, bufferlist& out) override {
     int cmdnum = 0;
-    JSONFormatter jf(false);
+    JSONFormatter jf;
     jf.open_object_section("command_descriptions");
     for (map<string,string>::iterator p = m_as->m_descs.begin();
         p != m_as->m_descs.end();
@@ -550,6 +550,7 @@ public:
       cmdnum++;
     }
     jf.close_section(); // command_descriptions
+    jf.enable_line_break();
     ostringstream ss;
     jf.flush(ss);
     out.append(ss.str());
index 8d579cb99bbd2aaff2ea1275aad583732395c63b..6c8264a5efd0c21ba071736fa872027e8b4dbfca 100644 (file)
@@ -136,6 +136,7 @@ namespace cohort {
        for (int ix = 0; ix < n_lanes; ++ix,
               lane_ix = next_evict_lane()) {
          Lane& lane = qlane[lane_ix];
+         lane.lock.lock();
          /* if object at LRU has refcnt==1, it may be reclaimable */
          Object* o = &(lane.q.back());
          if (can_reclaim(o)) {
@@ -156,7 +157,6 @@ namespace cohort {
              return o;
            } else {
              // XXX can't make unreachable (means what?)
-             lane.lock.lock();
              --(o->lru_refcnt);
              o->lru_flags &= ~FLAG_EVICTING;
              /* unlock in next block */
index c65b52acb6061a4264cf237bbc080f184a09634f..cb6b406bb12ec1d870787c376969450b57ab449e 100644 (file)
@@ -403,7 +403,6 @@ OPTION(fuse_debug, OPT_BOOL)
 OPTION(fuse_multithreaded, OPT_BOOL)
 OPTION(fuse_require_active_mds, OPT_BOOL) // if ceph_fuse requires active mds server
 OPTION(fuse_syncfs_on_mksnap, OPT_BOOL)
-OPTION(fuse_set_user_groups, OPT_BOOL) // if ceph_fuse fills in group lists or not
 
 OPTION(client_try_dentry_invalidate, OPT_BOOL) // the client should try to use dentry invaldation instead of remounting, on kernels it believes that will work for
 OPTION(client_die_on_failed_remount, OPT_BOOL)
@@ -436,8 +435,6 @@ OPTION(mds_data, OPT_STR)
 OPTION(mds_max_file_size, OPT_U64) // Used when creating new CephFS. Change with 'ceph mds set max_file_size <size>' afterwards
 // max xattr kv pairs size for each dir/file
 OPTION(mds_max_xattr_pairs_size, OPT_U32)
-OPTION(mds_cache_size, OPT_INT)
-OPTION(mds_cache_mid, OPT_FLOAT)
 OPTION(mds_max_file_recover, OPT_U32)
 OPTION(mds_dir_max_commit_size, OPT_INT) // MB
 OPTION(mds_dir_keys_per_op, OPT_INT)
@@ -457,7 +454,6 @@ OPTION(mds_recall_state_timeout, OPT_FLOAT)    // detect clients which aren't tr
 OPTION(mds_freeze_tree_timeout, OPT_FLOAT)    // detecting freeze tree deadlock
 OPTION(mds_session_autoclose, OPT_FLOAT) // autoclose idle session
 OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
-OPTION(mds_health_cache_threshold, OPT_FLOAT) // warn on cache size if it exceeds mds_cache_size by this factor
 OPTION(mds_reconnect_timeout, OPT_FLOAT)  // seconds to wait for clients during mds restart
              //  make it (mds_session_timeout - mds_beacon_grace)
 OPTION(mds_tick_interval, OPT_FLOAT)
@@ -1302,110 +1298,6 @@ OPTION(rados_mon_op_timeout, OPT_DOUBLE) // how many seconds to wait for a respo
 OPTION(rados_osd_op_timeout, OPT_DOUBLE) // how many seconds to wait for a response from osds before returning an error from a rados operation. 0 means no limit.
 OPTION(rados_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
 
-OPTION(rbd_op_threads, OPT_INT)
-OPTION(rbd_op_thread_timeout, OPT_INT)
-OPTION(rbd_non_blocking_aio, OPT_BOOL) // process AIO ops from a worker thread to prevent blocking
-OPTION(rbd_cache, OPT_BOOL) // whether to enable caching (writeback unless rbd_cache_max_dirty is 0)
-OPTION(rbd_cache_writethrough_until_flush, OPT_BOOL) // whether to make writeback caching writethrough until flush is called, to be sure the user of librbd will send flushs so that writeback is safe
-OPTION(rbd_cache_size, OPT_LONGLONG)         // cache size in bytes
-OPTION(rbd_cache_max_dirty, OPT_LONGLONG)    // dirty limit in bytes - set to 0 for write-through caching
-OPTION(rbd_cache_target_dirty, OPT_LONGLONG) // target dirty limit in bytes
-OPTION(rbd_cache_max_dirty_age, OPT_FLOAT)      // seconds in cache before writeback starts
-OPTION(rbd_cache_max_dirty_object, OPT_INT)       // dirty limit for objects - set to 0 for auto calculate from rbd_cache_size
-OPTION(rbd_cache_block_writes_upfront, OPT_BOOL) // whether to block writes to the cache before the aio_write call completes (true))
-OPTION(rbd_concurrent_management_ops, OPT_INT) // how many operations can be in flight for a management operation like deleting or resizing an image
-OPTION(rbd_balance_snap_reads, OPT_BOOL)
-OPTION(rbd_localize_snap_reads, OPT_BOOL)
-OPTION(rbd_balance_parent_reads, OPT_BOOL)
-OPTION(rbd_localize_parent_reads, OPT_BOOL)
-OPTION(rbd_readahead_trigger_requests, OPT_INT) // number of sequential requests necessary to trigger readahead
-OPTION(rbd_readahead_max_bytes, OPT_LONGLONG) // set to 0 to disable readahead
-OPTION(rbd_readahead_disable_after_bytes, OPT_LONGLONG) // how many bytes are read in total before readahead is disabled
-OPTION(rbd_clone_copy_on_read, OPT_BOOL)
-OPTION(rbd_blacklist_on_break_lock, OPT_BOOL) // whether to blacklist clients whose lock was broken
-OPTION(rbd_blacklist_expire_seconds, OPT_INT) // number of seconds to blacklist - set to 0 for OSD default
-OPTION(rbd_request_timed_out_seconds, OPT_INT) // number of seconds before maint request times out
-OPTION(rbd_skip_partial_discard, OPT_BOOL) // when trying to discard a range inside an object, set to true to skip zeroing the range.
-OPTION(rbd_enable_alloc_hint, OPT_BOOL) // when writing a object, it will issue a hint to osd backend to indicate the expected size object need
-OPTION(rbd_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
-OPTION(rbd_blkin_trace_all, OPT_BOOL) // create a blkin trace for all RBD requests
-OPTION(rbd_validate_pool, OPT_BOOL) // true if empty pools should be validated for RBD compatibility
-OPTION(rbd_validate_names, OPT_BOOL) // true if image specs should be validated
-OPTION(rbd_auto_exclusive_lock_until_manual_request, OPT_BOOL) // whether to automatically acquire/release exclusive lock until it is explicitly requested, i.e. before we know the user of librbd is properly using the lock API
-OPTION(rbd_mirroring_resync_after_disconnect, OPT_BOOL) // automatically start image resync after mirroring is disconnected due to being laggy
-OPTION(rbd_mirroring_replay_delay, OPT_INT) // time-delay in seconds for rbd-mirror asynchronous replication
-
-OPTION(rbd_default_pool, OPT_STR) // default pool for storing images
-
-/*
- * The following options change the behavior for librbd's image creation methods that
- * don't require all of the parameters. These are provided so that older programs
- * can take advantage of newer features without being rewritten to use new versions
- * of the image creation functions.
- *
- * rbd_create()/RBD::create() are affected by all of these options.
- *
- * rbd_create2()/RBD::create2() and rbd_clone()/RBD::clone() are affected by:
- * - rbd_default_order
- * - rbd_default_stripe_count
- * - rbd_default_stripe_size
- *
- * rbd_create3()/RBD::create3() and rbd_clone2/RBD::clone2() are only
- * affected by rbd_default_order.
- */
-OPTION(rbd_default_format, OPT_INT)
-OPTION(rbd_default_order, OPT_INT)
-OPTION(rbd_default_stripe_count, OPT_U64) // changing requires stripingv2 feature
-OPTION(rbd_default_stripe_unit, OPT_U64) // changing to non-object size requires stripingv2 feature
-OPTION(rbd_default_data_pool, OPT_STR) // optional default pool for storing image data blocks
-
-/**
- * RBD features are only applicable for v2 images. This setting accepts either
- * an integer bitmask value or comma-delimited string of RBD feature names.
- * This setting is always internally stored as an integer bitmask value. The
- * mapping between feature bitmask value and feature name is as follows:
- *
- *  +1 -> layering
- *  +2 -> striping
- *  +4 -> exclusive-lock
- *  +8 -> object-map
- *  +16 -> fast-diff
- *  +32 -> deep-flatten
- *  +64 -> journaling
- *  +128 -> data-pool
- */
-SAFE_OPTION(rbd_default_features, OPT_STR)
-
-OPTION(rbd_default_map_options, OPT_STR) // default rbd map -o / --options
-
-/**
- * RBD journal options.
- */
-OPTION(rbd_journal_order, OPT_U32) // bits to shift to compute journal object max size, between 12 and 64
-OPTION(rbd_journal_splay_width, OPT_U32) // number of active journal objects
-OPTION(rbd_journal_commit_age, OPT_DOUBLE) // commit time interval, seconds
-OPTION(rbd_journal_object_flush_interval, OPT_INT) // maximum number of pending commits per journal object
-OPTION(rbd_journal_object_flush_bytes, OPT_INT) // maximum number of pending bytes per journal object
-OPTION(rbd_journal_object_flush_age, OPT_DOUBLE) // maximum age (in seconds) for pending commits
-OPTION(rbd_journal_pool, OPT_STR) // pool for journal objects
-OPTION(rbd_journal_max_payload_bytes, OPT_U32) // maximum journal payload size before splitting
-OPTION(rbd_journal_max_concurrent_object_sets, OPT_INT) // maximum number of object sets a journal client can be behind before it is automatically unregistered
-
-/**
- * RBD Mirror options
- */
-OPTION(rbd_mirror_journal_commit_age, OPT_DOUBLE) // commit time interval, seconds
-OPTION(rbd_mirror_journal_poll_age, OPT_DOUBLE) // maximum age (in seconds) between successive journal polls
-OPTION(rbd_mirror_journal_max_fetch_bytes, OPT_U32) // maximum bytes to read from each journal data object per fetch
-OPTION(rbd_mirror_sync_point_update_age, OPT_DOUBLE) // number of seconds between each update of the image sync point object number
-OPTION(rbd_mirror_concurrent_image_syncs, OPT_U32) // maximum number of image syncs in parallel
-OPTION(rbd_mirror_pool_replayers_refresh_interval, OPT_INT) // interval to refresh peers in rbd-mirror daemon
-OPTION(rbd_mirror_delete_retry_interval, OPT_DOUBLE) // interval to check and retry the failed requests in deleter
-OPTION(rbd_mirror_image_state_check_interval, OPT_INT) // interval to get images from pool watcher and set sources in replayer
-OPTION(rbd_mirror_leader_heartbeat_interval, OPT_INT) // interval (in seconds) between mirror leader heartbeats
-OPTION(rbd_mirror_leader_max_missed_heartbeats, OPT_INT) // number of missed heartbeats for non-lock owner to attempt to acquire lock
-OPTION(rbd_mirror_leader_max_acquire_attempts_before_break, OPT_INT) // number of failed attempts to acquire lock after missing heartbeats before breaking lock
-
 OPTION(nss_db_path, OPT_STR) // path to nss db
 
 
index 6166ef3f8d50b5515f7308756faf3b27e97190ba..0be052e1bc551eaf3787bbc205afda66f52b3598 100644 (file)
@@ -1187,6 +1187,12 @@ std::vector<Option> get_global_options() {
     .set_default(50)
     .set_description(""),
 
+    Option("mon_health_log_update_period", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(5)
+    .set_description("Minimum time in seconds between log messages about "
+                     "each health check")
+    .set_min(0),
+
     Option("mon_data_avail_crit", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(5)
     .set_description(""),
@@ -2510,7 +2516,7 @@ std::vector<Option> get_global_options() {
     .set_description(""),
 
     Option("osd_min_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
-    .set_default(3000)
+    .set_default(1500)
     .set_description("minimum number of entries to maintain in the PG log")
     .add_service("osd")
     .add_see_also("osd_max_pg_log_entries")
@@ -3085,7 +3091,7 @@ std::vector<Option> get_global_options() {
     .set_description(""),
 
     Option("bluefs_allocator", Option::TYPE_STR, Option::LEVEL_DEV)
-    .set_default("bitmap")
+    .set_default("stupid")
     .set_description(""),
 
     Option("bluefs_preextend_wal_files", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
@@ -3397,9 +3403,9 @@ std::vector<Option> get_global_options() {
     .set_description("Key value database to use for bluestore"),
 
     Option("bluestore_allocator", Option::TYPE_STR, Option::LEVEL_DEV)
-    .set_default("bitmap")
-    .add_tag("mkfs")
-    .set_description(""),
+    .set_default("stupid")
+    .set_enum_allowed({"bitmap", "stupid"})
+    .set_description("Allocator policy"),
 
     Option("bluestore_freelist_blocks_per_key", Option::TYPE_INT, Option::LEVEL_DEV)
     .set_default(128)
@@ -4927,7 +4933,7 @@ std::vector<Option> get_rgw_options() {
     .set_description(""),
 
     Option("rgw_dynamic_resharding", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
-    .set_default(false)
+    .set_default(true)
     .set_description(""),
 
     Option("rgw_max_objs_per_shard", Option::TYPE_INT, Option::LEVEL_ADVANCED)
@@ -4944,7 +4950,7 @@ static std::vector<Option> get_rbd_options() {
   return std::vector<Option>({
     Option("rbd_default_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("rbd")
-    .set_description("")
+    .set_description("default pool for storing new images")
     .set_validator([](std::string *value, std::string *error_message){
       boost::regex pattern("^[^@/]+$");
       if (!boost::regex_match (*value, pattern)) {
@@ -4956,7 +4962,7 @@ static std::vector<Option> get_rbd_options() {
 
     Option("rbd_default_data_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("")
-    .set_description("")
+    .set_description("default pool for storing data blocks for new images")
     .set_validator([](std::string *value, std::string *error_message){
       boost::regex pattern("^[^@/]*$");
       if (!boost::regex_match (*value, pattern)) {
@@ -4968,7 +4974,15 @@ static std::vector<Option> get_rbd_options() {
 
     Option("rbd_default_features", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("layering,exclusive-lock,object-map,fast-diff,deep-flatten")
-    .set_description("")
+    .set_description("default v2 image features for new images")
+    .set_long_description(
+        "RBD features are only applicable for v2 images. This setting accepts "
+        "either an integer bitmask value or comma-delimited string of RBD "
+        "feature names. This setting is always internally stored as an integer "
+        "bitmask value. The mapping between feature bitmask value and feature "
+        "name is as follows: +1 -> layering, +2 -> striping, "
+        "+4 -> exclusive-lock, +8 -> object-map, +16 -> fast-diff, "
+        "+32 -> deep-flatten, +64 -> journaling, +128 -> data-pool")
     .set_safe()
     .set_validator([](std::string *value, std::string *error_message){
       static const std::map<std::string, uint64_t> FEATURE_MAP = {
@@ -5029,234 +5043,241 @@ static std::vector<Option> get_rbd_options() {
 
     Option("rbd_op_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(1)
-    .set_description(""),
+    .set_description("number of threads to utilize for internal processing"),
 
     Option("rbd_op_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(60)
-    .set_description(""),
+    .set_description("time in seconds for detecting a hung thread"),
 
     Option("rbd_non_blocking_aio", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("process AIO ops from a dispatch thread to prevent blocking"),
 
     Option("rbd_cache", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("whether to enable caching (writeback unless rbd_cache_max_dirty is 0)"),
 
     Option("rbd_cache_writethrough_until_flush", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("whether to make writeback caching writethrough until "
+                     "flush is called, to be sure the user of librbd will send "
+                     "flushes so that writeback is safe"),
 
     Option("rbd_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(32<<20)
-    .set_description(""),
+    .set_description("cache size in bytes"),
 
     Option("rbd_cache_max_dirty", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(24<<20)
-    .set_description(""),
+    .set_description("dirty limit in bytes - set to 0 for write-through caching"),
 
     Option("rbd_cache_target_dirty", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(16<<20)
-    .set_description(""),
+    .set_description("target dirty limit in bytes"),
 
     Option("rbd_cache_max_dirty_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(1.0)
-    .set_description(""),
+    .set_description("seconds in cache before writeback starts"),
 
     Option("rbd_cache_max_dirty_object", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(0)
-    .set_description(""),
+    .set_description("dirty limit for objects - set to 0 for auto calculate from rbd_cache_size"),
 
     Option("rbd_cache_block_writes_upfront", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("whether to block writes to the cache before the aio_write call completes"),
 
     Option("rbd_concurrent_management_ops", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(10)
     .set_min(1)
-    .set_description(""),
+    .set_description("how many operations can be in flight for a management operation like deleting or resizing an image"),
 
     Option("rbd_balance_snap_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("distribute snap read requests to random OSD"),
 
     Option("rbd_localize_snap_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("localize snap read requests to closest OSD"),
 
     Option("rbd_balance_parent_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("distribute parent read requests to random OSD"),
 
     Option("rbd_localize_parent_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("localize parent requests to closest OSD"),
 
     Option("rbd_readahead_trigger_requests", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(10)
-    .set_description(""),
+    .set_description("number of sequential requests necessary to trigger readahead"),
 
     Option("rbd_readahead_max_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(512 * 1024)
-    .set_description(""),
+    .set_description("set to 0 to disable readahead"),
 
     Option("rbd_readahead_disable_after_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(50 * 1024 * 1024)
-    .set_description(""),
+    .set_description("how many bytes are read in total before readahead is disabled"),
 
     Option("rbd_clone_copy_on_read", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("copy-up parent image blocks to clone upon read request"),
 
     Option("rbd_blacklist_on_break_lock", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("whether to blacklist clients whose lock was broken"),
 
     Option("rbd_blacklist_expire_seconds", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(0)
-    .set_description(""),
+    .set_description("number of seconds to blacklist - set to 0 for OSD default"),
 
     Option("rbd_request_timed_out_seconds", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(30)
-    .set_description(""),
+    .set_description("number of seconds before maintenance request times out"),
 
     Option("rbd_skip_partial_discard", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("when trying to discard a range inside an object, set to true to skip zeroing the range"),
 
     Option("rbd_enable_alloc_hint", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("when writing a object, it will issue a hint to osd backend to indicate the expected size object need"),
 
     Option("rbd_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("true if LTTng-UST tracepoints should be enabled"),
 
     Option("rbd_blkin_trace_all", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("create a blkin trace for all RBD requests"),
 
     Option("rbd_validate_pool", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("validate empty pools for RBD compatibility"),
 
     Option("rbd_validate_names", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("validate new image names for RBD compatibility"),
 
     Option("rbd_auto_exclusive_lock_until_manual_request", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("automatically acquire/release exclusive lock until it is explicitly requested"),
 
     Option("rbd_mirroring_resync_after_disconnect", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("automatically start image resync after mirroring is disconnected due to being laggy"),
 
     Option("rbd_mirroring_replay_delay", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(0)
-    .set_description(""),
+    .set_description("time-delay in seconds for rbd-mirror asynchronous replication"),
 
     Option("rbd_default_format", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(2)
-    .set_description(""),
+    .set_description("default image format for new images"),
 
     Option("rbd_default_order", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(22)
-    .set_description(""),
+    .set_description("default order (data block object size) for new images"),
 
     Option("rbd_default_stripe_count", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(0)
-    .set_description(""),
+    .set_description("default stripe count for new images"),
 
     Option("rbd_default_stripe_unit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(0)
-    .set_description(""),
+    .set_description("default stripe width for new images"),
 
     Option("rbd_default_map_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("")
-    .set_description(""),
+    .set_description("default krbd map options"),
 
     Option("rbd_journal_order", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_min(12)
     .set_default(24)
-    .set_description(""),
+    .set_description("default order (object size) for journal data objects"),
 
     Option("rbd_journal_splay_width", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(4)
-    .set_description(""),
+    .set_description("number of active journal objects"),
 
     Option("rbd_journal_commit_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(5)
-    .set_description(""),
+    .set_description("commit time interval, seconds"),
 
     Option("rbd_journal_object_flush_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(0)
-    .set_description(""),
+    .set_description("maximum number of pending commits per journal object"),
 
     Option("rbd_journal_object_flush_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(0)
-    .set_description(""),
+    .set_description("maximum number of pending bytes per journal object"),
 
     Option("rbd_journal_object_flush_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(0)
-    .set_description(""),
+    .set_description("maximum age (in seconds) for pending commits"),
 
     Option("rbd_journal_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("")
-    .set_description(""),
+    .set_description("pool for journal objects"),
 
     Option("rbd_journal_max_payload_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(16384)
-    .set_description(""),
+    .set_description("maximum journal payload size before splitting"),
 
     Option("rbd_journal_max_concurrent_object_sets", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(0)
-    .set_description(""),
+    .set_description("maximum number of object sets a journal client can be behind before it is automatically unregistered"),
+  });
+}
 
+static std::vector<Option> get_rbd_mirror_options() {
+  return std::vector<Option>({
     Option("rbd_mirror_journal_commit_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(5)
-    .set_description(""),
+    .set_description("commit time interval, seconds"),
 
     Option("rbd_mirror_journal_poll_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(5)
-    .set_description(""),
+    .set_description("maximum age (in seconds) between successive journal polls"),
 
     Option("rbd_mirror_journal_max_fetch_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(32768)
-    .set_description(""),
+    .set_description("maximum bytes to read from each journal data object per fetch"),
 
     Option("rbd_mirror_sync_point_update_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(30)
-    .set_description(""),
+    .set_description("number of seconds between each update of the image sync point object number"),
 
     Option("rbd_mirror_concurrent_image_syncs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(5)
-    .set_description(""),
+    .set_description("maximum number of image syncs in parallel"),
 
     Option("rbd_mirror_pool_replayers_refresh_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(30)
-    .set_description(""),
+    .set_description("interval to refresh peers in rbd-mirror daemon"),
 
     Option("rbd_mirror_delete_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(30)
-    .set_description(""),
+    .set_description("interval to check and retry the failed requests in deleter"),
 
     Option("rbd_mirror_image_state_check_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(30)
     .set_min(1)
-    .set_description(""),
+    .set_description("interval to get images from pool watcher and set sources in replayer"),
 
     Option("rbd_mirror_leader_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(5)
     .set_min(1)
-    .set_description(""),
+    .set_description("interval (in seconds) between mirror leader heartbeats"),
 
     Option("rbd_mirror_leader_max_missed_heartbeats", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(2)
-    .set_description(""),
+    .set_description("number of missed heartbeats for non-lock owner to attempt to acquire lock"),
 
     Option("rbd_mirror_leader_max_acquire_attempts_before_break", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(3)
-    .set_description(""),
+    .set_description("number of failed attempts to acquire lock after missing heartbeats before breaking lock"),
   });
 }
 
@@ -5275,8 +5296,22 @@ std::vector<Option> get_mds_options() {
     .set_description(""),
 
     Option("mds_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
-    .set_default(100000)
-    .set_description(""),
+    .set_default(0)
+    .set_description("maximum number of inodes in MDS cache (<=0 is unlimited)")
+    .set_long_description("This tunable is no longer recommended. Use mds_cache_memory_limit."),
+
+    Option("mds_cache_memory_limit", Option::TYPE_UINT, Option::LEVEL_BASIC)
+    .set_default(1*(1LL<<30))
+    .set_description("target maximum memory usage of MDS cache")
+    .set_long_description("This sets a target maximum memory usage of the MDS cache and is the primary tunable to limit the MDS memory usage. The MDS will try to stay under a reservation of this limit (by default 95%; 1 - mds_cache_reservation) by trimming unused metadata in its cache and recalling cached items in the client caches. It is possible for the MDS to exceed this limit due to slow recall from clients. The mds_health_cache_threshold (150%) sets a cache full threshold for when the MDS signals a cluster health warning."),
+
+    Option("mds_cache_reservation", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.05)
+    .set_description("amount of memory to reserve"),
+
+    Option("mds_health_cache_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.5)
+    .set_description("threshold for cache size to generate health warning"),
 
     Option("mds_cache_mid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(.7)
@@ -5350,10 +5385,6 @@ std::vector<Option> get_mds_options() {
     .set_default(10)
     .set_description(""),
 
-    Option("mds_health_cache_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(1.5)
-    .set_description(""),
-
     Option("mds_reconnect_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(45)
     .set_description(""),
@@ -5899,11 +5930,11 @@ std::vector<Option> get_mds_client_options() {
     .set_description(""),
 
     Option("fuse_set_user_groups", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
-    .set_default(false)
-    .set_description(""),
+    .set_default(true)
+    .set_description("check for ceph-fuse to consider supplementary groups for permissions"),
 
     Option("client_try_dentry_invalidate", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
-    .set_default(true)
+    .set_default(false)
     .set_description(""),
 
     Option("client_die_on_failed_remount", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
@@ -5939,6 +5970,7 @@ static std::vector<Option> build_options()
 
   ingest(get_rgw_options(), "rgw");
   ingest(get_rbd_options(), "rbd");
+  ingest(get_rbd_mirror_options(), "rbd-mirror");
   ingest(get_mds_options(), "mds");
   ingest(get_mds_client_options(), "mds_client");
 
index 762fe374bb578723671f667c1cefae1b815e760d..4f31b932faa4385127036f7180cb6ef2a5197882 100644 (file)
 #include <sys/mount.h>
 #endif
 
+#include <string>
+
+#include <stdio.h>
+
 int64_t unit_to_bytesize(string val, ostream *pss)
 {
   if (val.empty()) {
@@ -302,3 +306,15 @@ string cleanbin(string &str)
     result = "Base64:" + result;
   return result;
 }
+
+std::string bytes2str(uint64_t count) {
+  static char s[][2] = {"\0", "k", "M", "G", "T", "P", "E", "\0"};
+  int i = 0;
+  while (count >= 1024 && *s[i+1]) {
+    count >>= 10;
+    i++;
+  }
+  char str[128];
+  snprintf(str, sizeof str, "%" PRIu64 "%sB", count, s[i]);
+  return std::string(str);
+}
index 3737d57300230404dd6b086861a2d10fd28c5941..384af2c442692357969c9d483b27b5f821b3b2f2 100644 (file)
@@ -1258,14 +1258,15 @@ public:
 
   int find_rule(int ruleset, int type, int size) const {
     if (!crush) return -1;
-    if (!have_uniform_rules) {
-      return crush_find_rule(crush, ruleset, type, size);
-    } else {
-      if (ruleset < (int)crush->max_rules &&
-         crush->rules[ruleset])
-       return ruleset;
-      return -1;
+    if (have_uniform_rules &&
+       ruleset < (int)crush->max_rules &&
+       crush->rules[ruleset] &&
+       crush->rules[ruleset]->mask.type == type &&
+       crush->rules[ruleset]->mask.min_size <= size &&
+       crush->rules[ruleset]->mask.max_size >= size) {
+      return ruleset;
     }
+    return crush_find_rule(crush, ruleset, type, size);
   }
 
   bool ruleset_exists(const int ruleset) const {
diff --git a/ceph/src/include/alloc_ptr.h b/ceph/src/include/alloc_ptr.h
new file mode 100644 (file)
index 0000000..258c583
--- /dev/null
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ALLOC_PTR_H
+#define CEPH_ALLOC_PTR_H
+
+#include <memory>
+
+template <class T>
+class alloc_ptr
+{
+public:
+    typedef typename std::pointer_traits< std::unique_ptr<T> >::pointer pointer;
+    typedef typename std::pointer_traits< std::unique_ptr<T> >::element_type element_type;
+
+    alloc_ptr() : ptr() {}
+
+    template<class U>
+      alloc_ptr(U&& u) : ptr(std::forward<U>(u)) {}
+
+    alloc_ptr(alloc_ptr<pointer>&& rhs) : ptr(std::move(rhs.ptr)) {}
+    alloc_ptr(const alloc_ptr<pointer>& rhs) = delete;
+    alloc_ptr& operator=(const alloc_ptr<pointer>&& rhs) {
+        ptr = rhs.ptr;
+    }
+    alloc_ptr& operator=(const alloc_ptr<pointer>& rhs) {
+        ptr = rhs.ptr;
+    }
+
+    void swap (alloc_ptr<pointer>& rhs) {
+        ptr.swap(rhs.ptr);
+    }
+    element_type* release() {
+        return ptr.release();
+    }
+    void reset(element_type *p = nullptr) {
+        ptr.reset(p);
+    }
+    element_type* get() const {
+        if (!ptr)
+          ptr.reset(new element_type);
+        return ptr.get();
+    }
+    element_type& operator*() const {
+        if (!ptr)
+          ptr.reset(new element_type);
+        return *ptr;
+    }
+    element_type* operator->() const {
+        if (!ptr)
+          ptr.reset(new element_type);
+        return ptr.get();
+    }
+    operator bool() const {
+        return !!ptr;
+    }
+
+    friend bool operator< (const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return std::less<element_type>(*lhs, *rhs);
+    }
+    friend bool operator<=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return std::less_equal<element_type>(*lhs, *rhs);
+    }
+    friend bool operator> (const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return std::greater<element_type>(*lhs, *rhs);
+    }
+    friend bool operator>=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return std::greater_equal<element_type>(*lhs, *rhs);
+    }
+    friend bool operator==(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return *lhs == *rhs;
+    }
+    friend bool operator!=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return *lhs != *rhs;
+    }
+private:
+    mutable std::unique_ptr<element_type> ptr;
+};
+
+#endif
index c72d4ab081a435a782c53d88e365c6c38b704d85..828fead2ae0b9a984dfc089c42c5f99ada293863 100644 (file)
 #include "assert.h"
 #include "encoding_btree.h"
 
-template<typename T>
+template<typename T,
+        typename Alloc = std::allocator<std::pair<const T, T>>>
 class btree_interval_set {
  public:
 
-  typedef btree::btree_map<T,T> map_t;
+  typedef btree::btree_map<T,T, std::less<T>, Alloc> map_t;
 
   class const_iterator;
 
@@ -164,28 +165,28 @@ class btree_interval_set {
     return m.size();
   }
 
-  typename btree_interval_set<T>::iterator begin() {
-    return typename btree_interval_set<T>::iterator(m.begin());
+  typename btree_interval_set<T,Alloc>::iterator begin() {
+    return typename btree_interval_set<T,Alloc>::iterator(m.begin());
   }
 
-  typename btree_interval_set<T>::iterator lower_bound(T start) {
-    return typename btree_interval_set<T>::iterator(find_inc_m(start));
+  typename btree_interval_set<T,Alloc>::iterator lower_bound(T start) {
+    return typename btree_interval_set<T,Alloc>::iterator(find_inc_m(start));
   }
 
-  typename btree_interval_set<T>::iterator end() {
-    return typename btree_interval_set<T>::iterator(m.end());
+  typename btree_interval_set<T,Alloc>::iterator end() {
+    return typename btree_interval_set<T,Alloc>::iterator(m.end());
   }
 
-  typename btree_interval_set<T>::const_iterator begin() const {
-    return typename btree_interval_set<T>::const_iterator(m.begin());
+  typename btree_interval_set<T,Alloc>::const_iterator begin() const {
+    return typename btree_interval_set<T,Alloc>::const_iterator(m.begin());
   }
 
-  typename btree_interval_set<T>::const_iterator lower_bound(T start) const {
-    return typename btree_interval_set<T>::const_iterator(find_inc(start));
+  typename btree_interval_set<T,Alloc>::const_iterator lower_bound(T start) const {
+    return typename btree_interval_set<T,Alloc>::const_iterator(find_inc(start));
   }
 
-  typename btree_interval_set<T>::const_iterator end() const {
-    return typename btree_interval_set<T>::const_iterator(m.end());
+  typename btree_interval_set<T,Alloc>::const_iterator end() const {
+    return typename btree_interval_set<T,Alloc>::const_iterator(m.end());
   }
 
   // helpers
@@ -555,11 +556,11 @@ private:
 };
 
 
-template<class T>
-inline std::ostream& operator<<(std::ostream& out, const btree_interval_set<T> &s) {
+template<class T, class A>
+inline std::ostream& operator<<(std::ostream& out, const btree_interval_set<T,A> &s) {
   out << "[";
   const char *prequel = "";
-  for (typename btree_interval_set<T>::const_iterator i = s.begin();
+  for (auto i = s.begin();
        i != s.end();
        ++i)
   {
@@ -570,13 +571,13 @@ inline std::ostream& operator<<(std::ostream& out, const btree_interval_set<T> &
   return out;
 }
 
-template<class T>
-inline void encode(const btree_interval_set<T>& s, bufferlist& bl)
+template<class T,typename A>
+inline void encode(const btree_interval_set<T,A>& s, bufferlist& bl)
 {
   s.encode(bl);
 }
-template<class T>
-inline void decode(btree_interval_set<T>& s, bufferlist::iterator& p)
+template<class T,typename A>
+inline void decode(btree_interval_set<T,A>& s, bufferlist::iterator& p)
 {
   s.decode(p);
 }
index 12ba8b3504a30e21b8e7f71b9dc50a2105352373..975baa064d3dd312464bbcdb8394445607f49901 100644 (file)
@@ -61,6 +61,9 @@ protected:
       --it;
       return *this;
     }
+    const std::pair<const Key,T>& operator*() {
+      return *it;
+    }
     const std::pair<const Key,T>* operator->() {
       return it.operator->();
     }
@@ -103,6 +106,9 @@ protected:
       --it;
       return *this;
     }
+    std::pair<const Key,T>& operator*() {
+      return *it;
+    }
     std::pair<const Key,T>* operator->() {
       return it.operator->();
     }
index d161c0b7dac61ba20b066ae3aed79594008213b3..61ed7409c7e4ff3107a2dc9bfadd7595fef71262 100644 (file)
@@ -15,6 +15,8 @@
 #ifndef CEPH_COUNTER_H
 #define CEPH_COUNTER_H
 
+#include <atomic>
+
 template <typename T>
 class Counter {
 public:
@@ -30,23 +32,23 @@ public:
   ~Counter() {
     _count()--;
   }
-  static unsigned long count() {
+  static uint64_t count() {
     return _count();
   }
-  static unsigned long increments() {
+  static uint64_t increments() {
     return _increments();
   }
-  static unsigned long decrements() {
+  static uint64_t decrements() {
     return increments()-count();
   }
 
 private:
-  static unsigned long &_count() {
-    static unsigned long c;
+  static std::atomic<uint64_t> &_count() {
+    static std::atomic<uint64_t> c;
     return c;
   }
-  static unsigned long &_increments() {
-    static unsigned long i;
+  static std::atomic<uint64_t> &_increments() {
+    static std::atomic<uint64_t> i;
     return i;
   }
 };
index f7ee5d118285cf17e03ca83e028dade11d5c1171..5015e19024b2d274d79725a2a1376146d65ecb12 100644 (file)
@@ -297,16 +297,16 @@ inline void encode(const boost::optional<T> &p, bufferlist &bl)
 #pragma GCC diagnostic ignored "-Wpragmas"
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wuninitialized"
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 template<typename T>
 inline void decode(boost::optional<T> &p, bufferlist::iterator &bp)
 {
   __u8 present;
   ::decode(present, bp);
   if (present) {
-    T t;
-    p = t;
+    p = T{};
     decode(p.get(), bp);
+  } else {
+    p = boost::none;
   }
 }
 #pragma GCC diagnostic pop
index b84d8187052d68eaa2a2c14706207ef3dc3883c5..e05eacae41efacf6383731da8aa09706ea9e2bc8 100644 (file)
@@ -463,17 +463,31 @@ class interval_set {
 
     typename std::map<T,T>::const_iterator pa = a.m.begin();
     typename std::map<T,T>::const_iterator pb = b.m.begin();
-    
+    typename decltype(m)::iterator mi = m.begin();
+
     while (pa != a.m.end() && pb != b.m.end()) {
       // passing?
       if (pa->first + pa->second <= pb->first) 
         { pa++;  continue; }
       if (pb->first + pb->second <= pa->first) 
         { pb++;  continue; }
+
+      if (*pa == *pb) {
+        do {
+          mi = m.insert(mi, *pa);
+          _size += pa->second;
+          ++pa;
+          ++pb;
+        } while (pa != a.m.end() && pb != b.m.end() && *pa == *pb);
+        continue;
+      }
+
       T start = MAX(pa->first, pb->first);
       T en = MIN(pa->first+pa->second, pb->first+pb->second);
       assert(en > start);
-      insert(start, en-start);
+      typename decltype(m)::value_type i{start, en - start};
+      mi = m.insert(mi, i);
+      _size += i.second;
       if (pa->first+pa->second > pb->first+pb->second)
         pb++;
       else
index e88cedc60b2dc0cfafa988d4d971d8ddc30b9344..d04e94f19a471f842e78cc74a335faaf5024bdd6 100644 (file)
 #ifndef CEPH_LRU_H
 #define CEPH_LRU_H
 
+#include <math.h>
 #include <stdint.h>
 
 #include "common/config.h"
-
-
+#include "xlist.h"
 
 class LRUObject {
- private:
-  LRUObject *lru_next, *lru_prev;
-  bool lru_pinned;
-  class LRU *lru;
-  class LRUList *lru_list;
-
- public:
-  LRUObject() {
-    lru_next = lru_prev = NULL;
-    lru_list = 0;
-    lru_pinned = false;
-    lru = 0;
-  }
+public:
+  LRUObject() : lru(), lru_link(this), lru_pinned(false) { }
+  ~LRUObject();
 
   // pin/unpin item in cache
-  void lru_pin(); 
+  void lru_pin();
   void lru_unpin();
   bool lru_is_expireable() const { return !lru_pinned; }
 
   friend class LRU;
-  friend class LRUList;
+private:
+  class LRU *lru;
+  xlist<LRUObject *>::item lru_link;
+  bool lru_pinned;
 };
 
+class LRU {
+public:
+  LRU() : num_pinned(0), midpoint(0.6) {}
 
-class LRUList {
- private:
-  LRUObject *head, *tail;
-  uint32_t len;
+  uint64_t lru_get_size() const { return lru_get_top()+lru_get_bot()+lru_get_pintail(); }
+  uint64_t lru_get_top() const { return top.size(); }
+  uint64_t lru_get_bot() const{ return bottom.size(); }
+  uint64_t lru_get_pintail() const { return pintail.size(); }
+  uint64_t lru_get_num_pinned() const { return num_pinned; }
 
- public:
-  LRUList() {
-    head = tail = 0;
-    len = 0;
-  }
+  void lru_set_midpoint(double f) { midpoint = fmin(1.0, fmax(0.0, f)); }
   
-  uint32_t  get_length() const { return len; }
-
-  LRUObject *get_head() {
-    return head;
-  }
-  LRUObject *get_tail() {
-    return tail;
-  }
-
-  void clear() {
-    while (len > 0) {
-      remove(get_head());
+  void lru_clear() {
+    while (!top.empty()) {
+      lru_remove(top.front());
     }
-  }
-
-  void insert_head(LRUObject *o) {
-    o->lru_next = head;
-    o->lru_prev = NULL;
-    if (head) {
-      head->lru_prev = o;
-    } else {
-      tail = o;
+    while (!bottom.empty()) {
+      lru_remove(bottom.front());
     }
-    head = o;
-    o->lru_list = this;
-    len++;
-  }
-  void insert_tail(LRUObject *o) {
-    o->lru_next = NULL;
-    o->lru_prev = tail;
-    if (tail) {
-      tail->lru_next = o;
-    } else {
-      head = o;
+    while (!pintail.empty()) {
+      lru_remove(pintail.front());
     }
-    tail = o;
-    o->lru_list = this;
-    len++;
-  }
-
-  void remove(LRUObject *o) {
-    assert(o->lru_list == this);
-    if (o->lru_next)
-      o->lru_next->lru_prev = o->lru_prev;
-    else
-      tail = o->lru_prev;
-    if (o->lru_prev)
-      o->lru_prev->lru_next = o->lru_next;
-    else
-      head = o->lru_next;
-    o->lru_next = o->lru_prev = NULL;
-    o->lru_list = 0;
-    assert(len>0);
-    len--;
-  }
-  
-};
-
-
-class LRU {
- protected:
-  LRUList lru_top, lru_bot, lru_pintail;
-  uint32_t lru_num, lru_num_pinned;
-  uint32_t lru_max;   // max items
-  double lru_midpoint;
-
-  friend class LRUObject;
-  //friend class MDCache; // hack
-  
- public:
-  LRU(int max = 0) {
-    lru_num = 0;
-    lru_num_pinned = 0;
-    lru_midpoint = .6;
-    lru_max = max;
-  }
-
-  uint32_t lru_get_size() const { return lru_num; }
-  uint32_t lru_get_top() const { return lru_top.get_length(); }
-  uint32_t lru_get_bot() const{ return lru_bot.get_length(); }
-  uint32_t lru_get_pintail() const { return lru_pintail.get_length(); }
-  uint32_t lru_get_max() const { return lru_max; }
-  uint32_t lru_get_num_pinned() const { return lru_num_pinned; }
-
-  void lru_set_max(uint32_t m) { lru_max = m; }
-  void lru_set_midpoint(float f) { lru_midpoint = f; }
-  
-  void lru_clear() {
-    lru_top.clear();
-    lru_bot.clear();
-    lru_pintail.clear();
-    lru_num = 0;
+    assert(num_pinned == 0);
   }
 
   // insert at top of lru
   void lru_insert_top(LRUObject *o) {
-    //assert(!o->lru_in_lru);
-    //o->lru_in_lru = true;
     assert(!o->lru);
     o->lru = this;
-    lru_top.insert_head( o );
-    lru_num++;
-    if (o->lru_pinned) lru_num_pinned++;
-    lru_adjust();
+    top.push_front(&o->lru_link);
+    if (o->lru_pinned) num_pinned++;
+    adjust();
   }
 
   // insert at mid point in lru
   void lru_insert_mid(LRUObject *o) {
-    //assert(!o->lru_in_lru);
-    //o->lru_in_lru = true;
     assert(!o->lru);
     o->lru = this;
-    lru_bot.insert_head(o);
-    lru_num++;
-    if (o->lru_pinned) lru_num_pinned++;
+    bottom.push_front(&o->lru_link);
+    if (o->lru_pinned) num_pinned++;
+    adjust();
   }
 
   // insert at bottom of lru
   void lru_insert_bot(LRUObject *o) {
     assert(!o->lru);
     o->lru = this;
-    lru_bot.insert_tail(o);
-    lru_num++;
-    if (o->lru_pinned) lru_num_pinned++;
+    bottom.push_back(&o->lru_link);
+    if (o->lru_pinned) num_pinned++;
+    adjust();
   }
 
-  /*
-  // insert at bottom of lru
-  void lru_insert_pintail(LRUObject *o) {
-    assert(!o->lru);
-    o->lru = this;
-    
-    assert(o->lru_pinned);
-
-    lru_pintail.insert_head(o);
-    lru_num++;
-    lru_num_pinned += o->lru_pinned;
-  }
-  */
-
-  
-
-
-  // adjust top/bot balance, as necessary
-  void lru_adjust() {
-    if (!lru_max) return;
-
-    unsigned toplen = lru_top.get_length();
-    unsigned topwant = (unsigned)(lru_midpoint * ((double)lru_max - lru_num_pinned));
-    while (toplen > 0 && 
-           toplen > topwant) {
-      // remove from tail of top, stick at head of bot
-      // FIXME: this could be way more efficient by moving a whole chain of items.
-
-      LRUObject *o = lru_top.get_tail();
-      lru_top.remove(o);
-      lru_bot.insert_head(o);
-      toplen--;
-    }
-  }
-
-
   // remove an item
   LRUObject *lru_remove(LRUObject *o) {
-    // not in list
-    //assert(o->lru_in_lru);
-    //if (!o->lru_in_lru) return o;  // might have expired and been removed that way.
     if (!o->lru) return o;
-
-    assert((o->lru_list == &lru_pintail) ||
-           (o->lru_list == &lru_top) ||
-           (o->lru_list == &lru_bot));
-    o->lru_list->remove(o);
-
-    lru_num--;
-    if (o->lru_pinned) lru_num_pinned--;
-    o->lru = 0;
+    auto list = o->lru_link.get_list();
+    assert(list == &top || list == &bottom || list == &pintail);
+    o->lru_link.remove_myself();
+    if (o->lru_pinned) num_pinned--;
+    o->lru = nullptr;
+    adjust();
     return o;
   }
 
   // touch item -- move to head of lru
   bool lru_touch(LRUObject *o) {
-    lru_remove(o);
-    lru_insert_top(o);
+    if (!o->lru) {
+      lru_insert_top(o);
+    } else {
+      assert(o->lru == this);
+      auto list = o->lru_link.get_list();
+      assert(list == &top || list == &bottom || list == &pintail);
+      top.push_front(&o->lru_link);
+      adjust();
+    }
     return true;
   }
 
   // touch item -- move to midpoint (unless already higher)
   bool lru_midtouch(LRUObject *o) {
-    if (o->lru_list == &lru_top) return false;
-    
-    lru_remove(o);
-    lru_insert_mid(o);
+    if (!o->lru) {
+      lru_insert_mid(o);
+    } else {
+      assert(o->lru == this);
+      auto list = o->lru_link.get_list();
+      assert(list == &top || list == &bottom || list == &pintail);
+      if (list == &top) return false;
+      bottom.push_front(&o->lru_link);
+      adjust();
+    }
     return true;
   }
 
   // touch item -- move to bottom
   bool lru_bottouch(LRUObject *o) {
-    lru_remove(o);
-    lru_insert_bot(o);
+    if (!o->lru) {
+      lru_insert_bot(o);
+    } else {
+      assert(o->lru == this);
+      auto list = o->lru_link.get_list();
+      assert(list == &top || list == &bottom || list == &pintail);
+      bottom.push_back(&o->lru_link);
+      adjust();
+    }
     return true;
   }
 
   void lru_touch_entire_pintail() {
     // promote entire pintail to the top lru
-    while (lru_pintail.get_length() > 0) {
-      LRUObject *o = lru_pintail.get_head();
-      lru_pintail.remove(o);
-      lru_top.insert_tail(o);
+    while (pintail.size() > 0) {
+      top.push_back(&pintail.front()->lru_link);
+      adjust();
     }
   }
 
-
   // expire -- expire a single item
   LRUObject *lru_get_next_expire() {
-    LRUObject *p;
-    
     // look through tail of bot
-    while (lru_bot.get_length()) {
-      p = lru_bot.get_tail();
+    while (bottom.size()) {
+      LRUObject *p = bottom.back();
       if (!p->lru_pinned) return p;
 
       // move to pintail
-      lru_bot.remove(p);
-      lru_pintail.insert_head(p);
+      pintail.push_front(&p->lru_link);
+      adjust();
     }
 
     // ok, try head then
-    while (lru_top.get_length()) {
-      p = lru_top.get_tail();
+    while (top.size()) {
+      LRUObject *p = top.back();
       if (!p->lru_pinned) return p;
 
       // move to pintail
-      lru_top.remove(p);
-      lru_pintail.insert_head(p);
+      pintail.push_front(&p->lru_link);
+      adjust();
     }
     
     // no luck!
@@ -307,32 +188,55 @@ class LRU {
     return NULL;
   }
 
-
   void lru_status() {
-    //generic_dout(10) << "lru: " << lru_num << " items, " << lru_top.get_length() << " top, " << lru_bot.get_length() << " bot, " << lru_pintail.get_length() << " pintail" << dendl;
+    //generic_dout(10) << "lru: " << lru_get_size() << " items, " << top.size() << " top, " << bottom.size() << " bot, " << pintail.size() << " pintail" << dendl;
   }
 
+protected:
+  // adjust top/bot balance, as necessary
+  void adjust() {
+    uint64_t toplen = top.size();
+    uint64_t topwant = (midpoint * (double)(lru_get_size() - num_pinned));
+    /* move items from below midpoint (bottom) to top: move midpoint forward */
+    for (uint64_t i = toplen; i < topwant; i++) {
+      top.push_back(&bottom.front()->lru_link);
+    }
+    /* or: move items from above midpoint (top) to bottom: move midpoint backwards */
+    for (uint64_t i = toplen; i > topwant; i--) {
+      bottom.push_front(&top.back()->lru_link);
+    }
+  }
+
+  uint64_t num_pinned;
+  double midpoint;
+
+  friend class LRUObject;
+private:
+  typedef xlist<LRUObject *> LRUList;
+  LRUList top, bottom, pintail;
 };
 
+inline LRUObject::~LRUObject() {
+  if (lru) {
+    lru->lru_remove(this);
+  }
+}
 
 inline void LRUObject::lru_pin() {
   if (lru && !lru_pinned) {
-    lru->lru_num_pinned++;
-    lru->lru_adjust();
+    lru->num_pinned++;
   }
   lru_pinned = true;
 }
 
 inline void LRUObject::lru_unpin() {
   if (lru && lru_pinned) {
-    lru->lru_num_pinned--;
+    lru->num_pinned--;
 
     // move from pintail -> bot
-    if (lru_list == &lru->lru_pintail) {
-      lru->lru_pintail.remove(this);
-      lru->lru_bot.insert_tail(this);
+    if (lru_link.get_list() == &lru->pintail) {
+      lru->lru_bottouch(this);
     }
-    lru->lru_adjust();
   }
   lru_pinned = false;
 }
index b0f29baa00bcc84367f036823d6ee4eb78ad0451..2cd61ad2afc3453d40fd875c78e89c93ea986d93 100644 (file)
@@ -100,6 +100,12 @@ BlueStore::Onode, we need to do
 (This is just because we need to name some static variables and we
 can't use :: in a variable name.)
 
+XXX Note: the new operator hard-codes the allocation size to the size of the
+object given in MEMPOOL_DEFINE_OBJECT_FACTORY. For this reason, you cannot
+incorporate mempools into a base class without also defining a helper/factory
+for the child class as well (as the base class is usually smaller than the
+child class).
+
 In order to use the STL containers, simply use the namespaced variant
 of the container type.  For example,
 
@@ -154,6 +160,7 @@ namespace mempool {
   f(osdmap)                          \
   f(osdmap_mapping)                  \
   f(pgmap)                           \
+  f(mds_co)                          \
   f(unittest_1)                              \
   f(unittest_2)
 
index 0a242b72b1520392ec749ddd634ef6a358e758b8..7fd874ff11717874436e1813d9f1947cb50d120d 100644 (file)
@@ -156,13 +156,15 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSDMAP_REQUIRE_KRAKEN   (1<<17) /* require kraken for booting osds */
 #define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
 #define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
+#define CEPH_OSDMAP_PURGED_SNAPDIRS  (1<<20) /* osds have converted snapsets */
 
 /* these are hidden in 'ceph status' view */
 #define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL|       \
                                      CEPH_OSDMAP_REQUIRE_KRAKEN |      \
                                      CEPH_OSDMAP_REQUIRE_LUMINOUS |    \
                                      CEPH_OSDMAP_RECOVERY_DELETES |    \
-                                     CEPH_OSDMAP_SORTBITWISE)
+                                     CEPH_OSDMAP_SORTBITWISE |         \
+                                     CEPH_OSDMAP_PURGED_SNAPDIRS)
 #define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL |  \
                                          CEPH_OSDMAP_REQUIRE_KRAKEN |  \
                                          CEPH_OSDMAP_REQUIRE_LUMINOUS)
index b152c1130df9455684fcd4a82285bbfc705fb873..3de4c3d3ec1e3ed3e58616877af2325ccfd8cc93 100644 (file)
@@ -19,6 +19,8 @@
 
 int64_t unit_to_bytesize(string val, ostream *pss);
 
+std::string bytes2str(uint64_t count);
+
 struct ceph_data_stats
 {
   uint64_t byte_total;
index 9c36e172c98158fdf4d351ad64c906e40f75306e..3b3cd9fcaac7f8c2a0d336b5fa2dcfe576794907 100644 (file)
@@ -63,7 +63,7 @@ public:
 
 private:
   item *_front, *_back;
-  int _size;
+  size_t _size;
 
 public:
   xlist(const xlist& other) {
@@ -79,7 +79,7 @@ public:
     assert(_back == 0);
   }
 
-  int size() const {
+  size_t size() const {
     assert((bool)_front == (bool)_size);
     return _size;
   }
index c97b11505d7744e7c8ed6d5e1395d6b1edd5f382..3db79a7ed30c24a2faeefbd9fb4dc60f3081880b 100644 (file)
@@ -60,7 +60,7 @@ public:
     : ThreadPool(cct, "librbd::thread_pool", "tp_librbd", 1,
                  "rbd_op_threads"),
       op_work_queue(new ContextWQ("librbd::op_work_queue",
-                                  cct->_conf->rbd_op_thread_timeout,
+                                  cct->_conf->get_val<int64_t>("rbd_op_thread_timeout"),
                                   this)) {
     start();
   }
@@ -211,10 +211,11 @@ struct C_InvalidateCache : public Context {
     ThreadPool *thread_pool;
     get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
     io_work_queue = new io::ImageRequestWQ<>(
-      this, "librbd::io_work_queue", cct->_conf->rbd_op_thread_timeout,
+      this, "librbd::io_work_queue",
+      cct->_conf->get_val<int64_t>("rbd_op_thread_timeout"),
       thread_pool);
 
-    if (cct->_conf->rbd_auto_exclusive_lock_until_manual_request) {
+    if (cct->_conf->get_val<bool>("rbd_auto_exclusive_lock_until_manual_request")) {
       exclusive_lock_policy = new exclusive_lock::AutomaticPolicy(this);
     } else {
       exclusive_lock_policy = new exclusive_lock::StandardPolicy(this);
@@ -1013,50 +1014,51 @@ struct C_InvalidateCache : public Context {
       }
     }
 
-#define ASSIGN_OPTION(config)                                                  \
+#define ASSIGN_OPTION(config, type)                                            \
     do {                                                                       \
       string key = "rbd_";                                                    \
       key = key + #config;                                                    \
       if (configs[key])                                                        \
-        config = local_config_t.rbd_##config;                                  \
+        config = local_config_t.get_val<type>("rbd_"#config);                  \
       else                                                                     \
-        config = cct->_conf->rbd_##config;                                     \
+        config = cct->_conf->get_val<type>("rbd_"#config);                     \
     } while (0);
 
-    ASSIGN_OPTION(non_blocking_aio);
-    ASSIGN_OPTION(cache);
-    ASSIGN_OPTION(cache_writethrough_until_flush);
-    ASSIGN_OPTION(cache_size);
-    ASSIGN_OPTION(cache_max_dirty);
-    ASSIGN_OPTION(cache_target_dirty);
-    ASSIGN_OPTION(cache_max_dirty_age);
-    ASSIGN_OPTION(cache_max_dirty_object);
-    ASSIGN_OPTION(cache_block_writes_upfront);
-    ASSIGN_OPTION(concurrent_management_ops);
-    ASSIGN_OPTION(balance_snap_reads);
-    ASSIGN_OPTION(localize_snap_reads);
-    ASSIGN_OPTION(balance_parent_reads);
-    ASSIGN_OPTION(localize_parent_reads);
-    ASSIGN_OPTION(readahead_trigger_requests);
-    ASSIGN_OPTION(readahead_max_bytes);
-    ASSIGN_OPTION(readahead_disable_after_bytes);
-    ASSIGN_OPTION(clone_copy_on_read);
-    ASSIGN_OPTION(blacklist_on_break_lock);
-    ASSIGN_OPTION(blacklist_expire_seconds);
-    ASSIGN_OPTION(request_timed_out_seconds);
-    ASSIGN_OPTION(enable_alloc_hint);
-    ASSIGN_OPTION(journal_order);
-    ASSIGN_OPTION(journal_splay_width);
-    ASSIGN_OPTION(journal_commit_age);
-    ASSIGN_OPTION(journal_object_flush_interval);
-    ASSIGN_OPTION(journal_object_flush_bytes);
-    ASSIGN_OPTION(journal_object_flush_age);
-    ASSIGN_OPTION(journal_pool);
-    ASSIGN_OPTION(journal_max_payload_bytes);
-    ASSIGN_OPTION(journal_max_concurrent_object_sets);
-    ASSIGN_OPTION(mirroring_resync_after_disconnect);
-    ASSIGN_OPTION(mirroring_replay_delay);
-    ASSIGN_OPTION(skip_partial_discard);
+    ASSIGN_OPTION(non_blocking_aio, bool);
+    ASSIGN_OPTION(cache, bool);
+    ASSIGN_OPTION(cache_writethrough_until_flush, bool);
+    ASSIGN_OPTION(cache_size, int64_t);
+    ASSIGN_OPTION(cache_max_dirty, int64_t);
+    ASSIGN_OPTION(cache_target_dirty, int64_t);
+    ASSIGN_OPTION(cache_max_dirty_age, double);
+    ASSIGN_OPTION(cache_max_dirty_object, int64_t);
+    ASSIGN_OPTION(cache_block_writes_upfront, bool);
+    ASSIGN_OPTION(concurrent_management_ops, int64_t);
+    ASSIGN_OPTION(balance_snap_reads, bool);
+    ASSIGN_OPTION(localize_snap_reads, bool);
+    ASSIGN_OPTION(balance_parent_reads, bool);
+    ASSIGN_OPTION(localize_parent_reads, bool);
+    ASSIGN_OPTION(readahead_trigger_requests, int64_t);
+    ASSIGN_OPTION(readahead_max_bytes, int64_t);
+    ASSIGN_OPTION(readahead_disable_after_bytes, int64_t);
+    ASSIGN_OPTION(clone_copy_on_read, bool);
+    ASSIGN_OPTION(blacklist_on_break_lock, bool);
+    ASSIGN_OPTION(blacklist_expire_seconds, int64_t);
+    ASSIGN_OPTION(request_timed_out_seconds, int64_t);
+    ASSIGN_OPTION(enable_alloc_hint, bool);
+    ASSIGN_OPTION(journal_order, uint64_t);
+    ASSIGN_OPTION(journal_splay_width, uint64_t);
+    ASSIGN_OPTION(journal_commit_age, double);
+    ASSIGN_OPTION(journal_object_flush_interval, int64_t);
+    ASSIGN_OPTION(journal_object_flush_bytes, int64_t);
+    ASSIGN_OPTION(journal_object_flush_age, double);
+    ASSIGN_OPTION(journal_pool, std::string);
+    ASSIGN_OPTION(journal_max_payload_bytes, uint64_t);
+    ASSIGN_OPTION(journal_max_concurrent_object_sets, int64_t);
+    ASSIGN_OPTION(mirroring_resync_after_disconnect, bool);
+    ASSIGN_OPTION(mirroring_replay_delay, int64_t);
+    ASSIGN_OPTION(skip_partial_discard, bool);
+    ASSIGN_OPTION(blkin_trace_all, bool);
   }
 
   ExclusiveLock<ImageCtx> *ImageCtx::create_exclusive_lock() {
index 2bda521f154230e997d1429fb76da19e04fe7f74..6fcad3c27e9ce69a9dfe92a2cf30713245bd1e27 100644 (file)
@@ -196,6 +196,7 @@ namespace librbd {
     bool mirroring_resync_after_disconnect;
     int mirroring_replay_delay;
     bool skip_partial_discard;
+    bool blkin_trace_all;
 
     LibrbdAdminSocketHook *asok_hook;
 
index ba4151a0005d6b31da26d90c8d0647a004da54cd..5c2d92a4eedc75b124e2f8b3e7b4df69377a2279 100644 (file)
@@ -216,7 +216,7 @@ private:
     m_cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
       thread_pool_singleton, "librbd::ImageUpdateWatchers::thread_pool");
     m_work_queue = new ContextWQ("librbd::ImageUpdateWatchers::op_work_queue",
-                                m_cct->_conf->rbd_op_thread_timeout,
+                                m_cct->_conf->get_val<int64_t>("rbd_op_thread_timeout"),
                                 thread_pool_singleton);
   }
 
index ca612a9ded86a4278b439b5175d2f4ff8d7bbc42..487eabaa69c18f5c7b374362982ce071b40de1b2 100644 (file)
@@ -425,7 +425,8 @@ void ImageWatcher<I>::handle_request_lock(int r) {
     schedule_request_lock(true);
   } else {
     // lock owner acked -- but resend if we don't see them release the lock
-    int retry_timeout = m_image_ctx.cct->_conf->client_notify_timeout;
+    int retry_timeout = m_image_ctx.cct->_conf->template get_val<int64_t>(
+      "client_notify_timeout");
     ldout(m_image_ctx.cct, 15) << this << " will retry in " << retry_timeout
                                << " seconds" << dendl;
     schedule_request_lock(true, retry_timeout);
index aa73cdeddac73c45ffd15307a5c5430694aca329..9fe57cd3f7b87874f619fdf259023d6c37ba22d7 100644 (file)
@@ -337,7 +337,7 @@ Journal<I>::Journal(I &image_ctx)
   cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
     thread_pool_singleton, "librbd::journal::thread_pool");
   m_work_queue = new ContextWQ("librbd::journal::work_queue",
-                               cct->_conf->rbd_op_thread_timeout,
+                               cct->_conf->get_val<int64_t>("rbd_op_thread_timeout"),
                                thread_pool_singleton);
   ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock);
 }
index f7376db9a85e6a21105918a950f4d4e68486f87c..9f43ebc6a2e607214f3a37bafc2e477285f2972c 100644 (file)
@@ -164,29 +164,29 @@ CreateRequest<I>::CreateRequest(IoCtx &ioctx, const std::string &image_name,
 
   if (image_options.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &m_stripe_unit) != 0 ||
       m_stripe_unit == 0) {
-    m_stripe_unit = m_cct->_conf->rbd_default_stripe_unit;
+    m_stripe_unit = m_cct->_conf->get_val<uint64_t>("rbd_default_stripe_unit");
   }
   if (image_options.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &m_stripe_count) != 0 ||
       m_stripe_count == 0) {
-    m_stripe_count = m_cct->_conf->rbd_default_stripe_count;
+    m_stripe_count = m_cct->_conf->get_val<uint64_t>("rbd_default_stripe_count");
   }
   if (get_image_option(image_options, RBD_IMAGE_OPTION_ORDER, &m_order) != 0 ||
       m_order == 0) {
-    m_order = m_cct->_conf->rbd_default_order;
+    m_order = m_cct->_conf->get_val<int64_t>("rbd_default_order");
   }
   if (get_image_option(image_options, RBD_IMAGE_OPTION_JOURNAL_ORDER,
       &m_journal_order) != 0) {
-    m_journal_order = m_cct->_conf->rbd_journal_order;
+    m_journal_order = m_cct->_conf->get_val<uint64_t>("rbd_journal_order");
   }
   if (get_image_option(image_options, RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH,
                        &m_journal_splay_width) != 0) {
-    m_journal_splay_width = m_cct->_conf->rbd_journal_splay_width;
+    m_journal_splay_width = m_cct->_conf->get_val<uint64_t>("rbd_journal_splay_width");
   }
   if (image_options.get(RBD_IMAGE_OPTION_JOURNAL_POOL, &m_journal_pool) != 0) {
-    m_journal_pool = m_cct->_conf->rbd_journal_pool;
+    m_journal_pool = m_cct->_conf->get_val<std::string>("rbd_journal_pool");
   }
   if (image_options.get(RBD_IMAGE_OPTION_DATA_POOL, &m_data_pool) != 0) {
-    m_data_pool = m_cct->_conf->rbd_default_data_pool;
+    m_data_pool = m_cct->_conf->get_val<std::string>("rbd_default_data_pool");
   }
 
   m_layout.object_size = 1ull << m_order;
@@ -268,7 +268,7 @@ void CreateRequest<I>::send() {
 
 template<typename I>
 void CreateRequest<I>::validate_pool() {
-  if (!m_cct->_conf->rbd_validate_pool) {
+  if (!m_cct->_conf->get_val<bool>("rbd_validate_pool")) {
     create_id_object();
     return;
   }
index 70c706c6d4beec42e45430c0e2f894f157ad6363..acbe336629ccd4c52aee6ed94e4e06a533865b30 100644 (file)
@@ -75,7 +75,7 @@ namespace librbd {
 namespace {
 
 int validate_pool(IoCtx &io_ctx, CephContext *cct) {
-  if (!cct->_conf->rbd_validate_pool) {
+  if (!cct->_conf->get_val<bool>("rbd_validate_pool")) {
     return 0;
   }
 
@@ -833,7 +833,7 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
 
     uint64_t format;
     if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0)
-      format = cct->_conf->rbd_default_format;
+      format = cct->_conf->get_val<int64_t>("rbd_default_format");
     bool old_format = format == 1;
 
     // make sure it doesn't already exist, in either format
@@ -850,7 +850,7 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
 
     uint64_t order = 0;
     if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0 || order == 0) {
-      order = cct->_conf->rbd_default_order;
+      order = cct->_conf->get_val<int64_t>("rbd_default_order");
     }
     r = image::CreateRequest<>::validate_order(cct, order);
     if (r < 0) {
@@ -970,8 +970,7 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
     ImageCtx *ictx = new ImageCtx(srcname, "", "", io_ctx, false);
     int r = ictx->state->open(false);
     if (r < 0) {
-      lderr(ictx->cct) << "error opening source image: " << cpp_strerror(r)
-                      << dendl;
+      lderr(cct) << "error opening source image: " << cpp_strerror(r) << dendl;
       return r;
     }
     BOOST_SCOPE_EXIT((ictx)) {
@@ -1906,7 +1905,7 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
     }
 
     ZTracer::Trace trace;
-    if (cct->_conf->rbd_blkin_trace_all) {
+    if (src->blkin_trace_all) {
       trace.init("copy", &src->trace_endpoint);
     }
 
@@ -2147,7 +2146,7 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
     uint64_t left = mylen;
 
     ZTracer::Trace trace;
-    if (ictx->cct->_conf->rbd_blkin_trace_all) {
+    if (ictx->blkin_trace_all) {
       trace.init("read_iterate", &ictx->trace_endpoint);
     }
 
index 4cc8e0148dd19ab1d57825abb0df9c000b602c7b..81800cc4ae73cf904fb245ce98c91b83a1f210e9 100644 (file)
@@ -200,7 +200,7 @@ void ImageRequestWQ<I>::aio_read(AioCompletion *c, uint64_t off, uint64_t len,
                                 bool native_async) {
   CephContext *cct = m_image_ctx.cct;
   ZTracer::Trace trace;
-  if (cct->_conf->rbd_blkin_trace_all) {
+  if (m_image_ctx.blkin_trace_all) {
     trace.init("wq: read", &m_image_ctx.trace_endpoint);
     trace.event("start");
   }
@@ -241,7 +241,7 @@ void ImageRequestWQ<I>::aio_write(AioCompletion *c, uint64_t off, uint64_t len,
                                  bool native_async) {
   CephContext *cct = m_image_ctx.cct;
   ZTracer::Trace trace;
-  if (cct->_conf->rbd_blkin_trace_all) {
+  if (m_image_ctx.blkin_trace_all) {
     trace.init("wq: write", &m_image_ctx.trace_endpoint);
     trace.event("init");
   }
@@ -278,7 +278,7 @@ void ImageRequestWQ<I>::aio_discard(AioCompletion *c, uint64_t off,
                                    bool native_async) {
   CephContext *cct = m_image_ctx.cct;
   ZTracer::Trace trace;
-  if (cct->_conf->rbd_blkin_trace_all) {
+  if (m_image_ctx.blkin_trace_all) {
     trace.init("wq: discard", &m_image_ctx.trace_endpoint);
     trace.event("init");
   }
@@ -313,7 +313,7 @@ template <typename I>
 void ImageRequestWQ<I>::aio_flush(AioCompletion *c, bool native_async) {
   CephContext *cct = m_image_ctx.cct;
   ZTracer::Trace trace;
-  if (cct->_conf->rbd_blkin_trace_all) {
+  if (m_image_ctx.blkin_trace_all) {
     trace.init("wq: flush", &m_image_ctx.trace_endpoint);
     trace.event("init");
   }
@@ -346,7 +346,7 @@ void ImageRequestWQ<I>::aio_writesame(AioCompletion *c, uint64_t off,
                                      int op_flags, bool native_async) {
   CephContext *cct = m_image_ctx.cct;
   ZTracer::Trace trace;
-  if (cct->_conf->rbd_blkin_trace_all) {
+  if (m_image_ctx.blkin_trace_all) {
     trace.init("wq: writesame", &m_image_ctx.trace_endpoint);
     trace.event("init");
   }
@@ -387,7 +387,7 @@ void ImageRequestWQ<I>::aio_compare_and_write(AioCompletion *c,
                                               int op_flags, bool native_async) {
   CephContext *cct = m_image_ctx.cct;
   ZTracer::Trace trace;
-  if (cct->_conf->rbd_blkin_trace_all) {
+  if (m_image_ctx.blkin_trace_all) {
     trace.init("wq: compare_and_write", &m_image_ctx.trace_endpoint);
     trace.event("init");
   }
index 35119e0f5cd2dc3e91ef5eed5cda288f28154c69..1e4b854bddd500f8e0ff7d77427ee42d3c2aab0e 100644 (file)
@@ -256,7 +256,7 @@ void ObjectMapIterateRequest<I>::send_verify_objects() {
   AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
     this, m_image_ctx, context_factory, this->create_callback_context(),
     &m_prog_ctx, 0, num_objects);
-  throttle->start_ops(cct->_conf->rbd_concurrent_management_ops);
+  throttle->start_ops(m_image_ctx.concurrent_management_ops);
 }
 
 template <typename I>
index 602240bfe12c41db52128f5e56c70b9d47a21125..10ee8c242a0ba3ab0137d99c9e9df2f40ee34de1 100644 (file)
@@ -15,6 +15,7 @@
 
 #include "common/dout.h"
 #include "common/HeartbeatMap.h"
+
 #include "include/stringify.h"
 #include "include/util.h"
 
@@ -483,11 +484,10 @@ void Beacon::notify_health(MDSRank const *mds)
   }
 
   // Report if we have significantly exceeded our cache size limit
-  if (mds->mdcache->get_cache_size() >
-        g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
+  if (mds->mdcache->cache_overfull()) {
     std::ostringstream oss;
-    oss << "Too many inodes in cache (" << mds->mdcache->get_cache_size()
-        << "/" << g_conf->mds_cache_size << "), "
+    oss << "MDS cache is too large (" << bytes2str(mds->mdcache->cache_size())
+        << "/" << bytes2str(mds->mdcache->cache_limit_memory()) << "); "
         << mds->mdcache->num_inodes_with_caps << " inodes in use by clients, "
         << mds->mdcache->get_num_strays() << " stray files";
 
index 44dc6d52a522b32405c98d896b900b98fc260ff5..591e8d8ff6e3caf8e7c7e26c68acb1bc54f9ddd8 100644 (file)
@@ -620,3 +620,5 @@ std::string CDentry::linkage_t::get_remote_d_type_string() const
     default: ceph_abort(); return "";
   }
 }
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(CDentry, co_dentry, mds_co);
index 01e3d3c4992041daa9ee9f8e753e3e97263a7c0a..e9416104ad15f1fc81723729cfa940f922414231 100644 (file)
@@ -49,6 +49,7 @@ bool operator<(const CDentry& l, const CDentry& r);
 // dentry
 class CDentry : public MDSCacheObject, public LRUObject, public Counter<CDentry> {
 public:
+  MEMPOOL_CLASS_HELPERS();
   friend class CDir;
 
   struct linkage_t {
@@ -266,7 +267,7 @@ public:
     ::encode(version, bl);
     ::encode(projected_version, bl);
     ::encode(lock, bl);
-    ::encode(replica_map, bl);
+    ::encode(get_replicas(), bl);
     get(PIN_TEMPEXPORTING);
   }
   void finish_export() {
@@ -288,14 +289,14 @@ public:
     ::decode(version, blp);
     ::decode(projected_version, blp);
     ::decode(lock, blp);
-    ::decode(replica_map, blp);
+    ::decode(get_replicas(), blp);
 
     // twiddle
     state &= MASK_STATE_IMPORT_KEPT;
     state_set(CDentry::STATE_AUTH);
     if (nstate & STATE_DIRTY)
       _mark_dirty(ls);
-    if (!replica_map.empty())
+    if (is_replicated())
       get(PIN_REPLICATED);
     replica_nonce = 0;
   }
index c190cca175b479f89f74d8dc55c118adff491351..3c4a5428193846935902d3949f1bd943cf916665 100644 (file)
@@ -932,7 +932,7 @@ void CDir::finish_old_fragment(list<MDSInternalContextBase*>& waiters, bool repl
 
 void CDir::init_fragment_pins()
 {
-  if (!replica_map.empty())
+  if (is_replicated())
     get(PIN_REPLICATED);
   if (state_test(STATE_DIRTY))
     get(PIN_DIRTY);
@@ -976,7 +976,7 @@ void CDir::split(int bits, list<CDir*>& subs, list<MDSInternalContextBase*>& wai
   for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
     CDir *f = new CDir(inode, *p, cache, is_auth());
     f->state_set(state & (MASK_STATE_FRAGMENT_KEPT | STATE_COMPLETE));
-    f->replica_map = replica_map;
+    f->get_replicas() = get_replicas();
     f->dir_auth = dir_auth;
     f->init_fragment_pins();
     f->set_version(get_version());
@@ -1085,12 +1085,10 @@ void CDir::merge(list<CDir*>& subs, list<MDSInternalContextBase*>& waiters, bool
       steal_dentry(dir->items.begin()->second);
     
     // merge replica map
-    for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
-        p != dir->replicas_end();
-        ++p) {
-      unsigned cur = replica_map[p->first];
-      if (p->second > cur)
-       replica_map[p->first] = p->second;
+    for (const auto &p : dir->get_replicas()) {
+      unsigned cur = get_replicas()[p.first];
+      if (p.second > cur)
+       get_replicas()[p.first] = p.second;
     }
 
     // merge version
@@ -2432,7 +2430,7 @@ void CDir::encode_export(bufferlist& bl)
   ::encode(pop_auth_subtree, bl);
 
   ::encode(dir_rep_by, bl);  
-  ::encode(replica_map, bl);
+  ::encode(get_replicas(), bl);
 
   get(PIN_TEMPEXPORTING);
 }
@@ -2473,8 +2471,8 @@ void CDir::decode_import(bufferlist::iterator& blp, utime_t now, LogSegment *ls)
   pop_auth_subtree_nested.add(now, cache->decayrate, pop_auth_subtree);
 
   ::decode(dir_rep_by, blp);
-  ::decode(replica_map, blp);
-  if (!replica_map.empty()) get(PIN_REPLICATED);
+  ::decode(get_replicas(), blp);
+  if (is_replicated()) get(PIN_REPLICATED);
 
   replica_nonce = 0;  // no longer defined
 
@@ -3295,3 +3293,4 @@ bool CDir::should_split_fast() const
   return effective_size > fast_limit;
 }
 
+MEMPOOL_DEFINE_OBJECT_FACTORY(CDir, co_dir, mds_co);
index 5b0079a74de4dda79950314ecd01afba8556ac6f..45b2998b3d24d00ec67268b9d00b51978094f1d6 100644 (file)
@@ -46,6 +46,7 @@ class CDir : public MDSCacheObject, public Counter<CDir> {
   friend ostream& operator<<(ostream& out, const class CDir& dir);
 
 public:
+  MEMPOOL_CLASS_HELPERS();
   // -- pins --
   static const int PIN_DNWAITER =     1;
   static const int PIN_INOWAITER =    2;
index 149660b65b1ac9ed84a46007beeb3943975884de..633e6477553dda671318f2991d28dc1b2e4a621c 100644 (file)
@@ -3622,7 +3622,7 @@ void CInode::encode_export(bufferlist& bl)
 
   ::encode(pop, bl);
 
-  ::encode(replica_map, bl);
+  ::encode(get_replicas(), bl);
 
   // include scatterlock info for any bounding CDirs
   bufferlist bounding;
@@ -3687,8 +3687,8 @@ void CInode::decode_import(bufferlist::iterator& p,
 
   ::decode(pop, ceph_clock_now(), p);
 
-  ::decode(replica_map, p);
-  if (!replica_map.empty())
+  ::decode(get_replicas(), p);
+  if (is_replicated())
     get(PIN_REPLICATED);
   replica_nonce = 0;
 
@@ -4515,3 +4515,5 @@ bool CInode::is_exportable(mds_rank_t dest) const
     return true;
   }
 }
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co);
index 030c86e62a14539430d8771c265c1139d655ad43..8d868d0786a76ce2cd1f017b5aa9b3c54a9f4894 100644 (file)
@@ -130,6 +130,7 @@ WRITE_CLASS_ENCODER_FEATURES(InodeStore)
 // cached inode wrapper
 class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CInode> {
  public:
+  MEMPOOL_CLASS_HELPERS();
   // -- pins --
   static const int PIN_DIRFRAG =         -1; 
   static const int PIN_CAPS =             2;  // client caps
index c605dfcd15716a1dc1585750f110ff58c60f51e6..95e78b23425734faa591dd765eb74147f3c60e71 100644 (file)
@@ -355,7 +355,7 @@ void FSMap::get_health_checks(health_check_map_t *checks) const
     if (!stuck_failed.empty()) {
       health_check_t& fscheck = checks->get_or_add(
         "FS_WITH_FAILED_MDS", HEALTH_WARN,
-        "%num% filesystem%plurals% %isorare% have a failed mds daemon");
+        "%num% filesystem%plurals% %hasorhave% a failed mds daemon");
       ostringstream ss;
       ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
          << " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
index 5656e3a5e8449d15976c6e309eea0046ed4a2c9e..a0ccf96016be557ca2ed2661894243659344e9da 100644 (file)
@@ -130,28 +130,24 @@ void Locker::tick()
 
 void Locker::send_lock_message(SimpleLock *lock, int msg)
 {
-  for (compact_map<mds_rank_t,unsigned>::iterator it = lock->get_parent()->replicas_begin();
-       it != lock->get_parent()->replicas_end();
-       ++it) {
+  for (const auto &it : lock->get_parent()->get_replicas()) {
     if (mds->is_cluster_degraded() &&
-       mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) 
+       mds->mdsmap->get_state(it.first) < MDSMap::STATE_REJOIN)
       continue;
     MLock *m = new MLock(lock, msg, mds->get_nodeid());
-    mds->send_message_mds(m, it->first);
+    mds->send_message_mds(m, it.first);
   }
 }
 
 void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data)
 {
-  for (compact_map<mds_rank_t,unsigned>::iterator it = lock->get_parent()->replicas_begin();
-       it != lock->get_parent()->replicas_end();
-       ++it) {
+  for (const auto &it : lock->get_parent()->get_replicas()) {
     if (mds->is_cluster_degraded() &&
-       mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) 
+       mds->mdsmap->get_state(it.first) < MDSMap::STATE_REJOIN)
       continue;
     MLock *m = new MLock(lock, msg, mds->get_nodeid());
     m->set_data(data);
-    mds->send_message_mds(m, it->first);
+    mds->send_message_mds(m, it.first);
   }
 }
 
index e6600dbd4ea77e4cd6b10d2fb93842c24299f505..a2510f989819d5a66f07eb7822f4dd0c58faf665 100644 (file)
 
 #include "include/ceph_fs.h"
 #include "include/filepath.h"
+#include "include/util.h"
 
 #include "msg/Message.h"
 #include "msg/Messenger.h"
 
+#include "common/MemoryModel.h"
 #include "common/errno.h"
-#include "common/safe_io.h"
 #include "common/perf_counters.h"
-#include "common/MemoryModel.h"
+#include "common/safe_io.h"
+
 #include "osdc/Journaler.h"
 #include "osdc/Filer.h"
 
@@ -202,10 +204,8 @@ MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
   cap_imports_num_opening = 0;
 
   opening_root = open = false;
-  lru.lru_set_max(g_conf->mds_cache_size);
-  lru.lru_set_midpoint(g_conf->mds_cache_mid);
+  lru.lru_set_midpoint(cache_mid());
 
-  bottom_lru.lru_set_max(0);
   bottom_lru.lru_set_midpoint(0);
 
   decayrate.set_halflife(g_conf->mds_decay_halflife);
@@ -224,7 +224,7 @@ MDCache::~MDCache()
 
 void MDCache::log_stat()
 {
-  mds->logger->set(l_mds_inode_max, g_conf->mds_cache_size);
+  mds->logger->set(l_mds_inode_max, cache_limit_inodes() == 0 ? INT_MAX : cache_limit_inodes());
   mds->logger->set(l_mds_inodes, lru.lru_get_size());
   mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
   mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
@@ -272,8 +272,7 @@ void MDCache::add_inode(CInode *in)
       base_inodes.insert(in);
   }
 
-  if (CInode::count() >
-        g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
+  if (cache_toofull()) {
     exceeded_size_limit = true;
   }
 }
@@ -941,9 +940,9 @@ void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval)
 
     if (to_eval && dir->get_inode()->is_auth())
       to_eval->insert(dir->get_inode());
-  } 
 
-  show_subtrees(15);
+    show_subtrees(15);
+  }
 }
 
 void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
@@ -2034,12 +2033,10 @@ update:
     msg->quota = i->quota;
     mds->send_message_client_counted(msg, session->connection);
   }
-  for (compact_map<mds_rank_t, unsigned>::iterator it = in->replicas_begin();
-       it != in->replicas_end();
-       ++it) {
+  for (const auto &it : in->get_replicas()) {
     MGatherCaps *msg = new MGatherCaps;
     msg->ino = in->ino();
-    mds->send_message_mds(msg, it->first);
+    mds->send_message_mds(msg, it.first);
   }
 }
 
@@ -4563,9 +4560,10 @@ void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *a
         p != dfs.end();
         ++p) {
       CDir *dir = *p;
+      if (!dir->is_auth())
+       continue;
       
-      if (dir->is_auth() &&
-         dir->is_replica(from) &&
+      if (dir->is_replica(from) &&
          (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
        dir->remove_replica(from);
        dout(10) << " rem " << *dir << dendl;
@@ -5973,13 +5971,11 @@ void MDCache::rejoin_send_acks()
       dq.pop_front();
       
       // dir
-      for (compact_map<mds_rank_t,unsigned>::iterator r = dir->replicas_begin();
-          r != dir->replicas_end();
-          ++r) {
-       auto it = acks.find(r->first);
+      for (auto &r : dir->get_replicas()) {
+       auto it = acks.find(r.first);
        if (it == acks.end())
          continue;
-       it->second->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
+       it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
        it->second->add_dirfrag_base(dir);
       }
           
@@ -5995,36 +5991,32 @@ void MDCache::rejoin_send_acks()
          in = dnl->get_inode();
 
        // dentry
-       for (compact_map<mds_rank_t,unsigned>::iterator r = dn->replicas_begin();
-            r != dn->replicas_end();
-            ++r) {
-         auto it = acks.find(r->first);
+       for (auto &r : dn->get_replicas()) {
+         auto it = acks.find(r.first);
          if (it == acks.end())
            continue;
          it->second->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
                                           dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
                                           dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
                                           dnl->is_remote() ? dnl->get_remote_d_type():0,
-                                          ++r->second,
+                                          ++r.second,
                                           dn->lock.get_replica_state());
          // peer missed MDentrylink message ?
-         if (in && !in->is_replica(r->first))
-           in->add_replica(r->first);
+         if (in && !in->is_replica(r.first))
+           in->add_replica(r.first);
        }
        
        if (!in)
          continue;
 
-       for (compact_map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
-            r != in->replicas_end();
-            ++r) {
-         auto it = acks.find(r->first);
+       for (auto &r : in->get_replicas()) {
+         auto it = acks.find(r.first);
          if (it == acks.end())
            continue;
          it->second->add_inode_base(in, mds->mdsmap->get_up_features());
          bufferlist bl;
-         in->_encode_locks_state_for_rejoin(bl, r->first);
-         it->second->add_inode_locks(in, ++r->second, bl);
+         in->_encode_locks_state_for_rejoin(bl, r.first);
+         it->second->add_inode_locks(in, ++r.second, bl);
        }
        
        // subdirs in this subtree?
@@ -6035,28 +6027,24 @@ void MDCache::rejoin_send_acks()
 
   // base inodes too
   if (root && root->is_auth()) 
-    for (compact_map<mds_rank_t,unsigned>::iterator r = root->replicas_begin();
-        r != root->replicas_end();
-        ++r) {
-      auto it = acks.find(r->first);
+    for (auto &r : root->get_replicas()) {
+      auto it = acks.find(r.first);
       if (it == acks.end())
        continue;
       it->second->add_inode_base(root, mds->mdsmap->get_up_features());
       bufferlist bl;
-      root->_encode_locks_state_for_rejoin(bl, r->first);
-      it->second->add_inode_locks(root, ++r->second, bl);
+      root->_encode_locks_state_for_rejoin(bl, r.first);
+      it->second->add_inode_locks(root, ++r.second, bl);
     }
   if (myin)
-    for (compact_map<mds_rank_t,unsigned>::iterator r = myin->replicas_begin();
-        r != myin->replicas_end();
-        ++r) {
-      auto it = acks.find(r->first);
+    for (auto &r : myin->get_replicas()) {
+      auto it = acks.find(r.first);
       if (it == acks.end())
        continue;
       it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
       bufferlist bl;
-      myin->_encode_locks_state_for_rejoin(bl, r->first);
-      it->second->add_inode_locks(myin, ++r->second, bl);
+      myin->_encode_locks_state_for_rejoin(bl, r.first);
+      it->second->add_inode_locks(myin, ++r.second, bl);
     }
 
   // include inode base for any inodes whose scatterlocks may have updated
@@ -6064,10 +6052,8 @@ void MDCache::rejoin_send_acks()
        p != rejoin_potential_updated_scatterlocks.end();
        ++p) {
     CInode *in = *p;
-    for (compact_map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
-        r != in->replicas_end();
-        ++r) {
-      auto it = acks.find(r->first);
+    for (const auto &r : in->get_replicas()) {
+      auto it = acks.find(r.first);
       if (it == acks.end())
        continue;
       it->second->add_inode_base(in, mds->mdsmap->get_up_features());
@@ -6452,34 +6438,19 @@ void MDCache::start_recovered_truncates()
 // ================================================================================
 // cache trimming
 
-
-/*
- * note: only called while MDS is active or stopping... NOT during recovery.
- * however, we may expire a replica whose authority is recovering.
- * 
- */
-bool MDCache::trim(int max, int count)
-{
-  // trim LRU
-  if (count > 0) {
-    max = lru.lru_get_size() - count;
-    if (max <= 0)
-      max = 1;
-  } else if (max < 0) {
-    max = g_conf->mds_cache_size;
-    if (max <= 0)
-      return false;
-  }
-  dout(7) << "trim max=" << max << "  cur=" << lru.lru_get_size()
-         << "/" << bottom_lru.lru_get_size() << dendl;
-
-  // process delayed eval_stray()
-  stray_manager.advance_delayed();
-
-  map<mds_rank_t, MCacheExpire*> expiremap;
+void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
+{
   bool is_standby_replay = mds->is_standby_replay();
-  int unexpirable = 0;
-  list<CDentry*> unexpirables;
+  std::vector<CDentry *> unexpirables;
+  uint64_t trimmed = 0;
+
+  dout(7) << "trim_lru trimming " << count
+          << " items from LRU"
+          << " size=" << lru.lru_get_size()
+          << " mid=" << lru.lru_get_top()
+          << " pintail=" << lru.lru_get_pintail()
+          << " pinned=" << lru.lru_get_num_pinned()
+          << dendl;
 
   for (;;) {
     CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
@@ -6487,34 +6458,65 @@ bool MDCache::trim(int max, int count)
       break;
     if (trim_dentry(dn, expiremap)) {
       unexpirables.push_back(dn);
-      ++unexpirable;
+    } else {
+      trimmed++;
     }
   }
 
-  for(auto dn : unexpirables)
+  for (auto &dn : unexpirables) {
     bottom_lru.lru_insert_mid(dn);
+  }
   unexpirables.clear();
 
-  // trim dentries from the LRU: only enough to satisfy `max`,
-  while (lru.lru_get_size() + unexpirable > (unsigned)max) {
+  // trim dentries from the LRU until count is reached
+  while (cache_toofull() || count > 0) {
     CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
     if (!dn) {
       break;
     }
     if ((is_standby_replay && dn->get_linkage()->inode &&
-        dn->get_linkage()->inode->item_open_file.is_on_list()) ||
-       trim_dentry(dn, expiremap)) {
+        dn->get_linkage()->inode->item_open_file.is_on_list())) {
       unexpirables.push_back(dn);
-      ++unexpirable;
+    } else if (trim_dentry(dn, expiremap)) {
+      unexpirables.push_back(dn);
+    } else {
+      trimmed++;
     }
+    count--;
   }
-  for(auto dn : unexpirables)
+
+  for (auto &dn : unexpirables) {
     lru.lru_insert_mid(dn);
+  }
   unexpirables.clear();
 
+  dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
+}
+
+/*
+ * note: only called while MDS is active or stopping... NOT during recovery.
+ * however, we may expire a replica whose authority is recovering.
+ *
+ * @param count is number of dentries to try to expire
+ */
+bool MDCache::trim(uint64_t count)
+{
+  uint64_t used = cache_size();
+  uint64_t limit = cache_limit_memory();
+  map<mds_rank_t, MCacheExpire*> expiremap;
+
+  dout(7) << "trim bytes_used=" << bytes2str(used)
+          << " limit=" << bytes2str(limit)
+          << " reservation=" << cache_reservation()
+          << "% count=" << count << dendl;
+
+  // process delayed eval_stray()
+  stray_manager.advance_delayed();
+
+  trim_lru(count, expiremap);
+
   // trim non-auth, non-bound subtrees
-  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
-       p != subtrees.end();) {
+  for (auto p = subtrees.begin(); p != subtrees.end();) {
     CDir *dir = p->first;
     ++p;
     CInode *diri = dir->get_inode();
@@ -6522,6 +6524,7 @@ bool MDCache::trim(int max, int count)
       if (!diri->is_auth() && !diri->is_base() &&
          dir->get_num_head_items() == 0) {
        if (dir->state_test(CDir::STATE_EXPORTING) ||
+           !(mds->is_active() || mds->is_stopping()) ||
            dir->is_freezing() || dir->is_frozen())
          continue;
 
@@ -6547,7 +6550,7 @@ bool MDCache::trim(int max, int count)
   }
 
   // trim root?
-  if (max == 0 && root) {
+  if (mds->is_stopping() && root) {
     list<CDir*> ls;
     root->get_dirfrags(ls);
     for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
@@ -6590,7 +6593,7 @@ bool MDCache::trim(int max, int count)
   }
 
   // Other rank's base inodes (when I'm stopping)
-  if (max == 0) {
+  if (mds->is_stopping()) {
     for (set<CInode*>::iterator p = base_inodes.begin();
          p != base_inodes.end(); ++p) {
       if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
@@ -7277,7 +7280,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
       if (nonce == dir->get_replica_nonce(from)) {
        // remove from our cached_by
        dout(7) << " dir expire on " << *dir << " from mds." << from
-               << " replicas was " << dir->replica_map << dendl;
+               << " replicas was " << dir->get_replicas() << dendl;
        dir->remove_replica(from);
       } 
       else {
@@ -7442,9 +7445,9 @@ void MDCache::check_memory_usage()
 
   // check client caps
   assert(CInode::count() == inode_map.size());
-  float caps_per_inode = 0.0;
+  double caps_per_inode = 0.0;
   if (CInode::count())
-    caps_per_inode = (float)Capability::count() / (float)CInode::count();
+    caps_per_inode = (double)Capability::count() / (double)CInode::count();
 
   dout(2) << "check_memory_usage"
           << " total " << last.get_total()
@@ -7460,20 +7463,15 @@ void MDCache::check_memory_usage()
   mds->mlogger->set(l_mdm_rss, last.get_rss());
   mds->mlogger->set(l_mdm_heap, last.get_heap());
 
-  if (num_inodes_with_caps > g_conf->mds_cache_size) {
-    float ratio = (float)g_conf->mds_cache_size * .9 / (float)num_inodes_with_caps;
-    if (ratio < 1.0) {
-      last_recall_state = ceph_clock_now();
-      mds->server->recall_client_state(ratio);
-    }
+  if (cache_toofull()) {
+    last_recall_state = ceph_clock_now();
+    mds->server->recall_client_state();
   }
 
   // If the cache size had exceeded its limit, but we're back in bounds
   // now, free any unused pool memory so that our memory usage isn't
   // permanently bloated.
-  if (exceeded_size_limit
-      && CInode::count() <=
-        g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
+  if (exceeded_size_limit && !cache_toofull()) {
     // Only do this once we are back in bounds: otherwise the releases would
     // slow down whatever process caused us to exceed bounds to begin with
     if (ceph_using_tcmalloc()) {
@@ -7565,7 +7563,7 @@ bool MDCache::shutdown_pass()
   }
 
   // trim cache
-  trim(0);
+  trim(UINT64_MAX);
   dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
 
   // SUBTREES
@@ -7631,12 +7629,6 @@ bool MDCache::shutdown_pass()
   assert(!migrator->is_exporting());
   assert(!migrator->is_importing());
 
-  if ((myin && myin->is_auth_pinned()) ||
-      (mydir && mydir->is_auth_pinned())) {
-    dout(7) << "still have auth pinned objects" << dendl;
-    return false;
-  }
-
   // flush what we can from the log
   mds->mdlog->trim(0);
   if (mds->mdlog->get_num_segments() > 1) {
@@ -7644,6 +7636,12 @@ bool MDCache::shutdown_pass()
     return false;
   }
 
+  if ((myin && myin->is_auth_pinned()) ||
+      (mydir && mydir->is_auth_pinned())) {
+    dout(7) << "still have auth pinned objects" << dendl;
+    return false;
+  }
+
   // (only do this once!)
   if (!mds->mdlog->is_capped()) {
     dout(7) << "capping the log" << dendl;
@@ -10321,10 +10319,9 @@ int MDCache::send_dir_updates(CDir *dir, bool bcast)
   if (bcast) {
     mds->get_mds_map()->get_active_mds_set(who);
   } else {
-    for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
-        p != dir->replicas_end();
-        ++p)
-      who.insert(p->first);
+    for (const auto &p : dir->get_replicas()) {
+      who.insert(p.first);
+    }
   }
   
   dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
@@ -10407,22 +10404,20 @@ void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
   dout(7) << "send_dentry_link " << *dn << dendl;
 
   CDir *subtree = get_subtree_root(dn->get_dir());
-  for (compact_map<mds_rank_t,unsigned>::iterator p = dn->replicas_begin();
-       p != dn->replicas_end();
-       ++p) {
+  for (const auto &p : dn->get_replicas()) {
     // don't tell (rename) witnesses; they already know
-    if (mdr.get() && mdr->more()->witnessed.count(p->first))
+    if (mdr.get() && mdr->more()->witnessed.count(p.first))
       continue;
-    if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
-       (mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
-        rejoin_gather.count(p->first)))
+    if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
+       (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
+        rejoin_gather.count(p.first)))
       continue;
     CDentry::linkage_t *dnl = dn->get_linkage();
     MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
                                     dn->name, dnl->is_primary());
     if (dnl->is_primary()) {
       dout(10) << "  primary " << *dnl->get_inode() << dendl;
-      replicate_inode(dnl->get_inode(), p->first, m->bl,
+      replicate_inode(dnl->get_inode(), p.first, m->bl,
                      mds->mdsmap->get_up_features());
     } else if (dnl->is_remote()) {
       inodeno_t ino = dnl->get_remote_ino();
@@ -10432,7 +10427,7 @@ void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
       ::encode(d_type, m->bl);
     } else
       ceph_abort();   // aie, bad caller!
-    mds->send_message_mds(m, p->first);
+    mds->send_message_mds(m, p.first);
   }
 }
 
@@ -11348,12 +11343,10 @@ void MDCache::_fragment_stored(MDRequestRef& mdr)
 
   // tell peers
   CDir *first = *info.resultfrags.begin();
-  for (compact_map<mds_rank_t,unsigned>::iterator p = first->replicas_begin();
-       p != first->replicas_end();
-       ++p) {
-    if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
-       (mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
-        rejoin_gather.count(p->first)))
+  for (const auto &p : first->get_replicas()) {
+    if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
+       (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
+        rejoin_gather.count(p.first)))
       continue;
 
     MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
@@ -11362,9 +11355,9 @@ void MDCache::_fragment_stored(MDRequestRef& mdr)
     for (list<CDir*>::iterator q = info.resultfrags.begin();
         q != info.resultfrags.end();
         ++q)
-      replicate_dir(*q, p->first, notify->basebl);
+      replicate_dir(*q, p.first, notify->basebl);
 
-    mds->send_message_mds(notify, p->first);
+    mds->send_message_mds(notify, p.first);
   }
 
   // journal commit
@@ -11865,6 +11858,18 @@ void MDCache::show_cache()
   }
 }
 
+int MDCache::cache_status(Formatter *f)
+{
+  f->open_object_section("cache");
+
+  f->open_object_section("pool");
+  mempool::get_pool(mempool::mds_co::id).dump(f);
+  f->close_section();
+
+  f->close_section();
+  return 0;
+}
+
 int MDCache::dump_cache(std::string const &file_name)
 {
   return dump_cache(file_name.c_str(), NULL);
index 8282ad7c44e778632f3cf5cbe9d8e574dbd3c616..61a170bf6fd57b60baae3a613b89c0a9770afdce 100644 (file)
@@ -146,6 +146,38 @@ class MDCache {
   bool exceeded_size_limit;
 
 public:
+  static uint64_t cache_limit_inodes(void) {
+    return g_conf->get_val<int64_t>("mds_cache_size");
+  }
+  static uint64_t cache_limit_memory(void) {
+    return g_conf->get_val<uint64_t>("mds_cache_memory_limit");
+  }
+  static double cache_reservation(void) {
+    return g_conf->get_val<double>("mds_cache_reservation");
+  }
+  static double cache_mid(void) {
+    return g_conf->get_val<double>("mds_cache_mid");
+  }
+  static double cache_health_threshold(void) {
+    return g_conf->get_val<double>("mds_health_cache_threshold");
+  }
+  double cache_toofull_ratio(void) const {
+    uint64_t inode_limit = cache_limit_inodes();
+    double inode_reserve = inode_limit*(1.0-cache_reservation());
+    double memory_reserve = cache_limit_memory()*(1.0-cache_reservation());
+    return fmax(0.0, fmax((cache_size()-memory_reserve)/memory_reserve, inode_limit == 0 ? 0.0 : (CInode::count()-inode_reserve)/inode_reserve));
+  }
+  bool cache_toofull(void) const {
+    return cache_toofull_ratio() > 0.0;
+  }
+  uint64_t cache_size(void) const {
+    return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
+  }
+  bool cache_overfull(void) const {
+    uint64_t inode_limit = cache_limit_inodes();
+    return (inode_limit > 0 && CInode::count() > inode_limit*cache_health_threshold()) || (cache_size() > cache_limit_memory()*cache_health_threshold());
+  }
+
   void advance_stray() {
     stray_index = (stray_index+1)%NUM_STRAY;
   }
@@ -671,12 +703,12 @@ public:
   CInode *get_root() { return root; }
   CInode *get_myin() { return myin; }
 
-  // cache
-  void set_cache_size(size_t max) { lru.lru_set_max(max); }
   size_t get_cache_size() { return lru.lru_get_size(); }
 
   // trimming
-  bool trim(int max=-1, int count=-1);   // trim cache
+  bool trim(uint64_t count=0);
+private:
+  void trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*>& expiremap);
   bool trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap);
   void trim_dirfrag(CDir *dir, CDir *con,
                    map<mds_rank_t, MCacheExpire*>& expiremap);
@@ -684,6 +716,7 @@ public:
                  map<mds_rank_t,class MCacheExpire*>& expiremap);
   void send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap);
   void trim_non_auth();      // trim out trimmable non-auth items
+public:
   bool trim_non_auth_subtree(CDir *directory);
   void standby_trim_segment(LogSegment *ls);
   void try_trim_non_auth_subtree(CDir *dir);
@@ -1135,6 +1168,8 @@ public:
   int dump_cache(Formatter *f);
   int dump_cache(const std::string& dump_root, int depth, Formatter *f);
 
+  int cache_status(Formatter *f);
+
   void dump_resolve_status(Formatter *f) const;
   void dump_rejoin_status(Formatter *f) const;
 
index cb1f9b558b24c7ad98a70334f14ca4115d369029..806974a1d0b7dc0d6f1d87c96ecce7738183ad68 100644 (file)
@@ -1446,7 +1446,7 @@ void MDLog::standby_trim_segments()
 
   if (removed_segment) {
     dout(20) << " calling mdcache->trim!" << dendl;
-    mds->mdcache->trim(-1);
+    mds->mdcache->trim();
   } else {
     dout(20) << " removed no segments!" << dendl;
   }
index 1265a4a9e8912d93822e0174a77cd6f3889f6a72..6286bbe5d1cbd35f6c2df1913f96a2d93fc64a4c 100644 (file)
@@ -8,7 +8,7 @@
 uint64_t MDSCacheObject::last_wait_seq = 0;
 
 void MDSCacheObject::finish_waiting(uint64_t mask, int result) {
-  list<MDSInternalContextBase*> finished;
+  std::list<MDSInternalContextBase*> finished;
   take_waiting(mask, finished);
   finish_contexts(g_ceph_context, finished, result);
 }
@@ -21,12 +21,10 @@ void MDSCacheObject::dump(Formatter *f) const
   f->open_object_section("auth_state");
   {
     f->open_object_section("replicas");
-    const compact_map<mds_rank_t,unsigned>& replicas = get_replicas();
-    for (compact_map<mds_rank_t,unsigned>::const_iterator i = replicas.begin();
-         i != replicas.end(); ++i) {
+    for (const auto &it : get_replicas()) {
       std::ostringstream rank_str;
-      rank_str << i->first;
-      f->dump_int(rank_str.str().c_str(), i->second);
+      rank_str << it.first;
+      f->dump_int(rank_str.str().c_str(), it.second);
     }
     f->close_section();
   }
index 1bc80cfed67307f6f22792f44e1d397e35022907..9a90a1db0c8c315c2b04298b86011776425720c5 100644 (file)
@@ -1,17 +1,17 @@
 #ifndef CEPH_MDSCACHEOBJECT_H
 #define CEPH_MDSCACHEOBJECT_H
 
-#include <set>
-#include <map>
 #include <ostream>
-using namespace std;
-
 
 #include "common/config.h"
+
+#include "include/Context.h"
+#include "include/alloc_ptr.h"
 #include "include/assert.h"
+#include "include/mempool.h"
 #include "include/types.h"
 #include "include/xlist.h"
-#include "include/Context.h"
+
 #include "mdstypes.h"
 
 #define MDS_REF_SET      // define me for improved debug output, sanity checking
@@ -145,7 +145,7 @@ class MDSCacheObject {
 protected:
   __s32      ref;       // reference count
 #ifdef MDS_REF_SET
-  std::map<int,int> ref_map;
+  mempool::mds_co::map<int,int> ref_map;
 #endif
 
  public:
@@ -226,7 +226,7 @@ protected:
   int auth_pins;
   int nested_auth_pins;
 #ifdef MDS_AUTHPIN_SET
-  multiset<void*> auth_pin_set;
+  mempool::mds_co::multiset<void*> auth_pin_set;
 #endif
 
   public:
@@ -253,47 +253,47 @@ protected:
   // replication (across mds cluster)
  protected:
   unsigned             replica_nonce; // [replica] defined on replica
-  compact_map<mds_rank_t,unsigned>     replica_map;   // [auth] mds -> nonce
+  typedef compact_map<mds_rank_t,unsigned> replica_map_type;
+  replica_map_type replica_map;   // [auth] mds -> nonce
 
  public:
-  bool is_replicated() const { return !replica_map.empty(); }
-  bool is_replica(mds_rank_t mds) const { return replica_map.count(mds); }
-  int num_replicas() const { return replica_map.size(); }
+  bool is_replicated() const { return !get_replicas().empty(); }
+  bool is_replica(mds_rank_t mds) const { return get_replicas().count(mds); }
+  int num_replicas() const { return get_replicas().size(); }
   unsigned add_replica(mds_rank_t mds) {
-    if (replica_map.count(mds)) 
-      return ++replica_map[mds];  // inc nonce
-    if (replica_map.empty()) 
+    if (get_replicas().count(mds))
+      return ++get_replicas()[mds];  // inc nonce
+    if (get_replicas().empty())
       get(PIN_REPLICATED);
-    return replica_map[mds] = 1;
+    return get_replicas()[mds] = 1;
   }
   void add_replica(mds_rank_t mds, unsigned nonce) {
-    if (replica_map.empty()) 
+    if (get_replicas().empty())
       get(PIN_REPLICATED);
-    replica_map[mds] = nonce;
+    get_replicas()[mds] = nonce;
   }
   unsigned get_replica_nonce(mds_rank_t mds) {
-    assert(replica_map.count(mds));
-    return replica_map[mds];
+    assert(get_replicas().count(mds));
+    return get_replicas()[mds];
   }
   void remove_replica(mds_rank_t mds) {
-    assert(replica_map.count(mds));
-    replica_map.erase(mds);
-    if (replica_map.empty())
+    assert(get_replicas().count(mds));
+    get_replicas().erase(mds);
+    if (get_replicas().empty()) {
       put(PIN_REPLICATED);
+    }
   }
   void clear_replica_map() {
-    if (!replica_map.empty())
+    if (!get_replicas().empty())
       put(PIN_REPLICATED);
     replica_map.clear();
   }
-  compact_map<mds_rank_t,unsigned>::iterator replicas_begin() { return replica_map.begin(); }
-  compact_map<mds_rank_t,unsigned>::iterator replicas_end() { return replica_map.end(); }
-  const compact_map<mds_rank_t,unsigned>& get_replicas() const { return replica_map; }
+  replica_map_type& get_replicas() { return replica_map; }
+  const replica_map_type& get_replicas() const { return replica_map; }
   void list_replicas(std::set<mds_rank_t>& ls) const {
-    for (compact_map<mds_rank_t,unsigned>::const_iterator p = replica_map.begin();
-        p != replica_map.end();
-        ++p)
-      ls.insert(p->first);
+    for (const auto &p : get_replicas()) {
+      ls.insert(p.first);
+    }
   }
 
   unsigned get_replica_nonce() const { return replica_nonce; }
@@ -302,8 +302,8 @@ protected:
 
   // ---------------------------------------------
   // waiting
- protected:
-  compact_multimap<uint64_t, pair<uint64_t, MDSInternalContextBase*> > waiting;
+ private:
+  alloc_ptr<mempool::mds_co::multimap<uint64_t, std::pair<uint64_t, MDSInternalContextBase*>>> waiting;
   static uint64_t last_wait_seq;
 
  public:
@@ -311,18 +311,18 @@ protected:
     if (!min) {
       min = mask;
       while (min & (min-1))  // if more than one bit is set
-       min &= min-1;        //  clear LSB
+        min &= min-1;        //  clear LSB
     }
-    for (auto p = waiting.lower_bound(min);
-        p != waiting.end();
-        ++p) {
-      if (p->first & mask) return true;
-      if (p->first > mask) return false;
+    if (waiting) {
+      for (auto p = waiting->lower_bound(min); p != waiting->end(); ++p) {
+        if (p->first & mask) return true;
+        if (p->first > mask) return false;
+      }
     }
     return false;
   }
   virtual void add_waiter(uint64_t mask, MDSInternalContextBase *c) {
-    if (waiting.empty())
+    if (waiting->empty())
       get(PIN_WAITER);
 
     uint64_t seq = 0;
@@ -330,7 +330,7 @@ protected:
       seq = ++last_wait_seq;
       mask &= ~WAIT_ORDERED;
     }
-    waiting.insert(pair<uint64_t, pair<uint64_t, MDSInternalContextBase*> >(
+    waiting->insert(pair<uint64_t, pair<uint64_t, MDSInternalContextBase*> >(
                            mask,
                            pair<uint64_t, MDSInternalContextBase*>(seq, c)));
 //    pdout(10,g_conf->debug_mds) << (mdsco_db_line_prefix(this)) 
@@ -339,41 +339,40 @@ protected:
 //                            << dendl;
     
   }
-  virtual void take_waiting(uint64_t mask, list<MDSInternalContextBase*>& ls) {
-    if (waiting.empty()) return;
+  virtual void take_waiting(uint64_t mask, std::list<MDSInternalContextBase*>& ls) {
+    if (!waiting || waiting->empty()) return;
 
     // process ordered waiters in the same order that they were added.
     std::map<uint64_t, MDSInternalContextBase*> ordered_waiters;
 
-    for (auto it = waiting.begin();
-        it != waiting.end(); ) {
+    for (auto it = waiting->begin(); it != waiting->end(); ) {
       if (it->first & mask) {
-
-       if (it->second.first > 0)
-         ordered_waiters.insert(it->second);
-       else
-         ls.push_back(it->second.second);
+           if (it->second.first > 0) {
+             ordered_waiters.insert(it->second);
+           } else {
+             ls.push_back(it->second.second);
+        }
 //     pdout(10,g_conf->debug_mds) << (mdsco_db_line_prefix(this))
 //                                << "take_waiting mask " << hex << mask << dec << " took " << it->second
 //                                << " tag " << hex << it->first << dec
 //                                << " on " << *this
 //                                << dendl;
-       waiting.erase(it++);
+        waiting->erase(it++);
       } else {
 //     pdout(10,g_conf->debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second
 //                                << " tag " << hex << it->first << dec
 //                                << " on " << *this 
 //                                << dendl;
-       ++it;
+             ++it;
       }
     }
-    for (auto it = ordered_waiters.begin();
-        it != ordered_waiters.end();
-        ++it) {
+    for (auto it = ordered_waiters.begin(); it != ordered_waiters.end(); ++it) {
       ls.push_back(it->second);
     }
-    if (waiting.empty())
+    if (waiting->empty()) {
       put(PIN_WAITER);
+      waiting.release();
+    }
   }
   void finish_waiting(uint64_t mask, int result = 0);
 
index 1e4558121a586c9691deb53dc4322ade15836021..087c995a830df2ae31de81c35f682119c892a934 100644 (file)
@@ -255,6 +255,11 @@ void MDSDaemon::set_up_admin_socket()
                                      asok_hook,
                                      "dump metadata cache (optionally to a file)");
   assert(r == 0);
+  r = admin_socket->register_command("cache status",
+                                     "cache status",
+                                     asok_hook,
+                                     "show cache status");
+  assert(r == 0);
   r = admin_socket->register_command("dump tree",
                                     "dump tree "
                                     "name=root,type=CephString,req=true "
@@ -329,6 +334,7 @@ void MDSDaemon::clean_up_admin_socket()
   admin_socket->unregister_command("flush_path");
   admin_socket->unregister_command("export dir");
   admin_socket->unregister_command("dump cache");
+  admin_socket->unregister_command("cache status");
   admin_socket->unregister_command("dump tree");
   admin_socket->unregister_command("session evict");
   admin_socket->unregister_command("osdmap barrier");
index 52e357f0742e0ccaac5b9c1ccbf108685b7e29b0..77f1819788fc5977338a2821ac6c34f04d3317d0 100644 (file)
@@ -1937,6 +1937,13 @@ bool MDSRankDispatcher::handle_asok_command(
 
     if (r != 0) {
       ss << "Failed to dump cache: " << cpp_strerror(r);
+      f->reset();
+    }
+  } else if (command == "cache status") {
+    Mutex::Locker l(mds_lock);
+    int r = mdcache->cache_status(f);
+    if (r != 0) {
+      ss << "Failed to get cache status: " << cpp_strerror(r);
     }
   } else if (command == "dump tree") {
     string root;
@@ -1949,6 +1956,7 @@ bool MDSRankDispatcher::handle_asok_command(
       int r = mdcache->dump_cache(root, depth, f);
       if (r != 0) {
         ss << "Failed to dump tree: " << cpp_strerror(r);
+        f->reset();
       }
     }
   } else if (command == "force_readonly") {
index ad32f6d34649b99e695f7e35a4fe86b2a749dfe9..6603e4f1a96662a6806214aa69509bc9b5ef723a 100644 (file)
@@ -776,6 +776,10 @@ void Migrator::export_dir(CDir *dir, mds_rank_t dest)
   assert(dir->is_auth());
   assert(dest != mds->get_nodeid());
    
+  if (!(mds->is_active() || mds->is_stopping())) {
+    dout(7) << "i'm not active, no exports for now" << dendl;
+    return;
+  }
   if (mds->mdcache->is_readonly()) {
     dout(7) << "read-only FS, no exports for now" << dendl;
     return;
@@ -1087,12 +1091,10 @@ void Migrator::export_frozen(CDir *dir, uint64_t tid)
   MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag(), it->second.tid);
 
   // include list of bystanders
-  for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
-       p != dir->replicas_end();
-       ++p) {
-    if (p->first != it->second.peer) {
-      dout(10) << "bystander mds." << p->first << dendl;
-      prep->add_bystander(p->first);
+  for (const auto &p : dir->get_replicas()) {
+    if (p.first != it->second.peer) {
+      dout(10) << "bystander mds." << p.first << dendl;
+      prep->add_bystander(p.first);
     }
   }
 
@@ -1272,22 +1274,20 @@ void Migrator::handle_export_prep_ack(MExportDirPrepAck *m)
          it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0));
   assert(it->second.notify_ack_waiting.empty());
 
-  for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
-       p != dir->replicas_end();
-       ++p) {
-    if (p->first == it->second.peer) continue;
+  for (const auto &p : dir->get_replicas()) {
+    if (p.first == it->second.peer) continue;
     if (mds->is_cluster_degraded() &&
-       !mds->mdsmap->is_clientreplay_or_active_or_stopping(p->first))
+       !mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first))
       continue;  // only if active
-    it->second.warning_ack_waiting.insert(p->first);
-    it->second.notify_ack_waiting.insert(p->first);  // we'll eventually get a notifyack, too!
+    it->second.warning_ack_waiting.insert(p.first);
+    it->second.notify_ack_waiting.insert(p.first);  // we'll eventually get a notifyack, too!
 
     MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), it->second.tid, true,
                                                    mds_authority_t(mds->get_nodeid(),CDIR_AUTH_UNKNOWN),
                                                    mds_authority_t(mds->get_nodeid(),it->second.peer));
     for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
       notify->get_bounds().push_back((*q)->dirfrag());
-    mds->send_message_mds(notify, p->first);
+    mds->send_message_mds(notify, p.first);
     
   }
 
@@ -2011,7 +2011,7 @@ void Migrator::export_finish(CDir *dir)
   cache->show_subtrees();
   audit();
 
-  cache->trim(-1, num_dentries); // try trimming exported dentries
+  cache->trim(num_dentries); // try trimming exported dentries
 
   // send pending import_maps?
   mds->mdcache->maybe_send_pending_resolves();
@@ -2654,7 +2654,7 @@ void Migrator::import_reverse(CDir *dir)
   // log our failure
   mds->mdlog->start_submit_entry(new EImportFinish(dir, false));       // log failure
 
-  cache->trim(-1, num_dentries); // try trimming dentries
+  cache->trim(num_dentries); // try trimming dentries
 
   // notify bystanders; wait in aborting state
   import_notify_abort(dir, bounds);
@@ -3264,8 +3264,9 @@ void Migrator::handle_export_caps(MExportCaps *ex)
   assert(in->is_auth());
 
   // FIXME
-  if (in->is_frozen())
+  if (!in->can_auth_pin())
     return;
+  in->auth_pin(this);
 
   C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
       this, in, mds_rank_t(ex->get_source().num()));
@@ -3306,4 +3307,5 @@ void Migrator::logged_import_caps(CInode *in,
   // clients will release caps from the exporter when they receive the cap import message.
   finish_import_inode_caps(in, from, false, peer_exports[in], imported_caps);
   mds->locker->eval(in, CEPH_CAP_LOCKS, true);
+  in->auth_unpin(this);
 }
index 952df343995eac50b17a691e53da796abedf97e2..3d34bd4c1caa0f45835b4af664da5215d5a5b4fc 100644 (file)
@@ -1082,10 +1082,16 @@ void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
  * to trim some caps, and consequently unpin some inodes in the MDCache so
  * that it can trim too.
  */
-void Server::recall_client_state(float ratio)
+void Server::recall_client_state(void)
 {
-  int max_caps_per_client = (int)(g_conf->mds_cache_size * .8);
-  int min_caps_per_client = 100;
+  /* try to recall at least 80% of all caps */
+  uint64_t max_caps_per_client = (Capability::count() * .8);
+  uint64_t min_caps_per_client = 100;
+  /* unless this ratio is smaller: */
+  /* ratio: determine the amount of caps to recall from each client. Use
+   * percentage full over the cache reservation. Cap the ratio at 80% of client
+   * caps. */
+  double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
 
   dout(10) << "recall_client_state " << ratio
           << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
@@ -1093,10 +1099,7 @@ void Server::recall_client_state(float ratio)
 
   set<Session*> sessions;
   mds->sessionmap.get_client_session_set(sessions);
-  for (set<Session*>::const_iterator p = sessions.begin();
-       p != sessions.end();
-       ++p) {
-    Session *session = *p;
+  for (auto &session : sessions) {
     if (!session->is_open() ||
        !session->info.inst.name.is_client())
       continue;
@@ -1107,7 +1110,7 @@ void Server::recall_client_state(float ratio)
             << dendl;
 
     if (session->caps.size() > min_caps_per_client) {  
-      int newlim = MIN((int)(session->caps.size() * ratio), max_caps_per_client);
+      uint64_t newlim = MIN((session->caps.size() * ratio), max_caps_per_client);
       if (session->caps.size() > newlim) {
           MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
           m->head.max_caps = newlim;
@@ -3166,7 +3169,8 @@ void Server::handle_client_open(MDRequestRef& mdr)
     return;
   }
   
-  bool need_auth = !file_mode_is_readonly(cmode) || (flags & CEPH_O_TRUNC);
+  bool need_auth = !file_mode_is_readonly(cmode) ||
+                  (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
 
   if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
     dout(7) << "read-only FS" << dendl;
@@ -3668,12 +3672,11 @@ void Server::handle_client_readdir(MDRequestRef& mdr)
   bufferlist dnbl;
   __u32 numfiles = 0;
   bool start = !offset_hash && offset_str.empty();
-  bool end = (dir->begin() == dir->end());
   // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
   dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
-  for (CDir::map_t::iterator it = start ? dir->begin() : dir->lower_bound(skip_key);
-       !end && numfiles < max;
-       end = (it == dir->end())) {
+  auto it = start ? dir->begin() : dir->lower_bound(skip_key);
+  bool end = (it == dir->end());
+  for (; !end && numfiles < max; end = (it == dir->end())) {
     CDentry *dn = it->second;
     ++it;
 
@@ -4068,10 +4071,9 @@ void Server::handle_client_setattr(MDRequestRef& mdr)
   if (mask & CEPH_SETATTR_MODE)
     pi->mode = (pi->mode & ~07777) | (req->head.args.setattr.mode & 07777);
   else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
-           S_ISREG(pi->mode)) {
-    pi->mode &= ~S_ISUID;
-    if ((pi->mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP))
-      pi->mode &= ~S_ISGID;
+           S_ISREG(pi->mode) &&
+            (pi->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
+    pi->mode &= ~(S_ISUID|S_ISGID);
   }
 
   if (mask & CEPH_SETATTR_MTIME)
@@ -5214,9 +5216,15 @@ void Server::handle_client_link(MDRequestRef& mdr)
   dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
   dout(7) << "target is " << *targeti << dendl;
   if (targeti->is_dir()) {
-    dout(7) << "target is a dir, failing..." << dendl;
-    respond_to_request(mdr, -EINVAL);
-    return;
+    // if srcdn is replica, need to make sure its linkage is correct
+    vector<CDentry*>& trace = mdr->dn[1];
+    if (trace.empty() ||
+       trace.back()->is_auth() ||
+       trace.back()->lock.can_read(mdr->get_client())) {
+      dout(7) << "target is a dir, failing..." << dendl;
+      respond_to_request(mdr, -EINVAL);
+      return;
+    }
   }
 
   xlocks.insert(&targeti->linklock);
@@ -6502,25 +6510,30 @@ void Server::handle_client_rename(MDRequestRef& mdr)
     oldin = mdcache->get_dentry_inode(destdn, mdr, true);
     if (!oldin) return;
     dout(10) << " oldin " << *oldin << dendl;
-    
-    // mv /some/thing /to/some/existing_other_thing
-    if (oldin->is_dir() && !srci->is_dir()) {
-      respond_to_request(mdr, -EISDIR);
-      return;
-    }
-    if (!oldin->is_dir() && srci->is_dir()) {
-      respond_to_request(mdr, -ENOTDIR);
-      return;
-    }
 
     // non-empty dir? do trivial fast unlocked check, do another check later with read locks
     if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
       respond_to_request(mdr, -ENOTEMPTY);
       return;
     }
-    if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
-      respond_to_request(mdr, 0);  // no-op.  POSIX makes no sense.
-      return;
+
+    // if srcdn is replica, need to make sure its linkage is correct
+    if (srcdn->is_auth() ||
+       srcdn->lock.can_read(mdr->get_client()) ||
+       (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) {
+      // mv /some/thing /to/some/existing_other_thing
+      if (oldin->is_dir() && !srci->is_dir()) {
+       respond_to_request(mdr, -EISDIR);
+       return;
+      }
+      if (!oldin->is_dir() && srci->is_dir()) {
+       respond_to_request(mdr, -ENOTDIR);
+       return;
+      }
+      if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
+       respond_to_request(mdr, 0);  // no-op.  POSIX makes no sense.
+       return;
+      }
     }
   }
 
index 752ba10b326ee01c8be60f6f6cc15c5342f17a97..2543953bab5fec2436641b2e0efbd78508c638dd 100644 (file)
@@ -133,7 +133,7 @@ public:
   void reconnect_tick();
   void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
 
-  void recall_client_state(float ratio);
+  void recall_client_state(void);
   void force_clients_readonly();
 
   // -- requests --
index 3057ba577b13e497dcabda0959ed2978e85299ec..9d0bd4c87b66849adc640cd9872f2912ac0cbc77 100644 (file)
@@ -826,7 +826,7 @@ void Session::notify_cap_release(size_t n_caps)
  * in order to generate health metrics if the session doesn't see
  * a commensurate number of calls to ::notify_cap_release
  */
-void Session::notify_recall_sent(const int new_limit)
+void Session::notify_recall_sent(const size_t new_limit)
 {
   if (recalled_at.is_zero()) {
     // Entering recall phase, set up counters so we can later
index ebd4921cafb41b42702297b570acca835994b6bd..50ffde9cc04f3477dacca53705f2c890307b001c 100644 (file)
@@ -148,7 +148,7 @@ public:
   interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos
 
   void notify_cap_release(size_t n_caps);
-  void notify_recall_sent(const int new_limit);
+  void notify_recall_sent(const size_t new_limit);
   void clear_recalled_at();
 
   inodeno_t next_ino() const {
index 27eae7bebd4a0937b4daad52406eb2a608106678..56d1c4b0b7ed52751b8a1858ace82a2cc6d0dffe 100644 (file)
@@ -373,10 +373,9 @@ public:
   }
 
   void init_gather() {
-    for (compact_map<mds_rank_t,unsigned>::iterator p = parent->replicas_begin();
-        p != parent->replicas_end();
-        ++p)
-      more()->gather_set.insert(p->first);
+    for (const auto p : parent->get_replicas()) {
+      more()->gather_set.insert(p.first);
+    }
   }
   bool is_gathering() const {
     return have_more() && !more()->gather_set.empty();
index 3941f9f53e8c435f01b2a7b9f8e1e653132315a5..2643acd376437903f85ff5f6c71d7953ef57ee85 100644 (file)
@@ -5,6 +5,7 @@
 #define CEPH_MOSDPGRECOVERYDELETE_H
 
 #include "MOSDFastDispatchOp.h"
+#include "include/ceph_features.h"
 
 /*
  * instruct non-primary to remove some objects during recovery
@@ -12,7 +13,7 @@
 
 struct MOSDPGRecoveryDelete : public MOSDFastDispatchOp {
 
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
   pg_shard_t from;
@@ -70,7 +71,9 @@ public:
     ::encode(from, payload);
     ::encode(pgid, payload);
     ::encode(map_epoch, payload);
-    ::encode(min_epoch, payload);
+    if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      ::encode(min_epoch, payload);
+    }
     ::encode(cost, payload);
     ::encode(objects, payload);
   }
@@ -79,7 +82,12 @@ public:
     ::decode(from, p);
     ::decode(pgid, p);
     ::decode(map_epoch, p);
-    ::decode(min_epoch, p);
+    if (header.version == 1 &&
+       !HAVE_FEATURE(get_connection()->get_features(), SERVER_LUMINOUS)) {
+      min_epoch = map_epoch;
+    } else {
+      ::decode(min_epoch, p);
+    }
     ::decode(cost, p);
     ::decode(objects, p);
   }
index fe936388da972a0e5c9895392a360ee74102cda4..b4f835edb30c7698df2ab4046532699601f3a718 100644 (file)
@@ -5,9 +5,10 @@
 #define MOSDRECOVERYDELETEREPLY_H
 
 #include "MOSDFastDispatchOp.h"
+#include "include/ceph_features.h"
 
 struct MOSDPGRecoveryDeleteReply : public MOSDFastDispatchOp {
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
   pg_shard_t from;
@@ -34,7 +35,12 @@ struct MOSDPGRecoveryDeleteReply : public MOSDFastDispatchOp {
     bufferlist::iterator p = payload.begin();
     ::decode(pgid.pgid, p);
     ::decode(map_epoch, p);
-    ::decode(min_epoch, p);
+    if (header.version == 1 &&
+       !HAVE_FEATURE(get_connection()->get_features(), SERVER_LUMINOUS)) {
+      min_epoch = map_epoch;
+    } else {
+      ::decode(min_epoch, p);
+    }
     ::decode(objects, p);
     ::decode(pgid.shard, p);
     ::decode(from, p);
@@ -43,7 +49,9 @@ struct MOSDPGRecoveryDeleteReply : public MOSDFastDispatchOp {
   void encode_payload(uint64_t features) override {
     ::encode(pgid.pgid, payload);
     ::encode(map_epoch, payload);
-    ::encode(min_epoch, payload);
+    if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      ::encode(min_epoch, payload);
+    }
     ::encode(objects, payload);
     ::encode(pgid.shard, payload);
     ::encode(from, payload);
index 924f5822abe807b4a4594483e72bfb0d833e8954..ec4dad81c2901985a59070ae6916704591164b7d 100644 (file)
@@ -76,7 +76,7 @@ public:
   void set_status(int status, const char* status_name) override {}
   void output_header() override {};
   void output_footer() override {};
-
+  void enable_line_break() override {};
 
   void open_array_section(const char *name) override;
   void open_object_section(const char *name) override;
index f03929102e4753ccad1c864bbe7c5375748763f0..fb6b831de727aef0a9d443342728ac60de90dd1d 100644 (file)
@@ -58,7 +58,9 @@ public:
     dout(10) << "MonCommandCompletion::finish()" << dendl;
     {
       // Scoped so the Gil is released before calling notify_all()
-      Gil gil(pThreadState);
+      // Create new thread state because this is called via the MonClient
+      // Finisher, not the PyModules finisher.
+      Gil gil(pThreadState, true);
 
       auto set_fn = PyObject_GetAttrString(python_completion, "complete");
       assert(set_fn != nullptr);
index 4ba91575f3cc0d25317dc04b8100963f7974fbfa..49935108c442fdd02b4a805c28e494b2a07cdd78 100644 (file)
@@ -60,6 +60,7 @@ struct creating_pgs_t {
     auto last = pgs.lower_bound(pg_t{0, (uint64_t)removed_pool + 1});
     pgs.erase(first, last);
     created_pools.erase(removed_pool);
+    queue.erase(removed_pool);
     return total - pgs.size();
   }
   void encode(bufferlist& bl) const {
index 9d0b50d18cfbada9920b38178b048c4c82921e4b..32a80a7b8b16a84d3d783bd4eb3d83d9f0cc71ec 100644 (file)
@@ -234,6 +234,10 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
       p.second.summary,
       boost::regex("%isorare%"),
       p.second.detail.size() > 1 ? "are" : "is");
+    p.second.summary = boost::regex_replace(
+      p.second.summary,
+      boost::regex("%hasorhave%"),
+      p.second.detail.size() > 1 ? "have" : "has");
   }
   encode_health(new_checks, t);
 }
@@ -662,10 +666,11 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
     if (state == MDSMap::STATE_STOPPED) {
       const auto fscid = pending_fsmap.mds_roles.at(gid);
       auto fs = pending_fsmap.get_filesystem(fscid);
+
       mon->clog->info() << info.human_name() << " finished "
                         << "deactivating rank " << info.rank << " in filesystem "
                         << fs->mds_map.fs_name << " (now has "
-                        << fs->mds_map.get_num_in_mds() << " ranks)";
+                        << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
 
       auto erased = pending_fsmap.stop(gid);
       erased.push_back(gid);
@@ -2012,7 +2017,7 @@ bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
 
     mon->clog->info() << new_info.human_name() << " assigned to "
                          "filesystem " << fs->mds_map.fs_name << " as rank "
-                      << mds << " (now has " << fs->mds_map.get_num_in_mds()
+                      << mds << " (now has " << fs->mds_map.get_num_in_mds() + 1
                       << " ranks)";
     pending_fsmap.promote(newgid, fs, mds);
     do_propose = true;
index fc4f08d8d70c51ec41bf23a1943f0e79d876410f..3840b642b956cd9269522f8d172f0749690bf229 100644 (file)
@@ -332,7 +332,7 @@ bool MgrMonitor::prepare_beacon(MonOpRequestRef op)
   } else if (pending_map.active_gid == 0) {
     // There is no currently active daemon, select this one.
     if (pending_map.standbys.count(m->get_gid())) {
-      drop_standby(m->get_gid());
+      drop_standby(m->get_gid(), false);
     }
     dout(4) << "selecting new active " << m->get_gid()
            << " " << m->get_name()
@@ -599,7 +599,8 @@ bool MgrMonitor::promote_standby()
     pending_map.available = false;
     pending_map.active_addr = entity_addr_t();
 
-    drop_standby(replacement_gid);
+    drop_standby(replacement_gid, false);
+
     return true;
   } else {
     return false;
@@ -624,10 +625,12 @@ void MgrMonitor::drop_active()
   cancel_timer();
 }
 
-void MgrMonitor::drop_standby(uint64_t gid)
+void MgrMonitor::drop_standby(uint64_t gid, bool drop_meta)
 {
-  pending_metadata_rm.insert(pending_map.standbys[gid].name);
-  pending_metadata.erase(pending_map.standbys[gid].name);
+  if (drop_meta) {
+    pending_metadata_rm.insert(pending_map.standbys[gid].name);
+    pending_metadata.erase(pending_map.standbys[gid].name);
+  }
   pending_map.standbys.erase(gid);
   if (last_beacon.count(gid) > 0) {
     last_beacon.erase(gid);
index 65451633dbefc0aa85c71bd4d65b50e222548922..563ae7c5de8de937626fdf2be61823809a7e6fb9 100644 (file)
@@ -43,7 +43,16 @@ class MgrMonitor: public PaxosService
    */
   bool promote_standby();
   void drop_active();
-  void drop_standby(uint64_t gid);
+
+  /**
+   * Remove this gid from the list of standbys.  By default,
+   * also remove metadata (i.e. forget the daemon entirely).
+   *
+   * Set `drop_meta` to false if you would like to keep
+   * the daemon's metadata, for example if you're dropping
+   * it as a standby before reinstating it as the active daemon.
+   */
+  void drop_standby(uint64_t gid, bool drop_meta=true);
 
   Context *digest_event = nullptr;
   void cancel_timer();
index c1846a5447798e8fc2a3258581dd4b6da8f86270..1dc264f86b6d06671f5634d023146b4d93eda0e3 100644 (file)
@@ -37,6 +37,10 @@ public:
   void encode_pending(MonitorDBStore::TransactionRef t) override;
   version_t get_trim_to() override;
 
+  bool definitely_converted_snapsets() const {
+    return digest.definitely_converted_snapsets();
+  }
+
   bool preprocess_query(MonOpRequestRef op) override;
   bool prepare_update(MonOpRequestRef op) override;
 
index 61d0ccabc773d67c53f83aaba65c7f1e724d8960..946a95756d88b8c98b39ac16c93aa2ee2cd88521 100644 (file)
@@ -981,6 +981,12 @@ COMMAND("osd pool application rm " \
         "name=key,type=CephString",
         "removes application <app> metadata key <key> on pool <poolname>",
         "osd", "rw", "cli,rest")
+COMMAND("osd pool application get " \
+        "name=pool,type=CephPoolname,req=fasle " \
+        "name=app,type=CephString,req=false " \
+        "name=key,type=CephString,req=false",
+        "get value of key <key> of application <app> on pool <poolname>",
+        "osd", "r", "cli,rest")
 COMMAND("osd utilization",
        "get basic pg distribution stats",
        "osd", "r", "cli,rest")
index a5000efe44e23bcc418cb6f5175f4c62a045ff56..7a43bff2f60256335c77e6248c1cfcb3979e4eb7 100644 (file)
@@ -88,7 +88,7 @@ private:
   MonOpRequest(Message *req, OpTracker *tracker) :
     TrackedOp(tracker,
       req->get_recv_stamp().is_zero() ?
-      req->get_recv_stamp() : ceph_clock_now()),
+      ceph_clock_now() : req->get_recv_stamp()),
     request(req),
     session(NULL),
     con(NULL),
index d98a40967baf7003fd07f4fcdbea7f6917336c8c..4e5914aa7caa447b70d83488668398505b218c52 100644 (file)
@@ -370,6 +370,7 @@ CompatSet Monitor::get_supported_features()
   compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2);
   compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3);
   compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_KRAKEN);
+  compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS);
   return compat;
 }
 
@@ -1060,6 +1061,7 @@ void Monitor::_reset()
   cancel_probe_timeout();
   timecheck_finish();
   health_events_cleanup();
+  health_check_log_times.clear();
   scrub_event_cancel();
 
   leader_since = utime_t();
@@ -2078,6 +2080,13 @@ void Monitor::apply_monmap_to_compatset_features()
     assert(HAVE_FEATURE(quorum_con_features, SERVER_KRAKEN));
     new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_KRAKEN);
   }
+  if (monmap_features.contains_all(ceph::features::mon::FEATURE_LUMINOUS)) {
+    assert(ceph::features::mon::get_persistent().contains_all(
+           ceph::features::mon::FEATURE_LUMINOUS));
+    // this feature should only ever be set if the quorum supports it.
+    assert(HAVE_FEATURE(quorum_con_features, SERVER_LUMINOUS));
+    new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS);
+  }
 
   dout(5) << __func__ << dendl;
   _apply_compatset_features(new_features);
@@ -2103,6 +2112,9 @@ void Monitor::calc_quorum_requirements()
   if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_KRAKEN)) {
     required_features |= CEPH_FEATUREMASK_SERVER_KRAKEN;
   }
+  if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS)) {
+    required_features |= CEPH_FEATUREMASK_SERVER_LUMINOUS;
+  }
 
   // monmap
   if (monmap->get_required_features().contains_all(
@@ -2420,14 +2432,7 @@ void Monitor::do_health_to_clog(bool force)
        summary == health_status_cache.summary &&
        level == health_status_cache.overall)
       return;
-    if (level == HEALTH_OK)
-      clog->info() << "overall " << summary;
-    else if (level == HEALTH_WARN)
-      clog->warn() << "overall " << summary;
-    else if (level == HEALTH_ERR)
-      clog->error() << "overall " << summary;
-    else
-      ceph_abort();
+    clog->health(level) << "overall " << summary;
     health_status_cache.summary = summary;
     health_status_cache.overall = level;
   } else {
@@ -2538,31 +2543,60 @@ void Monitor::log_health(
   if (!g_conf->mon_health_to_clog) {
     return;
   }
+
+  const utime_t now = ceph_clock_now();
+
   // FIXME: log atomically as part of @t instead of using clog.
   dout(10) << __func__ << " updated " << updated.checks.size()
           << " previous " << previous.checks.size()
           << dendl;
+  const auto min_log_period = g_conf->get_val<int64_t>(
+      "mon_health_log_update_period");
   for (auto& p : updated.checks) {
     auto q = previous.checks.find(p.first);
+    bool logged = false;
     if (q == previous.checks.end()) {
       // new
       ostringstream ss;
       ss << "Health check failed: " << p.second.summary << " ("
          << p.first << ")";
-      if (p.second.severity == HEALTH_WARN)
-       clog->warn() << ss.str();
-      else
-       clog->error() << ss.str();
+      clog->health(p.second.severity) << ss.str();
+
+      logged = true;
     } else {
       if (p.second.summary != q->second.summary ||
          p.second.severity != q->second.severity) {
-       // summary or severity changed (ignore detail changes at this level)
-       ostringstream ss;
+
+        auto status_iter = health_check_log_times.find(p.first);
+        if (status_iter != health_check_log_times.end()) {
+          if (p.second.severity == q->second.severity &&
+              now - status_iter->second.updated_at < min_log_period) {
+            // We already logged this recently and the severity is unchanged,
+            // so skip emitting an update of the summary string.
+            // We'll get an update out of tick() later if the check
+            // is still failing.
+            continue;
+          }
+        }
+
+        // summary or severity changed (ignore detail changes at this level)
+        ostringstream ss;
         ss << "Health check update: " << p.second.summary << " (" << p.first << ")";
-       if (p.second.severity == HEALTH_WARN)
-         clog->warn() << ss.str();
-       else
-         clog->error() << ss.str();
+        clog->health(p.second.severity) << ss.str();
+
+        logged = true;
+      }
+    }
+    // Record the time at which we last logged, so that we can check this
+    // when considering whether/when to print update messages.
+    if (logged) {
+      auto iter = health_check_log_times.find(p.first);
+      if (iter == health_check_log_times.end()) {
+        health_check_log_times.emplace(p.first, HealthCheckLogStatus(
+          p.second.severity, p.second.summary, now));
+      } else {
+        iter->second = HealthCheckLogStatus(
+          p.second.severity, p.second.summary, now);
       }
     }
   }
@@ -2578,6 +2612,10 @@ void Monitor::log_health(
         clog->info() << "Health check cleared: " << p.first << " (was: "
                      << p.second.summary << ")";
       }
+
+      if (health_check_log_times.count(p.first)) {
+        health_check_log_times.erase(p.first);
+      }
     }
   }
 
@@ -3363,8 +3401,12 @@ void Monitor::handle_command(MonOpRequestRef op)
       tagstr = tagstr.substr(0, tagstr.find_last_of(' '));
     f->dump_string("tag", tagstr);
 
-    list<string> hs;
-    get_health(hs, NULL, f.get());
+    if (osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+      get_health_status(true, f.get(), nullptr);
+    } else {
+      list<string> health_str;
+      get_health(health_str, nullptr, f.get());
+    }
 
     monmon()->dump_info(f.get());
     osdmon()->dump_info(f.get());
@@ -4399,8 +4441,13 @@ void Monitor::handle_ping(MonOpRequestRef op)
   boost::scoped_ptr<Formatter> f(new JSONFormatter(true));
   f->open_object_section("pong");
 
-  list<string> health_str;
-  get_health(health_str, NULL, f.get());
+  if (osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+    get_health_status(false, f.get(), nullptr);
+  } else {
+    list<string> health_str;
+    get_health(health_str, nullptr, f.get());
+  }
+
   {
     stringstream ss;
     get_mon_status(f.get(), ss);
@@ -4765,10 +4812,7 @@ void Monitor::handle_timecheck_leader(MonOpRequestRef op)
 
   ostringstream ss;
   health_status_t status = timecheck_status(ss, skew_bound, latency);
-  if (status == HEALTH_ERR)
-    clog->error() << other << " " << ss.str();
-  else if (status == HEALTH_WARN)
-    clog->warn() << other << " " << ss.str();
+  clog->health(status) << other << " " << ss.str();
 
   dout(10) << __func__ << " from " << other << " ts " << m->timestamp
           << " delta " << delta << " skew_bound " << skew_bound
@@ -5429,14 +5473,49 @@ void Monitor::tick()
 {
   // ok go.
   dout(11) << "tick" << dendl;
+  const utime_t now = ceph_clock_now();
   
+  // Check if we need to emit any delayed health check updated messages
+  if (is_leader()) {
+    const auto min_period = g_conf->get_val<int64_t>(
+                              "mon_health_log_update_period");
+    for (auto& svc : paxos_service) {
+      auto health = svc->get_health_checks();
+
+      for (const auto &i : health.checks) {
+        const std::string &code = i.first;
+        const std::string &summary = i.second.summary;
+        const health_status_t severity = i.second.severity;
+
+        auto status_iter = health_check_log_times.find(code);
+        if (status_iter == health_check_log_times.end()) {
+          continue;
+        }
+
+        auto &log_status = status_iter->second;
+        bool const changed = log_status.last_message != summary
+                             || log_status.severity != severity;
+
+        if (changed && now - log_status.updated_at > min_period) {
+          log_status.last_message = summary;
+          log_status.updated_at = now;
+          log_status.severity = severity;
+
+          ostringstream ss;
+          ss << "Health check update: " << summary << " (" << code << ")";
+          clog->health(severity) << ss.str();
+        }
+      }
+    }
+  }
+
+
   for (vector<PaxosService*>::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) {
     (*p)->tick();
     (*p)->maybe_trim();
   }
   
   // trim sessions
-  utime_t now = ceph_clock_now();
   {
     Mutex::Locker l(session_map_lock);
     auto p = session_map.sessions.begin();
index fa4ba9fa8f56f26034c3335b1232037a7ec6911f..fc9772601cb9871f4eb3a39d024bab3616d02a56 100644 (file)
@@ -756,6 +756,25 @@ public:
     const health_check_map_t& previous,
     MonitorDBStore::TransactionRef t);
 
+protected:
+
+  class HealthCheckLogStatus {
+    public:
+    health_status_t severity;
+    std::string last_message;
+    utime_t updated_at = 0;
+    HealthCheckLogStatus(health_status_t severity_,
+                         const std::string &last_message_,
+                         utime_t updated_at_)
+      : severity(severity_),
+        last_message(last_message_),
+        updated_at(updated_at_)
+    {}
+  };
+  std::map<std::string, HealthCheckLogStatus> health_check_log_times;
+
+public:
+
   void get_cluster_status(stringstream &ss, Formatter *f);
 
   void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
@@ -993,6 +1012,7 @@ public:
 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
 #define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
+#define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout")
 // make sure you add your feature to Monitor::get_supported_features
 
 
index 5643261748fe164fc539ecd8b4b19477e76f7e85..46f702f4023e47810a1d12ba7cbf123a14c92720 100644 (file)
@@ -240,7 +240,9 @@ void OSDMonitor::create_initial()
     derr << __func__ << " mon_debug_no_require_luminous=true" << dendl;
   } else {
     newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
-    newmap.flags |= CEPH_OSDMAP_RECOVERY_DELETES;
+    newmap.flags |=
+      CEPH_OSDMAP_RECOVERY_DELETES |
+      CEPH_OSDMAP_PURGED_SNAPDIRS;
     newmap.full_ratio = g_conf->mon_osd_full_ratio;
     if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
     newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
@@ -3325,6 +3327,15 @@ void OSDMonitor::tick()
       do_propose = true;
     }
   }
+  if (!osdmap.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS) &&
+      osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+      mon->mgrstatmon()->is_readable() &&
+      mon->mgrstatmon()->definitely_converted_snapsets()) {
+    dout(1) << __func__ << " all snapsets converted, setting purged_snapdirs"
+           << dendl;
+    add_flag(CEPH_OSDMAP_PURGED_SNAPDIRS);
+    do_propose = true;
+  }
 
   // mark osds down?
   if (check_failures(now))
@@ -5160,6 +5171,87 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
       rs << "\n";
       rdata.append(rs.str());
     }
+  } else if (prefix == "osd pool application get") {
+    boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
+                                                     "json-pretty"));
+    string pool_name;
+    cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+    string app;
+    cmd_getval(g_ceph_context, cmdmap, "app", app);
+    string key;
+    cmd_getval(g_ceph_context, cmdmap, "key", key);
+
+    if (pool_name.empty()) {
+      // all
+      f->open_object_section("pools");
+      for (const auto &pool : osdmap.pools) {
+        std::string name("<unknown>");
+        const auto &pni = osdmap.pool_name.find(pool.first);
+        if (pni != osdmap.pool_name.end())
+          name = pni->second;
+        f->open_object_section(name.c_str());
+        for (auto &app_pair : pool.second.application_metadata) {
+          f->open_object_section(app_pair.first.c_str());
+          for (auto &kv_pair : app_pair.second) {
+            f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+          }
+          f->close_section();
+        }
+        f->close_section(); // name
+      }
+      f->close_section(); // pools
+      f->flush(rdata);
+    } else {
+      int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
+      if (pool < 0) {
+        ss << "unrecognized pool '" << pool_name << "'";
+        r = -ENOENT;
+        goto reply;
+      }
+      auto p = osdmap.get_pg_pool(pool);
+      // filter by pool
+      if (app.empty()) {
+        f->open_object_section(pool_name.c_str());
+        for (auto &app_pair : p->application_metadata) {
+          f->open_object_section(app_pair.first.c_str());
+          for (auto &kv_pair : app_pair.second) {
+            f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+          }
+          f->close_section(); // application
+        }
+        f->close_section(); // pool_name
+        f->flush(rdata);
+        goto reply;
+      }
+
+      auto app_it = p->application_metadata.find(app);
+      if (app_it == p->application_metadata.end()) {
+        ss << "pool '" << pool_name << "' has no application '" << app << "'";
+        r = -ENOENT;
+        goto reply;
+      }
+      // filter by pool + app
+      if (key.empty()) {
+        f->open_object_section(app_it->first.c_str());
+        for (auto &kv_pair : app_it->second) {
+          f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+        }
+        f->close_section(); // application
+        f->flush(rdata);
+        goto reply;
+      }
+      // filter by pool + app + key
+      auto key_it = app_it->second.find(key);
+      if (key_it == app_it->second.end()) {
+        ss << "application '" << app << "' on pool '" << pool_name
+           << "' does not have key '" << key << "'";
+        r = -ENOENT;
+        goto reply;
+      }
+      ss << key_it->second << "\n";
+      rdata.append(ss.str());
+      ss.str("");
+    }
   } else {
     // try prepare update
     return false;
@@ -5775,7 +5867,7 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
       *ss << "crush test failed with " << r << ": " << err.str();
       return r;
     }
-    dout(10) << __func__ << " crush somke test duration: "
+    dout(10) << __func__ << " crush smoke test duration: "
              << duration << dendl;
   }
   unsigned size, min_size;
@@ -7528,16 +7620,11 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
     CrushWrapper newcrush;
     _get_pending_crush(newcrush);
-
-    if (!newcrush.class_exists(srcname)) {
-      err = -ENOENT;
-      ss << "class '" << srcname << "' does not exist";
-      goto reply;
-    }
-
-    if (newcrush.class_exists(dstname)) {
-      err = -EEXIST;
-      ss << "class '" << dstname << "' already exists";
+    if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
+      // suppose this is a replay and return success
+      // so command is idempotent
+      ss << "already renamed to '" << dstname << "'";
+      err = 0;
       goto reply;
     }
 
@@ -8546,6 +8633,15 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
     CrushWrapper newcrush;
     _get_pending_crush(newcrush);
+    if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
+      // srcname does not exist and dstname already exists
+      // suppose this is a replay and return success
+      // (so this command is idempotent)
+      ss << "already renamed to '" << dstname << "'";
+      err = 0;
+      goto reply;
+    }
+
     err = newcrush.rename_rule(srcname, dstname, &ss);
     if (err < 0) {
       // ss has reason for failure
index b1aa8a5e3b22863f0cce5352af98668bae322b89..fde038fc5806a866c5310e1321b0c0d4cc69fd01 100644 (file)
@@ -835,7 +835,7 @@ void PGMapDigest::dump_object_stat_sum(
     f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
     f->dump_int("bytes_used", sum.num_bytes);
     f->dump_format_unquoted("percent_used", "%.2f", (used*100));
-    f->dump_unsigned("max_avail", avail);
+    f->dump_unsigned("max_avail", avail / raw_used_rate);
     f->dump_int("objects", sum.num_objects);
     if (verbose) {
       f->dump_int("quota_objects", pool->quota_max_objects);
@@ -850,7 +850,7 @@ void PGMapDigest::dump_object_stat_sum(
   } else {
     tbl << stringify(si_t(sum.num_bytes));
     tbl << percentify(used*100);
-    tbl << si_t(avail);
+    tbl << si_t(avail / raw_used_rate);
     tbl << sum.num_objects;
     if (verbose) {
       tbl << stringify(si_t(sum.num_objects_dirty))
index 3432f796633cf07c25e91a434c8a5a4d6b1ee705..257a9c75aaf839c1d3488553b52d682c0f4834e9 100644 (file)
@@ -188,6 +188,15 @@ public:
       return 0;
   }
 
+  // kill me post-mimic or -nautilus
+  bool definitely_converted_snapsets() const {
+    // false negative is okay; false positive is not!
+    return
+      num_pg &&
+      num_pg_unknown == 0 &&
+      pg_sum.stats.sum.num_legacy_snapsets == 0;
+  }
+
   // kill me post-luminous:
   virtual float get_fallback_full_ratio() const {
     return .95;
index 2a30e36e953d11ea6f58738ea4e6078278eab85b..1d39d76358c4859e02defeaa0e1dbb0742d0ff85 100644 (file)
@@ -1186,6 +1186,11 @@ void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
   assert(!new_log);
   assert(!new_log_writer);
 
+  // create a new log [writer] so that we know compaction is in progress
+  // (see _should_compact_log)
+  new_log = new File;
+  new_log->fnode.ino = 0;   // so that _flush_range won't try to log the fnode
+
   // 1. allocate new log space and jump to it.
   old_log_jump_to = log_file->fnode.get_allocated();
   uint64_t need = old_log_jump_to + cct->_conf->bluefs_max_log_runway;
@@ -1227,9 +1232,7 @@ void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
   dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
           << std::dec << dendl;
 
-  // create a new log [writer]
-  new_log = new File;
-  new_log->fnode.ino = 0;   // so that _flush_range won't try to log the fnode
+  // allocate
   int r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
                     &new_log->fnode.extents);
   assert(r == 0);
index b87d0302b306a9fc5526e001391e0f67cf9f2b4d..3566005cd01a76132fdbf37d97e6bb10764e80bb 100644 (file)
@@ -2133,7 +2133,7 @@ void BlueStore::ExtentMap::reshard(
             << needs_reshard_end << ")" << std::dec << dendl;
   }
 
-  fault_range(db, needs_reshard_begin, needs_reshard_end);
+  fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
 
   // we may need to fault in a larger interval later must have all
   // referring extents for spanning blobs loaded in order to have
@@ -3474,6 +3474,7 @@ BlueStore::BlueStore(CephContext *cct, const string& path)
     throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
                       cct->_conf->bluestore_throttle_bytes +
                       cct->_conf->bluestore_throttle_deferred_bytes),
+    deferred_finisher(cct, "defered_finisher", "dfin"),
     kv_sync_thread(this),
     kv_finalize_thread(this),
     mempool_thread(this)
@@ -3492,6 +3493,7 @@ BlueStore::BlueStore(CephContext *cct,
     throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
                       cct->_conf->bluestore_throttle_bytes +
                       cct->_conf->bluestore_throttle_deferred_bytes),
+    deferred_finisher(cct, "defered_finisher", "dfin"),
     kv_sync_thread(this),
     kv_finalize_thread(this),
     min_alloc_size(_min_alloc_size),
@@ -3538,6 +3540,8 @@ const char **BlueStore::get_tracked_conf_keys() const
     "bluestore_compression_required_ratio",
     "bluestore_max_alloc_size",
     "bluestore_prefer_deferred_size",
+    "bluestore_prefer_deferred_size_hdd",
+    "bluestore_prefer_deferred_size_ssd",
     "bluestore_deferred_batch_ops",
     "bluestore_deferred_batch_ops_hdd",
     "bluestore_deferred_batch_ops_ssd",
@@ -3577,6 +3581,8 @@ void BlueStore::handle_conf_change(const struct md_config_t *conf,
     }
   }
   if (changed.count("bluestore_prefer_deferred_size") ||
+      changed.count("bluestore_prefer_deferred_size_hdd") ||
+      changed.count("bluestore_prefer_deferred_size_ssd") ||
       changed.count("bluestore_max_alloc_size") ||
       changed.count("bluestore_deferred_batch_ops") ||
       changed.count("bluestore_deferred_batch_ops_hdd") ||
@@ -8292,6 +8298,7 @@ void BlueStore::_kv_start()
     finishers.push_back(f);
   }
 
+  deferred_finisher.start();
   for (auto f : finishers) {
     f->start();
   }
@@ -8329,6 +8336,8 @@ void BlueStore::_kv_stop()
     kv_finalize_stop = false;
   }
   dout(10) << __func__ << " stopping finishers" << dendl;
+  deferred_finisher.wait_for_empty();
+  deferred_finisher.stop();
   for (auto f : finishers) {
     f->wait_for_empty();
     f->stop();
@@ -8696,9 +8705,16 @@ void BlueStore::deferred_try_submit()
     osrs.push_back(&osr);
   }
   for (auto& osr : osrs) {
-    if (osr->deferred_pending && !osr->deferred_running) {
-      _deferred_submit_unlock(osr.get());
-      deferred_lock.lock();
+    if (osr->deferred_pending) {
+      if (!osr->deferred_running) {
+       _deferred_submit_unlock(osr.get());
+       deferred_lock.lock();
+      } else {
+       dout(20) << __func__ << "  osr " << osr << " already has running"
+                << dendl;
+      }
+    } else {
+      dout(20) << __func__ << "  osr " << osr << " has no pending" << dendl;
     }
   }
 }
@@ -8752,8 +8768,6 @@ void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
     ++i;
   }
 
-  // demote to deferred_submit_lock, then drop that too
-  std::lock_guard<std::mutex> l(deferred_submit_lock);
   deferred_lock.unlock();
   bdev->aio_submit(&b->ioc);
 }
@@ -8769,13 +8783,16 @@ void BlueStore::_deferred_aio_finish(OpSequencer *osr)
     assert(osr->deferred_running == b);
     osr->deferred_running = nullptr;
     if (!osr->deferred_pending) {
+      dout(20) << __func__ << " dequeueing" << dendl;
       auto q = deferred_queue.iterator_to(*osr);
       deferred_queue.erase(q);
     } else if (deferred_aggressive) {
       dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
-      finishers[0]->queue(new FunctionContext([&](int) {
+      deferred_finisher.queue(new FunctionContext([&](int) {
            deferred_try_submit();
          }));
+    } else {
+      dout(20) << __func__ << " leaving queued, more pending" << dendl;
     }
   }
 
@@ -9981,12 +9998,20 @@ int BlueStore::_do_alloc_write(
       if ((suggested_boff % (1 << csum_order)) == 0 &&
            suggested_boff + final_length <= max_bsize &&
            suggested_boff > b_off) {
-        dout(20) << __func__ << " forcing blob_offset to "
+        dout(20) << __func__ << " forcing blob_offset to 0x"
                  << std::hex << suggested_boff << std::dec << dendl;
         assert(suggested_boff >= b_off);
         csum_length += suggested_boff - b_off;
         b_off = suggested_boff;
       }
+      if (csum != Checksummer::CSUM_NONE) {
+        dout(20) << __func__ << " initialize csum setting for new blob " << *b
+                 << " csum_type " << Checksummer::get_csum_type_string(csum)
+                 << " csum_order " << csum_order
+                 << " csum_length 0x" << std::hex << csum_length << std::dec
+                 << dendl;
+        dblob.init_csum(csum, csum_order, csum_length);
+      }
     }
 
     AllocExtentVector extents;
@@ -10004,18 +10029,11 @@ int BlueStore::_do_alloc_write(
     }
     dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
 
-    dout(20) << __func__ << " blob " << *b
-            << " csum_type " << Checksummer::get_csum_type_string(csum)
-            << " csum_order " << csum_order
-            << " csum_length 0x" << std::hex << csum_length << std::dec
-            << dendl;
-
-    if (csum != Checksummer::CSUM_NONE) {
-      if (!dblob.has_csum()) {
-       dblob.init_csum(csum, csum_order, csum_length);
-      }
+    dout(20) << __func__ << " blob " << *b << dendl;
+    if (dblob.has_csum()) {
       dblob.calc_csum(b_off, *l);
     }
+
     if (wi.mark_unused) {
       auto b_end = b_off + wi.bl.length();
       if (b_off) {
index 8c5eb4a02b4e5de2b3c7a3f00d49f2a4633194eb..cf89f243895db7caf556c64670b03049feafc942 100644 (file)
@@ -1841,11 +1841,12 @@ private:
   interval_set<uint64_t> bluefs_extents;  ///< block extents owned by bluefs
   interval_set<uint64_t> bluefs_extents_reclaiming; ///< currently reclaiming
 
-  std::mutex deferred_lock, deferred_submit_lock;
+  std::mutex deferred_lock;
   std::atomic<uint64_t> deferred_seq = {0};
   deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
   int deferred_queue_size = 0;         ///< num txc's queued across all osrs
   atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
+  Finisher deferred_finisher;
 
   int m_finisher_num = 1;
   vector<Finisher*> finishers;
index a64df0458f61fb9251477bd1e4631318748c2243..5720432b648ba3688cb63c70c70968c511e3d92c 100644 (file)
@@ -71,8 +71,9 @@ void StupidAllocator::unreserve(uint64_t unused)
 }
 
 /// return the effective length of the extent if we align to alloc_unit
-static uint64_t aligned_len(btree_interval_set<uint64_t>::iterator p,
-                           uint64_t alloc_unit)
+uint64_t StupidAllocator::_aligned_len(
+  btree_interval_set<uint64_t,allocator>::iterator p,
+  uint64_t alloc_unit)
 {
   uint64_t skew = p.get_start() % alloc_unit;
   if (skew)
@@ -106,7 +107,7 @@ int64_t StupidAllocator::allocate_int(
     for (bin = orig_bin; bin < (int)free.size(); ++bin) {
       p = free[bin].lower_bound(hint);
       while (p != free[bin].end()) {
-       if (aligned_len(p, alloc_unit) >= want_size) {
+       if (_aligned_len(p, alloc_unit) >= want_size) {
          goto found;
        }
        ++p;
@@ -119,7 +120,7 @@ int64_t StupidAllocator::allocate_int(
     p = free[bin].begin();
     auto end = hint ? free[bin].lower_bound(hint) : free[bin].end();
     while (p != end) {
-      if (aligned_len(p, alloc_unit) >= want_size) {
+      if (_aligned_len(p, alloc_unit) >= want_size) {
        goto found;
       }
       ++p;
@@ -131,7 +132,7 @@ int64_t StupidAllocator::allocate_int(
     for (bin = orig_bin; bin >= 0; --bin) {
       p = free[bin].lower_bound(hint);
       while (p != free[bin].end()) {
-       if (aligned_len(p, alloc_unit) >= alloc_unit) {
+       if (_aligned_len(p, alloc_unit) >= alloc_unit) {
          goto found;
        }
        ++p;
@@ -144,7 +145,7 @@ int64_t StupidAllocator::allocate_int(
     p = free[bin].begin();
     auto end = hint ? free[bin].lower_bound(hint) : free[bin].end();
     while (p != end) {
-      if (aligned_len(p, alloc_unit) >= alloc_unit) {
+      if (_aligned_len(p, alloc_unit) >= alloc_unit) {
        goto found;
       }
       ++p;
@@ -284,10 +285,10 @@ void StupidAllocator::init_rm_free(uint64_t offset, uint64_t length)
   std::lock_guard<std::mutex> l(lock);
   dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
           << std::dec << dendl;
-  btree_interval_set<uint64_t> rm;
+  btree_interval_set<uint64_t,allocator> rm;
   rm.insert(offset, length);
   for (unsigned i = 0; i < free.size() && !rm.empty(); ++i) {
-    btree_interval_set<uint64_t> overlap;
+    btree_interval_set<uint64_t,allocator> overlap;
     overlap.intersection_of(rm, free[i]);
     if (!overlap.empty()) {
       dout(20) << __func__ << " bin " << i << " rm 0x" << std::hex << overlap
index 445e8a6bc8a1c4e3b32d4d4211d36726bd807e69..431c636a61022a82eca8b88143ff12d39e4e53ca 100644 (file)
@@ -9,6 +9,7 @@
 #include "Allocator.h"
 #include "include/btree_interval_set.h"
 #include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
 
 class StupidAllocator : public Allocator {
   CephContext* cct;
@@ -17,13 +18,19 @@ class StupidAllocator : public Allocator {
   int64_t num_free;     ///< total bytes in freelist
   int64_t num_reserved; ///< reserved bytes
 
-  std::vector<btree_interval_set<uint64_t> > free;        ///< leading-edge copy
+  typedef mempool::bluestore_alloc::pool_allocator<
+    pair<const uint64_t,uint64_t>> allocator;
+  std::vector<btree_interval_set<uint64_t,allocator>> free;  ///< leading-edge copy
 
   uint64_t last_alloc;
 
   unsigned _choose_bin(uint64_t len);
   void _insert_free(uint64_t offset, uint64_t len);
 
+  uint64_t _aligned_len(
+    btree_interval_set<uint64_t,allocator>::iterator p,
+    uint64_t alloc_unit);
+
 public:
   StupidAllocator(CephContext* cct);
   ~StupidAllocator() override;
index cfe0c5cf8c899f0fd71a7a46f6ef2c2a77711098..4996e73452b119a8c9c50085d5c858e71534ece5 100644 (file)
@@ -39,15 +39,16 @@ int aio_queue_t::submit_batch(aio_iter begin, aio_iter end,
 
   aio_iter cur = begin;
   struct iocb *piocb[aios_size];
-  int r, pos = 0;
+  int left = 0;
   while (cur != end) {
     cur->priv = priv;
-    *(piocb+pos) = &cur->iocb;
-    ++pos;
+    *(piocb+left) = &cur->iocb;
+    ++left;
     ++cur;
   }
-  while (true) {
-    r = io_submit(ctx, pos, piocb);
+  int done = 0;
+  while (left > 0) {
+    int r = io_submit(ctx, left, piocb + done);
     if (r < 0) {
       if (r == -EAGAIN && attempts-- > 0) {
        usleep(delay);
@@ -55,10 +56,13 @@ int aio_queue_t::submit_batch(aio_iter begin, aio_iter end,
        (*retries)++;
        continue;
       }
+      return r;
     }
-    break;
+    assert(r > 0);
+    done += r;
+    left -= r;
   }
-  return r;
+  return done;
 }
 
 int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max)
index 659f3e5bcc06823b35fa866b40c435988867829e..9fb7ce84b626f00bcc28fc5e9074600eddebf8f2 100644 (file)
@@ -674,7 +674,7 @@ ostream& operator<<(ostream& out, const bluestore_blob_t& o)
   if (o.flags) {
     out << " " << o.get_flags_string();
   }
-  if (o.csum_type) {
+  if (o.has_csum()) {
     out << " " << Checksummer::get_csum_type_string(o.csum_type)
        << "/0x" << std::hex << (1ull << o.csum_chunk_order) << std::dec;
   }
index 28608f1250a9cdad07cd584d1f0f8390972c0841..426da2440089115065c5c2e191580b1137636ee0 100644 (file)
@@ -585,6 +585,11 @@ void OSDService::activate_map()
   agent_lock.Unlock();
 }
 
+void OSDService::request_osdmap_update(epoch_t e)
+{
+  osd->osdmap_subscribe(e, false);
+}
+
 class AgentTimeoutCB : public Context {
   PGRef pg;
 public:
@@ -1913,6 +1918,7 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
   disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
   command_tp(cct, "OSD::command_tp", "tp_osd_cmd",  1),
   session_waiting_lock("OSD::session_waiting_lock"),
+  osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
   heartbeat_lock("OSD::heartbeat_lock"),
   heartbeat_stop(false),
   heartbeat_need_update(true),
@@ -4441,17 +4447,21 @@ void OSD::build_initial_pg_history(
       &debug);
     if (new_interval) {
       h->same_interval_since = e;
-    }
-    if (up != new_up) {
-      h->same_up_since = e;
-    }
-    if (acting_primary != new_acting_primary) {
-      h->same_primary_since = e;
-    }
-    if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
-                          osdmap->get_pg_num(pgid.pgid.pool()),
-                          nullptr)) {
-      h->last_epoch_split = e;
+      if (up != new_up) {
+        h->same_up_since = e;
+      }
+      if (acting_primary != new_acting_primary) {
+        h->same_primary_since = e;
+      }
+      if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
+                             osdmap->get_pg_num(pgid.pgid.pool()),
+                             nullptr)) {
+        h->last_epoch_split = e;
+      }
+      up = new_up;
+      acting = new_acting;
+      up_primary = new_up_primary;
+      acting_primary = new_acting_primary;
     }
     lastmap = osdmap;
   }
@@ -5785,6 +5795,9 @@ void OSD::start_waiting_for_healthy()
   dout(1) << "start_waiting_for_healthy" << dendl;
   set_state(STATE_WAITING_FOR_HEALTHY);
   last_heartbeat_resample = utime_t();
+
+  // subscribe to osdmap updates, in case our peers really are known to be dead
+  osdmap_subscribe(osdmap->get_epoch() + 1, false);
 }
 
 bool OSD::_is_healthy()
@@ -7482,10 +7495,12 @@ struct C_OnMapApply : public Context {
 
 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
 {
-  OSDMapRef osdmap = service.get_osdmap();
-  if (osdmap->get_epoch() >= epoch)
+  Mutex::Locker l(osdmap_subscribe_lock);
+  if (latest_subscribed_epoch >= epoch && !force_request)
     return;
 
+  latest_subscribed_epoch = MAX(epoch, latest_subscribed_epoch);
+
   if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
       force_request) {
     monc->renew_subs();
index 88e84710ff531bf4f9eee2a59d4df2762c469a97..42a152b3c9adc6aca9fc27ef89cab5ee20edde84 100644 (file)
@@ -1144,6 +1144,8 @@ public:
     return ret;
   }
 
+  void request_osdmap_update(epoch_t e);
+
   // -- stopping --
   Mutex is_stopping_lock;
   Cond is_stopping_cond;
@@ -1455,6 +1457,9 @@ private:
   void osdmap_subscribe(version_t epoch, bool force_request);
   /** @} monc helpers */
 
+  Mutex osdmap_subscribe_lock;
+  epoch_t latest_subscribed_epoch{0};
+
   // -- heartbeat --
   /// information about a heartbeat peer
   struct HeartbeatInfo {
index 99a768897c142ec97f1182ee143752c12e35b172..5d7eb423880c6958ca9791268dd85d9c7f299405 100644 (file)
@@ -1939,17 +1939,28 @@ void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) co
 
   auto q = pg_upmap_items.find(pg);
   if (q != pg_upmap_items.end()) {
-    for (auto& i : *raw) {
-      for (auto& r : q->second) {
-        if (r.first != i) {
-          continue;
-        }
-        if (!(r.second != CRUSH_ITEM_NONE &&
-              r.second < max_osd &&
-              osd_weight[r.second] == 0)) {
-          i = r.second;
-        }
-        break;
+    // NOTE: this approach does not allow a bidirectional swap,
+    // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
+    for (auto& r : q->second) {
+      // make sure the replacement value doesn't already appear
+      bool exists = false;
+      ssize_t pos = -1;
+      for (unsigned i = 0; i < raw->size(); ++i) {
+       int osd = (*raw)[i];
+       if (osd == r.second) {
+         exists = true;
+         break;
+       }
+       // ignore mapping if target is marked out (or invalid osd id)
+       if (osd == r.first &&
+           pos < 0 &&
+           !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
+             osd_weight[r.second] == 0)) {
+         pos = i;
+       }
+      }
+      if (!exists && pos >= 0) {
+       (*raw)[pos] = r.second;
       }
     }
   }
@@ -2947,6 +2958,8 @@ string OSDMap::get_flag_string(unsigned f)
     s += ",require_luminous_osds";
   if (f & CEPH_OSDMAP_RECOVERY_DELETES)
     s += ",recovery_deletes";
+  if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
+    s += ",purged_snapdirs";
   if (s.length())
     s.erase(0, 1);
   return s;
index 5849c64bb070f3fa28fc8d5d12d92ba26c535f6b..da4da001e0404e2b8daca753cdafc320359e303b 100644 (file)
@@ -5700,12 +5700,21 @@ bool PG::can_discard_replica_op(OpRequestRef& op)
   const T *m = static_cast<const T *>(op->get_req());
   assert(m->get_type() == MSGTYPE);
 
+  int from = m->get_source().num();
+
+  // if a repop is replied after a replica goes down in a new osdmap, and
+  // before the pg advances to this new osdmap, the repop replies before this
+  // repop can be discarded by that replica OSD, because the primary resets the
+  // connection to it when handling the new osdmap marking it down, and also
+  // resets the messenger sesssion when the replica reconnects. to avoid the
+  // out-of-order replies, the messages from that replica should be discarded.
+  if (osd->get_osdmap()->is_down(from))
+    return true;
   /* Mostly, this overlaps with the old_peering_msg
    * condition.  An important exception is pushes
    * sent by replicas not in the acting set, since
    * if such a replica goes down it does not cause
    * a new interval. */
-  int from = m->get_source().num();
   if (get_osdmap()->get_down_at(from) >= m->map_epoch)
     return true;
 
@@ -5959,10 +5968,8 @@ void PG::update_store_on_load()
     // legacy filestore didn't store collection bit width; fix.
     int bits = osd->store->collection_bits(coll);
     if (bits < 0) {
-      if (coll.is_meta())
-       bits = 0;
-      else
-       bits = info.pgid.get_split_bits(pool.info.get_pg_num());
+      assert(!coll.is_meta()); // otherwise OSD::load_pgs() did a bad thing
+      bits = info.pgid.get_split_bits(pool.info.get_pg_num());
       lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
       ObjectStore::Transaction t;
       t.collection_set_bits(coll, bits);
index 7b086eb30a08999f2aae2ed47e0058c414abdafe..96f49fd9d85011330de2cd036f9397c06b250727 100644 (file)
@@ -48,7 +48,7 @@ void PGLog::IndexedLog::trim(
   eversion_t s,
   set<eversion_t> *trimmed,
   set<string>* trimmed_dups,
-  bool* dirty_dups)
+  eversion_t *write_from_dups)
 {
   if (complete_to != log.end() &&
       complete_to->version <= s) {
@@ -75,8 +75,12 @@ void PGLog::IndexedLog::trim(
     unindex(e);         // remove from index,
 
     // add to dup list
+    generic_dout(20) << "earliest_dup_version = " << earliest_dup_version << dendl;
     if (e.version.version >= earliest_dup_version) {
-      if (dirty_dups) *dirty_dups = true;
+      if (write_from_dups != nullptr && *write_from_dups > e.version) {
+       generic_dout(20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
+       *write_from_dups = e.version;
+      }
       dups.push_back(pg_log_dup_t(e));
       index(dups.back());
       for (const auto& extra : e.extra_reqids) {
@@ -166,7 +170,7 @@ void PGLog::trim(
     assert(trim_to <= info.last_complete);
 
     dout(10) << "trim " << log << " to " << trim_to << dendl;
-    log.trim(cct, trim_to, &trimmed, &trimmed_dups, &dirty_dups);
+    log.trim(cct, trim_to, &trimmed, &trimmed_dups, &write_from_dups);
     info.log_tail = log.tail;
   }
 }
@@ -445,7 +449,6 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
 
   // now handle dups
   if (merge_log_dups(olog)) {
-    dirty_dups = true;
     changed = true;
   }
 
@@ -469,6 +472,8 @@ bool PGLog::merge_log_dups(const pg_log_t& olog) {
        olog.dups.front().version << " to " <<
        olog.dups.back().version << dendl;
       changed = true;
+      dirty_from_dups = eversion_t();
+      dirty_to_dups = eversion_t::max();
       // since our log.dups is empty just copy them
       for (const auto& i : olog.dups) {
        log.dups.push_back(i);
@@ -486,9 +491,11 @@ bool PGLog::merge_log_dups(const pg_log_t& olog) {
        auto log_tail_version = log.dups.back().version;
 
        auto insert_cursor = log.dups.end();
+       eversion_t last_shared = eversion_t::max();
        for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) {
          if (i->version <= log_tail_version) break;
          log.dups.insert(insert_cursor, *i);
+         last_shared = i->version;
 
          auto prev = insert_cursor;
          --prev;
@@ -497,6 +504,7 @@ bool PGLog::merge_log_dups(const pg_log_t& olog) {
 
          --insert_cursor; // make sure we insert in reverse order
        }
+       mark_dirty_from_dups(last_shared);
       }
 
       if (olog.dups.front().version < log.dups.front().version) {
@@ -505,15 +513,18 @@ bool PGLog::merge_log_dups(const pg_log_t& olog) {
          olog.dups.front().version << dendl;
        changed = true;
 
+       eversion_t last;
        auto insert_cursor = log.dups.begin();
        for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) {
          if (i->version >= insert_cursor->version) break;
          log.dups.insert(insert_cursor, *i);
+         last = i->version;
          auto prev = insert_cursor;
          --prev;
          // be sure to pass address of copy in log.dups
          log.index(*prev);
        }
+       mark_dirty_to_dups(last);
       }
     }
   }
@@ -526,6 +537,7 @@ bool PGLog::merge_log_dups(const pg_log_t& olog) {
 
     while (!log.dups.empty() && log.dups.back().version >= log.tail) {
       log.unindex(log.dups.back());
+      mark_dirty_from_dups(log.dups.back().version);
       log.dups.pop_back();
     }
   }
@@ -587,7 +599,9 @@ void PGLog::write_log_and_missing(
       !touched_log,
       require_rollback,
       clear_divergent_priors,
-      dirty_dups,
+      dirty_to_dups,
+      dirty_from_dups,
+      write_from_dups,
       &rebuilt_missing_with_deletes,
       (pg_log_debug ? &log_keys_debug : nullptr));
     undirty();
@@ -603,15 +617,16 @@ void PGLog::write_log_and_missing_wo_missing(
     pg_log_t &log,
     const coll_t& coll, const ghobject_t &log_oid,
     map<eversion_t, hobject_t> &divergent_priors,
-    bool require_rollback,
-    bool dirty_dups)
+    bool require_rollback
+    )
 {
   _write_log_and_missing_wo_missing(
     t, km, log, coll, log_oid,
     divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
     set<eversion_t>(),
     set<string>(),
-    true, true, require_rollback, dirty_dups, nullptr);
+    true, true, require_rollback,
+    eversion_t::max(), eversion_t(), eversion_t(), nullptr);
 }
 
 // static
@@ -623,7 +638,6 @@ void PGLog::write_log_and_missing(
     const ghobject_t &log_oid,
     const pg_missing_tracker_t &missing,
     bool require_rollback,
-    bool dirty_dups,
     bool *rebuilt_missing_with_deletes)
 {
   _write_log_and_missing(
@@ -634,7 +648,11 @@ void PGLog::write_log_and_missing(
     set<eversion_t>(),
     set<string>(),
     missing,
-    true, require_rollback, false, dirty_dups, rebuilt_missing_with_deletes, nullptr);
+    true, require_rollback, false,
+    eversion_t::max(),
+    eversion_t(),
+    eversion_t(),
+    rebuilt_missing_with_deletes, nullptr);
 }
 
 // static
@@ -652,7 +670,9 @@ void PGLog::_write_log_and_missing_wo_missing(
   bool dirty_divergent_priors,
   bool touch_log,
   bool require_rollback,
-  bool dirty_dups,
+  eversion_t dirty_to_dups,
+  eversion_t dirty_from_dups,
+  eversion_t write_from_dups,
   set<string> *log_keys_debug
   )
 {
@@ -713,18 +733,40 @@ void PGLog::_write_log_and_missing_wo_missing(
     }
   }
 
-  // process dirty_dups after log_keys_debug is filled, so dups do not
+  // process dups after log_keys_debug is filled, so dups do not
   // end up in that set
-  if (dirty_dups) {
-    pg_log_dup_t min;
+  if (dirty_to_dups != eversion_t()) {
+    pg_log_dup_t min, dirty_to_dup;
+    dirty_to_dup.version = dirty_to_dups;
     t.omap_rmkeyrange(
       coll, log_oid,
-      min.get_key_name(), log.dups.begin()->get_key_name());
-    for (const auto& entry : log.dups) {
-      bufferlist bl;
-      ::encode(entry, bl);
-      (*km)[entry.get_key_name()].claim(bl);
-    }
+      min.get_key_name(), dirty_to_dup.get_key_name());
+  }
+  if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
+    pg_log_dup_t max, dirty_from_dup;
+    max.version = eversion_t::max();
+    dirty_from_dup.version = dirty_from_dups;
+    t.omap_rmkeyrange(
+      coll, log_oid,
+      dirty_from_dup.get_key_name(), max.get_key_name());
+  }
+
+  for (const auto& entry : log.dups) {
+    if (entry.version > dirty_to_dups)
+      break;
+    bufferlist bl;
+    ::encode(entry, bl);
+    (*km)[entry.get_key_name()].claim(bl);
+  }
+
+  for (list<pg_log_dup_t>::reverse_iterator p = log.dups.rbegin();
+       p != log.dups.rend() &&
+        (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
+        p->version >= dirty_to_dups;
+       ++p) {
+    bufferlist bl;
+    ::encode(*p, bl);
+    (*km)[p->get_key_name()].claim(bl);
   }
 
   if (dirty_divergent_priors) {
@@ -759,7 +801,9 @@ void PGLog::_write_log_and_missing(
   bool touch_log,
   bool require_rollback,
   bool clear_divergent_priors,
-  bool dirty_dups,
+  eversion_t dirty_to_dups,
+  eversion_t dirty_from_dups,
+  eversion_t write_from_dups,
   bool *rebuilt_missing_with_deletes, // in/out param
   set<string> *log_keys_debug
   ) {
@@ -819,18 +863,40 @@ void PGLog::_write_log_and_missing(
     }
   }
 
-  // process dirty_dups after log_keys_debug is filled, so dups do not
+  // process dups after log_keys_debug is filled, so dups do not
   // end up in that set
-  if (dirty_dups) {
-    pg_log_dup_t min;
+  if (dirty_to_dups != eversion_t()) {
+    pg_log_dup_t min, dirty_to_dup;
+    dirty_to_dup.version = dirty_to_dups;
     t.omap_rmkeyrange(
       coll, log_oid,
-      min.get_key_name(), log.dups.begin()->get_key_name());
-    for (const auto& entry : log.dups) {
-      bufferlist bl;
-      ::encode(entry, bl);
-      (*km)[entry.get_key_name()].claim(bl);
-    }
+      min.get_key_name(), dirty_to_dup.get_key_name());
+  }
+  if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
+    pg_log_dup_t max, dirty_from_dup;
+    max.version = eversion_t::max();
+    dirty_from_dup.version = dirty_from_dups;
+    t.omap_rmkeyrange(
+      coll, log_oid,
+      dirty_from_dup.get_key_name(), max.get_key_name());
+  }
+
+  for (const auto& entry : log.dups) {
+    if (entry.version > dirty_to_dups)
+      break;
+    bufferlist bl;
+    ::encode(entry, bl);
+    (*km)[entry.get_key_name()].claim(bl);
+  }
+
+  for (list<pg_log_dup_t>::reverse_iterator p = log.dups.rbegin();
+       p != log.dups.rend() &&
+        (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
+        p->version >= dirty_to_dups;
+       ++p) {
+    bufferlist bl;
+    ::encode(*p, bl);
+    (*km)[p->get_key_name()].claim(bl);
   }
 
   if (clear_divergent_priors) {
index 23b53b78f3877ac03e34baad284c321d5e40b708..5e7d10b2e409948b8afb672920005c852683a2fb 100644 (file)
@@ -538,7 +538,7 @@ public:
       eversion_t s,
       set<eversion_t> *trimmed,
       set<string>* trimmed_dups,
-      bool* dirty_dups);
+      eversion_t *write_from_dups);
 
     ostream& print(ostream& out) const;
   }; // IndexedLog
@@ -554,13 +554,15 @@ protected:
   eversion_t dirty_from;       ///< must clear/writeout all keys >= dirty_from
   eversion_t writeout_from;    ///< must writout keys >= writeout_from
   set<eversion_t> trimmed;     ///< must clear keys in trimmed
+  eversion_t dirty_to_dups;    ///< must clear/writeout all dups <= dirty_to_dups
+  eversion_t dirty_from_dups;  ///< must clear/writeout all dups >= dirty_from_dups
+  eversion_t write_from_dups;  ///< must write keys >= write_from_dups
   set<string> trimmed_dups;    ///< must clear keys in trimmed_dups
   CephContext *cct;
   bool pg_log_debug;
   /// Log is clean on [dirty_to, dirty_from)
   bool touched_log;
   bool clear_divergent_priors;
-  bool dirty_dups; /// log.dups is updated
   bool rebuilt_missing_with_deletes = false;
 
   void mark_dirty_to(eversion_t to) {
@@ -575,6 +577,14 @@ protected:
     if (from < writeout_from)
       writeout_from = from;
   }
+  void mark_dirty_to_dups(eversion_t to) {
+    if (to > dirty_to_dups)
+      dirty_to_dups = to;
+  }
+  void mark_dirty_from_dups(eversion_t from) {
+    if (from < dirty_from_dups)
+      dirty_from_dups = from;
+  }
 public:
   bool is_dirty() const {
     return !touched_log ||
@@ -584,12 +594,16 @@ public:
       !(trimmed.empty()) ||
       !missing.is_clean() ||
       !(trimmed_dups.empty()) ||
-      dirty_dups ||
+      (dirty_to_dups != eversion_t()) ||
+      (dirty_from_dups != eversion_t::max()) ||
+      (write_from_dups != eversion_t::max()) ||
       rebuilt_missing_with_deletes;
   }
   void mark_log_for_rewrite() {
     mark_dirty_to(eversion_t::max());
     mark_dirty_from(eversion_t());
+    mark_dirty_to_dups(eversion_t::max());
+    mark_dirty_from_dups(eversion_t());
     touched_log = false;
   }
   bool get_rebuilt_missing_with_deletes() const {
@@ -624,7 +638,9 @@ protected:
     writeout_from = eversion_t::max();
     check();
     missing.flush();
-    dirty_dups = false;
+    dirty_to_dups = eversion_t();
+    dirty_from_dups = eversion_t::max();
+    write_from_dups = eversion_t::max();
   }
 public:
 
@@ -633,11 +649,12 @@ public:
     prefix_provider(dpp),
     dirty_from(eversion_t::max()),
     writeout_from(eversion_t::max()),
+    dirty_from_dups(eversion_t::max()),
+    write_from_dups(eversion_t::max()),
     cct(cct),
     pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
     touched_log(false),
-    clear_divergent_priors(false),
-    dirty_dups(false)
+    clear_divergent_priors(false)
   { }
 
   void reset_backfill();
@@ -715,6 +732,7 @@ public:
     log.claim_log_and_clear_rollback_info(o);
     missing.clear();
     mark_dirty_to(eversion_t::max());
+    mark_dirty_to_dups(eversion_t::max());
   }
 
   void split_into(
@@ -724,7 +742,9 @@ public:
     log.split_out_child(child_pgid, split_bits, &opg_log->log);
     missing.split_into(child_pgid, split_bits, &(opg_log->missing));
     opg_log->mark_dirty_to(eversion_t::max());
+    opg_log->mark_dirty_to_dups(eversion_t::max());
     mark_dirty_to(eversion_t::max());
+    mark_dirty_to_dups(eversion_t::max());
     if (missing.may_include_deletes)
       opg_log->rebuilt_missing_with_deletes = true;
   }
@@ -1181,8 +1201,7 @@ public:
     pg_log_t &log,
     const coll_t& coll,
     const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
-    bool require_rollback,
-    bool dirty_dups);
+    bool require_rollback);
 
   static void write_log_and_missing(
     ObjectStore::Transaction& t,
@@ -1192,7 +1211,6 @@ public:
     const ghobject_t &log_oid,
     const pg_missing_tracker_t &missing,
     bool require_rollback,
-    bool dirty_dups,
     bool *rebuilt_missing_set_with_deletes);
 
   static void _write_log_and_missing_wo_missing(
@@ -1209,7 +1227,9 @@ public:
     bool dirty_divergent_priors,
     bool touch_log,
     bool require_rollback,
-    bool dirty_dups,
+    eversion_t dirty_to_dups,
+    eversion_t dirty_from_dups,
+    eversion_t write_from_dups,
     set<string> *log_keys_debug
     );
 
@@ -1227,7 +1247,9 @@ public:
     bool touch_log,
     bool require_rollback,
     bool clear_divergent_priors,
-    bool dirty_dups,
+    eversion_t dirty_to_dups,
+    eversion_t dirty_from_dups,
+    eversion_t write_from_dups,
     bool *rebuilt_missing_with_deletes,
     set<string> *log_keys_debug
     );
index 6e50990addda35bb1b3d0d5ec2c3d8dfb0e5119c..2877c28d67a618da2577b9b744aee684a599c3b9 100644 (file)
@@ -1655,6 +1655,7 @@ void PrimaryLogPG::do_request(
             << ", queue on waiting_for_map " << op->get_source() << dendl;
     waiting_for_map[op->get_source()].push_back(op);
     op->mark_delayed("op must wait for map");
+    osd->request_osdmap_update(op->min_epoch);
     return;
   }
 
@@ -4918,9 +4919,6 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
         bufferlist t;
         uint64_t len = miter->first - last;
         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
-       if (r == -EIO) {
-         r = rep_repair_primary_object(soid, ctx->op);
-       }
         if (r < 0) {
           osd->clog->error() << coll << " " << soid
                             << " sparse-read failed to read: "
@@ -4935,6 +4933,9 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
       bufferlist tmpbl;
       r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
                                       op.flags, &tmpbl);
+      if (r == -EIO) {
+        r = rep_repair_primary_object(soid, ctx->op);
+      }
       if (r < 0) {
        return r;
       }
index 9d9a142b0f3ff27b1fc795f4d00bbcf18d618529..43d9a98e73753ea8bb14c45ba32f01ea19a24d0b 100644 (file)
@@ -3440,6 +3440,16 @@ struct pg_log_dup_t {
   void dump(Formatter *f) const;
   static void generate_test_instances(list<pg_log_dup_t*>& o);
 
+  bool operator==(const pg_log_dup_t &rhs) const {
+    return reqid == rhs.reqid &&
+      version == rhs.version &&
+      user_version == rhs.user_version &&
+      return_code == rhs.return_code;
+  }
+  bool operator!=(const pg_log_dup_t &rhs) const {
+    return !(*this == rhs);
+  }
+
   friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
 };
 WRITE_CLASS_ENCODER(pg_log_dup_t)
index 1d4947c26de3f5b22ce982777603196f47ea9895..146bd92d0c807a2d7aaa7fd06b60ac42288b3e02 100644 (file)
             font-weight: bold;
         }
 
+        .chart-container {
+            width: 100%;
+            height: 100%;
+        }
+
         .dataTables_wrapper .dataTables_filter {
             color: #ddd
         }
                 <li rv-each-pool="rbd_pools">
                   <a rv-href="pool.url"><i class="fa fa-circle-o"></i> {pool.name}</a>
                 </li>
+                <li class="ceph-none-found" rv-hide="rbd_pools | length">None found</li>
               </ul>
             </li>
-            <li class="ceph-none-found" rv-hide="rbd_pools | length">None found</li>
           </ul>
         </li>
         <li class="treeview{%if path_info.startswith(('/filesystem/', '/clients/'))%} active{%endif%}">
index 59dc312d9fa11ec7bfa9dd64f94018345316f4ea..48cea82f1097c69476fc43923ae49e5db7e82957 100644 (file)
                     },
                     options: {
                         center_text: raw_usage_text,
-                        responsive: false,
+                        responsive: true,
                         legend: {display: false},
                         animation: {duration: 0}
                     }
                         ]
                     },
                     options: {
-                        responsive: false,
+                        responsive: true,
                         legend: {display: false},
                         animation: {duration: 0}
                     }
                                 <td>
                                     <span style="font-size: 45px;">{df.stats.total_objects | dimless}</span>
                                 </td>
-                                <td>
-                                    <canvas id="raw_usage_chart"
-                                            style="height:120px; width:120px;"></canvas>
+                                <td >
+                                    <div style="height:120px; width: 120px;">
+                                        <canvas id="raw_usage_chart"></canvas>
+                                    </div>
                                 </td>
                                 <td>
-                                    <canvas id="pool_usage_chart"
-                                            style="height:120px; width: 120px;"></canvas>
+                                    <div style="height:120px; width: 120px;">
+                                        <canvas id="pool_usage_chart"></canvas>
+                                    </div>
                                 </td>
                             </tr>
                             <tr>
index ea1e8a7e88bd0c6103468989d4c40f28b60a3da8..10b5c37edb30794d3ef363b31e80485b90dfd50e 100644 (file)
@@ -294,7 +294,7 @@ class Module(MgrModule):
                     ) + "/s"
 
                 metadata = self.get_metadata('mds', info['name'])
-                mds_versions[metadata['ceph_version']].append(info['name'])
+                mds_versions[metadata.get('ceph_version', 'unknown')].append(info['name'])
                 rank_table.append(
                     {
                         "rank": rank,
@@ -363,7 +363,7 @@ class Module(MgrModule):
         standby_table = []
         for standby in fsmap['standbys']:
             metadata = self.get_metadata('mds', standby['name'])
-            mds_versions[metadata['ceph_version']].append(standby['name'])
+            mds_versions[metadata.get('ceph_version', 'unknown')].append(standby['name'])
 
             standby_table.append({
                 'name': standby['name']
index fb820e6384d06c044271d3dc131af8f5ae9e1657..9c0826822de8e28a2966749fd499192fb4e3d6e6 100644 (file)
@@ -2396,6 +2396,7 @@ int main(int argc, const char **argv)
   string start_marker;
   string end_marker;
   int max_entries = -1;
+  bool max_entries_specified = false;
   int admin = false;
   bool admin_specified = false;
   int system = false;
@@ -2557,6 +2558,7 @@ int main(int argc, const char **argv)
       max_buckets_specified = true;
     } else if (ceph_argparse_witharg(args, i, &val, "--max-entries", (char*)NULL)) {
       max_entries = (int)strict_strtol(val.c_str(), 10, &err);
+      max_entries_specified = true;
       if (!err.empty()) {
         cerr << "ERROR: failed to parse max entries: " << err << std::endl;
         return EINVAL;
@@ -5852,6 +5854,11 @@ next:
     if (inconsistent_index == false) {
       RGWBucketAdminOp::remove_bucket(store, bucket_op, bypass_gc, true);
     } else {
+      if (!yes_i_really_mean_it) {
+       cerr << "using --inconsistent_index can corrupt the bucket index " << std::endl
+       << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl;
+       return 1;
+      }
       RGWBucketAdminOp::remove_bucket(store, bucket_op, bypass_gc, false);
     }
   }
@@ -6093,31 +6100,47 @@ next:
     }
     void *handle;
     int max = 1000;
-    int ret = store->meta_mgr->list_keys_init(metadata_key, &handle);
+    int ret = store->meta_mgr->list_keys_init(metadata_key, marker, &handle);
     if (ret < 0) {
       cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
       return -ret;
     }
 
     bool truncated;
+    uint64_t count = 0;
 
+    if (max_entries_specified) {
+      formatter->open_object_section("result");
+    }
     formatter->open_array_section("keys");
 
+    uint64_t left;
     do {
       list<string> keys;
-      ret = store->meta_mgr->list_keys_next(handle, max, keys, &truncated);
+      left = (max_entries_specified ? max_entries - count : max);
+      ret = store->meta_mgr->list_keys_next(handle, left, keys, &truncated);
       if (ret < 0 && ret != -ENOENT) {
         cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
         return -ret;
       } if (ret != -ENOENT) {
        for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
          formatter->dump_string("key", *iter);
+          ++count;
        }
        formatter->flush(cout);
       }
-    } while (truncated);
+    } while (truncated && left > 0);
 
     formatter->close_section();
+
+    if (max_entries_specified) {
+      encode_json("truncated", truncated, formatter);
+      encode_json("count", count, formatter);
+      if (truncated) {
+        encode_json("marker", store->meta_mgr->get_marker(handle), formatter);
+      }
+      formatter->close_section();
+    }
     formatter->flush(cout);
 
     store->meta_mgr->list_keys_complete(handle);
index 5fc1deec42e4b323d7281e2e14db2dc43647386d..ee6be6256a5ba69b588c612a517b4d190eeee5db 100644 (file)
@@ -111,11 +111,11 @@ static void handle_connection(RGWProcessEnv& env, tcp::socket socket,
     rgw::asio::ClientIO real_client{socket, parser, buffer};
 
     auto real_client_io = rgw::io::add_reordering(
-                            rgw::io::add_buffering(
+                            rgw::io::add_buffering(cct,
                               rgw::io::add_chunking(
                                 rgw::io::add_conlen_controlling(
                                   &real_client))));
-    RGWRestfulIO client(&real_client_io);
+    RGWRestfulIO client(cct, &real_client_io);
     process_request(env.store, env.rest, &req, env.uri_prefix,
                     *env.auth_registry, &client, env.olog);
 
index 41b7d12315066a5ffd7188bd4a9e275e6806069c..0da6dd587b672226d14d35bdd9967d840954d713 100644 (file)
@@ -12,6 +12,7 @@
 
 #include "common/errno.h"
 #include "common/ceph_json.h"
+#include "common/backport14.h"
 #include "rgw_rados.h"
 #include "rgw_acl.h"
 #include "rgw_acl_s3.h"
@@ -2164,13 +2165,17 @@ public:
     pool = store->get_zone_params().domain_root;
   }
 
-  int list_keys_init(RGWRados *store, void **phandle) override
-  {
-    list_keys_info *info = new list_keys_info;
+  int list_keys_init(RGWRados *store, const string& marker, void **phandle) override {
+    auto info = ceph::make_unique<list_keys_info>();
 
     info->store = store;
 
-    *phandle = (void *)info;
+    int ret = store->list_raw_objects_init(store->get_zone_params().domain_root, marker,
+                                           &info->ctx);
+    if (ret < 0) {
+      return ret;
+    }
+    *phandle = (void *)info.release();
 
     return 0;
   }
@@ -2186,8 +2191,8 @@ public:
 
     list<string> unfiltered_keys;
 
-    int ret = store->list_raw_objects(store->get_zone_params().domain_root, no_filter,
-                                      max, info->ctx, unfiltered_keys, truncated);
+    int ret = store->list_raw_objects_next(no_filter, max, info->ctx,
+                                           unfiltered_keys, truncated);
     if (ret < 0 && ret != -ENOENT)
       return ret;
     if (ret == -ENOENT) {
@@ -2213,6 +2218,11 @@ public:
     list_keys_info *info = static_cast<list_keys_info *>(handle);
     delete info;
   }
+
+  string get_marker(void *handle) {
+    list_keys_info *info = static_cast<list_keys_info *>(handle);
+    return info->store->list_raw_objs_get_cursor(info->ctx);
+  }
 };
 
 class RGWBucketInstanceMetadataHandler : public RGWMetadataHandler {
@@ -2354,13 +2364,17 @@ public:
     pool = store->get_zone_params().domain_root;
   }
 
-  int list_keys_init(RGWRados *store, void **phandle) override
-  {
-    list_keys_info *info = new list_keys_info;
+  int list_keys_init(RGWRados *store, const string& marker, void **phandle) override {
+    auto info = ceph::make_unique<list_keys_info>();
 
     info->store = store;
 
-    *phandle = (void *)info;
+    int ret = store->list_raw_objects_init(store->get_zone_params().domain_root, marker,
+                                           &info->ctx);
+    if (ret < 0) {
+      return ret;
+    }
+    *phandle = (void *)info.release();
 
     return 0;
   }
@@ -2376,8 +2390,8 @@ public:
 
     list<string> unfiltered_keys;
 
-    int ret = store->list_raw_objects(store->get_zone_params().domain_root, no_filter,
-                                      max, info->ctx, unfiltered_keys, truncated);
+    int ret = store->list_raw_objects_next(no_filter, max, info->ctx,
+                                           unfiltered_keys, truncated);
     if (ret < 0 && ret != -ENOENT)
       return ret;
     if (ret == -ENOENT) {
@@ -2407,6 +2421,11 @@ public:
     delete info;
   }
 
+  string get_marker(void *handle) {
+    list_keys_info *info = static_cast<list_keys_info *>(handle);
+    return info->store->list_raw_objs_get_cursor(info->ctx);
+  }
+
   /*
    * hash entry for mdlog placement. Use the same hash key we'd have for the bucket entry
    * point, so that the log entries end up at the same log shard, so that we process them
index 4b1fd766cbfaa13175d7889f55ad96a86d8b4cc7..8cf7a20b0dc0110f355f95ecb633a0c651de3919 100644 (file)
@@ -24,11 +24,11 @@ int RGWCivetWebFrontend::process(struct mg_connection*  const conn)
 
   RGWCivetWeb cw_client(conn);
   auto real_client_io = rgw::io::add_reordering(
-                          rgw::io::add_buffering(
+                          rgw::io::add_buffering(dout_context,
                             rgw::io::add_chunking(
                               rgw::io::add_conlen_controlling(
                                 &cw_client))));
-  RGWRestfulIO client_io(&real_client_io);
+  RGWRestfulIO client_io(dout_context, &real_client_io);
 
   RGWRequest req(env.store->get_new_req_id());
   int ret = process_request(env.store, env.rest, &req, env.uri_prefix,
index d06eaea3c2d16bdae092ba9cfccbfb352030ea23..8579b2383526956dee5774136cf2786b8d6c9121 100644 (file)
@@ -339,8 +339,8 @@ class RGWRestfulIO : public rgw::io::AccountingFilter<rgw::io::RestfulClient*> {
 public:
   ~RGWRestfulIO() override = default;
 
-  RGWRestfulIO(rgw::io::RestfulClient* engine)
-    : AccountingFilter<rgw::io::RestfulClient*>(std::move(engine)) {
+  RGWRestfulIO(CephContext *_cx, rgw::io::RestfulClient* engine)
+    : AccountingFilter<rgw::io::RestfulClient*>(_cx, std::move(engine)) {
   }
 
   void add_filter(std::shared_ptr<DecoratedRestfulClient> new_filter) {
index 0862f33d3ef570a9d29471bd5e2efba09c9bbedc..04761fc782905236a4fdbcb5fcc72993fcc38769 100644 (file)
@@ -20,20 +20,24 @@ class AccountingFilter : public DecoratedRestfulClient<T>,
   bool enabled;
   uint64_t total_sent;
   uint64_t total_received;
+  CephContext *cct;
 
 public:
   template <typename U>
-  AccountingFilter(U&& decoratee)
+  AccountingFilter(CephContext *cct, U&& decoratee)
     : DecoratedRestfulClient<T>(std::forward<U>(decoratee)),
       enabled(false),
       total_sent(0),
-      total_received(0) {
+      total_received(0), cct(cct) {
   }
 
   size_t send_status(const int status,
                      const char* const status_name) override {
     const auto sent = DecoratedRestfulClient<T>::send_status(status,
                                                              status_name);
+    lsubdout(cct, rgw, 30) << "AccountingFilter::send_status: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
     if (enabled) {
       total_sent += sent;
     }
@@ -42,6 +46,9 @@ public:
 
   size_t send_100_continue() override {
     const auto sent = DecoratedRestfulClient<T>::send_100_continue();
+    lsubdout(cct, rgw, 30) << "AccountingFilter::send_100_continue: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
     if (enabled) {
       total_sent += sent;
     }
@@ -51,6 +58,9 @@ public:
   size_t send_header(const boost::string_ref& name,
                      const boost::string_ref& value) override {
     const auto sent = DecoratedRestfulClient<T>::send_header(name, value);
+    lsubdout(cct, rgw, 30) << "AccountingFilter::send_header: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
     if (enabled) {
       total_sent += sent;
     }
@@ -59,6 +69,9 @@ public:
 
   size_t send_content_length(const uint64_t len) override {
     const auto sent = DecoratedRestfulClient<T>::send_content_length(len);
+    lsubdout(cct, rgw, 30) << "AccountingFilter::send_content_length: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
     if (enabled) {
       total_sent += sent;
     }
@@ -67,6 +80,9 @@ public:
 
   size_t send_chunked_transfer_encoding() override {
     const auto sent = DecoratedRestfulClient<T>::send_chunked_transfer_encoding();
+    lsubdout(cct, rgw, 30) << "AccountingFilter::send_chunked_transfer_encoding: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
     if (enabled) {
       total_sent += sent;
     }
@@ -75,6 +91,9 @@ public:
 
   size_t complete_header() override {
     const auto sent = DecoratedRestfulClient<T>::complete_header();
+    lsubdout(cct, rgw, 30) << "AccountingFilter::complete_header: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
     if (enabled) {
       total_sent += sent;
     }
@@ -83,6 +102,8 @@ public:
 
   size_t recv_body(char* buf, size_t max) override {
     const auto received = DecoratedRestfulClient<T>::recv_body(buf, max);
+    lsubdout(cct, rgw, 30) << "AccountingFilter::recv_body: e="
+        << (enabled ? "1" : "0") << ", received=" << received << dendl;
     if (enabled) {
       total_received += received;
     }
@@ -92,6 +113,20 @@ public:
   size_t send_body(const char* const buf,
                    const size_t len) override {
     const auto sent = DecoratedRestfulClient<T>::send_body(buf, len);
+    lsubdout(cct, rgw, 30) << "AccountingFilter::send_body: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
+    if (enabled) {
+      total_sent += sent;
+    }
+    return sent;
+  }
+
+  size_t complete_request() override {
+    const auto sent = DecoratedRestfulClient<T>::complete_request();
+    lsubdout(cct, rgw, 30) << "AccountingFilter::complete_request: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
     if (enabled) {
       total_sent += sent;
     }
@@ -108,6 +143,8 @@ public:
 
   void set_account(bool enabled) override {
     this->enabled = enabled;
+    lsubdout(cct, rgw, 30) << "AccountingFilter::set_account: e="
+        << (enabled ? "1" : "0") << dendl;
   }
 };
 
@@ -122,13 +159,14 @@ protected:
 
   bool has_content_length;
   bool buffer_data;
+  CephContext *cct;
 
 public:
   template <typename U>
-  BufferingFilter(U&& decoratee)
+  BufferingFilter(CephContext *cct, U&& decoratee)
     : DecoratedRestfulClient<T>(std::forward<U>(decoratee)),
       has_content_length(false),
-      buffer_data(false) {
+      buffer_data(false), cct(cct) {
   }
 
   size_t send_content_length(const uint64_t len) override;
@@ -144,6 +182,9 @@ size_t BufferingFilter<T>::send_body(const char* const buf,
 {
   if (buffer_data) {
     data.append(buf, len);
+
+    lsubdout(cct, rgw, 30) << "BufferingFilter<T>::send_body: defer count = "
+        << len << dendl;
     return 0;
   }
 
@@ -170,6 +211,8 @@ size_t BufferingFilter<T>::complete_header()
   if (! has_content_length) {
     /* We will dump everything in complete_request(). */
     buffer_data = true;
+    lsubdout(cct, rgw, 30) << "BufferingFilter<T>::complete_header: has_content_length="
+        << (has_content_length ? "1" : "0") << dendl;
     return 0;
   }
 
@@ -182,8 +225,16 @@ size_t BufferingFilter<T>::complete_request()
   size_t sent = 0;
 
   if (! has_content_length) {
+    /* It is not correct to count these bytes here,
+     * because they can only be part of the header.
+     * Therefore force count to 0.
+     */
     sent += DecoratedRestfulClient<T>::send_content_length(data.length());
     sent += DecoratedRestfulClient<T>::complete_header();
+    lsubdout(cct, rgw, 30) <<
+        "BufferingFilter::complete_request: !has_content_length: IGNORE: sent="
+        << sent << dendl;
+    sent = 0;
   }
 
   if (buffer_data) {
@@ -195,14 +246,18 @@ size_t BufferingFilter<T>::complete_request()
     }
     data.clear();
     buffer_data = false;
+    lsubdout(cct, rgw, 30) << "BufferingFilter::complete_request: buffer_data: sent="
+        << sent << dendl;
   }
 
   return sent + DecoratedRestfulClient<T>::complete_request();
 }
 
 template <typename T> static inline
-BufferingFilter<T> add_buffering(T&& t) {
-  return BufferingFilter<T>(std::forward<T>(t));
+BufferingFilter<T> add_buffering(
+CephContext *cct,
+T&& t) {
+  return BufferingFilter<T>(cct, std::forward<T>(t));
 }
 
 
index 4fb476859e70b3baec11ce99fc87358247e63882..8d299cf39d475c4818491ec3c6c3fee0ef785d20 100644 (file)
@@ -67,6 +67,7 @@ using ceph::crypto::MD5;
 #define RGW_ATTR_EXPIRES       RGW_ATTR_PREFIX "expires"
 #define RGW_ATTR_DELETE_AT     RGW_ATTR_PREFIX "delete_at"
 #define RGW_ATTR_ID_TAG        RGW_ATTR_PREFIX "idtag"
+#define RGW_ATTR_TAIL_TAG      RGW_ATTR_PREFIX "tail_tag"
 #define RGW_ATTR_SHADOW_OBJ            RGW_ATTR_PREFIX "shadow_name"
 #define RGW_ATTR_MANIFEST      RGW_ATTR_PREFIX "manifest"
 #define RGW_ATTR_USER_MANIFEST  RGW_ATTR_PREFIX "user_manifest"
index 88abd3b833d687c782b5f3c45073842d91f12b17..4d5b00ff6baafd0edce91996d6e64c49bed9530c 100644 (file)
@@ -1155,6 +1155,12 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
   int res = 0;
   std::string stored_mode = get_str_attribute(attrs, RGW_ATTR_CRYPT_MODE);
   ldout(s->cct, 15) << "Encryption mode: " << stored_mode << dendl;
+
+  const char *req_sse = s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION", NULL);
+  if (nullptr != req_sse && (s->op == OP_GET || s->op == OP_HEAD)) {
+    return -ERR_INVALID_REQUEST;
+  }
+
   if (stored_mode == "SSE-C-AES256") {
     if (s->cct->_conf->rgw_crypt_require_ssl &&
         !s->info.env->exists("SERVER_PORT_SECURE")) {
index 5d070e5a45426a2c8fd22f6b7e76cb9770661c53..8fe6497f29b61a0d2ddc3b3da13b5e351189a6d6 100644 (file)
@@ -1020,6 +1020,12 @@ public:
 #define BUCKET_SHARD_SYNC_SPAWN_WINDOW 20
 #define DATA_SYNC_MAX_ERR_ENTRIES 10
 
+enum RemoteDatalogStatus {
+  RemoteNotTrimmed = 0,
+  RemoteTrimmed = 1,
+  RemoteMightTrimmed = 2
+};
+
 class RGWDataSyncShardCR : public RGWCoroutine {
   RGWDataSyncEnv *sync_env;
 
@@ -1042,6 +1048,7 @@ class RGWDataSyncShardCR : public RGWCoroutine {
   RGWDataChangesLogInfo shard_info;
   string datalog_marker;
 
+  RemoteDatalogStatus remote_trimmed;
   Mutex inc_lock;
   Cond inc_cond;
 
@@ -1089,7 +1096,7 @@ public:
                                                      pool(_pool),
                                                      shard_id(_shard_id),
                                                      sync_marker(_marker),
-                                                      marker_tracker(NULL), truncated(false), inc_lock("RGWDataSyncShardCR::inc_lock"),
+                                                      marker_tracker(NULL), truncated(false), remote_trimmed(RemoteNotTrimmed), inc_lock("RGWDataSyncShardCR::inc_lock"),
                                                       total_entries(0), spawn_window(BUCKET_SHARD_SYNC_SPAWN_WINDOW), reset_backoff(NULL),
                                                       lease_cr(nullptr), lease_stack(nullptr), error_repo(nullptr), max_error_entries(DATA_SYNC_MAX_ERR_ENTRIES),
                                                       retry_backoff_secs(RETRY_BACKOFF_SECS_DEFAULT) {
@@ -1294,10 +1301,13 @@ public:
           return set_cr_error(retcode);
         }
         datalog_marker = shard_info.marker;
+        remote_trimmed = RemoteNotTrimmed;
 #define INCREMENTAL_MAX_ENTRIES 100
        ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " datalog_marker=" << datalog_marker << " sync_marker.marker=" << sync_marker.marker << dendl;
        if (datalog_marker > sync_marker.marker) {
           spawned_keys.clear();
+          if (sync_marker.marker.empty())
+            remote_trimmed = RemoteMightTrimmed; //remote data log shard might be trimmed;
           yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, &sync_marker.marker, &log_entries, &truncated));
           if (retcode < 0) {
             ldout(sync_env->cct, 0) << "ERROR: failed to read remote data log info: ret=" << retcode << dendl;
@@ -1305,6 +1315,10 @@ public:
             drain_all();
             return set_cr_error(retcode);
           }
+          if ((remote_trimmed == RemoteMightTrimmed) && sync_marker.marker.empty() && log_entries.empty())
+            remote_trimmed = RemoteTrimmed;
+          else
+            remote_trimmed = RemoteNotTrimmed;
           for (log_iter = log_entries.begin(); log_iter != log_entries.end(); ++log_iter) {
             ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " log_entry: " << log_iter->log_id << ":" << log_iter->log_timestamp << ":" << log_iter->entry.key << dendl;
             if (!marker_tracker->index_key_to_marker(log_iter->entry.key, log_iter->log_id)) {
@@ -1343,7 +1357,7 @@ public:
           }
        }
        ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " datalog_marker=" << datalog_marker << " sync_marker.marker=" << sync_marker.marker << dendl;
-       if (datalog_marker == sync_marker.marker) {
+       if (datalog_marker == sync_marker.marker || remote_trimmed == RemoteTrimmed) {
 #define INCREMENTAL_INTERVAL 20
          yield wait(utime_t(INCREMENTAL_INTERVAL, 0));
        }
index 332cde29719f3517268e0b887611ecd7f5c98d98..3d6eac2b5dde3f3851193bb21714740ab68ceeda 100644 (file)
@@ -118,10 +118,10 @@ void RGWFCGXProcess::handle_request(RGWRequest* r)
 
   RGWFCGX fcgxfe(req->fcgx);
   auto real_client_io = rgw::io::add_reordering(
-                          rgw::io::add_buffering(
+                          rgw::io::add_buffering(cct,
                             rgw::io::add_chunking(
                               &fcgxfe)));
-  RGWRestfulIO client_io(&real_client_io);
+  RGWRestfulIO client_io(cct, &real_client_io);
 
  
   int ret = process_request(store, rest, req, uri_prefix,
index 6c7d006282baf2d335d6538239ea542d86319661..a50965873836fe07a255a7d1a4d888f3159758ac 100644 (file)
@@ -29,6 +29,7 @@ public:
   void set_status(int status, const char* status_name) override {};
   void output_header() override {};
   void output_footer() override {};
+  void enable_line_break() override {};
   void flush(ostream& os) override;
   void reset() override;
 
index 2990bff1e816c1fd5269659db91dc08f3f0ea64c..cad0304e3527cb15e5f5b8f1bed0fa9f21f0e077 100644 (file)
@@ -67,7 +67,14 @@ bool RGWLifecycleConfiguration::_add_rule(LCRule *rule)
     op.mp_expiration = rule->get_mp_expiration().get_days();
   }
   op.dm_expiration = rule->get_dm_expiration();
-  auto ret = prefix_map.insert(pair<string, lc_op>(rule->get_prefix(), op));
+
+  std::string prefix;
+  if (rule->get_filter().has_prefix()){
+    prefix = rule->get_filter().get_prefix();
+  } else {
+    prefix = rule->get_prefix();
+  }
+  auto ret = prefix_map.emplace(std::move(prefix), std::move(op));
   return ret.second;
 }
 
index dd6a1a7f04664f048cd12962cf2aca6accad507c..0a655c5c3e32a1d651052953befc55853d19a945 100644 (file)
@@ -91,6 +91,47 @@ public:
 };
 WRITE_CLASS_ENCODER(LCExpiration)
 
+class LCFilter
+{
+ protected:
+  std::string prefix;
+  // TODO add support for tagging
+ public:
+  const std::string& get_prefix() const{
+    return prefix;
+  }
+
+  void set_prefix(const string& _prefix){
+    prefix = _prefix;
+  }
+
+  void set_prefix(std::string&& _prefix){
+    prefix = std::move(_prefix);
+  }
+
+  bool empty() const {
+    return prefix.empty();
+  }
+
+  bool has_prefix() const {
+    return !prefix.empty();
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(prefix, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(prefix, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(LCFilter);
+
+
+
 class LCRule
 {
 protected:
@@ -100,6 +141,7 @@ protected:
   LCExpiration expiration;
   LCExpiration noncur_expiration;
   LCExpiration mp_expiration;
+  LCFilter filter;
   bool dm_expiration = false;
 
 public:
@@ -115,11 +157,15 @@ public:
   string& get_status() {
       return status;
   }
-  
+
   string& get_prefix() {
       return prefix;
   }
 
+  LCFilter& get_filter() {
+    return filter;
+  }
+
   LCExpiration& get_expiration() {
     return expiration;
   }
@@ -167,7 +213,7 @@ public:
   bool valid();
   
   void encode(bufferlist& bl) const {
-     ENCODE_START(4, 1, bl);
+     ENCODE_START(5, 1, bl);
      ::encode(id, bl);
      ::encode(prefix, bl);
      ::encode(status, bl);
@@ -175,10 +221,11 @@ public:
      ::encode(noncur_expiration, bl);
      ::encode(mp_expiration, bl);
      ::encode(dm_expiration, bl);
+     ::encode(filter, bl);
      ENCODE_FINISH(bl);
    }
    void decode(bufferlist::iterator& bl) {
-     DECODE_START_LEGACY_COMPAT_LEN(4, 1, 1, bl);
+     DECODE_START_LEGACY_COMPAT_LEN(5, 1, 1, bl);
      ::decode(id, bl);
      ::decode(prefix, bl);
      ::decode(status, bl);
@@ -192,6 +239,9 @@ public:
      if (struct_v >= 4) {
         ::decode(dm_expiration, bl);
      }
+     if (struct_v >= 5) {
+       ::decode(filter, bl);
+     }
      DECODE_FINISH(bl);
    }
 
index ea64847a0c733af1ca0eef4583cce7a3812fdc95..b03c4c32b9efa5d9b19397de49066a023a156750 100644 (file)
@@ -80,15 +80,41 @@ bool LCRule_S3::xml_end(const char *el) {
   status.clear();
   dm_expiration = false;
 
+  // S3 generates a 48 bit random ID, maybe we could generate shorter IDs
+  static constexpr auto LC_ID_LENGTH = 48;
+
   lc_id = static_cast<LCID_S3 *>(find_first("ID"));
-  if (!lc_id)
-    return false;
-  id = lc_id->get_data();
+  if (lc_id){
+    id = lc_id->get_data();
+  } else {
+    gen_rand_alphanumeric_lower(nullptr, &id, LC_ID_LENGTH);
+  }
+
+
+  XMLObj *obj = find_first("Filter");
+
+  if (obj){
+    string _prefix;
+    RGWXMLDecoder::decode_xml("Prefix", _prefix, obj);
+    filter.set_prefix(std::move(_prefix));
+  } else {
+    // Ideally the following code should be deprecated and we should return
+    // False here, The new S3 LC configuration xml spec. makes Filter mandatory
+    // and Prefix optional. However older clients including boto2 still generate
+    // xml according to the older spec, where Prefix existed outside of Filter
+    // and S3 itself seems to be sloppy on enforcing the mandatory Filter
+    // argument. A day will come when S3 enforces their own xml-spec, but it is
+    // not this day
+
+    lc_prefix = static_cast<LCPrefix_S3 *>(find_first("Prefix"));
+
+    if (!lc_prefix){
+      return false;
+    }
+
+    prefix = lc_prefix->get_data();
+  }
 
-  lc_prefix = static_cast<LCPrefix_S3 *>(find_first("Prefix"));
-  if (!lc_prefix)
-    return false;
-  prefix = lc_prefix->get_data();
 
   lc_status = static_cast<LCStatus_S3 *>(find_first("Status"));
   if (!lc_status)
@@ -126,7 +152,12 @@ bool LCRule_S3::xml_end(const char *el) {
 void LCRule_S3::to_xml(CephContext *cct, ostream& out) {
   out << "<Rule>" ;
   out << "<ID>" << id << "</ID>";
-  out << "<Prefix>" << prefix << "</Prefix>";
+  if (!filter.empty()) {
+    LCFilter_S3& lc_filter = static_cast<LCFilter_S3&>(filter);
+    lc_filter.to_xml(out);
+  } else {
+    out << "<Prefix>" << prefix << "</Prefix>";
+  }
   out << "<Status>" << status << "</Status>";
   if (!expiration.empty() || dm_expiration) {
     LCExpiration_S3 expir(expiration.get_days_str(), expiration.get_date(), dm_expiration);
index ed1af0c0053e3527d3ec27ac96742cf6e88c26b8..7ff1bf71ba5a69994db4df35b85c41aa048ff46f 100644 (file)
@@ -26,6 +26,25 @@ public:
   string& to_str() { return data; }
 };
 
+class LCFilter_S3 : public LCFilter, public XMLObj
+{
+ public:
+  ~LCFilter_S3() override {}
+  string& to_str() { return data; }
+  void to_xml(ostream& out){
+    out << "<Filter>";
+      if (!prefix.empty())
+        out << "<Prefix>" << prefix << "</Prefix>";
+    out << "</Filter>";
+  }
+  void dump_xml(Formatter *f) const {
+    f->open_object_section("Filter");
+    if (!prefix.empty())
+      encode_xml("Prefix", prefix, f);
+    f->close_section(); // Filter
+  }
+};
+
 class LCStatus_S3 : public XMLObj
 {
 public:
@@ -150,7 +169,13 @@ public:
   void dump_xml(Formatter *f) const {
     f->open_object_section("Rule");
     encode_xml("ID", id, f);
-    encode_xml("Prefix", prefix, f);
+    // In case of an empty filter and an empty Prefix, we defer to Prefix.
+    if (!filter.empty()) {
+      const LCFilter_S3& lc_filter = static_cast<const LCFilter_S3&>(filter);
+      lc_filter.dump_xml(f);
+    } else {
+      encode_xml("Prefix", prefix, f);
+    }
     encode_xml("Status", status, f);
     if (!expiration.empty() || dm_expiration) {
       LCExpiration_S3 expir(expiration.get_days_str(), expiration.get_date(), dm_expiration);
@@ -164,6 +189,7 @@ public:
       const LCMPExpiration_S3& mp_expir = static_cast<const LCMPExpiration_S3&>(mp_expiration);
       mp_expir.dump_xml(f);
     }
+
     f->close_section(); // Rule
   }
 };
index fef37f805bac9cca414756d006c6344f38089944..13a344e3b46946dc2ae0b39cf86abbfa215b2e00 100644 (file)
@@ -131,7 +131,7 @@ void RGWLoadGenProcess::handle_request(RGWRequest* r)
   env.sign(access_key);
 
   RGWLoadGenIO real_client_io(&env);
-  RGWRestfulIO client_io(&real_client_io);
+  RGWRestfulIO client_io(cct, &real_client_io);
 
   int ret = process_request(store, rest, req, uri_prefix,
                             *auth_registry, &client_io, olog);
index 7bceab0aa689636031a1eef3f075fad869116a91..588aa0a26fde84b8b626c0864ed5977f06fb7129 100644 (file)
@@ -221,6 +221,11 @@ static void log_usage(struct req_state *s, const string& op_name)
   if (!s->is_err())
     data.successful_ops = 1;
 
+  ldout(s->cct, 30) << "log_usage: bucket_name=" << bucket_name
+       << " tenant=" << s->bucket_tenant
+       << ", bytes_sent=" << bytes_sent << ", bytes_received="
+       << bytes_received << ", success=" << data.successful_ops << dendl;
+
   entry.add(op_name, data);
 
   utime_t ts = ceph_clock_now();
index f81c4490ae82160d605346f72706b1e05145a824..6c554810e98a81b181503bc0b81e2aa85609a84f 100644 (file)
@@ -273,8 +273,8 @@ obj_version& RGWMetadataObject::get_version()
 
 class RGWMetadataTopHandler : public RGWMetadataHandler {
   struct iter_data {
-    list<string> sections;
-    list<string>::iterator iter;
+    set<string> sections;
+    set<string>::iterator iter;
   };
 
 public:
@@ -290,10 +290,14 @@ public:
 
   int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) override { return -ENOTSUP; }
 
-  int list_keys_init(RGWRados *store, void **phandle) override {
+  int list_keys_init(RGWRados *store, const string& marker, void **phandle) override {
     iter_data *data = new iter_data;
-    store->meta_mgr->get_sections(data->sections);
-    data->iter = data->sections.begin();
+    list<string> sections;
+    store->meta_mgr->get_sections(sections);
+    for (auto& s : sections) {
+      data->sections.insert(s);
+    }
+    data->iter = data->sections.lower_bound(marker);
 
     *phandle = data;
 
@@ -314,6 +318,16 @@ public:
 
     delete data;
   }
+
+  virtual string get_marker(void *handle) {
+    iter_data *data = static_cast<iter_data *>(handle);
+
+    if (data->iter != data->sections.end()) {
+      return *(data->iter);
+    }
+
+    return string();
+  }
 };
 
 static RGWMetadataTopHandler md_top_handler;
@@ -830,8 +844,12 @@ struct list_keys_handle {
   RGWMetadataHandler *handler;
 };
 
-
 int RGWMetadataManager::list_keys_init(string& section, void **handle)
+{
+  return list_keys_init(section, string(), handle);
+}
+
+int RGWMetadataManager::list_keys_init(string& section, const string& marker, void **handle)
 {
   string entry;
   RGWMetadataHandler *handler;
@@ -845,7 +863,7 @@ int RGWMetadataManager::list_keys_init(string& section, void **handle)
 
   list_keys_handle *h = new list_keys_handle;
   h->handler = handler;
-  ret = handler->list_keys_init(store, &h->handle);
+  ret = handler->list_keys_init(store, marker, &h->handle);
   if (ret < 0) {
     delete h;
     return ret;
@@ -865,7 +883,6 @@ int RGWMetadataManager::list_keys_next(void *handle, int max, list<string>& keys
   return handler->list_keys_next(h->handle, max, keys, truncated);
 }
 
-
 void RGWMetadataManager::list_keys_complete(void *handle)
 {
   list_keys_handle *h = static_cast<list_keys_handle *>(handle);
@@ -876,6 +893,13 @@ void RGWMetadataManager::list_keys_complete(void *handle)
   delete h;
 }
 
+string RGWMetadataManager::get_marker(void *handle)
+{
+  list_keys_handle *h = static_cast<list_keys_handle *>(handle);
+
+  return h->handler->get_marker(h->handle);
+}
+
 void RGWMetadataManager::dump_log_entry(cls_log_entry& entry, Formatter *f)
 {
   f->open_object_section("entry");
index 4d077e8f888cac395dfa6d7b9dae0170275582bd..f6dc2db03bc3be4cb85e034f4acd648506ac5b27 100644 (file)
@@ -79,10 +79,12 @@ public:
                   real_time mtime, JSONObj *obj, sync_type_t type) = 0;
   virtual int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) = 0;
 
-  virtual int list_keys_init(RGWRados *store, void **phandle) = 0;
+  virtual int list_keys_init(RGWRados *store, const string& marker, void **phandle) = 0;
   virtual int list_keys_next(void *handle, int max, list<string>& keys, bool *truncated) = 0;
   virtual void list_keys_complete(void *handle) = 0;
 
+  virtual string get_marker(void *handle) = 0;
+
   /* key to use for hashing entries for log shard placement */
   virtual void get_hash_key(const string& section, const string& key, string& hash_key) {
     hash_key = section + ":" + key;
@@ -352,9 +354,12 @@ public:
   int remove(string& metadata_key);
 
   int list_keys_init(string& section, void **phandle);
+  int list_keys_init(string& section, const string& marker, void **phandle);
   int list_keys_next(void *handle, int max, list<string>& keys, bool *truncated);
   void list_keys_complete(void *handle);
 
+  string get_marker(void *handle);
+
   void dump_log_entry(cls_log_entry& entry, Formatter *f);
 
   void get_sections(list<string>& sections);
index c59c5376653234a84876bdc2978299aacb897aed..ac6f7b70081aece857f51238abbd19cf778f217e 100644 (file)
@@ -3017,6 +3017,7 @@ int RGWPutObjProcessor_Multipart::do_complete(size_t accounted_size,
   head_obj_op.meta.owner = s->owner.get_id();
   head_obj_op.meta.delete_at = delete_at;
   head_obj_op.meta.zones_trace = zones_trace;
+  head_obj_op.meta.modify_tail = true;
 
   int r = head_obj_op.write_meta(obj_len, accounted_size, attrs);
   if (r < 0)
@@ -5450,6 +5451,7 @@ void RGWCompleteMultipart::execute()
   obj_op.meta.ptag = &s->req_id; /* use req_id as operation tag */
   obj_op.meta.owner = s->owner.get_id();
   obj_op.meta.flags = PUT_OBJ_CREATE;
+  obj_op.meta.modify_tail = true;
   op_ret = obj_op.write_meta(ofs, accounted_size, attrs);
   if (op_ret < 0)
     return;
index d3a63aee73d17c0818d3e7ea1afec1ade642f944..d9ce2b400011ff815b3c800cad4b9364e9d04809 100644 (file)
@@ -203,6 +203,7 @@ protected:
   bool partial_content;
   bool range_parsed;
   bool skip_manifest;
+  bool skip_decrypt{false};
   rgw_obj obj;
   utime_t gc_invalidate_time;
   bool is_slo;
index 8e050a0592d68182f6a737d7be76c11b97c1154d..d94dc0ca4cf74798d0f5b1317eaa601b8d5df434 100644 (file)
@@ -241,19 +241,19 @@ public:
     const uint64_t rounded_added = rgw_rounded_objsize(added_bytes);
     const uint64_t rounded_removed = rgw_rounded_objsize(removed_bytes);
 
-    if ((entry->stats.size + added_bytes - removed_bytes) >= 0) {
+    if (((int64_t)(entry->stats.size + added_bytes - removed_bytes)) >= 0) {
       entry->stats.size += added_bytes - removed_bytes;
     } else {
       entry->stats.size = 0;
     }
 
-    if ((entry->stats.size_rounded + rounded_added - rounded_removed) >= 0) {
+    if (((int64_t)(entry->stats.size_rounded + rounded_added - rounded_removed)) >= 0) {
       entry->stats.size_rounded += rounded_added - rounded_removed;
     } else {
       entry->stats.size_rounded = 0;
     }
 
-    if ((entry->stats.num_objects + objs_delta) >= 0) {
+    if (((int64_t)(entry->stats.num_objects + objs_delta)) >= 0) {
       entry->stats.num_objects += objs_delta;
     } else {
       entry->stats.num_objects = 0;
index 7a1fce857151b99077ef251e0afaa9ba02cfd1cd..9df547a19ade5cabb20b5cc96fd146aa71b47f37 100644 (file)
@@ -2780,6 +2780,7 @@ int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string&
   obj_op.meta.delete_at = delete_at;
   obj_op.meta.user_data = user_data;
   obj_op.meta.zones_trace = zones_trace;
+  obj_op.meta.modify_tail = true;
 
   r = obj_op.write_meta(obj_len, accounted_size, attrs);
   if (r < 0) {
@@ -6823,7 +6824,8 @@ int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
  * Returns: 0 on success, -ERR# otherwise.
  */
 int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
-                                           map<string, bufferlist>& attrs, bool assume_noent,
+                                           map<string, bufferlist>& attrs,
+                                           bool assume_noent, bool modify_tail,
                                            void *_index_op)
 {
   RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
@@ -6856,7 +6858,7 @@ int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_si
   if (!ptag && !index_op->get_optag()->empty()) {
     ptag = index_op->get_optag();
   }
-  r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false);
+  r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
   if (r < 0)
     return r;
 
@@ -7072,13 +7074,13 @@ int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
   bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
   int r;
   if (assume_noent) {
-    r = _do_write_meta(size, accounted_size, attrs, assume_noent, (void *)&index_op);
+    r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
     if (r == -EEXIST) {
       assume_noent = false;
     }
   }
   if (!assume_noent) {
-    r = _do_write_meta(size, accounted_size, attrs, assume_noent, (void *)&index_op);
+    r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
   }
   return r;
 }
@@ -7391,6 +7393,12 @@ static void set_copy_attrs(map<string, bufferlist>& src_attrs,
     if (!attrs[RGW_ATTR_ETAG].length()) {
       attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
     }
+    if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
+      auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
+      if (ttiter != src_attrs.end()) {
+        attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
+      }
+    }
     break;
   case RGWRados::ATTRSMOD_MERGE:
     for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
@@ -7422,6 +7430,7 @@ int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
     return ret;
 
   attrset.erase(RGW_ATTR_ID_TAG);
+  attrset.erase(RGW_ATTR_TAIL_TAG);
 
   uint64_t max_chunk_size;
 
@@ -7579,10 +7588,15 @@ int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
 
   obj_time_weight dest_mtime_weight;
 
+  constexpr bool prepend_meta = true;
+  constexpr bool get_op = true;
+  constexpr bool rgwx_stat = true;
+  constexpr bool sync_manifest = true;
+  constexpr bool skip_decrypt = true;
   int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
                       dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
-                      true /* prepend_meta */, true /* GET */, true /* rgwx-stat */,
-                      true /* sync manifest */, &cb, &in_stream_req);
+                      prepend_meta, get_op, rgwx_stat,
+                      sync_manifest, skip_decrypt, &cb, &in_stream_req);
   if (ret < 0) {
     return ret;
   }
@@ -7750,10 +7764,15 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
     }
   }
 
+  static constexpr bool prepend_meta = true;
+  static constexpr bool get_op = true;
+  static constexpr bool rgwx_stat = false;
+  static constexpr bool sync_manifest = true;
+  static constexpr bool skip_decrypt = true;
   ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
                       dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
-                      true /* prepend_meta */, true /* GET */, false /* rgwx-stat */,
-                      true /* sync manifest */, &cb, &in_stream_req);
+                      prepend_meta, get_op, rgwx_stat,
+                      sync_manifest, skip_decrypt, &cb, &in_stream_req);
   if (ret < 0) {
     goto set_err_state;
   }
@@ -8117,6 +8136,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
   }
 
   if (!copy_itself) {
+    attrs.erase(RGW_ATTR_TAIL_TAG);
     manifest = astate->manifest;
     const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
     if (tail_placement.bucket.name.empty()) {
@@ -8163,6 +8183,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
   write_op.meta.category = category;
   write_op.meta.olh_epoch = olh_epoch;
   write_op.meta.delete_at = delete_at;
+  write_op.meta.modify_tail = !copy_itself;
 
   ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
   if (ret < 0) {
@@ -8475,7 +8496,7 @@ int RGWRados::Object::complete_atomic_modification()
     return 0;
   }
 
-  string tag = state->obj_tag.to_str();
+  string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
   return store->gc->send_chain(chain, tag, false);  // do it async
 }
 
@@ -8692,13 +8713,17 @@ int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_ob
     return -EINVAL;
   }
 
-  if (state->obj_tag.length() == 0) {// check for backward compatibility
+  string tag;
+
+  if (state->tail_tag.length() > 0) {
+    tag = state->tail_tag.c_str();
+  } else if (state->obj_tag.length() > 0) {
+    tag = state->obj_tag.c_str();
+  } else {
     ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
     return -EINVAL;
   }
 
-  string tag = state->obj_tag.c_str();
-
   ldout(cct, 0) << "defer chain tag=" << tag << dendl;
 
   return gc->defer_chain(tag, false);
@@ -8861,7 +8886,7 @@ int RGWRados::Object::Delete::delete_obj()
     return -ENOENT;
   }
 
-  r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true);
+  r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
   if (r < 0)
     return r;
 
@@ -9195,6 +9220,10 @@ int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket
     s->shadow_obj[bl.length()] = '\0';
   }
   s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
+  auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
+  if (ttiter != s->attrset.end()) {
+    s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
+  }
 
   bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
   if (manifest_bl.length()) {
@@ -9467,7 +9496,8 @@ void RGWRados::SystemObject::invalidate_state()
 }
 
 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
-                                                  const char *if_match, const char *if_nomatch, bool removal_op)
+                                                  const char *if_match, const char *if_nomatch, bool removal_op,
+                                                  bool modify_tail)
 {
   int r = get_state(&state, false);
   if (r < 0)
@@ -9551,6 +9581,9 @@ int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool
   ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
 
   op.setxattr(RGW_ATTR_ID_TAG, bl);
+  if (modify_tail) {
+    op.setxattr(RGW_ATTR_TAIL_TAG, bl);
+  }
 
   return 0;
 }
@@ -12203,6 +12236,31 @@ int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
   return 0;
 }
 
+int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
+{
+  librados::IoCtx& io_ctx = ctx.io_ctx;
+  librados::NObjectIterator& iter = ctx.iter;
+
+  int r = open_pool_ctx(pool, io_ctx);
+  if (r < 0)
+    return r;
+
+  librados::ObjectCursor oc;
+  if (!oc.from_str(cursor)) {
+    ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
+    return -EINVAL;
+  }
+
+  iter = io_ctx.nobjects_begin(oc);
+
+  return 0;
+}
+
+string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
+{
+  return ctx.iter.get_cursor().to_str();
+}
+
 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
                            bool *is_truncated, RGWAccessListFilter *filter)
 {
@@ -12242,21 +12300,27 @@ struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
   }
 };
 
-int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
-                              int max, RGWListRawObjsCtx& ctx, list<string>& oids,
-                              bool *is_truncated)
+int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
 {
-  RGWAccessListFilterPrefix filter(prefix_filter);
-
-  if (!ctx.initialized) {
-    int r = pool_iterate_begin(pool, ctx.iter_ctx);
+  if (!ctx->initialized) {
+    int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
     if (r < 0) {
       ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
       return r;
     }
-    ctx.initialized = true;
+    ctx->initialized = true;
   }
+  return 0;
+}
 
+int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
+                                    RGWListRawObjsCtx& ctx, list<string>& oids,
+                                    bool *is_truncated)
+{
+  if (!ctx.initialized) {
+    return -EINVAL;
+  }
+  RGWAccessListFilterPrefix filter(prefix_filter);
   vector<rgw_bucket_dir_entry> objs;
   int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
   if (r < 0) {
@@ -12273,6 +12337,25 @@ int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter
   return oids.size();
 }
 
+int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
+                              int max, RGWListRawObjsCtx& ctx, list<string>& oids,
+                              bool *is_truncated)
+{
+  if (!ctx.initialized) {
+    int r = list_raw_objects_init(pool, string(), &ctx);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
+}
+
+string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
+{
+  return pool_iterate_get_cursor(ctx.iter_ctx);
+}
+
 int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
                                   std::list<rgw_bi_log_entry>& result, bool *truncated)
 {
index 23caafd6314f48a58ba33ee624ec8ca836b58c65..6984192f0f497cdd999c63ada76ce236e3f43087 100644 (file)
@@ -874,6 +874,7 @@ struct RGWObjState {
   ceph::real_time mtime;
   uint64_t epoch;
   bufferlist obj_tag;
+  bufferlist tail_tag;
   string write_tag;
   bool fake_tag;
   RGWObjManifest manifest;
@@ -908,6 +909,9 @@ struct RGWObjState {
     if (rhs.obj_tag.length()) {
       obj_tag = rhs.obj_tag;
     }
+    if (rhs.tail_tag.length()) {
+      tail_tag = rhs.tail_tag;
+    }
     write_tag = rhs.write_tag;
     fake_tag = rhs.fake_tag;
     if (rhs.has_manifest) {
@@ -2517,11 +2521,17 @@ public:
     return rgw_shards_max();
   }
 
+
   int get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref);
 
+  int list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx);
+  int list_raw_objects_next(const string& prefix_filter, int max,
+                            RGWListRawObjsCtx& ctx, list<string>& oids,
+                            bool *is_truncated);
   int list_raw_objects(const rgw_pool& pool, const string& prefix_filter, int max,
                        RGWListRawObjsCtx& ctx, list<string>& oids,
                        bool *is_truncated);
+  string list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx);
 
   int list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result);
   int list_zonegroups(list<string>& zonegroups);
@@ -2702,7 +2712,7 @@ public:
     void invalidate_state();
 
     int prepare_atomic_modification(librados::ObjectWriteOperation& op, bool reset_obj, const string *ptag,
-                                    const char *ifmatch, const char *ifnomatch, bool removal_op);
+                                    const char *ifmatch, const char *ifnomatch, bool removal_op, bool modify_tail);
     int complete_atomic_modification();
 
   public:
@@ -2798,17 +2808,19 @@ public:
         bool canceled;
         const string *user_data;
         rgw_zone_set *zones_trace;
+        bool modify_tail;
 
         MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
                  remove_objs(NULL), category(RGW_OBJ_CATEGORY_MAIN), flags(0),
-                 if_match(NULL), if_nomatch(NULL), olh_epoch(0), canceled(false), user_data(nullptr), zones_trace(nullptr) {}
+                 if_match(NULL), if_nomatch(NULL), olh_epoch(0), canceled(false), user_data(nullptr), zones_trace(nullptr),
+                 modify_tail(false) {}
       } meta;
 
       explicit Write(RGWRados::Object *_target) : target(_target) {}
 
       int _do_write_meta(uint64_t size, uint64_t accounted_size,
                      map<std::string, bufferlist>& attrs,
-                     bool assume_noent,
+                     bool modify_tail, bool assume_noent,
                      void *index_op);
       int write_meta(uint64_t size, uint64_t accounted_size,
                      map<std::string, bufferlist>& attrs);
@@ -3644,6 +3656,22 @@ public:
    */
   int pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx);
 
+  /**
+   * Init pool iteration
+   * pool: pool to use
+   * cursor: position to start iteration
+   * ctx: context object to use for the iteration
+   * Returns: 0 on success, -ERR# otherwise.
+   */
+  int pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx);
+
+  /**
+   * Get pool iteration position
+   * ctx: context object to use for the iteration
+   * Returns: string representation of position
+   */
+  string pool_iterate_get_cursor(RGWPoolIterCtx& ctx);
+
   /**
    * Iterate over pool return object names, use optional filter
    * ctx: iteration context, initialized with pool_iterate_begin()
index fb61f326bd2f87d33d18b8972ebacac4b124fed7..22bbfe80f363fed7245dcdc3b856671be04d2aee 100644 (file)
@@ -391,7 +391,7 @@ struct grant_type_to_header grants_headers_def[] = {
 
 static bool grants_by_type_check_perm(map<int, string>& grants_by_type, int perm, ACLGrant& grant, int check_perm)
 {
-  if ((perm & check_perm) == perm) {
+  if ((perm & check_perm) == check_perm) {
     grants_by_type_add_one_grant(grants_by_type, check_perm, grant);
     return true;
   }
index aa3e0c8881f355857d4097ee4ca08c0ae53f211c..d94a5643f340b875b28159f6184d2bed60054beb 100644 (file)
@@ -156,7 +156,7 @@ int RGWRESTConn::get_obj(const rgw_user& uid, req_info *info /* optional */, rgw
                          const real_time *mod_ptr, const real_time *unmod_ptr,
                          uint32_t mod_zone_id, uint64_t mod_pg_ver,
                          bool prepend_metadata, bool get_op, bool rgwx_stat,
-                         bool sync_manifest, RGWGetDataCB *cb,
+                         bool sync_manifest, bool skip_decrypt, RGWGetDataCB *cb,
                          RGWRESTStreamRWRequest **req)
 {
   string url;
@@ -175,6 +175,9 @@ int RGWRESTConn::get_obj(const rgw_user& uid, req_info *info /* optional */, rgw
   if (sync_manifest) {
     params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "sync-manifest", ""));
   }
+  if (skip_decrypt) {
+    params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "skip-decrypt", ""));
+  }
   if (!obj.key.instance.empty()) {
     const string& instance = obj.key.instance;
     params.push_back(param_pair_t("versionId", instance));
index f4a1005b6a3d676c3f5451ff7b85de23924de827..e9941856f577132603a3fa08009fbec6ac31d505 100644 (file)
@@ -94,7 +94,7 @@ public:
               const ceph::real_time *mod_ptr, const ceph::real_time *unmod_ptr,
               uint32_t mod_zone_id, uint64_t mod_pg_ver,
               bool prepend_metadata, bool get_op, bool rgwx_stat, bool sync_manifest,
-              RGWGetDataCB *cb, RGWRESTStreamRWRequest **req);
+              bool skip_decrypt, RGWGetDataCB *cb, RGWRESTStreamRWRequest **req);
   int complete_request(RGWRESTStreamRWRequest *req, string& etag, ceph::real_time *mtime, uint64_t *psize, map<string, string>& attrs);
 
   int get_resource(const string& resource,
index 099a206e50f6f614becf603c655dcd75f4c2d9aa..e6239347d2bb6931472fa1b8f13d3c0d9496de3b 100644 (file)
@@ -68,6 +68,24 @@ const string RGWOp_Metadata_List::name() {
 }
 
 void RGWOp_Metadata_List::execute() {
+  string marker = s->info.args.get("marker");
+  bool max_entries_specified;
+  string max_entries_str = s->info.args.get("max-entries", &max_entries_specified);
+
+  bool extended_response = (max_entries_specified); /* for backward compatibility, if max-entries is not specified
+                                                    we will send the old response format */
+  uint64_t max_entries = 0;
+
+  if (max_entries_specified) {
+    string err;
+    max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      dout(5) << "Error parsing max-entries " << max_entries_str << dendl;
+      http_ret = -EINVAL;
+      return;
+    }
+  }
+
   string metadata_key;
 
   frame_metadata_key(s, metadata_key);
@@ -75,19 +93,26 @@ void RGWOp_Metadata_List::execute() {
   void *handle;
   int max = 1000;
 
-  http_ret = store->meta_mgr->list_keys_init(metadata_key, &handle);
+  http_ret = store->meta_mgr->list_keys_init(metadata_key, marker, &handle);
   if (http_ret < 0) {
     dout(5) << "ERROR: can't get key: " << cpp_strerror(http_ret) << dendl;
     return;
   }
 
   bool truncated;
+  uint64_t count = 0;
+
+  if (extended_response) {
+    s->formatter->open_object_section("result");
+  }
 
   s->formatter->open_array_section("keys");
 
+  uint64_t left;
   do {
     list<string> keys;
-    http_ret = store->meta_mgr->list_keys_next(handle, max, keys, &truncated);
+    left = (max_entries_specified ? max_entries - count : max);
+    http_ret = store->meta_mgr->list_keys_next(handle, left, keys, &truncated);
     if (http_ret < 0) {
       dout(5) << "ERROR: lists_keys_next(): " << cpp_strerror(http_ret)
              << dendl;
@@ -97,12 +122,21 @@ void RGWOp_Metadata_List::execute() {
     for (list<string>::iterator iter = keys.begin(); iter != keys.end();
         ++iter) {
       s->formatter->dump_string("key", *iter);
+      ++count;
     }
 
-  } while (truncated);
+  } while (truncated && left > 0);
 
   s->formatter->close_section();
 
+  if (extended_response) {
+    encode_json("truncated", truncated, s->formatter);
+    encode_json("count", count, s->formatter);
+    if (truncated) {
+      encode_json("marker", store->meta_mgr->get_marker(handle), s->formatter);
+    }
+    s->formatter->close_section();
+  }
   store->meta_mgr->list_keys_complete(handle);
 
   http_ret = 0;
index 9c504ca591023275963e9991c1d6d0568755576e..aa5b525d04da84cf73837af223f094fa94b0e9c9 100644 (file)
@@ -129,6 +129,12 @@ int RGWGetObj_ObjStore_S3::get_params()
   // all of the data from its parts. the parts will sync as separate objects
   skip_manifest = s->info.args.exists(RGW_SYS_PARAM_PREFIX "sync-manifest");
 
+  // multisite sync requests should fetch encrypted data, along with the
+  // attributes needed to support decryption on the other zone
+  if (s->system_request) {
+    skip_decrypt = s->info.args.exists(RGW_SYS_PARAM_PREFIX "skip-decrypt");
+  }
+
   return RGWGetObj_ObjStore::get_params();
 }
 
@@ -337,6 +343,10 @@ send_data:
 
 int RGWGetObj_ObjStore_S3::get_decrypt_filter(std::unique_ptr<RGWGetDataCB> *filter, RGWGetDataCB* cb, bufferlist* manifest_bl)
 {
+  if (skip_decrypt) { // bypass decryption for multisite sync requests
+    return 0;
+  }
+
   int res = 0;
   std::unique_ptr<BlockCrypt> block_crypt;
   res = rgw_s3_prepare_decrypt(s, attrs, &block_crypt, crypt_http_responses);
index 0f6a270848ef902a8f6026a6aa93281211e1681e..d45d9a92e3302c751042203ca33d0b0cc5c23bf9 100644 (file)
@@ -775,7 +775,20 @@ class RGWFetchAllMetaCR : public RGWCoroutine {
 
   list<string> sections;
   list<string>::iterator sections_iter;
-  list<string> result;
+
+  struct meta_list_result {
+    list<string> keys;
+    string marker;
+    uint64_t count{0};
+    bool truncated{false};
+
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("keys", keys, obj);
+      JSONDecoder::decode_json("marker", marker, obj);
+      JSONDecoder::decode_json("count", count, obj);
+      JSONDecoder::decode_json("truncated", truncated, obj);
+    }
+  } result;
   list<string>::iterator iter;
 
   std::unique_ptr<RGWShardedOmapCRManager> entries_index;
@@ -785,6 +798,8 @@ class RGWFetchAllMetaCR : public RGWCoroutine {
   bool lost_lock;
   bool failed;
 
+  string marker;
+
   map<uint32_t, rgw_meta_sync_marker>& markers;
 
 public:
@@ -863,41 +878,47 @@ public:
       rearrange_sections();
       sections_iter = sections.begin();
       for (; sections_iter != sections.end(); ++sections_iter) {
-        yield {
-         string entrypoint = string("/admin/metadata/") + *sections_iter;
-          /* FIXME: need a better scaling solution here, requires streaming output */
-         call(new RGWReadRESTResourceCR<list<string> >(cct, conn, sync_env->http_manager,
-                                      entrypoint, NULL, &result));
-       }
-        if (get_ret_status() < 0) {
-          ldout(cct, 0) << "ERROR: failed to fetch metadata section: " << *sections_iter << dendl;
-          yield entries_index->finish();
-          yield lease_cr->go_down();
-          drain_all();
-          return set_cr_error(get_ret_status());
-        }
-        iter = result.begin();
-        for (; iter != result.end(); ++iter) {
-          if (!lease_cr->is_locked()) {
-            lost_lock = true;
-            break;
+        do {
+          yield {
+#define META_FULL_SYNC_CHUNK_SIZE "1000"
+            string entrypoint = string("/admin/metadata/") + *sections_iter;
+            rgw_http_param_pair pairs[] = { { "max-entries", META_FULL_SYNC_CHUNK_SIZE },
+              { "marker", result.marker.c_str() },
+              { NULL, NULL } };
+            result.keys.clear();
+            call(new RGWReadRESTResourceCR<meta_list_result >(cct, conn, sync_env->http_manager,
+                                                              entrypoint, pairs, &result));
           }
-          yield; // allow entries_index consumer to make progress
-
-          ldout(cct, 20) << "list metadata: section=" << *sections_iter << " key=" << *iter << dendl;
-          string s = *sections_iter + ":" + *iter;
-          int shard_id;
-          RGWRados *store = sync_env->store;
-          int ret = store->meta_mgr->get_log_shard_id(*sections_iter, *iter, &shard_id);
-          if (ret < 0) {
-            ldout(cct, 0) << "ERROR: could not determine shard id for " << *sections_iter << ":" << *iter << dendl;
-            ret_status = ret;
-            break;
+          if (get_ret_status() < 0) {
+            ldout(cct, 0) << "ERROR: failed to fetch metadata section: " << *sections_iter << dendl;
+            yield entries_index->finish();
+            yield lease_cr->go_down();
+            drain_all();
+            return set_cr_error(get_ret_status());
           }
-          if (!entries_index->append(s, shard_id)) {
-            break;
+          iter = result.keys.begin();
+          for (; iter != result.keys.end(); ++iter) {
+            if (!lease_cr->is_locked()) {
+              lost_lock = true;
+              break;
+            }
+            yield; // allow entries_index consumer to make progress
+
+            ldout(cct, 20) << "list metadata: section=" << *sections_iter << " key=" << *iter << dendl;
+            string s = *sections_iter + ":" + *iter;
+            int shard_id;
+            RGWRados *store = sync_env->store;
+            int ret = store->meta_mgr->get_log_shard_id(*sections_iter, *iter, &shard_id);
+            if (ret < 0) {
+              ldout(cct, 0) << "ERROR: could not determine shard id for " << *sections_iter << ":" << *iter << dendl;
+              ret_status = ret;
+              break;
+            }
+            if (!entries_index->append(s, shard_id)) {
+              break;
+            }
           }
-       }
+        } while (result.truncated);
       }
       yield {
         if (!entries_index->finish()) {
index 41a5492b93089129c22db4f626d12d7f3c4e76b4..ebe795e7d6f6bb5deac831d4d87000ce9171b96d 100644 (file)
@@ -11,6 +11,7 @@
 #include "common/Formatter.h"
 #include "common/ceph_json.h"
 #include "common/RWLock.h"
+#include "common/backport14.h"
 #include "rgw_rados.h"
 #include "rgw_acl.h"
 
@@ -2772,13 +2773,19 @@ public:
     pool = store->get_zone_params().user_uid_pool;
   }
 
-  int list_keys_init(RGWRados *store, void **phandle) override
+  int list_keys_init(RGWRados *store, const string& marker, void **phandle) override
   {
-    list_keys_info *info = new list_keys_info;
+    auto info = ceph::make_unique<list_keys_info>();
 
     info->store = store;
 
-    *phandle = (void *)info;
+    int ret = store->list_raw_objects_init(store->get_zone_params().user_uid_pool, marker,
+                                           &info->ctx);
+    if (ret < 0) {
+      return ret;
+    }
+
+    *phandle = (void *)info.release();
 
     return 0;
   }
@@ -2794,8 +2801,8 @@ public:
 
     list<string> unfiltered_keys;
 
-    int ret = store->list_raw_objects(store->get_zone_params().user_uid_pool, no_filter,
-                                      max, info->ctx, unfiltered_keys, truncated);
+    int ret = store->list_raw_objects_next(no_filter, max, info->ctx,
+                                           unfiltered_keys, truncated);
     if (ret < 0 && ret != -ENOENT)
       return ret;                      
     if (ret == -ENOENT) {
@@ -2821,6 +2828,11 @@ public:
     list_keys_info *info = static_cast<list_keys_info *>(handle);
     delete info;
   }
+
+  string get_marker(void *handle) {
+    list_keys_info *info = static_cast<list_keys_info *>(handle);
+    return info->store->list_raw_objs_get_cursor(info->ctx);
+  }
 };
 
 void rgw_user_init(RGWRados *store)
index 1b1719de5776a23a594e0865901948eb16999352..5f82b33af921badfbfbf7df70b7088a68b8f58d2 100644 (file)
@@ -125,7 +125,7 @@ if(WIN32)
     endif()
   endif()
 else()
-  option(WITH_SSE42 "build with SSE4.2" ON)
+  option(WITH_SSE42 "build with SSE4.2" OFF)
   if(WITH_SSE42)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
   endif()
index 42bedc2445a5464cc74953e443de53164f45e4ee..2c8cb097d93f2fc0f25c28cba7ba2eed674f7bd5 100644 (file)
@@ -327,6 +327,15 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
   table0_[c >> 24];
 }
 
+#if defined(HAVE_SSE42) && defined(__GNUC__)
+#if defined(__clang__)
+#if __has_cpp_attribute(gnu::target)
+__attribute__ ((target ("sse4.2")))
+#endif
+#else  // gcc supports this since 4.4
+__attribute__ ((target ("sse4.2")))
+#endif
+#endif
 static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
 #ifdef __SSE4_2__
 #ifdef __LP64__
@@ -397,8 +406,7 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
 static bool isSSE42() {
 #if defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
   uint32_t c_;
-  uint32_t d_;
-  __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
+  __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx");
   return c_ & (1U << 20);  // copied from CpuId.h in Folly.
 #elif defined(_WIN64)
   int info[4];
@@ -425,7 +433,7 @@ bool IsFastCrc32Supported() {
 #endif
 }
 
-Function ChosenExtend = Choose_Extend();
+static Function ChosenExtend = Choose_Extend();
 
 uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
   return ChosenExtend(crc, buf, size);
index 69d7679d9d7e910e63535b7484e10e149853e312..54c63733f25308ea28f0ab1c8e34491cbd8dc3d3 100644 (file)
@@ -6,9 +6,9 @@ ls on empty pool never containing images
   $ rados -p rbd rm rbd_directory >/dev/null 2>&1 || true
   $ rbd ls
   $ rbd ls --format json
-  [] (no-eol)
+  []
   $ rbd ls --format xml
-  <images></images> (no-eol)
+  <images></images>
 
 create
 =======
index a509ffd4e2d3d3aa9afecfc458daaf581c63146b..06cfa294f814be5e4229a0fc64f22ebef87cf01c 100644 (file)
@@ -141,7 +141,7 @@ Skip test on FreeBSD as it generates different output there.
     -p [ --pool ] arg    pool name
     --image arg          image name
     --snap arg           snapshot name
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
   
   rbd help clone
@@ -301,7 +301,7 @@ Skip test on FreeBSD as it generates different output there.
     --snap arg            snapshot name
     --from-snap arg       snapshot starting point
     --whole-object        compare whole object
-    --format arg          output format [plain, json, or xml]
+    --format arg          output format (plain, json, or xml) [default: plain]
     --pretty-format       pretty formatting (json and xml)
   
   rbd help disk-usage
@@ -320,7 +320,7 @@ Skip test on FreeBSD as it generates different output there.
     -p [ --pool ] arg     pool name
     --image arg           image name
     --snap arg            snapshot name
-    --format arg          output format [plain, json, or xml]
+    --format arg          output format (plain, json, or xml) [default: plain]
     --pretty-format       pretty formatting (json and xml)
     --from-snap arg       snapshot starting point
   
@@ -453,7 +453,7 @@ Skip test on FreeBSD as it generates different output there.
   Optional arguments
     -p [ --pool ] arg    pool name
     --image arg          image name
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
   
   rbd help image-meta remove
@@ -575,7 +575,7 @@ Skip test on FreeBSD as it generates different output there.
     --image arg           image name
     --snap arg            snapshot name
     --image-id arg        image id
-    --format arg          output format [plain, json, or xml]
+    --format arg          output format (plain, json, or xml) [default: plain]
     --pretty-format       pretty formatting (json and xml)
   
   rbd help journal client disconnect
@@ -654,7 +654,7 @@ Skip test on FreeBSD as it generates different output there.
     -p [ --pool ] arg    pool name
     --image arg          image name
     --journal arg        journal name
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
   
   rbd help journal inspect
@@ -706,7 +706,7 @@ Skip test on FreeBSD as it generates different output there.
     -p [ --pool ] arg    pool name
     --image arg          image name
     --journal arg        journal name
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
   
   rbd help list
@@ -721,7 +721,7 @@ Skip test on FreeBSD as it generates different output there.
   Optional arguments
     -l [ --long ]        long listing format
     -p [ --pool ] arg    pool name
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
   
   rbd help lock add
@@ -754,7 +754,7 @@ Skip test on FreeBSD as it generates different output there.
   Optional arguments
     -p [ --pool ] arg    pool name
     --image arg          image name
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
   
   rbd help lock remove
@@ -893,7 +893,7 @@ Skip test on FreeBSD as it generates different output there.
   Optional arguments
     -p [ --pool ] arg    pool name
     --image arg          image name
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
   
   rbd help mirror pool demote
@@ -945,7 +945,7 @@ Skip test on FreeBSD as it generates different output there.
   
   Optional arguments
     -p [ --pool ] arg    pool name
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
   
   rbd help mirror pool peer add
@@ -1019,7 +1019,7 @@ Skip test on FreeBSD as it generates different output there.
   
   Optional arguments
     -p [ --pool ] arg    pool name
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
     --verbose            be verbose
   
@@ -1165,7 +1165,7 @@ Skip test on FreeBSD as it generates different output there.
   Show the rbd images mapped by the kernel.
   
   Optional arguments
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
   
   rbd help snap create
@@ -1227,7 +1227,7 @@ Skip test on FreeBSD as it generates different output there.
     -p [ --pool ] arg    pool name
     --image arg          image name
     --image-id arg       image id
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
   
   rbd help snap protect
@@ -1351,7 +1351,7 @@ Skip test on FreeBSD as it generates different output there.
   Optional arguments
     -p [ --pool ] arg    pool name
     --image arg          image name
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
   
   rbd help trash list
@@ -1368,7 +1368,7 @@ Skip test on FreeBSD as it generates different output there.
     -p [ --pool ] arg    pool name
     -a [ --all ]         list images from all sources
     -l [ --long ]        long listing format
-    --format arg         output format [plain, json, or xml]
+    --format arg         output format (plain, json, or xml) [default: plain]
     --pretty-format      pretty formatting (json and xml)
   
   rbd help trash move
index 50c59cf058b199fd891d6553bbdc38fddd03b9f8..a13c183293116a15d02bcd097fa144288745d6bd 100644 (file)
@@ -100,7 +100,8 @@ TEST_F(TestClsRbd, get_all_features)
 
   uint64_t all_features = 0;
   ASSERT_EQ(0, get_all_features(&ioctx, oid, &all_features));
-  ASSERT_EQ(RBD_FEATURES_ALL, all_features);
+  ASSERT_EQ(static_cast<uint64_t>(RBD_FEATURES_ALL),
+            static_cast<uint64_t>(all_features & RBD_FEATURES_ALL));
 
   ioctx.close();
 }
index b3630792aef81bd61750fee4557ffc13cfa13ba9..5db18314c4bd9789d2d7085c34f290b54c15b9c5 100644 (file)
 class Item : public LRUObject {
 public:
   int id;
-  explicit Item(int v) : id(v) {}
+  Item() : id(0) {}
+  Item(int i) : id(i) {}
+  void set(int i) {id = i;}
 };
 
 
 TEST(lru, InsertTop) {
-  LRU lru = LRU(10);
-
-  lru.lru_set_midpoint(.5); // 50% of 10 elements.
-  for (int i=0; i<100; i++) {
-    lru.lru_insert_top(new Item(i));
+  LRU lru;
+  static const int n = 100;
+  Item items[n];
+
+  lru.lru_set_midpoint(.5); // 50% of elements.
+  for (int i=0; i<n; i++) {
+    items[i].set(i);
+    lru.lru_insert_top(&items[i]);
   }
-  ASSERT_EQ(5U, lru.lru_get_top());
-  ASSERT_EQ(95U, lru.lru_get_bot());
+  ASSERT_EQ(50U, lru.lru_get_top());
+  ASSERT_EQ(50U, lru.lru_get_bot());
   ASSERT_EQ(100U, lru.lru_get_size());
 
   ASSERT_EQ(0, (static_cast<Item*>(lru.lru_expire()))->id);
 }
 
 TEST(lru, InsertMid) {
-  LRU lru = LRU(10);
-
-  for (int i=0; i<100; i++) {
-    lru.lru_insert_mid(new Item(i));
+  LRU lru;
+  static const int n = 102;
+  Item items[n];
+
+  lru.lru_set_midpoint(.7); // 70% of elements.
+  for (int i=0; i<n; i++) {
+    items[i].set(i);
+    lru.lru_insert_mid(&items[i]);
   }
-  ASSERT_EQ(0U, lru.lru_get_top());
-  ASSERT_EQ(100U, lru.lru_get_bot());
-  ASSERT_EQ(100U, lru.lru_get_size());
+  ASSERT_EQ(71U, lru.lru_get_top());
+  ASSERT_EQ(31U, lru.lru_get_bot());
+  ASSERT_EQ(102U, lru.lru_get_size());
 
   ASSERT_EQ(0, (static_cast<Item*>(lru.lru_expire()))->id);
 }
 
 TEST(lru, InsertBot) {
-  LRU lru = LRU(10);
-
-  for (int i=0; i<100; i++) {
-    lru.lru_insert_bot(new Item(i));
+  LRU lru;
+  static const int n = 100;
+  Item items[n];
+
+  lru.lru_set_midpoint(.7); // 70% of elements.
+  for (int i=0; i<n; i++) {
+    items[i].set(i);
+    lru.lru_insert_bot(&items[i]);
   }
-  ASSERT_EQ(0U, lru.lru_get_top());
-  ASSERT_EQ(100U, lru.lru_get_bot());
+  ASSERT_EQ(70U, lru.lru_get_top());
+  ASSERT_EQ(30U, lru.lru_get_bot());
   ASSERT_EQ(100U, lru.lru_get_size());
 
   ASSERT_EQ(99, (static_cast<Item*>(lru.lru_expire()))->id);
 }
 
 TEST(lru, Adjust) {
-  LRU lru = LRU(10);
-
-  lru.lru_set_midpoint(.6); // 60% of 10 elements.
-  for (int i=0; i<100; i++) {
-    lru.lru_touch(new Item(i));
+  LRU lru;
+  static const int n = 100;
+  Item items[n];
+
+  lru.lru_set_midpoint(.6); // 60% of elements.
+  for (int i=0; i<n; i++) {
+    items[i].set(i);
+    lru.lru_insert_top(&items[i]);
+    if (i % 5 == 0)
+      items[i].lru_pin();
   }
-  ASSERT_EQ(6U, lru.lru_get_top());
-  ASSERT_EQ(94U, lru.lru_get_bot());
+  ASSERT_EQ(48U, lru.lru_get_top()); /* 60% of unpinned */
+  ASSERT_EQ(52U, lru.lru_get_bot());
   ASSERT_EQ(100U, lru.lru_get_size());
 
-  lru.lru_clear();
-
-  lru.lru_set_midpoint(1.2); // 120% of 10 elements.
-  for (int i=0; i<100; i++) {
-    lru.lru_touch(new Item(i));
-  }
-  ASSERT_EQ(12U, lru.lru_get_top());
-  ASSERT_EQ(88U, lru.lru_get_bot());
-  ASSERT_EQ(100U, lru.lru_get_size());
+  ASSERT_EQ(1, (static_cast<Item*>(lru.lru_expire()))->id);
+  ASSERT_EQ(1U, lru.lru_get_pintail());
+  ASSERT_EQ(47U, lru.lru_get_top()); /* 60% of unpinned */
+  ASSERT_EQ(51U, lru.lru_get_bot());
+  ASSERT_EQ(99U, lru.lru_get_size());
+  ASSERT_EQ(2, (static_cast<Item*>(lru.lru_expire()))->id);
+  ASSERT_EQ(1U, lru.lru_get_pintail());
+  ASSERT_EQ(46U, lru.lru_get_top()); /* 60% of unpinned */
+  ASSERT_EQ(51U, lru.lru_get_bot());
+  ASSERT_EQ(98U, lru.lru_get_size());
+  ASSERT_EQ(3, (static_cast<Item*>(lru.lru_expire()))->id);
+  ASSERT_EQ(4, (static_cast<Item*>(lru.lru_expire()))->id);
+  ASSERT_EQ(6, (static_cast<Item*>(lru.lru_expire()))->id);
+  ASSERT_EQ(2U, lru.lru_get_pintail());
+  ASSERT_EQ(45U, lru.lru_get_top()); /* 60% of unpinned */
+  ASSERT_EQ(48U, lru.lru_get_bot());
+  ASSERT_EQ(95U, lru.lru_get_size());
 }
 
 TEST(lru, Pinning) {
-  LRU lru = LRU();
+  LRU lru;
 
-  Item *ob0 = new Item(0);
-  Item *ob1 = new Item(1);
+  Item ob0(0), ob1(1);
 
   // test before ob1 are in a LRU
-  ob1->lru_pin();
-  ASSERT_FALSE(ob1->lru_is_expireable());
+  ob1.lru_pin();
+  ASSERT_FALSE(ob1.lru_is_expireable());
 
-  ob1->lru_unpin();
-  ASSERT_TRUE(ob1->lru_is_expireable());
+  ob1.lru_unpin();
+  ASSERT_TRUE(ob1.lru_is_expireable());
 
   // test when ob1 are in a LRU
-  lru.lru_touch(ob0);
-  lru.lru_touch(ob1);
+  lru.lru_insert_top(&ob0);
+  lru.lru_insert_top(&ob1);
 
-  ob1->lru_pin();
-  ob1->lru_pin(); // Verify that, one incr.
+  ob1.lru_pin();
+  ob1.lru_pin(); // Verify that, one incr.
   ASSERT_EQ(1U, lru.lru_get_num_pinned());
-  ASSERT_FALSE(ob1->lru_is_expireable());
+  ASSERT_FALSE(ob1.lru_is_expireable());
 
-  ob1->lru_unpin();
-  ob1->lru_unpin(); // Verify that, one decr.
+  ob1.lru_unpin();
+  ob1.lru_unpin(); // Verify that, one decr.
   ASSERT_EQ(0U, lru.lru_get_num_pinned());
-  ASSERT_TRUE(ob1->lru_is_expireable());
+  ASSERT_TRUE(ob1.lru_is_expireable());
 
   ASSERT_EQ(0, (static_cast<Item*>(lru.lru_expire()))->id);
-  ob0->lru_pin();
+  ob0.lru_pin();
   ASSERT_EQ(1, (static_cast<Item*>(lru.lru_expire()))->id);
 }
 
index 53fcb694e00e35cdd6375a8a9d3dd133844768a7..e492708fb2e31e464edbc5c15f9a44850486de0c 100644 (file)
@@ -1739,8 +1739,8 @@ TEST(LibCephFS, ClearSetuid) {
   Fh *fh;
   Inode *in;
   struct ceph_statx stx;
-  const mode_t after_mode = S_IRWXU | S_IRWXG;
-  const mode_t before_mode = S_IRWXU | S_IRWXG | S_ISUID | S_ISGID;
+  const mode_t after_mode = S_IRWXU;
+  const mode_t before_mode = S_IRWXU | S_ISUID | S_ISGID;
   const unsigned want = CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_MODE;
   UserPerm *usercred = ceph_mount_perms(cmount);
 
@@ -1788,6 +1788,35 @@ TEST(LibCephFS, ClearSetuid) {
   ASSERT_TRUE(stx.stx_mask & CEPH_STATX_MODE);
   ASSERT_EQ(stx.stx_mode & (mode_t)ALLPERMS, after_mode);
 
+  /* test chown with supplementary groups, and chown with/without exe bit */
+  uid_t u = 65534;
+  gid_t g = 65534;
+  gid_t gids[] = {65533,65532};
+  UserPerm *altcred = ceph_userperm_new(u, g, sizeof gids / sizeof gids[0], gids);
+  stx.stx_uid = u;
+  stx.stx_gid = g;
+  mode_t m = S_ISGID|S_ISUID|S_IRUSR|S_IWUSR;
+  stx.stx_mode = m;
+  ASSERT_EQ(ceph_ll_setattr(cmount, in, &stx, CEPH_STATX_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID, rootcred), 0);
+  ASSERT_EQ(ceph_ll_getattr(cmount, in, &stx, CEPH_STATX_MODE, 0, altcred), 0);
+  ASSERT_EQ(stx.stx_mode&(mode_t)ALLPERMS, m);
+  /* not dropped without exe bit */
+  stx.stx_gid = gids[0];
+  ASSERT_EQ(ceph_ll_setattr(cmount, in, &stx, CEPH_SETATTR_GID, altcred), 0);
+  ASSERT_EQ(ceph_ll_getattr(cmount, in, &stx, CEPH_STATX_MODE, 0, altcred), 0);
+  ASSERT_EQ(stx.stx_mode&(mode_t)ALLPERMS, m);
+  /* now check dropped with exe bit */
+  m = S_ISGID|S_ISUID|S_IRWXU;
+  stx.stx_mode = m;
+  ASSERT_EQ(ceph_ll_setattr(cmount, in, &stx, CEPH_STATX_MODE, altcred), 0);
+  ASSERT_EQ(ceph_ll_getattr(cmount, in, &stx, CEPH_STATX_MODE, 0, altcred), 0);
+  ASSERT_EQ(stx.stx_mode&(mode_t)ALLPERMS, m);
+  stx.stx_gid = gids[1];
+  ASSERT_EQ(ceph_ll_setattr(cmount, in, &stx, CEPH_SETATTR_GID, altcred), 0);
+  ASSERT_EQ(ceph_ll_getattr(cmount, in, &stx, CEPH_STATX_MODE, 0, altcred), 0);
+  ASSERT_EQ(stx.stx_mode&(mode_t)ALLPERMS, m&(S_IRWXU|S_IRWXG|S_IRWXO));
+  ceph_userperm_destroy(altcred);
+
   ASSERT_EQ(ceph_ll_close(cmount, fh), 0);
   ceph_shutdown(cmount);
 }
index 33e162a0f5e6324963080fce7accc2ba75d7dde9..54be1487faed851a2472e30aa7566ab1b57e7e88 100644 (file)
@@ -164,7 +164,7 @@ TEST_F(TestJournalEntries, AioDiscard) {
   REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
 
   CephContext* cct = reinterpret_cast<CephContext*>(_rados.cct());
-  REQUIRE(!cct->_conf->rbd_skip_partial_discard);
+  REQUIRE(!cct->_conf->get_val<bool>("rbd_skip_partial_discard"));
 
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
@@ -175,7 +175,7 @@ TEST_F(TestJournalEntries, AioDiscard) {
   C_SaferCond cond_ctx;
   auto c = librbd::io::AioCompletion::create(&cond_ctx);
   c->get();
-  ictx->io_work_queue->aio_discard(c, 123, 234, cct->_conf->rbd_skip_partial_discard);
+  ictx->io_work_queue->aio_discard(c, 123, 234, ictx->skip_partial_discard);
   ASSERT_EQ(0, c->wait_for_complete());
   c->put();
 
index 7c735695b8e89fae73fbcbbeaf5ac3e555318e73..d8c651858a822f9d58999825fcf14858ad1242c9 100644 (file)
@@ -143,7 +143,8 @@ TEST_F(TestJournalReplay, AioDiscardEvent) {
 
   // inject a discard operation into the journal
   inject_into_journal(ictx,
-                      librbd::journal::AioDiscardEvent(0, payload.size(), ictx->skip_partial_discard));
+                      librbd::journal::AioDiscardEvent(0, payload.size(),
+                                                       ictx->skip_partial_discard));
   close_image(ictx);
 
   // re-open the journal so that it replays the new entry
@@ -155,7 +156,7 @@ TEST_F(TestJournalReplay, AioDiscardEvent) {
                                 librbd::io::ReadResult{read_result}, 0);
   ASSERT_EQ(0, aio_comp->wait_for_complete());
   aio_comp->release();
-  if (ictx->cct->_conf->rbd_skip_partial_discard) {
+  if (ictx->skip_partial_discard) {
     ASSERT_EQ(payload, read_payload);
   } else {
     ASSERT_EQ(std::string(read_payload.size(), '\0'), read_payload);
@@ -170,9 +171,11 @@ TEST_F(TestJournalReplay, AioDiscardEvent) {
 
   // replay several envents and check the commit position
   inject_into_journal(ictx,
-                      librbd::journal::AioDiscardEvent(0, payload.size(), ictx->cct->_conf->rbd_skip_partial_discard));
+                      librbd::journal::AioDiscardEvent(0, payload.size(),
+                                                       ictx->skip_partial_discard));
   inject_into_journal(ictx,
-                      librbd::journal::AioDiscardEvent(0, payload.size(), ictx->cct->_conf->rbd_skip_partial_discard));
+                      librbd::journal::AioDiscardEvent(0, payload.size(),
+                                                       ictx->skip_partial_discard));
   close_image(ictx);
 
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
@@ -183,7 +186,8 @@ TEST_F(TestJournalReplay, AioDiscardEvent) {
 
   // verify lock ordering constraints
   aio_comp = new librbd::io::AioCompletion();
-  ictx->io_work_queue->aio_discard(aio_comp, 0, read_payload.size(), ictx->cct->_conf->rbd_skip_partial_discard);
+  ictx->io_work_queue->aio_discard(aio_comp, 0, read_payload.size(),
+                                   ictx->skip_partial_discard);
   ASSERT_EQ(0, aio_comp->wait_for_complete());
   aio_comp->release();
 }
index 174e97be49bf8e8a88c71d33ee3d9ffc1e6ef48a..8f2f5f6ef2774bf5566e195e473358a5e1008a58 100644 (file)
@@ -34,8 +34,9 @@ struct BreakRequest<librbd::MockImageCtx> {
                               uint32_t blacklist_expire_seconds,
                               bool force_break_lock, Context *on_finish) {
     CephContext *cct = reinterpret_cast<CephContext *>(ioctx.cct());
-    EXPECT_EQ(cct->_conf->rbd_blacklist_on_break_lock, blacklist_locker);
-    EXPECT_EQ(cct->_conf->rbd_blacklist_expire_seconds,
+    EXPECT_EQ(cct->_conf->get_val<bool>("rbd_blacklist_on_break_lock"),
+              blacklist_locker);
+    EXPECT_EQ(cct->_conf->get_val<int64_t>("rbd_blacklist_expire_seconds"),
               (int)blacklist_expire_seconds);
     EXPECT_FALSE(force_break_lock);
     assert(s_instance != nullptr);
index 483b063e56f330a75c793f43013788ab3cc9d1f6..be439b985f722badaf5b2abcded7ffb0c57f80a6 100644 (file)
@@ -105,7 +105,8 @@ struct MockImageCtx {
       mirroring_resync_after_disconnect(
           image_ctx.mirroring_resync_after_disconnect),
       mirroring_replay_delay(image_ctx.mirroring_replay_delay),
-      non_blocking_aio(image_ctx.non_blocking_aio)
+      non_blocking_aio(image_ctx.non_blocking_aio),
+      blkin_trace_all(image_ctx.blkin_trace_all)
   {
     md_ctx.dup(image_ctx.md_ctx);
     data_ctx.dup(image_ctx.data_ctx);
@@ -297,6 +298,7 @@ struct MockImageCtx {
   bool mirroring_resync_after_disconnect;
   int mirroring_replay_delay;
   bool non_blocking_aio;
+  bool blkin_trace_all;
 };
 
 } // namespace librbd
index f29ff933fdac11e9fb9065ba9014c1b25cba41f5..7b49b71a8d8bda79ca2cabffe64bd6ddaf3d24c4 100644 (file)
@@ -32,6 +32,7 @@ struct MockMirroringWatcher : public MirroringWatcher<> {
 } // anonymous namespace
 
 using ::testing::_;
+using ::testing::AtLeast;
 using ::testing::Invoke;
 using ::testing::StrEq;
 using ::testing::WithArg;
@@ -72,10 +73,13 @@ public:
 };
 
 TEST_F(TestMirroringWatcher, ModeUpdated) {
-  EXPECT_CALL(*m_image_watcher, handle_mode_updated(cls::rbd::MIRROR_MODE_DISABLED));
+  EXPECT_CALL(*m_image_watcher,
+              handle_mode_updated(cls::rbd::MIRROR_MODE_DISABLED))
+    .Times(AtLeast(1));
 
   C_SaferCond ctx;
-  MockMirroringWatcher::notify_mode_updated(m_ioctx, cls::rbd::MIRROR_MODE_DISABLED, &ctx);
+  MockMirroringWatcher::notify_mode_updated(
+    m_ioctx, cls::rbd::MIRROR_MODE_DISABLED, &ctx);
   ASSERT_EQ(0, ctx.wait());
 }
 
@@ -83,12 +87,13 @@ TEST_F(TestMirroringWatcher, ImageStatusUpdated) {
   EXPECT_CALL(*m_image_watcher,
               handle_image_updated(cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
                                    StrEq("image id"),
-                                   StrEq("global image id")));
+                                   StrEq("global image id")))
+    .Times(AtLeast(1));
 
   C_SaferCond ctx;
-  MockMirroringWatcher::notify_image_updated(m_ioctx,
-                                             cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
-                                             "image id", "global image id", &ctx);
+  MockMirroringWatcher::notify_image_updated(
+    m_ioctx, cls::rbd::MIRROR_IMAGE_STATE_ENABLED, "image id",
+    "global image id", &ctx);
   ASSERT_EQ(0, ctx.wait());
 }
 
index dd8d5a8cb0c63ca383fc836e2276bc21190bbc27..7cc6e12cab9b9ac0c5d07e88f0322f893d6627e2 100644 (file)
@@ -718,7 +718,7 @@ TEST_F(TestInternal, DiscardCopyup)
   REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
 
   CephContext* cct = reinterpret_cast<CephContext*>(_rados.cct());
-  REQUIRE(!cct->_conf->rbd_skip_partial_discard);
+  REQUIRE(!cct->_conf->get_val<bool>("rbd_skip_partial_discard"));
 
   m_image_name = get_temp_image_name();
   m_image_size = 1 << 14;
index 60de6b7ddae91d146682e9e252b59ba8ba40c9a2..d7e8a18fe41c893ac5647100bbf8f87cfbd2a343 100644 (file)
@@ -180,7 +180,7 @@ TEST(pgmap, dump_object_stat_sum_0)
   unsigned col = 0;
   ASSERT_EQ(stringify(si_t(sum.num_bytes)), tbl.get(0, col++));
   ASSERT_EQ(percentify(used_percent), tbl.get(0, col++));
-  ASSERT_EQ(stringify(si_t(avail)), tbl.get(0, col++));
+  ASSERT_EQ(stringify(si_t(avail/pool.size)), tbl.get(0, col++));
   ASSERT_EQ(stringify(sum.num_objects), tbl.get(0, col++));
   ASSERT_EQ(stringify(si_t(sum.num_objects_dirty)), tbl.get(0, col++));
   ASSERT_EQ(stringify(si_t(sum.num_rd)), tbl.get(0, col++));
@@ -210,7 +210,7 @@ TEST(pgmap, dump_object_stat_sum_1)
   unsigned col = 0;
   ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
   ASSERT_EQ(percentify(0), tbl.get(0, col++));
-  ASSERT_EQ(stringify(si_t(avail)), tbl.get(0, col++));
+  ASSERT_EQ(stringify(si_t(avail/pool.size)), tbl.get(0, col++));
   ASSERT_EQ(stringify(0), tbl.get(0, col++));
   ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
   ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
@@ -239,6 +239,6 @@ TEST(pgmap, dump_object_stat_sum_2)
   unsigned col = 0;
   ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
   ASSERT_EQ(percentify(0), tbl.get(0, col++));
-  ASSERT_EQ(stringify(si_t(avail)), tbl.get(0, col++));
+  ASSERT_EQ(stringify(si_t(avail/pool.size)), tbl.get(0, col++));
   ASSERT_EQ(stringify(0), tbl.get(0, col++));
 }
index bf29f87c4e55fda6d174835da9f36b3ade15f24e..91d78f0aa83a625e72c43f9d83f525a113e4f169 100644 (file)
@@ -2392,16 +2392,25 @@ TEST_F(PGLogTestRebuildMissing, MissingNotInLog) {
 }
 
 
-class PGLogMergeDupsTest : public ::testing::Test, protected PGLog {
+class PGLogMergeDupsTest : protected PGLog, public StoreTestFixture {
 
 public:
 
-  PGLogMergeDupsTest() : PGLog(g_ceph_context) { }
+  PGLogMergeDupsTest() : PGLog(g_ceph_context), StoreTestFixture("memstore") { }
 
-  void SetUp() override { }
+  void SetUp() override {
+    StoreTestFixture::SetUp();
+    ObjectStore::Sequencer osr(__func__);
+    ObjectStore::Transaction t;
+    test_coll = coll_t(spg_t(pg_t(1, 1)));
+    t.create_collection(test_coll, 0);
+    store->apply_transaction(&osr, std::move(t));
+  }
 
   void TearDown() override {
+    test_disk_roundtrip();
     clear();
+    StoreTestFixture::TearDown();
   }
 
   static pg_log_dup_t create_dup_entry(uint a, uint b) {
@@ -2437,11 +2446,13 @@ public:
 
   void add_dups(uint a, uint b) {
     log.dups.push_back(create_dup_entry(a, b));
+    write_from_dups = MIN(write_from_dups, log.dups.back().version);
   }
 
   void add_dups(const std::vector<pg_log_dup_t>& l) {
     for (auto& i : l) {
       log.dups.push_back(i);
+      write_from_dups = MIN(write_from_dups, log.dups.back().version);
     }
   }
 
@@ -2466,6 +2477,36 @@ public:
       EXPECT_EQ(1u, log.dup_index.count(i.reqid));
     }
   }
+
+  void test_disk_roundtrip() {
+    ObjectStore::Sequencer osr(__func__);
+    ObjectStore::Transaction t;
+    hobject_t hoid;
+    hoid.pool = 1;
+    hoid.oid = "log";
+    ghobject_t log_oid(hoid);
+    map<string, bufferlist> km;
+    write_log_and_missing(t, &km, test_coll, log_oid, false);
+    if (!km.empty()) {
+      t.omap_setkeys(test_coll, log_oid, km);
+    }
+    ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t)));
+
+    auto orig_dups = log.dups;
+    clear();
+    ostringstream err;
+    read_log_and_missing(store.get(), test_coll, test_coll, log_oid,
+                        pg_info_t(), false, err, false);
+    ASSERT_EQ(orig_dups.size(), log.dups.size());
+    ASSERT_EQ(orig_dups, log.dups);
+    auto dups_it = log.dups.begin();
+    for (auto orig_dup : orig_dups) {
+      ASSERT_EQ(orig_dup, *dups_it);
+      ++dups_it;
+    }
+  }
+
+  coll_t test_coll;
 };
 
 TEST_F(PGLogMergeDupsTest, OtherEmpty) {
@@ -2657,14 +2698,7 @@ struct PGLogTrimTest :
   public PGLogTestBase,
   public PGLog::IndexedLog
 {
-  std::list<hobject_t*> test_hobjects;
-  CephContext *cct;
-
-  void SetUp() override {
-    cct = (new CephContext(CEPH_ENTITY_TYPE_OSD))->get();
-
-    hobject_t::generate_test_instances(test_hobjects);
-  }
+  CephContext *cct = g_ceph_context;
 
   void SetUp(unsigned min_entries, unsigned max_entries, unsigned dup_track) {
     constexpr size_t size = 10;
@@ -2680,35 +2714,10 @@ struct PGLogTrimTest :
     cct->_conf->set_val_or_die("osd_min_pg_log_entries", min_entries_s);
     cct->_conf->set_val_or_die("osd_max_pg_log_entries", max_entries_s);
     cct->_conf->set_val_or_die("osd_pg_log_dups_tracked", dup_track_s);
-}
-
-  void TearDown() override {
-    while (!test_hobjects.empty()) {
-      delete test_hobjects.front();
-      test_hobjects.pop_front();
-    }
-
-    cct->put();
   }
 }; // struct PGLogTrimTest
 
 
-# if 0
-TEST_F(PGLogTest, Trim1) {
-  TestCase t;
-
-  t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
-  t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(15, 150), mk_evt(10, 100)));
-  t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(15, 155), mk_evt(15, 150)));
-  t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(20, 160), mk_evt(25, 152)));
-  t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(21, 165), mk_evt(26, 160)));
-  t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(21, 165), mk_evt(31, 171)));
-
-  t.setup();
-}
-#endif
-
-
 TEST_F(PGLogTrimTest, TestMakingCephContext)
 {
   SetUp(1, 2, 5);
@@ -2736,11 +2745,11 @@ TEST_F(PGLogTrimTest, TestPartialTrim)
 
   std::set<eversion_t> trimmed;
   std::set<std::string> trimmed_dups;
-  bool dirty_dups = false;
+  eversion_t write_from_dups = eversion_t::max();
 
-  log.trim(cct, mk_evt(19, 157), &trimmed, &trimmed_dups, &dirty_dups);
+  log.trim(cct, mk_evt(19, 157), &trimmed, &trimmed_dups, &write_from_dups);
 
-  EXPECT_EQ(true, dirty_dups);
+  EXPECT_EQ(eversion_t(15, 150), write_from_dups);
   EXPECT_EQ(3u, log.log.size());
   EXPECT_EQ(3u, trimmed.size());
   EXPECT_EQ(2u, log.dups.size());
@@ -2750,11 +2759,11 @@ TEST_F(PGLogTrimTest, TestPartialTrim)
 
   std::set<eversion_t> trimmed2;
   std::set<std::string> trimmed_dups2;
-  bool dirty_dups2 = false;
-  
-  log.trim(cct, mk_evt(20, 164), &trimmed2, &trimmed_dups2, &dirty_dups2);
+  eversion_t write_from_dups2 = eversion_t::max();
 
-  EXPECT_EQ(true, dirty_dups2);
+  log.trim(cct, mk_evt(20, 164), &trimmed2, &trimmed_dups2, &write_from_dups2);
+
+  EXPECT_EQ(eversion_t(19, 160), write_from_dups2);
   EXPECT_EQ(2u, log.log.size());
   EXPECT_EQ(1u, trimmed2.size());
   EXPECT_EQ(2u, log.dups.size());
@@ -2776,11 +2785,11 @@ TEST_F(PGLogTrimTest, TestTrimNoTrimmed) {
   log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160)));
   log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166)));
 
-  bool dirty_dups = false;
+  eversion_t write_from_dups = eversion_t::max();
 
-  log.trim(cct, mk_evt(19, 157), nullptr, nullptr, &dirty_dups);
+  log.trim(cct, mk_evt(19, 157), nullptr, nullptr, &write_from_dups);
 
-  EXPECT_EQ(true, dirty_dups);
+  EXPECT_EQ(eversion_t(15, 150), write_from_dups);
   EXPECT_EQ(3u, log.log.size());
   EXPECT_EQ(2u, log.dups.size());
 }
@@ -2803,11 +2812,11 @@ TEST_F(PGLogTrimTest, TestTrimNoDups)
 
   std::set<eversion_t> trimmed;
   std::set<std::string> trimmed_dups;
-  bool dirty_dups = false;
+  eversion_t write_from_dups = eversion_t::max();
 
-  log.trim(cct, mk_evt(19, 157), &trimmed, &trimmed_dups, &dirty_dups);
+  log.trim(cct, mk_evt(19, 157), &trimmed, &trimmed_dups, &write_from_dups);
 
-  EXPECT_FALSE(dirty_dups);
+  EXPECT_EQ(eversion_t::max(), write_from_dups);
   EXPECT_EQ(3u, log.log.size());
   EXPECT_EQ(3u, trimmed.size());
   EXPECT_EQ(0u, log.dups.size());
@@ -2831,11 +2840,11 @@ TEST_F(PGLogTrimTest, TestNoTrim)
 
   std::set<eversion_t> trimmed;
   std::set<std::string> trimmed_dups;
-  bool dirty_dups = false;
+  eversion_t write_from_dups = eversion_t::max();
 
-  log.trim(cct, mk_evt(9, 99), &trimmed, &trimmed_dups, &dirty_dups);
+  log.trim(cct, mk_evt(9, 99), &trimmed, &trimmed_dups, &write_from_dups);
 
-  EXPECT_FALSE(dirty_dups);
+  EXPECT_EQ(eversion_t::max(), write_from_dups);
   EXPECT_EQ(6u, log.log.size());
   EXPECT_EQ(0u, trimmed.size());
   EXPECT_EQ(0u, log.dups.size());
@@ -2859,11 +2868,11 @@ TEST_F(PGLogTrimTest, TestTrimAll)
 
   std::set<eversion_t> trimmed;
   std::set<std::string> trimmed_dups;
-  bool dirty_dups = false;
+  eversion_t write_from_dups = eversion_t::max();
 
-  log.trim(cct, mk_evt(22, 180), &trimmed, &trimmed_dups, &dirty_dups);
+  log.trim(cct, mk_evt(22, 180), &trimmed, &trimmed_dups, &write_from_dups);
 
-  EXPECT_EQ(true, dirty_dups);
+  EXPECT_EQ(eversion_t(15, 150), write_from_dups);
   EXPECT_EQ(0u, log.log.size());
   EXPECT_EQ(6u, trimmed.size());
   EXPECT_EQ(5u, log.dups.size());
@@ -2893,11 +2902,11 @@ TEST_F(PGLogTrimTest, TestGetRequest) {
   log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166),
                       osd_reqid_t(client, 8, 6)));
 
-  bool dirty_dups = false;
+  eversion_t write_from_dups = eversion_t::max();
 
-  log.trim(cct, mk_evt(19, 157), nullptr, nullptr, &dirty_dups);
+  log.trim(cct, mk_evt(19, 157), nullptr, nullptr, &write_from_dups);
 
-  EXPECT_EQ(true, dirty_dups);
+  EXPECT_EQ(eversion_t(15, 150), write_from_dups);
   EXPECT_EQ(3u, log.log.size());
   EXPECT_EQ(2u, log.dups.size());
 
@@ -2980,6 +2989,7 @@ TEST_F(PGLogTest, _merge_object_divergent_entries) {
   }
 }
 
+
 // Local Variables:
 // compile-command: "cd ../.. ; make unittest_pglog ; ./unittest_pglog --log-to-stderr=true  --debug-osd=20 # --gtest_filter=*.* "
 // End:
index 2f458e6ad6411ddd5713193a3e2b356329cae104..2141a00f7a01d3d9df43240a5dca228b155056ba 100644 (file)
@@ -623,8 +623,8 @@ TEST_F(TestMockLeaderWatcher, Break) {
   EXPECT_EQ(0, _rados->conf_set("rbd_mirror_leader_max_missed_heartbeats",
                                 "1"));
   CephContext *cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct());
-  int max_acquire_attempts =
-    cct->_conf->rbd_mirror_leader_max_acquire_attempts_before_break;
+  int max_acquire_attempts = cct->_conf->get_val<int64_t>(
+    "rbd_mirror_leader_max_acquire_attempts_before_break");
 
   MockManagedLock mock_managed_lock;
   MockMirrorStatusWatcher mock_mirror_status_watcher;
index 721b5c5cb24cc2e4fcd1fe155cd66af094e7a81e..0109a02138db9ae358ee7d874808328878206fec 100644 (file)
@@ -930,3 +930,45 @@ def test_bucket_sync_disable_enable():
 
     for bucket_name in buckets:
         zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
+
+def test_encrypted_object_sync():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+
+    (zone1, zone2) = zonegroup_conns.rw_zones[0:2]
+
+    # create a bucket on the first zone
+    bucket_name = gen_bucket_name()
+    log.info('create bucket zone=%s name=%s', zone1.name, bucket_name)
+    bucket = zone1.conn.create_bucket(bucket_name)
+
+    # upload an object with sse-c encryption
+    sse_c_headers = {
+        'x-amz-server-side-encryption-customer-algorithm': 'AES256',
+        'x-amz-server-side-encryption-customer-key': 'pO3upElrwuEXSoFwCfnZPdSsmt/xWeFa0N9KgDijwVs=',
+        'x-amz-server-side-encryption-customer-key-md5': 'DWygnHRtgiJ77HCm+1rvHw=='
+    }
+    key = bucket.new_key('testobj-sse-c')
+    data = 'A'*512
+    key.set_contents_from_string(data, headers=sse_c_headers)
+
+    # upload an object with sse-kms encryption
+    sse_kms_headers = {
+        'x-amz-server-side-encryption': 'aws:kms',
+        # testkey-1 must be present in 'rgw crypt s3 kms encryption keys' (vstart.sh adds this)
+        'x-amz-server-side-encryption-aws-kms-key-id': 'testkey-1',
+    }
+    key = bucket.new_key('testobj-sse-kms')
+    key.set_contents_from_string(data, headers=sse_kms_headers)
+
+    # wait for the bucket metadata and data to sync
+    zonegroup_meta_checkpoint(zonegroup)
+    zone_bucket_checkpoint(zone2.zone, zone1.zone, bucket_name)
+
+    # read the encrypted objects from the second zone
+    bucket2 = get_bucket(zone2, bucket_name)
+    key = bucket2.get_key('testobj-sse-c', headers=sse_c_headers)
+    eq(data, key.get_contents_as_string(headers=sse_c_headers))
+
+    key = bucket2.get_key('testobj-sse-kms')
+    eq(data, key.get_contents_as_string())
index 13224f759291a76be11ba20025e3dc475c46a160..397303deaafbc24b68e4b0bb4b002a4f635d014b 100644 (file)
@@ -479,12 +479,12 @@ int write_pg(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
   if (!divergent.empty()) {
     assert(missing.get_items().empty());
     PGLog::write_log_and_missing_wo_missing(
-      t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, true, true);
+      t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, true);
   } else {
     pg_missing_tracker_t tmissing(missing);
     bool rebuilt_missing_set_with_deletes = missing.may_include_deletes;
     PGLog::write_log_and_missing(
-      t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true, true,
+      t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true,
       &rebuilt_missing_set_with_deletes);
   }
   t.omap_setkeys(coll, info.pgid.make_pgmeta_oid(), km);
@@ -2286,9 +2286,13 @@ int dup(string srcpath, ObjectStore *src, string dstpath, ObjectStore *dst)
       ObjectStore::Transaction t;
       int bits = src->collection_bits(cid);
       if (bits < 0) {
-       cerr << "cannot get bit count for collection " << cid << ": "
-            << cpp_strerror(bits) << std::endl;
-       goto out;
+        if (src->get_type() == "filestore" && cid.is_meta()) {
+          bits = 0;
+        } else {
+          cerr << "cannot get bit count for collection " << cid << ": "
+               << cpp_strerror(bits) << std::endl;
+          goto out;
+        }
       }
       t.create_collection(cid, bits);
       dst->apply_transaction(&osr, std::move(t));
index c6c9264fd06a1501f3ed1fb02e808e6a75e1985c..42b42a6fb37a99dc27b6b14c067abc048f7f40ee 100644 (file)
@@ -332,7 +332,7 @@ void add_no_progress_option(boost::program_options::options_description *opt) {
 
 void add_format_options(boost::program_options::options_description *opt) {
   opt->add_options()
-    (FORMAT.c_str(), po::value<Format>(), "output format [plain, json, or xml]")
+    (FORMAT.c_str(), po::value<Format>(), "output format (plain, json, or xml) [default: plain]")
     (PRETTY_FORMAT.c_str(), po::bool_switch(),
      "pretty formatting (json and xml)");
 }
index db248e34a1b723072c71abefb9426bb413d102cf..549cf9dd8338486d201002ce51baf3a3f1fe89e5 100644 (file)
@@ -84,7 +84,7 @@ int read_string(int fd, unsigned max, std::string *out) {
 int extract_spec(const std::string &spec, std::string *pool_name,
                  std::string *image_name, std::string *snap_name,
                  SpecValidation spec_validation) {
-  if (!g_ceph_context->_conf->rbd_validate_names) {
+  if (!g_ceph_context->_conf->get_val<bool>("rbd_validate_names")) {
     spec_validation = SPEC_VALIDATION_NONE;
   }
 
@@ -184,7 +184,7 @@ std::string get_positional_argument(const po::variables_map &vm, size_t index) {
 }
 
 std::string get_default_pool_name() {
-  return g_ceph_context->_conf->rbd_default_pool;
+  return g_ceph_context->_conf->get_val<std::string>("rbd_default_pool");
 }
 
 std::string get_pool_name(const po::variables_map &vm, size_t *arg_index) {
@@ -831,7 +831,13 @@ int get_formatter(const po::variables_map &vm,
       std::cerr << "rbd: --pretty-format only works when --format "
                 << "is json or xml" << std::endl;
       return -EINVAL;
+    } else if (*formatter != nullptr && !pretty) {
+      formatter->get()->enable_line_break();
     }
+  } else if (vm[at::PRETTY_FORMAT].as<bool>()) {
+    std::cerr << "rbd: --pretty-format only works when --format "
+              << "is json or xml" << std::endl;
+    return -EINVAL;
   }
   return 0;
 }
index 3a38a052fb3d2eac107f8a6ddd5db9b639116c22..9934e81dd21fe43e4ab053ab0d0c580814b53bde 100644 (file)
@@ -283,7 +283,7 @@ int execute(const po::variables_map &vm) {
                     from_snap_name.empty() ? nullptr : from_snap_name.c_str(),
                     formatter.get());
   if (r < 0) {
-    std::cerr << "du failed: " << cpp_strerror(r) << std::endl;
+    std::cerr << "rbd: du failed: " << cpp_strerror(r) << std::endl;
     return r;
   }
   return 0;
index 708663a43104f3cc0cb3c9fdf2a4936449e8b0cd..bfda6980598a6190c8ec07caf88fde771b26c0ce 100644 (file)
@@ -177,8 +177,8 @@ int do_export_diff_fd(librbd::Image& image, const char *fromsnapname,
     }
   }
   ExportDiffContext edc(&image, fd, info.size,
-                        g_conf->rbd_concurrent_management_ops, no_progress,
-                       export_format);
+                        g_conf->get_val<int64_t>("rbd_concurrent_management_ops"),
+                        no_progress, export_format);
   r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object,
                           &C_ExportDiff::export_diff_cb, (void *)&edc);
   if (r < 0) {
@@ -509,7 +509,7 @@ static int do_export(librbd::Image& image, const char *path, bool no_progress, i
     fd = STDOUT_FILENO;
     max_concurrent_ops = 1;
   } else {
-    max_concurrent_ops = g_conf->rbd_concurrent_management_ops;
+    max_concurrent_ops = g_conf->get_val<int64_t>("rbd_concurrent_management_ops");
     fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
     if (fd < 0) {
       return -errno;
index d6dec85f6a99b9c14cab4ee2db58ce94baed211a..c0ae0b8ef1f5dba46d2baa3df89475db5e5263af 100644 (file)
@@ -33,52 +33,66 @@ int get_key(const po::variables_map &vm, std::string *key) {
   return 0;
 }
 
+const uint32_t MAX_KEYS = 64;
+
 } // anonymous namespace
 
 static int do_metadata_list(librbd::Image& image, Formatter *f)
 {
-  std::map<std::string, bufferlist> pairs;
   int r;
   TextTable tbl;
 
-  r = image.metadata_list("", 0, &pairs);
-  if (r < 0) {
-    std::cerr << "failed to list metadata of image : " << cpp_strerror(r)
-              << std::endl;
-    return r;
-  }
-
-  if (f) {
-    f->open_object_section("metadatas");
-  } else {
-    tbl.define_column("Key", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
-  }
+  size_t count = 0;
+  std::string last_key;
+  bool more_results = true;
+  while (more_results) {
+    std::map<std::string, bufferlist> pairs;
+    r = image.metadata_list(last_key, MAX_KEYS, &pairs);
+    if (r < 0) {
+      std::cerr << "failed to list metadata of image : " << cpp_strerror(r)
+                << std::endl;
+      return r;
+    }
 
-  if (!pairs.empty()) {
-    bool one = (pairs.size() == 1);
+    more_results = (pairs.size() == MAX_KEYS);
+    if (!pairs.empty()) {
+      if (count == 0) {
+        if (f) {
+          f->open_object_section("metadatas");
+        } else {
+          tbl.define_column("Key", TextTable::LEFT, TextTable::LEFT);
+          tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
+        }
+      }
 
-    if (!f) {
-      std::cout << "There " << (one ? "is " : "are ") << pairs.size()
-           << " metadata" << (one ? "" : "s") << " on this image.\n";
-    }
+      last_key = pairs.rbegin()->first;
+      count += pairs.size();
 
-    for (std::map<std::string, bufferlist>::iterator it = pairs.begin();
-         it != pairs.end(); ++it) {
-      std::string val(it->second.c_str(), it->second.length());
-      if (f) {
-        f->dump_string(it->first.c_str(), val.c_str());
-      } else {
-        tbl << it->first << val.c_str() << TextTable::endrow;
+      for (auto kv : pairs) {
+        std::string val(kv.second.c_str(), kv.second.length());
+        if (f) {
+          f->dump_string(kv.first.c_str(), val.c_str());
+        } else {
+          tbl << kv.first << val << TextTable::endrow;
+        }
       }
     }
-    if (!f)
-      std::cout << tbl;
   }
 
-  if (f) {
-    f->close_section();
-    f->flush(std::cout);
+  if (f == nullptr) {
+    bool single = (count == 1);
+    std::cout << "There " << (single ? "is" : "are") << " " << count << " "
+              << (single ? "metadatum" : "metadata") << " on this image"
+              << (count == 0 ? "." : ":") << std::endl;
+  }
+
+  if (count > 0) {
+    if (f) {
+      f->close_section();
+      f->flush(std::cout);
+    } else {
+      std::cout << std::endl << tbl;
+    }
   }
   return 0;
 }
index 0f31d810394bee1a8071142ca62605657265d63d..3c717855270574547b92b5b300aa2b0d37590cac 100644 (file)
@@ -37,7 +37,9 @@ struct ImportDiffContext {
   ImportDiffContext(librbd::Image *image, int fd, size_t size, bool no_progress)
     : image(image), fd(fd), size(size), pc("Importing image diff", no_progress),
       throttle((fd == STDIN_FILENO) ? 1 :
-               g_conf->rbd_concurrent_management_ops, false), last_offset(0) {
+                  g_conf->get_val<int64_t>("rbd_concurrent_management_ops"),
+               false),
+      last_offset(0) {
   }
 
   void update_size(size_t new_size)
@@ -653,7 +655,7 @@ static int do_import_v1(int fd, librbd::Image &image, uint64_t size,
     throttle.reset(new SimpleThrottle(1, false));
   } else {
     throttle.reset(new SimpleThrottle(
-                     g_conf->rbd_concurrent_management_ops, false));
+      g_conf->get_val<int64_t>("rbd_concurrent_management_ops"), false));
   }
 
   reqlen = min<uint64_t>(reqlen, size);
@@ -746,7 +748,7 @@ static int do_import(librados::Rados &rados, librbd::RBD &rbd,
 
   uint64_t order;
   if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
-    order = g_conf->rbd_default_order;
+    order = g_conf->get_val<int64_t>("rbd_default_order");
   }
 
   // try to fill whole imgblklen blocks for sparsification
index 1f1951fc8842539b269ca3d2b00d80d4bff87623..4f58cc27d83c87dabc7219a8529a782f96dbc3a5 100644 (file)
@@ -414,7 +414,8 @@ int execute_map(const po::variables_map &vm) {
   }
 
   // parse default options first so they can be overwritten by cli options
-  char *default_map_options = strdup(g_conf->rbd_default_map_options.c_str());
+  char *default_map_options = strdup(g_conf->get_val<std::string>(
+    "rbd_default_map_options").c_str());
   BOOST_SCOPE_EXIT( (default_map_options) ) {
     free(default_map_options);
   } BOOST_SCOPE_EXIT_END;
index 4c4babe99489d57a057046cc358603da7e55ad0e..fbac3e106559190dd7d1aa6641e0c4e02c198cc3 100644 (file)
@@ -290,7 +290,9 @@ int execute(const po::variables_map &vm) {
     return r;
   }
 
-  r = do_list(pool_name, vm["long"].as<bool>(), g_conf->rbd_concurrent_management_ops, formatter.get());
+  r = do_list(pool_name, vm["long"].as<bool>(),
+              g_conf->get_val<int64_t>("rbd_concurrent_management_ops"),
+              formatter.get());
   if (r < 0) {
     std::cerr << "rbd: list: " << cpp_strerror(r) << std::endl;
     return r;
index 0090a42060b9716886e73eb57b482ab2ba64e4fc..4314b1ed6b46ddc3c8b8944a359414d24dda0422 100644 (file)
@@ -459,7 +459,8 @@ public:
       m_factory(std::bind(ImageRequestAllocator<RequestT>(),
                           std::ref(m_io_ctx), std::ref(m_throttle),
                           std::placeholders::_1, std::forward<Args>(args)...)),
-      m_throttle(g_conf->rbd_concurrent_management_ops, true) {
+      m_throttle(g_conf->get_val<int64_t>("rbd_concurrent_management_ops"),
+                 true) {
   }
 
   int execute() {
index 07de6e93c38cf575d41cd2a3b3f4fd86d0f17aca..333d11e06e494c8e5bfd0953b62ed548ecaa13d0 100644 (file)
@@ -147,7 +147,8 @@ ImageDeleter<I>::ImageDeleter(ContextWQ *work_queue, SafeTimer *timer,
     m_failed_timer_lock(timer_lock),
     m_asok_hook(new ImageDeleterAdminSocketHook<I>(g_ceph_context, this))
 {
-  set_failed_timer_interval(g_ceph_context->_conf->rbd_mirror_delete_retry_interval);
+  set_failed_timer_interval(g_ceph_context->_conf->get_val<double>(
+    "rbd_mirror_delete_retry_interval"));
   m_image_deleter_thread.create("image_deleter");
 }
 
index a84199968a253657c44be60dc252ba242bfa6a3d..2bb31b4b11dabc857ae61efbef2968390294cfcf 100644 (file)
@@ -526,8 +526,10 @@ void ImageReplayer<I>::bootstrap() {
 
   CephContext *cct = static_cast<CephContext *>(m_local->cct());
   journal::Settings settings;
-  settings.commit_interval = cct->_conf->rbd_mirror_journal_commit_age;
-  settings.max_fetch_bytes = cct->_conf->rbd_mirror_journal_max_fetch_bytes;
+  settings.commit_interval = cct->_conf->get_val<double>(
+    "rbd_mirror_journal_commit_age");
+  settings.max_fetch_bytes = cct->_conf->get_val<uint64_t>(
+    "rbd_mirror_journal_max_fetch_bytes");
 
   m_remote_journaler = new Journaler(m_threads->work_queue,
                                      m_threads->timer,
@@ -717,7 +719,8 @@ void ImageReplayer<I>::handle_start_replay(int r) {
 
   {
     CephContext *cct = static_cast<CephContext *>(m_local->cct());
-    double poll_seconds = cct->_conf->rbd_mirror_journal_poll_age;
+    double poll_seconds = cct->_conf->get_val<double>(
+      "rbd_mirror_journal_poll_age");
 
     Mutex::Locker locker(m_lock);
     m_replay_handler = new ReplayHandler<I>(this);
index e5d08cea5f48e8203e450878e2946a836ad44d49..dfa96ed4d3e463f3a21faca658958aa54398e99f 100644 (file)
@@ -31,8 +31,8 @@ template <typename I>
 ImageSyncThrottler<I>::ImageSyncThrottler()
   : m_lock(librbd::util::unique_lock_name("rbd::mirror::ImageSyncThrottler",
                                           this)),
-    m_max_concurrent_syncs(
-      g_ceph_context->_conf->rbd_mirror_concurrent_image_syncs) {
+    m_max_concurrent_syncs(g_ceph_context->_conf->get_val<uint64_t>(
+      "rbd_mirror_concurrent_image_syncs")) {
   dout(20) << "max_concurrent_syncs=" << m_max_concurrent_syncs << dendl;
   g_ceph_context->_conf->add_observer(this);
 }
@@ -205,7 +205,7 @@ template <typename I>
 void ImageSyncThrottler<I>::handle_conf_change(const struct md_config_t *conf,
                                       const set<string> &changed) {
   if (changed.count("rbd_mirror_concurrent_image_syncs")) {
-    set_max_concurrent_syncs(conf->rbd_mirror_concurrent_image_syncs);
+    set_max_concurrent_syncs(conf->get_val<uint64_t>("rbd_mirror_concurrent_image_syncs"));
   }
 }
 
index 52e60605c758eac5dffa87f4422052f7e902a4e8..8b7fca17ac16e923387fc07a3efda64a2ffedba9 100644 (file)
@@ -483,7 +483,8 @@ void InstanceReplayer<I>::schedule_image_state_check_task() {
       queue_start_image_replayers();
     });
 
-  int after = g_ceph_context->_conf->rbd_mirror_image_state_check_interval;
+  int after = g_ceph_context->_conf->get_val<int64_t>(
+    "rbd_mirror_image_state_check_interval");
 
   dout(20) << "scheduling image state check after " << after << " sec (task "
            << m_image_state_check_task << ")" << dendl;
index 0c82c3007fc73f4af29ba3a5dd27d536799a3572..7f94976ea1c60d8ce7cad511505ab95a316f25af 100644 (file)
@@ -326,7 +326,7 @@ InstanceWatcher<I>::InstanceWatcher(librados::IoCtx &io_ctx,
     m_lock(unique_lock_name("rbd::mirror::InstanceWatcher::m_lock", this)),
     m_instance_lock(librbd::ManagedLock<I>::create(
       m_ioctx, m_work_queue, m_oid, this, librbd::managed_lock::EXCLUSIVE, true,
-      m_cct->_conf->rbd_blacklist_expire_seconds)) {
+      m_cct->_conf->get_val<int64_t>("rbd_blacklist_expire_seconds"))) {
 }
 
 template <typename I>
index 302700e51bb618d850553f382489b713acc1d419..56feb760ae3fb6d63df431c188b2dcc63f7cb0d8 100644 (file)
@@ -228,9 +228,9 @@ void Instances<I>::schedule_remove_task(Instance &instance) {
 
   cancel_remove_task(instance);
 
-  int after = m_cct->_conf->rbd_mirror_leader_heartbeat_interval *
-    (1 + m_cct->_conf->rbd_mirror_leader_max_missed_heartbeats +
-     m_cct->_conf->rbd_mirror_leader_max_acquire_attempts_before_break);
+  int after = m_cct->_conf->get_val<int64_t>("rbd_mirror_leader_heartbeat_interval") *
+    (1 + m_cct->_conf->get_val<int64_t>("rbd_mirror_leader_max_missed_heartbeats") +
+     m_cct->_conf->get_val<int64_t>("rbd_mirror_leader_max_acquire_attempts_before_break"));
 
   instance.timer_task = new FunctionContext(
     [this, &instance](int r) {
index 9e99d6b6f6e8beb96c2b1b68f2fef9c83716c874..46f555252f0e48e0c7e6dcdfc61479058fc74066 100644 (file)
@@ -33,7 +33,8 @@ LeaderWatcher<I>::LeaderWatcher(Threads<I> *threads, librados::IoCtx &io_ctx,
     m_lock("rbd::mirror::LeaderWatcher " + io_ctx.get_pool_name()),
     m_notifier_id(librados::Rados(io_ctx).get_instance_id()),
     m_leader_lock(new LeaderLock(m_ioctx, m_work_queue, m_oid, this, true,
-                                 m_cct->_conf->rbd_blacklist_expire_seconds)) {
+                                 m_cct->_conf->get_val<int64_t>(
+                                   "rbd_blacklist_expire_seconds"))) {
 }
 
 template <typename I>
@@ -370,7 +371,8 @@ void LeaderWatcher<I>::schedule_timer_task(const std::string &name,
       m_timer_gate->timer_callback = timer_callback;
     });
 
-  int after = delay_factor * m_cct->_conf->rbd_mirror_leader_heartbeat_interval;
+  int after = delay_factor * m_cct->_conf->get_val<int64_t>(
+    "rbd_mirror_leader_heartbeat_interval");
 
   dout(20) << "scheduling " << name << " after " << after << " sec (task "
            << m_timer_task << ")" << dendl;
@@ -567,8 +569,8 @@ void LeaderWatcher<I>::handle_get_locker(int r,
     }
   }
 
-  if (m_acquire_attempts >=
-        m_cct->_conf->rbd_mirror_leader_max_acquire_attempts_before_break) {
+  if (m_acquire_attempts >= m_cct->_conf->get_val<int64_t>(
+        "rbd_mirror_leader_max_acquire_attempts_before_break")) {
     dout(0) << "breaking leader lock after " << m_acquire_attempts << " "
             << "failed attempts to acquire" << dendl;
     break_leader_lock();
@@ -604,7 +606,7 @@ void LeaderWatcher<I>::schedule_acquire_leader_lock(uint32_t delay_factor) {
 
   schedule_timer_task("acquire leader lock",
                       delay_factor *
-                        m_cct->_conf->rbd_mirror_leader_max_missed_heartbeats,
+                        m_cct->_conf->get_val<int64_t>("rbd_mirror_leader_max_missed_heartbeats"),
                       false, &LeaderWatcher<I>::acquire_leader_lock, false);
 }
 
index b4509d5c465d5cb725aa9d2e3ab4a4a1c51bd9fc..61dc9bf2d89c7615ac4a148949ebc24a8bb9904c 100644 (file)
@@ -264,7 +264,7 @@ void Mirror::run()
     }
     m_cond.WaitInterval(
       m_lock,
-      utime_t(m_cct->_conf->rbd_mirror_pool_replayers_refresh_interval, 0));
+      utime_t(m_cct->_conf->get_val<int64_t>("rbd_mirror_pool_replayers_refresh_interval"), 0));
   }
 
   // stop all pool replayers in parallel
index 6c3b228dd58aa55a8e2e090ca42f44542a2f6810..8d03e878f16c80a3421c90742fdf31b363767a69 100644 (file)
@@ -513,10 +513,10 @@ void PoolReplayer::print_status(Formatter *f, stringstream *ss)
 
   f->dump_string("local_cluster_admin_socket",
                  reinterpret_cast<CephContext *>(m_local_io_ctx.cct())->_conf->
-                     admin_socket);
+                     get_val<std::string>("admin_socket"));
   f->dump_string("remote_cluster_admin_socket",
                  reinterpret_cast<CephContext *>(m_remote_io_ctx.cct())->_conf->
-                     admin_socket);
+                     get_val<std::string>("admin_socket"));
 
   f->open_object_section("sync_throttler");
   m_instance_watcher->print_sync_status(f, ss);
index 8c22440a006718c6d9c56afdc93282f1c05c2b9d..45a6fddd6fdd568140910fbfd01ba45f9d19a7d0 100644 (file)
@@ -12,11 +12,13 @@ namespace mirror {
 template <typename I>
 Threads<I>::Threads(CephContext *cct) : timer_lock("Threads::timer_lock") {
   thread_pool = new ThreadPool(cct, "Journaler::thread_pool", "tp_journal",
-                               cct->_conf->rbd_op_threads, "rbd_op_threads");
+                               cct->_conf->get_val<int64_t>("rbd_op_threads"),
+                               "rbd_op_threads");
   thread_pool->start();
 
   work_queue = new ContextWQ("Journaler::work_queue",
-                             cct->_conf->rbd_op_thread_timeout, thread_pool);
+                             cct->_conf->get_val<int64_t>("rbd_op_thread_timeout"),
+                             thread_pool);
 
   timer = new SafeTimer(cct, timer_lock, true);
   timer->init();
index dbc2560acd0254188d9809f459d4599d017ad0f6..6278d01015558868bf4dad1ba1b3cc98428cdee1 100644 (file)
@@ -39,7 +39,8 @@ ImageCopyRequest<I>::ImageCopyRequest(I *local_image_ctx, I *remote_image_ctx,
     m_progress_ctx(progress_ctx),
     m_lock(unique_lock_name("ImageCopyRequest::m_lock", this)),
     m_updating_sync_point(false), m_update_sync_ctx(nullptr),
-    m_update_sync_point_interval(m_local_image_ctx->cct->_conf->rbd_mirror_sync_point_update_age),
+    m_update_sync_point_interval(m_local_image_ctx->cct->_conf->template get_val<double>(
+      "rbd_mirror_sync_point_update_age")),
     m_client_meta_copy(*client_meta) {
   assert(!m_client_meta_copy.sync_points.empty());
 }
@@ -142,7 +143,7 @@ void ImageCopyRequest<I>::send_object_copies() {
   bool complete;
   {
     Mutex::Locker locker(m_lock);
-    for (int i = 0; i < cct->_conf->rbd_concurrent_management_ops; ++i) {
+    for (int i = 0; i < cct->_conf->get_val<int64_t>("rbd_concurrent_management_ops"); ++i) {
       send_next_object_copy();
       if (m_ret_val < 0 && m_current_ops == 0) {
         break;
index a3d3577310ccfe78e83157f0df03147936f14d0d..bf7477f4d394fd225e3e732714ad251ae3041926 100644 (file)
@@ -233,7 +233,7 @@ private:
     if (ret == -EINVAL) {
       // if shrinking an image, a pagecache writeback might reference
       // extents outside of the range of the new image extents
-      dout(5) << __func__ << ": masking IO out-of-bounds error" << dendl;
+      dout(0) << __func__ << ": masking IO out-of-bounds error" << dendl;
       ctx->data.clear();
       ret = 0;
     }
@@ -924,14 +924,26 @@ static int do_list_mapped_devices()
 
 static int parse_args(vector<const char*>& args, std::ostream *err_msg, Config *cfg)
 {
-  std::vector<const char*>::iterator i;
-  std::ostringstream err;
+  std::string conf_file_list;
+  std::string cluster;
+  CephInitParameters iparams = ceph_argparse_early_args(
+          args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list);
 
   md_config_t config;
-  config.parse_config_files(nullptr, nullptr, 0);
+  config.name = iparams.name;
+  config.cluster = cluster;
+
+  if (!conf_file_list.empty()) {
+    config.parse_config_files(conf_file_list.c_str(), nullptr, 0);
+  } else {
+    config.parse_config_files(nullptr, nullptr, 0);
+  }
   config.parse_env();
   config.parse_argv(args);
-  cfg->poolname = config.rbd_default_pool;
+  cfg->poolname = config.get_val<std::string>("rbd_default_pool");
+
+  std::vector<const char*>::iterator i;
+  std::ostringstream err;
 
   for (i = args.begin(); i != args.end(); ) {
     if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {