]> git.proxmox.com Git - ceph.git/commitdiff
update sources to 12.2.7
authorAlwin Antreich <a.antreich@proxmox.com>
Wed, 18 Jul 2018 07:51:46 +0000 (09:51 +0200)
committerAlwin Antreich <a.antreich@proxmox.com>
Wed, 18 Jul 2018 07:51:46 +0000 (09:51 +0200)
Signed-off-by: Alwin Antreich <a.antreich@proxmox.com>
452 files changed:
Makefile
ceph/CMakeLists.txt
ceph/alpine/APKBUILD
ceph/ceph.spec
ceph/ceph.spec.in
ceph/debian/ceph-osd.postinst
ceph/debian/changelog
ceph/doc/_templates/smarttoc.html
ceph/doc/ceph-volume/index.rst
ceph/doc/ceph-volume/lvm/activate.rst
ceph/doc/ceph-volume/lvm/encryption.rst
ceph/doc/ceph-volume/lvm/list.rst
ceph/doc/ceph-volume/lvm/prepare.rst
ceph/doc/ceph-volume/simple/index.rst
ceph/doc/cephfs/administration.rst
ceph/doc/dev/rbd-export.rst
ceph/doc/dev/testing.rst [new file with mode: 0644]
ceph/doc/man/8/ceph-fuse.rst
ceph/doc/man/8/ceph-volume.rst
ceph/doc/man/8/radosgw-admin.rst
ceph/doc/mgr/balancer.rst [new file with mode: 0644]
ceph/doc/mgr/index.rst
ceph/doc/mgr/influx.rst
ceph/doc/mgr/prometheus.rst
ceph/doc/rados/configuration/auth-config-ref.rst
ceph/doc/rados/deployment/ceph-deploy-mds.rst
ceph/doc/rados/operations/add-or-rm-osds.rst
ceph/doc/rados/operations/pools.rst
ceph/doc/radosgw/frontends.rst
ceph/doc/radosgw/keystone.rst
ceph/doc/radosgw/multitenancy.rst
ceph/qa/cephfs/clusters/3-mds.yaml
ceph/qa/cephfs/clusters/9-mds.yaml
ceph/qa/distros/all/rhel_7.5.yaml [new file with mode: 0644]
ceph/qa/distros/all/ubuntu_18.04.yaml [new file with mode: 0644]
ceph/qa/overrides/more-active-recovery.yaml [new file with mode: 0644]
ceph/qa/standalone/ceph-helpers.sh
ceph/qa/standalone/erasure-code/test-erasure-code.sh
ceph/qa/standalone/erasure-code/test-erasure-eio.sh
ceph/qa/standalone/osd/ec-error-rollforward.sh [new file with mode: 0755]
ceph/qa/standalone/osd/osd-rep-recov-eio.sh
ceph/qa/standalone/osd/repro_long_log.sh
ceph/qa/standalone/scrub/osd-recovery-scrub.sh
ceph/qa/standalone/scrub/osd-scrub-repair.sh
ceph/qa/standalone/scrub/osd-scrub-snaps.sh
ceph/qa/standalone/scrub/osd-scrub-test.sh
ceph/qa/suites/ceph-deploy/basic/tasks/ceph-admin-commands.yaml
ceph/qa/suites/ceph-deploy/ceph-volume/cluster/4node.yaml
ceph/qa/suites/fs/basic_functional/tasks/cephfs_scrub_tests.yaml
ceph/qa/suites/fs/basic_functional/tasks/client-recovery.yaml
ceph/qa/suites/fs/bugs/client_trim_caps/% [new file with mode: 0644]
ceph/qa/suites/fs/bugs/client_trim_caps/begin.yaml [new symlink]
ceph/qa/suites/fs/bugs/client_trim_caps/clusters/small-cluster.yaml [new file with mode: 0644]
ceph/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore.yaml [new symlink]
ceph/qa/suites/fs/bugs/client_trim_caps/overrides/+ [new file with mode: 0644]
ceph/qa/suites/fs/bugs/client_trim_caps/overrides/debug.yaml [new symlink]
ceph/qa/suites/fs/bugs/client_trim_caps/overrides/frag_enable.yaml [new symlink]
ceph/qa/suites/fs/bugs/client_trim_caps/overrides/no_client_pidfile.yaml [new symlink]
ceph/qa/suites/fs/bugs/client_trim_caps/overrides/whitelist_health.yaml [new symlink]
ceph/qa/suites/fs/bugs/client_trim_caps/overrides/whitelist_wrongly_marked_down.yaml [new symlink]
ceph/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml [new file with mode: 0644]
ceph/qa/suites/kcephfs/recovery/tasks/client-recovery.yaml
ceph/qa/suites/powercycle/osd/tasks/rados_api_tests.yaml
ceph/qa/suites/rados/basic-luminous/scrub_test.yaml
ceph/qa/suites/rados/basic/tasks/rados_api_tests.yaml
ceph/qa/suites/rados/basic/tasks/repair_test.yaml
ceph/qa/suites/rados/monthrash/ceph.yaml
ceph/qa/suites/rados/multimon/tasks/mon_clock_no_skews.yaml
ceph/qa/suites/rados/objectstore/objectstore.yaml
ceph/qa/suites/rados/singleton-bluestore/all/cephtool.yaml
ceph/qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml
ceph/qa/suites/rados/singleton-nomsgr/all/large-omap-object-warnings.yaml [new file with mode: 0644]
ceph/qa/suites/rados/singleton/all/divergent_priors.yaml
ceph/qa/suites/rados/singleton/all/divergent_priors2.yaml
ceph/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml
ceph/qa/suites/rados/singleton/all/osd-recovery.yaml
ceph/qa/suites/rados/singleton/all/pg-removal-interruption.yaml
ceph/qa/suites/rados/singleton/all/random-eio.yaml
ceph/qa/suites/rados/singleton/all/thrash-eio.yaml
ceph/qa/suites/rados/singleton/all/thrash_cache_writeback_proxy_none.yaml
ceph/qa/suites/rados/thrash-erasure-code-big/recovery-overrides [new symlink]
ceph/qa/suites/rados/thrash-erasure-code-isa/recovery-overrides [new symlink]
ceph/qa/suites/rados/thrash-erasure-code-overwrites/recovery-overrides [new symlink]
ceph/qa/suites/rados/thrash-erasure-code-shec/recovery-overrides [new symlink]
ceph/qa/suites/rados/thrash-erasure-code/recovery-overrides [new symlink]
ceph/qa/suites/rados/thrash/2-recovery-overrides/$ [new file with mode: 0644]
ceph/qa/suites/rados/thrash/2-recovery-overrides/default.yaml [new file with mode: 0644]
ceph/qa/suites/rados/thrash/2-recovery-overrides/more-active-recovery.yaml [new symlink]
ceph/qa/suites/rados/thrash/workloads/rados_api_tests.yaml
ceph/qa/suites/rados/verify/tasks/rados_api_tests.yaml
ceph/qa/suites/rbd/basic/msgr-failures/many.yaml [deleted file]
ceph/qa/suites/rbd/openstack/base/install.yaml
ceph/qa/suites/rbd/thrash/thrashers/cache.yaml
ceph/qa/suites/smoke/basic/tasks/rados_api_tests.yaml
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/% [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/0-cluster/+ [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/0-cluster/openstack.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/0-cluster/start.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/1-install/luminous-client-x.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/2-workload/rbd_api_tests.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/2-workload/rbd_cli_import_export.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/supported/centos_7.4.yaml [new symlink]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/supported/rhel_7.5.yaml [new symlink]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/supported/ubuntu_16.04.yaml [new symlink]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/supported/ubuntu_18.04.yaml [new symlink]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/% [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/0-cluster/+ [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/0-cluster/openstack.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/0-cluster/start.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/1-install/luminous-client-x.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/2-features/defaults.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/2-features/layering.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/3-workload/rbd_notification_tests.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/supported/centos_7.4.yaml [new symlink]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/supported/rhel_7.5.yaml [new symlink]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/supported/ubuntu_16.04.yaml [new symlink]
ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/supported/ubuntu_18.04.yaml [new symlink]
ceph/qa/suites/upgrade/jewel-x/parallel/1-jewel-install/jewel.yaml
ceph/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml
ceph/qa/suites/upgrade/luminous-p2p/% [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-p2p/point-to-point-upgrade.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-p2p/supported [new symlink]
ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/start.yaml
ceph/qa/suites/upgrade/luminous-x/point-to-point-x/% [deleted file]
ceph/qa/suites/upgrade/luminous-x/point-to-point-x/distros/centos_latest.yaml [deleted symlink]
ceph/qa/suites/upgrade/luminous-x/point-to-point-x/distros/ubuntu_latest.yaml [deleted symlink]
ceph/qa/suites/upgrade/luminous-x/point-to-point-x/point-to-point-upgrade.yaml [deleted file]
ceph/qa/tasks/ceph.py
ceph/qa/tasks/ceph_deploy.py
ceph/qa/tasks/ceph_manager.py
ceph/qa/tasks/cephfs/filesystem.py
ceph/qa/tasks/cephfs/fuse_mount.py
ceph/qa/tasks/cephfs/kernel_mount.py
ceph/qa/tasks/cephfs/test_client_recovery.py
ceph/qa/tasks/cephfs/test_exports.py
ceph/qa/tasks/cephfs/test_strays.py
ceph/qa/tasks/cephfs/test_volume_client.py
ceph/qa/tasks/filestore_idempotent.py
ceph/qa/tasks/osd_max_pg_per_osd.py
ceph/qa/tasks/radosgw_admin_rest.py
ceph/qa/workunits/ceph-tests/ceph-admin-commands.sh
ceph/qa/workunits/fs/misc/rstats.sh [new file with mode: 0755]
ceph/qa/workunits/mon/crush_ops.sh
ceph/qa/workunits/rados/test_envlibrados_for_rocksdb.sh
ceph/qa/workunits/rados/test_large_omap_detection.py [new file with mode: 0755]
ceph/qa/workunits/rados/test_pool_access.sh
ceph/qa/workunits/rbd/import_export.sh
ceph/qa/workunits/rbd/permissions.sh
ceph/qa/workunits/rbd/rbd_mirror.sh
ceph/qa/workunits/rest/test_mgr_rest_api.py
ceph/selinux/ceph.te
ceph/src/.git_version
ceph/src/CMakeLists.txt
ceph/src/arch/arm.c
ceph/src/arch/arm.h
ceph/src/auth/Auth.h
ceph/src/auth/AuthAuthorizeHandler.h
ceph/src/auth/cephx/CephxAuthorizeHandler.cc
ceph/src/auth/cephx/CephxAuthorizeHandler.h
ceph/src/auth/cephx/CephxProtocol.cc
ceph/src/auth/cephx/CephxProtocol.h
ceph/src/auth/cephx/CephxServiceHandler.cc
ceph/src/auth/cephx/CephxSessionHandler.cc
ceph/src/auth/none/AuthNoneAuthorizeHandler.cc
ceph/src/auth/none/AuthNoneAuthorizeHandler.h
ceph/src/auth/none/AuthNoneProtocol.h
ceph/src/auth/unknown/AuthUnknownAuthorizeHandler.cc
ceph/src/auth/unknown/AuthUnknownAuthorizeHandler.h
ceph/src/ceph-disk/ceph_disk/main.py
ceph/src/ceph-volume/ceph_volume/__init__.py
ceph/src/ceph-volume/ceph_volume/api/lvm.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/create.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/listing.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/prepare.py
ceph/src/ceph-volume/ceph_volume/tests/conftest.py
ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_listing.py
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/dmcrypt-luks/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/dmcrypt-plain/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/dmcrypt-luks/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/dmcrypt-plain/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/dmcrypt-luks/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/dmcrypt-plain/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/dmcrypt-luks/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/dmcrypt-plain/group_vars/all
ceph/src/ceph-volume/ceph_volume/tests/util/test_prepare.py
ceph/src/ceph-volume/ceph_volume/util/prepare.py
ceph/src/ceph-volume/ceph_volume/util/system.py
ceph/src/ceph_fuse.cc
ceph/src/client/Client.cc
ceph/src/client/Client.h
ceph/src/client/Inode.cc
ceph/src/client/Inode.h
ceph/src/client/MetaRequest.h
ceph/src/client/UserPerm.h
ceph/src/client/fuse_ll.cc
ceph/src/cls/rgw/cls_rgw.cc
ceph/src/common/DecayCounter.cc
ceph/src/common/DecayCounter.h
ceph/src/common/Preforker.h
ceph/src/common/bounded_key_counter.h
ceph/src/common/code_environment.cc
ceph/src/common/compat.cc [new file with mode: 0644]
ceph/src/common/crc32c_aarch64.c
ceph/src/common/legacy_config_opts.h
ceph/src/common/obj_bencher.cc
ceph/src/common/options.cc
ceph/src/crush/CrushCompiler.cc
ceph/src/crush/CrushTreeDumper.h
ceph/src/crush/CrushWrapper.cc
ceph/src/crush/CrushWrapper.h
ceph/src/crush/builder.c
ceph/src/crush/crush.h
ceph/src/crush/mapper.c
ceph/src/global/global_init.cc
ceph/src/include/ceph_features.h
ceph/src/include/ceph_fs.h
ceph/src/include/cephfs/libcephfs.h
ceph/src/include/compat.h
ceph/src/include/interval_set.h
ceph/src/include/msgr.h
ceph/src/java/CMakeLists.txt
ceph/src/libcephfs.cc
ceph/src/librados/IoCtxImpl.cc
ceph/src/librbd/ImageCtx.cc
ceph/src/librbd/ImageCtx.h
ceph/src/librbd/ImageWatcher.cc
ceph/src/librbd/Watcher.cc
ceph/src/librbd/Watcher.h
ceph/src/librbd/journal/Replay.cc
ceph/src/librbd/journal/Replay.h
ceph/src/log/Log.cc
ceph/src/mds/Beacon.cc
ceph/src/mds/CDir.cc
ceph/src/mds/CDir.h
ceph/src/mds/CInode.cc
ceph/src/mds/CInode.h
ceph/src/mds/Capability.h
ceph/src/mds/FSMap.cc
ceph/src/mds/FSMap.h
ceph/src/mds/Locker.cc
ceph/src/mds/MDBalancer.cc
ceph/src/mds/MDBalancer.h
ceph/src/mds/MDCache.cc
ceph/src/mds/MDCache.h
ceph/src/mds/MDSCacheObject.h
ceph/src/mds/MDSDaemon.cc
ceph/src/mds/MDSDaemon.h
ceph/src/mds/MDSRank.cc
ceph/src/mds/MDSRank.h
ceph/src/mds/Migrator.cc
ceph/src/mds/Migrator.h
ceph/src/mds/Mutation.h
ceph/src/mds/PurgeQueue.cc
ceph/src/mds/Server.cc
ceph/src/mds/Server.h
ceph/src/mds/SessionMap.h
ceph/src/mds/journal.cc
ceph/src/mds/mdstypes.cc
ceph/src/mds/mdstypes.h
ceph/src/messages/MClientCaps.h
ceph/src/messages/MHeartbeat.h
ceph/src/messages/MMDSMap.h
ceph/src/messages/MOSDMap.h
ceph/src/messages/MOSDRepScrub.h
ceph/src/messages/MOSDRepScrubMap.h
ceph/src/mgr/ActivePyModules.cc
ceph/src/mgr/DaemonServer.cc
ceph/src/mgr/DaemonServer.h
ceph/src/mgr/DaemonState.cc
ceph/src/mgr/DaemonState.h
ceph/src/mon/AuthMonitor.cc
ceph/src/mon/CMakeLists.txt
ceph/src/mon/ConfigKeyService.cc
ceph/src/mon/LogMonitor.cc
ceph/src/mon/MDSMonitor.cc
ceph/src/mon/MDSMonitor.h
ceph/src/mon/MgrMonitor.cc
ceph/src/mon/MonCommands.h
ceph/src/mon/Monitor.cc
ceph/src/mon/Monitor.h
ceph/src/mon/OSDMonitor.cc
ceph/src/mon/OSDMonitor.h
ceph/src/mon/PGMap.cc
ceph/src/mon/Paxos.h
ceph/src/mon/PaxosFSMap.h [new file with mode: 0644]
ceph/src/msg/Dispatcher.h
ceph/src/msg/Messenger.h
ceph/src/msg/async/AsyncConnection.cc
ceph/src/msg/async/AsyncConnection.h
ceph/src/msg/async/AsyncMessenger.h
ceph/src/msg/simple/Pipe.cc
ceph/src/msg/simple/SimpleMessenger.cc
ceph/src/msg/simple/SimpleMessenger.h
ceph/src/os/ObjectStore.h
ceph/src/os/bluestore/BlueStore.cc
ceph/src/os/bluestore/BlueStore.h
ceph/src/os/bluestore/KernelDevice.cc
ceph/src/os/bluestore/StupidAllocator.cc
ceph/src/os/bluestore/aio.cc
ceph/src/os/bluestore/bluefs_types.h
ceph/src/os/bluestore/bluestore_types.cc
ceph/src/os/bluestore/bluestore_types.h
ceph/src/os/filestore/FileJournal.cc
ceph/src/os/filestore/FileStore.cc
ceph/src/os/filestore/FileStore.h
ceph/src/osd/ECBackend.cc
ceph/src/osd/ECBackend.h
ceph/src/osd/OSD.cc
ceph/src/osd/OSD.h
ceph/src/osd/OSDMap.cc
ceph/src/osd/OSDMap.h
ceph/src/osd/PG.cc
ceph/src/osd/PG.h
ceph/src/osd/PGBackend.cc
ceph/src/osd/PGBackend.h
ceph/src/osd/PGTransaction.h
ceph/src/osd/PrimaryLogPG.cc
ceph/src/osd/PrimaryLogPG.h
ceph/src/osd/ReplicatedBackend.cc
ceph/src/osd/ReplicatedBackend.h
ceph/src/osd/SnapMapper.cc
ceph/src/osd/osd_types.cc
ceph/src/osd/osd_types.h
ceph/src/osdc/Journaler.cc
ceph/src/osdc/Journaler.h
ceph/src/osdc/ObjectCacher.cc
ceph/src/osdc/ObjectCacher.h
ceph/src/osdc/Objecter.cc
ceph/src/pybind/ceph_volume_client.py
ceph/src/pybind/mgr/influx/module.py
ceph/src/pybind/mgr/mgr_module.py
ceph/src/pybind/mgr/prometheus/module.py
ceph/src/pybind/mgr/restful/common.py
ceph/src/pybind/mgr/restful/module.py
ceph/src/pybind/rados/rados.pyx
ceph/src/pybind/rbd/rbd.pyx
ceph/src/rgw/rgw_acl_swift.cc
ceph/src/rgw/rgw_acl_swift.h
ceph/src/rgw/rgw_admin.cc
ceph/src/rgw/rgw_asio_frontend.cc
ceph/src/rgw/rgw_auth.cc
ceph/src/rgw/rgw_auth.h
ceph/src/rgw/rgw_auth_registry.h
ceph/src/rgw/rgw_auth_s3.cc
ceph/src/rgw/rgw_auth_s3.h
ceph/src/rgw/rgw_cache.cc
ceph/src/rgw/rgw_cache.h
ceph/src/rgw/rgw_civetweb.cc
ceph/src/rgw/rgw_civetweb_frontend.cc
ceph/src/rgw/rgw_common.h
ceph/src/rgw/rgw_coroutine.cc
ceph/src/rgw/rgw_cr_rados.cc
ceph/src/rgw/rgw_cr_rados.h
ceph/src/rgw/rgw_data_sync.cc
ceph/src/rgw/rgw_data_sync.h
ceph/src/rgw/rgw_file.cc
ceph/src/rgw/rgw_file.h
ceph/src/rgw/rgw_frontend.cc
ceph/src/rgw/rgw_frontend.h
ceph/src/rgw/rgw_http_client.cc
ceph/src/rgw/rgw_iam_policy.cc
ceph/src/rgw/rgw_iam_policy.h
ceph/src/rgw/rgw_lc_s3.cc
ceph/src/rgw/rgw_main.cc
ceph/src/rgw/rgw_op.cc
ceph/src/rgw/rgw_op.h
ceph/src/rgw/rgw_period_puller.cc
ceph/src/rgw/rgw_rados.cc
ceph/src/rgw/rgw_rados.h
ceph/src/rgw/rgw_realm_reloader.cc
ceph/src/rgw/rgw_rest.cc
ceph/src/rgw/rgw_rest_log.cc
ceph/src/rgw/rgw_rest_s3.cc
ceph/src/rgw/rgw_rest_s3.h
ceph/src/rgw/rgw_rest_swift.cc
ceph/src/rgw/rgw_rest_user.cc
ceph/src/rgw/rgw_swift_auth.h
ceph/src/rgw/rgw_sync.cc
ceph/src/rgw/rgw_sync_module_es.cc
ceph/src/rgw/rgw_torrent.cc
ceph/src/rgw/rgw_user.cc
ceph/src/script/build-integration-branch [new file with mode: 0755]
ceph/src/test/CMakeLists.txt
ceph/src/test/cli/crushtool/choose-args.crush
ceph/src/test/cli/crushtool/choose-args.t
ceph/src/test/cli/radosgw-admin/help.t
ceph/src/test/crush/CrushWrapper.cc
ceph/src/test/fs/CMakeLists.txt
ceph/src/test/fs/test_trim_caps.cc [new file with mode: 0644]
ceph/src/test/libcephfs/test.cc
ceph/src/test/librados/aio.cc
ceph/src/test/librados/list.cc
ceph/src/test/librados/test_common.cc
ceph/src/test/librados/test_common.h
ceph/src/test/librbd/journal/test_mock_Replay.cc
ceph/src/test/librbd/mock/MockImageCtx.h
ceph/src/test/messenger/simple_dispatcher.h
ceph/src/test/messenger/xio_dispatcher.h
ceph/src/test/msgr/perf_msgr_client.cc
ceph/src/test/msgr/perf_msgr_server.cc
ceph/src/test/msgr/test_msgr.cc
ceph/src/test/objectstore/DeterministicOpSequence.cc
ceph/src/test/objectstore/DeterministicOpSequence.h
ceph/src/test/objectstore/FileStoreDiff.cc
ceph/src/test/objectstore/TestObjectStoreState.h
ceph/src/test/objectstore/run_seed_to.sh
ceph/src/test/objectstore/run_seed_to_range.sh
ceph/src/test/pybind/test_rados.py
ceph/src/test/rbd_mirror/image_replayer/test_mock_BootstrapRequest.cc
ceph/src/test/rbd_mirror/image_replayer/test_mock_PrepareLocalImageRequest.cc
ceph/src/test/rbd_mirror/test_ImageReplayer.cc
ceph/src/test/rbd_mirror/test_mock_ImageReplayer.cc
ceph/src/test/rgw/rgw_multi/tests.py
ceph/src/test/rgw/test_rgw_iam_policy.cc
ceph/src/tools/cephfs/Resetter.cc
ceph/src/tools/rbd/Utils.h
ceph/src/tools/rbd/action/Export.cc
ceph/src/tools/rbd/action/Import.cc
ceph/src/tools/rbd_mirror/ImageReplayer.cc
ceph/src/tools/rbd_mirror/ImageReplayer.h
ceph/src/tools/rbd_mirror/ImageSync.cc
ceph/src/tools/rbd_mirror/InstanceWatcher.cc
ceph/src/tools/rbd_mirror/InstanceWatcher.h
ceph/src/tools/rbd_mirror/PoolReplayer.cc
ceph/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
ceph/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h
ceph/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.cc
ceph/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc
ceph/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc
ceph/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h
ceph/src/tools/rbd_nbd/rbd-nbd.cc

index 3c24519835127dfcdde09a5b6c618032b1120d5e..9bc9fbe4684c7bf2a3d6cb6d8470e7740e4a5e14 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
-RELEASE=5.1
+RELEASE=5.2
 
 PACKAGE=ceph
-VER=12.2.5
+VER=12.2.7
 DEBREL=pve1
 
 SRCDIR=ceph
index aa90ba65dac3a9a333275b71752cb6d83fdd2ee7..2e2df1f6e75f1fabe235d01a4116123eeff0500e 100644 (file)
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8.11)
 
 project(ceph)
-set(VERSION 12.2.5)
+set(VERSION 12.2.7)
 
 if(POLICY CMP0046)
   # Tweak policies (this one disables "missing" dependency warning)
index d280aacfb336ac4c54fa3e304df6f5ed20dcb542..90553c49eee8a5318ba725418c563805eb5ccd60 100644 (file)
@@ -1,7 +1,7 @@
 # Contributor: John Coyle <dx9err@gmail.com>
 # Maintainer: John Coyle <dx9err@gmail.com>
 pkgname=ceph
-pkgver=12.2.5
+pkgver=12.2.7
 pkgrel=0
 pkgdesc="Ceph is a distributed object store and file system"
 pkgusers="ceph"
@@ -63,7 +63,7 @@ makedepends="
        xmlstarlet
        yasm
 "
-source="ceph-12.2.5.tar.bz2"
+source="ceph-12.2.7.tar.bz2"
 subpackages="
        $pkgname-base
        $pkgname-common
@@ -116,7 +116,7 @@ _sysconfdir=/etc
 _udevrulesdir=/etc/udev/rules.d
 _python_sitelib=/usr/lib/python2.7/site-packages
 
-builddir=$srcdir/ceph-12.2.5
+builddir=$srcdir/ceph-12.2.7
 
 build() {
        export CEPH_BUILD_VIRTUALENV=$builddir
index bbb86d4525ae2fa059c0980390b2b1e881865398..08ccdb6c2182ec3a92606cd46433e1b74b32a388 100644 (file)
@@ -61,7 +61,7 @@
 # main package definition
 #################################################################################
 Name:          ceph
-Version:       12.2.5
+Version:       12.2.7
 Release:       0%{?dist}
 %if 0%{?fedora} || 0%{?rhel}
 Epoch:         2
@@ -77,7 +77,7 @@ License:      LGPL-2.1 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and
 Group:         System/Filesystems
 %endif
 URL:           http://ceph.com/
-Source0:       http://ceph.com/download/ceph-12.2.5.tar.bz2
+Source0:       http://ceph.com/download/ceph-12.2.7.tar.bz2
 %if 0%{?suse_version}
 %if 0%{?is_opensuse}
 ExclusiveArch:  x86_64 aarch64 ppc64 ppc64le
@@ -779,7 +779,7 @@ python-rbd, python-rgw or python-cephfs instead.
 # common
 #################################################################################
 %prep
-%autosetup -p1 -n ceph-12.2.5
+%autosetup -p1 -n ceph-12.2.7
 
 %build
 %if 0%{with cephfs_java}
@@ -1447,6 +1447,8 @@ fi
 %else
     /usr/lib/systemd/systemd-sysctl %{_sysctldir}/90-ceph-osd.conf > /dev/null 2>&1 || :
 %endif
+# work around https://tracker.ceph.com/issues/24903
+chown -h ceph:ceph /var/lib/ceph/osd/*/block* 2>&1 > /dev/null || :
 
 %preun osd
 %if 0%{?suse_version}
index 2f89a9bc1a1388404b6925eb78af3c28fbd358e4..05e3e330ef1ec22d8a16902ca27b212e1fa8dd42 100644 (file)
@@ -1447,6 +1447,8 @@ fi
 %else
     /usr/lib/systemd/systemd-sysctl %{_sysctldir}/90-ceph-osd.conf > /dev/null 2>&1 || :
 %endif
+# work around https://tracker.ceph.com/issues/24903
+chown -h ceph:ceph /var/lib/ceph/osd/*/block* 2>&1 > /dev/null || :
 
 %preun osd
 %if 0%{?suse_version}
index 5e44548fe826177d9c78c589f0611207d334d7e5..8acfc0743112af3d771ee91287e8e5e1781b08bc 100644 (file)
@@ -25,6 +25,8 @@ case "$1" in
     configure)
        [ -x /etc/init.d/procps ] && invoke-rc.d procps restart || :
        [ -x /sbin/start ] && start ceph-osd-all || :
+       # work around https://tracker.ceph.com/issues/24903
+       chown -h ceph:ceph /var/lib/ceph/osd/*/block* 2>&1 > /dev/null || :
     ;;
     abort-upgrade|abort-remove|abort-deconfigure)
        :
index c3a8efa2da5e58ac970cf193a8269bc52fbcd31f..a95f9dc266356812a3f6cdec3cf5fa86ede6025d 100644 (file)
@@ -1,3 +1,15 @@
+ceph (12.2.7-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com>  Mon, 16 Jul 2018 16:00:29 +0000
+
+ceph (12.2.6-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com>  Mon, 09 Jul 2018 16:18:46 +0000
+
 ceph (12.2.5-1) stable; urgency=medium
 
   * New upstream release
index 8ec60cdf454dacf2d8f6d9400c081014f7e7e2dd..7a3fd3f2b4255b38b8832b0ce319b1b3e22f3ea8 100644 (file)
@@ -10,7 +10,7 @@
 
 #}
 <h3><a href="{{ pathto(master_doc) }}">{{ _('Table Of Contents') }}</a></h3>
-{{ toctree(maxdepth=-1) }}
+{{ toctree(maxdepth=-1, includehidden=True) }}
 
 <!-- ugly kludge to make genindex look like it's part of the toc -->
 <ul style="margin-top: -10px"><li class="toctree-l1"><a class="reference internal" href="{{ pathto('genindex') }}">Index</a></li></ul>
index 2ec462ebea3a57a2e052cb88bf0b10b85305c17b..387596d613bf72d4827ecaaeb7b89c80009ff26b 100644 (file)
@@ -22,7 +22,13 @@ Migrating
 ---------
 Starting on Ceph version 12.2.2, ``ceph-disk`` is deprecated. Deprecation
 warnings will show up that will link to this page. It is strongly suggested
-that users start consuming ``ceph-volume``.
+that users start consuming ``ceph-volume``. There are two paths for migrating:
+
+#. Keep OSDs deployed with ``ceph-disk``: The :ref:`ceph-volume-simple` command
+   provides a way to take over the management while disabling ``ceph-disk``
+   triggers.
+#. Redeploy existing OSDs with ``ceph-volume``: This is covered in depth on
+   :ref:`rados-replacing-an-osd`
 
 New deployments
 ^^^^^^^^^^^^^^^
index 66bed5a82e9b654f6d960e99dbae1476dced8f3c..e757f5266e5ae10226892ccb50d86dfebf0ff015 100644 (file)
@@ -22,6 +22,18 @@ need to be supplied. For example::
 .. note:: The UUID is stored in the ``fsid`` file in the OSD path, which is
           generated when :ref:`ceph-volume-lvm-prepare` is used.
 
+Activating all OSDs
+-------------------
+It is possible to activate all existing OSDs at once by using the ``--all``
+flag. For example::
+
+    ceph-volume lvm activate --all
+
+This call will inspect all the OSDs created by ceph-volume that are inactive
+and will activate them one by one. If any of the OSDs are already running, it
+will report them in the command output and skip them, making it safe to rerun
+(idempotent).
+
 requiring uuids
 ^^^^^^^^^^^^^^^
 The :term:`OSD uuid` is being required as an extra step to ensure that the
@@ -29,6 +41,13 @@ right OSD is being activated. It is entirely possible that a previous OSD with
 the same id exists and would end up activating the incorrect one.
 
 
+dmcrypt
+^^^^^^^
+If the OSD was prepared with dmcrypt by ceph-volume, there is no need to
+specify ``--dmcrypt`` on the command line again (that flag is not available for
+the ``activate`` subcommand). An encrypted OSD will be automatically detected.
+
+
 Discovery
 ---------
 With either existing OSDs or new ones being activated, a *discovery* process is
@@ -63,9 +82,11 @@ The systemd unit will look for the matching OSD device, and by looking at its
 
 Existing OSDs
 -------------
-For exsiting OSDs that have been deployed with different tooling, the only way
-to port them over to the new mechanism is to prepare them again (losing data).
-See :ref:`ceph-volume-lvm-existing-osds` for details on how to proceed.
+For existing OSDs that have been deployed with ``ceph-disk``, they need to be
+scanned and activated :ref:`using the simple sub-command <ceph-volume-simple>`.
+If a different tooling was used then the only way to port them over to the new
+mechanism is to prepare them again (losing data). See
+:ref:`ceph-volume-lvm-existing-osds` for details on how to proceed.
 
 Summary
 -------
index 02a447b5779210b9acb656762bb155bb29244404..6bdec7c44db355fe8723db402e2aa62726455666 100644 (file)
@@ -3,10 +3,11 @@
 Encryption
 ==========
 
-Logical volumes can be encrypted using ``dmcrypt``. Encryption can be done in
-different ways, specially with LVM. ``ceph-volume`` is somewhat opinionated
-with the way it sets up encryption with logical volumes so that the process is
-consistent and robust.
+Logical volumes can be encrypted using ``dmcrypt`` by specifying the
+``--dmcrypt`` flag when creating OSDs. Encryption can be done in different ways,
+specially with LVM. ``ceph-volume`` is somewhat opinionated with the way it
+sets up encryption with logical volumes so that the process is consistent and
+robust.
 
 In this case, ``ceph-volume lvm`` follows these constraints:
 
index 19e06000b8429497b72f140240757abb9eb226f2..718154b102194ecb5b4d611be100456ed50fdc54 100644 (file)
@@ -39,6 +39,7 @@ one with a physical device may look similar to::
           data uuid                 SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
           journal device            /dev/journals/journal1
           data device               /dev/test_group/data-lv2
+          devices                   /dev/sda
 
       [data]    /dev/test_group/data-lv2
 
@@ -50,6 +51,7 @@ one with a physical device may look similar to::
           data uuid                 SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
           journal device            /dev/journals/journal1
           data device               /dev/test_group/data-lv2
+          devices                   /dev/sdb
 
     ====== osd.0 =======
 
@@ -63,11 +65,18 @@ one with a physical device may look similar to::
           data uuid                 TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00
           journal device            /dev/sdd1
           data device               /dev/test_group/data-lv1
+          devices                   /dev/sdc
 
       [journal]    /dev/sdd1
 
           PARTUUID                  cd72bd28-002a-48da-bdf6-d5b993e84f3f
 
+
+For logical volumes the ``devices`` key is populated with the physical devices
+associated with the logical volume. Since LVM allows multiple physical devices
+to be part of a logical volume, the value will be comma separated when using
+``pretty``, but an array when using ``json``.
+
 .. note:: Tags are displayed in a readable format. The ``osd id`` key is stored
           as a ``ceph.osd_id`` tag. For more information on lvm tag conventions
           see :ref:`ceph-volume-lvm-tag-api`
@@ -96,6 +105,7 @@ can be listed in the following way::
           data uuid                 SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
           journal device            /dev/journals/journal1
           data device               /dev/test_group/data-lv2
+          devices                   /dev/sdc
 
 
 .. note:: Tags are displayed in a readable format. The ``osd id`` key is stored
@@ -134,6 +144,7 @@ output (note how tags aren't modified)::
     {
         "0": [
             {
+                "devices": ["/dev/sda"],
                 "lv_name": "data-lv1",
                 "lv_path": "/dev/test_group/data-lv1",
                 "lv_tags": "ceph.cluster_fsid=ce454d91-d748-4751-a318-ff7f7aa18ffd,ceph.data_device=/dev/test_group/data-lv1,ceph.data_uuid=TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00,ceph.journal_device=/dev/sdd1,ceph.journal_uuid=cd72bd28-002a-48da-bdf6-d5b993e84f3f,ceph.osd_fsid=943949f0-ce37-47ca-a33c-3413d46ee9ec,ceph.osd_id=0,ceph.type=data",
index de29a782a1070bcbbd84ae52f735682876b66160..700d1216a55b65cb2f67b2d0668437359758086a 100644 (file)
@@ -42,6 +42,10 @@ The API call looks like::
 
     ceph-volume prepare --filestore --data data --journal journal
 
+For enabling :ref:`encryption <ceph-volume-lvm-encryption>`, the ``--dmcrypt`` flag is required::
+
+    ceph-volume lvm prepare --filestore --dmcrypt --data volume_group/lv_name --journal journal
+
 There is flexibility to use a raw device or partition as well for ``--data``
 that will be converted to a logical volume. This is not ideal in all situations
 since ``ceph-volume`` is just going to create a unique volume group and
@@ -148,6 +152,9 @@ A raw device can be specified in the same way::
 
     ceph-volume lvm prepare --bluestore --data /path/to/device
 
+For enabling :ref:`encryption <ceph-volume-lvm-encryption>`, the ``--dmcrypt`` flag is required::
+
+    ceph-volume lvm prepare --bluestore --dmcrypt --data vg/lv
 
 If a ``block.db`` or a ``block.wal`` is needed (they are optional for
 bluestore) they can be specified with ``--block.db`` and ``--block.wal``
index 6f2534a738e363658faa12b1766e30a4aea074e7..315dea99a106152b1b036871191cabf7cbf670a4 100644 (file)
@@ -17,3 +17,16 @@ Implements the functionality needed to manage OSDs from the ``simple`` subcomman
 By *taking over* management, it disables all ``ceph-disk`` systemd units used
 to trigger devices at startup, relying on basic (customizable) JSON
 configuration and systemd for starting up OSDs.
+
+This process involves two steps:
+
+#. :ref:`Scan <ceph-volume-simple-scan>` the running OSD or the data device
+#. :ref:`Activate <ceph-volume-simple-activate>` the scanned OSD
+
+The scanning will infer everything that ``ceph-volume`` needs to start the OSD,
+so that when activation is needed, the OSD can start normally without getting
+interference from ``ceph-disk``.
+
+As part of the activation process the systemd units for ``ceph-disk`` in charge
+of reacting to ``udev`` events, are linked to ``/dev/null`` so that they are
+fully inactive.
index e9d9195557d19cae0865985f2d09304855676c07..71a0e4075d10fb00c5db9255aa8cc22c8d1d6593 100644 (file)
@@ -74,6 +74,22 @@ appear to be eg. exabytes in size, causing load on the MDS as it tries
 to enumerate the objects during operations like stats or deletes.
 
 
+Taking the cluster down
+-----------------------
+
+Taking a CephFS cluster down is done by reducing the number of ranks to 1,
+setting the cluster_down flag, and then failing the last rank. For example:
+
+::
+    ceph fs set <fs_name> max_mds 1
+    ceph mds deactivate <fs_name>:1 # rank 2 of 2
+    ceph status # wait for rank 1 to finish stopping
+    ceph fs set <fs_name> cluster_down true
+    ceph mds fail <fs_name>:0
+
+Setting the ``cluster_down`` flag prevents standbys from taking over the failed
+rank.
+
 Daemons
 -------
 
index 0d34d611b64f416979e045d2c074179140104e8b..2edb637f6ef15539103842c0deff46b99388519c 100644 (file)
@@ -65,6 +65,14 @@ Image Stripe count
 - le64: length of appending data (8)
 - le64: image striping count
 
+ImageMeta Key and Value
+-----------------------
+
+- u8: 'M'
+- le64: length of appending data (length of key + length of value + 4 * 2)
+- string: image-meta key
+- string: image-meta value
+
 Final Record
 ~~~~~~~~~~~~
 
@@ -75,9 +83,21 @@ End
 
 
 Diffs records
-~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~
+
 Record the all snapshots and the HEAD in this section. 
 
+Snap Protection status
+----------------------
+
+Record the snapshot's protection status if `--export-format=2`.
+- u8: 'p'
+- le64: length of appending data (8)
+- u8: snap protection status (0 for false, 1 for true)
+
+Others
+------
+
 - le64: number of diffs
 - Diffs ...
 
diff --git a/ceph/doc/dev/testing.rst b/ceph/doc/dev/testing.rst
new file mode 100644 (file)
index 0000000..1d99848
--- /dev/null
@@ -0,0 +1,40 @@
+Testing notes
+=============
+
+
+build-integration-branch
+------------------------
+
+Setup
+^^^^^
+
+#. Create a github token at `<https://github.com/settings/tokens>`_
+   and put it in ``~/.github_token``.  Note that only the
+   ``public_repo`` under the ``repo`` section needs to be checked.
+
+#. Create a ceph repo label `wip-yourname-testing` if you don't
+   already have one at `<https://github.com/ceph/ceph/labels>`_.
+
+#. Create the ``ci`` remote::
+
+     git remote add ci git@github.com:ceph/ceph-ci
+
+Using
+^^^^^
+
+#. Tag some subset of `needs-qa` commits with your label (usually `wip-yourname-testing`).
+
+#. Create the integration branch::
+
+     git checkout master
+     git pull
+     ../src/script/build-integration-branch wip-yourname-testing
+
+#. Smoke test::
+
+     make && ctest -j12
+
+#. Push to ceph-ci::
+
+     git push ci $(git rev-parse --abbrev-ref HEAD)
+
index cede60e5f41768777254763c403d3220a68f79b6..6c88a5a8f1a0bb4ee5f07138a8e77195bca35159 100644 (file)
@@ -9,7 +9,7 @@
 Synopsis
 ========
 
-| **ceph-fuse** [ -m *monaddr*:*port* ] *mountpoint* [ *fuse options* ]
+| **ceph-fuse** [-n *client.username*] [ -m *monaddr*:*port* ] *mountpoint* [ *fuse options* ]
 
 
 Description
@@ -32,9 +32,13 @@ Options
 
 Any options not recognized by ceph-fuse will be passed on to libfuse.
 
+.. option:: -o opt,[opt...]
+
+   mount options.
+
 .. option:: -d
 
-   Detach from console and daemonize after startup.
+   Run in foreground, send all log output to stderr and enable FUSE debugging (-o debug).
 
 .. option:: -c ceph.conf, --conf=ceph.conf
 
@@ -45,10 +49,17 @@ Any options not recognized by ceph-fuse will be passed on to libfuse.
 
    Connect to specified monitor (instead of looking through ceph.conf).
 
-.. option:: -r root_directory
+.. option:: --client_mountpoint/-r root_directory
 
    Use root_directory as the mounted root, rather than the full Ceph tree.
 
+.. option:: -f
+
+   Foreground: do not daemonize after startup (run in foreground). Do not generate a pid file.
+
+.. option:: -s
+
+   Disable multi-threaded operation.
 
 Availability
 ============
index d38cb624ef588860f8ea8439905d8de9906ac711..50a2e52803928780463110ea6b71c6c895a10b80 100644 (file)
@@ -63,6 +63,10 @@ Optional Arguments:
 * [--no-systemd] Skip creating and enabling systemd units and starting of OSD
   services
 
+Multiple OSDs can be activated at once by using the (idempotent) ``--all`` flag::
+
+    ceph-volume lvm activate --all
+
 
 **prepare**
 Prepares a logical volume to be used as an OSD and journal using a ``filestore``
@@ -90,6 +94,10 @@ Required arguments:
 
 * --data                A logical group name or a path to a logical volume
 
+For encrypting an OSD, the ``--dmcrypt`` flag must be added when preparing
+(also supported in the ``create`` sub-command).
+
+
 **create**
 Wraps the two-step process to provision a new osd (calling ``prepare`` first
 and then ``activate``) into a single one. The reason to prefer ``prepare`` and
index 4aaeb392285e5d083b17de783a2be0dde68ff340..e4ddb152e147166e5d7bedefaf7d67452c3d5f16 100644 (file)
@@ -306,9 +306,13 @@ Options
 
 .. option:: --shard-id=<shard-id>
 
-       Optional for mdlog list. Required for ``mdlog trim``,
+       Optional for mdlog list, data sync status. Required for ``mdlog trim``,
        ``replica mdlog get/delete``, ``replica datalog get/delete``.
 
+.. option:: --max-entries=<entries>
+
+       Optional for listing operations to specify the max entires
+
 .. option:: --auth-uid=auid
 
    The librados auid.
diff --git a/ceph/doc/mgr/balancer.rst b/ceph/doc/mgr/balancer.rst
new file mode 100644 (file)
index 0000000..191c455
--- /dev/null
@@ -0,0 +1,146 @@
+Balancer plugin
+===============
+
+The *balancer* plugin can optimize the placement of PGs across OSDs in
+order to achieve a balanced distribution, either automatically or in a
+supervised fashion.
+
+Enabling
+--------
+
+The *balancer* module is enabled with::
+
+  ceph mgr module enable balancer
+
+(It is enabled by default.)
+
+Status
+------
+
+The current status of the balancer can be check at any time with::
+
+  ceph balancer status
+
+
+Automatic balancing
+-------------------
+
+The automatic balancing can be enabled, using the default settings, with::
+
+  ceph balancer on
+
+The balancer can be turned back off again with::
+
+  ceph balancer off
+
+This will use the ``crush-compat`` mode, which is backward compatible
+with older clients, and will make small changes to the data
+distribution over time to ensure that OSDs are equally utilized.
+
+
+Throttling
+----------
+
+No adjustments will be made to the PG distribution if the cluster is
+degraded (e.g., because an OSD has failed and the system has not yet
+healed itself).
+
+When the cluster is healthy, the balancer will throttle its changes
+such that the percentage of PGs that are misplaced (i.e., that need to
+be moved) is below a threshold of (by default) 5%.  The
+``max_misplaced`` threshold can be adjusted with::
+
+  ceph config-key set mgr/balancer/max_misplaced .07   # 7%
+
+
+Modes
+-----
+
+There are currently two supported balancer modes:
+
+#. **crush-compat**.  The CRUSH compat mode uses the compat weight-set
+   feature (introduced in Luminous) to manage an alternative set of
+   weights for devices in the CRUSH hierarchy.  The normal weights
+   should remain set to the size of the device to reflect the target
+   amount of data that we want to store on the device.  The balancer
+   then optimizes the weight-set values, adjusting them up or down in
+   small increments, in order to achieve a distribution that matches
+   the target distribution as closely as possible.  (Because PG
+   placement is a pseudorandom process, there is a natural amount of
+   variation in the placement; by optimizing the weights we
+   counter-act that natural variation.)
+
+   Notably, this mode is *fully backwards compatible* with older
+   clients: when an OSDMap and CRUSH map is shared with older clients,
+   we present the optimized weights as the "real" weights.
+
+   The primary restriction of this mode is that the balancer cannot
+   handle multiple CRUSH hierarchies with different placement rules if
+   the subtrees of the hierarchy share any OSDs.  (This is normally
+   not the case, and is generally not a recommended configuration
+   because it is hard to manage the space utilization on the shared
+   OSDs.)
+
+#. **upmap**.  Starting with Luminous, the OSDMap can store explicit
+   mappings for individual OSDs as exceptions to the normal CRUSH
+   placement calculation.  These `upmap` entries provide fine-grained
+   control over the PG mapping.  This CRUSH mode will optimize the
+   placement of individual PGs in order to achieve a balanced
+   distribution.  In most cases, this distribution is "perfect," which
+   an equal number of PGs on each OSD (+/-1 PG, since they might not
+   divide evenly).
+
+   Note that using upmap requires that all clients be Luminous or newer.
+
+The default mode is ``crush-compat``.  The mode can be adjusted with::
+
+  ceph balancer mode upmap
+
+or::
+
+  ceph balancer mode crush-compat
+
+Supervised optimization
+-----------------------
+
+The balancer operation is broken into a few distinct phases:
+
+#. building a *plan*
+#. evaluating the quality of the data distribution, either for the current PG distribution, or the PG distribution that would result after executing a *plan*
+#. executing the *plan*
+
+To evautate and score the current distribution,::
+
+  ceph balancer eval
+
+You can also evaluate the distribution for a single pool with::
+
+  ceph balancer eval <pool-name>
+
+Greater detail for the evaluation can be seen with::
+
+  ceph balancer eval-verbose ...
+  
+The balancer can generate a plan, using the currently configured mode, with::
+
+  ceph balancer optimize <plan-name>
+
+The name is provided by the user and can be any useful identifying string.  The contents of a plan can be seen with::
+
+  ceph balancer show <plan-name>
+
+Old plans can be discarded with::
+
+  ceph balancer rm <plan-name>
+
+Currently recorded plans are shown as part of the status command::
+
+  ceph balancer status
+
+The quality of the distribution that would result after executing a plan can be calculated with::
+
+  ceph balancer eval <plan-name>
+
+Assuming the plan is expected to improve the distribution (i.e., it has a lower score than the current cluster state), the user can execute that plan with::
+
+  ceph balancer execute <plan-name>
index 53844ba24ed2c2e1ae265f7315f7734fe4af9c22..30f9516060b23cb5c093c42e8e9df7fdfe3e4aea 100644 (file)
@@ -27,6 +27,7 @@ sensible.
 
     Installation and Configuration <administrator>
     Writing plugins <plugins>
+    Balancer plugin <balancer>
     Dashboard plugin <dashboard>
     Local pool plugin <localpool>
     RESTful plugin <restful>
index 37aa5cd63434d9c0ac73ee4625ed089c31685f1e..066c958a0e727e581bc595d13e0e3a906be5b330 100644 (file)
@@ -53,7 +53,8 @@ Additional optional configuration settings are:
 :interval: Time between reports to InfluxDB.  Default 5 seconds.
 :database: InfluxDB database name.  Default "ceph".  You will need to create this database and grant write privileges to the configured username or the username must have admin privileges to create it.  
 :port: InfluxDB server port.  Default 8086
-    
+:ssl: Use https connection for InfluxDB server. Use "true" or "false". Default false
+:verify_ssl: Verify https cert for InfluxDB server. Use "true" or "false". Default true
 
 ---------
 Debugging 
index 5bae6a9845f821beb7c0e3661c96cf9ad5f194fd..eb869d11872b526ff650617b69d46cb805fa0d37 100644 (file)
@@ -59,13 +59,13 @@ Pools have a ``ceph_pool_metadata`` field like this:
 
 ::
 
-    ceph_pool_metadata{pool_id="2",name="cephfs_metadata_a"} 0.0
+    ceph_pool_metadata{pool_id="2",name="cephfs_metadata_a"} 1.0
 
 OSDs have a ``ceph_osd_metadata`` field like this:
 
 ::
 
-    ceph_osd_metadata{cluster_addr="172.21.9.34:6802/19096",device_class="ssd",id="0",public_addr="172.21.9.34:6801/19096",weight="1.0"} 0.0
+    ceph_osd_metadata{cluster_addr="172.21.9.34:6802/19096",device_class="ssd",ceph_daemon="osd.0",public_addr="172.21.9.34:6801/19096",weight="1.0"} 1.0
 
 
 Correlating drive statistics with node_exporter
@@ -79,34 +79,46 @@ drive statistics, special series are output like this:
 
 ::
 
-    ceph_disk_occupation{ceph_daemon="osd.0",device="sdd",instance="myhost",job="ceph"}
+    ceph_disk_occupation{ceph_daemon="osd.0",device="sdd", exported_instance="myhost"}
 
-To use this to get disk statistics by OSD ID, use the ``and on`` syntax
-in your prometheus query like this:
+To use this to get disk statistics by OSD ID, use either the ``and`` operator or
+the ``*`` operator in your prometheus query. All metadata metrics (like ``
+ceph_disk_occupation`` have the value 1 so they act neutral with ``*``. Using ``*``
+allows to use ``group_left`` and ``group_right`` grouping modifiers, so that
+the resulting metric has additional labels from one side of the query.
+
+See the
+`prometheus documentation`__ for more information about constructing queries.
+
+__ https://prometheus.io/docs/prometheus/latest/querying/basics
+
+The goal is to run a query like
 
 ::
 
     rate(node_disk_bytes_written[30s]) and on (device,instance) ceph_disk_occupation{ceph_daemon="osd.0"}
 
-See the prometheus documentation for more information about constructing
-queries.
+Out of the box the above query will not return any metrics since the ``instance`` labels of
+both metrics don't match. The ``instance`` label of ``ceph_disk_occupation``
+will be the currently active MGR node.
 
-Note that for this mechanism to work, Ceph and node_exporter must agree
-about the values of the ``instance`` label.  See the following section
-for guidance about to to set up Prometheus in a way that sets
-``instance`` properly.
+ The following two section outline two approaches to remedy this.
 
-Configuring Prometheus server
-=============================
+Use label_replace
+=================
+
+The ``label_replace`` function (cp.
+`label_replace documentation <https://prometheus.io/docs/prometheus/latest/querying/functions/#label_replace>`_)
+can add a label to, or alter a label of, a metric within a query.
 
-See the prometheus documentation for full details of how to add
-scrape endpoints: the notes
-in this section are tips on how to configure Prometheus to capture
-the Ceph statistics in the most usefully-labelled form.
+To correlate an OSD and its disks write rate, the following query can be used:
+
+::
 
-This configuration is necessary because Ceph is reporting metrics
-from many hosts and services via a single endpoint, and some
-metrics that relate to no physical host (such as pool statistics).
+    label_replace(rate(node_disk_bytes_written[30s]), "exported_instance", "$1", "instance", "(.*):.*") and on (device,exported_instance) ceph_disk_occupation{ceph_daemon="osd.0"}
+
+Configuring Prometheus server
+=============================
 
 honor_labels
 ------------
@@ -115,27 +127,18 @@ To enable Ceph to output properly-labelled data relating to any host,
 use the ``honor_labels`` setting when adding the ceph-mgr endpoints
 to your prometheus configuration.
 
-Without this setting, any ``instance`` labels that Ceph outputs, such
-as those in ``ceph_disk_occupation`` series, will be overridden
-by Prometheus.
-
-Ceph instance label
--------------------
+This allows Ceph to export the proper ``instance`` label without prometheus
+overwriting it. Without this setting, Prometheus applies an ``instance`` label
+that includes the hostname and port of the endpoint that the series game from.
+Because Ceph clusters have multiple manager daemons, this results in an
+``instance`` label that changes spuriously when the active manager daemon
+changes.
 
-By default, Prometheus applies an ``instance`` label that includes
-the hostname and port of the endpoint that the series game from.  Because
-Ceph clusters have multiple manager daemons, this results in an ``instance``
-label that changes spuriously when the active manager daemon changes.
-
-Set a custom ``instance`` label in your Prometheus target configuration: 
-you might wish to set it to the hostname of your first monitor, or something
-completely arbitrary like "ceph_cluster".
-
-node_exporter instance labels
+node_exporter hostname labels
 -----------------------------
 
 Set your ``instance`` labels to match what appears in Ceph's OSD metadata
-in the ``hostname`` field.  This is generally the short hostname of the node.
+in the ``instance`` field.  This is generally the short hostname of the node.
 
 This is only necessary if you want to correlate Ceph stats with host stats,
 but you may find it useful to do it in all cases in case you want to do
@@ -145,7 +148,8 @@ Example configuration
 ---------------------
 
 This example shows a single node configuration running ceph-mgr and
-node_exporter on a server called ``senta04``.
+node_exporter on a server called ``senta04``. Note that this requires to add the
+appropriate instance label to every ``node_exporter`` target individually.
 
 This is just an example: there are other ways to configure prometheus
 scrape targets and label rewrite rules.
@@ -180,9 +184,7 @@ ceph_targets.yml
     [
         {
             "targets": [ "senta04.mydomain.com:9283" ],
-            "labels": {
-                "instance": "ceph_cluster"
-            }
+            "labels": {}
         }
     ]
 
index eb14fa43de18d11e43727e9ea770e1dbed34e70e..1d72a395d778924f3bed7639c8736b6fc528da90 100644 (file)
@@ -291,18 +291,16 @@ You can override these locations, but it is not recommended.
 Signatures
 ----------
 
-In Ceph Bobtail and subsequent versions, we prefer that Ceph authenticate all
-ongoing messages between the entities using the session key set up for that
-initial authentication. However, Argonaut and earlier Ceph daemons do not know
-how to perform ongoing message authentication. To maintain backward
-compatibility (e.g., running both Botbail and Argonaut daemons in the same
-cluster), message signing is **off** by default. If you are running Bobtail or
-later daemons exclusively, configure Ceph to require signatures.
+Ceph performs a signature check that provides some limited protection
+against messages being tampered with in flight (e.g., by a "man in the
+middle" attack).
 
 Like other parts of Ceph authentication, Ceph provides fine-grained control so
 you can enable/disable signatures for service messages between the client and
 Ceph, and you can enable/disable signatures for messages between Ceph daemons.
 
+Note that even with signatures enabled data is not encrypted in
+flight.
 
 ``cephx require signatures``
 
@@ -310,6 +308,10 @@ Ceph, and you can enable/disable signatures for messages between Ceph daemons.
               traffic between the Ceph Client and the Ceph Storage Cluster, and 
               between daemons comprising the Ceph Storage Cluster. 
 
+             Ceph Argonaut and Linux kernel versions prior to 3.19 do
+             not support signatures; if such clients are in use this
+             option can be turned off to allow them to connect.
+
 :Type: Boolean
 :Required: No
 :Default: ``false``
@@ -338,7 +340,7 @@ Ceph, and you can enable/disable signatures for messages between Ceph daemons.
 ``cephx sign messages``
 
 :Description: If the Ceph version supports message signing, Ceph will sign
-              all messages so they cannot be spoofed.
+              all messages so they are more difficult to spoof.
 
 :Type: Boolean
 :Default: ``true``
index d2afaec3d46b1a5fe3b4ddae7cef8cb27c148c0a..aee5242af335d8e33555a43a298c6e7cb7efe0c4 100644 (file)
@@ -6,10 +6,6 @@ With ``ceph-deploy``, adding and removing metadata servers is a simple task. You
 just add or remove one or more metadata servers on the command line with one
 command.
 
-.. important:: You must deploy at least one metadata server to use CephFS.
-    There is experimental support for running multiple metadata servers.
-    Do not run multiple active metadata servers in production.
-
 See `MDS Config Reference`_ for details on configuring metadata servers.
 
 
index 59ce4c71380fca03e9a81eddf2688bb08be638b4..8ad28315a3bddf2e51fcd0f20a88abb224cce4db 100644 (file)
@@ -164,6 +164,7 @@ weight).
  Note that this practice will no longer be necessary in Bobtail and
  subsequent releases.
 
+.. _rados-replacing-an-osd:
 
 Replacing an OSD
 ----------------
index 8dcccd63deb566bcb31fa41d52de0a2d74435de2..ae569f92a918d40e1c2924d442630cf6c2396fe0 100644 (file)
@@ -750,6 +750,13 @@ You may get values for the following keys:
 :Type: Double
 
 
+``allow_ec_overwrites``
+
+:Description: see allow_ec_overwrites_
+
+:Type: Boolean
+
+
 Set the Number of Object Replicas
 =================================
 
index cdc5c277d55dd4c0703884e05aa3318ed1f797a1..ff6323ee454b345439cae2f9856af67e6209eeeb 100644 (file)
@@ -20,12 +20,25 @@ Options
 
 ``port``
 
-:Description: Sets the listening port number.
+:Description: Sets the listening port number. Can be specified multiple
+              times as in ``port=80 port=8000``.
 
 :Type: Integer
 :Default: ``80``
 
 
+``endpoint``
+
+:Description: Sets the listening address in the form ``address[:port]``,
+              where the address is an IPv4 address string in dotted decimal
+              form, or an IPv6 address in hexadecimal notation. The
+              optional port defaults to 80. Can be specified multiple times
+              as in ``endpoint=::1 endpoint=192.168.0.100:8000``.
+
+:Type: Integer
+:Default: None
+
+
 Civetweb
 ========
 
@@ -43,7 +56,8 @@ Options
 :Description: Sets the listening port number. For SSL-enabled ports, add an
               ``s`` suffix like ``443s``. To bind a specific IPv4 or IPv6
               address, use the form ``address:port``. Multiple endpoints
-              can be separated by ``+`` as in ``127.0.0.1:8000+443s``.
+              can either be separated by ``+`` as in ``127.0.0.1:8000+443s``,
+              or by providing multiple options as in ``port=8000 port=443s``.
 
 :Type: String
 :Default: ``7480``
index 398276c74a57873e9929b6efee9cd603d1e3aa7e..4220b75c53dd992e57e0c8d1368e66fc8c3e9321 100644 (file)
@@ -47,6 +47,13 @@ For a v3 version of the OpenStack Identity API you should replace
    rgw keystone admin domain = {keystone admin domain name}
    rgw keystone admin project = {keystone admin project name}
 
+For compatibility with previous versions of ceph, it is also
+possible to set ``rgw keystone implicit tenants`` to either
+``s3`` or ``swift``.  This has the effect of splitting
+the identity space such that the indicated protocol will
+only use implicit tenants, and the other protocol will
+never use implicit tenants.  Some older versions of ceph
+only supported implicit tenants with swift.
 
 Prior to Kilo
 -------------
index 95f22d7a1d9162e8d4c76b5acdd3dc0f725dc96c..b16f704b399ade5b9ff7bd32afc9e4d428e774b8 100644 (file)
@@ -95,6 +95,7 @@ Swift with Keystone
 TBD -- don't forget to explain the function of
        rgw keystone implicit tenants = true
        in commit e9259486decab52a362443d3fd3dec33b0ec654f
+       [ There is a description of this in keystone.rst ]
 
 Notes and known issues
 ----------------------
index 71f39af08954812c3151d356d2920cc7a385b7d9..c0d463a90d6120654dc6710ed03861d082e2c9d6 100644 (file)
@@ -1,7 +1,7 @@
 roles:
 - [mon.a, mon.c, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
 - [mon.b, mgr.x, mds.b, mds.c, osd.4, osd.5, osd.6, osd.7]
-- [client.0]
+- [client.0, client.1]
 openstack:
 - volumes: # attached to each instance
     count: 4
index 86be381ee6e1f32d85abee57687695bf29a0f4b9..0bf240272bc9942c087c2281480e8a433c91b316 100644 (file)
@@ -1,7 +1,7 @@
 roles:
 - [mon.a, mon.c, mgr.y, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2, osd.3]
 - [mon.b, mgr.x, mds.e, mds.f, mds.g, mds.h, mds.i, osd.4, osd.5, osd.6, osd.7]
-- [client.0]
+- [client.0, client.1]
 openstack:
 - volumes: # attached to each instance
     count: 4
diff --git a/ceph/qa/distros/all/rhel_7.5.yaml b/ceph/qa/distros/all/rhel_7.5.yaml
new file mode 100644 (file)
index 0000000..e5aaf3d
--- /dev/null
@@ -0,0 +1,2 @@
+os_type: rhel
+os_version: "7.5"
diff --git a/ceph/qa/distros/all/ubuntu_18.04.yaml b/ceph/qa/distros/all/ubuntu_18.04.yaml
new file mode 100644 (file)
index 0000000..4d44648
--- /dev/null
@@ -0,0 +1,2 @@
+os_type: ubuntu
+os_version: "18.04"
diff --git a/ceph/qa/overrides/more-active-recovery.yaml b/ceph/qa/overrides/more-active-recovery.yaml
new file mode 100644 (file)
index 0000000..bfe86e4
--- /dev/null
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_recovery_max_active: 10
+        osd_recovery_max_single_start: 10
index 6a8eebff4b4475fcfd1167220a16682e7ba92ddd..b9dd86bf2030d1246ade886d417534f3623841b0 100755 (executable)
@@ -514,6 +514,11 @@ function create_pool() {
     sleep 1
 }
 
+function delete_pool() {
+    local poolname=$1
+    ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
+}
+
 #######################################################################
 
 function run_mgr() {
index 6dd5833ad04486db57759fcf2519f0932adeebe6..452161dbd2a76a090e5f6e6a49290f6b69425e55 100755 (executable)
@@ -62,12 +62,6 @@ function create_erasure_coded_pool() {
     wait_for_clean || return 1
 }
 
-function delete_pool() {
-    local poolname=$1
-
-    ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
-}
-
 function rados_put_get() {
     local dir=$1
     local poolname=$2
index 5dccebb3ba78f06c22fe8afb195fc340e789acd5..32bef54ef45fd6dae563cfc4c4fc9ddc14abc264 100755 (executable)
@@ -80,9 +80,8 @@ function create_erasure_coded_pool() {
     wait_for_clean || return 1
 }
 
-function delete_pool() {
+function delete_erasure_coded_pool() {
     local poolname=$1
-
     ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
     ceph osd erasure-code-profile rm myprofile
 }
@@ -260,7 +259,7 @@ function TEST_rados_get_subread_eio_shard_0() {
     # inject eio on primary OSD (0) and replica OSD (1)
     local shard_id=0
     rados_put_get_data eio $dir $shard_id || return 1
-    delete_pool $poolname
+    delete_erasure_coded_pool $poolname
 }
 
 function TEST_rados_get_subread_eio_shard_1() {
@@ -272,7 +271,7 @@ function TEST_rados_get_subread_eio_shard_1() {
     # inject eio into replicas OSD (1) and OSD (2)
     local shard_id=1
     rados_put_get_data eio $dir $shard_id || return 1
-    delete_pool $poolname
+    delete_erasure_coded_pool $poolname
 }
 
 # We don't remove the object from the primary because
@@ -287,7 +286,7 @@ function TEST_rados_get_subread_missing() {
     # inject remove into replicas OSD (1) and OSD (2)
     local shard_id=1
     rados_put_get_data remove $dir $shard_id || return 1
-    delete_pool $poolname
+    delete_erasure_coded_pool $poolname
 }
 
 #
@@ -309,7 +308,7 @@ function TEST_rados_get_bad_size_shard_0() {
     rados_get_data_bad_size $dir $shard_id 10 || return 1
     rados_get_data_bad_size $dir $shard_id 0 || return 1
     rados_get_data_bad_size $dir $shard_id 256 add || return 1
-    delete_pool $poolname
+    delete_erasure_coded_pool $poolname
 }
 
 function TEST_rados_get_bad_size_shard_1() {
@@ -323,7 +322,7 @@ function TEST_rados_get_bad_size_shard_1() {
     rados_get_data_bad_size $dir $shard_id 10 || return 1
     rados_get_data_bad_size $dir $shard_id 0 || return 1
     rados_get_data_bad_size $dir $shard_id 256 add || return 1
-    delete_pool $poolname
+    delete_erasure_coded_pool $poolname
 }
 
 function TEST_rados_get_with_subreadall_eio_shard_0() {
@@ -337,7 +336,7 @@ function TEST_rados_get_with_subreadall_eio_shard_0() {
     # inject eio on primary OSD (0)
     rados_put_get_data eio $dir $shard_id recovery || return 1
 
-    delete_pool $poolname
+    delete_erasure_coded_pool $poolname
 }
 
 function TEST_rados_get_with_subreadall_eio_shard_1() {
@@ -351,11 +350,11 @@ function TEST_rados_get_with_subreadall_eio_shard_1() {
     # inject eio on replica OSD (1)
     rados_put_get_data eio $dir $shard_id recovery || return 1
 
-    delete_pool $poolname
+    delete_erasure_coded_pool $poolname
 }
 
 # Test recovery the first k copies aren't all available
-function TEST_ec_recovery_errors() {
+function TEST_ec_single_recovery_error() {
     local dir=$1
     local objname=myobject
 
@@ -377,7 +376,102 @@ function TEST_ec_recovery_errors() {
     # Cluster should recover this object
     wait_for_clean || return 1
 
-    delete_pool $poolname
+    rados_get $dir $poolname myobject || return 1
+
+    delete_erasure_coded_pool $poolname
+}
+
+# Test recovery when repeated reads are needed due to EIO
+function TEST_ec_recovery_multiple_errors() {
+    local dir=$1
+    local objname=myobject
+
+    setup_osds 9 || return 1
+
+    local poolname=pool-jerasure
+    create_erasure_coded_pool $poolname 4 4 || return 1
+
+    rados_put $dir $poolname $objname || return 1
+    inject_eio ec data $poolname $objname $dir 0 || return 1
+    # first read will try shards 0,1,2 when 0 gets EIO, shard 3 gets
+    # tried as well. Make that fail to test multiple-EIO handling.
+    inject_eio ec data $poolname $objname $dir 3 || return 1
+    inject_eio ec data $poolname $objname $dir 4 || return 1
+
+    local -a initial_osds=($(get_osds $poolname $objname))
+    local last_osd=${initial_osds[-1]}
+    # Kill OSD
+    kill_daemons $dir TERM osd.${last_osd} >&2 < /dev/null || return 1
+    ceph osd down ${last_osd} || return 1
+    ceph osd out ${last_osd} || return 1
+
+    # Cluster should recover this object
+    wait_for_clean || return 1
+
+    rados_get $dir $poolname myobject || return 1
+
+    delete_erasure_coded_pool $poolname
+}
+
+# Test recovery when there's only one shard to recover, but multiple
+# objects recovering in one RecoveryOp
+function TEST_ec_recovery_multiple_objects() {
+    local dir=$1
+    local objname=myobject
+
+    ORIG_ARGS=$CEPH_ARGS
+    CEPH_ARGS+=' --osd-recovery-max-single-start 3 --osd-recovery-max-active 3 '
+    setup_osds 7 || return 1
+    CEPH_ARGS=$ORIG_ARGS
+
+    local poolname=pool-jerasure
+    create_erasure_coded_pool $poolname 3 2 || return 1
+
+    rados_put $dir $poolname test1
+    rados_put $dir $poolname test2
+    rados_put $dir $poolname test3
+
+    ceph osd out 0 || return 1
+
+    # Cluster should recover these objects all at once
+    wait_for_clean || return 1
+
+    rados_get $dir $poolname test1
+    rados_get $dir $poolname test2
+    rados_get $dir $poolname test3
+
+    delete_erasure_coded_pool $poolname
+}
+
+# test multi-object recovery when the one missing shard gets EIO
+function TEST_ec_recovery_multiple_objects_eio() {
+    local dir=$1
+    local objname=myobject
+
+    ORIG_ARGS=$CEPH_ARGS
+    CEPH_ARGS+=' --osd-recovery-max-single-start 3 --osd-recovery-max-active 3 '
+    setup_osds 7 || return 1
+    CEPH_ARGS=$ORIG_ARGS
+
+    local poolname=pool-jerasure
+    create_erasure_coded_pool $poolname 3 2 || return 1
+
+    rados_put $dir $poolname test1
+    rados_put $dir $poolname test2
+    rados_put $dir $poolname test3
+
+    # can't read from this shard anymore
+    inject_eio ec data $poolname $objname $dir 0 || return 1
+    ceph osd out 0 || return 1
+
+    # Cluster should recover these objects all at once
+    wait_for_clean || return 1
+
+    rados_get $dir $poolname test1
+    rados_get $dir $poolname test2
+    rados_get $dir $poolname test3
+
+    delete_erasure_coded_pool $poolname
 }
 
 # Test backfill with unfound object
@@ -388,9 +482,10 @@ function TEST_ec_backfill_unfound() {
     # Must be between 1 and $lastobj
     local testobj=obj250
 
-    export CEPH_ARGS
+    ORIG_ARGS=$CEPH_ARGS
     CEPH_ARGS+=' --osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10'
     setup_osds 5 || return 1
+    CEPH_ARGS=$ORIG_ARGS
 
     local poolname=pool-jerasure
     create_erasure_coded_pool $poolname 3 2 || return 1
@@ -455,7 +550,7 @@ function TEST_ec_backfill_unfound() {
 
     rm -f ${dir}/ORIGINAL ${dir}/CHECK
 
-    delete_pool $poolname
+    delete_erasure_coded_pool $poolname
 }
 
 # Test recovery with unfound object
@@ -466,7 +561,11 @@ function TEST_ec_recovery_unfound() {
     # Must be between 1 and $lastobj
     local testobj=obj75
 
+    ORIG_ARGS=$CEPH_ARGS
+    CEPH_ARGS+=' --osd-recovery-max-single-start 3 --osd-recovery-max-active 3 '
+    CEPH_ARGS+=' --osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10'
     setup_osds 5 || return 1
+    CEPH_ARGS=$ORIG_ARGS
 
     local poolname=pool-jerasure
     create_erasure_coded_pool $poolname 3 2 || return 1
@@ -531,7 +630,7 @@ function TEST_ec_recovery_unfound() {
 
     rm -f ${dir}/ORIGINAL ${dir}/CHECK
 
-    delete_pool $poolname
+    delete_erasure_coded_pool $poolname
 }
 
 main test-erasure-eio "$@"
diff --git a/ceph/qa/standalone/osd/ec-error-rollforward.sh b/ceph/qa/standalone/osd/ec-error-rollforward.sh
new file mode 100755 (executable)
index 0000000..e3a6480
--- /dev/null
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    # Fix port????
+    export CEPH_MON="127.0.0.1:7132" # git grep '\<7132\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON --osd-objectstore filestore"
+    export margin=10
+    export objects=200
+    export poolname=test
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+function TEST_ec_error_rollforward() {
+    local dir=$1
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    run_osd $dir 0 || return 1
+    run_osd $dir 1 || return 1
+    run_osd $dir 2 || return 1
+    run_osd $dir 3 || return 1
+
+    ceph osd erasure-code-profile set ec-profile m=2 k=2 crush-failure-domain=osd
+    ceph osd pool create ec 1 1 erasure ec-profile
+
+    rados -p ec put foo /etc/passwd
+
+    kill -STOP `cat $dir/osd.2.pid`
+
+    rados -p ec rm foo &
+    sleep 1
+    rados -p ec rm a &
+    rados -p ec rm b &
+    rados -p ec rm c &
+    sleep 1
+    kill -9 `cat $dir/osd.?.pid`
+    kill %1 %2 %3 %4
+
+    run_osd $dir 0 || return 1
+    run_osd $dir 1 || return 1
+    run_osd $dir 2 || return 1
+    run_osd $dir 3 || return 1
+
+    wait_for_clean || return 1
+}
+
+main ec-error-rollforward "$@"
index d0fe538363c80cf87fc3a4bc7b3fe15b87ec45dc..3a478566320be5c691c9fa4022c61c91f1412ed1 100755 (executable)
@@ -57,12 +57,6 @@ function get_state() {
         jq -r ".[] | select(.pgid==\"$pgid\") | .$sname"
 }
 
-function delete_pool() {
-    local poolname=$1
-
-    ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
-}
-
 function rados_put() {
     local dir=$1
     local poolname=$2
index 1f4ba29bdfbc444d0880c68c70fcf7884825bf43..93201a2a50c60ed2ed1abd8dfc4ac14db82de2cf 100755 (executable)
@@ -138,10 +138,11 @@ function TEST_trim_max_entries()
     rados -p test rm foo
     test_log_size $PGID 3
     rados -p test rm foo
-    test_log_size $PGID 4
-
+    test_log_size $PGID 3
     rados -p test rm foo
-    test_log_size $PGID 2
+    test_log_size $PGID 3
+    rados -p test rm foo
+    test_log_size $PGID 3
 }
 
 main repro-long-log "$@"
index 1061dc77effd9cee976f50ecefc185ac624a89e2..a4ddd6b34762522aea01036562d67db52a6050ed 100755 (executable)
@@ -127,3 +127,4 @@ main osd-recovery-scrub "$@"
 # Local Variables:
 # compile-command: "cd build ; make -j4 && \
 #    ../qa/run-standalone.sh osd-recovery-scrub.sh"
+# End:
index 52f171ff1e9093b999099e870523fd897a1780b4..a3732ba325e6102ec3187ae9b6b45dcd35fba86f 100755 (executable)
@@ -56,6 +56,7 @@ function run() {
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
+    CEPH_ARGS+="--osd-skip-data-digest=false "
 
     local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
     for func in $funcs ; do
@@ -2719,8 +2720,7 @@ function corrupt_scrub_erasure() {
             "lost": 0,
             "flags": [
               "dirty",
-              "data_digest",
-              "omap_digest"
+              "data_digest"
             ],
             "truncate_seq": 0,
             "truncate_size": 0,
@@ -2771,8 +2771,7 @@ function corrupt_scrub_erasure() {
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -2846,8 +2845,7 @@ function corrupt_scrub_erasure() {
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -2953,8 +2951,7 @@ function corrupt_scrub_erasure() {
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -3011,8 +3008,7 @@ function corrupt_scrub_erasure() {
             "lost": 0,
             "flags": [
               "dirty",
-              "data_digest",
-              "omap_digest"
+              "data_digest"
             ],
             "truncate_seq": 0,
             "truncate_size": 0,
@@ -3063,8 +3059,7 @@ function corrupt_scrub_erasure() {
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -3122,8 +3117,7 @@ function corrupt_scrub_erasure() {
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -3218,8 +3212,7 @@ function corrupt_scrub_erasure() {
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -3375,8 +3368,7 @@ EOF
             "lost": 0,
             "flags": [
               "dirty",
-              "data_digest",
-              "omap_digest"
+              "data_digest"
             ],
             "truncate_seq": 0,
             "truncate_size": 0,
@@ -3430,8 +3422,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -3510,8 +3501,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -3623,8 +3613,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -3685,8 +3674,7 @@ EOF
             "lost": 0,
             "flags": [
               "dirty",
-              "data_digest",
-              "omap_digest"
+              "data_digest"
             ],
             "truncate_seq": 0,
             "truncate_size": 0,
@@ -3739,8 +3727,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -3803,8 +3790,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -3900,8 +3886,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -4038,8 +4023,7 @@ EOF
             "lost": 0,
             "flags": [
               "dirty",
-              "data_digest",
-              "omap_digest"
+              "data_digest"
             ],
             "truncate_seq": 0,
             "truncate_size": 0,
@@ -4093,8 +4077,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -4174,8 +4157,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -4250,8 +4232,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -4363,8 +4344,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -4423,8 +4403,7 @@ EOF
             "lost": 0,
             "flags": [
               "dirty",
-              "data_digest",
-              "omap_digest"
+              "data_digest"
             ],
             "truncate_seq": 0,
             "truncate_size": 0,
@@ -4478,8 +4457,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -4543,8 +4521,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -4642,8 +4619,7 @@ EOF
         "lost": 0,
         "flags": [
           "dirty",
-          "data_digest",
-          "omap_digest"
+          "data_digest"
         ],
         "truncate_seq": 0,
         "truncate_size": 0,
@@ -5134,6 +5110,6 @@ EOF
 main osd-scrub-repair "$@"
 
 # Local Variables:
-# compile-command: "cd ../.. ; make -j4 && \
-#    test/osd/osd-scrub-repair.sh # TEST_corrupt_and_repair_replicated"
+# compile-command: "cd build ; make -j4 && \
+#    ../qa/run-standalone.sh osd-scrub-repair.sh"
 # End:
index cf6f67b6dbe2d1b6d3694b940d182a8de738ea0a..a83cfe75c9a8ee49daf15e722e3e18f8bbfcc629 100755 (executable)
@@ -20,6 +20,9 @@ source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
 # Set to "yes" in order to ignore diff errors and save results to update test
 getjson="no"
 
+jqfilter='.inconsistents'
+sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
+
 function run() {
     local dir=$1
     shift
@@ -37,33 +40,12 @@ function run() {
     done
 }
 
-function TEST_scrub_snaps() {
+function create_scenario() {
     local dir=$1
-    local poolname=test
-    local OBJS=15
-    local OSDS=1
-
-    TESTDATA="testdata.$$"
+    local poolname=$2
+    local TESTDATA=$3
+    local osd=$4
 
-    run_mon $dir a --osd_pool_default_size=$OSDS || return 1
-    run_mgr $dir x || return 1
-    for osd in $(seq 0 $(expr $OSDS - 1))
-    do
-      run_osd $dir $osd || return 1
-    done
-
-    # Create a pool with a single pg
-    create_pool $poolname 1 1
-    wait_for_clean || return 1
-    poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
-
-    dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
-    for i in `seq 1 $OBJS`
-    do
-        rados -p $poolname put obj${i} $TESTDATA
-    done
-
-    local primary=$(get_primary $poolname obj1)
     SNAP=1
     rados -p $poolname mksnap snap${SNAP}
     dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
@@ -107,60 +89,91 @@ function TEST_scrub_snaps() {
 
     # Don't need to use ceph_objectstore_tool() function because osd stopped
 
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj1)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" --force remove
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj1)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" --force remove
 
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --op list obj5 | grep \"snapid\":2)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" remove
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj5 | grep \"snapid\":2)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" remove
 
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --op list obj5 | grep \"snapid\":1)"
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj5 | grep \"snapid\":1)"
     OBJ5SAVE="$JSON"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" remove
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" remove
 
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --op list obj5 | grep \"snapid\":4)"
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj5 | grep \"snapid\":4)"
     dd if=/dev/urandom of=$TESTDATA bs=256 count=18
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" set-bytes $TESTDATA
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-bytes $TESTDATA
 
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj3)"
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj3)"
     dd if=/dev/urandom of=$TESTDATA bs=256 count=15
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" set-bytes $TESTDATA
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-bytes $TESTDATA
 
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --op list obj4 | grep \"snapid\":7)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" remove
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj4 | grep \"snapid\":7)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" remove
 
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj2)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" rm-attr snapset
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj2)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" rm-attr snapset
 
     # Create a clone which isn't in snapset and doesn't have object info
     JSON="$(echo "$OBJ5SAVE" | sed s/snapid\":1/snapid\":7/)"
     dd if=/dev/urandom of=$TESTDATA bs=256 count=7
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" set-bytes $TESTDATA
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-bytes $TESTDATA
 
-    rm -f $TESTDATA
-
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj6)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj7)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset corrupt
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj8)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset seq
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj9)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset clone_size
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj10)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset clone_overlap
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj11)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset clones
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj12)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset head
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj13)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset snaps
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj14)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" clear-snapset size
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj6)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj7)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset corrupt
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj8)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset seq
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj9)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset clone_size
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj10)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset clone_overlap
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj11)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset clones
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj12)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset head
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj13)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset snaps
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj14)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset size
 
     echo "garbage" > $dir/bad
-    JSON="$(ceph-objectstore-tool --data-path $dir/${primary} --head --op list obj15)"
-    ceph-objectstore-tool --data-path $dir/${primary} "$JSON" set-attr snapset $dir/bad
+    JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj15)"
+    ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-attr snapset $dir/bad
     rm -f $dir/bad
+}
+
+function TEST_scrub_snaps() {
+    local dir=$1
+    local poolname=test
+    local OBJS=15
+    local OSDS=1
+
+    TESTDATA="testdata.$$"
+
+    run_mon $dir a --osd_pool_default_size=$OSDS || return 1
+    run_mgr $dir x || return 1
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+
+    # Create a pool with a single pg
+    create_pool $poolname 1 1
+    wait_for_clean || return 1
+    poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
+
+    dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
+    for i in `seq 1 $OBJS`
+    do
+        rados -p $poolname put obj${i} $TESTDATA
+    done
+
+    local primary=$(get_primary $poolname obj1)
+
+    create_scenario $dir $poolname $TESTDATA $primary
+
+    rm -f $TESTDATA
 
     for osd in $(seq 0 $(expr $OSDS - 1))
     do
@@ -172,7 +185,8 @@ function TEST_scrub_snaps() {
         cat $dir/osd.0.log
         return 1
     fi
-    grep 'log_channel' $dir/osd.0.log
+
+    test "$(grep "_scan_snaps start" $dir/osd.${primary}.log | wc -l)" = "2" || return 1
 
     rados list-inconsistent-pg $poolname > $dir/json || return 1
     # Check pg count
@@ -180,10 +194,22 @@ function TEST_scrub_snaps() {
     # Check pgid
     test $(jq -r '.[0]' $dir/json) = $pgid || return 1
 
-    rados list-inconsistent-snapset $pgid > $dir/json || return 1
+    rados list-inconsistent-obj $pgid > $dir/json || return 1
 
-    local jqfilter='.inconsistents'
-    local sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
+    # The injected snapshot errors with a single copy pool doesn't
+    # see object errors because all the issues are detected by
+    # comparing copies.
+    jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
+{
+    "epoch": 17,
+    "inconsistents": []
+}
+EOF
+
+    jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
+    diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+
+    rados list-inconsistent-snapset $pgid > $dir/json || return 1
 
     jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
 {
@@ -676,6 +702,13 @@ EOF
       jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-snap.json || return 1
     fi
 
+    pidfiles=$(find $dir 2>/dev/null | grep 'osd[^/]*\.pid')
+    pids=""
+    for pidfile in ${pidfiles}
+    do
+        pids+="$(cat $pidfile) "
+    done
+
     for i in `seq 1 7`
     do
         rados -p $poolname rmsnap snap$i
@@ -683,14 +716,14 @@ EOF
 
     ERRORS=0
 
-    pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
-    pid=$(cat $pidfile)
-    if ! kill -0 $pid
-    then
-        echo "OSD crash occurred"
-        tail -100 $dir/osd.0.log
-        ERRORS=$(expr $ERRORS + 1)
-    fi
+    for pid in $pids
+    do
+        if ! kill -0 $pid
+        then
+            echo "OSD Crash occurred"
+            ERRORS=$(expr $ERRORS + 1)
+        fi
+    done
 
     kill_daemons $dir || return 1
 
@@ -739,8 +772,505 @@ EOF
     return 0
 }
 
+function _scrub_snaps_multi() {
+    local dir=$1
+    local poolname=test
+    local OBJS=15
+    local OSDS=2
+    local which=$2
+
+    TESTDATA="testdata.$$"
+
+    run_mon $dir a --osd_pool_default_size=$OSDS || return 1
+    run_mgr $dir x || return 1
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+
+    # Create a pool with a single pg
+    create_pool $poolname 1 1
+    wait_for_clean || return 1
+    poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
+
+    dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
+    for i in `seq 1 $OBJS`
+    do
+        rados -p $poolname put obj${i} $TESTDATA
+    done
+
+    local primary=$(get_primary $poolname obj1)
+    local replica=$(get_not_primary $poolname obj1)
+
+    eval create_scenario $dir $poolname $TESTDATA \$$which
+
+    rm -f $TESTDATA
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+
+    local pgid="${poolid}.0"
+    if ! pg_scrub "$pgid" ; then
+        cat $dir/osd.0.log
+        return 1
+    fi
+
+    test "$(grep "_scan_snaps start" $dir/osd.${primary}.log | wc -l)" -gt "3" || return 1
+    test "$(grep "_scan_snaps start" $dir/osd.${replica}.log | wc -l)" -gt "3" || return 1
+
+    rados list-inconsistent-pg $poolname > $dir/json || return 1
+    # Check pg count
+    test $(jq '. | length' $dir/json) = "1" || return 1
+    # Check pgid
+    test $(jq -r '.[0]' $dir/json) = $pgid || return 1
+
+    rados list-inconsistent-obj $pgid --format=json-pretty
+
+    rados list-inconsistent-snapset $pgid > $dir/json || return 1
+
+    # Since all of the snapshots on the primary is consistent there are no errors here
+    if [ $which = "replica" ];
+    then
+        scruberrors="21"
+        jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
+{
+    "epoch": 23,
+    "inconsistents": []
+}
+EOF
+
+else
+        scruberrors="33"
+        jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
+{
+    "epoch": 23,
+    "inconsistents": [
+        {
+            "name": "obj10",
+            "nspace": "",
+            "locator": "",
+            "snap": 1,
+            "errors": [
+                "size_mismatch"
+            ]
+        },
+        {
+            "name": "obj11",
+            "nspace": "",
+            "locator": "",
+            "snap": 1,
+            "errors": [
+                "headless"
+            ]
+        },
+        {
+            "name": "obj14",
+            "nspace": "",
+            "locator": "",
+            "snap": 1,
+            "errors": [
+                "size_mismatch"
+            ]
+        },
+        {
+            "name": "obj6",
+            "nspace": "",
+            "locator": "",
+            "snap": 1,
+            "errors": [
+                "headless"
+            ]
+        },
+        {
+            "name": "obj7",
+            "nspace": "",
+            "locator": "",
+            "snap": 1,
+            "errors": [
+                "headless"
+            ]
+        },
+        {
+            "name": "obj9",
+            "nspace": "",
+            "locator": "",
+            "snap": 1,
+            "errors": [
+                "size_mismatch"
+            ]
+        },
+        {
+            "name": "obj5",
+            "nspace": "",
+            "locator": "",
+            "snap": 7,
+            "errors": [
+                "info_missing",
+                "headless"
+            ]
+        },
+        {
+            "name": "obj10",
+            "nspace": "",
+            "locator": "",
+            "snap": "head",
+            "snapset": {
+                "head_exists": 1,
+                "snap_context": {
+                    "seq": 1,
+                    "snaps": [
+                        1
+                    ]
+                },
+                "clones": [
+                    {
+                        "snap": 1,
+                        "size": 1032,
+                        "overlap": "????",
+                        "snaps": [
+                            1
+                        ]
+                    }
+                ]
+            },
+            "errors": []
+        },
+        {
+            "name": "obj11",
+            "nspace": "",
+            "locator": "",
+            "snap": "head",
+            "snapset": {
+                "head_exists": 1,
+                "snap_context": {
+                    "seq": 1,
+                    "snaps": [
+                        1
+                    ]
+                },
+                "clones": []
+            },
+            "errors": [
+                "extra_clones"
+            ],
+            "extra clones": [
+                1
+            ]
+        },
+        {
+           "errors": [
+             "head_mismatch"
+           ],
+           "locator": "",
+           "name": "obj12",
+           "nspace": "",
+           "snap": "head",
+           "snapset": {
+             "clones": [
+               {
+                 "overlap": "[]",
+                 "size": 1032,
+                 "snap": 1,
+                 "snaps": [
+                   1
+                 ]
+               }
+             ],
+             "head_exists": 0,
+              "snap_context": {
+              "seq": 1,
+              "snaps": [
+                1
+              ]
+            }
+          }
+        },
+        {
+            "name": "obj14",
+            "nspace": "",
+            "locator": "",
+            "snap": "head",
+            "snapset": {
+                "head_exists": 1,
+                "snap_context": {
+                    "seq": 1,
+                    "snaps": [
+                        1
+                    ]
+                },
+                "clones": [
+                    {
+                        "snap": 1,
+                        "size": 1033,
+                        "overlap": "[]",
+                        "snaps": [
+                            1
+                        ]
+                    }
+                ]
+            },
+            "errors": []
+        },
+        {
+            "name": "obj5",
+            "nspace": "",
+            "locator": "",
+            "snap": "head",
+            "snapset": {
+                "head_exists": 1,
+                "snap_context": {
+                    "seq": 6,
+                    "snaps": [
+                        6,
+                        5,
+                        4,
+                        3,
+                        2,
+                        1
+                    ]
+                },
+                "clones": [
+                    {
+                        "snap": 1,
+                        "size": 1032,
+                        "overlap": "[]",
+                        "snaps": [
+                            1
+                        ]
+                    },
+                    {
+                        "snap": 2,
+                        "size": 256,
+                        "overlap": "[]",
+                        "snaps": [
+                            2
+                        ]
+                    },
+                    {
+                        "snap": 4,
+                        "size": 512,
+                        "overlap": "[]",
+                        "snaps": [
+                            4,
+                            3
+                        ]
+                    },
+                    {
+                        "snap": 6,
+                        "size": 1024,
+                        "overlap": "[]",
+                        "snaps": [
+                            6,
+                            5
+                        ]
+                    }
+                ]
+            },
+            "errors": [
+                "extra_clones"
+            ],
+            "extra clones": [
+                7
+            ]
+        },
+        {
+            "name": "obj6",
+            "nspace": "",
+            "locator": "",
+            "snap": "head",
+            "snapset": {
+                "head_exists": 1,
+                "snap_context": {
+                    "seq": 1,
+                    "snaps": [
+                        1
+                    ]
+                },
+                "clones": []
+            },
+            "errors": [
+                "extra_clones"
+            ],
+            "extra clones": [
+                1
+            ]
+        },
+        {
+            "name": "obj7",
+            "nspace": "",
+            "locator": "",
+            "snap": "head",
+            "snapset": {
+                "head_exists": 0,
+                "snap_context": {
+                    "seq": 0,
+                    "snaps": []
+                },
+                "clones": []
+            },
+            "errors": [
+                "head_mismatch",
+                "extra_clones"
+            ],
+            "extra clones": [
+                1
+            ]
+        },
+        {
+            "name": "obj8",
+            "nspace": "",
+            "locator": "",
+            "snap": "head",
+            "snapset": {
+                "head_exists": 1,
+                "snap_context": {
+                    "seq": 0,
+                    "snaps": [
+                        1
+                    ]
+                },
+                "clones": [
+                    {
+                        "snap": 1,
+                        "size": 1032,
+                        "overlap": "[]",
+                        "snaps": [
+                            1
+                        ]
+                    }
+                ]
+            },
+            "errors": [
+                "snapset_error"
+            ]
+        },
+        {
+            "name": "obj9",
+            "nspace": "",
+            "locator": "",
+            "snap": "head",
+            "snapset": {
+                "head_exists": 1,
+                "snap_context": {
+                    "seq": 1,
+                    "snaps": [
+                        1
+                    ]
+                },
+                "clones": [
+                    {
+                        "snap": 1,
+                        "size": "????",
+                        "overlap": "[]",
+                        "snaps": [
+                            1
+                        ]
+                    }
+                ]
+            },
+            "errors": []
+        }
+    ]
+}
+EOF
+fi
+
+    jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
+    diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+    if test $getjson = "yes"
+    then
+        jq '.' $dir/json > save1.json
+    fi
+
+    if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
+    then
+      jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-snap.json || return 1
+    fi
+
+    pidfiles=$(find $dir 2>/dev/null | grep 'osd[^/]*\.pid')
+    pids=""
+    for pidfile in ${pidfiles}
+    do
+        pids+="$(cat $pidfile) "
+    done
+
+    # When removing snapshots with a corrupt replica, it crashes.
+    # See http://tracker.ceph.com/issues/23875
+    if [ $which = "primary" ];
+    then
+        for i in `seq 1 7`
+        do
+            rados -p $poolname rmsnap snap$i
+        done
+    fi
+
+    ERRORS=0
+
+    for pid in $pids
+    do
+        if ! kill -0 $pid
+        then
+            echo "OSD Crash occurred"
+            ERRORS=$(expr $ERRORS + 1)
+        fi
+    done
+
+    kill_daemons $dir || return 1
+
+    declare -a err_strings
+    err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] missing .*:::obj4:7"
+    err_strings[1]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1]: soid .*:::obj3:head size 3840 != size 768 from auth oi"
+    err_strings[2]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] missing .*:::obj5:1"
+    err_strings[3]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] missing .*:::obj5:2"
+    err_strings[4]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1]: soid .*:::obj5:4 size 4608 != size 512 from auth oi"
+    err_strings[5]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid .*:::obj5:7: failed to pick suitable object info"
+    err_strings[6]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] missing .*:::obj1:head"
+    err_strings[7]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub ${scruberrors} errors"
+
+    for err_string in "${err_strings[@]}"
+    do
+        if ! grep "$err_string" $dir/osd.${primary}.log > /dev/null;
+        then
+            echo "Missing log message '$err_string'"
+            ERRORS=$(expr $ERRORS + 1)
+        fi
+    done
+
+    if [ $ERRORS != "0" ];
+    then
+        echo "TEST FAILED WITH $ERRORS ERRORS"
+        return 1
+    fi
+
+    echo "TEST PASSED"
+    return 0
+}
+
+function TEST_scrub_snaps_replica() {
+    local dir=$1
+    ORIG_ARGS=$CEPH_ARGS
+    CEPH_ARGS+=" --osd_scrub_chunk_min=3 --osd_scrub_chunk_max=3"
+    _scrub_snaps_multi $dir replica
+    err=$?
+    CEPH_ARGS=$ORIG_ARGS
+    return $err
+}
+
+function TEST_scrub_snaps_primary() {
+    local dir=$1
+    ORIG_ARGS=$CEPH_ARGS
+    CEPH_ARGS+=" --osd_scrub_chunk_min=3 --osd_scrub_chunk_max=3"
+    _scrub_snaps_multi $dir primary
+    err=$?
+    CEPH_ARGS=$ORIG_ARGS
+    return $err
+}
+
 main osd-scrub-snaps "$@"
 
 # Local Variables:
-# compile-command: "cd ../.. ; make -j4 && \
-#    test/osd/osd-scrub-snaps.sh"
+# compile-command: "cd build ; make -j4 && \
+#    ../qa/run-standalone.sh osd-scrub-snaps.sh"
+# End:
index e6c93684a2254d09df6651804c9517dd61f3da73..b73906af958c1aeefd742f4d0eea5933e7b4038e 100755 (executable)
@@ -109,5 +109,6 @@ function TEST_scrub_test() {
 main osd-scrub-test "$@"
 
 # Local Variables:
-# compile-command: "cd ../.. ; make -j4 && \
-#    qa/standalone/scrub/osd-scrub-test.sh"
+# compile-command: "cd build ; make -j4 && \
+#    ../qa/run-standalone.sh osd-scrub-test.sh"
+# End:
index fc4873ceed1a1ed9e9c0b66d35034649d72f037b..4e516ab41927fc058b46b8da9f95e9cbae90cc07 100644 (file)
@@ -1,3 +1,9 @@
+openstack:
+  - machine:
+      disk: 10
+    volumes:
+      count: 2
+      size: 20
 roles:
 - - mon.a
   - mgr.x
@@ -6,14 +12,6 @@ roles:
 - - osd.1
   - mon.b
   - client.0
-openstack:
-  - machine:
-      disk: 10 # GB
-      ram: 2000 # MB
-      cpus: 1
-    volumes: # attached to each instance
-      count: 2
-      size: 10 # GB
 tasks:
 - ssh_keys:
 - print: "**** done ssh_keys"
index 837b1029bb2e8bdf365e04f8fcd05907becc9408..bf4a7f98695b6042519d55b9b899289a2d03c168 100644 (file)
@@ -2,6 +2,12 @@ overrides:
  ansible.cephlab: 
   vars: 
    quick_lvs_to_create: 4
+openstack:
+  - machine:
+      disk: 10
+    volumes:
+      count: 4
+      size: 20
 roles:
 - [mon.a, mgr.y, osd.0, osd.1]
 - [mon.b, osd.2, osd.3]
index f79784c0bed6358f747201c7799a493efeca873d..7fdee84b1f8963ab4994aba357419374c82ac824 100644 (file)
@@ -6,7 +6,7 @@ overrides:
       - Metadata damage detected
       - bad backtrace on inode
       - overall HEALTH_
-      - (MDS_TRIM)
+      - \(MDS_TRIM\)
     conf:
       mds:
         mds log max segments: 1
index f5e9a0b5cf46169a2a7a92dbe508941f25e6ad4b..a2f56299b221ab995bb74cce0c3d672e0dcee200 100644 (file)
@@ -10,5 +10,6 @@ overrides:
 
 tasks:
   - cephfs_test_runner:
+      fail_on_skip: false
       modules:
         - tasks.cephfs.test_client_recovery
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/% b/ceph/qa/suites/fs/bugs/client_trim_caps/%
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/begin.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/begin.yaml
new file mode 120000 (symlink)
index 0000000..3279455
--- /dev/null
@@ -0,0 +1 @@
+../../../../cephfs/begin.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/clusters/small-cluster.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/clusters/small-cluster.yaml
new file mode 100644 (file)
index 0000000..12047bd
--- /dev/null
@@ -0,0 +1,9 @@
+roles:
+- [mon.a, mgr.x, osd.0, osd.1, osd.2, mds.a, mds.b, client.0]
+openstack:
+- volumes: # attached to each instance
+    count: 2
+    size: 10 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore.yaml
new file mode 120000 (symlink)
index 0000000..1728acc
--- /dev/null
@@ -0,0 +1 @@
+../../../../../cephfs/objectstore-ec/bluestore.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/overrides/+ b/ceph/qa/suites/fs/bugs/client_trim_caps/overrides/+
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/overrides/debug.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/overrides/debug.yaml
new file mode 120000 (symlink)
index 0000000..4fdb9dd
--- /dev/null
@@ -0,0 +1 @@
+../../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/overrides/frag_enable.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/overrides/frag_enable.yaml
new file mode 120000 (symlink)
index 0000000..9e0f15f
--- /dev/null
@@ -0,0 +1 @@
+../../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/overrides/no_client_pidfile.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/overrides/no_client_pidfile.yaml
new file mode 120000 (symlink)
index 0000000..4626386
--- /dev/null
@@ -0,0 +1 @@
+../../../../../overrides/no_client_pidfile.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/overrides/whitelist_health.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/overrides/whitelist_health.yaml
new file mode 120000 (symlink)
index 0000000..42fa3ea
--- /dev/null
@@ -0,0 +1 @@
+../../../../../cephfs/overrides/whitelist_health.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/overrides/whitelist_wrongly_marked_down.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/overrides/whitelist_wrongly_marked_down.yaml
new file mode 120000 (symlink)
index 0000000..3728aac
--- /dev/null
@@ -0,0 +1 @@
+../../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
new file mode 100644 (file)
index 0000000..4106062
--- /dev/null
@@ -0,0 +1,20 @@
+# Note this test is unlikely to exercise the code as expected in the future:
+# "It's too tricky to arrange inodes in session->caps. we don't know if it
+# still works in the future." -Zheng
+
+overrides:
+  ceph:
+    log-whitelist:
+      - MDS cache is too large
+      - \(MDS_CACHE_OVERSIZED\)
+tasks:
+- exec:
+    mon.a:
+    - "ceph tell mds.* config set mds_max_ratio_caps_per_client 1"
+    - "ceph tell mds.* config set mds_min_caps_per_client 1"
+- background_exec:
+    mon.a:
+    - "sleep 30 && ceph tell mds.* config set mds_cache_memory_limit 1"
+- exec:
+    client.0:
+    - ceph_test_trim_caps
index 72ce013fabfc9c4c71692c8189b5b20a02630980..725a259d24f3225504bee08611a34d48e3ce7282 100644 (file)
@@ -10,5 +10,6 @@ overrides:
 
 tasks:
   - cephfs_test_runner:
+      fail_on_skip: false
       modules:
         - tasks.cephfs.test_client_recovery
index 06f3f57388b9cb704cdfb12739d959c6efe6e617..52fc991c1c33fa80b6bf2113f5f2e2b3b5f277fd 100644 (file)
@@ -3,6 +3,7 @@ overrides:
     log-whitelist:
       - reached quota
       - \(POOL_APP_NOT_ENABLED\)
+      - \(PG_AVAILABILITY\)
 tasks:
 - ceph-fuse:
 - workunit:
index d87f5bfdd35b417931e57ae5838c649a0f457b9e..39821ad867271e0d08d729c344dc72e2c746c922 100644 (file)
@@ -16,13 +16,14 @@ overrides:
     - 'deep-scrub 1 missing, 0 inconsistent objects'
     - 'failed to pick suitable auth object'
     - overall HEALTH_
-    - (OSDMAP_FLAGS)
-    - (OSD_
-    - (PG_
-    - (OSD_SCRUB_ERRORS)
-    - (TOO_FEW_PGS)
+    - \(OSDMAP_FLAGS\)
+    - \(OSD_
+    - \(PG_
+    - \(OSD_SCRUB_ERRORS\)
+    - \(TOO_FEW_PGS\)
     conf:
       osd:
         osd deep scrub update digest min age: 0
+        osd skip data digest: false
 tasks:
 - scrub_test:
index 316119c9cc1396d3faccc44b3dd00b0940ce2cb0..6cebde391cdcbffdd106c9e7028426d053cd36af 100644 (file)
@@ -4,11 +4,12 @@ overrides:
     - reached quota
     - but it is still running
     - overall HEALTH_
-    - (POOL_FULL)
-    - (SMALLER_PGP_NUM)
-    - (CACHE_POOL_NO_HIT_SET)
-    - (CACHE_POOL_NEAR_FULL)
-    - (POOL_APP_NOT_ENABLED)
+    - \(POOL_FULL\)
+    - \(SMALLER_PGP_NUM\)
+    - \(CACHE_POOL_NO_HIT_SET\)
+    - \(CACHE_POOL_NEAR_FULL\)
+    - \(POOL_APP_NOT_ENABLED\)
+    - \(PG_AVAILABILITY\)
 tasks:
 - workunit:
     clients:
index 9afbc0417b008ac85b200bef250f0d7668cf98ea..da765b018a45139c6a3212a9eed71ee8b43a1804 100644 (file)
@@ -18,9 +18,9 @@ overrides:
       - attr name mismatch
       - Regular scrub request, deep-scrub details will be lost
       - overall HEALTH_
-      - (OSDMAP_FLAGS)
-      - (OSD_
-      - (PG_
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_
+      - \(PG_
     conf:
       osd:
         filestore debug inject read err: true
index 9c08e93f3848377e04a7864e5d32be237ecfbe5f..d7e09a6378fea071af8732b63667c6f4ac92f422 100644 (file)
@@ -6,7 +6,6 @@ overrides:
         paxos service trim min: 5
 # thrashing monitors may make mgr have trouble w/ its keepalive
     log-whitelist:
-      - daemon x is unresponsive
       - overall HEALTH_
       - \(MGR_DOWN\)
 tasks:
index ec761e2955ee24b86a177c7089067c85333129fa..a4cea8f3aedaf16c5c79a58005672115d4d31942 100644 (file)
@@ -6,6 +6,6 @@ tasks:
     - .*clock.*skew.*
     - clocks not synchronized
     - overall HEALTH_
-    - (MON_CLOCK_SKEW)
+    - \(MON_CLOCK_SKEW\)
 - mon_clock_skew_check:
     expect-skew: false
index b5442342b1872ca808b91b296a4625c6b60e32fa..d68270a8eb03107fb50b2ca2563e3e28ab4fd276 100644 (file)
@@ -8,5 +8,5 @@ tasks:
 - install:
 - exec:
     client.0:
-      - mkdir $TESTDIR/ostest && cd $TESTDIR/ostest && ulimit -c 0 && ulimit -Sn 4096 && ceph_test_objectstore --gtest_filter=-*/3
-      - rm -rf $TESTDIR/ostest
+      - mkdir $TESTDIR/archive/ostest && cd $TESTDIR/archive/ostest && ulimit -Sn 16384 && CEPH_ARGS="--no-log-to-stderr --log-file $TESTDIR/archive/ceph_test_objectstore.log --debug-filestore 20 --debug-bluestore 20" ceph_test_objectstore --gtest_filter=-*/3 --gtest_catch_exceptions=0
+      - rm -rf $TESTDIR/archive/ostest
index 77eed22d47070edea19fe501abb97a3415265188..b77aa7b7e185a6b68839083e0c63a37c41985837 100644 (file)
@@ -22,10 +22,20 @@ tasks:
     - failsafe engaged, dropping updates
     - failsafe disengaged, no longer dropping updates
     - overall HEALTH_
-    - (OSDMAP_FLAGS)
-    - (OSD_
-    - (PG_
-    - (SMALLER_PG_NUM)
+    - \(OSDMAP_FLAGS\)
+    - \(OSD_
+    - \(PG_
+    - \(SMALLER_PG_NUM\)
+    - \(CACHE_POOL_NO_HIT_SET\)
+    - erasure code profile property .ruleset-failure-domain. is no longer supported
+    - \(CACHE_POOL_NEAR_FULL\)
+    - \(FS_WITH_FAILED_MDS\)
+    - \(FS_DEGRADED\)
+    - \(POOL_BACKFILLFULL\)
+    - \(POOL_FULL\)
+    - \(SMALLER_PGP_NUM\)
+    - \(POOL_NEARFULL\)
+    - \(POOL_APP_NOT_ENABLED\)
 - workunit:
     clients:
       all:
index bbf330b0ba1f936c7493094458b7105ba3b0a959..dfe35bc442db590cde04fb3ea4cb54097cd6fb46 100644 (file)
@@ -6,10 +6,10 @@ overrides:
       - MDS in read-only mode
       - force file system read-only
       - overall HEALTH_
-      - (OSDMAP_FLAGS)
-      - (OSD_FULL)
-      - (MDS_READ_ONLY)
-      - (POOL_FULL)
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_FULL\)
+      - \(MDS_READ_ONLY\)
+      - \(POOL_FULL\)
 tasks:
 - install:
 - ceph:
diff --git a/ceph/qa/suites/rados/singleton-nomsgr/all/large-omap-object-warnings.yaml b/ceph/qa/suites/rados/singleton-nomsgr/all/large-omap-object-warnings.yaml
new file mode 100644 (file)
index 0000000..e00a93d
--- /dev/null
@@ -0,0 +1,22 @@
+roles:
+- [mon.a, mgr.x, osd.0, osd.1, client.0]
+overrides:
+  ceph:
+    log-whitelist:
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_FULL\)
+      - \(MDS_READ_ONLY\)
+      - large omap objects
+      - Large omap object found
+      - application not enabled
+    conf:
+      osd:
+        osd deep scrub large omap object value sum threshold: 8800000
+        osd deep scrub large omap object key threshold: 20000
+tasks:
+- install:
+- ceph:
+- workunit:
+    clients:
+      all:
+        - rados/test_large_omap_detection.py
index 00fd13c8219d51a51e24b4dc29cbf5c2098208d9..743d73d4a5408472eaebbda6776c004e38991f40 100644 (file)
@@ -19,9 +19,6 @@ overrides:
       - \(PG_
       - \(OBJECT_
       - \(POOL_APP_NOT_ENABLED\)
-    conf:
-      osd:
-        debug osd: 5
 
 tasks:
 - install:
index 57237b5a2344ffd466c130ab48325372b55a6e85..2da2c4660cde774ae0ecc5fceff95f16cbe8f42a 100644 (file)
@@ -19,9 +19,6 @@ overrides:
       - \(PG_
       - \(OBJECT_
       - \(POOL_APP_NOT_ENABLED\)
-    conf:
-      osd:
-        debug osd: 5
 
 tasks:
 - install:
index ed5b216b24981c228f33f94cd7484d5672be8ac4..a63400be391a6412972992ae057a2da30672a8f0 100644 (file)
@@ -21,6 +21,7 @@ tasks:
       - \(OSD_
       - \(PG_
       - \(OBJECT_
+      - \(REQUEST_SLOW\)
     conf:
       osd:
         osd min pg log entries: 5
index 634e8843ea1060260bc673534b366259f9f76632..5479f79b7db489b206183a5df8db38a9cf9b0f0b 100644 (file)
@@ -21,6 +21,7 @@ tasks:
       - \(PG_
       - \(OBJECT_DEGRADED\)
       - \(SLOW_OPS\)
+      - \(REQUEST_SLOW\)
     conf:
       osd:
         osd min pg log entries: 5
index 10f18e2ea47362f079cdd1c448d5036e50b8dd7a..3ada5518f57b44031956f0f744082488fe4a0fbc 100644 (file)
@@ -16,9 +16,9 @@ tasks:
       - but it is still running
       - slow request
       - overall HEALTH_
-      - (OSDMAP_FLAGS)
-      - (OSD_
-      - (PG_
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_
+      - \(PG_
 - exec:
     client.0:
       - sudo ceph osd pool create foo 128 128
index 96b2208d1b2f4800ea1fb6ad2903bb6464660890..71e85649e96dfb614ea1568c4d41eaa0823199db 100644 (file)
@@ -19,8 +19,8 @@ tasks:
     - missing primary copy of
     - objects unfound and apparently lost
     - overall HEALTH_
-    - (POOL_APP_NOT_ENABLED)
-    - (PG_DEGRADED)
+    - \(POOL_APP_NOT_ENABLED\)
+    - \(PG_DEGRADED\)
 - full_sequential:
   - exec:
       client.0:
index cac3cb3b792df64a894fe9fa0a916a7cd0e93d92..e184d911da3d48e2812e0af338b8bf51ea504999 100644 (file)
@@ -25,11 +25,11 @@ tasks:
     - missing primary copy of
     - objects unfound and apparently lost
     - overall HEALTH_
-    - (OSDMAP_FLAGS)
-    - (REQUEST_SLOW)
-    - (PG_
+    - \(OSDMAP_FLAGS\)
+    - \(REQUEST_SLOW\)
+    - \(PG_
     - \(OBJECT_MISPLACED\)
-    - (OSD_
+    - \(OSD_
 - thrashosds:
     op_delay: 30
     clean_interval: 120
index 82c9b2d0c73be2edf41664d6854719b99aae58f8..c0b270758f803de48a6ef675a21a73e7e3b3cd0a 100644 (file)
@@ -19,7 +19,7 @@ tasks:
       - but it is still running
       - slow request
       - overall HEALTH_
-      - (CACHE_POOL_
+      - \(CACHE_POOL_
 - exec:
     client.0:
       - sudo ceph osd pool create base 4
diff --git a/ceph/qa/suites/rados/thrash-erasure-code-big/recovery-overrides b/ceph/qa/suites/rados/thrash-erasure-code-big/recovery-overrides
new file mode 120000 (symlink)
index 0000000..1957f2c
--- /dev/null
@@ -0,0 +1 @@
+../thrash/2-recovery-overrides
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/thrash-erasure-code-isa/recovery-overrides b/ceph/qa/suites/rados/thrash-erasure-code-isa/recovery-overrides
new file mode 120000 (symlink)
index 0000000..1957f2c
--- /dev/null
@@ -0,0 +1 @@
+../thrash/2-recovery-overrides
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/thrash-erasure-code-overwrites/recovery-overrides b/ceph/qa/suites/rados/thrash-erasure-code-overwrites/recovery-overrides
new file mode 120000 (symlink)
index 0000000..1957f2c
--- /dev/null
@@ -0,0 +1 @@
+../thrash/2-recovery-overrides
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/thrash-erasure-code-shec/recovery-overrides b/ceph/qa/suites/rados/thrash-erasure-code-shec/recovery-overrides
new file mode 120000 (symlink)
index 0000000..1957f2c
--- /dev/null
@@ -0,0 +1 @@
+../thrash/2-recovery-overrides
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/thrash-erasure-code/recovery-overrides b/ceph/qa/suites/rados/thrash-erasure-code/recovery-overrides
new file mode 120000 (symlink)
index 0000000..1957f2c
--- /dev/null
@@ -0,0 +1 @@
+../thrash/2-recovery-overrides
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/thrash/2-recovery-overrides/$ b/ceph/qa/suites/rados/thrash/2-recovery-overrides/$
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/rados/thrash/2-recovery-overrides/default.yaml b/ceph/qa/suites/rados/thrash/2-recovery-overrides/default.yaml
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/rados/thrash/2-recovery-overrides/more-active-recovery.yaml b/ceph/qa/suites/rados/thrash/2-recovery-overrides/more-active-recovery.yaml
new file mode 120000 (symlink)
index 0000000..3aa7ce9
--- /dev/null
@@ -0,0 +1 @@
+../../../../overrides/more-active-recovery.yaml
\ No newline at end of file
index 23c705d1eb56fb5c6cf4f1dc6d643beded0e37ef..c5efb0cc04a993ea0015e244706bc9c6ac3e9638 100644 (file)
@@ -3,6 +3,7 @@ overrides:
     log-whitelist:
       - reached quota
       - \(POOL_APP_NOT_ENABLED\)
+      - \(PG_AVAILABILITY\)
     crush_tunables: hammer
     conf:
       client:
index 05b843ebd3eb46bd503b0789c9896fdcc0384459..4a06055b5195207cbab725ad1a1f276ab24f96fd 100644 (file)
@@ -9,6 +9,7 @@ overrides:
       - \(REQUEST_SLOW\)
       - \(CACHE_POOL_NEAR_FULL\)
       - \(POOL_APP_NOT_ENABLED\)
+      - \(PG_AVAILABILITY\)
     conf:
       client:
         debug ms: 1
diff --git a/ceph/qa/suites/rbd/basic/msgr-failures/many.yaml b/ceph/qa/suites/rbd/basic/msgr-failures/many.yaml
deleted file mode 100644 (file)
index 86f8dde..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-overrides:
-  ceph:
-    conf:
-      global:
-        ms inject socket failures: 500
index 90f80dcf2b851f5a4be1dadeb9180941830995aa..740d32c00cd1cf7b9761174963796786168492fd 100644 (file)
@@ -4,4 +4,4 @@ tasks:
 overrides:
   ceph:
     log-whitelist:
-      - (POOL_APP_NOT_ENABLED)
+      - \(POOL_APP_NOT_ENABLED\)
index e723e0929e0921fb01cc75ee333000024707b6ef..24956484165f6b171d1f78d844856af1a3d6afd2 100644 (file)
@@ -4,8 +4,8 @@ overrides:
       - but it is still running
       - objects unfound and apparently lost
       - overall HEALTH_
-      - (CACHE_POOL_NEAR_FULL)
-      - (CACHE_POOL_NO_HIT_SET)
+      - \(CACHE_POOL_NEAR_FULL\)
+      - \(CACHE_POOL_NO_HIT_SET\)
 tasks:
 - exec:
     client.0:
index d17f60db104c5c4e609e78cdf4b4f8f56a32e4f0..df28ed6a2c964beacf6a4a06982b0827beddd22d 100644 (file)
@@ -6,7 +6,7 @@ tasks:
     - reached quota
     - but it is still running
     - objects unfound and apparently lost
-    - (POOL_APP_NOT_ENABLED)
+    - \(POOL_APP_NOT_ENABLED\)
 - thrashosds:
     chance_pgnum_grow: 2
     chance_pgpnum_fix: 1
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/% b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/%
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/0-cluster/+ b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/0-cluster/+
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/0-cluster/openstack.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/0-cluster/openstack.yaml
new file mode 100644 (file)
index 0000000..b0f3b9b
--- /dev/null
@@ -0,0 +1,4 @@
+openstack:
+  - volumes: # attached to each instance
+      count: 4
+      size: 30 # GB
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/0-cluster/start.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/0-cluster/start.yaml
new file mode 100644 (file)
index 0000000..dff144f
--- /dev/null
@@ -0,0 +1,15 @@
+roles:
+- - mon.a
+  - mon.b
+  - mon.c
+  - osd.0
+  - osd.1
+  - osd.2
+  - mgr.x
+- - client.0
+overrides:
+  ceph:
+    log-whitelist:
+    - failed to encode map
+    - CACHE_POOL_NO_HIT_SET
+    fs: xfs
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/1-install/luminous-client-x.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/1-install/luminous-client-x.yaml
new file mode 100644 (file)
index 0000000..3e190f0
--- /dev/null
@@ -0,0 +1,11 @@
+tasks:
+- install:
+    branch: luminous 
+    exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+- print: "**** done install luminous"
+upgrade_workload:
+  sequential:
+  - install.upgrade:
+      exclude_packages: ['ceph-test', 'ceph-test-dbg','libcephfs1']
+      client.0:
+  - print: "**** done install.upgrade to -x on client.0"
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/2-workload/rbd_api_tests.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/2-workload/rbd_api_tests.yaml
new file mode 100644 (file)
index 0000000..386c5e3
--- /dev/null
@@ -0,0 +1,21 @@
+tasks:
+- exec:
+    client.0:
+    - "cp $(which ceph_test_librbd_api) $TESTDIR/ceph_test_librbd_api"
+- sequential:
+  - upgrade_workload
+- ceph: 
+- print: "**** done ceph"
+- exec:
+    client.0:
+    - "cp --force $TESTDIR/ceph_test_librbd_api $(which ceph_test_librbd_api)"
+    - "rm -rf $TESTDIR/ceph_test_librbd_api"
+- print: "**** done reverting to luminous ceph_test_librbd_api"
+- workunit:
+    branch: luminous
+    clients:
+      client.0:
+      - rbd/test_librbd_api.sh
+    env:
+      RBD_FEATURES: "61"
+- print: "**** done rbd/test_librbd_api.sh"
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/2-workload/rbd_cli_import_export.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/2-workload/rbd_cli_import_export.yaml
new file mode 100644 (file)
index 0000000..1467a04
--- /dev/null
@@ -0,0 +1,11 @@
+tasks:
+- sequential:
+  - upgrade_workload
+- ceph: 
+- print: "**** done ceph"
+- workunit:
+    branch: luminous
+    clients:
+      client.0:
+      - rbd/import_export.sh
+- print: "**** done rbd/import_export.sh"
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/supported/centos_7.4.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/supported/centos_7.4.yaml
new file mode 120000 (symlink)
index 0000000..dcc0017
--- /dev/null
@@ -0,0 +1 @@
+../../../../../../distros/all/centos_7.4.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/supported/rhel_7.5.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/supported/rhel_7.5.yaml
new file mode 120000 (symlink)
index 0000000..7213396
--- /dev/null
@@ -0,0 +1 @@
+../../../../../../distros/all/rhel_7.5.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/supported/ubuntu_16.04.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/supported/ubuntu_16.04.yaml
new file mode 120000 (symlink)
index 0000000..9dc1ea9
--- /dev/null
@@ -0,0 +1 @@
+../../../../../../distros/all/ubuntu_16.04.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/supported/ubuntu_18.04.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/basic/supported/ubuntu_18.04.yaml
new file mode 120000 (symlink)
index 0000000..886e87f
--- /dev/null
@@ -0,0 +1 @@
+../../../../../../distros/all/ubuntu_18.04.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/% b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/%
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/0-cluster/+ b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/0-cluster/+
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/0-cluster/openstack.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/0-cluster/openstack.yaml
new file mode 100644 (file)
index 0000000..b0f3b9b
--- /dev/null
@@ -0,0 +1,4 @@
+openstack:
+  - volumes: # attached to each instance
+      count: 4
+      size: 30 # GB
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/0-cluster/start.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/0-cluster/start.yaml
new file mode 100644 (file)
index 0000000..699811f
--- /dev/null
@@ -0,0 +1,15 @@
+roles:
+- - mon.a
+  - mon.b
+  - mon.c
+  - osd.0
+  - osd.1
+  - osd.2
+  - client.0
+  - mgr.x
+- - client.1
+overrides:
+  ceph:
+    log-whitelist:
+    - failed to encode map
+    fs: xfs
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/1-install/luminous-client-x.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/1-install/luminous-client-x.yaml
new file mode 100644 (file)
index 0000000..4a45535
--- /dev/null
@@ -0,0 +1,11 @@
+tasks:
+- install:
+    branch: luminous
+    exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+- print: "**** done install luminous"
+- install.upgrade:
+   exclude_packages: ['ceph-test', 'ceph-test-dbg','libcephfs1']
+   client.1:
+- print: "**** done install.upgrade to -x on client.0"
+- ceph:
+- print: "**** done ceph task"
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/2-features/defaults.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/2-features/defaults.yaml
new file mode 100644 (file)
index 0000000..dff6623
--- /dev/null
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      client:
+        rbd default features: 61
+
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/2-features/layering.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/2-features/layering.yaml
new file mode 100644 (file)
index 0000000..5613d01
--- /dev/null
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      client:
+        rbd default features: 1
+
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/3-workload/rbd_notification_tests.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/3-workload/rbd_notification_tests.yaml
new file mode 100644 (file)
index 0000000..17d2c17
--- /dev/null
@@ -0,0 +1,21 @@
+tasks:
+- workunit:
+    branch: luminous
+    clients:
+      client.0:
+        - rbd/notify_master.sh
+      client.1:
+        - rbd/notify_slave.sh
+    env:
+      RBD_FEATURES: "61"
+- print: "**** done rbd: old librbd -> new librbd"
+- workunit:
+    branch: luminous
+    clients:
+      client.0:
+        - rbd/notify_slave.sh
+      client.1:
+        - rbd/notify_master.sh
+    env:
+      RBD_FEATURES: "61"
+- print: "**** done rbd: new librbd -> old librbd"
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/supported/centos_7.4.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/supported/centos_7.4.yaml
new file mode 120000 (symlink)
index 0000000..dcc0017
--- /dev/null
@@ -0,0 +1 @@
+../../../../../../distros/all/centos_7.4.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/supported/rhel_7.5.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/supported/rhel_7.5.yaml
new file mode 120000 (symlink)
index 0000000..7213396
--- /dev/null
@@ -0,0 +1 @@
+../../../../../../distros/all/rhel_7.5.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/supported/ubuntu_16.04.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/supported/ubuntu_16.04.yaml
new file mode 120000 (symlink)
index 0000000..9dc1ea9
--- /dev/null
@@ -0,0 +1 @@
+../../../../../../distros/all/ubuntu_16.04.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/supported/ubuntu_18.04.yaml b/ceph/qa/suites/upgrade/client-upgrade-luminous/luminous-client-x/rbd/supported/ubuntu_18.04.yaml
new file mode 120000 (symlink)
index 0000000..886e87f
--- /dev/null
@@ -0,0 +1 @@
+../../../../../../distros/all/ubuntu_18.04.yaml
\ No newline at end of file
index c64b2cded2947010e7680528d411d36a4cedf9db..48f1ce328b3701f15a63374c739e3bef60debf0b 100644 (file)
@@ -42,7 +42,6 @@ tasks:
       - \(PG_
       - Monitor daemon marked osd
       - Behind on trimming
-      - is unresponsive
     conf:
       global:
         mon warn on pool no app: false
index f5a883a3927df6aa26cf453afbb5c9c601e4ea71..4d10158b71ba98d7fa8a55f4e7c3122e7893fe32 100644 (file)
@@ -25,7 +25,7 @@ overrides:
     - scrub mismatch
     - ScrubResult
     - wrongly marked
-    - (POOL_APP_NOT_ENABLED)
+    - \(POOL_APP_NOT_ENABLED\)
     - overall HEALTH_
     conf:
       global:
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/% b/ceph/qa/suites/upgrade/luminous-p2p/%
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/point-to-point-upgrade.yaml b/ceph/qa/suites/upgrade/luminous-p2p/point-to-point-upgrade.yaml
new file mode 100644 (file)
index 0000000..f660e2a
--- /dev/null
@@ -0,0 +1,163 @@
+meta:
+- desc: |
+   Run ceph on two nodes, using one of them as a client,
+   with a separate client-only node. 
+   Use xfs beneath the osds.
+   install ceph/luminous v12.2.2 point version
+   run workload and upgrade-sequence in parallel
+   install ceph/luminous v12.2.5 point version
+   run workload and upgrade-sequence in parallel
+   install ceph/luminous latest version
+   run workload and upgrade-sequence in parallel
+overrides:
+  ceph:
+    log-whitelist:
+    - reached quota
+    - scrub
+    - osd_map_max_advance
+    - wrongly marked
+    - FS_DEGRADED
+    - POOL_APP_NOT_ENABLED
+    - CACHE_POOL_NO_HIT_SET
+    - POOL_FULL
+    - SMALLER_PG
+    - pool\(s\) full
+    - OSD_DOWN
+    - missing hit_sets
+    - CACHE_POOL_NEAR_FULL
+    - PG_AVAILABILITY
+    - PG_DEGRADED
+    - application not enabled
+    fs: xfs
+    conf:
+      mon:
+        mon debug unsafe allow tier with nonempty snaps: true
+        mon warn on pool no app: false
+      osd:
+        osd map max advance: 1000
+        osd_class_load_list: "cephfs hello journal lock log numops rbd refcount 
+                              replica_log rgw sdk statelog timeindex user version"
+        osd_class_default_list: "cephfs hello journal lock log numops rbd refcount 
+                                 replica_log rgw sdk statelog timeindex user version"
+      client:
+        rgw_crypt_require_ssl: false
+        rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo=
+roles:
+- - mon.a
+  - mds.a
+  - osd.0
+  - osd.1
+  - osd.2
+  - mgr.x
+- - mon.b
+  - mon.c
+  - osd.3
+  - osd.4
+  - osd.5
+  - client.0
+- - client.1
+openstack:
+- volumes: # attached to each instance
+    count: 3
+    size: 30 # GB
+tasks:
+- print: "****  v12.2.2 about to install"
+- install:
+    tag: v12.2.2
+    # line below can be removed its from jewel test
+    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev', 'librgw2']
+- print: "**** done v12.2.2 install"
+- ceph:
+   fs: xfs
+   add_osds_to_crush: true
+- print: "**** done ceph xfs"
+- sequential:
+   - workload
+- print: "**** done workload v12.2.2"
+
+####  upgrade to v12.2.5
+- install.upgrade:
+    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+    mon.a:
+      tag: v12.2.5
+    mon.b:
+      tag: v12.2.5
+    # Note that client.a IS NOT upgraded at this point
+- parallel:
+   - workload_luminous
+   - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.5"
+####  upgrade to latest luminous
+- install.upgrade:
+    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+    mon.a:
+      branch: luminous
+    mon.b:
+      branch: luminous
+    # Note that client.a IS NOT upgraded at this point
+- parallel:
+   - workload_luminous
+   - upgrade-sequence_luminous
+- print: "**** done parallel luminous branch"
+
+#######################
+workload:
+   sequential:
+   - workunit:
+       clients:
+         client.0:
+           - suites/blogbench.sh
+workload_luminous:
+   full_sequential:
+   - workunit:
+       tag: v12.2.2
+       clients:
+         client.1:
+         - rados/test.sh
+         - cls
+       env:
+         CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces'
+   - print: "**** done rados/test.sh &  cls workload_luminous"
+   - sequential:
+     - rgw: [client.0]
+     - print: "**** done rgw workload_luminous"
+     - s3tests:
+         client.0:
+           force-branch: ceph-luminous
+           rgw_server: client.0
+           scan_for_encryption_keys: false
+     - print: "**** done s3tests workload_luminous"
+upgrade-sequence_luminous:
+   sequential:
+   - print: "**** done branch: luminous install.upgrade"
+   - ceph.restart: [mds.a]
+   - sleep:
+       duration: 60
+   - ceph.restart: [osd.0]
+   - sleep:
+       duration: 30
+   - ceph.restart: [osd.1]
+   - sleep:
+       duration: 30
+   - ceph.restart: [osd.2]
+   - sleep:
+       duration: 30
+   - ceph.restart: [osd.3]
+   - sleep:
+       duration: 30
+   - ceph.restart: [osd.4]
+   - sleep:
+       duration: 30
+   - ceph.restart: [osd.5]
+   - sleep:
+       duration: 60
+   - ceph.restart: [mon.a]
+   - sleep:
+       duration: 60
+   - ceph.restart: [mon.b]
+   - sleep:
+       duration: 60
+   - ceph.restart: [mon.c]
+   - sleep:
+       duration: 60
+   - print: "**** done ceph.restart all luminous branch mds/osd/mon"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/supported b/ceph/qa/suites/upgrade/luminous-p2p/supported
new file mode 120000 (symlink)
index 0000000..dd0d7f1
--- /dev/null
@@ -0,0 +1 @@
+../../../distros/supported/
\ No newline at end of file
index 3684b1e0a0af721d37b9811ec9985a09de6b810f..f8732212dc2b8323d8a3e3f246666a831ed073dc 100644 (file)
@@ -25,7 +25,7 @@ overrides:
     - scrub mismatch
     - ScrubResult
     - wrongly marked
-    - (POOL_APP_NOT_ENABLED)
+    - \(POOL_APP_NOT_ENABLED\)
     - overall HEALTH_
     conf:
       global:
diff --git a/ceph/qa/suites/upgrade/luminous-x/point-to-point-x/% b/ceph/qa/suites/upgrade/luminous-x/point-to-point-x/%
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/ceph/qa/suites/upgrade/luminous-x/point-to-point-x/distros/centos_latest.yaml b/ceph/qa/suites/upgrade/luminous-x/point-to-point-x/distros/centos_latest.yaml
deleted file mode 120000 (symlink)
index b5973b9..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../distros/supported/centos_latest.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/point-to-point-x/distros/ubuntu_latest.yaml b/ceph/qa/suites/upgrade/luminous-x/point-to-point-x/distros/ubuntu_latest.yaml
deleted file mode 120000 (symlink)
index cc5b15b..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../distros/supported/ubuntu_latest.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-x/point-to-point-x/point-to-point-upgrade.yaml b/ceph/qa/suites/upgrade/luminous-x/point-to-point-x/point-to-point-upgrade.yaml
deleted file mode 100644 (file)
index 0a4ed9a..0000000
+++ /dev/null
@@ -1,227 +0,0 @@
-meta:
-- desc: |
-   Run ceph on two nodes, using one of them as a client,
-   with a separate client-only node. 
-   Use xfs beneath the osds.
-   install ceph/luminous v12.2.2 point version
-   run workload and upgrade-sequence in parallel
-   install ceph/luminous latest version
-   run workload and upgrade-sequence in parallel
-   install ceph/-x version (luminous or master/mimic)
-   run workload and upgrade-sequence in parallel
-overrides:
-  ceph:
-    log-whitelist:
-    - reached quota
-    - scrub
-    - osd_map_max_advance
-    - wrongly marked
-    fs: xfs
-    conf:
-      mon:
-        mon debug unsafe allow tier with nonempty snaps: true
-        mon warn on pool no app: false
-      osd:
-        osd map max advance: 1000
-        osd_class_load_list: "cephfs hello journal lock log numops rbd refcount 
-                              replica_log rgw sdk statelog timeindex user version"
-        osd_class_default_list: "cephfs hello journal lock log numops rbd refcount 
-                                 replica_log rgw sdk statelog timeindex user version"
-      client:
-        rgw_crypt_require_ssl: false
-        rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo=
-roles:
-- - mon.a
-  - mds.a
-  - osd.0
-  - osd.1
-  - osd.2
-  - mgr.x
-- - mon.b
-  - mon.c
-  - osd.3
-  - osd.4
-  - osd.5
-  - client.0
-- - client.1
-openstack:
-- volumes: # attached to each instance
-    count: 3
-    size: 30 # GB
-tasks:
-- print: "****  v12.2.2 about to install"
-- install:
-    tag: v12.2.2
-    # line below can be removed its from jewel test
-    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev', 'librgw2']
-- print: "**** done v12.2.2 install"
-- ceph:
-   fs: xfs
-   add_osds_to_crush: true
-- print: "**** done ceph xfs"
-- sequential:
-   - workload
-- print: "**** done workload"
-- install.upgrade:
-    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-    mon.a:
-      branch: luminous
-    mon.b:
-      branch: luminous
-    # Note that client.a IS NOT upgraded at this point
-- parallel:
-   - workload_luminous
-   - upgrade-sequence_luminous
-- print: "**** done parallel luminous branch"
-- install.upgrade:
-    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-    client.1:
-      branch: luminous
-- print: "**** done branch: luminous install.upgrade on client.1"
-- install.upgrade:
-    mon.a:
-    mon.b:
-- print: "**** done branch: -x install.upgrade on mon.a and mon.b"
-- parallel:
-   - workload_x
-   - upgrade-sequence_x
-- print: "**** done parallel -x branch"
-- exec:
-    osd.0:
-      - ceph osd set-require-min-compat-client luminous
-# Run librados tests on the -x upgraded cluster
-- install.upgrade:
-    client.1:
-- workunit:
-    branch: luminous
-    clients:
-      client.1:
-      - rados/test.sh
-      - cls
-- print: "**** done final test on -x cluster"
-#######################
-workload:
-   sequential:
-   - workunit:
-       clients:
-         client.0:
-           - suites/blogbench.sh
-workload_luminous:
-   full_sequential:
-   - workunit:
-       tag: v12.2.2
-       clients:
-         client.1:
-         - rados/test.sh
-         - cls
-       env:
-         CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces'
-   - print: "**** done rados/test.sh &  cls workload_luminous"
-   - sequential:
-     - rgw: [client.0]
-     - print: "**** done rgw workload_luminous"
-     - s3tests:
-         client.0:
-           force-branch: ceph-luminous
-           rgw_server: client.0
-           scan_for_encryption_keys: false
-     - print: "**** done s3tests workload_luminous"
-upgrade-sequence_luminous:
-   sequential:
-   - print: "**** done branch: luminous install.upgrade"
-   - ceph.restart: [mds.a]
-   - sleep:
-       duration: 60
-   - ceph.restart: [osd.0]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.1]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.2]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.3]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.4]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.5]
-   - sleep:
-       duration: 60
-   - ceph.restart: [mon.a]
-   - sleep:
-       duration: 60
-   - ceph.restart: [mon.b]
-   - sleep:
-       duration: 60
-   - ceph.restart: [mon.c]
-   - sleep:
-       duration: 60
-   - print: "**** done ceph.restart all luminous branch mds/osd/mon"
-workload_x:
-   sequential:
-   - workunit:
-       branch: luminous
-       clients:
-         client.1:
-         - rados/test-upgrade-to-mimic.sh
-         - cls
-   - print: "**** done rados/test-upgrade-to-mimic.sh &  cls workload_x NOT upgraded  client"
-   - workunit:
-       branch: luminous
-       clients:
-         client.0:
-         - rados/test-upgrade-to-mimic.sh
-         - cls
-   - print: "**** done rados/test.sh &  cls workload_x upgraded client"
-   - rgw: [client.1]
-   - print: "**** done rgw workload_x"
-   - s3tests:
-       client.1:
-         force-branch: ceph-luminous
-         rgw_server: client.1
-         scan_for_encryption_keys: false
-   - print: "**** done s3tests workload_x"
-upgrade-sequence_x:
-   sequential:
-   - ceph.restart: [mds.a]
-   - sleep:
-       duration: 60
-   - ceph.restart: [mon.a]
-   - sleep:
-       duration: 60
-   - ceph.restart: [mon.b]
-   - sleep:
-       duration: 60
-   - ceph.restart: [mon.c]
-   - sleep:
-       duration: 60
-   - ceph.restart: [osd.0]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.1]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.2]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.3]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.4]
-   - sleep:
-       duration: 30
-   - ceph.restart:
-       daemons: [osd.5]
-       wait-for-healthy: false
-       wait-for-up-osds: true
-   - ceph.restart:
-       daemons: [mgr.x]
-       wait-for-healthy: false
-   - exec:
-       osd.0:
-         - ceph osd require-osd-release luminous
-   - ceph.healthy:
-   - print: "**** done ceph.restart all -x branch mds/osd/mon"
index 87cbbc1ff561748a842da97ed38814b6b65d0e76..f61047d98cd45ce389dfac11c9f0bcfda6142aa9 100644 (file)
@@ -1059,6 +1059,8 @@ def osd_scrub_pgs(ctx, config):
         # allow this to fail; in certain cases the OSD might not be up
         # at this point.  we will catch all pgs below.
         try:
+            manager.raw_cluster_cmd('tell', 'osd.' + id_, 'config', 'set',
+                                    'osd_debug_deep_scrub_sleep', '0');
             manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
         except run.CommandFailedError:
             pass
index 7d65ca1c79d4fdb6f270303e5617485e5a5c347b..3d4c7c8d31da66ce06a858b6756fdec3fde3875d 100644 (file)
@@ -483,6 +483,19 @@ def build_ceph_cluster(ctx, config):
         elif not config.get('only_mon'):
             raise RuntimeError(
                 "The cluster is NOT operational due to insufficient OSDs")
+        # create rbd pool
+        ceph_admin.run(
+            args=[
+                'sudo', 'ceph', '--cluster', 'ceph',
+                'osd', 'pool', 'create', 'rbd', '128', '128'],
+            check_status=False)
+        ceph_admin.run(
+            args=[
+                'sudo', 'ceph', '--cluster', 'ceph',
+                'osd', 'pool', 'application', 'enable',
+                'rbd', 'rbd', '--yes-i-really-mean-it'
+                ],
+            check_status=False)
         yield
 
     except Exception:
index edbb2ae3ec6384c6812e8fa7255f6aca6c84a324..969cd6e23c5d1c7242a0f405df9a99672b982f06 100644 (file)
@@ -1437,7 +1437,7 @@ class CephManager:
         # both osd_mon_report_interval_min and mgr_stats_period are 5 seconds
         # by default, and take the faulty injection in ms into consideration,
         # 12 seconds are more than enough
-        delays = [1, 1, 2, 3, 5, 8, 13]
+        delays = [1, 1, 2, 3, 5, 8, 13, 0]
         @wraps(func)
         def wrapper(self, *args, **kwargs):
             exc = None
index 6cc1ea55889d78d2d8c176d7ba9c6bb8df19ca60..b3950441990866a76458a78e9dfc9a20a4068a32 100644 (file)
@@ -714,7 +714,17 @@ class Filesystem(MDSCluster):
 
         return result
 
-    def get_rank_names(self):
+    def get_rank(self, rank=0, status=None):
+        if status is None:
+            status = self.getinfo()
+        return status.get_rank(self.id, rank)
+
+    def get_ranks(self, status=None):
+        if status is None:
+            status = self.getinfo()
+        return status.get_ranks(self.id)
+
+    def get_rank_names(self, status=None):
         """
         Return MDS daemon names of those daemons holding a rank,
         sorted by rank.  This includes e.g. up:replay/reconnect
@@ -838,6 +848,10 @@ class Filesystem(MDSCluster):
 
         return self.json_asok(command, 'mds', mds_id)
 
+    def rank_asok(self, command, rank=0):
+        info = self.get_rank(rank=rank)
+        return self.json_asok(command, 'mds', info['name'])
+
     def read_cache(self, path, depth=None):
         cmd = ["dump", "tree", path]
         if depth is not None:
index 8d8410c69edce471954f347a10b0759964feaf99..b121680b0b39040ff31044c5f7a8f6776621222c 100644 (file)
@@ -261,7 +261,7 @@ class FuseMount(CephFSMount):
         assert not self.is_mounted()
         self._fuse_conn = None
 
-    def umount_wait(self, force=False, require_clean=False):
+    def umount_wait(self, force=False, require_clean=False, timeout=900):
         """
         :param force: Complete cleanly even if the MDS is offline
         """
@@ -282,7 +282,7 @@ class FuseMount(CephFSMount):
         try:
             if self.fuse_daemon:
                 # Permit a timeout, so that we do not block forever
-                run.wait([self.fuse_daemon], 900)
+                run.wait([self.fuse_daemon], timeout)
         except MaxWhileTries:
             log.error("process failed to terminate after unmount.  This probably"
                       "indicates a bug within ceph-fuse.")
@@ -407,9 +407,15 @@ print find_socket("{client_name}")
         """
         Look up the CephFS client ID for this mount
         """
-
         return self.admin_socket(['mds_sessions'])['id']
 
+    def get_client_pid(self):
+        """
+        return pid of ceph-fuse process
+        """
+        status = self.admin_socket(['status'])
+        return status['metadata']['pid']
+
     def get_osd_epoch(self):
         """
         Return 2-tuple of osd_epoch, osd_epoch_barrier
index bfa1ac679166bd90367b9982be2806e4d922bc73..d237f65fb33541091968fe6b8c6079229e7c33d9 100644 (file)
@@ -124,7 +124,7 @@ class KernelMount(CephFSMount):
     def cleanup(self):
         pass
 
-    def umount_wait(self, force=False, require_clean=False):
+    def umount_wait(self, force=False, require_clean=False, timeout=900):
         """
         Unlike the fuse client, the kernel client's umount is immediate
         """
index fd58c1427338e1cf1f234ec55361e33b56782730..829ca3d5c6a8fc29f0dc234da5eaf8ee7023f0c3 100644 (file)
@@ -11,8 +11,10 @@ import re
 import os
 
 from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
+from tasks.cephfs.fuse_mount import FuseMount
 from tasks.cephfs.cephfs_test_case import CephFSTestCase
 from teuthology.packaging import get_package_version
+from unittest import SkipTest
 
 
 log = logging.getLogger(__name__)
@@ -472,3 +474,42 @@ class TestClientRecovery(CephFSTestCase):
         self.mount_b.mount()
         self.mount_b.wait_until_mounted()
         self.mount_b.run_shell(["ls", "subdir/childfile"])
+
+    def test_stale_renew(self):
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client to handle signal STOP/CONT")
+
+        self.mount_a.run_shell(["mkdir", "testdir"])
+        self.mount_a.run_shell(["touch", "testdir/file1"])
+        # populate readdir cache
+        self.mount_a.run_shell(["ls", "testdir"])
+        self.mount_b.run_shell(["ls", "testdir"])
+
+        # check if readdir cache is effective
+        initial_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency'])
+        self.mount_b.run_shell(["ls", "testdir"])
+        current_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency'])
+        self.assertEqual(current_readdirs, initial_readdirs);
+
+        mount_b_gid = self.mount_b.get_global_id()
+        mount_b_pid = self.mount_b.get_client_pid()
+        # stop ceph-fuse process of mount_b
+        self.mount_b.client_remote.run(args=["sudo", "kill", "-STOP", mount_b_pid])
+
+        self.assert_session_state(mount_b_gid, "open")
+        time.sleep(self.mds_session_timeout * 1.5)  # Long enough for MDS to consider session stale
+        self.assert_session_state(mount_b_gid, "stale")
+
+        self.mount_a.run_shell(["touch", "testdir/file2"])
+
+        # resume ceph-fuse process of mount_b
+        self.mount_b.client_remote.run(args=["sudo", "kill", "-CONT", mount_b_pid])
+        # Is the new file visible from mount_b? (caps become invalid after session stale)
+        self.mount_b.run_shell(["ls", "testdir/file2"])
+
+    def test_unmount_for_evicted_client(self):
+        """Test if client hangs on unmount after evicting the client."""
+        mount_a_client_id = self.mount_a.get_global_id()
+        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+
+        self.mount_a.umount_wait(require_clean=True, timeout=30)
index 913999db7733b7a835bc0e89b25883e66840d082..2c62313592e2b6817a0769be26d395e891a51c75 100644 (file)
@@ -7,6 +7,7 @@ log = logging.getLogger(__name__)
 
 class TestExports(CephFSTestCase):
     MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 2
 
     def _wait_subtrees(self, status, rank, test):
         timeout = 30
@@ -105,3 +106,42 @@ class TestExports(CephFSTestCase):
         self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/aa/bb', 0)])
         self.mount_a.run_shell(["mv", "aa", "a/b/"])
         self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/a/b/aa/bb', 0)])
+
+    def test_session_race(self):
+        """
+        Test session creation race.
+
+        See: https://tracker.ceph.com/issues/24072#change-113056
+        """
+
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_daemons()
+
+        status = self.fs.status()
+        rank1 = self.fs.get_rank(rank=1, status=status)
+        name1 = 'mds.'+rank1['name']
+
+        # Create a directory that is pre-exported to rank 1
+        self.mount_a.run_shell(["mkdir", "-p", "a/aa"])
+        self.mount_a.setfattr("a", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 1, [('/a', 1)])
+
+        # Now set the mds config to allow the race
+        self.fs.rank_asok(["config", "set", "mds_inject_migrator_session_race", "true"], rank=1)
+
+        # Now create another directory and try to export it
+        self.mount_b.run_shell(["mkdir", "-p", "b/bb"])
+        self.mount_b.setfattr("b", "ceph.dir.pin", "1")
+
+        time.sleep(5)
+
+        # Now turn off the race so that it doesn't wait again
+        self.fs.rank_asok(["config", "set", "mds_inject_migrator_session_race", "false"], rank=1)
+
+        # Now try to create a session with rank 1 by accessing a dir known to
+        # be there, if buggy, this should cause the rank 1 to crash:
+        self.mount_b.run_shell(["ls", "a"])
+
+        # Check if rank1 changed (standby tookover?)
+        new_rank1 = self.fs.get_rank(rank=1)
+        self.assertEqual(rank1['gid'], new_rank1['gid'])
index b64f3e931dca692e6e3681cf2e270c04f3987c5d..f96994e886a754a5f1ba5b4f713d86ec0b50faa6 100644 (file)
@@ -948,76 +948,6 @@ class TestStrays(CephFSTestCase):
         self.mds_cluster.mds_restart()
         self.fs.wait_for_daemons()
 
-    def test_purge_queue_op_rate(self):
-        """
-        A busy purge queue is meant to aggregate operations sufficiently
-        that our RADOS ops to the metadata pool are not O(files).  Check
-        that that is so.
-        :return:
-        """
-
-        # For low rates of deletion, the rate of metadata ops actually
-        # will be o(files), so to see the desired behaviour we have to give
-        # the system a significant quantity, i.e. an order of magnitude
-        # more than the number of files it will purge at one time.
-
-        max_purge_files = 2
-
-        self.set_conf('mds', 'mds_bal_frag', 'false')
-        self.set_conf('mds', 'mds_max_purge_files', "%d" % max_purge_files)
-        self.fs.mds_fail_restart()
-        self.fs.wait_for_daemons()
-
-        phase_1_files = 256
-        phase_2_files = 512
-
-        self.mount_a.run_shell(["mkdir", "phase1"])
-        self.mount_a.create_n_files("phase1/file", phase_1_files)
-
-        self.mount_a.run_shell(["mkdir", "phase2"])
-        self.mount_a.create_n_files("phase2/file", phase_2_files)
-
-        def unlink_and_count_ops(path, expected_deletions):
-            initial_ops = self.get_stat("objecter", "op")
-            initial_pq_executed = self.get_stat("purge_queue", "pq_executed")
-
-            self.mount_a.run_shell(["rm", "-rf", path])
-
-            self._wait_for_counter(
-                "purge_queue", "pq_executed", initial_pq_executed + expected_deletions
-            )
-
-            final_ops = self.get_stat("objecter", "op")
-
-            # Calculation of the *overhead* operations, i.e. do not include
-            # the operations where we actually delete files.
-            return final_ops - initial_ops - expected_deletions
-
-        self.fs.mds_asok(['flush', 'journal'])
-        phase1_ops = unlink_and_count_ops("phase1/", phase_1_files + 1)
-
-        self.fs.mds_asok(['flush', 'journal'])
-        phase2_ops = unlink_and_count_ops("phase2/", phase_2_files + 1)
-
-        log.info("Phase 1: {0}".format(phase1_ops))
-        log.info("Phase 2: {0}".format(phase2_ops))
-
-        # The success criterion is that deleting double the number
-        # of files doesn't generate double the number of overhead ops
-        # -- this comparison is a rough approximation of that rule.
-        self.assertTrue(phase2_ops < phase1_ops * 1.25)
-
-        # Finally, check that our activity did include properly quiescing
-        # the queue (i.e. call to Journaler::write_head in the right place),
-        # by restarting the MDS and checking that it doesn't try re-executing
-        # any of the work we did.
-        self.fs.mds_asok(['flush', 'journal'])  # flush to ensure no strays
-                                                # hanging around
-        self.fs.mds_fail_restart()
-        self.fs.wait_for_daemons()
-        time.sleep(10)
-        self.assertEqual(self.get_stat("purge_queue", "pq_executed"), 0)
-
     def test_replicated_delete_speed(self):
         """
         That deletions of replicated metadata are not pathologically slow
index 0876af96efe0b985aabeda3abeb2afeb2279ad4c..9be7fc2fff5575dced0f2e722ab0bacbf9629d99 100644 (file)
@@ -283,21 +283,21 @@ vc.disconnect()
             # it has lost network, because there is nothing to tell it that is messages
             # are being dropped because it's identity is gone)
             background = self.mounts[2].write_n_mb("rogue.bin", 1, wait=False)
-            time.sleep(10)  # Approximate check for 'stuck' as 'still running after 10s'
-            self.assertFalse(background.finished)
+            try:
+                background.wait()
+            except CommandFailedError:
+                # command failed with EBLACKLISTED?
+                if "transport endpoint shutdown" in background.stderr.getvalue():
+                    pass
+                else:
+                    raise
 
             # After deauthorisation, the client ID should be gone (this was the only
             # volume it was authorised for)
             self.assertNotIn("client.{0}".format(guest_entity), [e['entity'] for e in self.auth_list()])
 
             # Clean up the dead mount (ceph-fuse's behaviour here is a bit undefined)
-            self.mounts[2].kill()
-            self.mounts[2].kill_cleanup()
-            try:
-                background.wait()
-            except CommandFailedError:
-                # We killed the mount out from under you
-                pass
+            self.mounts[2].umount_wait()
 
         self._volume_client_python(self.mount_b, dedent("""
             vp = VolumePath("{group_id}", "{volume_id}")
@@ -343,6 +343,19 @@ vc.disconnect()
             vc.delete_volume(vp, data_isolated=True)
             vc.purge_volume(vp, data_isolated=True)
             vc.purge_volume(vp, data_isolated=True)
+
+            vc.create_volume(vp, 10, namespace_isolated=False)
+            vc.create_volume(vp, 10, namespace_isolated=False)
+            vc.authorize(vp, "{guest_entity}")
+            vc.authorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.evict("{guest_entity}")
+            vc.evict("{guest_entity}")
+            vc.delete_volume(vp)
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+            vc.purge_volume(vp)
         """.format(
             group_id=group_id,
             volume_id=volume_id,
@@ -764,52 +777,63 @@ vc.disconnect()
         # auth ID belongs to, the auth ID's authorized access levels
         # for different volumes, versioning details, etc.
         expected_auth_metadata = {
-            u"version": 2,
-            u"compat_version": 1,
-            u"dirty": False,
-            u"tenant_id": u"tenant1",
-            u"volumes": {
-                u"groupid/volumeid": {
-                    u"dirty": False,
-                    u"access_level": u"rw",
+            "version": 2,
+            "compat_version": 1,
+            "dirty": False,
+            "tenant_id": u"tenant1",
+            "volumes": {
+                "groupid/volumeid": {
+                    "dirty": False,
+                    "access_level": u"rw",
                 }
             }
         }
 
         auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            import json
             vp = VolumePath("{group_id}", "{volume_id}")
             auth_metadata = vc._auth_metadata_get("{auth_id}")
-            print auth_metadata
+            print(json.dumps(auth_metadata))
         """.format(
             group_id=group_id,
             volume_id=volume_id,
             auth_id=guestclient_1["auth_id"],
         )))
+        auth_metadata = json.loads(auth_metadata)
 
-        self.assertItemsEqual(str(expected_auth_metadata), auth_metadata)
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
 
         # Verify that the volume metadata file stores info about auth IDs
         # and their access levels to the volume, versioning details, etc.
         expected_vol_metadata = {
-            u"version": 2,
-            u"compat_version": 1,
-            u"auths": {
-                u"guest": {
-                    u"dirty": False,
-                    u"access_level": u"rw"
+            "version": 2,
+            "compat_version": 1,
+            "auths": {
+                "guest": {
+                    "dirty": False,
+                    "access_level": u"rw"
                 }
             }
         }
 
         vol_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            import json
             vp = VolumePath("{group_id}", "{volume_id}")
             volume_metadata = vc._volume_metadata_get(vp)
-            print volume_metadata
+            print(json.dumps(volume_metadata))
         """.format(
             group_id=group_id,
             volume_id=volume_id,
         )))
-        self.assertItemsEqual(str(expected_vol_metadata), vol_metadata)
+        vol_metadata = json.loads(vol_metadata)
+
+        self.assertGreaterEqual(vol_metadata["version"], expected_vol_metadata["version"])
+        del expected_vol_metadata["version"]
+        del vol_metadata["version"]
+        self.assertEqual(expected_vol_metadata, vol_metadata)
 
         # Cannot authorize 'guestclient_2' to access the volume.
         # It uses auth ID 'guest', which has already been used by a
@@ -1014,3 +1038,43 @@ vc.disconnect()
         # Mount the volume in the guest using the auth ID to assert that the
         # auth caps are valid
         guest_mount.mount(mount_path=mount_path)
+
+    def test_volume_without_namespace_isolation(self):
+        """
+        That volume client can create volumes that do not have separate RADOS
+        namespace layouts.
+        """
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+
+        # Configure vc_mount as the handle for driving volumeclient
+        self._configure_vc_auth(vc_mount, "manila")
+
+        # Create a volume
+        volume_prefix = "/myprefix"
+        group_id = "grpid"
+        volume_id = "volid"
+        mount_path = self._volume_client_python(vc_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*10, namespace_isolated=False)
+            print create_result['mount_path']
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )), volume_prefix)
+
+        # The CephFS volume should be created
+        self.mounts[0].stat(os.path.join("myprefix", group_id, volume_id))
+        vol_namespace = self.mounts[0].getfattr(
+            os.path.join("myprefix", group_id, volume_id),
+            "ceph.dir.layout.pool_namespace")
+        assert not vol_namespace
+
+        self._volume_client_python(vc_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )), volume_prefix)
index 4e2a228fc3b6b513bfaa546ef2bccc19ae47470e..35c1a80022f37cbffd151dfd60903d1e3a2110d4 100644 (file)
@@ -38,7 +38,9 @@ def task(ctx, config):
 
     dir = '%s/ceph.data/test.%s' % (testdir, client)
 
-    seed = str(int(random.uniform(1,100)))
+    seed = int(random.uniform(1,100))
+    start = 800 + random.randint(800,1200)
+    end = start + 150
 
     try:
         log.info('creating a working dir')
@@ -61,7 +63,7 @@ def task(ctx, config):
             args=[
                 'cd', dir,
                 run.Raw('&&'),
-                './run_seed_to_range.sh', seed, '50', '300',
+                './run_seed_to_range.sh', str(seed), str(start), str(end),
                 ],
             wait=False,
             check_status=False)
index b4e2aa4deed452f207c0acf0752d2423e0be84c4..03ea218f5690adba4c1e2fa3093b908ddd551cc0 100644 (file)
@@ -34,8 +34,9 @@ def test_create_from_mon(ctx, config):
     manager = ctx.managers['ceph']
     log.info('1. creating pool.a')
     pool_a = manager.create_pool_with_unique_name(pg_num)
-    manager.wait_for_clean()
-    assert manager.get_num_active_clean() == pg_num
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
 
     log.info('2. creating pool.b')
     pool_b = manager.create_pool_with_unique_name(pg_num)
@@ -81,8 +82,9 @@ def test_create_from_peer(ctx, config):
     manager = ctx.managers['ceph']
     log.info('1. creating pool.a')
     pool_a = manager.create_pool_with_unique_name(pg_num)
-    manager.wait_for_clean()
-    assert manager.get_num_active_clean() == pg_num
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
 
     log.info('2. creating pool.b')
     while True:
index 7bd72d19536c1854ec923ea3616a883dd53416c1..99a742f13b9e18efc2b9bf7d0e38c272227ba925 100644 (file)
@@ -254,6 +254,7 @@ def task(ctx, config):
     (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
     assert ret == 200
     assert out['suspended']
+    assert out['email'] == email
 
     # TESTCASE 're-enable','user','enable','suspended user','succeeds'
     (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : 'false'})
index 4d850c391bbe40d6c8dac28c545c592eac654282..30e74cce5b56da7c2ce6e355501521edfc558140 100755 (executable)
@@ -5,7 +5,6 @@ ceph -s
 #list pools
 rados lspools
 #lisr rbd images
-ceph osd pool create rbd 128 128
 rbd ls
 #check that the monitors work
 ceph osd set nodown
diff --git a/ceph/qa/workunits/fs/misc/rstats.sh b/ceph/qa/workunits/fs/misc/rstats.sh
new file mode 100755 (executable)
index 0000000..4c32edb
--- /dev/null
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+set -x
+
+timeout=30
+old_value=""
+new_value=""
+
+wait_until_changed() {
+       name=$1
+       wait=0
+       while [ $wait -lt $timeout ]; do
+               new_value=`getfattr --only-value -n ceph.dir.$name .`
+               [ $new_value == $old_value ] || return 0
+               sleep 1
+               wait=$(($wait + 1))
+       done
+       return 1
+}
+
+check_rctime() {
+       old_sec=$(echo $old_value | cut -d. -f1)
+       old_nsec=$(echo $old_value | cut -d. -f2)
+       new_sec=$(echo $new_value | cut -d. -f1)
+       new_nsec=$(echo $new_value | cut -d. -f2)
+       [ "$old_sec" -lt "$new_sec" ] && return 0
+       [ "$old_sec" -gt "$new_sec" ] && return 1
+       [ "$old_nsec" -lt "$new_nsec" ] && return 0
+       return 1
+}
+
+# sync(3) does not make ceph-fuse flush dirty caps, because fuse kernel module
+# does not notify ceph-fuse about it. Use fsync(3) instead.
+fsync_path() {
+       cmd="import os; fd=os.open(\"$1\", os.O_RDONLY); os.fsync(fd); os.close(fd)"
+       python -c "$cmd"
+}
+
+set -e
+
+mkdir -p rstats_testdir/d1/d2
+cd rstats_testdir
+
+# rfiles
+old_value=`getfattr --only-value -n ceph.dir.rfiles .`
+[ $old_value == 0 ] || false
+touch d1/d2/f1
+wait_until_changed rfiles
+[ $new_value == $(($old_value + 1)) ] || false
+
+# rsubdirs
+old_value=`getfattr --only-value -n ceph.dir.rsubdirs .`
+[ $old_value == 3 ] || false
+mkdir d1/d2/d3
+wait_until_changed rsubdirs
+[ $new_value == $(($old_value + 1)) ] || false
+
+# rbytes
+old_value=`getfattr --only-value -n ceph.dir.rbytes .`
+[ $old_value == 0 ] || false
+echo hello > d1/d2/f2
+fsync_path d1/d2/f2
+wait_until_changed rbytes
+[ $new_value == $(($old_value + 6)) ] || false
+
+#rctime
+old_value=`getfattr --only-value -n ceph.dir.rctime .`
+touch d1/d2/d3 # touch existing file
+fsync_path d1/d2/d3
+wait_until_changed rctime
+check_rctime
+
+old_value=`getfattr --only-value -n ceph.dir.rctime .`
+touch d1/d2/f3 # create new file
+wait_until_changed rctime
+check_rctime
+
+cd ..
+rm -rf rstats_testdir
+echo OK
index 348811e7c4b24e9118292ed9ab7e756e2d3bb30e..2ae8202301ee2cc3df621e160c9169be17ef84ae 100755 (executable)
@@ -202,4 +202,15 @@ ceph osd pool rm cool cool --yes-i-really-really-mean-it
 ceph osd pool rm cold cold --yes-i-really-really-mean-it
 ceph osd crush weight-set rm-compat
 
+# weight set vs device classes vs move
+ceph osd crush weight-set create-compat
+ceph osd crush add-bucket fooo host
+ceph osd crush move fooo root=default
+ceph osd crush add-bucket barr rack
+ceph osd crush move barr root=default
+ceph osd crush move fooo rack=barr
+ceph osd crush rm fooo
+ceph osd crush rm barr
+ceph osd crush weight-set rm-compat
+
 echo OK
index 94580c234ca30ebd6eeaba37525a6aa7ca85f5db..c3c802d57006128c3643b04646c1b3c47c531c7c 100755 (executable)
@@ -71,7 +71,7 @@ git clone https://github.com/facebook/rocksdb.git --depth 1
 
 # compile code
 cd rocksdb
-make env_librados_test ROCKSDB_USE_LIBRADOS=1 -j8
+make env_librados_test ROCKSDB_USE_LIBRADOS=1 DISABLE_WARNING_AS_ERROR=1 -j8
 
 echo "Copy ceph.conf"
 # prepare ceph.conf
diff --git a/ceph/qa/workunits/rados/test_large_omap_detection.py b/ceph/qa/workunits/rados/test_large_omap_detection.py
new file mode 100755 (executable)
index 0000000..6a9e3f9
--- /dev/null
@@ -0,0 +1,130 @@
+#!/usr/bin/python
+# -*- mode:python -*-
+# vim: ts=4 sw=4 smarttab expandtab
+#
+# Copyright (C) 2017 Red Hat <contact@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+import json
+import rados
+import shlex
+import subprocess
+import time
+
+def cleanup(cluster):
+    cluster.delete_pool('large-omap-test-pool')
+    cluster.shutdown()
+
+def init():
+    # For local testing
+    #cluster = rados.Rados(conffile='./ceph.conf')
+    cluster = rados.Rados(conffile='/etc/ceph/ceph.conf')
+    cluster.connect()
+    print("\nCluster ID: " + cluster.get_fsid())
+    cluster.create_pool('large-omap-test-pool')
+    ioctx = cluster.open_ioctx('large-omap-test-pool')
+    ioctx.write_full('large-omap-test-object1', "Lorem ipsum")
+    op = ioctx.create_write_op()
+
+    keys = []
+    values = []
+    for x in range(20001):
+        keys.append(str(x))
+        values.append("X")
+
+    ioctx.set_omap(op, tuple(keys), tuple(values))
+    ioctx.operate_write_op(op, 'large-omap-test-object1', 0)
+    ioctx.release_write_op(op)
+
+    ioctx.write_full('large-omap-test-object2', "Lorem ipsum dolor")
+    op = ioctx.create_write_op()
+
+    buffer = ("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do "
+              "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut "
+              "enim ad minim veniam, quis nostrud exercitation ullamco laboris "
+              "nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in "
+              "reprehenderit in voluptate velit esse cillum dolore eu fugiat "
+              "nulla pariatur. Excepteur sint occaecat cupidatat non proident, "
+              "sunt in culpa qui officia deserunt mollit anim id est laborum.")
+
+    keys = []
+    values = []
+    for x in xrange(20000):
+        keys.append(str(x))
+        values.append(buffer)
+
+    ioctx.set_omap(op, tuple(keys), tuple(values))
+    ioctx.operate_write_op(op, 'large-omap-test-object2', 0)
+    ioctx.release_write_op(op)
+    ioctx.close()
+    return cluster
+
+def get_deep_scrub_timestamp(pgid):
+    cmd = ['ceph', 'pg', 'dump', '--format=json-pretty']
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    out = proc.communicate()[0]
+    for stat in json.loads(out)['pg_stats']:
+        if stat['pgid'] == pgid:
+            return stat['last_deep_scrub_stamp']
+
+def wait_for_scrub():
+    osds = set();
+    pgs = dict();
+    cmd = ['ceph', 'osd', 'map', 'large-omap-test-pool',
+           'large-omap-test-object1', '--format=json-pretty']
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    out = proc.communicate()[0]
+    osds.add(json.loads(out)['acting_primary'])
+    pgs[json.loads(out)['pgid']] = get_deep_scrub_timestamp(json.loads(out)['pgid'])
+    cmd = ['ceph', 'osd', 'map', 'large-omap-test-pool',
+           'large-omap-test-object2', '--format=json-pretty']
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    out = proc.communicate()[0]
+    osds.add(json.loads(out)['acting_primary'])
+    pgs[json.loads(out)['pgid']] = get_deep_scrub_timestamp(json.loads(out)['pgid'])
+
+    for pg in pgs:
+        command = "ceph pg deep-scrub " + str(pg)
+        subprocess.check_call(shlex.split(command))
+
+    for pg in pgs:
+        RETRIES = 0
+        while RETRIES < 60 and pgs[pg] == get_deep_scrub_timestamp(pg):
+            time.sleep(10)
+            RETRIES += 1
+
+def check_health_output():
+    RETRIES = 0
+    result = 0
+    while RETRIES < 6 and result != 2:
+        result = 0
+        RETRIES += 1
+        output = subprocess.check_output(["ceph", "health", "detail"])
+        for line in output.splitlines():
+            result += int(line.find('2 large omap objects') != -1)
+        time.sleep(10)
+
+    if result != 2:
+        print("Error, got invalid output:")
+        print(output)
+        raise Exception
+
+def main():
+    cluster = init()
+    wait_for_scrub()
+    check_health_output()
+
+    cleanup(cluster)
+
+if __name__ == '__main__':
+    main()
index 8597b7147c88da8efc5395a17ed0e9ce3f295ade..69604bca826e667403ace40407597a0a11119120 100755 (executable)
 
 set -e
 
-expect_1()
+KEYRING=$(mktemp)
+trap cleanup EXIT ERR HUP INT QUIT
+
+cleanup() {
+    (ceph auth del client.mon_read || true) >/dev/null 2>&1
+    (ceph auth del client.mon_write || true) >/dev/null 2>&1
+
+    rm -f $KEYRING
+}
+
+expect_false()
 {
-  set -x
-  set +e
-  "$@"
-  if [ $? == 1 ]; then return 0; else return 1; fi
+       set -x
+       if "$@"; then return 1; else return 0; fi
+}
+
+create_pool_op() {
+  ID=$1
+  POOL=$2
+
+  cat << EOF | CEPH_KEYRING="$KEYRING" python
+import rados
+
+cluster = rados.Rados(conffile="", rados_id="${ID}")
+cluster.connect()
+cluster.create_pool("${POOL}")
+EOF
 }
 
+delete_pool_op() {
+  ID=$1
+  POOL=$2
+
+  cat << EOF | CEPH_KEYRING="$KEYRING" python
+import rados
+
+cluster = rados.Rados(conffile="", rados_id="${ID}")
+cluster.connect()
+cluster.delete_pool("${POOL}")
+EOF
+}
+
+create_pool_snap_op() {
+  ID=$1
+  POOL=$2
+  SNAP=$3
+
+  cat << EOF | CEPH_KEYRING="$KEYRING" python
+import rados
+
+cluster = rados.Rados(conffile="", rados_id="${ID}")
+cluster.connect()
+ioctx = cluster.open_ioctx("${POOL}")
+
+ioctx.create_snap("${SNAP}")
+EOF
+}
+
+remove_pool_snap_op() {
+  ID=$1
+  POOL=$2
+  SNAP=$3
+
+  cat << EOF | CEPH_KEYRING="$KEYRING" python
+import rados
+
+cluster = rados.Rados(conffile="", rados_id="${ID}")
+cluster.connect()
+ioctx = cluster.open_ioctx("${POOL}")
+
+ioctx.remove_snap("${SNAP}")
+EOF
+}
+
+test_pool_op()
+{
+    ceph auth get-or-create client.mon_read mon 'allow r' >> $KEYRING
+    ceph auth get-or-create client.mon_write mon 'allow *' >> $KEYRING
+
+    expect_false create_pool_op mon_read pool1
+    create_pool_op mon_write pool1
+
+    expect_false create_pool_snap_op mon_read pool1 snap1
+    create_pool_snap_op mon_write pool1 snap1
+
+    expect_false remove_pool_snap_op mon_read pool1 snap1
+    remove_pool_snap_op mon_write pool1 snap1
+
+    expect_false delete_pool_op mon_read pool1
+    delete_pool_op mon_write pool1
+}
 
 key=`ceph auth get-or-create-key client.poolaccess1 mon 'allow r' osd 'allow *'`
 rados --id poolaccess1 --key $key -p rbd ls
 
 key=`ceph auth get-or-create-key client.poolaccess2 mon 'allow r' osd 'allow * pool=nopool'`
-expect_1 rados --id poolaccess2 --key $key -p rbd ls
+expect_false rados --id poolaccess2 --key $key -p rbd ls
 
 key=`ceph auth get-or-create-key client.poolaccess3 mon 'allow r' osd 'allow rw pool=nopool'`
-expect_1 rados --id poolaccess3 --key $key -p rbd ls
+expect_false rados --id poolaccess3 --key $key -p rbd ls
+
+test_pool_op
 
 echo OK
index c9ecb8b62523b6d609dba3efef305613af989ab3..3c85adc0f9515d09b5579b23e92283b59c13b8aa 100755 (executable)
@@ -79,10 +79,14 @@ if rbd help export | grep -q export-format; then
     dd if=/bin/dd of=${TMPDIR}/img bs=1k count=10 seek=100
     rbd import $RBD_CREATE_ARGS ${TMPDIR}/img testimg
     rbd snap create testimg@snap
+    rbd image-meta set testimg key1 value1
+    IMAGEMETA_BEFORE=`rbd image-meta list testimg`
     rbd export --export-format 2 testimg ${TMPDIR}/img_v2
     rbd import --export-format 2 ${TMPDIR}/img_v2 testimg_import
     rbd info testimg_import
     rbd info testimg_import@snap
+    IMAGEMETA_AFTER=`rbd image-meta list testimg_import`
+    [ "$IMAGEMETA_BEFORE" = "$IMAGEMETA_AFTER" ]
 
     # compare the contents between testimg and testimg_import
     rbd export testimg_import ${TMPDIR}/img_import
@@ -132,7 +136,7 @@ if rbd help export | grep -q export-format; then
     rbd import --stripe-count 1000 --stripe-unit 4096 ${TMPDIR}/img testimg
     rbd export --export-format 2 testimg ${TMPDIR}/img_v2
     rbd import --export-format 2 ${TMPDIR}/img_v2 testimg_import
-    rbd info testimg_import|grep "stripe unit"|awk '{print $3}'|grep 4096
+    rbd info testimg_import|grep "stripe unit"|grep -Ei '(4 KiB|4K|4096)'
     rbd info testimg_import|grep "stripe count"|awk '{print $3}'|grep 1000
 
     rm ${TMPDIR}/img_v2
@@ -158,7 +162,7 @@ dd if=/dev/urandom bs=1M count=1 of=${TMPDIR}/sparse2; truncate ${TMPDIR}/sparse
 # 1M sparse, 1M data
 rbd rm sparse1 || true
 rbd import $RBD_CREATE_ARGS --order 20 ${TMPDIR}/sparse1
-rbd ls -l | grep sparse1 | grep -Ei '(2M|2048k)'
+rbd ls -l | grep sparse1 | grep -Ei '(2 MiB|2M|2048k)'
 [ $tiered -eq 1 -o "$(objects sparse1)" = '1' ]
 
 # export, compare contents and on-disk size
@@ -170,7 +174,7 @@ rbd rm sparse1
 # 1M data, 1M sparse
 rbd rm sparse2 || true
 rbd import $RBD_CREATE_ARGS --order 20 ${TMPDIR}/sparse2
-rbd ls -l | grep sparse2 | grep -Ei '(2M|2048k)'
+rbd ls -l | grep sparse2 | grep -Ei '(2 MiB|2M|2048k)'
 [ $tiered -eq 1 -o "$(objects sparse2)" = '0' ]
 rbd export sparse2 ${TMPDIR}/sparse2.out
 compare_files_and_ondisk_sizes ${TMPDIR}/sparse2 ${TMPDIR}/sparse2.out
@@ -181,7 +185,7 @@ rbd rm sparse2
 truncate ${TMPDIR}/sparse1 -s 10M
 # import from stdin just for fun, verify still sparse
 rbd import $RBD_CREATE_ARGS --order 20 - sparse1 < ${TMPDIR}/sparse1
-rbd ls -l | grep sparse1 | grep -Ei '(10M|10240k)'
+rbd ls -l | grep sparse1 | grep -Ei '(10 MiB|10M|10240k)'
 [ $tiered -eq 1 -o "$(objects sparse1)" = '1' ]
 rbd export sparse1 ${TMPDIR}/sparse1.out
 compare_files_and_ondisk_sizes ${TMPDIR}/sparse1 ${TMPDIR}/sparse1.out
@@ -192,7 +196,7 @@ rbd rm sparse1
 dd if=/dev/urandom bs=2M count=1 of=${TMPDIR}/sparse2 oflag=append conv=notrunc
 # again from stding
 rbd import $RBD_CREATE_ARGS --order 20 - sparse2 < ${TMPDIR}/sparse2
-rbd ls -l | grep sparse2 | grep -Ei '(4M|4096k)'
+rbd ls -l | grep sparse2 | grep -Ei '(4 MiB|4M|4096k)'
 [ $tiered -eq 1 -o "$(objects sparse2)" = '0 2 3' ]
 rbd export sparse2 ${TMPDIR}/sparse2.out
 compare_files_and_ondisk_sizes ${TMPDIR}/sparse2 ${TMPDIR}/sparse2.out
index a435a67bb92cae908c4e19c6a04c28fe87df2a10..fdf91bfc6c3a8512e832e6877a8021965f75c356 100755 (executable)
@@ -23,11 +23,27 @@ recreate_pools() {
 delete_users() {
     (ceph auth del client.volumes || true) >/dev/null 2>&1
     (ceph auth del client.images || true) >/dev/null 2>&1
+
+    (ceph auth del client.snap_none || true) >/dev/null 2>&1
+    (ceph auth del client.snap_all || true) >/dev/null 2>&1
+    (ceph auth del client.snap_pool || true) >/dev/null 2>&1
+    (ceph auth del client.snap_profile_all || true) >/dev/null 2>&1
+    (ceph auth del client.snap_profile_pool || true) >/dev/null 2>&1
+
+    (ceph auth del client.mon_write || true) >/dev/null 2>&1
 }
 
 create_users() {
     ceph auth get-or-create client.volumes mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow r class-read pool images, allow rwx pool volumes' >> $KEYRING
     ceph auth get-or-create client.images mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool images' >> $KEYRING
+
+    ceph auth get-or-create client.snap_none mon 'allow r' >> $KEYRING
+    ceph auth get-or-create client.snap_all mon 'allow r' osd 'allow w' >> $KEYRING
+    ceph auth get-or-create client.snap_pool mon 'allow r' osd 'allow w pool=images' >> $KEYRING
+    ceph auth get-or-create client.snap_profile_all mon 'allow r' osd 'profile rbd' >> $KEYRING
+    ceph auth get-or-create client.snap_profile_pool mon 'allow r' osd 'profile rbd pool=images' >> $KEYRING
+
+    ceph auth get-or-create client.mon_write mon 'allow *' >> $KEYRING
 }
 
 expect() {
@@ -126,9 +142,83 @@ test_volumes_access() {
     rbd -k $KEYRING --id volumes rm volumes/child
 }
 
+create_self_managed_snapshot() {
+  ID=$1
+  POOL=$2
+
+  cat << EOF | CEPH_KEYRING="$KEYRING" python
+import rados
+
+cluster = rados.Rados(conffile="", rados_id="${ID}")
+cluster.connect()
+ioctx = cluster.open_ioctx("${POOL}")
+
+snap_id = ioctx.create_self_managed_snap()
+print ("Created snap id {}".format(snap_id))
+EOF
+}
+
+remove_self_managed_snapshot() {
+  ID=$1
+  POOL=$2
+
+  cat << EOF | CEPH_KEYRING="$KEYRING" python
+import rados
+
+cluster1 = rados.Rados(conffile="", rados_id="mon_write")
+cluster1.connect()
+ioctx1 = cluster1.open_ioctx("${POOL}")
+
+snap_id = ioctx1.create_self_managed_snap()
+print ("Created snap id {}".format(snap_id))
+
+cluster2 = rados.Rados(conffile="", rados_id="${ID}")
+cluster2.connect()
+ioctx2 = cluster2.open_ioctx("${POOL}")
+
+ioctx2.remove_self_managed_snap(snap_id)
+print ("Removed snap id {}".format(snap_id))
+EOF
+}
+
+test_remove_self_managed_snapshots() {
+    # Ensure users cannot create self-managed snapshots w/o permissions
+    expect 1 create_self_managed_snapshot snap_none images
+    expect 1 create_self_managed_snapshot snap_none volumes
+
+    create_self_managed_snapshot snap_all images
+    create_self_managed_snapshot snap_all volumes
+
+    create_self_managed_snapshot snap_pool images
+    expect 1 create_self_managed_snapshot snap_pool volumes
+
+    create_self_managed_snapshot snap_profile_all images
+    create_self_managed_snapshot snap_profile_all volumes
+
+    create_self_managed_snapshot snap_profile_pool images
+    expect 1 create_self_managed_snapshot snap_profile_pool volumes
+
+    # Ensure users cannot delete self-managed snapshots w/o permissions
+    expect 1 remove_self_managed_snapshot snap_none images
+    expect 1 remove_self_managed_snapshot snap_none volumes
+
+    remove_self_managed_snapshot snap_all images
+    remove_self_managed_snapshot snap_all volumes
+
+    remove_self_managed_snapshot snap_pool images
+    expect 1 remove_self_managed_snapshot snap_pool volumes
+
+    remove_self_managed_snapshot snap_profile_all images
+    remove_self_managed_snapshot snap_profile_all volumes
+
+    remove_self_managed_snapshot snap_profile_pool images
+    expect 1 remove_self_managed_snapshot snap_profile_pool volumes
+}
+
 cleanup() {
     rm -f $KEYRING
 }
+
 KEYRING=$(mktemp)
 trap cleanup EXIT ERR HUP INT QUIT
 
@@ -141,6 +231,8 @@ test_images_access
 recreate_pools
 test_volumes_access
 
+test_remove_self_managed_snapshots
+
 delete_pools
 delete_users
 
index 3e7f844faeaf948e3bd73d1c15ec4bd1b52a26db..c2308c355af2404b8288d0abda7562ce38dc7dca 100755 (executable)
@@ -400,7 +400,7 @@ testlog " - rbd_mirroring_resync_after_disconnect config option"
 set_image_meta ${CLUSTER2} ${POOL} ${image} \
               conf_rbd_mirroring_resync_after_disconnect true
 wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
-image_id=$(get_image_id ${CLUSTER1} ${pool} ${image})
+image_id=$(get_image_id ${CLUSTER1} ${POOL} ${image})
 disconnect_image ${CLUSTER2} ${POOL} ${image}
 wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted' ${image_id}
 wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present'
index 7c4335c7b6559e427ae6fe14dc1b5cc6cfb0e874..871f5709399b7197138a48800ba2b8ed42a9b80f 100755 (executable)
@@ -72,6 +72,8 @@ screenplay = [
     ('get',    '/request?page=0', {}),
     ('delete', '/request', {}),
     ('get',    '/request', {}),
+    ('patch', '/pool/1', {'pg_num': 128}),
+    ('patch', '/pool/1', {'pgp_num': 128}),
 ]
 
 for method, endpoint, args in screenplay:
index 0a9349803b12831eb72b266d4e74d5ac38c3e98a..a56eb6a55abc9ace03e5ede6d97f3513a2e8f326 100644 (file)
@@ -12,6 +12,7 @@ require {
        class dir read;
        class file { getattr read open };
        class blk_file { getattr ioctl open read write };
+       class capability2 block_suspend;
 }
 
 ########################################
@@ -46,6 +47,7 @@ allow ceph_t self:process { signal_perms };
 allow ceph_t self:fifo_file rw_fifo_file_perms;
 allow ceph_t self:unix_stream_socket create_stream_socket_perms;
 allow ceph_t self:capability { setuid setgid dac_override };
+allow ceph_t self:capability2 block_suspend;
 
 manage_dirs_pattern(ceph_t, ceph_log_t, ceph_log_t)
 manage_files_pattern(ceph_t, ceph_log_t, ceph_log_t)
@@ -103,6 +105,7 @@ fstools_exec(ceph_t)
 nis_use_ypbind_uncond(ceph_t)
 storage_raw_rw_fixed_disk(ceph_t)
 files_manage_generic_locks(ceph_t)
+libs_exec_ldconfig(ceph_t)
 
 allow ceph_t sysfs_t:dir read;
 allow ceph_t sysfs_t:file { read getattr open };
index 823f28dde81adb907752fe579c6fec59b2f5b007..fa15181c222433798fc2f260e1554d4152dc32ef 100644 (file)
@@ -1,2 +1,2 @@
-cad919881333ac92274171586c827e01f554a70a
-v12.2.5
+3ec878d1e53e1aeb47a9f619c49d9e7c0aa384d5
+v12.2.7
index 7aa8a4392e1e2a7d1258fc1278adbcc2ee66208f..3d4baae39d86653eccddfedaae9e7e7e79c0e1de 100644 (file)
@@ -531,6 +531,7 @@ set(libcommon_files
   common/bit_str.cc
   osdc/Striper.cc
   osdc/Objecter.cc
+  common/compat.cc
   common/Graylog.cc
   common/fs_types.cc
   common/dns_resolve.cc
@@ -820,14 +821,15 @@ if (NOT WITH_SYSTEM_ROCKSDB)
     list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER})
   endif(WITH_CCACHE AND CCACHE_FOUND)
 
+  list(APPEND ROCKSDB_CMAKE_ARGS -DWITH_SNAPPY=${SNAPPY_FOUND})
+  list(APPEND ROCKSDB_CMAKE_ARGS -DWITH_LZ4=${LZ4_FOUND})
+  list(APPEND ROCKSDB_CMAKE_ARGS -DWITH_ZLIB=${ZLIB_FOUND})
+
   # SSE 4.2 is enabled by default in rocksdb's crc32c. For details refer to
   # rocksdb/util/crc32c.cc.
   list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_AR=${CMAKE_AR})
   list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
-
-  if (CMAKE_CXX_COMPILER_ID STREQUAL Clang)
-    list(APPEND ROCKSDB_CMAKE_ARGS -DFAIL_ON_WARNINGS=OFF)
-  endif()
+  list(APPEND ROCKSDB_CMAKE_ARGS -DFAIL_ON_WARNINGS=OFF)
 
   # we use an external project and copy the sources to bin directory to ensure
   # that object files are built outside of the source tree.
index a8562f59674161bd4a81bce81e997191429bf967..02f0107b76ba9473f16b3d6cef2a9f94d6436b05 100644 (file)
@@ -4,6 +4,7 @@
 /* flags we export */
 int ceph_arch_neon = 0;
 int ceph_arch_aarch64_crc32 = 0;
+int ceph_arch_aarch64_pmull = 0;
 
 #include <stdio.h>
 
@@ -11,48 +12,26 @@ int ceph_arch_aarch64_crc32 = 0;
 
 #include <elf.h>
 #include <link.h> // ElfW macro
+#include <sys/auxv.h>
 
 #if __arm__ || __aarch64__
 #include <asm/hwcap.h>
 #endif // __arm__
 
-static unsigned long get_auxval(unsigned long type)
-{
-       unsigned long result = 0;
-       FILE *f = fopen("/proc/self/auxv", "r");
-       if (f) {
-               ElfW(auxv_t) entry;
-               while (fread(&entry, sizeof(entry), 1, f) == 1) {
-                       if (entry.a_type == type) {
-                               result = entry.a_un.a_val;
-                               break;
-                       }
-               }
-               fclose(f);
-       }
-       return result;
-}
-
-static unsigned long get_hwcap(void)
-{
-       return get_auxval(AT_HWCAP);
-}
-
 #endif // __linux__
 
 int ceph_arch_arm_probe(void)
 {
-#if __arm__ && __linux__
-       ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON;
-#elif __aarch64__ && __linux__
-       ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD;
-# if defined(HAVE_ARMV8_CRC) && defined(HWCAP_CRC32)
-       ceph_arch_aarch64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32;
-# endif
-#else
-       if (0)
-               get_hwcap();  // make compiler shut up
+#if __linux__
+       unsigned long hwcap = getauxval(AT_HWCAP);
+#if __arm__
+       ceph_arch_neon = (hwcap & HWCAP_NEON) == HWCAP_NEON;
+#elif __aarch64__
+       ceph_arch_neon = (hwcap & HWCAP_ASIMD) == HWCAP_ASIMD;
+       ceph_arch_aarch64_crc32 = (hwcap & HWCAP_CRC32) == HWCAP_CRC32;
+       ceph_arch_aarch64_pmull = (hwcap & HWCAP_PMULL) == HWCAP_PMULL;
 #endif
+#endif // __linux__
        return 0;
 }
 
index 1659b2e94dec42cf6d1d9038f76dec7538d11863..dacc450b18f07af50072b0ebf6c815a6b47d43fe 100644 (file)
@@ -7,6 +7,7 @@ extern "C" {
 
 extern int ceph_arch_neon;  /* true if we have ARM NEON or ASIMD abilities */
 extern int ceph_arch_aarch64_crc32;  /* true if we have AArch64 CRC32/CRC32C abilities */
+extern int ceph_arch_aarch64_pmull;  /* true if we have AArch64 PMULL abilities */
 
 extern int ceph_arch_arm_probe(void);
 
index c5bda0ccaf49f11b763842870afc37898ccb0923..f40296e1a0375f2694503bcaa39b5c8d50f4c530 100644 (file)
@@ -136,6 +136,11 @@ struct AuthAuthorizer {
   explicit AuthAuthorizer(__u32 p) : protocol(p) {}
   virtual ~AuthAuthorizer() {}
   virtual bool verify_reply(bufferlist::iterator& reply) = 0;
+  virtual bool add_challenge(CephContext *cct, bufferlist& challenge) = 0;
+};
+
+struct AuthAuthorizerChallenge {
+  virtual ~AuthAuthorizerChallenge() {}
 };
 
 
index 2e81f14abde6462c14968419d8353760cbb588af..d824ed4e14387bf01715f098da85944fe506b5c6 100644 (file)
@@ -34,7 +34,9 @@ struct AuthAuthorizeHandler {
   virtual bool verify_authorizer(CephContext *cct, KeyStore *keys,
                                 bufferlist& authorizer_data, bufferlist& authorizer_reply,
                                  EntityName& entity_name, uint64_t& global_id,
-                                AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid = NULL) = 0;
+                                AuthCapsInfo& caps_info, CryptoKey& session_key,
+                                uint64_t *auid,
+                                std::unique_ptr<AuthAuthorizerChallenge> *challenge) = 0;
   virtual int authorizer_session_crypto() = 0;
 };
 
index d455b190a997eb291f99205983d10718381f81cb..1a6164a0f0ad21640449b0d3f9be5211ec6dd2ac 100644 (file)
@@ -6,9 +6,12 @@
 
 
 
-bool CephxAuthorizeHandler::verify_authorizer(CephContext *cct, KeyStore *keys,
-                                             bufferlist& authorizer_data, bufferlist& authorizer_reply,
-                                              EntityName& entity_name, uint64_t& global_id, AuthCapsInfo& caps_info, CryptoKey& session_key,  uint64_t *auid)
+bool CephxAuthorizeHandler::verify_authorizer(
+  CephContext *cct, KeyStore *keys,
+  bufferlist& authorizer_data, bufferlist& authorizer_reply,
+  EntityName& entity_name, uint64_t& global_id, AuthCapsInfo& caps_info,
+  CryptoKey& session_key, uint64_t *auid,
+  std::unique_ptr<AuthAuthorizerChallenge> *challenge)
 {
   bufferlist::iterator iter = authorizer_data.begin();
 
@@ -19,7 +22,8 @@ bool CephxAuthorizeHandler::verify_authorizer(CephContext *cct, KeyStore *keys,
 
   CephXServiceTicketInfo auth_ticket_info;
 
-  bool isvalid = cephx_verify_authorizer(cct, keys, iter, auth_ticket_info, authorizer_reply);
+  bool isvalid = cephx_verify_authorizer(cct, keys, iter, auth_ticket_info, challenge,
+                                        authorizer_reply);
 
   if (isvalid) {
     caps_info = auth_ticket_info.ticket.caps;
index 7246b80c71da5312ac083c9ea6d5376201faebac..8fa40aa71275312d88077203ff7a1d405eccc485 100644 (file)
@@ -23,7 +23,8 @@ struct CephxAuthorizeHandler : public AuthAuthorizeHandler {
   bool verify_authorizer(CephContext *cct, KeyStore *keys,
                         bufferlist& authorizer_data, bufferlist& authorizer_reply,
                          EntityName& entity_name, uint64_t& global_id,
-                        AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid = NULL) override;
+                        AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid,
+                        std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
   int authorizer_session_crypto() override;
 };
 
index 5836a33bd531e7a31e662d6d93a5098150a4625b..cc5f44964c2adeed0b7f357ae2fd9af5f573ab16 100644 (file)
@@ -296,7 +296,7 @@ CephXAuthorizer *CephXTicketHandler::build_authorizer(uint64_t global_id) const
 {
   CephXAuthorizer *a = new CephXAuthorizer(cct);
   a->session_key = session_key;
-  a->nonce = ((uint64_t)rand() << 32) + rand();
+  get_random_bytes((char*)&a->nonce, sizeof(a->nonce));
 
   __u8 authorizer_v = 1;
   ::encode(authorizer_v, a->bl);
@@ -304,6 +304,7 @@ CephXAuthorizer *CephXTicketHandler::build_authorizer(uint64_t global_id) const
   ::encode(service_id, a->bl);
 
   ::encode(ticket, a->bl);
+  a->base_bl = a->bl;
 
   CephXAuthorize msg;
   msg.nonce = a->nonce;
@@ -390,7 +391,9 @@ bool cephx_decode_ticket(CephContext *cct, KeyStore *keys, uint32_t service_id,
  */
 bool cephx_verify_authorizer(CephContext *cct, KeyStore *keys,
                             bufferlist::iterator& indata,
-                            CephXServiceTicketInfo& ticket_info, bufferlist& reply_bl)
+                            CephXServiceTicketInfo& ticket_info,
+                            std::unique_ptr<AuthAuthorizerChallenge> *challenge,
+                            bufferlist& reply_bl)
 {
   __u8 authorizer_v;
   uint32_t service_id;
@@ -457,6 +460,30 @@ bool cephx_verify_authorizer(CephContext *cct, KeyStore *keys,
     return false;
   }
 
+  if (challenge) {
+    auto *c = static_cast<CephXAuthorizeChallenge*>(challenge->get());
+    if (!auth_msg.have_challenge || !c) {
+      c = new CephXAuthorizeChallenge;
+      challenge->reset(c);
+      get_random_bytes((char*)&c->server_challenge, sizeof(c->server_challenge));
+      ldout(cct,10) << __func__ << " adding server_challenge " << c->server_challenge
+                   << dendl;
+
+      encode_encrypt_enc_bl(cct, *c, ticket_info.session_key, reply_bl, error);
+      if (!error.empty()) {
+       ldout(cct, 10) << "verify_authorizer: encode_encrypt error: " << error << dendl;
+       return false;
+      }
+      return false;
+    }
+    ldout(cct, 10) << __func__ << " got server_challenge+1 "
+                  << auth_msg.server_challenge_plus_one
+                  << " expecting " << c->server_challenge + 1 << dendl;
+    if (c->server_challenge + 1 != auth_msg.server_challenge_plus_one) {
+      return false;
+    }
+  }
+
   /*
    * Reply authorizer:
    *  {timestamp + 1}^session_key
@@ -493,3 +520,31 @@ bool CephXAuthorizer::verify_reply(bufferlist::iterator& indata)
   return true;
 }
 
+bool CephXAuthorizer::add_challenge(CephContext *cct, bufferlist& challenge)
+{
+  bl = base_bl;
+
+  CephXAuthorize msg;
+  msg.nonce = nonce;
+
+  auto p = challenge.begin();
+  if (!p.end()) {
+    std::string error;
+    CephXAuthorizeChallenge ch;
+    decode_decrypt_enc_bl(cct, ch, session_key, challenge, error);
+    if (!error.empty()) {
+      ldout(cct, 0) << "failed to decrypt challenge (" << challenge.length() << " bytes): "
+                   << error << dendl;
+      return false;
+    }
+    msg.have_challenge = true;
+    msg.server_challenge_plus_one = ch.server_challenge + 1;
+  }
+
+  std::string error;
+  if (encode_encrypt(cct, msg, session_key, bl, error)) {
+    ldout(cct, 0) << __func__ << " failed to encrypt authorizer: " << error << dendl;
+    return false;
+  }
+  return true;
+}
index c82206989906155ba8400187ee87535dbc2ff627..b5ec897f35b9ab447e3d29f2ba1435adb5aad4c3 100644 (file)
@@ -273,12 +273,14 @@ private:
   CephContext *cct;
 public:
   uint64_t nonce;
+  bufferlist base_bl;
 
   explicit CephXAuthorizer(CephContext *cct_)
     : AuthAuthorizer(CEPH_AUTH_CEPHX), cct(cct_), nonce(0) {}
 
   bool build_authorizer();
   bool verify_reply(bufferlist::iterator& reply) override;
+  bool add_challenge(CephContext *cct, bufferlist& challenge) override;
 };
 
 
@@ -384,17 +386,41 @@ struct CephXServiceTicketInfo {
 };
 WRITE_CLASS_ENCODER(CephXServiceTicketInfo)
 
+struct CephXAuthorizeChallenge : public AuthAuthorizerChallenge {
+  uint64_t server_challenge;
+  void encode(bufferlist& bl) const {
+    __u8 struct_v = 1;
+    ::encode(struct_v, bl);
+    ::encode(server_challenge, bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    __u8 struct_v;
+    ::decode(struct_v, bl);
+    ::decode(server_challenge, bl);
+  }
+};
+WRITE_CLASS_ENCODER(CephXAuthorizeChallenge)
+
 struct CephXAuthorize {
   uint64_t nonce;
+  bool have_challenge = false;
+  uint64_t server_challenge_plus_one = 0;
   void encode(bufferlist& bl) const {
-    __u8 struct_v = 1;
+    __u8 struct_v = 2;
     ::encode(struct_v, bl);
     ::encode(nonce, bl);
+    ::encode(have_challenge, bl);
+    ::encode(server_challenge_plus_one, bl);
   }
   void decode(bufferlist::iterator& bl) {
     __u8 struct_v;
     ::decode(struct_v, bl);
     ::decode(nonce, bl);
+    if (struct_v >= 2) {
+      ::decode(have_challenge, bl);
+      ::decode(server_challenge_plus_one, bl);
+    }
+
   }
 };
 WRITE_CLASS_ENCODER(CephXAuthorize)
@@ -409,9 +435,12 @@ bool cephx_decode_ticket(CephContext *cct, KeyStore *keys,
 /*
  * Verify authorizer and generate reply authorizer
  */
-extern bool cephx_verify_authorizer(CephContext *cct, KeyStore *keys,
-                                   bufferlist::iterator& indata,
-                                   CephXServiceTicketInfo& ticket_info, bufferlist& reply_bl);
+extern bool cephx_verify_authorizer(
+  CephContext *cct, KeyStore *keys,
+  bufferlist::iterator& indata,
+  CephXServiceTicketInfo& ticket_info,
+  std::unique_ptr<AuthAuthorizerChallenge> *challenge,
+  bufferlist& reply_bl);
 
 
 
index 3184835a14dcf14c9dd90de5a21e6bf07dda316b..b06e0080ba8fe51ab23b1b678d679c734049a13b 100644 (file)
@@ -152,7 +152,9 @@ int CephxServiceHandler::handle_request(bufferlist::iterator& indata, bufferlist
 
       bufferlist tmp_bl;
       CephXServiceTicketInfo auth_ticket_info;
-      if (!cephx_verify_authorizer(cct, key_server, indata, auth_ticket_info, tmp_bl)) {
+      // note: no challenge here.
+      if (!cephx_verify_authorizer(cct, key_server, indata, auth_ticket_info, nullptr,
+                                  tmp_bl)) {
         ret = -EPERM;
        break;
       }
index 5694a22c0766eac87103ce0b157391b70578f5a2..989ba2d35fb5ad28519fb3d5ab1bf488b6b28e49 100644 (file)
@@ -29,33 +29,75 @@ int CephxSessionHandler::_calc_signature(Message *m, uint64_t *psig)
   const ceph_msg_header& header = m->get_header();
   const ceph_msg_footer& footer = m->get_footer();
 
-  // optimized signature calculation
-  // - avoid temporary allocated buffers from encode_encrypt[_enc_bl]
-  // - skip the leading 4 byte wrapper from encode_encrypt
-  struct {
-    __u8 v;
-    __le64 magic;
-    __le32 len;
-    __le32 header_crc;
-    __le32 front_crc;
-    __le32 middle_crc;
-    __le32 data_crc;
-  } __attribute__ ((packed)) sigblock = {
-    1, mswab(AUTH_ENC_MAGIC), mswab<uint32_t>(4*4),
-    mswab<uint32_t>(header.crc), mswab<uint32_t>(footer.front_crc),
-    mswab<uint32_t>(footer.middle_crc), mswab<uint32_t>(footer.data_crc)
-  };
-  bufferlist bl_plaintext;
-  bl_plaintext.append(buffer::create_static(sizeof(sigblock), (char*)&sigblock));
-
-  bufferlist bl_ciphertext;
-  if (key.encrypt(cct, bl_plaintext, bl_ciphertext, NULL) < 0) {
-    lderr(cct) << __func__ << " failed to encrypt signature block" << dendl;
-    return -1;
-  }
+  if (!HAVE_FEATURE(features, CEPHX_V2)) {
+    // legacy pre-mimic behavior for compatibility
+
+    // optimized signature calculation
+    // - avoid temporary allocated buffers from encode_encrypt[_enc_bl]
+    // - skip the leading 4 byte wrapper from encode_encrypt
+    struct {
+      __u8 v;
+      __le64 magic;
+      __le32 len;
+      __le32 header_crc;
+      __le32 front_crc;
+      __le32 middle_crc;
+      __le32 data_crc;
+    } __attribute__ ((packed)) sigblock = {
+      1, mswab(AUTH_ENC_MAGIC), mswab<uint32_t>(4*4),
+      mswab<uint32_t>(header.crc), mswab<uint32_t>(footer.front_crc),
+      mswab<uint32_t>(footer.middle_crc), mswab<uint32_t>(footer.data_crc)
+    };
+
+    bufferlist bl_plaintext;
+    bl_plaintext.append(buffer::create_static(sizeof(sigblock),
+                                             (char*)&sigblock));
+
+    bufferlist bl_ciphertext;
+    if (key.encrypt(cct, bl_plaintext, bl_ciphertext, NULL) < 0) {
+      lderr(cct) << __func__ << " failed to encrypt signature block" << dendl;
+      return -1;
+    }
 
-  bufferlist::iterator ci = bl_ciphertext.begin();
-  ::decode(*psig, ci);
+    bufferlist::iterator ci = bl_ciphertext.begin();
+    ::decode(*psig, ci);
+  } else {
+    // newer mimic+ signatures
+    struct {
+      __le32 header_crc;
+      __le32 front_crc;
+      __le32 front_len;
+      __le32 middle_crc;
+      __le32 middle_len;
+      __le32 data_crc;
+      __le32 data_len;
+      __le32 seq_lower_word;
+    } __attribute__ ((packed)) sigblock = {
+      mswab<uint32_t>(header.crc),
+      mswab<uint32_t>(footer.front_crc),
+      mswab<uint32_t>(header.front_len),
+      mswab<uint32_t>(footer.middle_crc),
+      mswab<uint32_t>(header.middle_len),
+      mswab<uint32_t>(footer.data_crc),
+      mswab<uint32_t>(header.data_len),
+      mswab<uint32_t>(header.seq)
+    };
+
+    bufferlist bl_plaintext;
+    bl_plaintext.append(buffer::create_static(sizeof(sigblock),
+                                             (char*)&sigblock));
+
+    bufferlist bl_ciphertext;
+    if (key.encrypt(cct, bl_plaintext, bl_ciphertext, NULL) < 0) {
+      lderr(cct) << __func__ << " failed to encrypt signature block" << dendl;
+      return -1;
+    }
+
+    struct enc {
+      __le64 a, b, c, d;
+    } *penc = reinterpret_cast<enc*>(bl_ciphertext.c_str());
+    *psig = penc->a ^ penc->b ^ penc->c ^ penc->d;
+  }
 
   ldout(cct, 10) << __func__ << " seq " << m->get_seq()
                 << " front_crc_ = " << footer.front_crc
index 8b55e9e9ba8b44eed8101fcde13635fb3ea00407..5767eacffd105ec0c9062fb645e675628886aaf4 100644 (file)
 
 #define dout_subsys ceph_subsys_auth
 
-bool AuthNoneAuthorizeHandler::verify_authorizer(CephContext *cct, KeyStore *keys,
-                                                bufferlist& authorizer_data, bufferlist& authorizer_reply,
-                                                EntityName& entity_name, uint64_t& global_id, AuthCapsInfo& caps_info, CryptoKey& session_key,
-uint64_t *auid)
+bool AuthNoneAuthorizeHandler::verify_authorizer(
+  CephContext *cct, KeyStore *keys,
+  bufferlist& authorizer_data, bufferlist& authorizer_reply,
+  EntityName& entity_name, uint64_t& global_id, AuthCapsInfo& caps_info,
+  CryptoKey& session_key,
+  uint64_t *auid,
+  std::unique_ptr<AuthAuthorizerChallenge> *challenge)
 {
   bufferlist::iterator iter = authorizer_data.begin();
 
index b531cfb77360160614b3fe310aa6b2cd89beac17..0ce542bf678e2b061ea5fcb756f1df8de1a9701e 100644 (file)
@@ -23,7 +23,8 @@ struct AuthNoneAuthorizeHandler : public AuthAuthorizeHandler {
   bool verify_authorizer(CephContext *cct, KeyStore *keys,
                         bufferlist& authorizer_data, bufferlist& authorizer_reply,
                          EntityName& entity_name, uint64_t& global_id,
-                        AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid=NULL) override;
+                        AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid,
+                        std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
   int authorizer_session_crypto() override;
 };
 
index 8683c567416e7af4db397af4c765ea83cf3e7fe5..662fdb51648f643ae7c06e2ce3bde22cb01950cd 100644 (file)
@@ -17,6 +17,8 @@
 
 #include "auth/Auth.h"
 
+class CephContext;
+
 struct AuthNoneAuthorizer : public AuthAuthorizer {
   AuthNoneAuthorizer() : AuthAuthorizer(CEPH_AUTH_NONE) { }
   bool build_authorizer(const EntityName &ename, uint64_t global_id) {
@@ -27,6 +29,7 @@ struct AuthNoneAuthorizer : public AuthAuthorizer {
     return 0;
   }
   bool verify_reply(bufferlist::iterator& reply) override { return true; }
+  bool add_challenge(CephContext *cct, bufferlist& ch) override { return true; }
 };
 
 #endif
index 62cb638874e2bc9c1c2f2e9f96dbae599034bb82..90e00ef579a584201554e82f2061740cfd6b3aec 100644 (file)
 
 #include "AuthUnknownAuthorizeHandler.h"
 
-bool AuthUnknownAuthorizeHandler::verify_authorizer(CephContext *cct, KeyStore *keys,
-                                                bufferlist& authorizer_data, bufferlist& authorizer_reply,
-                                                EntityName& entity_name, uint64_t& global_id, AuthCapsInfo& caps_info, CryptoKey& session_key,
-uint64_t *auid)
+bool AuthUnknownAuthorizeHandler::verify_authorizer(
+  CephContext *cct, KeyStore *keys,
+  bufferlist& authorizer_data, bufferlist& authorizer_reply,
+  EntityName& entity_name, uint64_t& global_id, AuthCapsInfo& caps_info,
+  CryptoKey& session_key,
+  uint64_t *auid,
+  std::unique_ptr<AuthAuthorizerChallenge> *challenge)
 {
   // For unknown authorizers, there's nothing to verify.  They're "OK" by definition.  PLR
 
index 9795ebfe9bf66ba93d04824d40635d4b31754293..e052af5def7dff1a62710a8e3a4b9283f37483a8 100644 (file)
@@ -23,7 +23,8 @@ struct AuthUnknownAuthorizeHandler : public AuthAuthorizeHandler {
   bool verify_authorizer(CephContext *cct, KeyStore *keys,
                         bufferlist& authorizer_data, bufferlist& authorizer_reply,
                          EntityName& entity_name, uint64_t& global_id,
-                        AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid=NULL) override;
+                        AuthCapsInfo& caps_info, CryptoKey& session_key, uint64_t *auid,
+                        std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
   int authorizer_session_crypto() override;
 };
 
index 63faec6ba525fab98d9e1b8d3f5b1249fe3b79d5..0058f1ac2d977514d446ad92d2f12ae2becb3421 100644 (file)
@@ -1528,10 +1528,11 @@ def get_free_partition_index(dev):
 
 
 def check_journal_reqs(args):
+    log_file = "/var/log/ceph/$cluster-osd-check.log"
     _, _, allows_journal = command([
         'ceph-osd', '--check-allows-journal',
         '-i', '0',
-        '--log-file', '$run_dir/$cluster-osd-check.log',
+        '--log-file', log_file,
         '--cluster', args.cluster,
         '--setuser', get_ceph_user(),
         '--setgroup', get_ceph_group(),
@@ -1539,7 +1540,7 @@ def check_journal_reqs(args):
     _, _, wants_journal = command([
         'ceph-osd', '--check-wants-journal',
         '-i', '0',
-        '--log-file', '$run_dir/$cluster-osd-check.log',
+        '--log-file', log_file,
         '--cluster', args.cluster,
         '--setuser', get_ceph_user(),
         '--setgroup', get_ceph_group(),
@@ -1547,7 +1548,7 @@ def check_journal_reqs(args):
     _, _, needs_journal = command([
         'ceph-osd', '--check-needs-journal',
         '-i', '0',
-        '--log-file', '$run_dir/$cluster-osd-check.log',
+        '--log-file', log_file,
         '--cluster', args.cluster,
         '--setuser', get_ceph_user(),
         '--setgroup', get_ceph_group(),
index f5500015c35518dccc832b7c215f3b30d7faef0d..6550db415572af84ba9d61de72a29043a4a8fc9c 100644 (file)
@@ -1,5 +1,16 @@
 from collections import namedtuple
 
+
+class UnloadedConfig(object):
+    """
+    This class is used as the default value for conf.ceph so that if
+    a configuration file is not successfully loaded then it will give
+    a nice error message when values from the config are used.
+    """
+    def __getattr__(self, *a):
+        raise RuntimeError("No valid ceph configuration file was loaded.")
+
 conf = namedtuple('config', ['ceph', 'cluster', 'verbosity', 'path', 'log_path'])
+conf.ceph = UnloadedConfig()
 
 __version__ = "1.0.0"
index 88f0d2d2af6dd658662428a22d470f5332c9cf7b..2f2bd17388f656159a8608eee1a3511693e04942 100644 (file)
@@ -220,7 +220,7 @@ def get_api_pvs():
           /dev/sdv;;07A4F654-4162-4600-8EB3-88D1E42F368D
 
     """
-    fields = 'pv_name,pv_tags,pv_uuid,vg_name'
+    fields = 'pv_name,pv_tags,pv_uuid,vg_name,lv_uuid'
 
     stdout, stderr, returncode = process.call(
         ['pvs', '--no-heading', '--readonly', '--separator=";"', '-o', fields]
index 37ec6fb7267aa6e5eefeaa66af66b5687a47ec9b..5e9376e5dde48cb579aaaf9ffc7bcff13f6fe7e2 100644 (file)
@@ -162,10 +162,12 @@ def activate_bluestore(lvs, no_systemd=False):
         destination = os.path.join(osd_path, 'block.db')
         process.run(['ln', '-snf', db_device_path, destination])
         system.chown(db_device_path)
+        system.chown(destination)
     if wal_device_path:
         destination = os.path.join(osd_path, 'block.wal')
         process.run(['ln', '-snf', wal_device_path, destination])
         system.chown(wal_device_path)
+        system.chown(destination)
 
     if no_systemd is False:
         # enable the ceph-volume unit for this OSD
index 18c80657ec8d20a92872586fb3ce85c467de7b34..0f972f9d83f1f3cc16b2aca58b87f081bb106995 100644 (file)
@@ -31,7 +31,7 @@ class Create(object):
             # activate, which would never need to be rolled back.
             Activate([]).activate(args)
         except Exception:
-            logger.error('lvm activate was unable to complete, while creating the OSD')
+            logger.exception('lvm activate was unable to complete, while creating the OSD')
             logger.info('will rollback OSD ID creation')
             rollback_osd(args, osd_id)
             raise
index d1d96d7adb5a66968fdd3875f8c300bfbcb68146..a84a39c182a512e4a6032e60be0afbf083dd1203 100644 (file)
@@ -49,6 +49,9 @@ def pretty_report(report):
                         value=value
                     )
                 )
+            output.append(
+                device_metadata_item_template.format(tag_name='devices', value=','.join(device['devices'])))
+
     print(''.join(output))
 
 
@@ -74,6 +77,30 @@ class List(object):
     def __init__(self, argv):
         self.argv = argv
 
+    @property
+    def pvs(self):
+        """
+        To avoid having to make an LVM API call for every single item being
+        reported, the call gets set only once, using that stored call for
+        subsequent calls
+        """
+        if getattr(self, '_pvs', None) is not None:
+            return self._pvs
+        self._pvs = api.get_api_pvs()
+        return self._pvs
+
+    def match_devices(self, lv_uuid):
+        """
+        It is possible to have more than one PV reported *with the same name*,
+        to avoid incorrect or duplicate contents we correlated the lv uuid to
+        the one on the physical device.
+        """
+        devices = []
+        for device in self.pvs:
+            if device.get('lv_uuid') == lv_uuid:
+                devices.append(device['pv_name'])
+        return devices
+
     @decorators.needs_root
     def list(self, args):
         # ensure everything is up to date before calling out
@@ -152,6 +179,7 @@ class List(object):
                 return self.full_report(lvs=lvs)
 
         if lv:
+
             try:
                 _id = lv.tags['ceph.osd_id']
             except KeyError:
@@ -159,9 +187,9 @@ class List(object):
                 return report
 
             report.setdefault(_id, [])
-            report[_id].append(
-                lv.as_dict()
-            )
+            lv_report = lv.as_dict()
+            lv_report['devices'] = self.match_devices(lv.lv_uuid)
+            report[_id].append(lv_report)
 
         else:
             # this has to be a journal/wal/db device (not a logical volume) so try
@@ -202,9 +230,9 @@ class List(object):
                 continue
 
             report.setdefault(_id, [])
-            report[_id].append(
-                lv.as_dict()
-            )
+            lv_report = lv.as_dict()
+            lv_report['devices'] = self.match_devices(lv.lv_uuid)
+            report[_id].append(lv_report)
 
             for device_type in ['journal', 'block', 'wal', 'db']:
                 device_uuid = lv.tags.get('ceph.%s_uuid' % device_type)
index 17ac5e1b3942d6d77099a43b61b161fe00fad551..2369cb6f3a58d5d7e0a427e12cd8b7d136b88430 100644 (file)
@@ -215,7 +215,7 @@ class Prepare(object):
         try:
             self.prepare(args)
         except Exception:
-            logger.error('lvm prepare was unable to complete')
+            logger.exception('lvm prepare was unable to complete')
             logger.info('will rollback OSD ID creation')
             rollback_osd(args, self.osd_id)
             raise
index ba3395bfba3067673ca4b73691d1f6f60c81b54d..b1f858138ea615e306bf83916fca94ec653f16b1 100644 (file)
@@ -10,9 +10,15 @@ class Capture(object):
         self.a = a
         self.kw = kw
         self.calls = []
+        self.return_values = kw.get('return_values', False)
+        self.always_returns = kw.get('always_returns', False)
 
     def __call__(self, *a, **kw):
         self.calls.append({'args': a, 'kwargs': kw})
+        if self.always_returns:
+            return self.always_returns
+        if self.return_values:
+            return self.return_values.pop()
 
 
 class Factory(object):
@@ -41,7 +47,7 @@ def fake_run(monkeypatch):
 
 @pytest.fixture
 def fake_call(monkeypatch):
-    fake_call = Capture()
+    fake_call = Capture(always_returns=([], [], 0))
     monkeypatch.setattr('ceph_volume.process.call', fake_call)
     return fake_call
 
@@ -51,10 +57,12 @@ def stub_call(monkeypatch):
     """
     Monkeypatches process.call, so that a caller can add behavior to the response
     """
-    def apply(return_value):
-        monkeypatch.setattr(
-            'ceph_volume.process.call',
-            lambda *a, **kw: return_value)
+    def apply(return_values):
+        if isinstance(return_values, tuple):
+            return_values = [return_values]
+        stubbed_call = Capture(return_values=return_values)
+        monkeypatch.setattr('ceph_volume.process.call', stubbed_call)
+        return stubbed_call
 
     return apply
 
index b780ea2e99ccbbf0175c08ed94ed8d386f8a5e71..a49a3e9e6a08f44246ad06ce86e14380c4809b5e 100644 (file)
@@ -22,12 +22,16 @@ class TestPrettyReport(object):
         assert stdout == '\n'
 
     def test_type_and_path_are_reported(self, capsys):
-        lvm.listing.pretty_report({0: [{'type': 'data', 'path': '/dev/sda1'}]})
+        lvm.listing.pretty_report({0: [
+            {'type': 'data', 'path': '/dev/sda1', 'devices': ['/dev/sda']}
+        ]})
         stdout, stderr = capsys.readouterr()
         assert '[data]    /dev/sda1' in stdout
 
     def test_osd_id_header_is_reported(self, capsys):
-        lvm.listing.pretty_report({0: [{'type': 'data', 'path': '/dev/sda1'}]})
+        lvm.listing.pretty_report({0: [
+            {'type': 'data', 'path': '/dev/sda1', 'devices': ['/dev/sda']}
+        ]})
         stdout, stderr = capsys.readouterr()
         assert '====== osd.0 =======' in stdout
 
@@ -36,12 +40,20 @@ class TestPrettyReport(object):
             {0: [{
                 'type': 'data',
                 'path': '/dev/sda1',
-                'tags': {'ceph.osd_id': '0'}
+                'tags': {'ceph.osd_id': '0'},
+                'devices': ['/dev/sda'],
             }]}
         )
         stdout, stderr = capsys.readouterr()
         assert 'osd id' in stdout
 
+    def test_devices_are_comma_separated(self, capsys):
+        lvm.listing.pretty_report({0: [
+            {'type': 'data', 'path': '/dev/sda1', 'devices': ['/dev/sda', '/dev/sdb1']}
+        ]})
+        stdout, stderr = capsys.readouterr()
+        assert '/dev/sda,/dev/sdb1' in stdout
+
 
 class TestList(object):
 
@@ -155,22 +167,85 @@ class TestSingleReport(object):
         # ceph lvs are detected by looking into its tags
         tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
         lv = api.Volume(
-            lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+            lv_name='lv', vg_name='VolGroup',
+            lv_uuid='aaaa', lv_path='/dev/VolGroup/lv', lv_tags=tags
+        )
         volumes.append(lv)
         monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
         result = lvm.listing.List([]).single_report('VolGroup/lv')
         assert result['0'][0]['name'] == 'lv'
         assert result['0'][0]['lv_tags'] == tags
         assert result['0'][0]['path'] == '/dev/VolGroup/lv'
+        assert result['0'][0]['devices'] == []
 
     def test_report_a_ceph_journal_device(self, volumes, monkeypatch):
         # ceph lvs are detected by looking into its tags
         tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data,ceph.journal_device=/dev/sda1'
         lv = api.Volume(
-            lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+            lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv',
+            lv_uuid='aaa', lv_tags=tags)
         volumes.append(lv)
         monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
         result = lvm.listing.List([]).single_report('/dev/sda1')
         assert result['0'][0]['tags'] == {'PARTUUID': 'x'}
         assert result['0'][0]['type'] == 'journal'
         assert result['0'][0]['path'] == '/dev/sda1'
+
+    def test_report_a_ceph_lv_with_devices(self, volumes, monkeypatch):
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+        lv = api.Volume(
+            lv_name='lv', vg_name='VolGroup',
+            lv_uuid='aaaa', lv_path='/dev/VolGroup/lv', lv_tags=tags
+        )
+        volumes.append(lv)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        listing = lvm.listing.List([])
+        listing._pvs = [
+            {'lv_uuid': 'aaaa', 'pv_name': '/dev/sda1', 'pv_tags': '', 'pv_uuid': ''},
+            {'lv_uuid': 'aaaa', 'pv_name': '/dev/sdb1', 'pv_tags': '', 'pv_uuid': ''},
+        ]
+        result = listing.single_report('VolGroup/lv')
+        assert result['0'][0]['name'] == 'lv'
+        assert result['0'][0]['lv_tags'] == tags
+        assert result['0'][0]['path'] == '/dev/VolGroup/lv'
+        assert result['0'][0]['devices'] == ['/dev/sda1', '/dev/sdb1']
+
+    def test_report_a_ceph_lv_with_no_matching_devices(self, volumes, monkeypatch):
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+        lv = api.Volume(
+            lv_name='lv', vg_name='VolGroup',
+            lv_uuid='aaaa', lv_path='/dev/VolGroup/lv', lv_tags=tags
+        )
+        volumes.append(lv)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        listing = lvm.listing.List([])
+        listing._pvs = [
+            {'lv_uuid': 'ffff', 'pv_name': '/dev/sda1', 'pv_tags': '', 'pv_uuid': ''},
+            {'lv_uuid': 'ffff', 'pv_name': '/dev/sdb1', 'pv_tags': '', 'pv_uuid': ''},
+        ]
+        result = listing.single_report('VolGroup/lv')
+        assert result['0'][0]['name'] == 'lv'
+        assert result['0'][0]['lv_tags'] == tags
+        assert result['0'][0]['path'] == '/dev/VolGroup/lv'
+        assert result['0'][0]['devices'] == []
+
+
+class TestListingPVs(object):
+
+    def setup(self):
+        self.default_pvs = [
+            {'lv_uuid': 'ffff', 'pv_name': '/dev/sda1', 'pv_tags': '', 'pv_uuid': ''},
+            {'lv_uuid': 'ffff', 'pv_name': '/dev/sdb1', 'pv_tags': '', 'pv_uuid': ''},
+        ]
+
+    def test_pvs_is_unset(self, monkeypatch):
+        monkeypatch.setattr(lvm.listing.api, 'get_api_pvs', lambda: self.default_pvs)
+        listing = lvm.listing.List([])
+        assert listing.pvs == self.default_pvs
+
+    def test_pvs_is_set(self, monkeypatch):
+        # keep it patched so that we can fail if this gets returned
+        monkeypatch.setattr(lvm.listing.api, 'get_api_pvs', lambda: self.default_pvs)
+        listing = lvm.listing.List([])
+        listing._pvs = []
+        assert listing.pvs == []
index 2050c599074427d0b7f9f5906be4af84e0b5bab2..37e87be317d1f64ad953d2dcc8fca9b133c87336 100644 (file)
@@ -10,7 +10,7 @@ osd_objectstore: "bluestore"
 osd_scenario: lvm
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 lvm_volumes:
   - data: data-lv1
     data_vg: test_group
index 90ad04d4218254481a38852eb773d9d6f47c6fc8..6f36f4922e4bba4fb2bb6dca8491436b31f152f8 100644 (file)
@@ -11,7 +11,7 @@ osd_objectstore: "bluestore"
 osd_scenario: lvm
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 lvm_volumes:
   - data: data-lv1
     data_vg: test_group
index 70c018aa7e3466b0b53edf8f90916c83b8de1296..bbdc1035d25dbd8adcf229cca4ad9f743f7277bd 100644 (file)
@@ -1,4 +1,3 @@
-
 - hosts: osds
   become: yes
   tasks:
@@ -8,9 +7,26 @@
         name: ceph-osd@2
         state: stopped
 
+    - name: stop ceph-osd@0 daemon
+      service:
+        name: ceph-osd@0
+        state: stopped
+
+- hosts: mons
+  become: yes
+  tasks:
+
     - name: destroy osd.2
       command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
 
+    - name: destroy osd.0
+      command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    # osd.2 device
     - name: zap /dev/sdd1
       command: "ceph-volume lvm zap /dev/sdd1 --destroy"
       environment:
       environment:
         CEPH_VOLUME_DEBUG: 1
 
-    - name: stop ceph-osd@0 daemon
-      service:
-        name: ceph-osd@0
-        state: stopped
-
-    - name: destroy osd.0
-      command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
-
+    # osd.0 lv
     - name: zap test_group/data-lv1
       command: "ceph-volume lvm zap test_group/data-lv1"
       environment:
         name: ceph-osd@0
         state: stopped
 
+
+- hosts: mons
+  become: yes
+  tasks:
+
     - name: destroy osd.0
       command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
 
+
+- hosts: osds
+  become: yes
+  tasks:
+
+
     - name: zap test_group/data-lv1
       command: "ceph-volume lvm zap test_group/data-lv1"
       environment:
index f90d233d8c577e092ba18a1f00fc68e286bf70b7..af647e911c34a5150a89ff9db4913d9ba2388540 100644 (file)
@@ -10,7 +10,7 @@ osd_objectstore: "filestore"
 osd_scenario: lvm
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 # test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
 lvm_volumes:
   - data: data-lv1
index 871523a2363dd65893da42b77933bb295e2608df..8cd6c48c195c7977bc5dd7d43349a18d652b943b 100644 (file)
@@ -11,7 +11,7 @@ osd_objectstore: "filestore"
 osd_scenario: lvm
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 # test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
 lvm_volumes:
   - data: data-lv1
index fc3e38c7aee46de532a7527513d652f5d547eebc..49f37d20aade92b4a861a15cce0121ced2aafcab 100644 (file)
@@ -8,9 +8,28 @@
         name: ceph-osd@2
         state: stopped
 
+    - name: stop ceph-osd@0 daemon
+      service:
+        name: ceph-osd@0
+        state: stopped
+
+
+- hosts: mons
+  become: yes
+  tasks:
+
     - name: destroy osd.2
       command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
 
+    - name: destroy osd.0
+      command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    # osd.2 device
     - name: zap /dev/sdd1
       command: "ceph-volume lvm zap /dev/sdd1 --destroy"
       environment:
       environment:
         CEPH_VOLUME_DEBUG: 1
 
-    - name: stop ceph-osd@0 daemon
-      service:
-        name: ceph-osd@0
-        state: stopped
-
-    - name: destroy osd.0
-      command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
-
+    # osd.0 lv
     - name: zap test_group/data-lv1
       command: "ceph-volume lvm zap test_group/data-lv1"
       environment:
index ec1e11b83b04987388b2291d2b05d2f6cf4a0bbd..eb0ef32cd9fcf2f0d5335493c548e151376548a5 100644 (file)
@@ -8,9 +8,28 @@
         name: ceph-osd@2
         state: stopped
 
+    - name: stop ceph-osd@0 daemon
+      service:
+        name: ceph-osd@0
+        state: stopped
+
+
+- hosts: mons
+  become: yes
+  tasks:
+
     - name: destroy osd.2
       command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
 
+    - name: destroy osd.0
+      command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    # osd.2 device
     - name: zap /dev/sdd1
       command: "ceph-volume lvm zap /dev/sdd1 --destroy"
       environment:
       environment:
         CEPH_VOLUME_DEBUG: 1
 
-    - name: stop ceph-osd@0 daemon
-      service:
-        name: ceph-osd@0
-        state: stopped
-
-    - name: destroy osd.0
-      command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
-
+    # osd.0 device
     - name: zap test_group/data-lv1
       command: "ceph-volume lvm zap test_group/data-lv1"
       environment:
index fc3e38c7aee46de532a7527513d652f5d547eebc..d37efe19dff4e1dba6db90e3cf8a784d08d3690a 100644 (file)
@@ -8,14 +8,34 @@
         name: ceph-osd@2
         state: stopped
 
+    - name: stop ceph-osd@0 daemon
+      service:
+        name: ceph-osd@0
+        state: stopped
+
+
+- hosts: mons
+  become: yes
+  tasks:
+
     - name: destroy osd.2
       command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
 
+    - name: destroy osd.0
+      command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    # osd.2 device
     - name: zap /dev/sdd1
       command: "ceph-volume lvm zap /dev/sdd1 --destroy"
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    # osd.2 journal
     - name: zap /dev/sdd2
       command: "ceph-volume lvm zap /dev/sdd2 --destroy"
       environment:
       environment:
         CEPH_VOLUME_DEBUG: 1
 
-    - name: stop ceph-osd@0 daemon
-      service:
-        name: ceph-osd@0
-        state: stopped
-
-    - name: destroy osd.0
-      command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
-
+    # osd.0 data lv
     - name: zap test_group/data-lv1
       command: "ceph-volume lvm zap test_group/data-lv1"
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    # osd.0 journal device
     - name: zap /dev/sdc1
       command: "ceph-volume lvm zap /dev/sdc1 --destroy"
       environment:
index 429b467f463c2138ab6a26f93b9c4e8faee4280d..5a2ff02a1a48c58cf64c691f3404f4aebe12054b 100644 (file)
@@ -20,6 +20,7 @@ deps=
   ansible==2.4.1
   testinfra==1.7.1
   pytest-xdist
+  notario>=0.0.13
 changedir=
   # plain/unencrypted
   centos7-filestore-create: {toxinidir}/centos7/filestore/create
@@ -39,6 +40,9 @@ changedir=
   centos7-bluestore-prepare_activate: {toxinidir}/xenial/bluestore/prepare_activate
 commands=
   git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
+  # XXX Ideally we should be able to consume the requirements for ceph-ansible directly,
+  # but the master branch doesn't pin dependencies so we can't guarantee to work correctly
+  #pip install -r {envdir}/tmp/ceph-ansible/requirements.txt
 
   vagrant up --no-provision {posargs:--provider=virtualbox}
   bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
index 2050c599074427d0b7f9f5906be4af84e0b5bab2..37e87be317d1f64ad953d2dcc8fca9b133c87336 100644 (file)
@@ -10,7 +10,7 @@ osd_objectstore: "bluestore"
 osd_scenario: lvm
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 lvm_volumes:
   - data: data-lv1
     data_vg: test_group
index 90ad04d4218254481a38852eb773d9d6f47c6fc8..6f36f4922e4bba4fb2bb6dca8491436b31f152f8 100644 (file)
@@ -11,7 +11,7 @@ osd_objectstore: "bluestore"
 osd_scenario: lvm
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 lvm_volumes:
   - data: data-lv1
     data_vg: test_group
index 9a60ab2e0ecfad9ad06ad4d5e047b42afccf7f54..f93ff2304e3cac1baf248842660f606f8172d466 100644 (file)
@@ -1,4 +1,3 @@
-
 - hosts: osds
   become: yes
   tasks:
@@ -8,33 +7,44 @@
         name: ceph-osd@2
         state: stopped
 
-    - name: destroy osd.2 
+    - name: stop ceph-osd@0 daemon
+      service:
+        name: ceph-osd@0
+        state: stopped
+
+- hosts: mons
+  become: yes
+  tasks:
+
+    - name: destroy osd.2
       command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
 
-    - name: zap /dev/sdd1 
+    - name: destroy osd.0
+      command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    # osd.2 device
+    - name: zap /dev/sdd1
       command: "ceph-volume lvm zap /dev/sdd1 --destroy"
       environment:
         CEPH_VOLUME_DEBUG: 1
 
-    - name: redeploy osd.2 using /dev/sdd1 
+    - name: redeploy osd.2 using /dev/sdd1
       command: "ceph-volume lvm create --bluestore --data /dev/sdd1 --osd-id 2"
       environment:
         CEPH_VOLUME_DEBUG: 1
 
-    - name: stop ceph-osd@0 daemon
-      service:
-        name: ceph-osd@0
-        state: stopped
-
-    - name: destroy osd.0 
-      command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
-
-    - name: zap test_group/data-lv1 
+    # osd.0 lv
+    - name: zap test_group/data-lv1
       command: "ceph-volume lvm zap test_group/data-lv1"
       environment:
         CEPH_VOLUME_DEBUG: 1
 
-    - name: redeploy osd.0 using test_group/data-lv1 
+    - name: redeploy osd.0 using test_group/data-lv1
       command: "ceph-volume lvm create --bluestore --data test_group/data-lv1 --osd-id 0"
       environment:
         CEPH_VOLUME_DEBUG: 1
         name: ceph-osd@0
         state: stopped
 
+
+- hosts: mons
+  become: yes
+  tasks:
+
     - name: destroy osd.0
       command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
 
+
+- hosts: osds
+  become: yes
+  tasks:
+
     - name: zap test_group/data-lv1
       command: "ceph-volume lvm zap test_group/data-lv1"
       environment:
index f90d233d8c577e092ba18a1f00fc68e286bf70b7..af647e911c34a5150a89ff9db4913d9ba2388540 100644 (file)
@@ -10,7 +10,7 @@ osd_objectstore: "filestore"
 osd_scenario: lvm
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 # test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
 lvm_volumes:
   - data: data-lv1
index 871523a2363dd65893da42b77933bb295e2608df..8cd6c48c195c7977bc5dd7d43349a18d652b943b 100644 (file)
@@ -11,7 +11,7 @@ osd_objectstore: "filestore"
 osd_scenario: lvm
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 # test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
 lvm_volumes:
   - data: data-lv1
index fc3e38c7aee46de532a7527513d652f5d547eebc..49f37d20aade92b4a861a15cce0121ced2aafcab 100644 (file)
@@ -8,9 +8,28 @@
         name: ceph-osd@2
         state: stopped
 
+    - name: stop ceph-osd@0 daemon
+      service:
+        name: ceph-osd@0
+        state: stopped
+
+
+- hosts: mons
+  become: yes
+  tasks:
+
     - name: destroy osd.2
       command: "ceph osd destroy osd.2 --yes-i-really-mean-it"
 
+    - name: destroy osd.0
+      command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    # osd.2 device
     - name: zap /dev/sdd1
       command: "ceph-volume lvm zap /dev/sdd1 --destroy"
       environment:
       environment:
         CEPH_VOLUME_DEBUG: 1
 
-    - name: stop ceph-osd@0 daemon
-      service:
-        name: ceph-osd@0
-        state: stopped
-
-    - name: destroy osd.0
-      command: "ceph osd destroy osd.0 --yes-i-really-mean-it"
-
+    # osd.0 lv
     - name: zap test_group/data-lv1
       command: "ceph-volume lvm zap test_group/data-lv1"
       environment:
index 560c8b03b172b5c71a1efcd69802fc15742de72b..c265e783b07d6e20aa5940c57582bac69a153896 100644 (file)
@@ -9,7 +9,7 @@ journal_size: 100
 osd_objectstore: "bluestore"
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index 83d24e0a0135afbf30847448f6f1cd2967106a6e..885c2c82f4e5d0f154adbd28ca14744ed45d68d2 100644 (file)
@@ -10,7 +10,7 @@ journal_size: 100
 osd_objectstore: "bluestore"
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index 674abc9390ce34b41a18da2d9903ce0d74707e5e..30bcf5be7c6f7317b926921f3c4f7e4ecd88328a 100644 (file)
@@ -10,7 +10,7 @@ journal_size: 100
 osd_objectstore: "bluestore"
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index 8902bdda37e9632c77ca1abc81c435e0d07eaf7b..7ab573b07e9a334be010f3ed2d90b0c686e1636c 100644 (file)
@@ -9,7 +9,7 @@ journal_size: 100
 osd_objectstore: "filestore"
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index c8ba0ba4c08b2d00b1df746eace86ab2449af35f..a27cfbad0c9fdceba1a280a73bfa8376637e334f 100644 (file)
@@ -10,7 +10,7 @@ journal_size: 100
 osd_objectstore: "filestore"
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index 29e82024c7a7ead721f7cab446976585de5aaa5c..edac61b20650c0495ca9aadafa46d64cdd96d7af 100644 (file)
@@ -10,7 +10,7 @@ journal_size: 100
 osd_objectstore: "filestore"
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index 55d8c81d0ec0e9a5de6b1983b7d168642711b056..d5e9c33ea6401f0f52d7cf5bfc3f0a1e854ae0f3 100644 (file)
@@ -21,6 +21,7 @@ deps=
   ansible==2.4.1
   testinfra==1.7.1
   pytest-xdist
+  notario>=0.0.13
 changedir=
   centos7-filestore-activate: {toxinidir}/centos7/filestore/activate
   centos7-bluestore-activate: {toxinidir}/centos7/bluestore/activate
@@ -36,6 +37,9 @@ changedir=
   centos7-filestore-dmcrypt_luks: {toxinidir}/centos7/filestore/dmcrypt-luks
 commands=
   git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
+  # XXX Ideally we should be able to consume the requirements for ceph-ansible directly,
+  # but the master branch doesn't pin dependencies so we can't guarantee to work correctly
+  #pip install -r {envdir}/tmp/ceph-ansible/requirements.txt
 
   vagrant up --no-provision {posargs:--provider=virtualbox}
   bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
index 560c8b03b172b5c71a1efcd69802fc15742de72b..c265e783b07d6e20aa5940c57582bac69a153896 100644 (file)
@@ -9,7 +9,7 @@ journal_size: 100
 osd_objectstore: "bluestore"
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index 83d24e0a0135afbf30847448f6f1cd2967106a6e..885c2c82f4e5d0f154adbd28ca14744ed45d68d2 100644 (file)
@@ -10,7 +10,7 @@ journal_size: 100
 osd_objectstore: "bluestore"
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index 674abc9390ce34b41a18da2d9903ce0d74707e5e..30bcf5be7c6f7317b926921f3c4f7e4ecd88328a 100644 (file)
@@ -10,7 +10,7 @@ journal_size: 100
 osd_objectstore: "bluestore"
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index 8902bdda37e9632c77ca1abc81c435e0d07eaf7b..7ab573b07e9a334be010f3ed2d90b0c686e1636c 100644 (file)
@@ -9,7 +9,7 @@ journal_size: 100
 osd_objectstore: "filestore"
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index c8ba0ba4c08b2d00b1df746eace86ab2449af35f..a27cfbad0c9fdceba1a280a73bfa8376637e334f 100644 (file)
@@ -10,7 +10,7 @@ journal_size: 100
 osd_objectstore: "filestore"
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index 29e82024c7a7ead721f7cab446976585de5aaa5c..edac61b20650c0495ca9aadafa46d64cdd96d7af 100644 (file)
@@ -10,7 +10,7 @@ journal_size: 100
 osd_objectstore: "filestore"
 ceph_origin: 'repository'
 ceph_repository: 'dev'
-copy_admin_key: true
+copy_admin_key: false
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
index bf825ff460613c5daeb7bf224ad6dc3dd7b3e394..e702e052dea45a032f9b5662c6129cd1d51a8706 100644 (file)
@@ -262,3 +262,31 @@ class TestNormalizeFlags(object):
     def test_normalize_strings_duplicate_flags(self, flags):
         result = sorted(prepare._normalize_mount_flags(flags, extras=['discard','rw']).split(','))
         assert ','.join(result) == 'auto,discard,exec,rw'
+
+
+class TestMkfsBluestore(object):
+
+    def test_non_zero_exit_status(self, stub_call, monkeypatch):
+        conf.cluster = 'ceph'
+        monkeypatch.setattr('ceph_volume.util.prepare.system.chown', lambda x: True)
+        stub_call(([], [], 1))
+        with pytest.raises(RuntimeError) as error:
+            prepare.osd_mkfs_bluestore('1', 'asdf-1234', keyring='keyring')
+        assert "Command failed with exit code 1" in str(error)
+
+    def test_non_zero_exit_formats_command_correctly(self, stub_call, monkeypatch):
+        conf.cluster = 'ceph'
+        monkeypatch.setattr('ceph_volume.util.prepare.system.chown', lambda x: True)
+        stub_call(([], [], 1))
+        with pytest.raises(RuntimeError) as error:
+            prepare.osd_mkfs_bluestore('1', 'asdf-1234', keyring='keyring')
+        expected = ' '.join([
+            'ceph-osd',
+            '--cluster',
+            'ceph',
+            '--osd-objectstore', 'bluestore', '--mkfs',
+            '-i', '1', '--monmap', '/var/lib/ceph/osd/ceph-1/activate.monmap',
+            '--keyfile', '-', '--osd-data', '/var/lib/ceph/osd/ceph-1/',
+            '--osd-uuid', 'asdf-1234',
+            '--setuser', 'ceph', '--setgroup', 'ceph'])
+        assert expected in str(error)
index d02c570fec8251bf434a9867cb1dc18954c5fb34..d1cddf073a6978c22989c4d3b20d58043e4c677a 100644 (file)
@@ -322,7 +322,9 @@ def osd_mkfs_bluestore(osd_id, fsid, keyring=None, wal=False, db=False):
 
     command = base_command + supplementary_command
 
-    process.call(command, stdin=keyring, show_command=True)
+    _, _, returncode = process.call(command, stdin=keyring, show_command=True)
+    if returncode != 0:
+        raise RuntimeError('Command failed with exit code %s: %s' % (returncode, ' '.join(command)))
 
 
 def osd_mkfs_filestore(osd_id, fsid):
index 0ba46d362766b3acb48e83f349f5ebf821912844..b4b7d17c4c946648e17c1548a7f53d961a65b3ba 100644 (file)
@@ -87,6 +87,7 @@ def chown(path, recursive=True):
     """
     uid, gid = get_ceph_user_ids()
     if os.path.islink(path):
+        process.run(['chown', '-h', 'ceph:ceph', path])
         path = os.path.realpath(path)
     if recursive:
         process.run(['chown', '-R', 'ceph:ceph', path])
index 851425af097017b6d7d475bef3e53bf76fe1bdf1..77f4702d1539eb491baaeeb7f070fd30797ebe43 100644 (file)
@@ -93,6 +93,19 @@ int main(int argc, const char **argv, const char *envp[]) {
       filer_flags |= CEPH_OSD_FLAG_LOCALIZE_READS;
     } else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
       usage();
+    } else if (ceph_argparse_flag(args, i, "-V", (char*)nullptr)) {
+      const char* tmpargv[] = {
+       "ceph-fuse",
+       "-V"
+      };
+
+      struct fuse_args fargs = FUSE_ARGS_INIT(2, (char**)tmpargv);
+      if (fuse_parse_cmdline(&fargs, nullptr, nullptr, nullptr) == -1) {
+       derr << "fuse_parse_cmdline failed." << dendl;
+      }
+      assert(fargs.allocated);
+      fuse_opt_free_args(&fargs);
+      exit(0);
     } else {
       ++i;
     }
@@ -254,9 +267,11 @@ int main(int argc, const char **argv, const char *envp[]) {
     r = client->mount(g_conf->client_mountpoint.c_str(), perms,
                      g_ceph_context->_conf->fuse_require_active_mds);
     if (r < 0) {
-      if (r == CEPH_FUSE_NO_MDS_UP)
+      if (r == CEPH_FUSE_NO_MDS_UP) {
         cerr << "ceph-fuse[" << getpid() << "]: probably no MDS server is up?" << std::endl;
-      cerr << "ceph-fuse[" << getpid() << "]: ceph mount failed with " << cpp_strerror(-r) << std::endl;
+      }
+      cerr << "ceph-fuse[" << getpid() << "]: ceph mount failed with " << cpp_strerror(-r) << std::endl;r = EXIT_FAILURE;
+      r = EXIT_FAILURE;
       goto out_shutdown;
     }
 
index 1fd435839490a2d240a7a4d207d292aa45871559..41eff1b4b2a247e64b534a4646a8dfd96172a8df 100644 (file)
@@ -236,7 +236,6 @@ Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
     remount_cb(NULL),
     ino_invalidate_cb(NULL),
     dentry_invalidate_cb(NULL),
-    getgroups_cb(NULL),
     umask_cb(NULL),
     can_invalidate_dentries(false),
     async_ino_invalidator(m->cct),
@@ -902,8 +901,10 @@ Inode * Client::add_update_inode(InodeStat *st, utime_t from,
     add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
                   st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
                   request_perms);
-    if (in->auth_cap && in->auth_cap->session == session)
+    if (in->auth_cap && in->auth_cap->session == session) {
       in->max_size = st->max_size;
+      in->rstat = st->rstat;
+    }
   } else
     in->snap_caps |= st->cap.caps;
 
@@ -2076,6 +2077,10 @@ void Client::handle_client_session(MClientSession *m)
     break;
 
   case CEPH_SESSION_STALE:
+    // invalidate session caps/leases
+    session->cap_gen++;
+    session->cap_ttl = ceph_clock_now();
+    session->cap_ttl -= 1;
     renew_caps(session);
     break;
 
@@ -3232,7 +3237,7 @@ void Client::cap_delay_requeue(Inode *in)
   ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
   in->hold_caps_until = ceph_clock_now();
   in->hold_caps_until += cct->_conf->client_caps_release_delay;
-  delayed_caps.push_back(&in->cap_item);
+  delayed_list.push_back(&in->delay_cap_item);
 }
 
 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
@@ -3762,7 +3767,7 @@ void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
   if (cct->_conf->client_oc) {
     vector<ObjectExtent> ls;
     Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
-    objectcacher->discard_set(&in->oset, ls);
+    objectcacher->discard_writeback(&in->oset, ls, nullptr);
   }
 
   _schedule_invalidate_callback(in, off, len);
@@ -3901,7 +3906,6 @@ void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id
     cap->session = mds_session;
     cap->inode = in;
     cap->gen = mds_session->cap_gen;
-    cap_list.push_back(&in->cap_item);
   }
 
   check_cap_issue(in, cap, issued);
@@ -3925,6 +3929,7 @@ void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id
   cap->seq = seq;
   cap->issue_seq = seq;
   cap->mseq = mseq;
+  cap->gen = mds_session->cap_gen;
   cap->latest_perms = cap_perms;
   ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
           << " from mds." << mds
@@ -4020,7 +4025,7 @@ void Client::remove_session_caps(MetaSession *s)
        in->flushing_cap_tids.clear();
       }
       in->flushing_caps = 0;
-      in->dirty_caps = 0;
+      in->mark_caps_clean();
       put_inode(in);
     }
   }
@@ -4085,16 +4090,17 @@ void Client::_invalidate_kernel_dcache()
   }
 }
 
-void Client::trim_caps(MetaSession *s, int max)
+void Client::trim_caps(MetaSession *s, uint64_t max)
 {
   mds_rank_t mds = s->mds_num;
-  int caps_size = s->caps.size();
+  size_t caps_size = s->caps.size();
   ldout(cct, 10) << "trim_caps mds." << mds << " max " << max 
     << " caps " << caps_size << dendl;
 
-  int trimmed = 0;
-  xlist<Cap*>::iterator p = s->caps.begin();
-  std::set<InodeRef> anchor; /* prevent put_inode from deleting all caps during traversal */
+  uint64_t trimmed = 0;
+  auto p = s->caps.begin();
+  std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
+                               * looking at from getting deleted during traversal. */
   while ((caps_size - trimmed) > max && !p.end()) {
     Cap *cap = *p;
     InodeRef in(cap->inode);
@@ -4109,8 +4115,7 @@ void Client::trim_caps(MetaSession *s, int max)
       // disposable non-auth cap
       if (!(get_caps_used(in.get()) & ~oissued & mine)) {
        ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
-       remove_cap(cap, true);
-        /* N.B. no need to push onto anchor, as we are only removing one cap */
+       cap = (remove_cap(cap, true), nullptr);
        trimmed++;
       }
     } else {
@@ -4127,9 +4132,8 @@ void Client::trim_caps(MetaSession *s, int max)
            // the end of this function.
            _schedule_invalidate_dentry_callback(dn, true);
          }
-          ldout(cct, 20) << " anchoring inode: " << in->ino << dendl;
-          anchor.insert(in);
-         trim_dentry(dn);
+          ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
+          to_trim.insert(dn);
         } else {
           ldout(cct, 20) << "  not expirable: " << dn->name << dendl;
          all = false;
@@ -4141,8 +4145,11 @@ void Client::trim_caps(MetaSession *s, int max)
       }
     }
   }
-  ldout(cct, 20) << " clearing anchored inodes" << dendl;
-  anchor.clear();
+  ldout(cct, 20) << " trimming queued dentries: " << dendl;
+  for (const auto &dn : to_trim) {
+    trim_dentry(dn);
+  }
+  to_trim.clear();
 
   caps_size = s->caps.size();
   if (caps_size > max)
@@ -4159,15 +4166,6 @@ void Client::force_session_readonly(MetaSession *s)
   }
 }
 
-void Client::mark_caps_dirty(Inode *in, int caps)
-{
-  ldout(cct, 10) << "mark_caps_dirty " << *in << " " << ccap_string(in->dirty_caps) << " -> "
-          << ccap_string(in->dirty_caps | caps) << dendl;
-  if (caps && !in->caps_dirty())
-    in->get();
-  in->dirty_caps |= caps;
-}
-
 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
 {
   MetaSession *session = in->auth_cap->session;
@@ -4186,7 +4184,7 @@ int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
   }
 
   in->flushing_caps |= flushing;
-  in->dirty_caps = 0;
+  in->mark_caps_clean();
  
   if (!in->flushing_cap_item.is_on_list())
     session->flushing_caps.push_back(&in->flushing_cap_item);
@@ -4222,20 +4220,20 @@ void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s,  MetaSe
 void Client::flush_caps_sync()
 {
   ldout(cct, 10) << __func__ << dendl;
-  xlist<Inode*>::iterator p = delayed_caps.begin();
+  xlist<Inode*>::iterator p = delayed_list.begin();
   while (!p.end()) {
     unsigned flags = CHECK_CAPS_NODELAY;
     Inode *in = *p;
 
     ++p;
-    delayed_caps.pop_front();
-    if (p.end() && cap_list.empty())
+    delayed_list.pop_front();
+    if (p.end() && dirty_list.empty())
       flags |= CHECK_CAPS_SYNCHRONOUS;
     check_caps(in, flags);
   }
 
   // other caps, too
-  p = cap_list.begin();
+  p = dirty_list.begin();
   while (!p.end()) {
     unsigned flags = CHECK_CAPS_NODELAY;
     Inode *in = *p;
@@ -5041,6 +5039,7 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
                << " caps now " << ccap_string(new_caps)
                << " was " << ccap_string(old_caps) << dendl;
   cap->seq = m->get_seq();
+  cap->gen = session->cap_gen;
 
   in->layout = m->get_layout();
 
@@ -5069,6 +5068,12 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
     ::decode(in->xattrs, p);
     in->xattr_version = m->head.xattr_version;
   }
+
+  if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
+    in->dirstat.nfiles = m->get_nfiles();
+    in->dirstat.nsubdirs = m->get_nsubdirs();
+  }
+
   update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(),
                         m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(),
                         m->get_mtime(), m->get_atime(),
@@ -5104,17 +5109,16 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
     else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
       in->recall_deleg(true);
 
-    if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER)
-        && !_flush(in, new C_Client_FlushComplete(this, in))) {
+    if ((used & revoked & CEPH_CAP_FILE_BUFFER) &&
+       !_flush(in, new C_Client_FlushComplete(this, in))) {
       // waitin' for flush
-    } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) {
+    } else if (revoked & CEPH_CAP_FILE_CACHE) {
       if (_release(in))
        check = true;
     } else {
       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
       check = true;
     }
-
   } else if (old_caps == new_caps) {
     ldout(cct, 10) << "  caps unchanged at " << ccap_string(old_caps) << dendl;
   } else {
@@ -5149,63 +5153,6 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
   m->put();
 }
 
-int Client::_getgrouplist(gid_t** sgids, uid_t uid, gid_t gid)
-{
-  // cppcheck-suppress variableScope
-  int sgid_count;
-  gid_t *sgid_buf;
-
-  if (getgroups_cb) {
-    sgid_count = getgroups_cb(callback_handle, &sgid_buf);
-    if (sgid_count > 0) {
-      *sgids = sgid_buf;
-      return sgid_count;
-    }
-  }
-
-#if HAVE_GETGROUPLIST
-  struct passwd *pw;
-  pw = getpwuid(uid);
-  if (pw == NULL) {
-    ldout(cct, 3) << "getting user entry failed" << dendl;
-    return -errno;
-  }
-  //use PAM to get the group list
-  // initial number of group entries, defaults to posix standard of 16
-  // PAM implementations may provide more than 16 groups....
-  sgid_count = 16;
-  sgid_buf = (gid_t*)malloc(sgid_count * sizeof(gid_t));
-  if (sgid_buf == NULL) {
-    ldout(cct, 3) << "allocating group memory failed" << dendl;
-    return -ENOMEM;
-  }
-
-  while (1) {
-#if defined(__APPLE__)
-    if (getgrouplist(pw->pw_name, gid, (int*)sgid_buf, &sgid_count) == -1) {
-#else
-    if (getgrouplist(pw->pw_name, gid, sgid_buf, &sgid_count) == -1) {
-#endif
-      // we need to resize the group list and try again
-      void *_realloc = NULL;
-      if ((_realloc = realloc(sgid_buf, sgid_count * sizeof(gid_t))) == NULL) {
-       ldout(cct, 3) << "allocating group memory failed" << dendl;
-       free(sgid_buf);
-       return -ENOMEM;
-      }
-      sgid_buf = (gid_t*)_realloc;
-      continue;
-    }
-    // list was successfully retrieved
-    break;
-  }
-  *sgids = sgid_buf;
-  return sgid_count;
-#else
-  return 0;
-#endif
-}
-
 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
 {
   if (perms.uid() == 0)
@@ -6054,14 +6001,13 @@ void Client::tick()
   }
 
   // delayed caps
-  xlist<Inode*>::iterator p = delayed_caps.begin();
+  xlist<Inode*>::iterator p = delayed_list.begin();
   while (!p.end()) {
     Inode *in = *p;
     ++p;
     if (in->hold_caps_until > now)
       break;
-    delayed_caps.pop_front();
-    cap_list.push_back(&in->cap_item);
+    delayed_list.pop_front();
     check_caps(in, CHECK_CAPS_NODELAY);
   }
 
@@ -6720,11 +6666,11 @@ int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
     in->cap_dirtier_uid = perms.uid();
     in->cap_dirtier_gid = perms.gid();
     if (issued & CEPH_CAP_AUTH_EXCL)
-      mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
+      in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
     else if (issued & CEPH_CAP_FILE_EXCL)
-      mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
+      in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
     else if (issued & CEPH_CAP_XATTR_EXCL)
-      mark_caps_dirty(in, CEPH_CAP_XATTR_EXCL);
+      in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
     else
       mask |= CEPH_SETATTR_CTIME;
   }
@@ -6739,7 +6685,7 @@ int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
       in->cap_dirtier_uid = perms.uid();
       in->cap_dirtier_gid = perms.gid();
       in->uid = stx->stx_uid;
-      mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
+      in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
       mask &= ~CEPH_SETATTR_UID;
       kill_sguid = true;
       ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
@@ -6749,7 +6695,7 @@ int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
       in->cap_dirtier_uid = perms.uid();
       in->cap_dirtier_gid = perms.gid();
       in->gid = stx->stx_gid;
-      mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
+      in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
       mask &= ~CEPH_SETATTR_GID;
       kill_sguid = true;
       ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
@@ -6760,13 +6706,13 @@ int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
       in->cap_dirtier_uid = perms.uid();
       in->cap_dirtier_gid = perms.gid();
       in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
-      mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
+      in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
       mask &= ~CEPH_SETATTR_MODE;
       ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
     } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
       /* Must squash the any setuid/setgid bits with an ownership change */
       in->mode &= ~(S_ISUID|S_ISGID);
-      mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
+      in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
     }
 
     if (mask & CEPH_SETATTR_BTIME) {
@@ -6774,7 +6720,7 @@ int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
       in->cap_dirtier_uid = perms.uid();
       in->cap_dirtier_gid = perms.gid();
       in->btime = utime_t(stx->stx_btime);
-      mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
+      in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
       mask &= ~CEPH_SETATTR_BTIME;
       ldout(cct,10) << "changing btime to " << in->btime << dendl;
     }
@@ -6793,7 +6739,7 @@ int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
       in->cap_dirtier_uid = perms.uid();
       in->cap_dirtier_gid = perms.gid();
       in->time_warp_seq++;
-      mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
+      in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
       mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
     }
   }
@@ -7117,7 +7063,22 @@ int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_inf
   st->st_dev = in->snapid;
   st->st_mode = in->mode;
   st->st_rdev = in->rdev;
-  st->st_nlink = in->nlink;
+  if (in->is_dir()) {
+    switch (in->nlink) {
+      case 0:
+        st->st_nlink = 0; /* dir is unlinked */
+        break;
+      case 1:
+        st->st_nlink = 1 /* parent dentry */
+                       + 1 /* <dir>/. */
+                       + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
+        break;
+      default:
+        ceph_abort();
+    }
+  } else {
+    st->st_nlink = in->nlink;
+  }
   st->st_uid = in->uid;
   st->st_gid = in->gid;
   if (in->ctime > in->mtime) {
@@ -7184,7 +7145,22 @@ void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
   }
 
   if (mask & CEPH_CAP_LINK_SHARED) {
-    stx->stx_nlink = in->nlink;
+    if (in->is_dir()) {
+      switch (in->nlink) {
+        case 0:
+          stx->stx_nlink = 0; /* dir is unlinked */
+          break;
+        case 1:
+          stx->stx_nlink = 1 /* parent dentry */
+                           + 1 /* <dir>/. */
+                           + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
+          break;
+        default:
+          ceph_abort();
+      }
+    } else {
+      stx->stx_nlink = in->nlink;
+    }
     stx->stx_mask |= CEPH_STATX_NLINK;
   }
 
@@ -7755,6 +7731,7 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
     else
       dirp->next_offset = dirp->offset_low();
     dirp->last_name = dn_name; // we successfully returned this one; update!
+    dirp->release_count = 0; // last_name no longer match cache index
     if (r > 0)
       return r;
   }
@@ -8872,7 +8849,7 @@ done:
     if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
       in->inline_data.clear();
       in->inline_version = CEPH_INLINE_NONE;
-      mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+      in->mark_caps_dirty(CEPH_CAP_FILE_WR);
       check_caps(in, 0);
     } else
       r = uninline_ret;
@@ -9156,8 +9133,9 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
 
   // check quota
   uint64_t endoff = offset + size;
-  if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
-                                                  f->actor_perms)) {
+  std::list<InodeRef> quota_roots;
+  if (endoff > in->size &&
+      is_quota_bytes_exceeded(in, endoff - in->size, f->actor_perms, &quota_roots)) {
     return -EDQUOT;
   }
 
@@ -9332,9 +9310,9 @@ success:
   // extend file?
   if (totalwritten + offset > in->size) {
     in->size = totalwritten + offset;
-    mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+    in->mark_caps_dirty(CEPH_CAP_FILE_WR);
 
-    if (is_quota_bytes_approaching(in, f->actor_perms)) {
+    if (is_quota_bytes_approaching(in, quota_roots)) {
       check_caps(in, CHECK_CAPS_NODELAY);
     } else if (is_max_size_approaching(in)) {
       check_caps(in, 0);
@@ -9348,7 +9326,7 @@ success:
   // mtime
   in->mtime = ceph_clock_now();
   in->change_attr++;
-  mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+  in->mark_caps_dirty(CEPH_CAP_FILE_WR);
 
 done:
 
@@ -9363,7 +9341,7 @@ done:
     if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
       in->inline_data.clear();
       in->inline_version = CEPH_INLINE_NONE;
-      mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+      in->mark_caps_dirty(CEPH_CAP_FILE_WR);
       check_caps(in, 0);
     } else
       r = uninline_ret;
@@ -9478,6 +9456,8 @@ int Client::_fsync(Inode *in, bool syncdataonly)
   } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
 
   if (!syncdataonly && !in->unsafe_ops.empty()) {
+    flush_mdlog_sync();
+
     MetaRequest *req = in->unsafe_ops.back();
     ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() <<  dendl;
 
@@ -10058,7 +10038,6 @@ void Client::ll_register_callbacks(struct client_callback_args *args)
   ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
                 << " invalidate_ino_cb " << args->ino_cb
                 << " invalidate_dentry_cb " << args->dentry_cb
-                << " getgroups_cb" << args->getgroups_cb
                 << " switch_interrupt_cb " << args->switch_intr_cb
                 << " remount_cb " << args->remount_cb
                 << dendl;
@@ -10079,7 +10058,6 @@ void Client::ll_register_callbacks(struct client_callback_args *args)
     remount_cb = args->remount_cb;
     remount_finisher.start();
   }
-  getgroups_cb = args->getgroups_cb;
   umask_cb = args->umask_cb;
 }
 
@@ -10852,7 +10830,11 @@ int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
 
     // Do a force getattr to get the latest quota before returning
     // a value to userspace.
-    r = _getattr(in, 0, perms, true);
+    int flags = 0;
+    if (vxattr->flags & VXATTR_RSTAT) {
+      flags |= CEPH_STAT_RSTAT;
+    }
+    r = _getattr(in, flags, perms, true);
     if (r != 0) {
       // Error from getattr!
       return r;
@@ -11369,6 +11351,16 @@ size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
   readonly: true,                                              \
   hidden: false,                                               \
   exists_cb: NULL,                                             \
+  flags: 0,                                                     \
+}
+#define XATTR_NAME_CEPH2(_type, _name, _flags)                 \
+{                                                              \
+  name: CEPH_XATTR_NAME(_type, _name),                         \
+  getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,     \
+  readonly: true,                                              \
+  hidden: false,                                               \
+  exists_cb: NULL,                                             \
+  flags: _flags,                                               \
 }
 #define XATTR_LAYOUT_FIELD(_type, _name, _field)               \
 {                                                              \
@@ -11377,6 +11369,7 @@ size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
   readonly: false,                                             \
   hidden: true,                                                        \
   exists_cb: &Client::_vxattrcb_layout_exists,                 \
+  flags: 0,                                                     \
 }
 #define XATTR_QUOTA_FIELD(_type, _name)                                \
 {                                                              \
@@ -11385,6 +11378,7 @@ size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
   readonly: false,                                             \
   hidden: true,                                                        \
   exists_cb: &Client::_vxattrcb_quota_exists,                  \
+  flags: 0,                                                     \
 }
 
 const Client::VXattr Client::_dir_vxattrs[] = {
@@ -11394,6 +11388,7 @@ const Client::VXattr Client::_dir_vxattrs[] = {
     readonly: false,
     hidden: true,
     exists_cb: &Client::_vxattrcb_layout_exists,
+    flags: 0,
   },
   XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
   XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
@@ -11403,17 +11398,18 @@ const Client::VXattr Client::_dir_vxattrs[] = {
   XATTR_NAME_CEPH(dir, entries),
   XATTR_NAME_CEPH(dir, files),
   XATTR_NAME_CEPH(dir, subdirs),
-  XATTR_NAME_CEPH(dir, rentries),
-  XATTR_NAME_CEPH(dir, rfiles),
-  XATTR_NAME_CEPH(dir, rsubdirs),
-  XATTR_NAME_CEPH(dir, rbytes),
-  XATTR_NAME_CEPH(dir, rctime),
+  XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
+  XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
+  XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
+  XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
+  XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
   {
     name: "ceph.quota",
     getxattr_cb: &Client::_vxattrcb_quota,
     readonly: false,
     hidden: true,
     exists_cb: &Client::_vxattrcb_quota_exists,
+    flags: 0,
   },
   XATTR_QUOTA_FIELD(quota, max_bytes),
   XATTR_QUOTA_FIELD(quota, max_files),
@@ -11427,6 +11423,7 @@ const Client::VXattr Client::_file_vxattrs[] = {
     readonly: false,
     hidden: true,
     exists_cb: &Client::_vxattrcb_layout_exists,
+    flags: 0,
   },
   XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
   XATTR_LAYOUT_FIELD(file, layout, stripe_count),
@@ -12395,7 +12392,7 @@ int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
 {
   Mutex::Locker lock(client_lock);
 
-  inodeno_t ino = ll_get_inodeno(in);
+  inodeno_t ino = in->ino;
   uint32_t object_size = layout->object_size;
   uint32_t su = layout->stripe_unit;
   uint32_t stripe_count = layout->stripe_count;
@@ -12879,6 +12876,19 @@ int Client::ll_fsync(Fh *fh, bool syncdataonly)
   return r;
 }
 
+int Client::ll_sync_inode(Inode *in, bool syncdataonly)
+{
+  Mutex::Locker lock(client_lock);
+  ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
+  tout(cct) << "ll_sync_inode" << std::endl;
+  tout(cct) << (unsigned long)in << std::endl;
+
+  if (unmounting)
+    return -ENOTCONN;
+
+  return _fsync(in, syncdataonly);
+}
+
 #ifdef FALLOC_FL_PUNCH_HOLE
 
 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
@@ -12906,9 +12916,10 @@ int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
     return -EBADF;
 
   uint64_t size = offset + length;
+  std::list<InodeRef> quota_roots;
   if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
       size > in->size &&
-      is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
+      is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms, &quota_roots)) {
     return -EDQUOT;
   }
 
@@ -12943,7 +12954,7 @@ int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
       }
       in->mtime = ceph_clock_now();
       in->change_attr++;
-      mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+      in->mark_caps_dirty(CEPH_CAP_FILE_WR);
     } else {
       if (in->inline_version < CEPH_INLINE_NONE) {
         onuninline = new C_SafeCond(&uninline_flock,
@@ -12969,7 +12980,7 @@ int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
                  0, true, onfinish);
       in->mtime = ceph_clock_now();
       in->change_attr++;
-      mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+      in->mark_caps_dirty(CEPH_CAP_FILE_WR);
 
       client_lock.Unlock();
       flock.Lock();
@@ -12985,9 +12996,9 @@ int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
       in->size = size;
       in->mtime = ceph_clock_now();
       in->change_attr++;
-      mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+      in->mark_caps_dirty(CEPH_CAP_FILE_WR);
 
-      if (is_quota_bytes_approaching(in, fh->actor_perms)) {
+      if (is_quota_bytes_approaching(in, quota_roots)) {
         check_caps(in, CHECK_CAPS_NODELAY);
       } else if (is_max_size_approaching(in)) {
        check_caps(in, 0);
@@ -13006,7 +13017,7 @@ int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
     if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
       in->inline_data.clear();
       in->inline_version = CEPH_INLINE_NONE;
-      mark_caps_dirty(in, CEPH_CAP_FILE_WR);
+      in->mark_caps_dirty(CEPH_CAP_FILE_WR);
       check_caps(in, 0);
     } else
       r = uninline_ret;
@@ -13473,6 +13484,7 @@ void Client::ms_handle_remote_reset(Connection *con)
 
        case MetaSession::STATE_OPEN:
          {
+           objecter->maybe_request_map(); /* to check if we are blacklisted */
            const md_config_t *conf = cct->_conf;
            if (conf->client_reconnect_stale) {
              ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
@@ -13615,32 +13627,34 @@ bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
 }
 
 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
-                                    const UserPerm& perms)
+                                    const UserPerm& perms,
+                                    std::list<InodeRef>* quota_roots)
 {
   return check_quota_condition(in, perms,
-      [&new_bytes](const Inode &in) {
+      [&new_bytes, quota_roots](const Inode &in) {
+       if (quota_roots)
+         quota_roots->emplace_back(const_cast<Inode*>(&in));
         return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
                > in.quota.max_bytes;
       });
 }
 
-bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
+bool Client::is_quota_bytes_approaching(Inode *in, std::list<InodeRef>& quota_roots)
 {
-  return check_quota_condition(in, perms,
-      [](const Inode &in) {
-        if (in.quota.max_bytes) {
-          if (in.rstat.rbytes >= in.quota.max_bytes) {
-            return true;
-          }
-
-          assert(in.size >= in.reported_size);
-          const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
-          const uint64_t size = in.size - in.reported_size;
-          return (space >> 4) < size;
-        } else {
-          return false;
-        }
-      });
+  assert(in->size >= in->reported_size);
+  const uint64_t size = in->size - in->reported_size;
+
+  for (auto& diri : quota_roots) {
+    if (diri->quota.max_bytes) {
+      if (diri->rstat.rbytes >= diri->quota.max_bytes)
+       return true;
+
+      uint64_t space = diri->quota.max_bytes - diri->rstat.rbytes;
+      if ((space >> 4) < size)
+       return true;
+    }
+  }
+  return false;
 }
 
 enum {
@@ -13895,13 +13909,6 @@ void Client::handle_conf_change(const struct md_config_t *conf,
   }
 }
 
-void Client::init_groups(UserPerm *perms)
-{
-  gid_t *sgids;
-  int count = _getgrouplist(&sgids, perms->uid(), perms->gid());
-  perms->init_gids(sgids, count);
-}
-
 void intrusive_ptr_add_ref(Inode *in)
 {
   in->get();
index 0d7cd787bad13ec4ee5c6256d1ec6c271441d0ce..9c076214ac76bcce47f066df3c60fd6da8040325 100644 (file)
@@ -94,7 +94,8 @@ class MDSCommandOp : public CommandOp
 };
 
 /* error code for ceph_fuse */
-#define CEPH_FUSE_NO_MDS_UP    -(1<<2) /* no mds up deteced in ceph_fuse */
+#define CEPH_FUSE_NO_MDS_UP    -((1<<16)+0) /* no mds up deteced in ceph_fuse */
+#define CEPH_FUSE_LAST         -((1<<16)+1) /* (unused) */
 
 // ============================================
 // types for my local metadata cache
@@ -135,7 +136,6 @@ typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino,
                                         vinodeno_t ino, string& name);
 typedef int (*client_remount_callback_t)(void *handle);
 
-typedef int (*client_getgroups_callback_t)(void *handle, gid_t **sgids);
 typedef void(*client_switch_interrupt_callback_t)(void *handle, void *data);
 typedef mode_t (*client_umask_callback_t)(void *handle);
 
@@ -148,7 +148,6 @@ struct client_callback_args {
   client_dentry_callback_t dentry_cb;
   client_switch_interrupt_callback_t switch_intr_cb;
   client_remount_callback_t remount_cb;
-  client_getgroups_callback_t getgroups_cb;
   client_umask_callback_t umask_cb;
 };
 
@@ -276,7 +275,6 @@ class Client : public Dispatcher, public md_config_obs_t {
   client_remount_callback_t remount_cb;
   client_ino_callback_t ino_invalidate_cb;
   client_dentry_callback_t dentry_invalidate_cb;
-  client_getgroups_callback_t getgroups_cb;
   client_umask_callback_t umask_cb;
   bool can_invalidate_dentries;
 
@@ -432,8 +430,8 @@ protected:
   Inode*                 root_ancestor;
   LRU                    lru;    // lru list of Dentry's in our local metadata cache.
 
-  // all inodes with caps sit on either cap_list or delayed_caps.
-  xlist<Inode*> delayed_caps, cap_list;
+  // dirty_list keeps all the dirty inodes before flushing.
+  xlist<Inode*> delayed_list, dirty_list;
   int num_flushing_caps;
   ceph::unordered_map<inodeno_t,SnapRealm*> snap_realms;
 
@@ -539,7 +537,7 @@ protected:
   void trim_cache(bool trim_kernel_dcache=false);
   void trim_cache_for_reconnect(MetaSession *s);
   void trim_dentry(Dentry *dn);
-  void trim_caps(MetaSession *s, int max);
+  void trim_caps(MetaSession *s, uint64_t max);
   void _invalidate_kernel_dcache();
   
   void dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected);
@@ -574,8 +572,9 @@ protected:
                             std::function<bool (const Inode &)> test);
   bool is_quota_files_exceeded(Inode *in, const UserPerm& perms);
   bool is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
-                              const UserPerm& perms);
-  bool is_quota_bytes_approaching(Inode *in, const UserPerm& perms);
+                              const UserPerm& perms,
+                              std::list<InodeRef>* quota_roots=nullptr);
+  bool is_quota_bytes_approaching(Inode *in, std::list<InodeRef>& quota_roots);
 
   std::map<std::pair<int64_t,std::string>, int> pool_perms;
   list<Cond*> waiting_for_pool_perm;
@@ -635,7 +634,6 @@ protected:
   void remove_cap(Cap *cap, bool queue_release);
   void remove_all_caps(Inode *in);
   void remove_session_caps(MetaSession *session);
-  void mark_caps_dirty(Inode *in, int caps);
   int mark_caps_flushing(Inode *in, ceph_tid_t *ptid);
   void adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s);
   void flush_caps_sync();
@@ -733,6 +731,7 @@ protected:
   void flush_mdlog_sync();
   void flush_mdlog(MetaSession *session);
   
+  xlist<Inode*> &get_dirty_list() { return dirty_list; }
   // ----------------------
   // fs ops.
 private:
@@ -863,9 +862,6 @@ private:
     MAY_READ = 4,
   };
 
-  void init_groups(UserPerm *groups);
-
-  int inode_permission(Inode *in, const UserPerm& perms, unsigned want);
   int xattr_permission(Inode *in, const char *name, unsigned want,
                       const UserPerm& perms);
   int may_setattr(Inode *in, struct ceph_statx *stx, int mask,
@@ -877,7 +873,6 @@ private:
   int may_hardlink(Inode *in, const UserPerm& perms);
 
   int _getattr_for_perm(Inode *in, const UserPerm& perms);
-  int _getgrouplist(gid_t **sgids, uid_t uid, gid_t gid);
 
   vinodeno_t _get_vino(Inode *in);
   inodeno_t _get_inodeno(Inode *in);
@@ -891,8 +886,12 @@ private:
          size_t (Client::*getxattr_cb)(Inode *in, char *val, size_t size);
          bool readonly, hidden;
          bool (Client::*exists_cb)(Inode *in);
+         int flags;
   };
 
+/* Flags for VXattr */
+#define VXATTR_RSTAT 0x1
+
   bool _vxattrcb_quota_exists(Inode *in);
   size_t _vxattrcb_quota(Inode *in, char *val, size_t size);
   size_t _vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size);
@@ -1126,15 +1125,13 @@ public:
   int mksnap(const char *path, const char *name, const UserPerm& perm);
   int rmsnap(const char *path, const char *name, const UserPerm& perm);
 
+  // Inode permission checking
+  int inode_permission(Inode *in, const UserPerm& perms, unsigned want);
+
   // expose caps
   int get_caps_issued(int fd);
   int get_caps_issued(const char *path, const UserPerm& perms);
 
-  // low-level interface v2
-  inodeno_t ll_get_inodeno(Inode *in) {
-    Mutex::Locker lock(client_lock);
-    return _get_inodeno(in);
-  }
   snapid_t ll_get_snapid(Inode *in);
   vinodeno_t ll_get_vino(Inode *in) {
     Mutex::Locker lock(client_lock);
@@ -1221,6 +1218,7 @@ public:
   loff_t ll_lseek(Fh *fh, loff_t offset, int whence);
   int ll_flush(Fh *fh);
   int ll_fsync(Fh *fh, bool syncdataonly);
+  int ll_sync_inode(Inode *in, bool syncdataonly);
   int ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length);
   int ll_release(Fh *fh);
   int ll_getlk(Fh *fh, struct flock *fl, uint64_t owner);
index 2fadb2b5bab1823a9e83ef8c6e056203ab5daefb..be898cbd89dc928c87e3ac64c70c57f52fb988e8 100644 (file)
@@ -14,7 +14,8 @@
 
 Inode::~Inode()
 {
-  cap_item.remove_myself();
+  delay_cap_item.remove_myself();
+  dirty_cap_item.remove_myself(); 
   snaprealm_item.remove_myself();
 
   if (snapdir_parent) {
@@ -48,6 +49,7 @@ ostream& operator<<(ostream &out, const Inode &in)
       << " open=" << in.open_by_mode
       << " mode=" << oct << in.mode << dec
       << " size=" << in.size << "/" << in.max_size
+      << " nlink=" << in.nlink
       << " mtime=" << in.mtime
       << " caps=" << ccap_string(in.caps_issued());
   if (!in.caps.empty()) {
@@ -747,3 +749,32 @@ void Inode::unset_deleg(Fh *fh)
     }
   }
 }
+
+/**
+* mark_caps_dirty - mark some caps dirty
+* @caps: the dirty caps
+*
+* note that if there is no dirty and flushing caps before, we need to pin this inode.
+* it will be unpined by handle_cap_flush_ack when there are no dirty and flushing caps.
+*/
+void Inode::mark_caps_dirty(int caps)
+{
+  lsubdout(client->cct, client, 10) << __func__ << " " << *this << " " << ccap_string(dirty_caps) << " -> "
+           << ccap_string(dirty_caps | caps) << dendl;
+  if (caps && !caps_dirty())
+    get();
+  dirty_caps |= caps;
+  client->get_dirty_list().push_back(&dirty_cap_item);
+}
+
+/**
+* mark_caps_clean - only clean the dirty_caps and caller should start flushing the dirty caps.
+*/
+void Inode::mark_caps_clean()
+{
+  lsubdout(client->cct, client, 10) << __func__ << " " << *this << dendl;
+  dirty_caps = 0;
+  dirty_cap_item.remove_myself();
+}
+
+
index 5242630d5d1a23b4954a6f4c27d0185d18a78ab7..614d84a306d05e1249bd7070beedd4e4fb335b0e 100644 (file)
@@ -177,7 +177,7 @@ struct Inode {
   int shared_gen, cache_gen;
   int snap_caps, snap_cap_refs;
   utime_t hold_caps_until;
-  xlist<Inode*>::item cap_item, flushing_cap_item;
+  xlist<Inode*>::item delay_cap_item, dirty_cap_item, flushing_cap_item;
 
   SnapRealm *snaprealm;
   xlist<Inode*>::item snaprealm_item;
@@ -247,7 +247,7 @@ struct Inode {
       cap_dirtier_uid(-1), cap_dirtier_gid(-1),
       dirty_caps(0), flushing_caps(0), shared_gen(0), cache_gen(0),
       snap_caps(0), snap_cap_refs(0),
-      cap_item(this), flushing_cap_item(this),
+      delay_cap_item(this), dirty_cap_item(this), flushing_cap_item(this),
       snaprealm(0), snaprealm_item(this),
       oset((void *)this, newlayout->pool_id, this->ino),
       reported_size(0), wanted_max_size(0), requested_max_size(0),
@@ -306,6 +306,8 @@ struct Inode {
   int set_deleg(Fh *fh, unsigned type, ceph_deleg_cb_t cb, void *priv);
   void unset_deleg(Fh *fh);
 
+  void mark_caps_dirty(int caps);
+  void mark_caps_clean();
 private:
   // how many opens for write on this Inode?
   long open_count_for_write()
index 56dc4d9730649e2e83892d55ef62775fdcc2489c..5243904131157b9cfc78b5c97d5b939e2c2f9dd5 100644 (file)
@@ -199,7 +199,7 @@ public:
   }
   bool auth_is_best() {
     if ((head.op & CEPH_MDS_OP_WRITE) || head.op == CEPH_MDS_OP_OPEN ||
-       head.op == CEPH_MDS_OP_READDIR) 
+       head.op == CEPH_MDS_OP_READDIR || send_to_auth
       return true;
     return false;    
   }
index a2d6ccbe888d759ce966ba79bc41a695042f02b0..b73814ffad5e5b2e4647a8985f259b11a1b13346 100644 (file)
@@ -31,7 +31,7 @@ private:
     m_uid = b.m_uid;
     m_gid = b.m_gid;
     gid_count = b.gid_count;
-    if (gid_count) {
+    if (gid_count > 0) {
       gids = new gid_t[gid_count];
       alloced_gids = true;
       for (int i = 0; i < gid_count; ++i) {
@@ -79,8 +79,8 @@ public:
   void init_gids(gid_t* _gids, int count) {
     gids = _gids;
     gid_count = count;
+    alloced_gids = true;
   }
-  void take_gids() { alloced_gids = true; }
   void shallow_copy(const UserPerm& o) {
     m_uid = o.m_uid;
     m_gid = o.m_gid;
index d24ad5c3451a14a5211988510e313af7efd06589..018653f7e6d7d62d6a004995d8427e5efef0e80f 100644 (file)
@@ -127,20 +127,20 @@ static int getgroups(fuse_req_t req, gid_t **sgids)
   return -ENOSYS;
 }
 
-static int getgroups_cb(void *handle, gid_t **sgids)
+static void get_fuse_groups(UserPerm& perms, fuse_req_t req)
 {
-  CephFuse::Handle *cfuse = (CephFuse::Handle *) handle;
-  fuse_req_t req = cfuse->get_fuse_req();
-  return getgroups(req, sgids);
-}
+  if (g_conf->get_val<bool>("fuse_set_user_groups")) {
+    gid_t *gids = NULL;
+    int count = getgroups(req, &gids);
 
-#define GET_GROUPS(perms, req) {                               \
-  if (g_conf->get_val<bool>("fuse_set_user_groups")) { \
-    gid_t *gids = NULL;                                                \
-    int count = getgroups(req, &gids);                         \
-    perms.init_gids(gids, count);                              \
-    perms.take_gids();                                         \
-  } }
+    if (count > 0) {
+      perms.init_gids(gids, count);
+    } else if (count < 0) {
+      derr << __func__ << ": getgroups failed: " << cpp_strerror(-count)
+          << dendl;
+    }
+  }
+}
 
 
 static CephFuse::Handle *fuse_ll_req_prepare(fuse_req_t req)
@@ -158,7 +158,7 @@ static void fuse_ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
   Inode *i2, *i1 = cfuse->iget(parent); // see below
   int r;
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   memset(&fe, 0, sizeof(fe));
   r = cfuse->client->ll_lookup(i1, name, &fe.attr, &i2, perms);
@@ -191,7 +191,7 @@ static void fuse_ll_getattr(fuse_req_t req, fuse_ino_t ino,
   Inode *in = cfuse->iget(ino);
   struct stat stbuf;
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
   
   (void) fi; // XXX
 
@@ -213,7 +213,7 @@ static void fuse_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
   const struct fuse_ctx *ctx = fuse_req_ctx(req);
   Inode *in = cfuse->iget(ino);
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   int mask = 0;
   if (to_set & FUSE_SET_ATTR_MODE) mask |= CEPH_SETATTR_MODE;
@@ -250,7 +250,7 @@ static void fuse_ll_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
   const struct fuse_ctx *ctx = fuse_req_ctx(req);
   Inode *in = cfuse->iget(ino);
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   int r = cfuse->client->ll_setxattr(in, name, value, size, flags, perms);
   fuse_reply_err(req, -r);
@@ -265,7 +265,7 @@ static void fuse_ll_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
   Inode *in = cfuse->iget(ino);
   char buf[size];
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   int r = cfuse->client->ll_listxattr(in, buf, size, perms);
   if (size == 0 && r >= 0)
@@ -290,7 +290,7 @@ static void fuse_ll_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
   Inode *in = cfuse->iget(ino);
   char buf[size];
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   int r = cfuse->client->ll_getxattr(in, name, buf, size, perms);
   if (size == 0 && r >= 0)
@@ -310,7 +310,7 @@ static void fuse_ll_removexattr(fuse_req_t req, fuse_ino_t ino,
   const struct fuse_ctx *ctx = fuse_req_ctx(req);
   Inode *in = cfuse->iget(ino);
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   int r = cfuse->client->ll_removexattr(in, name, perms);
   fuse_reply_err(req, -r);
@@ -327,7 +327,7 @@ static void fuse_ll_opendir(fuse_req_t req, fuse_ino_t ino,
   void *dirp;
 
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   int r = cfuse->client->ll_opendir(in, fi->flags, (dir_result_t **)&dirp,
                                    perms);
@@ -348,7 +348,7 @@ static void fuse_ll_readlink(fuse_req_t req, fuse_ino_t ino)
   Inode *in = cfuse->iget(ino);
   char buf[PATH_MAX + 1];  // leave room for a null terminator
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   int r = cfuse->client->ll_readlink(in, buf, sizeof(buf) - 1, perms);
   if (r >= 0) {
@@ -369,7 +369,7 @@ static void fuse_ll_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
   Inode *i2, *i1 = cfuse->iget(parent);
   struct fuse_entry_param fe;
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   memset(&fe, 0, sizeof(fe));
 
@@ -398,7 +398,7 @@ static void fuse_ll_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
 
   memset(&fe, 0, sizeof(fe));
   UserPerm perm(ctx->uid, ctx->gid);
-  GET_GROUPS(perm, req);
+  get_fuse_groups(perm, req);
 #ifdef HAVE_SYS_SYNCFS
   if (cfuse->fino_snap(parent) == CEPH_SNAPDIR &&
       cfuse->client->cct->_conf->fuse_multithreaded &&
@@ -441,7 +441,7 @@ static void fuse_ll_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
   const struct fuse_ctx *ctx = fuse_req_ctx(req);
   Inode *in = cfuse->iget(parent);
   UserPerm perm(ctx->uid, ctx->gid);
-  GET_GROUPS(perm, req);
+  get_fuse_groups(perm, req);
 
   int r = cfuse->client->ll_unlink(in, name, perm);
   fuse_reply_err(req, -r);
@@ -455,7 +455,7 @@ static void fuse_ll_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
   const struct fuse_ctx *ctx = fuse_req_ctx(req);
   Inode *in = cfuse->iget(parent);
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   int r = cfuse->client->ll_rmdir(in, name, perms);
   fuse_reply_err(req, -r);
@@ -471,7 +471,7 @@ static void fuse_ll_symlink(fuse_req_t req, const char *existing,
   Inode *i2, *i1 = cfuse->iget(parent);
   struct fuse_entry_param fe;
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   memset(&fe, 0, sizeof(fe));
 
@@ -497,7 +497,7 @@ static void fuse_ll_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
   Inode *in = cfuse->iget(parent);
   Inode *nin = cfuse->iget(newparent);
   UserPerm perm(ctx->uid, ctx->gid);
-  GET_GROUPS(perm, req);
+  get_fuse_groups(perm, req);
 
   int r = cfuse->client->ll_rename(in, name, nin, newname, perm);
   fuse_reply_err(req, -r);
@@ -517,7 +517,7 @@ static void fuse_ll_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent,
 
   memset(&fe, 0, sizeof(fe));
   UserPerm perm(ctx->uid, ctx->gid);
-  GET_GROUPS(perm, req);
+  get_fuse_groups(perm, req);
   
   /*
    * Note that we could successfully link, but then fail the subsequent
@@ -556,7 +556,7 @@ static void fuse_ll_open(fuse_req_t req, fuse_ino_t ino,
   Inode *in = cfuse->iget(ino);
   Fh *fh = NULL;
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   int r = cfuse->client->ll_open(in, fi->flags, &fh, perms);
   if (r == 0) {
@@ -748,7 +748,15 @@ static void fuse_ll_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
 
 static void fuse_ll_access(fuse_req_t req, fuse_ino_t ino, int mask)
 {
-  fuse_reply_err(req, 0);
+  CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
+  const struct fuse_ctx *ctx = fuse_req_ctx(req);
+  Inode *in = cfuse->iget(ino);
+  UserPerm perms(ctx->uid, ctx->gid);
+  get_fuse_groups(perms, req);
+
+  int r = cfuse->client->inode_permission(in, perms, mask);
+  fuse_reply_err(req, -r);
+  cfuse->iput(in);
 }
 
 static void fuse_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name,
@@ -760,7 +768,7 @@ static void fuse_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name,
   struct fuse_entry_param fe;
   Fh *fh = NULL;
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   memset(&fe, 0, sizeof(fe));
 
@@ -791,7 +799,7 @@ static void fuse_ll_statfs(fuse_req_t req, fuse_ino_t ino)
   Inode *in = cfuse->iget(ino);
   const struct fuse_ctx *ctx = fuse_req_ctx(req);
   UserPerm perms(ctx->uid, ctx->gid);
-  GET_GROUPS(perms, req);
+  get_fuse_groups(perms, req);
 
   int r = cfuse->client->ll_statfs(in, &stbuf, perms);
   if (r == 0)
@@ -1130,7 +1138,6 @@ int CephFuse::Handle::start()
 #if defined(__linux__)
     remount_cb: remount_cb,
 #endif
-    getgroups_cb: getgroups_cb,
 #if !defined(DARWIN)
     umask_cb: umask_cb,
 #endif
index dc9da596854ab1015dda9e2baed5feafe54acfe7..d7deb78f8b6feb2f347050ebb59ac47b137abc54 100644 (file)
@@ -2296,6 +2296,9 @@ static int list_plain_entries(cls_method_context_t hctx, const string& name, con
   for (iter = keys.begin(); iter != keys.end(); ++iter) {
     if (iter->first >= end_key) {
       /* past the end of plain namespace */
+      if (pmore) {
+       *pmore = false;
+      }
       return count;
     }
 
@@ -2317,6 +2320,10 @@ static int list_plain_entries(cls_method_context_t hctx, const string& name, con
     CLS_LOG(20, "%s(): entry.idx=%s e.key.name=%s", __func__, escape_str(entry.idx).c_str(), escape_str(e.key.name).c_str());
 
     if (!name.empty() && e.key.name != name) {
+      /* we are skipping the rest of the entries */
+      if (pmore) {
+       *pmore = false;
+      }
       return count;
     }
 
@@ -2379,6 +2386,10 @@ static int list_instance_entries(cls_method_context_t hctx, const string& name,
     entry.data = iter->second;
 
     if (!filter.empty() && entry.idx.compare(0, filter.size(), filter) != 0) {
+      /* we are skipping the rest of the entries */
+      if (pmore) {
+       *pmore = false;
+      }
       return count;
     }
 
@@ -2395,6 +2406,10 @@ static int list_instance_entries(cls_method_context_t hctx, const string& name,
     }
 
     if (!name.empty() && e.key.name != name) {
+      /* we are skipping the rest of the entries */
+      if (pmore) {
+       *pmore = false;
+      }
       return count;
     }
 
@@ -2456,6 +2471,10 @@ static int list_olh_entries(cls_method_context_t hctx, const string& name, const
     entry.data = iter->second;
 
     if (!filter.empty() && entry.idx.compare(0, filter.size(), filter) != 0) {
+      /* we are skipping the rest of the entries */
+      if (pmore) {
+       *pmore = false;
+      }
       return count;
     }
 
@@ -2472,6 +2491,10 @@ static int list_olh_entries(cls_method_context_t hctx, const string& name, const
     }
 
     if (!name.empty() && e.key.name != name) {
+      /* we are skipping the rest of the entries */
+      if (pmore) {
+       *pmore = false;
+      }
       return count;
     }
 
@@ -2502,7 +2525,7 @@ static int rgw_bi_list_op(cls_method_context_t hctx, bufferlist *in, bufferlist
   int32_t max = (op.max < MAX_BI_LIST_ENTRIES ? op.max : MAX_BI_LIST_ENTRIES);
   string start_key = op.marker;
   bool more;
-  int ret = list_plain_entries(hctx, op.name, op.marker, max, &op_ret.entries, &more); 
+  int ret = list_plain_entries(hctx, op.name, op.marker, max, &op_ret.entries, &more);
   if (ret < 0) {
     CLS_LOG(0, "ERROR: %s(): list_plain_entries retured ret=%d", __func__, ret);
     return ret;
index 67a129ccd0965e470f9096772704122869dafb30..26a552d439c8e71b4356676e591525896dfe2665 100644 (file)
@@ -62,21 +62,23 @@ void DecayCounter::generate_test_instances(list<DecayCounter*>& ls)
 
 void DecayCounter::decay(utime_t now, const DecayRate &rate)
 {
-  utime_t el = now;
-  el -= last_decay;
+  if (now >= last_decay) {
+    double el = (double)(now - last_decay);
+    if (el >= 1.0) {
+      // calculate new value
+      double newval = (val+delta) * exp(el * rate.k);
+      if (newval < .01)
+       newval = 0.0;
 
-  if (el.sec() >= 1) {
-    // calculate new value
-    double newval = (val+delta) * exp((double)el * rate.k);
-    if (newval < .01)
-      newval = 0.0;
+      // calculate velocity approx
+      vel += (newval - val) * el;
+      vel *= exp(el * rate.k);
 
-    // calculate velocity approx
-    vel += (newval - val) * (double)el;
-    vel *= exp((double)el * rate.k);
-
-    val = newval;
-    delta = 0;
-    last_decay = now;
+      val = newval;
+      delta = 0;
+      last_decay = now;
+    }
+  } else {
+      last_decay = now;
   }
 }
index e45aed8d2f2a5f590859898a2928e7e188fdba7f..4addb1ce34e7864ae8769689766e3c6f0c8299cd 100644 (file)
@@ -86,15 +86,15 @@ public:
     return val+delta;
   }
 
-  double get_last() {
+  double get_last() const {
     return val;
   }
   
-  double get_last_vel() {
+  double get_last_vel() const {
     return vel;
   }
 
-  utime_t get_last_decay() 
+  utime_t get_last_decay() const {
     return last_decay; 
   }
 
index 9fe34e5ea0eb8fb39721cff507d51462475d85b2..f6671a5d302d6ef9125fd6745199385b2d018721 100644 (file)
@@ -107,11 +107,8 @@ public:
 
   int signal_exit(int r) {
     if (forked) {
-      // tell parent.  this shouldn't fail, but if it does, pass the
-      // error back to the parent.
-      int ret = safe_write(fd[1], &r, sizeof(r));
-      if (ret <= 0)
-       return ret;
+      /* If we get an error here, it's too late to do anything reasonable about it. */
+      (void)safe_write(fd[1], &r, sizeof(r));
     }
     return r;
   }
index e5aa52210df7e0da0a6d69ab16e5909eb863e397..0109a26ef1bcc36e69d58f83272f01d0af148bf2 100644 (file)
@@ -73,7 +73,11 @@ class BoundedKeyCounter {
   struct const_pointer_iterator : public map_type::const_iterator {
     const_pointer_iterator(typename map_type::const_iterator i)
       : map_type::const_iterator(i) {}
-    const value_type* operator*() const {
+
+    using value_type = typename map_type::const_iterator::value_type*;
+    using reference = const typename map_type::const_iterator::value_type*;
+
+    reference operator*() const {
       return &map_type::const_iterator::operator*();
     }
   };
index 7b106d585fb4c14872d0f7fdbaf42d8bb12517bf..0aab05c764603872a524bc4a8413d85f3351d168 100644 (file)
 
 #include <iostream>
 
+#include "acconfig.h"
+
 #ifdef HAVE_SYS_PRCTL_H
 #include <sys/prctl.h>
 #endif
 
+#include <string.h>
+
 code_environment_t g_code_env = CODE_ENVIRONMENT_UTILITY;
 
 extern "C" const char *code_environment_to_str(enum code_environment_t e)
diff --git a/ceph/src/common/compat.cc b/ceph/src/common/compat.cc
new file mode 100644 (file)
index 0000000..18b7587
--- /dev/null
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <fcntl.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <sys/mount.h>
+#if defined(__linux__) 
+#include <sys/vfs.h>
+#endif
+
+#include "include/compat.h" 
+#include "common/safe_io.h"
+
+// The type-value for a ZFS FS in fstatfs.
+#define FS_ZFS_TYPE 0xde
+
+// On FreeBSD, ZFS fallocate always fails since it is considered impossible to
+// reserve space on a COW filesystem. posix_fallocate() returns EINVAL
+// Linux in this case already emulates the reservation in glibc
+// In which case it is allocated manually, and still that is not a real guarantee
+// that a full buffer is allocated on disk, since it could be compressed.
+// To prevent this the written buffer needs to be loaded with random data.
+int manual_fallocate(int fd, off_t offset, off_t len) {
+  int r = lseek(fd, offset, SEEK_SET);
+  if (r == -1)
+    return errno;
+  char data[1024*128];
+  // TODO: compressing filesystems would require random data
+  memset(data, 0x42, sizeof(data));
+  for (off_t off = 0; off < len; off += sizeof(data)) {
+    if (off + sizeof(data) > len)
+      r = safe_write(fd, data, len - off);
+    else
+      r = safe_write(fd, data, sizeof(data));
+    if (r == -1) {
+      return errno;
+    }
+  }
+  return 0;
+}
+
+int on_zfs(int basedir_fd) {
+  struct statfs basefs;
+  (void)fstatfs(basedir_fd, &basefs);
+  return (basefs.f_type == FS_ZFS_TYPE);
+}
+
+int ceph_posix_fallocate(int fd, off_t offset, off_t len) {
+  // Return 0 if oke, otherwise errno > 0
+
+#ifdef HAVE_POSIX_FALLOCATE
+  if (on_zfs(fd)) {
+    return manual_fallocate(fd, offset, len);
+  } else {
+    return posix_fallocate(fd, offset, len);
+  }
+#elif defined(__APPLE__)
+  fstore_t store;
+  store.fst_flags = F_ALLOCATECONTIG;
+  store.fst_posmode = F_PEOFPOSMODE;
+  store.fst_offset = offset;
+  store.fst_length = len;
+
+  int ret = fcntl(fd, F_PREALLOCATE, &store);
+  if (ret == -1) {
+    ret = errno;
+  }
+  return ret;
+#else
+  return manual_fallocate(fd, offset, len);
+#endif
+} 
+
index e4fbf23a7602a73846a9dc13c8043a42e398bd98..d15736a0cd9d0ff1b3c004402e7d42142d1e80fe 100644 (file)
@@ -1,6 +1,7 @@
 #include "acconfig.h"
 #include "include/int_types.h"
 #include "common/crc32c_aarch64.h"
+#include "arch/arm.h"
 
 #ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
 /* Request crc extension capabilities from the assembler */
@@ -96,6 +97,7 @@ uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned
 
        if (buffer) {
 #ifdef HAVE_ARMV8_CRYPTO
+               if (ceph_arch_aarch64_pmull) {
 #ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
                /* Calculate reflected crc with PMULL Instruction */
                const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
@@ -183,7 +185,7 @@ uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned
 
                if(!(length += 1024))
                        return crc;
-
+               }
 #endif /* HAVE_ARMV8_CRYPTO */
                while ((length -= sizeof(uint64_t)) >= 0) {
                        CRC32CX(crc, *(uint64_t *)buffer);
@@ -203,6 +205,7 @@ uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned
                        CRC32CB(crc, *buffer);
        } else {
 #ifdef HAVE_ARMV8_CRYPTO
+               if (ceph_arch_aarch64_pmull) {
 #ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
                const poly64_t k1 = 0xe417f38a;
                uint64_t t0;
@@ -250,7 +253,7 @@ uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned
 
                if(!(length += 1024))
                        return crc;
-
+               }
 #endif /* HAVE_ARMV8_CRYPTO */
                while ((length -= sizeof(uint64_t)) >= 0)
                        CRC32CX(crc, 0);
index 0651b2070a33c70ad2c11cc73051e787c5381918..e2ac86444322c6c996dc7b3cb01db69257aca3c1 100644 (file)
@@ -331,9 +331,12 @@ OPTION(auth_service_required, OPT_STR)   // required by daemons of clients
 OPTION(auth_client_required, OPT_STR)     // what clients require of daemons
 OPTION(auth_supported, OPT_STR)               // deprecated; default value for above if they are not defined.
 OPTION(max_rotating_auth_attempts, OPT_INT)
-OPTION(cephx_require_signatures, OPT_BOOL) //  If true, don't talk to Cephx partners if they don't support message signing; off by default
+OPTION(cephx_require_signatures, OPT_BOOL)
 OPTION(cephx_cluster_require_signatures, OPT_BOOL)
 OPTION(cephx_service_require_signatures, OPT_BOOL)
+OPTION(cephx_require_version, OPT_INT)
+OPTION(cephx_cluster_require_version, OPT_INT)
+OPTION(cephx_service_require_version, OPT_INT)
 OPTION(cephx_sign_messages, OPT_BOOL)  // Default to signing session messages if supported
 OPTION(auth_mon_ticket_ttl, OPT_DOUBLE)
 OPTION(auth_service_ticket_ttl, OPT_DOUBLE)
@@ -769,6 +772,8 @@ OPTION(osd_max_scrubs, OPT_INT)
 OPTION(osd_scrub_during_recovery, OPT_BOOL) // Allow new scrubs to start while recovery is active on the OSD
 OPTION(osd_scrub_begin_hour, OPT_INT)
 OPTION(osd_scrub_end_hour, OPT_INT)
+OPTION(osd_scrub_begin_week_day, OPT_INT)
+OPTION(osd_scrub_end_week_day, OPT_INT)
 OPTION(osd_scrub_load_threshold, OPT_FLOAT)
 OPTION(osd_scrub_min_interval, OPT_FLOAT)    // if load is low
 OPTION(osd_scrub_max_interval, OPT_FLOAT)  // regardless of load
@@ -782,7 +787,12 @@ OPTION(osd_scrub_auto_repair_num_errors, OPT_U32)   // only auto-repair when num
 OPTION(osd_deep_scrub_interval, OPT_FLOAT) // once a week
 OPTION(osd_deep_scrub_randomize_ratio, OPT_FLOAT) // scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep)
 OPTION(osd_deep_scrub_stride, OPT_INT)
+OPTION(osd_deep_scrub_keys, OPT_INT)
 OPTION(osd_deep_scrub_update_digest_min_age, OPT_INT)   // objects must be this old (seconds) before we update the whole-object digest on scrub
+OPTION(osd_skip_data_digest, OPT_BOOL)
+OPTION(osd_distrust_data_digest, OPT_BOOL)
+OPTION(osd_deep_scrub_large_omap_object_key_threshold, OPT_U64)
+OPTION(osd_deep_scrub_large_omap_object_value_sum_threshold, OPT_U64)
 OPTION(osd_class_dir, OPT_STR) // where rados plugins are stored
 OPTION(osd_open_classes_on_start, OPT_BOOL)
 OPTION(osd_class_load_list, OPT_STR) // list of object classes allowed to be loaded (allow all: *)
@@ -818,7 +828,6 @@ OPTION(osd_debug_drop_ping_probability, OPT_DOUBLE)
 OPTION(osd_debug_drop_ping_duration, OPT_INT)
 OPTION(osd_debug_op_order, OPT_BOOL)
 OPTION(osd_debug_verify_missing_on_start, OPT_BOOL)
-OPTION(osd_debug_scrub_chance_rewrite_digest, OPT_U64)
 OPTION(osd_debug_verify_snaps, OPT_BOOL)
 OPTION(osd_debug_verify_stray_on_activate, OPT_BOOL)
 OPTION(osd_debug_skip_full_check_in_backfill_reservation, OPT_BOOL)
@@ -828,6 +837,7 @@ OPTION(osd_debug_misdirected_ops, OPT_BOOL)
 OPTION(osd_debug_skip_full_check_in_recovery, OPT_BOOL)
 OPTION(osd_debug_random_push_read_error, OPT_DOUBLE)
 OPTION(osd_debug_verify_cached_snaps, OPT_BOOL)
+OPTION(osd_debug_deep_scrub_sleep, OPT_FLOAT)
 OPTION(osd_enable_op_tracker, OPT_BOOL) // enable/disable OSD op tracking
 OPTION(osd_num_op_tracker_shard, OPT_U32) // The number of shards for holding the ops
 OPTION(osd_op_history_size, OPT_U32)    // Max number of completed ops to track
@@ -1335,6 +1345,7 @@ OPTION(rgw_lc_max_objs, OPT_INT)
 OPTION(rgw_lc_debug_interval, OPT_INT)  // Debug run interval, in seconds
 OPTION(rgw_script_uri, OPT_STR) // alternative value for SCRIPT_URI if not set in request
 OPTION(rgw_request_uri, OPT_STR) // alternative value for REQUEST_URI if not set in request
+OPTION(rgw_ignore_get_invalid_range, OPT_BOOL) // treat invalid (e.g., negative) range requests as full
 OPTION(rgw_swift_url, OPT_STR)             // the swift url, being published by the internal swift auth
 OPTION(rgw_swift_url_prefix, OPT_STR) // entry point for which a url is considered a swift url
 OPTION(rgw_swift_auth_url, OPT_STR)        // default URL to go and verify tokens for v1 auth (if not using internal swift auth)
@@ -1360,7 +1371,6 @@ OPTION(rgw_keystone_accepted_admin_roles, OPT_STR) // list of roles allowing an
 OPTION(rgw_keystone_token_cache_size, OPT_INT)  // max number of entries in keystone token cache
 OPTION(rgw_keystone_revocation_interval, OPT_INT)  // seconds between tokens revocation check
 OPTION(rgw_keystone_verify_ssl, OPT_BOOL) // should we try to verify keystone's ssl
-OPTION(rgw_keystone_implicit_tenants, OPT_BOOL)  // create new users in their own tenants of the same name
 OPTION(rgw_cross_domain_policy, OPT_STR)
 OPTION(rgw_healthcheck_disabling_path, OPT_STR) // path that existence causes the healthcheck to respond 503
 OPTION(rgw_s3_auth_use_rados, OPT_BOOL)  // should we try to use the internal credentials for s3?
index 4eaeba3471f7c5c4bef0b343796f5de0cf34d8b6..f80d09cee4a757c779c38052add02176d8868800 100644 (file)
@@ -592,7 +592,7 @@ int ObjBencher::write_bench(int secondsToRun,
     formatter->dump_format("min_iops", "%d", data.idata.min_iops);
     formatter->dump_format("average_latency", "%f", data.avg_latency);
     formatter->dump_format("stddev_latency", "%f", vec_stddev(data.history.latency));
-    formatter->dump_format("max_latency:", "%f", data.max_latency);
+    formatter->dump_format("max_latency", "%f", data.max_latency);
     formatter->dump_format("min_latency", "%f", data.min_latency);
   }
   //write object size/number data for read benchmarks
index 3ea0af62af4e63dbff2065437d4e1599ab79b167..5c83f9527773e566eeb6d20e1d51f6656872ca77 100644 (file)
@@ -1479,14 +1479,26 @@ std::vector<Option> get_global_options() {
     .set_default(false)
     .set_description(""),
 
+    Option("cephx_require_version", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Cephx version required (1 = pre-mimic, 2 = mimic+)"),
+
     Option("cephx_cluster_require_signatures", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
     .set_description(""),
 
+    Option("cephx_cluster_require_version", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Cephx version required by the cluster from clients (1 = pre-mimic, 2 = mimic+)"),
+
     Option("cephx_service_require_signatures", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
     .set_description(""),
 
+    Option("cephx_service_require_version", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Cephx version required from ceph services (1 = pre-mimic, 2 = mimic+)"),
+
     Option("cephx_sign_messages", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
     .set_description(""),
@@ -1973,6 +1985,14 @@ std::vector<Option> get_global_options() {
     .set_default(8)
     .set_description(""),
 
+    Option("osd_skip_data_digest", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Do not store full-object checksums if the backend (bluestore) does its own checksums.  Do not ever turn this off if it has ever been turned on."),
+
+    Option("osd_distrust_data_digest", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Do not trust stored data_digest (due to previous bug or corruption)"),
+
     Option("osd_op_queue", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("wpq")
     .set_enum_allowed( { "wpq", "prioritized", "mclock_opclass", "mclock_client", "debug_random" } )
@@ -2485,75 +2505,118 @@ std::vector<Option> get_global_options() {
 
     Option("osd_max_scrubs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(1)
-    .set_description(""),
+    .set_description("Maximum concurrent scrubs on a single OSD"),
 
     Option("osd_scrub_during_recovery", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("Allow scrubbing when PGs on the OSD are undergoing recovery"),
 
     Option("osd_scrub_begin_hour", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(0)
-    .set_description(""),
+    .set_description("Restrict scrubbing to this hour of the day or later")
+    .add_see_also("osd_scrub_end_hour"),
 
     Option("osd_scrub_end_hour", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(24)
-    .set_description(""),
+    .set_description("Restrict scrubbing to hours of the day earlier than this")
+    .add_see_also("osd_scrub_begin_hour"),
+
+    Option("osd_scrub_begin_week_day", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Restrict scrubbing to this day of the week or later")
+    .set_long_description("0 or 7 = Sunday, 1 = Monday, etc.")
+    .add_see_also("osd_scrub_end_week_day"),
+
+    Option("osd_scrub_end_week_day", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(7)
+    .set_description("Restrict scrubbing to days of the week earlier than this")
+    .set_long_description("0 or 7 = Sunday, 1 = Monday, etc.")
+    .add_see_also("osd_scrub_begin_week_day"),
 
     Option("osd_scrub_load_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(0.5)
-    .set_description(""),
+    .set_description("Allow scrubbing when system load divided by number of CPUs is below this value"),
 
     Option("osd_scrub_min_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(1_day)
-    .set_description(""),
+    .set_description("Scrub each PG no more often than this interval")
+    .add_see_also("osd_scrub_max_interval"),
 
     Option("osd_scrub_max_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(7_day)
-    .set_description(""),
+    .set_description("Scrub each PG no less often than this interval")
+    .add_see_also("osd_scrub_min_interval"),
 
     Option("osd_scrub_interval_randomize_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(0.5)
-    .set_description(""),
+    .set_description("Ratio of scrub interval to randomly vary")
+    .set_long_description("This prevents a scrub 'stampede' by randomly varying the scrub intervals so that they are soon uniformly distributed over the week")
+    .add_see_also("osd_scrub_min_interval"),
 
-    Option("osd_scrub_backoff_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    Option("osd_scrub_backoff_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(.66)
-    .set_description(""),
+    .set_description("Backoff ratio after a failed scrub scheduling attempt"),
 
     Option("osd_scrub_chunk_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(5)
-    .set_description(""),
+    .set_description("Minimum number of objects to scrub in a single chunk")
+    .add_see_also("osd_scrub_chunk_max"),
 
     Option("osd_scrub_chunk_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(25)
-    .set_description(""),
+    .set_description("Maximum number of object to scrub in a single chunk")
+    .add_see_also("osd_scrub_chunk_min"),
 
     Option("osd_scrub_sleep", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(0)
-    .set_description(""),
+    .set_description("Duration to inject a delay during scrubbing"),
 
     Option("osd_scrub_auto_repair", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("Automatically repair damaged objects detected during scrub"),
 
     Option("osd_scrub_auto_repair_num_errors", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(5)
-    .set_description(""),
+    .set_description("Maximum number of detected errors to automatically repair")
+    .add_see_also("osd_scrub_auto_repair"),
+
+    Option("osd_scrub_max_preemptions", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Set the maximum number of times we will preempt a deep scrub due to a client operation before blocking client IO to complete the scrub"),
 
     Option("osd_deep_scrub_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(7_day)
-    .set_description(""),
+    .set_description("Deep scrub each PG (i.e., verify data checksums) at least this often"),
 
     Option("osd_deep_scrub_randomize_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(0.15)
-    .set_description(""),
+    .set_description("Ratio of deep scrub interval to randomly vary")
+    .set_long_description("This prevents a deep scrub 'stampede' by randomly varying the scrub intervals so that they are soon uniformly distributed over the week")
+    .add_see_also("osd_deep_scrub_interval"),
 
     Option("osd_deep_scrub_stride", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(524288)
-    .set_description(""),
+    .set_description("Number of bytes to read from an object at a time during deep scrub"),
+
+    Option("osd_deep_scrub_keys", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description("Number of keys to read from an object at a time during deep scrub"),
 
     Option("osd_deep_scrub_update_digest_min_age", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(2_hr)
-    .set_description(""),
+    .set_description("Update overall object digest only if object was last modified longer ago than this"),
+
+    Option("osd_deep_scrub_large_omap_object_key_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(2000000)
+    .set_description("Warn when we encounter an object with more omap keys than this")
+    .add_service("osd")
+    .add_see_also("osd_deep_scrub_large_omap_object_value_sum_threshold"),
+
+    Option("osd_deep_scrub_large_omap_object_value_sum_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1_G)
+    .set_description("Warn when we encounter an object with more omap key bytes than this")
+    .add_service("osd")
+    .add_see_also("osd_deep_scrub_large_omap_object_key_threshold"),
 
     Option("osd_class_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default(CEPH_LIBDIR "/rados-classes")
@@ -2625,7 +2688,7 @@ std::vector<Option> get_global_options() {
     .set_description(""),
 
     Option("osd_max_pg_per_osd_hard_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(2)
+    .set_default(3)
     .set_min(1)
     .set_description("Maximum number of PG per OSD, a factor of 'mon_max_pg_per_osd'")
     .set_long_description("OSD will refuse to instantiate PG if the number of PG it serves exceeds this number.")
@@ -2706,10 +2769,6 @@ std::vector<Option> get_global_options() {
     .set_default(false)
     .set_description(""),
 
-    Option("osd_debug_scrub_chance_rewrite_digest", Option::TYPE_UINT, Option::LEVEL_DEV)
-    .set_default(0)
-    .set_description(""),
-
     Option("osd_debug_verify_snaps", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(false)
     .set_description(""),
@@ -2746,6 +2805,10 @@ std::vector<Option> get_global_options() {
     .set_default(false)
     .set_description(""),
 
+    Option("osd_debug_deep_scrub_sleep", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Inject an expensive sleep during deep scrub IO to make it easier to induce preemption"),
+
     Option("osd_enable_op_tracker", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
     .set_description(""),
@@ -2962,7 +3025,9 @@ std::vector<Option> get_global_options() {
     .set_description("The block size for index partitions. (0 = rocksdb default)"),
 
     Option("mon_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
-    .set_default("write_buffer_size=33554432,compression=kNoCompression")
+    .set_default("write_buffer_size=33554432,"
+                "compression=kNoCompression,"
+                "level_compaction_dynamic_level_bytes=true")
     .set_description(""),
 
     Option("osd_client_op_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
@@ -2983,11 +3048,11 @@ std::vector<Option> get_global_options() {
 
     Option("osd_scrub_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(5)
-    .set_description(""),
+    .set_description("Priority for scrub operations in work queue"),
 
     Option("osd_scrub_cost", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(50<<20)
-    .set_description(""),
+    .set_description("Cost for scrub operations in work queue"),
 
     Option("osd_requested_scrub_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(120)
@@ -4488,6 +4553,12 @@ std::vector<Option> get_rgw_options() {
     .set_default("")
     .set_description(""),
 
+    Option("rgw_ignore_get_invalid_range", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Treat invalid (e.g., negative) range request as full")
+    .set_long_description("Treat invalid (e.g., negative) range request "
+                         "as request for the full object (AWS compatibility)"),
+
     Option("rgw_swift_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("")
     .set_description("Swift-auth storage URL")
@@ -4612,12 +4683,13 @@ std::vector<Option> get_rgw_options() {
     .set_default(true)
     .set_description("Should RGW verify the Keystone server SSL certificate."),
 
-    Option("rgw_keystone_implicit_tenants", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
-    .set_default(false)
+    Option("rgw_keystone_implicit_tenants", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("false")
+    .set_enum_allowed( { "false", "true", "swift", "s3", "both", "0", "1", "none" } )
     .set_description("RGW Keystone implicit tenants creation")
     .set_long_description(
         "Implicitly create new users in their own tenant with the same name when "
-        "authenticating via Keystone."),
+        "authenticating via Keystone.  Can be limited to s3 or swift only."),
 
     Option("rgw_cross_domain_policy", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("<allow-access-from domain=\"*\" secure=\"false\" />")
@@ -5456,7 +5528,9 @@ std::vector<Option> get_rgw_options() {
 
     Option("rgw_torrent_flag", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description("Produce torrent function flag"),
+    .set_description("When true, uploaded objects will calculate and store "
+                     "a SHA256 hash of object data so the object can be "
+                     "retrieved as a torrent file"),
 
     Option("rgw_torrent_tracker", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("")
@@ -6346,106 +6420,111 @@ std::vector<Option> get_mds_options() {
     Option("mds_hack_allow_loading_invalid_metadata", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
      .set_default(0)
      .set_description("INTENTIONALLY CAUSE DATA LOSS by bypasing checks for invalid metadata on disk. Allows testing repair tools."),
+
+    Option("mds_inject_migrator_session_race", Option::TYPE_BOOL, Option::LEVEL_DEV)
+     .set_default(false),
   });
 }
 
 std::vector<Option> get_mds_client_options() {
   return std::vector<Option>({
-    Option("client_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("client_cache_size", Option::TYPE_INT, Option::LEVEL_BASIC)
     .set_default(16384)
-    .set_description(""),
+    .set_description("soft maximum number of directory entries in client cache"),
 
     Option("client_cache_mid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(.75)
-    .set_description(""),
+    .set_description("mid-point of client cache LRU"),
 
-    Option("client_use_random_mds", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    Option("client_use_random_mds", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(false)
-    .set_description(""),
+    .set_description("issue new requests to a random active MDS"),
 
     Option("client_mount_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(300.0)
-    .set_description(""),
+    .set_description("timeout for mounting CephFS (seconds)"),
 
-    Option("client_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    Option("client_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(1.0)
-    .set_description(""),
+    .set_description("seconds between client upkeep ticks"),
 
-    Option("client_trace", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    Option("client_trace", Option::TYPE_STR, Option::LEVEL_DEV)
     .set_default("")
-    .set_description(""),
+    .set_description("file containing trace of client operations"),
 
     Option("client_readahead_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(128*1024)
-    .set_description(""),
+    .set_description("minimum bytes to readahead in a file"),
 
     Option("client_readahead_max_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(0)
-    .set_description(""),
+    .set_description("maximum bytes to readahead in a file (zero is unlimited)"),
 
     Option("client_readahead_max_periods", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(4)
-    .set_description(""),
+    .set_description("maximum stripe periods to readahead in a file"),
 
     Option("client_reconnect_stale", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("reconnect when the session becomes stale"),
 
     Option("client_snapdir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default(".snap")
-    .set_description(""),
+    .set_description("pseudo directory for snapshot access to a directory"),
 
     Option("client_mountpoint", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("/")
-    .set_description(""),
+    .set_description("default mount-point"),
 
     Option("client_mount_uid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(-1)
-    .set_description(""),
+    .set_description("uid to mount as"),
 
     Option("client_mount_gid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(-1)
-    .set_description(""),
+    .set_description("gid to mount as"),
 
-    Option("client_notify_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    /* RADOS client option */
+    Option("client_notify_timeout", Option::TYPE_INT, Option::LEVEL_DEV)
     .set_default(10)
     .set_description(""),
 
-    Option("osd_client_watch_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    /* RADOS client option */
+    Option("osd_client_watch_timeout", Option::TYPE_INT, Option::LEVEL_DEV)
     .set_default(30)
     .set_description(""),
 
-    Option("client_caps_release_delay", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("client_caps_release_delay", Option::TYPE_INT, Option::LEVEL_DEV)
     .set_default(5)
     .set_description(""),
 
     Option("client_quota_df", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("show quota usage for statfs (df)"),
 
     Option("client_oc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("enable object caching"),
 
     Option("client_oc_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(200_M)
-    .set_description(""),
+    .set_description("maximum size of object cache"),
 
     Option("client_oc_max_dirty", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(100_M)
-    .set_description(""),
+    .set_description("maximum size of dirty pages in object cache"),
 
     Option("client_oc_target_dirty", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(8_M)
-    .set_description(""),
+    .set_description("target size of dirty pages object cache"),
 
     Option("client_oc_max_dirty_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(5.0)
-    .set_description(""),
+    .set_description("maximum age of dirty pages in object cache (seconds)"),
 
     Option("client_oc_max_objects", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(1000)
-    .set_description(""),
+    .set_description("maximum number of objects in cache"),
 
     Option("client_debug_getattr_caps", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(false)
@@ -6459,7 +6538,7 @@ std::vector<Option> get_mds_client_options() {
     .set_default(0)
     .set_description(""),
 
-    Option("client_max_inline_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("client_max_inline_size", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(4_K)
     .set_description(""),
 
@@ -6473,19 +6552,20 @@ std::vector<Option> get_mds_client_options() {
 
     Option("client_metadata", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("")
-    .set_description(""),
+    .set_description("metadata key=value comma-delimited pairs appended to session metadata"),
 
     Option("client_acl_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("")
-    .set_description(""),
+    .set_description("ACL type to enforce (none or \"posix_acl\")"),
 
     Option("client_permissions", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("client-enforced permission checking"),
 
     Option("client_dirsize_rbytes", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("set the directory size as the number of file bytes recursively used")
+    .set_long_description("This option enables a CephFS feature that stores the recursive directory size (the bytes used by files in the directory and its descendents) in the st_size field of the stat structure."),
 
     Option("fuse_use_invalidate_cb", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
@@ -6493,15 +6573,15 @@ std::vector<Option> get_mds_client_options() {
 
     Option("fuse_disable_pagecache", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("disable page caching in the kernel for this FUSE mount"),
 
     Option("fuse_allow_other", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("pass allow_other to FUSE on mount"),
 
     Option("fuse_default_permissions", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
-    .set_description(""),
+    .set_description("pass default_permisions to FUSE on mount"),
 
     Option("fuse_big_writes", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
@@ -6509,29 +6589,29 @@ std::vector<Option> get_mds_client_options() {
 
     Option("fuse_atomic_o_trunc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("pass atomic_o_trunc flag to FUSE on mount"),
 
-    Option("fuse_debug", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    Option("fuse_debug", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(false)
     .set_description(""),
 
     Option("fuse_multithreaded", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("allow parallel processing through FUSE library"),
 
     Option("fuse_require_active_mds", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("require active MDSs in the file system when mounting"),
 
     Option("fuse_syncfs_on_mksnap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("synchronize all local metadata/file changes after snapshot"),
 
     Option("fuse_set_user_groups", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
     .set_description("check for ceph-fuse to consider supplementary groups for permissions"),
 
-    Option("client_try_dentry_invalidate", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    Option("client_try_dentry_invalidate", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(false)
     .set_description(""),
 
@@ -6546,15 +6626,15 @@ std::vector<Option> get_mds_client_options() {
 
     Option("client_check_pool_perm", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description(""),
+    .set_description("confirm access to inode's data pool/namespace described in file layout"),
 
-    Option("client_use_faked_inos", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    Option("client_use_faked_inos", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(false)
     .set_description(""),
 
     Option("client_mds_namespace", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("")
-    .set_description(""),
+    .set_description("CephFS file system name to mount"),
   });
 }
 
index d3c5c17ca7fa58ef46771ae52eac0c2adbbc2da6..5d8415122495311faf5dfb84aee36fb474cafc9c 100644 (file)
@@ -250,8 +250,8 @@ int CrushCompiler::decompile_choose_arg(crush_choose_arg *arg,
   int r;
   out << "  {\n";
   out << "    bucket_id " << bucket_id << "\n";
-  if (arg->weight_set_size > 0) {
-    r = decompile_weight_set(arg->weight_set, arg->weight_set_size, out);
+  if (arg->weight_set_positions > 0) {
+    r = decompile_weight_set(arg->weight_set, arg->weight_set_positions, out);
     if (r < 0)
       return r;
   }
@@ -269,7 +269,7 @@ int CrushCompiler::decompile_choose_arg_map(crush_choose_arg_map arg_map,
 {
   for (__u32 i = 0; i < arg_map.size; i++) {
     if ((arg_map.args[i].ids_size == 0) &&
-        (arg_map.args[i].weight_set_size == 0))
+        (arg_map.args[i].weight_set_positions == 0))
       continue;
     int r = decompile_choose_arg(&arg_map.args[i], -1-i, out);
     if (r < 0)
@@ -951,14 +951,14 @@ int CrushCompiler::parse_weight_set_weights(iter_t const& i, int bucket_id, crus
 int CrushCompiler::parse_weight_set(iter_t const& i, int bucket_id, crush_choose_arg *arg)
 {
   // -3 stands for the leading "weight_set" keyword and the enclosing [ ]
-  arg->weight_set_size = i->children.size() - 3;
-  arg->weight_set = (crush_weight_set *)calloc(arg->weight_set_size, sizeof(crush_weight_set));
+  arg->weight_set_positions = i->children.size() - 3;
+  arg->weight_set = (crush_weight_set *)calloc(arg->weight_set_positions, sizeof(crush_weight_set));
   __u32 pos = 0;
   for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
     int r = 0;
     switch((int)p->value.id().to_long()) {
     case crush_grammar::_weight_set_weights:
-      if (pos < arg->weight_set_size) {
+      if (pos < arg->weight_set_positions) {
         r = parse_weight_set_weights(p, bucket_id, &arg->weight_set[pos]);
         pos++;
       } else {
index 6591b1bcdb40749f916fc21228c655114542c8eb..931037cfdae13cc28f76e9d913fdf36f34e8fed2 100644 (file)
@@ -208,7 +208,7 @@ namespace CrushTreeDumper {
        if (b &&
            bidx < (int)cmap.size &&
            cmap.args[bidx].weight_set &&
-           cmap.args[bidx].weight_set_size >= 1) {
+           cmap.args[bidx].weight_set_positions >= 1) {
          int bpos;
          for (bpos = 0;
               bpos < (int)cmap.args[bidx].weight_set[0].size &&
@@ -224,7 +224,7 @@ namespace CrushTreeDumper {
          }
          f->open_array_section(name.c_str());
          for (unsigned opos = 0;
-              opos < cmap.args[bidx].weight_set_size;
+              opos < cmap.args[bidx].weight_set_positions;
               ++opos) {
            float w = (float)cmap.args[bidx].weight_set[opos].weights[bpos] /
              (float)0x10000;
index 822f1d9c183be517da956c739048fe4ac2139f3b..b85c7409945366920e523fe71d206d4a4f0b55a7 100644 (file)
@@ -159,10 +159,10 @@ bool CrushWrapper::has_incompat_choose_args() const
   crush_choose_arg_map arg_map = choose_args.begin()->second;
   for (__u32 i = 0; i < arg_map.size; i++) {
     crush_choose_arg *arg = &arg_map.args[i];
-    if (arg->weight_set_size == 0 &&
+    if (arg->weight_set_positions == 0 &&
        arg->ids_size == 0)
        continue;
-    if (arg->weight_set_size != 1)
+    if (arg->weight_set_positions != 1)
       return true;
     if (arg->ids_size != 0)
       return true;
@@ -297,6 +297,19 @@ void CrushWrapper::find_takes(set<int> *roots) const
   }
 }
 
+void CrushWrapper::find_takes_by_rule(int rule, set<int> *roots) const
+{
+  if (rule < 0 || rule >= (int)crush->max_rules)
+    return;
+  crush_rule *r = crush->rules[rule];
+  if (!r)
+    return;
+  for (unsigned i = 0; i < r->len; i++) {
+    if (r->steps[i].op == CRUSH_RULE_TAKE)
+      roots->insert(r->steps[i].arg1);
+  }
+}
+
 void CrushWrapper::find_roots(set<int> *roots) const
 {
   for (int i = 0; i < crush->max_buckets; i++) {
@@ -344,6 +357,7 @@ bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool
     if (class_bucket.count(item) != 0)
       class_bucket.erase(item);
     class_remove_item(item);
+    update_choose_args(cct);
   }
   if ((item >= 0 || !unlink_only) && name_map.count(item)) {
     ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl;
@@ -386,9 +400,73 @@ int CrushWrapper::remove_root(int item)
   if (class_bucket.count(item) != 0)
     class_bucket.erase(item);
   class_remove_item(item);
+  update_choose_args(nullptr);
   return 0;
 }
 
+void CrushWrapper::update_choose_args(CephContext *cct)
+{
+  for (auto& i : choose_args) {
+    crush_choose_arg_map &arg_map = i.second;
+    unsigned positions = get_choose_args_positions(arg_map);
+    for (int j = 0; j < crush->max_buckets; ++j) {
+      crush_bucket *b = crush->buckets[j];
+      auto& carg = arg_map.args[j];
+      // strip out choose_args for any buckets that no longer exist
+      if (!b || b->alg != CRUSH_BUCKET_STRAW2) {
+       if (carg.ids) {
+         if (cct)
+           ldout(cct,0) << __func__ << " removing " << i.first << " bucket "
+                        << (-1-j) << " ids" << dendl;
+         free(carg.ids);
+         carg.ids = 0;
+         carg.ids_size = 0;
+       }
+       if (carg.weight_set) {
+         if (cct)
+           ldout(cct,0) << __func__ << " removing " << i.first << " bucket "
+                        << (-1-j) << " weight_sets" << dendl;
+         for (unsigned p = 0; p < carg.weight_set_positions; ++p) {
+           free(carg.weight_set[p].weights);
+         }
+         free(carg.weight_set);
+         carg.weight_set = 0;
+         carg.weight_set_positions = 0;
+       }
+       continue;
+      }
+      if (carg.weight_set_positions == 0) {
+       continue;       // skip it
+      }
+      if (carg.weight_set_positions != positions) {
+       if (cct)
+         lderr(cct) << __func__ << " " << i.first << " bucket "
+                    << (-1-j) << " positions " << carg.weight_set_positions
+                    << " -> " << positions << dendl;
+       continue;       // wth... skip!
+      }
+      // mis-sized weight_sets?  this shouldn't ever happen.
+      for (unsigned p = 0; p < positions; ++p) {
+       if (carg.weight_set[p].size != b->size) {
+         if (cct)
+           lderr(cct) << __func__ << " fixing " << i.first << " bucket "
+                      << (-1-j) << " position " << p
+                      << " size " << carg.weight_set[p].size << " -> "
+                      << b->size << dendl;
+         auto old_ws = carg.weight_set[p];
+         carg.weight_set[p].size = b->size;
+         carg.weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32));
+         auto max = std::min<unsigned>(old_ws.size, b->size);
+         for (unsigned k = 0; k < max; ++k) {
+           carg.weight_set[p].weights[k] = old_ws.weights[k];
+         }
+         free(old_ws.weights);
+       }
+      }
+    }
+  }
+}
+
 int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
 {
   ldout(cct, 5) << "remove_item " << item
@@ -770,6 +848,36 @@ int CrushWrapper::get_children(int id, list<int> *children)
   return b->size;
 }
 
+void CrushWrapper::get_children_of_type(int id,
+                                        int type,
+                                       set<int> *children,
+                                       bool exclude_shadow) const
+{
+  if (id >= 0) {
+    if (type == 0) {
+      // want leaf?
+      children->insert(id);
+    }
+    return;
+  }
+  auto b = get_bucket(id);
+  if (IS_ERR(b)) {
+    return;
+  }
+  if (b->type < type) {
+    // give up
+    return;
+  } else if (b->type == type) {
+    if (!is_shadow_item(b->id) || !exclude_shadow) {
+      children->insert(b->id);
+    }
+    return;
+  }
+  for (unsigned n = 0; n < b->size; n++) {
+    get_children_of_type(b->items[n], type, children, exclude_shadow);
+  }
+}
+
 int CrushWrapper::get_rule_failure_domain(int rule_id)
 {
   crush_rule *rule = get_rule(rule_id);
@@ -1362,15 +1470,33 @@ int CrushWrapper::get_immediate_parent_id(int id, int *parent) const
   return -ENOENT;
 }
 
-int CrushWrapper::get_parent_of_type(int item, int type) const
+int CrushWrapper::get_parent_of_type(int item, int type, int rule) const
 {
-  do {
-    int r = get_immediate_parent_id(item, &item);
-    if (r < 0) {
-      return 0;
+  if (rule < 0) {
+    // no rule specified
+    do {
+      int r = get_immediate_parent_id(item, &item);
+      if (r < 0) {
+        return 0;
+      }
+    } while (get_bucket_type(item) != type);
+    return item;
+  }
+  set<int> roots;
+  find_takes_by_rule(rule, &roots);
+  for (auto root : roots) {
+    set<int> candidates;
+    get_children_of_type(root, type, &candidates, false);
+    for (auto candidate : candidates) {
+      if (subtree_contains(candidate, item)) {
+       // note that here we assure that no two different buckets
+       // from a single crush rule will share a same device,
+       // which should generally be true.
+        return candidate;
+      }
     }
-  } while (get_bucket_type(item) != type);
-  return item;
+  }
+  return 0; // not found
 }
 
 int CrushWrapper::rename_class(const string& srcname, const string& dstname)
@@ -1422,7 +1548,7 @@ int CrushWrapper::populate_classes(
   // accumulate weight values for each carg and bucket as we go. because it is
   // depth first, we will have the nested bucket weights we need when we
   // finish constructing the containing buckets.
-  map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> weights
+  map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> [bucket weight for each position]
   set<int> roots;
   find_nonshadow_roots(&roots);
   for (auto &r : roots) {
@@ -1717,7 +1843,7 @@ int CrushWrapper::bucket_adjust_item_weight(CephContext *cct, crush_bucket *buck
     for (auto &w : choose_args) {
       crush_choose_arg_map &arg_map = w.second;
       crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
-      for (__u32 j = 0; j < arg->weight_set_size; j++) {
+      for (__u32 j = 0; j < arg->weight_set_positions; j++) {
        crush_weight_set *weight_set = &arg->weight_set[j];
        weight_set->weights[position] = weight;
       }
@@ -1764,7 +1890,7 @@ int CrushWrapper::add_bucket(
       crush_choose_arg& carg = cmap.args[pos];
       carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
                                                  size);
-      carg.weight_set_size = positions;
+      carg.weight_set_positions = positions;
       for (int ppos = 0; ppos < positions; ++ppos) {
        carg.weight_set[ppos].weights = (__u32*)calloc(sizeof(__u32), size);
        carg.weight_set[ppos].size = size;
@@ -1787,7 +1913,7 @@ int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
   for (auto &w : choose_args) {
     crush_choose_arg_map &arg_map = w.second;
     crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
-    for (__u32 j = 0; j < arg->weight_set_size; j++) {
+    for (__u32 j = 0; j < arg->weight_set_positions; j++) {
       crush_weight_set *weight_set = &arg->weight_set[j];
       weight_set->weights = (__u32*)realloc(weight_set->weights,
                                            new_size * sizeof(__u32));
@@ -1820,7 +1946,7 @@ int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
   for (auto &w : choose_args) {
     crush_choose_arg_map &arg_map = w.second;
     crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
-    for (__u32 j = 0; j < arg->weight_set_size; j++) {
+    for (__u32 j = 0; j < arg->weight_set_positions; j++) {
       crush_weight_set *weight_set = &arg->weight_set[j];
       assert(weight_set->size - 1 == new_size);
       for (__u32 k = position; k < new_size; k++)
@@ -2018,21 +2144,23 @@ int CrushWrapper::device_class_clone(
     auto& o = cmap.args[-1-original_id];
     auto& n = cmap.args[-1-bno];
     n.ids_size = 0; // FIXME: implement me someday
-    n.weight_set_size = o.weight_set_size;
+    n.weight_set_positions = o.weight_set_positions;
     n.weight_set = (crush_weight_set*)calloc(
-      n.weight_set_size, sizeof(crush_weight_set));
-    for (size_t s = 0; s < n.weight_set_size; ++s) {
+      n.weight_set_positions, sizeof(crush_weight_set));
+    for (size_t s = 0; s < n.weight_set_positions; ++s) {
       n.weight_set[s].size = copy->size;
       n.weight_set[s].weights = (__u32*)calloc(copy->size, sizeof(__u32));
     }
-    for (size_t s = 0; s < n.weight_set_size; ++s) {
-      vector<int> bucket_weights(n.weight_set_size);
+    for (size_t s = 0; s < n.weight_set_positions; ++s) {
+      vector<int> bucket_weights(n.weight_set_positions);
       for (size_t i = 0; i < copy->size; ++i) {
        int item = copy->items[i];
        if (item >= 0) {
          n.weight_set[s].weights[i] = o.weight_set[s].weights[item_orig_pos[i]];
-       } else {
+       } else if ((*cmap_item_weight)[w.first].count(item)) {
          n.weight_set[s].weights[i] = (*cmap_item_weight)[w.first][item][s];
+       } else {
+         n.weight_set[s].weights[i] = 0;
        }
        bucket_weights[s] += n.weight_set[s].weights[i];
       }
@@ -2230,7 +2358,7 @@ void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
       {
        __u32 *weights;
        if (encode_compat_choose_args &&
-           arg_map.args[i].weight_set_size > 0) {
+           arg_map.args[i].weight_set_positions > 0) {
          weights = arg_map.args[i].weight_set[0].weights;
        } else {
          weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
@@ -2292,7 +2420,7 @@ void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
       size = 0;
       for (__u32 i = 0; i < arg_map.size; i++) {
        crush_choose_arg *arg = &arg_map.args[i];
-       if (arg->weight_set_size == 0 &&
+       if (arg->weight_set_positions == 0 &&
            arg->ids_size == 0)
          continue;
        size++;
@@ -2300,12 +2428,12 @@ void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
       ::encode(size, bl);
       for (__u32 i = 0; i < arg_map.size; i++) {
        crush_choose_arg *arg = &arg_map.args[i];
-       if (arg->weight_set_size == 0 &&
+       if (arg->weight_set_positions == 0 &&
            arg->ids_size == 0)
          continue;
        ::encode(i, bl);
-       ::encode(arg->weight_set_size, bl);
-       for (__u32 j = 0; j < arg->weight_set_size; j++) {
+       ::encode(arg->weight_set_positions, bl);
+       for (__u32 j = 0; j < arg->weight_set_positions; j++) {
          crush_weight_set *weight_set = &arg->weight_set[j];
          ::encode(weight_set->size, bl);
          for (__u32 k = 0; k < weight_set->size; k++)
@@ -2433,11 +2561,11 @@ void CrushWrapper::decode(bufferlist::iterator& blp)
          ::decode(bucket_index, blp);
          assert(bucket_index < arg_map.size);
          crush_choose_arg *arg = &arg_map.args[bucket_index];
-         ::decode(arg->weight_set_size, blp);
-         if (arg->weight_set_size) {
+         ::decode(arg->weight_set_positions, blp);
+         if (arg->weight_set_positions) {
            arg->weight_set = (crush_weight_set*)calloc(
-             arg->weight_set_size, sizeof(crush_weight_set));
-           for (__u32 k = 0; k < arg->weight_set_size; k++) {
+             arg->weight_set_positions, sizeof(crush_weight_set));
+           for (__u32 k = 0; k < arg->weight_set_positions; k++) {
              crush_weight_set *weight_set = &arg->weight_set[k];
              ::decode(weight_set->size, blp);
              weight_set->weights = (__u32*)calloc(
@@ -2457,6 +2585,7 @@ void CrushWrapper::decode(bufferlist::iterator& blp)
        choose_args[choose_args_index] = arg_map;
       }
     }
+    update_choose_args(nullptr); // in case we decode a legacy "corrupted" map
     finalize();
   }
   catch (...) {
@@ -2750,15 +2879,15 @@ void CrushWrapper::dump_choose_args(Formatter *f) const
     f->open_array_section(stringify(c.first).c_str());
     for (__u32 i = 0; i < arg_map.size; i++) {
       crush_choose_arg *arg = &arg_map.args[i];
-      if (arg->weight_set_size == 0 &&
+      if (arg->weight_set_positions == 0 &&
          arg->ids_size == 0)
        continue;
       f->open_object_section("choose_args");
       int bucket_index = i;
       f->dump_int("bucket_id", -1-bucket_index);
-      if (arg->weight_set_size > 0) {
+      if (arg->weight_set_positions > 0) {
        f->open_array_section("weight_set");
-       for (__u32 j = 0; j < arg->weight_set_size; j++) {
+       for (__u32 j = 0; j < arg->weight_set_positions; j++) {
          f->open_array_section("weights");
          __u32 *weights = arg->weight_set[j].weights;
          __u32 size = arg->weight_set[j].size;
@@ -2926,7 +3055,7 @@ protected:
        if (b &&
            bidx < (int)cmap.size &&
            cmap.args[bidx].weight_set &&
-           cmap.args[bidx].weight_set_size >= 1) {
+           cmap.args[bidx].weight_set_positions >= 1) {
          int pos;
          for (pos = 0;
               pos < (int)cmap.args[bidx].weight_set[0].size &&
@@ -3111,6 +3240,10 @@ int CrushWrapper::_choose_type_stack(
                   << " w " << w << dendl;
     vector<int> o;
     auto tmpi = i;
+    if (i == orig.end()) {
+      ldout(cct, 10) << __func__ << " end of orig, break 0" << dendl;
+      break;
+    }
     for (auto from : w) {
       ldout(cct, 10) << " from " << from << dendl;
       // identify leaves under each choice.  we use this to check whether any of these
@@ -3154,6 +3287,7 @@ int CrushWrapper::_choose_type_stack(
              ldout(cct, 10) << __func__ << " pos " << pos << " replace "
                             << *i << " -> " << item << dendl;
              replaced = true;
+              assert(i != orig.end());
              ++i;
              break;
            }
@@ -3161,6 +3295,7 @@ int CrushWrapper::_choose_type_stack(
          if (!replaced) {
            ldout(cct, 10) << __func__ << " pos " << pos << " keep " << *i
                           << dendl;
+            assert(i != orig.end());
            o.push_back(*i);
            ++i;
          }
@@ -3275,7 +3410,8 @@ int CrushWrapper::try_remap_rule(
        if (numrep <= 0)
          numrep += maxout;
        type_stack.push_back(make_pair(type, numrep));
-       type_stack.push_back(make_pair(0, 1));
+        if (type > 0)
+         type_stack.push_back(make_pair(0, 1));
        int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
                                   i, used, &w);
        if (r < 0)
@@ -3340,16 +3476,25 @@ int CrushWrapper::_choose_args_adjust_item_weight_in_bucket(
   }
   crush_choose_arg *carg = &cmap.args[bidx];
   if (carg->weight_set == NULL) {
-    if (ss)
-      *ss << "no weight-set for bucket " << b->id;
-    ldout(cct, 10) << __func__ << "  no weight_set for bucket " << b->id
-                  << dendl;
-    return 0;
+    // create a weight-set for this bucket and populate it with the
+    // bucket weights
+    unsigned positions = get_choose_args_positions(cmap);
+    carg->weight_set_positions = positions;
+    carg->weight_set = static_cast<crush_weight_set*>(
+      calloc(sizeof(crush_weight_set), positions));
+    for (unsigned p = 0; p < positions; ++p) {
+      carg->weight_set[p].size = b->size;
+      carg->weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32));
+      for (unsigned i = 0; i < b->size; ++i) {
+       carg->weight_set[p].weights[i] = crush_get_bucket_item_weight(b, i);
+      }
+    }
+    changed++;
   }
-  if (carg->weight_set_size != weight.size()) {
+  if (carg->weight_set_positions != weight.size()) {
     if (ss)
-      *ss << "weight_set_size != " << weight.size() << " for bucket " << b->id;
-    ldout(cct, 10) << __func__ << "  weight_set_size != " << weight.size()
+      *ss << "weight_set_positions != " << weight.size() << " for bucket " << b->id;
+    ldout(cct, 10) << __func__ << "  weight_set_positions != " << weight.size()
                   << " for bucket " << b->id << dendl;
     return 0;
   }
index 64e63760c7545019f5acc36b53318b4454025e8d..b18e4f8e6061a1e8b9168e973d2d1d3dc65d8a2e 100644 (file)
@@ -73,12 +73,12 @@ public:
   std::map<int64_t, crush_choose_arg_map> choose_args;
 
 private:
-  struct crush_map *crush;
+  struct crush_map *crush = nullptr;
 
   bool have_uniform_rules = false;
 
   /* reverse maps */
-  mutable bool have_rmaps;
+  mutable bool have_rmaps = false;
   mutable std::map<string, int> type_rmap, name_rmap, rule_name_rmap;
   void build_rmaps() const {
     if (have_rmaps) return;
@@ -97,7 +97,7 @@ public:
   CrushWrapper(const CrushWrapper& other);
   const CrushWrapper& operator=(const CrushWrapper& other);
 
-  CrushWrapper() : crush(0), have_rmaps(false) {
+  CrushWrapper() {
     create();
   }
   ~CrushWrapper() {
@@ -586,6 +586,7 @@ public:
    * Note that these may not be parentless roots.
    */
   void find_takes(set<int> *roots) const;
+  void find_takes_by_rule(int rule, set<int> *roots) const;
 
   /**
    * find tree roots
@@ -681,9 +682,10 @@ public:
 
   /**
    * return ancestor of the given type, or 0 if none
+   * can pass in a specific crush **rule** to return ancestor from that rule only 
    * (parent is always a bucket and thus <0)
    */
-  int get_parent_of_type(int id, int type) const;
+  int get_parent_of_type(int id, int type, int rule = -1) const;
 
   /**
    * get the fully qualified location of a device by successively finding
@@ -726,6 +728,10 @@ public:
    * @return number of items, or error
    */
   int get_children(int id, list<int> *children);
+  void get_children_of_type(int id,
+                            int type,
+                           set<int> *children,
+                           bool exclude_shadow = true) const;
 
   /**
     * get failure-domain type of a specific crush rule
@@ -1385,7 +1391,7 @@ public:
   void destroy_choose_args(crush_choose_arg_map arg_map) {
     for (__u32 i = 0; i < arg_map.size; i++) {
       crush_choose_arg *arg = &arg_map.args[i];
-      for (__u32 j = 0; j < arg->weight_set_size; j++) {
+      for (__u32 j = 0; j < arg->weight_set_positions; j++) {
        crush_weight_set *weight_set = &arg->weight_set[j];
        free(weight_set->weights);
       }
@@ -1412,9 +1418,9 @@ public:
       carg.ids_size = 0;
       if (b && b->alg == CRUSH_BUCKET_STRAW2) {
        crush_bucket_straw2 *sb = (crush_bucket_straw2*)b;
-       carg.weight_set_size = positions;
+       carg.weight_set_positions = positions;
        carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
-                                                   carg.weight_set_size);
+                                                   carg.weight_set_positions);
        // initialize with canonical weights
        for (int pos = 0; pos < positions; ++pos) {
          carg.weight_set[pos].size = b->size;
@@ -1425,7 +1431,7 @@ public:
        }
       } else {
        carg.weight_set = NULL;
-       carg.weight_set_size = 0;
+       carg.weight_set_positions = 0;
       }
     }
   }
@@ -1444,6 +1450,9 @@ public:
     choose_args.clear();
   }
 
+  // remove choose_args for buckets that no longer exist, create them for new buckets
+  void update_choose_args(CephContext *cct);
+
   // adjust choose_args_map weight, preserving the hierarchical summation
   // property.  used by callers optimizing layouts by tweaking weights.
   int _choose_args_adjust_item_weight_in_bucket(
@@ -1473,8 +1482,8 @@ public:
   int get_choose_args_positions(crush_choose_arg_map cmap) {
     // infer positions from other buckets
     for (unsigned j = 0; j < cmap.size; ++j) {
-      if (cmap.args[j].weight_set_size) {
-       return cmap.args[j].weight_set_size;
+      if (cmap.args[j].weight_set_positions) {
+       return cmap.args[j].weight_set_positions;
       }
     }
     return 1;
index f492a27d3819b98125978e3c02c156f75c480737..be8b5e24cebcac4fc24988d5b0fc35dfc1e114c4 100644 (file)
@@ -1447,7 +1447,7 @@ struct crush_choose_arg *crush_make_choose_args(struct crush_map *map, int num_p
       weights += bucket->h.size;
     }
     arg[b].weight_set = weight_set;
-    arg[b].weight_set_size = num_positions;
+    arg[b].weight_set_positions = num_positions;
     weight_set += position;
 
     memcpy(ids, bucket->h.items, sizeof(__s32) * bucket->h.size);
index e5bdba2d791e0638817d5d1da52a187f400caaf2..8438898a4a6bc4a12ead8a13038fcbb350d72cfe 100644 (file)
@@ -258,7 +258,7 @@ struct crush_weight_set {
  * When crush_do_rule() chooses the Nth item from a straw2 bucket, the
  * replacement weights found at __weight_set[N]__ are used instead of
  * the weights from __item_weights__. If __N__ is greater than
- * __weight_set_size__, the weights found at __weight_set_size-1__ are
+ * __weight_set_positions__, the weights found at __weight_set_positions-1__ are
  * used instead. For instance if __weight_set__ is:
  *
  *    [ [ 0x10000, 0x20000 ],   // position 0
@@ -274,7 +274,7 @@ struct crush_choose_arg {
   __s32 *ids;                           /*!< values to use instead of items */
   __u32 ids_size;                       /*!< size of the __ids__ array */
   struct crush_weight_set *weight_set;  /*!< weight replacements for a given position */
-  __u32 weight_set_size;                /*!< size of the __weight_set__ array */
+  __u32 weight_set_positions;           /*!< size of the __weight_set__ array */
 };
 
 /** @ingroup API
index e15039ed944b9b880647e73d8cafd38d9f6c4a46..76bc86d528e05f00fd31c7ffd7efca5be565a154 100644 (file)
@@ -306,8 +306,8 @@ static inline __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bu
   if ((arg == NULL) ||
       (arg->weight_set == NULL))
     return bucket->item_weights;
-  if (position >= arg->weight_set_size)
-    position = arg->weight_set_size - 1;
+  if (position >= arg->weight_set_positions)
+    position = arg->weight_set_positions - 1;
   return arg->weight_set[position].weights;
 }
 
index 147f294a350ee775956f1de134634224f854526f..7d0abb622a372f7afa6548c80f247d25463f91d7 100644 (file)
@@ -414,13 +414,6 @@ void global_init_postfork_start(CephContext *cct)
         << err << dendl;
     exit(1);
   }
-  VOID_TEMP_FAILURE_RETRY(close(STDOUT_FILENO));
-  if (open("/dev/null", O_RDONLY) < 0) {
-    int err = errno;
-    derr << "global_init_daemonize: open(/dev/null) failed: error "
-        << err << dendl;
-    exit(1);
-  }
 
   const md_config_t *conf = cct->_conf;
   if (pidfile_write(conf) < 0)
@@ -435,8 +428,8 @@ void global_init_postfork_start(CephContext *cct)
 
 void global_init_postfork_finish(CephContext *cct)
 {
-  /* We only close stderr once the caller decides the daemonization
-   * process is finished.  This way we can allow error messages to be
+  /* We only close stdout+stderr once the caller decides the daemonization
+   * process is finished.  This way we can allow error or other messages to be
    * propagated in a manner that the user is able to see.
    */
   if (!(cct->get_init_flags() & CINIT_FLAG_NO_CLOSE_STDERR)) {
@@ -447,6 +440,15 @@ void global_init_postfork_finish(CephContext *cct)
       exit(1);
     }
   }
+
+  VOID_TEMP_FAILURE_RETRY(close(STDOUT_FILENO));
+  if (open("/dev/null", O_RDONLY) < 0) {
+    int err = errno;
+    derr << "global_init_daemonize: open(/dev/null) failed: error "
+        << err << dendl;
+    exit(1);
+  }
+
   ldout(cct, 1) << "finished global_init_daemonize" << dendl;
 }
 
index 19fab5f303a56715fc13f1cad680da02a7119c13..bea6f49fb160760e13d12644f6652573809ce353 100755 (executable)
@@ -163,8 +163,8 @@ DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
 DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
 DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
 DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit*
+DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2)             // *do not share this bit*
 
-DEFINE_CEPH_FEATURE(61, 1, RESERVED2)          // unused, but slow down!
 DEFINE_CEPH_FEATURE(62, 1, RESERVED)           // do not use; used as a sentinal
 DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing
 
@@ -230,6 +230,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
         CEPH_FEATURE_RESEND_ON_SPLIT |         \
         CEPH_FEATURE_RADOS_BACKOFF |           \
         CEPH_FEATURE_OSD_RECOVERY_DELETES | \
+        CEPH_FEATURE_CEPHX_V2 | \
         0ULL)
 
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
@@ -254,7 +255,6 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 static inline void ____build_time_check_for_reserved_bits(void) {
        CEPH_STATIC_ASSERT((CEPH_FEATURES_ALL &
                            (CEPH_FEATURE_RESERVED |
-                            CEPH_FEATURE_RESERVED2 |
                             DEPRECATED_CEPH_FEATURE_RESERVED_BROKEN)) == 0);
 }
 
index 4ddfda5be7b74eab727eadc513a8ca23f8bffa2d..9c73c5cd9dcaa972d595a08c5288ec31165cc8e7 100644 (file)
@@ -733,6 +733,7 @@ int ceph_flags_to_mode(int flags);
                                 CEPH_CAP_XATTR_SHARED)
 #define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \
                                   CEPH_CAP_FILE_RD)
+#define CEPH_STAT_RSTAT        CEPH_CAP_FILE_WREXTEND
 
 #define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |                    \
                              CEPH_CAP_LINK_SHARED |                    \
index 0a48c80c9c569074a98b5c3d2d14576cb0102aa3..b6c0f706a419dd8209e9c4bcbd0e9d02f8288c14 100644 (file)
@@ -1473,6 +1473,8 @@ int ceph_ll_read(struct ceph_mount_info *cmount, struct Fh* filehandle,
                 int64_t off, uint64_t len, char* buf);
 int ceph_ll_fsync(struct ceph_mount_info *cmount, struct Fh *fh,
                  int syncdataonly);
+int ceph_ll_sync_inode(struct ceph_mount_info *cmount, struct Inode *in,
+                 int syncdataonly);
 int ceph_ll_write(struct ceph_mount_info *cmount, struct Fh* filehandle,
                  int64_t off, uint64_t len, const char *data);
 int64_t ceph_ll_readv(struct ceph_mount_info *cmount, struct Fh *fh,
index d4120478c88ead29a932704e25c5bd8a33849a13..eb384c2e137985a8078b82bbcc0a1a961c849006 100644 (file)
@@ -13,6 +13,7 @@
 #define CEPH_COMPAT_H
 
 #include "acconfig.h"
+#include <sys/types.h>
 
 #if defined(__linux__)
 #define PROCPREFIX
     0; })
 #endif
 
+int ceph_posix_fallocate(int fd, off_t offset, off_t len);
+
 #endif /* !CEPH_COMPAT_H */
index 09475e52f15014f91d04bd8d5ff8ab424bf6af38..185285d6775f1efc820b3fdf9d4ebab24d820d7d 100644 (file)
@@ -517,8 +517,8 @@ class interval_set {
     erase(val, 1);
   }
 
-  void erase(T start, T len,
-    std::function<bool(T, T)> post_process = {}) {
+  void erase(T start, T len, 
+    std::function<bool(T, T)> claim = {}) {
     typename Map::iterator p = find_inc_m(start);
 
     _size -= len;
@@ -530,13 +530,22 @@ class interval_set {
     T before = start - p->first;
     assert(p->second >= before+len);
     T after = p->second - before - len;
-    if (before && (post_process ? post_process(p->first, before) : true)) {
-      p->second = before;        // shorten bit before
+    if (before) {
+      if (claim && claim(p->first, before)) {
+       _size -= before;
+       m.erase(p);
+      } else {
+       p->second = before;        // shorten bit before
+      }
     } else {
       m.erase(p);
     }
-    if (after && (post_process ? post_process(start + len, after) : true)) {
-      m[start + len] = after;
+    if (after) {
+      if (claim && claim(start + len, after)) {
+       _size -= after;
+      } else {
+       m[start + len] = after;
+      }
     }
   }
 
index 1953eb28b40b44d2515259049c86f36c632ec08b..62582b0bca7b7405871b00cc296d6b4a1465f86d 100644 (file)
@@ -93,7 +93,7 @@ struct ceph_entity_inst {
 #define CEPH_MSGR_TAG_SEQ           13 /* 64-bit int follows with seen seq number */
 #define CEPH_MSGR_TAG_KEEPALIVE2     14
 #define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15  /* keepalive reply */
-
+#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16  /* ceph v2 doing server challenge */
 
 /*
  * connection negotiation
index 5276bc2c1ddb5d28b407f6aa523aa5ece7e82106..70ed8e6d839309ac9fffda33be80c529d2a8afee 100644 (file)
@@ -15,25 +15,41 @@ set(java_srcs
   java/com/ceph/fs/CephStat.java
   java/com/ceph/fs/CephStatVFS.java)
 
-# note: for the -source 1.5 builds, we add
+# note: for the -source 1.7 builds, we add
 #   -Xlint:-options
 # to get rid of the warning
-#   warning: [options] bootstrap class path not set in conjunction with -source 1.5
+#   warning: [options] bootstrap class path not set in conjunction with -source 1.7
 # as per
 #   https://blogs.oracle.com/darcy/entry/bootclasspath_older_source
-set(CMAKE_JAVA_COMPILE_FLAGS "-source" "1.5" "-target" "1.5" "-Xlint:-options")
-add_jar(libcephfs ${java_srcs})
+set(CMAKE_JAVA_COMPILE_FLAGS "-source" "1.7" "-target" "1.7" "-Xlint:-options")
+set(jni_header_dir "${CMAKE_CURRENT_BINARY_DIR}/native")
+if(Java_VERSION VERSION_LESS 1.8)
+  add_jar(libcephfs ${java_srcs})
+  get_property(libcephfs_jar TARGET libcephfs PROPERTY JAR_FILE)
+  set(java_h native/com_ceph_fs_CephMount.h)
+  add_custom_command(
+    OUTPUT ${java_h}
+    COMMAND ${Java_JAVAH_EXECUTABLE} -classpath ${libcephfs_jar} -jni -o ${java_h} com.ceph.fs.CephMount
+    COMMENT "Building C header files from classes...")
+  add_custom_target(jni-header
+    DEPENDS ${java_h})
+  add_dependencies(jni-header libcephfs)
+else()
+  if(CMAKE_VERSION VERSION_LESS 3.11)
+    set(CMAKE_JAVA_COMPILE_FLAGS ${CMAKE_JAVA_COMPILE_FLAGS} "-h" ${jni_header_dir})
+    add_jar(libcephfs ${java_srcs})
+    add_custom_target(
+      jni-header
+      DEPENDS libcephfs)
+    add_dependencies(jni-header libcephfs)
+  else()
+    add_jar(libcephfs ${java_srcs}
+      GENERATE_NATIVE_HEADERS jni-header
+      DESTINATION ${jni_header_dir})
+  endif()
+  get_property(libcephfs_jar TARGET libcephfs PROPERTY JAR_FILE)
+endif()
 install_jar(libcephfs share/java)
-get_property(libcephfs_jar TARGET libcephfs PROPERTY JAR_FILE)
-
-set(java_h native/com_ceph_fs_CephMount.h)
-add_custom_command(
-  OUTPUT ${java_h}
-  COMMAND ${Java_JAVAH_EXECUTABLE} -classpath ${libcephfs_jar} -jni -o ${CMAKE_CURRENT_BINARY_DIR}/${java_h} com.ceph.fs.CephMount)
-add_custom_target(
-  jni-header
-  DEPENDS ${java_h})
-add_dependencies(jni-header libcephfs)
 
 find_jar(JUNIT_JAR
   NAMES junit4 junit
index cb7a0131782f90d6c74980c6bcac613b9584a6ed..f3fa5a97b2efb9c80d6fb079ec916153e19d9966 100644 (file)
@@ -1539,6 +1539,12 @@ extern "C" int ceph_ll_fsync(class ceph_mount_info *cmount,
   return (cmount->get_client()->ll_fsync(fh, syncdataonly));
 }
 
+extern "C" int ceph_ll_sync_inode(class ceph_mount_info *cmount,
+                            Inode *in, int syncdataonly)
+{
+  return (cmount->get_client()->ll_sync_inode(in, syncdataonly));
+}
+
 extern "C" off_t ceph_ll_lseek(class ceph_mount_info *cmount,
                                Fh *fh, off_t offset, int whence)
 {
index bafdb948da8c3125cd8a02fda3872ac1b8799fa5..dbd0818da61567c159f640fadb2a3c1c8d080fa5 100644 (file)
@@ -138,7 +138,7 @@ struct C_aio_notify_Complete : public C_aio_linger_Complete {
   }
 
   void complete(int r) override {
-    // invoked by C_notify_Finish (or C_aio_notify_Ack on failure)
+    // invoked by C_notify_Finish
     lock.Lock();
     finished = true;
     complete_unlock(r);
@@ -175,10 +175,6 @@ struct C_aio_notify_Ack : public Context {
     ldout(cct, 10) << __func__ << " linger op " << oncomplete->linger_op << " "
                    << "acked (" << r << ")" << dendl;
     oncomplete->handle_ack(r);
-    if (r < 0) {
-      // on failure, we won't expect to see a notify_finish callback
-      onfinish->complete(r);
-    }
   }
 };
 
@@ -1832,6 +1828,7 @@ int librados::IoCtxImpl::notify(const object_t& oid, bufferlist& bl,
   Context *notify_finish = new C_notify_Finish(client->cct, &notify_finish_cond,
                                                objecter, linger_op, preply_bl,
                                                preply_buf, preply_buf_len);
+  (void) notify_finish;
 
   uint32_t timeout = notify_timeout;
   if (timeout_ms)
@@ -1863,7 +1860,7 @@ int librados::IoCtxImpl::notify(const object_t& oid, bufferlist& bl,
   } else {
     ldout(client->cct, 10) << __func__ << " failed to initiate notify, r = "
                           << r << dendl;
-    notify_finish->complete(r);
+    notify_finish_cond.wait();
   }
 
   objecter->linger_cancel(linger_op);
index 5b277314f2a34714d3585a1d4a4204e78fb210f5..3f64a8003b05a95a489aa86f08c615a501b43cbc 100644 (file)
@@ -1147,6 +1147,10 @@ struct C_InvalidateCache : public Context {
     journal_policy = policy;
   }
 
+  bool ImageCtx::is_writeback_cache_enabled() const {
+    return (cache && cache_max_dirty > 0);
+  }
+
   void ImageCtx::get_thread_pool_instance(CephContext *cct,
                                           ThreadPool **thread_pool,
                                           ContextWQ **op_work_queue) {
index e73ca7cc82b5b1b06b1c50d98f57435c97b46cac..dc77af6d48d014c6a45accd6f3893e43470456f6 100644 (file)
@@ -331,6 +331,8 @@ namespace librbd {
     journal::Policy *get_journal_policy() const;
     void set_journal_policy(journal::Policy *policy);
 
+    bool is_writeback_cache_enabled() const;
+
     static void get_thread_pool_instance(CephContext *cct,
                                          ThreadPool **thread_pool,
                                          ContextWQ **op_work_queue);
index f8ffda4a75c624719b8c78997754411df33b90ab..daac18868f71d04f22126a1322f1dbf302b0cad0 100644 (file)
@@ -364,7 +364,7 @@ void ImageWatcher<I>::schedule_request_lock(bool use_timer, int timer_delay) {
          !m_image_ctx.exclusive_lock->is_lock_owner());
 
   RWLock::RLocker watch_locker(this->m_watch_lock);
-  if (this->m_watch_state == Watcher::WATCH_STATE_REGISTERED) {
+  if (this->is_registered(this->m_watch_lock)) {
     ldout(m_image_ctx.cct, 15) << this << " requesting exclusive lock" << dendl;
 
     FunctionContext *ctx = new FunctionContext(
index 54a2246f12cff7ea82c41353141b30a0624de84a..6e31ad7c635929408ace2bd9a80c4a845b74ec12 100644 (file)
@@ -93,23 +93,23 @@ Watcher::Watcher(librados::IoCtx& ioctx, ContextWQ *work_queue,
     m_cct(reinterpret_cast<CephContext *>(ioctx.cct())),
     m_watch_lock(util::unique_lock_name("librbd::Watcher::m_watch_lock", this)),
     m_watch_handle(0), m_notifier(work_queue, ioctx, oid),
-    m_watch_state(WATCH_STATE_UNREGISTERED), m_watch_ctx(*this) {
+    m_watch_state(WATCH_STATE_IDLE), m_watch_ctx(*this) {
 }
 
 Watcher::~Watcher() {
   RWLock::RLocker l(m_watch_lock);
-  assert(m_watch_state != WATCH_STATE_REGISTERED);
+  assert(is_unregistered(m_watch_lock));
 }
 
 void Watcher::register_watch(Context *on_finish) {
   ldout(m_cct, 10) << dendl;
 
   RWLock::RLocker watch_locker(m_watch_lock);
-  assert(m_watch_state == WATCH_STATE_UNREGISTERED);
+  assert(is_unregistered(m_watch_lock));
   m_watch_state = WATCH_STATE_REGISTERING;
 
   librados::AioCompletion *aio_comp = create_rados_callback(
-                                         new C_RegisterWatch(this, on_finish));
+    new C_RegisterWatch(this, on_finish));
   int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_watch_handle, &m_watch_ctx);
   assert(r == 0);
   aio_comp->release();
@@ -117,27 +117,35 @@ void Watcher::register_watch(Context *on_finish) {
 
 void Watcher::handle_register_watch(int r, Context *on_finish) {
   ldout(m_cct, 10) << "r=" << r << dendl;
+
+  bool watch_error = false;
   Context *unregister_watch_ctx = nullptr;
   {
     RWLock::WLocker watch_locker(m_watch_lock);
     assert(m_watch_state == WATCH_STATE_REGISTERING);
 
-    std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+    m_watch_state = WATCH_STATE_IDLE;
     if (r < 0) {
       lderr(m_cct) << "failed to register watch: " << cpp_strerror(r)
                    << dendl;
       m_watch_handle = 0;
-      m_watch_state = WATCH_STATE_UNREGISTERED;
-    } else if (r >= 0) {
-      m_watch_state = WATCH_STATE_REGISTERED;
+    }
+
+    if (m_unregister_watch_ctx != nullptr) {
+      std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+    } else if (r == 0 && m_watch_error) {
+      lderr(m_cct) << "re-registering watch after error" << dendl;
+      m_watch_state = WATCH_STATE_REWATCHING;
+      watch_error = true;
     }
   }
 
   on_finish->complete(r);
 
-  // wake up pending unregister request
   if (unregister_watch_ctx != nullptr) {
     unregister_watch_ctx->complete(0);
+  } else if (watch_error) {
+    rewatch();
   }
 }
 
@@ -146,8 +154,7 @@ void Watcher::unregister_watch(Context *on_finish) {
 
   {
     RWLock::WLocker watch_locker(m_watch_lock);
-    if (m_watch_state == WATCH_STATE_REGISTERING ||
-        m_watch_state == WATCH_STATE_REWATCHING) {
+    if (m_watch_state != WATCH_STATE_IDLE) {
       ldout(m_cct, 10) << "delaying unregister until register completed"
                        << dendl;
 
@@ -156,17 +163,13 @@ void Watcher::unregister_watch(Context *on_finish) {
           unregister_watch(on_finish);
         });
       return;
-    }
-
-    if (m_watch_state == WATCH_STATE_REGISTERED ||
-        m_watch_state == WATCH_STATE_ERROR) {
-      m_watch_state = WATCH_STATE_UNREGISTERED;
-
+    } else if (is_registered(m_watch_lock)) {
       librados::AioCompletion *aio_comp = create_rados_callback(
                         new C_UnwatchAndFlush(m_ioctx, on_finish));
       int r = m_ioctx.aio_unwatch(m_watch_handle, aio_comp);
       assert(r == 0);
       aio_comp->release();
+      m_watch_handle = 0;
       return;
     }
   }
@@ -208,8 +211,8 @@ std::string Watcher::get_oid() const {
 }
 
 void Watcher::set_oid(const string& oid) {
-  RWLock::WLocker l(m_watch_lock);
-  assert(m_watch_state == WATCH_STATE_UNREGISTERED);
+  RWLock::WLocker watch_locker(m_watch_lock);
+  assert(is_unregistered(m_watch_lock));
 
   m_oid = oid;
 }
@@ -217,9 +220,11 @@ void Watcher::set_oid(const string& oid) {
 void Watcher::handle_error(uint64_t handle, int err) {
   lderr(m_cct) << "handle=" << handle << ": " << cpp_strerror(err) << dendl;
 
-  RWLock::WLocker l(m_watch_lock);
-  if (m_watch_state == WATCH_STATE_REGISTERED) {
-    m_watch_state = WATCH_STATE_ERROR;
+  RWLock::WLocker watch_locker(m_watch_lock);
+  m_watch_error = true;
+
+  if (is_registered(m_watch_lock)) {
+    m_watch_state = WATCH_STATE_REWATCHING;
 
     FunctionContext *ctx = new FunctionContext(
         boost::bind(&Watcher::rewatch, this));
@@ -235,46 +240,93 @@ void Watcher::acknowledge_notify(uint64_t notify_id, uint64_t handle,
 void Watcher::rewatch() {
   ldout(m_cct, 10) << dendl;
 
-  RWLock::WLocker l(m_watch_lock);
-  if (m_watch_state != WATCH_STATE_ERROR) {
-    return;
+  Context *unregister_watch_ctx = nullptr;
+  {
+    RWLock::WLocker watch_locker(m_watch_lock);
+    assert(m_watch_state == WATCH_STATE_REWATCHING);
+
+    if (m_unregister_watch_ctx != nullptr) {
+      m_watch_state = WATCH_STATE_IDLE;
+      std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+    } else {
+      m_watch_error = false;
+      auto ctx = create_context_callback<
+        Watcher, &Watcher::handle_rewatch>(this);
+      auto req = RewatchRequest::create(m_ioctx, m_oid, m_watch_lock,
+                                        &m_watch_ctx, &m_watch_handle, ctx);
+      req->send();
+      return;
+    }
   }
-  m_watch_state = WATCH_STATE_REWATCHING;
-
-  Context *ctx = create_context_callback<Watcher,
-                                         &Watcher::handle_rewatch>(this);
-  RewatchRequest *req = RewatchRequest::create(m_ioctx, m_oid, m_watch_lock,
-                                               &m_watch_ctx,
-                                               &m_watch_handle, ctx);
-  req->send();
+
+  unregister_watch_ctx->complete(0);
 }
 
 void Watcher::handle_rewatch(int r) {
   ldout(m_cct, 10) "r=" << r << dendl;
 
-  WatchState next_watch_state = WATCH_STATE_REGISTERED;
-  if (r < 0) {
-    // only EBLACKLISTED or ENOENT can be returned
-    assert(r == -EBLACKLISTED || r == -ENOENT);
-    next_watch_state = WATCH_STATE_UNREGISTERED;
+  bool watch_error = false;
+  Context *unregister_watch_ctx = nullptr;
+  {
+    RWLock::WLocker watch_locker(m_watch_lock);
+    assert(m_watch_state == WATCH_STATE_REWATCHING);
+
+    if (m_unregister_watch_ctx != nullptr) {
+      ldout(m_cct, 10) << "image is closing, skip rewatch" << dendl;
+      m_watch_state = WATCH_STATE_IDLE;
+      std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+    } else if (r  == -EBLACKLISTED) {
+      lderr(m_cct) << "client blacklisted" << dendl;
+    } else if (r == -ENOENT) {
+      ldout(m_cct, 5) << "object does not exist" << dendl;
+    } else if (r < 0) {
+      lderr(m_cct) << "failed to rewatch: " << cpp_strerror(r) << dendl;
+      watch_error = true;
+    } else if (m_watch_error) {
+      lderr(m_cct) << "re-registering watch after error" << dendl;
+      watch_error = true;
+    }
   }
 
+  if (unregister_watch_ctx != nullptr) {
+    unregister_watch_ctx->complete(0);
+    return;
+  } else if (watch_error) {
+    rewatch();
+    return;
+  }
+
+  auto ctx = create_context_callback<
+    Watcher, &Watcher::handle_rewatch_callback>(this);
+  m_work_queue->queue(ctx, r);
+}
+
+void Watcher::handle_rewatch_callback(int r) {
+  ldout(m_cct, 10) << "r=" << r << dendl;
+  handle_rewatch_complete(r);
+
+  bool watch_error = false;
   Context *unregister_watch_ctx = nullptr;
   {
     RWLock::WLocker watch_locker(m_watch_lock);
     assert(m_watch_state == WATCH_STATE_REWATCHING);
-    m_watch_state = next_watch_state;
 
-    std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
-
-    m_work_queue->queue(
-      create_context_callback<Watcher,
-                              &Watcher::handle_rewatch_complete>(this), r);
+    if (m_unregister_watch_ctx != nullptr) {
+      m_watch_state = WATCH_STATE_IDLE;
+      std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+    } else if (r  == -EBLACKLISTED || r == -ENOENT) {
+      m_watch_state = WATCH_STATE_IDLE;
+    } else if (m_watch_error) {
+      watch_error = true;
+    } else {
+      m_watch_state = WATCH_STATE_IDLE;
+    }
   }
 
-  // wake up pending unregister request
   if (unregister_watch_ctx != nullptr) {
     unregister_watch_ctx->complete(0);
+  } else if (watch_error) {
+    rewatch();
   }
 }
 
index 39009027d9ebc4756bd466b0d6f555e53614975b..518fbdd015e555ea6804e84331eca1c8c1a3bd1a 100644 (file)
@@ -54,19 +54,17 @@ public:
 
   bool is_registered() const {
     RWLock::RLocker locker(m_watch_lock);
-    return m_watch_state == WATCH_STATE_REGISTERED;
+    return is_registered(m_watch_lock);
   }
   bool is_unregistered() const {
     RWLock::RLocker locker(m_watch_lock);
-    return m_watch_state == WATCH_STATE_UNREGISTERED;
+    return is_unregistered(m_watch_lock);
   }
 
 protected:
   enum WatchState {
-    WATCH_STATE_UNREGISTERED,
+    WATCH_STATE_IDLE,
     WATCH_STATE_REGISTERING,
-    WATCH_STATE_REGISTERED,
-    WATCH_STATE_ERROR,
     WATCH_STATE_REWATCHING
   };
 
@@ -80,6 +78,13 @@ protected:
   WatchState m_watch_state;
   AsyncOpTracker m_async_op_tracker;
 
+  bool is_registered(const RWLock&) const {
+    return (m_watch_state == WATCH_STATE_IDLE && m_watch_handle != 0);
+  }
+  bool is_unregistered(const RWLock&) const {
+    return (m_watch_state == WATCH_STATE_IDLE && m_watch_handle == 0);
+  }
+
   void send_notify(bufferlist &payload,
                    watcher::NotifyResponse *response = nullptr,
                    Context *on_finish = nullptr);
@@ -155,12 +160,15 @@ private:
   WatchCtx m_watch_ctx;
   Context *m_unregister_watch_ctx = nullptr;
 
+  bool m_watch_error = false;
+
   uint32_t m_blocked_count = 0;
 
   void handle_register_watch(int r, Context *on_finish);
 
   void rewatch();
   void handle_rewatch(int r);
+  void handle_rewatch_callback(int r);
 
 };
 
index 21ee18c98d11e5aa4d654cea95f3a64749f2a6d5..5c0404de1fb953f16c188055d4fe2ac027561e06 100644 (file)
@@ -858,7 +858,8 @@ void Replay<I>::handle_event(const journal::UnknownEvent &event,
 
 template <typename I>
 void Replay<I>::handle_aio_modify_complete(Context *on_ready, Context *on_safe,
-                                           int r, std::set<int> &filters) {
+                                           int r, std::set<int> &filters,
+                                           bool writeback_cache_enabled) {
   Mutex::Locker locker(m_lock);
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << ": on_ready=" << on_ready << ", "
@@ -877,8 +878,23 @@ void Replay<I>::handle_aio_modify_complete(Context *on_ready, Context *on_safe,
     return;
   }
 
-  // will be completed after next flush operation completes
-  m_aio_modify_safe_contexts.insert(on_safe);
+  if (writeback_cache_enabled) {
+    // will be completed after next flush operation completes
+    m_aio_modify_safe_contexts.insert(on_safe);
+  } else {
+    // IO is safely stored on disk
+    assert(m_in_flight_aio_modify > 0);
+    --m_in_flight_aio_modify;
+
+    if (m_on_aio_ready != nullptr) {
+      ldout(cct, 10) << ": resuming paused AIO" << dendl;
+      m_on_aio_ready->complete(0);
+      m_on_aio_ready = nullptr;
+    }
+
+    ldout(cct, 20) << ": completing safe context: " << on_safe << dendl;
+    m_image_ctx.op_work_queue->queue(on_safe, 0);
+  }
 }
 
 template <typename I>
@@ -1057,13 +1073,18 @@ Replay<I>::create_aio_modify_completion(Context *on_ready,
   }
 
   ++m_in_flight_aio_modify;
-  m_aio_modify_unsafe_contexts.push_back(on_safe);
+
+  bool writeback_cache_enabled = m_image_ctx.is_writeback_cache_enabled();
+  if (writeback_cache_enabled) {
+    m_aio_modify_unsafe_contexts.push_back(on_safe);
+  }
 
   // FLUSH if we hit the low-water mark -- on_safe contexts are
   // completed by flushes-only so that we don't move the journal
   // commit position until safely on-disk
 
-  *flush_required = (m_aio_modify_unsafe_contexts.size() ==
+  *flush_required = (writeback_cache_enabled &&
+                     m_aio_modify_unsafe_contexts.size() ==
                        IN_FLIGHT_IO_LOW_WATER_MARK);
   if (*flush_required) {
     ldout(cct, 10) << ": hit AIO replay low-water mark: scheduling flush"
@@ -1086,7 +1107,8 @@ Replay<I>::create_aio_modify_completion(Context *on_ready,
   // event. when flushed, the completion of the next flush will fire the
   // on_safe callback
   auto aio_comp = io::AioCompletion::create_and_start<Context>(
-    new C_AioModifyComplete(this, on_ready, on_safe, std::move(filters)),
+    new C_AioModifyComplete(this, on_ready, on_safe, std::move(filters),
+                            writeback_cache_enabled),
     util::get_image_ctx(&m_image_ctx), aio_type);
   return aio_comp;
 }
index 4a4260cb9f00d8d30c5b464bb2c5620fa444f9cd..50ccfa96551e982638e2d86cdf0d0225e88544c0 100644 (file)
@@ -78,13 +78,17 @@ private:
     Context *on_ready;
     Context *on_safe;
     std::set<int> filters;
+    bool writeback_cache_enabled;
     C_AioModifyComplete(Replay *replay, Context *on_ready,
-                        Context *on_safe, std::set<int> &&filters)
+                        Context *on_safe, std::set<int> &&filters,
+                        bool writeback_cache_enabled)
       : replay(replay), on_ready(on_ready), on_safe(on_safe),
-        filters(std::move(filters)) {
+        filters(std::move(filters)),
+        writeback_cache_enabled(writeback_cache_enabled) {
     }
     void finish(int r) override {
-      replay->handle_aio_modify_complete(on_ready, on_safe, r, filters);
+      replay->handle_aio_modify_complete(on_ready, on_safe, r, filters,
+                                         writeback_cache_enabled);
     }
   };
 
@@ -177,7 +181,8 @@ private:
                     Context *on_safe);
 
   void handle_aio_modify_complete(Context *on_ready, Context *on_safe,
-                                  int r, std::set<int> &filters);
+                                  int r, std::set<int> &filters,
+                                  bool writeback_cache_enabled);
   void handle_aio_flush_complete(Context *on_flush_safe, Contexts &on_safe_ctxs,
                                  int r);
 
index 6af49b4c8cd56fd2a37ecb7f00b37fe215681193..5f7e23b4b048c0343972e53355463a864ea2a108 100644 (file)
@@ -433,13 +433,14 @@ void Log::start()
 
 void Log::stop()
 {
-  assert(is_started());
-  pthread_mutex_lock(&m_queue_mutex);
-  m_stop = true;
-  pthread_cond_signal(&m_cond_flusher);
-  pthread_cond_broadcast(&m_cond_loggers);
-  pthread_mutex_unlock(&m_queue_mutex);
-  join();
+  if (is_started()) {
+    pthread_mutex_lock(&m_queue_mutex);
+    m_stop = true;
+    pthread_cond_signal(&m_cond_flusher);
+    pthread_cond_broadcast(&m_cond_loggers);
+    pthread_mutex_unlock(&m_queue_mutex);
+    join();
+  }
 }
 
 void *Log::entry()
index ea59c57b827c43b5e6ce3a57ff5982b26a0ebbd1..e94390f65bf4fbb834ade342b9bcfbc85547058f 100644 (file)
@@ -142,6 +142,7 @@ void Beacon::handle_mds_beacon(MMDSBeacon *m)
     dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state())
             << " seq " << m->get_seq() << " dne" << dendl;
   }
+  m->put();
 }
 
 
index 1157e07dfe1e1b5fdd6aff0c112494310d13c469..f2396b31ba8b652db30d5444ce4295dc1bf833b0 100644 (file)
@@ -196,6 +196,7 @@ CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) :
   pop_nested(ceph_clock_now()),
   pop_auth_subtree(ceph_clock_now()),
   pop_auth_subtree_nested(ceph_clock_now()),
+  pop_lru_subdirs(member_offset(CInode, item_pop_lru)),
   num_dentries_nested(0), num_dentries_auth_subtree(0),
   num_dentries_auth_subtree_nested(0),
   dir_auth(CDIR_AUTH_DEFAULT)
@@ -386,7 +387,6 @@ CDentry* CDir::add_primary_dentry(boost::string_view dname, CInode *in,
   items[dn->key()] = dn;
 
   dn->get_linkage()->inode = in;
-  in->set_primary_parent(dn);
 
   link_inode_work(dn, in);
 
@@ -533,7 +533,6 @@ void CDir::link_primary_inode(CDentry *dn, CInode *in)
   assert(dn->get_linkage()->is_null());
 
   dn->get_linkage()->inode = in;
-  in->set_primary_parent(dn);
 
   link_inode_work(dn, in);
 
@@ -558,7 +557,7 @@ void CDir::link_primary_inode(CDentry *dn, CInode *in)
 void CDir::link_inode_work( CDentry *dn, CInode *in)
 {
   assert(dn->get_linkage()->get_inode() == in);
-  assert(in->get_parent_dn() == dn);
+  in->set_primary_parent(dn);
 
   // set inode version
   //in->inode.version = dn->get_version();
@@ -646,9 +645,11 @@ void CDir::unlink_inode_work( CDentry *dn )
     // unlink auth_pin count
     if (in->auth_pins + in->nested_auth_pins)
       dn->adjust_nested_auth_pins(0 - (in->auth_pins + in->nested_auth_pins), 0 - in->auth_pins, NULL);
-    
+
     // detach inode
     in->remove_primary_parent(dn);
+    if (in->is_dir())
+      in->item_pop_lru.remove_myself();
     dn->get_linkage()->inode = 0;
   } else {
     assert(!dn->get_linkage()->is_null());
@@ -823,10 +824,13 @@ void CDir::steal_dentry(CDentry *dn)
     if (dn->get_linkage()->is_primary()) {
       CInode *in = dn->get_linkage()->get_inode();
       auto pi = in->get_projected_inode();
-      if (dn->get_linkage()->get_inode()->is_dir())
+      if (in->is_dir()) {
        fnode.fragstat.nsubdirs++;
-      else
+       if (in->item_pop_lru.is_on_list())
+         pop_lru_subdirs.push_back(&in->item_pop_lru);
+      } else {
        fnode.fragstat.nfiles++;
+      }
       fnode.rstat.rbytes += pi->accounted_rstat.rbytes;
       fnode.rstat.rfiles += pi->accounted_rstat.rfiles;
       fnode.rstat.rsubdirs += pi->accounted_rstat.rsubdirs;
@@ -1659,8 +1663,7 @@ CDentry *CDir::_load_dentry(
     bufferlist &bl,
     const int pos,
     const std::set<snapid_t> *snaps,
-    bool *force_dirty,
-    list<CInode*> *undef_inodes)
+    bool *force_dirty)
 {
   bufferlist::iterator q = bl.begin();
 
@@ -1712,10 +1715,16 @@ CDentry *CDir::_load_dentry(
     }
 
     if (dn) {
-      if (dn->get_linkage()->get_inode() == 0) {
-        dout(12) << "_fetched  had NEG dentry " << *dn << dendl;
-      } else {
-        dout(12) << "_fetched  had dentry " << *dn << dendl;
+      CDentry::linkage_t *dnl = dn->get_linkage();
+      dout(12) << "_fetched had " << (dnl->is_null() ? "NEG" : "") << " dentry " << *dn << dendl;
+      if (committed_version == 0 &&
+         dnl->is_remote() &&
+         dn->is_dirty() &&
+         ino == dnl->get_remote_ino() &&
+         d_type == dnl->get_remote_d_type()) {
+       // see comment below
+       dout(10) << "_fetched  had underwater dentry " << *dn << ", marking clean" << dendl;
+       dn->mark_clean();
       }
     } else {
       // (remote) link
@@ -1748,15 +1757,35 @@ CDentry *CDir::_load_dentry(
 
     bool undef_inode = false;
     if (dn) {
-      CInode *in = dn->get_linkage()->get_inode();
-      if (in) {
-        dout(12) << "_fetched  had dentry " << *dn << dendl;
-        if (in->state_test(CInode::STATE_REJOINUNDEF)) {
-          undef_inodes->push_back(in);
-          undef_inode = true;
-        }
-      } else
-        dout(12) << "_fetched  had NEG dentry " << *dn << dendl;
+      CDentry::linkage_t *dnl = dn->get_linkage();
+      dout(12) << "_fetched had " << (dnl->is_null() ? "NEG" : "") << " dentry " << *dn << dendl;
+
+      if (dnl->is_primary()) {
+       CInode *in = dnl->get_inode();
+       if (in->state_test(CInode::STATE_REJOINUNDEF)) {
+         undef_inode = true;
+       } else if (committed_version == 0 &&
+                  dn->is_dirty() &&
+                  inode_data.inode.ino == in->ino() &&
+                  inode_data.inode.version == in->get_version()) {
+         /* clean underwater item?
+          * Underwater item is something that is dirty in our cache from
+          * journal replay, but was previously flushed to disk before the
+          * mds failed.
+          *
+          * We only do this is committed_version == 0. that implies either
+          * - this is a fetch after from a clean/empty CDir is created
+          *   (and has no effect, since the dn won't exist); or
+          * - this is a fetch after _recovery_, which is what we're worried
+          *   about.  Items that are marked dirty from the journal should be
+          *   marked clean if they appear on disk.
+          */
+         dout(10) << "_fetched  had underwater dentry " << *dn << ", marking clean" << dendl;
+         dn->mark_clean();
+         dout(10) << "_fetched  had underwater inode " << *dnl->get_inode() << ", marking clean" << dendl;
+         in->mark_clean();
+       }
+      }
     }
 
     if (!dn || undef_inode) {
@@ -1918,7 +1947,7 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map<string, bufferlist>& omap,
     try {
       dn = _load_dentry(
             p->first, dname, last, p->second, pos, snaps,
-            &force_dirty, &undef_inodes);
+            &force_dirty);
     } catch (const buffer::error &err) {
       cache->mds->clog->warn() << "Corrupt dentry '" << dname << "' in "
                                   "dir frag " << dirfrag() << ": "
@@ -1937,35 +1966,16 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map<string, bufferlist>& omap,
       continue;
     }
 
-    if (dn && (wanted_items.count(mempool::mds_co::string(boost::string_view(dname))) > 0 || !complete)) {
-      dout(10) << " touching wanted dn " << *dn << dendl;
-      inode->mdcache->touch_dentry(dn);
-    }
+    if (!dn)
+      continue;
 
-    /** clean underwater item?
-     * Underwater item is something that is dirty in our cache from
-     * journal replay, but was previously flushed to disk before the
-     * mds failed.
-     *
-     * We only do this is committed_version == 0. that implies either
-     * - this is a fetch after from a clean/empty CDir is created
-     *   (and has no effect, since the dn won't exist); or
-     * - this is a fetch after _recovery_, which is what we're worried 
-     *   about.  Items that are marked dirty from the journal should be
-     *   marked clean if they appear on disk.
-     */
-    if (committed_version == 0 &&     
-       dn &&
-       dn->get_version() <= got_fnode.version &&
-       dn->is_dirty()) {
-      dout(10) << "_fetched  had underwater dentry " << *dn << ", marking clean" << dendl;
-      dn->mark_clean();
+    CDentry::linkage_t *dnl = dn->get_linkage();
+    if (dnl->is_primary() && dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
+      undef_inodes.push_back(dnl->get_inode());
 
-      if (dn->get_linkage()->is_primary()) {
-       assert(dn->get_linkage()->get_inode()->get_version() <= got_fnode.version);
-       dout(10) << "_fetched  had underwater inode " << *dn->get_linkage()->get_inode() << ", marking clean" << dendl;
-       dn->get_linkage()->get_inode()->mark_clean();
-      }
+    if (!complete || wanted_items.count(mempool::mds_co::string(boost::string_view(dname))) > 0) {
+      dout(10) << " touching wanted dn " << *dn << dendl;
+      inode->mdcache->touch_dentry(dn);
     }
   }
 
@@ -3057,6 +3067,28 @@ void CDir::dump(Formatter *f) const
   MDSCacheObject::dump(f);
 }
 
+void CDir::dump_load(Formatter *f, utime_t now, const DecayRate& rate)
+{
+  f->dump_stream("path") << get_path();
+  f->dump_stream("dirfrag") << dirfrag();
+
+  f->open_object_section("pop_me");
+  pop_me.dump(f, now, rate);
+  f->close_section();
+
+  f->open_object_section("pop_nested");
+  pop_nested.dump(f, now, rate);
+  f->close_section();
+
+  f->open_object_section("pop_auth_subtree");
+  pop_auth_subtree.dump(f, now, rate);
+  f->close_section();
+
+  f->open_object_section("pop_auth_subtree_nested");
+  pop_auth_subtree_nested.dump(f, now, rate);
+  f->close_section();
+}
+
 /****** Scrub Stuff *******/
 
 void CDir::scrub_info_create() const
index 8de1867329df9aae00a736fc667b3ee69a241e2f..0ce4ac0ba0a015b83ee825dabfbf30e33a31ab7c 100644 (file)
@@ -362,6 +362,8 @@ protected:
 
   load_spread_t pop_spread;
 
+  elist<CInode*> pop_lru_subdirs;
+
   // and to provide density
   int num_dentries_nested;
   int num_dentries_auth_subtree;
@@ -605,8 +607,7 @@ protected:
       bufferlist &bl,
       int pos,
       const std::set<snapid_t> *snaps,
-      bool *force_dirty,
-      std::list<CInode*> *undef_inodes);
+      bool *force_dirty);
 
   /**
    * Mark this fragment as BADFRAG (common part of go_bad and go_bad_dentry)
@@ -747,6 +748,7 @@ public:
   ostream& print_db_line_prefix(ostream& out) override;
   void print(ostream& out) override;
   void dump(Formatter *f) const;
+  void dump_load(Formatter *f, utime_t now, const DecayRate& rate);
 };
 
 #endif
index 929e3252e48a1722110d46b558d3e07cc900f581..1d07d8756959572ace40a691460fb40860d2dc5a 100644 (file)
@@ -423,7 +423,7 @@ void CInode::pop_and_dirty_projected_inode(LogSegment *ls)
   inode = front.inode;
 
   if (inode.is_backtrace_updated())
-    _mark_dirty_parent(ls, old_pool != inode.layout.pool_id);
+    mark_dirty_parent(ls, old_pool != inode.layout.pool_id);
 
   if (front.xattrs) {
     --num_projected_xattrs;
@@ -1236,7 +1236,7 @@ void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace)
   mdcache->fetch_backtrace(inode.ino, get_backtrace_pool(), *backtrace, fin);
 }
 
-void CInode::_mark_dirty_parent(LogSegment *ls, bool dirty_pool)
+void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool)
 {
   if (!state_test(STATE_DIRTYPARENT)) {
     dout(10) << "mark_dirty_parent" << dendl;
@@ -1283,7 +1283,7 @@ void CInode::verify_diri_backtrace(bufferlist &bl, int err)
     mds->clog->error() << "bad backtrace on directory inode " << ino();
     assert(!"bad backtrace" == (g_conf->mds_verify_backtrace > 1));
 
-    _mark_dirty_parent(mds->mdlog->get_current_segment(), false);
+    mark_dirty_parent(mds->mdlog->get_current_segment(), false);
     mds->mdlog->flush();
   }
 }
@@ -3442,6 +3442,8 @@ void CInode::encode_cap_message(MClientCaps *m, Capability *cap)
   m->ctime = i->ctime;
   m->change_attr = i->change_attr;
   m->time_warp_seq = i->time_warp_seq;
+  m->nfiles = i->dirstat.nfiles;
+  m->nsubdirs = i->dirstat.nsubdirs;
 
   if (cap->client_inline_version < i->inline_data.version) {
     m->inline_version = cap->client_inline_version = i->inline_data.version;
@@ -3681,7 +3683,7 @@ void CInode::decode_import(bufferlist::iterator& p,
   }
   if (is_dirty_parent()) {
     get(PIN_DIRTYPARENT);
-    _mark_dirty_parent(ls);
+    mark_dirty_parent(ls);
   }
 
   ::decode(pop, ceph_clock_now(), p);
@@ -3925,7 +3927,7 @@ next:
         in->make_path_string(path);
         in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino()
                                        << "(" << path << "), rewriting it";
-        in->_mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(),
+        in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(),
                            false);
         // Flag that we repaired this BT so that it won't go into damagetable
         results->backtrace.repaired = true;
index c735f43a2f011ea489c300002a190540462392cc..9b356d9f5c4c422eeb9f11a4c5d7119a84313ff9 100644 (file)
@@ -631,6 +631,7 @@ public:
   int auth_pin_freeze_allowance = 0;
 
   inode_load_vec_t pop;
+  elist<CInode*>::item item_pop_lru;
 
   // friends
   friend class Server;
@@ -762,7 +763,7 @@ protected:
    */
   int64_t get_backtrace_pool() const;
 public:
-  void _mark_dirty_parent(LogSegment *ls, bool dirty_pool=false);
+  void mark_dirty_parent(LogSegment *ls, bool dirty_pool=false);
   void clear_dirty_parent();
   void verify_diri_backtrace(bufferlist &bl, int err);
   bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); }
index ffe88c242aee45d5280be04b0e35cb47d48604f7..e8e05dbd609eb359792ebfde24b2385cf0a1484c 100644 (file)
@@ -271,7 +271,7 @@ public:
   Export make_export() {
     return Export(cap_id, _wanted, issued(), pending(), client_follows, last_sent, mseq+1, last_issue_stamp);
   }
-  void merge(Export& other, bool auth_cap) {
+  void merge(const Export& other, bool auth_cap) {
     if (!is_stale()) {
       // issued + pending
       int newpending = other.pending | pending();
index d2ba81aeb09b03186cadf5d2c0319d3a82b2102e..504aa41f9b2e6b5909111e7c5bb69d24213ed72a 100644 (file)
@@ -230,6 +230,7 @@ void FSMap::create_filesystem(boost::string_view name,
                               uint64_t features)
 {
   auto fs = std::make_shared<Filesystem>();
+  fs->mds_map.epoch = epoch;
   fs->mds_map.fs_name = std::string(name);
   fs->mds_map.max_mds = 1;
   fs->mds_map.data_pools.push_back(data_pool);
index d1757f8601224240984444b984b9d945a2dfc12a..720a22f0be78e4a8f14d3ce06615e3955edfc55f 100644 (file)
@@ -103,6 +103,7 @@ protected:
 public:
 
   friend class MDSMonitor;
+  friend class PaxosFSMap;
 
   FSMap() 
     : epoch(0),
index 717779b8d6d6ab9378c69fd921486cee7ba9b538..f9858b573f5f660f306c58127b2ad5c1c02c6dba 100644 (file)
@@ -17,6 +17,7 @@
 #include "MDSRank.h"
 #include "MDCache.h"
 #include "Locker.h"
+#include "MDBalancer.h"
 #include "CInode.h"
 #include "CDir.h"
 #include "CDentry.h"
@@ -1886,6 +1887,9 @@ void Locker::file_update_finish(CInode *in, MutationRef& mut, bool share_max, bo
   }
   issue_caps_set(need_issue);
 
+  utime_t now = ceph_clock_now();
+  mds->balancer->hit_inode(now, in, META_POP_IWR);
+
   // auth unpin after issuing caps
   mut->cleanup();
 }
@@ -2414,6 +2418,8 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
     pi.inode.rstat.rbytes = new_size;
     dout(10) << "check_inode_max_size mtime " << pi.inode.mtime << " -> " << new_mtime << dendl;
     pi.inode.mtime = new_mtime;
+    if (new_mtime > pi.inode.ctime)
+      pi.inode.ctime = pi.inode.rstat.rctime = new_mtime;
   }
 
   // use EOpen if the file is still open; otherwise, use EUpdate.
@@ -3131,7 +3137,7 @@ void Locker::_update_cap_fields(CInode *in, int dirty, MClientCaps *m, CInode::m
   if (m->get_ctime() > pi->ctime) {
     dout(7) << "  ctime " << pi->ctime << " -> " << m->get_ctime()
            << " for " << *in << dendl;
-    pi->ctime = m->get_ctime();
+    pi->ctime = pi->rstat.rctime = m->get_ctime();
   }
 
   if ((features & CEPH_FEATURE_FS_CHANGE_ATTR) &&
index d8886ce96bd3255ef9a2bbc25968df336597ae50..83de7910648aaabd8cfbd71b012279f9623b4f8c 100644 (file)
@@ -164,8 +164,6 @@ void MDBalancer::tick()
   }
 
   // balance?
-  if (last_heartbeat == utime_t())
-    last_heartbeat = now;
   if (mds->get_nodeid() == 0 &&
       g_conf->mds_bal_interval > 0 &&
       (num_bal_times ||
@@ -229,7 +227,26 @@ mds_load_t MDBalancer::get_load(utime_t now)
     dout(20) << "get_load no root, no load" << dendl;
   }
 
-  load.req_rate = mds->get_req_rate();
+  uint64_t num_requests = mds->get_num_requests();
+  bool new_req_rate = false;
+  if (last_get_load != utime_t() &&
+      now > last_get_load &&
+      num_requests >= last_num_requests) {
+    utime_t el = now;
+    el -= last_get_load;
+    if (el.sec() >= 1) {
+      load.req_rate = (num_requests - last_num_requests) / (double)el;
+      new_req_rate = true;
+    }
+  }
+  if (!new_req_rate) {
+    auto p = mds_load.find(mds->get_nodeid());
+    if (p != mds_load.end())
+      load.req_rate = p->second.req_rate;
+  }
+  last_get_load = now;
+  last_num_requests = num_requests;
+
   load.queue_len = messenger->get_dispatch_queue_len();
 
   ifstream cpu(PROCPREFIX "/proc/loadavg");
@@ -302,14 +319,15 @@ void MDBalancer::send_heartbeat()
 
   if (mds->get_nodeid() == 0) {
     beat_epoch++;
-   
     mds_load.clear();
   }
 
   // my load
   mds_load_t load = get_load(now);
-  map<mds_rank_t, mds_load_t>::value_type val(mds->get_nodeid(), load);
-  mds_load.insert(val);
+  mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
+  mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
+
+  mds_load[mds->get_nodeid()] = load;
 
   // import_map -- how much do i import from whom
   map<mds_rank_t, float> import_map;
@@ -350,8 +368,6 @@ void MDBalancer::send_heartbeat()
 /* This function DOES put the passed message before returning */
 void MDBalancer::handle_heartbeat(MHeartbeat *m)
 {
-  typedef map<mds_rank_t, mds_load_t> mds_load_map_t;
-
   mds_rank_t who = mds_rank_t(m->get_source().num());
   dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
 
@@ -380,23 +396,22 @@ void MDBalancer::handle_heartbeat(MHeartbeat *m)
   if (who == 0) {
     dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
     if (beat_epoch != m->get_beat()) {
+      beat_epoch = m->get_beat();
       mds_load.clear();
     }
-    beat_epoch = m->get_beat();
+
     send_heartbeat();
 
     mds->mdcache->show_subtrees();
-  }
-
-  {
-    // set mds_load[who]
-    mds_load_map_t::value_type val(who, m->get_load());
-    pair < mds_load_map_t::iterator, bool > rval (mds_load.insert(val));
-    if (!rval.second) {
-      rval.first->second = val.second;
+  } else if (mds->get_nodeid() == 0) {
+    if (beat_epoch != m->get_beat()) {
+      dout(10) << " old heartbeat epoch, ignoring" << dendl;
+      goto out;
     }
   }
-  mds_import_map[ who ] = m->get_import_map();
+
+  mds_load[who] = m->get_load();
+  mds_import_map[who] = m->get_import_map();
 
   {
     unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
@@ -421,26 +436,6 @@ void MDBalancer::handle_heartbeat(MHeartbeat *m)
   m->put();
 }
 
-
-void MDBalancer::export_empties()
-{
-  dout(5) << "export_empties checking for empty imports" << dendl;
-
-  std::set<CDir *> subtrees;
-  mds->mdcache->get_fullauth_subtrees(subtrees);
-  for (auto &dir : subtrees) {
-    if (dir->is_freezing() || dir->is_frozen())
-      continue;
-
-    if (!dir->inode->is_base() &&
-       !dir->inode->is_stray() &&
-       dir->get_num_head_items() == 0)
-      mds->mdcache->migrator->export_empty_import(dir);
-  }
-}
-
-
-
 double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
                              mds_rank_t im, double& maxim)
 {
@@ -615,12 +610,12 @@ void MDBalancer::prep_rebalance(int beat)
              << dendl;
     }
 
+    mds_meta_load.clear();
+
     double total_load = 0.0;
     multimap<double,mds_rank_t> load_map;
     for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
-      map<mds_rank_t, mds_load_t>::value_type val(i, mds_load_t(ceph_clock_now()));
-      std::pair < map<mds_rank_t, mds_load_t>::iterator, bool > r(mds_load.insert(val));
-      mds_load_t &load(r.first->second);
+      mds_load_t& load = mds_load.at(i);
 
       double l = load.mds_load() * load_fac;
       mds_meta_load[i] = l;
@@ -645,13 +640,18 @@ void MDBalancer::prep_rebalance(int beat)
            << dendl;
 
     // under or over?
-    if (my_load < target_load * (1.0 + g_conf->mds_bal_min_rebalance)) {
+    for (auto p : load_map) {
+      if (p.first < target_load * (1.0 + g_conf->mds_bal_min_rebalance)) {
+       dout(5) << " mds." << p.second << " is underloaded or barely overloaded." << dendl;
+       mds_last_epoch_under_map[p.second] = beat_epoch;
+      }
+    }
+
+    int last_epoch_under = mds_last_epoch_under_map[whoami];
+    if (last_epoch_under == beat_epoch) {
       dout(5) << "  i am underloaded or barely overloaded, doing nothing." << dendl;
-      last_epoch_under = beat_epoch;
-      mds->mdcache->show_subtrees();
       return;
     }
-
     // am i over long enough?
     if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
       dout(5) << "  i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
@@ -675,9 +675,12 @@ void MDBalancer::prep_rebalance(int beat)
        importers.insert(pair<double,mds_rank_t>(it->first,it->second));
        importer_set.insert(it->second);
       } else {
-       dout(15) << "   mds." << it->second << " is exporter" << dendl;
-       exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
-       exporter_set.insert(it->second);
+       int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
+       if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
+         dout(15) << "   mds." << it->second << " is exporter" << dendl;
+         exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
+         exporter_set.insert(it->second);
+       }
       }
     }
 
@@ -764,12 +767,8 @@ int MDBalancer::mantle_prep_rebalance()
 
   /* fill in the metrics for each mds by grabbing load struct */
   vector < map<string, double> > metrics (cluster_size);
-  for (mds_rank_t i=mds_rank_t(0);
-       i < mds_rank_t(cluster_size);
-       i++) {
-    map<mds_rank_t, mds_load_t>::value_type val(i, mds_load_t(ceph_clock_now()));
-    std::pair < map<mds_rank_t, mds_load_t>::iterator, bool > r(mds_load.insert(val));
-    mds_load_t &load(r.first->second);
+  for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
+    mds_load_t& load = mds_load.at(i);
 
     metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
                   {"all.meta_load", load.all.meta_load()},
@@ -804,142 +803,159 @@ void MDBalancer::try_rebalance(balance_state_t& state)
   }
 
   // make a sorted list of my imports
-  map<double,CDir*>    import_pop_map;
-  multimap<mds_rank_t,CDir*>  import_from_map;
+  multimap<double, CDir*> import_pop_map;
+  multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
   set<CDir*> fullauthsubs;
 
   mds->mdcache->get_fullauth_subtrees(fullauthsubs);
-  for (set<CDir*>::iterator it = fullauthsubs.begin();
-       it != fullauthsubs.end();
-       ++it) {
-    CDir *im = *it;
-    if (im->get_inode()->is_stray()) continue;
+  for (auto dir : fullauthsubs) {
+    CInode *diri = dir->get_inode();
+    if (diri->is_mdsdir())
+      continue;
+    if (diri->get_export_pin(false) != MDS_RANK_NONE)
+      continue;
+    if (dir->is_freezing() || dir->is_frozen())
+      continue;  // export pbly already in progress
 
-    double pop = im->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
+    mds_rank_t from = diri->authority().first;
+    double pop = dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
     if (g_conf->mds_bal_idle_threshold > 0 &&
        pop < g_conf->mds_bal_idle_threshold &&
-       im->inode != mds->mdcache->get_root() &&
-       im->inode->authority().first != mds->get_nodeid()) {
-      dout(5) << " exporting idle (" << pop << ") import " << *im
-             << " back to mds." << im->inode->authority().first
-             << dendl;
-      mds->mdcache->migrator->export_dir_nicely(im, im->inode->authority().first);
+       diri != mds->mdcache->get_root() &&
+       from != mds->get_nodeid()) {
+      dout(5) << " exporting idle (" << pop << ") import " << *dir
+             << " back to mds." << from << dendl;
+      mds->mdcache->migrator->export_dir_nicely(dir, from);
       continue;
     }
 
-    import_pop_map[ pop ] = im;
-    mds_rank_t from = im->inode->authority().first;
-    dout(15) << "  map: i imported " << *im << " from " << from << dendl;
-    import_from_map.insert(pair<mds_rank_t,CDir*>(from, im));
+    dout(15) << "  map: i imported " << *dir << " from " << from << dendl;
+    import_pop_map.insert(make_pair(pop, dir));
+    import_from_map.insert(make_pair(from, make_pair(dir, pop)));
   }
 
-
-
   // do my exports!
-  set<CDir*> already_exporting;
+  map<mds_rank_t, double> export_pop_map;
 
   for (auto &it : state.targets) {
     mds_rank_t target = it.first;
     double amount = it.second;
 
-    if (amount < MIN_OFFLOAD) continue;
-    if (amount / target_load < .2) continue;
+    if (amount / target_load < .2)
+      continue;
+    if (amount < MIN_OFFLOAD)
+      continue;
 
     dout(5) << "want to send " << amount << " to mds." << target
       //<< " .. " << (*it).second << " * " << load_fac
            << " -> " << amount
            << dendl;//" .. fudge is " << fudge << dendl;
-    double have = 0.0;
 
+    double& have = export_pop_map[target];
 
     mds->mdcache->show_subtrees();
 
     // search imports from target
     if (import_from_map.count(target)) {
       dout(5) << " aha, looking through imports from target mds." << target << dendl;
-      pair<multimap<mds_rank_t,CDir*>::iterator, multimap<mds_rank_t,CDir*>::iterator> p =
-       import_from_map.equal_range(target);
-      while (p.first != p.second) {
-       CDir *dir = (*p.first).second;
+      for (auto p = import_from_map.equal_range(target);
+          p.first != p.second; ) {
+       CDir *dir = p.first->second.first;
+       double pop = p.first->second.second;
        dout(5) << "considering " << *dir << " from " << (*p.first).first << dendl;
-       multimap<mds_rank_t,CDir*>::iterator plast = p.first++;
+       auto plast = p.first++;
 
-       if (dir->inode->is_base() ||
-           dir->inode->is_stray())
+       if (dir->inode->is_base())
          continue;
-       if (dir->is_freezing() || dir->is_frozen()) continue;  // export pbly already in progress
-       double pop = dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
        assert(dir->inode->authority().first == target);  // cuz that's how i put it in the map, dummy
 
        if (pop <= amount-have) {
-         dout(5) << "reexporting " << *dir
-                 << " pop " << pop
+         dout(5) << "reexporting " << *dir << " pop " << pop
                  << " back to mds." << target << dendl;
          mds->mdcache->migrator->export_dir_nicely(dir, target);
          have += pop;
          import_from_map.erase(plast);
-         import_pop_map.erase(pop);
+         for (auto q = import_pop_map.equal_range(pop);
+              q.first != q.second; ) {
+           if (q.first->second == dir) {
+             import_pop_map.erase(q.first);
+             break;
+           }
+           q.first++;
+         }
        } else {
          dout(5) << "can't reexport " << *dir << ", too big " << pop << dendl;
        }
-       if (amount-have < MIN_OFFLOAD) break;
+       if (amount-have < MIN_OFFLOAD)
+         break;
       }
     }
-    if (amount-have < MIN_OFFLOAD) {
+  }
+
+  // any other imports
+  for (auto &it : state.targets) {
+    mds_rank_t target = it.first;
+    double amount = it.second;
+
+    if (!export_pop_map.count(target))
+      continue;
+    double& have = export_pop_map[target];
+    if (amount-have < MIN_OFFLOAD)
       continue;
-    }
 
-    // any other imports
-    if (false)
-      for (map<double,CDir*>::iterator import = import_pop_map.begin();
-          import != import_pop_map.end();
-          import++) {
-       CDir *imp = (*import).second;
-       if (imp->inode->is_base() ||
-           imp->inode->is_stray())
-         continue;
+    for (auto p = import_pop_map.begin();
+        p != import_pop_map.end(); ) {
+      CDir *dir = p->second;
+      if (dir->inode->is_base()) {
+       ++p;
+       continue;
+      }
 
-       double pop = (*import).first;
-       if (pop < amount-have || pop < MIN_REEXPORT) {
-         dout(5) << "reexporting " << *imp
-                 << " pop " << pop
-                 << " back to mds." << imp->inode->authority()
-                 << dendl;
-         have += pop;
-         mds->mdcache->migrator->export_dir_nicely(imp, imp->inode->authority().first);
-       }
-       if (amount-have < MIN_OFFLOAD) break;
+      double pop = p->first;
+      if (pop <= amount-have && pop > MIN_REEXPORT) {
+       dout(0) << "reexporting " << *dir << " pop " << pop
+               << " to mds." << target << dendl;
+       have += pop;
+       mds->mdcache->migrator->export_dir_nicely(dir, target);
+       import_pop_map.erase(p++);
+      } else {
+       ++p;
       }
-    if (amount-have < MIN_OFFLOAD) {
-      //fudge = amount-have;
-      continue;
+      if (amount-have < MIN_OFFLOAD)
+       break;
     }
+  }
 
-    // okay, search for fragments of my workload
-    set<CDir*> candidates;
-    mds->mdcache->get_fullauth_subtrees(candidates);
+  set<CDir*> already_exporting;
+
+  for (auto &it : state.targets) {
+    mds_rank_t target = it.first;
+    double amount = it.second;
 
+    if (!export_pop_map.count(target))
+      continue;
+    double& have = export_pop_map[target];
+    if (amount-have < MIN_OFFLOAD)
+      continue;
+
+    // okay, search for fragments of my workload
     list<CDir*> exports;
 
-    for (set<CDir*>::iterator pot = candidates.begin();
-        pot != candidates.end();
-        ++pot) {
-      if ((*pot)->get_inode()->is_stray()) continue;
-      find_exports(*pot, amount, exports, have, already_exporting);
-      if (have > amount-MIN_OFFLOAD)
+    for (auto p = import_pop_map.rbegin();
+        p != import_pop_map.rend();
+        ++p) {
+      CDir *dir = p->second;
+      find_exports(dir, amount, exports, have, already_exporting);
+      if (amount-have < MIN_OFFLOAD)
        break;
     }
     //fudge = amount - have;
 
-    for (list<CDir*>::iterator it = exports.begin(); it != exports.end(); ++it) {
-      dout(5) << "   - exporting "
-              << (*it)->pop_auth_subtree
-              << " "
-              << (*it)->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate)
-              << " to mds." << target
-              << " " << **it
-              << dendl;
-      mds->mdcache->migrator->export_dir_nicely(*it, target);
+    for (auto dir : exports) {
+      dout(5) << "   - exporting " << dir->pop_auth_subtree
+             << " " << dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate)
+             << " to mds." << target << " " << *dir << dendl;
+      mds->mdcache->migrator->export_dir_nicely(dir, target);
     }
   }
 
@@ -953,9 +969,19 @@ void MDBalancer::find_exports(CDir *dir,
                               double& have,
                               set<CDir*>& already_exporting)
 {
+  utime_t now = ceph_clock_now();
+  if ((double)(now - rebalance_time) > 0.1) {
+    derr << " balancer runs too long"  << dendl_impl;
+    have = amount;
+    return;
+  }
+
+  assert(dir->is_auth());
+
   double need = amount - have;
   if (need < amount * g_conf->mds_bal_min_start)
     return;   // good enough!
+
   double needmax = need * g_conf->mds_bal_need_max;
   double needmin = need * g_conf->mds_bal_need_min;
   double midchunk = need * g_conf->mds_bal_midchunk;
@@ -968,28 +994,40 @@ void MDBalancer::find_exports(CDir *dir,
   dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
 
   double subdir_sum = 0;
-  for (auto it = dir->begin(); it != dir->end(); ++it) {
-    CInode *in = it->second->get_linkage()->get_inode();
-    if (!in) continue;
-    if (!in->is_dir()) continue;
+  for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
+       !it.end(); ) {
+    CInode *in = *it;
+    ++it;
+
+    assert(in->is_dir());
+    assert(in->get_parent_dir() == dir);
 
     list<CDir*> dfls;
-    in->get_dirfrags(dfls);
+    in->get_nested_dirfrags(dfls);
+
+    size_t num_idle_frags = 0;
     for (list<CDir*>::iterator p = dfls.begin();
         p != dfls.end();
         ++p) {
       CDir *subdir = *p;
-      if (!subdir->is_auth()) continue;
-      if (already_exporting.count(subdir)) continue;
+      if (already_exporting.count(subdir))
+       continue;
 
-      if (subdir->is_frozen()) continue;  // can't export this right now!
+      // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
+      // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
+      if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
+         subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
+       continue;  // can't export this right now!
 
       // how popular?
       double pop = subdir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
       subdir_sum += pop;
       dout(15) << "   subdir pop " << pop << " " << *subdir << dendl;
 
-      if (pop < minchunk) continue;
+      if (pop < minchunk) {
+       num_idle_frags++;
+       continue;
+      }
 
       // lucky find?
       if (pop > needmin && pop < needmax) {
@@ -1007,6 +1045,8 @@ void MDBalancer::find_exports(CDir *dir,
       } else
        smaller.insert(pair<double,CDir*>(pop, subdir));
     }
+    if (dfls.size() == num_idle_frags)
+      in->item_pop_lru.remove_myself();
   }
   dout(15) << "   sum " << subdir_sum << " / " << dir_pop << dendl;
 
@@ -1060,10 +1100,9 @@ void MDBalancer::find_exports(CDir *dir,
     if (have > needmin)
       return;
   }
-
 }
 
-void MDBalancer::hit_inode(utime_t now, CInode *in, int type, int who)
+void MDBalancer::hit_inode(const utime_t& now, CInode *in, int type, int who)
 {
   // hit inode
   in->pop.get(type).hit(now, mds->mdcache->decayrate);
@@ -1104,7 +1143,7 @@ void MDBalancer::maybe_fragment(CDir *dir, bool hot)
   }
 }
 
-void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amount)
+void MDBalancer::hit_dir(const utime_t& now, CDir *dir, int type, int who, double amount)
 {
   // hit me
   double v = dir->pop_me.get(type).hit(now, mds->mdcache->decayrate, amount);
@@ -1175,14 +1214,21 @@ void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amoun
   bool hit_subtree_nested = dir->is_auth();  // all nested auth subtrees
 
   while (true) {
+    CDir *pdir = dir->inode->get_parent_dir();
     dir->pop_nested.get(type).hit(now, mds->mdcache->decayrate, amount);
     if (rd_adj != 0.0)
       dir->pop_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
 
     if (hit_subtree) {
       dir->pop_auth_subtree.get(type).hit(now, mds->mdcache->decayrate, amount);
+
       if (rd_adj != 0.0)
        dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
+
+      if (dir->is_subtree_root())
+       hit_subtree = false;                // end of auth domain, stop hitting auth counters.
+      else if (pdir)
+       pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
     }
 
     if (hit_subtree_nested) {
@@ -1190,12 +1236,8 @@ void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amoun
       if (rd_adj != 0.0)
        dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
     }
-
-    if (dir->is_subtree_root())
-      hit_subtree = false;                // end of auth domain, stop hitting auth counters.
-
-    if (dir->inode->get_parent_dn() == 0) break;
-    dir = dir->inode->get_parent_dn()->get_dir();
+    if (!pdir) break;
+    dir = pdir;
   }
 }
 
@@ -1235,9 +1277,141 @@ void MDBalancer::add_import(CDir *dir, utime_t now)
   }
 }
 
+void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, utime_t now, bool inc)
+{
+  DecayRate& rate = mds->mdcache->decayrate;
+
+  bool adjust_subtree_nest = dir->is_auth();
+  bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
+  CDir *cur = dir;
+  while (true) {
+    if (inc) {
+      pdir->pop_nested.add(now, rate, dir->pop_nested);
+      if (adjust_subtree) {
+       pdir->pop_auth_subtree.add(now, rate, dir->pop_auth_subtree);
+       pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
+      }
+
+      if (adjust_subtree_nest)
+       pdir->pop_auth_subtree_nested.add(now, rate, dir->pop_auth_subtree_nested);
+    } else {
+      pdir->pop_nested.sub(now, rate, dir->pop_nested);
+      if (adjust_subtree)
+       pdir->pop_auth_subtree.sub(now, rate, dir->pop_auth_subtree);
+
+      if (adjust_subtree_nest)
+       pdir->pop_auth_subtree_nested.sub(now, rate, dir->pop_auth_subtree_nested);
+    }
+
+    if (pdir->is_subtree_root())
+      adjust_subtree = false;
+    cur = pdir;
+    pdir = pdir->inode->get_parent_dir();
+    if (!pdir) break;
+  }
+}
+
 void MDBalancer::handle_mds_failure(mds_rank_t who)
 {
   if (0 == who) {
-    last_epoch_under = 0;
+    mds_last_epoch_under_map.clear();
   }
 }
+
+int MDBalancer::dump_loads(Formatter *f)
+{
+  utime_t now = ceph_clock_now();
+  DecayRate& decayrate = mds->mdcache->decayrate;
+
+  list<CDir*> dfs;
+  if (mds->mdcache->get_root()) {
+    mds->mdcache->get_root()->get_dirfrags(dfs);
+  } else {
+    dout(5) << "dump_load no root" << dendl;
+  }
+
+  f->open_object_section("loads");
+
+  f->open_array_section("dirfrags");
+  while (!dfs.empty()) {
+    CDir *dir = dfs.front();
+    dfs.pop_front();
+
+    if (f) {
+      f->open_object_section("dir");
+      dir->dump_load(f, now, decayrate);
+      f->close_section();
+    }
+
+    for (auto it = dir->begin(); it != dir->end(); ++it) {
+      CInode *in = it->second->get_linkage()->get_inode();
+      if (!in || !in->is_dir())
+       continue;
+
+      list<CDir*> ls;
+      in->get_dirfrags(ls);
+      for (auto subdir : ls) {
+       if (subdir->pop_nested.meta_load() < .001)
+         continue;
+       dfs.push_back(subdir);
+      }
+    }
+  }
+  f->close_section();  // dirfrags array
+
+  f->open_object_section("mds_load");
+  {
+
+    auto dump_mds_load = [f, now](mds_load_t& load) {
+      f->dump_float("request_rate", load.req_rate);
+      f->dump_float("cache_hit_rate", load.cache_hit_rate);
+      f->dump_float("queue_length", load.queue_len);
+      f->dump_float("cpu_load", load.cpu_load_avg);
+      f->dump_float("mds_load", load.mds_load());
+
+      DecayRate rate; // no decay
+      f->open_object_section("auth_dirfrags");
+      load.auth.dump(f, now, rate);
+      f->close_section();
+      f->open_object_section("all_dirfrags");
+      load.all.dump(f, now, rate);
+      f->close_section();
+    };
+
+    for (auto p : mds_load) {
+      stringstream name;
+      name << "mds." << p.first;
+      f->open_object_section(name.str().c_str());
+      dump_mds_load(p.second);
+      f->close_section();
+    }
+  }
+  f->close_section(); // mds_load
+
+  f->open_object_section("mds_meta_load");
+  for (auto p : mds_meta_load) {
+    stringstream name;
+    name << "mds." << p.first;
+    f->dump_float(name.str().c_str(), p.second);
+  }
+  f->close_section(); // mds_meta_load
+
+  f->open_object_section("mds_import_map");
+  for (auto p : mds_import_map) {
+    stringstream name1;
+    name1 << "mds." << p.first;
+    f->open_array_section(name1.str().c_str());
+    for (auto q : p.second) {
+      f->open_object_section("from");
+      stringstream name2;
+      name2 << "mds." << q.first;
+      f->dump_float(name2.str().c_str(), q.second);
+      f->close_section();
+    }
+    f->close_section(); // mds.? array
+  }
+  f->close_section(); // mds_import_map
+
+  f->close_section(); // loads
+  return 0;
+}
index d23185b22f898e1ffe278178813bfc62b5683991..aeb78e414ce1aef8e68d6e3db40392341e41f59f 100644 (file)
@@ -38,14 +38,7 @@ class MDBalancer {
   friend class C_Bal_SendHeartbeat;
 public:
   MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) : 
-    mds(m),
-    messenger(msgr),
-    mon_client(monc),
-    beat_epoch(0),
-    last_epoch_under(0), my_load(0.0), target_load(0.0)
-    { }
-
-  mds_load_t get_load(utime_t);
+    mds(m), messenger(msgr), mon_client(monc) { }
 
   int proc_message(Message *m);
 
@@ -58,9 +51,10 @@ public:
 
   void subtract_export(CDir *ex, utime_t now);
   void add_import(CDir *im, utime_t now);
+  void adjust_pop_for_rename(CDir *pdir, CDir *dir, utime_t now, bool inc);
 
-  void hit_inode(utime_t now, CInode *in, int type, int who=-1);
-  void hit_dir(utime_t now, CDir *dir, int type, int who=-1, double amount=1.0);
+  void hit_inode(const utime_t& now, CInode *in, int type, int who=-1);
+  void hit_dir(const utime_t& now, CDir *dir, int type, int who=-1, double amount=1.0);
 
   void queue_split(const CDir *dir, bool fast);
   void queue_merge(CDir *dir);
@@ -75,6 +69,8 @@ public:
 
   void handle_mds_failure(mds_rank_t who);
 
+  int dump_loads(Formatter *f);
+
 private:
   typedef struct {
     std::map<mds_rank_t, double> targets;
@@ -89,7 +85,7 @@ private:
 
   void handle_export_pins(void);
 
-  void export_empties();
+  mds_load_t get_load(utime_t now);
   int localize_balancer();
   void send_heartbeat();
   void handle_heartbeat(MHeartbeat *m);
@@ -122,9 +118,8 @@ private:
   MDSRank *mds;
   Messenger *messenger;
   MonClient *mon_client;
-  int beat_epoch;
+  int beat_epoch = 0;
 
-  int last_epoch_under;
   string bal_code;
   string bal_version;
 
@@ -132,6 +127,9 @@ private:
   utime_t last_sample;
   utime_t rebalance_time; //ensure a consistent view of load for rebalance
 
+  utime_t last_get_load;
+  uint64_t last_num_requests = 0;
+
   // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
   // just as soon as a delayed context comes back and triggers it.
   // These sets just prevent us from spawning extra timer contexts for
@@ -142,9 +140,11 @@ private:
   map<mds_rank_t, mds_load_t>  mds_load;
   map<mds_rank_t, double>       mds_meta_load;
   map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;
+  map<mds_rank_t, int> mds_last_epoch_under_map;
 
   // per-epoch state
-  double          my_load, target_load;
+  double my_load = 0;
+  double target_load = 0;
 };
 
 #endif
index e954c8912a57fbc6c470ff20d3626dac6556e3c7..887c1062779e8562bfa923e86229cb26da568723 100644 (file)
@@ -434,7 +434,10 @@ void MDCache::create_empty_hierarchy(MDSGather *gather)
   rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
   rootdir->commit(0, gather->new_sub());
 
-  root->store(gather->new_sub());
+  root->mark_clean();
+  root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment());
+  root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
+  root->flush(gather->new_sub());
 }
 
 void MDCache::create_mydir_hierarchy(MDSGather *gather)
@@ -464,7 +467,7 @@ void MDCache::create_mydir_hierarchy(MDSGather *gather)
     straydir->mark_complete();
     straydir->mark_dirty(straydir->pre_dirty(), ls);
     straydir->commit(0, gather->new_sub());
-    stray->_mark_dirty_parent(ls, true);
+    stray->mark_dirty_parent(ls, true);
     stray->store_backtrace(gather->new_sub());
   }
 
@@ -613,6 +616,24 @@ void MDCache::open_mydir_inode(MDSInternalContextBase *c)
   gather.activate();
 }
 
+void MDCache::open_mydir_frag(MDSInternalContextBase *c)
+{
+  open_mydir_inode(
+      new MDSInternalContextWrapper(mds,
+       new FunctionContext([this, c](int r) {
+           if (r < 0) {
+             c->complete(r);
+             return;
+           }
+           CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
+           assert(mydir);
+           adjust_subtree_auth(mydir, mds->get_nodeid());
+           mydir->fetch(c);
+         })
+       )
+      );
+}
+
 void MDCache::open_root()
 {
   dout(10) << "open_root" << dendl;
@@ -809,7 +830,7 @@ void MDCache::list_subtrees(list<CDir*>& ls)
  * merge with parent and/or child subtrees, if is it appropriate.
  * merge can ONLY happen if both parent and child have unambiguous auth.
  */
-void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth)
+void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
 {
   dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
          << " on " << *dir << dendl;
@@ -864,7 +885,7 @@ void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth)
     root = dir;
 
     // adjust recursive pop counters
-    if (dir->is_auth()) {
+    if (adjust_pop && dir->is_auth()) {
       utime_t now = ceph_clock_now();
       CDir *p = dir->get_parent_dir();
       while (p) {
@@ -909,7 +930,7 @@ public:
   }
 };
 
-void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval)
+void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
 {
   dout(10) << "try_subtree_merge_at " << *dir << dendl;
 
@@ -941,12 +962,15 @@ void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval)
     subtrees[parent].erase(dir);
 
     // adjust popularity?
-    if (dir->is_auth()) {
+    if (adjust_pop && dir->is_auth()) {
       utime_t now = ceph_clock_now();
+      CDir *cur = dir;
       CDir *p = dir->get_parent_dir();
       while (p) {
        p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
+       p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
        if (p->is_subtree_root()) break;
+       cur = p;
        p = p->inode->get_parent_dir();
       }
     }
@@ -1325,6 +1349,7 @@ void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
   dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
 
   //show_subtrees();
+  utime_t now = ceph_clock_now();
 
   CDir *newdir = diri->get_parent_dir();
 
@@ -1353,12 +1378,12 @@ void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
     CDir *newparent = get_subtree_root(newdir);
     dout(10) << " new parent " << *newparent << dendl;
 
+    if (olddir != newdir)
+      mds->balancer->adjust_pop_for_rename(olddir, dir, now, false);
+
     if (oldparent == newparent) {
       dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
-      continue;
-    }
-
-    if (dir->is_subtree_root()) {
+    } else if (dir->is_subtree_root()) {
       // children are fine.  change parent.
       dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
       assert(subtrees[oldparent].count(dir));
@@ -1366,7 +1391,7 @@ void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
       assert(subtrees.count(newparent));
       subtrees[newparent].insert(dir);
       // caller is responsible for 'eval diri'
-      try_subtree_merge_at(dir, NULL);
+      try_subtree_merge_at(dir, NULL, false);
     } else {
       // mid-subtree.
 
@@ -1391,11 +1416,14 @@ void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
 
       // did auth change?
       if (oldparent->authority() != newparent->authority()) {
-       adjust_subtree_auth(dir, oldparent->authority());
+       adjust_subtree_auth(dir, oldparent->authority(), false);
        // caller is responsible for 'eval diri'
-       try_subtree_merge_at(dir, NULL);
+       try_subtree_merge_at(dir, NULL, false);
       }
     }
+
+    if (olddir != newdir)
+      mds->balancer->adjust_pop_for_rename(newdir, dir, now, true);
   }
 
   show_subtrees();
@@ -1963,7 +1991,7 @@ void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accou
   }
 }
 
-void MDCache::broadcast_quota_to_client(CInode *in)
+void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct)
 {
   if (!in->is_auth() || in->is_frozen())
     return;
@@ -1982,6 +2010,10 @@ void MDCache::broadcast_quota_to_client(CInode *in)
       continue;
 
     Capability *cap = it->second;
+
+    if (exclude_ct >= 0 && exclude_ct != it->first)
+      goto update;
+
     if (cap->last_rbytes == i->rstat.rbytes &&
         cap->last_rsize == i->rstat.rsize())
       continue;
@@ -3164,6 +3196,10 @@ void MDCache::handle_resolve(MMDSResolve *m)
            im.cap_id = ++last_cap_id; // assign a new cap ID
            im.issue_seq = 1;
            im.mseq = q->second.mseq;
+
+           Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+           if (session)
+             rejoin_client_map.emplace(q->first, session->info.inst);
          }
 
          // will process these caps in rejoin stage
@@ -3955,12 +3991,11 @@ void MDCache::rejoin_send_rejoins()
   if (mds->is_rejoin()) {
     map<client_t, set<mds_rank_t> > client_exports;
     for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) {
-      assert(cap_export_targets.count(p->first));
-      mds_rank_t target = cap_export_targets[p->first];
+      mds_rank_t target = p->second.first;
       if (rejoins.count(target) == 0)
        continue;
-      rejoins[target]->cap_exports[p->first] = p->second;
-      for (auto q = p->second.begin(); q != p->second.end(); ++q)
+      rejoins[target]->cap_exports[p->first] = p->second.second;
+      for (auto q = p->second.second.begin(); q != p->second.second.end(); ++q)
        client_exports[q->first].insert(target);
     }
     for (map<client_t, set<mds_rank_t> >::iterator p = client_exports.begin();
@@ -4128,7 +4163,7 @@ void MDCache::rejoin_send_rejoins()
   rejoins_pending = false;
 
   // nothing?
-  if (mds->is_rejoin() && rejoins.empty()) {
+  if (mds->is_rejoin() && rejoin_gather.empty()) {
     dout(10) << "nothing to rejoin" << dendl;
     rejoin_gather_finish();
   }
@@ -4484,7 +4519,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
     }
   } else {
     // done?
-    if (rejoin_gather.empty()) {
+    if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
       rejoin_gather_finish();
     } else {
       dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
@@ -4492,14 +4527,6 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
   }
 }
 
-class C_MDC_RejoinGatherFinish : public MDCacheContext {
-public:
-  explicit C_MDC_RejoinGatherFinish(MDCache *c) : MDCacheContext(c) {}
-  void finish(int r) override {
-    mdcache->rejoin_gather_finish();
-  }
-};
-
 /*
  * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
  *
@@ -4858,7 +4885,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
   // done?
   assert(rejoin_gather.count(from));
   rejoin_gather.erase(from);
-  if (rejoin_gather.empty()) {
+  if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
     rejoin_gather_finish();
   } else {
     dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
@@ -5062,29 +5089,33 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
   for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
        p != peer_imported.end();
        ++p) {
-    assert(cap_exports.count(p->first));
-    assert(cap_export_targets.count(p->first));
-    assert(cap_export_targets[p->first] == from);
+    auto& ex = cap_exports.at(p->first);
+    assert(ex.first == from);
     for (map<client_t,Capability::Import>::iterator q = p->second.begin();
         q != p->second.end();
         ++q) {
-      assert(cap_exports[p->first].count(q->first));
+      auto r = ex.second.find(q->first);
+      assert(r != ex.second.end());
 
       dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
       Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
-      assert(session);
+      if (!session) {
+       dout(10) << " no session for client." << p->first << dendl;
+       ex.second.erase(r);
+       continue;
+      }
 
       // mark client caps stale.
       MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0,
-                                      cap_exports[p->first][q->first].capinfo.cap_id, 0,
+                                      r->second.capinfo.cap_id, 0,
                                        mds->get_osd_epoch_barrier());
       m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
                      (q->second.cap_id > 0 ? from : -1), 0);
       mds->send_message_client_counted(m, session);
 
-      cap_exports[p->first].erase(q->first);
+      ex.second.erase(r);
     }
-    assert(cap_exports[p->first].empty());
+    assert(ex.second.empty());
   }
 
   // done?
@@ -5174,6 +5205,7 @@ void MDCache::rejoin_gather_finish()
 {
   dout(10) << "rejoin_gather_finish" << dendl;
   assert(mds->is_rejoin());
+  assert(rejoin_ack_gather.count(mds->get_nodeid()));
 
   if (open_undef_inodes_dirfrags())
     return;
@@ -5187,7 +5219,6 @@ void MDCache::rejoin_gather_finish()
   rejoin_send_acks();
   
   // signal completion of fetches, rejoin_gather_finish, etc.
-  assert(rejoin_ack_gather.count(mds->get_nodeid()));
   rejoin_ack_gather.erase(mds->get_nodeid());
 
   // did we already get our acks too?
@@ -5238,22 +5269,19 @@ void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
 
 class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
 public:
-  map<client_t,entity_inst_t> client_map;
-  map<client_t,uint64_t> sseqmap;
-
-  C_MDC_RejoinSessionsOpened(MDCache *c, map<client_t,entity_inst_t>& cm) :
-    MDCacheLogContext(c), client_map(cm) {}
+  map<client_t,pair<Session*,uint64_t> > session_map;
+  C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
   void finish(int r) override {
     assert(r == 0);
-    mdcache->rejoin_open_sessions_finish(client_map, sseqmap);
+    mdcache->rejoin_open_sessions_finish(session_map);
   }
 };
 
-void MDCache::rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
-                                         map<client_t,uint64_t>& sseqmap)
+void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
 {
   dout(10) << "rejoin_open_sessions_finish" << dendl;
-  mds->server->finish_force_open_sessions(client_map, sseqmap);
+  mds->server->finish_force_open_sessions(session_map);
+  rejoin_session_map.swap(session_map);
   if (rejoin_gather.empty())
     rejoin_gather_finish();
 }
@@ -5275,6 +5303,8 @@ bool MDCache::process_imported_caps()
     cap_imports_num_opening++;
     dout(10) << "  opening missing ino " << p->first << dendl;
     open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
+    if (!(cap_imports_num_opening % 1000))
+      mds->heartbeat_reset();
   }
 
   if (cap_imports_num_opening > 0)
@@ -5282,21 +5312,16 @@ bool MDCache::process_imported_caps()
 
   // called by rejoin_gather_finish() ?
   if (rejoin_gather.count(mds->get_nodeid()) == 0) {
-    // if sessions for imported caps are all open ?
-    for (map<client_t,entity_inst_t>::iterator p = rejoin_client_map.begin();
-        p != rejoin_client_map.end();
-        ++p) {
-      if (!mds->sessionmap.have_session(entity_name_t::CLIENT(p->first.v))) {
-       C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this, rejoin_client_map);
-       version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map, finish->sseqmap);
-       ESessions *le = new ESessions(pv, rejoin_client_map);
-       mds->mdlog->start_submit_entry(le, finish);
-       mds->mdlog->flush();
-       rejoin_client_map.clear();
-       return true;
-      }
+    if (!rejoin_client_map.empty() &&
+       rejoin_session_map.empty()) {
+      C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
+      version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
+                                                             finish->session_map);
+      mds->mdlog->start_submit_entry(new ESessions(pv, rejoin_client_map), finish);
+      mds->mdlog->flush();
+      rejoin_client_map.clear();
+      return true;
     }
-    rejoin_client_map.clear();
 
     // process caps that were exported by slave rename
     for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
@@ -5307,9 +5332,11 @@ bool MDCache::process_imported_caps()
       for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
           q != p->second.second.end();
           ++q) {
-       Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
-       assert(session);
+       auto r = rejoin_session_map.find(q->first);
+       if (r == rejoin_session_map.end())
+         continue;
 
+       Session *session = r->second.first;
        Capability *cap = in->get_client_cap(q->first);
        if (!cap)
          cap = in->add_client_cap(q->first, session);
@@ -5339,9 +5366,19 @@ bool MDCache::process_imported_caps()
       }
       assert(in->is_auth());
       for (auto q = p->second.begin(); q != p->second.end(); ++q) {
-       Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
-       assert(session);
+       Session *session;
+       {
+         auto r = rejoin_session_map.find(q->first);
+         session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
+       }
+
        for (auto r = q->second.begin(); r != q->second.end(); ++r) {
+         if (!session) {
+           if (r->first >= 0)
+             (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
+           continue;
+         }
+
          Capability *cap = in->reconnect_cap(q->first, r->second, session);
          add_reconnected_cap(q->first, in->ino(), r->second);
          if (r->first >= 0) {
@@ -5361,11 +5398,10 @@ bool MDCache::process_imported_caps()
   } else {
     trim_non_auth();
 
+    assert(rejoin_gather.count(mds->get_nodeid()));
     rejoin_gather.erase(mds->get_nodeid());
+    assert(!rejoin_ack_gather.count(mds->get_nodeid()));
     maybe_send_pending_rejoins();
-
-    if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
-      rejoin_gather_finish();
   }
   return false;
 }
@@ -5817,7 +5853,15 @@ bool MDCache::open_undef_inodes_dirfrags()
   if (fetch_queue.empty())
     return false;
 
-  MDSGatherBuilder gather(g_ceph_context, new C_MDC_RejoinGatherFinish(this));
+  MDSGatherBuilder gather(g_ceph_context,
+      new MDSInternalContextWrapper(mds,
+       new FunctionContext([this](int r) {
+           if (rejoin_gather.empty())
+             rejoin_gather_finish();
+         })
+       )
+      );
+
   for (set<CDir*>::iterator p = fetch_queue.begin();
        p != fetch_queue.end();
        ++p) {
@@ -6740,8 +6784,9 @@ bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCa
     // This is because that unconnected replicas are problematic for
     // subtree migration.
     //
-    if (!in->is_auth() && !in->dirfragtreelock.can_read(-1))
+    if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1, nullptr)) {
       return true;
+    }
 
     // DIR
     list<CDir*> dfls;
@@ -7510,30 +7555,16 @@ bool MDCache::shutdown_pass()
   }
 
   // empty stray dir
-  if (!shutdown_export_strays()) {
-    dout(7) << "waiting for strays to migrate" << dendl;
-    return false;
-  }
-  
-  // drop our reference to our stray dir inode
-  for (int i = 0; i < NUM_STRAY; ++i) {
-    if (strays[i] &&
-       strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
-      strays[i]->state_clear(CInode::STATE_STRAYPINNED);
-      strays[i]->put(CInode::PIN_STRAY);
-      strays[i]->put_stickydirs();
-    }
-  }
+  bool strays_all_exported = shutdown_export_strays();
 
   // trim cache
   trim(UINT64_MAX);
   dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
 
-  // SUBTREES
+  // Export all subtrees to another active (usually rank 0) if not rank 0
   int num_auth_subtree = 0;
   if (!subtrees.empty() &&
-      mds->get_nodeid() != 0 && 
-      migrator->get_export_queue_size() == 0) {
+      mds->get_nodeid() != 0) {
     dout(7) << "looking for subtrees to export to mds0" << dendl;
     list<CDir*> ls;
     for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
@@ -7552,6 +7583,8 @@ bool MDCache::shutdown_pass()
        ls.push_back(dir);
       }
     }
+
+    migrator->clear_export_queue();
     for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
       CDir *dir = *p;
       mds_rank_t dest = dir->get_inode()->authority().first;
@@ -7562,7 +7595,13 @@ bool MDCache::shutdown_pass()
     }
   }
 
+  if (!strays_all_exported) {
+    dout(7) << "waiting for strays to migrate" << dendl;
+    return false;
+  }
+
   if (num_auth_subtree > 0) {
+    assert(mds->get_nodeid() > 0);
     dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
     show_subtrees();
     return false;
@@ -7575,6 +7614,25 @@ bool MDCache::shutdown_pass()
     return false;
   }
 
+  // Fully trim the log so that all objects in cache are clean and may be
+  // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
+  // trim the log such that the cache eventually becomes clean.
+  mds->mdlog->trim(0);
+  if (mds->mdlog->get_num_segments() > 1) {
+    dout(7) << "still >1 segments, waiting for log to trim" << dendl;
+    return false;
+  }
+
+  // drop our reference to our stray dir inode
+  for (int i = 0; i < NUM_STRAY; ++i) {
+    if (strays[i] &&
+       strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
+      strays[i]->state_clear(CInode::STATE_STRAYPINNED);
+      strays[i]->put(CInode::PIN_STRAY);
+      strays[i]->put_stickydirs();
+    }
+  }
+
   CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
   if (mydir && !mydir->is_subtree_root())
     mydir = NULL;
@@ -7592,13 +7650,6 @@ bool MDCache::shutdown_pass()
   assert(!migrator->is_exporting());
   assert(!migrator->is_importing());
 
-  // flush what we can from the log
-  mds->mdlog->trim(0);
-  if (mds->mdlog->get_num_segments() > 1) {
-    dout(7) << "still >1 segments, waiting for log to trim" << dendl;
-    return false;
-  }
-
   if ((myin && myin->is_auth_pinned()) ||
       (mydir && mydir->is_auth_pinned())) {
     dout(7) << "still have auth pinned objects" << dendl;
@@ -7677,9 +7728,9 @@ bool MDCache::shutdown_export_strays()
 
   list<CDir*> dfs;
   for (int i = 0; i < NUM_STRAY; ++i) {
-    if (!strays[i]) {
+    if (!strays[i] ||
+       !strays[i]->state_test(CInode::STATE_STRAYPINNED))
       continue;
-    }
     strays[i]->get_dirfrags(dfs);
   }
 
@@ -7697,7 +7748,7 @@ bool MDCache::shutdown_export_strays()
     
     for (auto &p : dir->items) {
       CDentry *dn = p.second;
-      CDentry::linkage_t *dnl = dn->get_linkage();
+      CDentry::linkage_t *dnl = dn->get_projected_linkage();
       if (dnl->is_null())
        continue;
       done = false;
@@ -12110,36 +12161,40 @@ void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
   // If the scrub did some repair, then flush the journal at the end of
   // the scrub.  Otherwise in the case of e.g. rewriting a backtrace
   // the on disk state will still look damaged.
-  auto expiry_fin = new FunctionContext([this, header, fin](int r){
-      if (header->get_repaired()) {
-        dout(4) << "Flushing journal because scrub did some repairs" << dendl;
-        mds->mdlog->start_new_segment();
-        mds->mdlog->trim_all();
-        if (fin) {
-          MDSGatherBuilder expiry_gather(g_ceph_context);
-          const std::set<LogSegment*> &expiring_segments = mds->mdlog->get_expiring_segments();
-          for (std::set<LogSegment*>::const_iterator i = expiring_segments.begin();
-               i != expiring_segments.end(); ++i) {
-            (*i)->wait_for_expiry(expiry_gather.new_sub());
-          }
-          expiry_gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
-          expiry_gather.activate();
-        }
-      } else {
-        if (fin) {
-          fin->complete(r);
-        }
+  auto scrub_finish = new FunctionContext([this, header, fin](int r){
+    if (!header->get_repaired()) {
+      if (fin)
+        fin->complete(r);
+      return;
+    }
+
+    auto flush_finish = new FunctionContext([this, fin](int r){
+      dout(4) << "Expiring log segments because scrub did some repairs" << dendl;
+      mds->mdlog->trim_all();
+
+      if (fin) {
+       MDSGatherBuilder gather(g_ceph_context);
+       auto& expiring_segments = mds->mdlog->get_expiring_segments();
+       for (auto logseg : expiring_segments)
+         logseg->wait_for_expiry(gather.new_sub());
+       assert(gather.has_subs());
+       gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
+       gather.activate();
       }
+    });
+
+    dout(4) << "Flushing journal because scrub did some repairs" << dendl;
+    mds->mdlog->start_new_segment();
+    mds->mdlog->flush();
+    mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish));
   });
 
   if (!header->get_recursive()) {
     mds->scrubstack->enqueue_inode_top(in, header,
-                                      new MDSInternalContextWrapper(mds,
-                                         expiry_fin));
+                                      new MDSInternalContextWrapper(mds, scrub_finish));
   } else {
     mds->scrubstack->enqueue_inode_bottom(in, header, 
-                                      new MDSInternalContextWrapper(mds,
-                                         expiry_fin));
+                                      new MDSInternalContextWrapper(mds, scrub_finish));
   }
 
   mds->server->respond_to_request(mdr, 0);
index 3d3f9686414b3855592a2a9efd27cae05a87206c..84bda527c30209b2a7e68838e04d70af32e4d81e 100644 (file)
@@ -313,7 +313,7 @@ protected:
 public:
   bool is_subtrees() { return !subtrees.empty(); }
   void list_subtrees(list<CDir*>& ls);
-  void adjust_subtree_auth(CDir *root, mds_authority_t auth);
+  void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true);
   void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
     adjust_subtree_auth(root, mds_authority_t(a,b));
   }
@@ -327,7 +327,7 @@ public:
   }
   void map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result);
   void try_subtree_merge(CDir *root);
-  void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval);
+  void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval, bool adjust_pop=true);
   void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut);
   void eval_subtree_root(CInode *diri);
   CDir *get_subtree_root(CDir *dir);
@@ -402,7 +402,7 @@ public:
   void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
                                   snapid_t ofirst, snapid_t last, 
                                   CInode *pin, bool cow_head);
-  void broadcast_quota_to_client(CInode *in);
+  void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1);
   void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
                                CInode *in, CDir *parent,
                                int flags, int linkunlink=0,
@@ -547,9 +547,9 @@ protected:
   map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
   map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_slave_exports;
   map<client_t,entity_inst_t> rejoin_client_map;
+  map<client_t,pair<Session*,uint64_t> > rejoin_session_map;
 
-  map<inodeno_t,map<client_t,cap_reconnect_t> > cap_exports; // ino -> client -> capex
-  map<inodeno_t,mds_rank_t> cap_export_targets; // ino -> auth mds
+  map<inodeno_t,pair<mds_rank_t,map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex
 
   map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports;  // ino -> client -> frommds -> capex
   set<inodeno_t> cap_imports_missing;
@@ -589,13 +589,17 @@ public:
   void rejoin_send_rejoins();
   void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
                          int target=-1) {
-    cap_exports[ino][client] = icr;
-    cap_export_targets[ino] = target;
+    auto& ex = cap_exports[ino];
+    ex.first = target;
+    ex.second[client] = icr;
   }
   void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr, 
                             mds_rank_t frommds=MDS_RANK_NONE) {
     cap_imports[ino][client][frommds] = icr;
   }
+  void rejoin_recovered_client(client_t client, const entity_inst_t& inst) {
+    rejoin_client_map.emplace(client, inst);
+  }
   const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
     if (cap_imports.count(ino) &&
        cap_imports[ino].count(client) &&
@@ -640,8 +644,7 @@ public:
   friend class C_MDC_RejoinOpenInoFinish;
   friend class C_MDC_RejoinSessionsOpened;
   void rejoin_open_ino_finish(inodeno_t ino, int ret);
-  void rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
-                                  map<client_t,uint64_t>& sseqmap);
+  void rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map);
   bool process_imported_caps();
   void choose_lock_states_and_reconnect_caps();
   void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
@@ -892,6 +895,7 @@ public:
   void open_root_inode(MDSInternalContextBase *c);
   void open_root();
   void open_mydir_inode(MDSInternalContextBase *c);
+  void open_mydir_frag(MDSInternalContextBase *c);
   void populate_mydir();
 
   void _create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin);
index bab2c7de843d7302d091f7441119881aab12c8b4..147ee133472166d108e72eb9c9c15f08274d16e1 100644 (file)
@@ -367,7 +367,7 @@ protected:
     }
     if (waiting->empty()) {
       put(PIN_WAITER);
-      waiting.release();
+      waiting.reset();
     }
   }
   void finish_waiting(uint64_t mask, int result = 0);
index d1a22df71ab9ef7d49ab4fe013560d491d9ac043..452d5bbf6602be0125b1b629a084790380251b82 100644 (file)
@@ -258,6 +258,11 @@ void MDSDaemon::set_up_admin_socket()
                                     asok_hook,
                                     "dump metadata cache for subtree");
   assert(r == 0);
+  r = admin_socket->register_command("dump loads",
+                                     "dump loads",
+                                     asok_hook,
+                                     "dump metadata loads");
+  assert(r == 0);
   r = admin_socket->register_command("session evict",
                                     "session evict name=client_id,type=CephString",
                                     asok_hook,
@@ -327,6 +332,7 @@ void MDSDaemon::clean_up_admin_socket()
   admin_socket->unregister_command("dump cache");
   admin_socket->unregister_command("cache status");
   admin_socket->unregister_command("dump tree");
+  admin_socket->unregister_command("dump loads");
   admin_socket->unregister_command("session evict");
   admin_socket->unregister_command("osdmap barrier");
   admin_socket->unregister_command("session ls");
@@ -356,6 +362,7 @@ const char** MDSDaemon::get_tracked_conf_keys() const
     "mds_max_purge_ops",
     "mds_max_purge_ops_per_pg",
     "mds_max_purge_files",
+    "mds_inject_migrator_session_race",
     "clog_to_graylog",
     "clog_to_graylog_host",
     "clog_to_graylog_port",
@@ -1262,7 +1269,8 @@ bool MDSDaemon::ms_handle_refused(Connection *con)
 
 bool MDSDaemon::ms_verify_authorizer(Connection *con, int peer_type,
                               int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
-                              bool& is_valid, CryptoKey& session_key)
+                                    bool& is_valid, CryptoKey& session_key,
+                                    std::unique_ptr<AuthAuthorizerChallenge> *challenge)
 {
   Mutex::Locker l(mds_lock);
   if (stopping) {
@@ -1294,7 +1302,7 @@ bool MDSDaemon::ms_verify_authorizer(Connection *con, int peer_type,
     is_valid = authorize_handler->verify_authorizer(
       cct, keys,
       authorizer_data, authorizer_reply, name, global_id, caps_info,
-      session_key);
+      session_key, nullptr, challenge);
   } else {
     dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
     is_valid = false;
@@ -1325,6 +1333,9 @@ bool MDSDaemon::ms_verify_authorizer(Connection *con, int peer_type,
       dout(10) << " new session " << s << " for " << s->info.inst << " con " << con << dendl;
       con->set_priv(s);
       s->connection = con;
+      if (mds_rank) {
+        mds_rank->kick_waiters_for_any_client_connection();
+      }
     } else {
       dout(10) << " existing session " << s << " for " << s->info.inst << " existing con " << s->connection
               << ", new/authorizing con " << con << dendl;
index 8dac42c1b6263b80bb96504b751092f8e3561c11..119b22b2d913c6ef3ec5385af82bbcc16cf773af 100644 (file)
@@ -108,7 +108,8 @@ class MDSDaemon : public Dispatcher, public md_config_obs_t {
   bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
   bool ms_verify_authorizer(Connection *con, int peer_type,
                               int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
-                              bool& isvalid, CryptoKey& session_key) override;
+                           bool& isvalid, CryptoKey& session_key,
+                           std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
   void ms_handle_accept(Connection *con) override;
   void ms_handle_connect(Connection *con) override;
   bool ms_handle_reset(Connection *con) override;
index de0a7c3e60af05b90987d7c67cd5ebfce0e46dcb..e6f6db65b077e3ecaae73038b4e15c41cd5f8d61 100644 (file)
@@ -266,11 +266,7 @@ void MDSRankDispatcher::tick()
   }
 
   // log
-  mds_load_t load = balancer->get_load(ceph_clock_now());
-
   if (logger) {
-    logger->set(l_mds_load_cent, 100 * load.mds_load());
-    logger->set(l_mds_dispatch_queue_len, messenger->get_dispatch_queue_len());
     logger->set(l_mds_subtrees, mdcache->num_subtrees());
 
     mdcache->log_stat();
@@ -852,9 +848,29 @@ Session *MDSRank::get_session(Message *m)
 {
   Session *session = static_cast<Session *>(m->get_connection()->get_priv());
   if (session) {
+    session->put(); // do not carry ref
     dout(20) << "get_session have " << session << " " << session->info.inst
             << " state " << session->get_state_name() << dendl;
-    session->put();  // not carry ref
+    // Check if we've imported an open session since (new sessions start closed)
+    if (session->is_closed()) {
+      Session *imported_session = sessionmap.get_session(session->info.inst.name);
+      if (imported_session && imported_session != session) {
+        dout(10) << __func__ << " replacing connection bootstrap session " << session << " with imported session " << imported_session << dendl;
+        imported_session->info.auth_name = session->info.auth_name;
+        //assert(session->info.auth_name == imported_session->info.auth_name);
+        assert(session->info.inst == imported_session->info.inst);
+        imported_session->connection = session->connection;
+        // send out any queued messages
+        while (!session->preopen_out_queue.empty()) {
+          imported_session->connection->send_message(session->preopen_out_queue.front());
+          session->preopen_out_queue.pop_front();
+        }
+        imported_session->auth_caps = session->auth_caps;
+        assert(session->get_nref() == 1);
+        imported_session->connection->set_priv(imported_session->get());
+        session = imported_session;
+      }
+    }
   } else {
     dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
   }
@@ -1094,15 +1110,19 @@ void MDSRank::boot_start(BootStep step, int r)
         MDSGatherBuilder gather(g_ceph_context,
             new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
 
-        mdcache->open_mydir_inode(gather.new_sub());
+       if (is_starting()) {
+         // load mydir frag for the first log segment (creating subtree map)
+         mdcache->open_mydir_frag(gather.new_sub());
+       } else {
+         mdcache->open_mydir_inode(gather.new_sub());
+       }
 
-        if (is_starting() ||
-            whoami == mdsmap->get_root()) {  // load root inode off disk if we are auth
-          mdcache->open_root_inode(gather.new_sub());
-        } else {
-          // replay.  make up fake root inode to start with
-          (void)mdcache->create_root_inode();
-        }
+       if (whoami == mdsmap->get_root()) {  // load root inode off disk if we are auth
+         mdcache->open_root_inode(gather.new_sub());
+       } else if (is_any_replay()) {
+         // replay.  make up fake root inode to start with
+         mdcache->create_root_inode();
+       }
         gather.activate();
       }
       break;
@@ -1140,7 +1160,7 @@ void MDSRank::boot_start(BootStep step, int r)
 void MDSRank::validate_sessions()
 {
   assert(mds_lock.is_locked_by_me());
-  std::vector<Session*> victims;
+  bool valid = true;
 
   // Identify any sessions which have state inconsistent with other,
   // after they have been loaded from rados during startup.
@@ -1150,19 +1170,15 @@ void MDSRank::validate_sessions()
     Session *session = i.second;
     interval_set<inodeno_t> badones;
     if (inotable->intersects_free(session->info.prealloc_inos, &badones)) {
-      clog->error() << "Client session loaded with invalid preallocated "
-                          "inodes, evicting session " << *session;
-
-      // Make the session consistent with inotable so that it can
-      // be cleanly torn down
-      session->info.prealloc_inos.subtract(badones);
-
-      victims.push_back(session);
+      clog->error() << "client " << *session
+                   << "loaded with preallocated inodes that are inconsistent with inotable";
+      valid = false;
     }
   }
 
-  for (const auto &session: victims) {
-    server->kill_session(session, nullptr);
+  if (!valid) {
+    damaged();
+    assert(valid);
   }
 }
 
@@ -1172,16 +1188,7 @@ void MDSRank::starting_done()
   assert(is_starting());
   request_state(MDSMap::STATE_ACTIVE);
 
-  mdcache->open_root();
-
-  if (mdcache->is_open()) {
-    mdlog->start_new_segment();
-  } else {
-    mdcache->wait_for_open(new MDSInternalContextWrapper(this,
-                          new FunctionContext([this] (int r) {
-                              mdlog->start_new_segment();
-                          })));
-  }
+  mdlog->start_new_segment();
 }
 
 
@@ -1466,7 +1473,8 @@ void MDSRank::active_start()
 {
   dout(1) << "active_start" << dendl;
 
-  if (last_state == MDSMap::STATE_CREATING) {
+  if (last_state == MDSMap::STATE_CREATING ||
+      last_state == MDSMap::STATE_STARTING) {
     mdcache->open_root();
   }
 
@@ -1698,7 +1706,7 @@ void MDSRankDispatcher::handle_mds_map(
 
   // REJOIN
   // is everybody finally rejoining?
-  if (is_starting() || is_rejoin() || is_clientreplay() || is_active() || is_stopping()) {
+  if (is_rejoin() || is_clientreplay() || is_active() || is_stopping()) {
     // did we start?
     if (!oldmap->is_rejoining() && mdsmap->is_rejoining())
       rejoin_joint_start();
@@ -1991,6 +1999,13 @@ bool MDSRankDispatcher::handle_asok_command(
         f->reset();
       }
     }
+  } else if (command == "dump loads") {
+    Mutex::Locker l(mds_lock);
+    int r = balancer->dump_loads(f);
+    if (r != 0) {
+      ss << "Failed to dump loads: " << cpp_strerror(r);
+      f->reset();
+    }
   } else if (command == "force_readonly") {
     Mutex::Locker l(mds_lock);
     mdcache->force_readonly();
index 1bb4d2724d883602b646b6f54171385d9a8efc01..d26627e8e0c2f7075e11f99b98c31145ae97eaeb 100644 (file)
@@ -220,6 +220,7 @@ class MDSRank {
     void handle_conf_change(const struct md_config_t *conf,
                             const std::set <std::string> &changed)
     {
+      mdcache->migrator->handle_conf_change(conf, changed, *mdsmap);
       purge_queue.handle_conf_change(conf, changed, *mdsmap);
     }
 
@@ -263,6 +264,7 @@ class MDSRank {
     ceph_tid_t last_tid;    // for mds-initiated requests (e.g. stray rename)
 
     list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
+    list<MDSInternalContextBase*> waiting_for_any_client_connection;
     list<MDSInternalContextBase*> replay_queue;
     map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
     map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
@@ -377,6 +379,12 @@ class MDSRank {
       waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
     }
 
+    void wait_for_any_client_connection(MDSInternalContextBase *c) {
+      waiting_for_any_client_connection.push_back(c);
+    }
+    void kick_waiters_for_any_client_connection(void) {
+      finish_contexts(g_ceph_context, waiting_for_any_client_connection);
+    }
     void wait_for_active(MDSInternalContextBase *c) {
       waiting_for_active.push_back(c);
     }
@@ -408,7 +416,7 @@ class MDSRank {
 
     MDSMap *get_mds_map() { return mdsmap; }
 
-    int get_req_rate() const { return logger->get(l_mds_request); }
+    uint64_t get_num_requests() const { return logger->get(l_mds_request); }
   
     int get_mds_slow_req_count() const { return mds_slow_req_count; }
 
index 1e2a3a302f6520269e53e09ddc7859e8aee4a87f..40a89626bc798710342ca357424d873370d4ecf8 100644 (file)
@@ -27,6 +27,7 @@
 #include "Mutation.h"
 
 #include "include/filepath.h"
+#include "common/likely.h"
 
 #include "events/EExport.h"
 #include "events/EImportStart.h"
@@ -118,7 +119,12 @@ void Migrator::dispatch(Message *m)
     handle_export_prep(static_cast<MExportDirPrep*>(m));
     break;
   case MSG_MDS_EXPORTDIR:
-    handle_export_dir(static_cast<MExportDir*>(m));
+    if (unlikely(inject_session_race)) {
+      dout(0) << "waiting for inject_session_race" << dendl;
+      mds->wait_for_any_client_connection(new C_MDS_RetryMessage(mds, m));
+    } else {
+      handle_export_dir(static_cast<MExportDir*>(m));
+    }
     break;
   case MSG_MDS_EXPORTDIRFINISH:
     handle_export_finish(static_cast<MExportDirFinish*>(m));
@@ -1511,7 +1517,8 @@ void Migrator::finish_export_inode_caps(CInode *in, mds_rank_t peer,
 
     map<client_t,Capability::Import>::iterator q = peer_imported.find(it->first);
     assert(q != peer_imported.end());
-    m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq, peer, 0);
+    m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
+                   (q->second.cap_id > 0 ? peer : -1), 0);
     mds->send_message_client_counted(m, it->first);
   }
   in->clear_client_caps_after_export();
@@ -2442,14 +2449,13 @@ class C_MDS_ImportDirLoggedStart : public MigratorLogContext {
   CDir *dir;
   mds_rank_t from;
 public:
-  map<client_t,entity_inst_t> imported_client_map;
-  map<client_t,uint64_t> sseqmap;
+  map<client_t,pair<Session*,uint64_t> > imported_session_map;
 
   C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, mds_rank_t f) :
     MigratorLogContext(m), df(d->dirfrag()), dir(d), from(f) {
   }
   void finish(int r) override {
-    mig->import_logged_start(df, dir, from, imported_client_map, sseqmap);
+    mig->import_logged_start(df, dir, from, imported_session_map);
   }
 };
 
@@ -2492,10 +2498,11 @@ void Migrator::handle_export_dir(MExportDir *m)
   // new client sessions, open these after we journal
   // include imported sessions in EImportStart
   bufferlist::iterator cmp = m->client_map.begin();
-  ::decode(onlogged->imported_client_map, cmp);
+  map<client_t,entity_inst_t> client_map;
+  decode(client_map, cmp);
   assert(cmp.end());
-  le->cmapv = mds->server->prepare_force_open_sessions(onlogged->imported_client_map, onlogged->sseqmap);
-  le->client_map.claim(m->client_map);
+  le->cmapv = mds->server->prepare_force_open_sessions(client_map, onlogged->imported_session_map);
+  encode(client_map, le->client_map, mds->mdsmap->get_up_features());
 
   bufferlist::iterator blp = m->export_data.begin();
   int num_imported_inodes = 0;
@@ -2691,24 +2698,24 @@ void Migrator::import_reverse(CDir *dir)
   if (stat.state == IMPORT_ACKING) {
     // remove imported caps
     for (map<CInode*,map<client_t,Capability::Export> >::iterator p = stat.peer_exports.begin();
-       p != stat.peer_exports.end();
-       ++p) {
+        p != stat.peer_exports.end();
+        ++p) {
       CInode *in = p->first;
       for (map<client_t,Capability::Export>::iterator q = p->second.begin();
-         q != p->second.end();
-         ++q) {
+          q != p->second.end();
+          ++q) {
        Capability *cap = in->get_client_cap(q->first);
-       assert(cap);
+       if (!cap) {
+         assert(!stat.session_map.count(q->first));
+         continue;
+       }
        if (cap->is_importing())
          in->remove_client_cap(q->first);
       }
       in->put(CInode::PIN_IMPORTINGCAPS);
     }
-    for (map<client_t,entity_inst_t>::iterator p = stat.client_map.begin();
-        p != stat.client_map.end();
-        ++p) {
-      Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
-      assert(session);
+    for (auto& p : stat.session_map) {
+      Session *session = p.second.first;
       session->dec_importing();
     }
   }
@@ -2808,14 +2815,13 @@ void Migrator::import_reverse_final(CDir *dir)
 
 
 void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
-                                  map<client_t,entity_inst_t>& imported_client_map,
-                                  map<client_t,uint64_t>& sseqmap)
+                                  map<client_t,pair<Session*,uint64_t> >& imported_session_map)
 {
   map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
   if (it == import_state.end() ||
       it->second.state != IMPORT_LOGGINGSTART) {
     dout(7) << "import " << df << " must have aborted" << dendl;
-    mds->server->finish_force_open_sessions(imported_client_map, sseqmap);
+    mds->server->finish_force_open_sessions(imported_session_map);
     return;
   }
 
@@ -2827,16 +2833,18 @@ void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
   assert (g_conf->mds_kill_import_at != 7);
 
   // force open client sessions and finish cap import
-  mds->server->finish_force_open_sessions(imported_client_map, sseqmap, false);
-  it->second.client_map.swap(imported_client_map);
+  mds->server->finish_force_open_sessions(imported_session_map, false);
   
   map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
   for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
        p != it->second.peer_exports.end();
        ++p) {
     // parameter 'peer' is NONE, delay sending cap import messages to client
-    finish_import_inode_caps(p->first, MDS_RANK_NONE, true, p->second, imported_caps[p->first->ino()]);
+    finish_import_inode_caps(p->first, MDS_RANK_NONE, true, imported_session_map,
+                            p->second, imported_caps[p->first->ino()]);
   }
+
+  it->second.session_map.swap(imported_session_map);
   
   // send notify's etc.
   dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl;
@@ -2894,8 +2902,11 @@ void Migrator::import_finish(CDir *dir, bool notify, bool last)
       for (map<client_t,Capability::Export>::iterator q = p->second.begin();
          q != p->second.end();
          ++q) {
-       Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
-       assert(session);
+       auto r = it->second.session_map.find(q->first);
+       if (r == it->second.session_map.end())
+         continue;
+
+       Session *session = r->second.first;
        Capability *cap = in->get_client_cap(q->first);
        assert(cap);
        cap->merge(q->second, true);
@@ -2906,11 +2917,8 @@ void Migrator::import_finish(CDir *dir, bool notify, bool last)
       p->second.clear();
       in->replica_caps_wanted = 0;
     }
-    for (map<client_t,entity_inst_t>::iterator p = it->second.client_map.begin();
-        p != it->second.client_map.end();
-        ++p) {
-      Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
-      assert(session);
+    for (auto& p : it->second.session_map) {
+      Session *session = p.second.first;
       session->dec_importing();
     }
   }
@@ -3009,6 +3017,9 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp,
     assert(!dn->get_linkage()->get_inode());
     dn->dir->link_primary_inode(dn, in);
   }
+
+  if (in->is_dir())
+    dn->dir->pop_lru_subdirs.push_back(&in->item_pop_lru);
  
   // add inode?
   if (added) {
@@ -3056,32 +3067,38 @@ void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
 }
 
 void Migrator::finish_import_inode_caps(CInode *in, mds_rank_t peer, bool auth_cap,
-                                       map<client_t,Capability::Export> &export_map,
+                                       const map<client_t,pair<Session*,uint64_t> >& session_map,
+                                       const map<client_t,Capability::Export> &export_map,
                                        map<client_t,Capability::Import> &import_map)
 {
-  for (map<client_t,Capability::Export>::iterator it = export_map.begin();
-       it != export_map.end();
-       ++it) {
-    dout(10) << "finish_import_inode_caps for client." << it->first << " on " << *in << dendl;
-    Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(it->first.v));
-    assert(session);
+  for (auto& it : export_map) {
+    dout(10) << "finish_import_inode_caps for client." << it.first << " on " << *in << dendl;
+
+    auto p = session_map.find(it.first);
+    if (p == session_map.end()) {
+      dout(10) << " no session for client." << it.first << dendl;
+      (void)import_map[it.first];
+      continue;
+    }
 
-    Capability *cap = in->get_client_cap(it->first);
+    Session *session = p->second.first;
+
+    Capability *cap = in->get_client_cap(it.first);
     if (!cap) {
-      cap = in->add_client_cap(it->first, session);
+      cap = in->add_client_cap(it.first, session);
       if (peer < 0)
        cap->mark_importing();
     }
 
-    Capability::Import& im = import_map[it->first];
+    Capability::Import& im = import_map[it.first];
     im.cap_id = cap->get_cap_id();
-    im.mseq = auth_cap ? it->second.mseq : cap->get_mseq();
+    im.mseq = auth_cap ? it.second.mseq : cap->get_mseq();
     im.issue_seq = cap->get_last_seq() + 1;
 
     if (peer >= 0) {
-      cap->merge(it->second, auth_cap);
-      mds->mdcache->do_cap_import(session, in, cap, it->second.cap_id,
-                                 it->second.seq, it->second.mseq - 1, peer,
+      cap->merge(it.second, auth_cap);
+      mds->mdcache->do_cap_import(session, in, cap, it.second.cap_id,
+                                 it.second.seq, it.second.mseq - 1, peer,
                                  auth_cap ? CEPH_CAP_FLAG_AUTH : CEPH_CAP_FLAG_RELEASE);
     }
   }
@@ -3306,13 +3323,12 @@ class C_M_LoggedImportCaps : public MigratorLogContext {
   CInode *in;
   mds_rank_t from;
 public:
+  map<client_t,pair<Session*,uint64_t> > imported_session_map;
   map<CInode*, map<client_t,Capability::Export> > peer_exports;
-  map<client_t,entity_inst_t> client_map;
-  map<client_t,uint64_t> sseqmap;
 
   C_M_LoggedImportCaps(Migrator *m, CInode *i, mds_rank_t f) : MigratorLogContext(m), in(i), from(f) {}
   void finish(int r) override {
-    mig->logged_import_caps(in, from, peer_exports, client_map, sseqmap);
+    mig->logged_import_caps(in, from, imported_session_map, peer_exports);
   }  
 };
 
@@ -3326,23 +3342,29 @@ void Migrator::handle_export_caps(MExportCaps *ex)
   assert(in->is_auth());
 
   // FIXME
-  if (!in->can_auth_pin())
+  if (!in->can_auth_pin()) {
+    ex->put();
     return;
+  }
+
   in->auth_pin(this);
 
+  map<client_t,entity_inst_t> client_map;
+  client_map.swap(ex->client_map);
+
   C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
       this, in, mds_rank_t(ex->get_source().num()));
-  finish->client_map = ex->client_map;
 
+  version_t pv = mds->server->prepare_force_open_sessions(client_map,
+                                                         finish->imported_session_map);
   // decode new caps
   bufferlist::iterator blp = ex->cap_bl.begin();
   decode_import_inode_caps(in, false, blp, finish->peer_exports);
   assert(!finish->peer_exports.empty());   // thus, inode is pinned.
 
   // journal open client sessions
-  version_t pv = mds->server->prepare_force_open_sessions(finish->client_map, finish->sseqmap);
   
-  ESessions *le = new ESessions(pv, ex->client_map);
+  ESessions *le = new ESessions(pv, client_map);
   mds->mdlog->start_submit_entry(le, finish);
   mds->mdlog->flush();
 
@@ -3352,22 +3374,33 @@ void Migrator::handle_export_caps(MExportCaps *ex)
 
 void Migrator::logged_import_caps(CInode *in, 
                                  mds_rank_t from,
-                                 map<CInode*, map<client_t,Capability::Export> >& peer_exports,
-                                 map<client_t,entity_inst_t>& client_map,
-                                 map<client_t,uint64_t>& sseqmap) 
+                                 map<client_t,pair<Session*,uint64_t> >& imported_session_map,
+                                 map<CInode*, map<client_t,Capability::Export> >& peer_exports)
 {
   dout(10) << "logged_import_caps on " << *in << dendl;
   // see export_go() vs export_go_synced()
   assert(in->is_auth());
 
   // force open client sessions and finish cap import
-  mds->server->finish_force_open_sessions(client_map, sseqmap);
+  mds->server->finish_force_open_sessions(imported_session_map);
 
   map<client_t,Capability::Import> imported_caps;
 
-  assert(peer_exports.count(in));
+  auto it = peer_exports.find(in);
+  assert(it != peer_exports.end());
+
   // clients will release caps from the exporter when they receive the cap import message.
-  finish_import_inode_caps(in, from, false, peer_exports[in], imported_caps);
+  finish_import_inode_caps(in, from, false, imported_session_map, it->second, imported_caps);
   mds->locker->eval(in, CEPH_CAP_LOCKS, true);
   in->auth_unpin(this);
 }
+
+void Migrator::handle_conf_change(const struct md_config_t *conf,
+                                  const std::set <std::string> &changed,
+                                  const MDSMap &mds_map)
+{
+  if (changed.count("mds_inject_migrator_session_race")) {
+    inject_session_race = conf->get_val<bool>("mds_inject_migrator_session_race");
+    dout(0) << "mds_inject_migrator_session_race is " << inject_session_race << dendl;
+  }
+}
index 6070d0bb391028bb2c7c5ac3d942d1de7ebbc611..148b2fb4fd2c01422109e9f36455929c5269a291 100644 (file)
@@ -31,6 +31,7 @@ class MDSRank;
 class CDir;
 class CInode;
 class CDentry;
+class Session;
 
 class MExportDirDiscover;
 class MExportDirDiscoverAck;
@@ -101,9 +102,13 @@ public:
   }
 
   // -- cons --
-  Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) {}
-
+  Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) {
+    inject_session_race = g_conf->get_val<bool>("mds_inject_migrator_session_race");
+  }
 
+  void handle_conf_change(const struct md_config_t *conf,
+                          const std::set <std::string> &changed,
+                          const MDSMap &mds_map);
 
 protected:
   // export fun
@@ -137,7 +142,7 @@ protected:
     set<mds_rank_t> bystanders;
     list<dirfrag_t> bound_ls;
     list<ScatterLock*> updated_scatterlocks;
-    map<client_t,entity_inst_t> client_map;
+    map<client_t,pair<Session*,uint64_t> > session_map;
     map<CInode*, map<client_t,Capability::Export> > peer_exports;
     MutationRef mut;
     import_state_t() : state(0), peer(0), tid(0), mut() {}
@@ -185,16 +190,14 @@ protected:
   void import_notify_abort(CDir *dir, set<CDir*>& bounds);
   void import_notify_finish(CDir *dir, set<CDir*>& bounds);
   void import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
-                          map<client_t,entity_inst_t> &imported_client_map,
-                          map<client_t,uint64_t>& sseqmap);
+                          map<client_t,pair<Session*,uint64_t> >& imported_session_map);
   void handle_export_finish(MExportDirFinish *m);
 
   void handle_export_caps(MExportCaps *m);
   void logged_import_caps(CInode *in,
                          mds_rank_t from,
-                         map<CInode*, map<client_t,Capability::Export> >& cap_imports,
-                         map<client_t,entity_inst_t>& client_map,
-                         map<client_t,uint64_t>& sseqmap);
+                         map<client_t,pair<Session*,uint64_t> >& imported_session_map,
+                         map<CInode*, map<client_t,Capability::Export> >& cap_imports);
 
 
   friend class C_MDS_ImportDirLoggedStart;
@@ -330,7 +333,8 @@ public:
   void decode_import_inode_caps(CInode *in, bool auth_cap, bufferlist::iterator &blp,
                                map<CInode*, map<client_t,Capability::Export> >& cap_imports);
   void finish_import_inode_caps(CInode *in, mds_rank_t from, bool auth_cap,
-                               map<client_t,Capability::Export> &export_map,
+                               const map<client_t,pair<Session*,uint64_t> >& smap,
+                               const map<client_t,Capability::Export> &export_map,
                                map<client_t,Capability::Import> &import_map);
   int decode_import_dir(bufferlist::iterator& blp,
                        mds_rank_t oldauth,
@@ -347,6 +351,7 @@ public:
 private:
   MDSRank *mds;
   MDCache *cache;
+  bool inject_session_race = false;
 };
 
 #endif
index abf0078d78a306f0cde6b99ca1643c9880334fb0..c9ffcfe98596aba2752e6f8daf3a42ab44cc2593 100644 (file)
@@ -244,8 +244,7 @@ struct MDRequestImpl : public MutationImpl {
     bool is_remote_frozen_authpin;
     bool is_inode_exporter;
 
-    map<client_t,entity_inst_t> imported_client_map;
-    map<client_t,uint64_t> sseq_map;
+    map<client_t, pair<Session*, uint64_t> > imported_session_map;
     map<CInode*, map<client_t,Capability::Export> > cap_imports;
     
     // for lock/flock
index 49e48b04cc6d12700a4ff97ab149177a8a2f5c2b..cec43ecd34704cc3972a5372712778d62b717203 100644 (file)
@@ -350,7 +350,6 @@ bool PurgeQueue::_consume()
 
   bool could_consume = false;
   while(can_consume()) {
-    could_consume = true;
 
     if (delayed_flush) {
       // We are now going to read from the journal, so any proactive
@@ -376,6 +375,7 @@ bool PurgeQueue::_consume()
       return could_consume;
     }
 
+    could_consume = true;
     // The journaler is readable: consume an entry
     bufferlist bl;
     bool readable = journaler.try_read_entry(bl);
index 986f6e6b876c474755c832f6e2fb0aaacc8a9c0d..6e9c61191d3f45d6bcc56243dcf25c1d4f38d3a7 100644 (file)
@@ -236,8 +236,7 @@ void Server::dispatch(Message *m)
 
     bool wait_for_active = true;
     if (mds->is_stopping()) {
-      if (m->get_source().is_mds())
-       wait_for_active = false;
+      wait_for_active = false;
     } else if (mds->is_clientreplay()) {
       if (req->is_queued_for_replay()) {
        wait_for_active = false;
@@ -333,7 +332,8 @@ void Server::handle_client_session(MClientSession *m)
     if (session->is_opening() ||
        session->is_open() ||
        session->is_stale() ||
-       session->is_killing()) {
+       session->is_killing() ||
+       terminating_sessions) {
       dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
       // set client metadata for session opened by prepare_force_open_sessions
       if (!m->client_meta.empty())
@@ -356,7 +356,8 @@ void Server::handle_client_session(MClientSession *m)
         });
 
     if (blacklisted) {
-      dout(10) << "ignoring blacklisted client " << session->info.inst.addr << dendl;
+      dout(10) << "rejecting blacklisted client " << session->info.inst.addr << dendl;
+      mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
       m->put();
       return;
     }
@@ -586,32 +587,48 @@ void Server::_session_logged(Session *session, uint64_t state_seq, bool open, ve
  *  - sessions learned from other MDSs during a cross-MDS rename
  */
 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
-                                             map<client_t,uint64_t>& sseqmap)
+                                             map<client_t, pair<Session*,uint64_t> >& smap)
 {
   version_t pv = mds->sessionmap.get_projected();
 
   dout(10) << "prepare_force_open_sessions " << pv 
           << " on " << cm.size() << " clients"
           << dendl;
-  for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
 
+  mds->objecter->with_osdmap(
+      [this, &cm](const OSDMap &osd_map) {
+       for (auto p = cm.begin(); p != cm.end(); ) {
+         if (osd_map.is_blacklisted(p->second.addr)) {
+           dout(10) << " ignoring blacklisted client." << p->first
+                    << " (" <<  p->second.addr << ")" << dendl;
+           cm.erase(p++);
+         } else {
+           ++p;
+         }
+       }
+      });
+
+  for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
     Session *session = mds->sessionmap.get_or_add_session(p->second);
     pv = mds->sessionmap.mark_projected(session);
+    uint64_t sseq;
     if (session->is_closed() || 
        session->is_closing() ||
-       session->is_killing())
-      sseqmap[p->first] = mds->sessionmap.set_state(session, Session::STATE_OPENING);
-    else
+       session->is_killing()) {
+      sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
+    } else {
       assert(session->is_open() ||
             session->is_opening() ||
             session->is_stale());
+      sseq = 0;
+    }
+    smap[p->first] = make_pair(session, sseq);
     session->inc_importing();
   }
   return pv;
 }
 
-void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
-                                       map<client_t,uint64_t>& sseqmap,
+void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
                                        bool dec_import)
 {
   /*
@@ -619,17 +636,13 @@ void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
    * client trying to close a session and an MDS doing an import
    * trying to force open a session...  
    */
-  dout(10) << "finish_force_open_sessions on " << cm.size() << " clients,"
+  dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
           << " initial v " << mds->sessionmap.get_version() << dendl;
-  
-
-  for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
 
-    Session *session = mds->sessionmap.get_session(p->second.name);
-    assert(session);
-    
-    if (sseqmap.count(p->first)) {
-      uint64_t sseq = sseqmap[p->first];
+  for (auto &it : smap) {
+    Session *session = it.second.first;
+    uint64_t sseq = it.second.second;
+    if (sseq > 0) {
       if (session->get_state_seq() != sseq) {
        dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
       } else {
@@ -863,7 +876,13 @@ void Server::journal_close_session(Session *session, int state, Context *on_safe
 void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
 {
   reconnect_done = reconnect_done_;
-  mds->sessionmap.get_client_set(client_reconnect_gather);
+
+  set<Session*> sessions;
+  mds->sessionmap.get_client_session_set(sessions);
+  for (auto session : sessions) {
+    if (session->is_open())
+       client_reconnect_gather.insert(session->get_client());
+  }
 
   if (client_reconnect_gather.empty()) {
     dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
@@ -905,7 +924,7 @@ void Server::handle_client_reconnect(MClientReconnect *m)
        << ") from " << m->get_source_inst()
        << " after " << delay << " (allowed interval " << g_conf->mds_reconnect_timeout << ")";
     deny = true;
-  } else if (session->is_closed()) {
+  } else if (!session->is_open()) {
     dout(1) << " session is closed, ignoring reconnect, sending close" << dendl;
     mds->clog->info() << "denied reconnect attempt (mds is "
        << ceph_mds_state_name(mds->get_state())
@@ -986,6 +1005,7 @@ void Server::handle_client_reconnect(MClientReconnect *m)
       mdcache->rejoin_recovered_caps(p->first, from, p->second, MDS_RANK_NONE);
     }
   }
+  mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
 
   // remove from gather set
   client_reconnect_gather.erase(from);
@@ -1959,6 +1979,7 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
       } else {
        mdcache->request_finish(mdr);
       }
+      m->put();
       return;
     }
   }
@@ -3004,7 +3025,13 @@ void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
     return;
   }
 
-  CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, false, false, NULL, !is_lookup);
+  bool want_auth = false;
+  int mask = req->head.args.getattr.mask;
+  if (mask & CEPH_STAT_RSTAT)
+    want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
+
+  CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, want_auth, false, NULL, 
+                                   !is_lookup);
   if (!ref) return;
 
   /*
@@ -3022,7 +3049,6 @@ void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
              mdr->snapid <= cap->client_follows))
     issued = cap->issued();
 
-  int mask = req->head.args.getattr.mask;
   if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
     rdlocks.insert(&ref->linklock);
   if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
@@ -3051,11 +3077,14 @@ void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
   if (!check_access(mdr, ref, MAY_READ))
     return;
 
+  utime_t now = ceph_clock_now();
+  mdr->set_mds_stamp(now);
+
   // note which caps are requested, so we return at least a snapshot
   // value for them.  (currently this matters for xattrs and inline data)
   mdr->getattr_caps = mask;
 
-  mds->balancer->hit_inode(ceph_clock_now(), ref, META_POP_IRD,
+  mds->balancer->hit_inode(now, ref, META_POP_IRD,
                           req->get_source().num());
 
   // reply
@@ -3331,6 +3360,9 @@ void Server::handle_client_open(MDRequestRef& mdr)
   if (!check_access(mdr, cur, mask))
     return;
 
+  utime_t now = ceph_clock_now();
+  mdr->set_mds_stamp(now);
+
   if (cur->is_file() || cur->is_dir()) {
     if (mdr->snapid == CEPH_NOSNAP) {
       // register new cap
@@ -3366,9 +3398,9 @@ void Server::handle_client_open(MDRequestRef& mdr)
   
   // hit pop
   if (cmode & CEPH_FILE_MODE_WR)
-    mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IWR);
+    mds->balancer->hit_inode(now, cur, META_POP_IWR);
   else
-    mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IRD,
+    mds->balancer->hit_inode(now, cur, META_POP_IRD,
                             mdr->client_request->get_source().num());
 
   CDentry *dn = 0;
@@ -3397,7 +3429,7 @@ public:
     // dirty inode, dn, dir
     newi->inode.version--;   // a bit hacky, see C_MDS_mknod_finish
     newi->mark_dirty(newi->inode.version+1, mdr->ls);
-    newi->_mark_dirty_parent(mdr->ls, true);
+    newi->mark_dirty_parent(mdr->ls, true);
 
     mdr->apply();
 
@@ -3406,7 +3438,8 @@ public:
     MDRequestRef null_ref;
     get_mds()->mdcache->send_dentry_link(dn, null_ref);
 
-    get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
+    utime_t now = ceph_clock_now();
+    get_mds()->balancer->hit_inode(now, newi, META_POP_IWR);
 
     server->respond_to_request(mdr, 0);
 
@@ -3864,7 +3897,8 @@ public:
       get_mds()->mdcache->truncate_inode(in, mdr->ls);
     }
 
-    get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
+    utime_t now = ceph_clock_now();
+    get_mds()->balancer->hit_inode(now, in, META_POP_IWR);
 
     server->respond_to_request(mdr, 0);
 
@@ -4143,7 +4177,7 @@ void Server::handle_client_setattr(MDRequestRef& mdr)
   }
 
   pi.inode.version = cur->pre_dirty();
-  pi.inode.ctime = mdr->get_op_stamp();
+  pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
   pi.inode.change_attr++;
 
   // log + wait
@@ -4179,7 +4213,7 @@ void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
   // prepare
   auto &pi = in->project_inode();
   pi.inode.version = in->pre_dirty();
-  pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp();
+  pi.inode.mtime = pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
   pi.inode.change_attr++;
 
   uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
@@ -4297,7 +4331,7 @@ void Server::handle_client_setlayout(MDRequestRef& mdr)
   // add the old pool to the inode
   pi.inode.add_old_pool(old_layout.pool_id);
   pi.inode.version = cur->pre_dirty();
-  pi.inode.ctime = mdr->get_op_stamp();
+  pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
   pi.inode.change_attr++;
   
   // log + wait
@@ -4650,7 +4684,6 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
     int64_t old_pool = pi.inode.layout.pool_id;
     pi.inode.add_old_pool(old_pool);
     pi.inode.layout = layout;
-    pi.inode.ctime = mdr->get_op_stamp();
     pip = &pi.inode;
   } else if (name.compare(0, 10, "ceph.quota") == 0) { 
     if (!cur->is_dir() || cur->is_root()) {
@@ -4676,6 +4709,9 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
 
     mdr->no_early_reply = true;
     pip = &pi.inode;
+
+    client_t exclude_ct = mdr->get_client();
+    mdcache->broadcast_quota_to_client(cur, exclude_ct);
   } else if (name.find("ceph.dir.pin") == 0) {
     if (!cur->is_dir() || cur->is_root()) {
       respond_to_request(mdr, -EINVAL);
@@ -4706,7 +4742,7 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
   }
 
   pip->change_attr++;
-  pip->ctime = mdr->get_op_stamp();
+  pip->ctime = pip->rstat.rctime = mdr->get_op_stamp();
   pip->version = cur->pre_dirty();
   if (cur->is_file())
     pip->update_backtrace();
@@ -4796,7 +4832,8 @@ public:
     
     mdr->apply();
 
-    get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
+    utime_t now = ceph_clock_now();
+    get_mds()->balancer->hit_inode(now, in, META_POP_IWR);
 
     server->respond_to_request(mdr, 0);
   }
@@ -4873,7 +4910,7 @@ void Server::handle_client_setxattr(MDRequestRef& mdr)
   // project update
   auto &pi = cur->project_inode(true);
   pi.inode.version = cur->pre_dirty();
-  pi.inode.ctime = mdr->get_op_stamp();
+  pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
   pi.inode.change_attr++;
   pi.inode.xattr_version++;
   auto &px = *pi.xattrs;
@@ -4940,7 +4977,7 @@ void Server::handle_client_removexattr(MDRequestRef& mdr)
   auto &pi = cur->project_inode(true);
   auto &px = *pi.xattrs;
   pi.inode.version = cur->pre_dirty();
-  pi.inode.ctime = mdr->get_op_stamp();
+  pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
   pi.inode.change_attr++;
   pi.inode.xattr_version++;
   px.erase(mempool::mds_co::string(boost::string_view(name)));
@@ -4982,7 +5019,7 @@ public:
     // a new version of hte inode since it's just been created)
     newi->inode.version--; 
     newi->mark_dirty(newi->inode.version + 1, mdr->ls);
-    newi->_mark_dirty_parent(mdr->ls, true);
+    newi->mark_dirty_parent(mdr->ls, true);
 
     // mkdir?
     if (newi->inode.is_dir()) { 
@@ -5002,7 +5039,8 @@ public:
       get_mds()->locker->share_inode_max_size(newi);
 
     // hit pop
-    get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
+    utime_t now = ceph_clock_now();
+    get_mds()->balancer->hit_inode(now, newi, META_POP_IWR);
 
     // reply
     server->respond_to_request(mdr, 0);
@@ -5334,7 +5372,7 @@ void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
   // project inode update
   auto &pi = targeti->project_inode();
   pi.inode.nlink++;
-  pi.inode.ctime = mdr->get_op_stamp();
+  pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
   pi.inode.change_attr++;
   pi.inode.version = tipv;
 
@@ -5373,8 +5411,9 @@ void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
   mdcache->send_dentry_link(dn, null_ref);
 
   // bump target popularity
-  mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
-  mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
+  utime_t now = ceph_clock_now();
+  mds->balancer->hit_inode(now, targeti, META_POP_IWR);
+  mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
 
   // reply
   respond_to_request(mdr, 0);
@@ -5499,8 +5538,9 @@ void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
     mdcache->send_dentry_unlink(dn, NULL, null_ref);
   
   // bump target popularity
-  mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
-  mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
+  utime_t now = ceph_clock_now();
+  mds->balancer->hit_inode(now, targeti, META_POP_IWR);
+  mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
 
   // reply
   respond_to_request(mdr, 0);
@@ -5617,7 +5657,8 @@ void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti)
   mdr->apply();
 
   // hit pop
-  mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
+  utime_t now = ceph_clock_now();
+  mds->balancer->hit_inode(now, targeti, META_POP_IWR);
 
   // done.
   mdr->slave_request->put();
@@ -5727,7 +5768,7 @@ void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef&
   }
 
   // inode
-  pi.inode.ctime = rollback.old_ctime;
+  pi.inode.ctime = pi.inode.rstat.rctime = rollback.old_ctime;
   if (rollback.was_inc)
     pi.inode.nlink--;
   else
@@ -5998,7 +6039,7 @@ void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
   }
   mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
   pi.inode.version = in->pre_dirty();
-  pi.inode.ctime = mdr->get_op_stamp();
+  pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
   pi.inode.change_attr++;
   pi.inode.nlink--;
   if (pi.inode.nlink == 0)
@@ -6081,7 +6122,8 @@ void Server::_unlink_local_finish(MDRequestRef& mdr,
     mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
 
   // bump pop
-  mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
+  utime_t now = ceph_clock_now();
+  mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
 
   // reply
   respond_to_request(mdr, 0);
@@ -6926,9 +6968,10 @@ void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn,
     assert(g_conf->mds_kill_rename_at != 6);
   
   // bump popularity
-  mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
+  utime_t now = ceph_clock_now();
+  mds->balancer->hit_dir(now, srcdn->get_dir(), META_POP_IWR);
   if (destdnl->is_remote() && in->is_auth())
-    mds->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
+    mds->balancer->hit_inode(now, in, META_POP_IWR);
 
   // did we import srci?  if so, explicitly ack that import that, before we unlock and reply.
 
@@ -6998,10 +7041,10 @@ version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, buff
   bufferlist::iterator blp = mdr->more()->inode_import.begin();
          
   // imported caps
-  ::decode(mdr->more()->imported_client_map, blp);
-  ::encode(mdr->more()->imported_client_map, *client_map_bl,
-           mds->mdsmap->get_up_features());
-  prepare_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
+  map<client_t,entity_inst_t> client_map;
+  decode(client_map, blp);
+  prepare_force_open_sessions(client_map, mdr->more()->imported_session_map);
+  encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
 
   list<ScatterLock*> updated_scatterlocks;
   mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
@@ -7189,13 +7232,13 @@ void Server::_rename_prepare(MDRequestRef& mdr,
 
   if (!silent) {
     if (spi) {
-      spi->ctime = mdr->get_op_stamp();
+      spi->ctime = spi->rstat.rctime = mdr->get_op_stamp();
       spi->change_attr++;
       if (linkmerge)
        spi->nlink--;
     }
     if (tpi) {
-      tpi->ctime = mdr->get_op_stamp();
+      tpi->ctime = tpi->rstat.rctime = mdr->get_op_stamp();
       tpi->change_attr++;
       {
         std::string t;
@@ -7437,12 +7480,13 @@ void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, C
       map<client_t,Capability::Import> imported_caps;
       
       // finish cap imports
-      finish_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
+      finish_force_open_sessions(mdr->more()->imported_session_map);
       if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
        mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
-                                                        mdr->more()->srcdn_auth_mds, true,
-                                                        mdr->more()->cap_imports[destdnl->get_inode()],
-                                                        imported_caps);
+                                                   mdr->more()->srcdn_auth_mds, true,
+                                                   mdr->more()->imported_session_map,
+                                                   mdr->more()->cap_imports[destdnl->get_inode()],
+                                                   imported_caps);
       }
 
       mdr->more()->inode_import.clear();
@@ -7810,10 +7854,10 @@ void Server::_logged_slave_rename(MDRequestRef& mdr,
   destdnl = destdn->get_linkage();
 
   // bump popularity
-  mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
+  utime_t now = ceph_clock_now();
+  mds->balancer->hit_dir(now, srcdn->get_dir(), META_POP_IWR);
   if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
-    mds->balancer->hit_inode(mdr->get_mds_stamp(), destdnl->get_inode(),
-                            META_POP_IWR);
+    mds->balancer->hit_inode(now, destdnl->get_inode(), META_POP_IWR);
 
   // done.
   mdr->slave_request->put();
@@ -7861,8 +7905,7 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
       ::decode(peer_imported, bp);
 
       dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
-      mdcache->migrator->finish_export_inode(destdnl->get_inode(),
-                                            mdr->get_mds_stamp(),
+      mdcache->migrator->finish_export_inode(destdnl->get_inode(), ceph_clock_now(),
                                             mdr->slave_to_mds, peer_imported, finished);
       mds->queue_waiters(finished);   // this includes SINGLEAUTH waiters.
 
@@ -8087,7 +8130,7 @@ void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef
     } else
       pip = in->get_projected_inode();
     if (pip->ctime == rollback.ctime)
-      pip->ctime = rollback.orig_src.old_ctime;
+      pip->ctime = pip->rstat.rctime = rollback.orig_src.old_ctime;
   }
 
   if (srcdn && srcdn->authority().first == whoami) {
@@ -8124,7 +8167,7 @@ void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef
     } else 
       ti = target->get_projected_inode();
     if (ti->ctime == rollback.ctime)
-      ti->ctime = rollback.orig_dest.old_ctime;
+      ti->ctime = ti->rstat.rctime = rollback.orig_dest.old_ctime;
     if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
       if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
        assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
@@ -8578,7 +8621,7 @@ void Server::handle_client_mksnap(MDRequestRef& mdr)
   info.stamp = mdr->get_op_stamp();
 
   auto &pi = diri->project_inode(false, true);
-  pi.inode.ctime = info.stamp;
+  pi.inode.ctime = pi.inode.rstat.rctime = info.stamp;
   pi.inode.version = diri->pre_dirty();
 
   // project the snaprealm
@@ -8710,7 +8753,7 @@ void Server::handle_client_rmsnap(MDRequestRef& mdr)
   // journal
   auto &pi = diri->project_inode(false, true);
   pi.inode.version = diri->pre_dirty();
-  pi.inode.ctime = mdr->get_op_stamp();
+  pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
   
   mdr->ls = mdlog->get_current_segment();
   EUpdate *le = new EUpdate(mdlog, "rmsnap");
@@ -8852,7 +8895,7 @@ void Server::handle_client_renamesnap(MDRequestRef& mdr)
 
   // journal
   auto &pi = diri->project_inode(false, true);
-  pi.inode.ctime = mdr->get_op_stamp();
+  pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
   pi.inode.version = diri->pre_dirty();
 
   // project the snaprealm
index 25c6b90d22302322f80f1c7e1e9b89d481c9d059..c169ea62a3c0ec0bcbc8c813a1f474d0658afb45 100644 (file)
@@ -116,9 +116,8 @@ public:
   void _session_logged(Session *session, uint64_t state_seq, 
                       bool open, version_t pv, interval_set<inodeno_t>& inos,version_t piv);
   version_t prepare_force_open_sessions(map<client_t,entity_inst_t> &cm,
-                                       map<client_t,uint64_t>& sseqmap);
-  void finish_force_open_sessions(map<client_t,entity_inst_t> &cm,
-                                 map<client_t,uint64_t>& sseqmap,
+                                       map<client_t,pair<Session*,uint64_t> >& smap);
+  void finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
                                  bool dec_import=true);
   void flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather);
   void finish_flush_session(Session *session, version_t seq);
index 50ffde9cc04f3477dacca53705f2c890307b001c..b9714eeb01044a2570bc9330489cf394f74b78a4 100644 (file)
@@ -330,7 +330,11 @@ public:
     num_trim_flushes_warnings(0),
     num_trim_requests_warnings(0) { }
   ~Session() override {
-    assert(!item_session_list.is_on_list());
+    if (state == STATE_CLOSED) {
+      item_session_list.remove_myself();
+    } else {
+      assert(!item_session_list.is_on_list());
+    }
     while (!preopen_out_queue.empty()) {
       preopen_out_queue.front()->put();
       preopen_out_queue.pop_front();
@@ -542,13 +546,6 @@ public:
 
   void dump();
 
-  void get_client_set(set<client_t>& s) {
-    for (ceph::unordered_map<entity_name_t,Session*>::iterator p = session_map.begin();
-        p != session_map.end();
-        ++p)
-      if (p->second->info.inst.name.is_client())
-       s.insert(p->second->info.inst.name.num());
-  }
   void get_client_session_set(set<Session*>& s) const {
     for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
         p != session_map.end();
@@ -557,13 +554,13 @@ public:
        s.insert(p->second);
   }
 
-  void open_sessions(map<client_t,entity_inst_t>& client_map) {
+  void replay_open_sessions(map<client_t,entity_inst_t>& client_map) {
     for (map<client_t,entity_inst_t>::iterator p = client_map.begin(); 
         p != client_map.end(); 
         ++p) {
       Session *s = get_or_add_session(p->second);
       set_state(s, Session::STATE_OPEN);
-      version++;
+      replay_dirty_session(s);
     }
   }
 
index decb7d47b7411a4a835878940c60dcf3e7be1fc0..f86d0c2586a33f7dd30d0d4835e9b665358d7ea3 100644 (file)
@@ -1349,7 +1349,7 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
       if (p->is_dirty())
        in->_mark_dirty(logseg);
       if (p->is_dirty_parent())
-       in->_mark_dirty_parent(logseg, p->is_dirty_pool());
+       in->mark_dirty_parent(logseg, p->is_dirty_pool());
       if (p->need_snapflush())
        logseg->open_files.push_back(&in->item_open_file);
       if (dn->is_auth())
@@ -1857,9 +1857,8 @@ void ESessions::replay(MDSRank *mds)
   } else {
     dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
             << " < " << cmapv << dendl;
-    mds->sessionmap.open_sessions(client_map);
+    mds->sessionmap.replay_open_sessions(client_map);
     assert(mds->sessionmap.get_version() == cmapv);
-    mds->sessionmap.set_projected(mds->sessionmap.get_version());
   }
   update_segment();
 }
@@ -2132,10 +2131,8 @@ void EUpdate::replay(MDSRank *mds)
       map<client_t,entity_inst_t> cm;
       bufferlist::iterator blp = client_map.begin();
       ::decode(cm, blp);
-      mds->sessionmap.open_sessions(cm);
-
+      mds->sessionmap.replay_open_sessions(cm);
       assert(mds->sessionmap.get_version() == cmapv);
-      mds->sessionmap.set_projected(mds->sessionmap.get_version());
     }
   }
   update_segment();
@@ -2960,7 +2957,7 @@ void EImportStart::replay(MDSRank *mds)
     map<client_t,entity_inst_t> cm;
     bufferlist::iterator blp = client_map.begin();
     ::decode(cm, blp);
-    mds->sessionmap.open_sessions(cm);
+    mds->sessionmap.replay_open_sessions(cm);
     if (mds->sessionmap.get_version() != cmapv)
     {
       derr << "sessionmap version " << mds->sessionmap.get_version()
@@ -2969,7 +2966,6 @@ void EImportStart::replay(MDSRank *mds)
       mds->damaged();
       ceph_abort();  // Should be unreachable because damaged() calls respawn()
     }
-    mds->sessionmap.set_projected(mds->sessionmap.get_version());
   }
   update_segment();
 }
index 84fff861f444cfdd55577bdf5dafc81479d669fb..8ca1ced1bdf5aba2fd0e8c8954fd545285ebb501 100644 (file)
@@ -639,6 +639,16 @@ void dirfrag_load_vec_t::dump(Formatter *f) const
   f->close_section();
 }
 
+void dirfrag_load_vec_t::dump(Formatter *f, utime_t now, const DecayRate& rate)
+{
+  f->dump_float("meta_load", meta_load(now, rate));
+  f->dump_float("IRD", get(META_POP_IRD).get(now, rate));
+  f->dump_float("IWR", get(META_POP_IWR).get(now, rate));
+  f->dump_float("READDIR", get(META_POP_READDIR).get(now, rate));
+  f->dump_float("FETCH", get(META_POP_FETCH).get(now, rate));
+  f->dump_float("STORE", get(META_POP_STORE).get(now, rate));
+}
+
 void dirfrag_load_vec_t::generate_test_instances(list<dirfrag_load_vec_t*>& ls)
 {
   utime_t sample;
index 0ede619d54337b4874363190919cc5bc24215a39..cd9164927ccd8d380573526098277180bd75c851 100644 (file)
@@ -1466,6 +1466,7 @@ public:
     decode(sample, p);
   }
   void dump(Formatter *f) const;
+  void dump(Formatter *f, utime_t now, const DecayRate& rate);
   static void generate_test_instances(list<dirfrag_load_vec_t*>& ls);
 
   DecayCounter &get(int t) { 
@@ -1490,7 +1491,7 @@ public:
       2*vec[META_POP_FETCH].get(now, rate) +
       4*vec[META_POP_STORE].get(now, rate);
   }
-  double meta_load() {
+  double meta_load() const {
     return 
       1*vec[META_POP_IRD].get_last() + 
       2*vec[META_POP_IWR].get_last() +
@@ -1523,14 +1524,10 @@ inline void decode(dirfrag_load_vec_t& c, bufferlist::iterator &p) {
   c.decode(sample, p);
 }
 
-inline std::ostream& operator<<(std::ostream& out, dirfrag_load_vec_t& dl)
+inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl)
 {
-  // ugliness!
-  utime_t now = ceph_clock_now();
-  DecayRate rate(g_conf->mds_decay_halflife);
-  return out << "[" << dl.vec[0].get(now, rate) << "," << dl.vec[1].get(now, rate) 
-            << " " << dl.meta_load(now, rate)
-            << "]";
+  return out << "[" << dl.vec[0].get_last() << "," << dl.vec[1].get_last()
+            << " " << dl.meta_load() << "]";
 }
 
 
@@ -1574,7 +1571,7 @@ inline void decode(mds_load_t &c, bufferlist::iterator &p) {
   c.decode(sample, p);
 }
 
-inline std::ostream& operator<<( std::ostream& out, mds_load_t& load )
+inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load)
 {
   return out << "mdsload<" << load.auth << "/" << load.all
              << ", req " << load.req_rate 
index e973b63413818eadcf5e0f55557841a077a43f8c..01f5746c4394e841e9ec152705fc665a91b8b462 100644 (file)
 #define        CLIENT_CAPS_SYNC                (0x1)
 
 class MClientCaps : public Message {
-  static const int HEAD_VERSION = 10;
+  static const int HEAD_VERSION = 11;
   static const int COMPAT_VERSION = 1;
 
  public:
   struct ceph_mds_caps_head head;
 
-  uint64_t size, max_size, truncate_size, change_attr;
-  uint32_t truncate_seq;
+  uint64_t size = 0;
+  uint64_t max_size = 0;
+  uint64_t truncate_size = 0;
+  uint64_t change_attr = 0;
+  uint32_t truncate_seq = 0;
   utime_t mtime, atime, ctime, btime;
-  uint32_t time_warp_seq;
+  uint32_t time_warp_seq = 0;
+  int64_t nfiles = -1;         // files in dir
+  int64_t nsubdirs = -1;       // subdirs in dir
 
   struct ceph_mds_cap_peer peer;
 
   bufferlist snapbl;
   bufferlist xattrbl;
   bufferlist flockbl;
-  version_t  inline_version;
+  version_t  inline_version = 0;
   bufferlist inline_data;
 
   // Receivers may not use their new caps until they have this OSD map
-  epoch_t osd_epoch_barrier;
-  ceph_tid_t oldest_flush_tid;
-  uint32_t caller_uid;
-  uint32_t caller_gid;
+  epoch_t osd_epoch_barrier = 0;
+  ceph_tid_t oldest_flush_tid = 0;
+  uint32_t caller_uid = 0;
+  uint32_t caller_gid = 0;
 
   /* advisory CLIENT_CAPS_* flags to send to mds */
-  unsigned flags;
+  unsigned flags = 0;
 
   int      get_caps() { return head.caps; }
   int      get_wanted() { return head.wanted; }
@@ -70,6 +75,9 @@ class MClientCaps : public Message {
   utime_t get_atime() { return atime; }
   __u64 get_change_attr() { return change_attr; }
   __u32 get_time_warp_seq() { return time_warp_seq; }
+  uint64_t get_nfiles() { return nfiles; }
+  uint64_t get_nsubdirs() { return nsubdirs; }
+  bool dirstat_is_valid() const { return nfiles != -1 || nsubdirs != -1; }
 
   const file_layout_t& get_layout() {
     return layout;
@@ -115,19 +123,7 @@ class MClientCaps : public Message {
   void clear_dirty() { head.dirty = 0; }
 
   MClientCaps()
-    : Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION),
-      size(0),
-      max_size(0),
-      truncate_size(0),
-      change_attr(0),
-      truncate_seq(0),
-      time_warp_seq(0),
-      osd_epoch_barrier(0),
-      oldest_flush_tid(0),
-      caller_uid(0), caller_gid(0),
-      flags(0) {
-    inline_version = 0;
-  }
+    : Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION) {}
   MClientCaps(int op,
              inodeno_t ino,
              inodeno_t realm,
@@ -139,16 +135,7 @@ class MClientCaps : public Message {
              int mseq,
               epoch_t oeb)
     : Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION),
-      size(0),
-      max_size(0),
-      truncate_size(0),
-      change_attr(0),
-      truncate_seq(0),
-      time_warp_seq(0),
-      osd_epoch_barrier(oeb),
-      oldest_flush_tid(0),
-      caller_uid(0), caller_gid(0),
-      flags(0) {
+      osd_epoch_barrier(oeb) {
     memset(&head, 0, sizeof(head));
     head.op = op;
     head.ino = ino;
@@ -160,22 +147,12 @@ class MClientCaps : public Message {
     head.dirty = dirty;
     head.migrate_seq = mseq;
     memset(&peer, 0, sizeof(peer));
-    inline_version = 0;
   }
   MClientCaps(int op,
              inodeno_t ino, inodeno_t realm,
              uint64_t id, int mseq, epoch_t oeb)
     : Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION),
-      size(0),
-      max_size(0),
-      truncate_size(0),
-      change_attr(0),
-      truncate_seq(0),
-      time_warp_seq(0),
-      osd_epoch_barrier(oeb),
-      oldest_flush_tid(0),
-      caller_uid(0), caller_gid(0),
-      flags(0) {
+      osd_epoch_barrier(oeb) {
     memset(&head, 0, sizeof(head));
     head.op = op;
     head.ino = ino;
@@ -183,7 +160,6 @@ class MClientCaps : public Message {
     head.cap_id = id;
     head.migrate_seq = mseq;
     memset(&peer, 0, sizeof(peer));
-    inline_version = 0;
   }
 private:
   file_layout_t layout;
@@ -279,6 +255,10 @@ public:
     if (header.version >= 10) {
       ::decode(flags, p);
     }
+    if (header.version >= 11) {
+      decode(nfiles, p);
+      decode(nsubdirs, p);
+    }
   }
   void encode_payload(uint64_t features) override {
     header.version = HEAD_VERSION;
@@ -339,6 +319,8 @@ public:
     ::encode(btime, payload);
     ::encode(change_attr, payload);
     ::encode(flags, payload);
+    ::encode(nfiles, payload);
+    ::encode(nsubdirs, payload);
   }
 };
 
index 6cf4a0e37e1cda369e5832213c29bc8a176607f0..ad0d8f8e4818a915b3d3986a10a1ca587d12a7b8 100644 (file)
@@ -21,7 +21,7 @@
 
 class MHeartbeat : public Message {
   mds_load_t load;
-  __s32        beat;
+  __s32        beat = 0;
   map<mds_rank_t, float> import_map;
 
  public:
index 92b4e1cafd9cc60e1b834b9527e5692e1bb3300f..4fd9cc797dd4e75572a46942cc9ff8cebb6731a8 100644 (file)
@@ -34,7 +34,7 @@ public:
 
   MMDSMap() : 
     Message(CEPH_MSG_MDS_MAP, HEAD_VERSION, COMPAT_VERSION) {}
-  MMDSMap(const uuid_d &f, MDSMap *mm) :
+  MMDSMap(const uuid_d &f, const MDSMap *mm) :
     Message(CEPH_MSG_MDS_MAP, HEAD_VERSION, COMPAT_VERSION),
     fsid(f) {
     epoch = mm->get_epoch();
index 865642cf417c30f487fb3a698946a20e86f8075f..e77b4856cb9f478088794b16dddd2cdda4909e22 100644 (file)
@@ -26,6 +26,7 @@ class MOSDMap : public Message {
 
  public:
   uuid_d fsid;
+  uint64_t encode_features = 0;
   map<epoch_t, bufferlist> maps;
   map<epoch_t, bufferlist> incremental_maps;
   epoch_t oldest_map =0, newest_map = 0;
@@ -57,13 +58,12 @@ class MOSDMap : public Message {
 
 
   MOSDMap() : Message(CEPH_MSG_OSD_MAP, HEAD_VERSION) { }
-  MOSDMap(const uuid_d &f)
+  MOSDMap(const uuid_d &f, const uint64_t features)
     : Message(CEPH_MSG_OSD_MAP, HEAD_VERSION),
-      fsid(f),
+      fsid(f), encode_features(features),
       oldest_map(0), newest_map(0) { }
 private:
   ~MOSDMap() override {}
-
 public:
   // marshalling
   void decode_payload() override {
@@ -82,12 +82,8 @@ public:
   void encode_payload(uint64_t features) override {
     header.version = HEAD_VERSION;
     ::encode(fsid, payload);
-    if ((features & CEPH_FEATURE_PGID64) == 0 ||
-       (features & CEPH_FEATURE_PGPOOL3) == 0 ||
-       (features & CEPH_FEATURE_OSDENC) == 0 ||
-        (features & CEPH_FEATURE_OSDMAP_ENC) == 0 ||
-       (features & CEPH_FEATURE_MSG_ADDR2) == 0 ||
-       !HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+    if (OSDMap::get_significant_features(encode_features) !=
+         OSDMap::get_significant_features(features)) {
       if ((features & CEPH_FEATURE_PGID64) == 0 ||
          (features & CEPH_FEATURE_PGPOOL3) == 0)
        header.version = 1;  // old old_client version
@@ -105,13 +101,15 @@ public:
        OSDMap::Incremental inc;
        bufferlist::iterator q = p->second.begin();
        inc.decode(q);
+       // always encode with subset of osdmaps canonical features
+       uint64_t f = inc.encode_features & features;
        p->second.clear();
        if (inc.fullmap.length()) {
          // embedded full map?
          OSDMap m;
          m.decode(inc.fullmap);
          inc.fullmap.clear();
-         m.encode(inc.fullmap, features | CEPH_FEATURE_RESERVED);
+         m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
        }
        if (inc.crush.length()) {
          // embedded crush map
@@ -119,17 +117,19 @@ public:
          auto p = inc.crush.begin();
          c.decode(p);
          inc.crush.clear();
-         c.encode(inc.crush, features);
+         c.encode(inc.crush, f);
        }
-       inc.encode(p->second, features | CEPH_FEATURE_RESERVED);
+       inc.encode(p->second, f | CEPH_FEATURE_RESERVED);
       }
       for (map<epoch_t,bufferlist>::iterator p = maps.begin();
           p != maps.end();
           ++p) {
        OSDMap m;
        m.decode(p->second);
+       // always encode with subset of osdmaps canonical features
+       uint64_t f = m.get_encoding_features() & features;
        p->second.clear();
-       m.encode(p->second, features | CEPH_FEATURE_RESERVED);
+       m.encode(p->second, f | CEPH_FEATURE_RESERVED);
       }
     }
     ::encode(incremental_maps, payload);
index ce800aac068b3543fb981a595d0b27578912affd..f4f26ee1f2d45860ebd97a9d5211e145add12f90 100644 (file)
@@ -24,7 +24,7 @@
 
 struct MOSDRepScrub : public MOSDFastDispatchOp {
 
-  static const int HEAD_VERSION = 7;
+  static const int HEAD_VERSION = 9;
   static const int COMPAT_VERSION = 6;
 
   spg_t pgid;             // PG to scrub
@@ -35,7 +35,9 @@ struct MOSDRepScrub : public MOSDFastDispatchOp {
   hobject_t start;       // lower bound of scrub, inclusive
   hobject_t end;         // upper bound of scrub, exclusive
   bool deep;             // true if scrub should be deep
-  uint32_t seed;         // seed value for digest calculation
+  bool allow_preemption = false;
+  int32_t priority = 0;
+  bool high_priority = false;
 
   epoch_t get_map_epoch() const override {
     return map_epoch;
@@ -50,11 +52,11 @@ struct MOSDRepScrub : public MOSDFastDispatchOp {
   MOSDRepScrub()
     : MOSDFastDispatchOp(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION),
       chunky(false),
-      deep(false),
-      seed(0) { }
+      deep(false) { }
 
   MOSDRepScrub(spg_t pgid, eversion_t scrub_to, epoch_t map_epoch, epoch_t min_epoch,
-               hobject_t start, hobject_t end, bool deep, uint32_t seed)
+               hobject_t start, hobject_t end, bool deep,
+              bool preemption, int prio, bool highprio)
     : MOSDFastDispatchOp(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION),
       pgid(pgid),
       scrub_to(scrub_to),
@@ -64,7 +66,9 @@ struct MOSDRepScrub : public MOSDFastDispatchOp {
       start(start),
       end(end),
       deep(deep),
-      seed(seed) { }
+      allow_preemption(preemption),
+      priority(prio),
+      high_priority(highprio) { }
 
 
 private:
@@ -73,15 +77,18 @@ private:
 public:
   const char *get_type_name() const override { return "replica scrub"; }
   void print(ostream& out) const override {
-    out << "replica scrub(pg: ";
-    out << pgid << ",from:" << scrub_from << ",to:" << scrub_to
+    out << "replica_scrub(pg: "        << pgid
+       << ",from:" << scrub_from
+       << ",to:" << scrub_to
         << ",epoch:" << map_epoch << "/" << min_epoch
        << ",start:" << start << ",end:" << end
         << ",chunky:" << chunky
         << ",deep:" << deep
-       << ",seed:" << seed
-        << ",version:" << header.version;
-    out << ")";
+        << ",version:" << header.version
+       << ",allow_preemption:" << (int)allow_preemption
+       << ",priority=" << priority
+       << (high_priority ? " (high)":"")
+       << ")";
   }
 
   void encode_payload(uint64_t features) override {
@@ -94,8 +101,11 @@ public:
     ::encode(end, payload);
     ::encode(deep, payload);
     ::encode(pgid.shard, payload);
-    ::encode(seed, payload);
+    ::encode((uint32_t)-1, payload); // seed
     ::encode(min_epoch, payload);
+    ::encode(allow_preemption, payload);
+    ::encode(priority, payload);
+    ::encode(high_priority, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
@@ -108,12 +118,22 @@ public:
     ::decode(end, p);
     ::decode(deep, p);
     ::decode(pgid.shard, p);
-    ::decode(seed, p);
+    {
+      uint32_t seed;
+      ::decode(seed, p);
+    }
     if (header.version >= 7) {
       ::decode(min_epoch, p);
     } else {
       min_epoch = map_epoch;
     }
+    if (header.version >= 8) {
+      ::decode(allow_preemption, p);
+    }
+    if (header.version >= 9) {
+      ::decode(priority, p);
+      ::decode(high_priority, p);
+    }
   }
 };
 
index f17bb0c2cf30f1da0cc73ffeb24977a0ec8d45a5..a3835914b45f45867b7545058ce4fb6d6b4f65ff 100644 (file)
 
 struct MOSDRepScrubMap : public MOSDFastDispatchOp {
 
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
   spg_t pgid;            // primary spg_t
   epoch_t map_epoch = 0;
   pg_shard_t from;   // whose scrubmap this is
   bufferlist scrub_map_bl;
+  bool preempted = false;
 
   epoch_t get_map_epoch() const override {
     return map_epoch;
@@ -54,19 +55,24 @@ public:
   const char *get_type_name() const { return "rep_scrubmap"; }
   void print(ostream& out) const {
     out << "rep_scrubmap(" << pgid << " e" << map_epoch
-       << " from shard " << from << ")";
+       << " from shard " << from
+       << (preempted ? " PREEMPTED":"") << ")";
   }
 
   void encode_payload(uint64_t features) {
     ::encode(pgid, payload);
     ::encode(map_epoch, payload);
     ::encode(from, payload);
+    ::encode(preempted, payload);
   }
   void decode_payload() {
     bufferlist::iterator p = payload.begin();
     ::decode(pgid, p);
     ::decode(map_epoch, p);
     ::decode(from, p);
+    if (header.version >= 2) {
+      ::decode(preempted, p);
+    }
   }
 };
 
index 82f845091cf92c3a5717fe7c0e5168e1bbe2f390..0694d23082e4f511a6a0f782352a1582ea58fec2 100644 (file)
@@ -260,6 +260,9 @@ PyObject *ActivePyModules::get_python(const std::string &what)
             f.dump_int(i.first.c_str(), i.second);
           }
           f.close_section();
+          f.open_object_section("pg_stats_sum");
+          pg_map.pg_sum.dump(&f);
+          f.close_section();
         }
     );
     return f.get();
@@ -555,13 +558,24 @@ PyObject* ActivePyModules::get_counter_python(
     Mutex::Locker l2(metadata->lock);
     if (metadata->perf_counters.instances.count(path)) {
       auto counter_instance = metadata->perf_counters.instances.at(path);
-      const auto &data = counter_instance.get_data();
-      for (const auto &datapoint : data) {
-        f.open_array_section("datapoint");
-        f.dump_unsigned("t", datapoint.t.sec());
-        f.dump_unsigned("v", datapoint.v);
-        f.close_section();
-
+      auto counter_type = metadata->perf_counters.types.at(path);
+      if (counter_type.type & PERFCOUNTER_LONGRUNAVG) {
+        const auto &avg_data = counter_instance.get_data_avg();
+        for (const auto &datapoint : avg_data) {
+          f.open_array_section("datapoint");
+          f.dump_unsigned("t", datapoint.t.sec());
+          f.dump_unsigned("s", datapoint.s);
+          f.dump_unsigned("c", datapoint.c);
+          f.close_section();
+        }
+      } else {
+        const auto &data = counter_instance.get_data();
+        for (const auto &datapoint : data) {
+          f.open_array_section("datapoint");
+          f.dump_unsigned("t", datapoint.t.sec());
+          f.dump_unsigned("v", datapoint.v);
+          f.close_section();
+        }
       }
     } else {
       dout(4) << "Missing counter: '" << path << "' ("
index 29f380ca5a6b5d398ca6d613c27062a939f6a65e..013b5fec622ad25e6cc97200979afa96ea732a7f 100644 (file)
@@ -141,13 +141,15 @@ entity_addr_t DaemonServer::get_myaddr() const
 }
 
 
-bool DaemonServer::ms_verify_authorizer(Connection *con,
-    int peer_type,
-    int protocol,
-    ceph::bufferlist& authorizer_data,
-    ceph::bufferlist& authorizer_reply,
-    bool& is_valid,
-    CryptoKey& session_key)
+bool DaemonServer::ms_verify_authorizer(
+  Connection *con,
+  int peer_type,
+  int protocol,
+  ceph::bufferlist& authorizer_data,
+  ceph::bufferlist& authorizer_reply,
+  bool& is_valid,
+  CryptoKey& session_key,
+  std::unique_ptr<AuthAuthorizerChallenge> *challenge)
 {
   AuthAuthorizeHandler *handler = nullptr;
   if (peer_type == CEPH_ENTITY_TYPE_OSD ||
@@ -175,7 +177,9 @@ bool DaemonServer::ms_verify_authorizer(Connection *con,
       authorizer_data,
       authorizer_reply, s->entity_name,
       s->global_id, caps_info,
-      session_key);
+      session_key,
+      nullptr,
+      challenge);
   } else {
     dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
     is_valid = false;
index 1dcc24b2778683c2a2021ff10bcedb294a932bd5..3b11843425a0d6579929040aaed92e2d1c7c0122 100644 (file)
@@ -123,13 +123,15 @@ public:
   bool ms_handle_refused(Connection *con) override;
   bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer,
                          bool force_new) override;
-  bool ms_verify_authorizer(Connection *con,
-      int peer_type,
-      int protocol,
-      ceph::bufferlist& authorizer,
-      ceph::bufferlist& authorizer_reply,
-      bool& isvalid,
-      CryptoKey& session_key) override;
+  bool ms_verify_authorizer(
+    Connection *con,
+    int peer_type,
+    int protocol,
+    ceph::bufferlist& authorizer,
+    ceph::bufferlist& authorizer_reply,
+    bool& isvalid,
+    CryptoKey& session_key,
+    std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
 
   bool handle_open(MMgrOpen *m);
   bool handle_report(MMgrReport *m);
index a7b8f572e1614b3f0a0b420f7685ccdeccb5ecaa..dc6726739d3657f800fac5c0aa5ad3d7d58e37e7 100644 (file)
@@ -133,6 +133,8 @@ void DaemonPerfCounters::update(MMgrReport *report)
   for (const auto &t : report->declare_types) {
     types.insert(std::make_pair(t.path, t));
     session->declared_types.insert(t.path);
+    instances.insert(std::pair<std::string, PerfCounterInstance>(
+                     t.path, PerfCounterInstance(t.type)));
   }
   // Remove any old types
   for (const auto &t : report->undeclare_types) {
@@ -154,9 +156,10 @@ void DaemonPerfCounters::update(MMgrReport *report)
     if (t.type & PERFCOUNTER_LONGRUNAVG) {
       ::decode(avgcount, p);
       ::decode(avgcount2, p);
+      instances.at(t_path).push_avg(now, val, avgcount);
+    } else {
+      instances.at(t_path).push(now, val);
     }
-    // TODO: interface for insertion of avgs
-    instances[t_path].push(now, val);
   }
   DECODE_FINISH(p);
 }
@@ -171,3 +174,8 @@ void PerfCounterInstance::push(utime_t t, uint64_t const &v)
   buffer.push_back({t, v});
 }
 
+void PerfCounterInstance::push_avg(utime_t t, uint64_t const &s,
+                                   uint64_t const &c)
+{
+  avg_buffer.push_back({t, s, c});
+}
index 9a12b1187c6aa8977259bfca7574b1c9eaf7cbae..7dad81593f654248bf380b43e230edad1669ab72 100644 (file)
@@ -45,7 +45,20 @@ class PerfCounterInstance
     {}
   };
 
+  class AvgDataPoint
+  {
+    public:
+    utime_t t;
+    uint64_t s;
+    uint64_t c;
+    AvgDataPoint(utime_t t_, uint64_t s_, uint64_t c_)
+      : t(t_), s(s_), c(c_)
+    {}
+  };
+
   boost::circular_buffer<DataPoint> buffer;
+  boost::circular_buffer<AvgDataPoint> avg_buffer;
+
   uint64_t get_current() const;
 
   public:
@@ -53,9 +66,20 @@ class PerfCounterInstance
   {
     return buffer;
   }
+  const boost::circular_buffer<AvgDataPoint> & get_data_avg() const
+  {
+    return avg_buffer;
+  }
   void push(utime_t t, uint64_t const &v);
-  PerfCounterInstance()
-    : buffer(20) {}
+  void push_avg(utime_t t, uint64_t const &s, uint64_t const &c);
+
+  PerfCounterInstance(enum perfcounter_type_d type)
+  {
+    if (type & PERFCOUNTER_LONGRUNAVG)
+      avg_buffer = boost::circular_buffer<AvgDataPoint>(20);
+    else
+      buffer = boost::circular_buffer<DataPoint>(20);
+  };
 };
 
 
index 1f13145a6d929678b9f7d44cae453586f9a8d8d7..965958b1220ecc45f6df3dcf62117fe9140ef812 100644 (file)
@@ -417,6 +417,29 @@ bool AuthMonitor::prep_auth(MonOpRequestRef op, bool paxos_writable)
          supported.erase(CEPH_AUTH_CEPHX);
        }
       }
+    } else if (!m->get_connection()->has_feature(CEPH_FEATURE_CEPHX_V2)) {
+      if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON ||
+         entity_name.get_type() == CEPH_ENTITY_TYPE_OSD ||
+         entity_name.get_type() == CEPH_ENTITY_TYPE_MDS ||
+         entity_name.get_type() == CEPH_ENTITY_TYPE_MGR) {
+       if (g_conf->cephx_cluster_require_version >= 2 ||
+           g_conf->cephx_require_version >= 2) {
+         dout(1) << m->get_source_inst()
+                  << " supports cephx but not v2 and"
+                  << " 'cephx [cluster] require version >= 2';"
+                  << " disallowing cephx" << dendl;
+         supported.erase(CEPH_AUTH_CEPHX);
+       }
+      } else {
+       if (g_conf->cephx_service_require_version >= 2 ||
+           g_conf->cephx_require_version >= 2) {
+         dout(1) << m->get_source_inst()
+                  << " supports cephx but not v2 and"
+                  << " 'cephx [service] require version >= 2';"
+                  << " disallowing cephx" << dendl;
+         supported.erase(CEPH_AUTH_CEPHX);
+       }
+      }
     }
 
     int type;
@@ -1325,8 +1348,8 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op)
       for (const auto &sys_cap : wanted_caps) {
        if (entity_auth.caps.count(sys_cap.first) == 0 ||
            !entity_auth.caps[sys_cap.first].contents_equal(sys_cap.second)) {
-         ss << "key for " << entity << " exists but cap " << sys_cap.first
-            << " does not match";
+         ss << entity << " already has fs capabilities that differ from those supplied. To generate a new auth key for "
+            << entity << ", first remove " << entity << " from configuration files, execute 'ceph auth rm " << entity << "', then execute this command again.";
          err = -EINVAL;
          goto done;
        }
index 04044543c21bb46a711d95d511b4a9ecd3b68636..0f553c8a0876dc0d69b30b304978e9c623133f29 100644 (file)
@@ -21,7 +21,8 @@ set(lib_mon_srcs
   PGMonitor.cc
   PGMap.cc
   ConfigKeyService.cc
-  ../mgr/mgr_commands.cc)
+  ../mgr/mgr_commands.cc
+  ../osd/OSDCap.cc)
 add_library(mon STATIC
   ${lib_mon_srcs}
   $<TARGET_OBJECTS:kv_objs>
index 29ae9d959427aa4c38872fddb33607b44b1c6e24..e191f8367c80276d9f7f55c4d8973e269188417a 100644 (file)
@@ -108,6 +108,17 @@ bool ConfigKeyService::store_has_prefix(const string &prefix)
   return false;
 }
 
+static bool is_binary_string(const string& s)
+{
+  for (auto c : s) {
+    // \n and \t are escaped in JSON; other control characters are not.
+    if ((c < 0x20 && c != '\n' && c != '\t') || c >= 0x7f) {
+      return true;
+    }
+  }
+  return false;
+}
+
 void ConfigKeyService::store_dump(stringstream &ss)
 {
   KeyValueDB::Iterator iter =
@@ -117,7 +128,14 @@ void ConfigKeyService::store_dump(stringstream &ss)
   f.open_object_section("config-key store");
 
   while (iter->valid()) {
-    f.dump_string(iter->key().c_str(), iter->value().to_str());
+    string s = iter->value().to_str();
+    if (is_binary_string(s)) {
+      ostringstream ss;
+      ss << "<<< binary blob of length " << s.size() << " >>>";
+      f.dump_string(iter->key().c_str(), ss.str());
+    } else {
+      f.dump_string(iter->key().c_str(), s);
+    }
     iter->next();
   }
   f.close_section();
index 178f78589df5fb80c74becba716dd72b6f976689..d4fa0dbe3e5de0cde932ee7899e9bd643c4f5798 100644 (file)
@@ -314,6 +314,7 @@ bool LogMonitor::preprocess_log(MonOpRequestRef op)
   return false;
 
  done:
+  mon->no_reply(op);
   return true;
 }
 
@@ -647,7 +648,7 @@ void LogMonitor::_create_sub_incremental(MLog *mlog, int level, version_t sv)
   }
 
   version_t summary_ver = summary.version;
-  while (sv <= summary_ver) {
+  while (sv && sv <= summary_ver) {
     bufferlist bl;
     int err = get_version(sv, bl);
     assert(err == 0);
index 5c51d2f84d554a46d8389ead0a0659e5e96058ef..d402c0089971c8940141cb7b3086589104439aa5 100644 (file)
@@ -42,8 +42,8 @@
 
 #define dout_subsys ceph_subsys_mon
 #undef dout_prefix
-#define dout_prefix _prefix(_dout, mon, fsmap)
-static ostream& _prefix(std::ostream *_dout, Monitor *mon, FSMap const& fsmap) {
+#define dout_prefix _prefix(_dout, mon, get_fsmap())
+static ostream& _prefix(std::ostream *_dout, Monitor *mon, const FSMap& fsmap) {
   return *_dout << "mon." << mon->name << "@" << mon->rank
                << "(" << mon->get_state_name()
                << ").mds e" << fsmap.get_epoch() << " ";
@@ -77,7 +77,7 @@ template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
 
 // my methods
 
-void MDSMonitor::print_map(FSMap &m, int dbl)
+void MDSMonitor::print_map(const FSMap &m, int dbl)
 {
   dout(dbl) << "print_map\n";
   m.print(*_dout);
@@ -100,12 +100,12 @@ void MDSMonitor::get_store_prefixes(std::set<string>& s)
 void MDSMonitor::update_from_paxos(bool *need_bootstrap)
 {
   version_t version = get_last_committed();
-  if (version == fsmap.epoch)
+  if (version == get_fsmap().epoch)
     return;
 
   dout(10) << __func__ << " version " << version
-          << ", my e " << fsmap.epoch << dendl;
-  assert(version > fsmap.epoch);
+          << ", my e " << get_fsmap().epoch << dendl;
+  assert(version > get_fsmap().epoch);
 
   load_health();
 
@@ -117,13 +117,13 @@ void MDSMonitor::update_from_paxos(bool *need_bootstrap)
 
   assert(fsmap_bl.length() > 0);
   dout(10) << __func__ << " got " << version << dendl;
-  fsmap.decode(fsmap_bl);
+  PaxosFSMap::decode(fsmap_bl);
 
   // new map
   dout(4) << "new map" << dendl;
-  print_map(fsmap, 0);
+  print_map(get_fsmap(), 0);
   if (!g_conf->mon_mds_skip_sanity) {
-    fsmap.sanity();
+    get_fsmap().sanity();
   }
 
   check_subs();
@@ -137,43 +137,44 @@ void MDSMonitor::init()
 
 void MDSMonitor::create_pending()
 {
-  pending_fsmap = fsmap;
-  pending_fsmap.epoch++;
+  auto &fsmap = PaxosFSMap::create_pending();
 
   if (mon->osdmon()->is_readable()) {
-    auto &osdmap = mon->osdmon()->osdmap;
-    pending_fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
+    const auto &osdmap = mon->osdmon()->osdmap;
+    fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
   }
 
-  dout(10) << "create_pending e" << pending_fsmap.epoch << dendl;
+  dout(10) << "create_pending e" << fsmap.epoch << dendl;
 }
 
 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
 {
-  dout(10) << "encode_pending e" << pending_fsmap.epoch << dendl;
+  auto &pending = get_pending_fsmap_writeable();
+  auto &epoch = pending.epoch;
 
+  dout(10) << "encode_pending e" << epoch << dendl;
 
   // print map iff 'debug mon = 30' or higher
-  print_map(pending_fsmap, 30);
+  print_map(get_pending_fsmap(), 30);
   if (!g_conf->mon_mds_skip_sanity) {
-    pending_fsmap.sanity();
+    pending.sanity();
   }
 
   // Set 'modified' on maps modified this epoch
-  for (auto &i : fsmap.filesystems) {
-    if (i.second->mds_map.epoch == fsmap.epoch) {
-      i.second->mds_map.modified = ceph_clock_now();
+  for (auto &p : pending.filesystems) {
+    if (p.second->mds_map.epoch == epoch) {
+      p.second->mds_map.modified = ceph_clock_now();
     }
   }
 
   // apply to paxos
-  assert(get_last_committed() + 1 == pending_fsmap.epoch);
-  bufferlist fsmap_bl;
-  pending_fsmap.encode(fsmap_bl, mon->get_quorum_con_features());
+  assert(get_last_committed() + 1 == pending.epoch);
+  bufferlist pending_bl;
+  pending.encode(pending_bl, mon->get_quorum_con_features());
 
   /* put everything in the transaction */
-  put_version(t, pending_fsmap.epoch, fsmap_bl);
-  put_last_committed(t, pending_fsmap.epoch);
+  put_version(t, pending.epoch, pending_bl);
+  put_last_committed(t, pending.epoch);
 
   // Encode MDSHealth data
   for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
@@ -192,7 +193,7 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
 
   // health
   health_check_map_t new_checks;
-  const auto info_map = pending_fsmap.get_mds_info();
+  const auto &info_map = pending.get_mds_info();
   for (const auto &i : info_map) {
     const auto &gid = i.first;
     const auto &info = i.second;
@@ -221,18 +222,20 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
        mds_metric_summary(metric.type));
       ostringstream ss;
       ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
-      for (auto p = metric.metadata.begin();
-          p != metric.metadata.end();
-          ++p) {
-       if (p != metric.metadata.begin()) {
+      bool first = true;
+      for (auto &p : metric.metadata) {
+       if (first) {
+         ss << " ";
+       } else {
          ss << ", ";
-       }
-       ss << p->first << ": " << p->second;
+        }
+       ss << p.first << ": " << p.second;
+        first = false;
       }
       check->detail.push_back(ss.str());
     }
   }
-  pending_fsmap.get_health_checks(&new_checks);
+  pending.get_health_checks(&new_checks);
   for (auto& p : new_checks.checks) {
     p.second.summary = boost::regex_replace(
       p.second.summary,
@@ -276,6 +279,8 @@ void MDSMonitor::update_logger()
 {
   dout(10) << "update_logger" << dendl;
 
+  const auto &fsmap = get_fsmap();
+
   uint64_t up = 0;
   uint64_t in = 0;
   uint64_t failed = 0;
@@ -335,6 +340,8 @@ bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
   MDSMap::mds_info_t info;
   epoch_t effective_epoch = 0;
 
+  const auto &fsmap = get_working_fsmap();
+
   // check privileges, ignore if fails
   MonSession *session = m->get_session();
   assert(session);
@@ -367,11 +374,11 @@ bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
   }
 
   // fw to leader?
-  if (!mon->is_leader())
+  if (!is_leader())
     return false;
 
   // booted, but not in map?
-  if (!pending_fsmap.gid_exists(gid)) {
+  if (!fsmap.gid_exists(gid)) {
     if (state != MDSMap::STATE_BOOT) {
       dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
               << ceph_mds_state_name(state) << ")" << dendl;
@@ -386,7 +393,7 @@ bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
     }
   }
   dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
-  info = pending_fsmap.get_info_gid(gid);
+  info = fsmap.get_info_gid(gid);
 
   // old seq?
   if (info.state_seq > seq) {
@@ -396,11 +403,11 @@ bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
 
   // Work out the latest epoch that this daemon should have seen
   {
-    fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
+    fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
     if (fscid == FS_CLUSTER_ID_NONE) {
-      effective_epoch = pending_fsmap.standby_epochs.at(gid);
+      effective_epoch = fsmap.standby_epochs.at(gid);
     } else {
-      effective_epoch = pending_fsmap.get_filesystem(fscid)->mds_map.epoch;
+      effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
     }
     if (effective_epoch != m->get_last_epoch_seen()) {
       dout(10) << "mds_beacon " << *m
@@ -470,6 +477,8 @@ bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
   op->mark_mdsmon_event(__func__);
   MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
   dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
+
+  auto &fsmap = get_working_fsmap();
   
   // check privileges, ignore message if fails
   MonSession *session = m->get_session();
@@ -527,6 +536,8 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
   MDSMap::DaemonState state = m->get_state();
   version_t seq = m->get_seq();
 
+  auto &pending = get_pending_fsmap_writeable();
+
   dout(20) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
 
   // Calculate deltas of health metrics created and removed
@@ -547,16 +558,8 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
 
   for (const auto &new_metric: new_health) {
     if (old_types.count(new_metric.type) == 0) {
-      std::stringstream msg;
-      msg << "MDS health message (" << m->get_orig_source_inst().name << "): "
-          << new_metric.message;
-      if (new_metric.sev == HEALTH_ERR) {
-        mon->clog->error() << msg.str();
-      } else if (new_metric.sev == HEALTH_WARN) {
-        mon->clog->warn() << msg.str();
-      } else {
-        mon->clog->info() << msg.str();
-      }
+      dout(10) << "MDS health message (" << m->get_orig_source_inst().name
+              << "): " << new_metric.sev << " " << new_metric.message << dendl;
     }
   }
 
@@ -576,13 +579,13 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
     // zap previous instance of this name?
     if (g_conf->mds_enforce_unique_name) {
       bool failed_mds = false;
-      while (mds_gid_t existing = pending_fsmap.find_mds_gid_by_name(m->get_name())) {
+      while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
         if (!mon->osdmon()->is_writeable()) {
           mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
           return false;
         }
         const MDSMap::mds_info_t &existing_info =
-          pending_fsmap.get_info_gid(existing);
+          pending.get_info_gid(existing);
         mon->clog->info() << existing_info.human_name() << " restarted";
        fail_mds_gid(existing);
         failed_mds = true;
@@ -594,7 +597,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
     }
 
     // Add this daemon to the map
-    if (pending_fsmap.mds_roles.count(gid) == 0) {
+    if (pending.mds_roles.count(gid) == 0) {
       MDSMap::mds_info_t new_info;
       new_info.global_id = gid;
       new_info.name = m->get_name();
@@ -606,19 +609,19 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
       new_info.standby_for_name = m->get_standby_for_name();
       new_info.standby_for_fscid = m->get_standby_for_fscid();
       new_info.standby_replay = m->get_standby_replay();
-      pending_fsmap.insert(new_info);
+      pending.insert(new_info);
     }
 
     // Resolve standby_for_name to a rank
-    const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
+    const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
     if (!info.standby_for_name.empty()) {
-      const MDSMap::mds_info_t *leaderinfo = fsmap.find_by_name(
+      const MDSMap::mds_info_t *leaderinfo = pending.find_by_name(
           info.standby_for_name);
       if (leaderinfo && (leaderinfo->rank >= 0)) {
-        auto fscid = pending_fsmap.mds_roles.at(leaderinfo->global_id);
-        auto fs = pending_fsmap.get_filesystem(fscid);
+        const auto &fscid = pending.mds_roles.at(leaderinfo->global_id);
+        const auto &fs = pending.get_filesystem(fscid);
 
-        pending_fsmap.modify_daemon(gid, [fscid, leaderinfo](
+        pending.modify_daemon(gid, [fscid, leaderinfo](
               MDSMap::mds_info_t *info) {
             info->standby_for_rank = leaderinfo->rank;
             info->standby_for_fscid = fscid;
@@ -631,22 +634,22 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
     last_beacon[gid].seq = seq;
 
     // new incompat?
-    if (!pending_fsmap.compat.writeable(m->get_compat())) {
-      dout(10) << " fsmap " << pending_fsmap.compat
+    if (!pending.compat.writeable(m->get_compat())) {
+      dout(10) << " fsmap " << pending.compat
                << " can't write to new mds' " << m->get_compat()
               << ", updating fsmap and killing old mds's"
               << dendl;
-      pending_fsmap.update_compat(m->get_compat());
+      pending.update_compat(m->get_compat());
     }
 
     update_metadata(m->get_global_id(), m->get_sys_info());
   } else {
     // state update
-    const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
+    const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
     // Old MDS daemons don't mention that they're standby replay until
     // after they've sent their boot beacon, so update this field.
     if (info.standby_replay != m->get_standby_replay()) {
-      pending_fsmap.modify_daemon(info.global_id, [&m](
+      pending.modify_daemon(info.global_id, [&m](
             MDSMap::mds_info_t *i)
         {
           i->standby_replay = m->get_standby_replay();
@@ -663,7 +666,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
 
     if (info.laggy()) {
       dout(10) << "prepare_beacon clearing laggy flag on " << addr << dendl;
-      pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
+      pending.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
         {
           info->clear_laggy();
         }
@@ -676,15 +679,15 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
             << "  standby_for_rank=" << m->get_standby_for_rank()
             << dendl;
     if (state == MDSMap::STATE_STOPPED) {
-      const auto fscid = pending_fsmap.mds_roles.at(gid);
-      auto fs = pending_fsmap.get_filesystem(fscid);
+      const auto fscid = pending.mds_roles.at(gid);
+      const auto &fs = pending.get_filesystem(fscid);
 
       mon->clog->info() << info.human_name() << " finished "
                         << "deactivating rank " << info.rank << " in filesystem "
                         << fs->mds_map.fs_name << " (now has "
                         << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
 
-      auto erased = pending_fsmap.stop(gid);
+      auto erased = pending.stop(gid);
       erased.push_back(gid);
 
       for (const auto &erased_gid : erased) {
@@ -713,14 +716,14 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
       until += g_conf->get_val<double>("mon_mds_blacklist_interval");
       const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
       request_proposal(mon->osdmon());
-      pending_fsmap.damaged(gid, blacklist_epoch);
+      pending.damaged(gid, blacklist_epoch);
       last_beacon.erase(gid);
 
       // Respond to MDS, so that it knows it can continue to shut down
       mon->send_reply(op,
                      new MMDSBeacon(
                        mon->monmap->fsid, m->get_global_id(),
-                       m->get_name(), fsmap.get_epoch(), state, seq,
+                       m->get_name(), pending.get_epoch(), state, seq,
                        CEPH_FEATURES_SUPPORTED_DEFAULT));
     } else if (state == MDSMap::STATE_DNE) {
       if (!mon->osdmon()->is_writeable()) {
@@ -738,7 +741,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
       mon->send_reply(op,
                      new MMDSBeacon(
                        mon->monmap->fsid, m->get_global_id(),
-                       m->get_name(), fsmap.get_epoch(), state, seq,
+                       m->get_name(), pending.get_epoch(), state, seq,
                        CEPH_FEATURES_SUPPORTED_DEFAULT));
     } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
       // Standby daemons should never modify their own
@@ -756,8 +759,8 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
       return true;
     } else {
       if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
-        auto fscid = pending_fsmap.mds_roles.at(gid);
-        auto fs = pending_fsmap.get_filesystem(fscid);
+        const auto &fscid = pending.mds_roles.at(gid);
+        const auto &fs = pending.get_filesystem(fscid);
         mon->clog->info() << info.human_name() << " is now active in "
                           << "filesystem " << fs->mds_map.fs_name << " as rank "
                           << info.rank;
@@ -765,7 +768,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
 
       // Made it through special cases and validations, record the
       // daemon's reported state to the FSMap.
-      pending_fsmap.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
+      pending.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
         info->state = state;
         info->state_seq = seq;
       });
@@ -773,7 +776,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
   }
 
   dout(7) << "prepare_beacon pending map now:" << dendl;
-  print_map(pending_fsmap);
+  print_map(pending);
   
   wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
     if (r >= 0)
@@ -790,12 +793,14 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
 
 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
 {
+  auto &pending = get_pending_fsmap_writeable();
+
   op->mark_mdsmon_event(__func__);
   MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
   mds_gid_t gid = m->global_id;
-  if (pending_fsmap.gid_has_rank(gid)) {
+  if (pending.gid_has_rank(gid)) {
     dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
-    pending_fsmap.update_export_targets(gid, m->targets);
+    pending.update_export_targets(gid, m->targets);
   } else {
     dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
   }
@@ -810,6 +815,7 @@ bool MDSMonitor::should_propose(double& delay)
 
 void MDSMonitor::_updated(MonOpRequestRef op)
 {
+  const auto &fsmap = get_fsmap();
   op->mark_mdsmon_event(__func__);
   MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
   dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
@@ -838,8 +844,8 @@ void MDSMonitor::on_active()
   tick();
   update_logger();
 
-  if (mon->is_leader()) {
-    mon->clog->debug() << "fsmap " << fsmap;
+  if (is_leader()) {
+    mon->clog->debug() << "fsmap " << get_fsmap();
   }
 }
 
@@ -847,10 +853,12 @@ void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
                            list<pair<health_status_t, string> > *detail,
                            CephContext* cct) const
 {
+  const auto &fsmap = get_fsmap();
+
   fsmap.get_health(summary, detail);
 
   // For each MDS GID...
-  const auto info_map = fsmap.get_mds_info();
+  const auto &info_map = fsmap.get_mds_info();
   for (const auto &i : info_map) {
     const auto &gid = i.first;
     const auto &info = i.second;
@@ -898,7 +906,7 @@ void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
 void MDSMonitor::dump_info(Formatter *f)
 {
   f->open_object_section("fsmap");
-  fsmap.dump(f);
+  get_fsmap().dump(f);
   f->close_section();
 
   f->dump_unsigned("mdsmap_first_committed", get_first_committed());
@@ -914,6 +922,8 @@ bool MDSMonitor::preprocess_command(MonOpRequestRef op)
   stringstream ss, ds;
 
   map<string, cmd_vartype> cmdmap;
+  const auto &fsmap = get_working_fsmap();
+
   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     // ss has reason for failure
     string rs = ss.str();
@@ -947,87 +957,80 @@ bool MDSMonitor::preprocess_command(MonOpRequestRef op)
     int64_t epocharg;
     epoch_t epoch;
 
-    FSMap *p = &fsmap;
+    const FSMap *fsmapp = &get_fsmap();
+    FSMap dummy;
     if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
       epoch = epocharg;
       bufferlist b;
       int err = get_version(epoch, b);
       if (err == -ENOENT) {
-       p = 0;
        r = -ENOENT;
+        goto out;
       } else {
        assert(err == 0);
        assert(b.length());
-       p = new FSMap;
-       p->decode(b);
+        dummy.decode(b);
+        fsmapp = &dummy;
       }
     }
-    if (p) {
-      stringstream ds;
-      const MDSMap *mdsmap = nullptr;
-      MDSMap blank;
-      blank.epoch = fsmap.epoch;
-      if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
-        mdsmap = &(fsmap.filesystems[fsmap.legacy_client_fscid]->mds_map);
-      } else {
-        mdsmap = &blank;
-      }
-      if (f != NULL) {
-       f->open_object_section("mdsmap");
-       mdsmap->dump(f.get());
-       f->close_section();
-       f->flush(ds);
-       r = 0;
-      } else {
-       mdsmap->print(ds);
-       r = 0;
-      }
-
-      rdata.append(ds);
-      ss << "dumped fsmap epoch " << p->get_epoch();
 
-      if (p != &fsmap) {
-       delete p;
-      }
+    stringstream ds;
+    const MDSMap *mdsmapp = nullptr;
+    MDSMap blank;
+    blank.epoch = fsmapp->epoch;
+    if (fsmapp->legacy_client_fscid != FS_CLUSTER_ID_NONE) {
+      mdsmapp = &fsmapp->filesystems.at(fsmapp->legacy_client_fscid)->mds_map;
+    } else {
+      mdsmapp = &blank;
     }
+    if (f != NULL) {
+      f->open_object_section("mdsmap");
+      mdsmapp->dump(f.get());
+      f->close_section();
+      f->flush(ds);
+      r = 0;
+    } else {
+      mdsmapp->print(ds);
+      r = 0;
+    }
+
+    rdata.append(ds);
+    ss << "dumped fsmap epoch " << fsmapp->get_epoch();
   } else if (prefix == "fs dump") {
     int64_t epocharg;
     epoch_t epoch;
 
-    FSMap *p = &fsmap;
+    const FSMap *fsmapp = &get_fsmap();
+    FSMap dummy;
     if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
       epoch = epocharg;
       bufferlist b;
       int err = get_version(epoch, b);
       if (err == -ENOENT) {
-       p = 0;
        r = -ENOENT;
+        goto out;
       } else {
        assert(err == 0);
        assert(b.length());
-       p = new FSMap;
-       p->decode(b);
+       dummy.decode(b);
+        fsmapp = &dummy;
       }
     }
-    if (p) {
-      stringstream ds;
-      if (f != NULL) {
-       f->open_object_section("fsmap");
-       p->dump(f.get());
-       f->close_section();
-       f->flush(ds);
-       r = 0;
-      } else {
-       p->print(ds);
-       r = 0;
-      }
-
-      rdata.append(ds);
-      ss << "dumped fsmap epoch " << p->get_epoch();
 
-      if (p != &fsmap)
-       delete p;
+    stringstream ds;
+    if (f != NULL) {
+      f->open_object_section("fsmap");
+      fsmapp->dump(f.get());
+      f->close_section();
+      f->flush(ds);
+      r = 0;
+    } else {
+      fsmapp->print(ds);
+      r = 0;
     }
+
+    rdata.append(ds);
+    ss << "dumped fsmap epoch " << fsmapp->get_epoch();
   } else if (prefix == "mds metadata") {
     if (!f)
       f.reset(Formatter::create("json-pretty"));
@@ -1119,7 +1122,7 @@ bool MDSMonitor::preprocess_command(MonOpRequestRef op)
   } else if (prefix == "fs get") {
     string fs_name;
     cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
-    auto fs = fsmap.get_filesystem(fs_name);
+    const auto &fs = fsmap.get_filesystem(fs_name);
     if (fs == nullptr) {
       ss << "filesystem '" << fs_name << "' not found";
       r = -ENOENT;
@@ -1139,8 +1142,8 @@ bool MDSMonitor::preprocess_command(MonOpRequestRef op)
     if (f) {
       f->open_array_section("filesystems");
       {
-        for (const auto i : fsmap.filesystems) {
-          const auto fs = i.second;
+        for (const auto &p : fsmap.filesystems) {
+          const auto &fs = p.second;
           f->open_object_section("filesystem");
           {
             const MDSMap &mds_map = fs->mds_map;
@@ -1177,8 +1180,8 @@ bool MDSMonitor::preprocess_command(MonOpRequestRef op)
       f->close_section();
       f->flush(ds);
     } else {
-      for (const auto i : fsmap.filesystems) {
-        const auto fs = i.second;
+      for (const auto &p : fsmap.filesystems) {
+        const auto &fs = p.second;
         const MDSMap &mds_map = fs->mds_map;
         const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
             mds_map.metadata_pool);
@@ -1199,6 +1202,7 @@ bool MDSMonitor::preprocess_command(MonOpRequestRef op)
     r = 0;
   }
 
+out:
   if (r != -1) {
     rdata.append(ds);
     string rs;
@@ -1211,7 +1215,9 @@ bool MDSMonitor::preprocess_command(MonOpRequestRef op)
 
 bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
 {
-  const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
+  auto &pending = get_pending_fsmap_writeable();
+
+  const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
   dout(10) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
 
   epoch_t blacklist_epoch = 0;
@@ -1221,7 +1227,7 @@ bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
     blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
   }
 
-  pending_fsmap.erase(gid, blacklist_epoch);
+  pending.erase(gid, blacklist_epoch);
   last_beacon.erase(gid);
   if (pending_daemon_health.count(gid)) {
     pending_daemon_health.erase(gid);
@@ -1233,7 +1239,7 @@ bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
 
 mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
 {
-  const FSMap *relevant_fsmap = mon->is_leader() ? &pending_fsmap : &fsmap;
+  const auto &fsmap = get_working_fsmap();
 
   // Try parsing as a role
   mds_role_t role;
@@ -1241,7 +1247,7 @@ mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
   int r = parse_role(arg, &role, ignore_err);
   if (r == 0) {
     // See if a GID is assigned to this role
-    auto fs = relevant_fsmap->get_filesystem(role.fscid);
+    const auto &fs = fsmap.get_filesystem(role.fscid);
     assert(fs != nullptr);  // parse_role ensures it exists
     if (fs->mds_map.is_up(role.rank)) {
       dout(10) << __func__ << ": validated rank/GID " << role
@@ -1255,7 +1261,7 @@ mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
   unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
   if (!err.empty()) {
     // Not a role or a GID, try as a daemon name
-    const MDSMap::mds_info_t *mds_info = relevant_fsmap->find_by_name(arg);
+    const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
     if (!mds_info) {
       ss << "MDS named '" << arg
         << "' does not exist, or is not up";
@@ -1269,7 +1275,7 @@ mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
     dout(10) << __func__ << ": treating MDS reference '" << arg
             << "' as an integer " << maybe_gid << dendl;
 
-    if (relevant_fsmap->gid_exists(mds_gid_t(maybe_gid))) {
+    if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
       return mds_gid_t(maybe_gid);
     }
   }
@@ -1294,7 +1300,7 @@ int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg,
 
   // Take a copy of the info before removing the MDS from the map,
   // so that the caller knows which mds (if any) they ended up removing.
-  *failed_info = pending_fsmap.get_info_gid(gid);
+  *failed_info = get_pending_fsmap().get_info_gid(gid);
 
   fail_mds_gid(gid);
   ss << "failed mds gid " << gid;
@@ -1328,14 +1334,16 @@ bool MDSMonitor::prepare_command(MonOpRequestRef op)
     return true;
   }
 
+  auto &pending = get_pending_fsmap_writeable();
+
   bool batched_propose = false;
-  for (auto h : handlers) {
+  for (const auto &h : handlers) {
     if (h->can_handle(prefix)) {
       batched_propose = h->batched_propose();
       if (batched_propose) {
         paxos->plug();
       }
-      r = h->handle(mon, pending_fsmap, op, cmdmap, ss);
+      r = h->handle(mon, pending, op, cmdmap, ss);
       if (batched_propose) {
         paxos->unplug();
       }
@@ -1347,7 +1355,7 @@ bool MDSMonitor::prepare_command(MonOpRequestRef op)
       } else {
         if (r == 0) {
           // On successful updates, print the updated map
-          print_map(pending_fsmap);
+          print_map(pending);
         }
         // Successful or not, we're done: respond.
         goto out;
@@ -1367,8 +1375,8 @@ bool MDSMonitor::prepare_command(MonOpRequestRef op)
   }
 
   // Only handle legacy commands if there is a filesystem configured
-  if (pending_fsmap.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
-    if (pending_fsmap.filesystems.size() == 0) {
+  if (pending.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
+    if (pending.filesystems.size() == 0) {
       ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
     } else {
       ss << "No filesystem set for use with legacy commands";
@@ -1419,11 +1427,7 @@ int MDSMonitor::parse_role(
     mds_role_t *role,
     std::ostream &ss)
 {
-  const FSMap *relevant_fsmap = &fsmap;
-  if (mon->is_leader()) {
-    relevant_fsmap = &pending_fsmap;
-  }
-  return relevant_fsmap->parse_role(role_str, role, ss);
+  return get_working_fsmap().parse_role(role_str, role, ss);
 }
 
 int MDSMonitor::filesystem_command(
@@ -1438,15 +1442,15 @@ int MDSMonitor::filesystem_command(
   string whostr;
   cmd_getval(g_ceph_context, cmdmap, "who", whostr);
 
+  auto &pending = get_pending_fsmap_writeable();
   if (prefix == "mds stop" ||
       prefix == "mds deactivate") {
-
     mds_role_t role;
     r = parse_role(whostr, &role, ss);
     if (r < 0 ) {
       return r;
     }
-    auto fs = pending_fsmap.get_filesystem(role.fscid);
+    const auto &fs = pending.get_filesystem(role.fscid);
 
     if (!fs->mds_map.is_active(role.rank)) {
       r = -EEXIST;
@@ -1469,9 +1473,9 @@ int MDSMonitor::filesystem_command(
       r = 0;
       mds_gid_t gid = fs->mds_map.up.at(role.rank);
       ss << "telling mds." << role << " "
-         << pending_fsmap.get_info_gid(gid).addr << " to deactivate";
+         << pending.get_info_gid(gid).addr << " to deactivate";
 
-      pending_fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
+      pending.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
         info->state = MDSMap::STATE_STOPPING;
       });
     }
@@ -1488,8 +1492,8 @@ int MDSMonitor::filesystem_command(
          << cmd_vartype_stringify(cmdmap["state"]) << "'";
       return -EINVAL;
     }
-    if (pending_fsmap.gid_exists(gid)) {
-      pending_fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
+    if (pending.gid_exists(gid)) {
+      pending.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
         info->state = state;
       });
       ss << "set mds gid " << gid << " to state " << state << " "
@@ -1519,17 +1523,18 @@ int MDSMonitor::filesystem_command(
          << cmd_vartype_stringify(cmdmap["gid"]) << "'";
       return -EINVAL;
     }
-    if (!pending_fsmap.gid_exists(gid)) {
+    if (!pending.gid_exists(gid)) {
       ss << "mds gid " << gid << " dne";
       r = 0;
     } else {
-      MDSMap::DaemonState state = pending_fsmap.get_info_gid(gid).state;
+      const auto &info = pending.get_info_gid(gid);
+      MDSMap::DaemonState state = info.state;
       if (state > 0) {
-        ss << "cannot remove active mds." << pending_fsmap.get_info_gid(gid).name
-           << " rank " << pending_fsmap.get_info_gid(gid).rank;
+        ss << "cannot remove active mds." << info.name
+           << " rank " << info.rank;
         return -EBUSY;
       } else {
-        pending_fsmap.erase(gid, {});
+        pending.erase(gid, {});
         ss << "removed mds gid " << gid;
         return 0;
       }
@@ -1552,7 +1557,7 @@ int MDSMonitor::filesystem_command(
       return -EINVAL;
     }
 
-    pending_fsmap.modify_filesystem(
+    pending.modify_filesystem(
         role.fscid,
         [role](std::shared_ptr<Filesystem> fs)
     {
@@ -1568,13 +1573,13 @@ int MDSMonitor::filesystem_command(
          << cmd_vartype_stringify(cmdmap["feature"]) << "'";
       return -EINVAL;
     }
-    if (pending_fsmap.compat.compat.contains(f)) {
+    if (pending.compat.compat.contains(f)) {
       ss << "removing compat feature " << f;
-      CompatSet modified = pending_fsmap.compat;
+      CompatSet modified = pending.compat;
       modified.compat.remove(f);
-      pending_fsmap.update_compat(modified);
+      pending.update_compat(modified);
     } else {
-      ss << "compat feature " << f << " not present in " << pending_fsmap.compat;
+      ss << "compat feature " << f << " not present in " << pending.compat;
     }
     r = 0;
   } else if (prefix == "mds compat rm_incompat") {
@@ -1584,13 +1589,13 @@ int MDSMonitor::filesystem_command(
          << cmd_vartype_stringify(cmdmap["feature"]) << "'";
       return -EINVAL;
     }
-    if (pending_fsmap.compat.incompat.contains(f)) {
+    if (pending.compat.incompat.contains(f)) {
       ss << "removing incompat feature " << f;
-      CompatSet modified = pending_fsmap.compat;
+      CompatSet modified = pending.compat;
       modified.incompat.remove(f);
-      pending_fsmap.update_compat(modified);
+      pending.update_compat(modified);
     } else {
-      ss << "incompat feature " << f << " not present in " << pending_fsmap.compat;
+      ss << "incompat feature " << f << " not present in " << pending.compat;
     }
     r = 0;
   } else if (prefix == "mds repaired") {
@@ -1602,7 +1607,7 @@ int MDSMonitor::filesystem_command(
       return r;
     }
 
-    bool modified = pending_fsmap.undamaged(role.fscid, role.rank);
+    bool modified = pending.undamaged(role.fscid, role.rank);
     if (modified) {
       dout(4) << "repaired: restoring rank " << role << dendl;
     } else {
@@ -1623,6 +1628,7 @@ int MDSMonitor::filesystem_command(
 void MDSMonitor::modify_legacy_filesystem(
     std::function<void(std::shared_ptr<Filesystem> )> fn)
 {
+  auto &pending_fsmap = get_pending_fsmap_writeable();
   pending_fsmap.modify_filesystem(
     pending_fsmap.legacy_client_fscid,
     fn
@@ -1652,6 +1658,8 @@ int MDSMonitor::legacy_filesystem_command(
   string whostr;
   cmd_getval(g_ceph_context, cmdmap, "who", whostr);
 
+  auto &pending_fsmap = get_pending_fsmap_writeable();
+
   assert (pending_fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
 
   if (prefix == "mds set_max_mds") {
@@ -1721,8 +1729,8 @@ void MDSMonitor::check_subs()
   types.push_back("fsmap");
   types.push_back("fsmap.user");
   types.push_back("mdsmap");
-  for (const auto &i : fsmap.filesystems) {
-    auto fscid = i.first;
+  for (const auto &p : get_fsmap().filesystems) {
+    const auto &fscid = p.first;
     std::ostringstream oss;
     oss << "mdsmap." << fscid;
     types.push_back(oss.str());
@@ -1745,6 +1753,8 @@ void MDSMonitor::check_sub(Subscription *sub)
 {
   dout(20) << __func__ << ": " << sub->type << dendl;
 
+  const auto &fsmap = get_fsmap();
+
   if (sub->type == "fsmap") {
     if (sub->next <= fsmap.get_epoch()) {
       sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
@@ -1759,12 +1769,10 @@ void MDSMonitor::check_sub(Subscription *sub)
       FSMapUser fsmap_u;
       fsmap_u.epoch = fsmap.get_epoch();
       fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
-      for (auto p = fsmap.filesystems.begin();
-          p != fsmap.filesystems.end();
-          ++p) {
-       FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p->first];
-       fs_info.cid = p->first;
-       fs_info.name= p->second->mds_map.fs_name;
+      for (const auto &p : fsmap.filesystems) {
+       FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
+       fs_info.cid = p.second->fscid;
+       fs_info.name = p.second->mds_map.fs_name;
       }
       sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
       if (sub->onetime) {
@@ -1828,24 +1836,25 @@ void MDSMonitor::check_sub(Subscription *sub)
     dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
 
     // Work out the effective latest epoch
-    MDSMap *mds_map = nullptr;
+    const MDSMap *mds_map = nullptr;
     MDSMap null_map;
     null_map.compat = fsmap.compat;
     if (fscid == FS_CLUSTER_ID_NONE) {
       // For a client, we should have already dropped out
       assert(is_mds);
 
-      if (fsmap.standby_daemons.count(mds_gid)) {
+      auto it = fsmap.standby_daemons.find(mds_gid);
+      if (it != fsmap.standby_daemons.end()) {
         // For an MDS, we need to feed it an MDSMap with its own state in
-        null_map.mds_info[mds_gid] = fsmap.standby_daemons[mds_gid];
-        null_map.epoch = fsmap.standby_epochs[mds_gid];
+        null_map.mds_info[mds_gid] = it->second;
+        null_map.epoch = fsmap.standby_epochs.at(mds_gid);
       } else {
         null_map.epoch = fsmap.epoch;
       }
       mds_map = &null_map;
     } else {
       // Check the effective epoch 
-      mds_map = &(fsmap.filesystems.at(fscid)->mds_map);
+      mds_map = &fsmap.get_filesystem(fscid)->mds_map;
     }
 
     assert(mds_map != nullptr);
@@ -1888,7 +1897,7 @@ void MDSMonitor::remove_from_metadata(MonitorDBStore::TransactionRef t)
   bool update = false;
   for (map<mds_gid_t, Metadata>::iterator i = pending_metadata.begin();
        i != pending_metadata.end(); ) {
-    if (!pending_fsmap.gid_exists(i->first)) {
+    if (!get_pending_fsmap().gid_exists(i->first)) {
       pending_metadata.erase(i++);
       update = true;
     } else {
@@ -1985,11 +1994,11 @@ int MDSMonitor::print_nodes(Formatter *f)
       continue;
     }
     const mds_gid_t gid = it->first;
-    if (!fsmap.gid_exists(gid)) {
+    if (!get_fsmap().gid_exists(gid)) {
       dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
       continue;
     }
-    const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
+    const MDSMap::mds_info_t& mds_info = get_fsmap().get_info_gid(gid);
     // FIXME: include filesystem name with rank here
     mdses[hostname->second].push_back(mds_info.rank);
   }
@@ -2002,9 +2011,10 @@ int MDSMonitor::print_nodes(Formatter *f)
  * If a cluster is undersized (with respect to max_mds), then
  * attempt to find daemons to grow it.
  */
-bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
+bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> &fs)
 {
   bool do_propose = false;
+  auto &pending = get_pending_fsmap_writeable();
 
   if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
     return do_propose;
@@ -2017,13 +2027,13 @@ bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
     while (fs->mds_map.is_in(mds)) {
       mds++;
     }
-    mds_gid_t newgid = pending_fsmap.find_replacement_for({fs->fscid, mds},
+    mds_gid_t newgid = pending.find_replacement_for({fs->fscid, mds},
                          name, g_conf->mon_force_standby_active);
     if (newgid == MDS_GID_NONE) {
       break;
     }
 
-    const auto &new_info = pending_fsmap.get_info_gid(newgid);
+    const auto &new_info = pending.get_info_gid(newgid);
     dout(1) << "assigned standby " << new_info.addr
             << " as mds." << mds << dendl;
 
@@ -2031,7 +2041,7 @@ bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
                          "filesystem " << fs->mds_map.fs_name << " as rank "
                       << mds << " (now has " << fs->mds_map.get_num_in_mds() + 1
                       << " ranks)";
-    pending_fsmap.promote(newgid, fs, mds);
+    pending.promote(newgid, fs, mds);
     do_propose = true;
   }
 
@@ -2050,7 +2060,8 @@ void MDSMonitor::maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info
   assert(mds_propose != nullptr);
   assert(osd_propose != nullptr);
 
-  const auto fscid = pending_fsmap.mds_roles.at(gid);
+  auto &pending = get_pending_fsmap_writeable();
+  const auto fscid = pending.mds_roles.at(gid);
 
   // We will only take decisive action (replacing/removing a daemon)
   // if we have some indicating that some other daemon(s) are successfully
@@ -2070,12 +2081,12 @@ void MDSMonitor::maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info
       info.state != MDSMap::STATE_STANDBY &&
       info.state != MDSMap::STATE_STANDBY_REPLAY &&
       may_replace &&
-      !pending_fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
-      (sgid = pending_fsmap.find_replacement_for({fscid, info.rank}, info.name,
+      !pending.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
+      (sgid = pending.find_replacement_for({fscid, info.rank}, info.name,
                 g_conf->mon_force_standby_active)) != MDS_GID_NONE)
   {
     
-    MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
+    MDSMap::mds_info_t si = pending.get_info_gid(sgid);
     dout(10) << " replacing " << gid << " " << info.addr << " mds."
       << info.rank << "." << info.inc
       << " " << ceph_mds_state_name(info.state)
@@ -2087,14 +2098,14 @@ void MDSMonitor::maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info
                       << " with standby " << si.human_name();
 
     // Remember what NS the old one was in
-    const fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
+    const fs_cluster_id_t fscid = pending.mds_roles.at(gid);
 
     // Remove the old one
     *osd_propose |= fail_mds_gid(gid);
 
     // Promote the replacement
-    auto fs = pending_fsmap.filesystems.at(fscid);
-    pending_fsmap.promote(sgid, fs, info.rank);
+    auto fs = pending.filesystems.at(fscid);
+    pending.promote(sgid, fs, info.rank);
 
     *mds_propose = true;
   } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
@@ -2110,17 +2121,19 @@ void MDSMonitor::maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info
       dout(10) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
         << " " << ceph_mds_state_name(info.state)
         << " laggy" << dendl;
-      pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
+      pending.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
           info->laggy_since = ceph_clock_now();
       });
       *mds_propose = true;
   }
 }
 
-bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
+bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> &fs)
 {
   assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
 
+  auto &pending = get_pending_fsmap_writeable();
+
   bool do_propose = false;
 
   // have a standby take over?
@@ -2130,17 +2143,17 @@ bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
     set<mds_rank_t>::iterator p = failed.begin();
     while (p != failed.end()) {
       mds_rank_t f = *p++;
-      mds_gid_t sgid = pending_fsmap.find_replacement_for({fs->fscid, f}, {},
+      mds_gid_t sgid = pending.find_replacement_for({fs->fscid, f}, {},
           g_conf->mon_force_standby_active);
       if (sgid) {
-        const MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
+        const MDSMap::mds_info_t si = pending.get_info_gid(sgid);
         dout(0) << " taking over failed mds." << f << " with " << sgid
                 << "/" << si.name << " " << si.addr << dendl;
         mon->clog->info() << "Standby " << si.human_name()
                           << " assigned to filesystem " << fs->mds_map.fs_name
                           << " as rank " << f;
 
-        pending_fsmap.promote(sgid, fs, f);
+        pending.promote(sgid, fs, f);
        do_propose = true;
       }
     }
@@ -2152,12 +2165,12 @@ bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
     // them while perhaps-modifying standby_daemons during the loop
     // (if we promote anyone they are removed from standby_daemons)
     std::vector<mds_gid_t> standby_gids;
-    for (const auto &j : pending_fsmap.standby_daemons) {
+    for (const auto &j : pending.standby_daemons) {
       standby_gids.push_back(j.first);
     }
 
     for (const auto &gid : standby_gids) {
-      const auto &info = pending_fsmap.standby_daemons.at(gid);
+      const auto &info = pending.standby_daemons.at(gid);
       assert(info.state == MDSMap::STATE_STANDBY);
 
       if (!info.standby_replay) {
@@ -2176,14 +2189,14 @@ bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
         // the standby_for_rank refers to: lookup via legacy_client_fscid
         mds_role_t target_role = {
           info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
-            pending_fsmap.legacy_client_fscid : info.standby_for_fscid,
+            pending.legacy_client_fscid : info.standby_for_fscid,
           info.standby_for_rank};
 
         // It is possible that the map contains a standby_for_fscid
         // that doesn't correspond to an existing filesystem, especially
         // if we loaded from a version with a bug (#17466)
         if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
-            && !pending_fsmap.filesystem_exists(info.standby_for_fscid)) {
+            && !pending.filesystem_exists(info.standby_for_fscid)) {
           derr << "gid " << gid << " has invalid standby_for_fscid "
                << info.standby_for_fscid << dendl;
           continue;
@@ -2191,7 +2204,7 @@ bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
 
         // If we managed to resolve a full target role
         if (target_role.fscid != FS_CLUSTER_ID_NONE) {
-          auto fs = pending_fsmap.get_filesystem(target_role.fscid);
+          const auto &fs = pending.get_filesystem(target_role.fscid);
           if (fs->mds_map.is_followable(target_role.rank)) {
             do_propose |= try_standby_replay(
                 info,
@@ -2204,23 +2217,32 @@ bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
       }
 
       // check everyone
-      for (auto fs_i : pending_fsmap.filesystems) {
-        const MDSMap &mds_map = fs_i.second->mds_map;
-        for (auto mds_i : mds_map.mds_info) {
-          MDSMap::mds_info_t &cand_info = mds_i.second;
+      for (const auto &p : pending.filesystems) {
+       if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
+           info.standby_for_fscid != p.first)
+         continue;
+
+       bool assigned = false;
+        const auto &fs = p.second;
+        const MDSMap &mds_map = fs->mds_map;
+        for (const auto &mds_i : mds_map.mds_info) {
+          const MDSMap::mds_info_t &cand_info = mds_i.second;
           if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
             if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
                 info.standby_for_rank != MDS_RANK_NONE) {
               continue;   // we're supposed to follow someone else
             }
 
-            if (try_standby_replay(info, *(fs_i.second), cand_info)) {
-              do_propose = true;
+            if (try_standby_replay(info, *fs, cand_info)) {
+             assigned = true;
               break;
             }
-            continue;
           }
         }
+       if (assigned) {
+         do_propose = true;
+         break;
+       }
       }
     }
   }
@@ -2232,19 +2254,22 @@ void MDSMonitor::tick()
 {
   // make sure mds's are still alive
   // ...if i am an active leader
+
   if (!is_active()) return;
 
-  dout(10) << fsmap << dendl;
+  dout(10) << get_working_fsmap() << dendl;
 
-  bool do_propose = false;
+  if (!is_leader()) return;
+
+  auto &pending = get_pending_fsmap_writeable();
 
-  if (!mon->is_leader()) return;
+  bool do_propose = false;
 
-  do_propose |= pending_fsmap.check_health();
+  do_propose |= pending.check_health();
 
   // expand mds cluster (add new nodes to @in)?
-  for (auto i : pending_fsmap.filesystems) {
-    do_propose |= maybe_expand_cluster(i.second);
+  for (auto &p : pending.filesystems) {
+    do_propose |= maybe_expand_cluster(p.second);
   }
 
   const auto now = ceph_clock_now();
@@ -2270,7 +2295,7 @@ void MDSMonitor::tick()
   cutoff -= g_conf->mds_beacon_grace;
 
   // make sure last_beacon is fully populated
-  for (const auto &p : pending_fsmap.mds_roles) {
+  for (auto &p : pending.mds_roles) {
     auto &gid = p.first;
     if (last_beacon.count(gid) == 0) {
       last_beacon[gid].stamp = now;
@@ -2286,14 +2311,14 @@ void MDSMonitor::tick()
     auto beacon_info = p->second;
     ++p;
 
-    if (!pending_fsmap.gid_exists(gid)) {
+    if (!pending.gid_exists(gid)) {
       // clean it out
       last_beacon.erase(gid);
       continue;
     }
 
     if (beacon_info.stamp < cutoff) {
-      auto &info = pending_fsmap.get_info_gid(gid);
+      auto &info = pending.get_info_gid(gid);
       dout(1) << "no beacon from mds." << info.rank << "." << info.inc
               << " (gid: " << gid << " addr: " << info.addr
               << " state: " << ceph_mds_state_name(info.state) << ")"
@@ -2309,8 +2334,8 @@ void MDSMonitor::tick()
     request_proposal(mon->osdmon());
   }
 
-  for (auto i : pending_fsmap.filesystems) {
-    auto fs = i.second;
+  for (auto &p : pending.filesystems) {
+    auto &fs = p.second;
     if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
       do_propose |= maybe_promote_standby(fs);
     }
@@ -2338,7 +2363,7 @@ bool MDSMonitor::try_standby_replay(
   } else {
     // Assign the new role to the standby
     dout(10) << "  setting to follow mds rank " << ainfo.rank << dendl;
-    pending_fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
+    get_pending_fsmap_writeable().assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
     return true;
   }
 }
index 3d84f92a811fc6ec393fd348773942e7d2168f9c..88e9decf266ca25b9672f517a680f3a23e249620 100644 (file)
@@ -23,8 +23,7 @@
 using namespace std;
 
 #include "include/types.h"
-#include "mds/FSMap.h"
-#include "mds/MDSMap.h"
+#include "PaxosFSMap.h"
 #include "PaxosService.h"
 #include "msg/Messenger.h"
 #include "messages/MMDSBeacon.h"
@@ -34,7 +33,7 @@ class MMDSLoadTargets;
 class MMDSMap;
 class FileSystemCommandHandler;
 
-class MDSMonitor : public PaxosService {
+class MDSMonitor : public PaxosService, public PaxosFSMap {
  public:
   MDSMonitor(Monitor *mn, Paxos *p, string service_name);
 
@@ -59,8 +58,6 @@ class MDSMonitor : public PaxosService {
   void check_subs();
   void check_sub(Subscription *sub);
 
-  const FSMap &get_pending() const { return pending_fsmap; }
-  const FSMap &get_fsmap() const { return fsmap; }
   void dump_info(Formatter *f);
   int print_nodes(Formatter *f);
 
@@ -68,13 +65,12 @@ class MDSMonitor : public PaxosService {
    * Return true if a blacklist was done (i.e. OSD propose needed)
    */
   bool fail_mds_gid(mds_gid_t gid);
- protected:
-  // mds maps
-  FSMap fsmap;           // current
-  FSMap pending_fsmap;  // current + pending updates
 
+  bool is_leader() const override { return mon->is_leader(); }
+
+ protected:
   // my helpers
-  void print_map(FSMap &m, int dbl=7);
+  void print_map(const FSMap &m, int dbl=7);
   void update_logger();
 
   void _updated(MonOpRequestRef op);
@@ -127,8 +123,8 @@ class MDSMonitor : public PaxosService {
 
   std::list<std::shared_ptr<FileSystemCommandHandler> > handlers;
 
-  bool maybe_promote_standby(std::shared_ptr<Filesystem> fs);
-  bool maybe_expand_cluster(std::shared_ptr<Filesystem> fs);
+  bool maybe_promote_standby(std::shared_ptr<Filesystem> &fs);
+  bool maybe_expand_cluster(std::shared_ptr<Filesystem> &fs);
   void maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
       bool *mds_propose, bool *osd_propose);
   void tick() override;     // check state, take actions
index 900aa5fce76e4612d68ed13da75653bdf07b2250..fe5bf828ac6fd86609ce4b299c81b16f45157234 100644 (file)
@@ -618,7 +618,7 @@ void MgrMonitor::tick()
                         << " daemon " << pending_map.active_name;
     } else {
       dout(4) << "Active is laggy but have no standbys to replace it" << dendl;
-      mon->clog->warn() << "Manager daemon " << old_active_name
+      mon->clog->info() << "Manager daemon " << old_active_name
                         << " is unresponsive.  No standby daemons available.";
     }
   } else if (pending_map.active_gid == 0) {
index 176ca4055f5e3f6aff2ac96e018e6583168159f8..f5bb5c5bbf54844d2ed306d4153b21ff40e19315 100644 (file)
@@ -941,7 +941,7 @@ COMMAND("osd pool rename " \
        "rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
 COMMAND("osd pool get " \
        "name=pool,type=CephPoolname " \
-       "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block", \
+       "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites", \
        "get pool parameter <var>", "osd", "r", "cli,rest")
 COMMAND("osd pool set " \
        "name=pool,type=CephPoolname " \
index 4be4cd78b209b308c51543ba8e48cac9438741e8..6348f44c97b8cdc6fac8cb486f951492d74f9e6f 100644 (file)
@@ -5898,7 +5898,8 @@ bool Monitor::ms_get_authorizer(int service_id, AuthAuthorizer **authorizer,
 bool Monitor::ms_verify_authorizer(Connection *con, int peer_type,
                                   int protocol, bufferlist& authorizer_data,
                                   bufferlist& authorizer_reply,
-                                  bool& isvalid, CryptoKey& session_key)
+                                  bool& isvalid, CryptoKey& session_key,
+                                  std::unique_ptr<AuthAuthorizerChallenge> *challenge)
 {
   dout(10) << "ms_verify_authorizer " << con->get_peer_addr()
           << " " << ceph_entity_type_name(peer_type)
@@ -5917,7 +5918,7 @@ bool Monitor::ms_verify_authorizer(Connection *con, int peer_type,
       
       if (authorizer_data.length()) {
        bool ret = cephx_verify_authorizer(g_ceph_context, &keyring, iter,
-                                         auth_ticket_info, authorizer_reply);
+                                          auth_ticket_info, challenge, authorizer_reply);
        if (ret) {
          session_key = auth_ticket_info.session_key;
          isvalid = true;
index 4ff8706455163ff599d66ccfd139acc5f3c08dbf..008947e85be34d2203421f8d12f80e9bc00fd2cd 100644 (file)
@@ -906,7 +906,8 @@ public:
   bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
   bool ms_verify_authorizer(Connection *con, int peer_type,
                            int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
-                           bool& isvalid, CryptoKey& session_key) override;
+                           bool& isvalid, CryptoKey& session_key,
+                           std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
   bool ms_handle_reset(Connection *con) override;
   void ms_handle_remote_reset(Connection *con) override {}
   bool ms_handle_refused(Connection *con) override;
index 17921fd077e23f40199fc4d0c06e17de73c094c1..f8efabb0399b333b243cac66bdf1956d655ad309 100644 (file)
@@ -76,6 +76,9 @@
 #include "include/str_map.h"
 #include "include/scope_guard.h"
 
+#include "auth/cephx/CephxKeyServer.h"
+#include "osd/OSDCap.h"
+
 #include "json_spirit/json_spirit_reader.h"
 
 #include <boost/algorithm/string/predicate.hpp>
@@ -90,6 +93,87 @@ const uint32_t MAX_POOL_APPLICATIONS = 4;
 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
 
+bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
+  // Note: this doesn't include support for the application tag match
+  if ((grant.spec.allow & OSD_CAP_W) != 0) {
+    auto& match = grant.match;
+    if (match.is_match_all()) {
+      return true;
+    } else if (pool_name != nullptr && match.auid < 0 &&
+               !match.pool_namespace.pool_name.empty() &&
+               match.pool_namespace.pool_name == *pool_name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool is_unmanaged_snap_op_permitted(CephContext* cct,
+                                    const KeyServer& key_server,
+                                    const EntityName& entity_name,
+                                    const MonCap& mon_caps,
+                                    const std::string* pool_name)
+{
+  typedef std::map<std::string, std::string> CommandArgs;
+
+  if (mon_caps.is_capable(cct, CEPH_ENTITY_TYPE_MON,
+                               entity_name, "osd",
+                               "osd pool op unmanaged-snap",
+                               (pool_name == nullptr ?
+                                  CommandArgs{} /* pool DNE, require unrestricted cap */ :
+                                  CommandArgs{{"poolname", *pool_name}}),
+                                false, true, false)) {
+    return true;
+  }
+
+  AuthCapsInfo caps_info;
+  if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
+                                   caps_info)) {
+    dout(10) << "unable to locate OSD cap data for " << entity_name
+             << " in auth db" << dendl;
+    return false;
+  }
+
+  string caps_str;
+  if (caps_info.caps.length() > 0) {
+    auto p = caps_info.caps.begin();
+    try {
+      decode(caps_str, p);
+    } catch (const buffer::error &err) {
+      derr << "corrupt OSD cap data for " << entity_name << " in auth db"
+           << dendl;
+      return false;
+    }
+  }
+
+  OSDCap osd_cap;
+  if (!osd_cap.parse(caps_str, nullptr)) {
+    dout(10) << "unable to parse OSD cap data for " << entity_name
+             << " in auth db" << dendl;
+    return false;
+  }
+
+  // if the entity has write permissions in one or all pools, permit
+  // usage of unmanaged-snapshots
+  if (osd_cap.allow_all()) {
+    return true;
+  }
+
+  for (auto& grant : osd_cap.grants) {
+    if (grant.profile.is_valid()) {
+      for (auto& profile_grant : grant.profile_grants) {
+        if (is_osd_writable(profile_grant, pool_name)) {
+          return true;
+        }
+      }
+    } else if (is_osd_writable(grant, pool_name)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 } // anonymous namespace
 
 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
@@ -187,7 +271,6 @@ OSDMonitor::OSDMonitor(
    cct(cct),
    inc_osd_cache(g_conf->mon_osd_cache_size),
    full_osd_cache(g_conf->mon_osd_cache_size),
-   last_attempted_minwait_time(utime_t()),
    mapper(mn->cct, &mn->cpu_tp),
    op_tracker(cct, true, 1)
 {}
@@ -259,8 +342,9 @@ void OSDMonitor::create_initial()
   }
 
   // encode into pending incremental
+  uint64_t features = newmap.get_encoding_features();
   newmap.encode(pending_inc.fullmap,
-                mon->get_quorum_con_features() | CEPH_FEATURE_RESERVED);
+                features | CEPH_FEATURE_RESERVED);
   pending_inc.full_crc = newmap.get_crc();
   dout(20) << " full crc " << pending_inc.full_crc << dendl;
 }
@@ -1240,7 +1324,7 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
           uint32_t match_count = 0;
 
           // CephFS
-          FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
+          const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
           if (pending_fsmap.pool_in_use(pool_id)) {
             dout(10) << __func__ << " auto-enabling CephFS on pool '"
                      << pool_name << "'" << dendl;
@@ -1329,7 +1413,7 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   osdmap.maybe_remove_pg_upmaps(cct, osdmap, &pending_inc);
 
   // features for osdmap and its incremental
-  uint64_t features = mon->get_quorum_con_features();
+  uint64_t features;
 
   // encode full map and determine its crc
   OSDMap tmp;
@@ -1338,22 +1422,13 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
     tmp.apply_incremental(pending_inc);
 
     // determine appropriate features
-    if (tmp.require_osd_release < CEPH_RELEASE_LUMINOUS) {
-      dout(10) << __func__ << " encoding without feature SERVER_LUMINOUS"
-              << dendl;
-      features &= ~CEPH_FEATURE_SERVER_LUMINOUS;
-    }
-    if (tmp.require_osd_release < CEPH_RELEASE_KRAKEN) {
-      dout(10) << __func__ << " encoding without feature SERVER_KRAKEN | "
-              << "MSG_ADDR2" << dendl;
-      features &= ~(CEPH_FEATURE_SERVER_KRAKEN |
-                   CEPH_FEATURE_MSG_ADDR2);
-    }
-    if (tmp.require_osd_release < CEPH_RELEASE_JEWEL) {
-      dout(10) << __func__ << " encoding without feature SERVER_JEWEL" << dendl;
-      features &= ~CEPH_FEATURE_SERVER_JEWEL;
-    }
-    dout(10) << __func__ << " encoding full map with " << features << dendl;
+    features = tmp.get_encoding_features();
+    dout(10) << __func__ << " encoding full map with "
+            << ceph_release_name(tmp.require_osd_release)
+            << " features " << features << dendl;
+
+    // the features should be a subset of the mon quorum's features!
+    assert((features & ~mon->get_quorum_con_features()) == 0);
 
     bufferlist fullbl;
     ::encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
@@ -1557,8 +1632,13 @@ void OSDMonitor::share_map_with_random_osd()
   }
 
   dout(10) << "committed, telling random " << s->inst << " all about it" << dendl;
+
+  // get feature of the peer
+  // use quorum_con_features, if it's an anonymous connection.
+  uint64_t features = s->con_features ? s->con_features :
+                                        mon->get_quorum_con_features();
   // whatev, they'll request more if they need it
-  MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch());
+  MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
   s->con->send_message(m);
   // NOTE: do *not* record osd has up to this epoch (as we do
   // elsewhere) as they may still need to request older values.
@@ -1737,22 +1817,6 @@ bool OSDMonitor::should_propose(double& delay)
     return true;
   }
 
-  // propose as fast as possible if updating up_thru or pg_temp
-  // want to merge OSDMap changes as much as possible
-  if ((pending_inc.new_primary_temp.size() == 1
-      || pending_inc.new_up_thru.size() == 1)
-      && pending_inc.new_state.size() < 2) {
-    dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl;
-
-    utime_t now = ceph_clock_now();
-    if (now - last_attempted_minwait_time > g_conf->paxos_propose_interval
-       && now - paxos->get_last_commit_time() > g_conf->paxos_min_wait) {
-      delay = g_conf->paxos_min_wait;
-      last_attempted_minwait_time = now;
-      return true;
-    }
-  }
-
   return PaxosService::should_propose(delay);
 }
 
@@ -1765,21 +1829,26 @@ bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
 {
   op->mark_osdmon_event(__func__);
   MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
+
+  uint64_t features = mon->get_quorum_con_features();
+  if (m->get_session() && m->get_session()->con_features)
+    features = m->get_session()->con_features;
+
   dout(10) << __func__ << " " << *m << dendl;
-  MOSDMap *reply = new MOSDMap(mon->monmap->fsid);
+  MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
   epoch_t first = get_first_committed();
   epoch_t last = osdmap.get_epoch();
   int max = g_conf->osd_map_message_max;
   for (epoch_t e = MAX(first, m->get_full_first());
        e <= MIN(last, m->get_full_last()) && max > 0;
        ++e, --max) {
-    int r = get_version_full(e, reply->maps[e]);
+    int r = get_version_full(e, features, reply->maps[e]);
     assert(r >= 0);
   }
   for (epoch_t e = MAX(first, m->get_inc_first());
        e <= MIN(last, m->get_inc_last()) && max > 0;
        ++e, --max) {
-    int r = get_version(e, reply->incremental_maps[e]);
+    int r = get_version(e, features, reply->incremental_maps[e]);
     assert(r >= 0);
   }
   reply->oldest_map = first;
@@ -1870,6 +1939,7 @@ bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
   return false;
 
  didit:
+  mon->no_reply(op);
   return true;
 }
 
@@ -2254,6 +2324,7 @@ void OSDMonitor::process_failures()
           o->mark_event(__func__);
           MOSDFailure *m = o->get_req<MOSDFailure>();
           send_latest(o, m->get_epoch());
+         mon->no_reply(o);
         }
        ls.pop_front();
       }
@@ -3126,25 +3197,25 @@ void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
 }
 
 
-MOSDMap *OSDMonitor::build_latest_full()
+MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
 {
-  MOSDMap *r = new MOSDMap(mon->monmap->fsid);
-  get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]);
+  MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
+  get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
   r->oldest_map = get_first_committed();
   r->newest_map = osdmap.get_epoch();
   return r;
 }
 
-MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to)
+MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
 {
-  dout(10) << "build_incremental [" << from << ".." << to << "]" << dendl;
-  MOSDMap *m = new MOSDMap(mon->monmap->fsid);
+  dout(10) << "build_incremental [" << from << ".." << to << "] with features " << std::hex << features << dendl;
+  MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
   m->oldest_map = get_first_committed();
   m->newest_map = osdmap.get_epoch();
 
   for (epoch_t e = to; e >= from && e > 0; e--) {
     bufferlist bl;
-    int err = get_version(e, bl);
+    int err = get_version(e, features, bl);
     if (err == 0) {
       assert(bl.length());
       // if (get_version(e, bl) > 0) {
@@ -3154,7 +3225,7 @@ MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to)
     } else {
       assert(err == -ENOENT);
       assert(!bl.length());
-      get_version_full(e, bl);
+      get_version_full(e, features, bl);
       if (bl.length() > 0) {
       //else if (get_version("full", e, bl) > 0) {
       dout(20) << "build_incremental   full " << e << " "
@@ -3172,7 +3243,7 @@ void OSDMonitor::send_full(MonOpRequestRef op)
 {
   op->mark_osdmon_event(__func__);
   dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
-  mon->send_reply(op, build_latest_full());
+  mon->send_reply(op, build_latest_full(op->get_session()->con_features));
 }
 
 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
@@ -3205,6 +3276,11 @@ void OSDMonitor::send_incremental(epoch_t first,
   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
          << " to " << session->inst << dendl;
 
+  // get feature of the peer
+  // use quorum_con_features, if it's an anonymous connection.
+  uint64_t features = session->con_features ? session->con_features :
+    mon->get_quorum_con_features();
+
   if (first <= session->osd_epoch) {
     dout(10) << __func__ << " " << session->inst << " should already have epoch "
             << session->osd_epoch << dendl;
@@ -3214,14 +3290,14 @@ void OSDMonitor::send_incremental(epoch_t first,
   if (first < get_first_committed()) {
     first = get_first_committed();
     bufferlist bl;
-    int err = get_version_full(first, bl);
+    int err = get_version_full(first, features, bl);
     assert(err == 0);
     assert(bl.length());
 
     dout(20) << "send_incremental starting with base full "
             << first << " " << bl.length() << " bytes" << dendl;
 
-    MOSDMap *m = new MOSDMap(osdmap.get_fsid());
+    MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
     m->oldest_map = get_first_committed();
     m->newest_map = osdmap.get_epoch();
     m->maps[first] = bl;
@@ -3238,9 +3314,9 @@ void OSDMonitor::send_incremental(epoch_t first,
   }
 
   while (first <= osdmap.get_epoch()) {
-    epoch_t last = MIN(first + g_conf->osd_map_message_max - 1,
-                      osdmap.get_epoch());
-    MOSDMap *m = build_incremental(first, last);
+    epoch_t last = std::min<epoch_t>(first + g_conf->osd_map_message_max - 1,
+                                    osdmap.get_epoch());
+    MOSDMap *m = build_incremental(first, last, features);
 
     if (req) {
       // send some maps.  it may not be all of them, but it will get them
@@ -3258,26 +3334,98 @@ void OSDMonitor::send_incremental(epoch_t first,
 
 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
 {
-    if (inc_osd_cache.lookup(ver, &bl)) {
-      return 0;
-    }
-    int ret = PaxosService::get_version(ver, bl);
-    if (!ret) {
-      inc_osd_cache.add(ver, bl);
-    }
+  return get_version(ver, mon->get_quorum_con_features(), bl);
+}
+
+void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
+{
+  OSDMap::Incremental inc;
+  bufferlist::iterator q = bl.begin();
+  inc.decode(q);
+  // always encode with subset of osdmap's canonical features
+  uint64_t f = features & inc.encode_features;
+  dout(20) << __func__ << " " << inc.epoch << " with features " << f
+          << dendl;
+  bl.clear();
+  if (inc.fullmap.length()) {
+    // embedded full map?
+    OSDMap m;
+    m.decode(inc.fullmap);
+    inc.fullmap.clear();
+    m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
+  }
+  if (inc.crush.length()) {
+    // embedded crush map
+    CrushWrapper c;
+    auto p = inc.crush.begin();
+    c.decode(p);
+    inc.crush.clear();
+    c.encode(inc.crush, f);
+  }
+  inc.encode(bl, f | CEPH_FEATURE_RESERVED);
+}
+
+void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
+{
+  OSDMap m;
+  bufferlist::iterator q = bl.begin();
+  m.decode(q);
+  // always encode with subset of osdmap's canonical features
+  uint64_t f = features & m.get_encoding_features();
+  dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
+          << dendl;
+  bl.clear();
+  m.encode(bl, f | CEPH_FEATURE_RESERVED);
+}
+
+int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
+{
+  uint64_t significant_features = OSDMap::get_significant_features(features);
+  if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
+    return 0;
+  }
+  int ret = PaxosService::get_version(ver, bl);
+  if (ret < 0) {
     return ret;
+  }
+  // NOTE: this check is imprecise; the OSDMap encoding features may
+  // be a subset of the latest mon quorum features, but worst case we
+  // reencode once and then cache the (identical) result under both
+  // feature masks.
+  if (significant_features !=
+      OSDMap::get_significant_features(mon->get_quorum_con_features())) {
+    reencode_incremental_map(bl, features);
+  }
+  inc_osd_cache.add({ver, significant_features}, bl);
+  return 0;
 }
 
 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
 {
-    if (full_osd_cache.lookup(ver, &bl)) {
-      return 0;
-    }
-    int ret = PaxosService::get_version_full(ver, bl);
-    if (!ret) {
-      full_osd_cache.add(ver, bl);
-    }
+  return get_version_full(ver, mon->get_quorum_con_features(), bl);
+}
+
+int OSDMonitor::get_version_full(version_t ver, uint64_t features,
+                                bufferlist& bl)
+{
+  uint64_t significant_features = OSDMap::get_significant_features(features);
+  if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
+    return 0;
+  }
+  int ret = PaxosService::get_version_full(ver, bl);
+  if (ret < 0) {
     return ret;
+  }
+  // NOTE: this check is imprecise; the OSDMap encoding features may
+  // be a subset of the latest mon quorum features, but worst case we
+  // reencode once and then cache the (identical) result under both
+  // feature masks.
+  if (significant_features !=
+      OSDMap::get_significant_features(mon->get_quorum_con_features())) {
+    reencode_full_map(bl, features);
+  }
+  full_osd_cache.add({ver, significant_features}, bl);
+  return 0;
 }
 
 epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
@@ -3314,7 +3462,7 @@ void OSDMonitor::check_osdmap_sub(Subscription *sub)
     if (sub->next >= 1)
       send_incremental(sub->next, sub->session, sub->incremental_onetime);
     else
-      sub->session->con->send_message(build_latest_full());
+      sub->session->con->send_message(build_latest_full(sub->session->con_features));
     if (sub->onetime)
       mon->session_map.remove_sub(sub);
     else
@@ -4060,7 +4208,7 @@ void OSDMonitor::dump_info(Formatter *f)
 namespace {
   enum osd_pool_get_choices {
     SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL,
-    PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL,
+    PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
     NODELETE, NOPGCHANGE, NOSIZECHANGE,
     WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
     HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
@@ -4661,8 +4809,8 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
       {"min_size", MIN_SIZE},
       {"crash_replay_interval", CRASH_REPLAY_INTERVAL},
       {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
-      {"crush_rule", CRUSH_RULE},
-      {"hashpspool", HASHPSPOOL}, {"nodelete", NODELETE},
+      {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
+      {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
       {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
       {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
       {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
@@ -4710,7 +4858,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
       HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
     };
     const choices_set_t ONLY_ERASURE_CHOICES = {
-      ERASURE_CODE_PROFILE
+      EC_OVERWRITES, ERASURE_CODE_PROFILE
     };
 
     choices_set_t selected_choices;
@@ -4801,6 +4949,10 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
              f->dump_string("crush_rule", stringify(p->get_crush_rule()));
            }
            break;
+         case EC_OVERWRITES:
+           f->dump_bool("allow_ec_overwrites",
+                         p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
+           break;
          case HASHPSPOOL:
          case NODELETE:
          case NOPGCHANGE:
@@ -5018,6 +5170,11 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
            ss << "hit_set_search_last_n: " <<
              p->hit_set_search_last_n << "\n";
            break;
+         case EC_OVERWRITES:
+           ss << "allow_ec_overwrites: " <<
+             (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
+             "\n";
+           break;
          case HASHPSPOOL:
          case NODELETE:
          case NOPGCHANGE:
@@ -11480,11 +11637,61 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   return true;
 }
 
-bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op) 
+bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
 {
   op->mark_osdmon_event(__func__);
+
   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
-  
+  MonSession *session = m->get_session();
+  if (!session) {
+    _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+    return true;
+  }
+
+  switch (m->op) {
+  case POOL_OP_CREATE_UNMANAGED_SNAP:
+  case POOL_OP_DELETE_UNMANAGED_SNAP:
+    {
+      const std::string* pool_name = nullptr;
+      const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
+      if (pg_pool != nullptr) {
+        pool_name = &osdmap.get_pool_name(m->pool);
+      }
+
+      if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
+                                          session->entity_name, session->caps,
+                                          pool_name)) {
+        dout(0) << "got unmanaged-snap pool op from entity with insufficient "
+                << "privileges. message: " << *m  << std::endl
+                << "caps: " << session->caps << dendl;
+        _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+        return true;
+      }
+    }
+    break;
+  default:
+    if (!session->is_capable("osd", MON_CAP_W)) {
+      dout(0) << "got pool op from entity with insufficient privileges. "
+              << "message: " << *m  << std::endl
+              << "caps: " << session->caps << dendl;
+      _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+      return true;
+    }
+    break;
+  }
+
+  return false;
+}
+
+bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
+
+  if (enforce_pool_op_caps(op)) {
+    return true;
+  }
+
   if (m->fsid != mon->monmap->fsid) {
     dout(0) << __func__ << " drop message on fsid " << m->fsid
             << " != " << mon->monmap->fsid << " for " << *m << dendl;
@@ -11564,19 +11771,6 @@ bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
 {
   op->mark_osdmon_event(__func__);
   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
-  MonSession *session = m->get_session();
-  if (!session) {
-    _pool_op_reply(op, -EPERM, osdmap.get_epoch());
-    return true;
-  }
-  if (!session->is_capable("osd", MON_CAP_W)) {
-    dout(5) << "attempt to create new pool without sufficient auid privileges!"
-           << "message: " << *m  << std::endl
-           << "caps: " << session->caps << dendl;
-    _pool_op_reply(op, -EPERM, osdmap.get_epoch());
-    return true;
-  }
-
   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
   if (pool >= 0) {
     _pool_op_reply(op, 0, osdmap.get_epoch());
@@ -11703,6 +11897,10 @@ bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
 
   case POOL_OP_DELETE_UNMANAGED_SNAP:
     if (!pp.is_removed_snap(m->snapid)) {
+      if (m->snapid > pp.get_snap_seq()) {
+        _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
+        return false;
+      }
       pp.remove_unmanaged_snap(m->snapid);
       changed = true;
     }
@@ -11744,7 +11942,7 @@ int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
   const string& poolstr = osdmap.get_pool_name(pool_id);
 
   // If the Pool is in use by CephFS, refuse to delete it
-  FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
+  FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
   if (pending_fsmap.pool_in_use(pool_id)) {
     *ss << "pool '" << poolstr << "' is in use by CephFS";
     return -EBUSY;
@@ -11793,7 +11991,7 @@ bool OSDMonitor::_check_become_tier(
   const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
 
-  const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
+  const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
   if (pending_fsmap.pool_in_use(tier_pool_id)) {
     *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
     *err = -EBUSY;
@@ -11853,7 +12051,7 @@ bool OSDMonitor::_check_remove_tier(
   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
 
   // Apply CephFS-specific checks
-  const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
+  const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
   if (pending_fsmap.pool_in_use(base_pool_id)) {
     if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
       // If the underlying pool is erasure coded and does not allow EC
index c3db2332fd312018e2790abab7fc43fcd03cdf72..076e301f2c7b11103ab175437ef8ebd69a698ce6 100644 (file)
@@ -42,6 +42,9 @@ class MOSDMap;
 
 #include "erasure-code/ErasureCodeInterface.h"
 #include "mon/MonOpRequest.h"
+#include <boost/functional/hash.hpp>
+// re-include our assert to clobber the system one; fix dout:
+#include "include/assert.h"
 
 /// information about a particular peer's failure reports for one osd
 struct failure_reporter_t {
@@ -140,16 +143,18 @@ public:
 
   map<int,double> osd_weight;
 
-  SimpleLRU<version_t, bufferlist> inc_osd_cache;
-  SimpleLRU<version_t, bufferlist> full_osd_cache;
+  using osdmap_key_t = std::pair<version_t, uint64_t>;
+  using osdmap_cache_t = SimpleLRU<osdmap_key_t,
+                                   bufferlist,
+                                   std::less<osdmap_key_t>,
+                                   boost::hash<osdmap_key_t>>;
+  osdmap_cache_t inc_osd_cache;
+  osdmap_cache_t full_osd_cache;
 
   bool check_failures(utime_t now);
   bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
   void force_failure(int target_osd, int by);
 
-  // the time of last msg(MSG_ALIVE and MSG_PGTEMP) proposed without delay
-  utime_t last_attempted_minwait_time;
-
   bool _have_pending_crush();
   CrushWrapper &_get_stable_crush();
   void _get_pending_crush(CrushWrapper& newcrush);
@@ -244,8 +249,8 @@ private:
   bool can_mark_in(int o);
 
   // ...
-  MOSDMap *build_latest_full();
-  MOSDMap *build_incremental(epoch_t first, epoch_t last);
+  MOSDMap *build_latest_full(uint64_t features);
+  MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features);
   void send_full(MonOpRequestRef op);
   void send_incremental(MonOpRequestRef op, epoch_t first);
 public:
@@ -300,6 +305,7 @@ private:
   int _prepare_remove_pool(int64_t pool, ostream *ss, bool no_fake);
   int _prepare_rename_pool(int64_t pool, string newname);
 
+  bool enforce_pool_op_caps(MonOpRequestRef op);
   bool preprocess_pool_op (MonOpRequestRef op);
   bool preprocess_pool_op_create (MonOpRequestRef op);
   bool prepare_pool_op (MonOpRequestRef op);
@@ -428,6 +434,9 @@ private:
 
   int load_metadata(int osd, map<string, string>& m, ostream *err);
   void count_metadata(const string& field, Formatter *f);
+
+  void reencode_incremental_map(bufferlist& bl, uint64_t features);
+  void reencode_full_map(bufferlist& bl, uint64_t features);
 public:
   void count_metadata(const string& field, map<string,int> *out);
 protected:
@@ -534,6 +543,9 @@ public:
   }
 
   int get_version(version_t ver, bufferlist& bl) override;
+  int get_version(version_t ver, uint64_t feature, bufferlist& bl);
+
+  int get_version_full(version_t ver, uint64_t feature, bufferlist& bl);
   int get_version_full(version_t ver, bufferlist& bl) override;
 
   epoch_t blacklist(const entity_addr_t& a, utime_t until);
index 504adb53cd9420c09b98355b2a6c96cd4f882fb2..909e3a8163085cc45ce3069c197b91ca6e570257 100644 (file)
@@ -2883,6 +2883,39 @@ void PGMap::get_health_checks(
     checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
   }
 
+  // LARGE_OMAP_OBJECTS
+  if (pg_sum.stats.sum.num_large_omap_objects) {
+    list<string> detail;
+    for (auto &pool : pools) {
+      const string& pool_name = osdmap.get_pool_name(pool.first);
+      auto it2 = pg_pool_sum.find(pool.first);
+      if (it2 == pg_pool_sum.end()) {
+        continue;
+      }
+      const pool_stat_t *pstat = &it2->second;
+      if (pstat == nullptr) {
+        continue;
+      }
+      const object_stat_sum_t& sum = pstat->stats.sum;
+      if (sum.num_large_omap_objects) {
+        stringstream ss;
+        ss << sum.num_large_omap_objects << " large objects found in pool "
+           << "'" << pool_name << "'";
+        detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
+      auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str());
+      stringstream tip;
+      tip << "Search the cluster log for 'Large omap object found' for more "
+          << "details.";
+      detail.push_back(tip.str());
+      d.detail.swap(detail);
+    }
+  }
+
   // CACHE_POOL_NEAR_FULL
   {
     list<string> detail;
@@ -3185,14 +3218,11 @@ void PGMap::get_health_checks(
     }
 
     if (!warn_detail.empty()) {
-      ostringstream ss;
-      ss << warn << " slow requests are blocked > "
-        << cct->_conf->mon_osd_warn_op_age << " sec";
-      auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
-      d.detail.swap(warn_detail);
       int left = max;
+      set<int> implicated_osds;
       for (auto& p : warn_osd_by_max) {
        ostringstream ss;
+        implicated_osds.insert(p.second.begin(), p.second.end());
        if (p.second.size() > 1) {
          ss << "osds " << p.second
              << " have blocked requests > " << p.first << " sec";
@@ -3200,21 +3230,24 @@ void PGMap::get_health_checks(
          ss << "osd." << *p.second.begin()
              << " has blocked requests > " << p.first << " sec";
        }
-       d.detail.push_back(ss.str());
+       warn_detail.push_back(ss.str());
        if (--left == 0) {
          break;
        }
       }
+      ostringstream ss;
+      ss << warn << " slow requests are blocked > "
+        << cct->_conf->mon_osd_warn_op_age << " sec. Implicated osds "
+         << implicated_osds;
+      auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
+      d.detail.swap(warn_detail);
     }
     if (!error_detail.empty()) {
-      ostringstream ss;
-      ss << error << " stuck requests are blocked > "
-        << err_age << " sec";
-      auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
-      d.detail.swap(error_detail);
       int left = max;
+      set<int> implicated_osds;
       for (auto& p : error_osd_by_max) {
        ostringstream ss;
+        implicated_osds.insert(p.second.begin(), p.second.end());
        if (p.second.size() > 1) {
          ss << "osds " << p.second
              << " have stuck requests > " << p.first << " sec";
@@ -3222,11 +3255,16 @@ void PGMap::get_health_checks(
          ss << "osd." << *p.second.begin()
              << " has stuck requests > " << p.first << " sec";
        }
-       d.detail.push_back(ss.str());
+       error_detail.push_back(ss.str());
        if (--left == 0) {
          break;
        }
       }
+      ostringstream ss;
+      ss << error << " stuck requests are blocked > "
+        << err_age << " sec. Implicated osds " << implicated_osds;
+      auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
+      d.detail.swap(error_detail);
     }
   }
 
index 896aafa07eaf1a714c8adac57ae2300399278aa8..ce0d1b75781b034d9e4bddd2a9cf71320770781c 100644 (file)
@@ -1240,14 +1240,6 @@ public:
    * @return the first committed version
    */
   version_t get_first_committed() { return first_committed; }
-  /** 
-   * Get the last commit time
-   *
-   * @returns Our last commit time
-  */
-  utime_t get_last_commit_time() const{
-    return last_commit_time;
-  }
   /**
    * Check if a given version is readable.
    *
diff --git a/ceph/src/mon/PaxosFSMap.h b/ceph/src/mon/PaxosFSMap.h
new file mode 100644 (file)
index 0000000..8d7c8c1
--- /dev/null
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PAXOS_FSMAP_H
+#define CEPH_PAXOS_FSMAP_H
+
+#include "mds/FSMap.h"
+#include "mds/MDSMap.h"
+
+#include "include/assert.h"
+
+class PaxosFSMap {
+public:
+  virtual ~PaxosFSMap() {}
+
+  const FSMap &get_pending_fsmap() const { assert(is_leader()); return pending_fsmap; }
+  const FSMap &get_fsmap() const { return fsmap; }
+
+  virtual bool is_leader() const = 0;
+
+protected:
+  FSMap &get_pending_fsmap_writeable() { assert(is_leader()); return pending_fsmap; }
+
+  /* get_working_fsmap returns the "relevant" version of the fsmap (see MDSMonitor.cc history)
+   * used depending in helper methods of MDSMonitor.cc.
+   *
+   * This is technically evil and will be removed in the future.
+   *
+   * See discussion: https://github.com/ceph/ceph/pull/21458#discussion_r182081366
+   */
+  const FSMap &get_working_fsmap() const { return is_leader() ? pending_fsmap : fsmap; }
+
+  FSMap &create_pending() {
+    assert(is_leader());
+    pending_fsmap = fsmap;
+    pending_fsmap.epoch++;
+    return pending_fsmap;
+  }
+
+  void decode(bufferlist &bl) {
+    fsmap.decode(bl);
+    pending_fsmap = FSMap(); /* nuke it to catch invalid access */
+  }
+
+private:
+  /* Keep these PRIVATE to prevent unprotected manipulation. */
+  FSMap fsmap; /* the current epoch */
+  FSMap pending_fsmap; /* the next epoch */
+};
+
+
+#endif
index 5af27ea17f389ed62ac7e0d2959c16dd0f1212e3..64adefbd3f28cbb695a5dbdc4d7fc754ffe54ebc 100644 (file)
@@ -17,6 +17,7 @@
 #define CEPH_DISPATCHER_H
 
 #include "include/assert.h"
+#include <memory>
 #include "include/buffer_fwd.h"
 #include "include/assert.h"
 
@@ -26,6 +27,7 @@ class Connection;
 class AuthAuthorizer;
 class CryptoKey;
 class CephContext;
+class AuthAuthorizerChallenge;
 
 class Dispatcher {
 public:
@@ -204,7 +206,10 @@ public:
                                    ceph::bufferlist& authorizer,
                                    ceph::bufferlist& authorizer_reply,
                                    bool& isvalid,
-                                   CryptoKey& session_key) { return false; }
+                                   CryptoKey& session_key,
+                                   std::unique_ptr<AuthAuthorizerChallenge> *challenge) {
+    return false;
+  }
   /**
    * @} //Authentication
    */
index c6dbcc17694d4e4ea594b4612313f753885bda55..5975c583d3a4ca5c6112fe173f71116dc9bd298f 100644 (file)
@@ -806,11 +806,13 @@ public:
    */
   bool ms_deliver_verify_authorizer(Connection *con, int peer_type,
                                    int protocol, bufferlist& authorizer, bufferlist& authorizer_reply,
-                                   bool& isvalid, CryptoKey& session_key) {
+                                   bool& isvalid, CryptoKey& session_key,
+                                   std::unique_ptr<AuthAuthorizerChallenge> *challenge) {
     for (list<Dispatcher*>::iterator p = dispatchers.begin();
         p != dispatchers.end();
         ++p) {
-      if ((*p)->ms_verify_authorizer(con, peer_type, protocol, authorizer, authorizer_reply, isvalid, session_key))
+      if ((*p)->ms_verify_authorizer(con, peer_type, protocol, authorizer, authorizer_reply,
+                                    isvalid, session_key, challenge))
        return true;
     }
     return false;
index f14be27412b3e612350652b70154b8f194b78ef6..80231cccf4b9b3a1f8e832eb8af3538f02b753a5 100644 (file)
@@ -986,7 +986,8 @@ ssize_t AsyncConnection::_process_connection()
         ldout(async_msgr->cct, 20) << __func__ << " connect peer addr for me is " << peer_addr_for_me << dendl;
         lock.unlock();
         async_msgr->learned_addr(peer_addr_for_me);
-        if (async_msgr->cct->_conf->ms_inject_internal_delays) {
+        if (async_msgr->cct->_conf->ms_inject_internal_delays
+            && async_msgr->cct->_conf->ms_inject_socket_failures) {
           if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) {
             ldout(msgr->cct, 10) << __func__ << " sleep for "
                                  << async_msgr->cct->_conf->ms_inject_internal_delays << dendl;
@@ -1025,8 +1026,7 @@ ssize_t AsyncConnection::_process_connection()
 
     case STATE_CONNECTING_SEND_CONNECT_MSG:
       {
-        if (!got_bad_auth) {
-          delete authorizer;
+        if (!authorizer) {
           authorizer = async_msgr->get_authorizer(peer_type, false);
         }
         bufferlist bl;
@@ -1106,7 +1106,15 @@ ssize_t AsyncConnection::_process_connection()
           }
 
           authorizer_reply.append(state_buffer, connect_reply.authorizer_len);
-          bufferlist::iterator iter = authorizer_reply.begin();
+
+         if (connect_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+           ldout(async_msgr->cct,10) << __func__ << " connect got auth challenge" << dendl;
+           authorizer->add_challenge(async_msgr->cct, authorizer_reply);
+           state = STATE_CONNECTING_SEND_CONNECT_MSG;
+           break;
+         }
+
+          auto iter = authorizer_reply.begin();
           if (authorizer && !authorizer->verify_reply(iter)) {
             ldout(async_msgr->cct, 0) << __func__ << " failed verifying authorize reply" << dendl;
             goto fail;
@@ -1484,20 +1492,32 @@ ssize_t AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlis
   // require signatures for cephx?
   if (connect.authorizer_protocol == CEPH_AUTH_CEPHX) {
     if (peer_type == CEPH_ENTITY_TYPE_OSD ||
-        peer_type == CEPH_ENTITY_TYPE_MDS) {
+        peer_type == CEPH_ENTITY_TYPE_MDS ||
+       peer_type == CEPH_ENTITY_TYPE_MGR) {
       if (async_msgr->cct->_conf->cephx_require_signatures ||
           async_msgr->cct->_conf->cephx_cluster_require_signatures) {
         ldout(async_msgr->cct, 10) << __func__ << " using cephx, requiring MSG_AUTH feature bit for cluster" << dendl;
         policy.features_required |= CEPH_FEATURE_MSG_AUTH;
       }
+      if (async_msgr->cct->_conf->cephx_require_version >= 2 ||
+         async_msgr->cct->_conf->cephx_cluster_require_version >= 2) {
+        ldout(async_msgr->cct, 10) << __func__ << " using cephx, requiring cephx v2 feature bit for cluster" << dendl;
+        policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+      }
     } else {
       if (async_msgr->cct->_conf->cephx_require_signatures ||
           async_msgr->cct->_conf->cephx_service_require_signatures) {
         ldout(async_msgr->cct, 10) << __func__ << " using cephx, requiring MSG_AUTH feature bit for service" << dendl;
         policy.features_required |= CEPH_FEATURE_MSG_AUTH;
       }
+      if (async_msgr->cct->_conf->cephx_require_version >= 2 ||
+         async_msgr->cct->_conf->cephx_service_require_version >= 2) {
+        ldout(async_msgr->cct, 10) << __func__ << " using cephx, requiring cephx v2 feature bit for service" << dendl;
+        policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+      }
     }
   }
+
   uint64_t feat_missing = policy.features_required & ~(uint64_t)connect.features;
   if (feat_missing) {
     ldout(async_msgr->cct, 1) << __func__ << " peer missing required features "
@@ -1508,12 +1528,26 @@ ssize_t AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlis
   lock.unlock();
 
   bool authorizer_valid;
-  if (!async_msgr->verify_authorizer(this, peer_type, connect.authorizer_protocol, authorizer_bl,
-                               authorizer_reply, authorizer_valid, session_key) || !authorizer_valid) {
+  bool need_challenge = HAVE_FEATURE(connect.features, CEPHX_V2);
+  bool had_challenge = (bool)authorizer_challenge;
+  if (!async_msgr->verify_authorizer(
+       this, peer_type, connect.authorizer_protocol, authorizer_bl,
+       authorizer_reply, authorizer_valid, session_key,
+       need_challenge ? &authorizer_challenge : nullptr) ||
+      !authorizer_valid) {
     lock.lock();
-    ldout(async_msgr->cct,0) << __func__ << ": got bad authorizer" << dendl;
+    char tag;
+    if (need_challenge && !had_challenge && authorizer_challenge) {
+      ldout(async_msgr->cct,0) << __func__ << ": challenging authorizer"
+                              << dendl;
+      assert(authorizer_reply.length());
+      tag = CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER;
+    } else {
+      ldout(async_msgr->cct,0) << __func__ << ": got bad authorizer" << dendl;
+      tag = CEPH_MSGR_TAG_BADAUTHORIZER;
+    }
     session_security.reset();
-    return _reply_accept(CEPH_MSGR_TAG_BADAUTHORIZER, connect, reply, authorizer_reply);
+    return _reply_accept(tag, connect, reply, authorizer_reply);
   }
 
   // We've verified the authorizer for this AsyncConnection, so set up the session security structure.  PLR
@@ -1707,6 +1741,8 @@ ssize_t AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlis
     // there shouldn't exist any buffer
     assert(recv_start == recv_end);
 
+    existing->authorizer_challenge.reset();
+
     auto deactivate_existing = std::bind(
         [existing, new_worker, new_center, connect, reply, authorizer_reply](ConnectedSocket &cs) mutable {
       // we need to delete time event in original thread
index ab2ff2c4ab05054a8a2b579d0eed26c33a97f9ad..64a7502403bb12f9a1471ff9bd3b7f24f3b375be 100644 (file)
@@ -371,6 +371,7 @@ class AsyncConnection : public Connection {
   Worker *worker;
   EventCenter *center;
   ceph::shared_ptr<AuthSessionHandler> session_security;
+  std::unique_ptr<AuthAuthorizerChallenge> authorizer_challenge; // accept side
 
  public:
   // used by eventcallback
index 7ebc7777c93e611f1ca49efa78f5edc4654a0b61..6659cf29004f60b89b67ce9dbc775043ad1f9a61 100644 (file)
@@ -384,9 +384,10 @@ public:
    * This wraps ms_deliver_verify_authorizer; we use it for AsyncConnection.
    */
   bool verify_authorizer(Connection *con, int peer_type, int protocol, bufferlist& auth, bufferlist& auth_reply,
-                         bool& isvalid, CryptoKey& session_key) {
+                         bool& isvalid, CryptoKey& session_key,
+                        std::unique_ptr<AuthAuthorizerChallenge> *challenge) {
     return ms_deliver_verify_authorizer(con, peer_type, protocol, auth,
-                                        auth_reply, isvalid, session_key);
+                                        auth_reply, isvalid, session_key, challenge);
   }
   /**
    * Increment the global sequence for this AsyncMessenger and return it.
index 848efd45c0090f2381a930d11a6aae1014ed6572..2c1415ebc694a23d741ec959f4f912008003da6a 100644 (file)
@@ -354,6 +354,10 @@ int Pipe::accept()
   // used for reading in the remote acked seq on connect
   uint64_t newly_acked_seq = 0;
 
+  bool need_challenge = false;
+  bool had_challenge = false;
+  std::unique_ptr<AuthAuthorizerChallenge> authorizer_challenge;
+
   recv_reset();
 
   set_socket_options();
@@ -477,18 +481,29 @@ int Pipe::accept()
     // require signatures for cephx?
     if (connect.authorizer_protocol == CEPH_AUTH_CEPHX) {
       if (peer_type == CEPH_ENTITY_TYPE_OSD ||
-         peer_type == CEPH_ENTITY_TYPE_MDS) {
+         peer_type == CEPH_ENTITY_TYPE_MDS ||
+         peer_type == CEPH_ENTITY_TYPE_MGR) {
        if (msgr->cct->_conf->cephx_require_signatures ||
            msgr->cct->_conf->cephx_cluster_require_signatures) {
          ldout(msgr->cct,10) << "using cephx, requiring MSG_AUTH feature bit for cluster" << dendl;
          policy.features_required |= CEPH_FEATURE_MSG_AUTH;
        }
+       if (msgr->cct->_conf->cephx_require_version >= 2 ||
+           msgr->cct->_conf->cephx_cluster_require_version >= 2) {
+         ldout(msgr->cct,10) << "using cephx, requiring cephx v2 feature bit for cluster" << dendl;
+         policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+       }
       } else {
        if (msgr->cct->_conf->cephx_require_signatures ||
            msgr->cct->_conf->cephx_service_require_signatures) {
          ldout(msgr->cct,10) << "using cephx, requiring MSG_AUTH feature bit for service" << dendl;
          policy.features_required |= CEPH_FEATURE_MSG_AUTH;
        }
+       if (msgr->cct->_conf->cephx_require_version >= 2 ||
+           msgr->cct->_conf->cephx_service_require_version >= 2) {
+         ldout(msgr->cct,10) << "using cephx, requiring cephx v2 feature bit for cluster" << dendl;
+         policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+       }
       }
     }
 
@@ -503,14 +518,27 @@ int Pipe::accept()
 
     pipe_lock.Unlock();
 
-    if (!msgr->verify_authorizer(connection_state.get(), peer_type, connect.authorizer_protocol, authorizer,
-                                authorizer_reply, authorizer_valid, session_key) ||
+    need_challenge = HAVE_FEATURE(connect.features, CEPHX_V2);
+    had_challenge = (bool)authorizer_challenge;
+    authorizer_reply.clear();
+    if (!msgr->verify_authorizer(
+         connection_state.get(), peer_type, connect.authorizer_protocol, authorizer,
+         authorizer_reply, authorizer_valid, session_key,
+         need_challenge ? &authorizer_challenge : nullptr) ||
        !authorizer_valid) {
-      ldout(msgr->cct,0) << "accept: got bad authorizer" << dendl;
       pipe_lock.Lock();
       if (state != STATE_ACCEPTING)
        goto shutting_down_msgr_unlocked;
-      reply.tag = CEPH_MSGR_TAG_BADAUTHORIZER;
+      if (!had_challenge && need_challenge && authorizer_challenge) {
+       ldout(msgr->cct,0) << "accept: challenging authorizer "
+                          << authorizer_reply.length()
+                          << " bytes" << dendl;
+       assert(authorizer_reply.length());
+       reply.tag = CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER;
+      } else {
+       ldout(msgr->cct,0) << "accept: got bad authorizer" << dendl;
+       reply.tag = CEPH_MSGR_TAG_BADAUTHORIZER;
+      }
       session_security.reset();
       goto reply;
     } 
@@ -1116,8 +1144,9 @@ int Pipe::connect()
 
 
   while (1) {
-    delete authorizer;
-    authorizer = msgr->get_authorizer(peer_type, false);
+    if (!authorizer) {
+      authorizer = msgr->get_authorizer(peer_type, false);
+    }
     bufferlist authorizer_reply;
 
     ceph_msg_connect connect;
@@ -1184,6 +1213,13 @@ int Pipe::connect()
       authorizer_reply.push_back(bp);
     }
 
+    if (reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+      authorizer->add_challenge(msgr->cct, authorizer_reply);
+      ldout(msgr->cct,10) << " got authorizer challenge, " << authorizer_reply.length()
+                         << " bytes" << dendl;
+      continue;
+    }
+
     if (authorizer) {
       bufferlist::iterator iter = authorizer_reply.begin();
       if (!authorizer->verify_reply(iter)) {
index 78e190d027e1c65c34792face5597938a285a41d..d75804b531787b12e382961f24c0c2629e8aaf8b 100644 (file)
@@ -415,9 +415,12 @@ AuthAuthorizer *SimpleMessenger::get_authorizer(int peer_type, bool force_new)
 
 bool SimpleMessenger::verify_authorizer(Connection *con, int peer_type,
                                        int protocol, bufferlist& authorizer, bufferlist& authorizer_reply,
-                                       bool& isvalid,CryptoKey& session_key)
+                                       bool& isvalid,CryptoKey& session_key,
+                                       std::unique_ptr<AuthAuthorizerChallenge> *challenge)
 {
-  return ms_deliver_verify_authorizer(con, peer_type, protocol, authorizer, authorizer_reply, isvalid,session_key);
+  return ms_deliver_verify_authorizer(con, peer_type, protocol, authorizer, authorizer_reply,
+                                     isvalid, session_key,
+                                     challenge);
 }
 
 ConnectionRef SimpleMessenger::get_connection(const entity_inst_t& dest)
index 0a0512382eb3c337e2b8fb473fed3a6bde712466..aebc190e14e38fc8ffe3f6a495dbee1dae3e9eb6 100644 (file)
@@ -346,8 +346,10 @@ public:
   /**
    * This wraps ms_deliver_verify_authorizer; we use it for Pipe.
    */
-  bool verify_authorizer(Connection *con, int peer_type, int protocol, bufferlist& auth, bufferlist& auth_reply,
-                         bool& isvalid,CryptoKey& session_key);
+  bool verify_authorizer(Connection *con, int peer_type, int protocol, bufferlist& auth,
+                        bufferlist& auth_reply,
+                         bool& isvalid,CryptoKey& session_key,
+                        std::unique_ptr<AuthAuthorizerChallenge> *challenge);
   /**
    * Increment the global sequence for this SimpleMessenger and return it.
    * This is for the connect protocol, although it doesn't hurt if somebody
index 0efd5ad15a1754c02f347928145550a3813dcae8..12879511047dd737536c45e705fd3126dfdfc916 100644 (file)
@@ -2049,6 +2049,9 @@ public:
   virtual void inject_mdata_error(const ghobject_t &oid) {}
 
   virtual void compact() {}
+  virtual bool has_builtin_csum() const {
+    return false;
+  }
 };
 WRITE_CLASS_ENCODER(ObjectStore::Transaction)
 WRITE_CLASS_ENCODER(ObjectStore::Transaction::TransactionData)
index efbfd7f33a817ea961e4ff57223bb071500768ce..6fdd11b170b2403af505c2cf68717d686314893f 100644 (file)
@@ -1667,16 +1667,9 @@ void BlueStore::SharedBlob::put()
                             << " removing self from set " << get_parent()
                             << dendl;
     if (get_parent()) {
-      if (get_parent()->try_remove(this)) {
-       delete this;
-      } else {
-       ldout(coll->store->cct, 20)
-         << __func__ << " " << this << " lost race to remove myself from set"
-         << dendl;
-      }
-    } else {
-      delete this;
+      get_parent()->remove(this);
     }
+    delete this;
   }
 }
 
@@ -4564,9 +4557,7 @@ int BlueStore::_open_db(bool create)
     string bfn;
     struct stat st;
 
-    if (read_meta("path_block.db", &bfn) < 0) {
-      bfn = path + "/block.db";
-    }
+    bfn = path + "/block.db";
     if (::stat(bfn.c_str(), &st) == 0) {
       r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
       if (r < 0) {
@@ -4595,19 +4586,20 @@ int BlueStore::_open_db(bool create)
       }
       bluefs_shared_bdev = BlueFS::BDEV_SLOW;
       bluefs_single_shared_device = false;
-    } else if (::lstat(bfn.c_str(), &st) == -1) {
-      bluefs_shared_bdev = BlueFS::BDEV_DB;
     } else {
-      //symlink exist is bug
-      derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
       r = -errno;
-      goto free_bluefs;
+      if (::lstat(bfn.c_str(), &st) == -1) {
+       r = 0;
+       bluefs_shared_bdev = BlueFS::BDEV_DB;
+      } else {
+       derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+            << cpp_strerror(r) << dendl;
+       goto free_bluefs;
+      }
     }
 
     // shared device
-    if (read_meta("path_block", &bfn) < 0) {
-      bfn = path + "/block";
-    }
+    bfn = path + "/block";
     r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
     if (r < 0) {
       derr << __func__ << " add block device(" << bfn << ") returned: " 
@@ -4636,9 +4628,7 @@ int BlueStore::_open_db(bool create)
       bluefs_extents.insert(start, initial);
     }
 
-    if (read_meta("path_block.wal", &bfn) < 0) {
-      bfn = path + "/block.wal";
-    }
+    bfn = path + "/block.wal";
     if (::stat(bfn.c_str(), &st) == 0) {
       r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
       if (r < 0) {
@@ -4667,13 +4657,16 @@ int BlueStore::_open_db(bool create)
       }
       cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
       bluefs_single_shared_device = false;
-    } else if (::lstat(bfn.c_str(), &st) == -1) {
-      cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
     } else {
-      //symlink exist is bug
-      derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
       r = -errno;
-      goto free_bluefs;
+      if (::lstat(bfn.c_str(), &st) == -1) {
+       r = 0;
+       cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
+      } else {
+       derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+            << cpp_strerror(r) << dendl;
+       goto free_bluefs;
+      }
     }
 
     if (create) {
@@ -5011,6 +5004,7 @@ void BlueStore::_commit_bluefs_freespace(
 
 int BlueStore::_open_collections(int *errors)
 {
+  dout(10) << __func__ << dendl;
   assert(coll_map.empty());
   KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
   for (it->upper_bound(string());
@@ -5032,7 +5026,8 @@ int BlueStore::_open_collections(int *errors)
              << pretty_binary_string(it->key()) << dendl;
         return -EIO;
       }   
-      dout(20) << __func__ << " opened " << cid << " " << c << dendl;
+      dout(20) << __func__ << " opened " << cid << " " << c
+              << " " << c->cnode << dendl;
       coll_map[cid] = c;
     } else {
       derr << __func__ << " unrecognized collection " << it->key() << dendl;
@@ -5116,30 +5111,13 @@ int BlueStore::_setup_block_symlink_or_file(
        }
 
        if (cct->_conf->bluestore_block_preallocate_file) {
-#ifdef HAVE_POSIX_FALLOCATE
-         r = ::posix_fallocate(fd, 0, size);
-         if (r) {
+          r = ::ceph_posix_fallocate(fd, 0, size);
+          if (r > 0) {
            derr << __func__ << " failed to prefallocate " << name << " file to "
              << size << ": " << cpp_strerror(r) << dendl;
            VOID_TEMP_FAILURE_RETRY(::close(fd));
            return -r;
          }
-#else
-         char data[1024*128];
-         for (uint64_t off = 0; off < size; off += sizeof(data)) {
-           if (off + sizeof(data) > size)
-             r = ::write(fd, data, size - off);
-           else
-             r = ::write(fd, data, sizeof(data));
-           if (r < 0) {
-             r = -errno;
-             derr << __func__ << " failed to prefallocate w/ write " << name << " file to "
-               << size << ": " << cpp_strerror(r) << dendl;
-             VOID_TEMP_FAILURE_RETRY(::close(fd));
-             return r;
-           }
-         }
-#endif
        }
        dout(1) << __func__ << " resized " << name << " file to "
                << pretty_si_t(size) << "B" << dendl;
@@ -5254,17 +5232,6 @@ int BlueStore::mkfs()
   if (r < 0)
     goto out_close_fsid;
 
-  {
-    string wal_path = cct->_conf->get_val<string>("bluestore_block_wal_path");
-    if (wal_path.size()) {
-      write_meta("path_block.wal", wal_path);
-    }
-    string db_path = cct->_conf->get_val<string>("bluestore_block_db_path");
-    if (db_path.size()) {
-      write_meta("path_block.db", db_path);
-    }
-  }
-
   // choose min_alloc_size
   if (cct->_conf->bluestore_min_alloc_size) {
     min_alloc_size = cct->_conf->bluestore_min_alloc_size;
@@ -5782,7 +5749,8 @@ int BlueStore::_fsck(bool deep, bool repair)
          continue;
        }
        c->cid.is_pg(&pgid);
-       dout(20) << __func__ << "  collection " << c->cid << dendl;
+       dout(20) << __func__ << "  collection " << c->cid << " " << c->cnode
+                << dendl;
       }
 
       if (!expecting_shards.empty()) {
@@ -6469,7 +6437,7 @@ int BlueStore::read(
   }
 
  out:
-  if (r == 0 && _debug_data_eio(oid)) {
+  if (r >= 0 && _debug_data_eio(oid)) {
     r = -EIO;
     derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
   } else if (cct->_conf->bluestore_debug_random_read_err &&
index 32be8fe5b4590874a8a563dae33115e85815e7ff..387c22373243ba6c038d45dfb59d12081ec29f1c 100644 (file)
@@ -431,7 +431,8 @@ public:
     SharedBlobRef lookup(uint64_t sbid) {
       std::lock_guard<std::mutex> l(lock);
       auto p = sb_map.find(sbid);
-      if (p == sb_map.end()) {
+      if (p == sb_map.end() ||
+         p->second->nref == 0) {
         return nullptr;
       }
       return p->second;
@@ -443,20 +444,15 @@ public:
       sb->coll = coll;
     }
 
-    bool try_remove(SharedBlob *sb) {
-      std::lock_guard<std::mutex> l(lock);
-      if (sb->nref == 0) {
-       assert(sb->get_parent() == this);
-       sb_map.erase(sb->get_sbid());
-       return true;
-      }
-      return false;
-    }
-
     void remove(SharedBlob *sb) {
       std::lock_guard<std::mutex> l(lock);
       assert(sb->get_parent() == this);
-      sb_map.erase(sb->get_sbid());
+      // only remove if it still points to us
+      auto p = sb_map.find(sb->get_sbid());
+      if (p != sb_map.end() &&
+         p->second == sb) {
+       sb_map.erase(p);
+      }
     }
 
     bool empty() {
@@ -2417,7 +2413,10 @@ public:
     assert(db);
     db->compact();
   }
-  
+  bool has_builtin_csum() const override {
+    return true;
+  }
+
 private:
   bool _debug_data_eio(const ghobject_t& o) {
     if (!cct->_conf->bluestore_debug_inject_read_err) {
index ec4cc8bb0f844b2ea6cee09c3cc600bee855591b..6c2d5ec1bf16fd719b85df6c1508ee124ac23031 100644 (file)
@@ -338,6 +338,15 @@ void KernelDevice::_aio_stop()
   }
 }
 
+static bool is_expected_ioerr(const int r)
+{
+  // https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135
+  return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC ||
+         r == -ENOLINK || r == -EREMOTEIO || r == -EBADE ||
+         r == -ENODATA || r == -EILSEQ || r == -ENOMEM ||
+         r == -EAGAIN || r == -EREMCHG || r == -EIO);
+}
+
 void KernelDevice::_aio_thread()
 {
   dout(10) << __func__ << " start" << dendl;
@@ -372,11 +381,15 @@ void KernelDevice::_aio_thread()
 
        long r = aio[i]->get_return_value();
         if (r < 0) {
-          derr << __func__ << " got " << cpp_strerror(r) << dendl;
-          if (ioc->allow_eio && r == -EIO) {
-            ioc->set_return_value(r);
+          derr << __func__ << " got r=" << r << " (" << cpp_strerror(r) << ")"
+              << dendl;
+          if (ioc->allow_eio && is_expected_ioerr(r)) {
+            derr << __func__ << " translating the error to EIO for upper layer"
+                << dendl;
+            ioc->set_return_value(-EIO);
           } else {
-            assert(0 == "got unexpected error from io_getevents");
+            assert(0 == "got unexpected error from aio_t::get_return_value. "
+                       "This may suggest HW issue. Please check your dmesg!");
           }
         } else if (aio[i]->length != (uint64_t)r) {
           derr << "aio to " << aio[i]->offset << "~" << aio[i]->length
index 7e3402a720b6daf09c762a17dd91f125cfbe783f..65d7baf59d8b1b31cfdbc5d05af9f6f3367c6dc5 100644 (file)
@@ -306,9 +306,9 @@ void StupidAllocator::init_rm_free(uint64_t offset, uint64_t length)
               ldout(cct, 30) << __func__ << " demoting1 0x" << std::hex << off << "~" << len
                              << std::dec << " to bin " << newbin << dendl;
               _insert_free(off, len);
-              return false;
+              return true;
             }
-            return true;
+            return false;
           });
         ++it;
       }
index 4996e73452b119a8c9c50085d5c858e71534ece5..256f3db97fd551f2d8a91c8f533835db4b886bb5 100644 (file)
@@ -1,6 +1,7 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
+#include <algorithm>
 #include "aio.h"
 
 #if defined(HAVE_LIBAIO)
@@ -48,7 +49,7 @@ int aio_queue_t::submit_batch(aio_iter begin, aio_iter end,
   }
   int done = 0;
   while (left > 0) {
-    int r = io_submit(ctx, left, piocb + done);
+    int r = io_submit(ctx, std::min(left, max_iodepth), piocb + done);
     if (r < 0) {
       if (r == -EAGAIN && attempts-- > 0) {
        usleep(delay);
@@ -61,6 +62,8 @@ int aio_queue_t::submit_batch(aio_iter begin, aio_iter end,
     assert(r > 0);
     done += r;
     left -= r;
+    attempts = 16;
+    delay = 125;
   }
   return done;
 }
index b5bf775cb7b31b44c1d4e07e3419f8fca3e4979a..6aec1a31a8b6dc877b5a6ba15ea4f2a74a674db7 100644 (file)
@@ -127,7 +127,7 @@ struct bluefs_super_t {
       block_size(4096) { }
 
   uint64_t block_mask() const {
-    return ~(block_size - 1);
+    return ~((uint64_t)block_size - 1);
   }
 
   void encode(bufferlist& bl) const;
index 06f64d21d4b34b318525f464af2fa29ddec1ef48..4c0a5f77bf34188c8571c8ec36a2220fd77f2224 100644 (file)
@@ -116,6 +116,11 @@ void bluestore_cnode_t::generate_test_instances(list<bluestore_cnode_t*>& o)
   o.push_back(new bluestore_cnode_t(123));
 }
 
+ostream& operator<<(ostream& out, const bluestore_cnode_t& l)
+{
+  return out << "cnode(bits " << l.bits << ")";
+}
+
 // bluestore_extent_ref_map_t
 
 void bluestore_extent_ref_map_t::_check() const
index c2c0e39a089dc77e694e7938a0feabc4ae9c3998..bb57242ba3abe68d54ba93e882f7f9bc91f0669a 100644 (file)
@@ -64,6 +64,8 @@ struct bluestore_cnode_t {
 };
 WRITE_CLASS_DENC(bluestore_cnode_t)
 
+ostream& operator<<(ostream& out, const bluestore_cnode_t& l);
+
 class AllocExtent;
 typedef mempool::bluestore_alloc::vector<AllocExtent> AllocExtentVector;
 class AllocExtent {
index 6037ee4860279342ba0ec3bd2aae3a1634394cbc..6482317fce3d9d913797f10b8fba22a100321b0c 100644 (file)
@@ -198,32 +198,13 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
           << newsize << " bytes: " << cpp_strerror(err) << dendl;
       return -err;
     }
-#ifdef HAVE_POSIX_FALLOCATE
-    ret = ::posix_fallocate(fd, 0, newsize);
+    ret = ceph_posix_fallocate(fd, 0, newsize);
     if (ret) {
       derr << "FileJournal::_open_file : unable to preallocation journal to "
           << newsize << " bytes: " << cpp_strerror(ret) << dendl;
       return -ret;
     }
     max_size = newsize;
-#elif defined(__APPLE__)
-    fstore_t store;
-    store.fst_flags = F_ALLOCATECONTIG;
-    store.fst_posmode = F_PEOFPOSMODE;
-    store.fst_offset = 0;
-    store.fst_length = newsize;
-
-    ret = ::fcntl(fd, F_PREALLOCATE, &store);
-    if (ret == -1) {
-      ret = -errno;
-      derr << "FileJournal::_open_file : unable to preallocation journal to "
-          << newsize << " bytes: " << cpp_strerror(ret) << dendl;
-      return ret;
-    }
-    max_size = newsize;
-#else
-# error "Journal pre-allocation not supported on platform."
-#endif
   }
   else {
     max_size = oldsize;
index cd4972c1a9d55e0295f1251993e3ceef0ca36fd5..c3c9e3759c4ef968e28df15a8fcaa58ed44b22d0 100644 (file)
@@ -612,7 +612,8 @@ FileStore::FileStore(CephContext* cct, const std::string &base,
   plb.add_u64(l_filestore_journal_ops, "journal_ops", "Active journal entries to be applied");
   plb.add_u64(l_filestore_journal_queue_bytes, "journal_queue_bytes", "Size of journal queue");
   plb.add_u64(l_filestore_journal_bytes, "journal_bytes", "Active journal operation size to be applied");
-  plb.add_time_avg(l_filestore_journal_latency, "journal_latency", "Average journal queue completing latency");
+  plb.add_time_avg(l_filestore_journal_latency, "journal_latency", "Average journal queue completing latency",
+                   NULL, PerfCountersBuilder::PRIO_USEFUL);
   plb.add_u64_counter(l_filestore_journal_wr, "journal_wr", "Journal write IOs");
   plb.add_u64_avg(l_filestore_journal_wr_bytes, "journal_wr_bytes", "Journal data written");
   plb.add_u64(l_filestore_op_queue_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue");
@@ -628,7 +629,8 @@ FileStore::FileStore(CephContext* cct, const std::string &base,
   plb.add_time_avg(l_filestore_commitcycle_interval, "commitcycle_interval", "Average interval between commits");
   plb.add_time_avg(l_filestore_commitcycle_latency, "commitcycle_latency", "Average latency of commit");
   plb.add_u64_counter(l_filestore_journal_full, "journal_full", "Journal writes while full");
-  plb.add_time_avg(l_filestore_queue_transaction_latency_avg, "queue_transaction_latency_avg", "Store operation queue latency");
+  plb.add_time_avg(l_filestore_queue_transaction_latency_avg, "queue_transaction_latency_avg",
+                   "Store operation queue latency", NULL, PerfCountersBuilder::PRIO_USEFUL);
   plb.add_time(l_filestore_sync_pause_max_lat, "sync_pause_max_latency", "Max latency of op_wq pause before syncfs");
 
   logger = plb.create_perf_counters();
@@ -2981,7 +2983,8 @@ void FileStore::_do_transaction(
         const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
           _cid : _cid.get_temp();
         tracepoint(objectstore, omap_clear_enter, osr_name);
-        r = _omap_clear(cid, oid, spos);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+         r = _omap_clear(cid, oid, spos);
         tracepoint(objectstore, omap_clear_exit, r);
       }
       break;
@@ -2994,7 +2997,8 @@ void FileStore::_do_transaction(
         map<string, bufferlist> aset;
         i.decode_attrset(aset);
         tracepoint(objectstore, omap_setkeys_enter, osr_name);
-        r = _omap_setkeys(cid, oid, aset, spos);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+         r = _omap_setkeys(cid, oid, aset, spos);
         tracepoint(objectstore, omap_setkeys_exit, r);
       }
       break;
@@ -3007,7 +3011,8 @@ void FileStore::_do_transaction(
         set<string> keys;
         i.decode_keyset(keys);
         tracepoint(objectstore, omap_rmkeys_enter, osr_name);
-        r = _omap_rmkeys(cid, oid, keys, spos);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+         r = _omap_rmkeys(cid, oid, keys, spos);
         tracepoint(objectstore, omap_rmkeys_exit, r);
       }
       break;
@@ -3021,7 +3026,8 @@ void FileStore::_do_transaction(
         first = i.decode_string();
         last = i.decode_string();
         tracepoint(objectstore, omap_rmkeyrange_enter, osr_name);
-        r = _omap_rmkeyrange(cid, oid, first, last, spos);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+         r = _omap_rmkeyrange(cid, oid, first, last, spos);
         tracepoint(objectstore, omap_rmkeyrange_exit, r);
       }
       break;
@@ -3034,7 +3040,8 @@ void FileStore::_do_transaction(
         bufferlist bl;
         i.decode_bl(bl);
         tracepoint(objectstore, omap_setheader_enter, osr_name);
-        r = _omap_setheader(cid, oid, bl, spos);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+         r = _omap_setheader(cid, oid, bl, spos);
         tracepoint(objectstore, omap_setheader_exit, r);
       }
       break;
index 7af6863fed41eec7b7afa2065529bdcdba3ec667..1b725676a6ca8f97ab808f306687bc13dc08282f 100644 (file)
@@ -647,6 +647,10 @@ public:
     object_map->compact();
   }
 
+  bool has_builtin_csum() const override {
+    return false;
+  }
+
   void debug_obj_on_delete(const ghobject_t &oid);
   bool debug_data_eio(const ghobject_t &oid);
   bool debug_mdata_eio(const ghobject_t &oid);
index 3e6663630d13fff2a6418c34517f64284a884c60..23e5a50f0166ceb891f11e602ea81503f0412307 100644 (file)
@@ -254,14 +254,17 @@ struct OnRecoveryReadComplete :
 struct RecoveryMessages {
   map<hobject_t,
       ECBackend::read_request_t> reads;
+  map<hobject_t, set<int>> want_to_read;
   void read(
     ECBackend *ec,
     const hobject_t &hoid, uint64_t off, uint64_t len,
+    set<int> &&_want_to_read,
     const set<pg_shard_t> &need,
     bool attrs) {
     list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read;
     to_read.push_back(boost::make_tuple(off, len, 0));
     assert(!reads.count(hoid));
+    want_to_read.insert(make_pair(hoid, std::move(_want_to_read)));
     reads.insert(
       make_pair(
        hoid,
@@ -526,6 +529,7 @@ void ECBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority)
     return;
   start_read_op(
     priority,
+    m.want_to_read,
     m.reads,
     OpRequestRef(),
     false, true);
@@ -571,6 +575,7 @@ void ECBackend::continue_recovery_op(
        op.hoid,
        op.recovery_progress.data_recovered_to,
        amount,
+       std::move(want),
        to_read,
        op.recovery_progress.first && !op.obc);
       op.extent_requested = make_pair(
@@ -758,6 +763,7 @@ bool ECBackend::_handle_message(
     // not conflict with ECSubWrite's operator<<.
     MOSDECSubOpWrite *op = static_cast<MOSDECSubOpWrite*>(
       _op->get_nonconst_req());
+    parent->maybe_preempt_replica_scrub(op->op.soid);
     handle_sub_write(op->op.from, _op, op->op, _op->pg_trace);
     return true;
   }
@@ -1203,10 +1209,9 @@ void ECBackend::handle_sub_read_reply(
         have.insert(j->first.shard);
         dout(20) << __func__ << " have shard=" << j->first.shard << dendl;
       }
-      set<int> want_to_read, dummy_minimum;
-      get_want_to_read_shards(&want_to_read);
+      set<int> dummy_minimum;
       int err;
-      if ((err = ec_impl->minimum_to_decode(want_to_read, have, &dummy_minimum)) < 0) {
+      if ((err = ec_impl->minimum_to_decode(rop.want_to_read[iter->first], have, &dummy_minimum)) < 0) {
        dout(20) << __func__ << " minimum_to_decode failed" << dendl;
         if (rop.in_progress.empty()) {
          // If we don't have enough copies and we haven't sent reads for all shards
@@ -1489,6 +1494,7 @@ void ECBackend::call_write_ordered(std::function<void(void)> &&cb) {
 
 void ECBackend::get_all_avail_shards(
   const hobject_t &hoid,
+  const set<pg_shard_t> &error_shards,
   set<int> &have,
   map<shard_id_t, pg_shard_t> &shards,
   bool for_recovery)
@@ -1499,6 +1505,8 @@ void ECBackend::get_all_avail_shards(
        ++i) {
     dout(10) << __func__ << ": checking acting " << *i << dendl;
     const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
+    if (error_shards.find(*i) != error_shards.end())
+      continue;
     if (!missing.is_missing(hoid)) {
       assert(!have.count(i->shard));
       have.insert(i->shard);
@@ -1512,6 +1520,8 @@ void ECBackend::get_all_avail_shards(
           get_parent()->get_backfill_shards().begin();
         i != get_parent()->get_backfill_shards().end();
         ++i) {
+      if (error_shards.find(*i) != error_shards.end())
+       continue;
       if (have.count(i->shard)) {
        assert(shards.count(i->shard));
        continue;
@@ -1538,6 +1548,8 @@ void ECBackend::get_all_avail_shards(
        if (m) {
          assert(!(*m).is_missing(hoid));
        }
+       if (error_shards.find(*i) != error_shards.end())
+         continue;
        have.insert(i->shard);
        shards.insert(make_pair(i->shard, *i));
       }
@@ -1557,8 +1569,9 @@ int ECBackend::get_min_avail_to_read_shards(
 
   set<int> have;
   map<shard_id_t, pg_shard_t> shards;
+  set<pg_shard_t> error_shards;
 
-  get_all_avail_shards(hoid, have, shards, for_recovery);
+  get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
 
   set<int> need;
   int r = ec_impl->minimum_to_decode(want, have, &need);
@@ -1584,6 +1597,8 @@ int ECBackend::get_min_avail_to_read_shards(
 int ECBackend::get_remaining_shards(
   const hobject_t &hoid,
   const set<int> &avail,
+  const set<int> &want,
+  const read_result_t &result,
   set<pg_shard_t> *to_read,
   bool for_recovery)
 {
@@ -1591,21 +1606,41 @@ int ECBackend::get_remaining_shards(
 
   set<int> have;
   map<shard_id_t, pg_shard_t> shards;
+  set<pg_shard_t> error_shards;
+  for (auto &p : result.errors) {
+    error_shards.insert(p.first);
+  }
+
+  get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
+
+  set<int> need;
+  int r = ec_impl->minimum_to_decode(want, have, &need);
+  if (r < 0) {
+    dout(0) << __func__ << " not enough shards left to try for " << hoid
+           << " read result was " << result << dendl;
+    return -EIO;
+  }
 
-  get_all_avail_shards(hoid, have, shards, for_recovery);
+  set<int> shards_left;
+  for (auto p : need) {
+    if (avail.find(p) == avail.end()) {
+      shards_left.insert(p);
+    }
+  }
 
-  for (set<int>::iterator i = have.begin();
-       i != have.end();
+  for (set<int>::iterator i = shards_left.begin();
+       i != shards_left.end();
        ++i) {
     assert(shards.count(shard_id_t(*i)));
-    if (avail.find(*i) == avail.end())
-      to_read->insert(shards[shard_id_t(*i)]);
+    assert(avail.find(*i) == avail.end());
+    to_read->insert(shards[shard_id_t(*i)]);
   }
   return 0;
 }
 
 void ECBackend::start_read_op(
   int priority,
+  map<hobject_t, set<int>> &want_to_read,
   map<hobject_t, read_request_t> &to_read,
   OpRequestRef _op,
   bool do_redundant_reads,
@@ -1621,6 +1656,7 @@ void ECBackend::start_read_op(
       do_redundant_reads,
       for_recovery,
       _op,
+      std::move(want_to_read),
       std::move(to_read))).first->second;
   dout(10) << __func__ << ": starting " << op << dendl;
   if (_op) {
@@ -2274,6 +2310,7 @@ void ECBackend::objects_read_and_reconstruct(
     return;
   }
 
+  map<hobject_t, set<int>> obj_want_to_read;
   set<int> want_to_read;
   get_want_to_read_shards(&want_to_read);
     
@@ -2301,10 +2338,12 @@ void ECBackend::objects_read_and_reconstruct(
          shards,
          false,
          c)));
+    obj_want_to_read.insert(make_pair(to_read.first, want_to_read));
   }
 
   start_read_op(
     CEPH_MSG_PRIO_DEFAULT,
+    obj_want_to_read,
     for_read_op,
     OpRequestRef(),
     fast_read, false);
@@ -2322,31 +2361,24 @@ int ECBackend::send_all_remaining_reads(
     already_read.insert(i->shard);
   dout(10) << __func__ << " have/error shards=" << already_read << dendl;
   set<pg_shard_t> shards;
-  int r = get_remaining_shards(hoid, already_read, &shards, rop.for_recovery);
+  int r = get_remaining_shards(hoid, already_read, rop.want_to_read[hoid],
+                              rop.complete[hoid], &shards, rop.for_recovery);
   if (r)
     return r;
-  if (shards.empty())
-    return -EIO;
-
-  dout(10) << __func__ << " Read remaining shards " << shards << dendl;
 
-  // TODOSAM: this doesn't seem right
   list<boost::tuple<uint64_t, uint64_t, uint32_t> > offsets =
     rop.to_read.find(hoid)->second.to_read;
   GenContext<pair<RecoveryMessages *, read_result_t& > &> *c =
     rop.to_read.find(hoid)->second.cb;
 
-  map<hobject_t, read_request_t> for_read_op;
-  for_read_op.insert(
-    make_pair(
+  rop.to_read.erase(hoid);
+  rop.to_read.insert(make_pair(
       hoid,
       read_request_t(
        offsets,
        shards,
        false,
        c)));
-
-  rop.to_read.swap(for_read_op);
   do_read_op(rop);
   return 0;
 }
@@ -2386,47 +2418,60 @@ void ECBackend::rollback_append(
       old_size));
 }
 
-void ECBackend::be_deep_scrub(
+int ECBackend::be_deep_scrub(
   const hobject_t &poid,
-  uint32_t seed,
-  ScrubMap::object &o,
-  ThreadPool::TPHandle &handle) {
-  bufferhash h(-1); // we always used -1
+  ScrubMap &map,
+  ScrubMapBuilder &pos,
+  ScrubMap::object &o)
+{
+  dout(10) << __func__ << " " << poid << " pos " << pos << dendl;
   int r;
+
+  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+                           CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
+  utime_t sleeptime;
+  sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep);
+  if (sleeptime != utime_t()) {
+    lgeneric_derr(cct) << __func__ << " sleeping for " << sleeptime << dendl;
+    sleeptime.sleep();
+  }
+
+  if (pos.data_pos == 0) {
+    pos.data_hash = bufferhash(-1);
+  }
+
   uint64_t stride = cct->_conf->osd_deep_scrub_stride;
   if (stride % sinfo.get_chunk_size())
     stride += sinfo.get_chunk_size() - (stride % sinfo.get_chunk_size());
-  uint64_t pos = 0;
 
-  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
-
-  while (true) {
-    bufferlist bl;
-    handle.reset_tp_timeout();
-    r = store->read(
-      ch,
-      ghobject_t(
-       poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
-      pos,
-      stride, bl,
-      fadvise_flags);
-    if (r < 0)
-      break;
-    if (bl.length() % sinfo.get_chunk_size()) {
-      r = -EIO;
-      break;
-    }
-    pos += r;
-    h << bl;
-    if ((unsigned)r < stride)
-      break;
+  bufferlist bl;
+  r = store->read(
+    ch,
+    ghobject_t(
+      poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    pos.data_pos,
+    stride, bl,
+    fadvise_flags);
+  if (r < 0) {
+    dout(20) << __func__ << "  " << poid << " got "
+            << r << " on read, read_error" << dendl;
+    o.read_error = true;
+    return 0;
   }
-
-  if (r == -EIO) {
-    dout(0) << "_scan_list  " << poid << " got "
-           << r << " on read, read_error" << dendl;
+  if (bl.length() % sinfo.get_chunk_size()) {
+    dout(20) << __func__ << "  " << poid << " got "
+            << r << " on read, not chunk size " << sinfo.get_chunk_size() << " aligned"
+            << dendl;
     o.read_error = true;
-    return;
+    return 0;
+  }
+  if (r > 0) {
+    pos.data_hash << bl;
+  }
+  pos.data_pos += r;
+  if (r == (int)stride) {
+    return -EINPROGRESS;
   }
 
   ECUtil::HashInfoRef hinfo = get_hash_info(poid, false, &o.attrs);
@@ -2434,20 +2479,27 @@ void ECBackend::be_deep_scrub(
     dout(0) << "_scan_list  " << poid << " could not retrieve hash info" << dendl;
     o.read_error = true;
     o.digest_present = false;
-    return;
+    return 0;
   } else {
     if (!get_parent()->get_pool().allows_ecoverwrites()) {
       assert(hinfo->has_chunk_hash());
-      if (hinfo->get_total_chunk_size() != pos) {
-       dout(0) << "_scan_list  " << poid << " got incorrect size on read" << dendl;
+      if (hinfo->get_total_chunk_size() != (unsigned)pos.data_pos) {
+       dout(0) << "_scan_list  " << poid << " got incorrect size on read 0x"
+               << std::hex << pos
+               << " expected 0x" << hinfo->get_total_chunk_size() << std::dec
+               << dendl;
        o.ec_size_mismatch = true;
-       return;
+       return 0;
       }
 
-      if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) != h.digest()) {
-       dout(0) << "_scan_list  " << poid << " got incorrect hash on read" << dendl;
+      if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) !=
+         pos.data_hash.digest()) {
+       dout(0) << "_scan_list  " << poid << " got incorrect hash on read 0x"
+               << std::hex << pos.data_hash.digest() << " !=  expected 0x"
+               << hinfo->get_chunk_hash(get_parent()->whoami_shard().shard)
+               << std::dec << dendl;
        o.ec_hash_mismatch = true;
-       return;
+       return 0;
       }
 
       /* We checked above that we match our own stored hash.  We cannot
@@ -2467,6 +2519,7 @@ void ECBackend::be_deep_scrub(
     }
   }
 
-  o.omap_digest = seed;
+  o.omap_digest = -1;
   o.omap_digest_present = true;
+  return 0;
 }
index 120e2d298bfadefc0b518975c975d7ccd08b957c..49a88f74d1ff701a5f1cf3c0a7694cb0f39e11c7 100644 (file)
@@ -322,6 +322,7 @@ private:
     RecoveryMessages *m);
   void get_all_avail_shards(
     const hobject_t &hoid,
+    const set<pg_shard_t> &error_shards,
     set<int> &have,
     map<shard_id_t, pg_shard_t> &shards,
     bool for_recovery);
@@ -386,6 +387,7 @@ public:
 
     ZTracer::Trace trace;
 
+    map<hobject_t, set<int>> want_to_read;
     map<hobject_t, read_request_t> to_read;
     map<hobject_t, read_result_t> complete;
 
@@ -402,9 +404,11 @@ public:
       bool do_redundant_reads,
       bool for_recovery,
       OpRequestRef op,
+      map<hobject_t, set<int>> &&_want_to_read,
       map<hobject_t, read_request_t> &&_to_read)
       : priority(priority), tid(tid), op(op), do_redundant_reads(do_redundant_reads),
-       for_recovery(for_recovery), to_read(std::move(_to_read)) {
+       for_recovery(for_recovery), want_to_read(std::move(_want_to_read)),
+       to_read(std::move(_to_read)) {
       for (auto &&hpair: to_read) {
        auto &returned = complete[hpair.first].returned;
        for (auto &&extent: hpair.second.to_read) {
@@ -430,6 +434,7 @@ public:
   map<pg_shard_t, set<ceph_tid_t> > shard_to_read_map;
   void start_read_op(
     int priority,
+    map<hobject_t, set<int>> &want_to_read,
     map<hobject_t, read_request_t> &to_read,
     OpRequestRef op,
     bool do_redundant_reads, bool for_recovery);
@@ -655,6 +660,8 @@ public:
   int get_remaining_shards(
     const hobject_t &hoid,
     const set<int> &avail,
+    const set<int> &want,
+    const read_result_t &result,
     set<pg_shard_t> *to_read,
     bool for_recovery);
 
@@ -670,11 +677,11 @@ public:
   bool scrub_supported() override { return true; }
   bool auto_repair_supported() const override { return true; }
 
-  void be_deep_scrub(
-    const hobject_t &obj,
-    uint32_t seed,
-    ScrubMap::object &o,
-    ThreadPool::TPHandle &handle) override;
+  int be_deep_scrub(
+    const hobject_t &poid,
+    ScrubMap &map,
+    ScrubMapBuilder &pos,
+    ScrubMap::object &o) override;
   uint64_t be_get_ondisk_size(uint64_t logical_size) override {
     return sinfo.logical_to_next_chunk_offset(logical_size);
   }
index 2579e7fdc4b04c3a27b6917c027cbd89e5a608db..54fedcddcabf0624d4820294494ee9ab58691a8b 100644 (file)
@@ -1273,10 +1273,12 @@ bool OSDService::can_inc_scrubs_pending()
 
   if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
     dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
-            << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
+            << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active
+            << ")" << dendl;
     can_inc = true;
   } else {
-    dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
+    dout(20) << __func__ << " " << scrubs_pending << " + " << scrubs_active
+            << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
   }
 
   return can_inc;
@@ -1411,7 +1413,8 @@ void OSDService::got_stop_ack()
 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
                                                OSDSuperblock& sblock)
 {
-  MOSDMap *m = new MOSDMap(monc->get_fsid());
+  MOSDMap *m = new MOSDMap(monc->get_fsid(),
+                          osdmap->get_encoding_features());
   m->oldest_map = max_oldest_map;
   m->newest_map = sblock.newest_map;
 
@@ -1451,7 +1454,8 @@ void OSDService::send_incremental_map(epoch_t since, Connection *con,
     OSDSuperblock sblock(get_superblock());
     if (since < sblock.oldest_map) {
       // just send latest full map
-      MOSDMap *m = new MOSDMap(monc->get_fsid());
+      MOSDMap *m = new MOSDMap(monc->get_fsid(),
+                              osdmap->get_encoding_features());
       m->oldest_map = max_oldest_map;
       m->newest_map = sblock.newest_map;
       get_map_bl(to, m->maps[to]);
@@ -2660,6 +2664,12 @@ int OSD::init()
   update_log_config();
 
   peering_tp.start();
+  
+  service.init();
+  service.publish_map(osdmap);
+  service.publish_superblock(superblock);
+  service.max_oldest_map = superblock.oldest_map;
+
   osd_op_tp.start();
   disk_tp.start();
   command_tp.start();
@@ -2676,11 +2686,6 @@ int OSD::init()
     tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
   }
 
-  service.init();
-  service.publish_map(osdmap);
-  service.publish_superblock(superblock);
-  service.max_oldest_map = superblock.oldest_map;
-
   osd_lock.Unlock();
 
   r = monc->authenticate();
@@ -7233,9 +7238,11 @@ bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool for
 }
 
 
-bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
-                              int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
-                              bool& isvalid, CryptoKey& session_key)
+bool OSD::ms_verify_authorizer(
+  Connection *con, int peer_type,
+  int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
+  bool& isvalid, CryptoKey& session_key,
+  std::unique_ptr<AuthAuthorizerChallenge> *challenge)
 {
   AuthAuthorizeHandler *authorize_handler = 0;
   switch (peer_type) {
@@ -7267,7 +7274,7 @@ bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
     isvalid = authorize_handler->verify_authorizer(
       cct, keys,
       authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
-      &auid);
+      &auid, challenge);
   } else {
     dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
     isvalid = false;
@@ -7522,6 +7529,25 @@ bool OSD::scrub_time_permit(utime_t now)
   struct tm bdt;
   time_t tt = now.sec();
   localtime_r(&tt, &bdt);
+
+  bool day_permit = false;
+  if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
+    if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
+      day_permit = true;
+    }
+  } else {
+    if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
+      day_permit = true;
+    }
+  }
+
+  if (!day_permit) {
+    dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
+            << " - " << cct->_conf->osd_scrub_end_week_day
+            << " now " << bdt.tm_wday << " = no" << dendl;
+    return false;
+  }
+
   bool time_permit = false;
   if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
     if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
index ea611cbae1cb7010d09dc8c431bf6e94c07dcbc0..4523fb2807f2e62db2fc3532ed65e13f88492976 100644 (file)
@@ -1582,7 +1582,8 @@ public:
     }
     bool ms_verify_authorizer(Connection *con, int peer_type,
                              int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
-                             bool& isvalid, CryptoKey& session_key) override {
+                             bool& isvalid, CryptoKey& session_key,
+                             std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
       isvalid = true;
       return true;
     }
@@ -2359,7 +2360,8 @@ private:
   bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
   bool ms_verify_authorizer(Connection *con, int peer_type,
                            int protocol, bufferlist& authorizer, bufferlist& authorizer_reply,
-                           bool& isvalid, CryptoKey& session_key) override;
+                           bool& isvalid, CryptoKey& session_key,
+                           std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
   void ms_handle_connect(Connection *con) override;
   void ms_handle_fast_connect(Connection *con) override;
   void ms_handle_fast_accept(Connection *con) override;
index f2c4e34023b7c851588d746aaedfe39ffe439793..dc9e2e5083ec0663bb40ac1b87e5f8b151a43efa 100644 (file)
@@ -236,6 +236,8 @@ int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext *cct,
   return 0;
 }
 
+// ----------------------------------
+// OSDMap
 
 bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
 {
@@ -1608,125 +1610,123 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
   OSDMap tmpmap;
   tmpmap.deepish_copy_from(osdmap);
   tmpmap.apply_incremental(*pending_inc);
+  set<pg_t> to_check;
+  set<pg_t> to_cancel;
+  map<int, map<int, float>> rule_weight_map;
 
   for (auto& p : tmpmap.pg_upmap) {
-    ldout(cct, 10) << __func__ << " pg_upmap entry "
-                   << "[" << p.first << ":" << p.second << "]"
-                   << dendl;
-    auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first);
+    to_check.insert(p.first);
+  }
+  for (auto& p : tmpmap.pg_upmap_items) {
+    to_check.insert(p.first);
+  }
+  for (auto& p : pending_inc->new_pg_upmap) {
+    to_check.insert(p.first);
+  }
+  for (auto& p : pending_inc->new_pg_upmap_items) {
+    to_check.insert(p.first);
+  }
+  for (auto& pg : to_check) {
+    auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
     if (crush_rule < 0) {
       lderr(cct) << __func__ << " unable to load crush-rule of pg "
-                 << p.first << dendl;
+                 << pg << dendl;
       continue;
     }
+    map<int, float> weight_map;
+    auto it = rule_weight_map.find(crush_rule);
+    if (it == rule_weight_map.end()) {
+      auto r = tmpmap.crush->get_rule_weight_osd_map(crush_rule, &weight_map);
+      if (r < 0) {
+        lderr(cct) << __func__ << " unable to get crush weight_map for "
+                   << "crush_rule " << crush_rule << dendl;
+        continue;
+      }
+      rule_weight_map[crush_rule] = weight_map;
+    } else {
+      weight_map = it->second;
+    }
     auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
     if (type < 0) {
       lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
-                 << p.first << dendl;
-      continue;
-    } else if (type == 0) {
-      ldout(cct, 10) << __func__ << " failure-domain of pg " << p.first
-                     << " is osd-level, skipping"
-                     << dendl;
+                 << pg << dendl;
       continue;
     }
-    ldout(cct, 10) << __func__ << " pg " << p.first
+    ldout(cct, 10) << __func__ << " pg " << pg
                    << " crush-rule-id " << crush_rule
+                   << " weight_map " << weight_map
                    << " failure-domain-type " << type
                    << dendl;
     vector<int> raw;
     int primary;
-    tmpmap.pg_to_raw_up(p.first, &raw, &primary);
+    tmpmap.pg_to_raw_up(pg, &raw, &primary);
     set<int> parents;
-    bool error = false;
-    bool collide = false;
     for (auto osd : raw) {
-      auto parent = tmpmap.crush->get_parent_of_type(osd, type);
-      if (parent >= 0) {
-        lderr(cct) << __func__ << " unable to get parent of raw osd." << osd
-                   << ", pg " << p.first
-                   << dendl;
-        error = true;
+      if (type > 0) {
+        auto parent = tmpmap.crush->get_parent_of_type(osd, type, crush_rule);
+        if (parent >= 0) {
+          lderr(cct) << __func__ << " unable to get parent of raw osd."
+                     << osd << " of pg " << pg
+                     << dendl;
+          break;
+        }
+        auto r = parents.insert(parent);
+        if (!r.second) {
+          // two up-set osds come from same parent
+          to_cancel.insert(pg);
+          break;
+        }
+      }
+      // the above check validates collision only
+      // below we continue to check against crush-topology changing..
+      auto it = weight_map.find(osd);
+      if (it == weight_map.end()) {
+        // osd is gone or has been moved out of the specific crush-tree
+        to_cancel.insert(pg);
         break;
       }
-      auto r = parents.insert(parent);
-      if (!r.second) {
-        collide = true;
+      auto adjusted_weight = tmpmap.get_weightf(it->first) * it->second;
+      if (adjusted_weight == 0) {
+        // osd is out/crush-out
+        to_cancel.insert(pg);
         break;
       }
     }
-    if (!error && collide) {
-      ldout(cct, 10) << __func__ << " removing invalid pg_upmap "
-                     << "[" << p.first << ":" << p.second << "]"
-                     << ", final mapping result will be: " << raw
-                     << dendl;
-      auto it = pending_inc->new_pg_upmap.find(p.first);
+  }
+  for (auto &pg: to_cancel) {
+    { // pg_upmap
+      auto it = pending_inc->new_pg_upmap.find(pg);
       if (it != pending_inc->new_pg_upmap.end()) {
+        ldout(cct, 10) << __func__ << " cancel invalid pending "
+                       << "pg_upmap entry "
+                       << it->first << "->" << it->second
+                       << dendl;
         pending_inc->new_pg_upmap.erase(it);
       }
-      if (osdmap.pg_upmap.count(p.first)) {
-        pending_inc->old_pg_upmap.insert(p.first);
-      }
-    }
-  }
-  for (auto& p : tmpmap.pg_upmap_items) {
-    ldout(cct, 10) << __func__ << " pg_upmap_items entry "
-                   << "[" << p.first << ":" << p.second << "]"
-                   << dendl;
-    auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first);
-    if (crush_rule < 0) {
-      lderr(cct) << __func__ << " unable to load crush-rule of pg "
-                 << p.first << dendl;
-      continue;
-    }
-    auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
-    if (type < 0) {
-      lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
-                 << p.first << dendl;
-      continue;
-    } else if (type == 0) {
-      ldout(cct, 10) << __func__ << " failure-domain of pg " << p.first
-                     << " is osd-level, skipping"
-                     << dendl;
-      continue;
-    }
-    ldout(cct, 10) << __func__ << " pg " << p.first
-                   << " crush_rule_id " << crush_rule
-                   << " failure_domain_type " << type
-                   << dendl;
-    vector<int> raw;
-    int primary;
-    tmpmap.pg_to_raw_up(p.first, &raw, &primary);
-    set<int> parents;
-    bool error = false;
-    bool collide = false;
-    for (auto osd : raw) {
-      auto parent = tmpmap.crush->get_parent_of_type(osd, type);
-      if (parent >= 0) {
-        lderr(cct) << __func__ << " unable to get parent of raw osd." << osd
-                   << ", pg " << p.first
-                   << dendl;
-        error = true;
-        break;
-      }
-      auto r = parents.insert(parent);
-      if (!r.second) {
-        collide = true;
-        break;
+      if (osdmap.pg_upmap.count(pg)) {
+        ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
+                       << osdmap.pg_upmap.find(pg)->first << "->"
+                       << osdmap.pg_upmap.find(pg)->second
+                       << dendl;
+        pending_inc->old_pg_upmap.insert(pg);
       }
     }
-    if (!error && collide) {
-      ldout(cct, 10) << __func__ << " removing invalid pg_upmap_items "
-                     << "[" << p.first << ":" << p.second << "]"
-                     << ", final mapping result will be: " << raw
-                     << dendl;
-      // This is overkilling, but simpler..
-      auto it = pending_inc->new_pg_upmap_items.find(p.first);
+    { // pg_upmap_items
+      auto it = pending_inc->new_pg_upmap_items.find(pg);
       if (it != pending_inc->new_pg_upmap_items.end()) {
+        ldout(cct, 10) << __func__ << " cancel invalid pending "
+                       << "pg_upmap_items entry "
+                       << it->first << "->" << it->second
+                       << dendl;
         pending_inc->new_pg_upmap_items.erase(it);
       }
-      if (osdmap.pg_upmap_items.count(p.first)) {
-        pending_inc->old_pg_upmap_items.insert(p.first);
+      if (osdmap.pg_upmap_items.count(pg)) {
+        ldout(cct, 10) << __func__ << " cancel invalid "
+                       << "pg_upmap_items entry "
+                       << osdmap.pg_upmap_items.find(pg)->first << "->"
+                       << osdmap.pg_upmap_items.find(pg)->second
+                       << dendl;
+        pending_inc->old_pg_upmap_items.insert(pg);
       }
     }
   }
@@ -2349,6 +2349,24 @@ bool OSDMap::primary_changed(
   return false;      // same primary (tho replicas may have changed)
 }
 
+uint64_t OSDMap::get_encoding_features() const
+{
+  uint64_t f = SIGNIFICANT_FEATURES;
+  if (require_osd_release < CEPH_RELEASE_LUMINOUS) {
+    f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
+          CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
+  }
+  if (require_osd_release < CEPH_RELEASE_KRAKEN) {
+    f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
+          CEPH_FEATURE_MSG_ADDR2 |
+          CEPH_FEATURE_CRUSH_TUNABLES5);
+  }
+  if (require_osd_release < CEPH_RELEASE_JEWEL) {
+    f &= ~(CEPH_FEATURE_SERVER_JEWEL |
+          CEPH_FEATURE_NEW_OSDOP_ENCODING);
+  }
+  return f;
+}
 
 // serialize, unserialize
 void OSDMap::encode_client_old(bufferlist& bl) const
@@ -2487,6 +2505,8 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
   ENCODE_START(8, 7, bl);
 
   {
+    // NOTE: any new encoding dependencies must be reflected by
+    // SIGNIFICANT_FEATURES
     uint8_t v = 6;
     if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
       v = 3;
@@ -2557,6 +2577,8 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
   }
 
   {
+    // NOTE: any new encoding dependencies must be reflected by
+    // SIGNIFICANT_FEATURES
     uint8_t target_v = 5;
     if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
       target_v = 1;
@@ -4074,6 +4096,8 @@ int OSDMap::calc_pg_upmaps(
     for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
       int osd = p->second;
       float deviation = p->first;
+      // make sure osd is still there (belongs to this crush-tree)
+      assert(osd_weight.count(osd));
       float target = osd_weight[osd] * pgs_per_weight;
       assert(target > 0);
       if (deviation/target < max_deviation_ratio) {
index 847c7ecb39fa16b72c73aa01f4f4c8bc1d47e6bf..5d59754cef911a2186de7284d75aa9952af9d8a8 100644 (file)
@@ -507,6 +507,20 @@ private:
   int32_t max_osd;
   vector<uint32_t> osd_state;
 
+  // These features affect OSDMap[::Incremental] encoding, or the
+  // encoding of some type embedded therein (CrushWrapper, something
+  // from osd_types, etc.).
+  static constexpr uint64_t SIGNIFICANT_FEATURES =
+    CEPH_FEATUREMASK_PGID64 |
+    CEPH_FEATUREMASK_PGPOOL3 |
+    CEPH_FEATUREMASK_OSDENC |
+    CEPH_FEATUREMASK_OSDMAP_ENC |
+    CEPH_FEATUREMASK_OSD_POOLRESEND |
+    CEPH_FEATUREMASK_NEW_OSDOP_ENCODING |
+    CEPH_FEATUREMASK_MSG_ADDR2 |
+    CEPH_FEATUREMASK_CRUSH_TUNABLES5 |
+    CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS |
+    CEPH_FEATUREMASK_SERVER_LUMINOUS ;
   struct addrs_s {
     mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > client_addr;
     mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > cluster_addr;
@@ -591,6 +605,13 @@ private:
   OSDMap& operator=(const OSDMap& other) = default;
 public:
 
+  /// return feature mask subset that is relevant to OSDMap encoding
+  static uint64_t get_significant_features(uint64_t features) {
+    return SIGNIFICANT_FEATURES & features;
+  }
+
+  uint64_t get_encoding_features() const;
+
   void deepish_copy_from(const OSDMap& o) {
     *this = o;
     primary_temp.reset(new mempool::osdmap::map<pg_t,int32_t>(*o.primary_temp));
index 1b5107f7be71ed0fac08af3c460f0a16307021d5..60a604ae29fd6f116eafd14c3380043c6a6c63b7 100644 (file)
@@ -1011,13 +1011,12 @@ PG::Scrubber::Scrubber()
  : reserved(false), reserve_failed(false),
    epoch_start(0),
    active(false),
-   waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
+   shallow_errors(0), deep_errors(0), fixed(0),
    must_scrub(false), must_deep_scrub(false), must_repair(false),
    auto_repair(false),
    num_digest_updates_pending(0),
    state(INACTIVE),
-   deep(false),
-   seed(0)
+   deep(false)
 {}
 
 PG::Scrubber::~Scrubber() {}
@@ -3788,9 +3787,15 @@ void PG::do_replica_scrub_map(OpRequestRef op)
           << scrubber.received_maps[m->from].valid_through
           << dendl;
 
-  --scrubber.waiting_on;
+  dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
+          << dendl;
+  assert(scrubber.waiting_on_whom.count(m->from));
   scrubber.waiting_on_whom.erase(m->from);
-  if (scrubber.waiting_on == 0) {
+  if (m->preempted) {
+    dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
+    scrub_preempted = true;
+  }
+  if (scrubber.waiting_on_whom.empty()) {
     if (ops_blocked_by_scrub()) {
       requeue_scrub(true);
     } else {
@@ -3827,10 +3832,9 @@ void PG::sub_op_scrub_map(OpRequestRef op)
             << scrubber.received_maps[m->from].valid_through
             << dendl;
 
-  --scrubber.waiting_on;
   scrubber.waiting_on_whom.erase(m->from);
 
-  if (scrubber.waiting_on == 0) {
+  if (scrubber.waiting_on_whom.empty()) {
     if (ops_blocked_by_scrub()) {
       requeue_scrub(true);
     } else {
@@ -3843,16 +3847,20 @@ void PG::sub_op_scrub_map(OpRequestRef op)
 void PG::_request_scrub_map(
   pg_shard_t replica, eversion_t version,
   hobject_t start, hobject_t end,
-  bool deep, uint32_t seed)
+  bool deep,
+  bool allow_preemption)
 {
   assert(replica != pg_whoami);
   dout(10) << "scrub  requesting scrubmap from osd." << replica
-          << " deep " << (int)deep << " seed " << seed << dendl;
+          << " deep " << (int)deep << dendl;
   MOSDRepScrub *repscrubop = new MOSDRepScrub(
     spg_t(info.pgid.pgid, replica.shard), version,
     get_osdmap()->get_epoch(),
     get_last_peering_reset(),
-    start, end, deep, seed);
+    start, end, deep,
+    allow_preemption,
+    scrubber.priority,
+    ops_blocked_by_scrub());
   // default priority, we want the rep scrub processed prior to any recovery
   // or client io messages (we are holding a lock!)
   osd->send_message_osd_cluster(
@@ -4072,12 +4080,19 @@ void PG::_scan_snaps(ScrubMap &smap)
 {
   hobject_t head;
   SnapSet snapset;
+
+  // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify 
+  // caller using clean_meta_map(), and it works properly.
+  dout(20) << __func__ << " start" << dendl;
+
   for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
        i != smap.objects.rend();
        ++i) {
     const hobject_t &hoid = i->first;
     ScrubMap::object &o = i->second;
 
+    dout(20) << __func__ << " " << hoid << dendl;
+
     if (hoid.is_head() || hoid.is_snapdir()) {
       // parse the SnapSet
       bufferlist bl;
@@ -4230,42 +4245,67 @@ void PG::_repair_oinfo_oid(ScrubMap &smap)
     }
   }
 }
-
-/*
- * build a scrub map over a chunk without releasing the lock
- * only used by chunky scrub
- */
 int PG::build_scrub_map_chunk(
   ScrubMap &map,
-  hobject_t start, hobject_t end, bool deep, uint32_t seed,
+  ScrubMapBuilder &pos,
+  hobject_t start,
+  hobject_t end,
+  bool deep,
   ThreadPool::TPHandle &handle)
 {
   dout(10) << __func__ << " [" << start << "," << end << ") "
-          << " seed " << seed << dendl;
-
-  map.valid_through = info.last_update;
+          << " pos " << pos
+          << dendl;
 
-  // objects
-  vector<hobject_t> ls;
-  vector<ghobject_t> rollback_obs;
-  int ret = get_pgbackend()->objects_list_range(
-    start,
-    end,
-    0,
-    &ls,
-    &rollback_obs);
-  if (ret < 0) {
-    dout(5) << "objects_list_range error: " << ret << dendl;
-    return ret;
+  // start
+  while (pos.empty()) {
+    pos.deep = deep;
+    map.valid_through = info.last_update;
+    osr->flush();
+
+    // objects
+    vector<ghobject_t> rollback_obs;
+    pos.ret = get_pgbackend()->objects_list_range(
+      start,
+      end,
+      0,
+      &pos.ls,
+      &rollback_obs);
+    if (pos.ret < 0) {
+      dout(5) << "objects_list_range error: " << pos.ret << dendl;
+      return pos.ret;
+    }
+    if (pos.ls.empty()) {
+      break;
+    }
+    _scan_rollback_obs(rollback_obs, handle);
+    pos.pos = 0;
+    return -EINPROGRESS;
   }
 
+  // scan objects
+  while (!pos.done()) {
+    int r = get_pgbackend()->be_scan_list(map, pos);
+    if (r == -EINPROGRESS) {
+      return r;
+    }
+  }
 
-  get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
-  _scan_rollback_obs(rollback_obs, handle);
-  _scan_snaps(map);
+  // finish
+  dout(20) << __func__ << " finishing" << dendl;
+  assert(pos.done());
   _repair_oinfo_oid(map);
+  if (!is_primary()) {
+    ScrubMap for_meta_scrub;
+    // In case we restarted smaller chunk, clear old data
+    scrubber.cleaned_meta_map.clear_from(scrubber.start);
+    scrubber.cleaned_meta_map.insert(map);
+    scrubber.clean_meta_map(for_meta_scrub);
+    _scan_snaps(for_meta_scrub);
+  }
 
-  dout(20) << __func__ << " done" << dendl;
+  dout(20) << __func__ << " done, got " << map.objects.size() << " items"
+          << dendl;
   return 0;
 }
 
@@ -4351,8 +4391,6 @@ void PG::replica_scrub(
     return;
   }
 
-  ScrubMap map;
-
   assert(msg->chunky);
   if (last_update_applied < msg->scrub_to) {
     dout(10) << "waiting for last_update_applied to catch up" << dendl;
@@ -4366,45 +4404,24 @@ void PG::replica_scrub(
     return;
   }
 
-  // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
-  hobject_t start = msg->start;
-  hobject_t end = msg->end;
-  if (!start.is_max())
-    start.pool = info.pgid.pool();
-  if (!end.is_max())
-    end.pool = info.pgid.pool();
-
-  build_scrub_map_chunk(
-    map, start, end, msg->deep, msg->seed,
-    handle);
-
-  if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
-    MOSDRepScrubMap *reply = new MOSDRepScrubMap(
-      spg_t(info.pgid.pgid, get_primary().shard),
-      msg->map_epoch,
-      pg_whoami);
-    ::encode(map, reply->get_data());
-    osd->send_message_osd_cluster(reply, msg->get_connection());
+  scrubber.state = Scrubber::BUILD_MAP_REPLICA;
+  scrubber.replica_scrub_start = msg->min_epoch;
+  scrubber.start = msg->start;
+  scrubber.end = msg->end;
+  scrubber.max_end = msg->end;
+  scrubber.deep = msg->deep;
+  scrubber.epoch_start = info.history.same_interval_since;
+  if (msg->priority) {
+    scrubber.priority = msg->priority;
   } else {
-    // for jewel compatibility
-    vector<OSDOp> scrub(1);
-    scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
-    hobject_t poid;
-    eversion_t v;
-    osd_reqid_t reqid;
-    MOSDSubOp *subop = new MOSDSubOp(
-      reqid,
-      pg_whoami,
-      spg_t(info.pgid.pgid, get_primary().shard),
-      poid,
-      0,
-      msg->map_epoch,
-      osd->get_tid(),
-      v);
-    ::encode(map, subop->get_data());
-    subop->ops = scrub;
-    osd->send_message_osd_cluster(subop, msg->get_connection());
+    scrubber.priority = get_scrub_priority();
   }
+
+  scrub_can_preempt = msg->allow_preemption;
+  scrub_preempted = false;
+  scrubber.replica_scrubmap_pos.reset();
+
+  requeue_scrub(msg->high_priority);
 }
 
 /* Scrub:
@@ -4461,6 +4478,13 @@ void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
   scrub_queued = false;
   scrubber.needs_sleep = true;
 
+  // for the replica
+  if (!is_primary() &&
+      scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
+    chunky_scrub(handle);
+    return;
+  }
+
   if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
     dout(10) << "scrub -- not primary or active or not clean" << dendl;
     state_clear(PG_STATE_SCRUBBING);
@@ -4576,11 +4600,13 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
 
   while (!done) {
     dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
-            << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
+            << " [" << scrubber.start << "," << scrubber.end << ")"
+            << " max_end " << scrubber.max_end << dendl;
 
     switch (scrubber.state) {
       case PG::Scrubber::INACTIVE:
         dout(10) << "scrub start" << dendl;
+       assert(is_primary());
 
         publish_stats_to_osd();
         scrubber.epoch_start = info.history.same_interval_since;
@@ -4613,14 +4639,25 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
          osd->clog->debug(oss);
        }
 
-       scrubber.seed = -1;
-
+       scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
+         "osd_scrub_max_preemptions");
+       scrubber.preempt_divisor = 1;
         break;
 
       case PG::Scrubber::NEW_CHUNK:
         scrubber.primary_scrubmap = ScrubMap();
         scrubber.received_maps.clear();
 
+       // begin (possible) preemption window
+       if (scrub_preempted) {
+         scrubber.preempt_left--;
+         scrubber.preempt_divisor *= 2;
+         dout(10) << __func__ << " preempted, " << scrubber.preempt_left
+                  << " left" << dendl;
+         scrub_preempted = false;
+       }
+       scrub_can_preempt = scrubber.preempt_left > 0;
+
         {
           /* get the start and end of our scrub chunk
           *
@@ -4638,14 +4675,18 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
           * left end of the range if we are a tier because they may legitimately
           * not exist (see _scrub).
           */
-         int min = MAX(3, cct->_conf->osd_scrub_chunk_min);
+         int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
+                                     scrubber.preempt_divisor);
+         int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
+                                      scrubber.preempt_divisor);
           hobject_t start = scrubber.start;
          hobject_t candidate_end;
          vector<hobject_t> objects;
+         osr->flush();
          ret = get_pgbackend()->objects_list_partial(
            start,
            min,
-           MAX(min, cct->_conf->osd_scrub_chunk_max),
+           max,
            &objects,
            &candidate_end);
          assert(ret >= 0);
@@ -4680,6 +4721,8 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
            break;
          }
          scrubber.end = candidate_end;
+         if (scrubber.end > scrubber.max_end)
+           scrubber.max_end = scrubber.end;
         }
 
         // walk the log to find the latest update that affects our chunk
@@ -4709,7 +4752,6 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
         // ask replicas to wait until
         // last_update_applied >= scrubber.subset_last_update and then scan
         scrubber.waiting_on_whom.insert(pg_whoami);
-        ++scrubber.waiting_on;
 
         // request maps from replicas
        for (set<pg_shard_t>::iterator i = actingbackfill.begin();
@@ -4718,13 +4760,13 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
          if (*i == pg_whoami) continue;
           _request_scrub_map(*i, scrubber.subset_last_update,
                              scrubber.start, scrubber.end, scrubber.deep,
-                            scrubber.seed);
+                            scrubber.preempt_left > 0);
           scrubber.waiting_on_whom.insert(*i);
-          ++scrubber.waiting_on;
         }
+       dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
+                << dendl;
 
         scrubber.state = PG::Scrubber::WAIT_PUSHES;
-
         break;
 
       case PG::Scrubber::WAIT_PUSHES:
@@ -4737,49 +4779,76 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
         break;
 
       case PG::Scrubber::WAIT_LAST_UPDATE:
-        if (last_update_applied >= scrubber.subset_last_update) {
-          scrubber.state = PG::Scrubber::BUILD_MAP;
-        } else {
+        if (last_update_applied < scrubber.subset_last_update) {
           // will be requeued by op_applied
           dout(15) << "wait for writes to flush" << dendl;
           done = true;
-        }
+         break;
+       }
+
+       scrubber.state = PG::Scrubber::BUILD_MAP;
+       scrubber.primary_scrubmap_pos.reset();
         break;
 
       case PG::Scrubber::BUILD_MAP:
         assert(last_update_applied >= scrubber.subset_last_update);
 
         // build my own scrub map
-        ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
-                                    scrubber.start, scrubber.end,
-                                    scrubber.deep, scrubber.seed,
-                                   handle);
-        if (ret < 0) {
-          dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
+       if (scrub_preempted) {
+         dout(10) << __func__ << " preempted" << dendl;
+         scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
+         break;
+       }
+       ret = build_scrub_map_chunk(
+         scrubber.primary_scrubmap,
+         scrubber.primary_scrubmap_pos,
+         scrubber.start, scrubber.end,
+         scrubber.deep,
+         handle);
+       if (ret == -EINPROGRESS) {
+         requeue_scrub();
+         done = true;
+         break;
+       }
+       scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
+       break;
+
+      case PG::Scrubber::BUILD_MAP_DONE:
+       if (scrubber.primary_scrubmap_pos.ret < 0) {
+         dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
+                 << ", aborting" << dendl;
           scrub_clear_state();
           scrub_unreserve_replicas();
           return;
         }
-
-        --scrubber.waiting_on;
+       dout(10) << __func__ << " waiting_on_whom was "
+                << scrubber.waiting_on_whom << dendl;
+       assert(scrubber.waiting_on_whom.count(pg_whoami));
         scrubber.waiting_on_whom.erase(pg_whoami);
 
         scrubber.state = PG::Scrubber::WAIT_REPLICAS;
         break;
 
       case PG::Scrubber::WAIT_REPLICAS:
-        if (scrubber.waiting_on > 0) {
+        if (!scrubber.waiting_on_whom.empty()) {
           // will be requeued by sub_op_scrub_map
           dout(10) << "wait for replicas to build scrub map" << dendl;
           done = true;
-        } else {
+         break;
+       }
+       // end (possible) preemption window
+       scrub_can_preempt = false;
+       if (scrub_preempted) {
+         dout(10) << __func__ << " preempted, restarting chunk" << dendl;
+         scrubber.state = PG::Scrubber::NEW_CHUNK;
+       } else {
           scrubber.state = PG::Scrubber::COMPARE_MAPS;
         }
         break;
 
       case PG::Scrubber::COMPARE_MAPS:
         assert(last_update_applied >= scrubber.subset_last_update);
-        assert(scrubber.waiting_on == 0);
+        assert(scrubber.waiting_on_whom.empty());
 
         scrub_compare_maps();
        scrubber.start = scrubber.end;
@@ -4801,8 +4870,12 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
          break;
        }
 
+       scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
+         "osd_scrub_max_preemptions");
+       scrubber.preempt_divisor = 1;
+
        if (!(scrubber.end.is_max())) {
-          scrubber.state = PG::Scrubber::NEW_CHUNK;
+         scrubber.state = PG::Scrubber::NEW_CHUNK;
          requeue_scrub();
           done = true;
         } else {
@@ -4823,12 +4896,99 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
 
         break;
 
+      case PG::Scrubber::BUILD_MAP_REPLICA:
+        // build my own scrub map
+       if (scrub_preempted) {
+         dout(10) << __func__ << " preempted" << dendl;
+         ret = 0;
+       } else {
+         ret = build_scrub_map_chunk(
+           scrubber.replica_scrubmap,
+           scrubber.replica_scrubmap_pos,
+           scrubber.start, scrubber.end,
+           scrubber.deep,
+           handle);
+       }
+       if (ret == -EINPROGRESS) {
+         requeue_scrub();
+         done = true;
+         break;
+       }
+       // reply
+       if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
+         MOSDRepScrubMap *reply = new MOSDRepScrubMap(
+           spg_t(info.pgid.pgid, get_primary().shard),
+           scrubber.replica_scrub_start,
+           pg_whoami);
+         reply->preempted = scrub_preempted;
+         ::encode(scrubber.replica_scrubmap, reply->get_data());
+         osd->send_message_osd_cluster(
+           get_primary().osd, reply,
+           scrubber.replica_scrub_start);
+       } else {
+         // for jewel compatibility
+         vector<OSDOp> scrub(1);
+         scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
+         hobject_t poid;
+         eversion_t v;
+         osd_reqid_t reqid;
+         MOSDSubOp *subop = new MOSDSubOp(
+           reqid,
+           pg_whoami,
+           spg_t(info.pgid.pgid, get_primary().shard),
+           poid,
+           0,
+           scrubber.replica_scrub_start,
+           osd->get_tid(),
+           v);
+         ::encode(scrubber.replica_scrubmap, subop->get_data());
+         subop->ops = scrub;
+         osd->send_message_osd_cluster(
+           get_primary().osd, subop,
+           scrubber.replica_scrub_start);
+       }
+       scrub_preempted = false;
+       scrub_can_preempt = false;
+       scrubber.state = PG::Scrubber::INACTIVE;
+       scrubber.replica_scrubmap = ScrubMap();
+       scrubber.replica_scrubmap_pos = ScrubMapBuilder();
+       scrubber.start = hobject_t();
+       scrubber.end = hobject_t();
+       scrubber.max_end = hobject_t();
+       done = true;
+       break;
+
       default:
         ceph_abort();
     }
   }
   dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
-          << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
+          << " [" << scrubber.start << "," << scrubber.end << ")"
+          << " max_end " << scrubber.max_end << dendl;
+}
+
+bool PG::write_blocked_by_scrub(const hobject_t& soid)
+{
+  if (soid < scrubber.start || soid >= scrubber.end) {
+    return false;
+  }
+  if (scrub_can_preempt) {
+    if (!scrub_preempted) {
+      dout(10) << __func__ << " " << soid << " preempted" << dendl;
+      scrub_preempted = true;
+    } else {
+      dout(10) << __func__ << " " << soid << " already preempted" << dendl;
+    }
+    return false;
+  }
+  return true;
+}
+
+bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
+{
+  // does [start, end] intersect [scrubber.start, scrubber.max_end)
+  return (start < scrubber.max_end &&
+         end >= scrubber.start);
 }
 
 void PG::scrub_clear_state()
@@ -4857,33 +5017,52 @@ void PG::scrub_compare_maps()
 
   // construct authoritative scrub map for type specific scrubbing
   scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
-  map<hobject_t, pair<uint32_t, uint32_t>> missing_digest;
+  map<hobject_t,
+      pair<boost::optional<uint32_t>,
+           boost::optional<uint32_t>>> missing_digest;
+
+  map<pg_shard_t, ScrubMap *> maps;
+  maps[pg_whoami] = &scrubber.primary_scrubmap;
+
+  for (const auto& i : actingbackfill) {
+    if (i == pg_whoami) continue;
+    dout(2) << __func__ << " replica " << i << " has "
+            << scrubber.received_maps[i].objects.size()
+            << " items" << dendl;
+    maps[i] = &scrubber.received_maps[i];
+  }
+
+  set<hobject_t> master_set;
+
+  // Construct master set
+  for (const auto map : maps) {
+    for (const auto i : map.second->objects) {
+      master_set.insert(i.first);
+    }
+  }
+
+  stringstream ss;
+  get_pgbackend()->be_large_omap_check(maps, master_set,
+                                       scrubber.large_omap_objects, ss);
+  if (!ss.str().empty()) {
+    osd->clog->warn(ss);
+  }
 
   if (acting.size() > 1) {
     dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
 
-    stringstream ss;
-
     // Map from object with errors to good peer
     map<hobject_t, list<pg_shard_t>> authoritative;
-    map<pg_shard_t, ScrubMap *> maps;
 
     dout(2) << __func__ << "   osd." << acting[0] << " has "
            << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
-    maps[pg_whoami] = &scrubber.primary_scrubmap;
 
-    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
-        i != actingbackfill.end();
-        ++i) {
-      if (*i == pg_whoami) continue;
-      dout(2) << __func__ << " replica " << *i << " has "
-             << scrubber.received_maps[*i].objects.size()
-             << " items" << dendl;
-      maps[*i] = &scrubber.received_maps[*i];
-    }
+    ss.str("");
+    ss.clear();
 
     get_pgbackend()->be_compare_scrubmaps(
       maps,
+      master_set,
       state_test(PG_STATE_REPAIR),
       scrubber.missing,
       scrubber.inconsistent,
@@ -4926,26 +5105,12 @@ void PG::scrub_compare_maps()
   }
 
   ScrubMap for_meta_scrub;
-  if (scrubber.end.is_max() ||
-      scrubber.cleaned_meta_map.objects.empty()) {
-    scrubber.cleaned_meta_map.swap(for_meta_scrub);
-  } else {
-    auto iter = scrubber.cleaned_meta_map.objects.end();
-    --iter; // not empty, see if clause
-    auto begin = scrubber.cleaned_meta_map.objects.begin();
-    while (iter != begin) {
-      auto next = iter--;
-      if (next->first.get_head() != iter->first.get_head()) {
-       ++iter;
-       break;
-      }
-    }
-    for_meta_scrub.objects.insert(begin, iter);
-    scrubber.cleaned_meta_map.objects.erase(begin, iter);
-  }
+  scrubber.clean_meta_map(for_meta_scrub);
 
   // ok, do the pg-type specific scrubbing
   scrub_snapshot_metadata(for_meta_scrub, missing_digest);
+  // Called here on the primary can use an authoritative map if it isn't the primary
+  _scan_snaps(for_meta_scrub);
   if (!scrubber.store->empty()) {
     if (state_test(PG_STATE_REPAIR)) {
       dout(10) << __func__ << ": discarding scrub results" << dendl;
@@ -5077,6 +5242,7 @@ void PG::scrub_finish()
       info.history.last_clean_scrub_stamp = now;
     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
     info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
+    info.stats.stats.sum.num_large_omap_objects = scrubber.large_omap_objects;
   } else {
     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
     // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
@@ -5153,9 +5319,6 @@ bool PG::append_log_entries_update_missing(
   assert(entries.begin()->version > info.last_update);
 
   PGLogEntryHandler rollbacker{this, &t};
-  if (roll_forward_to) {
-    pg_log.roll_forward(&rollbacker);
-  }
   bool invalidate_stats =
     pg_log.append_new_log_entries(info.last_backfill,
                                  info.last_backfill_bitwise,
@@ -7125,6 +7288,12 @@ boost::statechart::result
 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
 {
   PG *pg = context< RecoveryMachine >().pg;
+  if (!pg->state_test(PG_STATE_RECOVERING)) {
+    // we may have finished recovery and have an AllReplicasRecovered
+    // event queued to move us to the next state.
+    ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl;
+    return discard_event();
+  }
   ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
   pg->state_clear(PG_STATE_RECOVERING);
   pg->state_set(PG_STATE_RECOVERY_WAIT);
@@ -7483,10 +7652,9 @@ boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
     q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
     q.f->dump_stream("scrubber.start") << pg->scrubber.start;
     q.f->dump_stream("scrubber.end") << pg->scrubber.end;
+    q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end;
     q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
     q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
-    q.f->dump_unsigned("scrubber.seed", pg->scrubber.seed);
-    q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
     {
       q.f->open_array_section("scrubber.waiting_on_whom");
       for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
@@ -8174,7 +8342,8 @@ boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &adv
   int64_t poolnum = pg->info.pgid.pool();
 
   // Reset if min_size turn smaller than previous value, pg might now be able to go active
-  if (advmap.lastmap->get_pools().find(poolnum)->second.min_size >
+  if (!advmap.osdmap->have_pg_pool(poolnum) ||
+      advmap.lastmap->get_pools().find(poolnum)->second.min_size >
       advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
     post_event(advmap);
     return transit< Reset >();
index 37cbf454e72d92b541afd6d3d7fa54146b2549af..932dc51a181c3bd00567c5fc882bac5aee031d55 100644 (file)
@@ -1211,12 +1211,16 @@ public:
 
     // common to both scrubs
     bool active;
-    int waiting_on;
     set<pg_shard_t> waiting_on_whom;
     int shallow_errors;
     int deep_errors;
+    int large_omap_objects = 0;
     int fixed;
     ScrubMap primary_scrubmap;
+    ScrubMapBuilder primary_scrubmap_pos;
+    epoch_t replica_scrub_start = 0;
+    ScrubMap replica_scrubmap;
+    ScrubMapBuilder replica_scrubmap_pos;
     map<pg_shard_t, ScrubMap> received_maps;
     OpRequestRef active_rep_scrub;
     utime_t scrub_reg_stamp;  // stamp we registered for
@@ -1245,11 +1249,36 @@ public:
     // Cleaned map pending snap metadata scrub
     ScrubMap cleaned_meta_map;
 
+    void clean_meta_map(ScrubMap &for_meta_scrub) {
+      if (end.is_max() ||
+          cleaned_meta_map.objects.empty()) {
+         cleaned_meta_map.swap(for_meta_scrub);
+      } else {
+        auto iter = cleaned_meta_map.objects.end();
+        --iter; // not empty, see if clause
+        auto begin = cleaned_meta_map.objects.begin();
+        if (iter->first.has_snapset()) {
+          ++iter;
+        } else {
+          while (iter != begin) {
+            auto next = iter--;
+            if (next->first.get_head() != iter->first.get_head()) {
+             ++iter;
+             break;
+            }
+          }
+        }
+        for_meta_scrub.objects.insert(begin, iter);
+        cleaned_meta_map.objects.erase(begin, iter);
+      }
+    }
+
     // digest updates which we are waiting on
     int num_digest_updates_pending;
 
     // chunky scrub
-    hobject_t start, end;
+    hobject_t start, end;    // [start,end)
+    hobject_t max_end;       // Largest end that may have been sent to replicas
     eversion_t subset_last_update;
 
     // chunky scrub state
@@ -1259,16 +1288,19 @@ public:
       WAIT_PUSHES,
       WAIT_LAST_UPDATE,
       BUILD_MAP,
+      BUILD_MAP_DONE,
       WAIT_REPLICAS,
       COMPARE_MAPS,
       WAIT_DIGEST_UPDATES,
       FINISH,
+      BUILD_MAP_REPLICA,
     } state;
 
     std::unique_ptr<Scrub::Store> store;
     // deep scrub
     bool deep;
-    uint32_t seed;
+    int preempt_left;
+    int preempt_divisor;
 
     list<Context*> callbacks;
     void add_callback(Context *context) {
@@ -1293,26 +1325,21 @@ public:
         case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
         case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
         case BUILD_MAP: ret = "BUILD_MAP"; break;
+        case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
         case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
         case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
         case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
         case FINISH: ret = "FINISH"; break;
+        case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
       }
       return ret;
     }
 
     bool is_chunky_scrub_active() const { return state != INACTIVE; }
 
-    // classic (non chunk) scrubs block all writes
-    // chunky scrubs only block writes to a range
-    bool write_blocked_by_scrub(const hobject_t &soid) {
-      return (soid >= start && soid < end);
-    }
-
     // clear all state
     void reset() {
       active = false;
-      waiting_on = 0;
       waiting_on_whom.clear();
       if (active_rep_scrub) {
         active_rep_scrub = OpRequestRef();
@@ -1327,17 +1354,22 @@ public:
       state = PG::Scrubber::INACTIVE;
       start = hobject_t();
       end = hobject_t();
+      max_end = hobject_t();
       subset_last_update = eversion_t();
       shallow_errors = 0;
       deep_errors = 0;
+      large_omap_objects = 0;
       fixed = 0;
       deep = false;
-      seed = 0;
       run_callbacks();
       inconsistent.clear();
       missing.clear();
       authoritative.clear();
       num_digest_updates_pending = 0;
+      primary_scrubmap = ScrubMap();
+      primary_scrubmap_pos.reset();
+      replica_scrubmap = ScrubMap();
+      replica_scrubmap_pos.reset();
       cleaned_meta_map = ScrubMap();
       sleeping = false;
       needs_sleep = true;
@@ -1352,6 +1384,17 @@ public:
 
   int active_pushes;
 
+  bool scrub_can_preempt = false;
+  bool scrub_preempted = false;
+
+  // we allow some number of preemptions of the scrub, which mean we do
+  // not block.  then we start to block.  once we start blocking, we do
+  // not stop until the scrub range is completed.
+  bool write_blocked_by_scrub(const hobject_t &soid);
+
+  /// true if the given range intersects the scrub interval in any way
+  bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
+
   void repair_object(
     const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
     pg_shard_t bad_peer);
@@ -1373,10 +1416,11 @@ public:
     ThreadPool::TPHandle &handle);
   void _request_scrub_map(pg_shard_t replica, eversion_t version,
                           hobject_t start, hobject_t end, bool deep,
-                         uint32_t seed);
+                         bool allow_preemption);
   int build_scrub_map_chunk(
     ScrubMap &map,
-    hobject_t start, hobject_t end, bool deep, uint32_t seed,
+    ScrubMapBuilder &pos,
+    hobject_t start, hobject_t end, bool deep,
     ThreadPool::TPHandle &handle);
   /**
    * returns true if [begin, end) is good to scrub at this time
@@ -1387,7 +1431,9 @@ public:
     const hobject_t &begin, const hobject_t &end) = 0;
   virtual void scrub_snapshot_metadata(
     ScrubMap &map,
-    const std::map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest) { }
+    const std::map<hobject_t,
+                   pair<boost::optional<uint32_t>,
+                        boost::optional<uint32_t>>> &missing_digest) { }
   virtual void _scrub_clear_state() { }
   virtual void _scrub_finish() { }
   virtual void split_colls(
index 673ff89390382f0f9e190d18819ed9359f0db56d..7c508adf396d5ddb102f6fa2b9111404f3efddc4 100644 (file)
@@ -570,58 +570,53 @@ PGBackend *PGBackend::build_pg_backend(
   }
 }
 
-/*
- * pg lock may or may not be held
- */
-void PGBackend::be_scan_list(
-  ScrubMap &map, const vector<hobject_t> &ls, bool deep, uint32_t seed,
-  ThreadPool::TPHandle &handle)
+int PGBackend::be_scan_list(
+  ScrubMap &map,
+  ScrubMapBuilder &pos)
 {
-  dout(10) << __func__ << " scanning " << ls.size() << " objects"
-           << (deep ? " deeply" : "") << dendl;
-  int i = 0;
-  for (vector<hobject_t>::const_iterator p = ls.begin();
-       p != ls.end();
-       ++p, i++) {
-    handle.reset_tp_timeout();
-    hobject_t poid = *p;
-
-    struct stat st;
-    int r = store->stat(
+  dout(10) << __func__ << " " << pos << dendl;
+  assert(!pos.done());
+  assert(pos.pos < pos.ls.size());
+  hobject_t& poid = pos.ls[pos.pos];
+
+  struct stat st;
+  int r = store->stat(
+    ch,
+    ghobject_t(
+      poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    &st,
+    true);
+  if (r == 0) {
+    ScrubMap::object &o = map.objects[poid];
+    o.size = st.st_size;
+    assert(!o.negative);
+    store->getattrs(
       ch,
       ghobject_t(
        poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
-      &st,
-      true);
-    if (r == 0) {
-      ScrubMap::object &o = map.objects[poid];
-      o.size = st.st_size;
-      assert(!o.negative);
-      store->getattrs(
-       ch,
-       ghobject_t(
-         poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
-       o.attrs);
-
-      // calculate the CRC32 on deep scrubs
-      if (deep) {
-       be_deep_scrub(*p, seed, o, handle);
-      }
+      o.attrs);
 
-      dout(25) << __func__ << "  " << poid << dendl;
-    } else if (r == -ENOENT) {
-      dout(25) << __func__ << "  " << poid << " got " << r
-              << ", skipping" << dendl;
-    } else if (r == -EIO) {
-      dout(25) << __func__ << "  " << poid << " got " << r
-              << ", stat_error" << dendl;
-      ScrubMap::object &o = map.objects[poid];
-      o.stat_error = true;
-    } else {
-      derr << __func__ << " got: " << cpp_strerror(r) << dendl;
-      ceph_abort();
+    if (pos.deep) {
+      r = be_deep_scrub(poid, map, pos, o);
     }
+    dout(25) << __func__ << "  " << poid << dendl;
+  } else if (r == -ENOENT) {
+    dout(25) << __func__ << "  " << poid << " got " << r
+            << ", skipping" << dendl;
+  } else if (r == -EIO) {
+    dout(25) << __func__ << "  " << poid << " got " << r
+            << ", stat_error" << dendl;
+    ScrubMap::object &o = map.objects[poid];
+    o.stat_error = true;
+  } else {
+    derr << __func__ << " got: " << cpp_strerror(r) << dendl;
+    ceph_abort();
+  }
+  if (r == -EINPROGRESS) {
+    return -EINPROGRESS;
   }
+  pos.next_object();
+  return 0;
 }
 
 bool PGBackend::be_compare_scrub_objects(
@@ -934,29 +929,21 @@ out:
 
 void PGBackend::be_compare_scrubmaps(
   const map<pg_shard_t,ScrubMap*> &maps,
+  const set<hobject_t> &master_set,
   bool repair,
   map<hobject_t, set<pg_shard_t>> &missing,
   map<hobject_t, set<pg_shard_t>> &inconsistent,
   map<hobject_t, list<pg_shard_t>> &authoritative,
-  map<hobject_t, pair<uint32_t,uint32_t>> &missing_digest,
+  map<hobject_t, pair<boost::optional<uint32_t>,
+                      boost::optional<uint32_t>>> &missing_digest,
   int &shallow_errors, int &deep_errors,
   Scrub::Store *store,
   const spg_t& pgid,
   const vector<int> &acting,
   ostream &errorstream)
 {
-  map<hobject_t,ScrubMap::object>::const_iterator i;
-  map<pg_shard_t, ScrubMap *>::const_iterator j;
-  set<hobject_t> master_set;
   utime_t now = ceph_clock_now();
 
-  // Construct master set
-  for (j = maps.begin(); j != maps.end(); ++j) {
-    for (i = j->second->objects.begin(); i != j->second->objects.end(); ++i) {
-      master_set.insert(i->first);
-    }
-  }
-
   // Check maps against master set and each other
   for (set<hobject_t>::const_iterator k = master_set.begin();
        k != master_set.end();
@@ -989,7 +976,7 @@ void PGBackend::be_compare_scrubmaps(
     set<pg_shard_t> cur_missing;
     set<pg_shard_t> cur_inconsistent;
 
-    for (j = maps.begin(); j != maps.end(); ++j) {
+    for (auto  j = maps.cbegin(); j != maps.cend(); ++j) {
       if (j == auth)
        shard_map[auth->first].selected_oi = true;
       if (j->second->objects.count(*k)) {
@@ -1072,16 +1059,12 @@ void PGBackend::be_compare_scrubmaps(
        FORCE = 2,
       } update = NO;
 
-      if (auth_object.digest_present && auth_object.omap_digest_present &&
-         (!auth_oi.is_data_digest() || !auth_oi.is_omap_digest())) {
-       dout(20) << __func__ << " missing digest on " << *k << dendl;
+      if (auth_object.digest_present && !auth_oi.is_data_digest()) {
+       dout(20) << __func__ << " missing data digest on " << *k << dendl;
        update = MAYBE;
       }
-      if (auth_object.digest_present && auth_object.omap_digest_present &&
-         cct->_conf->osd_debug_scrub_chance_rewrite_digest &&
-         (((unsigned)rand() % 100) >
-          cct->_conf->osd_debug_scrub_chance_rewrite_digest)) {
-       dout(20) << __func__ << " randomly updating digest on " << *k << dendl;
+      if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) {
+       dout(20) << __func__ << " missing omap digest on " << *k << dendl;
        update = MAYBE;
       }
 
@@ -1111,9 +1094,16 @@ void PGBackend::be_compare_scrubmaps(
        utime_t age = now - auth_oi.local_mtime;
        if (update == FORCE ||
            age > cct->_conf->osd_deep_scrub_update_digest_min_age) {
-         dout(20) << __func__ << " will update digest on " << *k << dendl;
-         missing_digest[*k] = make_pair(auth_object.digest,
-                                        auth_object.omap_digest);
+          boost::optional<uint32_t> data_digest, omap_digest;
+          if (auth_object.digest_present) {
+            data_digest = auth_object.digest;
+           dout(20) << __func__ << " will update data digest on " << *k << dendl;
+          }
+          if (auth_object.omap_digest_present) {
+            omap_digest = auth_object.omap_digest;
+           dout(20) << __func__ << " will update omap digest on " << *k << dendl;
+          }
+         missing_digest[*k] = make_pair(data_digest, omap_digest);
        } else {
          dout(20) << __func__ << " missing digest but age " << age
                   << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age
@@ -1131,3 +1121,35 @@ out:
     }
   }
 }
+
+void PGBackend::be_large_omap_check(const map<pg_shard_t,ScrubMap*> &maps,
+  const set<hobject_t> &master_set,
+  int& large_omap_objects,
+  ostream &warnstream) const
+{
+  bool needs_check = false;
+  for (const auto& map : maps) {
+    if (map.second->has_large_omap_object_errors) {
+      needs_check = true;
+      break;
+    }
+  }
+
+  if (!needs_check) {
+    return;
+  }
+
+  // Iterate through objects and check large omap object flag
+  for (const auto& k : master_set) {
+    for (const auto& map : maps) {
+      ScrubMap::object& obj = map.second->objects[k];
+      if (obj.large_omap_object_found) {
+        large_omap_objects++;
+        warnstream << "Large omap object found. Object: " << k << " Key count: "
+                   << obj.large_omap_object_key_count << " Size (bytes): "
+                   << obj.large_omap_object_value_size << '\n';
+        break;
+      }
+    }
+  }
+}
index cb8a1115c06f6e7892930f71934c5cc205ec1a0a..d69e511d36f90a7a1ff68acac9fdab539044262c 100644 (file)
@@ -132,6 +132,7 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
                                        eversion_t v,
                                        Context *on_complete) = 0;
 
+
      /**
       * Bless a context
       *
@@ -288,6 +289,7 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
 
      virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
 
+     virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0;
      virtual ~Listener() {}
    };
    Listener *parent;
@@ -558,9 +560,9 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
 
    virtual bool scrub_supported() = 0;
    virtual bool auto_repair_supported() const = 0;
-   void be_scan_list(
-     ScrubMap &map, const vector<hobject_t> &ls, bool deep, uint32_t seed,
-     ThreadPool::TPHandle &handle);
+   int be_scan_list(
+     ScrubMap &map,
+     ScrubMapBuilder &pos);
    bool be_compare_scrub_objects(
      pg_shard_t auth_shard,
      const ScrubMap::object &auth,
@@ -577,11 +579,13 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
      inconsistent_obj_wrapper &object_error);
    void be_compare_scrubmaps(
      const map<pg_shard_t,ScrubMap*> &maps,
+     const set<hobject_t> &master_set,
      bool repair,
      map<hobject_t, set<pg_shard_t>> &missing,
      map<hobject_t, set<pg_shard_t>> &inconsistent,
      map<hobject_t, list<pg_shard_t>> &authoritative,
-     map<hobject_t, pair<uint32_t,uint32_t>> &missing_digest,
+     map<hobject_t, pair<boost::optional<uint32_t>,
+                         boost::optional<uint32_t>>> &missing_digest,
      int &shallow_errors, int &deep_errors,
      Scrub::Store *store,
      const spg_t& pgid,
@@ -589,11 +593,16 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
      ostream &errorstream);
    virtual uint64_t be_get_ondisk_size(
      uint64_t logical_size) = 0;
-   virtual void be_deep_scrub(
-     const hobject_t &poid,
-     uint32_t seed,
-     ScrubMap::object &o,
-     ThreadPool::TPHandle &handle) = 0;
+   virtual int be_deep_scrub(
+     const hobject_t &oid,
+     ScrubMap &map,
+     ScrubMapBuilder &pos,
+     ScrubMap::object &o) = 0;
+   void be_large_omap_check(
+     const map<pg_shard_t,ScrubMap*> &maps,
+     const set<hobject_t> &master_set,
+     int& large_omap_objects,
+     ostream &warnstream) const;
 
    static PGBackend *build_pg_backend(
      const pg_pool_t &pool,
index a77a9dae78fb19cff64415a15fc7ed27ac0e56ff..8dcbd700c1fe5367ca84abda354aee3e988e5d0f 100644 (file)
@@ -363,7 +363,9 @@ public:
     ) {
     auto &op = get_object_op_for_modify(hoid);
     for (auto &&i: attrs) {
-      op.attr_updates[i.first] = i.second;
+      auto& d = op.attr_updates[i.first];
+      d = i.second;
+      d->rebuild();
     }
   }
   void setattr(
@@ -372,7 +374,9 @@ public:
     bufferlist &bl                 ///< [in] val to write, may be claimed
     ) {
     auto &op = get_object_op_for_modify(hoid);
-    op.attr_updates[attrname] = bl;
+    auto& d = op.attr_updates[attrname];
+    d = bl;
+    d->rebuild();
   }
   void rmattr(
     const hobject_t &hoid,         ///< [in] object to write
index 1358eefa69a9d3451dbc4d4c4bf3dc1f47e54e79..aaf9136a45e672b91acdbfcc2dc6f7c3ed3e3f28 100644 (file)
@@ -2059,8 +2059,8 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
     return;
   }
 
-  if (write_ordered &&
-      scrubber.write_blocked_by_scrub(head)) {
+  if (write_ordered && scrubber.is_chunky_scrub_active() &&
+      write_blocked_by_scrub(head)) {
     dout(20) << __func__ << ": waiting for scrub" << dendl;
     waiting_for_scrub.push_back(op);
     op->mark_delayed("waiting for scrub");
@@ -3127,7 +3127,7 @@ void PrimaryLogPG::promote_object(ObjectContextRef obc,
 {
   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
   assert(hoid != hobject_t());
-  if (scrubber.write_blocked_by_scrub(hoid)) {
+  if (write_blocked_by_scrub(hoid)) {
     dout(10) << __func__ << " " << hoid
             << " blocked by scrub" << dendl;
     if (op) {
@@ -4532,6 +4532,9 @@ int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
                              bufferlist::iterator *bl_it)
 {
   dout(20) << __func__ << dendl;
+  bool skip_data_digest =
+    (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
+    g_conf->osd_distrust_data_digest;
 
   auto& op = osd_op.op;
   if (op.checksum.chunk_size > 0) {
@@ -4586,7 +4589,8 @@ int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
     // If there is a data digest and it is possible we are reading
     // entire object, pass the digest.
     boost::optional<uint32_t> maybe_crc;
-    if (oi.is_data_digest() && op.checksum.offset == 0 &&
+    if (!skip_data_digest &&
+       oi.is_data_digest() && op.checksum.offset == 0 &&
         op.checksum.length >= oi.size) {
       maybe_crc = oi.data_digest;
     }
@@ -4734,6 +4738,9 @@ int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
 {
   dout(20) << __func__ << dendl;
   ceph_osd_op& op = osd_op.op;
+  bool skip_data_digest =
+    (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
+    g_conf->osd_distrust_data_digest;
 
   auto& oi = ctx->new_obs.oi;
   uint64_t size = oi.size;
@@ -4758,7 +4765,8 @@ int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
     // If there is a data digest and it is possible we are reading
     // entire object, pass the digest.
     boost::optional<uint32_t> maybe_crc;
-    if (oi.is_data_digest() && op.checksum.offset == 0 &&
+    if (!skip_data_digest &&
+       oi.is_data_digest() && op.checksum.offset == 0 &&
         op.checksum.length >= oi.size) {
       maybe_crc = oi.data_digest;
     }
@@ -4816,6 +4824,9 @@ int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
   __u32 seq = oi.truncate_seq;
   uint64_t size = oi.size;
   bool trimmed_read = false;
+  bool skip_data_digest =
+    (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
+    g_conf->osd_distrust_data_digest;
 
   // are we beyond truncate_size?
   if ( (seq < op.extent.truncate_seq) &&
@@ -4844,7 +4855,8 @@ int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
     // If there is a data digest and it is possible we are reading
     // entire object, pass the digest.  FillInVerifyExtent will
     // will check the oi.size again.
-    if (oi.is_data_digest() && op.extent.offset == 0 &&
+    if (!skip_data_digest &&
+       oi.is_data_digest() && op.extent.offset == 0 &&
         op.extent.length >= oi.size)
       maybe_crc = oi.data_digest;
     ctx->pending_async_reads.push_back(
@@ -4874,7 +4886,8 @@ int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
             << " bytes from obj " << soid << dendl;
 
     // whole object?  can we verify the checksum?
-    if (op.extent.length == oi.size && oi.is_data_digest()) {
+    if (!skip_data_digest &&
+       op.extent.length == oi.size && oi.is_data_digest()) {
       uint32_t crc = osd_op.outdata.crc32c(-1);
       if (oi.data_digest != crc) {
         osd->clog->error() << info.pgid << std::hex
@@ -4899,6 +4912,9 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
   auto& op = osd_op.op;
   auto& oi = ctx->new_obs.oi;
   auto& soid = oi.soid;
+  bool skip_data_digest =
+    (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
+    g_conf->osd_distrust_data_digest;
 
   if (op.extent.truncate_seq) {
     dout(0) << "sparse_read does not support truncation sequence " << dendl;
@@ -5012,7 +5028,8 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
     // Maybe at first, there is no much whole objects. With continued use, more
     // and more whole object exist. So from this point, for spare-read add
     // checksum make sense.
-    if (total_read == oi.size && oi.is_data_digest()) {
+    if (!skip_data_digest &&
+       total_read == oi.size && oi.is_data_digest()) {
       uint32_t crc = data_bl.crc32c(-1);
       if (oi.data_digest != crc) {
         osd->clog->error() << info.pgid << std::hex
@@ -5045,6 +5062,9 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
   ObjectState& obs = ctx->new_obs;
   object_info_t& oi = obs.oi;
   const hobject_t& soid = oi.soid;
+  bool skip_data_digest =
+    (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
+    g_conf->osd_distrust_data_digest;
 
   PGTransaction* t = ctx->op_t.get();
 
@@ -5857,12 +5877,18 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
            soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
        }
 
-       if (op.extent.offset == 0 && op.extent.length >= oi.size)
+       if (op.extent.offset == 0 && op.extent.length >= oi.size
+            && !skip_data_digest) {
          obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
-       else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
-         obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
-       else
+       } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
+          if (skip_data_digest) {
+            obs.oi.clear_data_digest();
+          } else {
+           obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
+          }
+       } else {
          obs.oi.clear_data_digest();
+        }
        write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
                                    op.extent.offset, op.extent.length);
 
@@ -5894,7 +5920,11 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
        if (op.extent.length) {
          t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
        }
-       obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
+        if (!skip_data_digest) {
+         obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
+        } else {
+         obs.oi.clear_data_digest();
+       }
 
        write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
            0, op.extent.length, true);
@@ -7824,6 +7854,10 @@ int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
   int result = 0;
   object_copy_cursor_t cursor;
   uint64_t out_max;
+  bool skip_data_digest =
+    (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
+    g_conf->osd_distrust_data_digest;
+
   try {
     ::decode(cursor, bp);
     ::decode(out_max, bp);
@@ -7858,7 +7892,7 @@ int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
   } else {
     reply_obj.snap_seq = obc->ssc->snapset.seq;
   }
-  if (oi.is_data_digest()) {
+  if (!skip_data_digest && oi.is_data_digest()) {
     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
     reply_obj.data_digest = oi.data_digest;
   }
@@ -8428,8 +8462,16 @@ void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
   // CopyFromCallback fills this in for us
   obs.oi.user_version = ctx->user_at_version;
 
-  obs.oi.set_data_digest(cb->results->data_digest);
-  obs.oi.set_omap_digest(cb->results->omap_digest);
+  if (cb->results->is_data_digest()) {
+    obs.oi.set_data_digest(cb->results->data_digest);
+  } else {
+    obs.oi.clear_data_digest();
+  }
+  if (cb->results->is_omap_digest()) {
+    obs.oi.set_omap_digest(cb->results->omap_digest);
+  } else {
+    obs.oi.clear_omap_digest();
+  }
 
   obs.oi.truncate_seq = cb->results->truncate_seq;
   obs.oi.truncate_size = cb->results->truncate_size;
@@ -8620,11 +8662,16 @@ void PrimaryLogPG::finish_promote(int r, CopyResults *results,
     }
     tctx->new_obs.oi.size = results->object_size;
     tctx->new_obs.oi.user_version = results->user_version;
-    // Don't care src object whether have data or omap digest
-    if (results->object_size)
+    if (results->is_data_digest()) {
       tctx->new_obs.oi.set_data_digest(results->data_digest);
-    if (results->has_omap)
+    } else {
+      tctx->new_obs.oi.clear_data_digest();
+    }
+    if (results->is_omap_digest()) {
       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
+    } else {
+      tctx->new_obs.oi.clear_omap_digest();
+    }
     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
     tctx->new_obs.oi.truncate_size = results->truncate_size;
 
@@ -9042,7 +9089,7 @@ int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
   }
 
   if (!fop->blocking &&
-      scrubber.write_blocked_by_scrub(oid)) {
+      write_blocked_by_scrub(oid)) {
     if (fop->op) {
       dout(10) << __func__ << " blocked by scrub" << dendl;
       requeue_op(fop->op);
@@ -9082,10 +9129,18 @@ int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
        fop->op)) {
     dout(20) << __func__ << " took write lock" << dendl;
   } else if (fop->op) {
-    dout(10) << __func__ << " waiting on write lock" << dendl;
+    dout(10) << __func__ << " waiting on write lock " << fop->op << " "
+            << fop->dup_ops << dendl;
     close_op_ctx(ctx.release());
-    requeue_op(fop->op);
-    requeue_ops(fop->dup_ops);
+    // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
+    for (auto op : fop->dup_ops) {
+      bool locked = ctx->lock_manager.get_lock_type(
+       ObjectContext::RWState::RWWRITE,
+       oid,
+       obc,
+       op);
+      assert(!locked);
+    }
     return -EAGAIN;    // will retry
   } else {
     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
@@ -9776,7 +9831,7 @@ void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
     return;
   }
 
-  if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
+  if (write_blocked_by_scrub(obc->obs.oi.soid)) {
     dout(10) << "handle_watch_timeout waiting for scrub on obj "
             << obc->obs.oi.soid
             << dendl;
@@ -10187,6 +10242,11 @@ int PrimaryLogPG::find_object_context(const hobject_t& oid,
   } else {
     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
     assert(p != obc->ssc->snapset.clone_snaps.end());
+    if (p->second.empty()) {
+      dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
+      assert(!cct->_conf->osd_debug_verify_snaps);
+      return -ENOENT;
+    }
     first = p->second.back();
     last = p->second.front();
   }
@@ -12435,10 +12495,10 @@ void PrimaryLogPG::update_range(
   if (bi->version < info.log_tail) {
     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
             << dendl;
+    osr->flush();
     if (last_update_applied >= info.log_tail) {
       bi->version = last_update_applied;
     } else {
-      osr->flush();
       bi->version = info.last_update;
     }
     scan_range(local_min, local_max, bi, handle);
@@ -12658,7 +12718,7 @@ void PrimaryLogPG::hit_set_remove_all()
     // Once we hit a degraded object just skip
     if (is_degraded_or_backfilling_object(aoid))
       return;
-    if (scrubber.write_blocked_by_scrub(aoid))
+    if (write_blocked_by_scrub(aoid))
       return;
   }
 
@@ -12777,7 +12837,7 @@ void PrimaryLogPG::hit_set_persist()
     // Once we hit a degraded object just skip further trim
     if (is_degraded_or_backfilling_object(aoid))
       return;
-    if (scrubber.write_blocked_by_scrub(aoid))
+    if (write_blocked_by_scrub(aoid))
       return;
   }
 
@@ -12811,7 +12871,7 @@ void PrimaryLogPG::hit_set_persist()
     new_hset.using_gmt);
 
   // If the current object is degraded we skip this persist request
-  if (scrubber.write_blocked_by_scrub(oid))
+  if (write_blocked_by_scrub(oid))
     return;
 
   hit_set->seal();
@@ -13055,7 +13115,8 @@ bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
       osd->logger->inc(l_osd_agent_skip);
       continue;
     }
-    if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
+    if (range_intersects_scrub(obc->obs.oi.soid,
+                              obc->obs.oi.soid.get_head())) {
       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
       osd->logger->inc(l_osd_agent_skip);
       continue;
@@ -13803,7 +13864,9 @@ unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
  */
 void PrimaryLogPG::scrub_snapshot_metadata(
   ScrubMap &scrubmap,
-  const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
+  const map<hobject_t,
+            pair<boost::optional<uint32_t>,
+                 boost::optional<uint32_t>>> &missing_digest)
 {
   dout(10) << __func__ << dendl;
 
@@ -14145,10 +14208,7 @@ void PrimaryLogPG::scrub_snapshot_metadata(
   if (head && (head_error.errors || soid_error_count))
     scrubber.store->add_snap_error(pool.id, head_error);
 
-  for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
-        missing_digest.begin();
-       p != missing_digest.end();
-       ++p) {
+  for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
     if (p->first.is_snapdir())
       continue;
     dout(10) << __func__ << " recording digests for " << p->first << dendl;
@@ -14168,8 +14228,16 @@ void PrimaryLogPG::scrub_snapshot_metadata(
     OpContextUPtr ctx = simple_opc_create(obc);
     ctx->at_version = get_next_version();
     ctx->mtime = utime_t();      // do not update mtime
-    ctx->new_obs.oi.set_data_digest(p->second.first);
-    ctx->new_obs.oi.set_omap_digest(p->second.second);
+    if (p->second.first) {
+      ctx->new_obs.oi.set_data_digest(*p->second.first);
+    } else {
+      ctx->new_obs.oi.clear_data_digest();
+    }
+    if (p->second.second) {
+      ctx->new_obs.oi.set_omap_digest(*p->second.second);
+    } else {
+      ctx->new_obs.oi.clear_omap_digest();
+    }
     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
 
     ctx->register_on_success(
index e59f8c662daa6a9a473c855b786fe1f6483973d3..3f10cef187526c3ad2c0457989a6daf8f12a82fa 100644 (file)
@@ -829,7 +829,10 @@ protected:
     if (!to_req.empty()) {
       // requeue at front of scrub blocking queue if we are blocked by scrub
       for (auto &&p: to_req) {
-       if (scrubber.write_blocked_by_scrub(p.first.get_head())) {
+       if (write_blocked_by_scrub(p.first.get_head())) {
+          for (auto& op : p.second) {
+            op->mark_delayed("waiting for scrub");
+          }
          waiting_for_scrub.splice(
            waiting_for_scrub.begin(),
            p.second,
@@ -1318,7 +1321,9 @@ protected:
     const hobject_t &begin, const hobject_t &end) override;
   void scrub_snapshot_metadata(
     ScrubMap &map,
-    const std::map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest) override;
+    const std::map<hobject_t,
+                   pair<boost::optional<uint32_t>,
+                        boost::optional<uint32_t>>> &missing_digest) override;
   void _scrub_clear_state() override;
   void _scrub_finish() override;
   object_stat_collection_t scrub_cstat;
@@ -1785,6 +1790,9 @@ public:
   void on_shutdown() override;
   bool check_failsafe_full(ostream &ss) override;
   bool check_osdmap_full(const set<pg_shard_t> &missing_on) override;
+  bool maybe_preempt_replica_scrub(const hobject_t& oid) override {
+    return write_blocked_by_scrub(oid);
+  }
   int rep_repair_primary_object(const hobject_t& soid, OpRequestRef op);
 
   // attr cache handling
index 081204a033fd91f4885f18d57b5ead56129a7d20..602a0f08f76b1338de66995acd2ce87a971612f0 100644 (file)
@@ -703,103 +703,140 @@ void ReplicatedBackend::do_repop_reply(OpRequestRef op)
   }
 }
 
-void ReplicatedBackend::be_deep_scrub(
+int ReplicatedBackend::be_deep_scrub(
   const hobject_t &poid,
-  uint32_t seed,
-  ScrubMap::object &o,
-  ThreadPool::TPHandle &handle)
+  ScrubMap &map,
+  ScrubMapBuilder &pos,
+  ScrubMap::object &o)
 {
-  dout(10) << __func__ << " " << poid << " seed " 
-          << std::hex << seed << std::dec << dendl;
-  bufferhash h(seed), oh(seed);
-  bufferlist bl, hdrbl;
+  dout(10) << __func__ << " " << poid << " pos " << pos << dendl;
   int r;
-  __u64 pos = 0;
+  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+                           CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
 
-  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
-
-  while (true) {
-    handle.reset_tp_timeout();
-    r = store->read(
-         ch,
-         ghobject_t(
-           poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
-         pos,
-         cct->_conf->osd_deep_scrub_stride, bl,
-         fadvise_flags);
-    if (r <= 0)
-      break;
-
-    h << bl;
-    pos += bl.length();
-    bl.clear();
+  utime_t sleeptime;
+  sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep);
+  if (sleeptime != utime_t()) {
+    lgeneric_derr(cct) << __func__ << " sleeping for " << sleeptime << dendl;
+    sleeptime.sleep();
   }
-  if (r == -EIO) {
-    dout(25) << __func__ << "  " << poid << " got "
-            << r << " on read, read_error" << dendl;
-    o.read_error = true;
-    return;
-  }
-  o.digest = h.digest();
-  o.digest_present = true;
 
-  bl.clear();
-  r = store->omap_get_header(
-    coll,
-    ghobject_t(
-      poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
-    &hdrbl, true);
-  // NOTE: bobtail to giant, we would crc the head as (len, head).
-  // that changes at the same time we start using a non-zero seed.
-  if (r == 0 && hdrbl.length()) {
-    dout(25) << "CRC header " << string(hdrbl.c_str(), hdrbl.length())
-             << dendl;
-    if (seed == 0) {
-      // legacy
-      bufferlist bl;
-      ::encode(hdrbl, bl);
-      oh << bl;
-    } else {
-      oh << hdrbl;
+  assert(poid == pos.ls[pos.pos]);
+  if (!pos.data_done()) {
+    if (pos.data_pos == 0) {
+      pos.data_hash = bufferhash(-1);
+    }
+
+    bufferlist bl;
+    r = store->read(
+      ch,
+      ghobject_t(
+       poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      pos.data_pos,
+      cct->_conf->osd_deep_scrub_stride, bl,
+      fadvise_flags);
+    if (r < 0) {
+      dout(20) << __func__ << "  " << poid << " got "
+              << r << " on read, read_error" << dendl;
+      o.read_error = true;
+      return 0;
+    }
+    if (r > 0) {
+      pos.data_hash << bl;
+    }
+    pos.data_pos += r;
+    if (r == cct->_conf->osd_deep_scrub_stride) {
+      dout(20) << __func__ << "  " << poid << " more data, digest so far 0x"
+              << std::hex << pos.data_hash.digest() << std::dec << dendl;
+      return -EINPROGRESS;
+    }
+    // done with bytes
+    pos.data_pos = -1;
+    o.digest = pos.data_hash.digest();
+    o.digest_present = true;
+    dout(20) << __func__ << "  " << poid << " done with data, digest 0x"
+            << std::hex << o.digest << std::dec << dendl;
+  }
+
+  // omap header
+  if (pos.omap_pos.empty()) {
+    pos.omap_hash = bufferhash(-1);
+
+    bufferlist hdrbl;
+    r = store->omap_get_header(
+      coll,
+      ghobject_t(
+       poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      &hdrbl, true);
+    if (r == -EIO) {
+      dout(20) << __func__ << "  " << poid << " got "
+              << r << " on omap header read, read_error" << dendl;
+      o.read_error = true;
+      return 0;
+    }
+    if (r == 0 && hdrbl.length()) {
+      dout(25) << "CRC header " << string(hdrbl.c_str(), hdrbl.length())
+              << dendl;
+      pos.omap_hash << hdrbl;
     }
-  } else if (r == -EIO) {
-    dout(25) << __func__ << "  " << poid << " got "
-            << r << " on omap header read, read_error" << dendl;
-    o.read_error = true;
-    return;
   }
 
+  // omap
   ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(
     coll,
     ghobject_t(
       poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
   assert(iter);
-  for (iter->seek_to_first(); iter->status() == 0 && iter->valid();
-    iter->next(false)) {
-    handle.reset_tp_timeout();
-
-    dout(25) << "CRC key " << iter->key() << " value:\n";
-    iter->value().hexdump(*_dout);
-    *_dout << dendl;
-
+  if (pos.omap_pos.length()) {
+    iter->lower_bound(pos.omap_pos);
+  } else {
+    iter->seek_to_first();
+  }
+  int max = g_conf->osd_deep_scrub_keys;
+  while (iter->status() == 0 && iter->valid()) {
+    pos.omap_bytes += iter->value().length();
+    ++pos.omap_keys;
+    --max;
+    // fixme: we can do this more efficiently.
+    bufferlist bl;
     ::encode(iter->key(), bl);
     ::encode(iter->value(), bl);
-    oh << bl;
-    bl.clear();
+    pos.omap_hash << bl;
+
+    iter->next();
+
+    if (iter->valid() && max == 0) {
+      pos.omap_pos = iter->key();
+      return -EINPROGRESS;
+    }
+    if (iter->status() < 0) {
+      dout(25) << __func__ << "  " << poid
+              << " on omap scan, db status error" << dendl;
+      o.read_error = true;
+      return 0;
+    }
   }
 
-  if (iter->status() < 0) {
-    dout(25) << __func__ << "  " << poid
-             << " on omap scan, db status error" << dendl;
-    o.read_error = true;
-    return;
+  if (pos.omap_keys > cct->_conf->
+       osd_deep_scrub_large_omap_object_key_threshold ||
+      pos.omap_bytes > cct->_conf->
+       osd_deep_scrub_large_omap_object_value_sum_threshold) {
+    dout(25) << __func__ << " " << poid
+            << " large omap object detected. Object has " << pos.omap_keys
+            << " keys and size " << pos.omap_bytes << " bytes" << dendl;
+    o.large_omap_object_found = true;
+    o.large_omap_object_key_count = pos.omap_keys;
+    o.large_omap_object_value_size = pos.omap_bytes;
+    map.has_large_omap_object_errors = true;
   }
 
-  //Store final calculated CRC32 of omap header & key/values
-  o.omap_digest = oh.digest();
+  o.omap_digest = pos.omap_hash.digest();
   o.omap_digest_present = true;
-  dout(20) << __func__ << "  " << poid << " omap_digest "
+  dout(20) << __func__ << " done with " << poid << " omap_digest "
           << std::hex << o.omap_digest << std::dec << dendl;
+
+  // done!
+  return 0;
 }
 
 void ReplicatedBackend::_do_push(OpRequestRef op)
@@ -1094,6 +1131,8 @@ void ReplicatedBackend::do_repop(OpRequestRef op)
   // we better not be missing this.
   assert(!parent->get_log().get_missing().is_missing(soid));
 
+  parent->maybe_preempt_replica_scrub(soid);
+
   int ackerosd = m->get_source().num();
 
   op->mark_started();
index 7cb1df40c63a6f84b70ff586ee73437adc3ad332..5c5c1bd2eed08028cccfb5ad10c75dbd3bc8ce54 100644 (file)
@@ -430,11 +430,11 @@ private:
   bool auto_repair_supported() const override { return false; }
 
 
-  void be_deep_scrub(
-    const hobject_t &obj,
-    uint32_t seed,
-    ScrubMap::object &o,
-    ThreadPool::TPHandle &handle) override;
+  int be_deep_scrub(
+    const hobject_t &poid,
+    ScrubMap &map,
+    ScrubMapBuilder &pos,
+    ScrubMap::object &o) override;
   uint64_t be_get_ondisk_size(uint64_t logical_size) override { return logical_size; }
 };
 
index 374e2429b439add892f84b47b2b76ba39c644e68..823b1428215145cbf2fd2573889dc7a3a283b603 100644 (file)
@@ -151,7 +151,10 @@ int SnapMapper::get_snaps(
     bufferlist::iterator bp = got.begin()->second.begin();
     ::decode(*out, bp);
     dout(20) << __func__ << " " << oid << " " << out->snaps << dendl;
-    assert(!out->snaps.empty());
+    if (out->snaps.empty()) {
+      dout(1) << __func__ << " " << oid << " empty snapset" << dendl;
+      assert(!cct->_conf->osd_debug_verify_snaps);
+    }
   } else {
     dout(20) << __func__ << " " << oid << " (out == NULL)" << dendl;
   }
index f90e8a28eb797fcb40c5a4a7e984bf0427bc82b5..5fb226221d1289a34696cd019b4bb7a347680444 100644 (file)
@@ -1354,7 +1354,10 @@ void pg_pool_t::remove_unmanaged_snap(snapid_t s)
   assert(is_unmanaged_snaps_mode());
   removed_snaps.insert(s);
   snap_seq = snap_seq + 1;
-  removed_snaps.insert(get_snap_seq());
+  // try to add in the new seq, just to try to keep the interval_set contiguous
+  if (!removed_snaps.contains(get_snap_seq())) {
+    removed_snaps.insert(get_snap_seq());
+  }
 }
 
 SnapContext pg_pool_t::get_snap_context() const
@@ -1539,6 +1542,8 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
   }
 
   uint8_t v = 26;
+  // NOTE: any new encoding dependencies must be reflected by
+  // SIGNIFICANT_FEATURES
   if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
     // this was the first post-hammer thing we added; if it's missing, encode
     // like hammer.
@@ -1943,11 +1948,12 @@ void object_stat_sum_t::dump(Formatter *f) const
   f->dump_int("num_evict_mode_full", num_evict_mode_full);
   f->dump_int("num_objects_pinned", num_objects_pinned);
   f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
+  f->dump_int("num_large_omap_objects", num_large_omap_objects);
 }
 
 void object_stat_sum_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(16, 14, bl);
+  ENCODE_START(17, 14, bl);
 #if defined(CEPH_LITTLE_ENDIAN)
   bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
 #else
@@ -1986,6 +1992,7 @@ void object_stat_sum_t::encode(bufferlist& bl) const
   ::encode(num_objects_pinned, bl);
   ::encode(num_objects_missing, bl);
   ::encode(num_legacy_snapsets, bl);
+  ::encode(num_large_omap_objects, bl);
 #endif
   ENCODE_FINISH(bl);
 }
@@ -1993,9 +2000,9 @@ void object_stat_sum_t::encode(bufferlist& bl) const
 void object_stat_sum_t::decode(bufferlist::iterator& bl)
 {
   bool decode_finish = false;
-  DECODE_START(16, bl);
+  DECODE_START(17, bl);  // make sure to also update fast decode below
 #if defined(CEPH_LITTLE_ENDIAN)
-  if (struct_v >= 16) {
+  if (struct_v >= 17) {  // this must match newest decode version
     bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
     decode_finish = true;
   }
@@ -2040,6 +2047,9 @@ void object_stat_sum_t::decode(bufferlist::iterator& bl)
     } else {
       num_legacy_snapsets = num_object_clones;  // upper bound
     }
+    if (struct_v >= 17) {
+      ::decode(num_large_omap_objects, bl);
+    }
   }
   DECODE_FINISH(bl);
 }
@@ -2079,6 +2089,7 @@ void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
   a.num_evict_mode_some = 1;
   a.num_evict_mode_full = 0;
   a.num_objects_pinned = 20;
+  a.num_large_omap_objects = 5;
   o.push_back(new object_stat_sum_t(a));
 }
 
@@ -2119,6 +2130,7 @@ void object_stat_sum_t::add(const object_stat_sum_t& o)
   num_evict_mode_full += o.num_evict_mode_full;
   num_objects_pinned += o.num_objects_pinned;
   num_legacy_snapsets += o.num_legacy_snapsets;
+  num_large_omap_objects += o.num_large_omap_objects;
 }
 
 void object_stat_sum_t::sub(const object_stat_sum_t& o)
@@ -2158,6 +2170,7 @@ void object_stat_sum_t::sub(const object_stat_sum_t& o)
   num_evict_mode_full -= o.num_evict_mode_full;
   num_objects_pinned -= o.num_objects_pinned;
   num_legacy_snapsets -= o.num_legacy_snapsets;
+  num_large_omap_objects -= o.num_large_omap_objects;
 }
 
 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
@@ -2197,7 +2210,8 @@ bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
     l.num_evict_mode_some == r.num_evict_mode_some &&
     l.num_evict_mode_full == r.num_evict_mode_full &&
     l.num_objects_pinned == r.num_objects_pinned &&
-    l.num_legacy_snapsets == r.num_legacy_snapsets;
+    l.num_legacy_snapsets == r.num_legacy_snapsets &&
+    l.num_large_omap_objects == r.num_large_omap_objects;
 }
 
 // -- object_stat_collection_t --
@@ -5839,7 +5853,7 @@ void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
 void ScrubMap::object::encode(bufferlist& bl) const
 {
   bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
-  ENCODE_START(8, 7, bl);
+  ENCODE_START(9, 7, bl);
   ::encode(size, bl);
   ::encode(negative, bl);
   ::encode(attrs, bl);
@@ -5854,12 +5868,15 @@ void ScrubMap::object::encode(bufferlist& bl) const
   ::encode(read_error, bl);
   ::encode(ec_hash_mismatch, bl);
   ::encode(ec_size_mismatch, bl);
+  ::encode(large_omap_object_found, bl);
+  ::encode(large_omap_object_key_count, bl);
+  ::encode(large_omap_object_value_size, bl);
   ENCODE_FINISH(bl);
 }
 
 void ScrubMap::object::decode(bufferlist::iterator& bl)
 {
-  DECODE_START(8, bl);
+  DECODE_START(9, bl);
   ::decode(size, bl);
   bool tmp, compat_read_error = false;
   ::decode(tmp, bl);
@@ -5891,6 +5908,12 @@ void ScrubMap::object::decode(bufferlist::iterator& bl)
   // If older encoder found a read_error, set read_error
   if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
     read_error = true;
+  if (struct_v >= 9) {
+    ::decode(tmp, bl);
+    large_omap_object_found = tmp;
+    ::decode(large_omap_object_key_count, bl);
+    ::decode(large_omap_object_value_size, bl);
+  }
   DECODE_FINISH(bl);
 }
 
index 2d7da93d7a811b286690c9ebb7461ddd5d732067..ec268b118c6fcc1261a372648d511817ec89211a 100644 (file)
@@ -1656,6 +1656,7 @@ struct object_stat_sum_t {
   int64_t num_objects_pinned;
   int64_t num_objects_missing;
   int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
+  int64_t num_large_omap_objects = 0;
 
   object_stat_sum_t()
     : num_bytes(0),
@@ -1702,6 +1703,7 @@ struct object_stat_sum_t {
     FLOOR(num_rd_kb);
     FLOOR(num_wr);
     FLOOR(num_wr_kb);
+    FLOOR(num_large_omap_objects);
     FLOOR(num_shallow_scrub_errors);
     FLOOR(num_deep_scrub_errors);
     num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
@@ -1762,6 +1764,7 @@ struct object_stat_sum_t {
       out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
                                out[i].num_deep_scrub_errors;
     }
+    SPLIT(num_large_omap_objects);
     SPLIT(num_objects_recovered);
     SPLIT(num_bytes_recovered);
     SPLIT(num_keys_recovered);
@@ -1816,6 +1819,7 @@ struct object_stat_sum_t {
         sizeof(num_wr) +
         sizeof(num_wr_kb) +
         sizeof(num_scrub_errors) +
+        sizeof(num_large_omap_objects) +
         sizeof(num_objects_recovered) +
         sizeof(num_bytes_recovered) +
         sizeof(num_keys_recovered) +
@@ -4778,8 +4782,8 @@ struct object_info_t {
     omap_digest = -1;
   }
   void new_object() {
-    set_data_digest(-1);
-    set_omap_digest(-1);
+    clear_data_digest();
+    clear_omap_digest();
   }
 
   void encode(bufferlist& bl, uint64_t features) const;
@@ -4940,12 +4944,16 @@ struct ScrubMap {
     bool stat_error:1;
     bool ec_hash_mismatch:1;
     bool ec_size_mismatch:1;
+    bool large_omap_object_found:1;
+    uint64_t large_omap_object_key_count = 0;
+    uint64_t large_omap_object_value_size = 0;
 
     object() :
       // Init invalid size so it won't match if we get a stat EIO error
       size(-1), omap_digest(0), digest(0),
-      negative(false), digest_present(false), omap_digest_present(false), 
-      read_error(false), stat_error(false), ec_hash_mismatch(false), ec_size_mismatch(false) {}
+      negative(false), digest_present(false), omap_digest_present(false),
+      read_error(false), stat_error(false), ec_hash_mismatch(false),
+      ec_size_mismatch(false), large_omap_object_found(false) {}
 
     void encode(bufferlist& bl) const;
     void decode(bufferlist::iterator& bl);
@@ -4957,8 +4965,12 @@ struct ScrubMap {
   map<hobject_t,object> objects;
   eversion_t valid_through;
   eversion_t incr_since;
+  bool has_large_omap_object_errors:1;
 
   void merge_incr(const ScrubMap &l);
+  void clear_from(const hobject_t& start) {
+    objects.erase(objects.lower_bound(start), objects.end());
+  }
   void insert(const ScrubMap &r) {
     objects.insert(r.objects.begin(), r.objects.end());
   }
@@ -4977,6 +4989,60 @@ struct ScrubMap {
 WRITE_CLASS_ENCODER(ScrubMap::object)
 WRITE_CLASS_ENCODER(ScrubMap)
 
+struct ScrubMapBuilder {
+  bool deep = false;
+  vector<hobject_t> ls;
+  size_t pos = 0;
+  int64_t data_pos = 0;
+  string omap_pos;
+  int ret = 0;
+  bufferhash data_hash, omap_hash;  ///< accumulatinng hash value
+  uint64_t omap_keys = 0;
+  uint64_t omap_bytes = 0;
+
+  bool empty() {
+    return ls.empty();
+  }
+  bool done() {
+    return pos >= ls.size();
+  }
+  void reset() {
+    *this = ScrubMapBuilder();
+  }
+
+  bool data_done() {
+    return data_pos < 0;
+  }
+
+  void next_object() {
+    ++pos;
+    data_pos = 0;
+    omap_pos.clear();
+    omap_keys = 0;
+    omap_bytes = 0;
+  }
+
+  friend ostream& operator<<(ostream& out, const ScrubMapBuilder& pos) {
+    out << "(" << pos.pos << "/" << pos.ls.size();
+    if (pos.pos < pos.ls.size()) {
+      out << " " << pos.ls[pos.pos];
+    }
+    if (pos.data_pos < 0) {
+      out << " byte " << pos.data_pos;
+    }
+    if (!pos.omap_pos.empty()) {
+      out << " key " << pos.omap_pos;
+    }
+    if (pos.deep) {
+      out << " deep";
+    }
+    if (pos.ret) {
+      out << " ret " << pos.ret;
+    }
+    return out << ")";
+  }
+};
+
 struct OSDOp {
   ceph_osd_op op;
   sobject_t soid;
index 94386101a1d34326216d4ed0fbada085c692d2a5..8ca0c4b27188eddf8db2856db60ebb1fcb3bade4 100644 (file)
@@ -853,6 +853,13 @@ void Journaler::_finish_prezero(int r, uint64_t start, uint64_t len)
     if (waiting_for_zero_pos > flush_pos) {
       _do_flush(waiting_for_zero_pos - flush_pos);
     }
+
+    if (prezero_pos == prezeroing_pos &&
+       !waitfor_prezero.empty()) {
+      list<Context*> ls;
+      ls.swap(waitfor_prezero);
+      finish_contexts(cct, ls, 0);
+    }
   } else {
     pending_zero.insert(start, len);
   }
@@ -862,6 +869,17 @@ void Journaler::_finish_prezero(int r, uint64_t start, uint64_t len)
                 << dendl;
 }
 
+void Journaler::wait_for_prezero(Context *onfinish)
+{
+  assert(onfinish);
+  lock_guard l(lock);
+
+  if (prezero_pos == prezeroing_pos) {
+    finisher->queue(onfinish, 0);
+    return;
+  }
+  waitfor_prezero.push_back(wrap_finisher(onfinish));
+}
 
 
 /***************** READING *******************/
index d65419a652be45bdf675bf799f81ca3306484bb8..521752017529ee33a1f2e525418285d574683890 100644 (file)
@@ -312,6 +312,8 @@ private:
 
   uint64_t waiting_for_zero_pos;
   interval_set<uint64_t> pending_zero;  // non-contig bits we've zeroed
+  list<Context*> waitfor_prezero;
+
   std::map<uint64_t, uint64_t> pending_safe; // flush_pos -> safe_pos
   // when safe through given offset
   std::map<uint64_t, std::list<Context*> > waitfor_safe;
@@ -459,6 +461,7 @@ public:
   void flush(Context *onsafe = 0);
   void wait_for_readable(Context *onfinish);
   bool have_waiter() const;
+  void wait_for_prezero(Context *onfinish);
 
   // Synchronous setters
   // ===================
index ff1bf87b17c9d8af463b41fd06d9d01e6b3e1332..bb974bed670f2305ac13ca06563d0573820ff796 100644 (file)
@@ -561,7 +561,8 @@ void ObjectCacher::Object::truncate(loff_t s)
   }
 }
 
-void ObjectCacher::Object::discard(loff_t off, loff_t len)
+void ObjectCacher::Object::discard(loff_t off, loff_t len,
+                                   C_GatherBuilder* commit_gather)
 {
   assert(oc->lock.is_locked());
   ldout(oc->cct, 10) << "discard " << *this << " " << off << "~" << len
@@ -596,8 +597,24 @@ void ObjectCacher::Object::discard(loff_t off, loff_t len)
 
     ++p;
     ldout(oc->cct, 10) << "discard " << *this << " bh " << *bh << dendl;
-    assert(bh->waitfor_read.empty());
     replace_journal_tid(bh, 0);
+
+    if (bh->is_tx() && commit_gather != nullptr) {
+      // wait for the writeback to commit
+      waitfor_commit[bh->last_write_tid].emplace_back(commit_gather->new_sub());
+    } else if (bh->is_rx()) {
+      // cannot remove bh with in-flight read, but we can ensure the
+      // read won't overwrite the discard
+      bh->last_read_tid = ++oc->last_read_tid;
+      bh->bl.clear();
+      bh->set_nocache(true);
+      oc->mark_zero(bh);
+      // we should mark all Rx bh to zero
+      continue;
+    } else {
+      assert(bh->waitfor_read.empty());
+    }
+
     oc->bh_remove(this, bh);
     delete bh;
   }
@@ -2450,32 +2467,79 @@ void ObjectCacher::clear_nonexistence(ObjectSet *oset)
 void ObjectCacher::discard_set(ObjectSet *oset, const vector<ObjectExtent>& exls)
 {
   assert(lock.is_locked());
-  if (oset->objects.empty()) {
-    ldout(cct, 10) << "discard_set on " << oset << " dne" << dendl;
+  bool was_dirty = oset->dirty_or_tx > 0;
+
+  _discard(oset, exls, nullptr);
+  _discard_finish(oset, was_dirty, nullptr);
+}
+
+/**
+ * discard object extents from an ObjectSet by removing the objects in
+ * exls from the in-memory oset. If the bh is in TX state, the discard
+ * will wait for the write to commit prior to invoking on_finish.
+ */
+void ObjectCacher::discard_writeback(ObjectSet *oset,
+                                     const vector<ObjectExtent>& exls,
+                                     Context* on_finish)
+{
+  assert(lock.is_locked());
+  bool was_dirty = oset->dirty_or_tx > 0;
+
+  C_GatherBuilder gather(cct);
+  _discard(oset, exls, &gather);
+
+  if (gather.has_subs()) {
+    bool flushed = was_dirty && oset->dirty_or_tx == 0;
+    gather.set_finisher(new FunctionContext(
+      [this, oset, flushed, on_finish](int) {
+       assert(lock.is_locked());
+       if (flushed && flush_set_callback)
+         flush_set_callback(flush_set_callback_arg, oset);
+       if (on_finish)
+         on_finish->complete(0);
+      }));
+    gather.activate();
     return;
   }
 
-  ldout(cct, 10) << "discard_set " << oset << dendl;
+  _discard_finish(oset, was_dirty, on_finish);
+}
+
+void ObjectCacher::_discard(ObjectSet *oset, const vector<ObjectExtent>& exls,
+                            C_GatherBuilder* gather)
+{
+  if (oset->objects.empty()) {
+    ldout(cct, 10) << __func__ << " on " << oset << " dne" << dendl;
+    return;
+  }
 
-  bool were_dirty = oset->dirty_or_tx > 0;
+  ldout(cct, 10) << __func__ << " " << oset << dendl;
 
-  for (vector<ObjectExtent>::const_iterator p = exls.begin();
-       p != exls.end();
-       ++p) {
-    ldout(cct, 10) << "discard_set " << oset << " ex " << *p << dendl;
-    const ObjectExtent &ex = *p;
+  for (auto& ex : exls) {
+    ldout(cct, 10) << __func__ << " " << oset << " ex " << ex << dendl;
     sobject_t soid(ex.oid, CEPH_NOSNAP);
     if (objects[oset->poolid].count(soid) == 0)
       continue;
     Object *ob = objects[oset->poolid][soid];
 
-    ob->discard(ex.offset, ex.length);
+    ob->discard(ex.offset, ex.length, gather);
   }
+}
+
+void ObjectCacher::_discard_finish(ObjectSet *oset, bool was_dirty,
+                                   Context* on_finish)
+{
+  assert(lock.is_locked());
 
   // did we truncate off dirty data?
-  if (flush_set_callback &&
-      were_dirty && oset->dirty_or_tx == 0)
+  if (flush_set_callback && was_dirty && oset->dirty_or_tx == 0) {
     flush_set_callback(flush_set_callback_arg, oset);
+  }
+
+  // notify that in-flight writeback has completed
+  if (on_finish != nullptr) {
+    on_finish->complete(0);
+  }
 }
 
 void ObjectCacher::verify_stats() const
index 01ab8829b88acdd0164a456e436dad4b4116455b..60f049ef55d5b2a4bf142b570a33fd728b4e7f0b 100644 (file)
@@ -355,7 +355,7 @@ class ObjectCacher {
 
     void replace_journal_tid(BufferHead *bh, ceph_tid_t tid);
     void truncate(loff_t s);
-    void discard(loff_t off, loff_t len);
+    void discard(loff_t off, loff_t len, C_GatherBuilder* commit_gather);
 
     // reference counting
     int get() {
@@ -620,6 +620,10 @@ private:
   void maybe_wait_for_writeback(uint64_t len, ZTracer::Trace *trace);
   bool _flush_set_finish(C_GatherBuilder *gather, Context *onfinish);
 
+  void _discard(ObjectSet *oset, const vector<ObjectExtent>& exls,
+                C_GatherBuilder* gather);
+  void _discard_finish(ObjectSet *oset, bool was_dirty, Context* on_finish);
+
 public:
   bool set_is_empty(ObjectSet *oset);
   bool set_is_cached(ObjectSet *oset);
@@ -637,6 +641,8 @@ public:
   uint64_t release_all();
 
   void discard_set(ObjectSet *oset, const vector<ObjectExtent>& ex);
+  void discard_writeback(ObjectSet *oset, const vector<ObjectExtent>& ex,
+                         Context* on_finish);
 
   /**
    * Retry any in-flight reads that get -ENOENT instead of marking
index 18ec1cf0d1839d1b8f81331cfc14b2f03755b992..4e99ec0bbbc25303966888a7f28897b7453f83da 100644 (file)
@@ -594,6 +594,10 @@ void Objecter::_linger_commit(LingerOp *info, int r, bufferlist& outbl)
     info->on_reg_commit->complete(r);
     info->on_reg_commit = NULL;
   }
+  if (r < 0 && info->on_notify_finish) {
+    info->on_notify_finish->complete(r);
+    info->on_notify_finish = nullptr;
+  }
 
   // only tell the user the first time we do this
   info->registered = true;
@@ -1639,8 +1643,14 @@ void Objecter::_check_linger_pool_dne(LingerOp *op, bool *need_unregister)
   }
   if (op->map_dne_bound > 0) {
     if (osdmap->get_epoch() >= op->map_dne_bound) {
+      LingerOp::unique_lock wl{op->watch_lock};
       if (op->on_reg_commit) {
        op->on_reg_commit->complete(-ENOENT);
+       op->on_reg_commit = nullptr;
+      }
+      if (op->on_notify_finish) {
+        op->on_notify_finish->complete(-ENOENT);
+        op->on_notify_finish = nullptr;
       }
       *need_unregister = true;
     }
@@ -1696,7 +1706,9 @@ void Objecter::C_Command_Map_Latest::finish(int r)
   if (c->map_dne_bound == 0)
     c->map_dne_bound = latest;
 
+  OSDSession::unique_lock sul(c->session->lock);
   objecter->_check_command_map_dne(c);
+  sul.unlock();
 
   c->put();
 }
@@ -1704,6 +1716,7 @@ void Objecter::C_Command_Map_Latest::finish(int r)
 void Objecter::_check_command_map_dne(CommandOp *c)
 {
   // rwlock is locked unique
+  // session is locked unique
 
   ldout(cct, 10) << "_check_command_map_dne tid " << c->tid
                 << " current " << osdmap->get_epoch()
@@ -1721,6 +1734,7 @@ void Objecter::_check_command_map_dne(CommandOp *c)
 void Objecter::_send_command_map_check(CommandOp *c)
 {
   // rwlock is locked unique
+  // session is locked unique
 
   // ask the monitor
   if (check_latest_map_commands.count(c->tid) == 0) {
@@ -4770,8 +4784,10 @@ void Objecter::handle_command_reply(MCommandReply *m)
 
   sl.unlock();
 
-
+  OSDSession::unique_lock sul(s->lock);
   _finish_command(c, m->r, m->rs);
+  sul.unlock();
+
   m->put();
   if (s)
     s->put();
@@ -4918,13 +4934,16 @@ int Objecter::command_op_cancel(OSDSession *s, ceph_tid_t tid, int r)
 
   CommandOp *op = it->second;
   _command_cancel_map_check(op);
+  OSDSession::unique_lock sl(op->session->lock);
   _finish_command(op, r, "");
+  sl.unlock();
   return 0;
 }
 
 void Objecter::_finish_command(CommandOp *c, int r, string rs)
 {
   // rwlock is locked unique
+  // session lock is locked
 
   ldout(cct, 10) << "_finish_command " << c->tid << " = " << r << " "
                 << rs << dendl;
@@ -4937,9 +4956,7 @@ void Objecter::_finish_command(CommandOp *c, int r, string rs)
     timer.cancel_event(c->ontimeout);
 
   OSDSession *s = c->session;
-  OSDSession::unique_lock sl(s->lock);
   _session_command_op_remove(c->session, c);
-  sl.unlock();
 
   c->put();
 
index ac01807b69362a3b4c927e1d021b165ecf6b529b..d38f72be9a37ab9e7773e1da3e09832ef05cc3b6 100644 (file)
@@ -204,7 +204,7 @@ CEPHFSVOLUMECLIENT_VERSION_HISTORY = """
 
     * 1 - Initial version
     * 2 - Added get_object, put_object, delete_object methods to CephFSVolumeClient
-
+    * 3 - Allow volumes to be created without RADOS namespace isolation
 """
 
 
@@ -228,7 +228,7 @@ class CephFSVolumeClient(object):
     """
 
     # Current version
-    version = 2
+    version = 3
 
     # Where shall we create our volumes?
     POOL_PREFIX = "fsvolume_"
@@ -598,7 +598,7 @@ class CephFSVolumeClient(object):
             except cephfs.ObjectNotFound:
                 self.fs.mkdir(subpath, 0o755)
 
-    def create_volume(self, volume_path, size=None, data_isolated=False):
+    def create_volume(self, volume_path, size=None, data_isolated=False, namespace_isolated=True):
         """
         Set up metadata, pools and auth for a volume.
 
@@ -608,6 +608,7 @@ class CephFSVolumeClient(object):
         :param volume_path: VolumePath instance
         :param size: In bytes, or None for no size limit
         :param data_isolated: If true, create a separate OSD pool for this volume
+        :param namespace_isolated: If true, use separate RADOS namespace for this volume
         :return:
         """
         path = self._get_path(volume_path)
@@ -630,10 +631,17 @@ class CephFSVolumeClient(object):
                 })
             self.fs.setxattr(path, 'ceph.dir.layout.pool', pool_name, 0)
 
-        # enforce security isolation, use seperate namespace for this volume
-        namespace = "{0}{1}".format(self.pool_ns_prefix, volume_path.volume_id)
-        log.info("create_volume: {0}, using rados namespace {1} to isolate data.".format(volume_path, namespace))
-        self.fs.setxattr(path, 'ceph.dir.layout.pool_namespace', namespace, 0)
+        # enforce security isolation, use separate namespace for this volume
+        if namespace_isolated:
+            namespace = "{0}{1}".format(self.pool_ns_prefix, volume_path.volume_id)
+            log.info("create_volume: {0}, using rados namespace {1} to isolate data.".format(volume_path, namespace))
+            self.fs.setxattr(path, 'ceph.dir.layout.pool_namespace', namespace, 0)
+        else:
+            # If volume's namespace layout is not set, then the volume's pool
+            # layout remains unset and will undesirably change with ancestor's
+            # pool layout changes.
+            pool_name = self._get_ancestor_xattr(path, "ceph.dir.layout.pool")
+            self.fs.setxattr(path, 'ceph.dir.layout.pool', pool_name, 0)
 
         # Create a volume meta file, if it does not already exist, to store
         # data about auth ids having access to the volume
@@ -1021,15 +1029,23 @@ class CephFSVolumeClient(object):
         # First I need to work out what the data pool is for this share:
         # read the layout
         pool_name = self._get_ancestor_xattr(path, "ceph.dir.layout.pool")
-        namespace = self.fs.getxattr(path, "ceph.dir.layout.pool_namespace")
+
+        try:
+            namespace = self.fs.getxattr(path, "ceph.dir.layout.pool_namespace")
+        except cephfs.NoData:
+            namespace = None
 
         # Now construct auth capabilities that give the guest just enough
         # permissions to access the share
         client_entity = "client.{0}".format(auth_id)
         want_access_level = 'r' if readonly else 'rw'
         want_mds_cap = 'allow {0} path={1}'.format(want_access_level, path)
-        want_osd_cap = 'allow {0} pool={1} namespace={2}'.format(
-            want_access_level, pool_name, namespace)
+        if namespace:
+            want_osd_cap = 'allow {0} pool={1} namespace={2}'.format(
+                want_access_level, pool_name, namespace)
+        else:
+            want_osd_cap = 'allow {0} pool={1}'.format(want_access_level,
+                                                       pool_name)
 
         try:
             existing = self._rados_command(
@@ -1057,26 +1073,41 @@ class CephFSVolumeClient(object):
             # auth caps.
             unwanted_access_level = 'r' if want_access_level is 'rw' else 'rw'
             unwanted_mds_cap = 'allow {0} path={1}'.format(unwanted_access_level, path)
-            unwanted_osd_cap = 'allow {0} pool={1} namespace={2}'.format(
-                unwanted_access_level, pool_name, namespace)
+            if namespace:
+                unwanted_osd_cap = 'allow {0} pool={1} namespace={2}'.format(
+                    unwanted_access_level, pool_name, namespace)
+            else:
+                unwanted_osd_cap = 'allow {0} pool={1}'.format(
+                    unwanted_access_level, pool_name)
+
+            def cap_update(
+                    orig_mds_caps, orig_osd_caps, want_mds_cap,
+                    want_osd_cap, unwanted_mds_cap, unwanted_osd_cap):
 
-            def cap_update(orig, want, unwanted):
-                # Updates the existing auth caps such that there is a single
-                # occurrence of wanted auth caps and no occurrence of
-                # conflicting auth caps.
+                if not orig_mds_caps:
+                    return want_mds_cap, want_osd_cap
 
-                if not orig:
-                    return want
+                mds_cap_tokens = orig_mds_caps.split(",")
+                osd_cap_tokens = orig_osd_caps.split(",")
 
-                cap_tokens = set(orig.split(","))
+                if want_mds_cap in mds_cap_tokens:
+                    return orig_mds_caps, orig_osd_caps
 
-                cap_tokens.discard(unwanted)
-                cap_tokens.add(want)
+                if unwanted_mds_cap in mds_cap_tokens:
+                    mds_cap_tokens.remove(unwanted_mds_cap)
+                    osd_cap_tokens.remove(unwanted_osd_cap)
 
-                return ",".join(cap_tokens)
+                mds_cap_tokens.append(want_mds_cap)
+                osd_cap_tokens.append(want_osd_cap)
 
-            osd_cap_str = cap_update(cap['caps'].get('osd', ""), want_osd_cap, unwanted_osd_cap)
-            mds_cap_str = cap_update(cap['caps'].get('mds', ""), want_mds_cap, unwanted_mds_cap)
+                return ",".join(mds_cap_tokens), ",".join(osd_cap_tokens)
+
+            orig_mds_caps = cap['caps'].get('mds', "")
+            orig_osd_caps = cap['caps'].get('osd', "")
+
+            mds_cap_str, osd_cap_str = cap_update(
+                orig_mds_caps, orig_osd_caps, want_mds_cap, want_osd_cap,
+                unwanted_mds_cap, unwanted_osd_cap)
 
             caps = self._rados_command(
                 'auth caps',
@@ -1183,16 +1214,23 @@ class CephFSVolumeClient(object):
         client_entity = "client.{0}".format(auth_id)
         path = self._get_path(volume_path)
         pool_name = self._get_ancestor_xattr(path, "ceph.dir.layout.pool")
-        namespace = self.fs.getxattr(path, "ceph.dir.layout.pool_namespace")
+        try:
+            namespace = self.fs.getxattr(path, "ceph.dir.layout.pool_namespace")
+        except cephfs.NoData:
+            namespace = None
 
         # The auth_id might have read-only or read-write mount access for the
         # volume path.
         access_levels = ('r', 'rw')
-        want_mds_caps = {'allow {0} path={1}'.format(access_level, path)
-                         for access_level in access_levels}
-        want_osd_caps = {'allow {0} pool={1} namespace={2}'.format(
-                         access_level, pool_name, namespace)
-                         for access_level in access_levels}
+        want_mds_caps = ['allow {0} path={1}'.format(access_level, path)
+                         for access_level in access_levels]
+        if namespace:
+            want_osd_caps = ['allow {0} pool={1} namespace={2}'.format(access_level, pool_name, namespace)
+                             for access_level in access_levels]
+        else:
+            want_osd_caps = ['allow {0} pool={1}'.format(access_level, pool_name)
+                             for access_level in access_levels]
+
 
         try:
             existing = self._rados_command(
@@ -1202,14 +1240,25 @@ class CephFSVolumeClient(object):
                 }
             )
 
-            def cap_remove(orig, want):
-                cap_tokens = set(orig.split(","))
-                return ",".join(cap_tokens.difference(want))
+            def cap_remove(orig_mds_caps, orig_osd_caps, want_mds_caps, want_osd_caps):
+                mds_cap_tokens = orig_mds_caps.split(",")
+                osd_cap_tokens = orig_osd_caps.split(",")
+
+                for want_mds_cap, want_osd_cap in zip(want_mds_caps, want_osd_caps):
+                    if want_mds_cap in mds_cap_tokens:
+                        mds_cap_tokens.remove(want_mds_cap)
+                        osd_cap_tokens.remove(want_osd_cap)
+                        break
+
+                return ",".join(mds_cap_tokens), ",".join(osd_cap_tokens)
 
             cap = existing[0]
-            osd_cap_str = cap_remove(cap['caps'].get('osd', ""), want_osd_caps)
-            mds_cap_str = cap_remove(cap['caps'].get('mds', ""), want_mds_caps)
-            if (not osd_cap_str) and (not mds_cap_str):
+            orig_mds_caps = cap['caps'].get('mds', "")
+            orig_osd_caps = cap['caps'].get('osd', "")
+            mds_cap_str, osd_cap_str = cap_remove(orig_mds_caps, orig_osd_caps,
+                                                  want_mds_caps, want_osd_caps)
+
+            if not mds_cap_str:
                 self._rados_command('auth del', {'entity': client_entity}, decode=False)
             else:
                 self._rados_command(
index adeb452701d81394df5fc2a5e540de4a98f7d0d5..b4039ac70806576dc1ea2b4997272a9ffa95eade 100644 (file)
@@ -1,53 +1,90 @@
-
 from datetime import datetime
 from threading import Event
 import json
 import errno
+import time
 
 from mgr_module import MgrModule
 
 try:
     from influxdb import InfluxDBClient
     from influxdb.exceptions import InfluxDBClientError
+    from requests.exceptions import ConnectionError
 except ImportError:
     InfluxDBClient = None
 
+
 class Module(MgrModule):
     COMMANDS = [
+        {
+            "cmd": "influx config-set name=key,type=CephString "
+                   "name=value,type=CephString",
+            "desc": "Set a configuration value",
+            "perm": "rw"
+        },
+        {
+            "cmd": "influx config-show",
+            "desc": "Show current configuration",
+            "perm": "r"
+        },
+        {
+            "cmd": "influx send",
+            "desc": "Force sending data to Influx",
+            "perm": "rw"
+        },
         {
             "cmd": "influx self-test",
             "desc": "debug the module",
-            "perm": "rw"  
+            "perm": "rw"
         },
     ]
 
+    config_keys = {
+        'hostname': None,
+        'port': 8086,
+        'database': 'ceph',
+        'username': None,
+        'password': None,
+        'interval': 5,
+        'ssl': 'false',
+        'verify_ssl': 'true'
+    }
 
     def __init__(self, *args, **kwargs):
         super(Module, self).__init__(*args, **kwargs)
         self.event = Event()
-        self.run = True 
+        self.run = True
+        self.config = dict()
 
+    def get_fsid(self):
+        return self.get('mon_map')['fsid']
 
     def get_latest(self, daemon_type, daemon_name, stat):
         data = self.get_counter(daemon_type, daemon_name, stat)[stat]
         if data:
             return data[-1][1]
-        else:
-            return 0
 
+        return 0
 
     def get_df_stats(self):
         df = self.get("df")
         data = []
 
+        now = datetime.utcnow().isoformat() + 'Z'
+
         df_types = [
             'bytes_used',
+            'kb_used',
             'dirty',
+            'rd',
             'rd_bytes',
             'raw_bytes_used',
+            'wr',
             'wr_bytes',
             'objects',
-            'max_avail'
+            'max_avail',
+            'quota_objects',
+            'quota_bytes'
         ]
 
         for df_type in df_types:
@@ -55,15 +92,15 @@ class Module(MgrModule):
                 point = {
                     "measurement": "ceph_pool_stats",
                     "tags": {
-                        "pool_name" : pool['name'],
-                        "pool_id" : pool['id'],
-                        "type_instance" : df_type,
-                        "mgr_id" : self.get_mgr_id(),
+                        "pool_name": pool['name'],
+                        "pool_id": pool['id'],
+                        "type_instance": df_type,
+                        "fsid": self.get_fsid()
                     },
-                        "time" : datetime.utcnow().isoformat() + 'Z',
-                        "fields": {
-                            "value" : pool['stats'][df_type],
-                        }
+                    "time": now,
+                    "fields": {
+                        "value": pool['stats'][df_type],
+                    }
                 }
                 data.append(point)
         return data
@@ -71,8 +108,10 @@ class Module(MgrModule):
     def get_daemon_stats(self):
         data = []
 
+        now = datetime.utcnow().isoformat() + 'Z'
+
         for daemon, counters in self.get_all_perf_counters().iteritems():
-            svc_type, svc_id = daemon.split(".")
+            svc_type, svc_id = daemon.split(".", 1)
             metadata = self.get_metadata(svc_type, svc_id)
 
             for path, counter_info in counters.items():
@@ -86,9 +125,10 @@ class Module(MgrModule):
                     "tags": {
                         "ceph_daemon": daemon,
                         "type_instance": path,
-                        "host": metadata['hostname']
+                        "host": metadata['hostname'],
+                        "fsid": self.get_fsid()
                     },
-                    "time": datetime.utcnow().isoformat() + 'Z',
+                    "time": now,
                     "fields": {
                         "value": value
                     }
@@ -96,32 +136,105 @@ class Module(MgrModule):
 
         return data
 
+    def set_config_option(self, option, value):
+        if option not in self.config_keys.keys():
+            raise RuntimeError('{0} is a unknown configuration '
+                               'option'.format(option))
+
+        if option in ['port', 'interval']:
+            try:
+                value = int(value)
+            except (ValueError, TypeError):
+                raise RuntimeError('invalid {0} configured. Please specify '
+                                   'a valid integer'.format(option))
+
+        if option == 'interval' and value < 5:
+            raise RuntimeError('interval should be set to at least 5 seconds')
+
+        if option in ['ssl', 'verify_ssl']:
+            value = value.lower() == 'true'
+
+        self.config[option] = value
+
+    def init_module_config(self):
+        self.config['hostname'] = \
+            self.get_config("hostname", default=self.config_keys['hostname'])
+        self.config['port'] = \
+            int(self.get_config("port", default=self.config_keys['port']))
+        self.config['database'] = \
+            self.get_config("database", default=self.config_keys['database'])
+        self.config['username'] = \
+            self.get_config("username", default=self.config_keys['username'])
+        self.config['password'] = \
+            self.get_config("password", default=self.config_keys['password'])
+        self.config['interval'] = \
+            int(self.get_config("interval",
+                                default=self.config_keys['interval']))
+        ssl = self.get_config("ssl", default=self.config_keys['ssl'])
+        self.config['ssl'] = ssl.lower() == 'true'
+        verify_ssl = \
+            self.get_config("verify_ssl", default=self.config_keys['verify_ssl'])
+        self.config['verify_ssl'] = verify_ssl.lower() == 'true'
+
     def send_to_influx(self):
-        host = self.get_config("hostname")
-        if not host:
-            self.log.error("No InfluxDB server configured, please set"
-                           "`hostname` configuration key.")
+        if not self.config['hostname']:
+            self.log.error("No Influx server configured, please set one using: "
+                           "ceph influx config-set hostname <hostname>")
+            self.set_health_checks({
+                'MGR_INFLUX_NO_SERVER': {
+                    'severity': 'warning',
+                    'summary': 'No InfluxDB server configured',
+                    'detail': ['Configuration option hostname not set']
+                }
+            })
             return
 
-        port = int(self.get_config("port", default="8086"))
-        database = self.get_config("database", default="ceph")
-
         # If influx server has authentication turned off then
         # missing username/password is valid.
-        username = self.get_config("username", default="")
-        password = self.get_config("password", default="")
-
-        client = InfluxDBClient(host, port, username, password, database)
+        self.log.debug("Sending data to Influx host: %s",
+                       self.config['hostname'])
+        client = InfluxDBClient(self.config['hostname'], self.config['port'],
+                                self.config['username'],
+                                self.config['password'],
+                                self.config['database'],
+                                self.config['ssl'],
+                                self.config['verify_ssl'])
 
-        # using influx client get_list_database requires admin privs, instead we'll catch the not found exception and inform the user if db can't be created
+        # using influx client get_list_database requires admin privs,
+        # instead we'll catch the not found exception and inform the user if
+        # db can not be created
         try:
             client.write_points(self.get_df_stats(), 'ms')
             client.write_points(self.get_daemon_stats(), 'ms')
+            self.set_health_checks(dict())
+        except ConnectionError as e:
+            self.log.exception("Failed to connect to Influx host %s:%d",
+                               self.config['hostname'], self.config['port'])
+            self.set_health_checks({
+                'MGR_INFLUX_SEND_FAILED': {
+                    'severity': 'warning',
+                    'summary': 'Failed to send data to InfluxDB server at %s:%d'
+                               ' due to an connection error'
+                               % (self.config['hostname'], self.config['port']),
+                    'detail': [str(e)]
+                }
+            })
         except InfluxDBClientError as e:
             if e.code == 404:
-                self.log.info("Database '{0}' not found, trying to create (requires admin privs).  You can also create manually and grant write privs to user '{1}'".format(database,username))
-                client.create_database(database)
+                self.log.info("Database '%s' not found, trying to create "
+                              "(requires admin privs).  You can also create "
+                              "manually and grant write privs to user "
+                              "'%s'", self.config['database'],
+                              self.config['username'])
+                client.create_database(self.config['database'])
             else:
+                self.set_health_checks({
+                    'MGR_INFLUX_SEND_FAILED': {
+                        'severity': 'warning',
+                        'summary': 'Failed to send data to InfluxDB',
+                        'detail': [str(e)]
+                    }
+                })
                 raise
 
     def shutdown(self):
@@ -130,18 +243,35 @@ class Module(MgrModule):
         self.event.set()
 
     def handle_command(self, cmd):
+        if cmd['prefix'] == 'influx config-show':
+            return 0, json.dumps(self.config), ''
+        elif cmd['prefix'] == 'influx config-set':
+            key = cmd['key']
+            value = cmd['value']
+            if not value:
+                return -errno.EINVAL, '', 'Value should not be empty or None'
+
+            self.log.debug('Setting configuration option %s to %s', key, value)
+            self.set_config_option(key, value)
+            self.set_config(key, value)
+            return 0, 'Configuration option {0} updated'.format(key), ''
+        elif cmd['prefix'] == 'influx send':
+            self.send_to_influx()
+            return 0, 'Sending data to Influx', ''
         if cmd['prefix'] == 'influx self-test':
             daemon_stats = self.get_daemon_stats()
             assert len(daemon_stats)
             df_stats = self.get_df_stats()
+
             result = {
                 'daemon_stats': daemon_stats,
                 'df_stats': df_stats
             }
+
             return 0, json.dumps(result, indent=2), 'Self-test OK'
-        else:
-            return (-errno.EINVAL, '',
-                    "Command not found '{0}'".format(cmd['prefix']))
+
+        return (-errno.EINVAL, '',
+                "Command not found '{0}'".format(cmd['prefix']))
 
     def serve(self):
         if InfluxDBClient is None:
@@ -150,13 +280,14 @@ class Module(MgrModule):
             return
 
         self.log.info('Starting influx module')
+        self.init_module_config()
         self.run = True
+
         while self.run:
+            start = time.time()
             self.send_to_influx()
-            self.log.debug("Running interval loop")
-            interval = self.get_config("interval")
-            if interval is None:
-                interval = 5
-            self.log.debug("sleeping for %d seconds",interval)
-            self.event.wait(interval)
-            
+            runtime = time.time() - start
+            self.log.debug('Finished sending data in Influx in %.3f seconds',
+                           runtime)
+            self.log.debug("Sleeping for %d seconds", self.config['interval'])
+            self.event.wait(self.config['interval'])
index 38283d2446d34bf0ef957c1c5f1e18eab3cfb76e..230d6f20b928d3d7ad9a86e45994af5f214480a7 100644 (file)
@@ -221,7 +221,7 @@ class MgrModule(ceph_module.BaseMgrModule):
     PERFCOUNTER_LONGRUNAVG = 4
     PERFCOUNTER_COUNTER = 8
     PERFCOUNTER_HISTOGRAM = 0x10
-    PERFCOUNTER_TYPE_MASK = ~2
+    PERFCOUNTER_TYPE_MASK = ~3
 
     def __init__(self, module_name, py_modules_ptr, this_ptr):
         self.module_name = module_name
@@ -313,6 +313,13 @@ class MgrModule(ceph_module.BaseMgrModule):
         
         return ''
 
+    def _perfvalue_to_value(self, stattype, value):
+        if stattype & self.PERFCOUNTER_TIME:
+            # Convert from ns to seconds
+            return value / 1000000000.0
+        else:
+            return value
+
     def get_server(self, hostname):
         """
         Called by the plugin to load information about a particular
@@ -547,6 +554,13 @@ class MgrModule(ceph_module.BaseMgrModule):
             else:
                 return 0
 
+        def get_latest_avg(daemon_type, daemon_name, counter):
+            data = self.get_counter(daemon_type, daemon_name, counter)[counter]
+            if data:
+                return (data[-1][1], data[-1][2])
+            else:
+                return (0, 0)
+
         for server in self.list_servers():
             for service in server['services']:
                 if service['type'] not in ("rgw", "mds", "osd", "mon"):
@@ -572,8 +586,24 @@ class MgrModule(ceph_module.BaseMgrModule):
                     if counter_schema['priority'] < prio_limit:
                         continue
 
-                    counter_info = counter_schema
-                    counter_info['value'] = get_latest(service['type'], service['id'], counter_path)
+                    counter_info = dict(counter_schema)
+
+                    # Also populate count for the long running avgs
+                    if counter_schema['type'] & self.PERFCOUNTER_LONGRUNAVG:
+                        v, c = get_latest_avg(
+                            service['type'],
+                            service['id'],
+                            counter_path
+                        )
+                        counter_info['value'], counter_info['count'] = v, c
+                        result[svc_full_name][counter_path] = counter_info
+                    else:
+                        counter_info['value'] = get_latest(
+                            service['type'],
+                            service['id'],
+                            counter_path
+                        )
+
                     result[svc_full_name][counter_path] = counter_info
 
         self.log.debug("returning {0} counter".format(len(result)))
@@ -597,4 +627,4 @@ class MgrModule(ceph_module.BaseMgrModule):
         and/or the monitor cluster is down.
         """
 
-        return self._ceph_have_mon_connection()
\ No newline at end of file
+        return self._ceph_have_mon_connection()
index c7daa128dd8af16fee1ae408895653f2684d8d18..2c4598a39e254737f8ec17e8a0ff85f6c557b0d9 100644 (file)
@@ -83,14 +83,15 @@ DF_POOL = ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty',
 OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance',
              'norecover', 'noscrub', 'nodeep-scrub')
 
-FS_METADATA = ('data_pools', 'id', 'metadata_pool', 'name')
+FS_METADATA = ('data_pools', 'fs_id', 'metadata_pool', 'name')
 
-MDS_METADATA = ('id', 'fs', 'hostname', 'public_addr', 'rank', 'ceph_version')
+MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank',
+                'ceph_version')
 
-MON_METADATA = ('id', 'hostname', 'public_addr', 'rank', 'ceph_version')
+MON_METADATA = ('ceph_daemon', 'hostname', 'public_addr', 'rank', 'ceph_version')
 
-OSD_METADATA = ('cluster_addr', 'device_class', 'id', 'hostname', 'public_addr',
-                'ceph_version')
+OSD_METADATA = ('ceph_daemon', 'cluster_addr', 'device_class', 'hostname',
+                'public_addr', 'ceph_version')
 
 OSD_STATUS = ['weight', 'up', 'in']
 
@@ -98,9 +99,11 @@ OSD_STATS = ['apply_latency_ms', 'commit_latency_ms']
 
 POOL_METADATA = ('pool_id', 'name')
 
-RGW_METADATA = ('id', 'hostname', 'ceph_version')
+RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
 
-DISK_OCCUPATION = ('instance', 'device', 'ceph_daemon')
+DISK_OCCUPATION = ( 'ceph_daemon', 'device','instance')
+
+NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
 
 
 class Metrics(object):
@@ -253,6 +256,13 @@ class Metrics(object):
                 'DF pool {}'.format(state),
                 ('pool_id',)
             )
+        for state in NUM_OBJECTS:
+            path = 'num_objects_{}'.format(state)
+            metrics[path] = Metric(
+                'gauge',
+                path,
+                'Number of {} objects'.format(state),
+            )
 
         return metrics
 
@@ -373,13 +383,14 @@ class Module(MgrModule):
                                  fs['id'],
                                  fs['mdsmap']['metadata_pool'],
                                  fs['mdsmap']['fs_name']))
+            self.log.debug('mdsmap: {}'.format(fs['mdsmap']))
             for gid, daemon in fs['mdsmap']['info'].items():
                 id_ = daemon['name']
                 host_version = servers.get((id_, 'mds'), ('',''))
                 self.metrics.append('mds_metadata', 1,
-                                    (id_, fs['id'], host_version[0],
-                                     daemon['addr'], daemon['rank'],
-                                     host_version[1]))
+                                    ('mds.{}'.format(id_), fs['id'],
+                                     host_version[0], daemon['addr'],
+                                     daemon['rank'], host_version[1]))
 
     def get_quorum_status(self):
         mon_status = json.loads(self.get('mon_status')['json'])
@@ -389,12 +400,12 @@ class Module(MgrModule):
             id_ = mon['name']
             host_version = servers.get((id_, 'mon'), ('',''))
             self.metrics.append('mon_metadata', 1,
-                                (id_, host_version[0],
+                                ('mon.{}'.format(id_), host_version[0],
                                  mon['public_addr'].split(':')[0], rank,
                                  host_version[1]))
             in_quorum = int(rank in mon_status['quorum'])
             self.metrics.append('mon_quorum_status', in_quorum,
-                                ('mon_{}'.format(id_),))
+                                ('mon.{}'.format(id_),))
 
     def get_pg_status(self):
         # TODO add per pool status?
@@ -477,9 +488,10 @@ class Module(MgrModule):
             host_version = servers.get((str(id_), 'osd'), ('',''))
 
             self.metrics.append('osd_metadata', 1, (
+                'osd.{}'.format(id_),
                 c_addr,
                 dev_class,
-                id_, host_version[0],
+                host_version[0],
                 p_addr, host_version[1]
             ))
 
@@ -505,9 +517,9 @@ class Module(MgrModule):
                 self.log.debug("Got dev for osd {0}: {1}/{2}".format(
                     id_, osd_hostname, osd_dev_node))
                 self.metrics.set('disk_occupation', 1, (
-                    osd_hostname,
+                    "osd.{0}".format(id_),
                     osd_dev_node,
-                    "osd.{0}".format(id_)
+                    osd_hostname
                 ))
             else:
                 self.log.info("Missing dev node metadata for osd {0}, skipping "
@@ -526,9 +538,15 @@ class Module(MgrModule):
             self.metrics.append(
                 'rgw_metadata',
                 1,
-                (service_id, hostname, version)
+                ('{}.{}'.format(service_type, service_id), hostname, version)
             )
 
+    def get_num_objects(self):
+        pg_sum = self.get('pg_summary')['pg_stats_sum']['stat_sum']
+        for obj in NUM_OBJECTS:
+            stat = 'num_objects_{}'.format(obj)
+            self.metrics.set(stat, pg_sum[stat])
+
     def collect(self):
         self.get_health()
         self.get_df()
@@ -537,24 +555,47 @@ class Module(MgrModule):
         self.get_quorum_status()
         self.get_metadata_and_osd_status()
         self.get_pg_status()
+        self.get_num_objects()
 
         for daemon, counters in self.get_all_perf_counters().items():
             for path, counter_info in counters.items():
+                # Skip histograms, they are represented by long running avgs
                 stattype = self._stattype_to_str(counter_info['type'])
-                # XXX simplify first effort: no histograms
-                # averages are already collapsed to one value for us
                 if not stattype or stattype == 'histogram':
                     self.log.debug('ignoring %s, type %s' % (path, stattype))
                     continue
 
-                self.metrics.add_metric(path, Metric(
+                # Get the value of the counter
+                value = self._perfvalue_to_value(counter_info['type'], counter_info['value'])
+
+                # Represent the long running avgs as sum/count pairs
+                if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG:
+                    _path = path + '_sum'
+                    self.metrics.add_metric(_path, Metric(
+                        stattype,
+                        _path,
+                        counter_info['description'] + ' Total',
+                        ("ceph_daemon",),
+                    ))
+                    self.metrics.append(_path, value, (daemon,))
+
+                    _path = path + '_count'
+                    self.metrics.add_metric(_path, Metric(
+                        'counter',
+                        _path,
+                        counter_info['description'] + ' Count',
+                        ("ceph_daemon",),
+                    ))
+                    self.metrics.append(_path, counter_info['count'], (daemon,))
+                else:
+                    self.metrics.add_metric(path, Metric(
                         stattype,
                         path,
                         counter_info['description'],
                         ("ceph_daemon",),
                     ))
+                    self.metrics.append(path, value, (daemon,))
 
-                self.metrics.append(path, counter_info['value'], (daemon,))
         # It is sufficient to reset the pending metrics once per scrape
         self.metrics.reset()
 
index dc8b39b6e6e3c4dec4363f47810d254efbec8f6c..9c731685e334f297f1f21dfd860c9ad755ad9df1 100644 (file)
@@ -12,7 +12,7 @@ OSD_IMPLEMENTED_COMMANDS = [
 # Valid values for the 'var' argument to 'ceph osd pool set'
 POOL_PROPERTIES_1 = [
     'size', 'min_size', 'crash_replay_interval', 'pg_num',
-    'crush_rule', 'hashpspool',
+    'crush_rule', 'hashpspool', 'auid',
 ]
 
 POOL_PROPERTIES_2 = [
@@ -86,6 +86,7 @@ def pool_update_commands(pool_name, args):
                 'prefix': 'osd pool set',
                 'pool': pool_name,
                 'var': var,
+                'val': args[var],
             })
 
     return commands
index 6ce610b881f4d06d98ae0e25fca3e84721dfeb47..5125253e4667b246eff95d5a0e40a4933d662d52 100644 (file)
@@ -177,11 +177,10 @@ class CommandsRequest(object):
                 self.finished
             ),
             'waiting': map(
-                lambda x: {
-                    'command': x.command,
-                    'outs': x.outs,
-                    'outb': x.outb,
-                },
+                lambda x: map(
+                    lambda y: common.humanify_command(y),
+                    x
+                ),
                 self.waiting
             ),
             'failed': map(
index 113180191dc9b4a02cbd99cd396eb21ef2d9f6cc..baa4af4e4a87d39d0ae3f9b8b1988a9d22bd9ac0 100644 (file)
@@ -226,6 +226,17 @@ cdef extern from "rados/librados.h" nogil:
     int rados_ioctx_snap_list(rados_ioctx_t io, rados_snap_t * snaps, int maxlen)
     int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id, time_t * t)
 
+    int rados_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
+                                            rados_snap_t *snapid)
+    int rados_ioctx_selfmanaged_snap_remove(rados_ioctx_t io,
+                                            rados_snap_t snapid)
+    int rados_ioctx_selfmanaged_snap_set_write_ctx(rados_ioctx_t io,
+                                                   rados_snap_t snap_seq,
+                                                   rados_snap_t *snap,
+                                                   int num_snaps)
+    int rados_ioctx_selfmanaged_snap_rollback(rados_ioctx_t io, const char *oid,
+                                              rados_snap_t snapid)
+
     int rados_lock_exclusive(rados_ioctx_t io, const char * oid, const char * name,
                              const char * cookie, const char * desc,
                              timeval * duration, uint8_t flags)
@@ -3115,6 +3126,101 @@ returned %d, but should return zero on success." % (self.name, ret))
         if ret != 0:
             raise make_ex(ret, "Failed to rollback %s" % oid)
 
+    def create_self_managed_snap(self):
+        """
+        Creates a self-managed snapshot
+
+        :returns: snap id on success
+
+        :raises: :class:`Error`
+        """
+        self.require_ioctx_open()
+        cdef:
+            rados_snap_t _snap_id
+        with nogil:
+            ret = rados_ioctx_selfmanaged_snap_create(self.io, &_snap_id)
+        if ret != 0:
+            raise make_ex(ret, "Failed to create self-managed snapshot")
+        return int(_snap_id)
+
+    @requires(('snap_id', int))
+    def remove_self_managed_snap(self, snap_id):
+        """
+        Removes a self-managed snapshot
+
+        :param snap_id: the name of the snapshot
+        :type snap_id: int
+
+        :raises: :class:`TypeError`
+        :raises: :class:`Error`
+        """
+        self.require_ioctx_open()
+        cdef:
+            rados_snap_t _snap_id = snap_id
+        with nogil:
+            ret = rados_ioctx_selfmanaged_snap_remove(self.io, _snap_id)
+        if ret != 0:
+            raise make_ex(ret, "Failed to remove self-managed snapshot")
+
+    def set_self_managed_snap_write(self, snaps):
+        """
+        Updates the write context to the specified self-managed
+        snapshot ids.
+
+        :param snaps: all associated self-managed snapshot ids
+        :type snaps: list
+
+        :raises: :class:`TypeError`
+        :raises: :class:`Error`
+        """
+        self.require_ioctx_open()
+        sorted_snaps = []
+        snap_seq = 0
+        if snaps:
+            sorted_snaps = sorted([int(x) for x in snaps], reverse=True)
+            snap_seq = sorted_snaps[0]
+
+        cdef:
+            rados_snap_t _snap_seq = snap_seq
+            rados_snap_t *_snaps = NULL
+            int _num_snaps = len(sorted_snaps)
+        try:
+            _snaps = <rados_snap_t *>malloc(_num_snaps * sizeof(rados_snap_t))
+            for i in range(len(sorted_snaps)):
+                _snaps[i] = sorted_snaps[i]
+            with nogil:
+                ret = rados_ioctx_selfmanaged_snap_set_write_ctx(self.io,
+                                                                 _snap_seq,
+                                                                 _snaps,
+                                                                 _num_snaps)
+            if ret != 0:
+                raise make_ex(ret, "Failed to update snapshot write context")
+        finally:
+            free(_snaps)
+
+    @requires(('oid', str_type), ('snap_id', int))
+    def rollback_self_managed_snap(self, oid, snap_id):
+        """
+        Rolls an specific object back to a self-managed snapshot revision
+
+        :param oid: the name of the object
+        :type oid: str
+        :param snap_id: the name of the snapshot
+        :type snap_id: int
+
+        :raises: :class:`TypeError`
+        :raises: :class:`Error`
+        """
+        self.require_ioctx_open()
+        oid = cstr(oid, 'oid')
+        cdef:
+            char *_oid = oid
+            rados_snap_t _snap_id = snap_id
+        with nogil:
+            ret = rados_ioctx_selfmanaged_snap_rollback(self.io, _oid, _snap_id)
+        if ret != 0:
+            raise make_ex(ret, "Failed to rollback %s" % oid)
+
     def get_last_version(self):
         """
         Return the version of the last object read or written to.
index df90e3ab59b698fe4a9f7f0c3a14e0314460559a..4738b3c197415cbdb919a7eca7bbb4747d3a04b0 100644 (file)
@@ -233,6 +233,7 @@ cdef extern from "rbd/librbd.h" nogil:
     int rbd_get_id(rbd_image_t image, char *id, size_t id_len)
     int rbd_get_block_name_prefix(rbd_image_t image, char *prefix,
                                   size_t prefix_len)
+    int64_t rbd_get_data_pool_id(rbd_image_t image)
     int rbd_get_parent_info2(rbd_image_t image,
                              char *parent_poolname, size_t ppoolnamelen,
                              char *parent_name, size_t pnamelen,
@@ -519,7 +520,9 @@ def cstr(val, name, encoding="utf-8", opt=False):
         return None
     if isinstance(val, bytes):
         return val
-    elif isinstance(val, unicode):
+    elif isinstance(val, str):
+        return val.encode(encoding)
+    elif sys.version_info < (3, 0) and isinstance(val, unicode):
         return val.encode(encoding)
     else:
         raise InvalidArgument('%s must be a string' % name)
@@ -704,6 +707,7 @@ class RBD(object):
         :raises: :class:`FunctionNotSupported`
         """
         name = cstr(name, 'name')
+        data_pool = cstr(data_pool, 'data_pool', opt=True)
         cdef:
             rados_ioctx_t _ioctx = convert_ioctx(ioctx)
             char *_name = name
@@ -787,6 +791,7 @@ class RBD(object):
         p_snapname = cstr(p_snapname, 'p_snapname')
         p_name = cstr(p_name, 'p_name')
         c_name = cstr(c_name, 'c_name')
+        data_pool = cstr(data_pool, 'data_pool', opt=True)
         cdef:
             rados_ioctx_t _p_ioctx = convert_ioctx(p_ioctx)
             rados_ioctx_t _c_ioctx = convert_ioctx(c_ioctx)
@@ -976,8 +981,8 @@ class RBD(object):
             'id'          : decode_cstr(c_info.id),
             'name'        : decode_cstr(c_info.name),
             'source'      : __source_string[c_info.source],
-            'deletion_time' : datetime.fromtimestamp(c_info.deletion_time),
-            'deferment_end_time' : datetime.fromtimestamp(c_info.deferment_end_time)
+            'deletion_time' : datetime.utcfromtimestamp(c_info.deletion_time),
+            'deferment_end_time' : datetime.utcfromtimestamp(c_info.deferment_end_time)
             }
         rbd_trash_get_cleanup(&c_info)
         return info
@@ -1286,7 +1291,7 @@ cdef class MirrorImageStatusIterator(object):
                         },
                     'state'       : self.images[i].state,
                     'description' : decode_cstr(self.images[i].description),
-                    'last_update' : datetime.fromtimestamp(self.images[i].last_update),
+                    'last_update' : datetime.utcfromtimestamp(self.images[i].last_update),
                     'up'          : self.images[i].up,
                     }
             if self.size < self.max_read:
@@ -1549,6 +1554,14 @@ cdef class Image(object):
         finally:
             free(prefix)
 
+    def data_pool_id(self):
+        """
+        Get the pool id of the pool where the data of this RBD image is stored.
+
+        :returns: int - the pool id
+        """
+        return rbd_get_data_pool_id(self.image)
+
     def parent_info(self):
         """
         Get information about a cloned image's parent (if any)
@@ -1737,6 +1750,7 @@ cdef class Image(object):
         :raises: :class:`ArgumentOutOfRange`
         """
         dest_name = cstr(dest_name, 'dest_name')
+        data_pool = cstr(data_pool, 'data_pool', opt=True)
         cdef:
             rados_ioctx_t _dest_ioctx = convert_ioctx(dest_ioctx)
             char *_dest_name = dest_name
@@ -1951,7 +1965,7 @@ cdef class Image(object):
             ret = rbd_snap_get_timestamp(self.image, _snap_id, &timestamp)
         if ret != 0:
             raise make_ex(ret, 'error getting snapshot timestamp for image: %s, snap_id: %d' % (self.name, snap_id))
-        return datetime.fromtimestamp(timestamp.tv_sec)
+        return datetime.utcfromtimestamp(timestamp.tv_sec)
 
     def remove_snap_limit(self):
         """
@@ -2180,7 +2194,7 @@ written." % (self.name, ret, length))
             ret = rbd_get_create_timestamp(self.image, &timestamp)
         if ret != 0:
             raise make_ex(ret, 'error getting create timestamp for image: %s' % (self.name))
-        return datetime.fromtimestamp(timestamp.tv_sec)
+        return datetime.utcfromtimestamp(timestamp.tv_sec)
 
     def flatten(self):
         """
@@ -2508,7 +2522,7 @@ written." % (self.name, ret, length))
                 },
             'state'       : c_status.state,
             'description' : decode_cstr(c_status.description),
-            'last_update' : datetime.fromtimestamp(c_status.last_update),
+            'last_update' : datetime.utcfromtimestamp(c_status.last_update),
             'up'          : c_status.up,
             }
         free(c_status.name)
@@ -2952,8 +2966,8 @@ cdef class TrashIterator(object):
                 'id'          : decode_cstr(self.entries[i].id),
                 'name'        : decode_cstr(self.entries[i].name),
                 'source'      : TrashIterator.__source_string[self.entries[i].source],
-                'deletion_time' : datetime.fromtimestamp(self.entries[i].deletion_time),
-                'deferment_end_time' : datetime.fromtimestamp(self.entries[i].deferment_end_time)
+                'deletion_time' : datetime.utcfromtimestamp(self.entries[i].deletion_time),
+                'deferment_end_time' : datetime.utcfromtimestamp(self.entries[i].deferment_end_time)
                 }
 
     def __dealloc__(self):
index 71f32a69528fb3921669a0085c1f47160111c9b2..614bda36fec7edc51f96f52899c75ecab7161491 100644 (file)
@@ -24,10 +24,10 @@ using namespace std;
 
 #define SWIFT_GROUP_ALL_USERS ".r:*"
 
-static int parse_list(const std::string& uid_list,
+static int parse_list(const char* uid_list,
                       std::vector<std::string>& uids)           /* out */
 {
-  char *s = strdup(uid_list.c_str());
+  char *s = strdup(uid_list);
   if (!s) {
     return -ENOMEM;
   }
@@ -177,8 +177,8 @@ int RGWAccessControlPolicy_SWIFT::add_grants(RGWRados* const store,
 int RGWAccessControlPolicy_SWIFT::create(RGWRados* const store,
                                          const rgw_user& id,
                                          const std::string& name,
-                                         const std::string& read_list,
-                                         const std::string& write_list,
+                                         const char* read_list,
+                                         const char* write_list,
                                          uint32_t& rw_mask)
 {
   acl.create_default(id, name);
@@ -186,7 +186,7 @@ int RGWAccessControlPolicy_SWIFT::create(RGWRados* const store,
   owner.set_name(name);
   rw_mask = 0;
 
-  if (read_list.size()) {
+  if (read_list) {
     std::vector<std::string> uids;
     int r = parse_list(read_list, uids);
     if (r < 0) {
@@ -203,7 +203,7 @@ int RGWAccessControlPolicy_SWIFT::create(RGWRados* const store,
     }
     rw_mask |= SWIFT_PERM_READ;
   }
-  if (write_list.size()) {
+  if (write_list) {
     std::vector<std::string> uids;
     int r = parse_list(write_list, uids);
     if (r < 0) {
index 87bdb608f4e2560eaee153469322394b24f50eb8..f5b4558da88662febbfbab8db60c53bddc6b4497 100644 (file)
@@ -28,8 +28,8 @@ public:
   int create(RGWRados *store,
              const rgw_user& id,
              const std::string& name,
-             const std::string& read_list,
-             const std::string& write_list,
+             const char* read_list,
+             const char* write_list,
              uint32_t& rw_mask);
   void filter_merge(uint32_t mask, RGWAccessControlPolicy_SWIFT *policy);
   void to_str(std::string& read, std::string& write);
index 282621b67a73517bea2638d7a32f0548754ce2be..350875984125895fa3aa0e19535b5525340f4ad1 100644 (file)
@@ -219,18 +219,21 @@ void usage()
   cout << "   --max-buckets             max number of buckets for a user\n";
   cout << "   --admin                   set the admin flag on the user\n";
   cout << "   --system                  set the system flag on the user\n";
-  cout << "   --bucket=<bucket>\n";
-  cout << "   --pool=<pool>\n";
-  cout << "   --object=<object>\n";
-  cout << "   --date=<date>\n";
-  cout << "   --start-date=<date>\n";
-  cout << "   --end-date=<date>\n";
-  cout << "   --bucket-id=<bucket-id>\n";
-  cout << "   --shard-id=<shard-id>     optional for mdlog list\n";
+  cout << "   --bucket=<bucket>         Specify the bucket name. Also used by the quota command.\n";
+  cout << "   --pool=<pool>             Specify the pool name. Also used to scan for leaked rados objects.\n";
+  cout << "   --object=<object>         object name\n";
+  cout << "   --date=<date>             date in the format yyyy-mm-dd\n";
+  cout << "   --start-date=<date>       start date in the format yyyy-mm-dd\n";
+  cout << "   --end-date=<date>         end date in the format yyyy-mm-dd\n";
+  cout << "   --bucket-id=<bucket-id>   bucket id\n";
+  cout << "   --shard-id=<shard-id>     optional for: \n";
+  cout << "                               mdlog list\n";
+  cout << "                               data sync status\n";
   cout << "                             required for: \n";
   cout << "                               mdlog trim\n";
   cout << "                               replica mdlog get/delete\n";
   cout << "                               replica datalog get/delete\n";
+  cout << "   --max-entries=<entries>   max entries for listing operations\n";
   cout << "   --metadata-key=<key>      key to retrieve metadata from with metadata get\n";
   cout << "   --remote=<remote>         zone or zonegroup id of remote gateway\n";
   cout << "   --period=<id>             period id\n";
@@ -356,6 +359,7 @@ enum {
   OPT_BUCKET_STATS,
   OPT_BUCKET_CHECK,
   OPT_BUCKET_SYNC_STATUS,
+  OPT_BUCKET_SYNC_MARKERS,
   OPT_BUCKET_SYNC_INIT,
   OPT_BUCKET_SYNC_RUN,
   OPT_BUCKET_SYNC_DISABLE,
@@ -619,6 +623,8 @@ static int get_cmd(const char *cmd, const char *prev_cmd, const char *prev_prev_
     if (strcmp(prev_cmd, "sync") == 0) {
       if (strcmp(cmd, "status") == 0)
         return OPT_BUCKET_SYNC_STATUS;
+      if (strcmp(cmd, "markers") == 0)
+        return OPT_BUCKET_SYNC_MARKERS;
       if (strcmp(cmd, "init") == 0)
         return OPT_BUCKET_SYNC_INIT;
       if (strcmp(cmd, "run") == 0)
@@ -1941,13 +1947,16 @@ static void get_md_sync_status(list<string>& status)
   int num_full = 0;
   int num_inc = 0;
   int total_shards = 0;
+  set<int> shards_behind_set;
 
   for (auto marker_iter : sync_status.sync_markers) {
     full_total += marker_iter.second.total_entries;
     total_shards++;
+    int shard_id = marker_iter.first;
     if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) {
       num_full++;
       full_complete += marker_iter.second.pos;
+      shards_behind_set.insert(shard_id);
     } else {
       full_complete += marker_iter.second.total_entries;
     }
@@ -1999,6 +2008,7 @@ static void get_md_sync_status(list<string>& status)
       if (local_iter.second.state == rgw_meta_sync_marker::SyncState::IncrementalSync &&
           master_marker > local_iter.second.marker) {
         shards_behind[shard_id] = local_iter.second.marker;
+        shards_behind_set.insert(shard_id);
       }
     }
   }
@@ -2008,6 +2018,8 @@ static void get_md_sync_status(list<string>& status)
     push_ss(ss, status) << "metadata is caught up with master";
   } else {
     push_ss(ss, status) << "metadata is behind on " << total_behind << " shards";
+    
+    push_ss(ss, status) << "behind shards: " << "[" << shards_behind_set << "]";
 
     map<int, rgw_mdlog_shard_data> master_pos;
     ret = sync.read_master_log_shards_next(sync_status.sync_info.period, shards_behind, &master_pos);
@@ -2070,6 +2082,13 @@ static void get_data_sync_status(const string& source_zone, list<string>& status
     return;
   }
 
+  set<int> recovering_shards;
+  ret = sync.read_recovering_shards(sync_status.sync_info.num_shards, recovering_shards);
+  if (ret < 0 && ret != ENOENT) {
+    push_ss(ss, status, tab) << string("failed read recovering shards: ") + cpp_strerror(-ret);
+    return;
+  }
+
   string status_str;
   switch (sync_status.sync_info.state) {
     case rgw_data_sync_info::StateInit:
@@ -2093,13 +2112,16 @@ static void get_data_sync_status(const string& source_zone, list<string>& status
   int num_full = 0;
   int num_inc = 0;
   int total_shards = 0;
+  set<int> shards_behind_set;
 
   for (auto marker_iter : sync_status.sync_markers) {
+    int shard_id = marker_iter.first;
     full_total += marker_iter.second.total_entries;
     total_shards++;
     if (marker_iter.second.state == rgw_data_sync_marker::SyncState::FullSync) {
       num_full++;
       full_complete += marker_iter.second.pos;
+      shards_behind_set.insert(shard_id);
     } else {
       full_complete += marker_iter.second.total_entries;
     }
@@ -2147,15 +2169,19 @@ static void get_data_sync_status(const string& source_zone, list<string>& status
     if (local_iter.second.state == rgw_data_sync_marker::SyncState::IncrementalSync &&
         master_marker > local_iter.second.marker) {
       shards_behind[shard_id] = local_iter.second.marker;
+      shards_behind_set.insert(shard_id);
     }
   }
 
   int total_behind = shards_behind.size() + (sync_status.sync_info.num_shards - num_inc);
-  if (total_behind == 0) {
+  int total_recovering = recovering_shards.size();
+  if (total_behind == 0 && total_recovering == 0) {
     push_ss(ss, status, tab) << "data is caught up with source";
-  } else {
+  } else if (total_behind > 0) {
     push_ss(ss, status, tab) << "data is behind on " << total_behind << " shards";
 
+    push_ss(ss, status, tab) << "behind shards: " << "[" << shards_behind_set << "]" ;
+
     map<int, rgw_datalog_shard_data> master_pos;
     ret = sync.read_source_log_shards_next(shards_behind, &master_pos);
     if (ret < 0) {
@@ -2181,6 +2207,11 @@ static void get_data_sync_status(const string& source_zone, list<string>& status
     }
   }
 
+  if (total_recovering > 0) {
+    push_ss(ss, status, tab) << total_recovering << " shards are recovering";
+    push_ss(ss, status, tab) << "recovering shards: " << "[" << recovering_shards << "]";
+  }
+
   flush_ss(ss, status);
 }
 
@@ -2234,6 +2265,154 @@ static void sync_status(Formatter *formatter)
   tab_dump("data sync", width, data_status);
 }
 
+struct indented {
+  int w; // indent width
+  boost::string_view header;
+  indented(int w, boost::string_view header = "") : w(w), header(header) {}
+};
+std::ostream& operator<<(std::ostream& out, const indented& h) {
+  return out << std::setw(h.w) << h.header << std::setw(1) << ' ';
+}
+
+static int remote_bilog_markers(RGWRados *store, const RGWZone& source,
+                                RGWRESTConn *conn, const RGWBucketInfo& info,
+                                BucketIndexShardsManager *markers)
+{
+  const auto instance_key = info.bucket.get_key();
+  const rgw_http_param_pair params[] = {
+    { "type" , "bucket-index" },
+    { "bucket-instance", instance_key.c_str() },
+    { "info" , nullptr },
+    { nullptr, nullptr }
+  };
+  rgw_bucket_index_marker_info result;
+  int r = conn->get_json_resource("/admin/log/", params, result);
+  if (r < 0) {
+    lderr(store->ctx()) << "failed to fetch remote log markers: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = markers->from_string(result.max_marker, -1);
+  if (r < 0) {
+    lderr(store->ctx()) << "failed to decode remote log markers" << dendl;
+    return r;
+  }
+  return 0;
+}
+
+static int bucket_source_sync_status(RGWRados *store, const RGWZone& zone,
+                                     const RGWZone& source, RGWRESTConn *conn,
+                                     const RGWBucketInfo& bucket_info,
+                                     int width, std::ostream& out)
+{
+  out << indented{width, "source zone"} << source.id << " (" << source.name << ")\n";
+
+  // syncing from this zone?
+  if (!zone.syncs_from(source.id)) {
+    out << indented{width} << "not in sync_from\n";
+    return 0;
+  }
+  std::vector<rgw_bucket_shard_sync_info> status;
+  int r = rgw_bucket_sync_status(store, source.id, bucket_info, &status);
+  if (r < 0) {
+    lderr(store->ctx()) << "failed to read bucket sync status: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  int num_full = 0;
+  int num_inc = 0;
+  uint64_t full_complete = 0;
+  const int total_shards = status.size();
+
+  using BucketSyncState = rgw_bucket_shard_sync_info::SyncState;
+  for (size_t shard_id = 0; shard_id < total_shards; shard_id++) {
+    auto& m = status[shard_id];
+    if (m.state == BucketSyncState::StateFullSync) {
+      num_full++;
+      full_complete += m.full_marker.count;
+    } else if (m.state == BucketSyncState::StateIncrementalSync) {
+      num_inc++;
+    }
+  }
+
+  out << indented{width} << "full sync: " << num_full << "/" << total_shards << " shards\n";
+  if (num_full > 0) {
+    out << indented{width} << "full sync: " << full_complete << " objects completed\n";
+  }
+  out << indented{width} << "incremental sync: " << num_inc << "/" << total_shards << " shards\n";
+
+  BucketIndexShardsManager remote_markers;
+  r = remote_bilog_markers(store, source, conn, bucket_info, &remote_markers);
+  if (r < 0) {
+    lderr(store->ctx()) << "failed to read remote log: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  std::set<int> shards_behind;
+  for (auto& r : remote_markers.get()) {
+    auto shard_id = r.first;
+    auto& m = status[shard_id];
+    if (r.second.empty()) {
+      continue; // empty bucket index shard
+    }
+    auto pos = BucketIndexShardsManager::get_shard_marker(m.inc_marker.position);
+    if (m.state != BucketSyncState::StateIncrementalSync || pos != r.second) {
+      shards_behind.insert(shard_id);
+    }
+  }
+  if (shards_behind.empty()) {
+    out << indented{width} << "bucket is caught up with source\n";
+  } else {
+    out << indented{width} << "bucket is behind on " << shards_behind.size() << " shards\n";
+    out << indented{width} << "behind shards: [" << shards_behind << "]\n" ;
+  }
+  return 0;
+}
+
+static int bucket_sync_status(RGWRados *store, const RGWBucketInfo& info,
+                              const std::string& source_zone_id,
+                              std::ostream& out)
+{
+  RGWRealm& realm = store->realm;
+  RGWZoneGroup& zonegroup = store->get_zonegroup();
+  RGWZone& zone = store->get_zone();
+  constexpr int width = 15;
+
+  out << indented{width, "realm"} << realm.get_id() << " (" << realm.get_name() << ")\n";
+  out << indented{width, "zonegroup"} << zonegroup.get_id() << " (" << zonegroup.get_name() << ")\n";
+  out << indented{width, "zone"} << zone.id << " (" << zone.name << ")\n";
+  out << indented{width, "bucket"} << info.bucket << "\n\n";
+
+  if (!info.datasync_flag_enabled()) {
+    out << "Sync is disabled for bucket " << info.bucket.name << '\n';
+    return 0;
+  }
+
+  if (!source_zone_id.empty()) {
+    auto z = zonegroup.zones.find(source_zone_id);
+    if (z == zonegroup.zones.end()) {
+      lderr(store->ctx()) << "Source zone not found in zonegroup "
+          << zonegroup.get_name() << dendl;
+      return -EINVAL;
+    }
+    auto c = store->zone_conn_map.find(source_zone_id);
+    if (c == store->zone_conn_map.end()) {
+      lderr(store->ctx()) << "No connection to zone " << z->second.name << dendl;
+      return -EINVAL;
+    }
+    return bucket_source_sync_status(store, zone, z->second, c->second,
+                                     info, width, out);
+  }
+
+  for (const auto& z : zonegroup.zones) {
+    auto c = store->zone_conn_map.find(z.second.id);
+    if (c != store->zone_conn_map.end()) {
+      bucket_source_sync_status(store, zone, z.second, c->second,
+                                info, width, out);
+    }
+  }
+  return 0;
+}
+
 static void parse_tier_config_param(const string& s, map<string, string, ltstr_nocase>& out)
 {
   list<string> confs;
@@ -2939,14 +3118,69 @@ int main(int argc, const char **argv)
                         OPT_REALM_RENAME, OPT_REALM_SET,
                         OPT_REALM_DEFAULT, OPT_REALM_PULL};
 
+  std::set<int> readonly_ops_list = {
+                         OPT_USER_INFO,
+                        OPT_USER_STATS,
+                        OPT_BUCKETS_LIST,
+                        OPT_BUCKET_LIMIT_CHECK,
+                        OPT_BUCKET_STATS,
+                        OPT_BUCKET_SYNC_STATUS,
+                        OPT_LOG_LIST,
+                        OPT_LOG_SHOW,
+                        OPT_USAGE_SHOW,
+                        OPT_OBJECT_STAT,
+                        OPT_BI_GET,
+                        OPT_BI_LIST,
+                        OPT_OLH_GET,
+                        OPT_OLH_READLOG,
+                        OPT_GC_LIST,
+                        OPT_LC_LIST,
+                        OPT_ORPHANS_LIST_JOBS,
+                        OPT_ZONEGROUP_GET,
+                        OPT_ZONEGROUP_LIST,
+                        OPT_ZONEGROUP_PLACEMENT_LIST,
+                        OPT_ZONE_GET,
+                        OPT_ZONE_LIST,
+                        OPT_ZONE_PLACEMENT_LIST,
+                        OPT_METADATA_GET,
+                        OPT_METADATA_LIST,
+                        OPT_METADATA_SYNC_STATUS,
+                        OPT_MDLOG_LIST,
+                        OPT_MDLOG_STATUS,
+                        OPT_SYNC_ERROR_LIST,
+                        OPT_BILOG_LIST,
+                        OPT_BILOG_STATUS,
+                        OPT_DATA_SYNC_STATUS,
+                        OPT_DATALOG_LIST,
+                        OPT_DATALOG_STATUS,
+                        OPT_OPSTATE_LIST,
+                        OPT_REPLICALOG_GET,
+                        OPT_REALM_GET,
+                        OPT_REALM_GET_DEFAULT,
+                        OPT_REALM_LIST,
+                        OPT_REALM_LIST_PERIODS,
+                        OPT_PERIOD_GET,
+                        OPT_PERIOD_GET_CURRENT,
+                        OPT_PERIOD_LIST,
+                        OPT_GLOBAL_QUOTA_GET,
+                        OPT_SYNC_STATUS,
+                        OPT_ROLE_GET,
+                        OPT_ROLE_LIST,
+                        OPT_ROLE_POLICY_LIST,
+                        OPT_ROLE_POLICY_GET,
+                        OPT_RESHARD_LIST,
+                        OPT_RESHARD_STATUS,
+  };
 
   bool raw_storage_op = (raw_storage_ops_list.find(opt_cmd) != raw_storage_ops_list.end() ||
                          raw_period_update);
+  bool need_cache = readonly_ops_list.find(opt_cmd) == readonly_ops_list.end();
 
   if (raw_storage_op) {
     store = RGWStoreManager::get_raw_storage(g_ceph_context);
   } else {
-    store = RGWStoreManager::get_storage(g_ceph_context, false, false, false, false, false);
+    store = RGWStoreManager::get_storage(g_ceph_context, false, false, false, false, false,
+      need_cache && g_conf->rgw_cache_enabled);
   }
   if (!store) {
     cerr << "couldn't init storage provider" << std::endl;
@@ -6433,34 +6667,53 @@ next:
     }
 
     rgw_data_sync_status sync_status;
-    ret = sync.read_sync_status(&sync_status);
-    if (ret < 0 && ret != -ENOENT) {
-      cerr << "ERROR: sync.read_sync_status() returned ret=" << ret << std::endl;
-      return -ret;
-    }
+    if (specified_shard_id) {
+      set<string> pending_buckets;
+      set<string> recovering_buckets;
+      rgw_data_sync_marker sync_marker;
+      ret = sync.read_shard_status(shard_id, pending_buckets, recovering_buckets, &sync_marker, 
+                                   max_entries_specified ? max_entries : 20);
+      if (ret < 0 && ret != -ENOENT) {
+        cerr << "ERROR: sync.read_shard_status() returned ret=" << ret << std::endl;
+        return -ret;
+      }
+      formatter->open_object_section("summary");
+      encode_json("shard_id", shard_id, formatter);
+      encode_json("marker", sync_marker, formatter);
+      encode_json("pending_buckets", pending_buckets, formatter);
+      encode_json("recovering_buckets", recovering_buckets, formatter);
+      formatter->close_section();
+      formatter->flush(cout);
+    } else {
+      ret = sync.read_sync_status(&sync_status);
+      if (ret < 0 && ret != -ENOENT) {
+        cerr << "ERROR: sync.read_sync_status() returned ret=" << ret << std::endl;
+        return -ret;
+      }
 
-    formatter->open_object_section("summary");
-    encode_json("sync_status", sync_status, formatter);
+      formatter->open_object_section("summary");
+      encode_json("sync_status", sync_status, formatter);
 
-    uint64_t full_total = 0;
-    uint64_t full_complete = 0;
+      uint64_t full_total = 0;
+      uint64_t full_complete = 0;
 
-    for (auto marker_iter : sync_status.sync_markers) {
-      full_total += marker_iter.second.total_entries;
-      if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) {
-        full_complete += marker_iter.second.pos;
-      } else {
-        full_complete += marker_iter.second.total_entries;
+      for (auto marker_iter : sync_status.sync_markers) {
+        full_total += marker_iter.second.total_entries;
+        if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) {
+          full_complete += marker_iter.second.pos;
+        } else {
+          full_complete += marker_iter.second.total_entries;
+        }
       }
-    }
 
-    formatter->open_object_section("full_sync");
-    encode_json("total", full_total, formatter);
-    encode_json("complete", full_complete, formatter);
-    formatter->close_section();
-    formatter->close_section();
+      formatter->open_object_section("full_sync");
+      encode_json("total", full_total, formatter);
+      encode_json("complete", full_complete, formatter);
+      formatter->close_section();
+      formatter->close_section();
 
-    formatter->flush(cout);
+      formatter->flush(cout);
+    }
   }
 
   if (opt_cmd == OPT_DATA_SYNC_INIT) {
@@ -6568,9 +6821,23 @@ next:
     ret = set_bucket_sync_enabled(store, opt_cmd, tenant, bucket_name);
     if (ret < 0)
       return -ret;
-}
+  }
 
   if (opt_cmd == OPT_BUCKET_SYNC_STATUS) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    RGWBucketInfo bucket_info;
+    rgw_bucket bucket;
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
+    if (ret < 0) {
+      return -ret;
+    }
+    bucket_sync_status(store, bucket_info, source_zone, std::cout);
+  }
+
+  if (opt_cmd == OPT_BUCKET_SYNC_MARKERS) {
     if (source_zone.empty()) {
       cerr << "ERROR: source zone not specified" << std::endl;
       return EINVAL;
index cf4224a1f0e1362bf766400824025c377ff21ded..66ed0ed708449ed2c6ea14b87f50f9f957a6aa63 100644 (file)
@@ -191,14 +191,20 @@ class Connection {
   friend void intrusive_ptr_release(Connection *c) { c->put(); }
 };
 
-
 class AsioFrontend {
   RGWProcessEnv env;
   RGWFrontendConfig* conf;
   boost::asio::io_service service;
 
-  tcp::acceptor acceptor;
-  tcp::socket peer_socket;
+  struct Listener {
+    tcp::endpoint endpoint;
+    tcp::acceptor acceptor;
+    tcp::socket socket;
+
+    Listener(boost::asio::io_service& service)
+      : acceptor(service), socket(service) {}
+  };
+  std::vector<Listener> listeners;
 
   std::vector<std::thread> threads;
   Pauser pauser;
@@ -206,11 +212,11 @@ class AsioFrontend {
 
   CephContext* ctx() const { return env.store->ctx(); }
 
-  void accept(boost::system::error_code ec);
+  void accept(Listener& listener, boost::system::error_code ec);
 
  public:
   AsioFrontend(const RGWProcessEnv& env, RGWFrontendConfig* conf)
-    : env(env), conf(conf), acceptor(service), peer_socket(service) {}
+    : env(env), conf(conf) {}
 
   int init();
   int run();
@@ -220,64 +226,104 @@ class AsioFrontend {
   void unpause(RGWRados* store, rgw_auth_registry_ptr_t);
 };
 
-int AsioFrontend::init()
+unsigned short parse_port(const char *input, boost::system::error_code& ec)
+{
+  char *end = nullptr;
+  auto port = std::strtoul(input, &end, 10);
+  if (port > std::numeric_limits<unsigned short>::max()) {
+    ec.assign(ERANGE, boost::system::system_category());
+  } else if (port == 0 && end == input) {
+    ec.assign(EINVAL, boost::system::system_category());
+  }
+  return port;
+}
+
+tcp::endpoint parse_endpoint(BOOST_ASIO_STRING_VIEW_PARAM input,
+                             boost::system::error_code& ec)
 {
-  std::string port_str;
-  conf->get_val("port", "80", &port_str);
+  tcp::endpoint endpoint;
 
-  unsigned short port;
-  boost::asio::ip::address addr; // default to 'any'
+  auto colon = input.find(':');
+  if (colon != input.npos) {
+    auto port_str = input.substr(colon + 1);
+    endpoint.port(parse_port(port_str.data(), ec));
+  } else {
+    endpoint.port(80);
+  }
+  if (!ec) {
+    auto addr = input.substr(0, colon);
+    endpoint.address(boost::asio::ip::make_address(addr, ec));
+  }
+  return endpoint;
+}
+
+int AsioFrontend::init()
+{
   boost::system::error_code ec;
+  auto& config = conf->get_config_map();
 
-  auto colon = port_str.find(':');
-  if (colon != port_str.npos) {
-    addr = boost::asio::ip::make_address(port_str.substr(0, colon), ec);
+  // parse endpoints
+  auto range = config.equal_range("port");
+  for (auto i = range.first; i != range.second; ++i) {
+    auto port = parse_port(i->second.c_str(), ec);
     if (ec) {
-      lderr(ctx()) << "failed to parse address '" << port_str << "': " << ec.message() << dendl;
+      lderr(ctx()) << "failed to parse port=" << i->second << dendl;
       return -ec.value();
     }
-    port = std::stoul(port_str.substr(colon + 1), nullptr, 0);
-  } else {
-    port = std::stoul(port_str, nullptr, 0);
+    listeners.emplace_back(service);
+    listeners.back().endpoint.port(port);
   }
 
-  tcp::endpoint ep = {addr, port};
-  ldout(ctx(), 4) << "frontend listening on " << ep << dendl;
-
-  acceptor.open(ep.protocol(), ec);
-  if (ec) {
-    lderr(ctx()) << "failed to open socket: " << ec.message() << dendl;
-    return -ec.value();
+  range = config.equal_range("endpoint");
+  for (auto i = range.first; i != range.second; ++i) {
+    auto endpoint = parse_endpoint(i->second, ec);
+    if (ec) {
+      lderr(ctx()) << "failed to parse endpoint=" << i->second << dendl;
+      return -ec.value();
+    }
+    listeners.emplace_back(service);
+    listeners.back().endpoint = endpoint;
   }
-  acceptor.set_option(tcp::acceptor::reuse_address(true));
-  acceptor.bind(ep, ec);
-  if (ec) {
-    lderr(ctx()) << "failed to bind address " << ep <<
-        ": " << ec.message() << dendl;
-    return -ec.value();
+
+  // start listeners
+  for (auto& l : listeners) {
+    l.acceptor.open(l.endpoint.protocol(), ec);
+    if (ec) {
+      lderr(ctx()) << "failed to open socket: " << ec.message() << dendl;
+      return -ec.value();
+    }
+    l.acceptor.set_option(tcp::acceptor::reuse_address(true));
+    l.acceptor.bind(l.endpoint, ec);
+    if (ec) {
+      lderr(ctx()) << "failed to bind address " << l.endpoint
+          << ": " << ec.message() << dendl;
+      return -ec.value();
+    }
+    l.acceptor.listen(boost::asio::socket_base::max_connections);
+    l.acceptor.async_accept(l.socket,
+                            [this, &l] (boost::system::error_code ec) {
+                              accept(l, ec);
+                            });
+
+    ldout(ctx(), 4) << "frontend listening on " << l.endpoint << dendl;
   }
-  acceptor.listen(boost::asio::socket_base::max_connections);
-  acceptor.async_accept(peer_socket,
-                        [this] (boost::system::error_code ec) {
-                          return accept(ec);
-                        });
   return 0;
 }
 
-void AsioFrontend::accept(boost::system::error_code ec)
+void AsioFrontend::accept(Listener& l, boost::system::error_code ec)
 {
-  if (!acceptor.is_open()) {
+  if (!l.acceptor.is_open()) {
     return;
   } else if (ec == boost::asio::error::operation_aborted) {
     return;
   } else if (ec) {
     throw ec;
   }
-  auto socket = std::move(peer_socket);
-  acceptor.async_accept(peer_socket,
-                        [this] (boost::system::error_code ec) {
-                          return accept(ec);
-                        });
+  auto socket = std::move(l.socket);
+  l.acceptor.async_accept(l.socket,
+                          [this, &l] (boost::system::error_code ec) {
+                            accept(l, ec);
+                          });
 
   boost::intrusive_ptr<Connection> conn{new Connection(env, std::move(socket))};
   conn->on_connect();
@@ -313,7 +359,10 @@ void AsioFrontend::stop()
   going_down = true;
 
   boost::system::error_code ec;
-  acceptor.close(ec);
+  // close all listeners
+  for (auto& listener : listeners) {
+    listener.acceptor.close(ec);
+  }
 
   // unblock the run() threads
   service.stop();
index 65a8b034f97932b121b68709eebefe90ac067d5e..e18749060fef75e2ad8c8f257338c1b2513096d2 100644 (file)
@@ -385,7 +385,47 @@ void rgw::auth::RemoteApplier::to_str(std::ostream& out) const
       << ", is_admin=" << info.is_admin << ")";
 }
 
+void rgw::auth::ImplicitTenants::recompute_value(const md_config_t *c)
+{
+  std::string s = c->get_val<std::string>("rgw_keystone_implicit_tenants");
+  int v = 0;
+  if (boost::iequals(s, "both")
+    || boost::iequals(s, "true")
+    || boost::iequals(s, "1")) {
+    v = IMPLICIT_TENANTS_S3|IMPLICIT_TENANTS_SWIFT;
+  } else if (boost::iequals(s, "0")
+    || boost::iequals(s, "none")
+    || boost::iequals(s, "false")) {
+    v = 0;
+  } else if (boost::iequals(s, "s3")) {
+    v = IMPLICIT_TENANTS_S3;
+  } else if (boost::iequals(s, "swift")) {
+    v = IMPLICIT_TENANTS_SWIFT;
+  } else {  /* "" (and anything else) */
+    v = IMPLICIT_TENANTS_BAD;
+    // assert(0);
+  }
+  saved = v;
+}
+
+const char **rgw::auth::ImplicitTenants::get_tracked_conf_keys() const
+{
+  static const char *keys[] = {
+    "rgw_keystone_implicit_tenants",
+  NULL };
+  return keys;
+}
+
+void rgw::auth::ImplicitTenants::handle_conf_change(const struct md_config_t *c,
+       const std::set <std::string> &changed)
+{
+  if (changed.count("rgw_keystone_implicit_tenants")) {
+    recompute_value(c);
+  }
+}
+
 void rgw::auth::RemoteApplier::create_account(const rgw_user& acct_user,
+                                              bool implicit_tenant,
                                               RGWUserInfo& user_info) const      /* out */
 {
   rgw_user new_acct_user = acct_user;
@@ -397,7 +437,7 @@ void rgw::auth::RemoteApplier::create_account(const rgw_user& acct_user,
 
   /* An upper layer may enforce creating new accounts within their own
    * tenants. */
-  if (new_acct_user.tenant.empty() && implicit_tenants) {
+  if (new_acct_user.tenant.empty() && implicit_tenant) {
     new_acct_user.tenant = new_acct_user.id;
   }
 
@@ -420,6 +460,9 @@ void rgw::auth::RemoteApplier::load_acct_info(RGWUserInfo& user_info) const
    * that belongs to the authenticated identity. Another policy may be
    * applied by using a RGWThirdPartyAccountAuthApplier decorator. */
   const rgw_user& acct_user = info.acct_user;
+  auto implicit_value = implicit_tenant_context.get_value();
+  bool implicit_tenant = implicit_value.implicit_tenants_for_(implicit_tenant_bit);
+  bool split_mode = implicit_value.is_split_mode();
 
   /* Normally, empty "tenant" field of acct_user means the authenticated
    * identity has the legacy, global tenant. However, due to inclusion
@@ -431,8 +474,16 @@ void rgw::auth::RemoteApplier::load_acct_info(RGWUserInfo& user_info) const
    * the wiser.
    * If that fails, we look up in the requested (possibly empty) tenant.
    * If that fails too, we create the account within the global or separated
-   * namespace depending on rgw_keystone_implicit_tenants. */
-  if (acct_user.tenant.empty()) {
+   * namespace depending on rgw_keystone_implicit_tenants.
+   * For compatibility with previous versions of ceph, it is possible
+   * to enable implicit_tenants for only s3 or only swift.
+   * in this mode ("split_mode"), we must constrain the id lookups to
+   * only use the identifier space that would be used if the id were
+   * to be created. */
+
+  if (split_mode && !implicit_tenant)
+       ;       /* suppress lookup for id used by "other" protocol */
+  else if (acct_user.tenant.empty()) {
     const rgw_user tenanted_uid(acct_user.id, acct_user.id);
 
     if (rgw_get_user_info_by_uid(store, tenanted_uid, user_info) >= 0) {
@@ -441,11 +492,16 @@ void rgw::auth::RemoteApplier::load_acct_info(RGWUserInfo& user_info) const
     }
   }
 
-  if (rgw_get_user_info_by_uid(store, acct_user, user_info) < 0) {
-    ldout(cct, 0) << "NOTICE: couldn't map swift user " << acct_user << dendl;
-    create_account(acct_user, user_info);
+  if (split_mode && implicit_tenant)
+       ;       /* suppress lookup for id used by "other" protocol */
+  else if (rgw_get_user_info_by_uid(store, acct_user, user_info) >= 0) {
+      /* Succeeded. */
+      return;
   }
 
+  ldout(cct, 0) << "NOTICE: couldn't map swift user " << acct_user << dendl;
+  create_account(acct_user, implicit_tenant, user_info);
+
   /* Succeeded if we are here (create_account() hasn't throwed). */
 }
 
index 168498d033316004a6615d3c04ba626405ab5480..bb7e7573abec4b2b6a74bb904aa7ba108e5577aa 100644 (file)
@@ -344,6 +344,43 @@ protected:
  * Each new Strategy should be exposed to it. */
 class StrategyRegistry;
 
+class ImplicitTenants: public md_config_obs_t {
+public:
+  enum implicit_tenant_flag_bits {IMPLICIT_TENANTS_SWIFT=1,
+       IMPLICIT_TENANTS_S3=2, IMPLICIT_TENANTS_BAD = -1, };
+private:
+  int saved;
+  void recompute_value(const md_config_t *);
+  class ImplicitTenantValue {
+    friend class ImplicitTenants;
+    int v;
+    ImplicitTenantValue(int v) : v(v) {};
+  public:
+    bool inline is_split_mode()
+    {
+      assert(v != IMPLICIT_TENANTS_BAD);
+      return v == IMPLICIT_TENANTS_SWIFT || v == IMPLICIT_TENANTS_S3;
+    }
+    bool inline implicit_tenants_for_(const implicit_tenant_flag_bits bit)
+    {
+      assert(v != IMPLICIT_TENANTS_BAD);
+      return !!(v&bit);
+    }
+  };
+public:
+  ImplicitTenants(md_config_t &c) { recompute_value(&c);}
+  ImplicitTenantValue get_value() {
+    return ImplicitTenantValue(saved);
+  }
+private:
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const struct md_config_t *conf,
+    const std::set <std::string> &changed) override;
+};
+
+std::tuple<bool,bool> implicit_tenants_enabled_for_swift(CephContext * const cct);
+std::tuple<bool,bool> implicit_tenants_enabled_for_s3(CephContext * const cct);
+
 /* rgw::auth::RemoteApplier targets those authentication engines which don't
  * need to ask the RADOS store while performing the auth process. Instead,
  * they obtain credentials from an external source like Keystone or LDAP.
@@ -396,9 +433,11 @@ protected:
   const acl_strategy_t extra_acl_strategy;
 
   const AuthInfo info;
-  const bool implicit_tenants;
+  rgw::auth::ImplicitTenants& implicit_tenant_context;
+  const rgw::auth::ImplicitTenants::implicit_tenant_flag_bits implicit_tenant_bit;
 
   virtual void create_account(const rgw_user& acct_user,
+                              bool implicit_tenant,
                               RGWUserInfo& user_info) const;          /* out */
 
 public:
@@ -406,12 +445,14 @@ public:
                 RGWRados* const store,
                 acl_strategy_t&& extra_acl_strategy,
                 const AuthInfo& info,
-                const bool implicit_tenants)
+               rgw::auth::ImplicitTenants& implicit_tenant_context,
+                rgw::auth::ImplicitTenants::implicit_tenant_flag_bits implicit_tenant_bit)
     : cct(cct),
       store(store),
       extra_acl_strategy(std::move(extra_acl_strategy)),
       info(info),
-      implicit_tenants(implicit_tenants) {
+      implicit_tenant_context(implicit_tenant_context),
+      implicit_tenant_bit(implicit_tenant_bit) {
   }
 
   uint32_t get_perms_from_aclspec(const aclspec_t& aclspec) const override;
index 08a93c73dac2f4c9bb52ad87a2b6135c209ab85d..494fbfe5e4fde23a813f205e00ac9cf293f7c92d 100644 (file)
@@ -35,9 +35,11 @@ class StrategyRegistry {
     s3_main_strategy_plain_t s3_main_strategy_plain;
     s3_main_strategy_boto2_t s3_main_strategy_boto2;
 
-    s3_main_strategy_t(CephContext* const cct, RGWRados* const store)
-      : s3_main_strategy_plain(cct, store),
-        s3_main_strategy_boto2(cct, store) {
+    s3_main_strategy_t(CephContext* const cct,
+                      ImplicitTenants& implicit_tenant_context,
+                      RGWRados* const store)
+      : s3_main_strategy_plain(cct, implicit_tenant_context, store),
+        s3_main_strategy_boto2(cct, implicit_tenant_context, store) {
       add_engine(Strategy::Control::SUFFICIENT, s3_main_strategy_plain);
       add_engine(Strategy::Control::FALLBACK, s3_main_strategy_boto2);
     }
@@ -55,10 +57,11 @@ class StrategyRegistry {
 
 public:
   StrategyRegistry(CephContext* const cct,
+                   ImplicitTenants& implicit_tenant_context,
                    RGWRados* const store)
-    : s3_main_strategy(cct, store),
-      s3_post_strategy(cct, store),
-      swift_strategy(cct, store) {
+    : s3_main_strategy(cct, implicit_tenant_context, store),
+      s3_post_strategy(cct, implicit_tenant_context, store),
+      swift_strategy(cct, implicit_tenant_context, store) {
   }
 
   const s3_main_strategy_t& get_s3_main() const {
@@ -75,8 +78,9 @@ public:
 
   static std::shared_ptr<StrategyRegistry>
   create(CephContext* const cct,
+         ImplicitTenants& implicit_tenant_context,
          RGWRados* const store) {
-    return std::make_shared<StrategyRegistry>(cct, store);
+    return std::make_shared<StrategyRegistry>(cct, implicit_tenant_context, store);
   }
 };
 
index 4508e8131f2fa0ba12e8c60039e275e069d1360a..0904e825520e8ed49372d78fa513c1985eb2585d 100644 (file)
@@ -17,6 +17,7 @@
 
 #include <boost/container/small_vector.hpp>
 #include <boost/utility/string_view.hpp>
+#include <boost/algorithm/string/trim_all.hpp>
 
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_rgw
@@ -620,7 +621,8 @@ get_v4_canonical_headers(const req_info& info,
   std::string canonical_hdrs;
   for (const auto& header : canonical_hdrs_map) {
     const boost::string_view& name = header.first;
-    const std::string& value = header.second;
+    std::string value = header.second;
+    boost::trim_all<std::string>(value);
 
     canonical_hdrs.append(name.data(), name.length())
                   .append(":", std::strlen(":"))
index ca84672d89dd2620d8d73696238247299075acf7..2a875d4798f021d2251d9087c2502b1379d70b94 100644 (file)
@@ -36,6 +36,7 @@ class ExternalAuthStrategy : public rgw::auth::Strategy,
                              public rgw::auth::RemoteApplier::Factory {
   typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t;
   RGWRados* const store;
+  rgw::auth::ImplicitTenants& implicit_tenant_context;
 
   using keystone_config_t = rgw::keystone::CephCtxConfig;
   using keystone_cache_t = rgw::keystone::TokenCache;
@@ -51,7 +52,8 @@ class ExternalAuthStrategy : public rgw::auth::Strategy,
                             ) const override {
     auto apl = rgw::auth::add_sysreq(cct, store, s,
       rgw::auth::RemoteApplier(cct, store, std::move(acl_alg), info,
-                               cct->_conf->rgw_keystone_implicit_tenants));
+                               implicit_tenant_context,
+                               rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_S3));
     /* TODO(rzarzynski): replace with static_ptr. */
     return aplptr_t(new decltype(apl)(std::move(apl)));
   }
@@ -59,8 +61,10 @@ class ExternalAuthStrategy : public rgw::auth::Strategy,
 public:
   ExternalAuthStrategy(CephContext* const cct,
                        RGWRados* const store,
+                       rgw::auth::ImplicitTenants& implicit_tenant_context,
                        AWSEngine::VersionAbstractor* const ver_abstractor)
     : store(store),
+      implicit_tenant_context(implicit_tenant_context),
       ldap_engine(cct, store, *ver_abstractor,
                   static_cast<rgw::auth::RemoteApplier::Factory*>(this)) {
 
@@ -116,12 +120,13 @@ class AWSAuthStrategy : public rgw::auth::Strategy,
 
 public:
   AWSAuthStrategy(CephContext* const cct,
+                  rgw::auth::ImplicitTenants& implicit_tenant_context,
                   RGWRados* const store)
     : store(store),
       ver_abstractor(cct),
       anonymous_engine(cct,
                        static_cast<rgw::auth::LocalApplier::Factory*>(this)),
-      external_engines(cct, store, &ver_abstractor),
+      external_engines(cct, store, implicit_tenant_context, &ver_abstractor),
       local_engine(cct, store, ver_abstractor,
                    static_cast<rgw::auth::LocalApplier::Factory*>(this)) {
     /* The anynoymous auth. */
index c60d72953665ba9a1f221719485ef86fbe9cecdf..fee86e5b97b31bdbc1dc1e446bf6eb7eeab07e1d 100644 (file)
@@ -129,14 +129,16 @@ void ObjectCache::put(const string& name, ObjectCacheInfo& info, rgw_cache_entry
 
   ldout(cct, 10) << "cache put: name=" << name << " info.flags=0x"
                  << std::hex << info.flags << std::dec << dendl;
-  map<string, ObjectCacheEntry>::iterator iter = cache_map.find(name);
-  if (iter == cache_map.end()) {
-    ObjectCacheEntry entry;
+
+  const std::pair<std::map<std::string,
+                          ObjectCacheEntry>::iterator, bool>& emp_pair
+    = cache_map.emplace(name, ObjectCacheEntry{});
+  ObjectCacheEntry& entry = emp_pair.first->second;
+  bool inserted = emp_pair.second;
+  entry.info.time_added = ceph::coarse_mono_clock::now();
+  if (inserted) {
     entry.lru_iter = lru.end();
-    cache_map.insert(pair<string, ObjectCacheEntry>(name, entry));
-    iter = cache_map.find(name);
   }
-  ObjectCacheEntry& entry = iter->second;
   ObjectCacheInfo& target = entry.info;
 
   invalidate_lru(entry);
index 023db75ff76c23cf3775ddf9bb9b995d15ea4308..e751d637dcb6f8ac9c91124ecf828df80626537a 100644 (file)
@@ -57,7 +57,7 @@ struct ObjectCacheInfo {
   map<string, bufferlist> rm_xattrs;
   ObjectMetaInfo meta;
   obj_version version = {};
-  ceph::coarse_mono_time time_added = ceph::coarse_mono_clock::now();
+  ceph::coarse_mono_time time_added;
 
   ObjectCacheInfo() = default;
 
index fd99584aafc7cbb4faad4edcfdaac262b966b39b..c3f585cfc21863624d3212eace00d358980578aa 100644 (file)
@@ -56,7 +56,7 @@ size_t RGWCivetWeb::read_data(char *buf, size_t len)
     return 0;
   }
   for (c = 0; c < len; c += ret) {
-    ret = mg_read(conn, buf, len);
+    ret = mg_read(conn, buf+c, len-c);
     if (ret < 0) {
       throw rgw::io::Exception(EIO, std::system_category());
     }
index 9b405760feb4d99db07b05ff65fbefd36c256ce7..33e00144647ee8a337f01b9c27ad9dff940fe3a4 100644 (file)
@@ -50,7 +50,6 @@ int RGWCivetWebFrontend::process(struct mg_connection*  const conn)
 int RGWCivetWebFrontend::run()
 {
   auto& conf_map = conf->get_config_map();
-  string port_str;
 
   set_conf_default(conf_map, "num_threads",
                    std::to_string(g_conf->rgw_thread_pool_size));
@@ -59,15 +58,29 @@ int RGWCivetWebFrontend::run()
   set_conf_default(conf_map, "validate_http_method", "no");
   set_conf_default(conf_map, "canonicalize_url_path", "no");
   set_conf_default(conf_map, "enable_auth_domain_check", "no");
-  conf->get_val("port", "80", &port_str);
-  std::replace(port_str.begin(), port_str.end(), '+', ',');
-  conf_map["listening_ports"] = port_str;
+
+  std::string listening_ports;
+  // support multiple port= entries
+  auto range = conf_map.equal_range("port");
+  for (auto p = range.first; p != range.second; ++p) {
+    std::string port_str = p->second;
+    // support port= entries with multiple values
+    std::replace(port_str.begin(), port_str.end(), '+', ',');
+    if (!listening_ports.empty()) {
+      listening_ports.append(1, ',');
+    }
+    listening_ports.append(port_str);
+  }
+  if (listening_ports.empty()) {
+    listening_ports = "80";
+  }
+  conf_map.emplace("listening_ports", std::move(listening_ports));
 
   /* Set run_as_user. This will cause civetweb to invoke setuid() and setgid()
    * based on pw_uid and pw_gid obtained from pw_name. */
   std::string uid_string = g_ceph_context->get_set_uid_string();
   if (! uid_string.empty()) {
-    conf_map["run_as_user"] = std::move(uid_string);
+    conf_map.emplace("run_as_user", std::move(uid_string));
   }
 
   /* Prepare options for CivetWeb. */
index 4523f2d61ec3be20ce953822d62489d43582ecb9..281c8545fa5ef4992ecfbff1cfa8a154da36052a 100644 (file)
@@ -1099,7 +1099,7 @@ struct rgw_bucket {
 WRITE_CLASS_ENCODER(rgw_bucket)
 
 inline ostream& operator<<(ostream& out, const rgw_bucket &b) {
-  out << b.name << "[" << b.marker << "])";
+  out << b.name << "[" << b.marker << "]";
   return out;
 }
 
index 7c103bfa643fc3e75fc65df254bd25cb517804dd..152aead410ece4ee82bf2fd029bd84cabe66ec6e 100644 (file)
@@ -553,7 +553,7 @@ int RGWCoroutinesManager::run(list<RGWCoroutinesStack *>& stacks)
     while (blocked_count - interval_wait_count >= ops_window) {
       ret = completion_mgr->get_next((void **)&blocked_stack);
       if (ret < 0) {
-       ldout(cct, 0) << "ERROR: failed to clone shard, completion_mgr.get_next() returned ret=" << ret << dendl;
+       ldout(cct, 5) << "completion_mgr.get_next() returned ret=" << ret << dendl;
       }
       handle_unblocked_stack(context_stacks, scheduled_stacks, blocked_stack, &blocked_count);
     }
@@ -565,7 +565,7 @@ int RGWCoroutinesManager::run(list<RGWCoroutinesStack *>& stacks)
     while (scheduled_stacks.empty() && blocked_count > 0) {
       ret = completion_mgr->get_next((void **)&blocked_stack);
       if (ret < 0) {
-       ldout(cct, 0) << "ERROR: failed to clone shard, completion_mgr.get_next() returned ret=" << ret << dendl;
+        ldout(cct, 5) << "completion_mgr.get_next() returned ret=" << ret << dendl;
       }
       if (going_down) {
        ldout(cct, 5) << __func__ << "(): was stopped, exiting" << dendl;
index edffa8061180c32092a940e9ae79c6d1b4d73f04..5fd8403178ada70e662ad09fd75a57cc620e4958 100644 (file)
@@ -249,10 +249,10 @@ int RGWRadosSetOmapKeysCR::request_complete()
 RGWRadosGetOmapKeysCR::RGWRadosGetOmapKeysCR(RGWRados *_store,
                       const rgw_raw_obj& _obj,
                       const string& _marker,
-                      map<string, bufferlist> *_entries, int _max_entries) : RGWSimpleCoroutine(_store->ctx()),
+                      std::set<std::string> *_entries, int _max_entries) : RGWSimpleCoroutine(_store->ctx()),
                                                 store(_store),
                                                 marker(_marker),
-                                                entries(_entries), max_entries(_max_entries), rval(0),
+                                                entries(_entries), max_entries(_max_entries),
                                                 obj(_obj), cn(NULL)
 {
   set_description() << "set omap keys dest=" << obj << " marker=" << marker;
@@ -268,12 +268,21 @@ int RGWRadosGetOmapKeysCR::send_request() {
   set_status() << "send request";
 
   librados::ObjectReadOperation op;
-  op.omap_get_vals2(marker, max_entries, entries, nullptr, &rval);
+  op.omap_get_keys2(marker, max_entries, entries, nullptr, nullptr);
 
   cn = stack->create_completion_notifier();
   return ref.ioctx.aio_operate(ref.oid, cn->completion(), &op, NULL);
 }
 
+int RGWRadosGetOmapKeysCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
 RGWRadosRemoveOmapKeysCR::RGWRadosRemoveOmapKeysCR(RGWRados *_store,
                       const rgw_raw_obj& _obj,
                       const set<string>& _keys) : RGWSimpleCoroutine(_store->ctx()),
index cb0ba580897328069d84b7cab7bfb3fe23ad6dd2..f4b9ea5fcdfcc4aa0d9d6fd74636598c6671da7f 100644 (file)
@@ -410,10 +410,9 @@ class RGWRadosGetOmapKeysCR : public RGWSimpleCoroutine {
   RGWRados *store;
 
   string marker;
-  map<string, bufferlist> *entries;
+  std::set<std::string> *entries;
   int max_entries;
 
-  int rval;
   rgw_rados_ref ref;
 
   rgw_raw_obj obj;
@@ -424,13 +423,10 @@ public:
   RGWRadosGetOmapKeysCR(RGWRados *_store,
                      const rgw_raw_obj& _obj,
                      const string& _marker,
-                     map<string, bufferlist> *_entries, int _max_entries);
+                     std::set<std::string> *_entries, int _max_entries);
 
   int send_request() override;
-
-  int request_complete() override {
-    return rval;
-  }
+  int request_complete() override;
 };
 
 class RGWRadosRemoveOmapKeysCR : public RGWSimpleCoroutine {
index a3af8459a35d33d93cc32f1e0d48a1d60d817053..703bdd7ee25e9cc0d700dd374595956f13a43ea9 100644 (file)
@@ -152,6 +152,40 @@ bool RGWReadDataSyncStatusMarkersCR::spawn_next()
   return true;
 }
 
+class RGWReadDataSyncRecoveringShardsCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  RGWDataSyncEnv *env;
+
+  uint64_t max_entries;
+  int num_shards;
+  int shard_id{0};;
+
+  string marker;
+  map<int, std::set<std::string>> &entries_map;
+
+ public:
+  RGWReadDataSyncRecoveringShardsCR(RGWDataSyncEnv *env, uint64_t _max_entries, int _num_shards,
+      map<int, std::set<std::string>>& _entries_map)
+    : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS), env(env),
+    max_entries(_max_entries), num_shards(_num_shards), entries_map(_entries_map)
+  {}
+  bool spawn_next() override;
+};
+
+bool RGWReadDataSyncRecoveringShardsCR::spawn_next()
+{
+  if (shard_id > num_shards)
+    return false;
+  string error_oid = RGWDataSyncStatusManager::shard_obj_name(env->source_zone, shard_id) + ".retry";
+  spawn(new RGWRadosGetOmapKeysCR(env->store, rgw_raw_obj(env->store->get_zone_params().log_pool, error_oid),
+                                  marker, &entries_map[shard_id], max_entries), false);
+
+  ++shard_id;
+  return true;
+}
+
 class RGWReadDataSyncStatusCoroutine : public RGWCoroutine {
   RGWDataSyncEnv *sync_env;
   rgw_data_sync_status *sync_status;
@@ -654,6 +688,34 @@ int RGWRemoteDataLog::read_sync_status(rgw_data_sync_status *sync_status)
   return ret;
 }
 
+int RGWRemoteDataLog::read_recovering_shards(const int num_shards, set<int>& recovering_shards)
+{
+  // cannot run concurrently with run_sync(), so run in a separate manager
+  RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
+  RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
+  int ret = http_manager.set_threaded();
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "failed in http_manager.set_threaded() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWDataSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+  map<int, std::set<std::string>> entries_map;
+  uint64_t max_entries{1};
+  ret = crs.run(new RGWReadDataSyncRecoveringShardsCR(&sync_env_local, max_entries, num_shards, entries_map));
+  http_manager.stop();
+
+  if (ret == 0) {
+    for (const auto& entry : entries_map) {
+      if (entry.second.size() != 0) {
+        recovering_shards.insert(entry.first);
+      }
+    }
+  }
+
+  return ret;
+}
+
 int RGWRemoteDataLog::init_sync_status(int num_shards)
 {
   rgw_data_sync_status sync_status;
@@ -1045,8 +1107,8 @@ class RGWDataSyncShardCR : public RGWCoroutine {
   uint32_t shard_id;
   rgw_data_sync_marker sync_marker;
 
-  map<string, bufferlist> entries;
-  map<string, bufferlist>::iterator iter;
+  std::set<std::string> entries;
+  std::set<std::string>::iterator iter;
 
   string oid;
 
@@ -1087,7 +1149,7 @@ class RGWDataSyncShardCR : public RGWCoroutine {
 
   string error_oid;
   RGWOmapAppend *error_repo;
-  map<string, bufferlist> error_entries;
+  std::set<std::string> error_entries;
   string error_marker;
   int max_error_entries;
 
@@ -1205,20 +1267,20 @@ public:
         }
         iter = entries.begin();
         for (; iter != entries.end(); ++iter) {
-          ldout(sync_env->cct, 20) << __func__ << ": full sync: " << iter->first << dendl;
+          ldout(sync_env->cct, 20) << __func__ << ": full sync: " << *iter << dendl;
           total_entries++;
-          if (!marker_tracker->start(iter->first, total_entries, real_time())) {
-            ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << iter->first << ". Duplicate entry?" << dendl;
+          if (!marker_tracker->start(*iter, total_entries, real_time())) {
+            ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << *iter << ". Duplicate entry?" << dendl;
           } else {
             // fetch remote and write locally
-            yield spawn(new RGWDataSyncSingleEntryCR(sync_env, iter->first, iter->first, marker_tracker, error_repo, false), false);
+            yield spawn(new RGWDataSyncSingleEntryCR(sync_env, *iter, *iter, marker_tracker, error_repo, false), false);
             if (retcode < 0) {
               lease_cr->go_down();
               drain_all();
               return set_cr_error(retcode);
             }
           }
-          sync_marker.marker = iter->first;
+          sync_marker.marker = *iter;
         }
       } while ((int)entries.size() == max_entries);
 
@@ -1285,9 +1347,9 @@ public:
         ldout(sync_env->cct, 20) << __func__ << "(): read error repo, got " << error_entries.size() << " entries" << dendl;
         iter = error_entries.begin();
         for (; iter != error_entries.end(); ++iter) {
-          ldout(sync_env->cct, 20) << __func__ << "(): handle error entry: " << iter->first << dendl;
-          spawn(new RGWDataSyncSingleEntryCR(sync_env, iter->first, iter->first, nullptr /* no marker tracker */, error_repo, true), false);
-          error_marker = iter->first;
+          error_marker = *iter;
+          ldout(sync_env->cct, 20) << __func__ << "(): handle error entry: " << error_marker << dendl;
+          spawn(new RGWDataSyncSingleEntryCR(sync_env, error_marker, error_marker, nullptr /* no marker tracker */, error_repo, true), false);
         }
         if ((int)error_entries.size() != max_error_entries) {
           if (error_marker.empty() && error_entries.empty()) {
@@ -1757,30 +1819,16 @@ int RGWRemoteBucketLog::init(const string& _source_zone, RGWRESTConn *_conn,
   return 0;
 }
 
-struct bucket_index_marker_info {
-  string bucket_ver;
-  string master_ver;
-  string max_marker;
-  bool syncstopped{false};
-
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("bucket_ver", bucket_ver, obj);
-    JSONDecoder::decode_json("master_ver", master_ver, obj);
-    JSONDecoder::decode_json("max_marker", max_marker, obj);
-    JSONDecoder::decode_json("syncstopped", syncstopped, obj);
-  }
-};
-
 class RGWReadRemoteBucketIndexLogInfoCR : public RGWCoroutine {
   RGWDataSyncEnv *sync_env;
   const string instance_key;
 
-  bucket_index_marker_info *info;
+  rgw_bucket_index_marker_info *info;
 
 public:
   RGWReadRemoteBucketIndexLogInfoCR(RGWDataSyncEnv *_sync_env,
                                   const rgw_bucket_shard& bs,
-                                  bucket_index_marker_info *_info)
+                                  rgw_bucket_index_marker_info *_info)
     : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
       instance_key(bs.get_key()), info(_info) {}
 
@@ -1793,7 +1841,7 @@ public:
                                        { NULL, NULL } };
 
         string p = "/admin/log/";
-        call(new RGWReadRESTResourceCR<bucket_index_marker_info>(sync_env->cct, sync_env->conn, sync_env->http_manager, p, pairs, info));
+        call(new RGWReadRESTResourceCR<rgw_bucket_index_marker_info>(sync_env->cct, sync_env->conn, sync_env->http_manager, p, pairs, info));
       }
       if (retcode < 0) {
         return set_cr_error(retcode);
@@ -1812,7 +1860,7 @@ class RGWInitBucketShardSyncStatusCoroutine : public RGWCoroutine {
 
   rgw_bucket_shard_sync_info& status;
 
-  bucket_index_marker_info info;
+  rgw_bucket_index_marker_info info;
 public:
   RGWInitBucketShardSyncStatusCoroutine(RGWDataSyncEnv *_sync_env,
                                         const rgw_bucket_shard& bs,
@@ -1936,6 +1984,171 @@ int RGWReadBucketSyncStatusCoroutine::operate()
   }
   return 0;
 }
+
+#define OMAP_READ_MAX_ENTRIES 10
+class RGWReadRecoveringBucketShardsCoroutine : public RGWCoroutine {
+  RGWDataSyncEnv *sync_env;
+  RGWRados *store;
+  
+  const int shard_id;
+  int max_entries;
+
+  set<string>& recovering_buckets;
+  string marker;
+  string error_oid;
+
+  set<string> error_entries;
+  int max_omap_entries;
+  int count;
+
+public:
+  RGWReadRecoveringBucketShardsCoroutine(RGWDataSyncEnv *_sync_env, const int _shard_id,
+                                      set<string>& _recovering_buckets, const int _max_entries) 
+  : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+  store(sync_env->store), shard_id(_shard_id), max_entries(_max_entries),
+  recovering_buckets(_recovering_buckets), max_omap_entries(OMAP_READ_MAX_ENTRIES)
+  {
+    error_oid = RGWDataSyncStatusManager::shard_obj_name(sync_env->source_zone, shard_id) + ".retry";
+  }
+
+  int operate() override;
+};
+
+int RGWReadRecoveringBucketShardsCoroutine::operate()
+{
+  reenter(this){
+    //read recovering bucket shards
+    count = 0;
+    do {
+      yield call(new RGWRadosGetOmapKeysCR(store, rgw_raw_obj(store->get_zone_params().log_pool, error_oid), 
+            marker, &error_entries, max_omap_entries));
+
+      if (retcode == -ENOENT) {
+        break;
+      }
+
+      if (retcode < 0) {
+        ldout(sync_env->cct, 0) << "failed to read recovering bucket shards with " 
+          << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      if (error_entries.empty()) {
+        break;
+      }
+
+      count += error_entries.size();
+      marker = *error_entries.rbegin();
+      recovering_buckets.insert(error_entries.begin(), error_entries.end());
+    }while((int)error_entries.size() == max_omap_entries && count < max_entries);
+  
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+class RGWReadPendingBucketShardsCoroutine : public RGWCoroutine {
+  RGWDataSyncEnv *sync_env;
+  RGWRados *store;
+  
+  const int shard_id;
+  int max_entries;
+
+  set<string>& pending_buckets;
+  string marker;
+  string status_oid;
+
+  rgw_data_sync_marker* sync_marker;
+  int count;
+
+  list<rgw_data_change_log_entry> log_entries;
+  bool truncated;
+
+public:
+  RGWReadPendingBucketShardsCoroutine(RGWDataSyncEnv *_sync_env, const int _shard_id,
+                                      set<string>& _pending_buckets,
+                                      rgw_data_sync_marker* _sync_marker, const int _max_entries) 
+  : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+  store(sync_env->store), shard_id(_shard_id), max_entries(_max_entries),
+  pending_buckets(_pending_buckets), sync_marker(_sync_marker)
+  {
+    status_oid = RGWDataSyncStatusManager::shard_obj_name(sync_env->source_zone, shard_id);
+  }
+
+  int operate() override;
+};
+
+int RGWReadPendingBucketShardsCoroutine::operate()
+{
+  reenter(this){
+    //read sync status marker
+    using CR = RGWSimpleRadosReadCR<rgw_data_sync_marker>;
+    yield call(new CR(sync_env->async_rados, store, 
+                      rgw_raw_obj(store->get_zone_params().log_pool, status_oid),
+                      sync_marker));
+    if (retcode < 0) {
+      ldout(sync_env->cct,0) << "failed to read sync status marker with " 
+        << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+
+    //read pending bucket shards
+    marker = sync_marker->marker;
+    count = 0;
+    do{
+      yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, &marker, &log_entries, &truncated));
+
+      if (retcode == -ENOENT) {
+        break;
+      }
+
+      if (retcode < 0) {
+        ldout(sync_env->cct,0) << "failed to read remote data log info with " 
+          << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      if (log_entries.empty()) {
+        break;
+      }
+
+      count += log_entries.size();
+      for (const auto& entry : log_entries) {
+        pending_buckets.insert(entry.entry.key);
+      }
+    }while(truncated && count < max_entries);
+
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+int RGWRemoteDataLog::read_shard_status(int shard_id, set<string>& pending_buckets, set<string>& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries)
+{
+  // cannot run concurrently with run_sync(), so run in a separate manager
+  RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
+  RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
+  int ret = http_manager.set_threaded();
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWDataSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+  list<RGWCoroutinesStack *> stacks;
+  RGWCoroutinesStack* recovering_stack = new RGWCoroutinesStack(store->ctx(), &crs);
+  recovering_stack->call(new RGWReadRecoveringBucketShardsCoroutine(&sync_env_local, shard_id, recovering_buckets, max_entries));
+  stacks.push_back(recovering_stack);
+  RGWCoroutinesStack* pending_stack = new RGWCoroutinesStack(store->ctx(), &crs);
+  pending_stack->call(new RGWReadPendingBucketShardsCoroutine(&sync_env_local, shard_id, pending_buckets, sync_marker, max_entries));
+  stacks.push_back(pending_stack);
+  ret = crs.run(stacks);
+  http_manager.stop();
+  return ret;
+}
+
 RGWCoroutine *RGWRemoteBucketLog::read_sync_status_cr(rgw_bucket_shard_sync_info *sync_status)
 {
   return new RGWReadBucketSyncStatusCoroutine(&sync_env, bs, sync_status);
@@ -2313,7 +2526,7 @@ public:
         sync_status = retcode;
       }
       if (!error_ss.str().empty()) {
-        yield call(sync_env->error_logger->log_error_cr(sync_env->conn->get_remote_id(), "data", error_ss.str(), -retcode, "failed to sync object"));
+        yield call(sync_env->error_logger->log_error_cr(sync_env->conn->get_remote_id(), "data", error_ss.str(), -retcode, string("failed to sync object") + cpp_strerror(-sync_status)));
       }
 done:
       if (sync_status == 0) {
@@ -3013,18 +3226,12 @@ class RGWCollectBucketSyncStatusCR : public RGWShardCollectCR {
 };
 
 int rgw_bucket_sync_status(RGWRados *store, const std::string& source_zone,
-                           const rgw_bucket& bucket,
+                           const RGWBucketInfo& bucket_info,
                            std::vector<rgw_bucket_shard_sync_info> *status)
 {
-  // read the bucket instance info for num_shards
-  RGWObjectCtx ctx(store);
-  RGWBucketInfo info;
-  int ret = store->get_bucket_instance_info(ctx, bucket, info, nullptr, nullptr);
-  if (ret < 0) {
-    return ret;
-  }
+  const auto num_shards = bucket_info.num_shards;
   status->clear();
-  status->resize(std::max<size_t>(1, info.num_shards));
+  status->resize(std::max<size_t>(1, num_shards));
 
   RGWDataSyncEnv env;
   RGWSyncModuleInstanceRef module; // null sync module
@@ -3032,8 +3239,8 @@ int rgw_bucket_sync_status(RGWRados *store, const std::string& source_zone,
            nullptr, nullptr, source_zone, module, nullptr);
 
   RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
-  return crs.run(new RGWCollectBucketSyncStatusCR(store, &env, info.num_shards,
-                                                  bucket, status));
+  return crs.run(new RGWCollectBucketSyncStatusCR(store, &env, num_shards,
+                                                  bucket_info.bucket, status));
 }
 
 
index 8e876b9c97d77a0c66f026414fb686b7ecf79fd1..a85a155aa109abe4c3b90dcfe7943f4f158f09ba 100644 (file)
@@ -128,7 +128,19 @@ struct rgw_data_sync_marker {
   }
 
   void dump(Formatter *f) const {
-    encode_json("state", (int)state, f);
+    const char *s{nullptr};
+    switch ((SyncState)state) {
+      case FullSync:
+        s = "full-sync";
+        break;
+      case IncrementalSync:
+        s = "incremental-sync";
+        break;
+      default:
+        s = "unknown";
+        break;
+    }
+    encode_json("status", s, f);
     encode_json("marker", marker, f);
     encode_json("next_step_marker", next_step_marker, f);
     encode_json("total_entries", total_entries, f);
@@ -136,9 +148,13 @@ struct rgw_data_sync_marker {
     encode_json("timestamp", utime_t(timestamp), f);
   }
   void decode_json(JSONObj *obj) {
-    int s;
-    JSONDecoder::decode_json("state", s, obj);
-    state = s;
+    std::string s;
+    JSONDecoder::decode_json("status", s, obj);
+    if (s == "full-sync") {
+      state = FullSync;
+    } else if (s == "incremental-sync") {
+      state = IncrementalSync;
+    }
     JSONDecoder::decode_json("marker", marker, obj);
     JSONDecoder::decode_json("next_step_marker", next_step_marker, obj);
     JSONDecoder::decode_json("total_entries", total_entries, obj);
@@ -274,6 +290,8 @@ public:
   int read_source_log_shards_info(map<int, RGWDataChangesLogInfo> *shards_info);
   int read_source_log_shards_next(map<int, string> shard_markers, map<int, rgw_datalog_shard_data> *result);
   int read_sync_status(rgw_data_sync_status *sync_status);
+  int read_recovering_shards(const int num_shards, set<int>& recovering_shards);
+  int read_shard_status(int shard_id, set<string>& lagging_buckets,set<string>& recovering_buckets, rgw_data_sync_marker* sync_marker, const int max_entries);
   int init_sync_status(int num_shards);
   int run_sync(int num_shards);
 
@@ -323,6 +341,14 @@ public:
   int read_sync_status(rgw_data_sync_status *sync_status) {
     return source_log.read_sync_status(sync_status);
   }
+
+  int read_recovering_shards(const int num_shards, set<int>& recovering_shards) {
+    return source_log.read_recovering_shards(num_shards, recovering_shards);
+  }
+
+  int read_shard_status(int shard_id, set<string>& lagging_buckets, set<string>& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries) {
+    return source_log.read_shard_status(shard_id, lagging_buckets, recovering_buckets,sync_marker, max_entries);
+  }
   int init_sync_status() { return source_log.init_sync_status(num_shards); }
 
   int read_log_info(rgw_datalog_info *log_info) {
@@ -440,6 +466,20 @@ struct rgw_bucket_shard_sync_info {
 };
 WRITE_CLASS_ENCODER(rgw_bucket_shard_sync_info)
 
+struct rgw_bucket_index_marker_info {
+  string bucket_ver;
+  string master_ver;
+  string max_marker;
+  bool syncstopped{false};
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("bucket_ver", bucket_ver, obj);
+    JSONDecoder::decode_json("master_ver", master_ver, obj);
+    JSONDecoder::decode_json("max_marker", max_marker, obj);
+    JSONDecoder::decode_json("syncstopped", syncstopped, obj);
+  }
+};
+
 
 class RGWRemoteBucketLog : public RGWCoroutinesManager {
   RGWRados *store;
@@ -522,7 +562,7 @@ public:
 
 /// read the sync status of all bucket shards from the given source zone
 int rgw_bucket_sync_status(RGWRados *store, const std::string& source_zone,
-                           const rgw_bucket& bucket,
+                           const RGWBucketInfo& bucket_info,
                            std::vector<rgw_bucket_shard_sync_info> *status);
 
 class RGWDefaultSyncModule : public RGWSyncModule {
index 38642110f94908a45ff3862a0bcf1d3238db32db..9795f7e6dcc9d24cf2d2eceb3ecb0ad4e973e4f3 100644 (file)
@@ -23,9 +23,9 @@
 #include "rgw_auth_s3.h"
 #include "rgw_user.h"
 #include "rgw_bucket.h"
-
 #include "rgw_file.h"
 #include "rgw_lib_frontend.h"
+#include "common/errno.h"
 
 #include <atomic>
 
@@ -951,6 +951,11 @@ namespace rgw {
   }
 
   RGWFileHandle::~RGWFileHandle() {
+    /* !recycle case, handle may STILL be in handle table, BUT
+     * the partition lock is not held in this path */
+    if (fh_hook.is_linked()) {
+      fs->fh_cache.remove(fh.fh_hk.object, this, FHCache::FLAG_LOCK);
+    }
     /* cond-unref parent */
     if (parent && (! parent->is_mount())) {
       /* safe because if parent->unref causes its deletion,
@@ -1487,7 +1492,7 @@ namespace rgw {
 
     op_ret = processor->complete(s->obj_size, etag, &mtime, real_time(), attrs,
                                  (delete_at ? *delete_at : real_time()),
-                                if_match, if_nomatch);
+                                if_match, if_nomatch);
     if (op_ret != 0) {
       /* revert attr updates */
       rgw_fh->set_mtime(omtime);
@@ -1611,16 +1616,25 @@ int rgw_statfs(struct rgw_fs *rgw_fs,
               struct rgw_statvfs *vfs_st, uint32_t flags)
 {
   RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  struct rados_cluster_stat_t stats;
+
+  RGWGetClusterStatReq req(fs->get_context(), fs->get_user(), stats);
+  int rc = rgwlib.get_fe()->execute_req(&req);
+  if (rc < 0) {
+    lderr(fs->get_context()) << "ERROR: getting total cluster usage"
+                             << cpp_strerror(-rc) << dendl;
+    return rc;
+  }
 
-  /* XXX for now, just publish a huge capacity and
-   * limited utiliztion */
-  vfs_st->f_bsize = 1024*1024 /* 1M */;
-  vfs_st->f_frsize = 1024;    /* minimal allocation unit (who cares) */
-  vfs_st->f_blocks = UINT64_MAX;
-  vfs_st->f_bfree = UINT64_MAX;
-  vfs_st->f_bavail = UINT64_MAX;
-  vfs_st->f_files = 1024; /* object count, do we have an est? */
-  vfs_st->f_ffree = UINT64_MAX;
+  //Set block size to 1M.
+  constexpr uint32_t CEPH_BLOCK_SHIFT = 20;
+  vfs_st->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+  vfs_st->f_frsize = 1 << CEPH_BLOCK_SHIFT;
+  vfs_st->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
+  vfs_st->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
+  vfs_st->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
+  vfs_st->f_files = stats.num_objects;
+  vfs_st->f_ffree = -1;
   vfs_st->f_fsid[0] = fs->get_fsid();
   vfs_st->f_fsid[1] = fs->get_fsid();
   vfs_st->f_flag = 0;
@@ -1856,7 +1870,7 @@ int rgw_open(struct rgw_fs *rgw_fs,
 {
   RGWFileHandle* rgw_fh = get_rgwfh(fh);
 
-  /* XXX 
+  /* XXX
    * need to track specific opens--at least read opens and
    * a write open;  we need to know when a write open is returned,
    * that closes a write transaction
index 7b7b8aa4d35278d297e9b5a89ce9afab400b29de..bfb95be737abe29216d666778c805f9fce0765ee 100644 (file)
@@ -2528,6 +2528,49 @@ public:
 
 }; /* RGWSetAttrsRequest */
 
+/*
+ * Send request to get the rados cluster stats
+ */
+class RGWGetClusterStatReq : public RGWLibRequest,
+        public RGWGetClusterStat {
+public:
+  struct rados_cluster_stat_t& stats_req;
+  RGWGetClusterStatReq(CephContext* _cct,RGWUserInfo *_user,
+                       rados_cluster_stat_t& _stats):
+  RGWLibRequest(_cct, _user), stats_req(_stats){
+    op = this;
+  }
+
+  int op_init() override {
+    // assign store, s, and dialect_handler
+    RGWObjectCtx* rados_ctx
+      = static_cast<RGWObjectCtx*>(get_state()->obj_ctx);
+    // framework promises to call op_init after parent init
+    assert(rados_ctx);
+    RGWOp::init(rados_ctx->store, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+    struct req_state* s = get_state();
+    s->info.method = "GET";
+    s->op = OP_GET;
+    s->user = user;
+    return 0;
+  }
+
+  int get_params() override { return 0; }
+  bool only_bucket() override { return false; }
+  void send_response() override {
+    stats_req.kb = stats_op.kb;
+    stats_req.kb_avail = stats_op.kb_avail;
+    stats_req.kb_used = stats_op.kb_used;
+    stats_req.num_objects = stats_op.num_objects;
+  }
+}; /* RGWGetClusterStatReq */
+
+
 } /* namespace rgw */
 
 #endif /* RGW_FILE_H */
index a047bcd378019bd8f140b21c5f8851cc1a434c2d..1d61a8fead10269054c2d53c368cb698ca6d73d6 100644 (file)
@@ -13,7 +13,7 @@
 #define dout_subsys ceph_subsys_rgw
 
 int RGWFrontendConfig::parse_config(const string& config,
-                                   map<string, string>& config_map)
+                                   std::multimap<string, string>& config_map)
 {
   list<string> config_list;
   get_str_list(config, " ", config_list);
@@ -33,7 +33,7 @@ int RGWFrontendConfig::parse_config(const string& config,
     ssize_t pos = entry.find('=');
     if (pos < 0) {
       dout(0) << "framework conf key: " << entry << dendl;
-      config_map[entry] = "";
+      config_map.emplace(std::move(entry), "");
       continue;
     }
 
@@ -44,7 +44,7 @@ int RGWFrontendConfig::parse_config(const string& config,
     }
 
     dout(0) << "framework conf key: " << key << ", val: " << val << dendl;
-    config_map[key] = val;
+    config_map.emplace(std::move(key), std::move(val));
   }
 
   return 0;
@@ -53,7 +53,7 @@ int RGWFrontendConfig::parse_config(const string& config,
 bool RGWFrontendConfig::get_val(const string& key, const string& def_val,
                                string *out)
 {
map<string, string>::iterator iter = config_map.find(key);
auto iter = config_map.find(key);
  if (iter == config_map.end()) {
    *out = def_val;
    return false;
index 76225e91ab6eafd04d419518fcd338cb66ff96e0..2ee4a9b31c23db47025e86a35318717871554375 100644 (file)
 
 class RGWFrontendConfig {
   std::string config;
-  std::map<std::string, std::string> config_map;
+  std::multimap<std::string, std::string> config_map;
   std::string framework;
 
   int parse_config(const std::string& config,
-                   std::map<std::string, std::string>& config_map);
+                   std::multimap<std::string, std::string>& config_map);
 
 public:
   RGWFrontendConfig(const std::string& config)
@@ -54,7 +54,7 @@ public:
     return config;
   }
 
-  std::map<std::string, std::string>& get_config_map() {
+  std::multimap<std::string, std::string>& get_config_map() {
     return config_map;
   }
 
@@ -97,11 +97,11 @@ class RGWCivetWebFrontend : public RGWFrontend {
   struct mg_context* ctx;
   RGWMongooseEnv env;
 
-  void set_conf_default(std::map<std::string, std::string>& m,
+  void set_conf_default(std::multimap<std::string, std::string>& m,
                         const std::string& key,
                        const std::string& def_val) {
     if (m.find(key) == std::end(m)) {
-      m[key] = def_val;
+      m.emplace(key, def_val);
     }
   }
 
@@ -246,11 +246,16 @@ public:
 class RGWFrontendPauser : public RGWRealmReloader::Pauser {
   std::list<RGWFrontend*> &frontends;
   RGWRealmReloader::Pauser* pauser;
+  rgw::auth::ImplicitTenants& implicit_tenants;
 
  public:
   RGWFrontendPauser(std::list<RGWFrontend*> &frontends,
+                    rgw::auth::ImplicitTenants& implicit_tenants,
                     RGWRealmReloader::Pauser* pauser = nullptr)
-    : frontends(frontends), pauser(pauser) {}
+    : frontends(frontends),
+      pauser(pauser),
+      implicit_tenants(implicit_tenants) {
+  }
 
   void pause() override {
     for (auto frontend : frontends)
@@ -262,7 +267,7 @@ class RGWFrontendPauser : public RGWRealmReloader::Pauser {
     /* Initialize the registry of auth strategies which will coordinate
      * the dynamic reconfiguration. */
     auto auth_registry = \
-      rgw::auth::StrategyRegistry::create(g_ceph_context, store);
+      rgw::auth::StrategyRegistry::create(g_ceph_context, implicit_tenants, store);
 
     for (auto frontend : frontends)
       frontend->unpause_with_new_config(store, auth_registry);
index 72c8a14f1db9d06c92884aa63b8b7038b8151168..61146748d635f58be05ec10e961110341c196443 100644 (file)
@@ -350,8 +350,13 @@ static curl_slist *headers_to_slist(param_vec_t& headers)
       }
     }
 
-    val.append(": ");
-    val.append(p.second);
+    // curl won't send headers with empty values unless it ends with a ; instead
+    if (p.second.empty()) {
+      val.append(1, ';');
+    } else {
+      val.append(": ");
+      val.append(p.second);
+    }
     h = curl_slist_append(h, val.c_str());
   }
 
index 675db22c21603b80d9032af5f9c8b979d94dc46d..9f31db2eaad7b73e3feb7592d139abf331170b77 100644 (file)
@@ -421,7 +421,7 @@ static const actpair actpairs[] =
  { "s3:GetObjectVersionTagging", s3GetObjectVersionTagging},
  { "s3:GetReplicationConfiguration", s3GetReplicationConfiguration },
  { "s3:ListAllMyBuckets", s3ListAllMyBuckets },
- { "s3:ListBucketMultiPartUploads", s3ListBucketMultiPartUploads },
+ { "s3:ListBucketMultipartUploads", s3ListBucketMultipartUploads },
  { "s3:ListBucket", s3ListBucket },
  { "s3:ListBucketVersions", s3ListBucketVersions },
  { "s3:ListMultipartUploadParts", s3ListMultipartUploadParts },
@@ -1325,8 +1325,8 @@ const char* action_bit_string(uint64_t action) {
   case s3ListAllMyBuckets:
     return "s3:ListAllMyBuckets";
 
-  case s3ListBucketMultiPartUploads:
-    return "s3:ListBucketMultiPartUploads";
+  case s3ListBucketMultipartUploads:
+    return "s3:ListBucketMultipartUploads";
 
   case s3GetAccelerateConfiguration:
     return "s3:GetAccelerateConfiguration";
index 032840151a6ff083a2c6015d8127d306f6d0a201..8791861a54d16cbce186b4b898312e5310939848 100644 (file)
@@ -60,7 +60,7 @@ static constexpr std::uint64_t s3DeleteBucket = 1ULL << 15;
 static constexpr std::uint64_t s3ListBucket = 1ULL << 16;
 static constexpr std::uint64_t s3ListBucketVersions = 1ULL << 17;
 static constexpr std::uint64_t s3ListAllMyBuckets = 1ULL << 18;
-static constexpr std::uint64_t s3ListBucketMultiPartUploads = 1ULL << 19;
+static constexpr std::uint64_t s3ListBucketMultipartUploads = 1ULL << 19;
 static constexpr std::uint64_t s3GetAccelerateConfiguration = 1ULL << 20;
 static constexpr std::uint64_t s3PutAccelerateConfiguration = 1ULL << 21;
 static constexpr std::uint64_t s3GetBucketAcl = 1ULL << 22;
@@ -109,7 +109,7 @@ inline int op_to_perm(std::uint64_t op) {
   case s3GetObjectVersionTagging:
   case s3ListAllMyBuckets:
   case s3ListBucket:
-  case s3ListBucketMultiPartUploads:
+  case s3ListBucketMultipartUploads:
   case s3ListBucketVersions:
   case s3ListMultipartUploadParts:
     return RGW_PERM_READ;
index 97a29342d11ee12aca6ceb02a0c5eeace05351ab..7fa3b8e1a05eed8c6038c14f70701deac9f70900 100644 (file)
@@ -60,6 +60,8 @@ bool LCMPExpiration_S3::xml_end(const char *el) {
 bool RGWLifecycleConfiguration_S3::xml_end(const char *el) {
   XMLObjIter iter = find("Rule");
   LCRule_S3 *rule = static_cast<LCRule_S3 *>(iter.get_next());
+  if (!rule)
+    return false;
   while (rule) {
     add_rule(rule);
     rule = static_cast<LCRule_S3 *>(iter.get_next());
index 8c0814a4480c826aa8c87eaf9fbfbe6c4637aa08..ad5848924dbe4a591ec3b67cea14f1416d6dc1aa 100644 (file)
@@ -317,7 +317,7 @@ int main(int argc, const char **argv)
 
   RGWRados *store = RGWStoreManager::get_storage(g_ceph_context,
       g_conf->rgw_enable_gc_threads, g_conf->rgw_enable_lc_threads, g_conf->rgw_enable_quota_threads,
-      g_conf->rgw_run_sync_thread, g_conf->rgw_dynamic_resharding);
+      g_conf->rgw_run_sync_thread, g_conf->rgw_dynamic_resharding, g_conf->rgw_cache_enabled);
   if (!store) {
     mutex.Lock();
     init_timer.cancel_all_events();
@@ -422,8 +422,10 @@ int main(int argc, const char **argv)
 
   /* Initialize the registry of auth strategies which will coordinate
    * the dynamic reconfiguration. */
+  rgw::auth::ImplicitTenants implicit_tenant_context{*g_conf};
+  g_conf->add_observer(&implicit_tenant_context);
   auto auth_registry = \
-    rgw::auth::StrategyRegistry::create(g_ceph_context, store);
+    rgw::auth::StrategyRegistry::create(g_ceph_context, implicit_tenant_context, store);
 
   /* Header custom behavior */
   rest.register_x_headers(g_conf->rgw_log_http_headers);
@@ -535,7 +537,7 @@ int main(int argc, const char **argv)
 
   // add a watcher to respond to realm configuration changes
   RGWPeriodPusher pusher(store);
-  RGWFrontendPauser pauser(fes, &pusher);
+  RGWFrontendPauser pauser(fes, implicit_tenant_context, &pusher);
   RGWRealmReloader reloader(store, service_map_meta, &pauser);
 
   RGWRealmWatcher realm_watcher(g_ceph_context, store->realm);
@@ -586,6 +588,7 @@ int main(int argc, const char **argv)
   rgw_tools_cleanup();
   rgw_shutdown_resolver();
   rgw::curl::cleanup_curl();
+  g_conf->remove_observer(&implicit_tenant_context);
 
   rgw_perf_stop(g_ceph_context);
 
index f20cbd9b862bc877599a4d29669bd697486c0a71..98a6db4703f2caa1471757c95ff06ec00a5dc096 100644 (file)
@@ -76,41 +76,42 @@ static int forward_request_to_master(struct req_state *s, obj_version *objv, RGW
 
 static MultipartMetaFilter mp_filter;
 
-static int parse_range(const char *range, off_t& ofs, off_t& end, bool *partial_content)
+int RGWGetObj::parse_range(void)
 {
   int r = -ERANGE;
-  string s(range);
+  string rs(range_str);
   string ofs_str;
   string end_str;
 
-  *partial_content = false;
+  ignore_invalid_range = s->cct->_conf->rgw_ignore_get_invalid_range;
+  partial_content = false;
 
-  size_t pos = s.find("bytes=");
+  size_t pos = rs.find("bytes=");
   if (pos == string::npos) {
     pos = 0;
-    while (isspace(s[pos]))
+    while (isspace(rs[pos]))
       pos++;
     int end = pos;
-    while (isalpha(s[end]))
+    while (isalpha(rs[end]))
       end++;
-    if (strncasecmp(s.c_str(), "bytes", end - pos) != 0)
+    if (strncasecmp(rs.c_str(), "bytes", end - pos) != 0)
       return 0;
-    while (isspace(s[end]))
+    while (isspace(rs[end]))
       end++;
-    if (s[end] != '=')
+    if (rs[end] != '=')
       return 0;
-    s = s.substr(end + 1);
+    rs = rs.substr(end + 1);
   } else {
-    s = s.substr(pos + 6); /* size of("bytes=")  */
+    rs = rs.substr(pos + 6); /* size of("bytes=")  */
   }
-  pos = s.find('-');
+  pos = rs.find('-');
   if (pos == string::npos)
     goto done;
 
-  *partial_content = true;
+  partial_content = true;
 
-  ofs_str = s.substr(0, pos);
-  end_str = s.substr(pos + 1);
+  ofs_str = rs.substr(0, pos);
+  end_str = rs.substr(pos + 1);
   if (end_str.length()) {
     end = atoll(end_str.c_str());
     if (end < 0)
@@ -127,8 +128,18 @@ static int parse_range(const char *range, off_t& ofs, off_t& end, bool *partial_
   if (end >= 0 && end < ofs)
     goto done;
 
-  r = 0;
+  range_parsed = true;
+  return 0;
+
 done:
+  if (ignore_invalid_range) {
+    partial_content = false;
+    ofs = 0;
+    end = -1;
+    range_parsed = false; // allow retry
+    r = 0;
+  }
+
   return r;
 }
 
@@ -1588,16 +1599,13 @@ bool RGWGetObj::prefetch_data()
   bool prefetch_first_chunk = true;
   range_str = s->info.env->get("HTTP_RANGE");
 
-  if(range_str) {
-    int r = parse_range(range_str, ofs, end, &partial_content);
-    /* error on parsing the range, stop prefetch and will fail in execte() */
+  if (range_str) {
+    int r = parse_range();
+    /* error on parsing the range, stop prefetch and will fail in execute() */
     if (r < 0) {
-      range_parsed = false;
-      return false;
-    } else {
-      range_parsed = true;
+      return false; /* range_parsed==false */
     }
-    /* range get goes to shadown objects, stop prefetch */
+    /* range get goes to shadow objects, stop prefetch */
     if (ofs >= s->cct->_conf->rgw_max_chunk_size) {
       prefetch_first_chunk = false;
     }
@@ -1605,6 +1613,7 @@ bool RGWGetObj::prefetch_data()
 
   return get_data && prefetch_first_chunk;
 }
+
 void RGWGetObj::pre_exec()
 {
   rgw_bucket_object_pre_exec(s);
@@ -1684,7 +1693,9 @@ void RGWGetObj::execute()
   {
     attr_iter = attrs.find(RGW_ATTR_CRYPT_MODE);
     if (attr_iter != attrs.end() && attr_iter->second.to_str() == "SSE-C-AES256") {
-      op_ret = -ERR_INVALID_REQUEST;
+      ldout(s->cct, 0) << "ERROR: torrents are not supported for objects "
+          "encrypted with SSE-C" << dendl;
+      op_ret = -EINVAL;
       goto done_err;
     }
     torrent.init(s, store);
@@ -1810,9 +1821,9 @@ done_err:
 int RGWGetObj::init_common()
 {
   if (range_str) {
-    /* range parsed error when prefetch*/
+    /* range parsed error when prefetch */
     if (!range_parsed) {
-      int r = parse_range(range_str, ofs, end, &partial_content);
+      int r = parse_range();
       if (r < 0)
         return r;
     }
@@ -2274,6 +2285,7 @@ int RGWListBucket::parse_max_keys()
     char *endptr;
     max = strtol(max_keys.c_str(), &endptr, 10);
     if (endptr) {
+      if (endptr == max_keys.c_str()) return -EINVAL;
       while (*endptr && isspace(*endptr)) // ignore white space
         endptr++;
       if (*endptr) {
@@ -3421,7 +3433,7 @@ void RGWPutObj::execute()
       op_ret = -ENOENT;
       goto done;
     }
-    lst = astate->size - 1;
+    lst = astate->accounted_size - 1;
   } else {
     lst = copy_source_range_lst;
   }
@@ -4993,7 +5005,7 @@ void RGWDeleteLC::execute()
       }
     }
   op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, &s->bucket_info.objv_tracker);
-  string shard_id = s->bucket.name + ':' +s->bucket.bucket_id;
+  string shard_id = s->bucket.tenant + ':' + s->bucket.name + ':' + s->bucket.bucket_id;
   pair<string, int> entry(shard_id, lc_uninitial);
   string oid; 
   get_lc_oid(s, oid);
@@ -5732,7 +5744,7 @@ void RGWListMultipart::execute()
 int RGWListBucketMultiparts::verify_permission()
 {
   if (!verify_bucket_permission(s,
-                               rgw::IAM::s3ListBucketMultiPartUploads))
+                               rgw::IAM::s3ListBucketMultipartUploads))
     return -EACCES;
 
   return 0;
@@ -7003,3 +7015,10 @@ void RGWDeleteBucketPolicy::execute()
       return op_ret;
     });
 }
+
+void RGWGetClusterStat::execute()
+{
+  op_ret = this->store->get_rados_handle()->cluster_stat(stats_op);
+}
+
+
index 0a127d2a9a46c2491a311d5c5c8e25f987a4426f..7800692113996b345a32748178f4e25b6adbd079 100644 (file)
@@ -204,6 +204,7 @@ protected:
   map<string, bufferlist> attrs;
   bool get_data;
   bool partial_content;
+  bool ignore_invalid_range;
   bool range_parsed;
   bool skip_manifest;
   bool skip_decrypt{false};
@@ -257,9 +258,11 @@ public:
   void set_get_data(bool get_data) {
     this->get_data = get_data;
   }
+
   int verify_permission() override;
   void pre_exec() override;
   void execute() override;
+  int parse_range();
   int read_user_manifest_part(
     rgw_bucket& bucket,
     const rgw_bucket_dir_entry& ent,
@@ -2193,4 +2196,22 @@ public:
   virtual uint32_t op_mask() { return RGW_OP_TYPE_WRITE; }
 };
 
+class RGWGetClusterStat : public RGWOp {
+protected:
+  struct rados_cluster_stat_t stats_op;
+public:
+  RGWGetClusterStat() {}
+
+  void init(RGWRados *store, struct req_state *s, RGWHandler *h) override {
+    RGWOp::init(store, s, h);
+  }
+  int verify_permission() override {return 0;}
+  virtual void send_response() = 0;
+  virtual int get_params() = 0;
+  void execute() override;
+  virtual const string name() { return "get_cluster_stat"; }
+};
+
+
+
 #endif /* CEPH_RGW_OP_H */
index 2c811a358ba1a4f492168097218b8282d884cd24..29d3a0052bbe6b25adc095ec23c26eb30c3f6936 100644 (file)
@@ -95,9 +95,11 @@ int RGWPeriodPuller::pull(const std::string& period_id, RGWPeriod& period)
       return r;
     }
     // reflect period objects if this is the latest version
-    r = period.reflect();
-    if (r < 0) {
-      return r;
+    if (store->realm.get_current_period() == period_id) {
+      r = period.reflect();
+      if (r < 0) {
+        return r;
+      }
     }
     ldout(store->ctx(), 14) << "period " << period_id
         << " pulled and written to local storage" << dendl;
index fd30c347b99664d8e13da90e9918a9170a655c3a..100f77a699bc0310cfb1b0684cfd7dc174f9dedf 100644 (file)
@@ -170,6 +170,14 @@ int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, b
   int r = rados->ioctx_create(pool.name.c_str(), ioctx);
   if (r == -ENOENT && create) {
     r = rados->pool_create(pool.name.c_str());
+    if (r == -ERANGE) {
+      dout(0)
+        << __func__
+        << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r)
+        << " (this can be due to a pool or placement group misconfiguration, e.g."
+        << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
+        << dendl;
+    }
     if (r < 0 && r != -EEXIST) {
       return r;
     }
@@ -4861,28 +4869,10 @@ void RGWRados::pick_control_oid(const string& key, string& notify_oid)
   notify_oid.append(buf);
 }
 
-int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx&  io_ctx)
+int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
 {
-  librados::Rados *rad = get_rados_handle();
-  int r = rgw_init_ioctx(rad, pool, io_ctx);
-  if (r != -ENOENT)
-    return r;
-
-  if (!pools_initialized)
-    return r;
-
-  r = rad->pool_create(pool.name.c_str());
-  if (r < 0 && r != -EEXIST)
-    return r;
-
-  r = rgw_init_ioctx(rad, pool, io_ctx);
-  if (r < 0)
-    return r;
-
-  r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
-  if (r < 0 && r != -EOPNOTSUPP)
-    return r;
-  return 0;
+  constexpr bool create = true; // create the pool if it doesn't exist
+  return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
 }
 
 void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
@@ -5767,32 +5757,9 @@ done:
  */
 int RGWRados::create_pool(const rgw_pool& pool)
 {
-  int ret = 0;
-
-  librados::Rados *rad = get_rados_handle();
-  ret = rad->pool_create(pool.name.c_str(), 0);
-  if (ret == -EEXIST)
-    ret = 0;
-  else if (ret == -ERANGE) {
-    ldout(cct, 0)
-      << __func__
-      << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-ret)
-      << " (this can be due to a pool or placement group misconfiguration, e.g."
-      << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
-      << dendl;
-  }
-  if (ret < 0)
-    return ret;
-
   librados::IoCtx io_ctx;
-  ret = rad->ioctx_create(pool.name.c_str(), io_ctx);
-  if (ret < 0)
-    return ret;
-
-  ret = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
-  if (ret < 0 && ret != -EOPNOTSUPP)
-    return ret;
-  return 0;
+  constexpr bool create = true;
+  return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
 }
 
 int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
@@ -7314,12 +7281,44 @@ bool RGWRados::aio_completed(void *handle)
   return c->is_safe();
 }
 
+// PutObj filter that buffers data so we don't try to compress tiny blocks.
+// libcurl reads in 16k at a time, and we need at least 64k to get a good
+// compression ratio
+class RGWPutObj_Buffer : public RGWPutObj_Filter {
+  const unsigned buffer_size;
+  bufferlist buffer;
+ public:
+  RGWPutObj_Buffer(RGWPutObjDataProcessor* next, unsigned buffer_size)
+    : RGWPutObj_Filter(next), buffer_size(buffer_size) {
+    assert(ISP2(buffer_size)); // must be power of 2
+  }
+
+  int handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj,
+                  bool *again) override {
+    if (*again || !bl.length()) {
+      // flush buffered data
+      return RGWPutObj_Filter::handle_data(buffer, ofs, phandle, pobj, again);
+    }
+    // transform offset to the beginning of the buffer
+    ofs = ofs - buffer.length();
+    buffer.claim_append(bl);
+    if (buffer.length() < buffer_size) {
+      *again = false; // don't come back until there's more data
+      return 0;
+    }
+    const auto count = P2ALIGN(buffer.length(), buffer_size);
+    buffer.splice(0, count, &bl);
+    return RGWPutObj_Filter::handle_data(bl, ofs, phandle, pobj, again);
+  }
+};
+
 class RGWRadosPutObj : public RGWGetDataCB
 {
   CephContext* cct;
   rgw_obj obj;
   RGWPutObjDataProcessor *filter;
   boost::optional<RGWPutObj_Compress>& compressor;
+  boost::optional<RGWPutObj_Buffer> buffering;
   CompressorRef& plugin;
   RGWPutObjProcessor_Atomic *processor;
   RGWOpStateSingleOp *opstate;
@@ -7365,7 +7364,9 @@ public:
     if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
       //do not compress if object is encrypted
       compressor = boost::in_place(cct, plugin, filter);
-      filter = &*compressor;
+      constexpr unsigned buffer_size = 512 * 1024;
+      buffering = boost::in_place(&*compressor, buffer_size);
+      filter = &*buffering;
     }
     return 0;
   }
@@ -7437,6 +7438,11 @@ public:
     return 0;
   }
 
+  int flush() {
+    bufferlist bl;
+    return put_data_and_throttle(filter, bl, 0, false);
+  }
+
   bufferlist& get_extra_data() { return extra_data_bl; }
 
   map<string, bufferlist>& get_attrs() { return src_attrs; }
@@ -7866,6 +7872,10 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
   if (ret < 0) {
     goto set_err_state;
   }
+  ret = cb.flush();
+  if (ret < 0) {
+    goto set_err_state;
+  }
   if (compressor && compressor->is_compressed()) {
     bufferlist tmp;
     RGWCompressionInfo cs_info;
@@ -13924,14 +13934,14 @@ uint64_t RGWRados::next_bucket_id()
   return ++max_bucket_id;
 }
 
-RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread)
+RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread,
+                                                bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_cache)
 {
-  int use_cache = cct->_conf->rgw_cache_enabled;
   RGWRados *store = NULL;
   if (!use_cache) {
     store = new RGWRados;
   } else {
-    store = new RGWCache<RGWRados>; 
+    store = new RGWCache<RGWRados>;
   }
 
   if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
index a5ba0325a606fbecf16267092a01fb876d0cff2e..c0f23825da7dfb935f3403bfa2b71584bb85a975 100644 (file)
@@ -1392,7 +1392,7 @@ struct RGWZone {
 
   bool is_read_only() { return read_only; }
 
-  bool syncs_from(const string& zone_id) {
+  bool syncs_from(const string& zone_id) const {
     return (sync_from_all || sync_from.find(zone_id) != sync_from.end());
   }
 };
@@ -3765,16 +3765,17 @@ public:
 class RGWStoreManager {
 public:
   RGWStoreManager() {}
-  static RGWRados *get_storage(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread) {
+  static RGWRados *get_storage(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads,
+                              bool run_sync_thread, bool run_reshard_thread, bool use_cache = true) {
     RGWRados *store = init_storage_provider(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread,
-                                           run_reshard_thread);
+                                           run_reshard_thread, use_cache);
     return store;
   }
   static RGWRados *get_raw_storage(CephContext *cct) {
     RGWRados *store = init_raw_storage_provider(cct);
     return store;
   }
-  static RGWRados *init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread);
+  static RGWRados *init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_metadata_cache);
   static RGWRados *init_raw_storage_provider(CephContext *cct);
   static void close_storage(RGWRados *store);
 
index 2bae84c1ecbd296fd56e1ac7ab8ee59f52d87d91..af4a72c5dc8b2079c57a58ceaff998f76f1a5b10 100644 (file)
@@ -105,7 +105,9 @@ void RGWRealmReloader::reload()
                                          cct->_conf->rgw_enable_lc_threads,
                                          cct->_conf->rgw_enable_quota_threads,
                                          cct->_conf->rgw_run_sync_thread,
-                                         cct->_conf->rgw_dynamic_resharding);
+                                         cct->_conf->rgw_dynamic_resharding,
+                                        cct->_conf->rgw_cache_enabled
+      );
 
     ldout(cct, 1) << "Creating new store" << dendl;
 
index 60267a4509e9bbeb8f2dcceb9849e4e74e67a2ec..80a886ec5d118ba6cf2271a80a632d14f3000828 100644 (file)
@@ -883,20 +883,9 @@ int RGWGetObj_ObjStore::get_params()
     get_data &= (!rgwx_stat);
   }
 
-  /* start gettorrent */
-  bool is_torrent = s->info.args.exists(GET_TORRENT);
-  bool torrent_flag = s->cct->_conf->rgw_torrent_flag;
-  if (torrent_flag && is_torrent)
-  {
-    int ret = 0;
-    ret = torrent.get_params();
-    if (ret < 0)
-    {
-      return ret;
-    }
+  if (s->info.args.exists(GET_TORRENT)) {
+    return torrent.get_params();
   }
-  /* end gettorrent */
-
   return 0;
 }
 
index 47d4e3e7a8141446da63082ea59f9d0249118f12..6824fd791ba7b180d5ec3f1681fd6dddbb61e8e3 100644 (file)
@@ -906,7 +906,15 @@ void RGWOp_BILog_Status::execute()
     return;
   }
 
-  http_ret = rgw_bucket_sync_status(store, source_zone, bucket, &status);
+  // read the bucket instance info for num_shards
+  RGWObjectCtx ctx(store);
+  RGWBucketInfo info;
+  http_ret = store->get_bucket_instance_info(ctx, bucket, info, nullptr, nullptr);
+  if (http_ret < 0) {
+    ldout(s->cct, 4) << "failed to read bucket info: " << cpp_strerror(http_ret) << dendl;
+    return;
+  }
+  http_ret = rgw_bucket_sync_status(store, source_zone, info, &status);
 }
 
 void RGWOp_BILog_Status::send_response()
index daa50dd6026ae5779ca18020057b7d7140aff8e6..b3e5d41060b2aded0b55882b85be69f4037f0549 100644 (file)
@@ -3828,6 +3828,7 @@ AWSGeneralAbstractor::get_auth_data_v4(const req_state* const s,
         case RGW_OP_PUT_BUCKET_POLICY:
         case RGW_OP_PUT_OBJ_TAGGING:
         case RGW_OP_PUT_LC:
+        case RGW_OP_SET_REQUEST_PAYMENT:
           break;
         default:
           dout(10) << "ERROR: AWS4 completion for this operation NOT IMPLEMENTED" << dendl;
index e8afaa3fa92a87302b8104a3a37c1a50eb6f9ad5..506f5f4c57b4eee90d75bb9f8692890717484aea 100644 (file)
@@ -891,14 +891,18 @@ public:
 };
 
 
+#if 0
 class S3AuthFactory : public rgw::auth::RemoteApplier::Factory,
                       public rgw::auth::LocalApplier::Factory {
   typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t;
   RGWRados* const store;
+  ImplicitTenants& implicit_tenant_context;
 
 public:
-  S3AuthFactory(RGWRados* const store)
-    : store(store) {
+  S3AuthFactory(RGWRados* const store,
+               ImplicitTenants& implicit_tenant_context)
+    : store(store),
+      implicit_tenant_context(implicit_tenant_context) {
   }
 
   aplptr_t create_apl_remote(CephContext* const cct,
@@ -908,7 +912,8 @@ public:
                             ) const override {
     return aplptr_t(
       new rgw::auth::RemoteApplier(cct, store, std::move(acl_alg), info,
-                                   cct->_conf->rgw_keystone_implicit_tenants));
+                                   implicit_tenant_context,
+                                   rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_S3));
   }
 
   aplptr_t create_apl_local(CephContext* const cct,
@@ -919,6 +924,7 @@ public:
         new rgw::auth::LocalApplier(cct, user_info, subuser));
   }
 };
+#endif
 
 
 } /* namespace s3 */
index 9dd40af9ce5d636b528adb2f8fad54a91b6edd7b..eab2420c2d377e0eb6e2690169cbc60ee4eed42d 100644 (file)
@@ -571,20 +571,12 @@ static int get_swift_container_settings(req_state * const s,
                                         RGWCORSConfiguration * const cors_config,
                                         bool * const has_cors)
 {
-  string read_list, write_list;
-
-  const char * const read_attr = s->info.env->get("HTTP_X_CONTAINER_READ");
-  if (read_attr) {
-    read_list = read_attr;
-  }
-  const char * const write_attr = s->info.env->get("HTTP_X_CONTAINER_WRITE");
-  if (write_attr) {
-    write_list = write_attr;
-  }
+  const char * const read_list = s->info.env->get("HTTP_X_CONTAINER_READ");
+  const char * const write_list = s->info.env->get("HTTP_X_CONTAINER_WRITE");
 
   *has_policy = false;
 
-  if (read_attr || write_attr) {
+  if (read_list || write_list) {
     RGWAccessControlPolicy_SWIFT swift_policy(s->cct);
     const auto r = swift_policy.create(store,
                                        s->user->user_id,
index 84194bfa8c0347c48c69a5b8bf4aa44ee0189451..ee526e3c03a41fdc80e1d834dbebf3c192d47c58 100644 (file)
@@ -212,7 +212,7 @@ void RGWOp_User_Modify::execute()
   bool gen_key;
   bool suspended;
   bool system;
-
+  bool email_set;
   bool quota_set;
   int32_t max_buckets;
 
@@ -222,7 +222,7 @@ void RGWOp_User_Modify::execute()
   rgw_user uid(uid_str);
 
   RESTArgs::get_string(s, "display-name", display_name, &display_name);
-  RESTArgs::get_string(s, "email", email, &email);
+  RESTArgs::get_string(s, "email", email, &email, &email_set);
   RESTArgs::get_string(s, "access-key", access_key, &access_key);
   RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
   RESTArgs::get_string(s, "user-caps", caps, &caps);
@@ -241,7 +241,10 @@ void RGWOp_User_Modify::execute()
 
   op_state.set_user_id(uid);
   op_state.set_display_name(display_name);
-  op_state.set_user_email(email);
+
+  if (email_set)
+    op_state.set_user_email(email);
+
   op_state.set_caps(caps);
   op_state.set_access_key(access_key);
   op_state.set_secret_key(secret_key);
index cc508202db855fd5e07be02eec58d6df0026d48f..f778e361166a0998d7d76b11c94d130c20a82dcb 100644 (file)
@@ -164,6 +164,7 @@ class DefaultStrategy : public rgw::auth::Strategy,
                         public rgw::auth::LocalApplier::Factory,
                         public rgw::auth::swift::TempURLApplier::Factory {
   RGWRados* const store;
+  ImplicitTenants& implicit_tenant_context;
 
   /* The engines. */
   const rgw::auth::swift::TempURLEngine tempurl_engine;
@@ -192,7 +193,8 @@ class DefaultStrategy : public rgw::auth::Strategy,
       rgw::auth::add_3rdparty(store, s->account_name,
         rgw::auth::add_sysreq(cct, store, s,
           rgw::auth::RemoteApplier(cct, store, std::move(extra_acl_strategy), info,
-                                   cct->_conf->rgw_keystone_implicit_tenants)));
+                                   implicit_tenant_context,
+                                   rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_SWIFT)));
     /* TODO(rzarzynski): replace with static_ptr. */
     return aplptr_t(new decltype(apl)(std::move(apl)));
   }
@@ -220,8 +222,10 @@ class DefaultStrategy : public rgw::auth::Strategy,
 
 public:
   DefaultStrategy(CephContext* const cct,
+                  ImplicitTenants& implicit_tenant_context,
                   RGWRados* const store)
     : store(store),
+      implicit_tenant_context(implicit_tenant_context),
       tempurl_engine(cct,
                      store,
                      static_cast<rgw::auth::swift::TempURLApplier::Factory*>(this)),
index d45d9a92e3302c751042203ca33d0b0cc5c23bf9..bd164b6231547f1483cfbc4a3efc5b5c1b745c42 100644 (file)
@@ -1310,8 +1310,8 @@ class RGWMetaSyncShardCR : public RGWCoroutine {
   string max_marker;
   const std::string& period_marker; //< max marker stored in next period
 
-  map<string, bufferlist> entries;
-  map<string, bufferlist>::iterator iter;
+  std::set<std::string> entries;
+  std::set<std::string>::iterator iter;
 
   string oid;
 
@@ -1502,20 +1502,20 @@ public:
         }
         iter = entries.begin();
         for (; iter != entries.end(); ++iter) {
-          ldout(sync_env->cct, 20) << __func__ << ": full sync: " << iter->first << dendl;
+          marker = *iter;
+          ldout(sync_env->cct, 20) << __func__ << ": full sync: " << marker << dendl;
           total_entries++;
-          if (!marker_tracker->start(iter->first, total_entries, real_time())) {
-            ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << iter->first << ". Duplicate entry?" << dendl;
+          if (!marker_tracker->start(marker, total_entries, real_time())) {
+            ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << marker << ". Duplicate entry?" << dendl;
           } else {
             // fetch remote and write locally
             yield {
-              RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, iter->first, iter->first, MDLOG_STATUS_COMPLETE, marker_tracker), false);
+              RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, marker, marker, MDLOG_STATUS_COMPLETE, marker_tracker), false);
               // stack_to_pos holds a reference to the stack
-              stack_to_pos[stack] = iter->first;
-              pos_to_prev[iter->first] = marker;
+              stack_to_pos[stack] = marker;
+              pos_to_prev[marker] = marker;
             }
           }
-          marker = iter->first;
         }
         collect_children();
       } while ((int)entries.size() == max_entries && can_adjust_marker);
index 6c8c65c5a14cc3f8f8e6cded4cacdbb89e89636f..0c30c91ed847874418ef1f9a76e33356fca9f019 100644 (file)
@@ -306,6 +306,11 @@ struct es_obj_metadata {
       } else if (name == "x-amz-tagging") {
         auto tags_bl = val.begin();
         ::decode(obj_tags, tags_bl);
+      } else if (name == "compression") {
+        RGWCompressionInfo cs_info;
+        auto vals_bl = val.begin();
+        decode(cs_info, vals_bl);
+        out_attrs[name] = cs_info.compression_type;
       } else {
         if (name != "pg_ver" &&
             name != "source_zone" &&
index ad85148b3308e7b6ce30829d41b9e66da35aca62..e132f28c15c6fc7741df3fa821d785aeb7bf5c64 100644 (file)
@@ -59,27 +59,21 @@ int seed::get_torrent_file(RGWRados::Object::Read &read_op,
   }
 
   string oid, key;
-  map<string, bufferlist> m;
-  set<string> obj_key;
   get_obj_bucket_and_oid_loc(obj, oid, key);
-  ldout(s->cct, 0) << "NOTICE: head obj oid= " << oid << dendl;
+  ldout(s->cct, 20) << "NOTICE: head obj oid= " << oid << dendl;
 
-  obj_key.insert(RGW_OBJ_TORRENT);
-  const int op_ret = read_op.state.io_ctx.omap_get_vals_by_keys(oid, obj_key, &m);
-  if (op_ret < 0)
-  {
-    ldout(s->cct, 0) << "ERROR: failed to omap_get_vals_by_keys op_ret = "
-                     << op_ret << dendl;
-    return op_ret;
+  const set<string> obj_key{RGW_OBJ_TORRENT};
+  map<string, bufferlist> m;
+  const int r = read_op.state.io_ctx.omap_get_vals_by_keys(oid, obj_key, &m);
+  if (r < 0) {
+    ldout(s->cct, 0) << "ERROR: omap_get_vals_by_keys failed: " << r << dendl;
+    return r;
   }
-
-  map<string, bufferlist>::iterator iter;
-  for (iter = m.begin(); iter != m.end(); ++iter)
-  {
-    bufferlist bl_tmp = iter->second;
-    char *pbuff = bl_tmp.c_str();
-    bl.append(pbuff, bl_tmp.length());
+  if (m.size() != 1) {
+    ldout(s->cct, 0) << "ERROR: omap key " RGW_OBJ_TORRENT " not found" << dendl;
+    return -EINVAL;
   }
+  bl.append(std::move(m.begin()->second));
   dencode.bencode_end(bl);
 
   bl_data = bl;
index 7fe88421805f13b5f1e08b1dd7d8365f2d6ceb01..ae8fe1fff22bc8f6ce22973583a33cedf29d2a80 100644 (file)
@@ -2376,7 +2376,7 @@ int RGWUserAdminOp_User::info(RGWRados *store, RGWUserAdminOpState& op_state,
   RGWStorageStats *arg_stats = NULL;
   if (op_state.fetch_stats) {
     int ret = store->get_user_stats(info.user_id, stats);
-    if (ret < 0) {
+    if (ret < 0 && ret != -ENOENT) {
       return ret;
     }
 
diff --git a/ceph/src/script/build-integration-branch b/ceph/src/script/build-integration-branch
new file mode 100755 (executable)
index 0000000..3189d85
--- /dev/null
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import json
+import os
+import requests
+from subprocess import call
+import sys
+import time
+try:
+    from urllib.parse import urljoin
+except:
+    from urlparse import urljoin
+
+label = sys.argv[1]
+repo = "ceph/ceph"
+
+with open(os.environ['HOME'] + '/.github_token', 'r') as myfile:
+    token = myfile.readline().strip()
+
+# get prs
+baseurl = urljoin('https://api.github.com',
+                  'repos/{repo}/issues?labels={label}&access_token={token}')
+url = baseurl.format(
+    label=label,
+    repo=repo,
+    token=token)
+r = requests.get(url)
+assert(r.ok)
+j = json.loads(r.text or r.content)
+print("--- found %d issues tagged with %s" % (len(j), label))
+
+prs = []
+prtext = []
+for issue in j:
+    if 'pull_request' not in issue:
+        continue
+    r = requests.get(issue['pull_request']['url'] + '?access_token=' + token)
+    pr = json.loads(r.text or r.content)
+    prs.append(pr)
+    prtext.append(pr['html_url'] + ' - ' + pr['title'])
+print("--- queried %s prs" % len(prs))
+
+# name branch
+TIME_FORMAT = '%Y-%m-%d-%H%M'
+branch = label + "-" + time.strftime(TIME_FORMAT, time.localtime())
+print("branch %s" % branch)
+
+# assemble
+print('--- creating branch %s' % branch)
+r = call(['git', 'checkout', '-b', branch])
+assert not r
+for pr in prs:
+    print('--- pr %d --- pulling %s branch %s' % (
+        pr['number'],
+        pr['head']['repo']['clone_url'],
+        pr['head']['ref']))
+    r = call(['git', 'pull', '--no-edit',
+              pr['head']['repo']['clone_url'],
+              pr['head']['ref']])
+    assert not r
+print('--- done. these PRs were included:')
+print('\n'.join(prtext).encode('ascii', errors='ignore').decode())
+print('--- perhaps you want to: make && ctest -j12 && git push ci %s' % branch)
index f3999e3b0114abfdc56cae6c5501a03e470a8c71..aad733c9a4f5689ab829a9008fe2e47c4b542f58 100644 (file)
@@ -568,7 +568,7 @@ if(WITH_RBD)
   endif(FREEBSD)
 endif(WITH_RBD)
 if(WITH_RADOSGW)
-  add_dependencies(tests radosgw-admin)
+  add_dependencies(tests radosgw radosgw-admin)
 endif(WITH_RADOSGW)
 if(NOT FREEBSD)
   add_dependencies(tests ceph-detect-init)
index 332f2e4f9a7108d66c3cc884e6a5c10933dec007..38eafe9a3558e6efeb6787fc14b992b6e511056e 100644 (file)
@@ -15,28 +15,28 @@ type 3 root
 host host0 {
        id -1           # do not change unnecessarily
        # weight 1.000
-       alg straw
+       alg straw2
        hash 0  # rjenkins1
        item device0 weight 1.000
 }
 host host1 {
        id -2           # do not change unnecessarily
        # weight 1.000
-       alg straw
+       alg straw2
        hash 0  # rjenkins1
        item device1 weight 1.000
 }
 host host2 {
        id -5           # do not change unnecessarily
        # weight 1.000
-       alg straw
+       alg straw2
        hash 0  # rjenkins1
        item device2 weight 1.000
 }
 rack rack0 {
        id -3           # do not change unnecessarily
        # weight 3.000
-       alg straw
+       alg straw2
        hash 0  # rjenkins1
        item host0 weight 1.000
        item host1 weight 1.000
@@ -45,7 +45,7 @@ rack rack0 {
 root root {
        id -4           # do not change unnecessarily
        # weight 4.000
-       alg straw
+       alg straw2
        hash 0  # rjenkins1
        item rack0 weight 4.000
 }
index dbe8e5bba7e6cac4fefece428c3b876b9ce7e3b0..1172aefb3ff2165c0e86881fdfd77edff0764419 100644 (file)
@@ -45,7 +45,7 @@
               "type_id": 1,
               "type_name": "host",
               "weight": 65536,
-              "alg": "straw",
+              "alg": "straw2",
               "hash": "rjenkins1",
               "items": [
                   {
@@ -61,7 +61,7 @@
               "type_id": 1,
               "type_name": "host",
               "weight": 65536,
-              "alg": "straw",
+              "alg": "straw2",
               "hash": "rjenkins1",
               "items": [
                   {
@@ -77,7 +77,7 @@
               "type_id": 2,
               "type_name": "rack",
               "weight": 196608,
-              "alg": "straw",
+              "alg": "straw2",
               "hash": "rjenkins1",
               "items": [
                   {
               "type_id": 3,
               "type_name": "root",
               "weight": 262144,
-              "alg": "straw",
+              "alg": "straw2",
               "hash": "rjenkins1",
               "items": [
                   {
               "type_id": 1,
               "type_name": "host",
               "weight": 65536,
-              "alg": "straw",
+              "alg": "straw2",
               "hash": "rjenkins1",
               "items": [
                   {
           "profile": "argonaut",
           "optimal_tunables": 0,
           "legacy_tunables": 1,
-          "minimum_required_version": "argonaut",
+          "minimum_required_version": "hammer",
           "require_feature_tunables": 0,
           "require_feature_tunables2": 0,
           "has_v2_rules": 0,
           "require_feature_tunables3": 0,
           "has_v3_rules": 0,
-          "has_v4_buckets": 0,
+          "has_v4_buckets": 1,
           "require_feature_tunables5": 0,
           "has_v5_rules": 0
       },
index 28150cac7110df7fde8752aa557f9ca863d98093..f847c7677eb0736d07807e09e8b8098dd67512ac 100644 (file)
      --max-buckets             max number of buckets for a user
      --admin                   set the admin flag on the user
      --system                  set the system flag on the user
-     --bucket=<bucket>
-     --pool=<pool>
-     --object=<object>
-     --date=<date>
-     --start-date=<date>
-     --end-date=<date>
-     --bucket-id=<bucket-id>
-     --shard-id=<shard-id>     optional for mdlog list
+     --bucket=<bucket>         Specify the bucket name. Also used by the quota command.
+     --pool=<pool>             Specify the pool name. Also used to scan for leaked rados objects.
+     --object=<object>         object name
+     --date=<date>             date in the format yyyy-mm-dd
+     --start-date=<date>       start date in the format yyyy-mm-dd
+     --end-date=<date>         end date in the format yyyy-mm-dd
+     --bucket-id=<bucket-id>   bucket id
+     --shard-id=<shard-id>     optional for: 
+                                 mdlog list
+                                 data sync status
                                required for: 
                                  mdlog trim
                                  replica mdlog get/delete
                                  replica datalog get/delete
+     --max-entries=<entries>   max entries for listing operations
      --metadata-key=<key>      key to retrieve metadata from with metadata get
      --remote=<remote>         zone or zonegroup id of remote gateway
      --period=<id>             period id
index cbd9dd30a6b083579fadb95c55217e45afe5b11d..d1e45aced1a8e1ca29c15195523aa57f64607479 100644 (file)
@@ -1024,7 +1024,7 @@ TEST(CrushWrapper, choose_args_compat) {
   crush_choose_arg choose_args[maxbuckets];
   memset(choose_args, '\0', sizeof(crush_choose_arg) * maxbuckets);
   choose_args[-1-id].ids_size = 0;
-  choose_args[-1-id].weight_set_size = 1;
+  choose_args[-1-id].weight_set_positions = 1;
   choose_args[-1-id].weight_set = &weight_set;
   crush_choose_arg_map arg_map;
   arg_map.size = c.get_max_buckets();
@@ -1042,7 +1042,7 @@ TEST(CrushWrapper, choose_args_compat) {
     CrushWrapper c_new;
     c_new.decode(i);
     ASSERT_EQ(1u, c_new.choose_args.size());
-    ASSERT_EQ(1u, c_new.choose_args[caid].args[-1-id].weight_set_size);
+    ASSERT_EQ(1u, c_new.choose_args[caid].args[-1-id].weight_set_positions);
     ASSERT_EQ(weights, c_new.choose_args[caid].args[-1-id].weight_set[0].weights[0]);
     ASSERT_EQ(weight, c_new.get_bucket_item_weightf(id, 0));
   }
index 7002e12283931cb5fe748051822ff64060ae9355..4ca390f6405a1f88669da9e718f16c509bf3d891 100644 (file)
@@ -1,6 +1,16 @@
-# unittest_mds_types
-add_executable(unittest_mds_types
-  mds_types.cc
+if(${WITH_CEPHFS})
+
+  # unittest_mds_types
+  add_executable(unittest_mds_types
+    mds_types.cc
+    )
+  add_ceph_unittest(unittest_mds_types ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_mds_types)
+  target_link_libraries(unittest_mds_types global)
+
+  add_executable(ceph_test_trim_caps
+    test_trim_caps.cc
   )
-add_ceph_unittest(unittest_mds_types ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_mds_types)
-target_link_libraries(unittest_mds_types global)
+  target_link_libraries(ceph_test_trim_caps ceph-common cephfs)
+  install(TARGETS ceph_test_trim_caps DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+endif(${WITH_CEPHFS})
diff --git a/ceph/src/test/fs/test_trim_caps.cc b/ceph/src/test/fs/test_trim_caps.cc
new file mode 100644 (file)
index 0000000..a7fd814
--- /dev/null
@@ -0,0 +1,83 @@
+#define _FILE_OFFSET_BITS 64
+#include <features.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+#include <unistd.h>
+#include <include/cephfs/libcephfs.h>
+
+int main(int argc, char *argv[]) 
+{
+       char buf;
+       int pipefd[2];
+       int rc = pipe(pipefd);
+    assert(rc >= 0);
+
+       pid_t pid = fork();
+       assert(pid >= 0);
+       if (pid == 0)
+               close(pipefd[1]);
+       else
+               close(pipefd[0]);
+
+       struct ceph_mount_info *cmount = NULL;
+
+       ceph_create(&cmount, "admin");
+       ceph_conf_read_file(cmount, NULL);
+
+       int ret = ceph_mount(cmount, NULL);
+       assert(ret >= 0);
+
+       if (pid == 0) {
+               ret = read(pipefd[0], &buf, 1);
+               assert(ret == 1);
+
+               ret = ceph_rename(cmount, "1", "3");
+               assert(ret >= 0);
+
+               ret = ceph_rename(cmount, "2", "1");
+               assert(ret >= 0);
+
+               ceph_unmount(cmount);
+               printf("child exits\n");
+       } else {
+               ret = ceph_mkdirs(cmount, "1/2", 0755);
+               assert(ret >= 0);
+
+               struct ceph_statx stx;
+               ret = ceph_statx(cmount, "1", &stx, 0, 0);
+               assert(ret >= 0);
+               uint64_t orig_ino = stx.stx_ino;
+
+
+               ret = ceph_mkdir(cmount, "2", 0755);
+               assert(ret >= 0);
+
+               ret = write(pipefd[1], &buf, 1);
+               assert(ret == 1);
+
+               int wstatus;
+               ret = waitpid(pid, &wstatus, 0);
+               assert(ret >= 0);
+               assert(wstatus == 0);
+
+               // make origin '1' no parent dentry
+               ret = ceph_statx(cmount, "1", &stx, 0, 0);
+               assert(ret >= 0);
+               assert(orig_ino != stx.stx_ino);
+
+               // move root inode's cap_item to tail of session->caps
+               ret = ceph_statx(cmount, ".", &stx, 0, 0);
+               assert(ret >= 0);
+
+               printf("waiting for crash\n");
+        sleep(60);
+       }
+       return 0;
+}
index 30acadb9f7ed0cb7edefeeb7e253dec10e0eadf8..535998e3d758180fddf0b1e243efdb7004bcb5c7 100644 (file)
@@ -614,6 +614,68 @@ TEST(LibCephFS, LstatSlashdot) {
   ceph_shutdown(cmount);
 }
 
+TEST(LibCephFS, StatDirNlink) {
+  struct ceph_mount_info *cmount;
+  ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+  ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
+  ASSERT_EQ(ceph_mount(cmount, NULL), 0);
+
+  char test_dir1[256];
+  sprintf(test_dir1, "dir1_symlinks_%d", getpid());
+  ASSERT_EQ(ceph_mkdir(cmount, test_dir1, 0700), 0);
+
+  int fd = ceph_open(cmount, test_dir1, O_DIRECTORY|O_RDONLY, 0);
+  ASSERT_GT(fd, 0);
+  struct ceph_statx stx;
+  ASSERT_EQ(ceph_fstatx(cmount, fd, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+  ASSERT_EQ(stx.stx_nlink, 2);
+
+  {
+    char test_dir2[256];
+    sprintf(test_dir2, "%s/.", test_dir1);
+    ASSERT_EQ(ceph_statx(cmount, test_dir2, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+    ASSERT_EQ(stx.stx_nlink, 2);
+  }
+
+  {
+    char test_dir2[256];
+    sprintf(test_dir2, "%s/1", test_dir1);
+    ASSERT_EQ(ceph_mkdir(cmount, test_dir2, 0700), 0);
+    ASSERT_EQ(ceph_statx(cmount, test_dir2, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+    ASSERT_EQ(stx.stx_nlink, 2);
+      ASSERT_EQ(ceph_statx(cmount, test_dir1, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+      ASSERT_EQ(stx.stx_nlink, 3);
+    sprintf(test_dir2, "%s/2", test_dir1);
+    ASSERT_EQ(ceph_mkdir(cmount, test_dir2, 0700), 0);
+      ASSERT_EQ(ceph_statx(cmount, test_dir1, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+      ASSERT_EQ(stx.stx_nlink, 4);
+    sprintf(test_dir2, "%s/1/1", test_dir1);
+    ASSERT_EQ(ceph_mkdir(cmount, test_dir2, 0700), 0);
+      ASSERT_EQ(ceph_statx(cmount, test_dir1, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+      ASSERT_EQ(stx.stx_nlink, 4);
+    ASSERT_EQ(ceph_rmdir(cmount, test_dir2), 0);
+      ASSERT_EQ(ceph_statx(cmount, test_dir1, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+      ASSERT_EQ(stx.stx_nlink, 4);
+    sprintf(test_dir2, "%s/1", test_dir1);
+    ASSERT_EQ(ceph_rmdir(cmount, test_dir2), 0);
+      ASSERT_EQ(ceph_statx(cmount, test_dir1, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+      ASSERT_EQ(stx.stx_nlink, 3);
+    sprintf(test_dir2, "%s/2", test_dir1);
+    ASSERT_EQ(ceph_rmdir(cmount, test_dir2), 0);
+      ASSERT_EQ(ceph_statx(cmount, test_dir1, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+      ASSERT_EQ(stx.stx_nlink, 2);
+  }
+
+  ASSERT_EQ(ceph_rmdir(cmount, test_dir1), 0);
+  ASSERT_EQ(ceph_fstatx(cmount, fd, &stx, CEPH_STATX_NLINK, AT_SYMLINK_NOFOLLOW), 0);
+  ASSERT_EQ(stx.stx_nlink, 0);
+
+  ceph_close(cmount, fd);
+
+  ceph_shutdown(cmount);
+}
+
 TEST(LibCephFS, DoubleChmod) {
 
   struct ceph_mount_info *cmount;
index d795b84683fc50d2a50b37e3a3f7bd50b2e3f26f..142622597386eaa510bfba80d79e430775268dfa 100644 (file)
@@ -235,6 +235,9 @@ TEST(LibRadosAio, PoolQuotaPP) {
   }
   ASSERT_LT(n, 1024);
 
+  // make sure we have latest map that marked the pool full
+  test_data.m_cluster.wait_for_latest_osdmap();
+
   // make sure we block without FULL_TRY
   {
     ObjectWriteOperation op;
index db90ac80c3aa6b498e4768fb3b44b9744bfbe2cb..35a5423f79cd29cd3288962da2f7a978bd941571 100644 (file)
@@ -1002,8 +1002,8 @@ TEST_F(LibRadosList, EnumerateObjects) {
 
   // Ensure a non-power-of-two PG count to avoid only
   // touching the easy path.
-  std::string err_str = set_pg_num(&s_cluster, pool_name, 11);
-  ASSERT_TRUE(err_str.empty());
+  ASSERT_TRUE(set_pg_num(&s_cluster, pool_name, 11).empty());
+  ASSERT_TRUE(set_pgp_num(&s_cluster, pool_name, 11).empty());
 
   std::set<std::string> saw_obj;
   rados_object_list_cursor c = rados_object_list_begin(ioctx);
@@ -1050,8 +1050,8 @@ TEST_F(LibRadosList, EnumerateObjectsSplit) {
 
   // Ensure a non-power-of-two PG count to avoid only
   // touching the easy path.
-  std::string err_str = set_pg_num(&s_cluster, pool_name, 11);
-  ASSERT_TRUE(err_str.empty());
+  ASSERT_TRUE(set_pg_num(&s_cluster, pool_name, 11).empty());
+  ASSERT_TRUE(set_pgp_num(&s_cluster, pool_name, 11).empty());
 
   rados_object_list_cursor begin = rados_object_list_begin(ioctx);
   rados_object_list_cursor end = rados_object_list_end(ioctx);
index c276d33ff7973688b97daeff0485ec39e8e8e9a2..2d4185289f2f2b4f27791b5b124ed3ab1da5692c 100644 (file)
@@ -101,34 +101,67 @@ int rados_pool_set(
   return ret;
 }
 
-}
-
-std::string set_pg_num(
-    rados_t *cluster, const std::string &pool_name, uint32_t pg_num)
-{
-  // Wait for 'creating' to clear
-  int r = wait_for_healthy(cluster);
-  if (r != 0) {
-    goto err;
+struct pool_op_error : std::exception {
+  string msg;
+  pool_op_error(const std::string& pool_name,
+               const std::string& func_name,
+               int err) {
+    std::ostringstream oss;
+    oss << func_name << "(" << pool_name << ") failed with error " << err;
+    msg = oss.str();
   }
-
-  // Adjust pg_num
-  r = rados_pool_set(cluster, pool_name, "pg_num", stringify(pg_num));
-  if (r != 0) {
-    goto err;
+  const char* what() const noexcept override {
+    return msg.c_str();
   }
+};
 
-  // Wait for 'creating' to clear
-  r = wait_for_healthy(cluster);
-  if (r != 0) {
-    goto err;
+template<typename Func>
+std::string with_healthy_cluster(rados_t* cluster,
+                                const std::string& pool_name,
+                                Func&& func)
+{
+  try {
+    // Wait for 'creating/backfilling' to clear
+    int r = wait_for_healthy(cluster);
+    if (r != 0) {
+      throw pool_op_error{pool_name, "wait_for_healthy", r};
+    }
+    func();
+    // Wait for 'creating/backfilling' to clear
+    r = wait_for_healthy(cluster);
+    if (r != 0) {
+      throw pool_op_error{pool_name, "wait_for_healthy", r};
+    }
+  } catch (const pool_op_error& e) {
+    rados_shutdown(*cluster);
+    return e.what();
   }
-
   return "";
+}
+}
+
+std::string set_pg_num(
+    rados_t *cluster, const std::string &pool_name, uint32_t pg_num)
+{
+  return with_healthy_cluster(cluster, pool_name, [&] {
+    // Adjust pg_num
+      int r = rados_pool_set(cluster, pool_name, "pg_num",
+                            stringify(pg_num));
+      if (r != 0) {
+       throw pool_op_error{pool_name, "set_pg_num", r};
+      }
+  });
+}
 
-err:
-  rados_shutdown(*cluster);
-  std::ostringstream oss;
-  oss << __func__ << "(" << pool_name << ") failed with error " << r;
-  return oss.str();
+std::string set_pgp_num(
+    rados_t *cluster, const std::string &pool_name, uint32_t pgp_num)
+{
+  return with_healthy_cluster(cluster, pool_name, [&] {
+    // Adjust pgp_num
+    int r = rados_pool_set(cluster, pool_name, "pgp_num",
+                          stringify(pgp_num));
+    if (r != 0) {
+      throw pool_op_error{pool_name, "set_pgp_num", r};
+    }
+  });
 }
index f3f1bdc494f10b55136e917c19be3ae0fb30afd7..71ef9de2cf5fc28d33e1dbba23d26b55e6b4f570 100644 (file)
@@ -5,3 +5,5 @@
 
 std::string set_pg_num(
     rados_t *cluster, const std::string &pool_name, uint32_t pg_num);
+std::string set_pgp_num(
+    rados_t *cluster, const std::string &pool_name, uint32_t pgp_num);
index e318cc3916b596f4abfdc36e2d605eda7c11d181..a20828a81e4f59f2bc5fe5388195a1b664dc5663 100644 (file)
@@ -288,6 +288,12 @@ public:
     }
   }
 
+  void expect_writeback_cache_enabled(MockReplayImageCtx &mock_image_ctx,
+                                      bool enabled) {
+    EXPECT_CALL(mock_image_ctx, is_writeback_cache_enabled())
+      .WillRepeatedly(Return(enabled));
+  }
+
   void when_process(MockJournalReplay &mock_journal_replay,
                     EventEntry &&event_entry, Context *on_ready,
                     Context *on_safe) {
@@ -367,6 +373,7 @@ TEST_F(TestMockJournalReplay, AioDiscard) {
 
   MockJournalReplay mock_journal_replay(mock_image_ctx);
   MockIoImageRequest mock_io_image_request;
+  expect_writeback_cache_enabled(mock_image_ctx, true);
   expect_op_work_queue(mock_image_ctx);
 
   InSequence seq;
@@ -400,6 +407,7 @@ TEST_F(TestMockJournalReplay, AioWrite) {
 
   MockJournalReplay mock_journal_replay(mock_image_ctx);
   MockIoImageRequest mock_io_image_request;
+  expect_writeback_cache_enabled(mock_image_ctx, true);
   expect_op_work_queue(mock_image_ctx);
 
   InSequence seq;
@@ -464,6 +472,7 @@ TEST_F(TestMockJournalReplay, AioWriteSame) {
 
   MockJournalReplay mock_journal_replay(mock_image_ctx);
   MockIoImageRequest mock_io_image_request;
+  expect_writeback_cache_enabled(mock_image_ctx, true);
   expect_op_work_queue(mock_image_ctx);
 
   InSequence seq;
@@ -500,6 +509,7 @@ TEST_F(TestMockJournalReplay, AioCompareAndWrite) {
   MockJournalReplay mock_compare_and_write_journal_replay(mock_image_ctx);
   MockJournalReplay mock_mis_compare_and_write_journal_replay(mock_image_ctx);
   MockIoImageRequest mock_io_image_request;
+  expect_writeback_cache_enabled(mock_image_ctx, true);
   expect_op_work_queue(mock_image_ctx);
 
   InSequence seq;
@@ -560,6 +570,7 @@ TEST_F(TestMockJournalReplay, IOError) {
 
   MockJournalReplay mock_journal_replay(mock_image_ctx);
   MockIoImageRequest mock_io_image_request;
+  expect_writeback_cache_enabled(mock_image_ctx, true);
   expect_op_work_queue(mock_image_ctx);
 
   InSequence seq;
@@ -593,6 +604,7 @@ TEST_F(TestMockJournalReplay, SoftFlushIO) {
 
   MockJournalReplay mock_journal_replay(mock_image_ctx);
   MockIoImageRequest mock_io_image_request;
+  expect_writeback_cache_enabled(mock_image_ctx, true);
   expect_op_work_queue(mock_image_ctx);
 
   InSequence seq;
@@ -637,6 +649,7 @@ TEST_F(TestMockJournalReplay, PauseIO) {
 
   MockJournalReplay mock_journal_replay(mock_image_ctx);
   MockIoImageRequest mock_io_image_request;
+  expect_writeback_cache_enabled(mock_image_ctx, true);
   expect_op_work_queue(mock_image_ctx);
 
   InSequence seq;
@@ -672,6 +685,8 @@ TEST_F(TestMockJournalReplay, PauseIO) {
 }
 
 TEST_F(TestMockJournalReplay, Flush) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
@@ -683,6 +698,7 @@ TEST_F(TestMockJournalReplay, Flush) {
 
   MockJournalReplay mock_journal_replay(mock_image_ctx);
   MockIoImageRequest mock_io_image_request;
+  expect_writeback_cache_enabled(mock_image_ctx, true);
   expect_op_work_queue(mock_image_ctx);
 
   InSequence seq;
@@ -785,6 +801,8 @@ TEST_F(TestMockJournalReplay, BlockedOpFinishError) {
 }
 
 TEST_F(TestMockJournalReplay, MissingOpFinishEvent) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
@@ -844,6 +862,8 @@ TEST_F(TestMockJournalReplay, MissingOpFinishEvent) {
 }
 
 TEST_F(TestMockJournalReplay, MissingOpFinishEventCancelOps) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
@@ -896,6 +916,8 @@ TEST_F(TestMockJournalReplay, MissingOpFinishEventCancelOps) {
 }
 
 TEST_F(TestMockJournalReplay, UnknownOpFinishEvent) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
@@ -2008,5 +2030,37 @@ TEST_F(TestMockJournalReplay, LockLostBeforeExecuteOp) {
   ASSERT_EQ(-ECANCELED, on_finish_safe.wait());
 }
 
+TEST_F(TestMockJournalReplay, WritebackCacheDisabled) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockReplayImageCtx mock_image_ctx(*ictx);
+
+  MockExclusiveLock mock_exclusive_lock;
+  mock_image_ctx.exclusive_lock = &mock_exclusive_lock;
+  expect_accept_ops(mock_exclusive_lock, true);
+
+  MockJournalReplay mock_journal_replay(mock_image_ctx);
+  MockIoImageRequest mock_io_image_request;
+  expect_writeback_cache_enabled(mock_image_ctx, false);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  io::AioCompletion *aio_comp;
+  C_SaferCond on_ready;
+  C_SaferCond on_safe;
+  expect_aio_discard(mock_io_image_request, &aio_comp, 123, 456, false);
+  when_process(mock_journal_replay,
+               EventEntry{AioDiscardEvent(123, 456, false)},
+               &on_ready, &on_safe);
+
+  when_complete(mock_image_ctx, aio_comp, 0);
+  ASSERT_EQ(0, on_ready.wait());
+  ASSERT_EQ(0, on_safe.wait());
+  ASSERT_EQ(0, when_shut_down(mock_journal_replay, false));
+}
+
 } // namespace journal
 } // namespace librbd
index 223ea537d5556ca4bb41bcd2bc85598f96465a6d..4bb723dc766e4f0c70258235183c32c15914adfd 100644 (file)
@@ -218,6 +218,8 @@ struct MockImageCtx {
   MOCK_CONST_METHOD0(get_stripe_count, uint64_t());
   MOCK_CONST_METHOD0(get_stripe_period, uint64_t());
 
+  MOCK_CONST_METHOD0(is_writeback_cache_enabled, bool());
+
   ImageCtx *image_ctx;
   CephContext *cct;
   PerfCounters *perfcounter;
index c5345d662d1a379d0d0d22dbf4f9aa68e8cd04af..8b84ff4b6d2f0dba2038372087ea4520404c8fec 100644 (file)
@@ -113,9 +113,10 @@ public:
    * authorizer, false otherwise.
    */
   bool ms_verify_authorizer(Connection *con, int peer_type,
-                                   int protocol, bufferlist& authorizer,
-                                   bufferlist& authorizer_reply,
-                                   bool& isvalid, CryptoKey& session_key) override {
+                           int protocol, bufferlist& authorizer,
+                           bufferlist& authorizer_reply,
+                           bool& isvalid, CryptoKey& session_key,
+                           std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
     /* always succeed */
     isvalid = true;
     return true;
index 3b59108071fb0b1405467b0801f1bed2fa2b8009..495fa3a7521669c354cfd1d01b74cea73fe8526d 100644 (file)
@@ -115,7 +115,8 @@ public:
   virtual bool ms_verify_authorizer(Connection *con, int peer_type,
                                    int protocol, bufferlist& authorizer,
                                    bufferlist& authorizer_reply,
-                                   bool& isvalid, CryptoKey& session_key) {
+                                   bool& isvalid, CryptoKey& session_key,
+                                   std::unique_ptr<AuthAuthorizerChallenge> *challenge) {
     /* always succeed */
     isvalid = true;
     return true;
index 5774c593974d27d46feb2c19ad358bbf40202d7c..b57a9c02f9da1d3b71abeaa2bf2da44a3ec81b2c 100644 (file)
@@ -58,7 +58,8 @@ class MessengerClient {
     bool ms_handle_refused(Connection *con) override { return false; }
     bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
                               bufferlist& authorizer, bufferlist& authorizer_reply,
-                              bool& isvalid, CryptoKey& session_key) override {
+                              bool& isvalid, CryptoKey& session_key,
+                             std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
       isvalid = true;
       return true;
     }
index 79e36721aa843f3e9659b13555a6c6cf789ac00c..d9bf20727b20df1aab0d0080cdadde3b6b6850d0 100644 (file)
@@ -100,7 +100,8 @@ class ServerDispatcher : public Dispatcher {
   }
   bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
                             bufferlist& authorizer, bufferlist& authorizer_reply,
-                            bool& isvalid, CryptoKey& session_key) override {
+                            bool& isvalid, CryptoKey& session_key,
+                           std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
     isvalid = true;
     return true;
   }
index 3a9397be9a689bb88500a7b6d56bfaef540ca5ff..6c56600787e16137530ae0c34462983ff6618709 100644 (file)
@@ -203,7 +203,8 @@ class FakeDispatcher : public Dispatcher {
 
   bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
                             bufferlist& authorizer, bufferlist& authorizer_reply,
-                            bool& isvalid, CryptoKey& session_key) override {
+                            bool& isvalid, CryptoKey& session_key,
+                           std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
     isvalid = true;
     return true;
   }
@@ -893,7 +894,8 @@ class SyntheticDispatcher : public Dispatcher {
 
   bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
                             bufferlist& authorizer, bufferlist& authorizer_reply,
-                            bool& isvalid, CryptoKey& session_key) override {
+                            bool& isvalid, CryptoKey& session_key,
+                           std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
     isvalid = true;
     return true;
   }
@@ -1436,7 +1438,8 @@ class MarkdownDispatcher : public Dispatcher {
   }
   bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
                             bufferlist& authorizer, bufferlist& authorizer_reply,
-                            bool& isvalid, CryptoKey& session_key) override {
+                            bool& isvalid, CryptoKey& session_key,
+                           std::unique_ptr<AuthAuthorizerChallenge> *challenge) override {
     isvalid = true;
     return true;
   }
index 772715eb4ccbe2eab4a4ca68aa68906889df72b1..cbf695cead8330346a32b93e0ae4732045751443 100644 (file)
@@ -81,6 +81,7 @@ bool DeterministicOpSequence::run_one_op(int op, rngen_t& gen)
     break;
 
   default:
+    cout << "bad op " << op << std::endl;
     assert(0 == "bad op");
   }
   return ok;
@@ -164,9 +165,9 @@ bool DeterministicOpSequence::do_touch(rngen_t& gen)
   }
   hobject_t *obj = entry->touch_obj(obj_id);
 
-  dout(0) << "do_touch " << entry->m_coll.to_str() << "/" << obj->oid.name << dendl;
+  dout(0) << "do_touch " << entry->m_coll << "/" << obj << dendl;
 
-  _do_touch(entry->m_coll, *obj);
+  _do_touch(entry, *obj);
   return true;
 }
 
@@ -185,9 +186,9 @@ bool DeterministicOpSequence::do_remove(rngen_t& gen)
   hobject_t *obj = entry->touch_obj(obj_id);
   ceph_assert(obj);
 
-  dout(0) << "do_remove " << entry->m_coll.to_str() << "/" << obj->oid.name << dendl;
+  dout(0) << "do_remove " << entry->m_coll << "/" << obj << dendl;
 
-  _do_remove(entry->m_coll, *obj);
+  _do_remove(entry, *obj);
   hobject_t *rmobj = entry->remove_obj(obj_id);
   ceph_assert(rmobj);
   delete rmobj;
@@ -245,7 +246,7 @@ bool DeterministicOpSequence::do_set_attrs(rngen_t& gen)
   gen_attrs(gen, &out);
 
   dout(0) << "do_set_attrs " << out.size() << " entries" << dendl;
-  _do_set_attrs(entry->m_coll, *obj, out);
+  _do_set_attrs(entry, *obj, out);
   return true;
 }
 
@@ -269,65 +270,70 @@ bool DeterministicOpSequence::do_write(rngen_t& gen)
   bufferlist bl;
   _gen_random(gen, size, bl);
 
-  dout(0) << "do_write " << entry->m_coll.to_str() << "/" << obj->oid.name
+  dout(0) << "do_write " << entry->m_coll << "/" << obj
          << " 0~" << size << dendl;
 
-  _do_write(entry->m_coll, *obj, 0, bl.length(), bl);
+  _do_write(entry, *obj, 0, bl.length(), bl);
   return true;
 }
 
-bool DeterministicOpSequence::_prepare_clone(rngen_t& gen,
-                                            coll_t& coll_ret, hobject_t& orig_obj_ret, hobject_t& new_obj_ret)
+bool DeterministicOpSequence::_prepare_clone(
+  rngen_t& gen,
+  coll_entry_t **entry_ret,
+  int *orig_obj_id,
+  hobject_t *orig_obj_ret,
+  int *new_obj_id,
+  hobject_t *new_obj_ret)
 {
   int coll_id = _gen_coll_id(gen);
 
   coll_entry_t *entry = get_coll_at(coll_id);
   ceph_assert(entry != NULL);
 
-  if (entry->m_objects.size() >= 2) {
-    dout(0) << "_prepare_clone coll " << entry->m_coll.to_str()
+  if (entry->m_objects.size() < 2) {
+    dout(0) << "_prepare_clone coll " << entry->m_coll
            << " doesn't have 2 or more objects" << dendl;
     return false;
   }
 
-  int orig_obj_id = entry->get_random_obj_id(gen);
-  hobject_t *orig_obj = entry->touch_obj(orig_obj_id);
+  *orig_obj_id = entry->get_random_obj_id(gen);
+  hobject_t *orig_obj = entry->touch_obj(*orig_obj_id);
   ceph_assert(orig_obj);
 
-  int id;
   do {
-    id = entry->get_random_obj_id(gen);
-  } while (id == orig_obj_id);
-  hobject_t *new_obj = entry->touch_obj(id);
+    *new_obj_id = entry->get_random_obj_id(gen);
+  } while (*new_obj_id == *orig_obj_id);
+  hobject_t *new_obj = entry->touch_obj(*new_obj_id);
   ceph_assert(new_obj);
 
-  coll_ret = entry->m_coll;
-  orig_obj_ret = *orig_obj;
-  new_obj_ret = *new_obj;
-
+  *entry_ret = entry;
+  *orig_obj_ret = *orig_obj;
+  *new_obj_ret = *new_obj;
   return true;
 }
 
 bool DeterministicOpSequence::do_clone(rngen_t& gen)
 {
-  coll_t coll;
+  coll_entry_t *entry;
+  int orig_id, new_id;
   hobject_t orig_obj, new_obj;
-  if (!_prepare_clone(gen, coll, orig_obj, new_obj)) {
+  if (!_prepare_clone(gen, &entry, &orig_id, &orig_obj, &new_id, &new_obj)) {
     return false;
   }
 
-  dout(0) << "do_clone " << coll.to_str() << "/" << orig_obj.oid.name
-      << " => " << coll.to_str() << "/" << new_obj.oid.name << dendl;
+  dout(0) << "do_clone " << entry->m_coll << "/" << orig_obj
+      << " => " << entry->m_coll << "/" << new_obj << dendl;
 
-  _do_clone(coll, orig_obj, new_obj);
+  _do_clone(entry, orig_obj, new_obj);
   return true;
 }
 
 bool DeterministicOpSequence::do_clone_range(rngen_t& gen)
 {
-  coll_t coll;
+  coll_entry_t *entry;
+  int orig_id, new_id;
   hobject_t orig_obj, new_obj;
-  if (!_prepare_clone(gen, coll, orig_obj, new_obj)) {
+  if (!_prepare_clone(gen, &entry, &orig_id, &orig_obj, &new_id, &new_obj)) {
     return false;
   }
 
@@ -346,173 +352,116 @@ bool DeterministicOpSequence::do_clone_range(rngen_t& gen)
   boost::uniform_int<> clone_len(1, bl.length());
   size = (size_t) clone_len(gen);
 
-  dout(0) << "do_clone_range " << coll.to_str() << "/" << orig_obj.oid.name
+  dout(0) << "do_clone_range " << entry->m_coll << "/" << orig_obj
       << " (0~" << size << ")"
-      << " => " << coll.to_str() << "/" << new_obj.oid.name
+      << " => " << entry->m_coll << "/" << new_obj
       << " (0)" << dendl;
-  _do_write_and_clone_range(coll, orig_obj, new_obj, 0, size, 0, bl);
+  _do_write_and_clone_range(entry, orig_obj, new_obj, 0, size, 0, bl);
   return true;
 }
 
-bool DeterministicOpSequence::_prepare_colls(rngen_t& gen,
-                                            coll_entry_t* &orig_coll, coll_entry_t* &new_coll)
-{
-  ceph_assert(m_collections_ids.size() > 1);
-  int orig_coll_id = _gen_coll_id(gen);
-  int new_coll_id;
-  do {
-    new_coll_id = _gen_coll_id(gen);
-  } while (new_coll_id == orig_coll_id);
-
-  dout(0) << "_prepare_colls from coll id " << orig_coll_id
-      << " to coll id " << new_coll_id << dendl;
-
-  orig_coll = get_coll_at(orig_coll_id);
-  ceph_assert(orig_coll != NULL);
-  new_coll = get_coll_at(new_coll_id);
-  ceph_assert(new_coll != NULL);
-
-  if (!orig_coll->m_objects.size()) {
-    dout(0) << "_prepare_colls coll " << orig_coll->m_coll.to_str()
-        << " has no objects to use" << dendl;
-    return false;
-  }
-
-  return true;
-}
-
-
 bool DeterministicOpSequence::do_coll_move(rngen_t& gen)
 {
-  coll_entry_t *orig_coll = NULL, *new_coll = NULL;
-  if (!_prepare_colls(gen, orig_coll, new_coll))
-    return false;
-
-  ceph_assert(orig_coll && new_coll);
-
-  boost::uniform_int<> obj_rng(0, orig_coll->m_objects.size()-1);
-  int obj_pos = obj_rng(gen);
-  int obj_key = -1;
-  hobject_t *obj = orig_coll->get_obj_at(obj_pos, &obj_key);
-  if (!obj) {
-    dout(0) << "do_coll_move coll " << orig_coll->m_coll.to_str()
-        << " has no object as pos #" << obj_pos << " (key " << obj_key << ")"
-        << dendl;
-    return false;
-  }
-  if (new_coll->check_for_obj(obj_key)) {
-    dout(0) << "do_coll_move coll " << orig_coll->m_coll.to_str()
-        << " already has object as pos #" << obj_pos << " (key " << obj_key << ")"
-        << dendl;
+  coll_entry_t *entry;
+  int orig_id, new_id;
+  hobject_t orig_obj, new_obj;
+  if (!_prepare_clone(gen, &entry, &orig_id, &orig_obj, &new_id, &new_obj)) {
     return false;
   }
-  dout(0) << "do_coll_move " << orig_coll->m_coll.to_str() << "/" << obj->oid.name
-        << " => " << new_coll->m_coll.to_str() << "/" << obj->oid.name << dendl;
-  new_coll->touch_obj(obj_key);
 
-  orig_coll->remove_obj(obj_key);
+  dout(0) << "do_coll_move " << entry->m_coll << "/" << orig_obj
+        << " => " << entry->m_coll << "/" << new_obj << dendl;
+  entry->remove_obj(orig_id);
 
-  _do_coll_move(orig_coll->m_coll, new_coll->m_coll, *obj);
+  _do_coll_move(entry, orig_obj, new_obj);
 
   return true;
 }
 
 bool DeterministicOpSequence::do_coll_create(rngen_t& gen)
 {
-  boost::uniform_int<> pg_num_range(0, 512);
-  int pg_num = pg_num_range(gen);
-
-  // Assume there is 7 OSDs in total, the PGs are evenly distributed across those OSDs
-  int pgs = pg_num / 7;
-
-  boost::uniform_int<> num_objs_range(1, 1024);
-  int num_objs = num_objs_range(gen);
-
-  int pool_id = get_next_pool_id();
-  std::set<int> pg_created;
-  for (int i = 0; i < pgs; i++) {
-    boost::uniform_int<> pg_range(0, pg_num - 1);
-    int pg_id = pg_range(gen);
-    if (pg_created.count(pg_id) > 0)
-      continue;
-    _do_coll_create(coll_t(spg_t(pg_t(pg_id,pool_id),shard_id_t::NO_SHARD)),
-                   (uint32_t) pg_num, (uint64_t) num_objs);
-    pg_created.insert(pg_id);
-  }
+  int i = m_collections.size();
+  coll_entry_t *entry = coll_create(i);
+  m_collections.insert(make_pair(i, entry));
+  m_collections_ids.push_back(i);
+
+  _do_coll_create(entry, 10, 10);
+  
   return true;
 }
 
-void DeterministicOpSequence::_do_coll_create(coll_t cid, uint32_t pg_num, uint64_t num_objs)
+void DeterministicOpSequence::_do_coll_create(coll_entry_t *entry, uint32_t pg_num, uint64_t num_objs)
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.create_collection(cid, 32);
+  t.create_collection(entry->m_coll, 32);
   bufferlist hint;
-  ::encode(pg_num, hint);
-  ::encode(num_objs, hint);
-  t.collection_hint(cid, ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS, hint);
-  dout(0) << "Give collection: " << cid << " a hint, pg_num is: " << pg_num << ", num_objs is: "
-    << num_objs << dendl;
+  encode(pg_num, hint);
+  encode(num_objs, hint);
+  t.collection_hint(entry->m_coll, ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS, hint);
+  dout(0) << "Give collection: " << entry->m_coll
+         << " a hint, pg_num is: " << pg_num << ", num_objs is: "
+         << num_objs << dendl;
 
   m_store->apply_transaction(&m_osr, std::move(t));
 }
 
-void DeterministicOpSequence::_do_touch(coll_t coll, hobject_t& obj)
+void DeterministicOpSequence::_do_touch(coll_entry_t *entry, hobject_t& obj)
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.touch(coll, ghobject_t(obj));
+  t.touch(entry->m_coll, ghobject_t(obj));
   m_store->apply_transaction(&m_osr, std::move(t));
 }
 
-void DeterministicOpSequence::_do_remove(coll_t coll, hobject_t& obj)
+void DeterministicOpSequence::_do_remove(coll_entry_t *entry, hobject_t& obj)
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.remove(coll, ghobject_t(obj));
+  t.remove(entry->m_coll, ghobject_t(obj));
   m_store->apply_transaction(&m_osr, std::move(t));
 }
 
-void DeterministicOpSequence::_do_set_attrs(coll_t coll,
+void DeterministicOpSequence::_do_set_attrs(coll_entry_t *entry,
                                            hobject_t &obj,
                                            const map<string, bufferlist> &attrs)
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.omap_setkeys(coll, ghobject_t(obj), attrs);
+  t.omap_setkeys(entry->m_coll, ghobject_t(obj), attrs);
   m_store->apply_transaction(&m_osr, std::move(t));
 }
 
-void DeterministicOpSequence::_do_write(coll_t coll, hobject_t& obj,
+void DeterministicOpSequence::_do_write(coll_entry_t *entry, hobject_t& obj,
                                        uint64_t off, uint64_t len, const bufferlist& data)
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.write(coll, ghobject_t(obj), off, len, data);
+  t.write(entry->m_coll, ghobject_t(obj), off, len, data);
   m_store->apply_transaction(&m_osr, std::move(t));
 }
 
-void DeterministicOpSequence::_do_clone(coll_t coll, hobject_t& orig_obj,
+void DeterministicOpSequence::_do_clone(coll_entry_t *entry, hobject_t& orig_obj,
                                        hobject_t& new_obj)
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.clone(coll, ghobject_t(orig_obj), ghobject_t(new_obj));
+  t.clone(entry->m_coll, ghobject_t(orig_obj), ghobject_t(new_obj));
   m_store->apply_transaction(&m_osr, std::move(t));
 }
 
-void DeterministicOpSequence::_do_clone_range(coll_t coll,
+void DeterministicOpSequence::_do_clone_range(coll_entry_t *entry,
                                              hobject_t& orig_obj, hobject_t& new_obj, uint64_t srcoff,
                                              uint64_t srclen, uint64_t dstoff)
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.clone_range(coll, ghobject_t(orig_obj), ghobject_t(new_obj),
+  t.clone_range(entry->m_coll, ghobject_t(orig_obj), ghobject_t(new_obj),
                srcoff, srclen, dstoff);
   m_store->apply_transaction(&m_osr, std::move(t));
 }
 
-void DeterministicOpSequence::_do_write_and_clone_range(coll_t coll,
+void DeterministicOpSequence::_do_write_and_clone_range(coll_entry_t *entry,
                                                         hobject_t& orig_obj,
                                                         hobject_t& new_obj,
                                                         uint64_t srcoff,
@@ -522,19 +471,21 @@ void DeterministicOpSequence::_do_write_and_clone_range(coll_t coll,
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.write(coll, ghobject_t(orig_obj), srcoff, bl.length(), bl);
-  t.clone_range(coll, ghobject_t(orig_obj), ghobject_t(new_obj),
+  t.write(entry->m_coll, ghobject_t(orig_obj), srcoff, bl.length(), bl);
+  t.clone_range(entry->m_coll, ghobject_t(orig_obj), ghobject_t(new_obj),
                srcoff, srclen, dstoff);
   m_store->apply_transaction(&m_osr, std::move(t));
 }
 
-void DeterministicOpSequence::_do_coll_move(coll_t orig_coll, coll_t new_coll,
-                                           hobject_t& obj)
+void DeterministicOpSequence::_do_coll_move(coll_entry_t *entry,
+                                           hobject_t& orig_obj,
+                                           hobject_t& new_obj)
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.remove(new_coll, ghobject_t(obj));
-  t.collection_move_rename(orig_coll, ghobject_t(obj), new_coll, ghobject_t(obj));
+  t.remove(entry->m_coll, ghobject_t(new_obj));
+  t.collection_move_rename(entry->m_coll, ghobject_t(orig_obj),
+                          entry->m_coll, ghobject_t(new_obj));
   m_store->apply_transaction(&m_osr, std::move(t));
 }
 
index b3707a202814d143a791cdae5c765cab25c7002a..92373e139290e5af176b83bd5e955d30b2f1e49b 100644 (file)
@@ -39,9 +39,9 @@ class DeterministicOpSequence : public TestObjectStoreState {
     DSOP_CLONE = 2,
     DSOP_CLONE_RANGE = 3,
     DSOP_OBJ_REMOVE = 4,
-    DSOP_COLL_MOVE = 6,
-    DSOP_SET_ATTRS = 7,
-    DSOP_COLL_CREATE = 8,
+    DSOP_COLL_MOVE = 5,
+    DSOP_SET_ATTRS = 6,
+    DSOP_COLL_CREATE = 7,
 
     DSOP_FIRST = DSOP_TOUCH,
     DSOP_LAST = DSOP_COLL_CREATE,
@@ -67,31 +67,34 @@ class DeterministicOpSequence : public TestObjectStoreState {
   bool do_set_attrs(rngen_t& gen);
   bool do_coll_create(rngen_t& gen);
 
-  virtual void _do_touch(coll_t coll, hobject_t& obj);
-  virtual void _do_remove(coll_t coll, hobject_t& obj);
-  virtual void _do_write(coll_t coll, hobject_t& obj, uint64_t off,
+  virtual void _do_touch(coll_entry_t *entry, hobject_t& obj);
+  virtual void _do_remove(coll_entry_t *entry, hobject_t& obj);
+  virtual void _do_write(coll_entry_t *entry, hobject_t& obj, uint64_t off,
       uint64_t len, const bufferlist& data);
-  virtual void _do_set_attrs(coll_t coll,
+  virtual void _do_set_attrs(coll_entry_t *entry,
                             hobject_t &obj,
                             const map<string, bufferlist> &attrs);
-  virtual void _do_clone(coll_t coll, hobject_t& orig_obj, hobject_t& new_obj);
-  virtual void _do_clone_range(coll_t coll, hobject_t& orig_obj,
+  virtual void _do_clone(coll_entry_t *entry, hobject_t& orig_obj, hobject_t& new_obj);
+  virtual void _do_clone_range(coll_entry_t *entry, hobject_t& orig_obj,
       hobject_t& new_obj, uint64_t srcoff, uint64_t srclen, uint64_t dstoff);
-  virtual void _do_write_and_clone_range(coll_t coll, hobject_t& orig_obj,
+  virtual void _do_write_and_clone_range(coll_entry_t *entry, hobject_t& orig_obj,
       hobject_t& new_obj, uint64_t srcoff, uint64_t srclen,
       uint64_t dstoff, bufferlist& bl);
-  virtual void _do_coll_move(coll_t orig_coll, coll_t new_coll, hobject_t& obj);
-  virtual void _do_coll_create(coll_t cid, uint32_t pg_num, uint64_t num_objs);
+  virtual void _do_coll_move(coll_entry_t *entry, hobject_t& orig_obj, hobject_t& new_obj);
+  virtual void _do_coll_create(coll_entry_t *entry, uint32_t pg_num, uint64_t num_objs);
 
   int _gen_coll_id(rngen_t& gen);
   int _gen_obj_id(rngen_t& gen);
   void _print_status(int seq, int op);
 
  private:
-  bool _prepare_clone(rngen_t& gen, coll_t& coll_ret,
-      hobject_t& orig_obj_ret, hobject_t& new_obj_ret);
-  bool _prepare_colls(rngen_t& gen,
-      coll_entry_t* &orig_coll, coll_entry_t* &new_coll);
+  bool _prepare_clone(
+    rngen_t& gen,
+    coll_entry_t **entry_ret,
+    int *orig_obj_id,
+    hobject_t *orig_obj_ret,
+    int *new_obj_id,
+    hobject_t *new_obj_ret);
 };
 
 
index 5134522017abad421efac494097c9abd50752ae9..3bdcec8f6b697e6d2ec7f517576fe4f60b5e8b78 100644 (file)
@@ -51,14 +51,14 @@ bool FileStoreDiff::diff_attrs(std::map<std::string,bufferptr>& b,
   std::map<std::string, bufferptr>::iterator a_it = a.begin();
   for (; b_it != b.end(); ++b_it, ++a_it) {
     if (b_it->first != a_it->first) {
-      dout(0) << "diff_attrs name mismatch (verify: " << b_it->first
-          << ", store: " << a_it->first << ")" << dendl;
+      cout << "diff_attrs name mismatch (verify: " << b_it->first
+          << ", store: " << a_it->first << ")" << std::endl;
       ret = true;
       continue;
     }
 
     if (!b_it->second.cmp(a_it->second)) {
-      dout(0) << "diff_attrs contents mismatch on attr " << b_it->first << dendl;
+      cout << "diff_attrs contents mismatch on attr " << b_it->first << std::endl;
       ret = true;
       continue;
     }
@@ -73,15 +73,21 @@ static bool diff_omap(std::map<std::string,bufferlist>& b,
   std::map<std::string, bufferlist>::iterator b_it = b.begin();
   std::map<std::string, bufferlist>::iterator a_it = a.begin();
   for (; b_it != b.end(); ++b_it, ++a_it) {
+    if (a_it == a.end()) {
+      cout << __func__ << " a reached end before b, a missing " << b_it->first
+          << std::endl;
+      ret = true;
+      break;
+    }
     if (b_it->first != a_it->first) {
-      dout(0) << "diff_attrs name mismatch (verify: " << b_it->first
-          << ", store: " << a_it->first << ")" << dendl;
+      cout << "diff_attrs name mismatch (verify: " << b_it->first
+          << ", store: " << a_it->first << ")" << std::endl;
       ret = true;
       continue;
     }
 
     if (!(b_it->second == a_it->second)) {
-      dout(0) << "diff_attrs contents mismatch on attr " << b_it->first << dendl;
+      cout << "diff_attrs contents mismatch on attr " << b_it->first << std::endl;
       ret = true;
       continue;
     }
@@ -94,32 +100,32 @@ bool FileStoreDiff::diff_objects_stat(struct stat& a, struct stat& b)
   bool ret = false;
 
   if (a.st_uid != b.st_uid) {
-    dout(0) << "diff_objects_stat uid mismatch (A: "
-        << a.st_uid << " != B: " << b.st_uid << ")" << dendl;
+    cout << "diff_objects_stat uid mismatch (A: "
+        << a.st_uid << " != B: " << b.st_uid << ")" << std::endl;
     ret = true;
   }
 
   if (a.st_gid != b.st_gid) {
-    dout(0) << "diff_objects_stat gid mismatch (A: "
-        << a.st_gid << " != B: " << b.st_gid << ")" << dendl;
+    cout << "diff_objects_stat gid mismatch (A: "
+        << a.st_gid << " != B: " << b.st_gid << ")" << std::endl;
     ret = true;
   }
 
   if (a.st_mode != b.st_mode) {
-    dout(0) << "diff_objects_stat mode mismatch (A: "
-        << a.st_mode << " != B: " << b.st_mode << ")" << dendl;
+    cout << "diff_objects_stat mode mismatch (A: "
+        << a.st_mode << " != B: " << b.st_mode << ")" << std::endl;
     ret = true;
   }
 
   if (a.st_nlink != b.st_nlink) {
-    dout(0) << "diff_objects_stat nlink mismatch (A: "
-        << a.st_nlink << " != B: " << b.st_nlink << ")" << dendl;
+    cout << "diff_objects_stat nlink mismatch (A: "
+        << a.st_nlink << " != B: " << b.st_nlink << ")" << std::endl;
     ret = true;
   }
 
   if (a.st_size != b.st_size) {
-    dout(0) << "diff_objects_stat size mismatch (A: "
-        << a.st_size << " != B: " << b.st_size << ")" << dendl;
+    cout << "diff_objects_stat size mismatch (A: "
+        << a.st_size << " != B: " << b.st_size << ")" << std::endl;
     ret = true;
   }
   return ret;
@@ -127,8 +133,6 @@ bool FileStoreDiff::diff_objects_stat(struct stat& a, struct stat& b)
 
 bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t coll)
 {
-  dout(2) << __func__ << " coll "  << coll << dendl;
-
   bool ret = false;
 
   int err;
@@ -136,21 +140,21 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
   err = b_store->collection_list(coll, ghobject_t(), ghobject_t::get_max(),
                                 INT_MAX, &b_objects, NULL);
   if (err < 0) {
-    dout(0) << "diff_objects list on verify coll " << coll.to_str()
-           << " returns " << err << dendl;
+    cout << "diff_objects list on verify coll " << coll.to_str()
+           << " returns " << err << std::endl;
     return true;
   }
   err = a_store->collection_list(coll, ghobject_t(), ghobject_t::get_max(),
                                 INT_MAX, &a_objects, NULL);
   if (err < 0) {
-    dout(0) << "diff_objects list on store coll " << coll.to_str()
-              << " returns " << err << dendl;
+    cout << "diff_objects list on store coll " << coll.to_str()
+              << " returns " << err << std::endl;
     return true;
   }
 
   if (b_objects.size() != a_objects.size()) {
-    dout(0) << "diff_objects num objs mismatch (A: " << a_objects.size()
-        << ", B: " << b_objects.size() << ")" << dendl;
+    cout << "diff_objects num objs mismatch (A: " << a_objects.size()
+        << ", B: " << b_objects.size() << ")" << std::endl;
     ret = true;
   }
 
@@ -159,9 +163,9 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
   for (; b_it != b_objects.end(); ++b_it, ++a_it) {
     ghobject_t b_obj = *b_it, a_obj = *a_it;
     if (b_obj.hobj.oid.name != a_obj.hobj.oid.name) {
-      dout(0) << "diff_objects name mismatch on A object "
+      cout << "diff_objects name mismatch on A object "
           << coll << "/" << a_obj << " and B object "
-          << coll << "/" << b_obj << dendl;
+          << coll << "/" << b_obj << std::endl;
       ret = true;
       continue;
     }
@@ -169,20 +173,20 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
     struct stat b_stat, a_stat;
     err = b_store->stat(coll, b_obj, &b_stat);
     if (err < 0) {
-      dout(0) << "diff_objects error stating B object "
-             << coll.to_str() << "/" << b_obj.hobj.oid.name << dendl;
+      cout << "diff_objects error stating B object "
+             << coll.to_str() << "/" << b_obj.hobj.oid.name << std::endl;
       ret = true;
     }
     err = a_store->stat(coll, a_obj, &a_stat);
     if (err < 0) {
-      dout(0) << "diff_objects error stating A object "
-          << coll << "/" << a_obj << dendl;
+      cout << "diff_objects error stating A object "
+          << coll << "/" << a_obj << std::endl;
       ret = true;
     }
 
     if (diff_objects_stat(a_stat, b_stat)) {
-      dout(0) << "diff_objects stat mismatch on "
-          << coll << "/" << b_obj << dendl;
+      cout << "diff_objects stat mismatch on "
+          << coll << "/" << b_obj << std::endl;
       ret = true;
     }
 
@@ -191,29 +195,29 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
     a_store->read(coll, a_obj, 0, a_stat.st_size, a_obj_bl);
 
     if (!a_obj_bl.contents_equal(b_obj_bl)) {
-      dout(0) << "diff_objects content mismatch on "
-          << coll << "/" << b_obj << dendl;
+      cout << "diff_objects content mismatch on "
+          << coll << "/" << b_obj << std::endl;
       ret = true;
     }
 
     std::map<std::string, bufferptr> a_obj_attrs_map, b_obj_attrs_map;
     err = a_store->getattrs(coll, a_obj, a_obj_attrs_map);
     if (err < 0) {
-      dout(0) << "diff_objects getattrs on A object " << coll << "/" << a_obj
-              << " returns " << err << dendl;
+      cout << "diff_objects getattrs on A object " << coll << "/" << a_obj
+              << " returns " << err << std::endl;
       ret = true;
     }
     err = b_store->getattrs(coll, b_obj, b_obj_attrs_map);
     if (err < 0) {
-      dout(0) << "diff_objects getattrs on B object " << coll << "/" << b_obj
-              << "returns " << err << dendl;
+      cout << "diff_objects getattrs on B object " << coll << "/" << b_obj
+              << "returns " << err << std::endl;
       ret = true;
     }
 
     if (diff_attrs(b_obj_attrs_map, a_obj_attrs_map)) {
-      dout(0) << "diff_objects attrs mismatch on A object "
+      cout << "diff_objects attrs mismatch on A object "
           << coll << "/" << a_obj << " and B object "
-          << coll << "/" << b_obj << dendl;
+          << coll << "/" << b_obj << std::endl;
       ret = true;
     }
 
@@ -221,32 +225,34 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
     std::set<std::string> a_omap_keys, b_omap_keys;
     err = a_store->omap_get_keys(coll, a_obj, &a_omap_keys);
     if (err < 0) {
-      dout(0) << "diff_objects getomap on A object " << coll << "/" << a_obj
-              << " returns " << err << dendl;
+      cout << "diff_objects getomap on A object " << coll << "/" << a_obj
+              << " returns " << err << std::endl;
       ret = true;
     }
     err = a_store->omap_get_values(coll, a_obj, a_omap_keys, &a_obj_omap);
     if (err < 0) {
-      dout(0) << "diff_objects getomap on A object " << coll << "/" << a_obj
-              << " returns " << err << dendl;
+      cout << "diff_objects getomap on A object " << coll << "/" << a_obj
+              << " returns " << err << std::endl;
       ret = true;
     }
     err = b_store->omap_get_keys(coll, b_obj, &b_omap_keys);
     if (err < 0) {
-      dout(0) << "diff_objects getomap on A object " << coll << "/" << b_obj
-              << " returns " << err << dendl;
+      cout << "diff_objects getomap on A object " << coll << "/" << b_obj
+              << " returns " << err << std::endl;
       ret = true;
     }
     err = b_store->omap_get_values(coll, b_obj, b_omap_keys, &b_obj_omap);
     if (err < 0) {
-      dout(0) << "diff_objects getomap on A object " << coll << "/" << b_obj
-              << " returns " << err << dendl;
+      cout << "diff_objects getomap on A object " << coll << "/" << b_obj
+              << " returns " << err << std::endl;
       ret = true;
     }
     if (diff_omap(a_obj_omap, b_obj_omap)) {
-      dout(0) << "diff_objects omap mismatch on A object "
+      cout << "diff_objects omap mismatch on A object "
              << coll << "/" << a_obj << " and B object "
-             << coll << "/" << b_obj << dendl;
+             << coll << "/" << b_obj << std::endl;
+      cout << "a: " << a_obj_omap << std::endl;
+      cout << "b: " << b_obj_omap << std::endl;
       ret = true;
     }
   }
@@ -266,7 +272,7 @@ bool FileStoreDiff::diff()
   for (; it != b_coll_list.end(); ++it) {
     coll_t b_coll = *it;
     if (!a_store->collection_exists(b_coll)) {
-      dout(0) << "diff B coll " << b_coll.to_str() << " DNE on A" << dendl;
+      cout << "diff B coll " << b_coll.to_str() << " DNE on A" << std::endl;
       ret = true;
       continue;
     }
@@ -283,7 +289,7 @@ bool FileStoreDiff::diff()
   }
   for (std::vector<coll_t>::iterator it = a_coll_list.begin();
        it != a_coll_list.end(); ++it) {
-    dout(0) << "diff A coll " << *it << " DNE on B" << dendl;
+    cout << "diff A coll " << *it << " DNE on B" << std::endl;
     ret = true;
   }
 
index 7d1ac8c082135a402a3480d2ea75792623ada58c..4eaf76b5c30c2f417a1efad155dd7ef1a17e8358 100644 (file)
@@ -100,7 +100,7 @@ public:
  public:
   explicit TestObjectStoreState(ObjectStore *store) :
     m_next_coll_nr(0), m_num_objs_per_coll(10), m_num_objects(0),
-    m_max_in_flight(0), m_finished_lock("Finished Lock"), m_next_pool(1) {
+    m_max_in_flight(0), m_finished_lock("Finished Lock"), m_next_pool(2) {
     m_store.reset(store);
   }
   ~TestObjectStoreState() { 
index d5bb671138c54405b1b295e4ba6b589617ef2d64..819523172bbc33c24a7b670d6a7958c5fee1d6be 100755 (executable)
@@ -52,6 +52,8 @@ usage() {
   echo
 }
 
+echo $0 $*
+
 die_on_missing_arg() {
   if [[ "$2" == "" ]]; then
     echo "$1: missing required parameter"
@@ -248,12 +250,12 @@ do
     $tmp_name_a $tmp_name_a/journal \
     --test-seed $seed --osd-journal-size 100 \
     --filestore-kill-at $killat $tmp_opts_a \
-    --log-file $tmp_name_a.fail --debug-filestore 20 || true
+    --log-file $tmp_name_a.fail --debug-filestore 20 --no-log-to-stderr || true
 
   stop_at=`ceph_test_filestore_idempotent_sequence get-last-op \
     $tmp_name_a $tmp_name_a/journal \
     --log-file $tmp_name_a.recover \
-    --debug-filestore 20 --debug-journal 20`
+    --debug-filestore 20 --debug-journal 20 --no-log-to-stderr`
 
   if [[ "`expr $stop_at - $stop_at 2>/dev/null`" != "0" ]]; then
     echo "error: get-last-op returned '$stop_at'"
@@ -266,10 +268,11 @@ do
   $v ceph_test_filestore_idempotent_sequence run-sequence-to \
     $stop_at $tmp_name_b $tmp_name_b/journal \
     --test-seed $seed --osd-journal-size 100 \
-    --log-file $tmp_name_b.clean --debug-filestore 20 $tmp_opts_b
+    --log-file $tmp_name_b.clean --debug-filestore 20 --no-log-to-stderr \
+    $tmp_opts_b
 
   if $v ceph_test_filestore_idempotent_sequence diff \
-    $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal ; then
+    $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal --no-log-to-stderr --log-file $tmp_name_a.diff.log --debug-filestore 20 ; then
       echo OK
   else
     echo "FAIL"
index bc4e3464c4b62aaa3a895be98436de81f063d73e..7af2e59ceaf572df0750d42f273642a0f0e2ab0e 100755 (executable)
@@ -1,5 +1,6 @@
 #!/bin/sh
 
+set -x
 set -e
 
 seed=$1
@@ -11,7 +12,7 @@ mydir=`dirname $0`
 
 for f in `seq $from $to`
 do
-    if ! $mydir/run_seed_to.sh $seed $f; then
+    if ! $mydir/run_seed_to.sh -o 10 -e $seed $f; then
        if [ -d "$dir" ]; then
            echo copying evidence to $dir
            cp -a . $dir
@@ -20,4 +21,4 @@ do
        fi
        exit 1
     fi
-done
\ No newline at end of file
+done
index ca9705cbf73c59ae2ba5afbc3535dc0645d89426..e2606ced80baba5b5e1e57ba74947b8202e12389 100644 (file)
@@ -926,6 +926,43 @@ class TestObject(object):
         eq(self.object.read(3), b'bar')
         eq(self.object.read(3), b'baz')
 
+class TestIoCtxSelfManagedSnaps(object):
+    def setUp(self):
+        self.rados = Rados(conffile='')
+        self.rados.connect()
+        self.rados.create_pool('test_pool')
+        assert self.rados.pool_exists('test_pool')
+        self.ioctx = self.rados.open_ioctx('test_pool')
+
+    def tearDown(self):
+        cmd = {"prefix":"osd unset", "key":"noup"}
+        self.rados.mon_command(json.dumps(cmd), b'')
+        self.ioctx.close()
+        self.rados.delete_pool('test_pool')
+        self.rados.shutdown()
+
+    def test(self):
+        # cannot mix-and-match pool and self-managed snapshot mode
+        self.ioctx.set_self_managed_snap_write([])
+        self.ioctx.write('abc', b'abc')
+        snap_id_1 = self.ioctx.create_self_managed_snap()
+        self.ioctx.set_self_managed_snap_write([snap_id_1])
+
+        self.ioctx.write('abc', b'def')
+        snap_id_2 = self.ioctx.create_self_managed_snap()
+        self.ioctx.set_self_managed_snap_write([snap_id_1, snap_id_2])
+
+        self.ioctx.write('abc', b'ghi')
+
+        self.ioctx.rollback_self_managed_snap('abc', snap_id_1)
+        eq(self.ioctx.read('abc'), b'abc')
+
+        self.ioctx.rollback_self_managed_snap('abc', snap_id_2)
+        eq(self.ioctx.read('abc'), b'def')
+
+        self.ioctx.remove_self_managed_snap(snap_id_1)
+        self.ioctx.remove_self_managed_snap(snap_id_2)
+
 class TestCommand(object):
 
     def setUp(self):
index 3d4c16a820d55e43d74d3952032976a87a394b4a..cde2df07bdbabfaf4b7ea761ffab79c0e0f05f40 100644 (file)
@@ -523,6 +523,63 @@ TEST_F(TestMockImageReplayerBootstrapRequest, NonPrimaryRemoteSyncingState) {
   ASSERT_EQ(-EREMOTEIO, ctx.wait());
 }
 
+TEST_F(TestMockImageReplayerBootstrapRequest, NonPrimaryRemoteNotTagOwner) {
+  create_local_image();
+
+  InSequence seq;
+
+  // lookup remote image tag class
+  cls::journal::Client client;
+  librbd::journal::ClientData client_data{
+    librbd::journal::ImageClientMeta{123}};
+  encode(client_data, client.data);
+  ::journal::MockJournaler mock_journaler;
+  expect_journaler_get_client(mock_journaler,
+                              librbd::Journal<>::IMAGE_CLIENT_ID,
+                              client, 0);
+
+  // open the remote image
+  librbd::MockJournal mock_journal;
+  librbd::MockTestImageCtx mock_remote_image_ctx(*m_remote_image_ctx);
+  MockOpenImageRequest mock_open_image_request;
+  expect_open_image(mock_open_image_request, m_remote_io_ctx,
+                    mock_remote_image_ctx.id, mock_remote_image_ctx, 0);
+
+  // test if remote image is primary
+  MockIsPrimaryRequest mock_is_primary_request;
+  expect_is_primary(mock_is_primary_request, false, 0);
+
+  // open the local image
+  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
+  mock_local_image_ctx.journal = &mock_journal;
+  MockOpenLocalImageRequest mock_open_local_image_request;
+  expect_open_local_image(mock_open_local_image_request, m_local_io_ctx,
+                          mock_local_image_ctx.id, &mock_local_image_ctx, 0);
+  expect_is_resync_requested(mock_journal, false, 0);
+
+  expect_journal_get_tag_tid(mock_journal, 345);
+  expect_journal_get_tag_data(mock_journal, {librbd::Journal<>::LOCAL_MIRROR_UUID,
+                                             librbd::Journal<>::ORPHAN_MIRROR_UUID,
+                                             true, 344, 0});
+
+  MockCloseImageRequest mock_close_image_request;
+  expect_close_image(mock_close_image_request, mock_local_image_ctx, 0);
+  expect_close_image(mock_close_image_request, mock_remote_image_ctx, 0);
+
+  C_SaferCond ctx;
+  MockInstanceWatcher mock_instance_watcher;
+  cls::journal::ClientState client_state = cls::journal::CLIENT_STATE_CONNECTED;
+  librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
+    mock_local_image_ctx.id};
+  mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+  MockBootstrapRequest *request = create_request(
+    &mock_instance_watcher, mock_journaler, mock_local_image_ctx.id,
+    mock_remote_image_ctx.id, "global image id", "local mirror uuid",
+    "remote mirror uuid", &client_state, &mirror_peer_client_meta, &ctx);
+  request->send();
+  ASSERT_EQ(-EREMOTEIO, ctx.wait());
+}
+
 TEST_F(TestMockImageReplayerBootstrapRequest, RemoteDemotePromote) {
   create_local_image();
 
@@ -547,7 +604,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, RemoteDemotePromote) {
 
   // test if remote image is primary
   MockIsPrimaryRequest mock_is_primary_request;
-  expect_is_primary(mock_is_primary_request, true, 0);
+  expect_is_primary(mock_is_primary_request, false, 0);
 
   // open the local image
   librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
@@ -557,6 +614,9 @@ TEST_F(TestMockImageReplayerBootstrapRequest, RemoteDemotePromote) {
                           mock_local_image_ctx.id, &mock_local_image_ctx, 0);
   expect_is_resync_requested(mock_journal, false, 0);
 
+  expect_journal_get_tag_tid(mock_journal, 345);
+  expect_journal_get_tag_data(mock_journal, {"remote mirror uuid"});
+
   // remote demotion / promotion event
   Tags tags = {
     {2, 123, encode_tag_data({librbd::Journal<>::LOCAL_MIRROR_UUID,
@@ -573,8 +633,6 @@ TEST_F(TestMockImageReplayerBootstrapRequest, RemoteDemotePromote) {
                               true, 4, 369})}
   };
   expect_journaler_get_tags(mock_journaler, 123, tags, 0);
-  expect_journal_get_tag_tid(mock_journal, 345);
-  expect_journal_get_tag_data(mock_journal, {"remote mirror uuid"});
 
   MockCloseImageRequest mock_close_image_request;
   expect_close_image(mock_close_image_request, mock_remote_image_ctx, 0);
@@ -627,6 +685,10 @@ TEST_F(TestMockImageReplayerBootstrapRequest, MultipleRemoteDemotePromotes) {
                           mock_local_image_ctx.id, &mock_local_image_ctx, 0);
   expect_is_resync_requested(mock_journal, false, 0);
 
+  expect_journal_get_tag_tid(mock_journal, 345);
+  expect_journal_get_tag_data(mock_journal, {librbd::Journal<>::ORPHAN_MIRROR_UUID,
+                                             "remote mirror uuid", true, 4, 1});
+
   // remote demotion / promotion event
   Tags tags = {
     {2, 123, encode_tag_data({librbd::Journal<>::LOCAL_MIRROR_UUID,
@@ -652,9 +714,6 @@ TEST_F(TestMockImageReplayerBootstrapRequest, MultipleRemoteDemotePromotes) {
                               true, 7, 1})}
   };
   expect_journaler_get_tags(mock_journaler, 123, tags, 0);
-  expect_journal_get_tag_tid(mock_journal, 345);
-  expect_journal_get_tag_data(mock_journal, {librbd::Journal<>::ORPHAN_MIRROR_UUID,
-                                             "remote mirror uuid", true, 4, 1});
 
   MockCloseImageRequest mock_close_image_request;
   expect_close_image(mock_close_image_request, mock_remote_image_ctx, 0);
@@ -707,6 +766,12 @@ TEST_F(TestMockImageReplayerBootstrapRequest, LocalDemoteRemotePromote) {
                           mock_local_image_ctx.id, &mock_local_image_ctx, 0);
   expect_is_resync_requested(mock_journal, false, 0);
 
+  expect_journal_get_tag_tid(mock_journal, 346);
+  expect_journal_get_tag_data(mock_journal,
+                              {librbd::Journal<>::ORPHAN_MIRROR_UUID,
+                               librbd::Journal<>::LOCAL_MIRROR_UUID,
+                               true, 345, 1});
+
   // remote demotion / promotion event
   Tags tags = {
     {2, 123, encode_tag_data({"local mirror uuid", "local mirror uuid",
@@ -718,11 +783,6 @@ TEST_F(TestMockImageReplayerBootstrapRequest, LocalDemoteRemotePromote) {
                               true, 3, 1})}
   };
   expect_journaler_get_tags(mock_journaler, 123, tags, 0);
-  expect_journal_get_tag_tid(mock_journal, 346);
-  expect_journal_get_tag_data(mock_journal,
-                              {librbd::Journal<>::ORPHAN_MIRROR_UUID,
-                               librbd::Journal<>::LOCAL_MIRROR_UUID,
-                               true, 345, 1});
 
   MockCloseImageRequest mock_close_image_request;
   expect_close_image(mock_close_image_request, mock_remote_image_ctx, 0);
@@ -775,6 +835,11 @@ TEST_F(TestMockImageReplayerBootstrapRequest, SplitBrainForcePromote) {
                           mock_local_image_ctx.id, &mock_local_image_ctx, 0);
   expect_is_resync_requested(mock_journal, false, 0);
 
+  expect_journal_get_tag_tid(mock_journal, 345);
+  expect_journal_get_tag_data(mock_journal, {librbd::Journal<>::LOCAL_MIRROR_UUID,
+                                             librbd::Journal<>::ORPHAN_MIRROR_UUID,
+                                             true, 344, 0});
+
   // remote demotion / promotion event
   Tags tags = {
     {2, 123, encode_tag_data({librbd::Journal<>::LOCAL_MIRROR_UUID,
@@ -785,10 +850,6 @@ TEST_F(TestMockImageReplayerBootstrapRequest, SplitBrainForcePromote) {
                               true, 2, 1})}
   };
   expect_journaler_get_tags(mock_journaler, 123, tags, 0);
-  expect_journal_get_tag_tid(mock_journal, 345);
-  expect_journal_get_tag_data(mock_journal, {librbd::Journal<>::LOCAL_MIRROR_UUID,
-                                             librbd::Journal<>::ORPHAN_MIRROR_UUID,
-                                             true, 344, 0});
 
   MockCloseImageRequest mock_close_image_request;
   expect_close_image(mock_close_image_request, mock_local_image_ctx, 0);
@@ -845,6 +906,8 @@ TEST_F(TestMockImageReplayerBootstrapRequest, ResyncRequested) {
   // resync is requested
   expect_is_resync_requested(mock_journal, true, 0);
 
+  expect_journal_get_tag_tid(mock_journal, 345);
+  expect_journal_get_tag_data(mock_journal, {"remote mirror uuid"});
 
   MockCloseImageRequest mock_close_image_request;
   expect_close_image(mock_close_image_request, mock_remote_image_ctx, 0);
@@ -914,6 +977,9 @@ TEST_F(TestMockImageReplayerBootstrapRequest, PrimaryRemote) {
                           mock_local_image_ctx.id, &mock_local_image_ctx, 0);
   expect_is_resync_requested(mock_journal, false, 0);
 
+  expect_journal_get_tag_tid(mock_journal, 345);
+  expect_journal_get_tag_data(mock_journal, {"remote mirror uuid"});
+
   // sync the remote image to the local image
   MockImageSync mock_image_sync;
   expect_image_sync(mock_image_sync, 0);
@@ -997,6 +1063,9 @@ TEST_F(TestMockImageReplayerBootstrapRequest, PrimaryRemoteLocalDeleted) {
                           mock_local_image_ctx.id, &mock_local_image_ctx, 0);
   expect_is_resync_requested(mock_journal, false, 0);
 
+  expect_journal_get_tag_tid(mock_journal, 345);
+  expect_journal_get_tag_data(mock_journal, {"remote mirror uuid"});
+
   // sync the remote image to the local image
   MockImageSync mock_image_sync;
   expect_image_sync(mock_image_sync, 0);
index 279debead9bf0c39b8a626b2652c1d85cf42013c..b19f1cb4fb70d2909e771507a4867b1a29aa99e2 100644 (file)
@@ -87,6 +87,19 @@ public:
                 }));
   }
 
+  void expect_dir_get_name(librados::IoCtx &io_ctx,
+                           const std::string &image_name, int r) {
+    bufferlist bl;
+    encode(image_name, bl);
+
+    EXPECT_CALL(get_mock_io_ctx(io_ctx),
+                exec(RBD_DIRECTORY, _, StrEq("rbd"), StrEq("dir_get_name"), _, _, _))
+      .WillOnce(DoAll(WithArg<5>(Invoke([bl](bufferlist *out_bl) {
+                                          *out_bl = bl;
+                                        })),
+                      Return(r)));
+  }
+
   void expect_mirror_image_get(librados::IoCtx &io_ctx,
                                cls::rbd::MirrorImageState state,
                                const std::string &global_id, int r) {
@@ -122,6 +135,7 @@ TEST_F(TestMockImageReplayerPrepareLocalImageRequest, Success) {
   MockGetMirrorImageIdRequest mock_get_mirror_image_id_request;
   expect_get_mirror_image_id(mock_get_mirror_image_id_request, "local image id",
                              0);
+  expect_dir_get_name(m_local_io_ctx, "local image name", 0);
   expect_mirror_image_get(m_local_io_ctx, cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
                           "global image id", 0);
 
@@ -129,11 +143,13 @@ TEST_F(TestMockImageReplayerPrepareLocalImageRequest, Success) {
   expect_get_tag_owner(mock_journal, "local image id", "remote mirror uuid", 0);
 
   std::string local_image_id;
+  std::string local_image_name;
   std::string tag_owner;
   C_SaferCond ctx;
   auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
                                                   "global image id",
                                                   &local_image_id,
+                                                  &local_image_name,
                                                   &tag_owner,
                                                   m_threads->work_queue,
                                                   &ctx);
@@ -141,6 +157,7 @@ TEST_F(TestMockImageReplayerPrepareLocalImageRequest, Success) {
 
   ASSERT_EQ(0, ctx.wait());
   ASSERT_EQ(std::string("local image id"), local_image_id);
+  ASSERT_EQ(std::string("local image name"), local_image_name);
   ASSERT_EQ(std::string("remote mirror uuid"), tag_owner);
 }
 
@@ -150,11 +167,13 @@ TEST_F(TestMockImageReplayerPrepareLocalImageRequest, MirrorImageIdError) {
   expect_get_mirror_image_id(mock_get_mirror_image_id_request, "", -EINVAL);
 
   std::string local_image_id;
+  std::string local_image_name;
   std::string tag_owner;
   C_SaferCond ctx;
   auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
                                                   "global image id",
                                                   &local_image_id,
+                                                  &local_image_name,
                                                   &tag_owner,
                                                   m_threads->work_queue,
                                                   &ctx);
@@ -163,20 +182,46 @@ TEST_F(TestMockImageReplayerPrepareLocalImageRequest, MirrorImageIdError) {
   ASSERT_EQ(-EINVAL, ctx.wait());
 }
 
+TEST_F(TestMockImageReplayerPrepareLocalImageRequest, DirGetNameError) {
+  InSequence seq;
+  MockGetMirrorImageIdRequest mock_get_mirror_image_id_request;
+  expect_get_mirror_image_id(mock_get_mirror_image_id_request, "local image id",
+                             0);
+  expect_dir_get_name(m_local_io_ctx, "", -ENOENT);
+
+  std::string local_image_id;
+  std::string local_image_name;
+  std::string tag_owner;
+  C_SaferCond ctx;
+  auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
+                                                  "global image id",
+                                                  &local_image_id,
+                                                  &local_image_name,
+                                                  &tag_owner,
+                                                  m_threads->work_queue,
+                                                  &ctx);
+  req->send();
+
+  ASSERT_EQ(-ENOENT, ctx.wait());
+}
+
 TEST_F(TestMockImageReplayerPrepareLocalImageRequest, MirrorImageError) {
   InSequence seq;
   MockGetMirrorImageIdRequest mock_get_mirror_image_id_request;
   expect_get_mirror_image_id(mock_get_mirror_image_id_request, "local image id",
                              0);
+  expect_dir_get_name(m_local_io_ctx, "local image name", 0);
   expect_mirror_image_get(m_local_io_ctx, cls::rbd::MIRROR_IMAGE_STATE_DISABLED,
                           "", -EINVAL);
 
   std::string local_image_id;
+  std::string local_image_name;
   std::string tag_owner;
   C_SaferCond ctx;
   auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
                                                   "global image id",
                                                   &local_image_id,
+                                                  &local_image_name,
                                                   &tag_owner,
                                                   m_threads->work_queue,
                                                   &ctx);
@@ -190,6 +235,7 @@ TEST_F(TestMockImageReplayerPrepareLocalImageRequest, TagOwnerError) {
   MockGetMirrorImageIdRequest mock_get_mirror_image_id_request;
   expect_get_mirror_image_id(mock_get_mirror_image_id_request, "local image id",
                              0);
+  expect_dir_get_name(m_local_io_ctx, "local image name", 0);
   expect_mirror_image_get(m_local_io_ctx, cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
                           "global image id", 0);
 
@@ -198,11 +244,13 @@ TEST_F(TestMockImageReplayerPrepareLocalImageRequest, TagOwnerError) {
                        -ENOENT);
 
   std::string local_image_id;
+  std::string local_image_name;
   std::string tag_owner;
   C_SaferCond ctx;
   auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
                                                   "global image id",
                                                   &local_image_id,
+                                                  &local_image_name,
                                                   &tag_owner,
                                                   m_threads->work_queue,
                                                   &ctx);
index d8e159f5f3d4e540321f31c8f5d3229434102758..9baebf58a4b045614bad89b3144224a4fa956de9 100644 (file)
@@ -83,6 +83,9 @@ public:
   TestImageReplayer()
     : m_local_cluster(new librados::Rados()), m_watch_handle(0)
   {
+    EXPECT_EQ(0, g_ceph_context->_conf->set_val("rbd_mirror_journal_commit_age",
+                                                "0.1"));
+
     EXPECT_EQ("", connect_cluster_pp(*m_local_cluster.get()));
     EXPECT_EQ(0, m_local_cluster->conf_set("rbd_cache", "false"));
     EXPECT_EQ(0, m_local_cluster->conf_set("rbd_mirror_journal_poll_age", "1"));
@@ -142,6 +145,8 @@ public:
 
     EXPECT_EQ(0, m_remote_cluster.pool_delete(m_remote_pool_name.c_str()));
     EXPECT_EQ(0, m_local_cluster->pool_delete(m_local_pool_name.c_str()));
+    EXPECT_EQ(0, g_ceph_context->_conf->set_val("rbd_mirror_journal_commit_age",
+                                                "5"));
   }
 
   template <typename ImageReplayerT = rbd::mirror::ImageReplayer<> >
@@ -301,10 +306,6 @@ public:
     cls::journal::ObjectPosition mirror_position;
 
     for (int i = 0; i < 100; i++) {
-      printf("m_replayer->flush()\n");
-      C_SaferCond cond;
-      m_replayer->flush(&cond);
-      ASSERT_EQ(0, cond.wait());
       get_commit_positions(&master_position, &mirror_position);
       if (master_position == mirror_position) {
        break;
index e63c7338a39df6526597b9d486e0c70da050d382..818813bf758eaf1ac2750ca7d57fc2c767f2a320 100644 (file)
@@ -113,17 +113,20 @@ template<>
 struct PrepareLocalImageRequest<librbd::MockTestImageCtx> {
   static PrepareLocalImageRequest* s_instance;
   std::string *local_image_id = nullptr;
+  std::string *local_image_name = nullptr;
   std::string *tag_owner = nullptr;
   Context *on_finish = nullptr;
 
   static PrepareLocalImageRequest* create(librados::IoCtx &,
                                           const std::string &global_image_id,
                                           std::string *local_image_id,
+                                          std::string *local_image_name,
                                           std::string *tag_owner,
                                           MockContextWQ *work_queue,
                                           Context *on_finish) {
     assert(s_instance != nullptr);
     s_instance->local_image_id = local_image_id;
+    s_instance->local_image_name = local_image_name;
     s_instance->tag_owner = tag_owner;
     s_instance->on_finish = on_finish;
     return s_instance;
@@ -418,12 +421,14 @@ public:
 
   void expect_send(MockPrepareLocalImageRequest &mock_request,
                    const std::string &local_image_id,
+                   const std::string &local_image_name,
                    const std::string &tag_owner,
                    int r) {
     EXPECT_CALL(mock_request, send())
-      .WillOnce(Invoke([&mock_request, local_image_id, tag_owner, r]() {
+      .WillOnce(Invoke([&mock_request, local_image_id, local_image_name, tag_owner, r]() {
           if (r == 0) {
             *mock_request.local_image_id = local_image_id;
+            *mock_request.local_image_name = local_image_name;
             *mock_request.tag_owner = tag_owner;
           }
           mock_request.on_finish->complete(r);
@@ -617,7 +622,7 @@ TEST_F(TestMockImageReplayer, StartStop) {
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
   expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
-              "remote mirror uuid", 0);
+              mock_local_image_ctx.name, "remote mirror uuid", 0);
   expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
               m_remote_image_ctx->id, 0);
   EXPECT_CALL(mock_remote_journaler, construct());
@@ -682,7 +687,7 @@ TEST_F(TestMockImageReplayer, LocalImagePrimary) {
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
   expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
-              "", 0);
+              mock_local_image_ctx.name, "", 0);
   expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
               "remote image id", 0);
   EXPECT_CALL(mock_remote_journaler, construct());
@@ -715,7 +720,7 @@ TEST_F(TestMockImageReplayer, LocalImageDNE) {
 
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
-  expect_send(mock_prepare_local_image_request, "", "", -ENOENT);
+  expect_send(mock_prepare_local_image_request, "", "", "", -ENOENT);
   expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
               m_remote_image_ctx->id, 0);
   EXPECT_CALL(mock_remote_journaler, construct());
@@ -748,7 +753,7 @@ TEST_F(TestMockImageReplayer, PrepareLocalImageError) {
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
   expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
-              "remote mirror uuid", -EINVAL);
+              mock_local_image_ctx.name, "remote mirror uuid", -EINVAL);
 
   create_image_replayer(mock_threads, mock_image_deleter);
 
@@ -775,7 +780,7 @@ TEST_F(TestMockImageReplayer, GetRemoteImageIdDNE) {
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
   expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
-              "remote mirror uuid", 0);
+              mock_local_image_ctx.name, "remote mirror uuid", 0);
   expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
               "", -ENOENT);
   expect_schedule_image_delete(mock_image_deleter, "global image id", false);
@@ -805,7 +810,7 @@ TEST_F(TestMockImageReplayer, GetRemoteImageIdNonLinkedDNE) {
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
   expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
-              "some other mirror uuid", 0);
+              mock_local_image_ctx.name, "some other mirror uuid", 0);
   expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
               "", -ENOENT);
 
@@ -834,7 +839,7 @@ TEST_F(TestMockImageReplayer, GetRemoteImageIdError) {
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
   expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
-              "remote mirror uuid", 0);
+              mock_local_image_ctx.name, "remote mirror uuid", 0);
   expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
               m_remote_image_ctx->id, -EINVAL);
 
@@ -866,7 +871,7 @@ TEST_F(TestMockImageReplayer, BootstrapError) {
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
   expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
-              "remote mirror uuid", 0);
+              mock_local_image_ctx.name, "remote mirror uuid", 0);
   expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
               m_remote_image_ctx->id, 0);
   EXPECT_CALL(mock_remote_journaler, construct());
@@ -909,7 +914,7 @@ TEST_F(TestMockImageReplayer, StartExternalReplayError) {
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
   expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
-              "remote mirror uuid", 0);
+              mock_local_image_ctx.name, "remote mirror uuid", 0);
   expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
               m_remote_image_ctx->id, 0);
   EXPECT_CALL(mock_remote_journaler, construct());
@@ -967,7 +972,7 @@ TEST_F(TestMockImageReplayer, StopError) {
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
   expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
-              "remote mirror uuid", 0);
+              mock_local_image_ctx.name, "remote mirror uuid", 0);
   expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
               m_remote_image_ctx->id, 0);
   EXPECT_CALL(mock_remote_journaler, construct());
@@ -1039,7 +1044,7 @@ TEST_F(TestMockImageReplayer, Replay) {
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
   expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
-              "remote mirror uuid", 0);
+              mock_local_image_ctx.name, "remote mirror uuid", 0);
   expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
               m_remote_image_ctx->id, 0);
   EXPECT_CALL(mock_remote_journaler, construct());
@@ -1148,7 +1153,7 @@ TEST_F(TestMockImageReplayer, DecodeError) {
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
   expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
-              "remote mirror uuid", 0);
+              mock_local_image_ctx.name, "remote mirror uuid", 0);
   expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
               m_remote_image_ctx->id, 0);
   EXPECT_CALL(mock_remote_journaler, construct());
@@ -1250,7 +1255,7 @@ TEST_F(TestMockImageReplayer, DelayedReplay) {
   InSequence seq;
   expect_wait_for_scheduled_deletion(mock_image_deleter, "global image id", 0);
   expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
-              "remote mirror uuid", 0);
+              mock_local_image_ctx.name, "remote mirror uuid", 0);
   expect_send(mock_prepare_remote_image_request, "remote mirror uuid",
               m_remote_image_ctx->id, 0);
   EXPECT_CALL(mock_remote_journaler, construct());
index 794d0ced540b3cc4f454dda682903310ef3fc03c..07b2d7e3a06652b473dea1463741d0e041bb73cb 100644 (file)
@@ -251,7 +251,7 @@ def bucket_sync_status(target_zone, source_zone, bucket_name):
     if target_zone == source_zone:
         return None
 
-    cmd = ['bucket', 'sync', 'status'] + target_zone.zone_args()
+    cmd = ['bucket', 'sync', 'markers'] + target_zone.zone_args()
     cmd += ['--source-zone', source_zone.name]
     cmd += ['--bucket', bucket_name]
     while True:
@@ -262,7 +262,7 @@ def bucket_sync_status(target_zone, source_zone, bucket_name):
         assert(retcode == 2) # ENOENT
 
     bucket_sync_status_json = bucket_sync_status_json.decode('utf-8')
-    log.debug('current bucket sync status=%s', bucket_sync_status_json)
+    log.debug('current bucket sync markers=%s', bucket_sync_status_json)
     sync_status = json.loads(bucket_sync_status_json)
 
     markers={}
@@ -386,7 +386,7 @@ def zone_bucket_checkpoint(target_zone, source_zone, bucket_name):
 
         time.sleep(config.checkpoint_delay)
 
-    assert False, 'finished bucket checkpoint for target_zone=%s source_zone=%s bucket=%s' % \
+    assert False, 'failed bucket checkpoint for target_zone=%s source_zone=%s bucket=%s' % \
                   (target_zone.name, source_zone.name, bucket_name)
 
 def zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name):
@@ -395,7 +395,8 @@ def zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name):
             if source_conn.zone == target_conn.zone:
                 continue
             zone_bucket_checkpoint(target_conn.zone, source_conn.zone, bucket_name)
-            target_conn.check_bucket_eq(source_conn, bucket_name)
+    for source_conn, target_conn in combinations(zonegroup_conns.zones, 2):
+        target_conn.check_bucket_eq(source_conn, bucket_name)
 
 def set_master_zone(zone):
     zone.modify(zone.cluster, ['--master'])
@@ -672,12 +673,8 @@ def test_versioned_object_incremental_sync():
             log.debug('version3 id=%s', v.version_id)
             k.bucket.delete_key(obj, version_id=v.version_id)
 
-    for source_conn, bucket in zone_bucket:
-        for target_conn in zonegroup_conns.zones:
-            if source_conn.zone == target_conn.zone:
-                continue
-            zone_bucket_checkpoint(target_conn.zone, source_conn.zone, bucket.name)
-            check_bucket_eq(source_conn, target_conn, bucket)
+    for _, bucket in zone_bucket:
+        zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
 
 def test_bucket_versioning():
     buckets, zone_bucket = create_bucket_per_zone_in_realm()
index 50d428a86be90e6d8208b8175340b9098d0e0bbf..738ce1b78efc8e6300f4db6735ca59cd7081e29c 100644 (file)
@@ -71,7 +71,7 @@ using rgw::IAM::s3GetReplicationConfiguration;
 using rgw::IAM::s3ListAllMyBuckets;
 using rgw::IAM::s3ListBucket;
 using rgw::IAM::s3ListBucket;
-using rgw::IAM::s3ListBucketMultiPartUploads;
+using rgw::IAM::s3ListBucketMultipartUploads;
 using rgw::IAM::s3ListBucketVersions;
 using rgw::IAM::s3ListMultipartUploadParts;
 using rgw::IAM::s3None;
@@ -315,7 +315,7 @@ TEST_F(PolicyTest, Parse3) {
   EXPECT_EQ(p->statements[2].action, (s3ListMultipartUploadParts |
                                      s3ListBucket | s3ListBucketVersions |
                                      s3ListAllMyBuckets |
-                                     s3ListBucketMultiPartUploads |
+                                     s3ListBucketMultipartUploads |
                                      s3GetObject | s3GetObjectVersion |
                                      s3GetObjectAcl | s3GetObjectVersionAcl |
                                      s3GetObjectTorrent |
@@ -370,7 +370,7 @@ TEST_F(PolicyTest, Eval3) {
 
   auto s3allow = (s3ListMultipartUploadParts | s3ListBucket |
                  s3ListBucketVersions | s3ListAllMyBuckets |
-                 s3ListBucketMultiPartUploads | s3GetObject |
+                 s3ListBucketMultipartUploads | s3GetObject |
                  s3GetObjectVersion | s3GetObjectAcl | s3GetObjectVersionAcl |
                  s3GetObjectTorrent | s3GetObjectVersionTorrent |
                  s3GetAccelerateConfiguration | s3GetBucketAcl |
index 58465c2ffc79ed3458afe2d90a5cf1f58ddee8cf..7741db44352b3a1e7a56f98085940b939d2a679f 100644 (file)
@@ -170,12 +170,25 @@ int Resetter::_write_reset_event(Journaler *journaler)
 
   bufferlist bl;
   le->encode_with_header(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
-  
+
   cout << "writing EResetJournal entry" << std::endl;
-  C_SaferCond cond;
   journaler->append_entry(bl);
-  journaler->flush(&cond);
 
-  return cond.wait();
+  int ret;
+  {
+    C_SaferCond cond;
+    journaler->flush(&cond);
+    ret = cond.wait();
+    if (ret < 0)
+      return ret;
+  }
+  {
+    // wait until all journal prezero ops are done
+    C_SaferCond cond;
+    journaler->wait_for_prezero(&cond);
+    cond.wait();
+  }
+
+  return ret;
 }
 
index d60a7b3e2cb67e2589cc14d611b72b30cdd58efb..5b61109b1cb6e214cf0ff14ed46680d9739c61cc 100644 (file)
@@ -46,10 +46,13 @@ static const std::string RBD_DIFF_BANNER_V2 ("rbd diff v2\n");
 #define RBD_DIFF_ZERO          'z'
 #define RBD_DIFF_END           'e'
 
+#define RBD_SNAP_PROTECTION_STATUS     'p'
+
 #define RBD_EXPORT_IMAGE_ORDER         'O'
 #define RBD_EXPORT_IMAGE_FEATURES      'T'
 #define RBD_EXPORT_IMAGE_STRIPE_UNIT   'U'
 #define RBD_EXPORT_IMAGE_STRIPE_COUNT  'C'
+#define RBD_EXPORT_IMAGE_META          'M'
 #define RBD_EXPORT_IMAGE_END           'E'
 
 enum SnapshotPresence {
index bfda6980598a6190c8ec07caf88fde771b26c0ce..9aae6f0ebcead7973d831d427e5722c38a0763a5 100644 (file)
@@ -156,12 +156,25 @@ int do_export_diff_fd(librbd::Image& image, const char *fromsnapname,
       ::encode(tag, bl);
       std::string to(endsnapname);
       if (export_format == 2) {
-       len = to.length() + 4;
-       ::encode(len, bl);
+        len = to.length() + 4;
+        ::encode(len, bl);
       }
       ::encode(to, bl);
     }
 
+    if (endsnapname && export_format == 2) {
+      tag = RBD_SNAP_PROTECTION_STATUS;
+      encode(tag, bl);
+      bool is_protected = false;
+      r = image.snap_is_protected(endsnapname, &is_protected);
+      if (r < 0) {
+        return r;
+      }
+      len = 8;
+      encode(len, bl);
+      encode(is_protected, bl);
+    }
+
     tag = RBD_DIFF_IMAGE_SIZE;
     ::encode(tag, bl);
     uint64_t endsize = info.size;
@@ -371,6 +384,8 @@ private:
   int m_fd;
 };
 
+const uint32_t MAX_KEYS = 64;
+
 static int do_export_v2(librbd::Image& image, librbd::image_info_t &info, int fd,
                        uint64_t period, int max_concurrent_ops, utils::ProgressContext &pc)
 {
@@ -414,6 +429,45 @@ static int do_export_v2(librbd::Image& image, librbd::image_info_t &info, int fd
   ::encode(length, bl);
   ::encode(stripe_count, bl);
 
+  //retrieve metadata of image
+  std::map<std::string, string> imagemetas;
+  std::string last_key;
+  bool more_results = true;
+  while (more_results) {
+    std::map<std::string, bufferlist> pairs;
+    r = image.metadata_list(last_key, MAX_KEYS, &pairs);
+    if (r < 0) {
+      std::cerr << "failed to retrieve metadata of image : " << cpp_strerror(r)
+                << std::endl;
+      return r;
+    }
+
+    if (!pairs.empty()) {
+      last_key = pairs.rbegin()->first;
+
+      for (auto kv : pairs) {
+        std::string key = kv.first;
+        std::string val(kv.second.c_str(), kv.second.length());
+        imagemetas[key] = val;
+      }
+    }
+    more_results = (pairs.size() == MAX_KEYS);
+  }
+
+  //encode imageMeta key and value
+  for (std::map<std::string, string>::iterator it = imagemetas.begin();
+       it != imagemetas.end(); ++it) {
+    string key = it->first;
+    string value = it->second;
+
+    tag = RBD_EXPORT_IMAGE_META;
+    length = key.length() + value.length() + 4 * 2;
+    ::encode(tag, bl);
+    ::encode(length, bl);
+    ::encode(key, bl);
+    ::encode(value, bl);
+  }
+
   // encode end tag
   tag = RBD_EXPORT_IMAGE_END;
   ::encode(tag, bl);
index 3c717855270574547b92b5b300aa2b0d37590cac..b7ab85d7825d3e33f93b38f1e0143541c442990f 100644 (file)
@@ -139,12 +139,14 @@ static int do_image_snap_from(ImportDiffContext *idiffctx)
   string from;
   r = utils::read_string(idiffctx->fd, 4096, &from);   // 4k limit to make sure we don't get a garbage string
   if (r < 0) {
+    std::cerr << "rbd: failed to decode start snap name" << std::endl;
     return r;
   }
 
   bool exists;
   r = idiffctx->image->snap_exists2(from.c_str(), &exists);
   if (r < 0) {
+    std::cerr << "rbd: failed to query start snap state" << std::endl;
     return r;
   }
 
@@ -164,17 +166,20 @@ static int do_image_snap_to(ImportDiffContext *idiffctx, std::string *tosnap)
   string to;
   r = utils::read_string(idiffctx->fd, 4096, &to);   // 4k limit to make sure we don't get a garbage string
   if (r < 0) {
+    std::cerr << "rbd: failed to decode end snap name" << std::endl;
     return r;
   }
 
   bool exists;
   r = idiffctx->image->snap_exists2(to.c_str(), &exists);
   if (r < 0) {
+    std::cerr << "rbd: failed to query end snap state" << std::endl;
     return r;
   }
 
   if (exists) {
-    std::cerr << "end snapshot '" << to << "' already exists, aborting" << std::endl;
+    std::cerr << "end snapshot '" << to << "' already exists, aborting"
+              << std::endl;
     return -EEXIST;
   }
 
@@ -184,6 +189,23 @@ static int do_image_snap_to(ImportDiffContext *idiffctx, std::string *tosnap)
   return 0;
 }
 
+static int get_snap_protection_status(ImportDiffContext *idiffctx,
+                                      bool *is_protected)
+{
+  int r;
+  char buf[sizeof(__u8)];
+  r = safe_read_exact(idiffctx->fd, buf, sizeof(buf));
+  if (r < 0) {
+    std::cerr << "rbd: failed to decode snap protection status" << std::endl;
+    return r;
+  }
+
+  *is_protected = (buf[0] != 0);
+  idiffctx->update_progress();
+
+  return 0;
+}
+
 static int do_image_resize(ImportDiffContext *idiffctx)
 {
   int r;
@@ -191,6 +213,7 @@ static int do_image_resize(ImportDiffContext *idiffctx)
   uint64_t end_size;
   r = safe_read_exact(idiffctx->fd, buf, sizeof(buf));
   if (r < 0) {
+    std::cerr << "rbd: failed to decode image size" << std::endl;
     return r;
   }
 
@@ -216,6 +239,7 @@ static int do_image_io(ImportDiffContext *idiffctx, bool discard, size_t sparse_
   char buf[16];
   r = safe_read_exact(idiffctx->fd, buf, sizeof(buf));
   if (r < 0) {
+    std::cerr << "rbd: failed to decode IO length" << std::endl;
     return r;
   }
 
@@ -231,6 +255,7 @@ static int do_image_io(ImportDiffContext *idiffctx, bool discard, size_t sparse_
     bufferptr bp = buffer::create(buffer_length);
     r = safe_read_exact(idiffctx->fd, bp.c_str(), buffer_length);
     if (r < 0) {
+      std::cerr << "rbd: failed to decode write data" << std::endl;
       return r;
     }
 
@@ -272,14 +297,16 @@ static int validate_banner(int fd, std::string banner)
 {
   int r;
   char buf[banner.size() + 1];
+  memset(buf, 0, sizeof(buf));
   r = safe_read_exact(fd, buf, banner.size());
   if (r < 0) {
+    std::cerr << "rbd: failed to decode diff banner" << std::endl;
     return r;
   }
 
   buf[banner.size()] = '\0';
   if (strcmp(buf, banner.c_str())) {
-    std::cerr << "invalid banner '" << buf << "', expected '" << banner << "'" << std::endl;
+    std::cerr << "rbd: invalid or unexpected diff banner" << std::endl;
     return -EINVAL;
   }
 
@@ -296,8 +323,10 @@ static int skip_tag(int fd, uint64_t length)
     uint64_t len = min<uint64_t>(length, sizeof(buf));
     while (len > 0) {
       r = safe_read_exact(fd, buf, len);
-      if (r < 0)
+      if (r < 0) {
+        std::cerr << "rbd: failed to decode skipped tag data" << std::endl;
         return r;
+      }
       length -= len;
       len = min<uint64_t>(length, sizeof(buf));
     }
@@ -319,6 +348,7 @@ static int read_tag(int fd, __u8 end_tag, int format, __u8 *tag, uint64_t *readl
 
   r = safe_read_exact(fd, &read_tag, sizeof(read_tag));
   if (r < 0) {
+    std::cerr << "rbd: failed to decode tag" << std::endl;
     return r;
   }
 
@@ -327,6 +357,7 @@ static int read_tag(int fd, __u8 end_tag, int format, __u8 *tag, uint64_t *readl
     char buf[sizeof(uint64_t)];
     r = safe_read_exact(fd, buf, sizeof(buf));
     if (r < 0) {
+      std::cerr << "rbd: failed to decode tag length" << std::endl;
       return r;
     }
 
@@ -350,6 +381,7 @@ int do_import_diff_fd(librados::Rados &rados, librbd::Image &image, int fd,
     struct stat stat_buf;
     r = ::fstat(fd, &stat_buf);
     if (r < 0) {
+      std::cerr << "rbd: failed to stat specified diff file" << std::endl;
       return r;
     }
     size = (uint64_t)stat_buf.st_size;
@@ -371,6 +403,7 @@ int do_import_diff_fd(librados::Rados &rados, librbd::Image &image, int fd,
 
   // begin image import
   std::string tosnap;
+  bool is_protected = false;
   ImportDiffContext idiffctx(&image, fd, size, no_progress);
   while (r == 0) {
     __u8 tag;
@@ -385,6 +418,8 @@ int do_import_diff_fd(librados::Rados &rados, librbd::Image &image, int fd,
       r = do_image_snap_from(&idiffctx);
     } else if (tag == RBD_DIFF_TO_SNAP) {
       r = do_image_snap_to(&idiffctx, &tosnap);
+    } else if (tag == RBD_SNAP_PROTECTION_STATUS) {
+      r = get_snap_protection_status(&idiffctx, &is_protected);
     } else if (tag == RBD_DIFF_IMAGE_SIZE) {
       r = do_image_resize(&idiffctx);
     } else if (tag == RBD_DIFF_WRITE || tag == RBD_DIFF_ZERO) {
@@ -399,7 +434,10 @@ int do_import_diff_fd(librados::Rados &rados, librbd::Image &image, int fd,
   int temp_r = idiffctx.throttle.wait_for_ret();
   r = (r < 0) ? r : temp_r; // preserve original error
   if (r == 0 && tosnap.length()) {
-    idiffctx.image->snap_create(tosnap.c_str());
+    r = idiffctx.image->snap_create(tosnap.c_str());
+    if (r == 0 && is_protected) {
+      r = idiffctx.image->snap_protect(tosnap.c_str());
+    }
   }
 
   idiffctx.finish(r);
@@ -475,6 +513,9 @@ int execute_diff(const po::variables_map &vm) {
 
   r = do_import_diff(rados, image, path.c_str(),
                     vm[at::NO_PROGRESS].as<bool>(), sparse_size);
+  if (r == -EDOM) {
+    r = -EBADMSG;
+  }
   if (r < 0) {
     cerr << "rbd: import-diff failed: " << cpp_strerror(r) << std::endl;
     return r;
@@ -537,6 +578,7 @@ static int decode_and_set_image_option(int fd, uint64_t imageopt, librbd::ImageO
 
   r = safe_read_exact(fd, buf, sizeof(buf));
   if (r < 0) {
+    std::cerr << "rbd: failed to decode image option" << std::endl;
     return r;
   }
 
@@ -555,7 +597,50 @@ static int decode_and_set_image_option(int fd, uint64_t imageopt, librbd::ImageO
   return 0;
 }
 
-static int do_import_header(int fd, int import_format, uint64_t &size, librbd::ImageOptions& opts)
+static int do_import_metadata(int import_format, librbd::Image& image,
+                              const std::map<std::string, std::string> &imagemetas)
+{
+  int r = 0;
+
+  //v1 format
+  if (import_format == 1) {
+    return 0;
+  }
+
+  for (std::map<std::string, std::string>::const_iterator it = imagemetas.begin();
+       it != imagemetas.end(); ++it) {
+    r = image.metadata_set(it->first, it->second);
+    if (r < 0)
+      return r;
+  }
+
+  return 0;
+}
+
+static int decode_imagemeta(int fd, uint64_t length, std::map<std::string, std::string>* imagemetas)
+{
+  int r;
+  string key;
+  string value;
+
+  r = utils::read_string(fd, length, &key);
+  if (r < 0) {
+    std::cerr << "rbd: failed to decode metadata key" << std::endl;
+    return r;
+  }
+
+  r = utils::read_string(fd, length, &value);
+  if (r < 0) {
+    std::cerr << "rbd: failed to decode metadata value" << std::endl;
+    return r;
+  }
+
+  (*imagemetas)[key] = value;
+  return 0;
+}
+
+static int do_import_header(int fd, int import_format, uint64_t &size, librbd::ImageOptions& opts,
+                            std::map<std::string, std::string>* imagemetas)
 {
   // There is no header in v1 image.
   if (import_format == 1) {
@@ -594,6 +679,8 @@ static int do_import_header(int fd, int import_format, uint64_t &size, librbd::I
       r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_STRIPE_UNIT, opts);
     } else if (tag == RBD_EXPORT_IMAGE_STRIPE_COUNT) {
       r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_STRIPE_COUNT, opts);
+    } else if (tag == RBD_EXPORT_IMAGE_META) {
+      r = decode_imagemeta(fd, length, imagemetas);
     } else {
       std::cerr << "rbd: invalid tag in image properties zone: " << tag << "Skip it."
                 << std::endl;
@@ -617,6 +704,7 @@ static int do_import_v2(librados::Rados &rados, int fd, librbd::Image &image,
   char buf[sizeof(uint64_t)];
   r = safe_read_exact(fd, buf, sizeof(buf));
   if (r < 0) {
+    std::cerr << "rbd: failed to decode diff count" << std::endl;
     return r;
   }
   bufferlist bl;
@@ -743,6 +831,7 @@ static int do_import(librados::Rados &rados, librbd::RBD &rbd,
   int fd, r;
   struct stat stat_buf;
   utils::ProgressContext pc("Importing image", no_progress);
+  std::map<std::string, std::string> imagemetas;
 
   assert(imgname);
 
@@ -796,7 +885,7 @@ static int do_import(librados::Rados &rados, librbd::RBD &rbd,
 #endif
   }
 
-  r = do_import_header(fd, import_format, size, opts);
+  r = do_import_header(fd, import_format, size, opts, &imagemetas);
   if (r < 0) {
     std::cerr << "rbd: import header failed." << std::endl;
     goto done;
@@ -814,6 +903,12 @@ static int do_import(librados::Rados &rados, librbd::RBD &rbd,
     goto err;
   }
 
+  r = do_import_metadata(import_format, image, imagemetas);
+  if (r < 0) {
+    std::cerr << "rbd: failed to import image-meta" << std::endl;
+    goto err;
+  }
+
   if (import_format == 1) {
     r = do_import_v1(fd, image, size, imgblklen, pc, sparse_size);
   } else {
index fd3bf7a90f136aaaed1f052e99d23fb47641908b..28678172b1727f48d42bb62253e8056e93c8f569 100644 (file)
@@ -149,13 +149,7 @@ public:
   }
 
   bool call(Formatter *f, stringstream *ss) override {
-    C_SaferCond cond;
-    this->replayer->flush(&cond);
-    int r = cond.wait();
-    if (r < 0) {
-      *ss << "flush: " << cpp_strerror(r);
-      return false;
-    }
+    this->replayer->flush();
     return true;
   }
 };
@@ -270,7 +264,7 @@ ImageReplayer<I>::ImageReplayer(Threads<I> *threads,
   m_local(local),
   m_local_mirror_uuid(local_mirror_uuid),
   m_local_pool_id(local_pool_id),
-  m_global_image_id(global_image_id),
+  m_global_image_id(global_image_id), m_local_image_name(global_image_id),
   m_lock("rbd::mirror::ImageReplayer " + stringify(local_pool_id) + " " +
         global_image_id),
   m_progress_cxt(this),
@@ -426,7 +420,7 @@ void ImageReplayer<I>::prepare_local_image() {
   Context *ctx = create_context_callback<
     ImageReplayer, &ImageReplayer<I>::handle_prepare_local_image>(this);
   auto req = PrepareLocalImageRequest<I>::create(
-    m_local_ioctx, m_global_image_id, &m_local_image_id,
+    m_local_ioctx, m_global_image_id, &m_local_image_id, &m_local_image_name,
     &m_local_image_tag_owner, m_threads->work_queue, ctx);
   req->send();
 }
@@ -440,6 +434,8 @@ void ImageReplayer<I>::handle_prepare_local_image(int r) {
   } else if (r < 0) {
     on_start_fail(r, "error preparing local image for replay");
     return;
+  } else {
+    reregister_admin_socket_hook();
   }
 
   // local image doesn't exist or is non-primary
@@ -582,8 +578,6 @@ void ImageReplayer<I>::handle_bootstrap(int r) {
     return;
   }
 
-  on_name_changed();
-
   update_mirror_image_status(false, boost::none);
   init_remote_journaler();
 }
@@ -1088,16 +1082,30 @@ void ImageReplayer<I>::handle_get_remote_tag(int r) {
 
 template <typename I>
 void ImageReplayer<I>::allocate_local_tag() {
-  dout(20) << dendl;
+  dout(15) << dendl;
 
   std::string mirror_uuid = m_replay_tag_data.mirror_uuid;
-  if (mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID ||
-      mirror_uuid == m_local_mirror_uuid) {
+  if (mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID) {
     mirror_uuid = m_remote_image.mirror_uuid;
+  } else if (mirror_uuid == m_local_mirror_uuid) {
+    mirror_uuid = librbd::Journal<>::LOCAL_MIRROR_UUID;
   } else if (mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) {
-    dout(5) << "encountered image demotion: stopping" << dendl;
-    Mutex::Locker locker(m_lock);
-    m_stop_requested = true;
+    // handle possible edge condition where daemon can failover and
+    // the local image has already been promoted/demoted
+    auto local_tag_data = m_local_journal->get_tag_data();
+    if (local_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID &&
+        (local_tag_data.predecessor.commit_valid &&
+         local_tag_data.predecessor.mirror_uuid ==
+           librbd::Journal<>::LOCAL_MIRROR_UUID)) {
+      dout(15) << "skipping stale demotion event" << dendl;
+      handle_process_entry_safe(m_replay_entry, 0);
+      handle_replay_ready();
+      return;
+    } else {
+      dout(5) << "encountered image demotion: stopping" << dendl;
+      Mutex::Locker locker(m_lock);
+      m_stop_requested = true;
+    }
   }
 
   librbd::journal::TagPredecessor predecessor(m_replay_tag_data.predecessor);
@@ -1107,10 +1115,9 @@ void ImageReplayer<I>::allocate_local_tag() {
     predecessor.mirror_uuid = librbd::Journal<>::LOCAL_MIRROR_UUID;
   }
 
-  dout(20) << "mirror_uuid=" << mirror_uuid << ", "
-           << "predecessor_mirror_uuid=" << predecessor.mirror_uuid << ", "
-           << "replay_tag_tid=" << m_replay_tag_tid << ", "
-           << "replay_tag_data=" << m_replay_tag_data << dendl;
+  dout(15) << "mirror_uuid=" << mirror_uuid << ", "
+           << "predecessor=" << predecessor << ", "
+           << "replay_tag_tid=" << m_replay_tag_tid << dendl;
   Context *ctx = create_context_callback<
     ImageReplayer, &ImageReplayer<I>::handle_allocate_local_tag>(this);
   m_local_journal->allocate_tag(mirror_uuid, predecessor, ctx);
@@ -1118,7 +1125,8 @@ void ImageReplayer<I>::allocate_local_tag() {
 
 template <typename I>
 void ImageReplayer<I>::handle_allocate_local_tag(int r) {
-  dout(20) << "r=" << r << dendl;
+  dout(15) << "r=" << r << ", "
+           << "tag_tid=" << m_local_journal->get_tag_tid() << dendl;
 
   if (r < 0) {
     derr << "failed to allocate journal tag: " << cpp_strerror(r) << dendl;
@@ -1224,7 +1232,18 @@ void ImageReplayer<I>::handle_process_entry_ready(int r) {
   dout(20) << dendl;
   assert(r == 0);
 
-  on_name_changed();
+  bool update_status = false;
+  {
+    RWLock::RLocker snap_locker(m_local_image_ctx->snap_lock);
+    if (m_local_image_name != m_local_image_ctx->name) {
+      m_local_image_name = m_local_image_ctx->name;
+      update_status = true;
+    }
+  }
+
+  if (update_status) {
+    reschedule_update_status_task(0);
+  }
 
   // attempt to process the next event
   handle_replay_ready();
@@ -1284,6 +1303,8 @@ bool ImageReplayer<I>::start_mirror_image_status_update(bool force,
 
 template <typename I>
 void ImageReplayer<I>::finish_mirror_image_status_update() {
+  reregister_admin_socket_hook();
+
   Context *on_finish = nullptr;
   {
     Mutex::Locker locker(m_lock);
@@ -1768,7 +1789,7 @@ void ImageReplayer<I>::register_admin_socket_hook() {
       return;
     }
 
-    dout(20) << "registered asok hook: " << m_name << dendl;
+    dout(15) << "registered asok hook: " << m_name << dendl;
     asok_hook = new ImageReplayerAdminSocketHook<I>(g_ceph_context, m_name,
                                                     this);
     int r = asok_hook->register_commands();
@@ -1783,7 +1804,7 @@ void ImageReplayer<I>::register_admin_socket_hook() {
 
 template <typename I>
 void ImageReplayer<I>::unregister_admin_socket_hook() {
-  dout(20) << dendl;
+  dout(15) << dendl;
 
   AdminSocketHook *asok_hook = nullptr;
   {
@@ -1794,11 +1815,10 @@ void ImageReplayer<I>::unregister_admin_socket_hook() {
 }
 
 template <typename I>
-void ImageReplayer<I>::on_name_changed() {
+void ImageReplayer<I>::reregister_admin_socket_hook() {
   {
     Mutex::Locker locker(m_lock);
-    std::string name = m_local_ioctx.get_pool_name() + "/" +
-      m_local_image_ctx->name;
+    auto name = m_local_ioctx.get_pool_name() + "/" + m_local_image_name;
     if (m_name == name) {
       return;
     }
index 55e44a6d01bd1979904d9c23f79ef5f3b92ee598..d47cccd137a87558a58d94f802adbd2ca669f9c0 100644 (file)
@@ -287,6 +287,7 @@ private:
   int64_t m_local_pool_id;
   std::string m_local_image_id;
   std::string m_global_image_id;
+  std::string m_local_image_name;
   std::string m_name;
 
   mutable Mutex m_lock;
@@ -427,8 +428,7 @@ private:
 
   void register_admin_socket_hook();
   void unregister_admin_socket_hook();
-
-  void on_name_changed();
+  void reregister_admin_socket_hook();
 };
 
 } // namespace mirror
index 94df5a8aac62247b17f53c5cc674a73fc57d02ab..7361cfbb4c06ecff85269a69ed678504f9802491 100644 (file)
@@ -27,6 +27,7 @@ namespace rbd {
 namespace mirror {
 
 using namespace image_sync;
+using librbd::util::create_async_context_callback;
 using librbd::util::create_context_callback;
 using librbd::util::unique_lock_name;
 
@@ -85,15 +86,30 @@ void ImageSync<I>::send_notify_sync_request() {
 
   dout(20) << dendl;
 
-  Context *ctx = create_context_callback<
-    ImageSync<I>, &ImageSync<I>::handle_notify_sync_request>(this);
+  m_lock.Lock();
+  if (m_canceled) {
+    m_lock.Unlock();
+    BaseRequest::finish(-ECANCELED);
+    return;
+  }
+
+  Context *ctx = create_async_context_callback(
+    m_work_queue, create_context_callback<
+      ImageSync<I>, &ImageSync<I>::handle_notify_sync_request>(this));
   m_instance_watcher->notify_sync_request(m_local_image_ctx->id, ctx);
+  m_lock.Unlock();
 }
 
 template <typename I>
 void ImageSync<I>::handle_notify_sync_request(int r) {
   dout(20) << ": r=" << r << dendl;
 
+  m_lock.Lock();
+  if (r == 0 && m_canceled) {
+    r = -ECANCELED;
+  }
+  m_lock.Unlock();
+
   if (r < 0) {
     BaseRequest::finish(r);
     return;
index 7f94976ea1c60d8ce7cad511505ab95a316f25af..f26e4da0df597abaa1457ca4b363e0bb9d3b4c62 100644 (file)
@@ -537,9 +537,15 @@ void InstanceWatcher<I>::notify_sync_start(const std::string &instance_id,
 
 template <typename I>
 void InstanceWatcher<I>::notify_sync_complete(const std::string &sync_id) {
-  dout(20) << "sync_id=" << sync_id << dendl;
-
   Mutex::Locker locker(m_lock);
+  notify_sync_complete(m_lock, sync_id);
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_sync_complete(const Mutex&,
+                                              const std::string &sync_id) {
+  dout(10) << "sync_id=" << sync_id << dendl;
+  assert(m_lock.is_locked());
 
   auto it = m_inflight_sync_reqs.find(sync_id);
   assert(it != m_inflight_sync_reqs.end());
@@ -559,7 +565,6 @@ void InstanceWatcher<I>::handle_notify_sync_request(C_SyncRequest *sync_ctx,
   Context *on_start = nullptr;
   {
     Mutex::Locker locker(m_lock);
-
     assert(sync_ctx->req != nullptr);
     assert(sync_ctx->on_start != nullptr);
 
@@ -569,13 +574,13 @@ void InstanceWatcher<I>::handle_notify_sync_request(C_SyncRequest *sync_ctx,
 
     std::swap(sync_ctx->on_start, on_start);
     sync_ctx->req = nullptr;
+
+    if (r == -ECANCELED) {
+      notify_sync_complete(m_lock, sync_ctx->sync_id);
+    }
   }
 
   on_start->complete(r == -ECANCELED ? r : 0);
-
-  if (r == -ECANCELED) {
-    notify_sync_complete(sync_ctx->sync_id);
-  }
 }
 
 template <typename I>
index be90f10e213736dd635bfbd5e51d64c60760a033..0afb25e531bc9452a4fcf9e82747f700bb04996c 100644 (file)
@@ -210,6 +210,7 @@ private:
   bool unsuspend_notify_request(C_NotifyInstanceRequest *req);
   void unsuspend_notify_requests();
 
+  void notify_sync_complete(const Mutex& lock, const std::string &sync_id);
   void handle_notify_sync_request(C_SyncRequest *sync_ctx, int r);
   void handle_notify_sync_complete(C_SyncRequest *sync_ctx, int r);
 
index 817d3434c3983a384b138e8091fa41fdb9f6c82f..c766e8609038189444610a1d3ebf68893700b92e 100644 (file)
@@ -360,17 +360,18 @@ void PoolReplayer::shut_down() {
   }
   if (m_leader_watcher) {
     m_leader_watcher->shut_down();
-    m_leader_watcher.reset();
   }
   if (m_instance_watcher) {
     m_instance_watcher->shut_down();
-    m_instance_watcher.reset();
   }
   if (m_instance_replayer) {
     m_instance_replayer->shut_down();
-    m_instance_replayer.reset();
   }
 
+  m_leader_watcher.reset();
+  m_instance_watcher.reset();
+  m_instance_replayer.reset();
+
   assert(!m_local_pool_watcher);
   assert(!m_remote_pool_watcher);
   m_local_rados.reset();
@@ -508,6 +509,8 @@ void PoolReplayer::run()
       m_cond.WaitInterval(m_lock, utime_t(1, 0));
     }
   }
+
+  m_instance_replayer->stop();
 }
 
 void PoolReplayer::print_status(Formatter *f, stringstream *ss)
index 2c24ef1f7d5e07e27831088f44dc3147de1a7940..cd2d9b4ab2709fdf5e5f177f79103f7f1b5f0d5c 100644 (file)
@@ -194,7 +194,12 @@ template <typename I>
 void BootstrapRequest<I>::handle_is_primary(int r) {
   dout(20) << ": r=" << r << dendl;
 
-  if (r < 0) {
+  if (r == -ENOENT) {
+    dout(5) << ": remote image is not mirrored" << dendl;
+    m_ret_val = -EREMOTEIO;
+    close_remote_image();
+    return;
+  } else if (r < 0) {
     derr << ": error querying remote image primary status: " << cpp_strerror(r)
          << dendl;
     m_ret_val = r;
@@ -203,11 +208,22 @@ void BootstrapRequest<I>::handle_is_primary(int r) {
   }
 
   if (!m_primary) {
-    dout(5) << ": remote image is not primary -- skipping image replay"
-            << dendl;
-    m_ret_val = -EREMOTEIO;
-    update_client_state();
-    return;
+    if (m_local_image_id.empty()) {
+      // no local image and remote isn't primary -- don't sync it
+      dout(5) << ": remote image is not primary -- not syncing"
+              << dendl;
+      m_ret_val = -EREMOTEIO;
+      close_remote_image();
+      return;
+    } else if (m_client_meta->state !=
+                 librbd::journal::MIRROR_PEER_STATE_REPLAYING) {
+      // ensure we attempt to re-sync to remote if it's re-promoted
+      dout(5) << ": remote image is not primary -- sync interrupted"
+              << dendl;
+      m_ret_val = -EREMOTEIO;
+      update_client_state();
+      return;
+    }
   }
 
   if (!m_client_meta->image_id.empty()) {
@@ -217,6 +233,7 @@ void BootstrapRequest<I>::handle_is_primary(int r) {
   }
 
   if (m_local_image_id.empty()) {
+    // prepare to create local image
     update_client_image();
     return;
   }
@@ -226,12 +243,6 @@ void BootstrapRequest<I>::handle_is_primary(int r) {
 
 template <typename I>
 void BootstrapRequest<I>::update_client_state() {
-  if (m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_REPLAYING) {
-    // state already set for replaying upon failover
-    close_remote_image();
-    return;
-  }
-
   dout(20) << dendl;
   update_progress("UPDATE_CLIENT_STATE");
 
@@ -300,8 +311,10 @@ void BootstrapRequest<I>::handle_open_local_image(int r) {
 
   I *local_image_ctx = (*m_local_image_ctx);
   {
-    RWLock::RLocker snap_locker(local_image_ctx->snap_lock);
+    local_image_ctx->snap_lock.get_read();
     if (local_image_ctx->journal == nullptr) {
+      local_image_ctx->snap_lock.put_read();
+
       derr << ": local image does not support journaling" << dendl;
       m_ret_val = -EINVAL;
       close_local_image();
@@ -310,11 +323,30 @@ void BootstrapRequest<I>::handle_open_local_image(int r) {
 
     r = (*m_local_image_ctx)->journal->is_resync_requested(m_do_resync);
     if (r < 0) {
+      local_image_ctx->snap_lock.put_read();
+
       derr << ": failed to check if a resync was requested" << dendl;
       m_ret_val = r;
       close_local_image();
       return;
     }
+
+    m_local_tag_tid = local_image_ctx->journal->get_tag_tid();
+    m_local_tag_data = local_image_ctx->journal->get_tag_data();
+    dout(10) << ": local tag=" << m_local_tag_tid << ", "
+             << "local tag data=" << m_local_tag_data << dendl;
+    local_image_ctx->snap_lock.put_read();
+  }
+
+  if (m_local_tag_data.mirror_uuid != m_remote_mirror_uuid && !m_primary) {
+    // if the local mirror is not linked to the (now) non-primary image,
+    // stop the replay. Otherwise, we ignore that the remote is non-primary
+    // so that we can replay the demotion
+    dout(5) << ": remote image is not primary -- skipping image replay"
+            << dendl;
+    m_ret_val = -EREMOTEIO;
+    close_local_image();
+    return;
   }
 
   if (*m_do_resync) {
@@ -366,8 +398,8 @@ void BootstrapRequest<I>::register_client() {
 
   update_progress("REGISTER_CLIENT");
 
-  librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
-    m_local_image_id};
+  assert(m_local_image_id.empty());
+  librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta;
   mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
 
   librbd::journal::ClientData client_data{mirror_peer_client_meta};
@@ -393,7 +425,7 @@ void BootstrapRequest<I>::handle_register_client(int r) {
   }
 
   *m_client_state = cls::journal::CLIENT_STATE_CONNECTED;
-  *m_client_meta = librbd::journal::MirrorPeerClientMeta(m_local_image_id);
+  *m_client_meta = librbd::journal::MirrorPeerClientMeta();
   m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
 
   is_primary();
@@ -513,24 +545,6 @@ void BootstrapRequest<I>::handle_get_remote_tags(int r) {
   // At this point, the local image was existing, non-primary, and replaying;
   // and the remote image is primary.  Attempt to link the local image's most
   // recent tag to the remote image's tag chain.
-  uint64_t local_tag_tid;
-  librbd::journal::TagData local_tag_data;
-  I *local_image_ctx = (*m_local_image_ctx);
-  {
-    RWLock::RLocker snap_locker(local_image_ctx->snap_lock);
-    if (local_image_ctx->journal == nullptr) {
-      derr << ": local image does not support journaling" << dendl;
-      m_ret_val = -EINVAL;
-      close_local_image();
-      return;
-    }
-
-    local_tag_tid = local_image_ctx->journal->get_tag_tid();
-    local_tag_data = local_image_ctx->journal->get_tag_data();
-    dout(20) << ": local tag " << local_tag_tid << ": "
-             << local_tag_data << dendl;
-  }
-
   bool remote_tag_data_valid = false;
   librbd::journal::TagData remote_tag_data;
   boost::optional<uint64_t> remote_orphan_tag_tid =
@@ -539,10 +553,10 @@ void BootstrapRequest<I>::handle_get_remote_tags(int r) {
 
   // decode the remote tags
   for (auto &remote_tag : m_remote_tags) {
-    if (local_tag_data.predecessor.commit_valid &&
-        local_tag_data.predecessor.mirror_uuid == m_remote_mirror_uuid &&
-        local_tag_data.predecessor.tag_tid > remote_tag.tid) {
-      dout(20) << ": skipping processed predecessor remote tag "
+    if (m_local_tag_data.predecessor.commit_valid &&
+        m_local_tag_data.predecessor.mirror_uuid == m_remote_mirror_uuid &&
+        m_local_tag_data.predecessor.tag_tid > remote_tag.tid) {
+      dout(15) << ": skipping processed predecessor remote tag "
                << remote_tag.tid << dendl;
       continue;
     }
@@ -562,7 +576,7 @@ void BootstrapRequest<I>::handle_get_remote_tags(int r) {
     dout(10) << ": decoded remote tag " << remote_tag.tid << ": "
              << remote_tag_data << dendl;
 
-    if (!local_tag_data.predecessor.commit_valid) {
+    if (!m_local_tag_data.predecessor.commit_valid) {
       // newly synced local image (no predecessor) replays from the first tag
       if (remote_tag_data.mirror_uuid != librbd::Journal<>::LOCAL_MIRROR_UUID) {
         dout(20) << ": skipping non-primary remote tag" << dendl;
@@ -573,17 +587,17 @@ void BootstrapRequest<I>::handle_get_remote_tags(int r) {
       break;
     }
 
-    if (local_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) {
+    if (m_local_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) {
       // demotion last available local epoch
 
-      if (remote_tag_data.mirror_uuid == local_tag_data.mirror_uuid &&
+      if (remote_tag_data.mirror_uuid == m_local_tag_data.mirror_uuid &&
           remote_tag_data.predecessor.commit_valid &&
           remote_tag_data.predecessor.tag_tid ==
-            local_tag_data.predecessor.tag_tid) {
+            m_local_tag_data.predecessor.tag_tid) {
         // demotion matches remote epoch
 
         if (remote_tag_data.predecessor.mirror_uuid == m_local_mirror_uuid &&
-            local_tag_data.predecessor.mirror_uuid ==
+            m_local_tag_data.predecessor.mirror_uuid ==
               librbd::Journal<>::LOCAL_MIRROR_UUID) {
           // local demoted and remote has matching event
           dout(20) << ": found matching local demotion tag" << dendl;
@@ -591,7 +605,7 @@ void BootstrapRequest<I>::handle_get_remote_tags(int r) {
           continue;
         }
 
-        if (local_tag_data.predecessor.mirror_uuid == m_remote_mirror_uuid &&
+        if (m_local_tag_data.predecessor.mirror_uuid == m_remote_mirror_uuid &&
             remote_tag_data.predecessor.mirror_uuid ==
               librbd::Journal<>::LOCAL_MIRROR_UUID) {
           // remote demoted and local has matching event
@@ -617,8 +631,8 @@ void BootstrapRequest<I>::handle_get_remote_tags(int r) {
   }
 
   if (remote_tag_data_valid &&
-      local_tag_data.mirror_uuid == m_remote_mirror_uuid) {
-    dout(20) << ": local image is in clean replay state" << dendl;
+      m_local_tag_data.mirror_uuid == m_remote_mirror_uuid) {
+    dout(10) << ": local image is in clean replay state" << dendl;
   } else if (reconnect_orphan) {
     dout(20) << ": remote image was demoted/promoted" << dendl;
   } else {
index 8a46e7038d6d043b53146ccd3dcfd73f36a524c1..a28789b9f8fb8c5a6b2e0837747f78ff32c39cf7 100644 (file)
@@ -8,6 +8,7 @@
 #include "include/rados/librados.hpp"
 #include "common/Mutex.h"
 #include "cls/journal/cls_journal_types.h"
+#include "librbd/journal/Types.h"
 #include "librbd/journal/TypeTraits.h"
 #include "tools/rbd_mirror/BaseRequest.h"
 #include "tools/rbd_mirror/types.h"
@@ -174,6 +175,9 @@ private:
   int m_ret_val = 0;
   ImageSync<ImageCtxT> *m_image_sync = nullptr;
 
+  uint64_t m_local_tag_tid = 0;
+  librbd::journal::TagData m_local_tag_data;
+
   bufferlist m_out_bl;
 
   void get_remote_tag_class();
index f2faa9f20f21f16112a8eefa029396d788b2199c..4c768b965594fd6d53c6db56843ab1f06426c1d7 100644 (file)
@@ -63,7 +63,7 @@ void IsPrimaryRequest<I>::handle_get_mirror_state(int r) {
         return;
       } else if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLING) {
         dout(5) << ": image mirroring is being disabled" << dendl;
-        *m_primary = false;
+        r = -ENOENT;
       } else {
         derr << ": image mirroring is disabled" << dendl;
         r = -EINVAL;
@@ -72,6 +72,8 @@ void IsPrimaryRequest<I>::handle_get_mirror_state(int r) {
       derr << ": failed to decode image mirror state: " << cpp_strerror(r)
            << dendl;
     }
+  } else if (r == -ENOENT) {
+    dout(5) << ": image is not mirrored" << dendl;
   } else {
     derr << ": failed to retrieve image mirror state: " << cpp_strerror(r)
          << dendl;
index a54216a8328cab410591fa983bb9111babb7dd57..9a2d6683c9d5eea9259d75aeee93e92c402cd9da 100644 (file)
@@ -152,7 +152,11 @@ template <typename I>
 void OpenLocalImageRequest<I>::handle_is_primary(int r) {
   dout(20) << ": r=" << r << dendl;
 
-  if (r < 0) {
+  if (r == -ENOENT) {
+    dout(5) << ": local image is not mirrored" << dendl;
+    send_close_image(r);
+    return;
+  } else if (r < 0) {
     derr << ": error querying local image primary status: " << cpp_strerror(r)
          << dendl;
     send_close_image(r);
index 4009dc10d4af6f86ea126ec2cdbf2d00ba606419..18b548011f2ed685b9fee92d88b0c12838a0c008 100644 (file)
@@ -54,6 +54,42 @@ void PrepareLocalImageRequest<I>::handle_get_local_image_id(int r) {
     return;
   }
 
+  get_local_image_name();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::get_local_image_name() {
+  dout(20) << dendl;
+
+  librados::ObjectReadOperation op;
+  librbd::cls_client::dir_get_name_start(&op, *m_local_image_id);
+
+  m_out_bl.clear();
+  librados::AioCompletion *aio_comp = create_rados_callback<
+    PrepareLocalImageRequest<I>,
+    &PrepareLocalImageRequest<I>::handle_get_local_image_name>(this);
+  int r = m_io_ctx.aio_operate(RBD_DIRECTORY, aio_comp, &op, &m_out_bl);
+  assert(r == 0);
+  aio_comp->release();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_get_local_image_name(int r) {
+  dout(20) << "r=" << r << dendl;
+
+  if (r == 0) {
+    bufferlist::iterator it = m_out_bl.begin();
+    r = librbd::cls_client::dir_get_name_finish(&it, m_local_image_name);
+  }
+
+  if (r < 0) {
+    if (r != -ENOENT) {
+      derr << "failed to retrieve image name: " << cpp_strerror(r) << dendl;
+    }
+    finish(r);
+    return;
+  }
+
   get_mirror_state();
 }
 
index 913bfd1c242c23207b4b873b826086e9f37a694e..33ba24f4e7bfc92815d89ccf1ac0a828132617b5 100644 (file)
@@ -23,22 +23,25 @@ public:
   static PrepareLocalImageRequest *create(librados::IoCtx &io_ctx,
                                           const std::string &global_image_id,
                                           std::string *local_image_id,
+                                          std::string *local_image_name,
                                           std::string *tag_owner,
                                           ContextWQ *work_queue,
                                           Context *on_finish) {
     return new PrepareLocalImageRequest(io_ctx, global_image_id, local_image_id,
-                                        tag_owner, work_queue, on_finish);
+                                        local_image_name, tag_owner, work_queue,
+                                        on_finish);
   }
 
   PrepareLocalImageRequest(librados::IoCtx &io_ctx,
                            const std::string &global_image_id,
                            std::string *local_image_id,
+                           std::string *local_image_name,
                            std::string *tag_owner,
                            ContextWQ *work_queue,
                            Context *on_finish)
     : m_io_ctx(io_ctx), m_global_image_id(global_image_id),
-      m_local_image_id(local_image_id), m_tag_owner(tag_owner),
-      m_work_queue(work_queue), m_on_finish(on_finish) {
+      m_local_image_id(local_image_id), m_local_image_name(local_image_name),
+      m_tag_owner(tag_owner), m_work_queue(work_queue), m_on_finish(on_finish) {
   }
 
   void send();
@@ -53,6 +56,9 @@ private:
    * GET_LOCAL_IMAGE_ID
    *    |
    *    v
+   * GET_LOCAL_IMAGE_NAME
+   *    |
+   *    v
    * GET_MIRROR_STATE
    *    |
    *    v
@@ -64,6 +70,7 @@ private:
   librados::IoCtx &m_io_ctx;
   std::string m_global_image_id;
   std::string *m_local_image_id;
+  std::string *m_local_image_name;
   std::string *m_tag_owner;
   ContextWQ *m_work_queue;
   Context *m_on_finish;
@@ -73,6 +80,9 @@ private:
   void get_local_image_id();
   void handle_get_local_image_id(int r);
 
+  void get_local_image_name();
+  void handle_get_local_image_name(int r);
+
   void get_mirror_state();
   void handle_get_mirror_state(int r);
 
index 098d9925ca29c6806385bd4dc301199b0680ad80..3d76ece42a8a98d0a4afb7bff3e024f924b9b089 100644 (file)
@@ -595,14 +595,13 @@ static int do_map(int argc, const char *argv[], Config *cfg)
       cerr << err << std::endl;
       return r;
     }
-
     if (forker.is_parent()) {
-      global_init_postfork_start(g_ceph_context);
       if (forker.parent_wait(err) != 0) {
         return -ENXIO;
       }
       return 0;
     }
+    global_init_postfork_start(g_ceph_context);
   }
 
   common_init_finish(g_ceph_context);
@@ -761,9 +760,8 @@ static int do_map(int argc, const char *argv[], Config *cfg)
     cout << cfg->devpath << std::endl;
 
     if (g_conf->daemonize) {
-      forker.daemonize();
-      global_init_postfork_start(g_ceph_context);
       global_init_postfork_finish(g_ceph_context);
+      forker.daemonize();
     }
 
     {