]> git.proxmox.com Git - ceph.git/commitdiff
update sources to 12.2.2
authorFabian Grünbichler <f.gruenbichler@proxmox.com>
Tue, 28 Nov 2017 08:02:46 +0000 (09:02 +0100)
committerFabian Grünbichler <f.gruenbichler@proxmox.com>
Mon, 4 Dec 2017 09:20:44 +0000 (10:20 +0100)
590 files changed:
ceph/CMakeLists.txt
ceph/COPYING
ceph/PendingReleaseNotes
ceph/admin/doc-requirements.txt
ceph/alpine/APKBUILD
ceph/ceph.spec
ceph/ceph.spec.in
ceph/debian/ceph-osd.install
ceph/debian/ceph-osd.postinst
ceph/debian/changelog
ceph/debian/rules
ceph/doc/ceph-volume/index.rst
ceph/doc/ceph-volume/lvm/activate.rst
ceph/doc/ceph-volume/lvm/create.rst [new file with mode: 0644]
ceph/doc/ceph-volume/lvm/index.rst
ceph/doc/ceph-volume/lvm/list.rst [new file with mode: 0644]
ceph/doc/ceph-volume/lvm/prepare.rst
ceph/doc/ceph-volume/lvm/systemd.rst
ceph/doc/ceph-volume/lvm/zap.rst [new file with mode: 0644]
ceph/doc/ceph-volume/simple/activate.rst [new file with mode: 0644]
ceph/doc/ceph-volume/simple/index.rst [new file with mode: 0644]
ceph/doc/ceph-volume/simple/scan.rst [new file with mode: 0644]
ceph/doc/ceph-volume/simple/systemd.rst [new file with mode: 0644]
ceph/doc/ceph-volume/systemd.rst [new file with mode: 0644]
ceph/doc/cephfs/mds-config-ref.rst
ceph/doc/conf.py
ceph/doc/man/8/CMakeLists.txt
ceph/doc/man/8/ceph-bluestore-tool.rst [new file with mode: 0644]
ceph/doc/mgr/administrator.rst
ceph/doc/mgr/dashboard.rst
ceph/doc/mgr/index.rst
ceph/doc/mgr/influx.rst [new file with mode: 0644]
ceph/doc/mgr/localpool.rst [new file with mode: 0644]
ceph/doc/mgr/plugins.rst
ceph/doc/mgr/prometheus.rst
ceph/doc/rados/configuration/pool-pg-config-ref.rst
ceph/doc/rados/operations/health-checks.rst
ceph/doc/scripts/gen_state_diagram.py
ceph/etc/default/ceph
ceph/etc/sysconfig/ceph
ceph/qa/cephfs/clusters/3-mds.yaml
ceph/qa/cephfs/clusters/9-mds.yaml
ceph/qa/cephfs/objectstore-ec/bluestore-comp-ec-root.yaml [new file with mode: 0644]
ceph/qa/cephfs/objectstore-ec/bluestore-comp.yaml [new file with mode: 0644]
ceph/qa/cephfs/objectstore-ec/bluestore-ec-root.yaml [new file with mode: 0644]
ceph/qa/cephfs/objectstore-ec/bluestore.yaml [new file with mode: 0644]
ceph/qa/cephfs/objectstore-ec/filestore-xfs.yaml [new file with mode: 0644]
ceph/qa/distros/all/centos_7.4.yaml [new file with mode: 0644]
ceph/qa/distros/supported/centos_latest.yaml
ceph/qa/releases/luminous-with-mgr.yaml
ceph/qa/releases/luminous.yaml
ceph/qa/standalone/mon/osd-pool-create.sh
ceph/qa/standalone/special/ceph_objectstore_tool.py [new file with mode: 0755]
ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_7.3.yaml [deleted file]
ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_latest.yaml [new symlink]
ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_16.04.yaml [deleted file]
ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_latest.yaml [new symlink]
ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml [new file with mode: 0644]
ceph/qa/suites/ceph-ansible/smoke/basic/2-config/ceph_ansible.yaml [deleted file]
ceph/qa/suites/ceph-ansible/smoke/basic/3-config/bluestore_with_dmcrypt.yaml [new file with mode: 0644]
ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_off.yaml [new file with mode: 0644]
ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_on.yaml [new file with mode: 0644]
ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/ceph-admin-commands.yaml [deleted file]
ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/cls.yaml [deleted file]
ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/rbd_import_export.yaml [deleted file]
ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/ceph-admin-commands.yaml [new file with mode: 0644]
ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/rbd_import_export.yaml [new file with mode: 0644]
ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml [new file with mode: 0644]
ceph/qa/suites/fs/32bits/objectstore [deleted symlink]
ceph/qa/suites/fs/32bits/objectstore-ec [new symlink]
ceph/qa/suites/fs/basic_functional/clusters/4-remote-clients.yaml
ceph/qa/suites/fs/basic_functional/objectstore/bluestore-ec-root.yaml [new symlink]
ceph/qa/suites/fs/basic_workload/objectstore [deleted symlink]
ceph/qa/suites/fs/basic_workload/objectstore-ec [new symlink]
ceph/qa/suites/fs/multiclient/clusters/three_clients.yaml
ceph/qa/suites/fs/multiclient/clusters/two_clients.yaml
ceph/qa/suites/fs/multiclient/objectstore [deleted symlink]
ceph/qa/suites/fs/multiclient/objectstore-ec [new symlink]
ceph/qa/suites/fs/multifs/clusters/2-remote-clients.yaml
ceph/qa/suites/fs/multifs/objectstore [deleted symlink]
ceph/qa/suites/fs/multifs/objectstore-ec [new symlink]
ceph/qa/suites/fs/permission/objectstore [deleted symlink]
ceph/qa/suites/fs/permission/objectstore-ec [new symlink]
ceph/qa/suites/fs/snaps/objectstore [deleted symlink]
ceph/qa/suites/fs/snaps/objectstore-ec [new symlink]
ceph/qa/suites/fs/thrash/objectstore [deleted symlink]
ceph/qa/suites/fs/thrash/objectstore-ec [new symlink]
ceph/qa/suites/fs/traceless/objectstore [deleted symlink]
ceph/qa/suites/fs/traceless/objectstore-ec [new symlink]
ceph/qa/suites/fs/verify/objectstore [deleted symlink]
ceph/qa/suites/fs/verify/objectstore-ec [new symlink]
ceph/qa/suites/kcephfs/cephfs/objectstore [deleted symlink]
ceph/qa/suites/kcephfs/cephfs/objectstore-ec [new symlink]
ceph/qa/suites/kcephfs/mixed-clients/objectstore [deleted symlink]
ceph/qa/suites/kcephfs/mixed-clients/objectstore-ec [new symlink]
ceph/qa/suites/kcephfs/recovery/clusters/4-remote-clients.yaml
ceph/qa/suites/kcephfs/recovery/objectstore [deleted symlink]
ceph/qa/suites/kcephfs/recovery/objectstore-ec [new symlink]
ceph/qa/suites/kcephfs/thrash/objectstore [deleted symlink]
ceph/qa/suites/kcephfs/thrash/objectstore-ec [new symlink]
ceph/qa/suites/multimds/basic/objectstore [deleted symlink]
ceph/qa/suites/multimds/basic/objectstore-ec [new symlink]
ceph/qa/suites/multimds/thrash/objectstore [deleted symlink]
ceph/qa/suites/multimds/thrash/objectstore-ec [new symlink]
ceph/qa/suites/multimds/verify/objectstore [deleted symlink]
ceph/qa/suites/multimds/verify/objectstore-ec [new symlink]
ceph/qa/suites/rados/basic/d-require-luminous [deleted symlink]
ceph/qa/suites/rados/basic/d-require-luminous/at-end.yaml [new file with mode: 0644]
ceph/qa/suites/rados/basic/d-require-luminous/at-mkfs.yaml [new file with mode: 0644]
ceph/qa/suites/rados/mgr/clusters/2-node-mgr.yaml
ceph/qa/suites/rados/mgr/tasks/dashboard.yaml [new file with mode: 0644]
ceph/qa/suites/rados/mgr/tasks/module_selftest.yaml [new file with mode: 0644]
ceph/qa/suites/rados/mgr/tasks/workunits.yaml [new file with mode: 0644]
ceph/qa/suites/rados/monthrash/d-require-luminous
ceph/qa/suites/rados/rest/mgr-restful.yaml
ceph/qa/suites/rados/rest/rest_test.yaml [new file with mode: 0644]
ceph/qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml
ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml [new file with mode: 0644]
ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml [new file with mode: 0644]
ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml [new file with mode: 0644]
ceph/qa/suites/rados/singleton/all/mon-seesaw.yaml
ceph/qa/suites/rados/singleton/all/recovery-preemption.yaml [new file with mode: 0644]
ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-crush-compat.yaml [new file with mode: 0644]
ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-upmap.yaml [new file with mode: 0644]
ceph/qa/suites/rados/verify/d-require-luminous
ceph/qa/suites/rbd/basic/tasks/rbd_cls_tests.yaml
ceph/qa/suites/rest/basic/tasks/rest_test.yaml [deleted file]
ceph/qa/suites/rgw/hadoop-s3a/s3a-hadoop.yaml
ceph/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
ceph/qa/suites/rgw/thrash/workload/rgw_s3tests.yaml
ceph/qa/suites/rgw/verify/tasks/rgw_s3tests.yaml
ceph/qa/suites/upgrade/jewel-x/ceph-deploy/% [new file with mode: 0644]
ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/centos_latest.yaml [new symlink]
ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/ubuntu_latest.yaml [new symlink]
ceph/qa/suites/upgrade/jewel-x/ceph-deploy/jewel-luminous.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/jewel-x/parallel/0-cluster/start.yaml
ceph/qa/suites/upgrade/jewel-x/parallel/1-jewel-install/jewel.yaml
ceph/qa/suites/upgrade/jewel-x/parallel/2-workload/blogbench.yaml
ceph/qa/suites/upgrade/jewel-x/parallel/4-luminous.yaml [changed from symlink to file mode: 0644]
ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/+ [deleted file]
ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/blogbench.yaml [deleted file]
ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados-snaps-few-objects.yaml [deleted file]
ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados_loadgenmix.yaml [deleted file]
ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados_mon_thrash.yaml [deleted file]
ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rbd_cls.yaml [deleted file]
ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rbd_import_export.yaml [deleted file]
ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rgw_swift.yaml [deleted file]
ceph/qa/suites/upgrade/jewel-x/parallel/5-workload.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/jewel-x/parallel/6-luminous-with-mgr.yaml [new symlink]
ceph/qa/suites/upgrade/jewel-x/parallel/6.5-crush-compat.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/+ [new file with mode: 0644]
ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/blogbench.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados-snaps-few-objects.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados_loadgenmix.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados_mon_thrash.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rbd_cls.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rbd_import_export.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rgw_swift.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/jewel-x/parallel/8-jewel-workload.yaml [new symlink]
ceph/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml
ceph/qa/suites/upgrade/jewel-x/stress-split/6.5-crush-compat.yaml [new symlink]
ceph/qa/suites/upgrade/kraken-x/ceph-deploy/kraken-luminous.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml
ceph/qa/suites/upgrade/kraken-x/parallel/4-luminous-with-mgr.yaml [deleted symlink]
ceph/qa/suites/upgrade/kraken-x/parallel/4-luminous.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/+ [deleted file]
ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/blogbench.yaml [deleted file]
ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados-snaps-few-objects.yaml [deleted file]
ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados_loadgenmix.yaml [deleted file]
ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados_mon_thrash.yaml [deleted file]
ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rbd_cls.yaml [deleted file]
ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rbd_import_export.yaml [deleted file]
ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rgw_swift.yaml [deleted file]
ceph/qa/suites/upgrade/kraken-x/parallel/5-workload.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/kraken-x/parallel/6-luminous-with-mgr.yaml [new symlink]
ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/+ [new file with mode: 0644]
ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/blogbench.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados-snaps-few-objects.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados_loadgenmix.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados_mon_thrash.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rbd_cls.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rbd_import_export.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rgw_swift.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/start.yaml
ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export.yaml [deleted file]
ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_no_upgrated.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_upgrated.yaml [new file with mode: 0644]
ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/1-ceph-install [new symlink]
ceph/qa/tasks/ceph.py
ceph/qa/tasks/ceph_deploy.py
ceph/qa/tasks/ceph_manager.py
ceph/qa/tasks/ceph_objectstore_tool.py
ceph/qa/tasks/cephfs/filesystem.py
ceph/qa/tasks/cephfs/test_client_limits.py
ceph/qa/tasks/cephfs/test_volume_client.py
ceph/qa/tasks/divergent_priors2.py
ceph/qa/tasks/mgr/mgr_test_case.py
ceph/qa/tasks/mgr/test_dashboard.py [new file with mode: 0644]
ceph/qa/tasks/mgr/test_module_selftest.py [new file with mode: 0644]
ceph/qa/tasks/osd_max_pg_per_osd.py [new file with mode: 0644]
ceph/qa/tasks/reg11184.py
ceph/qa/tasks/s3a_hadoop.py
ceph/qa/tasks/thrashosds.py
ceph/qa/tasks/util/rados.py
ceph/qa/workunits/ceph-disk/ceph-disk-test.py
ceph/qa/workunits/ceph-disk/ceph-disk.sh
ceph/qa/workunits/cephtool/test.sh
ceph/qa/workunits/cls/test_cls_journal.sh [new file with mode: 0755]
ceph/qa/workunits/mgr/test_localpool.sh [new file with mode: 0755]
ceph/qa/workunits/rados/test_rados_tool.sh
ceph/qa/workunits/rbd/rbd_mirror.sh
ceph/qa/workunits/rbd/rbd_mirror_helpers.sh
ceph/selinux/ceph.te
ceph/src/.git_version
ceph/src/90-ceph-osd.conf [new file with mode: 0644]
ceph/src/CMakeLists.txt
ceph/src/arch/arm.c
ceph/src/ceph-disk/ceph_disk/main.py [changed mode: 0755->0644]
ceph/src/ceph-disk/tox.ini
ceph/src/ceph-volume/ceph_volume/api/__init__.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/api/lvm.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/decorators.py
ceph/src/ceph-volume/ceph_volume/devices/__init__.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/api.py [deleted file]
ceph/src/ceph-volume/ceph_volume/devices/lvm/common.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/create.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/listing.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/devices/lvm/main.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/prepare.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/trigger.py
ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/devices/simple/__init__.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/devices/simple/activate.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/devices/simple/main.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/devices/simple/scan.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/devices/simple/trigger.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/main.py
ceph/src/ceph-volume/ceph_volume/process.py
ceph/src/ceph-volume/ceph_volume/systemd/systemctl.py
ceph/src/ceph-volume/ceph_volume/tests/api/test_lvm.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/conftest.py
ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_activate.py
ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_api.py [deleted file]
ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_listing.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_prepare.py
ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_trigger.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/Vagrantfile [deleted symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/group_vars/all [deleted file]
ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/hosts [deleted file]
ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/vagrant_variables.yml [deleted file]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/Vagrantfile [new symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/group_vars/all [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/hosts [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/setup.yml [new symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/vagrant_variables.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/Vagrantfile [new symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/group_vars/all [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/hosts [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/setup.yml [new symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/vagrant_variables.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/setup_partitions.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/Vagrantfile [new symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/group_vars/all [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/hosts [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/setup.yml [new symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/vagrant_variables.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/Vagrantfile [new symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/group_vars/all [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/hosts [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/setup.yml [new symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/vagrant_variables.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/Vagrantfile [new symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/group_vars/all [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd0.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd1.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/hosts [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/test.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/vagrant_variables.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/Vagrantfile [new symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/group_vars/all [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd0.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd1.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/hosts [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/vagrant_variables.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/Vagrantfile [new symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/group_vars/all [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd0.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd1.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/hosts [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/test.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/vagrant_variables.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/Vagrantfile [new symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/group_vars/all [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd0.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd1.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/hosts [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/vagrant_variables.yml [new file with mode: 0644]
ceph/src/ceph-volume/ceph_volume/tests/functional/tox.ini [deleted file]
ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/Vagrantfile [deleted symlink]
ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/group_vars/all [deleted file]
ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/hosts [deleted file]
ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/vagrant_variables.yml [deleted file]
ceph/src/ceph-volume/ceph_volume/tests/util/test_arg_validators.py
ceph/src/ceph-volume/ceph_volume/tests/util/test_system.py
ceph/src/ceph-volume/ceph_volume/util/arg_validators.py
ceph/src/ceph-volume/ceph_volume/util/disk.py
ceph/src/ceph-volume/ceph_volume/util/prepare.py
ceph/src/ceph-volume/ceph_volume/util/system.py
ceph/src/ceph.in
ceph/src/ceph_mgr.cc
ceph/src/ceph_mon.cc
ceph/src/ceph_osd.cc
ceph/src/client/Client.cc
ceph/src/client/Client.h
ceph/src/cls/journal/cls_journal.cc
ceph/src/cls/rbd/cls_rbd.cc
ceph/src/cls/rgw/cls_rgw.cc
ceph/src/cls/user/cls_user.cc
ceph/src/cls/user/cls_user_types.h
ceph/src/common/AsyncReserver.h
ceph/src/common/LogClient.cc
ceph/src/common/Timer.cc
ceph/src/common/Timer.h
ceph/src/common/bit_vector.hpp
ceph/src/common/buffer.cc
ceph/src/common/ceph_context.cc
ceph/src/common/common_init.cc
ceph/src/common/config.cc
ceph/src/common/legacy_config_opts.h
ceph/src/common/options.cc
ceph/src/common/perf_counters.cc
ceph/src/common/perf_counters.h
ceph/src/common/pick_address.cc
ceph/src/common/pick_address.h
ceph/src/common/subsys.h
ceph/src/crush/CrushTreeDumper.h
ceph/src/crush/CrushWrapper.cc
ceph/src/crush/CrushWrapper.h
ceph/src/include/buffer.h
ceph/src/include/rados/rgw_file.h
ceph/src/include/sock_compat.h
ceph/src/journal/JournalMetadata.cc
ceph/src/journal/ObjectPlayer.cc
ceph/src/journal/ObjectPlayer.h
ceph/src/journal/ObjectRecorder.cc
ceph/src/journal/ObjectRecorder.h
ceph/src/kv/KeyValueDB.h
ceph/src/kv/LevelDBStore.h
ceph/src/kv/RocksDBStore.h
ceph/src/librbd/ObjectMap.cc
ceph/src/librbd/ObjectMap.h
ceph/src/librbd/api/Mirror.cc
ceph/src/librbd/io/ObjectRequest.cc
ceph/src/librbd/io/ObjectRequest.h
ceph/src/librbd/object_map/UpdateRequest.cc
ceph/src/librbd/object_map/UpdateRequest.h
ceph/src/librbd/operation/SnapshotCreateRequest.cc
ceph/src/librbd/operation/SnapshotRemoveRequest.cc
ceph/src/librbd/operation/TrimRequest.cc
ceph/src/librbd/operation/TrimRequest.h
ceph/src/mds/Beacon.cc
ceph/src/mds/Beacon.h
ceph/src/mds/CInode.h
ceph/src/mds/FSMap.cc
ceph/src/mds/FSMap.h
ceph/src/mds/MDCache.cc
ceph/src/mds/MDSDaemon.cc
ceph/src/mds/MDSDaemon.h
ceph/src/mds/MDSMap.cc
ceph/src/mds/MDSMap.h
ceph/src/mds/MDSRank.cc
ceph/src/mds/PurgeQueue.cc
ceph/src/mds/PurgeQueue.h
ceph/src/mds/Server.cc
ceph/src/messages/MMgrBeacon.h
ceph/src/messages/MMgrConfigure.h
ceph/src/messages/MMgrReport.h
ceph/src/messages/MOSDMap.h
ceph/src/messages/MOSDPGTemp.h
ceph/src/mgr/ActivePyModule.cc [new file with mode: 0644]
ceph/src/mgr/ActivePyModule.h [new file with mode: 0644]
ceph/src/mgr/ActivePyModules.cc [new file with mode: 0644]
ceph/src/mgr/ActivePyModules.h [new file with mode: 0644]
ceph/src/mgr/BaseMgrModule.cc [new file with mode: 0644]
ceph/src/mgr/BaseMgrModule.h [new file with mode: 0644]
ceph/src/mgr/BaseMgrStandbyModule.cc [new file with mode: 0644]
ceph/src/mgr/BaseMgrStandbyModule.h [new file with mode: 0644]
ceph/src/mgr/DaemonServer.cc
ceph/src/mgr/DaemonServer.h
ceph/src/mgr/DaemonState.cc
ceph/src/mgr/DaemonState.h
ceph/src/mgr/Gil.cc [new file with mode: 0644]
ceph/src/mgr/Gil.h
ceph/src/mgr/Mgr.cc
ceph/src/mgr/Mgr.h
ceph/src/mgr/MgrClient.cc
ceph/src/mgr/MgrClient.h
ceph/src/mgr/MgrCommands.h
ceph/src/mgr/MgrPyModule.cc [deleted file]
ceph/src/mgr/MgrPyModule.h [deleted file]
ceph/src/mgr/MgrSession.h
ceph/src/mgr/MgrStandby.cc
ceph/src/mgr/MgrStandby.h
ceph/src/mgr/PyModuleRegistry.cc [new file with mode: 0644]
ceph/src/mgr/PyModuleRegistry.h [new file with mode: 0644]
ceph/src/mgr/PyModuleRunner.cc [new file with mode: 0644]
ceph/src/mgr/PyModuleRunner.h [new file with mode: 0644]
ceph/src/mgr/PyModules.cc [deleted file]
ceph/src/mgr/PyModules.h [deleted file]
ceph/src/mgr/PyOSDMap.cc [new file with mode: 0644]
ceph/src/mgr/PyOSDMap.h [new file with mode: 0644]
ceph/src/mgr/PyState.cc [deleted file]
ceph/src/mgr/PyState.h [deleted file]
ceph/src/mgr/StandbyPyModules.cc [new file with mode: 0644]
ceph/src/mgr/StandbyPyModules.h [new file with mode: 0644]
ceph/src/mon/AuthMonitor.cc
ceph/src/mon/Elector.cc
ceph/src/mon/LogMonitor.cc
ceph/src/mon/MDSMonitor.cc
ceph/src/mon/MDSMonitor.h
ceph/src/mon/MgrMap.h
ceph/src/mon/MgrMonitor.cc
ceph/src/mon/MgrMonitor.h
ceph/src/mon/MonCommands.h
ceph/src/mon/MonMap.cc
ceph/src/mon/Monitor.cc
ceph/src/mon/MonitorDBStore.h
ceph/src/mon/OSDMonitor.cc
ceph/src/mon/OSDMonitor.h
ceph/src/mon/PGMap.cc
ceph/src/mon/Paxos.cc
ceph/src/mon/PaxosService.cc
ceph/src/msg/Messenger.h
ceph/src/msg/async/AsyncConnection.h
ceph/src/msg/async/PosixStack.cc
ceph/src/msg/async/net_handler.cc
ceph/src/msg/simple/Pipe.cc
ceph/src/msg/simple/Pipe.h
ceph/src/os/ObjectMap.h
ceph/src/os/ObjectStore.h
ceph/src/os/bluestore/BitmapFreelistManager.cc
ceph/src/os/bluestore/BitmapFreelistManager.h
ceph/src/os/bluestore/BlueFS.cc
ceph/src/os/bluestore/BlueFS.h
ceph/src/os/bluestore/BlueStore.cc
ceph/src/os/bluestore/BlueStore.h
ceph/src/os/bluestore/FreelistManager.h
ceph/src/os/bluestore/KernelDevice.cc
ceph/src/os/bluestore/bluestore_tool.cc
ceph/src/os/bluestore/bluestore_types.cc
ceph/src/os/bluestore/bluestore_types.h
ceph/src/os/filestore/DBObjectMap.cc
ceph/src/os/filestore/DBObjectMap.h
ceph/src/os/filestore/FileJournal.cc
ceph/src/os/filestore/FileStore.cc
ceph/src/osd/OSD.cc
ceph/src/osd/OSD.h
ceph/src/osd/OSDMap.cc
ceph/src/osd/OSDMap.h
ceph/src/osd/PG.cc
ceph/src/osd/PG.h
ceph/src/osd/PrimaryLogPG.cc
ceph/src/osd/PrimaryLogPG.h
ceph/src/osd/ReplicatedBackend.h
ceph/src/osd/Watch.cc
ceph/src/osd/osd_types.cc
ceph/src/osd/osd_types.h
ceph/src/osdc/ObjectCacher.cc
ceph/src/osdc/ObjectCacher.h
ceph/src/pybind/ceph_volume_client.py
ceph/src/pybind/mgr/balancer/__init__.py [new file with mode: 0644]
ceph/src/pybind/mgr/balancer/module.py [new file with mode: 0644]
ceph/src/pybind/mgr/dashboard/base.html
ceph/src/pybind/mgr/dashboard/clients.html
ceph/src/pybind/mgr/dashboard/filesystem.html
ceph/src/pybind/mgr/dashboard/health.html
ceph/src/pybind/mgr/dashboard/module.py
ceph/src/pybind/mgr/dashboard/osd_perf.html
ceph/src/pybind/mgr/dashboard/osds.html
ceph/src/pybind/mgr/dashboard/rbd_iscsi.html
ceph/src/pybind/mgr/dashboard/rbd_ls.py
ceph/src/pybind/mgr/dashboard/rbd_mirroring.html
ceph/src/pybind/mgr/dashboard/rbd_pool.html
ceph/src/pybind/mgr/dashboard/servers.html
ceph/src/pybind/mgr/dashboard/standby.html [new file with mode: 0644]
ceph/src/pybind/mgr/influx/__init__.py [new file with mode: 0644]
ceph/src/pybind/mgr/influx/module.py [new file with mode: 0644]
ceph/src/pybind/mgr/localpool/__init__.py [new file with mode: 0644]
ceph/src/pybind/mgr/localpool/module.py [new file with mode: 0644]
ceph/src/pybind/mgr/mgr_module.py
ceph/src/pybind/mgr/prometheus/module.py
ceph/src/pybind/mgr/restful/module.py
ceph/src/pybind/mgr/selftest/__init__.py [new file with mode: 0644]
ceph/src/pybind/mgr/selftest/module.py [new file with mode: 0644]
ceph/src/pybind/mgr/status/module.py
ceph/src/pybind/mgr/zabbix/module.py
ceph/src/rbdmap
ceph/src/rgw/rgw_admin.cc
ceph/src/rgw/rgw_auth_s3.h
ceph/src/rgw/rgw_basic_types.h
ceph/src/rgw/rgw_bucket.cc
ceph/src/rgw/rgw_bucket.h
ceph/src/rgw/rgw_common.cc
ceph/src/rgw/rgw_common.h
ceph/src/rgw/rgw_crypt.cc
ceph/src/rgw/rgw_data_sync.cc
ceph/src/rgw/rgw_file.cc
ceph/src/rgw/rgw_file.h
ceph/src/rgw/rgw_iam_policy.cc
ceph/src/rgw/rgw_iam_policy.h
ceph/src/rgw/rgw_json_enc.cc
ceph/src/rgw/rgw_keystone.h
ceph/src/rgw/rgw_lc.cc
ceph/src/rgw/rgw_op.cc
ceph/src/rgw/rgw_op.h
ceph/src/rgw/rgw_quota.cc
ceph/src/rgw/rgw_rados.cc
ceph/src/rgw/rgw_rados.h
ceph/src/rgw/rgw_reshard.cc
ceph/src/rgw/rgw_rest.h
ceph/src/rgw/rgw_rest_swift.cc
ceph/src/rgw/rgw_rest_swift.h
ceph/src/rgw/rgw_rest_user.cc
ceph/src/rgw/rgw_swift_auth.cc
ceph/src/rgw/rgw_swift_auth.h
ceph/src/rgw/rgw_torrent.cc
ceph/src/rgw/rgw_torrent.h
ceph/src/rgw/rgw_user.cc
ceph/src/rgw/rgw_user.h
ceph/src/test/CMakeLists.txt
ceph/src/test/ceph_objectstore_tool.py [deleted file]
ceph/src/test/cli/crushtool/build.t
ceph/src/test/cli/osdmaptool/help.t
ceph/src/test/cli/osdmaptool/missing-argument.t
ceph/src/test/cli/osdmaptool/upmap-out.t [new file with mode: 0644]
ceph/src/test/cls_journal/CMakeLists.txt [new file with mode: 0644]
ceph/src/test/cls_journal/test_cls_journal.cc
ceph/src/test/common/test_bit_vector.cc
ceph/src/test/daemon_config.cc
ceph/src/test/librados/misc.cc
ceph/src/test/librbd/CMakeLists.txt
ceph/src/test/librbd/mock/MockImageCtx.h
ceph/src/test/librbd/mock/MockObjectMap.h
ceph/src/test/librbd/object_map/test_mock_UpdateRequest.cc
ceph/src/test/librbd/operation/test_mock_SnapshotCreateRequest.cc
ceph/src/test/librbd/operation/test_mock_SnapshotRemoveRequest.cc
ceph/src/test/librbd/operation/test_mock_TrimRequest.cc [new file with mode: 0644]
ceph/src/test/librgw_file.cc
ceph/src/test/librgw_file_aw.cc
ceph/src/test/librgw_file_cd.cc
ceph/src/test/librgw_file_gp.cc
ceph/src/test/librgw_file_marker.cc [new file with mode: 0644]
ceph/src/test/librgw_file_nfsns.cc
ceph/src/test/mon/PGMap.cc
ceph/src/test/perf_counters.cc
ceph/src/test/perf_local.cc
ceph/src/test/rbd_mirror/image_replayer/test_mock_BootstrapRequest.cc
ceph/src/test/rbd_mirror/mock/MockSafeTimer.h
ceph/src/test/rbd_mirror/test_mock_ImageReplayer.cc
ceph/src/test/rbd_mirror/test_mock_InstanceReplayer.cc
ceph/src/test/rbd_mirror/test_mock_PoolWatcher.cc
ceph/src/test/rgw/rgw_multi/tests.py
ceph/src/test/test_ipaddr.cc
ceph/src/tools/CMakeLists.txt
ceph/src/tools/ceph_kvstore_tool.cc
ceph/src/tools/ceph_monstore_tool.cc
ceph/src/tools/ceph_objectstore_tool.cc
ceph/src/tools/ceph_osdomap_tool.cc
ceph/src/tools/crushtool.cc
ceph/src/tools/monmaptool.cc
ceph/src/tools/osdmaptool.cc
ceph/src/tools/rbd/action/MirrorImage.cc
ceph/src/tools/rbd/action/MirrorPool.cc
ceph/src/tools/rbd_mirror/ImageReplayer.cc
ceph/src/tools/rbd_mirror/ImageReplayer.h
ceph/src/tools/rbd_mirror/PoolReplayer.cc
ceph/src/tools/rbd_mirror/PoolReplayer.h
ceph/src/tools/rbd_mirror/PoolWatcher.cc
ceph/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
ceph/src/tools/rbd_mirror/image_sync/ImageCopyRequest.cc
ceph/src/vstart.sh
ceph/systemd/ceph-rbd-mirror@.service

index c358e3e97482aecbf4cee7f2474aef43c1349911..0d362d84994a3c14fa940462c2369e4818a13f79 100644 (file)
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8.11)
 
 project(ceph)
-set(VERSION 12.2.1)
+set(VERSION 12.2.2)
 
 if(POLICY CMP0046)
   # Tweak policies (this one disables "missing" dependency warning)
index b7371e4f9c0592b9cf9161bd6b6bfaf648ce75fc..a87427936a5eca14d7895aca5afa099fc1242061 100644 (file)
@@ -145,8 +145,3 @@ Files: src/include/timegm.h
   Copyright (C) Copyright Howard Hinnant
   Copyright (C) Copyright 2010-2011 Vicente J. Botet Escriba
   License: Boost Software License, Version 1.0
-
-Files: src/msg/async/AsyncConnection.cc, src/msg/simple/Pipe.cc (sigpipe suppression)
-  Copyright (C) 2010 Tomash Brechko.  All rights reserved.
-  License: GPL3
-
index 9ca48cdabe968843468d1573adedd42518dd854e..b46d1dce10090862177cb1e654105b437d6e3cab 100644 (file)
     limit (5% by default). Limits by inode count are still supported using
     mds_cache_size. Setting mds_cache_size to 0 (the default) disables the
     inode limit.
+
+* The maximum number of PGs per OSD before the monitor issues a
+  warning has been reduced from 300 to 200 PGs.  200 is still twice
+  the generally recommended target of 100 PGs per OSD.  This limit can
+  be adjusted via the ``mon_max_pg_per_osd`` option on the
+  monitors.  The older ``mon_pg_warn_max_per_osd`` option has been removed.
+
+* Creating pools or adjusting pg_num will now fail if the change would
+  make the number of PGs per OSD exceed the configured
+  ``mon_max_pg_per_osd`` limit.  The option can be adjusted if it
+  is really necessary to create a pool with more PGs.
index aba92c28bef9dca9b9b9b17996dfff3ed94a568e..dc14113035d249acb30de1d11bc15ae8b4c289dd 100644 (file)
@@ -1,3 +1,3 @@
-Sphinx == 1.1.3
--e git+https://github.com/ceph/sphinx-ditaa.git#egg=sphinx-ditaa
+Sphinx == 1.6.3
+-e git+https://github.com/ceph/sphinx-ditaa.git@py3#egg=sphinx-ditaa
 -e git+https://github.com/michaeljones/breathe#egg=breathe
index 5fd0a9a24c369f870237a6e065fff965e1410a4b..0a0d4a4723bef7761624faa5d39677b7e3954203 100644 (file)
@@ -1,7 +1,7 @@
 # Contributor: John Coyle <dx9err@gmail.com>
 # Maintainer: John Coyle <dx9err@gmail.com>
 pkgname=ceph
-pkgver=12.2.1
+pkgver=12.2.2
 pkgrel=0
 pkgdesc="Ceph is a distributed object store and file system"
 pkgusers="ceph"
@@ -63,7 +63,7 @@ makedepends="
        xmlstarlet
        yasm
 "
-source="ceph-12.2.1.tar.bz2"
+source="ceph-12.2.2.tar.bz2"
 subpackages="
        $pkgname-base
        $pkgname-common
@@ -116,7 +116,7 @@ _sysconfdir=/etc
 _udevrulesdir=/etc/udev/rules.d
 _python_sitelib=/usr/lib/python2.7/site-packages
 
-builddir=$srcdir/ceph-12.2.1
+builddir=$srcdir/ceph-12.2.2
 
 build() {
        export CEPH_BUILD_VIRTUALENV=$builddir
index 00d09ee21fcbc09de287f174eac63ace50ba8848..a4e5f8c94db789e1fb6aee7a612bba657ec485c0 100644 (file)
 # main package definition
 #################################################################################
 Name:          ceph
-Version:       12.2.1
+Version:       12.2.2
 Release:       0%{?dist}
 %if 0%{?fedora} || 0%{?rhel}
 Epoch:         2
 %endif
 
-# define %_epoch_prefix macro which will expand to the empty string if %epoch is undefined
+# define _epoch_prefix macro which will expand to the empty string if epoch is
+# undefined
 %global _epoch_prefix %{?epoch:%{epoch}:}
 
 Summary:       User space components of the Ceph file system
@@ -76,7 +77,7 @@ License:      LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and
 Group:         System/Filesystems
 %endif
 URL:           http://ceph.com/
-Source0:       http://ceph.com/download/ceph-12.2.1.tar.bz2
+Source0:       http://ceph.com/download/ceph-12.2.2.tar.bz2
 %if 0%{?suse_version}
 %if 0%{?is_opensuse}
 ExclusiveArch:  x86_64 aarch64 ppc64 ppc64le
@@ -109,6 +110,7 @@ BuildRequires:      python-werkzeug
 %if 0%{?suse_version}
 BuildRequires: python-CherryPy
 BuildRequires: python-Werkzeug
+BuildRequires: python-numpy-devel
 %endif
 BuildRequires: python-pecan
 BuildRequires: socat
@@ -773,7 +775,7 @@ python-rbd, python-rgw or python-cephfs instead.
 # common
 #################################################################################
 %prep
-%autosetup -p1 -n ceph-12.2.1
+%autosetup -p1 -n ceph-12.2.2
 
 %build
 %if 0%{with cephfs_java}
@@ -883,6 +885,7 @@ mkdir -p %{buildroot}%{_sbindir}
 install -m 0644 -D src/logrotate.conf %{buildroot}%{_sysconfdir}/logrotate.d/ceph
 chmod 0644 %{buildroot}%{_docdir}/ceph/sample.ceph.conf
 install -m 0644 -D COPYING %{buildroot}%{_docdir}/ceph/COPYING
+install -m 0644 -D src/90-ceph-osd.conf %{buildroot}%{_sysctldir}/90-ceph-osd.conf
 
 # firewall templates and /sbin/mount.ceph symlink
 %if 0%{?suse_version}
@@ -1412,12 +1415,14 @@ fi
 %{_udevrulesdir}/95-ceph-osd.rules
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/ceph-osd.8*
+%{_mandir}/man8/ceph-bluestore-tool.8*
 %if 0%{?rhel} && ! 0%{?centos}
 %attr(0755,-,-) %{_sysconfdir}/cron.hourly/subman
 %endif
 %{_unitdir}/ceph-osd@.service
 %{_unitdir}/ceph-osd.target
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd
+%config(noreplace) %{_sysctldir}/90-ceph-osd.conf
 
 %post osd
 %if 0%{?suse_version}
@@ -1431,6 +1436,11 @@ fi
 if [ $1 -eq 1 ] ; then
 /usr/bin/systemctl start ceph-osd.target >/dev/null 2>&1 || :
 fi
+%if 0%{?sysctl_apply}
+    %sysctl_apply 90-ceph-osd.conf
+%else
+    /usr/lib/systemd/systemd-sysctl %{_sysctldir}/90-ceph-osd.conf > /dev/null 2>&1 || :
+%endif
 
 %preun osd
 %if 0%{?suse_version}
index b45c9feecfbcf0d2cfaf7678df6e1ee5c3744da8..cc0830e60ff51d60e6771385c2006cad9093525a 100644 (file)
@@ -67,7 +67,8 @@ Release:      @RPM_RELEASE@%{?dist}
 Epoch:         2
 %endif
 
-# define %_epoch_prefix macro which will expand to the empty string if %epoch is undefined
+# define _epoch_prefix macro which will expand to the empty string if epoch is
+# undefined
 %global _epoch_prefix %{?epoch:%{epoch}:}
 
 Summary:       User space components of the Ceph file system
@@ -109,6 +110,7 @@ BuildRequires:      python-werkzeug
 %if 0%{?suse_version}
 BuildRequires: python-CherryPy
 BuildRequires: python-Werkzeug
+BuildRequires: python-numpy-devel
 %endif
 BuildRequires: python-pecan
 BuildRequires: socat
@@ -883,6 +885,7 @@ mkdir -p %{buildroot}%{_sbindir}
 install -m 0644 -D src/logrotate.conf %{buildroot}%{_sysconfdir}/logrotate.d/ceph
 chmod 0644 %{buildroot}%{_docdir}/ceph/sample.ceph.conf
 install -m 0644 -D COPYING %{buildroot}%{_docdir}/ceph/COPYING
+install -m 0644 -D src/90-ceph-osd.conf %{buildroot}%{_sysctldir}/90-ceph-osd.conf
 
 # firewall templates and /sbin/mount.ceph symlink
 %if 0%{?suse_version}
@@ -1412,12 +1415,14 @@ fi
 %{_udevrulesdir}/95-ceph-osd.rules
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/ceph-osd.8*
+%{_mandir}/man8/ceph-bluestore-tool.8*
 %if 0%{?rhel} && ! 0%{?centos}
 %attr(0755,-,-) %{_sysconfdir}/cron.hourly/subman
 %endif
 %{_unitdir}/ceph-osd@.service
 %{_unitdir}/ceph-osd.target
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd
+%config(noreplace) %{_sysctldir}/90-ceph-osd.conf
 
 %post osd
 %if 0%{?suse_version}
@@ -1431,6 +1436,11 @@ fi
 if [ $1 -eq 1 ] ; then
 /usr/bin/systemctl start ceph-osd.target >/dev/null 2>&1 || :
 fi
+%if 0%{?sysctl_apply}
+    %sysctl_apply 90-ceph-osd.conf
+%else
+    /usr/lib/systemd/systemd-sysctl %{_sysctldir}/90-ceph-osd.conf > /dev/null 2>&1 || :
+%endif
 
 %preun osd
 %if 0%{?suse_version}
index 262082cfdfe961499719d0b6f72db2bcb772084d..87cd5011c1ec44ee52556c3cc1f6db888516ae76 100644 (file)
@@ -19,3 +19,5 @@ usr/share/man/man8/ceph-disk.8
 usr/share/man/man8/ceph-volume.8
 usr/share/man/man8/ceph-volume-systemd.8
 usr/share/man/man8/ceph-osd.8
+usr/share/man/man8/ceph-bluestore-tool.8
+etc/sysctl.d/30-ceph-osd.conf
index b642dfe346455526d82b6c5ad97b989a97ff7fcb..5e44548fe826177d9c78c589f0611207d334d7e5 100644 (file)
@@ -23,6 +23,7 @@ set -e
 
 case "$1" in
     configure)
+       [ -x /etc/init.d/procps ] && invoke-rc.d procps restart || :
        [ -x /sbin/start ] && start ceph-osd-all || :
     ;;
     abort-upgrade|abort-remove|abort-deconfigure)
index f0524391762ca1b1b71a00e0671327106d7e1ec2..7597cb680be2e768858d66defa490b0dd693c2a3 100644 (file)
@@ -1,3 +1,9 @@
+ceph (12.2.2-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com>  Thu, 30 Nov 2017 14:59:26 +0000
+
 ceph (12.2.1-1) stable; urgency=medium
 
   * New upstream release
index 92bc0b5877660b60adfd5aac156d35af382b6a0f..857888f84e764632b5e3eee1f4d37a41ccdd3f97 100755 (executable)
@@ -50,6 +50,7 @@ override_dh_auto_install:
        install -D -m 644 udev/95-ceph-osd.rules $(DESTDIR)/lib/udev/rules.d/95-ceph-osd.rules
        install -D -m 644 udev/60-ceph-by-parttypeuuid.rules $(DESTDIR)/lib/udev/rules.d/60-ceph-by-parttypeuuid.rules
        install -D -m 644 src/etc-rbdmap $(DESTDIR)/etc/ceph/rbdmap
+       install -D -m 644 src/90-ceph-osd.conf $(DESTDIR)/etc/sysctl.d/30-ceph-osd.conf
 
 # doc/changelog is a directory, which confuses dh_installchangelogs
 override_dh_installchangelogs:
index d34e8029480fd58f8033b72f9321d5538f6a2eb3..5cf4778bbe104dd83c76d7c5ea326704912b5ca2 100644 (file)
@@ -3,19 +3,46 @@
 ceph-volume
 ===========
 Deploy OSDs with different device technologies like lvm or physical disks using
-pluggable tools (:doc:`lvm/index` itself is treated like a plugin). It tries to
-follow the workflow of ``ceph-disk`` for deploying OSDs, with a predictable,
-and robust way of preparing, activating, and starting OSDs.
+pluggable tools (:doc:`lvm/index` itself is treated like a plugin) and trying to
+follow a predictable, and robust way of preparing, activating, and starting OSDs.
 
 :ref:`Overview <ceph-volume-overview>` |
 :ref:`Plugin Guide <ceph-volume-plugins>` |
 
 
 **Command Line Subcommands**
-Although currently there is support for ``lvm``, the plan is to support other
-technologies, including plain disks.
+There is currently support for ``lvm``, and plain disks (with GPT partitions)
+that may have been deployed with ``ceph-disk``.
 
 * :ref:`ceph-volume-lvm`
+* :ref:`ceph-volume-simple`
+
+
+Migrating
+---------
+Starting on Ceph version 12.2.2, ``ceph-disk`` is deprecated. Deprecation
+warnings will show up that will link to this page. It is strongly suggested
+that users start consuming ``ceph-volume``.
+
+New deployments
+^^^^^^^^^^^^^^^
+For new deployments, :ref:`ceph-volume-lvm` is recommended, it can use any
+logical volume as input for data OSDs, or it can setup a minimal/naive logical
+volume from a device.
+
+Existing OSDs
+^^^^^^^^^^^^^
+If the cluster has OSDs that were provisioned with ``ceph-disk``, then
+``ceph-volume`` can take over the management of these with
+:ref:`ceph-volume-simple`. A scan is done on the data device or OSD directory,
+and ``ceph-disk`` is fully disabled.
+
+Encrypted OSDs
+^^^^^^^^^^^^^^
+If using encryption with OSDs, there is currently no support in ``ceph-volume``
+for this scenario (although support for this is coming soon). In this case, it
+is OK to continue to use ``ceph-disk`` until ``ceph-volume`` fully supports it.
+This page will be updated when that happens.
 
 .. toctree::
    :hidden:
@@ -23,8 +50,15 @@ technologies, including plain disks.
    :caption: Contents:
 
    intro
+   systemd
    lvm/index
    lvm/activate
    lvm/prepare
    lvm/scan
    lvm/systemd
+   lvm/list
+   lvm/zap
+   simple/index
+   simple/activate
+   simple/scan
+   simple/systemd
index b9f30d69f3da11f0d88ef1e2e5ba4bbd1a109896..956a62a627df2aac3f1aa9b6f251d9506ba6122c 100644 (file)
@@ -17,7 +17,7 @@ New OSDs
 To activate newly prepared OSDs both the :term:`OSD id` and :term:`OSD uuid`
 need to be supplied. For example::
 
-    ceph-volume activate --filestore 0 0263644D-0BF1-4D6D-BC34-28BD98AE3BC8
+    ceph-volume lvm activate --bluestore 0 0263644D-0BF1-4D6D-BC34-28BD98AE3BC8
 
 .. note:: The UUID is stored in the ``osd_fsid`` file in the OSD path, which is
           generated when :ref:`ceph-volume-lvm-prepare` is used.
@@ -46,7 +46,7 @@ For example::
 Would start the discovery process for the OSD with an id of ``0`` and a UUID of
 ``8715BEB4-15C5-49DE-BA6F-401086EC7B41``.
 
-.. note:: for more details on the systemd workflow see :ref:`ceph-volume-systemd`
+.. note:: for more details on the systemd workflow see :ref:`ceph-volume-lvm-systemd`
 
 The systemd unit will look for the matching OSD device, and by looking at its
 :term:`LVM tags` will proceed to:
@@ -58,6 +58,9 @@ The systemd unit will look for the matching OSD device, and by looking at its
 
 # start the ``ceph-osd@0`` systemd unit
 
+.. note:: The system infers the objectstore type (filestore or bluestore) by
+          inspecting the LVM tags applied to the OSD devices
+
 Existing OSDs
 -------------
 For exsiting OSDs that have been deployed with different tooling, the only way
@@ -66,7 +69,18 @@ See :ref:`ceph-volume-lvm-existing-osds` for details on how to proceed.
 
 Summary
 -------
-To recap the ``activate`` process:
+To recap the ``activate`` process for :term:`bluestore`:
+
+#. require both :term:`OSD id` and :term:`OSD uuid`
+#. enable the system unit with matching id and uuid
+#. Create the ``tmpfs`` mount at the OSD directory in
+   ``/var/lib/ceph/osd/$cluster-$id/``
+#. Recreate all the files needed with ``ceph-bluestore-tool prime-osd-dir`` by
+   pointing it to the OSD ``block`` device.
+#. the systemd unit will ensure all devices are ready and linked
+#. the matching ``ceph-osd`` systemd unit will get started
+
+And for :term:`filestore`:
 
 #. require both :term:`OSD id` and :term:`OSD uuid`
 #. enable the system unit with matching id and uuid
diff --git a/ceph/doc/ceph-volume/lvm/create.rst b/ceph/doc/ceph-volume/lvm/create.rst
new file mode 100644 (file)
index 0000000..c90d1f6
--- /dev/null
@@ -0,0 +1,24 @@
+.. _ceph-volume-lvm-create:
+
+``create``
+===========
+This subcommand wraps the two-step process to provision a new osd (calling
+``prepare`` first and then ``activate``) into a single
+one. The reason to prefer ``prepare`` and then ``activate`` is to gradually
+introduce new OSDs into a cluster, and avoiding large amounts of data being
+rebalanced.
+
+The single-call process unifies exactly what :ref:`ceph-volume-lvm-prepare` and
+:ref:`ceph-volume-lvm-activate` do, with the convenience of doing it all at
+once.
+
+There is nothing different to the process except the OSD will become up and in
+immediately after completion.
+
+The backing objectstore can be specified with:
+
+* :ref:`--filestore <ceph-volume-lvm-prepare_filestore>`
+* :ref:`--bluestore <ceph-volume-lvm-prepare_bluestore>`
+
+All command line flags and options are the same as ``ceph-volume lvm prepare``.
+Please refer to :ref:`ceph-volume-lvm-prepare` for details.
index 5c1ef01177ea14791fdd9c89a0210677c282249d..9a2191fb519359107b3464589cf0bbd4146d6b93 100644 (file)
@@ -11,6 +11,10 @@ Implements the functionality needed to deploy OSDs from the ``lvm`` subcommand:
 
 * :ref:`ceph-volume-lvm-activate`
 
+* :ref:`ceph-volume-lvm-create`
+
+* :ref:`ceph-volume-lvm-list`
+
 .. not yet implemented
 .. * :ref:`ceph-volume-lvm-scan`
 
@@ -20,5 +24,5 @@ There are other aspects of the ``lvm`` subcommand that are internal and not
 exposed to the user, these sections explain how these pieces work together,
 clarifying the workflows of the tool.
 
-:ref:`Systemd Units <ceph-volume-systemd>` |
+:ref:`Systemd Units <ceph-volume-lvm-systemd>` |
 :ref:`lvm <ceph-volume-lvm-api>`
diff --git a/ceph/doc/ceph-volume/lvm/list.rst b/ceph/doc/ceph-volume/lvm/list.rst
new file mode 100644 (file)
index 0000000..19e0600
--- /dev/null
@@ -0,0 +1,173 @@
+.. _ceph-volume-lvm-list:
+
+``list``
+========
+This subcommand will list any devices (logical and physical) that may be
+associated with a Ceph cluster, as long as they contain enough metadata to
+allow for that discovery.
+
+Output is grouped by the OSD ID associated with the devices, and unlike
+``ceph-disk`` it does not provide any information for devices that aren't
+associated with Ceph.
+
+Command line options:
+
+* ``--format`` Allows a ``json`` or ``pretty`` value. Defaults to ``pretty``
+  which will group the device information in a human-readable format.
+
+Full Reporting
+--------------
+When no positional arguments are used, a full reporting will be presented. This
+means that all devices and logical volumes found in the system will be
+displayed.
+
+Full ``pretty`` reporting for two OSDs, one with a lv as a journal, and another
+one with a physical device may look similar to::
+
+    # ceph-volume lvm list
+
+
+    ====== osd.1 =======
+
+      [journal]    /dev/journals/journal1
+
+          journal uuid              C65n7d-B1gy-cqX3-vZKY-ZoE0-IEYM-HnIJzs
+          osd id                    1
+          cluster fsid              ce454d91-d748-4751-a318-ff7f7aa18ffd
+          type                      journal
+          osd fsid                  661b24f8-e062-482b-8110-826ffe7f13fa
+          data uuid                 SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
+          journal device            /dev/journals/journal1
+          data device               /dev/test_group/data-lv2
+
+      [data]    /dev/test_group/data-lv2
+
+          journal uuid              C65n7d-B1gy-cqX3-vZKY-ZoE0-IEYM-HnIJzs
+          osd id                    1
+          cluster fsid              ce454d91-d748-4751-a318-ff7f7aa18ffd
+          type                      data
+          osd fsid                  661b24f8-e062-482b-8110-826ffe7f13fa
+          data uuid                 SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
+          journal device            /dev/journals/journal1
+          data device               /dev/test_group/data-lv2
+
+    ====== osd.0 =======
+
+      [data]    /dev/test_group/data-lv1
+
+          journal uuid              cd72bd28-002a-48da-bdf6-d5b993e84f3f
+          osd id                    0
+          cluster fsid              ce454d91-d748-4751-a318-ff7f7aa18ffd
+          type                      data
+          osd fsid                  943949f0-ce37-47ca-a33c-3413d46ee9ec
+          data uuid                 TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00
+          journal device            /dev/sdd1
+          data device               /dev/test_group/data-lv1
+
+      [journal]    /dev/sdd1
+
+          PARTUUID                  cd72bd28-002a-48da-bdf6-d5b993e84f3f
+
+.. note:: Tags are displayed in a readable format. The ``osd id`` key is stored
+          as a ``ceph.osd_id`` tag. For more information on lvm tag conventions
+          see :ref:`ceph-volume-lvm-tag-api`
+
+Single Reporting
+----------------
+Single reporting can consume both devices and logical volumes as input
+(positional parameters). For logical volumes, it is required to use the group
+name as well as the logical volume name.
+
+For example the ``data-lv2`` logical volume, in the ``test_group`` volume group
+can be listed in the following way::
+
+    # ceph-volume lvm list test_group/data-lv2
+
+
+    ====== osd.1 =======
+
+      [data]    /dev/test_group/data-lv2
+
+          journal uuid              C65n7d-B1gy-cqX3-vZKY-ZoE0-IEYM-HnIJzs
+          osd id                    1
+          cluster fsid              ce454d91-d748-4751-a318-ff7f7aa18ffd
+          type                      data
+          osd fsid                  661b24f8-e062-482b-8110-826ffe7f13fa
+          data uuid                 SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
+          journal device            /dev/journals/journal1
+          data device               /dev/test_group/data-lv2
+
+
+.. note:: Tags are displayed in a readable format. The ``osd id`` key is stored
+          as a ``ceph.osd_id`` tag. For more information on lvm tag conventions
+          see :ref:`ceph-volume-lvm-tag-api`
+
+
+For plain disks, the full path to the device is required. For example, for
+a device like ``/dev/sdd1`` it can look like::
+
+
+    # ceph-volume lvm list /dev/sdd1
+
+
+    ====== osd.0 =======
+
+      [journal]    /dev/sdd1
+
+          PARTUUID                  cd72bd28-002a-48da-bdf6-d5b993e84f3f
+
+
+
+``json`` output
+---------------
+All output using ``--format=json`` will show everything the system has stored
+as metadata for the devices, including tags.
+
+No changes for readability are done with ``json`` reporting, and all
+information is presented as-is. Full output as well as single devices can be
+listed.
+
+For brevity, this is how a single logical volume would look with ``json``
+output (note how tags aren't modified)::
+
+    # ceph-volume lvm list --format=json test_group/data-lv1
+    {
+        "0": [
+            {
+                "lv_name": "data-lv1",
+                "lv_path": "/dev/test_group/data-lv1",
+                "lv_tags": "ceph.cluster_fsid=ce454d91-d748-4751-a318-ff7f7aa18ffd,ceph.data_device=/dev/test_group/data-lv1,ceph.data_uuid=TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00,ceph.journal_device=/dev/sdd1,ceph.journal_uuid=cd72bd28-002a-48da-bdf6-d5b993e84f3f,ceph.osd_fsid=943949f0-ce37-47ca-a33c-3413d46ee9ec,ceph.osd_id=0,ceph.type=data",
+                "lv_uuid": "TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00",
+                "name": "data-lv1",
+                "path": "/dev/test_group/data-lv1",
+                "tags": {
+                    "ceph.cluster_fsid": "ce454d91-d748-4751-a318-ff7f7aa18ffd",
+                    "ceph.data_device": "/dev/test_group/data-lv1",
+                    "ceph.data_uuid": "TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00",
+                    "ceph.journal_device": "/dev/sdd1",
+                    "ceph.journal_uuid": "cd72bd28-002a-48da-bdf6-d5b993e84f3f",
+                    "ceph.osd_fsid": "943949f0-ce37-47ca-a33c-3413d46ee9ec",
+                    "ceph.osd_id": "0",
+                    "ceph.type": "data"
+                },
+                "type": "data",
+                "vg_name": "test_group"
+            }
+        ]
+    }
+
+
+Synchronized information
+------------------------
+Before any listing type, the lvm API is queried to ensure that physical devices
+that may be in use haven't changed naming. It is possible that non-persistent
+devices like ``/dev/sda1`` could change to ``/dev/sdb1``.
+
+The detection is possible because the ``PARTUUID`` is stored as part of the
+metadata in the logical volume for the data lv. Even in the case of a journal
+that is a physical device, this information is still stored on the data logical
+volume associated with it.
+
+If the name is no longer the same (as reported by ``blkid`` when using the
+``PARTUUID``), the tag will get updated and the report will use the newly
+refreshed information.
index add0f185d864642e28b7f8e107ca5ea84ef89a48..27ebb55d7e6283c0549c9c2af5f41edee28e8221 100644 (file)
@@ -2,10 +2,11 @@
 
 ``prepare``
 ===========
-This subcommand allows a :term:`filestore` setup (:term:`bluestore` support is
-planned) and currently consumes only logical volumes for both the data and
-journal. It will not create or modify the logical volumes except for adding
-extra metadata.
+This subcommand allows a :term:`filestore` or :term:`bluestore` setup. It is
+recommended to pre-provision a logical volume before using it with
+``ceph-volume lvm``.
+
+Logical volumes are not altered except for adding extra metadata.
 
 .. note:: This is part of a two step process to deploy an OSD. If looking for
           a single-call way, please see :ref:`ceph-volume-lvm-create`
@@ -23,28 +24,46 @@ the back end can be specified with:
 
 
 * :ref:`--filestore <ceph-volume-lvm-prepare_filestore>`
-* ``--bluestore``
-
-.. when available, this will need to be updated to:
-.. * :ref:`--bluestore <ceph-volume-prepare_bluestore>`
+* :ref:`--bluestore <ceph-volume-lvm-prepare_bluestore>`
 
 .. _ceph-volume-lvm-prepare_filestore:
 
 ``filestore``
 -------------
-This is the default OSD backend and allows preparation of logical volumes for
-a :term:`filestore` OSD.
+This is the OSD backend that allows preparation of logical volumes for
+a :term:`filestore` objectstore OSD.
 
-The process is *very* strict, it requires two logical volumes that are ready to
-be used. No special preparation is needed for these volumes other than
-following the minimum size requirements for data and journal.
+It can use a logical volume for the OSD data and a partitioned physical device
+or logical volume for the journal.  No special preparation is needed for these
+volumes other than following the minimum size requirements for data and
+journal.
 
 The API call looks like::
 
     ceph-volume prepare --filestore --data data --journal journal
 
-The journal *must* be a logical volume, just like the data volume, and that
-argument is always required even if both live under the same group.
+There is flexibility to use a raw device or partition as well for ``--data``
+that will be converted to a logical volume. This is not ideal in all situations
+since ``ceph-volume`` is just going to create a unique volume group and
+a logical volume from that device.
+
+When using logical volumes for ``--data``, the  value *must* be a volume group
+name and a logical volume name separated by a ``/``. Since logical volume names
+are not enforced for uniqueness, this prevents using the wrong volume. The
+``--journal`` can be either a logical volume *or* a partition.
+
+When using a partition, it *must* contain a ``PARTUUID`` discoverable by
+``blkid``, so that it can later be identified correctly regardless of the
+device name (or path).
+
+When using a partition, this is how it would look for ``/dev/sdc1``::
+
+    ceph-volume prepare --filestore --data volume_group/lv_name --journal /dev/sdc1
+
+For a logical volume, just like for ``--data``, a volume group and logical
+volume name are required::
+
+    ceph-volume prepare --filestore --data volume_group/lv_name --journal volume_group/journal_lv
 
 A generated uuid is used to ask the cluster for a new OSD. These two pieces are
 crucial for identifying an OSD and will later be used throughout the
@@ -108,32 +127,109 @@ later be started (for detailed metadata description see :ref:`ceph-volume-lvm-ta
 
 ``bluestore``
 -------------
-This subcommand is planned but not currently implemented.
+The :term:`bluestore` objectstore is the default for new OSDs. It offers a bit
+more flexibility for devices. Bluestore supports the following configurations:
+
+* A block device, a block.wal, and a block.db device
+* A block device and a block.wal device
+* A block device and a block.db device
+* A single block device
+
+It can accept a whole device (or partition), or a logical volume for ``block``.
+If a physical device is provided it will then be turned into a logical volume.
+This allows a simpler approach at using LVM but at the cost of flexibility:
+there are no options or configurations to change how the LV is created.
+
+The ``block`` is specified with the ``--data`` flag, and in its simplest use
+case it looks like::
+
+    ceph-volume lvm prepare --bluestore --data vg/lv
+
+A raw device can be specified in the same way::
+
+    ceph-volume lvm prepare --bluestore --data /path/to/device
+
+
+If a ``block.db`` or a ``block.wal`` is needed (they are optional for
+bluestore) they can be specified with ``--block.db`` and ``--block.wal``
+accordingly. These can be a physical device (they **must** be a partition) or
+a logical volume.
+
+For both ``block.db`` and ``block.wal`` partitions aren't made logical volumes
+because they can be used as-is. Logical Volumes are also allowed.
+
+While creating the OSD directory, the process will use a ``tmpfs`` mount to
+place all the files needed for the OSD. These files are initially created by
+``ceph-osd --mkfs`` and are fully ephemeral.
+
+A symlink is always created for the ``block`` device, and optionally for
+``block.db`` and ``block.wal``. For a cluster with a default name, and an OSD
+id of 0, the directory could look like::
+
+    # ls -l /var/lib/ceph/osd/ceph-0
+    lrwxrwxrwx. 1 ceph ceph 93 Oct 20 13:05 block -> /dev/ceph-be2b6fbd-bcf2-4c51-b35d-a35a162a02f0/osd-block-25cf0a05-2bc6-44ef-9137-79d65bd7ad62
+    lrwxrwxrwx. 1 ceph ceph 93 Oct 20 13:05 block.db -> /dev/sda1
+    lrwxrwxrwx. 1 ceph ceph 93 Oct 20 13:05 block.wal -> /dev/ceph/osd-wal-0
+    -rw-------. 1 ceph ceph 37 Oct 20 13:05 ceph_fsid
+    -rw-------. 1 ceph ceph 37 Oct 20 13:05 fsid
+    -rw-------. 1 ceph ceph 55 Oct 20 13:05 keyring
+    -rw-------. 1 ceph ceph  6 Oct 20 13:05 ready
+    -rw-------. 1 ceph ceph 10 Oct 20 13:05 type
+    -rw-------. 1 ceph ceph  2 Oct 20 13:05 whoami
+
+In the above case, a device was used for ``block`` so ``ceph-volume`` create
+a volume group and a logical volume using the following convention:
+
+* volume group name: ``ceph-{cluster fsid}`` or if the vg exists already
+  ``ceph-{random uuid}``
+
+* logical volume name: ``osd-block-{osd_fsid}``
 
 
 Storing metadata
 ----------------
-The following tags will get applied as part of the prepartion process
-regardless of the type of volume (journal or data) and also regardless of the
-OSD backend:
+The following tags will get applied as part of the preparation process
+regardless of the type of volume (journal or data) or OSD objectstore:
 
 * ``cluster_fsid``
-* ``data_device``
-* ``journal_device``
 * ``encrypted``
 * ``osd_fsid``
 * ``osd_id``
-* ``block``
-* ``db``
-* ``wal``
-* ``lockbox_device``
+
+For :term:`filestore` these tags will be added:
+
+* ``journal_device``
+* ``journal_uuid``
+
+For :term:`bluestore` these tags will be added:
+
+* ``block_device``
+* ``block_uuid``
+* ``db_device``
+* ``db_uuid``
+* ``wal_device``
+* ``wal_uuid``
 
 .. note:: For the complete lvm tag conventions see :ref:`ceph-volume-lvm-tag-api`
 
 
 Summary
 -------
-To recap the ``prepare`` process:
+To recap the ``prepare`` process for :term:`bluestore`:
+
+#. Accept a logical volume for block or a raw device (that will get converted
+   to an lv)
+#. Accept partitions or logical volumes for ``block.wal`` or ``block.db``
+#. Generate a UUID for the OSD
+#. Ask the monitor get an OSD ID reusing the generated UUID
+#. OSD data directory is created on a tmpfs mount.
+#. ``block``, ``block.wal``, and ``block.db`` are symlinked if defined.
+#. monmap is fetched for activation
+#. Data directory is populated by ``ceph-osd``
+#. Logical Volumes are are assigned all the Ceph metadata using lvm tags
+
+
+And the ``prepare`` process for :term:`filestore`:
 
 #. Accept only logical volumes for data and journal (both required)
 #. Generate a UUID for the OSD
index 7162e0433526f1e7074358fdf0a6541d5ae967af..30260de7e8826ae939f66094bd53952a8a6da44e 100644 (file)
@@ -1,31 +1,7 @@
-.. _ceph-volume-systemd:
+.. _ceph-volume-lvm-systemd:
 
 systemd
 =======
-As part of the :ref:`ceph-volume-lvm-activate` process, a few systemd units will get enabled
-that will use the OSD id and uuid as part of their name. These units will be
-run when the system boots, and will proceed to activate their corresponding
-volumes.
-
-The API for activation requires both the :term:`OSD id` and :term:`OSD uuid`,
-which get persisted by systemd. Internally, the activation process enables the
-systemd unit using the following convention::
-
-    ceph-volume@<type>-<extra metadata>
-
-Where ``type`` is the sub-command used to parse the extra metadata, and ``extra
-metadata`` is any additional information needed by the sub-command to be able
-to activate the OSD. For example an OSD with an ID of 0, for the ``lvm``
-sub-command would look like::
-
-    systemctl enable ceph-volume@lvm-0-0A3E1ED2-DA8A-4F0E-AA95-61DEC71768D6
-
-
-Process
--------
-The systemd unit is a :term:`systemd oneshot` service, meant to start at boot after the
-local filesystem is ready to be used.
-
 Upon startup, it will identify the logical volume using :term:`LVM tags`,
 finding a matching ID and later ensuring it is the right one with
 the :term:`OSD uuid`.
@@ -41,6 +17,12 @@ be mounted at::
 
     /var/lib/ceph/osd/ceph-0
 
+
 Once that process is complete, a call will be made to start the OSD::
 
     systemctl start ceph-osd@0
+
+The systemd portion of this process is handled by the ``ceph-volume lvm
+trigger`` sub-command, which is only in charge of parsing metadata coming from
+systemd and startup, and then dispatching to ``ceph-volume lvm activate`` which
+would proceed with activation.
diff --git a/ceph/doc/ceph-volume/lvm/zap.rst b/ceph/doc/ceph-volume/lvm/zap.rst
new file mode 100644 (file)
index 0000000..8d42a90
--- /dev/null
@@ -0,0 +1,19 @@
+.. _ceph-volume-lvm-zap:
+
+``zap``
+=======
+
+This subcommand is used to zap lvs or partitions that have been used
+by ceph OSDs so that they may be reused. If given a path to a logical
+volume it must be in the format of vg/lv. Any filesystems present
+on the given lv or partition will be removed and all data will be purged.
+
+.. note:: The lv or partition will be kept intact.
+
+Zapping a logical volume::
+
+      ceph-volume lvm zap {vg name/lv name}
+
+Zapping a partition::
+
+      ceph-volume lvm zap /dev/sdc1
diff --git a/ceph/doc/ceph-volume/simple/activate.rst b/ceph/doc/ceph-volume/simple/activate.rst
new file mode 100644 (file)
index 0000000..edbb1e3
--- /dev/null
@@ -0,0 +1,80 @@
+.. _ceph-volume-simple-activate:
+
+``activate``
+============
+Once :ref:`ceph-volume-simple-scan` has been completed, and all the metadata
+captured for an OSD has been persisted to ``/etc/ceph/osd/{id}-{uuid}.json``
+the OSD is now ready to get "activated".
+
+This activation process **disables** all ``ceph-disk`` systemd units by masking
+them, to prevent the UDEV/ceph-disk interaction that will attempt to start them
+up at boot time.
+
+The disabling of ``ceph-disk`` units is done only when calling ``ceph-volume
+simple activate`` directly, but is is avoided when being called by systemd when
+the system is booting up.
+
+The activation process requires using both the :term:`OSD id` and :term:`OSD uuid`
+To activate parsed OSDs::
+
+    ceph-volume simple activate 0 6cc43680-4f6e-4feb-92ff-9c7ba204120e
+
+The above command will assume that a JSON configuration will be found in::
+
+    /etc/ceph/osd/0-6cc43680-4f6e-4feb-92ff-9c7ba204120e.json
+
+Alternatively, using a path to a JSON file directly is also possible::
+
+    ceph-volume simple activate --file /etc/ceph/osd/0-6cc43680-4f6e-4feb-92ff-9c7ba204120e.json
+
+requiring uuids
+^^^^^^^^^^^^^^^
+The :term:`OSD uuid` is being required as an extra step to ensure that the
+right OSD is being activated. It is entirely possible that a previous OSD with
+the same id exists and would end up activating the incorrect one.
+
+
+Discovery
+---------
+With OSDs previously scanned by ``ceph-volume``, a *discovery* process is
+performed using ``blkid`` and ``lvm``. There is currently support only for
+devices with GPT partitions and LVM logical volumes.
+
+The GPT partitions will have a ``PARTUUID`` that can be queried by calling out
+to ``blkid``, and the logical volumes will have a ``lv_uuid`` that can be
+queried against ``lvs`` (the LVM tool to list logical volumes).
+
+This discovery process ensures that devices can be correctly detected even if
+they are repurposed into another system or if their name changes (as in the
+case of non-persisting names like ``/dev/sda1``)
+
+The JSON configuration file used to map what devices go to what OSD will then
+coordinate the mounting and symlinking as part of activation.
+
+To ensure that the symlinks are always correct, if they exist in the OSD
+directory, the symlinks will be re-done.
+
+A systemd unit will capture the :term:`OSD id` and :term:`OSD uuid` and
+persist it. Internally, the activation will enable it like::
+
+    systemctl enable ceph-volume@simple-$id-$uuid
+
+For example::
+
+    systemctl enable ceph-volume@simple-0-8715BEB4-15C5-49DE-BA6F-401086EC7B41
+
+Would start the discovery process for the OSD with an id of ``0`` and a UUID of
+``8715BEB4-15C5-49DE-BA6F-401086EC7B41``.
+
+
+The systemd process will call out to activate passing the information needed to
+identify the OSD and its devices, and it will proceed to:
+
+# mount the device in the corresponding location (by convention this is
+  ``/var/lib/ceph/osd/<cluster name>-<osd id>/``)
+
+# ensure that all required devices are ready for that OSD and properly linked,
+regardless of objectstore used (filestore or bluestore). The symbolic link will
+**always** be re-done to ensure that the correct device is linked.
+
+# start the ``ceph-osd@0`` systemd unit
diff --git a/ceph/doc/ceph-volume/simple/index.rst b/ceph/doc/ceph-volume/simple/index.rst
new file mode 100644 (file)
index 0000000..6f2534a
--- /dev/null
@@ -0,0 +1,19 @@
+.. _ceph-volume-simple:
+
+``simple``
+==========
+Implements the functionality needed to manage OSDs from the ``simple`` subcommand:
+``ceph-volume simple``
+
+**Command Line Subcommands**
+
+* :ref:`ceph-volume-simple-scan`
+
+* :ref:`ceph-volume-simple-activate`
+
+* :ref:`ceph-volume-simple-systemd`
+
+
+By *taking over* management, it disables all ``ceph-disk`` systemd units used
+to trigger devices at startup, relying on basic (customizable) JSON
+configuration and systemd for starting up OSDs.
diff --git a/ceph/doc/ceph-volume/simple/scan.rst b/ceph/doc/ceph-volume/simple/scan.rst
new file mode 100644 (file)
index 0000000..afeddab
--- /dev/null
@@ -0,0 +1,158 @@
+.. _ceph-volume-simple-scan:
+
+``scan``
+========
+Scanning allows to capture any important details from an already-deployed OSD
+so that ``ceph-volume`` can manage it without the need of any other startup
+workflows or tools (like ``udev`` or ``ceph-disk``).
+
+The command has the ability to inspect a running OSD, by inspecting the
+directory where the OSD data is stored, or by consuming the data partition.
+
+Once scanned, information will (by default) persist the metadata as JSON in
+a file in ``/etc/ceph/osd``. This ``JSON`` file will use the naming convention
+of: ``{OSD ID}-{OSD FSID}.json``. An OSD with an id of 1, and an FSID like
+``86ebd829-1405-43d3-8fd6-4cbc9b6ecf96`` the absolute path of the file would
+be::
+
+    /etc/ceph/osd/1-86ebd829-1405-43d3-8fd6-4cbc9b6ecf96.json
+
+The ``scan`` subcommand will refuse to write to this file if it already exists.
+If overwriting the contents is needed, the ``--force`` flag must be used::
+
+    ceph-volume simple scan --force {path}
+
+If there is no need to persist the ``JSON`` metadata, there is support to send
+the contents to ``stdout`` (no file will be written)::
+
+    ceph-volume simple scan --stdout {path}
+
+
+.. _ceph-volume-simple-scan-directory:
+
+Directory scan
+--------------
+The directory scan will capture OSD file contents from interesting files. There
+are a few files that must exist in order to have a successful scan:
+
+* ``ceph_fsid``
+* ``fsid``
+* ``keyring``
+* ``ready``
+* ``type``
+* ``whoami``
+
+In the case of any other file, as long as it is not a binary or a directory, it
+will also get captured and persisted as part of the JSON object.
+
+The convention for the keys in the JSON object is that any file name will be
+a key, and its contents will be its value. If the contents are a single line
+(like in the case of the ``whoami``) the contents are trimmed, and the newline
+is dropped. For example with an OSD with an id of 1, this is how the JSON entry
+would look like::
+
+    "whoami": "1",
+
+For files that may have more than one line, the contents are left as-is, for
+example, a ``keyring`` could look like this::
+
+    "keyring": "[osd.1]\n\tkey = AQBBJ/dZp57NIBAAtnuQS9WOS0hnLVe0rZnE6Q==\n",
+
+For a directory like ``/var/lib/ceph/osd/ceph-1``, the command could look
+like::
+
+    ceph-volume simple scan /var/lib/ceph/osd/ceph1
+
+
+.. note:: There is no support for encrypted OSDs
+
+
+.. _ceph-volume-simple-scan-device:
+
+Device scan
+-----------
+When an OSD directory is not available (OSD is not running, or device is not
+mounted) the ``scan`` command is able to introspect the device to capture
+required data. Just like :ref:`ceph-volume-simple-scan-directory`, it would
+still require a few files present. This means that the device to be scanned
+**must be** the data partition of the OSD.
+
+As long as the data partition of the OSD is being passed in as an argument, the
+sub-command can scan its contents.
+
+In the case where the device is already mounted, the tool can detect this
+scenario and capture file contents from that directory.
+
+If the device is not mounted, a temporary directory will be created, and the
+device will be mounted temporarily just for scanning the contents. Once
+contents are scanned, the device will be unmounted.
+
+For a device like ``/dev/sda1`` which **must** be a data partition, the command
+could look like::
+
+    ceph-volume simple scan /dev/sda1
+
+
+.. note:: There is no support for encrypted OSDs
+
+
+.. _ceph-volume-simple-scan-json:
+
+``JSON`` contents
+-----------------
+The contents of the JSON object is very simple. The scan not only will persist
+information from the special OSD files and their contents, but will also
+validate paths and device UUIDs. Unlike what ``ceph-disk`` would do, by storing
+them in ``{device type}_uuid`` files, the tool will persist them as part of the
+device type key.
+
+For example, a ``block.db`` device would look something like::
+
+    "block.db": {
+        "path": "/dev/disk/by-partuuid/6cc43680-4f6e-4feb-92ff-9c7ba204120e",
+        "uuid": "6cc43680-4f6e-4feb-92ff-9c7ba204120e"
+    },
+
+But it will also persist the ``ceph-disk`` special file generated, like so::
+
+    "block.db_uuid": "6cc43680-4f6e-4feb-92ff-9c7ba204120e",
+
+This duplication is in place because the tool is trying to ensure the
+following:
+
+# Support OSDs that may not have ceph-disk special files
+# Check the most up-to-date information on the device, by querying against LVM
+and ``blkid``
+# Support both logical volumes and GPT devices
+
+This is a sample ``JSON`` metadata, from an OSD that is using ``bluestore``::
+
+    {
+        "active": "ok",
+        "block": {
+            "path": "/dev/disk/by-partuuid/40fd0a64-caa5-43a3-9717-1836ac661a12",
+            "uuid": "40fd0a64-caa5-43a3-9717-1836ac661a12"
+        },
+        "block.db": {
+            "path": "/dev/disk/by-partuuid/6cc43680-4f6e-4feb-92ff-9c7ba204120e",
+            "uuid": "6cc43680-4f6e-4feb-92ff-9c7ba204120e"
+        },
+        "block.db_uuid": "6cc43680-4f6e-4feb-92ff-9c7ba204120e",
+        "block_uuid": "40fd0a64-caa5-43a3-9717-1836ac661a12",
+        "bluefs": "1",
+        "ceph_fsid": "c92fc9eb-0610-4363-aafc-81ddf70aaf1b",
+        "cluster_name": "ceph",
+        "data": {
+            "path": "/dev/sdr1",
+            "uuid": "86ebd829-1405-43d3-8fd6-4cbc9b6ecf96"
+        },
+        "fsid": "86ebd829-1405-43d3-8fd6-4cbc9b6ecf96",
+        "keyring": "[osd.3]\n\tkey = AQBBJ/dZp57NIBAAtnuQS9WOS0hnLVe0rZnE6Q==\n",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "ready": "ready",
+        "systemd": "",
+        "type": "bluestore",
+        "whoami": "3"
+    }
diff --git a/ceph/doc/ceph-volume/simple/systemd.rst b/ceph/doc/ceph-volume/simple/systemd.rst
new file mode 100644 (file)
index 0000000..aa5bebf
--- /dev/null
@@ -0,0 +1,28 @@
+.. _ceph-volume-simple-systemd:
+
+systemd
+=======
+Upon startup, it will identify the logical volume by loading the JSON file in
+``/etc/ceph/osd/{id}-{uuid}.json`` corresponding to the instance name of the
+systemd unit.
+
+After identifying the correct volume it will then proceed to mount it by using
+the OSD destination conventions, that is::
+
+    /var/lib/ceph/osd/{cluster name}-{osd id}
+
+For our example OSD with an id of ``0``, that means the identified device will
+be mounted at::
+
+
+    /var/lib/ceph/osd/ceph-0
+
+
+Once that process is complete, a call will be made to start the OSD::
+
+    systemctl start ceph-osd@0
+
+The systemd portion of this process is handled by the ``ceph-volume simple
+trigger`` sub-command, which is only in charge of parsing metadata coming from
+systemd and startup, and then dispatching to ``ceph-volume simple activate`` which
+would proceed with activation.
diff --git a/ceph/doc/ceph-volume/systemd.rst b/ceph/doc/ceph-volume/systemd.rst
new file mode 100644 (file)
index 0000000..6cbc112
--- /dev/null
@@ -0,0 +1,49 @@
+.. _ceph-volume-systemd:
+
+systemd
+=======
+As part of the activation process (either with :ref:`ceph-volume-lvm-activate`
+or :ref:`ceph-volume-simple-activate`), systemd units will get enabled that
+will use the OSD id and uuid as part of their name. These units will be run
+when the system boots, and will proceed to activate their corresponding
+volumes via their sub-command implementation.
+
+The API for activation is a bit loose, it only requires two parts: the
+subcommand to use and any extra meta information separated by a dash. This
+convention makes the units look like::
+
+    ceph-volume@{command}-{extra metadata}
+
+The *extra metadata* can be anything needed that the subcommand implementing
+the processing might need. In the case of :ref:`ceph-volume-lvm` and
+:ref:`ceph-volume-simple`, both look to consume the :term:`OSD id` and :term:`OSD uuid`,
+but this is not a hard requirement, it is just how the sub-commands are
+implemented.
+
+Both the command and extra metadata gets persisted by systemd as part of the
+*"instance name"* of the unit.  For example an OSD with an ID of 0, for the
+``lvm`` sub-command would look like::
+
+    systemctl enable ceph-volume@lvm-0-0A3E1ED2-DA8A-4F0E-AA95-61DEC71768D6
+
+The enabled unit is a :term:`systemd oneshot` service, meant to start at boot
+after the local filesystem is ready to be used.
+
+
+Failure and Retries
+-------------------
+It is common to have failures when a system is coming up online. The devices
+are sometimes not fully available and this unpredictable behavior may cause an
+OSD to not be ready to be used.
+
+There are two configurable environment variables used to set the retry
+behavior:
+
+* ``CEPH_VOLUME_SYSTEMD_TRIES``: Defaults to 30
+* ``CEPH_VOLUME_SYSTEMD_INTERVAL``: Defaults to 5
+
+The *"tries"* is a number that sets the maximum amount of times the unit will
+attempt to activate an OSD before giving up.
+
+The *"interval"* is a value in seconds that determines the waiting time before
+initiating another try at activating the OSD.
index 4f7bea3ef8585411bf291e3300d40326592c1b30..2fd47ae334f8ae153fc0e0ad8a505afa985b98d5 100644 (file)
               
 :Type:  Boolean
 :Default:  ``false``
+
+
+``mds min caps per client``
+
+:Description: Set the minimum number of capabilities a client may hold.
+:Type: Integer
+:Default: ``100``
+
+
+``mds max ratio caps per client``
+
+:Description: Set the maximum ratio of current caps that may be recalled during MDS cache pressure.
+:Type: Float
+:Default: ``0.8``
index 49b6ecde29e2445bead056ded89597bfdcfd5040..a1968bb4c1476943e46446599994e20bf919f92f 100644 (file)
@@ -41,9 +41,10 @@ extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.graphviz',
     'sphinx.ext.todo',
-    'sphinx_ditaa',
+    'sphinxcontrib.ditaa',
     'breathe',
     ]
+ditaa = 'ditaa'
 todo_include_todos = True
 
 top_level = os.path.dirname(
index fd6bbae58eab128486cf539836efba4c3b444905..f819dede2543c96b6ac65c3a0a6c67610c4569c4 100644 (file)
@@ -26,7 +26,8 @@ set(osd_srcs
   ceph-volume.rst
   ceph-volume-systemd.rst
   ceph-osd.rst
-  osdmaptool.rst)
+  osdmaptool.rst
+  ceph-bluestore-tool.rst)
 
 set(mon_srcs
   ceph-mon.rst
diff --git a/ceph/doc/man/8/ceph-bluestore-tool.rst b/ceph/doc/man/8/ceph-bluestore-tool.rst
new file mode 100644 (file)
index 0000000..7a7b0ea
--- /dev/null
@@ -0,0 +1,123 @@
+:orphan:
+
+======================================================
+ ceph-bluestore-tool -- bluestore administrative tool
+======================================================
+
+.. program:: ceph-bluestore-tool
+
+Synopsis
+========
+
+| **ceph-bluestore-tool** *command*
+  [ --dev *device* ... ]
+  [ --path *osd path* ]
+  [ --out-dir *dir* ]
+  [ --log-file | -l *filename* ]
+  [ --deep ]
+| **ceph-bluestore-tool** fsck|repair --path *osd path* [ --deep ]
+| **ceph-bluestore-tool** show-label --dev *device* ...
+| **ceph-bluestore-tool** prime-osd-dir --dev *device* --path *osd path*
+| **ceph-bluestore-tool** bluefs-export --path *osd path* --out-dir *dir*
+| **ceph-bluestore-tool** bluefs-export --path *osd path* --out-dir *dir*
+
+
+Description
+===========
+
+**ceph-bluestore-tool** is a utility to perform low-level administrative
+operations on a BlueStore instance.
+
+Commands
+========
+
+.. option:: help
+
+   show help
+
+.. option:: fsck
+
+   run consistency check on BlueStore metadata.  If *--deep* is specified, also read all object data and verify checksums.
+
+.. option:: repair
+
+   Run a consistency check *and* repair any errors we can.
+
+.. option:: bluefs-export
+
+   Export the contents of BlueFS (i.e., rocksdb files) to an output directory.
+
+.. option:: bluefs-bdev-sizes --path *osd path*
+
+   Print the device sizes, as understood by BlueFS, to stdout.
+
+.. option:: bluefs-bdev-expand --path *osd path*
+
+   Instruct BlueFS to check the size of its block devices and, if they have expanded, make use of the additional space.
+
+.. option:: show-label --dev *device* [...]
+
+   Show device label(s).          
+
+Options
+=======
+
+.. option:: --dev *device*
+
+   Add *device* to the list of devices to consider
+
+.. option:: --path *osd path*
+
+   Specify an osd path.  In most cases, the device list is inferred from the symlinks present in *osd path*.  This is usually simpler than explicitly specifying the device(s) with --dev.
+
+.. option:: --out-dir *dir*
+
+   Output directory for bluefs-export
+
+.. option:: -l, --log-file *log file*
+
+   file to log to
+
+.. option:: --log-level *num*
+
+   debug log level.  Default is 30 (extremely verbose), 20 is very
+   verbose, 10 is verbose, and 1 is not very verbose.
+
+.. option:: --deep
+
+   deep scrub/repair (read and validate object data, not just metadata)
+
+Device labels
+=============
+
+Every BlueStore block device has a single block label at the beginning of the
+device.  You can dump the contents of the label with::
+
+  ceph-bluestore-tool show-label --dev *device*
+
+The main device will have a lot of metadata, including information
+that used to be stored in small files in the OSD data directory.  The
+auxilliary devices (db and wal) will only have the minimum required
+fields (OSD UUID, size, device type, birth time).
+
+OSD directory priming
+=====================
+
+You can generate the content for an OSD data directory that can start up a
+BlueStore OSD with the *prime-osd-dir* command::
+
+  ceph-bluestore-tool prime-osd-dir --dev *main device* --path /var/lib/ceph/osd/ceph-*id*
+
+
+Availability
+============
+
+**ceph-bluestore-tool** is part of Ceph, a massively scalable,
+open-source, distributed storage system. Please refer to the Ceph
+documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`ceph-osd <ceph-osd>`\(8)
index 1e8d189da3f198d995d7efdfb6fecca35ef5ff91..11daf3e42a046b1bb7da4c393fbbea55bb7fab33 100644 (file)
@@ -53,6 +53,62 @@ by a standby.
 If you want to pre-empt failover, you can explicitly mark a ceph-mgr
 daemon as failed using ``ceph mgr fail <mgr name>``.
 
+Using modules
+-------------
+
+Use the command ``ceph mgr module ls`` to see which modules are
+available, and which are currently enabled.  Enable or disable modules
+using the commands ``ceph mgr module enable <module>`` and
+``ceph mgr module disable <module>`` respectively.
+
+If a module is *enabled* then the active ceph-mgr daemon will load
+and execute it.  In the case of modules that provide a service,
+such as an HTTP server, the module may publish its address when it
+is loaded.  To see the addresses of such modules, use the command 
+``ceph mgr services``.
+
+Some modules may also implement a special standby mode which runs on
+standby ceph-mgr daemons as well as the active daemon.  This enables
+modules that provide services to redirect their clients to the active
+daemon, if the client tries to connect to a standby.
+
+Consult the documentation pages for individual manager modules for more
+information about what functionality each module provides.
+
+Here is an example of enabling the ``dashboard`` module:
+
+::
+
+       $ ceph mgr module ls
+       {
+               "enabled_modules": [
+                       "restful",
+                       "status"
+               ],
+               "disabled_modules": [
+                       "dashboard"
+               ]
+       }
+
+       $ ceph mgr module enable dashboard
+       $ ceph mgr module ls
+       {
+               "enabled_modules": [
+                       "restful",
+                       "status",
+                       "dashboard"
+               ],
+               "disabled_modules": [
+               ]
+       }
+
+       $ ceph mgr services
+       {
+               "dashboard": "http://myserver.com:7789/",
+               "restful": "https://myserver.com:8789/"
+       }
+
+
 Calling module commands
 -----------------------
 
index 8908497003dae367fe2244e879135e1a3998d214..4c2116b133ec7cba825fec311c8cd06ae94e1c6e 100644 (file)
@@ -39,6 +39,13 @@ If the port is not configured, the web app will bind to port ``7000``.
 If the address it not configured, the web app will bind to ``::``,
 which corresponds to all available IPv4 and IPv6 addresses.
 
+You can configure a prefix for all URLs::
+
+  ceph config-key set mgr/dashboard/url_prefix $PREFIX
+
+so you can access the dashboard at ``http://$IP:$PORT/$PREFIX/``.
+
+
 Load balancer
 -------------
 
@@ -48,4 +55,5 @@ manager is active (e.g., ``ceph mgr dump``).  In order to make the
 dashboard available via a consistent URL regardless of which manager
 daemon is currently active, you may want to set up a load balancer
 front-end to direct traffic to whichever manager endpoint is
-available.
+available. If you use a reverse http proxy that forwards a subpath to
+the dashboard, you need to configure ``url_prefix`` (see above).
index 29a221661040829e2991c332cbd379312b019236..53844ba24ed2c2e1ae265f7315f7734fe4af9c22 100644 (file)
@@ -26,9 +26,11 @@ sensible.
     :maxdepth: 1
 
     Installation and Configuration <administrator>
+    Writing plugins <plugins>
     Dashboard plugin <dashboard>
+    Local pool plugin <localpool>
     RESTful plugin <restful>
     Zabbix plugin <zabbix>
     Prometheus plugin <prometheus>
-    Writing plugins <plugins>
+    Influx plugin <influx>
 
diff --git a/ceph/doc/mgr/influx.rst b/ceph/doc/mgr/influx.rst
new file mode 100644 (file)
index 0000000..37aa5cd
--- /dev/null
@@ -0,0 +1,162 @@
+=============
+Influx Plugin 
+=============
+
+The influx plugin continuously collects and sends time series data to an
+influxdb database.
+
+The influx plugin was introduced in the 13.x *Mimic* release.
+
+--------
+Enabling 
+--------
+
+To enable the module, use the following command:
+
+::
+
+    ceph mgr module enable influx
+
+If you wish to subsequently disable the module, you can use the equivalent
+*disable* command:
+
+::
+
+    ceph mgr module disable influx
+
+-------------
+Configuration 
+-------------
+
+For the influx module to send statistics to an InfluxDB server, it
+is necessary to configure the servers address and some authentication
+credentials.
+
+Set configuration values using the following command:
+
+::
+
+    ceph config-key set mgr/influx/<key> <value>
+
+
+The most important settings are ``hostname``, ``username`` and ``password``.  
+For example, a typical configuration might look like this:
+
+::
+
+    ceph config-key set mgr/influx/hostname influx.mydomain.com
+    ceph config-key set mgr/influx/username admin123
+    ceph config-key set mgr/influx/password p4ssw0rd
+    
+Additional optional configuration settings are:
+
+:interval: Time between reports to InfluxDB.  Default 5 seconds.
+:database: InfluxDB database name.  Default "ceph".  You will need to create this database and grant write privileges to the configured username or the username must have admin privileges to create it.  
+:port: InfluxDB server port.  Default 8086
+    
+
+---------
+Debugging 
+---------
+
+By default, a few debugging statments as well as error statements have been set to print in the log files. Users can add more if necessary.
+To make use of the debugging option in the module:
+
+- Add this to the ceph.conf file.::
+
+    [mgr]
+        debug_mgr = 20  
+
+- Use this command ``ceph tell mgr.<mymonitor> influx self-test``.
+- Check the log files. Users may find it easier to filter the log files using *mgr[influx]*.
+
+--------------------
+Interesting counters
+--------------------
+
+The following tables describe a subset of the values output by
+this module.
+
+^^^^^
+Pools
+^^^^^
+
++---------------+-----------------------------------------------------+
+|Counter        | Description                                         |
++===============+=====================================================+
+|bytes_used     | Bytes used in the pool not including copies         |
++---------------+-----------------------------------------------------+
+|max_avail      | Max available number of bytes in the pool           |
++---------------+-----------------------------------------------------+
+|objects        | Number of objects in the pool                       |
++---------------+-----------------------------------------------------+
+|wr_bytes       | Number of bytes written in the pool                 |
++---------------+-----------------------------------------------------+
+|dirty          | Number of bytes dirty in the pool                   |
++---------------+-----------------------------------------------------+
+|rd_bytes       | Number of bytes read in the pool                    |
++---------------+-----------------------------------------------------+
+|raw_bytes_used | Bytes used in pool including copies made            |
++---------------+-----------------------------------------------------+
+
+^^^^
+OSDs
+^^^^
+
++------------+------------------------------------+
+|Counter     | Description                        |
++============+====================================+
+|op_w        | Client write operations            |
++------------+------------------------------------+
+|op_in_bytes | Client operations total write size |
++------------+------------------------------------+
+|op_r        | Client read operations             |
++------------+------------------------------------+
+|op_out_bytes| Client operations total read size  |
++------------+------------------------------------+
+
+
++------------------------+--------------------------------------------------------------------------+
+|Counter                 | Description                                                              |
++========================+==========================================================================+
+|op_wip                  | Replication operations currently being processed (primary)               |
++------------------------+--------------------------------------------------------------------------+
+|op_latency              | Latency of client operations (including queue time)                      |
++------------------------+--------------------------------------------------------------------------+
+|op_process_latency      | Latency of client operations (excluding queue time)                      |           
++------------------------+--------------------------------------------------------------------------+
+|op_prepare_latency      | Latency of client operations (excluding queue time and wait for finished)|
++------------------------+--------------------------------------------------------------------------+
+|op_r_latency            | Latency of read operation (including queue time)                         |
++------------------------+--------------------------------------------------------------------------+
+|op_r_process_latency    | Latency of read operation (excluding queue time)                         |
++------------------------+--------------------------------------------------------------------------+
+|op_w_in_bytes           | Client data written                                                      |
++------------------------+--------------------------------------------------------------------------+
+|op_w_latency            | Latency of write operation (including queue time)                        |
++------------------------+--------------------------------------------------------------------------+
+|op_w_process_latency    | Latency of write operation (excluding queue time)                        |
++------------------------+--------------------------------------------------------------------------+
+|op_w_prepare_latency    | Latency of write operations (excluding queue time and wait for finished) |
++------------------------+--------------------------------------------------------------------------+
+|op_rw                   | Client read-modify-write operations                                      |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_in_bytes          | Client read-modify-write operations write in                             |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_out_bytes         | Client read-modify-write operations read out                             |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_latency           | Latency of read-modify-write operation (including queue time)            |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_process_latency   | Latency of read-modify-write operation (excluding queue time)            |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_prepare_latency   | Latency of read-modify-write operations (excluding queue time            |
+|                        | and wait for finished)                                                   |
++------------------------+--------------------------------------------------------------------------+
+|op_before_queue_op_lat  | Latency of IO before calling queue (before really queue into ShardedOpWq)|
+|                        | op_before_dequeue_op_lat                                                 |
++------------------------+--------------------------------------------------------------------------+
+|op_before_dequeue_op_lat| Latency of IO before calling dequeue_op(already dequeued and get PG lock)|
++------------------------+--------------------------------------------------------------------------+
+
+Latency counters are measured in microseconds unless otherwise specified in the description.
+
diff --git a/ceph/doc/mgr/localpool.rst b/ceph/doc/mgr/localpool.rst
new file mode 100644 (file)
index 0000000..5779b7c
--- /dev/null
@@ -0,0 +1,35 @@
+Local pool plugin
+=================
+
+The *localpool* plugin can automatically create RADOS pools that are
+localized to a subset of the overall cluster.  For example, by default, it will
+create a pool for each distinct rack in the cluster.  This can be useful for some
+deployments that want to distribute some data locally as well as globally across the cluster .
+
+Enabling
+--------
+
+The *localpool* module is enabled with::
+
+  ceph mgr module enable localpool
+
+Configuring
+-----------
+
+The *localpool* module understands the following options:
+
+* **subtree** (default: `rack`): which CRUSH subtree type the module
+  should create a pool for.
+* **failure_domain** (default: `host`): what failure domain we should
+  separate data replicas across.
+* **pg_num** (default: `128`): number of PGs to create for each pool
+* **num_rep** (default: `3`): number of replicas for each pool.
+  (Currently, pools are always replicated.)
+* **min_size** (default: none): value to set min_size to (unchanged from Ceph's default if this option is not set)
+* **prefix** (default: `by-$subtreetype-`): prefix for the pool name.
+
+These options are set via the config-key interface.  For example, to
+change the replication level to 2x with only 64 PGs, ::
+
+  ceph config-key set mgr/localpool/num_rep 2
+  ceph config-key set mgr/localpool/pg_num 64
index b5a13cc3eada1b9ef4964b880a2c1cae706ad229..a75c14c849463c5213f2afdec1b7d2e1d3d3cbad 100644 (file)
@@ -157,6 +157,31 @@ a command completes, the ``notify()`` callback on the MgrModule
 instance is triggered, with notify_type set to "command", and
 notify_id set to the tag of the command.
 
+Implementing standby mode
+-------------------------
+
+For some modules, it is useful to run on standby manager daemons as well
+as on the active daemon.  For example, an HTTP server can usefully
+serve HTTP redirect responses from the standby managers so that
+the user can point his browser at any of the manager daemons without
+having to worry about which one is active.
+
+Standby manager daemons look for a class called ``StandbyModule``
+in each module.  If the class is not found then the module is not
+used at all on standby daemons.  If the class is found, then
+its ``serve`` method is called.  Implementations of ``StandbyModule``
+must inherit from ``mgr_module.MgrStandbyModule``.
+
+The interface of ``MgrStandbyModule`` is much restricted compared to
+``MgrModule`` -- none of the Ceph cluster state is available to
+the module.  ``serve`` and ``shutdown`` methods are used in the same
+way as a normal module class.  The ``get_active_uri`` method enables
+the standby module to discover the address of its active peer in
+order to make redirects.  See the ``MgrStandbyModule`` definition
+in the Ceph source code for the full list of methods.
+
+For an example of how to use this interface, look at the source code
+of the ``dashboard`` module.
 
 Logging
 -------
index fc84afee4b12f1f5decf411756eb6a4a31c00aca..5bae6a9845f821beb7c0e3661c96cf9ad5f194fd 100644 (file)
@@ -1,3 +1,4 @@
+=================
 Prometheus plugin
 =================
 
@@ -12,8 +13,8 @@ The HTTP path and query parameters are ignored; all extant counters
 for all reporting entities are returned in text exposition format.
 (See the Prometheus `documentation <https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details>`_.)
 
-Enabling
---------
+Enabling prometheus output
+==========================
 
 The *prometheus* module is enabled with::
 
@@ -28,19 +29,187 @@ configurable with ``ceph config-key set``, with keys
 ``mgr/prometheus/server_addr`` and ``mgr/prometheus/server_port``.
 This port is registered with Prometheus's `registry <https://github.com/prometheus/prometheus/wiki/Default-port-allocations>`_.
 
+Statistic names and labels
+==========================
+
+The names of the stats are exactly as Ceph names them, with
+illegal characters ``.``, ``-`` and ``::`` translated to ``_``, 
+and ``ceph_`` prefixed to all names.
+
+
+All *daemon* statistics have a ``ceph_daemon`` label such as "osd.123"
+that identifies the type and ID of the daemon they come from.  Some
+statistics can come from different types of daemon, so when querying
+e.g. an OSD's RocksDB stats, you would probably want to filter
+on ceph_daemon starting with "osd" to avoid mixing in the monitor
+rocksdb stats.
+
+
+The *cluster* statistics (i.e. those global to the Ceph cluster)
+have labels appropriate to what they report on.  For example, 
+metrics relating to pools have a ``pool_id`` label.
+
+Pool and OSD metadata series
+----------------------------
+
+Special series are output to enable displaying and querying on
+certain metadata fields.
+
+Pools have a ``ceph_pool_metadata`` field like this:
+
+::
+
+    ceph_pool_metadata{pool_id="2",name="cephfs_metadata_a"} 0.0
+
+OSDs have a ``ceph_osd_metadata`` field like this:
+
+::
+
+    ceph_osd_metadata{cluster_addr="172.21.9.34:6802/19096",device_class="ssd",id="0",public_addr="172.21.9.34:6801/19096",weight="1.0"} 0.0
+
+
+Correlating drive statistics with node_exporter
+-----------------------------------------------
+
+The prometheus output from Ceph is designed to be used in conjunction
+with the generic host monitoring from the Prometheus node_exporter.
+
+To enable correlation of Ceph OSD statistics with node_exporter's 
+drive statistics, special series are output like this:
+
+::
+
+    ceph_disk_occupation{ceph_daemon="osd.0",device="sdd",instance="myhost",job="ceph"}
+
+To use this to get disk statistics by OSD ID, use the ``and on`` syntax
+in your prometheus query like this:
+
+::
+
+    rate(node_disk_bytes_written[30s]) and on (device,instance) ceph_disk_occupation{ceph_daemon="osd.0"}
+
+See the prometheus documentation for more information about constructing
+queries.
+
+Note that for this mechanism to work, Ceph and node_exporter must agree
+about the values of the ``instance`` label.  See the following section
+for guidance about to to set up Prometheus in a way that sets
+``instance`` properly.
+
+Configuring Prometheus server
+=============================
+
+See the prometheus documentation for full details of how to add
+scrape endpoints: the notes
+in this section are tips on how to configure Prometheus to capture
+the Ceph statistics in the most usefully-labelled form.
+
+This configuration is necessary because Ceph is reporting metrics
+from many hosts and services via a single endpoint, and some
+metrics that relate to no physical host (such as pool statistics).
+
+honor_labels
+------------
+
+To enable Ceph to output properly-labelled data relating to any host,
+use the ``honor_labels`` setting when adding the ceph-mgr endpoints
+to your prometheus configuration.
+
+Without this setting, any ``instance`` labels that Ceph outputs, such
+as those in ``ceph_disk_occupation`` series, will be overridden
+by Prometheus.
+
+Ceph instance label
+-------------------
+
+By default, Prometheus applies an ``instance`` label that includes
+the hostname and port of the endpoint that the series game from.  Because
+Ceph clusters have multiple manager daemons, this results in an ``instance``
+label that changes spuriously when the active manager daemon changes.
+
+Set a custom ``instance`` label in your Prometheus target configuration: 
+you might wish to set it to the hostname of your first monitor, or something
+completely arbitrary like "ceph_cluster".
+
+node_exporter instance labels
+-----------------------------
+
+Set your ``instance`` labels to match what appears in Ceph's OSD metadata
+in the ``hostname`` field.  This is generally the short hostname of the node.
+
+This is only necessary if you want to correlate Ceph stats with host stats,
+but you may find it useful to do it in all cases in case you want to do
+the correlation in the future.
+
+Example configuration
+---------------------
+
+This example shows a single node configuration running ceph-mgr and
+node_exporter on a server called ``senta04``.
+
+This is just an example: there are other ways to configure prometheus
+scrape targets and label rewrite rules.
+
+prometheus.yml
+~~~~~~~~~~~~~~
+
+::
+
+    global:
+      scrape_interval:     15s
+      evaluation_interval: 15s
+
+    scrape_configs:
+      - job_name: 'node'
+        file_sd_configs:
+          - files:
+            - node_targets.yml
+      - job_name: 'ceph'
+        honor_labels: true
+        file_sd_configs:
+          - files:
+            - ceph_targets.yml
+
+
+ceph_targets.yml
+~~~~~~~~~~~~~~~~
+
+
+::
+
+    [
+        {
+            "targets": [ "senta04.mydomain.com:9283" ],
+            "labels": {
+                "instance": "ceph_cluster"
+            }
+        }
+    ]
+
+
+node_targets.yml
+~~~~~~~~~~~~~~~~
+
+::
+
+    [
+        {
+            "targets": [ "senta04.mydomain.com:9100" ],
+            "labels": {
+                "instance": "senta04"
+            }
+        }
+    ]
+
+
 Notes
------
+=====
 
 Counters and gauges are exported; currently histograms and long-running 
 averages are not.  It's possible that Ceph's 2-D histograms could be 
 reduced to two separate 1-D histograms, and that long-running averages
 could be exported as Prometheus' Summary type.
 
-The names of the stats are exactly as Ceph names them, with
-illegal characters ``.`` and ``-`` translated to ``_``.  There is one
-label applied, ``daemon``, and its value is the daemon.id for the
-daemon in question (e.g. ``{daemon=mon.hosta}`` or ``{daemon=osd.11}``).
-
 Timestamps, as with many Prometheus exporters, are established by
 the server's scrape time (Prometheus expects that it is polling the
 actual counter process synchronously).  It is possible to supply a
index dd416edfa3826a4f04458427980dbd1579b5bcc6..89a3707ccd5837d97dc56b88d90834c93d06cc38 100644 (file)
@@ -255,6 +255,15 @@ Ceph configuration file.
 :Type: 32-bit Integer
 :Default: ``45``
 
+``osd max pg per osd hard ratio``
+
+:Description: The ratio of number of PGs per OSD allowed by the cluster before
+              OSD refuses to create new PGs. OSD stops creating new PGs if the number
+              of PGs it serves exceeds
+              ``osd max pg per osd hard ratio`` \* ``mon max pg per osd``.
+
+:Type: Float
+:Default: ``2``
 
 .. _pool: ../../operations/pools
 .. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg#peering
index 6164355798680374f3889d8b079cb1ceaaac7758..c1e22004aef749b90d6ae1087c90091676e98fc3 100644 (file)
@@ -336,17 +336,20 @@ TOO_MANY_PGS
 ____________
 
 The number of PGs in use in the cluster is above the configurable
-threshold of ``mon_pg_warn_max_per_osd`` PGs per OSD.  This can lead
+threshold of ``mon_max_pg_per_osd`` PGs per OSD.  If this threshold is
+exceed the cluster will not allow new pools to be created, pool `pg_num` to
+be increased, or pool replication to be increased (any of which would lead to
+more PGs in the cluster).  A large number of PGs can lead
 to higher memory utilization for OSD daemons, slower peering after
 cluster state changes (like OSD restarts, additions, or removals), and
 higher load on the Manager and Monitor daemons.
 
-The ``pg_num`` value for existing pools cannot currently be reduced.
-However, the ``pgp_num`` value can, which effectively collocates some
-PGs on the same sets of OSDs, mitigating some of the negative impacts
-described above.  The ``pgp_num`` value can be adjusted with::
+The simplest way to mitigate the problem is to increase the number of
+OSDs in the cluster by adding more hardware.  Note that the OSD count
+used for the purposes of this health check is the number of "in" OSDs,
+so marking "out" OSDs "in" (if there are any) can also help::
 
-  ceph osd pool set <pool> pgp_num <value>
+  ceph osd in <osd id(s)>
 
 Please refer to
 :doc:`placement-groups#Choosing-the-number-of-Placement-Groups` for
@@ -368,7 +371,6 @@ triggering the data migration, with::
 
   ceph osd pool set <pool> pgp_num <pg-num-value>
 
-
 MANY_OBJECTS_PER_PG
 ___________________
 
index f8414368c1c348b51e9dcde9fe6caea3a7d7fa26..fccde26295c4cf6ff93b22d930843c76418b64b0 100755 (executable)
@@ -82,14 +82,22 @@ class StateMachineRenderer(object):
             )
 
     def read_input(self, input_lines):
+        previous_line = None
         for line in input_lines:
             self.get_state(line)
             self.get_event(line)
-            self.get_context(line)
-
-    def get_context(self, line):
-        match = re.search(r"(\w+::)*::(?P<tag>\w+)::\w+\(const (?P<event>\w+)",
-                          line)
+            # pass two lines at a time to get the context so that regexes can
+            # match on split signatures
+            self.get_context(line, previous_line)
+            previous_line = line
+
+    def get_context(self, line, previous_line):
+        match = re.search(r"(\w+::)*::(?P<tag>\w+)::\w+\(const (?P<event>\w+)", line)
+        if match is None and previous_line is not None:
+            # it is possible that we need to match on the previous line as well, so join
+            # them to make them one line and try and get this matching
+            joined_line = ' '.join([previous_line, line])
+            match = re.search(r"(\w+::)*::(?P<tag>\w+)::\w+\(\s*const (?P<event>\w+)", joined_line)
         if match is not None:
             self.context.append((match.group('tag'), self.context_depth, match.group('event')))
         if '{' in line:
@@ -105,7 +113,7 @@ class StateMachineRenderer(object):
                 r"boost::statechart::state_machine<\s*(\w*),\s*(\w*)\s*>",
                 line)
             if tokens is None:
-                raise "Error: malformed state_machine line: " + line
+                raise Exception("Error: malformed state_machine line: " + line)
             self.machines[tokens.group(1)] = tokens.group(2)
             self.context.append((tokens.group(1), self.context_depth, ""))
             return
@@ -114,7 +122,7 @@ class StateMachineRenderer(object):
                 r"boost::statechart::state<\s*(\w*),\s*(\w*)\s*,?\s*(\w*)\s*>",
                 line)
             if tokens is None:
-                raise "Error: malformed state line: " + line
+                raise Exception("Error: malformed state line: " + line)
             self.states[tokens.group(1)] = tokens.group(2)
             if tokens.group(2) not in self.state_contents.keys():
                 self.state_contents[tokens.group(2)] = []
@@ -131,14 +139,14 @@ class StateMachineRenderer(object):
                 if i.group(1) not in self.edges.keys():
                     self.edges[i.group(1)] = []
                 if len(self.context) is 0:
-                    raise "no context at line: " + line
+                    raise Exception("no context at line: " + line)
                 self.edges[i.group(1)].append((self.context[-1][0], i.group(2)))
         i = re.search("return\s+transit<\s*(\w*)\s*>()", line)
         if i is not None:
             if len(self.context) is 0:
-                raise "no context at line: " + line
+                raise Exception("no context at line: " + line)
             if self.context[-1][2] is "":
-                raise "no event in context at line: " + line
+                raise Exception("no event in context at line: " + line)
             if self.context[-1][2] not in self.edges.keys():
                 self.edges[self.context[-1][2]] = []
             self.edges[self.context[-1][2]].append((self.context[-1][0], i.group(1)))
index f2722073b5a7d75bf852ebbbd41c3f5e6f86edc8..4542838f4bc12143e2f9653e968d178ef7289c25 100644 (file)
@@ -5,11 +5,3 @@
 
 # Increase tcmalloc cache size
 TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728
-
-## use jemalloc instead of tcmalloc
-#
-# jemalloc is generally faster for small IO workloads and when
-# ceph-osd is backed by SSDs.  However, memory usage is usually
-# higher by 200-300mb.
-#
-#LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.1
index 61e941ded746b297a6fceba14d541e7af9305e22..c7f4bc45a0768d00171a4311d0d2bcde68bf31e3 100644 (file)
@@ -6,14 +6,6 @@
 # Increase tcmalloc cache size
 TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728
 
-## use jemalloc instead of tcmalloc
-#
-# jemalloc is generally faster for small IO workloads and when
-# ceph-osd is backed by SSDs.  However, memory usage is usually
-# higher by 200-300mb.
-#
-#LD_PRELOAD=/usr/lib64/libjemalloc.so.1
-
 ## automatically restart systemd units on upgrade
 #
 # By default, it is left to the administrator to restart
index ff35ed1a65b4d47d44a670f504010b968d88f84a..05c6142f8cb61f50bee4ed9105e93f467f8fa8ef 100644 (file)
@@ -1,4 +1,4 @@
 roles:
-- [mon.a, mon.c, mgr.y, mds.a, osd.0, osd.1, osd.2]
-- [mon.b, mgr.x, mds.b, mds.c, osd.3, osd.4, osd.5]
+- [mon.a, mon.c, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mgr.x, mds.b, mds.c, osd.4, osd.5, osd.6, osd.7]
 - [client.0]
index c1228b3a1c7f6814946afb81e4a30f39accbc0c6..a6342dc06720f3627052852fde79c1edcd085c2a 100644 (file)
@@ -1,4 +1,4 @@
 roles:
-- [mon.a, mon.c, mgr.y, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2]
-- [mon.b, mgr.x, mds.e, mds.f, mds.g, mds.h, mds.i, osd.3, osd.4, osd.5]
+- [mon.a, mon.c, mgr.y, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mgr.x, mds.e, mds.f, mds.g, mds.h, mds.i, osd.4, osd.5, osd.6, osd.7]
 - [client.0]
diff --git a/ceph/qa/cephfs/objectstore-ec/bluestore-comp-ec-root.yaml b/ceph/qa/cephfs/objectstore-ec/bluestore-comp-ec-root.yaml
new file mode 100644 (file)
index 0000000..9bc487c
--- /dev/null
@@ -0,0 +1,28 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    cephfs_ec_profile:
+      - m=2
+      - k=2
+      - crush-failure-domain=osd
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore compression mode: aggressive
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+
diff --git a/ceph/qa/cephfs/objectstore-ec/bluestore-comp.yaml b/ceph/qa/cephfs/objectstore-ec/bluestore-comp.yaml
new file mode 100644 (file)
index 0000000..b408032
--- /dev/null
@@ -0,0 +1,23 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore compression mode: aggressive
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
diff --git a/ceph/qa/cephfs/objectstore-ec/bluestore-ec-root.yaml b/ceph/qa/cephfs/objectstore-ec/bluestore-ec-root.yaml
new file mode 100644 (file)
index 0000000..726ad3d
--- /dev/null
@@ -0,0 +1,42 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    cephfs_ec_profile:
+      - m=2
+      - k=2
+      - crush-failure-domain=osd
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff --git a/ceph/qa/cephfs/objectstore-ec/bluestore.yaml b/ceph/qa/cephfs/objectstore-ec/bluestore.yaml
new file mode 100644 (file)
index 0000000..19dfeb0
--- /dev/null
@@ -0,0 +1,38 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff --git a/ceph/qa/cephfs/objectstore-ec/filestore-xfs.yaml b/ceph/qa/cephfs/objectstore-ec/filestore-xfs.yaml
new file mode 100644 (file)
index 0000000..f7aa0dd
--- /dev/null
@@ -0,0 +1,15 @@
+overrides:
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: filestore
+        osd sloppy crc: true
+  ceph-deploy:
+    fs: xfs
+    filestore: True
+    conf:
+      osd:
+        osd objectstore: filestore
+        osd sloppy crc: true
+
diff --git a/ceph/qa/distros/all/centos_7.4.yaml b/ceph/qa/distros/all/centos_7.4.yaml
new file mode 100644 (file)
index 0000000..d06bc38
--- /dev/null
@@ -0,0 +1,2 @@
+os_type: centos
+os_version: "7.4"
index 34d81be44770201dba0eabec8127c5ff18d2b29d..4cc59dad1d4edce8b5ea0fc1b77f245188891f57 120000 (symlink)
@@ -1 +1 @@
-../all/centos_7.3.yaml
\ No newline at end of file
+../all/centos_7.4.yaml
\ No newline at end of file
index ea3130768705eda898274a890089816d7865e5bc..391a5e1816d19564fdcb081f067216f751f11923 100644 (file)
@@ -2,10 +2,11 @@ tasks:
 - exec:
     osd.0:
       - ceph osd require-osd-release luminous
-      - ceph osd set-require-min-compat-client luminous
 - ceph.healthy:
 overrides:
   ceph:
     conf:
       mon:
         mon warn on osd down out interval zero: false
+    log-whitelist:
+      - ruleset-
index 9ed76715a9603e0332567d96226ebbedc19756fc..5bd666ca004ca324ae156f1651708695f6842a6f 100644 (file)
@@ -19,3 +19,4 @@ overrides:
         mon warn on osd down out interval zero: false
     log-whitelist:
       - no active mgr
+      - ruleset-
index 99a6064c0aee36ed4f77f98742a61a3f3f9f8277..693165d895e27742ab7f0e019d21afc57c331d12 100755 (executable)
@@ -200,7 +200,7 @@ function TEST_utf8_cli() {
     # the fix for http://tracker.ceph.com/issues/7387.  If it turns out
     # to not be OK (when is the default encoding *not* UTF-8?), maybe
     # the character '黄' can be replaced with the escape $'\xe9\xbb\x84'
-    ceph osd pool create 黄 1024 || return 1
+    ceph osd pool create 黄 16 || return 1
     ceph osd lspools 2>&1 | \
         grep "黄" || return 1
     ceph -f json-pretty osd dump | \
diff --git a/ceph/qa/standalone/special/ceph_objectstore_tool.py b/ceph/qa/standalone/special/ceph_objectstore_tool.py
new file mode 100755 (executable)
index 0000000..7c52101
--- /dev/null
@@ -0,0 +1,2024 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+from subprocess import call
+try:
+    from subprocess import check_output
+except ImportError:
+    def check_output(*popenargs, **kwargs):
+        import subprocess
+        # backported from python 2.7 stdlib
+        process = subprocess.Popen(
+            stdout=subprocess.PIPE, *popenargs, **kwargs)
+        output, unused_err = process.communicate()
+        retcode = process.poll()
+        if retcode:
+            cmd = kwargs.get("args")
+            if cmd is None:
+                cmd = popenargs[0]
+            error = subprocess.CalledProcessError(retcode, cmd)
+            error.output = output
+            raise error
+        return output
+
+import filecmp
+import os
+import subprocess
+import math
+import time
+import sys
+import re
+import logging
+import json
+import tempfile
+import platform
+
+try:
+    from subprocess import DEVNULL
+except ImportError:
+    DEVNULL = open(os.devnull, "wb")
+
+logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
+
+
+if sys.version_info[0] >= 3:
+    def decode(s):
+        return s.decode('utf-8')
+
+    def check_output(*args, **kwargs):
+        return decode(subprocess.check_output(*args, **kwargs))
+else:
+    def decode(s):
+        return s
+
+
+
+def wait_for_health():
+    print("Wait for health_ok...", end="")
+    tries = 0
+    while call("{path}/ceph health 2> /dev/null | grep -v 'HEALTH_OK\|HEALTH_WARN' > /dev/null".format(path=CEPH_BIN), shell=True) == 0:
+        tries += 1
+        if tries == 150:
+            raise Exception("Time exceeded to go to health")
+        time.sleep(1)
+    print("DONE")
+
+
+def get_pool_id(name, nullfd):
+    cmd = "{path}/ceph osd pool stats {pool}".format(pool=name, path=CEPH_BIN).split()
+    # pool {pool} id # .... grab the 4 field
+    return check_output(cmd, stderr=nullfd).split()[3]
+
+
+# return a list of unique PGS given an osd subdirectory
+def get_osd_pgs(SUBDIR, ID):
+    PGS = []
+    if ID:
+        endhead = re.compile("{id}.*_head$".format(id=ID))
+    DIR = os.path.join(SUBDIR, "current")
+    PGS += [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and (ID is None or endhead.match(f))]
+    PGS = [re.sub("_head", "", p) for p in PGS if "_head" in p]
+    return PGS
+
+
+# return a sorted list of unique PGs given a directory
+def get_pgs(DIR, ID):
+    OSDS = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and f.find("osd") == 0]
+    PGS = []
+    for d in OSDS:
+        SUBDIR = os.path.join(DIR, d)
+        PGS += get_osd_pgs(SUBDIR, ID)
+    return sorted(set(PGS))
+
+
+# return a sorted list of PGS a subset of ALLPGS that contain objects with prefix specified
+def get_objs(ALLPGS, prefix, DIR, ID):
+    OSDS = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and f.find("osd") == 0]
+    PGS = []
+    for d in OSDS:
+        DIRL2 = os.path.join(DIR, d)
+        SUBDIR = os.path.join(DIRL2, "current")
+        for p in ALLPGS:
+            PGDIR = p + "_head"
+            if not os.path.isdir(os.path.join(SUBDIR, PGDIR)):
+                continue
+            FINALDIR = os.path.join(SUBDIR, PGDIR)
+            # See if there are any objects there
+            if any(f for f in [val for _, _, fl in os.walk(FINALDIR) for val in fl] if f.startswith(prefix)):
+                PGS += [p]
+    return sorted(set(PGS))
+
+
+# return a sorted list of OSDS which have data from a given PG
+def get_osds(PG, DIR):
+    ALLOSDS = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and f.find("osd") == 0]
+    OSDS = []
+    for d in ALLOSDS:
+        DIRL2 = os.path.join(DIR, d)
+        SUBDIR = os.path.join(DIRL2, "current")
+        PGDIR = PG + "_head"
+        if not os.path.isdir(os.path.join(SUBDIR, PGDIR)):
+            continue
+        OSDS += [d]
+    return sorted(OSDS)
+
+
+def get_lines(filename):
+    tmpfd = open(filename, "r")
+    line = True
+    lines = []
+    while line:
+        line = tmpfd.readline().rstrip('\n')
+        if line:
+            lines += [line]
+    tmpfd.close()
+    os.unlink(filename)
+    return lines
+
+
+def cat_file(level, filename):
+    if level < logging.getLogger().getEffectiveLevel():
+        return
+    print("File: " + filename)
+    with open(filename, "r") as f:
+        while True:
+            line = f.readline().rstrip('\n')
+            if not line:
+                break
+            print(line)
+    print("<EOF>")
+
+
+def vstart(new, opt=""):
+    print("vstarting....", end="")
+    NEW = new and "-n" or "-N"
+    call("MON=1 OSD=4 MDS=0 MGR=1 CEPH_PORT=7400 {path}/src/vstart.sh --filestore --short -l {new} -d {opt} > /dev/null 2>&1".format(new=NEW, opt=opt, path=CEPH_ROOT), shell=True)
+    print("DONE")
+
+
+def test_failure(cmd, errmsg, tty=False):
+    if tty:
+        try:
+            ttyfd = open("/dev/tty", "rwb")
+        except Exception as e:
+            logging.info(str(e))
+            logging.info("SKIP " + cmd)
+            return 0
+    TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
+    tmpfd = open(TMPFILE, "wb")
+
+    logging.debug(cmd)
+    if tty:
+        ret = call(cmd, shell=True, stdin=ttyfd, stdout=ttyfd, stderr=tmpfd)
+        ttyfd.close()
+    else:
+        ret = call(cmd, shell=True, stderr=tmpfd)
+    tmpfd.close()
+    if ret == 0:
+        logging.error(cmd)
+        logging.error("Should have failed, but got exit 0")
+        return 1
+    lines = get_lines(TMPFILE)
+    matched = [ l for l in lines if errmsg in l ]
+    if any(matched):
+        logging.info("Correctly failed with message \"" + matched[0] + "\"")
+        return 0
+    else:
+        logging.error("Command: " + cmd )
+        logging.error("Bad messages to stderr \"" + str(lines) + "\"")
+        logging.error("Expected \"" + errmsg + "\"")
+        return 1
+
+
+def get_nspace(num):
+    if num == 0:
+        return ""
+    return "ns{num}".format(num=num)
+
+
+def verify(DATADIR, POOL, NAME_PREFIX, db):
+    TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
+    ERRORS = 0
+    for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(NAME_PREFIX) == 0]:
+        nsfile = rawnsfile.split("__")[0]
+        clone = rawnsfile.split("__")[1]
+        nspace = nsfile.split("-")[0]
+        file = nsfile.split("-")[1]
+        # Skip clones
+        if clone != "head":
+            continue
+        path = os.path.join(DATADIR, rawnsfile)
+        try:
+            os.unlink(TMPFILE)
+        except:
+            pass
+        cmd = "{path}/rados -p {pool} -N '{nspace}' get {file} {out}".format(pool=POOL, file=file, out=TMPFILE, nspace=nspace, path=CEPH_BIN)
+        logging.debug(cmd)
+        call(cmd, shell=True, stdout=DEVNULL, stderr=DEVNULL)
+        cmd = "diff -q {src} {result}".format(src=path, result=TMPFILE)
+        logging.debug(cmd)
+        ret = call(cmd, shell=True)
+        if ret != 0:
+            logging.error("{file} data not imported properly".format(file=file))
+            ERRORS += 1
+        try:
+            os.unlink(TMPFILE)
+        except:
+            pass
+        for key, val in db[nspace][file]["xattr"].items():
+            cmd = "{path}/rados -p {pool} -N '{nspace}' getxattr {name} {key}".format(pool=POOL, name=file, key=key, nspace=nspace, path=CEPH_BIN)
+            logging.debug(cmd)
+            getval = check_output(cmd, shell=True, stderr=DEVNULL)
+            logging.debug("getxattr {key} {val}".format(key=key, val=getval))
+            if getval != val:
+                logging.error("getxattr of key {key} returned wrong val: {get} instead of {orig}".format(key=key, get=getval, orig=val))
+                ERRORS += 1
+                continue
+        hdr = db[nspace][file].get("omapheader", "")
+        cmd = "{path}/rados -p {pool} -N '{nspace}' getomapheader {name} {file}".format(pool=POOL, name=file, nspace=nspace, file=TMPFILE, path=CEPH_BIN)
+        logging.debug(cmd)
+        ret = call(cmd, shell=True, stderr=DEVNULL)
+        if ret != 0:
+            logging.error("rados getomapheader returned {ret}".format(ret=ret))
+            ERRORS += 1
+        else:
+            getlines = get_lines(TMPFILE)
+            assert(len(getlines) == 0 or len(getlines) == 1)
+            if len(getlines) == 0:
+                gethdr = ""
+            else:
+                gethdr = getlines[0]
+            logging.debug("header: {hdr}".format(hdr=gethdr))
+            if gethdr != hdr:
+                logging.error("getomapheader returned wrong val: {get} instead of {orig}".format(get=gethdr, orig=hdr))
+                ERRORS += 1
+        for key, val in db[nspace][file]["omap"].items():
+            cmd = "{path}/rados -p {pool} -N '{nspace}' getomapval {name} {key} {file}".format(pool=POOL, name=file, key=key, nspace=nspace, file=TMPFILE, path=CEPH_BIN)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stderr=DEVNULL)
+            if ret != 0:
+                logging.error("getomapval returned {ret}".format(ret=ret))
+                ERRORS += 1
+                continue
+            getlines = get_lines(TMPFILE)
+            if len(getlines) != 1:
+                logging.error("Bad data from getomapval {lines}".format(lines=getlines))
+                ERRORS += 1
+                continue
+            getval = getlines[0]
+            logging.debug("getomapval {key} {val}".format(key=key, val=getval))
+            if getval != val:
+                logging.error("getomapval returned wrong val: {get} instead of {orig}".format(get=getval, orig=val))
+                ERRORS += 1
+        try:
+            os.unlink(TMPFILE)
+        except:
+            pass
+    return ERRORS
+
+
+def check_journal(jsondict):
+    errors = 0
+    if 'header' not in jsondict:
+        logging.error("Key 'header' not in dump-journal")
+        errors += 1
+    elif 'max_size' not in jsondict['header']:
+        logging.error("Key 'max_size' not in dump-journal header")
+        errors += 1
+    else:
+        print("\tJournal max_size = {size}".format(size=jsondict['header']['max_size']))
+    if 'entries' not in jsondict:
+        logging.error("Key 'entries' not in dump-journal output")
+        errors += 1
+    elif len(jsondict['entries']) == 0:
+        logging.info("No entries in journal found")
+    else:
+        errors += check_journal_entries(jsondict['entries'])
+    return errors
+
+
+def check_journal_entries(entries):
+    errors = 0
+    for enum in range(len(entries)):
+        if 'offset' not in entries[enum]:
+            logging.error("No 'offset' key in entry {e}".format(e=enum))
+            errors += 1
+        if 'seq' not in entries[enum]:
+            logging.error("No 'seq' key in entry {e}".format(e=enum))
+            errors += 1
+        if 'transactions' not in entries[enum]:
+            logging.error("No 'transactions' key in entry {e}".format(e=enum))
+            errors += 1
+        elif len(entries[enum]['transactions']) == 0:
+            logging.error("No transactions found in entry {e}".format(e=enum))
+            errors += 1
+        else:
+            errors += check_entry_transactions(entries[enum], enum)
+    return errors
+
+
+def check_entry_transactions(entry, enum):
+    errors = 0
+    for tnum in range(len(entry['transactions'])):
+        if 'trans_num' not in entry['transactions'][tnum]:
+            logging.error("Key 'trans_num' missing from entry {e} trans {t}".format(e=enum, t=tnum))
+            errors += 1
+        elif entry['transactions'][tnum]['trans_num'] != tnum:
+            ft = entry['transactions'][tnum]['trans_num']
+            logging.error("Bad trans_num ({ft}) entry {e} trans {t}".format(ft=ft, e=enum, t=tnum))
+            errors += 1
+        if 'ops' not in entry['transactions'][tnum]:
+            logging.error("Key 'ops' missing from entry {e} trans {t}".format(e=enum, t=tnum))
+            errors += 1
+        else:
+            errors += check_transaction_ops(entry['transactions'][tnum]['ops'], enum, tnum)
+    return errors
+
+
+def check_transaction_ops(ops, enum, tnum):
+    if len(ops) is 0:
+        logging.warning("No ops found in entry {e} trans {t}".format(e=enum, t=tnum))
+    errors = 0
+    for onum in range(len(ops)):
+        if 'op_num' not in ops[onum]:
+            logging.error("Key 'op_num' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum))
+            errors += 1
+        elif ops[onum]['op_num'] != onum:
+            fo = ops[onum]['op_num']
+            logging.error("Bad op_num ({fo}) from entry {e} trans {t} op {o}".format(fo=fo, e=enum, t=tnum, o=onum))
+            errors += 1
+        if 'op_name' not in ops[onum]:
+            logging.error("Key 'op_name' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum))
+            errors += 1
+    return errors
+
+
+def test_dump_journal(CFSD_PREFIX, osds):
+    ERRORS = 0
+    pid = os.getpid()
+    TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
+
+    for osd in osds:
+        # Test --op dump-journal by loading json
+        cmd = (CFSD_PREFIX + "--op dump-journal --format json").format(osd=osd)
+        logging.debug(cmd)
+        tmpfd = open(TMPFILE, "wb")
+        ret = call(cmd, shell=True, stdout=tmpfd)
+        if ret != 0:
+            logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
+            ERRORS += 1
+            continue
+        tmpfd.close()
+        tmpfd = open(TMPFILE, "r")
+        jsondict = json.load(tmpfd)
+        tmpfd.close()
+        os.unlink(TMPFILE)
+
+        journal_errors = check_journal(jsondict)
+        if journal_errors is not 0:
+            logging.error(jsondict)
+        ERRORS += journal_errors
+
+    return ERRORS
+
+CEPH_BUILD_DIR = os.environ.get('CEPH_BUILD_DIR')
+CEPH_BIN = os.environ.get('CEPH_BIN')
+CEPH_ROOT = os.environ.get('CEPH_ROOT')
+
+if not CEPH_BUILD_DIR:
+    CEPH_BUILD_DIR=os.getcwd()
+    os.putenv('CEPH_BUILD_DIR', CEPH_BUILD_DIR)
+    CEPH_BIN=os.path.join(CEPH_BUILD_DIR, 'bin')
+    os.putenv('CEPH_BIN', CEPH_BIN)
+    CEPH_ROOT=os.path.dirname(CEPH_BUILD_DIR)
+    os.putenv('CEPH_ROOT', CEPH_ROOT)
+    CEPH_LIB=os.path.join(CEPH_BUILD_DIR, 'lib')
+    os.putenv('CEPH_LIB', CEPH_LIB)
+
+try:
+    os.mkdir("td")
+except:
+    pass # ok if this is already there
+CEPH_DIR = os.path.join(CEPH_BUILD_DIR, os.path.join("td", "cot_dir"))
+CEPH_CONF = os.path.join(CEPH_DIR, 'ceph.conf')
+
+def kill_daemons():
+    call("{path}/init-ceph -c {conf} stop > /dev/null 2>&1".format(conf=CEPH_CONF, path=CEPH_BIN), shell=True)
+
+
+def check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME):
+    repcount = 0
+    ERRORS = 0
+    for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(SPLIT_NAME) == 0]:
+        nsfile = rawnsfile.split("__")[0]
+        clone = rawnsfile.split("__")[1]
+        nspace = nsfile.split("-")[0]
+        file = nsfile.split("-")[1] + "__" + clone
+        # Skip clones
+        if clone != "head":
+            continue
+        path = os.path.join(DATADIR, rawnsfile)
+        tmpfd = open(TMPFILE, "wb")
+        cmd = "find {dir} -name '{file}_*_{nspace}_*'".format(dir=OSDDIR, file=file, nspace=nspace)
+        logging.debug(cmd)
+        ret = call(cmd, shell=True, stdout=tmpfd)
+        if ret:
+            logging.critical("INTERNAL ERROR")
+            return 1
+        tmpfd.close()
+        obj_locs = get_lines(TMPFILE)
+        if len(obj_locs) == 0:
+            logging.error("Can't find imported object {name}".format(name=file))
+            ERRORS += 1
+        for obj_loc in obj_locs:
+            # For btrfs skip snap_* dirs
+            if re.search("/snap_[0-9]*/", obj_loc) is not None:
+                continue
+            repcount += 1
+            cmd = "diff -q {src} {obj_loc}".format(src=path, obj_loc=obj_loc)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True)
+            if ret != 0:
+                logging.error("{file} data not imported properly into {obj}".format(file=file, obj=obj_loc))
+                ERRORS += 1
+    return ERRORS, repcount
+
+
+def set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
+    # change the weight of osd.0 to math.pi in the newest osdmap of given osd
+    osdmap_file = tempfile.NamedTemporaryFile(delete=True)
+    cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
+                                                                        osdmap_file=osdmap_file.name)
+    output = check_output(cmd, shell=True)
+    epoch = int(re.findall('#(\d+)', output)[0])
+
+    new_crush_file = tempfile.NamedTemporaryFile(delete=True)
+    old_crush_file = tempfile.NamedTemporaryFile(delete=True)
+    ret = call("{path}/osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
+                                                                          crush_file=old_crush_file.name, path=CEPH_BIN),
+               stdout=DEVNULL,
+               stderr=DEVNULL,
+               shell=True)
+    assert(ret == 0)
+
+    for osd_id in osd_ids:
+        cmd = "{path}/crushtool -i {crush_file} --reweight-item osd.{osd} {weight} -o {new_crush_file}".format(osd=osd_id,
+                                                                                                          crush_file=old_crush_file.name,
+                                                                                                          weight=weight,
+                                                                                                          new_crush_file=new_crush_file.name, path=CEPH_BIN)
+        ret = call(cmd, stdout=DEVNULL, shell=True)
+        assert(ret == 0)
+        old_crush_file, new_crush_file = new_crush_file, old_crush_file
+
+    # change them back, since we don't need to preapre for another round
+    old_crush_file, new_crush_file = new_crush_file, old_crush_file
+    old_crush_file.close()
+
+    ret = call("{path}/osdmaptool --import-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
+                                                                               crush_file=new_crush_file.name, path=CEPH_BIN),
+               stdout=DEVNULL,
+               stderr=DEVNULL,
+               shell=True)
+    assert(ret == 0)
+
+    # Minimum test of --dry-run by using it, but not checking anything
+    cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force --dry-run"
+    cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch)
+    ret = call(cmd, stdout=DEVNULL, shell=True)
+    assert(ret == 0)
+
+    # osdmaptool increases the epoch of the changed osdmap, so we need to force the tool
+    # to use use a different epoch than the one in osdmap
+    cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force"
+    cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch)
+    ret = call(cmd, stdout=DEVNULL, shell=True)
+
+    return ret == 0
+
+def get_osd_weights(CFSD_PREFIX, osd_ids, osd_path):
+    osdmap_file = tempfile.NamedTemporaryFile(delete=True)
+    cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
+                                                                        osdmap_file=osdmap_file.name)
+    ret = call(cmd, stdout=DEVNULL, shell=True)
+    if ret != 0:
+        return None
+    # we have to read the weights from the crush map, even we can query the weights using
+    # osdmaptool, but please keep in mind, they are different:
+    #    item weights in crush map versus weight associated with each osd in osdmap
+    crush_file = tempfile.NamedTemporaryFile(delete=True)
+    ret = call("{path}/osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
+                                                                               crush_file=crush_file.name, path=CEPH_BIN),
+               stdout=DEVNULL,
+               shell=True)
+    assert(ret == 0)
+    output = check_output("{path}/crushtool --tree -i {crush_file} | tail -n {num_osd}".format(crush_file=crush_file.name,
+                                                                                          num_osd=len(osd_ids), path=CEPH_BIN),
+                          stderr=DEVNULL,
+                          shell=True)
+    weights = []
+    for line in output.strip().split('\n'):
+        print(line)
+        linev = re.split('\s+', line)
+        if linev[0] is '':
+            linev.pop(0)
+        print('linev %s' % linev)
+        weights.append(float(linev[2]))
+
+    return weights
+
+
+def test_get_set_osdmap(CFSD_PREFIX, osd_ids, osd_paths):
+    print("Testing get-osdmap and set-osdmap")
+    errors = 0
+    kill_daemons()
+    weight = 1 / math.e           # just some magic number in [0, 1]
+    changed = []
+    for osd_path in osd_paths:
+        if set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
+            changed.append(osd_path)
+        else:
+            logging.warning("Failed to change the weights: {0}".format(osd_path))
+    # i am pissed off if none of the store gets changed
+    if not changed:
+        errors += 1
+
+    for osd_path in changed:
+        weights = get_osd_weights(CFSD_PREFIX, osd_ids, osd_path)
+        if not weights:
+            errors += 1
+            continue
+        if any(abs(w - weight) > 1e-5 for w in weights):
+            logging.warning("Weight is not changed: {0} != {1}".format(weights, weight))
+            errors += 1
+    return errors
+
+def test_get_set_inc_osdmap(CFSD_PREFIX, osd_path):
+    # incrementals are not used unless we need to build an MOSDMap to update
+    # OSD's peers, so an obvious way to test it is simply overwrite an epoch
+    # with a different copy, and read it back to see if it matches.
+    kill_daemons()
+    file_e2 = tempfile.NamedTemporaryFile(delete=True)
+    cmd = (CFSD_PREFIX + "--op get-inc-osdmap --file {file}").format(osd=osd_path,
+                                                                     file=file_e2.name)
+    output = check_output(cmd, shell=True)
+    epoch = int(re.findall('#(\d+)', output)[0])
+    # backup e1 incremental before overwriting it
+    epoch -= 1
+    file_e1_backup = tempfile.NamedTemporaryFile(delete=True)
+    cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
+    ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
+    if ret: return 1
+    # overwrite e1 with e2
+    cmd = CFSD_PREFIX + "--op set-inc-osdmap --force --epoch {epoch} --file {file}"
+    ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e2.name), shell=True)
+    if ret: return 1
+    # Use dry-run to set back to e1 which shouldn't happen
+    cmd = CFSD_PREFIX + "--op set-inc-osdmap --dry-run --epoch {epoch} --file {file}"
+    ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
+    if ret: return 1
+    # read from e1
+    file_e1_read = tempfile.NamedTemporaryFile(delete=True)
+    cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
+    ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_read.name), shell=True)
+    if ret: return 1
+    errors = 0
+    try:
+        if not filecmp.cmp(file_e2.name, file_e1_read.name, shallow=False):
+            logging.error("{{get,set}}-inc-osdmap mismatch {0} != {1}".format(file_e2.name, file_e1_read.name))
+            errors += 1
+    finally:
+        # revert the change with file_e1_backup
+        cmd = CFSD_PREFIX + "--op set-inc-osdmap --epoch {epoch} --file {file}"
+        ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
+        if ret:
+            logging.error("Failed to revert the changed inc-osdmap")
+            errors += 1
+
+    return errors
+
+
+def test_removeall(CFSD_PREFIX, db, OBJREPPGS, REP_POOL, CEPH_BIN, OSDDIR, REP_NAME, NUM_CLONED_REP_OBJECTS):
+    # Test removeall
+    TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
+    nullfd = open(os.devnull, "w")
+    errors=0
+    print("Test removeall")
+    kill_daemons()
+    for nspace in db.keys():
+        for basename in db[nspace].keys():
+            JSON = db[nspace][basename]['json']
+            for pg in OBJREPPGS:
+                OSDS = get_osds(pg, OSDDIR)
+                for osd in OSDS:
+                    DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+                    fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+                              and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+                    if not fnames:
+                        continue
+
+                    if int(basename.split(REP_NAME)[1]) <= int(NUM_CLONED_REP_OBJECTS):
+                        cmd = (CFSD_PREFIX + "'{json}' remove").format(osd=osd, json=JSON)
+                        errors += test_failure(cmd, "Snapshots are present, use removeall to delete everything")
+
+                    cmd = (CFSD_PREFIX + " --force --dry-run '{json}' remove").format(osd=osd, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+                    if ret != 0:
+                        logging.error("remove with --force failed for {json}".format(json=JSON))
+                        errors += 1
+
+                    cmd = (CFSD_PREFIX + " --dry-run '{json}' removeall").format(osd=osd, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+                    if ret != 0:
+                        logging.error("removeall failed for {json}".format(json=JSON))
+                        errors += 1
+
+                    cmd = (CFSD_PREFIX + " '{json}' removeall").format(osd=osd, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+                    if ret != 0:
+                        logging.error("removeall failed for {json}".format(json=JSON))
+                        errors += 1
+
+                    tmpfd = open(TMPFILE, "w")
+                    cmd = (CFSD_PREFIX + "--op list --pgid {pg} --namespace {ns} {name}").format(osd=osd, pg=pg, ns=nspace, name=basename)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=tmpfd)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
+                        errors += 1
+                    tmpfd.close()
+                    lines = get_lines(TMPFILE)
+                    if len(lines) != 0:
+                        logging.error("Removeall didn't remove all objects {ns}/{name} : {lines}".format(ns=nspace, name=basename, lines=lines))
+                        errors += 1
+    vstart(new=False)
+    wait_for_health()
+    cmd = "{path}/rados -p {pool} rmsnap snap1".format(pool=REP_POOL, path=CEPH_BIN)
+    logging.debug(cmd)
+    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+    if ret != 0:
+        logging.error("rados rmsnap failed")
+        errors += 1
+    time.sleep(2)
+    wait_for_health()
+    return errors
+
+
+def main(argv):
+    if sys.version_info[0] < 3:
+        sys.stdout = stdout = os.fdopen(sys.stdout.fileno(), 'wb', 0)
+    else:
+        stdout = sys.stdout.buffer
+    if len(argv) > 1 and argv[1] == "debug":
+        nullfd = stdout
+    else:
+        nullfd = DEVNULL
+
+    call("rm -fr {dir}; mkdir -p {dir}".format(dir=CEPH_DIR), shell=True)
+    os.chdir(CEPH_DIR)
+    os.environ["CEPH_DIR"] = CEPH_DIR
+    OSDDIR = "dev"
+    REP_POOL = "rep_pool"
+    REP_NAME = "REPobject"
+    EC_POOL = "ec_pool"
+    EC_NAME = "ECobject"
+    if len(argv) > 0 and argv[0] == 'large':
+        PG_COUNT = 12
+        NUM_REP_OBJECTS = 800
+        NUM_CLONED_REP_OBJECTS = 100
+        NUM_EC_OBJECTS = 12
+        NUM_NSPACES = 4
+        # Larger data sets for first object per namespace
+        DATALINECOUNT = 50000
+        # Number of objects to do xattr/omap testing on
+        ATTR_OBJS = 10
+    else:
+        PG_COUNT = 4
+        NUM_REP_OBJECTS = 2
+        NUM_CLONED_REP_OBJECTS = 2
+        NUM_EC_OBJECTS = 2
+        NUM_NSPACES = 2
+        # Larger data sets for first object per namespace
+        DATALINECOUNT = 10
+        # Number of objects to do xattr/omap testing on
+        ATTR_OBJS = 2
+    ERRORS = 0
+    pid = os.getpid()
+    TESTDIR = "/tmp/test.{pid}".format(pid=pid)
+    DATADIR = "/tmp/data.{pid}".format(pid=pid)
+    CFSD_PREFIX = CEPH_BIN + "/ceph-objectstore-tool --data-path " + OSDDIR + "/{osd} "
+    PROFNAME = "testecprofile"
+
+    os.environ['CEPH_CONF'] = CEPH_CONF
+    vstart(new=True)
+    wait_for_health()
+
+    cmd = "{path}/ceph osd pool create {pool} {pg} {pg} replicated".format(pool=REP_POOL, pg=PG_COUNT, path=CEPH_BIN)
+    logging.debug(cmd)
+    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+    time.sleep(2)
+    REPID = get_pool_id(REP_POOL, nullfd)
+
+    print("Created Replicated pool #{repid}".format(repid=REPID))
+
+    cmd = "{path}/ceph osd erasure-code-profile set {prof} crush-failure-domain=osd".format(prof=PROFNAME, path=CEPH_BIN)
+    logging.debug(cmd)
+    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+    cmd = "{path}/ceph osd erasure-code-profile get {prof}".format(prof=PROFNAME, path=CEPH_BIN)
+    logging.debug(cmd)
+    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+    cmd = "{path}/ceph osd pool create {pool} {pg} {pg} erasure {prof}".format(pool=EC_POOL, prof=PROFNAME, pg=PG_COUNT, path=CEPH_BIN)
+    logging.debug(cmd)
+    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+    ECID = get_pool_id(EC_POOL, nullfd)
+
+    print("Created Erasure coded pool #{ecid}".format(ecid=ECID))
+
+    print("Creating {objs} objects in replicated pool".format(objs=(NUM_REP_OBJECTS*NUM_NSPACES)))
+    cmd = "mkdir -p {datadir}".format(datadir=DATADIR)
+    logging.debug(cmd)
+    call(cmd, shell=True)
+
+    db = {}
+
+    objects = range(1, NUM_REP_OBJECTS + 1)
+    nspaces = range(NUM_NSPACES)
+    for n in nspaces:
+        nspace = get_nspace(n)
+
+        db[nspace] = {}
+
+        for i in objects:
+            NAME = REP_NAME + "{num}".format(num=i)
+            LNAME = nspace + "-" + NAME
+            DDNAME = os.path.join(DATADIR, LNAME)
+            DDNAME += "__head"
+
+            cmd = "rm -f " + DDNAME
+            logging.debug(cmd)
+            call(cmd, shell=True)
+
+            if i == 1:
+                dataline = range(DATALINECOUNT)
+            else:
+                dataline = range(1)
+            fd = open(DDNAME, "w")
+            data = "This is the replicated data for " + LNAME + "\n"
+            for _ in dataline:
+                fd.write(data)
+            fd.close()
+
+            cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=REP_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stderr=nullfd)
+            if ret != 0:
+                logging.critical("Rados put command failed with {ret}".format(ret=ret))
+                return 1
+
+            db[nspace][NAME] = {}
+
+            if i < ATTR_OBJS + 1:
+                keys = range(i)
+            else:
+                keys = range(0)
+            db[nspace][NAME]["xattr"] = {}
+            for k in keys:
+                if k == 0:
+                    continue
+                mykey = "key{i}-{k}".format(i=i, k=k)
+                myval = "val{i}-{k}".format(i=i, k=k)
+                cmd = "{path}/rados -p {pool} -N '{nspace}' setxattr {name} {key} {val}".format(pool=REP_POOL, name=NAME, key=mykey, val=myval, nspace=nspace, path=CEPH_BIN)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True)
+                if ret != 0:
+                    logging.error("setxattr failed with {ret}".format(ret=ret))
+                    ERRORS += 1
+                db[nspace][NAME]["xattr"][mykey] = myval
+
+            # Create omap header in all objects but REPobject1
+            if i < ATTR_OBJS + 1 and i != 1:
+                myhdr = "hdr{i}".format(i=i)
+                cmd = "{path}/rados -p {pool} -N '{nspace}' setomapheader {name} {hdr}".format(pool=REP_POOL, name=NAME, hdr=myhdr, nspace=nspace, path=CEPH_BIN)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True)
+                if ret != 0:
+                    logging.critical("setomapheader failed with {ret}".format(ret=ret))
+                    ERRORS += 1
+                db[nspace][NAME]["omapheader"] = myhdr
+
+            db[nspace][NAME]["omap"] = {}
+            for k in keys:
+                if k == 0:
+                    continue
+                mykey = "okey{i}-{k}".format(i=i, k=k)
+                myval = "oval{i}-{k}".format(i=i, k=k)
+                cmd = "{path}/rados -p {pool} -N '{nspace}' setomapval {name} {key} {val}".format(pool=REP_POOL, name=NAME, key=mykey, val=myval, nspace=nspace, path=CEPH_BIN)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True)
+                if ret != 0:
+                    logging.critical("setomapval failed with {ret}".format(ret=ret))
+                db[nspace][NAME]["omap"][mykey] = myval
+
+    # Create some clones
+    cmd = "{path}/rados -p {pool} mksnap snap1".format(pool=REP_POOL, path=CEPH_BIN)
+    logging.debug(cmd)
+    call(cmd, shell=True)
+
+    objects = range(1, NUM_CLONED_REP_OBJECTS + 1)
+    nspaces = range(NUM_NSPACES)
+    for n in nspaces:
+        nspace = get_nspace(n)
+
+        for i in objects:
+            NAME = REP_NAME + "{num}".format(num=i)
+            LNAME = nspace + "-" + NAME
+            DDNAME = os.path.join(DATADIR, LNAME)
+            # First clone
+            CLONENAME = DDNAME + "__1"
+            DDNAME += "__head"
+
+            cmd = "mv -f " + DDNAME + " " + CLONENAME
+            logging.debug(cmd)
+            call(cmd, shell=True)
+
+            if i == 1:
+                dataline = range(DATALINECOUNT)
+            else:
+                dataline = range(1)
+            fd = open(DDNAME, "w")
+            data = "This is the replicated data after a snapshot for " + LNAME + "\n"
+            for _ in dataline:
+                fd.write(data)
+            fd.close()
+
+            cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=REP_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stderr=nullfd)
+            if ret != 0:
+                logging.critical("Rados put command failed with {ret}".format(ret=ret))
+                return 1
+
+    print("Creating {objs} objects in erasure coded pool".format(objs=(NUM_EC_OBJECTS*NUM_NSPACES)))
+
+    objects = range(1, NUM_EC_OBJECTS + 1)
+    nspaces = range(NUM_NSPACES)
+    for n in nspaces:
+        nspace = get_nspace(n)
+
+        for i in objects:
+            NAME = EC_NAME + "{num}".format(num=i)
+            LNAME = nspace + "-" + NAME
+            DDNAME = os.path.join(DATADIR, LNAME)
+            DDNAME += "__head"
+
+            cmd = "rm -f " + DDNAME
+            logging.debug(cmd)
+            call(cmd, shell=True)
+
+            if i == 1:
+                dataline = range(DATALINECOUNT)
+            else:
+                dataline = range(1)
+            fd = open(DDNAME, "w")
+            data = "This is the erasure coded data for " + LNAME + "\n"
+            for j in dataline:
+                fd.write(data)
+            fd.close()
+
+            cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=EC_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stderr=nullfd)
+            if ret != 0:
+                logging.critical("Erasure coded pool creation failed with {ret}".format(ret=ret))
+                return 1
+
+            db[nspace][NAME] = {}
+
+            db[nspace][NAME]["xattr"] = {}
+            if i < ATTR_OBJS + 1:
+                keys = range(i)
+            else:
+                keys = range(0)
+            for k in keys:
+                if k == 0:
+                    continue
+                mykey = "key{i}-{k}".format(i=i, k=k)
+                myval = "val{i}-{k}".format(i=i, k=k)
+                cmd = "{path}/rados -p {pool} -N '{nspace}' setxattr {name} {key} {val}".format(pool=EC_POOL, name=NAME, key=mykey, val=myval, nspace=nspace, path=CEPH_BIN)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True)
+                if ret != 0:
+                    logging.error("setxattr failed with {ret}".format(ret=ret))
+                    ERRORS += 1
+                db[nspace][NAME]["xattr"][mykey] = myval
+
+            # Omap isn't supported in EC pools
+            db[nspace][NAME]["omap"] = {}
+
+    logging.debug(db)
+
+    kill_daemons()
+
+    if ERRORS:
+        logging.critical("Unable to set up test")
+        return 1
+
+    ALLREPPGS = get_pgs(OSDDIR, REPID)
+    logging.debug(ALLREPPGS)
+    ALLECPGS = get_pgs(OSDDIR, ECID)
+    logging.debug(ALLECPGS)
+
+    OBJREPPGS = get_objs(ALLREPPGS, REP_NAME, OSDDIR, REPID)
+    logging.debug(OBJREPPGS)
+    OBJECPGS = get_objs(ALLECPGS, EC_NAME, OSDDIR, ECID)
+    logging.debug(OBJECPGS)
+
+    ONEPG = ALLREPPGS[0]
+    logging.debug(ONEPG)
+    osds = get_osds(ONEPG, OSDDIR)
+    ONEOSD = osds[0]
+    logging.debug(ONEOSD)
+
+    print("Test invalid parameters")
+    # On export can't use stdout to a terminal
+    cmd = (CFSD_PREFIX + "--op export --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True)
+
+    # On export can't use stdout to a terminal
+    cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True)
+
+    # Prep a valid ec export file for import failure tests
+    ONEECPG = ALLECPGS[0]
+    osds = get_osds(ONEECPG, OSDDIR)
+    ONEECOSD = osds[0]
+    OTHERFILE = "/tmp/foo.{pid}".format(pid=pid)
+    cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=ONEECPG, file=OTHERFILE)
+    logging.debug(cmd)
+    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+
+    # On import can't specify a different shard
+    BADPG = ONEECPG.split('s')[0] + "s10"
+    cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=BADPG, file=OTHERFILE)
+    ERRORS += test_failure(cmd, "Can't specify a different shard, must be")
+
+    os.unlink(OTHERFILE)
+
+    # Prep a valid export file for import failure tests
+    OTHERFILE = "/tmp/foo.{pid}".format(pid=pid)
+    cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE)
+    logging.debug(cmd)
+    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+
+    # On import can't specify a PG with a non-existent pool
+    cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg="10.0", file=OTHERFILE)
+    ERRORS += test_failure(cmd, "Can't specify a different pgid pool, must be")
+
+    # On import can't specify shard for a replicated export
+    cmd = (CFSD_PREFIX + "--op import --pgid {pg}s0 --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE)
+    ERRORS += test_failure(cmd, "Can't specify a sharded pgid with a non-sharded export")
+
+    # On import can't specify a PG with a bad seed
+    TMPPG="{pool}.80".format(pool=REPID)
+    cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg=TMPPG, file=OTHERFILE)
+    ERRORS += test_failure(cmd, "Illegal pgid, the seed is larger than current pg_num")
+
+    os.unlink(OTHERFILE)
+    cmd = (CFSD_PREFIX + "--op import --file {FOO}").format(osd=ONEOSD, FOO=OTHERFILE)
+    ERRORS += test_failure(cmd, "file: {FOO}: No such file or directory".format(FOO=OTHERFILE))
+
+    cmd = "{path}/ceph-objectstore-tool --data-path BAD_DATA_PATH --op list".format(osd=ONEOSD, path=CEPH_BIN)
+    ERRORS += test_failure(cmd, "data-path: BAD_DATA_PATH: No such file or directory")
+
+    cmd = "{path}/ceph-objectstore-tool --journal-path BAD_JOURNAL_PATH --op dump-journal".format(path=CEPH_BIN)
+    ERRORS += test_failure(cmd, "journal-path: BAD_JOURNAL_PATH: (2) No such file or directory")
+
+    cmd = (CFSD_PREFIX + "--journal-path BAD_JOURNAL_PATH --op list").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "journal-path: BAD_JOURNAL_PATH: No such file or directory")
+
+    cmd = (CFSD_PREFIX + "--journal-path /bin --op list").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "journal-path: /bin: (21) Is a directory")
+
+    # On import can't use stdin from a terminal
+    cmd = (CFSD_PREFIX + "--op import --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
+
+    # On import can't use stdin from a terminal
+    cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
+
+    # Specify a bad --type
+    os.mkdir(OSDDIR + "/fakeosd")
+    cmd = ("{path}/ceph-objectstore-tool --data-path " + OSDDIR + "/{osd} --type foobar --op list --pgid {pg}").format(osd="fakeosd", pg=ONEPG, path=CEPH_BIN)
+    ERRORS += test_failure(cmd, "Unable to create store of type foobar")
+
+    # Don't specify a data-path
+    cmd = "{path}/ceph-objectstore-tool --type memstore --op list --pgid {pg}".format(dir=OSDDIR, osd=ONEOSD, pg=ONEPG, path=CEPH_BIN)
+    ERRORS += test_failure(cmd, "Must provide --data-path")
+
+    cmd = (CFSD_PREFIX + "--op remove --pgid 2.0").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "Please use export-remove or you must use --force option")
+
+    cmd = (CFSD_PREFIX + "--force --op remove").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "Must provide pgid")
+
+    # Don't secify a --op nor object command
+    cmd = CFSD_PREFIX.format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "Must provide --op or object command...")
+
+    # Specify a bad --op command
+    cmd = (CFSD_PREFIX + "--op oops").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, mkfs, fsck, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)")
+
+    # Provide just the object param not a command
+    cmd = (CFSD_PREFIX + "object").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "Invalid syntax, missing command")
+
+    # Provide an object name that doesn't exist
+    cmd = (CFSD_PREFIX + "NON_OBJECT get-bytes").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "No object id 'NON_OBJECT' found")
+
+    # Provide an invalid object command
+    cmd = (CFSD_PREFIX + "--pgid {pg} '' notacommand").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Unknown object command 'notacommand'")
+
+    cmd = (CFSD_PREFIX + "foo list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "No object id 'foo' found or invalid JSON specified")
+
+    cmd = (CFSD_PREFIX + "'{{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}}' list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Without --pgid the object '{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}' must be a JSON array")
+
+    cmd = (CFSD_PREFIX + "'[]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Object '[]' must be a JSON array with 2 elements")
+
+    cmd = (CFSD_PREFIX + "'[\"1.0\"]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Object '[\"1.0\"]' must be a JSON array with 2 elements")
+
+    cmd = (CFSD_PREFIX + "'[\"1.0\", 5, 8, 9]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Object '[\"1.0\", 5, 8, 9]' must be a JSON array with 2 elements")
+
+    cmd = (CFSD_PREFIX + "'[1, 2]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Object '[1, 2]' must be a JSON array with the first element a string")
+
+    cmd = (CFSD_PREFIX + "'[\"1.3\",{{\"snapid\":\"not an int\"}}]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Decode object JSON error: value type is 2 not 4")
+
+    TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
+    ALLPGS = OBJREPPGS + OBJECPGS
+    OSDS = get_osds(ALLPGS[0], OSDDIR)
+    osd = OSDS[0]
+
+    print("Test all --op dump-journal")
+    ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]
+    ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS)
+
+    # Test --op list and generate json for all objects
+    print("Test --op list variants")
+
+    # retrieve all objects from all PGs
+    tmpfd = open(TMPFILE, "wb")
+    cmd = (CFSD_PREFIX + "--op list --format json").format(osd=osd)
+    logging.debug(cmd)
+    ret = call(cmd, shell=True, stdout=tmpfd)
+    if ret != 0:
+        logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
+        ERRORS += 1
+    tmpfd.close()
+    lines = get_lines(TMPFILE)
+    JSONOBJ = sorted(set(lines))
+    (pgid, coll, jsondict) = json.loads(JSONOBJ[0])[0]
+
+    # retrieve all objects in a given PG
+    tmpfd = open(OTHERFILE, "ab")
+    cmd = (CFSD_PREFIX + "--op list --pgid {pg} --format json").format(osd=osd, pg=pgid)
+    logging.debug(cmd)
+    ret = call(cmd, shell=True, stdout=tmpfd)
+    if ret != 0:
+        logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
+        ERRORS += 1
+    tmpfd.close()
+    lines = get_lines(OTHERFILE)
+    JSONOBJ = sorted(set(lines))
+    (other_pgid, other_coll, other_jsondict) = json.loads(JSONOBJ[0])[0]
+
+    if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll:
+        logging.error("the first line of --op list is different "
+                      "from the first line of --op list --pgid {pg}".format(pg=pgid))
+        ERRORS += 1
+
+    # retrieve all objects with a given name in a given PG
+    tmpfd = open(OTHERFILE, "wb")
+    cmd = (CFSD_PREFIX + "--op list --pgid {pg} {object} --format json").format(osd=osd, pg=pgid, object=jsondict['oid'])
+    logging.debug(cmd)
+    ret = call(cmd, shell=True, stdout=tmpfd)
+    if ret != 0:
+        logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
+        ERRORS += 1
+    tmpfd.close()
+    lines = get_lines(OTHERFILE)
+    JSONOBJ = sorted(set(lines))
+    (other_pgid, other_coll, other_jsondict) in json.loads(JSONOBJ[0])[0]
+
+    if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll:
+        logging.error("the first line of --op list is different "
+                      "from the first line of --op list --pgid {pg} {object}".format(pg=pgid, object=jsondict['oid']))
+        ERRORS += 1
+
+    print("Test --op list by generating json for all objects using default format")
+    for pg in ALLPGS:
+        OSDS = get_osds(pg, OSDDIR)
+        for osd in OSDS:
+            tmpfd = open(TMPFILE, "ab")
+            cmd = (CFSD_PREFIX + "--op list --pgid {pg}").format(osd=osd, pg=pg)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stdout=tmpfd)
+            if ret != 0:
+                logging.error("Bad exit status {ret} from --op list request".format(ret=ret))
+                ERRORS += 1
+
+    tmpfd.close()
+    lines = get_lines(TMPFILE)
+    JSONOBJ = sorted(set(lines))
+    for JSON in JSONOBJ:
+        (pgid, jsondict) = json.loads(JSON)
+        # Skip clones for now
+        if jsondict['snapid'] != -2:
+            continue
+        db[jsondict['namespace']][jsondict['oid']]['json'] = json.dumps((pgid, jsondict))
+        # print db[jsondict['namespace']][jsondict['oid']]['json']
+        if jsondict['oid'].find(EC_NAME) == 0 and 'shard_id' not in jsondict:
+            logging.error("Malformed JSON {json}".format(json=JSON))
+            ERRORS += 1
+
+    # Test get-bytes
+    print("Test get-bytes and set-bytes")
+    for nspace in db.keys():
+        for basename in db[nspace].keys():
+            file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
+            JSON = db[nspace][basename]['json']
+            GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
+            TESTNAME = "/tmp/testbytes.{pid}".format(pid=pid)
+            SETNAME = "/tmp/setbytes.{pid}".format(pid=pid)
+            BADNAME = "/tmp/badbytes.{pid}".format(pid=pid)
+            for pg in OBJREPPGS:
+                OSDS = get_osds(pg, OSDDIR)
+                for osd in OSDS:
+                    DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+                    fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+                              and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+                    if not fnames:
+                        continue
+                    try:
+                        os.unlink(GETNAME)
+                    except:
+                        pass
+                    cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-bytes {fname}").format(osd=osd, pg=pg, json=JSON, fname=GETNAME)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret}".format(ret=ret))
+                        ERRORS += 1
+                        continue
+                    cmd = "diff -q {file} {getfile}".format(file=file, getfile=GETNAME)
+                    ret = call(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Data from get-bytes differ")
+                        logging.debug("Got:")
+                        cat_file(logging.DEBUG, GETNAME)
+                        logging.debug("Expected:")
+                        cat_file(logging.DEBUG, file)
+                        ERRORS += 1
+                    fd = open(SETNAME, "w")
+                    data = "put-bytes going into {file}\n".format(file=file)
+                    fd.write(data)
+                    fd.close()
+                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' set-bytes {sname}").format(osd=osd, pg=pg, json=JSON, sname=SETNAME)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from set-bytes".format(ret=ret))
+                        ERRORS += 1
+                    fd = open(TESTNAME, "wb")
+                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-bytes -").format(osd=osd, pg=pg, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=fd)
+                    fd.close()
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from get-bytes".format(ret=ret))
+                        ERRORS += 1
+                    cmd = "diff -q {setfile} {testfile}".format(setfile=SETNAME, testfile=TESTNAME)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Data after set-bytes differ")
+                        logging.debug("Got:")
+                        cat_file(logging.DEBUG, TESTNAME)
+                        logging.debug("Expected:")
+                        cat_file(logging.DEBUG, SETNAME)
+                        ERRORS += 1
+
+                    # Use set-bytes with --dry-run and make sure contents haven't changed
+                    fd = open(BADNAME, "w")
+                    data = "Bad data for --dry-run in {file}\n".format(file=file)
+                    fd.write(data)
+                    fd.close()
+                    cmd = (CFSD_PREFIX + "--dry-run --pgid {pg} '{json}' set-bytes {sname}").format(osd=osd, pg=pg, json=JSON, sname=BADNAME)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from set-bytes --dry-run".format(ret=ret))
+                        ERRORS += 1
+                    fd = open(TESTNAME, "wb")
+                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-bytes -").format(osd=osd, pg=pg, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=fd)
+                    fd.close()
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from get-bytes".format(ret=ret))
+                        ERRORS += 1
+                    cmd = "diff -q {setfile} {testfile}".format(setfile=SETNAME, testfile=TESTNAME)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Data after set-bytes --dry-run changed!")
+                        logging.debug("Got:")
+                        cat_file(logging.DEBUG, TESTNAME)
+                        logging.debug("Expected:")
+                        cat_file(logging.DEBUG, SETNAME)
+                        ERRORS += 1
+
+                    fd = open(file, "rb")
+                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' set-bytes").format(osd=osd, pg=pg, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdin=fd)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from set-bytes to restore object".format(ret=ret))
+                        ERRORS += 1
+                    fd.close()
+
+    try:
+        os.unlink(GETNAME)
+    except:
+        pass
+    try:
+        os.unlink(TESTNAME)
+    except:
+        pass
+    try:
+        os.unlink(SETNAME)
+    except:
+        pass
+    try:
+        os.unlink(BADNAME)
+    except:
+        pass
+
+    # Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap
+    print("Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap")
+    for nspace in db.keys():
+        for basename in db[nspace].keys():
+            file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
+            JSON = db[nspace][basename]['json']
+            for pg in OBJREPPGS:
+                OSDS = get_osds(pg, OSDDIR)
+                for osd in OSDS:
+                    DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+                    fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+                              and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+                    if not fnames:
+                        continue
+                    for key, val in db[nspace][basename]["xattr"].items():
+                        attrkey = "_" + key
+                        cmd = (CFSD_PREFIX + " '{json}' get-attr {key}").format(osd=osd, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        getval = check_output(cmd, shell=True)
+                        if getval != val:
+                            logging.error("get-attr of key {key} returned wrong val: {get} instead of {orig}".format(key=attrkey, get=getval, orig=val))
+                            ERRORS += 1
+                            continue
+                        # set-attr to bogus value "foobar"
+                        cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        # Test set-attr with dry-run
+                        cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stdout=nullfd)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        # Check the set-attr
+                        cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        getval = check_output(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from get-attr".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        if getval != "foobar":
+                            logging.error("Check of set-attr failed because we got {val}".format(val=getval))
+                            ERRORS += 1
+                            continue
+                        # Test rm-attr
+                        cmd = (CFSD_PREFIX + "'{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from rm-attr".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        # Check rm-attr with dry-run
+                        cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stdout=nullfd)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from rm-attr".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        cmd = (CFSD_PREFIX + "'{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd)
+                        if ret == 0:
+                            logging.error("For rm-attr expect get-attr to fail, but it succeeded")
+                            ERRORS += 1
+                        # Put back value
+                        cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey, val=val)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
+                            ERRORS += 1
+                            continue
+
+                    hdr = db[nspace][basename].get("omapheader", "")
+                    cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, json=JSON)
+                    logging.debug(cmd)
+                    gethdr = check_output(cmd, shell=True)
+                    if gethdr != hdr:
+                        logging.error("get-omaphdr was wrong: {get} instead of {orig}".format(get=gethdr, orig=hdr))
+                        ERRORS += 1
+                        continue
+                    # set-omaphdr to bogus value "foobar"
+                    cmd = ("echo -n foobar | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
+                        ERRORS += 1
+                        continue
+                    # Check the set-omaphdr
+                    cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, pg=pg, json=JSON)
+                    logging.debug(cmd)
+                    gethdr = check_output(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from get-omaphdr".format(ret=ret))
+                        ERRORS += 1
+                        continue
+                    if gethdr != "foobar":
+                        logging.error("Check of set-omaphdr failed because we got {val}".format(val=getval))
+                        ERRORS += 1
+                        continue
+                    # Test dry-run with set-omaphdr
+                    cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=nullfd)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
+                        ERRORS += 1
+                        continue
+                    # Put back value
+                    cmd = ("echo -n {val} | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON, val=hdr)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
+                        ERRORS += 1
+                        continue
+
+                    for omapkey, val in db[nspace][basename]["omap"].items():
+                        cmd = (CFSD_PREFIX + " '{json}' get-omap {key}").format(osd=osd, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        getval = check_output(cmd, shell=True)
+                        if getval != val:
+                            logging.error("get-omap of key {key} returned wrong val: {get} instead of {orig}".format(key=omapkey, get=getval, orig=val))
+                            ERRORS += 1
+                            continue
+                        # set-omap to bogus value "foobar"
+                        cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        # Check set-omap with dry-run
+                        cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stdout=nullfd)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        # Check the set-omap
+                        cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        getval = check_output(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from get-omap".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        if getval != "foobar":
+                            logging.error("Check of set-omap failed because we got {val}".format(val=getval))
+                            ERRORS += 1
+                            continue
+                        # Test rm-omap
+                        cmd = (CFSD_PREFIX + "'{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from rm-omap".format(ret=ret))
+                            ERRORS += 1
+                        # Check rm-omap with dry-run
+                        cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stdout=nullfd)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from rm-omap".format(ret=ret))
+                            ERRORS += 1
+                        cmd = (CFSD_PREFIX + "'{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd)
+                        if ret == 0:
+                            logging.error("For rm-omap expect get-omap to fail, but it succeeded")
+                            ERRORS += 1
+                        # Put back value
+                        cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey, val=val)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
+                            ERRORS += 1
+                            continue
+
+    # Test dump
+    print("Test dump")
+    for nspace in db.keys():
+        for basename in db[nspace].keys():
+            file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
+            JSON = db[nspace][basename]['json']
+            GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
+            for pg in OBJREPPGS:
+                OSDS = get_osds(pg, OSDDIR)
+                for osd in OSDS:
+                    DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+                    fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+                              and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+                    if not fnames:
+                        continue
+                    if int(basename.split(REP_NAME)[1]) > int(NUM_CLONED_REP_OBJECTS):
+                        continue
+                    cmd = (CFSD_PREFIX + " '{json}' dump | grep '\"snap\": 1,' > /dev/null").format(osd=osd, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Invalid dump for {json}".format(json=JSON))
+                        ERRORS += 1
+
+    print("Test list-attrs get-attr")
+    ATTRFILE = r"/tmp/attrs.{pid}".format(pid=pid)
+    VALFILE = r"/tmp/val.{pid}".format(pid=pid)
+    for nspace in db.keys():
+        for basename in db[nspace].keys():
+            file = os.path.join(DATADIR, nspace + "-" + basename)
+            JSON = db[nspace][basename]['json']
+            jsondict = json.loads(JSON)
+
+            if 'shard_id' in jsondict:
+                logging.debug("ECobject " + JSON)
+                found = 0
+                for pg in OBJECPGS:
+                    OSDS = get_osds(pg, OSDDIR)
+                    # Fix shard_id since we only have one json instance for each object
+                    jsondict['shard_id'] = int(pg.split('s')[1])
+                    JSON = json.dumps(jsondict)
+                    for osd in OSDS:
+                        cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-attr hinfo_key").format(osd=osd, pg=pg, json=JSON)
+                        logging.debug("TRY: " + cmd)
+                        try:
+                            out = check_output(cmd, shell=True, stderr=subprocess.STDOUT)
+                            logging.debug("FOUND: {json} in {osd} has value '{val}'".format(osd=osd, json=JSON, val=out))
+                            found += 1
+                        except subprocess.CalledProcessError as e:
+                            if "No such file or directory" not in e.output and "No data available" not in e.output:
+                                raise
+                # Assuming k=2 m=1 for the default ec pool
+                if found != 3:
+                    logging.error("{json} hinfo_key found {found} times instead of 3".format(json=JSON, found=found))
+                    ERRORS += 1
+
+            for pg in ALLPGS:
+                # Make sure rep obj with rep pg or ec obj with ec pg
+                if ('shard_id' in jsondict) != (pg.find('s') > 0):
+                    continue
+                if 'shard_id' in jsondict:
+                    # Fix shard_id since we only have one json instance for each object
+                    jsondict['shard_id'] = int(pg.split('s')[1])
+                    JSON = json.dumps(jsondict)
+                OSDS = get_osds(pg, OSDDIR)
+                for osd in OSDS:
+                    DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+                    fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+                              and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+                    if not fnames:
+                        continue
+                    afd = open(ATTRFILE, "wb")
+                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' list-attrs").format(osd=osd, pg=pg, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=afd)
+                    afd.close()
+                    if ret != 0:
+                        logging.error("list-attrs failed with {ret}".format(ret=ret))
+                        ERRORS += 1
+                        continue
+                    keys = get_lines(ATTRFILE)
+                    values = dict(db[nspace][basename]["xattr"])
+                    for key in keys:
+                        if key == "_" or key == "snapset" or key == "hinfo_key":
+                            continue
+                        key = key.strip("_")
+                        if key not in values:
+                            logging.error("Unexpected key {key} present".format(key=key))
+                            ERRORS += 1
+                            continue
+                        exp = values.pop(key)
+                        vfd = open(VALFILE, "wb")
+                        cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key="_" + key)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stdout=vfd)
+                        vfd.close()
+                        if ret != 0:
+                            logging.error("get-attr failed with {ret}".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        lines = get_lines(VALFILE)
+                        val = lines[0]
+                        if exp != val:
+                            logging.error("For key {key} got value {got} instead of {expected}".format(key=key, got=val, expected=exp))
+                            ERRORS += 1
+                    if len(values) != 0:
+                        logging.error("Not all keys found, remaining keys:")
+                        print(values)
+
+    print("Test --op meta-list")
+    tmpfd = open(TMPFILE, "wb")
+    cmd = (CFSD_PREFIX + "--op meta-list").format(osd=ONEOSD)
+    logging.debug(cmd)
+    ret = call(cmd, shell=True, stdout=tmpfd)
+    if ret != 0:
+        logging.error("Bad exit status {ret} from --op meta-list request".format(ret=ret))
+        ERRORS += 1
+
+    print("Test get-bytes on meta")
+    tmpfd.close()
+    lines = get_lines(TMPFILE)
+    JSONOBJ = sorted(set(lines))
+    for JSON in JSONOBJ:
+        (pgid, jsondict) = json.loads(JSON)
+        if pgid != "meta":
+            logging.error("pgid incorrect for --op meta-list {pgid}".format(pgid=pgid))
+            ERRORS += 1
+        if jsondict['namespace'] != "":
+            logging.error("namespace non null --op meta-list {ns}".format(ns=jsondict['namespace']))
+            ERRORS += 1
+        logging.info(JSON)
+        try:
+            os.unlink(GETNAME)
+        except:
+            pass
+        cmd = (CFSD_PREFIX + "'{json}' get-bytes {fname}").format(osd=ONEOSD, json=JSON, fname=GETNAME)
+        logging.debug(cmd)
+        ret = call(cmd, shell=True)
+        if ret != 0:
+            logging.error("Bad exit status {ret}".format(ret=ret))
+            ERRORS += 1
+
+    try:
+        os.unlink(GETNAME)
+    except:
+        pass
+    try:
+        os.unlink(TESTNAME)
+    except:
+        pass
+
+    print("Test pg info")
+    for pg in ALLREPPGS + ALLECPGS:
+        for osd in get_osds(pg, OSDDIR):
+            cmd = (CFSD_PREFIX + "--op info --pgid {pg} | grep '\"pgid\": \"{pg}\"'").format(osd=osd, pg=pg)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stdout=nullfd)
+            if ret != 0:
+                logging.error("Getting info failed for pg {pg} from {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+                ERRORS += 1
+
+    print("Test pg logging")
+    if len(ALLREPPGS + ALLECPGS) == len(OBJREPPGS + OBJECPGS):
+        logging.warning("All PGs have objects, so no log without modify entries")
+    for pg in ALLREPPGS + ALLECPGS:
+        for osd in get_osds(pg, OSDDIR):
+            tmpfd = open(TMPFILE, "wb")
+            cmd = (CFSD_PREFIX + "--op log --pgid {pg}").format(osd=osd, pg=pg)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stdout=tmpfd)
+            if ret != 0:
+                logging.error("Getting log failed for pg {pg} from {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+                ERRORS += 1
+            HASOBJ = pg in OBJREPPGS + OBJECPGS
+            MODOBJ = False
+            for line in get_lines(TMPFILE):
+                if line.find("modify") != -1:
+                    MODOBJ = True
+                    break
+            if HASOBJ != MODOBJ:
+                logging.error("Bad log for pg {pg} from {osd}".format(pg=pg, osd=osd))
+                MSG = (HASOBJ and [""] or ["NOT "])[0]
+                print("Log should {msg}have a modify entry".format(msg=MSG))
+                ERRORS += 1
+
+    try:
+        os.unlink(TMPFILE)
+    except:
+        pass
+
+    print("Test list-pgs")
+    for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
+
+        CHECK_PGS = get_osd_pgs(os.path.join(OSDDIR, osd), None)
+        CHECK_PGS = sorted(CHECK_PGS)
+
+        cmd = (CFSD_PREFIX + "--op list-pgs").format(osd=osd)
+        logging.debug(cmd)
+        TEST_PGS = check_output(cmd, shell=True).split("\n")
+        TEST_PGS = sorted(TEST_PGS)[1:]  # Skip extra blank line
+
+        if TEST_PGS != CHECK_PGS:
+            logging.error("list-pgs got wrong result for osd.{osd}".format(osd=osd))
+            logging.error("Expected {pgs}".format(pgs=CHECK_PGS))
+            logging.error("Got {pgs}".format(pgs=TEST_PGS))
+            ERRORS += 1
+
+    EXP_ERRORS = 0
+    print("Test pg export --dry-run")
+    pg = ALLREPPGS[0]
+    osd = get_osds(pg, OSDDIR)[0]
+    fname = "/tmp/fname.{pid}".format(pid=pid)
+    cmd = (CFSD_PREFIX + "--dry-run --op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+    logging.debug(cmd)
+    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+    if ret != 0:
+        logging.error("Exporting --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+        EXP_ERRORS += 1
+    elif os.path.exists(fname):
+        logging.error("Exporting --dry-run created file")
+        EXP_ERRORS += 1
+
+    cmd = (CFSD_PREFIX + "--dry-run --op export --pgid {pg} > {file}").format(osd=osd, pg=pg, file=fname)
+    logging.debug(cmd)
+    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+    if ret != 0:
+        logging.error("Exporting --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+        EXP_ERRORS += 1
+    else:
+        outdata = get_lines(fname)
+        if len(outdata) > 0:
+            logging.error("Exporting --dry-run to stdout not empty")
+            logging.error("Data: " + outdata)
+            EXP_ERRORS += 1
+
+    os.mkdir(TESTDIR)
+    for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
+        os.mkdir(os.path.join(TESTDIR, osd))
+    print("Test pg export")
+    for pg in ALLREPPGS + ALLECPGS:
+        for osd in get_osds(pg, OSDDIR):
+            mydir = os.path.join(TESTDIR, osd)
+            fname = os.path.join(mydir, pg)
+            if pg == ALLREPPGS[0]:
+                cmd = (CFSD_PREFIX + "--op export --pgid {pg} > {file}").format(osd=osd, pg=pg, file=fname)
+            elif pg == ALLREPPGS[1]:
+                cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file - > {file}").format(osd=osd, pg=pg, file=fname)
+            else:
+                cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+            if ret != 0:
+                logging.error("Exporting failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+                EXP_ERRORS += 1
+
+    ERRORS += EXP_ERRORS
+
+    print("Test pg removal")
+    RM_ERRORS = 0
+    for pg in ALLREPPGS + ALLECPGS:
+        for osd in get_osds(pg, OSDDIR):
+            # This should do nothing
+            cmd = (CFSD_PREFIX + "--op remove --pgid {pg} --dry-run").format(pg=pg, osd=osd)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stdout=nullfd)
+            if ret != 0:
+                logging.error("Removing --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+                RM_ERRORS += 1
+            cmd = (CFSD_PREFIX + "--force --op remove --pgid {pg}").format(pg=pg, osd=osd)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stdout=nullfd)
+            if ret != 0:
+                logging.error("Removing failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+                RM_ERRORS += 1
+
+    ERRORS += RM_ERRORS
+
+    IMP_ERRORS = 0
+    if EXP_ERRORS == 0 and RM_ERRORS == 0:
+        print("Test pg import")
+        for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
+            dir = os.path.join(TESTDIR, osd)
+            PGS = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
+            for pg in PGS:
+                file = os.path.join(dir, pg)
+                # This should do nothing
+                cmd = (CFSD_PREFIX + "--op import --file {file} --dry-run").format(osd=osd, file=file)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True, stdout=nullfd)
+                if ret != 0:
+                    logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
+                    IMP_ERRORS += 1
+                if pg == PGS[0]:
+                    cmd = ("cat {file} |".format(file=file) + CFSD_PREFIX + "--op import").format(osd=osd)
+                elif pg == PGS[1]:
+                    cmd = (CFSD_PREFIX + "--op import --file - --pgid {pg} < {file}").format(osd=osd, file=file, pg=pg)
+                else:
+                    cmd = (CFSD_PREFIX + "--op import --file {file}").format(osd=osd, file=file)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True, stdout=nullfd)
+                if ret != 0:
+                    logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
+                    IMP_ERRORS += 1
+    else:
+        logging.warning("SKIPPING IMPORT TESTS DUE TO PREVIOUS FAILURES")
+
+    ERRORS += IMP_ERRORS
+    logging.debug(cmd)
+
+    if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
+        print("Verify replicated import data")
+        data_errors, _ = check_data(DATADIR, TMPFILE, OSDDIR, REP_NAME)
+        ERRORS += data_errors
+    else:
+        logging.warning("SKIPPING CHECKING IMPORT DATA DUE TO PREVIOUS FAILURES")
+
+    print("Test all --op dump-journal again")
+    ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]
+    ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS)
+
+    vstart(new=False)
+    wait_for_health()
+
+    if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
+        print("Verify erasure coded import data")
+        ERRORS += verify(DATADIR, EC_POOL, EC_NAME, db)
+        # Check replicated data/xattr/omap using rados
+        print("Verify replicated import data using rados")
+        ERRORS += verify(DATADIR, REP_POOL, REP_NAME, db)
+
+    if EXP_ERRORS == 0:
+        NEWPOOL = "rados-import-pool"
+        cmd = "{path}/rados mkpool {pool}".format(pool=NEWPOOL, path=CEPH_BIN)
+        logging.debug(cmd)
+        ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+
+        print("Test rados import")
+        first = True
+        for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
+            dir = os.path.join(TESTDIR, osd)
+            for pg in [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]:
+                if pg.find("{id}.".format(id=REPID)) != 0:
+                    continue
+                file = os.path.join(dir, pg)
+                if first:
+                    first = False
+                    # This should do nothing
+                    cmd = "{path}/rados import -p {pool} --dry-run {file}".format(pool=NEWPOOL, file=file, path=CEPH_BIN)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=nullfd)
+                    if ret != 0:
+                        logging.error("Rados import --dry-run failed from {file} with {ret}".format(file=file, ret=ret))
+                        ERRORS += 1
+                    cmd = "{path}/rados -p {pool} ls".format(pool=NEWPOOL, path=CEPH_BIN)
+                    logging.debug(cmd)
+                    data = check_output(cmd, shell=True)
+                    if data:
+                        logging.error("'{data}'".format(data=data))
+                        logging.error("Found objects after dry-run")
+                        ERRORS += 1
+                cmd = "{path}/rados import -p {pool} {file}".format(pool=NEWPOOL, file=file, path=CEPH_BIN)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True, stdout=nullfd)
+                if ret != 0:
+                    logging.error("Rados import failed from {file} with {ret}".format(file=file, ret=ret))
+                    ERRORS += 1
+                cmd = "{path}/rados import -p {pool} --no-overwrite {file}".format(pool=NEWPOOL, file=file, path=CEPH_BIN)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True, stdout=nullfd)
+                if ret != 0:
+                    logging.error("Rados import --no-overwrite failed from {file} with {ret}".format(file=file, ret=ret))
+                    ERRORS += 1
+
+        ERRORS += verify(DATADIR, NEWPOOL, REP_NAME, db)
+    else:
+        logging.warning("SKIPPING IMPORT-RADOS TESTS DUE TO PREVIOUS FAILURES")
+
+    # Clear directories of previous portion
+    call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True)
+    call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True)
+    os.mkdir(TESTDIR)
+    os.mkdir(DATADIR)
+
+    # Cause SPLIT_POOL to split and test import with object/log filtering
+    print("Testing import all objects after a split")
+    SPLIT_POOL = "split_pool"
+    PG_COUNT = 1
+    SPLIT_OBJ_COUNT = 5
+    SPLIT_NSPACE_COUNT = 2
+    SPLIT_NAME = "split"
+    cmd = "{path}/ceph osd pool create {pool} {pg} {pg} replicated".format(pool=SPLIT_POOL, pg=PG_COUNT, path=CEPH_BIN)
+    logging.debug(cmd)
+    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+    SPLITID = get_pool_id(SPLIT_POOL, nullfd)
+    pool_size = int(check_output("{path}/ceph osd pool get {pool} size".format(pool=SPLIT_POOL, path=CEPH_BIN), shell=True, stderr=nullfd).split(" ")[1])
+    EXP_ERRORS = 0
+    RM_ERRORS = 0
+    IMP_ERRORS = 0
+
+    objects = range(1, SPLIT_OBJ_COUNT + 1)
+    nspaces = range(SPLIT_NSPACE_COUNT)
+    for n in nspaces:
+        nspace = get_nspace(n)
+
+        for i in objects:
+            NAME = SPLIT_NAME + "{num}".format(num=i)
+            LNAME = nspace + "-" + NAME
+            DDNAME = os.path.join(DATADIR, LNAME)
+            DDNAME += "__head"
+
+            cmd = "rm -f " + DDNAME
+            logging.debug(cmd)
+            call(cmd, shell=True)
+
+            if i == 1:
+                dataline = range(DATALINECOUNT)
+            else:
+                dataline = range(1)
+            fd = open(DDNAME, "w")
+            data = "This is the split data for " + LNAME + "\n"
+            for _ in dataline:
+                fd.write(data)
+            fd.close()
+
+            cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=SPLIT_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stderr=nullfd)
+            if ret != 0:
+                logging.critical("Rados put command failed with {ret}".format(ret=ret))
+                return 1
+
+    wait_for_health()
+    kill_daemons()
+
+    for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
+        os.mkdir(os.path.join(TESTDIR, osd))
+
+    pg = "{pool}.0".format(pool=SPLITID)
+    EXPORT_PG = pg
+
+    export_osds = get_osds(pg, OSDDIR)
+    for osd in export_osds:
+        mydir = os.path.join(TESTDIR, osd)
+        fname = os.path.join(mydir, pg)
+        cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+        logging.debug(cmd)
+        ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+        if ret != 0:
+            logging.error("Exporting failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+            EXP_ERRORS += 1
+
+    ERRORS += EXP_ERRORS
+
+    if EXP_ERRORS == 0:
+        vstart(new=False)
+        wait_for_health()
+
+        cmd = "{path}/ceph osd pool set {pool} pg_num 2".format(pool=SPLIT_POOL, path=CEPH_BIN)
+        logging.debug(cmd)
+        ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+        time.sleep(5)
+        wait_for_health()
+
+        kill_daemons()
+
+        # Now 2 PGs, poolid.0 and poolid.1
+        for seed in range(2):
+            pg = "{pool}.{seed}".format(pool=SPLITID, seed=seed)
+
+            which = 0
+            for osd in get_osds(pg, OSDDIR):
+                cmd = (CFSD_PREFIX + "--force --op remove --pgid {pg}").format(pg=pg, osd=osd)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True, stdout=nullfd)
+
+                # This is weird.  The export files are based on only the EXPORT_PG
+                # and where that pg was before the split.  Use 'which' to use all
+                # export copies in import.
+                mydir = os.path.join(TESTDIR, export_osds[which])
+                fname = os.path.join(mydir, EXPORT_PG)
+                which += 1
+                cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True, stdout=nullfd)
+                if ret != 0:
+                    logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
+                    IMP_ERRORS += 1
+
+        ERRORS += IMP_ERRORS
+
+        # Start up again to make sure imports didn't corrupt anything
+        if IMP_ERRORS == 0:
+            print("Verify split import data")
+            data_errors, count = check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME)
+            ERRORS += data_errors
+            if count != (SPLIT_OBJ_COUNT * SPLIT_NSPACE_COUNT * pool_size):
+                logging.error("Incorrect number of replicas seen {count}".format(count=count))
+                ERRORS += 1
+            vstart(new=False)
+            wait_for_health()
+
+    call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True)
+    call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True)
+
+    ERRORS += test_removeall(CFSD_PREFIX, db, OBJREPPGS, REP_POOL, CEPH_BIN, OSDDIR, REP_NAME, NUM_CLONED_REP_OBJECTS)
+
+    # vstart() starts 4 OSDs
+    ERRORS += test_get_set_osdmap(CFSD_PREFIX, list(range(4)), ALLOSDS)
+    ERRORS += test_get_set_inc_osdmap(CFSD_PREFIX, ALLOSDS[0])
+
+    kill_daemons()
+    CORES = [f for f in os.listdir(CEPH_DIR) if f.startswith("core.")]
+    if CORES:
+        CORE_DIR = os.path.join("/tmp", "cores.{pid}".format(pid=os.getpid()))
+        os.mkdir(CORE_DIR)
+        call("/bin/mv {ceph_dir}/core.* {core_dir}".format(ceph_dir=CEPH_DIR, core_dir=CORE_DIR), shell=True)
+        logging.error("Failure due to cores found")
+        logging.error("See {core_dir} for cores".format(core_dir=CORE_DIR))
+        ERRORS += len(CORES)
+
+    if ERRORS == 0:
+        print("TEST PASSED")
+        return 0
+    else:
+        print("TEST FAILED WITH {errcount} ERRORS".format(errcount=ERRORS))
+        return 1
+
+
+def remove_btrfs_subvolumes(path):
+    if platform.system() == "FreeBSD":
+        return
+    result = subprocess.Popen("stat -f -c '%%T' %s" % path, shell=True, stdout=subprocess.PIPE)
+    for line in result.stdout:
+        filesystem = decode(line).rstrip('\n')
+    if filesystem == "btrfs":
+        result = subprocess.Popen("sudo btrfs subvolume list %s" % path, shell=True, stdout=subprocess.PIPE)
+        for line in result.stdout:
+            subvolume = decode(line).split()[8]
+            # extracting the relative volume name
+            m = re.search(".*(%s.*)" % path, subvolume)
+            if m:
+                found = m.group(1)
+                call("sudo btrfs subvolume delete %s" % found, shell=True)
+
+
+if __name__ == "__main__":
+    status = 1
+    try:
+        status = main(sys.argv[1:])
+    finally:
+        kill_daemons()
+        os.chdir(CEPH_BUILD_DIR)
+        remove_btrfs_subvolumes(CEPH_DIR)
+        call("/bin/rm -fr {dir}".format(dir=CEPH_DIR), shell=True)
+    sys.exit(status)
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_7.3.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_7.3.yaml
deleted file mode 100644 (file)
index 9dfcc7f..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-os_type: centos
-os_version: "7.3"
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_latest.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_latest.yaml
new file mode 120000 (symlink)
index 0000000..b5973b9
--- /dev/null
@@ -0,0 +1 @@
+../../../../../distros/supported/centos_latest.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_16.04.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_16.04.yaml
deleted file mode 100644 (file)
index a459fdd..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-os_type: ubuntu
-os_version: "16.04"
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_latest.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_latest.yaml
new file mode 120000 (symlink)
index 0000000..cc5b15b
--- /dev/null
@@ -0,0 +1 @@
+../../../../../distros/supported/ubuntu_latest.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml
new file mode 100644 (file)
index 0000000..36d0a07
--- /dev/null
@@ -0,0 +1,32 @@
+meta:
+- desc: "Build the ceph cluster using ceph-ansible"
+
+overrides:
+   ceph_ansible:
+     vars:
+        ceph_conf_overrides:
+          global:
+            osd default pool size: 2
+            mon pg warn min per osd: 2
+            osd pool default pg num: 64
+            osd pool default pgp num: 64
+            mon_max_pg_per_osd: 1024
+        ceph_test: true
+        ceph_stable_release: luminous
+        osd_scenario: collocated
+        journal_size: 1024
+        osd_auto_discovery: false
+        ceph_origin: repository
+        ceph_repository: dev
+        ceph_mgr_modules:
+          - status
+          - restful
+        cephfs_pools:
+          - name: "cephfs_data"
+            pgs: "64"
+          - name: "cephfs_metadata"
+            pgs: "64"
+tasks:
+- ssh-keys:
+- ceph_ansible:
+- install.ship_utilities:
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/2-config/ceph_ansible.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/2-config/ceph_ansible.yaml
deleted file mode 100644 (file)
index 5750d52..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-meta:
-- desc: "Build the ceph cluster using ceph-ansible"
-
-overrides:                                                                                                                                                                                                  
-   ceph_ansible:                                                                                                                                                                                            
-     vars:                                                                                                                                                                                                  
-        ceph_conf_overrides:                                                                                                                                                                                
-          global:                                                                                                                                                                                           
-            osd default pool size: 2                                                                                                                                                                        
-            mon pg warn min per osd: 2                                                                                                                                                                      
-        ceph_dev: true                                                                                                                                                                                      
-        ceph_dev_key: https://download.ceph.com/keys/autobuild.asc                                                                                                                                          
-        ceph_origin: upstream                                                                                                                                                                               
-        ceph_test: true                                                                                                                                                                                     
-        journal_collocation: true                                                                                                                                                                           
-        journal_size: 1024                                                                                                                                                                                  
-        osd_auto_discovery: false           
-
-tasks:
-- ssh-keys:
-- ceph_ansible:
-- install.ship_utilities:
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/bluestore_with_dmcrypt.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/bluestore_with_dmcrypt.yaml
new file mode 100644 (file)
index 0000000..604e757
--- /dev/null
@@ -0,0 +1,8 @@
+meta:
+- desc: "use bluestore + dmcrypt option"
+
+overrides:
+   ceph_ansible:
+     vars:
+        osd_objectstore: bluestore
+        dmcrypt: True
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_off.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_off.yaml
new file mode 100644 (file)
index 0000000..4bbd1c7
--- /dev/null
@@ -0,0 +1,7 @@
+meta:
+- desc: "without dmcrypt"
+
+overrides:
+   ceph_ansible:
+     vars:
+        dmcrypt: False
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_on.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_on.yaml
new file mode 100644 (file)
index 0000000..12d63d3
--- /dev/null
@@ -0,0 +1,7 @@
+meta:
+- desc: "use dmcrypt option"
+
+overrides:
+   ceph_ansible:
+     vars:
+        dmcrypt: True
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/ceph-admin-commands.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/ceph-admin-commands.yaml
deleted file mode 100644 (file)
index 33642d5..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-meta:
-- desc: "Run ceph-admin-commands.sh"
-tasks:
-- workunit:
-    clients:
-      client.0:
-        - ceph-tests/ceph-admin-commands.sh
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/cls.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/cls.yaml
deleted file mode 100644 (file)
index 781a4d4..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-meta:
-- desc: "Run the rados cls tests"
-tasks:
-- workunit:
-    clients:
-      client.0:
-        - cls
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/rbd_import_export.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/rbd_import_export.yaml
deleted file mode 100644 (file)
index 9495934..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-meta:
-- desc: "Run the rbd import/export tests"
-tasks:
-- workunit:
-    clients:
-      client.0:
-        - rbd/import_export.sh
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/ceph-admin-commands.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/ceph-admin-commands.yaml
new file mode 100644 (file)
index 0000000..33642d5
--- /dev/null
@@ -0,0 +1,7 @@
+meta:
+- desc: "Run ceph-admin-commands.sh"
+tasks:
+- workunit:
+    clients:
+      client.0:
+        - ceph-tests/ceph-admin-commands.sh
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/rbd_import_export.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/rbd_import_export.yaml
new file mode 100644 (file)
index 0000000..9495934
--- /dev/null
@@ -0,0 +1,7 @@
+meta:
+- desc: "Run the rbd import/export tests"
+tasks:
+- workunit:
+    clients:
+      client.0:
+        - rbd/import_export.sh
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml
new file mode 100644 (file)
index 0000000..8e38913
--- /dev/null
@@ -0,0 +1,15 @@
+tasks:
+- exec:
+    mgr.x:
+      - systemctl stop ceph-mgr.target
+      - sleep 5
+      - ceph -s
+- exec:
+    mon.a:
+      - ceph restful create-key admin
+      - ceph restful create-self-signed-cert
+      - ceph restful restart
+- workunit:
+    clients:
+      client.0:
+        - rest/test-restful.sh
diff --git a/ceph/qa/suites/fs/32bits/objectstore b/ceph/qa/suites/fs/32bits/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/32bits/objectstore-ec b/ceph/qa/suites/fs/32bits/objectstore-ec
new file mode 120000 (symlink)
index 0000000..15dc98f
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
index f9e423e2fc3cc4c3e02db97e42297b5bcfa7ed31..1c540a4ef4662ec2e9b3e39be77647693aa371ff 100644 (file)
@@ -1,6 +1,6 @@
 roles:
-- [mon.a, mgr.x, osd.0, mds.a, mds.b, client.1, client.2, client.3]
-- [client.0, osd.1, osd.2]
+- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, mds.a, mds.b, client.1, client.2, client.3]
+- [client.0, osd.4, osd.5, osd.6, osd.7]
 openstack:
 - volumes: # attached to each instance
     count: 2
diff --git a/ceph/qa/suites/fs/basic_functional/objectstore/bluestore-ec-root.yaml b/ceph/qa/suites/fs/basic_functional/objectstore/bluestore-ec-root.yaml
new file mode 120000 (symlink)
index 0000000..36a4d69
--- /dev/null
@@ -0,0 +1 @@
+../../../../cephfs/objectstore-ec/bluestore-ec-root.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/basic_workload/objectstore b/ceph/qa/suites/fs/basic_workload/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/basic_workload/objectstore-ec b/ceph/qa/suites/fs/basic_workload/objectstore-ec
new file mode 120000 (symlink)
index 0000000..a330d66
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
index 78b912f725283ae418b202d3d228121dffa87645..a533af5c620129f02c66f48f6c53c495605cfaa4 100644 (file)
@@ -1,5 +1,5 @@
 roles:
-- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2]
+- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
 - [client.2]
 - [client.1]
 - [client.0]
index 9586e6c8ffd381d7a9dbb1d8c728bec008b78cb9..00f3815cbe55660f7ac71c4a7d6aa0a2a17a1573 100644 (file)
@@ -1,5 +1,5 @@
 roles:
-- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2]
+- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
 - [client.1]
 - [client.0]
 
diff --git a/ceph/qa/suites/fs/multiclient/objectstore b/ceph/qa/suites/fs/multiclient/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/multiclient/objectstore-ec b/ceph/qa/suites/fs/multiclient/objectstore-ec
new file mode 120000 (symlink)
index 0000000..a330d66
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
index 52c5d7e0171ae86a81dc489699e9efca927d2c20..2ae772c3f27341f231ced6ae422f9e448779514b 100644 (file)
@@ -1,6 +1,6 @@
 roles:
-- [mon.a, mgr.x, osd.0, mon.b, mds.a, mds.b, client.1]
-- [mds.c, mds.d, mon.c, client.0, osd.1, osd.2]
+- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, mon.b, mds.a, mds.b, client.1]
+- [mds.c, mds.d, mon.c, client.0, osd.4, osd.5, osd.6, osd.7]
 openstack:
 - volumes: # attached to each instance
     count: 2
diff --git a/ceph/qa/suites/fs/multifs/objectstore b/ceph/qa/suites/fs/multifs/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/multifs/objectstore-ec b/ceph/qa/suites/fs/multifs/objectstore-ec
new file mode 120000 (symlink)
index 0000000..a330d66
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/permission/objectstore b/ceph/qa/suites/fs/permission/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/permission/objectstore-ec b/ceph/qa/suites/fs/permission/objectstore-ec
new file mode 120000 (symlink)
index 0000000..a330d66
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/snaps/objectstore b/ceph/qa/suites/fs/snaps/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/snaps/objectstore-ec b/ceph/qa/suites/fs/snaps/objectstore-ec
new file mode 120000 (symlink)
index 0000000..a330d66
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/thrash/objectstore b/ceph/qa/suites/fs/thrash/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/thrash/objectstore-ec b/ceph/qa/suites/fs/thrash/objectstore-ec
new file mode 120000 (symlink)
index 0000000..a330d66
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/traceless/objectstore b/ceph/qa/suites/fs/traceless/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/traceless/objectstore-ec b/ceph/qa/suites/fs/traceless/objectstore-ec
new file mode 120000 (symlink)
index 0000000..a330d66
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/verify/objectstore b/ceph/qa/suites/fs/verify/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/verify/objectstore-ec b/ceph/qa/suites/fs/verify/objectstore-ec
new file mode 120000 (symlink)
index 0000000..a330d66
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/cephfs/objectstore b/ceph/qa/suites/kcephfs/cephfs/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/cephfs/objectstore-ec b/ceph/qa/suites/kcephfs/cephfs/objectstore-ec
new file mode 120000 (symlink)
index 0000000..15dc98f
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/mixed-clients/objectstore b/ceph/qa/suites/kcephfs/mixed-clients/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/mixed-clients/objectstore-ec b/ceph/qa/suites/kcephfs/mixed-clients/objectstore-ec
new file mode 120000 (symlink)
index 0000000..15dc98f
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
index 1d432addb4af25d2acb4ad23e409d0faeab06afc..b1072e3be164bb2b0892e2696919b02c365d1a4c 100644 (file)
@@ -1,6 +1,6 @@
 roles:
-- [mon.a, osd.0, mds.a, mds.c, client.2]
-- [mgr.x, osd.1, osd.2, mds.b, mds.d, client.3]
+- [mon.a, osd.0, osd.1, osd.2, osd.3, mds.a, mds.c, client.2]
+- [mgr.x, osd.4, osd.5, osd.6, osd.7, mds.b, mds.d, client.3]
 - [client.0]
 - [client.1]
 openstack:
diff --git a/ceph/qa/suites/kcephfs/recovery/objectstore b/ceph/qa/suites/kcephfs/recovery/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/recovery/objectstore-ec b/ceph/qa/suites/kcephfs/recovery/objectstore-ec
new file mode 120000 (symlink)
index 0000000..15dc98f
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/thrash/objectstore b/ceph/qa/suites/kcephfs/thrash/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/thrash/objectstore-ec b/ceph/qa/suites/kcephfs/thrash/objectstore-ec
new file mode 120000 (symlink)
index 0000000..15dc98f
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/multimds/basic/objectstore b/ceph/qa/suites/multimds/basic/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/multimds/basic/objectstore-ec b/ceph/qa/suites/multimds/basic/objectstore-ec
new file mode 120000 (symlink)
index 0000000..15dc98f
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/multimds/thrash/objectstore b/ceph/qa/suites/multimds/thrash/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/multimds/thrash/objectstore-ec b/ceph/qa/suites/multimds/thrash/objectstore-ec
new file mode 120000 (symlink)
index 0000000..15dc98f
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/multimds/verify/objectstore b/ceph/qa/suites/multimds/verify/objectstore
deleted file mode 120000 (symlink)
index c72da2f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/multimds/verify/objectstore-ec b/ceph/qa/suites/multimds/verify/objectstore-ec
new file mode 120000 (symlink)
index 0000000..15dc98f
--- /dev/null
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/basic/d-require-luminous b/ceph/qa/suites/rados/basic/d-require-luminous
deleted file mode 120000 (symlink)
index 737aee8..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../thrash/d-require-luminous/
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/basic/d-require-luminous/at-end.yaml b/ceph/qa/suites/rados/basic/d-require-luminous/at-end.yaml
new file mode 100644 (file)
index 0000000..ef998cc
--- /dev/null
@@ -0,0 +1,33 @@
+# do not require luminous osds at mkfs time; only set flag at
+# the end of the test run, then do a final scrub (to convert any
+# legacy snapsets), and verify we are healthy.
+tasks:
+- full_sequential_finally:
+  - exec:
+      mon.a:
+        - ceph osd require-osd-release luminous
+        - ceph osd pool application enable base rados || true
+# make sure osds have latest map
+        - rados -p rbd bench 5 write -b 4096
+  - ceph.healthy:
+  - ceph.osd_scrub_pgs:
+      cluster: ceph
+  - exec:
+      mon.a:
+        - sleep 15
+        - ceph osd dump | grep purged_snapdirs
+        - ceph pg dump -f json-pretty
+        - "ceph pg dump sum -f json-pretty | grep num_legacy_snapsets | head -1 | grep ': 0'"
+overrides:
+  ceph:
+    conf:
+      global:
+        mon debug no require luminous: true
+
+# setting luminous triggers peering, which *might* trigger health alerts
+    log-whitelist:
+      - overall HEALTH_
+      - \(PG_AVAILABILITY\)
+      - \(PG_DEGRADED\)
+  thrashosds:
+    chance_thrash_cluster_full: 0
diff --git a/ceph/qa/suites/rados/basic/d-require-luminous/at-mkfs.yaml b/ceph/qa/suites/rados/basic/d-require-luminous/at-mkfs.yaml
new file mode 100644 (file)
index 0000000..e69de29
index bc950e5afff5c0ad5327bb5d0c75716cb648c720..abc90e22d358787ccb4490649cdf24c17087884e 100644 (file)
@@ -1,6 +1,6 @@
 roles:
 - [mgr.x, mon.a, mon.c, mds.a, mds.c, osd.0, client.0]
-- [mgr.y, mon.b, mds.b, osd.1, osd.2, client.1]
+- [mgr.y, mgr.z, mon.b, mds.b, osd.1, osd.2, client.1]
 log-rotate:
   ceph-mds: 10G
   ceph-osd: 10G
diff --git a/ceph/qa/suites/rados/mgr/tasks/dashboard.yaml b/ceph/qa/suites/rados/mgr/tasks/dashboard.yaml
new file mode 100644 (file)
index 0000000..3065e11
--- /dev/null
@@ -0,0 +1,16 @@
+
+tasks:
+  - install:
+  - ceph:
+      # tests may leave mgrs broken, so don't try and call into them
+      # to invoke e.g. pg dump during teardown.
+      wait-for-scrub: false
+      log-whitelist:
+        - overall HEALTH_
+        - \(MGR_DOWN\)
+        - \(PG_
+        - replacing it with standby
+        - No standby daemons available
+  - cephfs_test_runner:
+      modules:
+        - tasks.mgr.test_dashboard
diff --git a/ceph/qa/suites/rados/mgr/tasks/module_selftest.yaml b/ceph/qa/suites/rados/mgr/tasks/module_selftest.yaml
new file mode 100644 (file)
index 0000000..ffdfe8b
--- /dev/null
@@ -0,0 +1,19 @@
+
+tasks:
+  - install:
+  - ceph:
+      # tests may leave mgrs broken, so don't try and call into them
+      # to invoke e.g. pg dump during teardown.
+      wait-for-scrub: false
+      log-whitelist:
+        - overall HEALTH_
+        - \(MGR_DOWN\)
+        - \(PG_
+        - replacing it with standby
+        - No standby daemons available
+        - Reduced data availability
+        - Degraded data redundancy
+        - objects misplaced
+  - cephfs_test_runner:
+      modules:
+        - tasks.mgr.test_module_selftest
diff --git a/ceph/qa/suites/rados/mgr/tasks/workunits.yaml b/ceph/qa/suites/rados/mgr/tasks/workunits.yaml
new file mode 100644 (file)
index 0000000..d7261f4
--- /dev/null
@@ -0,0 +1,16 @@
+tasks:
+  - install:
+  - ceph:
+      # tests may leave mgrs broken, so don't try and call into them
+      # to invoke e.g. pg dump during teardown.
+      wait-for-scrub: false
+      log-whitelist:
+        - overall HEALTH_
+        - \(MGR_DOWN\)
+        - \(PG_
+        - replacing it with standby
+        - No standby daemons available
+  - workunit:
+      clients:
+        client.0:
+          - mgr
\ No newline at end of file
index 737aee82415f31688760885f520671a2a8724d70..82036c67f956fbc63231904b49a2b6642420b712 120000 (symlink)
@@ -1 +1 @@
-../thrash/d-require-luminous/
\ No newline at end of file
+../basic/d-require-luminous
\ No newline at end of file
index 90906d66eb2aef0da150b5049bcbfb3d99b7a6ab..049532e34cab3223977751b2b423623c762c0189 100644 (file)
@@ -6,6 +6,9 @@ tasks:
     log-whitelist:
       - overall HEALTH_
       - \(MGR_DOWN\)
+      - \(PG_
+      - \(OSD_
+      - \(OBJECT_
 - exec:
     mon.a:
       - ceph restful create-key admin
diff --git a/ceph/qa/suites/rados/rest/rest_test.yaml b/ceph/qa/suites/rados/rest/rest_test.yaml
new file mode 100644 (file)
index 0000000..0fdb9dc
--- /dev/null
@@ -0,0 +1,44 @@
+roles:
+- - mon.a
+  - mgr.x
+  - mds.a
+  - osd.0
+  - osd.1
+- - mon.b
+  - mon.c
+  - osd.2
+  - osd.3
+  - client.0
+
+openstack:
+- volumes: # attached to each instance
+    count: 2
+    size: 10 # GB
+
+tasks:
+- install:
+- ceph:
+    fs: xfs
+    log-whitelist:
+      - overall HEALTH
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_
+      - \(PG_
+      - \(POOL_
+      - \(CACHE_POOL_
+      - \(SMALLER_PGP_NUM\)
+      - \(OBJECT_
+      - \(REQUEST_SLOW\)
+      - \(SLOW_OPS\)
+      - \(TOO_FEW_PGS\)
+      - but it is still running
+    conf:
+      client.rest0:
+        debug ms: 1
+        debug objecter: 20
+        debug rados: 20
+- rest-api: [client.0]
+- workunit:
+    clients:
+      client.0:
+         - rest/test.py
index 3aaca875940323912c0a43ef9f10d6e7531929c0..bbf330b0ba1f936c7493094458b7105ba3b0a959 100644 (file)
@@ -9,6 +9,7 @@ overrides:
       - (OSDMAP_FLAGS)
       - (OSD_FULL)
       - (MDS_READ_ONLY)
+      - (POOL_FULL)
 tasks:
 - install:
 - ceph:
diff --git a/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml b/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml
new file mode 100644 (file)
index 0000000..accdd96
--- /dev/null
@@ -0,0 +1,26 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+openstack:
+  - volumes: # attached to each instance
+      count: 2
+      size: 10 # GB
+overrides:
+  ceph:
+    create_rbd_pool: False
+    conf:
+      mon:
+        osd pool default size: 2
+      osd:
+        mon max pg per osd : 2
+        osd max pg per osd hard ratio : 1
+    log-whitelist:
+      - \(TOO_FEW_PGS\)
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+    test_create_from_mon: True
+    pg_num: 2
diff --git a/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml b/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml
new file mode 100644 (file)
index 0000000..1c48ada
--- /dev/null
@@ -0,0 +1,31 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - osd.3
+openstack:
+  - volumes: # attached to each instance
+      count: 4
+      size: 10 # GB
+overrides:
+  ceph:
+    create_rbd_pool: False
+    conf:
+      mon:
+        osd pool default size: 2
+      osd:
+        mon max pg per osd : 1
+        osd max pg per osd hard ratio : 1
+    log-whitelist:
+      - \(TOO_FEW_PGS\)
+      - \(PG_
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+    test_create_from_mon: False
+    pg_num: 1
+    pool_size: 2
+    from_primary: True
diff --git a/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml b/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml
new file mode 100644 (file)
index 0000000..0cf37fd
--- /dev/null
@@ -0,0 +1,31 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - osd.3
+openstack:
+  - volumes: # attached to each instance
+      count: 4
+      size: 10 # GB
+overrides:
+  ceph:
+    create_rbd_pool: False
+    conf:
+      mon:
+        osd pool default size: 2
+      osd:
+        mon max pg per osd : 1
+        osd max pg per osd hard ratio : 1
+    log-whitelist:
+      - \(TOO_FEW_PGS\)
+      - \(PG_
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+    test_create_from_mon: False
+    pg_num: 1
+    pool_size: 2
+    from_primary: False
index ccd980fded7bbe6c29208c4f56ded5926cf7ba73..815c518eec52c0a5de68407a353308dc9b57cef6 100644 (file)
@@ -17,6 +17,10 @@ tasks:
       osd:
         debug monc: 1
         debug ms: 1
+    log-whitelist:
+      - overall HEALTH
+      - Manager daemon
+      - \(MGR_DOWN\)
 - mon_seesaw:
 - ceph_manager.create_pool:
     kwargs:
diff --git a/ceph/qa/suites/rados/singleton/all/recovery-preemption.yaml b/ceph/qa/suites/rados/singleton/all/recovery-preemption.yaml
new file mode 100644 (file)
index 0000000..7507bf6
--- /dev/null
@@ -0,0 +1,51 @@
+roles:
+- - mon.a
+  - mon.b
+  - mon.c
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - osd.3
+openstack:
+  - volumes: # attached to each instance
+      count: 3
+      size: 20 # GB
+tasks:
+- install:
+- ceph:
+    conf:
+      osd:
+        osd recovery sleep: .1
+        osd min pg log entries: 100
+        osd max pg log entries: 1000
+    log-whitelist:
+      - \(POOL_APP_NOT_ENABLED\)
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_
+      - \(OBJECT_
+      - \(PG_
+      - overall HEALTH
+- exec:
+    osd.0:
+      - ceph osd pool create foo 128
+      - ceph osd pool application enable foo foo
+      - rados -p foo bench 30 write -b 4096 --no-cleanup
+      - ceph osd out 0
+      - sleep 5
+      - ceph osd set noup
+- ceph.restart:
+    daemons: [osd.1]
+    wait-for-up: false
+    wait-for-healthy: false
+- exec:
+    osd.0:
+      - rados -p foo bench 3 write -b 4096 --no-cleanup
+      - ceph osd unset noup
+      - sleep 10
+      - ceph tell osd.* config set osd_recovery_sleep 0
+      - ceph tell osd.* config set osd_recovery_max_active 20
+- ceph.healthy:
+- exec:
+    osd.0:
+      - egrep '(defer backfill|defer recovery)' /var/log/ceph/ceph-osd.*.log
diff --git a/ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-crush-compat.yaml b/ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-crush-compat.yaml
new file mode 100644 (file)
index 0000000..9eb7143
--- /dev/null
@@ -0,0 +1,11 @@
+overrides:
+  ceph:
+    conf:
+      mgr:
+        debug osd: 20
+tasks:
+- exec:
+    mon.a:
+      - while ! ceph balancer status ; do sleep 1 ; done
+      - ceph balancer mode crush-compat
+      - ceph balancer on
diff --git a/ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-upmap.yaml b/ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-upmap.yaml
new file mode 100644 (file)
index 0000000..a1e0afe
--- /dev/null
@@ -0,0 +1,11 @@
+overrides:
+  ceph:
+    conf:
+      mgr:
+        debug osd: 20
+tasks:
+- exec:
+    mon.a:
+      - while ! ceph balancer status ; do sleep 1 ; done
+      - ceph balancer mode upmap
+      - ceph balancer on
index 737aee82415f31688760885f520671a2a8724d70..82036c67f956fbc63231904b49a2b6642420b712 120000 (symlink)
@@ -1 +1 @@
-../thrash/d-require-luminous/
\ No newline at end of file
+../basic/d-require-luminous
\ No newline at end of file
index 9ccd57c4a82a732dc4adf67a6c18508e7471d692..51b35e2e17ca1280b50799d01b76a2e2aeb014af 100644 (file)
@@ -3,3 +3,5 @@ tasks:
     clients:
       client.0:
         - cls/test_cls_rbd.sh
+        - cls/test_cls_lock.sh
+        - cls/test_cls_journal.sh
diff --git a/ceph/qa/suites/rest/basic/tasks/rest_test.yaml b/ceph/qa/suites/rest/basic/tasks/rest_test.yaml
deleted file mode 100644 (file)
index 9485456..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
-roles:
-- - mon.a
-  - mgr.x
-  - mds.a
-  - osd.0
-  - osd.1
-- - mon.b
-  - mon.c
-  - osd.2
-  - osd.3
-  - client.0
-
-openstack:
-- volumes: # attached to each instance
-    count: 2
-    size: 10 # GB
-
-tasks:
-- install:
-- ceph:
-    fs: xfs
-    log-whitelist:
-    - but it is still running
-    conf:
-      client.rest0:
-        debug ms: 1
-        debug objecter: 20
-        debug rados: 20
-- rest-api: [client.0]
-- workunit:
-    clients:
-      client.0:
-         - rest/test.py
index 171cc66e2b053e465a87e05230bb222fc8eaefa1..1c17a69f73163fabf28e9d90a21cf9af8f6407ea 100644 (file)
@@ -1,6 +1,8 @@
-os_type: centos
-os_version: "7.3"
-machine_type: vps
+machine_type: ovh
+openstack:
+- volumes: # attached to each instance
+    count: 3 
+    size: 10 # GB
 overrides:
     ceph_ansible:
       vars:
@@ -9,15 +11,16 @@ overrides:
             osd default pool size: 2
             osd pool default pg num: 128
             osd pool default pgp num: 128
-            debug rgw: 20 
+            debug rgw: 20
             debug ms: 1
         ceph_test: true
-        ceph_dev: true
-        ceph_dev_key: https://download.ceph.com/keys/autobuild.asc
-        ceph_origin: upstream
         journal_collocation: true
         osd_auto_discovery: false
         journal_size: 1024
+        ceph_stable_release: luminous
+        osd_scenario: collocated
+        ceph_origin: repository
+        ceph_repository: dev
 roles:
 - [mon.a, osd.0, osd.1, osd.2, rgw.0]
 - [osd.3, osd.4, osd.5]
index 4cdded04e768a166f187af236bfba07225bbf73b..da05a5ea13ba7255674044159353ee404fffc6e7 100644 (file)
@@ -4,7 +4,7 @@ tasks:
 - rgw: [client.0]
 - s3tests:
     client.0:
-      force-branch: ceph-master
+      force-branch: ceph-luminous
       rgw_server: client.0
 overrides:
   ceph:
index 45047ea410b4aedc0e34c0d78fb662fe5ef04666..82ac7c197725432f6e7e8c54c1e82a8aa836bf7d 100644 (file)
@@ -1,7 +1,7 @@
 tasks:
 - s3tests:
     client.0:
-      force-branch: ceph-master
+      force-branch: ceph-luminous
       rgw_server: client.0
 overrides:
   ceph:
index bed9f0e1900c2894da558fc3c65c97232af74770..cf413389bae6f9f84f765026a4549e896b79b193 100644 (file)
@@ -10,7 +10,7 @@ tasks:
       valgrind: [--tool=memcheck]
 - s3tests:
     client.0:
-      force-branch: ceph-master
+      force-branch: ceph-luminous
       rgw_server: client.0
 overrides:
   ceph:
diff --git a/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/% b/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/%
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/centos_latest.yaml b/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/centos_latest.yaml
new file mode 120000 (symlink)
index 0000000..b5973b9
--- /dev/null
@@ -0,0 +1 @@
+../../../../../distros/supported/centos_latest.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/ubuntu_latest.yaml b/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/ubuntu_latest.yaml
new file mode 120000 (symlink)
index 0000000..cc5b15b
--- /dev/null
@@ -0,0 +1 @@
+../../../../../distros/supported/ubuntu_latest.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/jewel-luminous.yaml b/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/jewel-luminous.yaml
new file mode 100644 (file)
index 0000000..9adede7
--- /dev/null
@@ -0,0 +1,82 @@
+meta:
+- desc: |
+    Setup 4 node ceph cluster using ceph-deploy, use latest
+    stable jewel as initial release, upgrade to luminous and
+    also setup mgr nodes along after upgrade, check for
+    cluster to reach healthy state, After upgrade run kernel tar/untar
+    task and systemd task. This test will detect any
+    ceph upgrade issue and systemd issues.
+overrides:
+  ceph-deploy:
+    fs: xfs
+    conf:
+      global:
+        mon pg warn min per osd: 2
+      osd:
+        osd pool default size: 2
+        osd objectstore: filestore
+        osd sloppy crc: true
+      client:
+        rbd default features: 5
+openstack:
+- machine:
+    disk: 100
+- volumes:
+    count: 3
+    size: 30
+#  reluctantely :( hard-coded machine type
+#  it will override command line args with teuthology-suite  
+machine_type: vps
+roles:
+- - mon.a
+  - mds.a
+  - osd.0
+  - osd.1
+  - osd.2
+  - mgr.x
+- - mon.b
+  - mgr.y
+- - mon.c
+  - osd.3
+  - osd.4
+  - osd.5
+- - osd.6
+  - osd.7
+  - osd.8
+  - client.0
+tasks:
+- ssh-keys:
+- print: "**** done ssh-keys"
+- ceph-deploy:
+    branch:
+      stable: jewel
+    skip-mgr: True
+- print: "**** done initial ceph-deploy"
+- ceph-deploy.upgrade:
+    branch:
+      dev: luminous
+    setup-mgr-node: True
+    check-for-healthy: True
+    roles:
+      - mon.a
+      - mon.b
+      - mon.c
+      - osd.6
+- print: "**** done ceph-deploy upgrade"
+- exec:
+     osd.0:
+      - ceph osd require-osd-release luminous
+      - ceph osd set-require-min-compat-client luminous
+- print: "**** done `ceph osd require-osd-release luminous`"
+- workunit:
+    clients:
+      all:
+        - kernel_untar_build.sh
+- print: "**** done kernel_untar_build.sh"
+- systemd:
+- print: "**** done systemd"
+- workunit:
+    clients:
+      all:
+      - rados/load-gen-mix.sh
+- print: "**** done rados/load-gen-mix.sh"
index 3145626320cfae5d745ca4ce060740734e47f1f4..d1f1e1070446aa9f80bd5bacf735462e7c66879e 100644 (file)
@@ -18,6 +18,7 @@ roles:
   - client.1
   - client.2
   - client.3
+- - client.4
 overrides:
   ceph:
     log-whitelist:
index a367ef37c12c5c2fcba0ffc2c2a37a1b83e1a5b6..c64b2cded2947010e7680528d411d36a4cedf9db 100644 (file)
@@ -1,3 +1,22 @@
+overrides:
+  ceph:
+    conf:
+      client.0:
+        debug ms: 1
+        debug client: 10
+        debug monc: 10
+      client.1:
+        debug ms: 1
+        debug client: 10
+        debug monc: 10
+      client.2:
+        debug ms: 1
+        debug client: 10
+        debug monc: 10
+      client.3:
+        debug ms: 1
+        debug client: 10
+        debug monc: 10
 meta:
 - desc: |
    install ceph/jewel latest
index a8e28c52ce0dc0c70120890cae05aa2a375628ce..56eedbd6bb79b31281ad5fcbd83e8728d3b17ea5 100644 (file)
@@ -5,7 +5,7 @@ meta:
 workload:
   full_sequential:
     - sequential:
-      - ceph-fuse:
+      - ceph-fuse: [client.2]
       - print: "**** done ceph-fuse 2-workload"
       - workunit:
           clients:
deleted file mode 120000 (symlink)
index 5283ac73e1b27948229b24e7b91b13dc6b9337a1..0000000000000000000000000000000000000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../../releases/luminous.yaml
\ No newline at end of file
new file mode 100644 (file)
index 0000000000000000000000000000000000000000..e57b37753de0c3da08b75002d691b96954c57551
--- /dev/null
@@ -0,0 +1,23 @@
+# this is the same fragment as ../../../../releases/luminous.yaml
+# but without line "ceph osd set-require-min-compat-client luminous" 
+
+tasks:
+- exec:
+    mgr.x:
+      - mkdir -p /var/lib/ceph/mgr/ceph-x
+      - ceph auth get-or-create-key mgr.x mon 'allow profile mgr'
+      - ceph auth export mgr.x > /var/lib/ceph/mgr/ceph-x/keyring
+- ceph.restart:
+    daemons: [mgr.x]
+    wait-for-healthy: false
+- exec:
+    osd.0:
+      - ceph osd require-osd-release luminous
+- ceph.healthy:
+overrides:
+  ceph:
+    conf:
+      mon:
+        mon warn on osd down out interval zero: false
+    log-whitelist:
+      - no active mgr
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/+ b/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/+
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/blogbench.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/blogbench.yaml
deleted file mode 100644 (file)
index d2629c0..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-meta:
-- desc: |
-   run a cephfs stress test
-   mount ceph-fuse on client.3 before running workunit
-tasks:
-- sequential:
-  - ceph-fuse:
-  - print: "**** done ceph-fuse 5-final-workload"
-  - workunit:
-      clients:
-         client.3:
-          - suites/blogbench.sh
-  - print: "**** done suites/blogbench.sh 5-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados-snaps-few-objects.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados-snaps-few-objects.yaml
deleted file mode 100644 (file)
index d8b3dcb..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-meta:
-- desc: |
-   randomized correctness test for rados operations on a replicated pool with snapshots
-tasks:
-  - rados:
-      clients: [client.1]
-      ops: 4000
-      objects: 50
-      write_append_excl: false
-      op_weights:
-        read: 100
-        write: 100
-        delete: 50
-        snap_create: 50
-        snap_remove: 50
-        rollback: 50
-  - print: "**** done rados 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados_loadgenmix.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados_loadgenmix.yaml
deleted file mode 100644 (file)
index 922a9da..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-meta:
-- desc: |
-   generate read/write load with rados objects ranging from 1 byte to 1MB
-tasks:
-  - workunit:
-      clients:
-        client.1:
-          - rados/load-gen-mix.sh
-  - print: "**** done rados/load-gen-mix.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados_mon_thrash.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados_mon_thrash.yaml
deleted file mode 100644 (file)
index 9b60d2e..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-meta:
-- desc: |
-   librados C and C++ api tests
-overrides:
-  ceph:
-    log-whitelist:
-      - reached quota
-tasks:
-  - mon_thrash:
-      revive_delay: 20
-      thrash_delay: 1
-  - print: "**** done mon_thrash 4-final-workload"
-  - workunit:
-      branch: jewel
-      clients:
-        client.1:
-          - rados/test-upgrade-v11.0.0.sh
-  - print: "**** done rados/test-upgrade-v11.0.0.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rbd_cls.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rbd_cls.yaml
deleted file mode 100644 (file)
index aaf0a37..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-meta:
-- desc: |
-   rbd object class functional tests
-tasks:
-  - workunit:
-      clients:
-        client.1:
-          - cls/test_cls_rbd.sh
-  - print: "**** done cls/test_cls_rbd.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rbd_import_export.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rbd_import_export.yaml
deleted file mode 100644 (file)
index 46e1355..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-meta:
-- desc: |
-   run basic import/export cli tests for rbd
-tasks:
-  - workunit:
-      clients:
-        client.1:
-          - rbd/import_export.sh
-      env:
-        RBD_CREATE_ARGS: --new-format
-  - print: "**** done rbd/import_export.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rgw_swift.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rgw_swift.yaml
deleted file mode 100644 (file)
index 7a7659f..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-meta:
-- desc: |
-   swift api tests for rgw
-overrides:
-  rgw:
-    frontend: civetweb
-tasks:
-  - rgw: [client.1]
-  - print: "**** done rgw 4-final-workload"
-  - swift:
-      client.1:
-        rgw_server: client.1
-  - print: "**** done swift 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-workload.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/5-workload.yaml
new file mode 100644 (file)
index 0000000..f7e9de4
--- /dev/null
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd on not upgrated client.4
+   (covers issue http://tracker.ceph.com/issues/21660)
+tasks:
+  - workunit:
+      branch: jewel
+      clients:
+        client.4:
+          - rbd/import_export.sh
+  - print: "**** done rbd/import_export.sh 5-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/6-luminous-with-mgr.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/6-luminous-with-mgr.yaml
new file mode 120000 (symlink)
index 0000000..5c72153
--- /dev/null
@@ -0,0 +1 @@
+../../../../releases/luminous-with-mgr.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/6.5-crush-compat.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/6.5-crush-compat.yaml
new file mode 100644 (file)
index 0000000..20c0ffd
--- /dev/null
@@ -0,0 +1,8 @@
+tasks:
+- exec:
+    mon.a:
+      - ceph osd set-require-min-compat-client jewel
+      - ceph osd crush set-all-straw-buckets-to-straw2
+      - ceph osd crush weight-set create-compat
+      - ceph osd crush weight-set reweight-compat osd.0 .9
+      - ceph osd crush weight-set reweight-compat osd.1 1.2
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/+ b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/+
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/blogbench.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/blogbench.yaml
new file mode 100644 (file)
index 0000000..d73459e
--- /dev/null
@@ -0,0 +1,13 @@
+meta:
+- desc: |
+   run a cephfs stress test
+   mount ceph-fuse on client.3 before running workunit
+tasks:
+- sequential:
+  - ceph-fuse: [client.3]
+  - print: "**** done ceph-fuse 5-final-workload"
+  - workunit:
+      clients:
+         client.3:
+          - suites/blogbench.sh
+  - print: "**** done suites/blogbench.sh 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados-snaps-few-objects.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados-snaps-few-objects.yaml
new file mode 100644 (file)
index 0000000..7dd61c5
--- /dev/null
@@ -0,0 +1,17 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool with snapshots
+tasks:
+  - rados:
+      clients: [client.1]
+      ops: 4000
+      objects: 50
+      write_append_excl: false
+      op_weights:
+        read: 100
+        write: 100
+        delete: 50
+        snap_create: 50
+        snap_remove: 50
+        rollback: 50
+  - print: "**** done rados 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados_loadgenmix.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados_loadgenmix.yaml
new file mode 100644 (file)
index 0000000..b218b92
--- /dev/null
@@ -0,0 +1,9 @@
+meta:
+- desc: |
+   generate read/write load with rados objects ranging from 1 byte to 1MB
+tasks:
+  - workunit:
+      clients:
+        client.1:
+          - rados/load-gen-mix.sh
+  - print: "**** done rados/load-gen-mix.sh 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados_mon_thrash.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados_mon_thrash.yaml
new file mode 100644 (file)
index 0000000..c835a65
--- /dev/null
@@ -0,0 +1,18 @@
+meta:
+- desc: |
+   librados C and C++ api tests
+overrides:
+  ceph:
+    log-whitelist:
+      - reached quota
+tasks:
+  - mon_thrash:
+      revive_delay: 20
+      thrash_delay: 1
+  - print: "**** done mon_thrash 4-final-workload"
+  - workunit:
+      branch: jewel
+      clients:
+        client.1:
+          - rados/test-upgrade-v11.0.0.sh
+  - print: "**** done rados/test-upgrade-v11.0.0.sh 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rbd_cls.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rbd_cls.yaml
new file mode 100644 (file)
index 0000000..46bbf76
--- /dev/null
@@ -0,0 +1,9 @@
+meta:
+- desc: |
+   rbd object class functional tests
+tasks:
+  - workunit:
+      clients:
+        client.1:
+          - cls/test_cls_rbd.sh
+  - print: "**** done cls/test_cls_rbd.sh 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rbd_import_export.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rbd_import_export.yaml
new file mode 100644 (file)
index 0000000..5ae7491
--- /dev/null
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd
+tasks:
+  - workunit:
+      clients:
+        client.1:
+          - rbd/import_export.sh
+      env:
+        RBD_CREATE_ARGS: --new-format
+  - print: "**** done rbd/import_export.sh 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rgw_swift.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rgw_swift.yaml
new file mode 100644 (file)
index 0000000..780c4ad
--- /dev/null
@@ -0,0 +1,13 @@
+meta:
+- desc: |
+   swift api tests for rgw
+overrides:
+  rgw:
+    frontend: civetweb
+tasks:
+  - rgw: [client.1]
+  - print: "**** done rgw 7-final-workload"
+  - swift:
+      client.1:
+        rgw_server: client.1
+  - print: "**** done swift 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/8-jewel-workload.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/8-jewel-workload.yaml
new file mode 120000 (symlink)
index 0000000..81df389
--- /dev/null
@@ -0,0 +1 @@
+5-workload.yaml
\ No newline at end of file
index 3033f14be5d717e796e867e7146ee2d4151e7416..d68c258c027f21d5e0151a73df5d4bbf98ccd11a 100644 (file)
@@ -16,13 +16,20 @@ overrides:
     - scrub
     - osd_map_max_advance
     - wrongly marked
+    - overall HEALTH_
+    - \(MGR_DOWN\)
+    - \(OSD_
+    - \(PG_
+    - \(CACHE_
     fs: xfs
     conf:
+      global:
+        mon warn on pool no app: false
       mon:
         mon debug unsafe allow tier with nonempty snaps: true
-        mon warn on pool no app: false
       osd:
         osd map max advance: 1000
+        osd map cache size: 1100
 roles:
 - - mon.a
   - mds.a
@@ -161,7 +168,7 @@ workload_x:
        branch: jewel
        clients:
          client.1:
-         - rados/test-upgrade-v11.0.0.sh
+         - rados/test-upgrade-v11.0.0-noec.sh
          - cls
        env:
          CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_image'
@@ -170,7 +177,7 @@ workload_x:
        branch: jewel
        clients:
          client.0:
-         - rados/test-upgrade-v11.0.0.sh
+         - rados/test-upgrade-v11.0.0-noec.sh
          - cls
    - print: "**** done rados/test-upgrade-v11.0.0.sh &  cls workload_x upgraded client"
    - rgw: [client.1]
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split/6.5-crush-compat.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split/6.5-crush-compat.yaml
new file mode 120000 (symlink)
index 0000000..02263d1
--- /dev/null
@@ -0,0 +1 @@
+../parallel/6.5-crush-compat.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/kraken-x/ceph-deploy/kraken-luminous.yaml b/ceph/qa/suites/upgrade/kraken-x/ceph-deploy/kraken-luminous.yaml
new file mode 100644 (file)
index 0000000..4a55362
--- /dev/null
@@ -0,0 +1,61 @@
+meta:
+- desc: |
+    Setup 4 node ceph cluster using ceph-deploy, use latest
+    stable kraken as initial release, upgrade to luminous and
+    also setup mgr nodes along after upgrade, check for
+    cluster to reach healthy state, After upgrade run kernel tar/untar
+    task and systemd task. This test will detect any
+    ceph upgrade issue and systemd issues.
+overrides:
+  ceph-deploy:
+    fs: xfs
+    conf:
+      global:
+        mon pg warn min per osd: 2
+      osd:
+        osd pool default size: 2
+        osd objectstore: filestore
+        osd sloppy crc: true
+      client:
+        rbd default features: 5
+roles:
+- - mon.a
+  - mds.a
+  - osd.0
+  - osd.1
+  - osd.2
+  - mgr.x
+- - mon.b
+  - mgr.y
+- - mon.c
+  - osd.3
+  - osd.4
+  - osd.5
+- - osd.6
+  - osd.7
+  - osd.8
+  - client.0
+tasks:
+- ssh-keys:
+- ceph-deploy:
+    branch:
+      stable: kraken
+    skip-mgr: True
+- ceph-deploy.upgrade:
+    branch:
+      dev: luminous
+    setup-mgr-node: True
+    check-for-healthy: True
+    roles:
+      - mon.a
+      - mon.b
+      - mon.c
+- workunit:
+    clients:
+      all:
+        - kernel_untar_build.sh
+- systemd:
+- workunit:
+    clients:
+      all:
+      - rados/load-gen-mix.sh
index 0dc9dd2bc3d823f974e6b2d79232584fb56ad114..f5a883a3927df6aa26cf453afbb5c9c601e4ea71 100644 (file)
@@ -18,6 +18,7 @@ roles:
   - client.1
   - client.2
   - client.3
+- - client.4
 overrides:
   ceph:
     log-whitelist:
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/4-luminous-with-mgr.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/4-luminous-with-mgr.yaml
deleted file mode 120000 (symlink)
index 5c72153..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../../../releases/luminous-with-mgr.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/4-luminous.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/4-luminous.yaml
new file mode 100644 (file)
index 0000000..80c2b9d
--- /dev/null
@@ -0,0 +1,4 @@
+tasks:
+- exec:
+    osd.0:
+      - ceph osd require-osd-release luminous
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/+ b/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/+
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/blogbench.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/blogbench.yaml
deleted file mode 100644 (file)
index d2629c0..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-meta:
-- desc: |
-   run a cephfs stress test
-   mount ceph-fuse on client.3 before running workunit
-tasks:
-- sequential:
-  - ceph-fuse:
-  - print: "**** done ceph-fuse 5-final-workload"
-  - workunit:
-      clients:
-         client.3:
-          - suites/blogbench.sh
-  - print: "**** done suites/blogbench.sh 5-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados-snaps-few-objects.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados-snaps-few-objects.yaml
deleted file mode 100644 (file)
index d8b3dcb..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-meta:
-- desc: |
-   randomized correctness test for rados operations on a replicated pool with snapshots
-tasks:
-  - rados:
-      clients: [client.1]
-      ops: 4000
-      objects: 50
-      write_append_excl: false
-      op_weights:
-        read: 100
-        write: 100
-        delete: 50
-        snap_create: 50
-        snap_remove: 50
-        rollback: 50
-  - print: "**** done rados 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados_loadgenmix.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados_loadgenmix.yaml
deleted file mode 100644 (file)
index 922a9da..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-meta:
-- desc: |
-   generate read/write load with rados objects ranging from 1 byte to 1MB
-tasks:
-  - workunit:
-      clients:
-        client.1:
-          - rados/load-gen-mix.sh
-  - print: "**** done rados/load-gen-mix.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados_mon_thrash.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados_mon_thrash.yaml
deleted file mode 100644 (file)
index ab6276e..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-meta:
-- desc: |
-   librados C and C++ api tests
-overrides:
-  ceph:
-    log-whitelist:
-      - reached quota
-tasks:
-  - mon_thrash:
-      revive_delay: 20
-      thrash_delay: 1
-  - print: "**** done mon_thrash 4-final-workload"
-  - workunit:
-      branch: kraken
-      clients:
-        client.1:
-          - rados/test.sh
-  - print: "**** done rados/test.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rbd_cls.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rbd_cls.yaml
deleted file mode 100644 (file)
index aaf0a37..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-meta:
-- desc: |
-   rbd object class functional tests
-tasks:
-  - workunit:
-      clients:
-        client.1:
-          - cls/test_cls_rbd.sh
-  - print: "**** done cls/test_cls_rbd.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rbd_import_export.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rbd_import_export.yaml
deleted file mode 100644 (file)
index 46e1355..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-meta:
-- desc: |
-   run basic import/export cli tests for rbd
-tasks:
-  - workunit:
-      clients:
-        client.1:
-          - rbd/import_export.sh
-      env:
-        RBD_CREATE_ARGS: --new-format
-  - print: "**** done rbd/import_export.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rgw_swift.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rgw_swift.yaml
deleted file mode 100644 (file)
index 7a7659f..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-meta:
-- desc: |
-   swift api tests for rgw
-overrides:
-  rgw:
-    frontend: civetweb
-tasks:
-  - rgw: [client.1]
-  - print: "**** done rgw 4-final-workload"
-  - swift:
-      client.1:
-        rgw_server: client.1
-  - print: "**** done swift 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-workload.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/5-workload.yaml
new file mode 100644 (file)
index 0000000..851c5c8
--- /dev/null
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd on not upgrated client.4
+   (covers issue http://tracker.ceph.com/issues/21660)
+tasks:
+  - workunit:
+      branch: kraken
+      clients:
+        client.4:
+          - rbd/import_export.sh
+  - print: "**** done rbd/import_export.sh 5-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/6-luminous-with-mgr.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/6-luminous-with-mgr.yaml
new file mode 120000 (symlink)
index 0000000..5c72153
--- /dev/null
@@ -0,0 +1 @@
+../../../../releases/luminous-with-mgr.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/+ b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/+
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/blogbench.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/blogbench.yaml
new file mode 100644 (file)
index 0000000..d2629c0
--- /dev/null
@@ -0,0 +1,13 @@
+meta:
+- desc: |
+   run a cephfs stress test
+   mount ceph-fuse on client.3 before running workunit
+tasks:
+- sequential:
+  - ceph-fuse:
+  - print: "**** done ceph-fuse 5-final-workload"
+  - workunit:
+      clients:
+         client.3:
+          - suites/blogbench.sh
+  - print: "**** done suites/blogbench.sh 5-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados-snaps-few-objects.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados-snaps-few-objects.yaml
new file mode 100644 (file)
index 0000000..d8b3dcb
--- /dev/null
@@ -0,0 +1,17 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool with snapshots
+tasks:
+  - rados:
+      clients: [client.1]
+      ops: 4000
+      objects: 50
+      write_append_excl: false
+      op_weights:
+        read: 100
+        write: 100
+        delete: 50
+        snap_create: 50
+        snap_remove: 50
+        rollback: 50
+  - print: "**** done rados 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados_loadgenmix.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados_loadgenmix.yaml
new file mode 100644 (file)
index 0000000..922a9da
--- /dev/null
@@ -0,0 +1,9 @@
+meta:
+- desc: |
+   generate read/write load with rados objects ranging from 1 byte to 1MB
+tasks:
+  - workunit:
+      clients:
+        client.1:
+          - rados/load-gen-mix.sh
+  - print: "**** done rados/load-gen-mix.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados_mon_thrash.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados_mon_thrash.yaml
new file mode 100644 (file)
index 0000000..ab6276e
--- /dev/null
@@ -0,0 +1,18 @@
+meta:
+- desc: |
+   librados C and C++ api tests
+overrides:
+  ceph:
+    log-whitelist:
+      - reached quota
+tasks:
+  - mon_thrash:
+      revive_delay: 20
+      thrash_delay: 1
+  - print: "**** done mon_thrash 4-final-workload"
+  - workunit:
+      branch: kraken
+      clients:
+        client.1:
+          - rados/test.sh
+  - print: "**** done rados/test.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rbd_cls.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rbd_cls.yaml
new file mode 100644 (file)
index 0000000..aaf0a37
--- /dev/null
@@ -0,0 +1,9 @@
+meta:
+- desc: |
+   rbd object class functional tests
+tasks:
+  - workunit:
+      clients:
+        client.1:
+          - cls/test_cls_rbd.sh
+  - print: "**** done cls/test_cls_rbd.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rbd_import_export.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rbd_import_export.yaml
new file mode 100644 (file)
index 0000000..46e1355
--- /dev/null
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd
+tasks:
+  - workunit:
+      clients:
+        client.1:
+          - rbd/import_export.sh
+      env:
+        RBD_CREATE_ARGS: --new-format
+  - print: "**** done rbd/import_export.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rgw_swift.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rgw_swift.yaml
new file mode 100644 (file)
index 0000000..7a7659f
--- /dev/null
@@ -0,0 +1,13 @@
+meta:
+- desc: |
+   swift api tests for rgw
+overrides:
+  rgw:
+    frontend: civetweb
+tasks:
+  - rgw: [client.1]
+  - print: "**** done rgw 4-final-workload"
+  - swift:
+      client.1:
+        rgw_server: client.1
+  - print: "**** done swift 4-final-workload"
index 1e8d5a58ddfcb5fc24acc5ec51037cbc1fdd760f..3684b1e0a0af721d37b9811ec9985a09de6b810f 100644 (file)
@@ -18,6 +18,7 @@ roles:
   - client.1
   - client.2
   - client.3
+- - client.4
 overrides:
   ceph:
     log-whitelist:
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export.yaml
deleted file mode 100644 (file)
index 46e1355..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-meta:
-- desc: |
-   run basic import/export cli tests for rbd
-tasks:
-  - workunit:
-      clients:
-        client.1:
-          - rbd/import_export.sh
-      env:
-        RBD_CREATE_ARGS: --new-format
-  - print: "**** done rbd/import_export.sh 4-final-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_no_upgrated.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_no_upgrated.yaml
new file mode 100644 (file)
index 0000000..5de8a23
--- /dev/null
@@ -0,0 +1,13 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd
+   on NO upgrated client
+tasks:
+  - workunit:
+      branch: luminous
+      clients:
+        client.4:
+          - rbd/import_export.sh
+      env:
+        RBD_CREATE_ARGS: --new-format
+  - print: "**** done rbd/import_export.sh 4-final-workload on NO upgrated client"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_upgrated.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_upgrated.yaml
new file mode 100644 (file)
index 0000000..2c7c484
--- /dev/null
@@ -0,0 +1,12 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd
+   on upgrated client
+tasks:
+  - workunit:
+      clients:
+        client.1:
+          - rbd/import_export.sh
+      env:
+        RBD_CREATE_ARGS: --new-format
+  - print: "**** done rbd/import_export.sh 4-final-workload  on upgrated client"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/1-ceph-install b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/1-ceph-install
new file mode 120000 (symlink)
index 0000000..0479ac5
--- /dev/null
@@ -0,0 +1 @@
+../stress-split/1-ceph-install/
\ No newline at end of file
index a37fec1c070374e88612306dd9797a88219637f7..72f265375e1c65157fae44a6e8286fdad691ae20 100644 (file)
@@ -43,7 +43,9 @@ def generate_caps(type_):
             osd='allow *',
         ),
         mgr=dict(
-            mon='allow *',
+            mon='allow profile mgr',
+            osd='allow *',
+            mds='allow *',
         ),
         mds=dict(
             mon='allow *',
@@ -338,17 +340,18 @@ def create_rbd_pool(ctx, config):
         remote=mon_remote,
         ceph_cluster=cluster_name,
     )
-    log.info('Creating RBD pool')
-    mon_remote.run(
-        args=['sudo', 'ceph', '--cluster', cluster_name,
-              'osd', 'pool', 'create', 'rbd', '8'])
-    mon_remote.run(
-        args=[
-            'sudo', 'ceph', '--cluster', cluster_name,
-            'osd', 'pool', 'application', 'enable',
-            'rbd', 'rbd', '--yes-i-really-mean-it'
-        ],
-        check_status=False)
+    if config.get('create_rbd_pool', True):
+        log.info('Creating RBD pool')
+        mon_remote.run(
+            args=['sudo', 'ceph', '--cluster', cluster_name,
+                  'osd', 'pool', 'create', 'rbd', '8'])
+        mon_remote.run(
+            args=[
+                'sudo', 'ceph', '--cluster', cluster_name,
+                'osd', 'pool', 'application', 'enable',
+                'rbd', 'rbd', '--yes-i-really-mean-it'
+            ],
+            check_status=False)
     yield
 
 @contextlib.contextmanager
@@ -365,7 +368,8 @@ def cephfs_setup(ctx, config):
     if mdss.remotes:
         log.info('Setting up CephFS filesystem...')
 
-        fs = Filesystem(ctx, name='cephfs', create=True)
+        fs = Filesystem(ctx, name='cephfs', create=True,
+                        ec_profile=config.get('cephfs_ec_profile', None))
 
         is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
         all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
index b22c32113ced9305c82bcf99e5440f527284cc0e..38fbe43c2158b60f0f7b0e5c198e9cf6fe9f6e51 100644 (file)
@@ -15,6 +15,7 @@ from teuthology.config import config as teuth_config
 from teuthology.task import install as install_fn
 from teuthology.orchestra import run
 from tasks.cephfs.filesystem import Filesystem
+from teuthology.misc import wait_until_healthy
 
 log = logging.getLogger(__name__)
 
@@ -27,7 +28,8 @@ def download_ceph_deploy(ctx, config):
     will use that instead. The `bootstrap` script is ran, with the argument
     obtained from `python_version`, if specified.
     """
-    ceph_admin = ctx.cluster.only(teuthology.get_first_mon(ctx, config))
+    # use mon.a for ceph_admin
+    (ceph_admin,) = ctx.cluster.only('mon.a').remotes.iterkeys()
 
     try:
         py_ver = str(config['python_version'])
@@ -41,8 +43,7 @@ def download_ceph_deploy(ctx, config):
             ))
 
         log.info("Installing Python")
-        for admin in ceph_admin.remotes:
-            system_type = teuthology.get_system_type(admin)
+        system_type = teuthology.get_system_type(ceph_admin)
 
         if system_type == 'rpm':
             package = 'python34' if py_ver == '3' else 'python'
@@ -145,7 +146,7 @@ def get_nodes_using_role(ctx, target_role):
 
     # Prepare a modified version of cluster.remotes with ceph-deploy-ized names
     modified_remotes = {}
-
+    ceph_deploy_mapped = dict()
     for _remote, roles_for_host in ctx.cluster.remotes.iteritems():
         modified_remotes[_remote] = []
         for svc_id in roles_for_host:
@@ -156,13 +157,16 @@ def get_nodes_using_role(ctx, target_role):
                     nodes_of_interest.append(fqdn)
                 else:
                     nodes_of_interest.append(nodename)
-
-                modified_remotes[_remote].append(
-                    "{0}.{1}".format(target_role, nodename))
+                mapped_role = "{0}.{1}".format(target_role, nodename)
+                modified_remotes[_remote].append(mapped_role)
+                # keep dict of mapped role for later use by tasks
+                # eg. mon.a => mon.node1
+                ceph_deploy_mapped[svc_id] = mapped_role
             else:
                 modified_remotes[_remote].append(svc_id)
 
     ctx.cluster.remotes = modified_remotes
+    ctx.cluster.mapped_role = ceph_deploy_mapped
 
     return nodes_of_interest
 
@@ -213,8 +217,8 @@ def build_ceph_cluster(ctx, config):
     # Expect to find ceph_admin on the first mon by ID, same place that the download task
     # puts it.  Remember this here, because subsequently IDs will change from those in
     # the test config to those that ceph-deploy invents.
-    (ceph_admin,) = ctx.cluster.only(
-        teuthology.get_first_mon(ctx, config)).remotes.iterkeys()
+
+    (ceph_admin,) = ctx.cluster.only('mon.a').remotes.iterkeys()
 
     def execute_ceph_deploy(cmd):
         """Remotely execute a ceph_deploy command"""
@@ -241,10 +245,16 @@ def build_ceph_cluster(ctx, config):
         mds_nodes = " ".join(mds_nodes)
         mon_node = get_nodes_using_role(ctx, 'mon')
         mon_nodes = " ".join(mon_node)
-        mgr_nodes = get_nodes_using_role(ctx, 'mgr')
-        mgr_nodes = " ".join(mgr_nodes)
+        # skip mgr based on config item
+        # this is needed when test uses latest code to install old ceph
+        # versions
+        skip_mgr = config.get('skip-mgr', False)
+        if not skip_mgr:
+            mgr_nodes = get_nodes_using_role(ctx, 'mgr')
+            mgr_nodes = " ".join(mgr_nodes)
         new_mon = './ceph-deploy new' + " " + mon_nodes
-        mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
+        if not skip_mgr:
+            mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
         mon_hostname = mon_nodes.split(' ')[0]
         mon_hostname = str(mon_hostname)
         gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
@@ -307,7 +317,8 @@ def build_ceph_cluster(ctx, config):
 
         estatus_gather = execute_ceph_deploy(gather_keys)
 
-        execute_ceph_deploy(mgr_create)
+        if not skip_mgr:
+            execute_ceph_deploy(mgr_create)
 
         if mds_nodes:
             estatus_mds = execute_ceph_deploy(deploy_mds)
@@ -334,7 +345,7 @@ def build_ceph_cluster(ctx, config):
             # first check for filestore, default is bluestore with ceph-deploy
             if config.get('filestore') is not None:
                 osd_create_cmd += '--filestore '
-            else:
+            elif config.get('bluestore') is not None:
                 osd_create_cmd += '--bluestore '
             if config.get('dmcrypt') is not None:
                 osd_create_cmd += '--dmcrypt '
@@ -414,7 +425,7 @@ def build_ceph_cluster(ctx, config):
 
             if mds_nodes:
                 log.info('Configuring CephFS...')
-                ceph_fs = Filesystem(ctx, create=True)
+                Filesystem(ctx, create=True)
         elif not config.get('only_mon'):
             raise RuntimeError(
                 "The cluster is NOT operational due to insufficient OSDs")
@@ -524,7 +535,7 @@ def cli_test(ctx, config):
         """Either use git path or repo path """
         args = ['cd', conf_dir, run.Raw(';')]
         if path:
-            args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path));
+            args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path))
         else:
             args.append('ceph-deploy')
         args.append(run.Raw(cmd))
@@ -608,11 +619,11 @@ def cli_test(ctx, config):
     log.info("Waiting for cluster to become healthy")
     with contextutil.safe_while(sleep=10, tries=6,
                                 action='check health') as proceed:
-       while proceed():
-           r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
-           out = r.stdout.getvalue()
-           if (out.split(None,1)[0] == 'HEALTH_OK'):
-               break
+        while proceed():
+            r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
+            out = r.stdout.getvalue()
+            if (out.split(None, 1)[0] == 'HEALTH_OK'):
+                break
     rgw_install = 'install {branch} --rgw {node}'.format(
         branch=test_branch,
         node=nodename,
@@ -679,6 +690,108 @@ def single_node_test(ctx, config):
             yield
 
 
+@contextlib.contextmanager
+def upgrade(ctx, config):
+    """
+     Upgrade using ceph-deploy
+     eg:
+       ceph-deploy.upgrade:
+          # to upgrade to specific branch, use
+          branch:
+             stable: jewel
+           # to setup mgr node, use
+           setup-mgr-node: True
+           # to wait for cluster to be healthy after all upgrade, use
+           wait-for-healthy: True
+           role: (upgrades the below roles serially)
+              mon.a
+              mon.b
+              osd.0
+     """
+    roles = config.get('roles')
+    # get the roles that are mapped as per ceph-deploy
+    # roles are mapped for mon/mds eg: mon.a  => mon.host_short_name
+    mapped_role = ctx.cluster.mapped_role
+    if config.get('branch'):
+        branch = config.get('branch')
+        (var, val) = branch.items()[0]
+        ceph_branch = '--{var}={val}'.format(var=var, val=val)
+    else:
+        # default to master
+        ceph_branch = '--dev=master'
+    # get the node used for initial deployment which is mon.a
+    mon_a = mapped_role.get('mon.a')
+    (ceph_admin,) = ctx.cluster.only(mon_a).remotes.iterkeys()
+    testdir = teuthology.get_testdir(ctx)
+    cmd = './ceph-deploy install ' + ceph_branch
+    for role in roles:
+        # check if this role is mapped (mon or mds)
+        if mapped_role.get(role):
+            role = mapped_role.get(role)
+        remotes_and_roles = ctx.cluster.only(role).remotes
+        for remote, roles in remotes_and_roles.iteritems():
+            nodename = remote.shortname
+            cmd = cmd + ' ' + nodename
+            log.info("Upgrading ceph on  %s", nodename)
+            ceph_admin.run(
+                args=[
+                    'cd',
+                    '{tdir}/ceph-deploy'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    run.Raw(cmd),
+                ],
+            )
+            # restart all ceph services, ideally upgrade should but it does not
+            remote.run(
+                args=[
+                    'sudo', 'systemctl', 'restart', 'ceph.target'
+                ]
+            )
+            ceph_admin.run(args=['sudo', 'ceph', '-s'])
+
+    # workaround for http://tracker.ceph.com/issues/20950
+    # write the correct mgr key to disk
+    if config.get('setup-mgr-node', None):
+        mons = ctx.cluster.only(teuthology.is_type('mon'))
+        for remote, roles in mons.remotes.iteritems():
+            remote.run(
+                args=[
+                    run.Raw('sudo ceph auth get client.bootstrap-mgr'),
+                    run.Raw('|'),
+                    run.Raw('sudo tee'),
+                    run.Raw('/var/lib/ceph/bootstrap-mgr/ceph.keyring')
+                ]
+            )
+
+    if config.get('setup-mgr-node', None):
+        mgr_nodes = get_nodes_using_role(ctx, 'mgr')
+        mgr_nodes = " ".join(mgr_nodes)
+        mgr_install = './ceph-deploy install --mgr ' + ceph_branch + " " + mgr_nodes
+        mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
+        # install mgr
+        ceph_admin.run(
+            args=[
+                'cd',
+                '{tdir}/ceph-deploy'.format(tdir=testdir),
+                run.Raw('&&'),
+                run.Raw(mgr_install),
+                ],
+            )
+        # create mgr
+        ceph_admin.run(
+            args=[
+                'cd',
+                '{tdir}/ceph-deploy'.format(tdir=testdir),
+                run.Raw('&&'),
+                run.Raw(mgr_create),
+                ],
+            )
+        ceph_admin.run(args=['sudo', 'ceph', '-s'])
+    if config.get('wait-for-healthy', None):
+        wait_until_healthy(ctx, ceph_admin, use_sudo=True)
+    yield
+
+
 @contextlib.contextmanager
 def task(ctx, config):
     """
@@ -694,12 +807,15 @@ def task(ctx, config):
              branch:
                 stable: bobtail
              mon_initial_members: 1
+             ceph-deploy-branch: my-ceph-deploy-branch
              only_mon: true
              keep_running: true
              # either choose bluestore or filestore, default is bluestore
              bluestore: True
              # or
              filestore: True
+             # skip install of mgr for old release using below flag
+             skip-mgr: True  ( default is False )
 
         tasks:
         - install:
index 9da03bdd9082324689e018ebadd48aea84c2e423..5a89f235f2d7e4f2e1554c65cfe44b01e14ae5ad 100644 (file)
@@ -111,12 +111,12 @@ class Thrasher:
         self.stopping = False
         self.logger = logger
         self.config = config
-        self.revive_timeout = self.config.get("revive_timeout", 150)
+        self.revive_timeout = self.config.get("revive_timeout", 360)
         self.pools_to_fix_pgp_num = set()
         if self.config.get('powercycle'):
             self.revive_timeout += 120
         self.clean_wait = self.config.get('clean_wait', 0)
-        self.minin = self.config.get("min_in", 3)
+        self.minin = self.config.get("min_in", 4)
         self.chance_move_pg = self.config.get('chance_move_pg', 1.0)
         self.sighup_delay = self.config.get('sighup_delay')
         self.optrack_toggle_delay = self.config.get('optrack_toggle_delay')
@@ -286,6 +286,7 @@ class Thrasher:
                                         pg=pg,
                                         id=exp_osd))
             # export
+            # Can't use new export-remove op since this is part of upgrade testing
             cmd = prefix + "--op export --pgid {pg} --file {file}"
             cmd = cmd.format(id=exp_osd, pg=pg, file=exp_path)
             proc = exp_remote.run(args=cmd)
@@ -294,7 +295,7 @@ class Thrasher:
                                 "export failure with status {ret}".
                                 format(ret=proc.exitstatus))
             # remove
-            cmd = prefix + "--op remove --pgid {pg}"
+            cmd = prefix + "--force --op remove --pgid {pg}"
             cmd = cmd.format(id=exp_osd, pg=pg)
             proc = exp_remote.run(args=cmd)
             if proc.exitstatus:
@@ -767,7 +768,7 @@ class Thrasher:
         osd_debug_skip_full_check_in_backfill_reservation to force
         the more complicated check in do_scan to be exercised.
 
-        Then, verify that all backfills stop.
+        Then, verify that all backfillings stop.
         """
         self.log("injecting backfill full")
         for i in self.live_osds:
@@ -779,13 +780,13 @@ class Thrasher:
                                      check_status=True, timeout=30, stdout=DEVNULL)
         for i in range(30):
             status = self.ceph_manager.compile_pg_status()
-            if 'backfill' not in status.keys():
+            if 'backfilling' not in status.keys():
                 break
             self.log(
-                "waiting for {still_going} backfills".format(
-                    still_going=status.get('backfill')))
+                "waiting for {still_going} backfillings".format(
+                    still_going=status.get('backfilling')))
             time.sleep(1)
-        assert('backfill' not in self.ceph_manager.compile_pg_status().keys())
+        assert('backfilling' not in self.ceph_manager.compile_pg_status().keys())
         for i in self.live_osds:
             self.ceph_manager.set_config(
                 i,
@@ -2043,7 +2044,7 @@ class CephManager:
         for pg in pgs:
             if (pg['state'].count('active') and
                     not pg['state'].count('recover') and
-                    not pg['state'].count('backfill') and
+                    not pg['state'].count('backfilling') and
                     not pg['state'].count('stale')):
                 num += 1
         return num
@@ -2217,6 +2218,8 @@ class CephManager:
                 else:
                     self.log("no progress seen, keeping timeout for now")
                     if now - start >= timeout:
+                       if self.is_recovered():
+                           break
                         self.log('dumping pgs')
                         out = self.raw_cluster_cmd('pg', 'dump')
                         self.log(out)
@@ -2317,6 +2320,30 @@ class CephManager:
             time.sleep(3)
         self.log("active!")
 
+    def wait_till_pg_convergence(self, timeout=None):
+        start = time.time()
+        old_stats = None
+        active_osds = [osd['osd'] for osd in self.get_osd_dump()
+                       if osd['in'] and osd['up']]
+        while True:
+            # strictly speaking, no need to wait for mon. but due to the
+            # "ms inject socket failures" setting, the osdmap could be delayed,
+            # so mgr is likely to ignore the pg-stat messages with pgs serving
+            # newly created pools which is not yet known by mgr. so, to make sure
+            # the mgr is updated with the latest pg-stats, waiting for mon/mgr is
+            # necessary.
+            self.flush_pg_stats(active_osds)
+            new_stats = dict((stat['pgid'], stat['state'])
+                             for stat in self.get_pg_stats())
+            if old_stats == new_stats:
+                return old_stats
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'failed to reach convergence before %d secs' % timeout
+            old_stats = new_stats
+            # longer than mgr_stats_period
+            time.sleep(5 + 1)
+
     def mark_out_osd(self, osd):
         """
         Wrapper to mark osd out.
@@ -2368,7 +2395,7 @@ class CephManager:
         time.sleep(2)
         self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
 
-    def revive_osd(self, osd, timeout=150, skip_admin_check=False):
+    def revive_osd(self, osd, timeout=360, skip_admin_check=False):
         """
         Revive osds by either power cycling (if indicated by the config)
         or by restarting.
index 3dc49624c2cb6f9e6811fce158144fd2231c3a68..912577317b40277cd6f0e8118246cd57d394b562 100644 (file)
@@ -591,7 +591,7 @@ def test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME, ec=False):
                 continue
 
             for pg in pgs[osdid]:
-                cmd = ((prefix + "--op remove --pgid {pg}").
+                cmd = ((prefix + "--force --op remove --pgid {pg}").
                        format(pg=pg, id=osdid))
                 proc = remote.run(args=cmd, check_status=False,
                                   stdout=StringIO())
index 44f6cbaf16dbf0e9e9e2418dfed9f5dd9ca4acbd..9638fd55c96ab02f9651d3cbf33d802d449a3949 100644 (file)
@@ -374,10 +374,12 @@ class Filesystem(MDSCluster):
     This object is for driving a CephFS filesystem.  The MDS daemons driven by
     MDSCluster may be shared with other Filesystems.
     """
-    def __init__(self, ctx, fscid=None, name=None, create=False):
+    def __init__(self, ctx, fscid=None, name=None, create=False,
+                 ec_profile=None):
         super(Filesystem, self).__init__(ctx)
 
         self.name = name
+        self.ec_profile = ec_profile
         self.id = None
         self.metadata_pool_name = None
         self.metadata_overlay = False
@@ -473,8 +475,22 @@ class Filesystem(MDSCluster):
                                              self.name, self.metadata_pool_name, data_pool_name,
                                              '--allow-dangerous-metadata-overlay')
         else:
-            self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
-                                             data_pool_name, pgs_per_fs_pool.__str__())
+            if self.ec_profile:
+                log.info("EC profile is %s", self.ec_profile)
+                cmd = ['osd', 'erasure-code-profile', 'set', data_pool_name]
+                cmd.extend(self.ec_profile)
+                self.mon_manager.raw_cluster_cmd(*cmd)
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'create',
+                    data_pool_name, pgs_per_fs_pool.__str__(), 'erasure',
+                    data_pool_name)
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'set',
+                    data_pool_name, 'allow_ec_overwrites', 'true')
+            else:
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'create',
+                    data_pool_name, pgs_per_fs_pool.__str__())
             self.mon_manager.raw_cluster_cmd('fs', 'new',
                                              self.name, self.metadata_pool_name, data_pool_name)
         self.check_pool_application(self.metadata_pool_name)
index b06d2a1d233fa0dc5fcecb18fe134942f8171750..cb5e3a4625243510d5216f94234b7d60e65b1f74 100644 (file)
@@ -29,7 +29,7 @@ class TestClientLimits(CephFSTestCase):
     REQUIRE_KCLIENT_REMOTE = True
     CLIENTS_REQUIRED = 2
 
-    def _test_client_pin(self, use_subdir):
+    def _test_client_pin(self, use_subdir, open_files):
         """
         When a client pins an inode in its cache, for example because the file is held open,
         it should reject requests from the MDS to trim these caps.  The MDS should complain
@@ -39,13 +39,16 @@ class TestClientLimits(CephFSTestCase):
         :param use_subdir: whether to put test files in a subdir or use root
         """
 
-        cache_size = 100
-        open_files = 200
+        cache_size = open_files/2
 
         self.set_conf('mds', 'mds cache size', cache_size)
         self.fs.mds_fail_restart()
         self.fs.wait_for_daemons()
 
+        mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        self.assertTrue(open_files >= mds_min_caps_per_client)
+        mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client"))
+
         mount_a_client_id = self.mount_a.get_global_id()
         path = "subdir/mount_a" if use_subdir else "mount_a"
         open_proc = self.mount_a.open_n_background(path, open_files)
@@ -62,8 +65,7 @@ class TestClientLimits(CephFSTestCase):
         # MDS should not be happy about that, as the client is failing to comply
         # with the SESSION_RECALL messages it is being sent
         mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout"))
-        self.wait_for_health("MDS_CLIENT_RECALL",
-                mds_recall_state_timeout + 10)
+        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10)
 
         # We can also test that the MDS health warning for oversized
         # cache is functioning as intended.
@@ -82,19 +84,31 @@ class TestClientLimits(CephFSTestCase):
 
         # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
         # which depend on the caps outstanding, cache size and overall ratio
-        self.wait_until_equal(
-            lambda: self.get_session(mount_a_client_id)['num_caps'],
-            int(open_files * 0.2),
-            timeout=30,
-            reject_fn=lambda x: x < int(open_files*0.2))
+        recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2))
+        def expected_caps():
+            num_caps = self.get_session(mount_a_client_id)['num_caps']
+            if num_caps < mds_min_caps_per_client:
+                raise RuntimeError("client caps fell below min!")
+            elif num_caps == mds_min_caps_per_client:
+                return True
+            elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05:
+                return True
+            else:
+                return False
+
+        self.wait_until_true(expected_caps, timeout=60)
 
     @needs_trimming
     def test_client_pin_root(self):
-        self._test_client_pin(False)
+        self._test_client_pin(False, 400)
 
     @needs_trimming
     def test_client_pin(self):
-        self._test_client_pin(True)
+        self._test_client_pin(True, 800)
+
+    @needs_trimming
+    def test_client_pin_mincaps(self):
+        self._test_client_pin(True, 200)
 
     def test_client_release_bug(self):
         """
index 65dc9a9eb856161e54a5e189aa1fc74e27f39061..0876af96efe0b985aabeda3abeb2afeb2279ad4c 100644 (file)
@@ -355,11 +355,11 @@ vc.disconnect()
         :return:
         """
 
-        # Because the teuthology config template sets mon_pg_warn_max_per_osd to
+        # Because the teuthology config template sets mon_max_pg_per_osd to
         # 10000 (i.e. it just tries to ignore health warnings), reset it to something
         # sane before using volume_client, to avoid creating pools with absurdly large
         # numbers of PGs.
-        self.set_conf("global", "mon pg warn max per osd", "300")
+        self.set_conf("global", "mon max pg per osd", "300")
         for mon_daemon_state in self.ctx.daemons.iter_daemons_of_role('mon'):
             mon_daemon_state.restart()
 
@@ -368,7 +368,7 @@ vc.disconnect()
 
         # Calculate how many PGs we'll expect the new volume pool to have
         osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
-        max_per_osd = int(self.fs.get_config('mon_pg_warn_max_per_osd'))
+        max_per_osd = int(self.fs.get_config('mon_max_pg_per_osd'))
         osd_count = len(osd_map['osds'])
         max_overall = osd_count * max_per_osd
 
@@ -764,7 +764,7 @@ vc.disconnect()
         # auth ID belongs to, the auth ID's authorized access levels
         # for different volumes, versioning details, etc.
         expected_auth_metadata = {
-            u"version": 1,
+            u"version": 2,
             u"compat_version": 1,
             u"dirty": False,
             u"tenant_id": u"tenant1",
@@ -791,7 +791,7 @@ vc.disconnect()
         # Verify that the volume metadata file stores info about auth IDs
         # and their access levels to the volume, versioning details, etc.
         expected_vol_metadata = {
-            u"version": 1,
+            u"version": 2,
             u"compat_version": 1,
             u"auths": {
                 u"guest": {
@@ -905,3 +905,112 @@ vc.disconnect()
             volume_id=volume_id,
             auth_id=guestclient["auth_id"],
         )))
+
+    def test_put_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test data'
+        obj_name = 'test_vc_obj_1'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self._volume_client_python(vc_mount, dedent("""
+            vc.put_object("{pool_name}", "{obj_name}", b"{obj_data}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+            obj_data = obj_data
+        )))
+
+        read_data = self.fs.rados(['get', obj_name, '-'], pool=pool_name)
+        self.assertEqual(obj_data, read_data)
+
+    def test_get_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test_data'
+        obj_name = 'test_vc_ob_2'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data)
+
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.get_object("{pool_name}", "{obj_name}")
+            assert data_read == b"{obj_data}"
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+            obj_data = obj_data
+        )))
+
+    def test_delete_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test data'
+        obj_name = 'test_vc_obj_3'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data)
+
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.delete_object("{pool_name}", "{obj_name}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+        )))
+
+        with self.assertRaises(CommandFailedError):
+            self.fs.rados(['stat', obj_name], pool=pool_name)
+
+        # Check idempotency -- no error raised trying to delete non-existent
+        # object
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.delete_object("{pool_name}", "{obj_name}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+        )))
+
+    def test_21501(self):
+        """
+        Reproducer for #21501 "ceph_volume_client: sets invalid caps for
+        existing IDs with no caps" (http://tracker.ceph.com/issues/21501)
+        """
+
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+
+        # Configure vc_mount as the handle for driving volumeclient
+        self._configure_vc_auth(vc_mount, "manila")
+
+        # Create a volume
+        group_id = "grpid"
+        volume_id = "volid"
+        mount_path = self._volume_client_python(vc_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*10)
+            print create_result['mount_path']
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )))
+
+        # Create an auth ID with no caps
+        guest_id = '21501'
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'get-or-create', 'client.{0}'.format(guest_id))
+
+        guest_mount = self.mounts[2]
+        guest_mount.umount_wait()
+
+        # Set auth caps for the auth ID using the volumeclient
+        self._configure_guest_auth(vc_mount, guest_mount, guest_id, mount_path)
+
+        # Mount the volume in the guest using the auth ID to assert that the
+        # auth caps are valid
+        guest_mount.mount(mount_path=mount_path)
index 0e645c7c4c35b3d753c3609329edca26f02295e6..0ed753278b4116709c03b65e248bf90477db2bc1 100644 (file)
@@ -156,13 +156,7 @@ def task(ctx, config):
               format(fpath=FSPATH, jpath=JPATH))
     pid = os.getpid()
     expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
-    cmd = ((prefix + "--op export --pgid 2.0 --file {file}").
-           format(id=divergent, file=expfile))
-    proc = exp_remote.run(args=cmd, wait=True,
-                          check_status=False, stdout=StringIO())
-    assert proc.exitstatus == 0
-
-    cmd = ((prefix + "--op remove --pgid 2.0").
+    cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}").
            format(id=divergent, file=expfile))
     proc = exp_remote.run(args=cmd, wait=True,
                           check_status=False, stdout=StringIO())
index a5531d33ed2e18079525b2175b33092b390dd48d..ec3f98d28bb41398451353cfd91759309c2000b6 100644 (file)
@@ -1,14 +1,18 @@
 
 from unittest import case
 import json
+import logging
 
 from teuthology import misc
 from tasks.ceph_test_case import CephTestCase
 
-# TODO move definition of CephCluster
+# TODO move definition of CephCluster away from the CephFS stuff
 from tasks.cephfs.filesystem import CephCluster
 
 
+log = logging.getLogger(__name__)
+
+
 class MgrCluster(CephCluster):
     def __init__(self, ctx):
         super(MgrCluster, self).__init__(ctx)
@@ -43,6 +47,12 @@ class MgrCluster(CephCluster):
     def get_standby_ids(self):
         return [s['name'] for s in self.get_mgr_map()["standbys"]]
 
+    def set_module_localized_conf(self, module, mgr_id, key, val):
+        self.mon_manager.raw_cluster_cmd("config-key", "set",
+                                         "mgr/{0}/{1}/{2}".format(
+                                             module, mgr_id, key
+                                         ), val)
+
 
 class MgrTestCase(CephTestCase):
     MGRS_REQUIRED = 1
@@ -77,3 +87,84 @@ class MgrTestCase(CephTestCase):
         self.wait_until_true(
             lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
             timeout=20)
+
+    def _load_module(self, module_name):
+        loaded = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                   "mgr", "module", "ls"))['enabled_modules']
+        if module_name in loaded:
+            # The enable command is idempotent, but our wait for a restart
+            # isn't, so let's return now if it's already loaded
+            return
+
+        initial_gid = self.mgr_cluster.get_mgr_map()['active_gid']
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable",
+                                         module_name)
+
+        # Wait for the module to load
+        def has_restarted():
+            mgr_map = self.mgr_cluster.get_mgr_map()
+            done = mgr_map['active_gid'] != initial_gid and mgr_map['available']
+            if done:
+                log.info("Restarted after module load (new active {0}/{1})".format(
+                    mgr_map['active_name'] , mgr_map['active_gid']))
+            return done
+        self.wait_until_true(has_restarted, timeout=30)
+
+
+    def _get_uri(self, service_name):
+        # Little dict hack so that I can assign into this from
+        # the get_or_none function
+        mgr_map = {'x': None}
+
+        def _get_or_none():
+            mgr_map['x'] = self.mgr_cluster.get_mgr_map()
+            result = mgr_map['x']['services'].get(service_name, None)
+            return result
+
+        self.wait_until_true(lambda: _get_or_none() is not None, 30)
+
+        uri = mgr_map['x']['services'][service_name]
+
+        log.info("Found {0} at {1} (daemon {2}/{3})".format(
+            service_name, uri, mgr_map['x']['active_name'],
+            mgr_map['x']['active_gid']))
+
+        return uri
+
+
+    def _assign_ports(self, module_name, config_name, min_port=7789):
+        """
+        To avoid the need to run lots of hosts in teuthology tests to
+        get different URLs per mgr, we will hand out different ports
+        to each mgr here.
+
+        This is already taken care of for us when running in a vstart
+        environment.
+        """
+        # Start handing out ports well above Ceph's range.
+        assign_port = min_port
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            self.mgr_cluster.mgr_stop(mgr_id)
+            self.mgr_cluster.mgr_fail(mgr_id)
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            log.info("Using port {0} for {1} on mgr.{2}".format(
+                assign_port, module_name, mgr_id
+            ))
+            self.mgr_cluster.set_module_localized_conf(module_name, mgr_id,
+                                                       config_name,
+                                                       str(assign_port))
+            assign_port += 1
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            self.mgr_cluster.mgr_restart(mgr_id)
+
+        def is_available():
+            mgr_map = self.mgr_cluster.get_mgr_map()
+            done = mgr_map['available']
+            if done:
+                log.info("Available after assign ports (new active {0}/{1})".format(
+                    mgr_map['active_name'] , mgr_map['active_gid']))
+            return done
+        self.wait_until_true(is_available, timeout=30)
diff --git a/ceph/qa/tasks/mgr/test_dashboard.py b/ceph/qa/tasks/mgr/test_dashboard.py
new file mode 100644 (file)
index 0000000..3b8a2cc
--- /dev/null
@@ -0,0 +1,70 @@
+
+
+from mgr_test_case import MgrTestCase
+
+import logging
+import requests
+
+
+log = logging.getLogger(__name__)
+
+
+class TestDashboard(MgrTestCase):
+    MGRS_REQUIRED = 3
+
+    def test_standby(self):
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        original_active = self.mgr_cluster.get_active_id()
+
+        original_uri = self._get_uri("dashboard")
+        log.info("Originally running at {0}".format(original_uri))
+
+        self.mgr_cluster.mgr_fail(original_active)
+
+        failed_over_uri = self._get_uri("dashboard")
+        log.info("After failover running at {0}".format(original_uri))
+
+        self.assertNotEqual(original_uri, failed_over_uri)
+
+        # The original active daemon should have come back up as a standby
+        # and be doing redirects to the new active daemon
+        r = requests.get(original_uri, allow_redirects=False)
+        self.assertEqual(r.status_code, 303)
+        self.assertEqual(r.headers['Location'], failed_over_uri)
+
+    def test_urls(self):
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        base_uri = self._get_uri("dashboard")
+
+        # This is a very simple smoke test to check that the dashboard can
+        # give us a 200 response to requests.  We're not testing that
+        # the content is correct or even renders!
+
+        urls = [
+            "/health",
+            "/servers",
+            "/osd/",
+            "/osd/perf/0",
+            "/rbd_mirroring",
+            "/rbd_iscsi"
+        ]
+
+        failures = []
+
+        for url in urls:
+            r = requests.get(base_uri + url, allow_redirects=False)
+            if r.status_code >= 300 and r.status_code < 400:
+                log.error("Unexpected redirect to: {0} (from {1})".format(
+                    r.headers['Location'], base_uri))
+            if r.status_code != 200:
+                failures.append(url)
+
+            log.info("{0}: {1} ({2} bytes)".format(
+                url, r.status_code, len(r.content)
+            ))
+
+        self.assertListEqual(failures, [])
diff --git a/ceph/qa/tasks/mgr/test_module_selftest.py b/ceph/qa/tasks/mgr/test_module_selftest.py
new file mode 100644 (file)
index 0000000..2776fb8
--- /dev/null
@@ -0,0 +1,74 @@
+
+import time
+import requests
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+
+class TestModuleSelftest(MgrTestCase):
+    """
+    That modules with a self-test command can be loaded and execute it
+    without errors.
+
+    This is not a substitute for really testing the modules, but it
+    is quick and is designed to catch regressions that could occur
+    if data structures change in a way that breaks how the modules
+    touch them.
+    """
+    MGRS_REQUIRED = 1
+
+    def _selftest_plugin(self, module_name):
+        self._load_module(module_name)
+
+        # Execute the module's self-test routine
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(module_name, "self-test")
+
+    def test_zabbix(self):
+        self._selftest_plugin("zabbix")
+
+    def test_prometheus(self):
+        self._selftest_plugin("prometheus")
+
+    def test_influx(self):
+        self._selftest_plugin("influx")
+
+    def test_selftest_run(self):
+        self._load_module("selftest")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", "run")
+
+    def test_selftest_command_spam(self):
+        # Use the selftest module to stress the mgr daemon
+        self._load_module("selftest")
+
+        # Use the dashboard to test that the mgr is still able to do its job
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+                                                     "background", "start",
+                                                     "command_spam")
+
+        dashboard_uri = self._get_uri("dashboard")
+
+        delay = 10
+        periods = 10
+        for i in range(0, periods):
+            t1 = time.time()
+            # Check that an HTTP module remains responsive
+            r = requests.get(dashboard_uri)
+            self.assertEqual(r.status_code, 200)
+
+            # Check that a native non-module command remains responsive
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("osd", "df")
+
+            time.sleep(delay - (time.time() - t1))
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+                                                     "background", "stop")
+
+        # Check that all mgr daemons are still running
+        self.assertEqual(original_active, self.mgr_cluster.get_active_id())
+        self.assertEqual(original_standbys, self.mgr_cluster.get_standby_ids())
diff --git a/ceph/qa/tasks/osd_max_pg_per_osd.py b/ceph/qa/tasks/osd_max_pg_per_osd.py
new file mode 100644 (file)
index 0000000..b4e2aa4
--- /dev/null
@@ -0,0 +1,126 @@
+import logging
+import random
+
+
+log = logging.getLogger(__name__)
+
+
+def pg_num_in_all_states(pgs, *states):
+    return sum(1 for state in pgs.itervalues()
+               if all(s in state for s in states))
+
+
+def pg_num_in_any_state(pgs, *states):
+    return sum(1 for state in pgs.itervalues()
+               if any(s in state for s in states))
+
+
+def test_create_from_mon(ctx, config):
+    """
+    osd should stop creating new pools if the number of pg it servers
+    exceeds the max-pg-per-osd setting, and it should resume the previously
+    suspended pg creations once the its pg number drops down below the setting
+    How it works::
+    1. set the hard limit of pg-per-osd to "2"
+    2. create pool.a with pg_num=2
+       # all pgs should be active+clean
+    2. create pool.b with pg_num=2
+       # new pgs belonging to this pool should be unknown (the primary osd
+       reaches the limit) or creating (replica osd reaches the limit)
+    3. remove pool.a
+    4. all pg belonging to pool.b should be active+clean
+    """
+    pg_num = config.get('pg_num', 2)
+    manager = ctx.managers['ceph']
+    log.info('1. creating pool.a')
+    pool_a = manager.create_pool_with_unique_name(pg_num)
+    manager.wait_for_clean()
+    assert manager.get_num_active_clean() == pg_num
+
+    log.info('2. creating pool.b')
+    pool_b = manager.create_pool_with_unique_name(pg_num)
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
+    pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+    assert pg_pending == pg_num
+
+    log.info('3. removing pool.a')
+    manager.remove_pool(pool_a)
+    pg_states = manager.wait_till_pg_convergence(300)
+    assert len(pg_states) == pg_num
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
+
+    # cleanup
+    manager.remove_pool(pool_b)
+
+
+def test_create_from_peer(ctx, config):
+    """
+    osd should stop creating new pools if the number of pg it servers
+    exceeds the max-pg-per-osd setting, and it should resume the previously
+    suspended pg creations once the its pg number drops down below the setting
+
+    How it works::
+    0. create 4 OSDs.
+    1. create pool.a with pg_num=1, size=2
+       pg will be mapped to osd.0, and osd.1, and it should be active+clean
+    2. create pool.b with pg_num=1, size=2.
+       if the pgs stuck in creating, delete the pool since the pool and try
+       again, eventually we'll get the pool to land on the other 2 osds that
+       aren't occupied by pool.a. (this will also verify that pgs for deleted
+       pools get cleaned out of the creating wait list.)
+    3. mark an osd out. verify that some pgs get stuck stale or peering.
+    4. delete a pool, verify pgs go active.
+    """
+    pg_num = config.get('pg_num', 1)
+    pool_size = config.get('pool_size', 2)
+    from_primary = config.get('from_primary', True)
+
+    manager = ctx.managers['ceph']
+    log.info('1. creating pool.a')
+    pool_a = manager.create_pool_with_unique_name(pg_num)
+    manager.wait_for_clean()
+    assert manager.get_num_active_clean() == pg_num
+
+    log.info('2. creating pool.b')
+    while True:
+        pool_b = manager.create_pool_with_unique_name(pg_num)
+        pg_states = manager.wait_till_pg_convergence(300)
+        pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+        assert pg_created >= pg_num
+        pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+        assert pg_pending == pg_num * 2 - pg_created
+        if pg_created == pg_num * 2:
+            break
+        manager.remove_pool(pool_b)
+
+    log.info('3. mark an osd out')
+    pg_stats = manager.get_pg_stats()
+    pg = random.choice(pg_stats)
+    if from_primary:
+        victim = pg['acting'][-1]
+    else:
+        victim = pg['acting'][0]
+    manager.mark_out_osd(victim)
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_stuck = pg_num_in_any_state(pg_states, 'activating', 'stale', 'peering')
+    assert pg_stuck > 0
+
+    log.info('4. removing pool.b')
+    manager.remove_pool(pool_b)
+    manager.wait_for_clean(30)
+
+    # cleanup
+    manager.remove_pool(pool_a)
+
+
+def task(ctx, config):
+    assert isinstance(config, dict), \
+        'osd_max_pg_per_osd task only accepts a dict for config'
+    manager = ctx.managers['ceph']
+    if config.get('test_create_from_mon', True):
+        test_create_from_mon(ctx, config)
+    else:
+        test_create_from_peer(ctx, config)
index 50e3a8b33d54a03f834115459bb8eacc492bfde7..f2486238449549c7da2e70edd8bc90cc2010c149 100644 (file)
@@ -174,19 +174,12 @@ def task(ctx, config):
               format(fpath=FSPATH, jpath=JPATH))
     pid = os.getpid()
     expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
-    cmd = ((prefix + "--op export --pgid 2.0 --file {file}").
+    cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}").
            format(id=divergent, file=expfile))
     proc = exp_remote.run(args=cmd, wait=True,
                           check_status=False, stdout=StringIO())
     assert proc.exitstatus == 0
 
-    # Remove the same pg that was exported
-    cmd = ((prefix + "--op remove --pgid 2.0").
-           format(id=divergent))
-    proc = exp_remote.run(args=cmd, wait=True,
-                          check_status=False, stdout=StringIO())
-    assert proc.exitstatus == 0
-
     # Kill one of non-divergent OSDs
     log.info('killing osd.%d' % non_divergent[0])
     manager.kill_osd(non_divergent[0])
@@ -194,7 +187,7 @@ def task(ctx, config):
     # manager.mark_out_osd(non_divergent[0])
 
     # An empty collection for pg 2.0 might need to be cleaned up
-    cmd = ((prefix + "--op remove --pgid 2.0").
+    cmd = ((prefix + "--force --op remove --pgid 2.0").
            format(id=non_divergent[0]))
     proc = exp_remote.run(args=cmd, wait=True,
                           check_status=False, stdout=StringIO())
index b969a36a8aa4ca835d3ea010bc84528281a34e6a..c01fe1dda005465fdd15046bb65ea6f832eaf7a3 100644 (file)
@@ -82,7 +82,9 @@ def task(ctx, config):
     fix_rgw_config(rgw_node, dnsmasq_name)
     setup_user_bucket(rgw_node, dnsmasq_name, access_key, secret_key, bucket_name, testdir)
     if hadoop_ver.startswith('2.8'):
-        test_options = '-Dit.test=ITestS3A* -Dparallel-tests -Dscale -Dfs.s3a.scale.test.huge.filesize=128M verify'
+        # test all ITtests but skip AWS test using public bucket landsat-pds
+        # which is not available from within this test
+        test_options = '-Dit.test=ITestS3A* -Dit.test=\!ITestS3AAWSCredentialsProvider* -Dparallel-tests -Dscale -Dfs.s3a.scale.test.huge.filesize=128M verify'
     else:
         test_options = 'test -Dtest=S3a*,TestS3A*'
     try:
index dbca056a08609c6060377ac61d78d08929858e04..420b7355908d52832d46330637a00d603a8a4780 100644 (file)
@@ -24,7 +24,7 @@ def task(ctx, config):
 
     cluster: (default 'ceph') the name of the cluster to thrash
 
-    min_in: (default 3) the minimum number of OSDs to keep in the
+    min_in: (default 4) the minimum number of OSDs to keep in the
        cluster
 
     min_out: (default 0) the minimum number of OSDs to keep out of the
index 86c4b53892135e53d4b02fef8c06a965a29a4b4f..a83f9e19082d41abc3cf69f9feda741ea36996a1 100644 (file)
@@ -34,7 +34,7 @@ def create_ec_pool(remote, name, profile_name, pgnum, profile={}, cluster_name="
     if application:
         remote.run(args=[
             'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
-        ])
+        ], check_status=False) # may fail as EINVAL when run in jewel upgrade test
 
 def create_replicated_pool(remote, name, pgnum, cluster_name="ceph", application=None):
     remote.run(args=[
@@ -43,7 +43,7 @@ def create_replicated_pool(remote, name, pgnum, cluster_name="ceph", application
     if application:
         remote.run(args=[
             'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
-        ])
+        ], check_status=False)
 
 def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="ceph"):
     remote.run(args=[
index efc080dc0eaf67d5273727b9dc391df72c1d496f..637fa90ebdd231ec299ba79f2f25a61204a8cde1 100644 (file)
@@ -113,7 +113,7 @@ class CephDisk:
         LOG.debug(self.unused_disks('sd.'))
         if self.unused_disks('sd.'):
             return
-        modprobe = "modprobe scsi_debug vpd_use_hostno=0 add_host=1 dev_size_mb=200 ; udevadm settle"
+        modprobe = "modprobe scsi_debug vpd_use_hostno=0 add_host=1 dev_size_mb=300 ; udevadm settle"
         try:
             self.sh(modprobe)
         except:
index 7a795b925d68cb8e70f0fd1f7f3c7a68b847c8a1..7102efba181cd811c3332f5f39ef5abd077546e3 100755 (executable)
@@ -35,7 +35,7 @@ if ! ${PYTHON} -m pytest --version > /dev/null 2>&1; then
     exit 1
 fi
 
-sudo env PATH=$(dirname $0):$(dirname $0)/..:$PATH ${PYTHON} -m pytest -s -v $(dirname $0)/ceph-disk-test.py
+sudo env PATH=$(dirname $0):$(dirname $0)/..:$PATH PYTHONWARNINGS=ignore ${PYTHON} -m pytest -s -v $(dirname $0)/ceph-disk-test.py
 result=$?
 
 sudo rm -f /lib/udev/rules.d/60-ceph-by-partuuid.rules
index f5a313ea2a5aff846efba6baaecd69ede8a9df59..15344172a966cd972e851308566361c1b33bd00d 100755 (executable)
@@ -1593,16 +1593,7 @@ function test_mon_osd()
   # When CEPH_CLI_TEST_DUP_COMMAND is set, osd create
   # is repeated and consumes two osd id, not just one.
   #
-  local next_osd
-  if test "$CEPH_CLI_TEST_DUP_COMMAND" ; then
-      next_osd=$((gap_start + 1))
-  else
-      next_osd=$gap_start
-  fi
-  id=`ceph osd create`
-  [ "$id" = "$next_osd" ]
-
-  next_osd=$((id + 1))
+  local next_osd=$gap_start
   id=`ceph osd create $(uuidgen)`
   [ "$id" = "$next_osd" ]
 
@@ -2162,9 +2153,12 @@ function test_mon_osd_erasure_code()
   ceph osd erasure-code-profile set fooprofile a=b c=d e=f --force
   ceph osd erasure-code-profile set fooprofile a=b c=d e=f
   expect_false ceph osd erasure-code-profile set fooprofile a=b c=d e=f g=h
-  #
-  # cleanup by removing profile 'fooprofile'
+  # ruleset-foo will work for luminous only
+  ceph osd erasure-code-profile set barprofile ruleset-failure-domain=host
+  ceph osd erasure-code-profile set barprofile crush-failure-domain=host
+  # clean up
   ceph osd erasure-code-profile rm fooprofile
+  ceph osd erasure-code-profile rm barprofile
 }
 
 function test_mon_osd_misc()
diff --git a/ceph/qa/workunits/cls/test_cls_journal.sh b/ceph/qa/workunits/cls/test_cls_journal.sh
new file mode 100755 (executable)
index 0000000..9aa7450
--- /dev/null
@@ -0,0 +1,6 @@
+#!/bin/sh -e
+
+GTEST_FILTER=${CLS_JOURNAL_GTEST_FILTER:-*}
+ceph_test_cls_journal --gtest_filter=${GTEST_FILTER}
+
+exit 0
diff --git a/ceph/qa/workunits/mgr/test_localpool.sh b/ceph/qa/workunits/mgr/test_localpool.sh
new file mode 100755 (executable)
index 0000000..c5a56a6
--- /dev/null
@@ -0,0 +1,21 @@
+#!/bin/sh -ex
+
+ceph config-key set mgr/localpool/subtree host
+ceph config-key set mgr/localpool/failure_domain osd
+ceph mgr module enable localpool
+
+while ! ceph osd pool ls | grep '^by-host-'
+do
+    sleep 5
+done
+
+ceph mgr module disable localpool
+for p in `ceph osd pool ls | grep '^by-host-'`
+do
+    ceph osd pool rm $p $p --yes-i-really-really-mean-it
+done
+
+ceph config-key rm mgr/localpool/subtree
+ceph config-key rm mgr/localpool/failure_domain
+
+echo OK
index 6a3ebe0b22e2ddda1a0fedd0ef6b3834861bbac1..87c86ee69430870a52be7df2073c920c85ce031e 100755 (executable)
@@ -346,7 +346,7 @@ test_rmobj() {
     $CEPH_TOOL osd pool set-quota $p max_objects 1
     V1=`mktemp fooattrXXXXXXX`
     $RADOS_TOOL put $OBJ $V1 -p $p
-    while ! $CEPH_TOOL osd dump | grep 'full max_objects'
+    while ! $CEPH_TOOL osd dump | grep 'full_no_quota max_objects'
     do
        sleep 2
     done
index 04a03a66e5cfc82bed406a43f7ecd48199ecb7b0..5195e6cf3e91ea85d3bf47b384a3e8c206015c6f 100755 (executable)
@@ -111,6 +111,18 @@ wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image1} 'up+replaying'
 admin_daemon ${CLUSTER1} rbd mirror flush
 admin_daemon ${CLUSTER1} rbd mirror status
 
+testlog "TEST: test image rename"
+new_name="${image}_RENAMED"
+rename_image ${CLUSTER2} ${POOL} ${image} ${new_name}
+wait_for_image_replay_started ${CLUSTER1} ${POOL} ${new_name}
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${new_name} 'up+replaying'
+admin_daemon ${CLUSTER1} rbd mirror status ${POOL}/${new_name}
+admin_daemon ${CLUSTER1} rbd mirror restart ${POOL}/${new_name}
+wait_for_image_replay_started ${CLUSTER1} ${POOL} ${new_name}
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${new_name} 'up+replaying'
+rename_image ${CLUSTER2} ${POOL} ${new_name} ${image}
+wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
+
 testlog "TEST: failover and failback"
 start_mirror ${CLUSTER2}
 
index 23216711e6ac3c86f49e082c74b2a88e133fbffe..325353b91bc7e87b4520e45ce33d6a3da1ce53d2 100755 (executable)
@@ -593,6 +593,16 @@ set_image_meta()
     rbd --cluster ${cluster} -p ${pool} image-meta set ${image} $key $val
 }
 
+rename_image()
+{
+    local cluster=$1
+    local pool=$2
+    local image=$3
+    local new_name=$4
+
+    rbd --cluster=${cluster} -p ${pool} rename ${image} ${new_name}
+}
+
 remove_image()
 {
     local cluster=$1
index 552f73601cd9262e83c59bfc9c6876e6f5c8d085..0a9349803b12831eb72b266d4e74d5ac38c3e98a 100644 (file)
@@ -106,7 +106,7 @@ files_manage_generic_locks(ceph_t)
 
 allow ceph_t sysfs_t:dir read;
 allow ceph_t sysfs_t:file { read getattr open };
-allow ceph_t sysfs_t:lnk_file read;
+allow ceph_t sysfs_t:lnk_file { read getattr };
 
 allow ceph_t random_device_t:chr_file getattr;
 allow ceph_t urandom_device_t:chr_file getattr;
index 4af7022267304529383fce7d4ac048f1105c73b4..9b4bb5c8be8830a2ff83eaf3970387d52c6ee47b 100644 (file)
@@ -1,2 +1,2 @@
-3e7492b9ada8bdc9a5cd0feafd42fbca27f9c38e
-v12.2.1
+cf0baeeeeba3b47f9427c6c97e2144b094b7e5ba
+v12.2.2
diff --git a/ceph/src/90-ceph-osd.conf b/ceph/src/90-ceph-osd.conf
new file mode 100644 (file)
index 0000000..c5c64bb
--- /dev/null
@@ -0,0 +1 @@
+fs.aio-max-nr = 1048576
index 3cdcb95be4b9b78daace47edbba3c3eab1bd449e..3d3d2f7af04fc4d2aad310ae68bd82420dd2716a 100644 (file)
@@ -540,8 +540,11 @@ set(libcommon_files
   ${auth_files}
   ${mds_files})
 
+CHECK_C_COMPILER_FLAG("-fvar-tracking-assignments" HAS_VTA)
 if(HAS_VTA)
-  set_source_files_properties(common/config.cc
+  set_source_files_properties(
+    common/config.cc
+    common/options.cc
     PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
 endif()
 
@@ -691,12 +694,18 @@ if (WITH_MGR)
       mgr/DaemonState.cc
       mgr/DaemonServer.cc
       mgr/ClusterState.cc
-      mgr/PyModules.cc
+      mgr/ActivePyModules.cc
+      mgr/StandbyPyModules.cc
+      mgr/PyModuleRegistry.cc
+      mgr/PyModuleRunner.cc
       mgr/PyFormatter.cc
-      mgr/PyState.cc
-      mgr/MgrPyModule.cc
+      mgr/PyOSDMap.cc
+      mgr/BaseMgrModule.cc
+      mgr/BaseMgrStandbyModule.cc
+      mgr/ActivePyModule.cc
       mgr/MgrStandby.cc
       mgr/Mgr.cc
+      mgr/Gil.cc
       mgr/mgr_commands.cc)
   add_executable(ceph-mgr ${mgr_srcs}
                  $<TARGET_OBJECTS:heap_profiler_objs>)
@@ -726,7 +735,6 @@ add_subdirectory(ceph-volume)
 add_subdirectory(ceph-detect-init)
 
 ## dencoder
-CHECK_C_COMPILER_FLAG("-fvar-tracking-assignments" HAS_VTA)
 if(HAS_VTA)
   set_source_files_properties(test/encoding/ceph_dencoder.cc
     PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
index e3cfb5c594fd34064cca70bb660da8436c7f4d30..a8562f59674161bd4a81bce81e997191429bf967 100644 (file)
@@ -1,3 +1,4 @@
+#include "acconfig.h"
 #include "arch/probe.h"
 
 /* flags we export */
@@ -45,10 +46,8 @@ int ceph_arch_arm_probe(void)
        ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON;
 #elif __aarch64__ && __linux__
        ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD;
-# ifdef HWCAP_CRC32
+# if defined(HAVE_ARMV8_CRC) && defined(HWCAP_CRC32)
        ceph_arch_aarch64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32;
-# else
-       ceph_arch_aarch64_crc32 = 0;  // sorry!
 # endif
 #else
        if (0)
old mode 100755 (executable)
new mode 100644 (file)
index 8b0c5db..6516750
@@ -24,6 +24,7 @@ import argparse
 import base64
 import errno
 import fcntl
+import functools
 import json
 import logging
 import os
@@ -41,12 +42,23 @@ import pwd
 import grp
 import textwrap
 import glob
+import warnings
 
 CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
 CEPH_LOCKBOX_ONDISK_MAGIC = 'ceph lockbox volume v001'
 
 KEY_MANAGEMENT_MODE_V1 = 'ceph-mon v1'
 
+DEPRECATION_WARNING = """
+*******************************************************************************
+This tool is now deprecated in favor of ceph-volume.
+It is recommended to use ceph-volume for OSD deployments. For details see:
+
+    http://docs.ceph.com/docs/master/ceph-volume/#migrating
+
+*******************************************************************************
+"""
+
 PTYPE = {
     'regular': {
         'journal': {
@@ -721,6 +733,21 @@ def get_partition_mpath(dev, pnum):
         return None
 
 
+def retry(on_error=Exception, max_tries=10, wait=0.2, backoff=0):
+    def wrapper(func):
+        @functools.wraps(func)
+        def repeat(*args, **kwargs):
+            for tries in range(max_tries - 1):
+                try:
+                    return func(*args, **kwargs)
+                except on_error:
+                    time.sleep(wait + backoff * tries)
+            return func(*args, **kwargs)
+        return repeat
+    return wrapper
+
+
+@retry(Error)
 def get_partition_dev(dev, pnum):
     """
     get the device name for a partition
@@ -732,36 +759,25 @@ def get_partition_dev(dev, pnum):
        sda 1 -> sda1
        cciss/c0d1 1 -> cciss!c0d1p1
     """
-    max_retry = 10
-    for retry in range(0, max_retry + 1):
-        partname = None
-        error_msg = ""
-        if is_mpath(dev):
-            partname = get_partition_mpath(dev, pnum)
-        else:
-            name = get_dev_name(os.path.realpath(dev))
-            sys_entry = os.path.join(BLOCKDIR, name)
-            error_msg = " in %s" % sys_entry
-            for f in os.listdir(sys_entry):
-                if f.startswith(name) and f.endswith(str(pnum)):
-                    # we want the shortest name that starts with the base name
-                    # and ends with the partition number
-                    if not partname or len(f) < len(partname):
-                        partname = f
-        if partname:
-            if retry:
-                LOG.info('Found partition %d for %s after %d tries' %
-                         (pnum, dev, retry))
-            return get_dev_path(partname)
-        else:
-            if retry < max_retry:
-                LOG.info('Try %d/%d : partition %d for %s does not exist%s' %
-                         (retry + 1, max_retry, pnum, dev, error_msg))
-                time.sleep(.2)
-                continue
-            else:
-                raise Error('partition %d for %s does not appear to exist%s' %
-                            (pnum, dev, error_msg))
+    partname = None
+    error_msg = ""
+    if is_mpath(dev):
+        partname = get_partition_mpath(dev, pnum)
+    else:
+        name = get_dev_name(os.path.realpath(dev))
+        sys_entry = os.path.join(BLOCKDIR, name)
+        error_msg = " in %s" % sys_entry
+        for f in os.listdir(sys_entry):
+            if f.startswith(name) and f.endswith(str(pnum)):
+                # we want the shortest name that starts with the base name
+                # and ends with the partition number
+                if not partname or len(f) < len(partname):
+                    partname = f
+    if partname:
+        return get_dev_path(partname)
+    else:
+        raise Error('partition %d for %s does not appear to exist%s' %
+                    (pnum, dev, error_msg))
 
 
 def list_all_partitions():
@@ -1374,22 +1390,14 @@ def _dmcrypt_map(
         raise Error('unable to map device', rawdev, e)
 
 
-def dmcrypt_unmap(
-    _uuid
-):
+@retry(Error, max_tries=10, wait=0.5, backoff=1.0)
+def dmcrypt_unmap(_uuid):
     if not os.path.exists('/dev/mapper/' + _uuid):
         return
-    retries = 0
-    while True:
-        try:
-            command_check_call(['cryptsetup', 'remove', _uuid])
-            break
-        except subprocess.CalledProcessError as e:
-            if retries == 10:
-                raise Error('unable to unmap device', _uuid, e)
-            else:
-                time.sleep(0.5 + retries * 1.0)
-                retries += 1
+    try:
+        command_check_call(['cryptsetup', 'remove', _uuid])
+    except subprocess.CalledProcessError as e:
+        raise Error('unable to unmap device', _uuid, e)
 
 
 def mount(
@@ -1451,6 +1459,7 @@ def mount(
     return path
 
 
+@retry(UnmountError, max_tries=3, wait=0.5, backoff=1.0)
 def unmount(
     path,
     do_rm=True,
@@ -1458,25 +1467,17 @@ def unmount(
     """
     Unmount and removes the given mount point.
     """
-    retries = 0
-    while True:
-        try:
-            LOG.debug('Unmounting %s', path)
-            command_check_call(
-                [
-                    '/bin/umount',
-                    '--',
-                    path,
-                ],
-            )
-            break
-        except subprocess.CalledProcessError as e:
-            # on failure, retry 3 times with incremental backoff
-            if retries == 3:
-                raise UnmountError(e)
-            else:
-                time.sleep(0.5 + retries * 1.0)
-                retries += 1
+    try:
+        LOG.debug('Unmounting %s', path)
+        command_check_call(
+            [
+                '/bin/umount',
+                '--',
+                path,
+            ],
+        )
+    except subprocess.CalledProcessError as e:
+        raise UnmountError(e)
     if not do_rm:
         return
     os.rmdir(path)
@@ -1855,6 +1856,7 @@ class DevicePartition(object):
         return self.ptype_map[name]['ready']
 
     @staticmethod
+    @retry(OSError)
     def factory(path, dev, args):
         dmcrypt_type = CryptHelpers.get_dmcrypt_type(args)
         if ((path is not None and is_mpath(path)) or
@@ -3248,7 +3250,7 @@ def systemd_start(
     osd_id,
 ):
     systemd_disable(path, osd_id)
-    if is_mounted(path):
+    if os.path.ismount(path):
         style = ['--runtime']
     else:
         style = []
@@ -3760,6 +3762,20 @@ def main_activate(args):
             )
             osd_data = get_mount_point(cluster, osd_id)
 
+            args.cluster = cluster
+            if args.dmcrypt:
+                for name in Space.NAMES:
+                    # Check if encrypted device in journal
+                    dev_path = os.path.join(osd_data, name + '_dmcrypt')
+                    if not os.path.exists(dev_path):
+                        continue
+                    partition = DevicePartition.factory(
+                        path=None,
+                        dev=dev_path,
+                        args=args)
+                    partition.rawdev = args.path
+                    partition.map()
+
         elif stat.S_ISDIR(mode):
             (cluster, osd_id) = activate_dir(
                 path=args.path,
@@ -5633,6 +5649,8 @@ def make_zap_parser(subparsers):
 
 
 def main(argv):
+    # Deprecate from the very beginning
+    warnings.warn(DEPRECATION_WARNING)
     args = parse_args(argv)
 
     setup_logging(args.verbose, args.log_stdout)
@@ -5652,10 +5670,20 @@ def main(argv):
     CEPH_PREF_GROUP = args.setgroup
 
     if args.verbose:
-        args.func(args)
+        try:
+            args.func(args)
+        except Exception:
+            # warn on any exception when running with verbosity
+            warnings.warn(DEPRECATION_WARNING)
+            # but still raise the original issue
+            raise
+
     else:
         main_catch(args.func, args)
 
+    # if there aren't any errors, still log again at the very bottom
+    warnings.warn(DEPRECATION_WARNING)
+
 
 def setup_logging(verbose, log_stdout):
     loglevel = logging.WARNING
@@ -5682,6 +5710,8 @@ def main_catch(func, args):
         func(args)
 
     except Error as e:
+        # warn on generic 'error' exceptions
+        warnings.warn(DEPRECATION_WARNING)
         raise SystemExit(
             '{prog}: {msg}'.format(
                 prog=args.prog,
@@ -5690,6 +5720,8 @@ def main_catch(func, args):
         )
 
     except CephDiskException as error:
+        # warn on ceph-disk exceptions
+        warnings.warn(DEPRECATION_WARNING)
         exc_name = error.__class__.__name__
         raise SystemExit(
             '{prog} {exc_name}: {msg}'.format(
index a2bc483a23b32d6a1c989beef34be1a71402a6d8..bbf1e21c66319b4a32021f3703ef266e21992b8e 100644 (file)
@@ -25,4 +25,4 @@ commands = coverage run --append --source=ceph_disk {envbindir}/py.test -vv {tox
            coverage report --show-missing
 
 [testenv:flake8]
-commands = flake8 --ignore=H105,H405,E127 ceph_disk tests
+commands = flake8 --ignore=H105,H405,E127,E722 ceph_disk tests
diff --git a/ceph/src/ceph-volume/ceph_volume/api/__init__.py b/ceph/src/ceph-volume/ceph_volume/api/__init__.py
new file mode 100644 (file)
index 0000000..ecc9712
--- /dev/null
@@ -0,0 +1,3 @@
+"""
+Device API that can be shared among other implementations.
+"""
diff --git a/ceph/src/ceph-volume/ceph_volume/api/lvm.py b/ceph/src/ceph-volume/ceph_volume/api/lvm.py
new file mode 100644 (file)
index 0000000..d82aee6
--- /dev/null
@@ -0,0 +1,762 @@
+"""
+API for CRUD lvm tag operations. Follows the Ceph LVM tag naming convention
+that prefixes tags with ``ceph.`` and uses ``=`` for assignment, and provides
+set of utilities for interacting with LVM.
+"""
+from ceph_volume import process
+from ceph_volume.exceptions import MultipleLVsError, MultipleVGsError, MultiplePVsError
+
+
+def _output_parser(output, fields):
+    """
+    Newer versions of LVM allow ``--reportformat=json``, but older versions,
+    like the one included in Xenial do not. LVM has the ability to filter and
+    format its output so we assume the output will be in a format this parser
+    can handle (using ',' as a delimiter)
+
+    :param fields: A string, possibly using ',' to group many items, as it
+                   would be used on the CLI
+    :param output: The CLI output from the LVM call
+    """
+    field_items = fields.split(',')
+    report = []
+    for line in output:
+        # clear the leading/trailing whitespace
+        line = line.strip()
+
+        # remove the extra '"' in each field
+        line = line.replace('"', '')
+
+        # prevent moving forward with empty contents
+        if not line:
+            continue
+
+        # spliting on ';' because that is what the lvm call uses as
+        # '--separator'
+        output_items = [i.strip() for i in line.split(';')]
+        # map the output to the fiels
+        report.append(
+            dict(zip(field_items, output_items))
+        )
+
+    return report
+
+
+def parse_tags(lv_tags):
+    """
+    Return a dictionary mapping of all the tags associated with
+    a Volume from the comma-separated tags coming from the LVM API
+
+    Input look like::
+
+       "ceph.osd_fsid=aaa-fff-bbbb,ceph.osd_id=0"
+
+    For the above example, the expected return value would be::
+
+        {
+            "ceph.osd_fsid": "aaa-fff-bbbb",
+            "ceph.osd_id": "0"
+        }
+    """
+    if not lv_tags:
+        return {}
+    tag_mapping = {}
+    tags = lv_tags.split(',')
+    for tag_assignment in tags:
+        key, value = tag_assignment.split('=', 1)
+        tag_mapping[key] = value
+
+    return tag_mapping
+
+
+def get_api_vgs():
+    """
+    Return the list of group volumes available in the system using flags to
+    include common metadata associated with them
+
+    Command and sample delimeted output, should look like::
+
+        $ sudo vgs --noheadings --separator=';' \
+          -o vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free
+          ubuntubox-vg;1;2;0;wz--n-;299.52g;12.00m
+          osd_vg;3;1;0;wz--n-;29.21g;9.21g
+
+    """
+    fields = 'vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free'
+    stdout, stderr, returncode = process.call(
+        ['sudo', 'vgs', '--noheadings', '--separator=";"', '-o', fields]
+    )
+    return _output_parser(stdout, fields)
+
+
+def get_api_lvs():
+    """
+    Return the list of logical volumes available in the system using flags to include common
+    metadata associated with them
+
+    Command and delimeted output, should look like::
+
+        $ sudo lvs --noheadings --separator=';' -o lv_tags,lv_path,lv_name,vg_name
+          ;/dev/ubuntubox-vg/root;root;ubuntubox-vg
+          ;/dev/ubuntubox-vg/swap_1;swap_1;ubuntubox-vg
+
+    """
+    fields = 'lv_tags,lv_path,lv_name,vg_name,lv_uuid'
+    stdout, stderr, returncode = process.call(
+        ['sudo', 'lvs', '--noheadings', '--separator=";"', '-o', fields]
+    )
+    return _output_parser(stdout, fields)
+
+
+def get_api_pvs():
+    """
+    Return the list of physical volumes configured for lvm and available in the
+    system using flags to include common metadata associated with them like the uuid
+
+    Command and delimeted output, should look like::
+
+        $ sudo pvs --noheadings --separator=';' -o pv_name,pv_tags,pv_uuid
+          /dev/sda1;;
+          /dev/sdv;;07A4F654-4162-4600-8EB3-88D1E42F368D
+
+    """
+    fields = 'pv_name,pv_tags,pv_uuid'
+
+    # note the use of `pvs -a` which will return every physical volume including
+    # ones that have not been initialized as "pv" by LVM
+    stdout, stderr, returncode = process.call(
+        ['sudo', 'pvs', '-a', '--no-heading', '--separator=";"', '-o', fields]
+    )
+
+    return _output_parser(stdout, fields)
+
+
+def get_lv_from_argument(argument):
+    """
+    Helper proxy function that consumes a possible logical volume passed in from the CLI
+    in the form of `vg/lv`, but with some validation so that an argument that is a full
+    path to a device can be ignored
+    """
+    if argument.startswith('/'):
+        lv = get_lv(lv_path=argument)
+        return lv
+    try:
+        vg_name, lv_name = argument.split('/')
+    except (ValueError, AttributeError):
+        return None
+    return get_lv(lv_name=lv_name, vg_name=vg_name)
+
+
+def get_lv(lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
+    """
+    Return a matching lv for the current system, requiring ``lv_name``,
+    ``vg_name``, ``lv_path`` or ``tags``. Raises an error if more than one lv
+    is found.
+
+    It is useful to use ``tags`` when trying to find a specific logical volume,
+    but it can also lead to multiple lvs being found, since a lot of metadata
+    is shared between lvs of a distinct OSD.
+    """
+    if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
+        return None
+    lvs = Volumes()
+    return lvs.get(
+        lv_name=lv_name, vg_name=vg_name, lv_path=lv_path, lv_uuid=lv_uuid,
+        lv_tags=lv_tags
+    )
+
+
+def get_pv(pv_name=None, pv_uuid=None, pv_tags=None):
+    """
+    Return a matching pv (physical volume) for the current system, requiring
+    ``pv_name``, ``pv_uuid``, or ``pv_tags``. Raises an error if more than one
+    pv is found.
+    """
+    if not any([pv_name, pv_uuid, pv_tags]):
+        return None
+    pvs = PVolumes()
+    return pvs.get(pv_name=pv_name, pv_uuid=pv_uuid, pv_tags=pv_tags)
+
+
+def create_pv(device):
+    """
+    Create a physical volume from a device, useful when devices need to be later mapped
+    to journals.
+    """
+    process.run([
+        'sudo',
+        'pvcreate',
+        '-v',  # verbose
+        '-f',  # force it
+        '--yes', # answer yes to any prompts
+        device
+    ])
+
+
+def create_vg(name, *devices):
+    """
+    Create a Volume Group. Command looks like::
+
+        vgcreate --force --yes group_name device
+
+    Once created the volume group is returned as a ``VolumeGroup`` object
+    """
+    process.run([
+        'sudo',
+        'vgcreate',
+        '--force',
+        '--yes',
+        name] + list(devices)
+    )
+
+    vg = get_vg(vg_name=name)
+    return vg
+
+
+def remove_lv(path):
+    """
+    Removes a logical volume given it's absolute path.
+
+    Will return True if the lv is successfully removed or
+    raises a RuntimeError if the removal fails.
+    """
+    stdout, stderr, returncode = process.call(
+        [
+            'sudo',
+            'lvremove',
+            '-v',  # verbose
+            '-f',  # force it
+            path
+        ],
+        show_command=True,
+        terminal_verbose=True,
+    )
+    if returncode != 0:
+        raise RuntimeError("Unable to remove %s".format(path))
+    return True
+
+
+def create_lv(name, group, size=None, tags=None):
+    """
+    Create a Logical Volume in a Volume Group. Command looks like::
+
+        lvcreate -L 50G -n gfslv vg0
+
+    ``name``, ``group``, are required. If ``size`` is provided it must follow
+    lvm's size notation (like 1G, or 20M). Tags are an optional dictionary and is expected to
+    conform to the convention of prefixing them with "ceph." like::
+
+        {"ceph.block_device": "/dev/ceph/osd-1"}
+    """
+    # XXX add CEPH_VOLUME_LVM_DEBUG to enable -vvvv on lv operations
+    type_path_tag = {
+        'journal': 'ceph.journal_device',
+        'data': 'ceph.data_device',
+        'block': 'ceph.block_device',
+        'wal': 'ceph.wal_device',
+        'db': 'ceph.db_device',
+        'lockbox': 'ceph.lockbox_device',  # XXX might not ever need this lockbox sorcery
+    }
+    if size:
+        process.run([
+            'sudo',
+            'lvcreate',
+            '--yes',
+            '-L',
+            '%s' % size,
+            '-n', name, group
+        ])
+    # create the lv with all the space available, this is needed because the
+    # system call is different for LVM
+    else:
+        process.run([
+            'sudo',
+            'lvcreate',
+            '--yes',
+            '-l',
+            '100%FREE',
+            '-n', name, group
+        ])
+
+    lv = get_lv(lv_name=name, vg_name=group)
+    lv.set_tags(tags)
+
+    # when creating a distinct type, the caller doesn't know what the path will
+    # be so this function will set it after creation using the mapping
+    path_tag = type_path_tag.get(tags.get('ceph.type'))
+    if path_tag:
+        lv.set_tags(
+            {path_tag: lv.lv_path}
+        )
+    return lv
+
+
+def get_vg(vg_name=None, vg_tags=None):
+    """
+    Return a matching vg for the current system, requires ``vg_name`` or
+    ``tags``. Raises an error if more than one vg is found.
+
+    It is useful to use ``tags`` when trying to find a specific volume group,
+    but it can also lead to multiple vgs being found.
+    """
+    if not any([vg_name, vg_tags]):
+        return None
+    vgs = VolumeGroups()
+    return vgs.get(vg_name=vg_name, vg_tags=vg_tags)
+
+
+class VolumeGroups(list):
+    """
+    A list of all known volume groups for the current system, with the ability
+    to filter them via keyword arguments.
+    """
+
+    def __init__(self):
+        self._populate()
+
+    def _populate(self):
+        # get all the vgs in the current system
+        for vg_item in get_api_vgs():
+            self.append(VolumeGroup(**vg_item))
+
+    def _purge(self):
+        """
+        Deplete all the items in the list, used internally only so that we can
+        dynamically allocate the items when filtering without the concern of
+        messing up the contents
+        """
+        self[:] = []
+
+    def _filter(self, vg_name=None, vg_tags=None):
+        """
+        The actual method that filters using a new list. Useful so that other
+        methods that do not want to alter the contents of the list (e.g.
+        ``self.find``) can operate safely.
+
+        .. note:: ``vg_tags`` is not yet implemented
+        """
+        filtered = [i for i in self]
+        if vg_name:
+            filtered = [i for i in filtered if i.vg_name == vg_name]
+
+        # at this point, `filtered` has either all the volumes in self or is an
+        # actual filtered list if any filters were applied
+        if vg_tags:
+            tag_filtered = []
+            for volume in filtered:
+                matches = all(volume.tags.get(k) == str(v) for k, v in vg_tags.items())
+                if matches:
+                    tag_filtered.append(volume)
+            return tag_filtered
+
+        return filtered
+
+    def filter(self, vg_name=None, vg_tags=None):
+        """
+        Filter out groups on top level attributes like ``vg_name`` or by
+        ``vg_tags`` where a dict is required. For example, to find a Ceph group
+        with dmcache as the type, the filter would look like::
+
+            vg_tags={'ceph.type': 'dmcache'}
+
+        .. warning:: These tags are not documented because they are currently
+                     unused, but are here to maintain API consistency
+        """
+        if not any([vg_name, vg_tags]):
+            raise TypeError('.filter() requires vg_name or vg_tags (none given)')
+        # first find the filtered volumes with the values in self
+        filtered_groups = self._filter(
+            vg_name=vg_name,
+            vg_tags=vg_tags
+        )
+        # then purge everything
+        self._purge()
+        # and add the filtered items
+        self.extend(filtered_groups)
+
+    def get(self, vg_name=None, vg_tags=None):
+        """
+        This is a bit expensive, since it will try to filter out all the
+        matching items in the list, filter them out applying anything that was
+        added and return the matching item.
+
+        This method does *not* alter the list, and it will raise an error if
+        multiple VGs are matched
+
+        It is useful to use ``tags`` when trying to find a specific volume group,
+        but it can also lead to multiple vgs being found (although unlikely)
+        """
+        if not any([vg_name, vg_tags]):
+            return None
+        vgs = self._filter(
+            vg_name=vg_name,
+            vg_tags=vg_tags
+        )
+        if not vgs:
+            return None
+        if len(vgs) > 1:
+            # this is probably never going to happen, but it is here to keep
+            # the API code consistent
+            raise MultipleVGsError(vg_name)
+        return vgs[0]
+
+
+class Volumes(list):
+    """
+    A list of all known (logical) volumes for the current system, with the ability
+    to filter them via keyword arguments.
+    """
+
+    def __init__(self):
+        self._populate()
+
+    def _populate(self):
+        # get all the lvs in the current system
+        for lv_item in get_api_lvs():
+            self.append(Volume(**lv_item))
+
+    def _purge(self):
+        """
+        Deplete all the items in the list, used internally only so that we can
+        dynamically allocate the items when filtering without the concern of
+        messing up the contents
+        """
+        self[:] = []
+
+    def _filter(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
+        """
+        The actual method that filters using a new list. Useful so that other
+        methods that do not want to alter the contents of the list (e.g.
+        ``self.find``) can operate safely.
+        """
+        filtered = [i for i in self]
+        if lv_name:
+            filtered = [i for i in filtered if i.lv_name == lv_name]
+
+        if vg_name:
+            filtered = [i for i in filtered if i.vg_name == vg_name]
+
+        if lv_uuid:
+            filtered = [i for i in filtered if i.lv_uuid == lv_uuid]
+
+        if lv_path:
+            filtered = [i for i in filtered if i.lv_path == lv_path]
+
+        # at this point, `filtered` has either all the volumes in self or is an
+        # actual filtered list if any filters were applied
+        if lv_tags:
+            tag_filtered = []
+            for volume in filtered:
+                # all the tags we got need to match on the volume
+                matches = all(volume.tags.get(k) == str(v) for k, v in lv_tags.items())
+                if matches:
+                    tag_filtered.append(volume)
+            return tag_filtered
+
+        return filtered
+
+    def filter(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
+        """
+        Filter out volumes on top level attributes like ``lv_name`` or by
+        ``lv_tags`` where a dict is required. For example, to find a volume
+        that has an OSD ID of 0, the filter would look like::
+
+            lv_tags={'ceph.osd_id': '0'}
+
+        """
+        if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
+            raise TypeError('.filter() requires lv_name, vg_name, lv_path, lv_uuid, or tags (none given)')
+        # first find the filtered volumes with the values in self
+        filtered_volumes = self._filter(
+            lv_name=lv_name,
+            vg_name=vg_name,
+            lv_path=lv_path,
+            lv_uuid=lv_uuid,
+            lv_tags=lv_tags
+        )
+        # then purge everything
+        self._purge()
+        # and add the filtered items
+        self.extend(filtered_volumes)
+
+    def get(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
+        """
+        This is a bit expensive, since it will try to filter out all the
+        matching items in the list, filter them out applying anything that was
+        added and return the matching item.
+
+        This method does *not* alter the list, and it will raise an error if
+        multiple LVs are matched
+
+        It is useful to use ``tags`` when trying to find a specific logical volume,
+        but it can also lead to multiple lvs being found, since a lot of metadata
+        is shared between lvs of a distinct OSD.
+        """
+        if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
+            return None
+        lvs = self._filter(
+            lv_name=lv_name,
+            vg_name=vg_name,
+            lv_path=lv_path,
+            lv_uuid=lv_uuid,
+            lv_tags=lv_tags
+        )
+        if not lvs:
+            return None
+        if len(lvs) > 1:
+            raise MultipleLVsError(lv_name, lv_path)
+        return lvs[0]
+
+
+class PVolumes(list):
+    """
+    A list of all known (physical) volumes for the current system, with the ability
+    to filter them via keyword arguments.
+    """
+
+    def __init__(self):
+        self._populate()
+
+    def _populate(self):
+        # get all the pvs in the current system
+        for pv_item in get_api_pvs():
+            self.append(PVolume(**pv_item))
+
+    def _purge(self):
+        """
+        Deplete all the items in the list, used internally only so that we can
+        dynamically allocate the items when filtering without the concern of
+        messing up the contents
+        """
+        self[:] = []
+
+    def _filter(self, pv_name=None, pv_uuid=None, pv_tags=None):
+        """
+        The actual method that filters using a new list. Useful so that other
+        methods that do not want to alter the contents of the list (e.g.
+        ``self.find``) can operate safely.
+        """
+        filtered = [i for i in self]
+        if pv_name:
+            filtered = [i for i in filtered if i.pv_name == pv_name]
+
+        if pv_uuid:
+            filtered = [i for i in filtered if i.pv_uuid == pv_uuid]
+
+        # at this point, `filtered` has either all the physical volumes in self
+        # or is an actual filtered list if any filters were applied
+        if pv_tags:
+            tag_filtered = []
+            for pvolume in filtered:
+                matches = all(pvolume.tags.get(k) == str(v) for k, v in pv_tags.items())
+                if matches:
+                    tag_filtered.append(pvolume)
+            # return the tag_filtered pvolumes here, the `filtered` list is no
+            # longer useable
+            return tag_filtered
+
+        return filtered
+
+    def filter(self, pv_name=None, pv_uuid=None, pv_tags=None):
+        """
+        Filter out volumes on top level attributes like ``pv_name`` or by
+        ``pv_tags`` where a dict is required. For example, to find a physical volume
+        that has an OSD ID of 0, the filter would look like::
+
+            pv_tags={'ceph.osd_id': '0'}
+
+        """
+        if not any([pv_name, pv_uuid, pv_tags]):
+            raise TypeError('.filter() requires pv_name, pv_uuid, or pv_tags (none given)')
+        # first find the filtered volumes with the values in self
+        filtered_volumes = self._filter(
+            pv_name=pv_name,
+            pv_uuid=pv_uuid,
+            pv_tags=pv_tags
+        )
+        # then purge everything
+        self._purge()
+        # and add the filtered items
+        self.extend(filtered_volumes)
+
+    def get(self, pv_name=None, pv_uuid=None, pv_tags=None):
+        """
+        This is a bit expensive, since it will try to filter out all the
+        matching items in the list, filter them out applying anything that was
+        added and return the matching item.
+
+        This method does *not* alter the list, and it will raise an error if
+        multiple pvs are matched
+
+        It is useful to use ``tags`` when trying to find a specific logical volume,
+        but it can also lead to multiple pvs being found, since a lot of metadata
+        is shared between pvs of a distinct OSD.
+        """
+        if not any([pv_name, pv_uuid, pv_tags]):
+            return None
+        pvs = self._filter(
+            pv_name=pv_name,
+            pv_uuid=pv_uuid,
+            pv_tags=pv_tags
+        )
+        if not pvs:
+            return None
+        if len(pvs) > 1:
+            raise MultiplePVsError(pv_name)
+        return pvs[0]
+
+
+class VolumeGroup(object):
+    """
+    Represents an LVM group, with some top-level attributes like ``vg_name``
+    """
+
+    def __init__(self, **kw):
+        for k, v in kw.items():
+            setattr(self, k, v)
+        self.name = kw['vg_name']
+        self.tags = parse_tags(kw.get('vg_tags', ''))
+
+    def __str__(self):
+        return '<%s>' % self.name
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class Volume(object):
+    """
+    Represents a Logical Volume from LVM, with some top-level attributes like
+    ``lv_name`` and parsed tags as a dictionary of key/value pairs.
+    """
+
+    def __init__(self, **kw):
+        for k, v in kw.items():
+            setattr(self, k, v)
+        self.lv_api = kw
+        self.name = kw['lv_name']
+        self.tags = parse_tags(kw['lv_tags'])
+
+    def __str__(self):
+        return '<%s>' % self.lv_api['lv_path']
+
+    def __repr__(self):
+        return self.__str__()
+
+    def as_dict(self):
+        obj = {}
+        obj.update(self.lv_api)
+        obj['tags'] = self.tags
+        obj['name'] = self.name
+        obj['type'] = self.tags['ceph.type']
+        obj['path'] = self.lv_path
+        return obj
+
+    def clear_tags(self):
+        """
+        Removes all tags from the Logical Volume.
+        """
+        for k, v in self.tags.items():
+            tag = "%s=%s" % (k, v)
+            process.run(['sudo', 'lvchange', '--deltag', tag, self.lv_path])
+
+    def set_tags(self, tags):
+        """
+        :param tags: A dictionary of tag names and values, like::
+
+            {
+                "ceph.osd_fsid": "aaa-fff-bbbb",
+                "ceph.osd_id": "0"
+            }
+
+        At the end of all modifications, the tags are refreshed to reflect
+        LVM's most current view.
+        """
+        for k, v in tags.items():
+            self.set_tag(k, v)
+        # after setting all the tags, refresh them for the current object, use the
+        # lv_* identifiers to filter because those shouldn't change
+        lv_object = get_lv(lv_name=self.lv_name, lv_path=self.lv_path)
+        self.tags = lv_object.tags
+
+    def set_tag(self, key, value):
+        """
+        Set the key/value pair as an LVM tag. Does not "refresh" the values of
+        the current object for its tags. Meant to be a "fire and forget" type
+        of modification.
+        """
+        # remove it first if it exists
+        if self.tags.get(key):
+            current_value = self.tags[key]
+            tag = "%s=%s" % (key, current_value)
+            process.call(['sudo', 'lvchange', '--deltag', tag, self.lv_api['lv_path']])
+
+        process.call(
+            [
+                'sudo', 'lvchange',
+                '--addtag', '%s=%s' % (key, value), self.lv_path
+            ]
+        )
+
+
+class PVolume(object):
+    """
+    Represents a Physical Volume from LVM, with some top-level attributes like
+    ``pv_name`` and parsed tags as a dictionary of key/value pairs.
+    """
+
+    def __init__(self, **kw):
+        for k, v in kw.items():
+            setattr(self, k, v)
+        self.pv_api = kw
+        self.name = kw['pv_name']
+        self.tags = parse_tags(kw['pv_tags'])
+
+    def __str__(self):
+        return '<%s>' % self.pv_api['pv_name']
+
+    def __repr__(self):
+        return self.__str__()
+
+    def set_tags(self, tags):
+        """
+        :param tags: A dictionary of tag names and values, like::
+
+            {
+                "ceph.osd_fsid": "aaa-fff-bbbb",
+                "ceph.osd_id": "0"
+            }
+
+        At the end of all modifications, the tags are refreshed to reflect
+        LVM's most current view.
+        """
+        for k, v in tags.items():
+            self.set_tag(k, v)
+        # after setting all the tags, refresh them for the current object, use the
+        # pv_* identifiers to filter because those shouldn't change
+        pv_object = get_pv(pv_name=self.pv_name, pv_uuid=self.pv_uuid)
+        self.tags = pv_object.tags
+
+    def set_tag(self, key, value):
+        """
+        Set the key/value pair as an LVM tag. Does not "refresh" the values of
+        the current object for its tags. Meant to be a "fire and forget" type
+        of modification.
+
+        **warning**: Altering tags on a PV has to be done ensuring that the
+        device is actually the one intended. ``pv_name`` is *not* a persistent
+        value, only ``pv_uuid`` is. Using ``pv_uuid`` is the best way to make
+        sure the device getting changed is the one needed.
+        """
+        # remove it first if it exists
+        if self.tags.get(key):
+            current_value = self.tags[key]
+            tag = "%s=%s" % (key, current_value)
+            process.call(['sudo', 'pvchange', '--deltag', tag, self.pv_name])
+
+        process.call(
+            [
+                'sudo', 'pvchange',
+                '--addtag', '%s=%s' % (key, value), self.pv_name
+            ]
+        )
index c1e14bc79088c32601256078dd0eb0500ba3f80e..d0be938172f99d912a6f71870bd1f46ae5e211e8 100644 (file)
@@ -58,6 +58,9 @@ def catches(catch=None, handler=None, exit=True):
             try:
                 return f(*a, **kw)
             except catch as e:
+                import logging
+                logger = logging.getLogger('ceph_volume')
+                logger.exception('exception caught by decorator')
                 if os.environ.get('CEPH_VOLUME_DEBUG'):
                     raise
                 if handler:
index c77c344d65d11c08c06eb88644529835ea50b755..8af5d1e74e0196b16c80f43daddd7988ffd3e890 100644 (file)
@@ -1 +1 @@
-from . import lvm # noqa
+from . import lvm, simple # noqa
index 5a755672a95dc243e9d7522586c3da00cb13b1e1..0a50e7a33cd8c2c64dbd7114eaef972f550fa280 100644 (file)
@@ -1,16 +1,25 @@
 from __future__ import print_function
 import argparse
+import logging
+import os
 from textwrap import dedent
 from ceph_volume import process, conf, decorators
 from ceph_volume.util import system, disk
+from ceph_volume.util import prepare as prepare_utils
 from ceph_volume.systemd import systemctl
-from . import api
+from ceph_volume.api import lvm as api
+
+
+logger = logging.getLogger(__name__)
 
 
 def activate_filestore(lvs):
     # find the osd
     osd_lv = lvs.get(lv_tags={'ceph.type': 'data'})
+    if not osd_lv:
+        raise RuntimeError('Unable to find a data LV for filestore activation')
     osd_id = osd_lv.tags['ceph.osd_id']
+    conf.cluster = osd_lv.tags['ceph.cluster_name']
     # it may have a volume with a journal
     osd_journal_lv = lvs.get(lv_tags={'ceph.type': 'journal'})
     # TODO: add sensible error reporting if this is ever the case
@@ -29,7 +38,7 @@ def activate_filestore(lvs):
     # mount the osd
     source = osd_lv.lv_path
     destination = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
-    if not system.is_mounted(source, destination=destination):
+    if not system.device_is_mounted(source, destination=destination):
         process.run(['sudo', 'mount', '-v', source, destination])
 
     # always re-do the symlink regardless if it exists, so that the journal
@@ -47,9 +56,76 @@ def activate_filestore(lvs):
     systemctl.start_osd(osd_id)
 
 
+def get_osd_device_path(osd_lv, lvs, device_type):
+    """
+    ``device_type`` can be one of ``db``, ``wal`` or ``block`` so that
+    we can query ``lvs`` (a ``Volumes`` object) and fallback to querying the uuid
+    if that is not present.
+
+    Return a path if possible, failing to do that a ``None``, since some of these devices
+    are optional
+    """
+    osd_lv = lvs.get(lv_tags={'ceph.type': 'block'})
+    uuid_tag = 'ceph.%s_uuid' % device_type
+    device_uuid = osd_lv.tags.get(uuid_tag)
+    if not device_uuid:
+        return None
+
+    device_lv = lvs.get(lv_uuid=device_uuid)
+    if device_lv:
+        return device_lv.lv_path
+    else:
+        # this could be a regular device, so query it with blkid
+        physical_device = disk.get_device_from_partuuid(device_uuid)
+        return physical_device or None
+    return None
+
+
 def activate_bluestore(lvs):
-    # TODO
-    pass
+    # find the osd
+    osd_lv = lvs.get(lv_tags={'ceph.type': 'block'})
+    osd_id = osd_lv.tags['ceph.osd_id']
+    conf.cluster = osd_lv.tags['ceph.cluster_name']
+    osd_fsid = osd_lv.tags['ceph.osd_fsid']
+    db_device_path = get_osd_device_path(osd_lv, lvs, 'db')
+    wal_device_path = get_osd_device_path(osd_lv, lvs, 'wal')
+
+    # mount on tmpfs the osd directory
+    osd_path = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
+    if not system.path_is_mounted(osd_path):
+        # mkdir -p and mount as tmpfs
+        prepare_utils.create_osd_path(osd_id, tmpfs=True)
+    # XXX This needs to be removed once ceph-bluestore-tool can deal with
+    # symlinks that exist in the osd dir
+    for link_name in ['block', 'block.db', 'block.wal']:
+        link_path = os.path.join(osd_path, link_name)
+        if os.path.exists(link_path):
+            os.unlink(os.path.join(osd_path, link_name))
+    # Once symlinks are removed, the osd dir can be 'primed again.
+    process.run([
+        'sudo', 'ceph-bluestore-tool', '--cluster=%s' % conf.cluster,
+        'prime-osd-dir', '--dev', osd_lv.lv_path,
+        '--path', osd_path])
+    # always re-do the symlink regardless if it exists, so that the block,
+    # block.wal, and block.db devices that may have changed can be mapped
+    # correctly every time
+    process.run(['sudo', 'ln', '-snf', osd_lv.lv_path, os.path.join(osd_path, 'block')])
+    system.chown(os.path.join(osd_path, 'block'))
+    system.chown(osd_path)
+    if db_device_path:
+        destination = os.path.join(osd_path, 'block.db')
+        process.run(['sudo', 'ln', '-snf', db_device_path, destination])
+        system.chown(db_device_path)
+    if wal_device_path:
+        destination = os.path.join(osd_path, 'block.wal')
+        process.run(['sudo', 'ln', '-snf', wal_device_path, destination])
+        system.chown(wal_device_path)
+
+    # enable the ceph-volume unit for this OSD
+    systemctl.enable_volume(osd_id, osd_fsid, 'lvm')
+
+    # start the OSD
+    systemctl.start_osd(osd_id)
 
 
 class Activate(object):
@@ -69,7 +145,22 @@ class Activate(object):
             lvs.filter(lv_tags={'ceph.osd_fsid': args.osd_fsid})
         if not lvs:
             raise RuntimeError('could not find osd.%s with fsid %s' % (args.osd_id, args.osd_fsid))
-        activate_filestore(lvs)
+        # This argument is only available when passed in directly or via
+        # systemd, not when ``create`` is being used
+        if getattr(args, 'auto_detect_objectstore', False):
+            logger.info('auto detecting objectstore')
+            # may get multiple lvs, so can't do lvs.get() calls here
+            for lv in lvs:
+                has_journal = lv.tags.get('ceph.journal_uuid')
+                if has_journal:
+                    logger.info('found a journal associated with the OSD, assuming filestore')
+                    return activate_filestore(lvs)
+            logger.info('unable to find a journal associated with the OSD, assuming bluestore')
+            return activate_bluestore(lvs)
+        if args.bluestore:
+            activate_bluestore(lvs)
+        elif args.filestore:
+            activate_filestore(lvs)
 
     def main(self):
         sub_command_help = dedent("""
@@ -100,18 +191,27 @@ class Activate(object):
             nargs='?',
             help='The FSID of the OSD, similar to a SHA1'
         )
+        parser.add_argument(
+            '--auto-detect-objectstore',
+            action='store_true',
+            help='Autodetect the objectstore by inspecting the OSD',
+        )
         parser.add_argument(
             '--bluestore',
-            action='store_true', default=False,
+            action='store_true',
             help='filestore objectstore (not yet implemented)',
         )
         parser.add_argument(
             '--filestore',
-            action='store_true', default=True,
+            action='store_true',
             help='filestore objectstore (current default)',
         )
         if len(self.argv) == 0:
             print(sub_command_help)
             return
         args = parser.parse_args(self.argv)
+        # Default to bluestore here since defaulting it in add_argument may
+        # cause both to be True
+        if not args.bluestore and not args.filestore:
+            args.bluestore = True
         self.activate(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/api.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/api.py
deleted file mode 100644 (file)
index e5bc262..0000000
+++ /dev/null
@@ -1,686 +0,0 @@
-"""
-API for CRUD lvm tag operations. Follows the Ceph LVM tag naming convention
-that prefixes tags with ``ceph.`` and uses ``=`` for assignment, and provides
-set of utilities for interacting with LVM.
-"""
-from ceph_volume import process
-from ceph_volume.exceptions import MultipleLVsError, MultipleVGsError, MultiplePVsError
-
-
-def _output_parser(output, fields):
-    """
-    Newer versions of LVM allow ``--reportformat=json``, but older versions,
-    like the one included in Xenial do not. LVM has the ability to filter and
-    format its output so we assume the output will be in a format this parser
-    can handle (using ',' as a delimiter)
-
-    :param fields: A string, possibly using ',' to group many items, as it
-                   would be used on the CLI
-    :param output: The CLI output from the LVM call
-    """
-    field_items = fields.split(',')
-    report = []
-    for line in output:
-        # clear the leading/trailing whitespace
-        line = line.strip()
-
-        # remove the extra '"' in each field
-        line = line.replace('"', '')
-
-        # prevent moving forward with empty contents
-        if not line:
-            continue
-
-        # spliting on ';' because that is what the lvm call uses as
-        # '--separator'
-        output_items = [i.strip() for i in line.split(';')]
-        # map the output to the fiels
-        report.append(
-            dict(zip(field_items, output_items))
-        )
-
-    return report
-
-
-def parse_tags(lv_tags):
-    """
-    Return a dictionary mapping of all the tags associated with
-    a Volume from the comma-separated tags coming from the LVM API
-
-    Input look like::
-
-       "ceph.osd_fsid=aaa-fff-bbbb,ceph.osd_id=0"
-
-    For the above example, the expected return value would be::
-
-        {
-            "ceph.osd_fsid": "aaa-fff-bbbb",
-            "ceph.osd_id": "0"
-        }
-    """
-    if not lv_tags:
-        return {}
-    tag_mapping = {}
-    tags = lv_tags.split(',')
-    for tag_assignment in tags:
-        key, value = tag_assignment.split('=', 1)
-        tag_mapping[key] = value
-
-    return tag_mapping
-
-
-def get_api_vgs():
-    """
-    Return the list of group volumes available in the system using flags to
-    include common metadata associated with them
-
-    Command and sample delimeted output, should look like::
-
-        $ sudo vgs --noheadings --separator=';' \
-          -o vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free
-          ubuntubox-vg;1;2;0;wz--n-;299.52g;12.00m
-          osd_vg;3;1;0;wz--n-;29.21g;9.21g
-
-    """
-    fields = 'vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free'
-    stdout, stderr, returncode = process.call(
-        ['sudo', 'vgs', '--noheadings', '--separator=";"', '-o', fields]
-    )
-    return _output_parser(stdout, fields)
-
-
-def get_api_lvs():
-    """
-    Return the list of logical volumes available in the system using flags to include common
-    metadata associated with them
-
-    Command and delimeted output, should look like::
-
-        $ sudo lvs --noheadings --separator=';' -o lv_tags,lv_path,lv_name,vg_name
-          ;/dev/ubuntubox-vg/root;root;ubuntubox-vg
-          ;/dev/ubuntubox-vg/swap_1;swap_1;ubuntubox-vg
-
-    """
-    fields = 'lv_tags,lv_path,lv_name,vg_name,lv_uuid'
-    stdout, stderr, returncode = process.call(
-        ['sudo', 'lvs', '--noheadings', '--separator=";"', '-o', fields]
-    )
-    return _output_parser(stdout, fields)
-
-
-def get_api_pvs():
-    """
-    Return the list of physical volumes configured for lvm and available in the
-    system using flags to include common metadata associated with them like the uuid
-
-    Command and delimeted output, should look like::
-
-        $ sudo pvs --noheadings --separator=';' -o pv_name,pv_tags,pv_uuid
-          /dev/sda1;;
-          /dev/sdv;;07A4F654-4162-4600-8EB3-88D1E42F368D
-
-    """
-    fields = 'pv_name,pv_tags,pv_uuid'
-
-    # note the use of `pvs -a` which will return every physical volume including
-    # ones that have not been initialized as "pv" by LVM
-    stdout, stderr, returncode = process.call(
-        ['sudo', 'pvs', '-a', '--no-heading', '--separator=";"', '-o', fields]
-    )
-
-    return _output_parser(stdout, fields)
-
-
-def get_lv(lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
-    """
-    Return a matching lv for the current system, requiring ``lv_name``,
-    ``vg_name``, ``lv_path`` or ``tags``. Raises an error if more than one lv
-    is found.
-
-    It is useful to use ``tags`` when trying to find a specific logical volume,
-    but it can also lead to multiple lvs being found, since a lot of metadata
-    is shared between lvs of a distinct OSD.
-    """
-    if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
-        return None
-    lvs = Volumes()
-    return lvs.get(
-        lv_name=lv_name, vg_name=vg_name, lv_path=lv_path, lv_uuid=lv_uuid,
-        lv_tags=lv_tags
-    )
-
-
-def get_pv(pv_name=None, pv_uuid=None, pv_tags=None):
-    """
-    Return a matching pv (physical volume) for the current system, requiring
-    ``pv_name``, ``pv_uuid``, or ``pv_tags``. Raises an error if more than one
-    pv is found.
-    """
-    if not any([pv_name, pv_uuid, pv_tags]):
-        return None
-    pvs = PVolumes()
-    return pvs.get(pv_name=pv_name, pv_uuid=pv_uuid, pv_tags=pv_tags)
-
-
-def create_pv(device):
-    """
-    Create a physical volume from a device, useful when devices need to be later mapped
-    to journals.
-    """
-    process.run([
-        'sudo',
-        'pvcreate',
-        '-v',  # verbose
-        '-f',  # force it
-        '--yes', # answer yes to any prompts
-        device
-    ])
-
-
-def create_lv(name, group, size=None, **tags):
-    """
-    Create a Logical Volume in a Volume Group. Command looks like::
-
-        lvcreate -L 50G -n gfslv vg0
-
-    ``name``, ``group``, and ``size`` are required. Tags are optional and are "translated" to include
-    the prefixes for the Ceph LVM tag API.
-
-    """
-    # XXX add CEPH_VOLUME_LVM_DEBUG to enable -vvvv on lv operations
-    type_path_tag = {
-        'journal': 'ceph.journal_device',
-        'data': 'ceph.data_device',
-        'block': 'ceph.block',
-        'wal': 'ceph.wal',
-        'db': 'ceph.db',
-        'lockbox': 'ceph.lockbox_device',
-    }
-    if size:
-        process.run([
-            'sudo',
-            'lvcreate',
-            '--yes',
-            '-L',
-            '%sG' % size,
-            '-n', name, group
-        ])
-    # create the lv with all the space available, this is needed because the
-    # system call is different for LVM
-    else:
-        process.run([
-            'sudo',
-            'lvcreate',
-            '--yes',
-            '-l',
-            '100%FREE',
-            '-n', name, group
-        ])
-
-    lv = get_lv(lv_name=name, vg_name=group)
-    ceph_tags = {}
-    for k, v in tags.items():
-        ceph_tags['ceph.%s' % k] = v
-    lv.set_tags(ceph_tags)
-
-    # when creating a distinct type, the caller doesn't know what the path will
-    # be so this function will set it after creation using the mapping
-    path_tag = type_path_tag[tags['type']]
-    lv.set_tags(
-        {path_tag: lv.lv_path}
-    )
-    return lv
-
-
-def get_vg(vg_name=None, vg_tags=None):
-    """
-    Return a matching vg for the current system, requires ``vg_name`` or
-    ``tags``. Raises an error if more than one vg is found.
-
-    It is useful to use ``tags`` when trying to find a specific volume group,
-    but it can also lead to multiple vgs being found.
-    """
-    if not any([vg_name, vg_tags]):
-        return None
-    vgs = VolumeGroups()
-    return vgs.get(vg_name=vg_name, vg_tags=vg_tags)
-
-
-class VolumeGroups(list):
-    """
-    A list of all known volume groups for the current system, with the ability
-    to filter them via keyword arguments.
-    """
-
-    def __init__(self):
-        self._populate()
-
-    def _populate(self):
-        # get all the vgs in the current system
-        for vg_item in get_api_vgs():
-            self.append(VolumeGroup(**vg_item))
-
-    def _purge(self):
-        """
-        Deplete all the items in the list, used internally only so that we can
-        dynamically allocate the items when filtering without the concern of
-        messing up the contents
-        """
-        self[:] = []
-
-    def _filter(self, vg_name=None, vg_tags=None):
-        """
-        The actual method that filters using a new list. Useful so that other
-        methods that do not want to alter the contents of the list (e.g.
-        ``self.find``) can operate safely.
-
-        .. note:: ``vg_tags`` is not yet implemented
-        """
-        filtered = [i for i in self]
-        if vg_name:
-            filtered = [i for i in filtered if i.vg_name == vg_name]
-
-        # at this point, `filtered` has either all the volumes in self or is an
-        # actual filtered list if any filters were applied
-        if vg_tags:
-            tag_filtered = []
-            for volume in filtered:
-                matches = all(volume.tags.get(k) == str(v) for k, v in vg_tags.items())
-                if matches:
-                    tag_filtered.append(volume)
-            return tag_filtered
-
-        return filtered
-
-    def filter(self, vg_name=None, vg_tags=None):
-        """
-        Filter out groups on top level attributes like ``vg_name`` or by
-        ``vg_tags`` where a dict is required. For example, to find a Ceph group
-        with dmcache as the type, the filter would look like::
-
-            vg_tags={'ceph.type': 'dmcache'}
-
-        .. warning:: These tags are not documented because they are currently
-                     unused, but are here to maintain API consistency
-        """
-        if not any([vg_name, vg_tags]):
-            raise TypeError('.filter() requires vg_name or vg_tags (none given)')
-        # first find the filtered volumes with the values in self
-        filtered_groups = self._filter(
-            vg_name=vg_name,
-            vg_tags=vg_tags
-        )
-        # then purge everything
-        self._purge()
-        # and add the filtered items
-        self.extend(filtered_groups)
-
-    def get(self, vg_name=None, vg_tags=None):
-        """
-        This is a bit expensive, since it will try to filter out all the
-        matching items in the list, filter them out applying anything that was
-        added and return the matching item.
-
-        This method does *not* alter the list, and it will raise an error if
-        multiple VGs are matched
-
-        It is useful to use ``tags`` when trying to find a specific volume group,
-        but it can also lead to multiple vgs being found (although unlikely)
-        """
-        if not any([vg_name, vg_tags]):
-            return None
-        vgs = self._filter(
-            vg_name=vg_name,
-            vg_tags=vg_tags
-        )
-        if not vgs:
-            return None
-        if len(vgs) > 1:
-            # this is probably never going to happen, but it is here to keep
-            # the API code consistent
-            raise MultipleVGsError(vg_name)
-        return vgs[0]
-
-
-class Volumes(list):
-    """
-    A list of all known (logical) volumes for the current system, with the ability
-    to filter them via keyword arguments.
-    """
-
-    def __init__(self):
-        self._populate()
-
-    def _populate(self):
-        # get all the lvs in the current system
-        for lv_item in get_api_lvs():
-            self.append(Volume(**lv_item))
-
-    def _purge(self):
-        """
-        Deplete all the items in the list, used internally only so that we can
-        dynamically allocate the items when filtering without the concern of
-        messing up the contents
-        """
-        self[:] = []
-
-    def _filter(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
-        """
-        The actual method that filters using a new list. Useful so that other
-        methods that do not want to alter the contents of the list (e.g.
-        ``self.find``) can operate safely.
-        """
-        filtered = [i for i in self]
-        if lv_name:
-            filtered = [i for i in filtered if i.lv_name == lv_name]
-
-        if vg_name:
-            filtered = [i for i in filtered if i.vg_name == vg_name]
-
-        if lv_uuid:
-            filtered = [i for i in filtered if i.lv_uuid == lv_uuid]
-
-        if lv_path:
-            filtered = [i for i in filtered if i.lv_path == lv_path]
-
-        # at this point, `filtered` has either all the volumes in self or is an
-        # actual filtered list if any filters were applied
-        if lv_tags:
-            tag_filtered = []
-            for volume in filtered:
-                # all the tags we got need to match on the volume
-                matches = all(volume.tags.get(k) == str(v) for k, v in lv_tags.items())
-                if matches:
-                    tag_filtered.append(volume)
-            return tag_filtered
-
-        return filtered
-
-    def filter(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
-        """
-        Filter out volumes on top level attributes like ``lv_name`` or by
-        ``lv_tags`` where a dict is required. For example, to find a volume
-        that has an OSD ID of 0, the filter would look like::
-
-            lv_tags={'ceph.osd_id': '0'}
-
-        """
-        if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
-            raise TypeError('.filter() requires lv_name, vg_name, lv_path, lv_uuid, or tags (none given)')
-        # first find the filtered volumes with the values in self
-        filtered_volumes = self._filter(
-            lv_name=lv_name,
-            vg_name=vg_name,
-            lv_path=lv_path,
-            lv_uuid=lv_uuid,
-            lv_tags=lv_tags
-        )
-        # then purge everything
-        self._purge()
-        # and add the filtered items
-        self.extend(filtered_volumes)
-
-    def get(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
-        """
-        This is a bit expensive, since it will try to filter out all the
-        matching items in the list, filter them out applying anything that was
-        added and return the matching item.
-
-        This method does *not* alter the list, and it will raise an error if
-        multiple LVs are matched
-
-        It is useful to use ``tags`` when trying to find a specific logical volume,
-        but it can also lead to multiple lvs being found, since a lot of metadata
-        is shared between lvs of a distinct OSD.
-        """
-        if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
-            return None
-        lvs = self._filter(
-            lv_name=lv_name,
-            vg_name=vg_name,
-            lv_path=lv_path,
-            lv_uuid=lv_uuid,
-            lv_tags=lv_tags
-        )
-        if not lvs:
-            return None
-        if len(lvs) > 1:
-            raise MultipleLVsError(lv_name, lv_path)
-        return lvs[0]
-
-
-class PVolumes(list):
-    """
-    A list of all known (physical) volumes for the current system, with the ability
-    to filter them via keyword arguments.
-    """
-
-    def __init__(self):
-        self._populate()
-
-    def _populate(self):
-        # get all the pvs in the current system
-        for pv_item in get_api_pvs():
-            self.append(PVolume(**pv_item))
-
-    def _purge(self):
-        """
-        Deplete all the items in the list, used internally only so that we can
-        dynamically allocate the items when filtering without the concern of
-        messing up the contents
-        """
-        self[:] = []
-
-    def _filter(self, pv_name=None, pv_uuid=None, pv_tags=None):
-        """
-        The actual method that filters using a new list. Useful so that other
-        methods that do not want to alter the contents of the list (e.g.
-        ``self.find``) can operate safely.
-        """
-        filtered = [i for i in self]
-        if pv_name:
-            filtered = [i for i in filtered if i.pv_name == pv_name]
-
-        if pv_uuid:
-            filtered = [i for i in filtered if i.pv_uuid == pv_uuid]
-
-        # at this point, `filtered` has either all the physical volumes in self
-        # or is an actual filtered list if any filters were applied
-        if pv_tags:
-            tag_filtered = []
-            for pvolume in filtered:
-                matches = all(pvolume.tags.get(k) == str(v) for k, v in pv_tags.items())
-                if matches:
-                    tag_filtered.append(pvolume)
-            # return the tag_filtered pvolumes here, the `filtered` list is no
-            # longer useable
-            return tag_filtered
-
-        return filtered
-
-    def filter(self, pv_name=None, pv_uuid=None, pv_tags=None):
-        """
-        Filter out volumes on top level attributes like ``pv_name`` or by
-        ``pv_tags`` where a dict is required. For example, to find a physical volume
-        that has an OSD ID of 0, the filter would look like::
-
-            pv_tags={'ceph.osd_id': '0'}
-
-        """
-        if not any([pv_name, pv_uuid, pv_tags]):
-            raise TypeError('.filter() requires pv_name, pv_uuid, or pv_tags (none given)')
-        # first find the filtered volumes with the values in self
-        filtered_volumes = self._filter(
-            pv_name=pv_name,
-            pv_uuid=pv_uuid,
-            pv_tags=pv_tags
-        )
-        # then purge everything
-        self._purge()
-        # and add the filtered items
-        self.extend(filtered_volumes)
-
-    def get(self, pv_name=None, pv_uuid=None, pv_tags=None):
-        """
-        This is a bit expensive, since it will try to filter out all the
-        matching items in the list, filter them out applying anything that was
-        added and return the matching item.
-
-        This method does *not* alter the list, and it will raise an error if
-        multiple pvs are matched
-
-        It is useful to use ``tags`` when trying to find a specific logical volume,
-        but it can also lead to multiple pvs being found, since a lot of metadata
-        is shared between pvs of a distinct OSD.
-        """
-        if not any([pv_name, pv_uuid, pv_tags]):
-            return None
-        pvs = self._filter(
-            pv_name=pv_name,
-            pv_uuid=pv_uuid,
-            pv_tags=pv_tags
-        )
-        if not pvs:
-            return None
-        if len(pvs) > 1:
-            raise MultiplePVsError(pv_name)
-        return pvs[0]
-
-
-class VolumeGroup(object):
-    """
-    Represents an LVM group, with some top-level attributes like ``vg_name``
-    """
-
-    def __init__(self, **kw):
-        for k, v in kw.items():
-            setattr(self, k, v)
-        self.name = kw['vg_name']
-        self.tags = parse_tags(kw.get('vg_tags', ''))
-
-    def __str__(self):
-        return '<%s>' % self.name
-
-    def __repr__(self):
-        return self.__str__()
-
-
-class Volume(object):
-    """
-    Represents a Logical Volume from LVM, with some top-level attributes like
-    ``lv_name`` and parsed tags as a dictionary of key/value pairs.
-    """
-
-    def __init__(self, **kw):
-        for k, v in kw.items():
-            setattr(self, k, v)
-        self.lv_api = kw
-        self.name = kw['lv_name']
-        self.tags = parse_tags(kw['lv_tags'])
-
-    def __str__(self):
-        return '<%s>' % self.lv_api['lv_path']
-
-    def __repr__(self):
-        return self.__str__()
-
-    def set_tags(self, tags):
-        """
-        :param tags: A dictionary of tag names and values, like::
-
-            {
-                "ceph.osd_fsid": "aaa-fff-bbbb",
-                "ceph.osd_id": "0"
-            }
-
-        At the end of all modifications, the tags are refreshed to reflect
-        LVM's most current view.
-        """
-        for k, v in tags.items():
-            self.set_tag(k, v)
-        # after setting all the tags, refresh them for the current object, use the
-        # lv_* identifiers to filter because those shouldn't change
-        lv_object = get_lv(lv_name=self.lv_name, lv_path=self.lv_path)
-        self.tags = lv_object.tags
-
-    def set_tag(self, key, value):
-        """
-        Set the key/value pair as an LVM tag. Does not "refresh" the values of
-        the current object for its tags. Meant to be a "fire and forget" type
-        of modification.
-        """
-        # remove it first if it exists
-        if self.tags.get(key):
-            current_value = self.tags[key]
-            tag = "%s=%s" % (key, current_value)
-            process.call(['sudo', 'lvchange', '--deltag', tag, self.lv_api['lv_path']])
-
-        process.call(
-            [
-                'sudo', 'lvchange',
-                '--addtag', '%s=%s' % (key, value), self.lv_path
-            ]
-        )
-
-
-class PVolume(object):
-    """
-    Represents a Physical Volume from LVM, with some top-level attributes like
-    ``pv_name`` and parsed tags as a dictionary of key/value pairs.
-    """
-
-    def __init__(self, **kw):
-        for k, v in kw.items():
-            setattr(self, k, v)
-        self.pv_api = kw
-        self.name = kw['pv_name']
-        self.tags = parse_tags(kw['pv_tags'])
-
-    def __str__(self):
-        return '<%s>' % self.pv_api['pv_name']
-
-    def __repr__(self):
-        return self.__str__()
-
-    def set_tags(self, tags):
-        """
-        :param tags: A dictionary of tag names and values, like::
-
-            {
-                "ceph.osd_fsid": "aaa-fff-bbbb",
-                "ceph.osd_id": "0"
-            }
-
-        At the end of all modifications, the tags are refreshed to reflect
-        LVM's most current view.
-        """
-        for k, v in tags.items():
-            self.set_tag(k, v)
-        # after setting all the tags, refresh them for the current object, use the
-        # pv_* identifiers to filter because those shouldn't change
-        pv_object = get_pv(pv_name=self.pv_name, pv_uuid=self.pv_uuid)
-        self.tags = pv_object.tags
-
-    def set_tag(self, key, value):
-        """
-        Set the key/value pair as an LVM tag. Does not "refresh" the values of
-        the current object for its tags. Meant to be a "fire and forget" type
-        of modification.
-
-        **warning**: Altering tags on a PV has to be done ensuring that the
-        device is actually the one intended. ``pv_name`` is *not* a persistent
-        value, only ``pv_uuid`` is. Using ``pv_uuid`` is the best way to make
-        sure the device getting changed is the one needed.
-        """
-        # remove it first if it exists
-        if self.tags.get(key):
-            current_value = self.tags[key]
-            tag = "%s=%s" % (key, current_value)
-            process.call(['sudo', 'pvchange', '--deltag', tag, self.pv_name])
-
-        process.call(
-            [
-                'sudo', 'pvchange',
-                '--addtag', '%s=%s' % (key, value), self.pv_name
-            ]
-        )
index b4e4ee3ad2d5c6626c9655e14562471420b4b170..b2fbbf991edeb72e50c3e888e93683106d7b7b5a 100644 (file)
@@ -15,30 +15,30 @@ def common_parser(prog, description):
     required_args = parser.add_argument_group('required arguments')
     parser.add_argument(
         '--journal',
-        help='A logical volume (vg_name/lv_name), or path to a device',
+        help='(filestore) A logical volume (vg_name/lv_name), or path to a device',
     )
     required_args.add_argument(
         '--data',
         required=True,
         type=arg_validators.LVPath(),
-        help='A logical volume (vg_name/lv_name) for OSD data',
+        help='OSD data path. A physical device or logical volume',
     )
     parser.add_argument(
         '--journal-size',
         default=5,
         metavar='GB',
         type=int,
-        help='Size (in GB) A logical group name or a path to a logical volume',
+        help='(filestore) Size (in GB) for the journal',
     )
     parser.add_argument(
         '--bluestore',
-        action='store_true', default=False,
-        help='Use the bluestore objectstore (not currently supported)',
+        action='store_true',
+        help='Use the bluestore objectstore',
     )
     parser.add_argument(
         '--filestore',
-        action='store_true', default=True,
-        help='Use the filestore objectstore (currently the only supported object store)',
+        action='store_true',
+        help='Use the filestore objectstore',
     )
     parser.add_argument(
         '--osd-id',
@@ -48,6 +48,16 @@ def common_parser(prog, description):
         '--osd-fsid',
         help='Reuse an existing OSD fsid',
     )
+    parser.add_argument(
+        '--block.db',
+        dest='block_db',
+        help='(bluestore) Path to bluestore block.db logical volume or device',
+    )
+    parser.add_argument(
+        '--block.wal',
+        dest='block_wal',
+        help='(bluestore) Path to bluestore block.wal logical volume or device',
+    )
     # Do not parse args, so that consumers can do something before the args get
     # parsed triggering argparse behavior
     return parser
index 8c747f342143cf34697f00a7d0724e57da5cecce..353b26ab4ed8c0afb07b911d3362e6ccb3c0e5dd 100644 (file)
@@ -50,4 +50,8 @@ class Create(object):
             print(sub_command_help)
             return
         args = parser.parse_args(self.argv)
+        # Default to bluestore here since defaulting it in add_argument may
+        # cause both to be True
+        if args.bluestore is None and args.filestore is None:
+            args.bluestore = True
         self.create(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/listing.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/listing.py
new file mode 100644 (file)
index 0000000..6982f91
--- /dev/null
@@ -0,0 +1,244 @@
+from __future__ import print_function
+import argparse
+import json
+import logging
+from textwrap import dedent
+from ceph_volume import decorators
+from ceph_volume.util import disk
+from ceph_volume.api import lvm as api
+
+logger = logging.getLogger(__name__)
+
+
+osd_list_header_template = """\n
+{osd_id:=^20}"""
+
+
+osd_device_header_template = """
+
+  [{type: >4}]    {path}
+"""
+
+device_metadata_item_template = """
+      {tag_name: <25} {value}"""
+
+
+def readable_tag(tag):
+    actual_name = tag.split('.')[-1]
+    return actual_name.replace('_', ' ')
+
+
+def pretty_report(report):
+    output = []
+    for _id, devices in report.items():
+        output.append(
+            osd_list_header_template.format(osd_id=" osd.%s " % _id)
+        )
+        for device in devices:
+            output.append(
+                osd_device_header_template.format(
+                    type=device['type'],
+                    path=device['path']
+                )
+            )
+            for tag_name, value in device.get('tags', {}).items():
+                output.append(
+                    device_metadata_item_template.format(
+                        tag_name=readable_tag(tag_name),
+                        value=value
+                    )
+                )
+    print(''.join(output))
+
+
+class List(object):
+
+    help = 'list logical volumes and devices associated with Ceph'
+
+    def __init__(self, argv):
+        self.argv = argv
+
+    @decorators.needs_root
+    def list(self, args):
+        # ensure everything is up to date before calling out
+        # to list lv's
+        self.update()
+        report = self.generate(args)
+        if args.format == 'json':
+            # If the report is empty, we don't return a non-zero exit status
+            # because it is assumed this is going to be consumed by automated
+            # systems like ceph-ansible which would be forced to ignore the
+            # non-zero exit status if all they need is the information in the
+            # JSON object
+            print(json.dumps(report, indent=4, sort_keys=True))
+        else:
+            if not report:
+                raise SystemExit('No valid Ceph devices found')
+            pretty_report(report)
+
+    def update(self):
+        """
+        Ensure all journal devices are up to date if they aren't a logical
+        volume
+        """
+        lvs = api.Volumes()
+        for lv in lvs:
+            try:
+                lv.tags['ceph.osd_id']
+            except KeyError:
+                # only consider ceph-based logical volumes, everything else
+                # will get ignored
+                continue
+
+            for device_type in ['journal', 'block', 'wal', 'db']:
+                device_name = 'ceph.%s_device' % device_type
+                device_uuid = lv.tags.get('ceph.%s_uuid' % device_type)
+                if not device_uuid:
+                    # bluestore will not have a journal, filestore will not have
+                    # a block/wal/db, so we must skip if not present
+                    continue
+                disk_device = disk.get_device_from_partuuid(device_uuid)
+                if disk_device:
+                    if lv.tags[device_name] != disk_device:
+                        # this means that the device has changed, so it must be updated
+                        # on the API to reflect this
+                        lv.set_tags({device_name: disk_device})
+
+    def generate(self, args):
+        """
+        Generate reports for an individual device or for all Ceph-related
+        devices, logical or physical, as long as they have been prepared by
+        this tool before and contain enough metadata.
+        """
+        if args.device:
+            return self.single_report(args.device)
+        else:
+            return self.full_report()
+
+    def single_report(self, device):
+        """
+        Generate a report for a single device. This can be either a logical
+        volume in the form of vg/lv or a device with an absolute path like
+        /dev/sda1
+        """
+        lvs = api.Volumes()
+        report = {}
+        lv = api.get_lv_from_argument(device)
+        if lv:
+            try:
+                _id = lv.tags['ceph.osd_id']
+            except KeyError:
+                logger.warning('device is not part of ceph: %s', device)
+                return report
+
+            report.setdefault(_id, [])
+            report[_id].append(
+                lv.as_dict()
+            )
+
+        else:
+            # this has to be a journal/wal/db device (not a logical volume) so try
+            # to find the PARTUUID that should be stored in the OSD logical
+            # volume
+            for device_type in ['journal', 'block', 'wal', 'db']:
+                device_tag_name = 'ceph.%s_device' % device_type
+                device_tag_uuid = 'ceph.%s_uuid' % device_type
+                associated_lv = lvs.get(lv_tags={device_tag_name: device})
+                if associated_lv:
+                    _id = associated_lv.tags['ceph.osd_id']
+                    uuid = associated_lv.tags[device_tag_uuid]
+
+                    report.setdefault(_id, [])
+                    report[_id].append(
+                        {
+                            'tags': {'PARTUUID': uuid},
+                            'type': device_type,
+                            'path': device,
+                        }
+                    )
+        return report
+
+    def full_report(self):
+        """
+        Generate a report for all the logical volumes and associated devices
+        that have been previously prepared by Ceph
+        """
+        lvs = api.Volumes()
+        report = {}
+        for lv in lvs:
+            try:
+                _id = lv.tags['ceph.osd_id']
+            except KeyError:
+                # only consider ceph-based logical volumes, everything else
+                # will get ignored
+                continue
+
+            report.setdefault(_id, [])
+            report[_id].append(
+                lv.as_dict()
+            )
+
+            for device_type in ['journal', 'block', 'wal', 'db']:
+                device_uuid = lv.tags.get('ceph.%s_uuid' % device_type)
+                if not device_uuid:
+                    # bluestore will not have a journal, filestore will not have
+                    # a block/wal/db, so we must skip if not present
+                    continue
+                if not api.get_lv(lv_uuid=device_uuid):
+                    # means we have a regular device, so query blkid
+                    disk_device = disk.get_device_from_partuuid(device_uuid)
+                    if disk_device:
+                        report[_id].append(
+                            {
+                                'tags': {'PARTUUID': device_uuid},
+                                'type': device_type,
+                                'path': disk_device,
+                            }
+                        )
+
+        return report
+
+    def main(self):
+        sub_command_help = dedent("""
+        List devices or logical volumes associated with Ceph. An association is
+        determined if a device has information relating to an OSD. This is
+        verified by querying LVM's metadata and correlating it with devices.
+
+        The lvs associated with the OSD need to have been prepared previously,
+        so that all needed tags and metadata exist.
+
+        Full listing of all system devices associated with a cluster::
+
+            ceph-volume lvm list
+
+        List a particular device, reporting all metadata about it::
+
+            ceph-volume lvm list /dev/sda1
+
+        List a logical volume, along with all its metadata (vg is a volume
+        group, and lv the logical volume name)::
+
+            ceph-volume lvm list {vg/lv}
+        """)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume lvm list',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=sub_command_help,
+        )
+
+        parser.add_argument(
+            'device',
+            metavar='DEVICE',
+            nargs='?',
+            help='Path to an lv (as vg/lv) or to a device like /dev/sda1'
+        )
+
+        parser.add_argument(
+            '--format',
+            help='output format, defaults to "pretty"',
+            default='pretty',
+            choices=['json', 'pretty'],
+        )
+
+        args = parser.parse_args(self.argv)
+        self.list(args)
index 59e69329b4e0d5b5a9d0b324acb8ce2812d7fb91..8b698a03f64cab12fe5d4673b956c4fe5964aeeb 100644 (file)
@@ -5,6 +5,8 @@ from . import activate
 from . import prepare
 from . import create
 from . import trigger
+from . import listing
+from . import zap
 
 
 class LVM(object):
@@ -22,6 +24,8 @@ class LVM(object):
         'prepare': prepare.Prepare,
         'create': create.Create,
         'trigger': trigger.Trigger,
+        'list': listing.List,
+        'zap': zap.Zap,
     }
 
     def __init__(self, argv):
index 1ca5b0d88540f86546a49deff5d68de02676df05..5a7daa3dc003b7c8e86105ef2c350b2c79e107f1 100644 (file)
@@ -1,17 +1,17 @@
 from __future__ import print_function
 import json
-import os
+import uuid
 from textwrap import dedent
 from ceph_volume.util import prepare as prepare_utils
 from ceph_volume.util import system, disk
 from ceph_volume import conf, decorators, terminal
-from . import api
+from ceph_volume.api import lvm as api
 from .common import prepare_parser
 
 
 def prepare_filestore(device, journal, secrets, id_=None, fsid=None):
     """
-    :param device: The name of the volume group or lvm to work with
+    :param device: The name of the logical volume to work with
     :param journal: similar to device but can also be a regular/plain disk
     :param secrets: A dict with the secrets needed to create the osd (e.g. cephx)
     :param id_: The OSD id
@@ -25,7 +25,7 @@ def prepare_filestore(device, journal, secrets, id_=None, fsid=None):
     # allow re-using an id, in case a prepare failed
     osd_id = id_ or prepare_utils.create_id(fsid, json_secrets)
     # create the directory
-    prepare_utils.create_path(osd_id)
+    prepare_utils.create_osd_path(osd_id)
     # format the device
     prepare_utils.format_device(device)
     # mount the data device
@@ -35,13 +35,42 @@ def prepare_filestore(device, journal, secrets, id_=None, fsid=None):
     # get the latest monmap
     prepare_utils.get_monmap(osd_id)
     # prepare the osd filesystem
-    prepare_utils.osd_mkfs(osd_id, fsid)
+    prepare_utils.osd_mkfs_filestore(osd_id, fsid)
     # write the OSD keyring if it doesn't exist already
     prepare_utils.write_keyring(osd_id, cephx_secret)
 
 
-def prepare_bluestore():
-    raise NotImplemented()
+def prepare_bluestore(block, wal, db, secrets, id_=None, fsid=None):
+    """
+    :param block: The name of the logical volume for the bluestore data
+    :param wal: a regular/plain disk or logical volume, to be used for block.wal
+    :param db: a regular/plain disk or logical volume, to be used for block.db
+    :param secrets: A dict with the secrets needed to create the osd (e.g. cephx)
+    :param id_: The OSD id
+    :param fsid: The OSD fsid, also known as the OSD UUID
+    """
+    cephx_secret = secrets.get('cephx_secret', prepare_utils.create_key())
+    json_secrets = json.dumps(secrets)
+
+    # allow re-using an existing fsid, in case prepare failed
+    fsid = fsid or system.generate_uuid()
+    # allow re-using an id, in case a prepare failed
+    osd_id = id_ or prepare_utils.create_id(fsid, json_secrets)
+    # create the directory
+    prepare_utils.create_osd_path(osd_id, tmpfs=True)
+    # symlink the block
+    prepare_utils.link_block(block, osd_id)
+    # get the latest monmap
+    prepare_utils.get_monmap(osd_id)
+    # write the OSD keyring if it doesn't exist already
+    prepare_utils.write_keyring(osd_id, cephx_secret)
+    # prepare the osd filesystem
+    prepare_utils.osd_mkfs_bluestore(
+        osd_id, fsid,
+        keyring=cephx_secret,
+        wal=wal,
+        db=db
+    )
 
 
 class Prepare(object):
@@ -51,19 +80,20 @@ class Prepare(object):
     def __init__(self, argv):
         self.argv = argv
 
-    def get_journal_ptuuid(self, argument):
+    def get_ptuuid(self, argument):
         uuid = disk.get_partuuid(argument)
         if not uuid:
             terminal.error('blkid could not detect a PARTUUID for device: %s' % argument)
-            raise RuntimeError('unable to use device for a journal')
+            raise RuntimeError('unable to use device')
         return uuid
 
-    def get_journal_lv(self, argument):
+    def get_lv(self, argument):
         """
-        Perform some parsing of the value of ``--journal`` so that the process
-        can determine correctly if it got a device path or an lv
-        :param argument: The value of ``--journal``, that will need to be split
-        to retrieve the actual lv
+        Perform some parsing of the command-line value so that the process
+        can determine correctly if it got a device path or an lv.
+
+        :param argument: The command-line value that will need to be split to
+                         retrieve the actual lv
         """
         try:
             vg_name, lv_name = argument.split('/')
@@ -71,6 +101,66 @@ class Prepare(object):
             return None
         return api.get_lv(lv_name=lv_name, vg_name=vg_name)
 
+    def setup_device(self, device_type, device_name, tags):
+        """
+        Check if ``device`` is an lv, if so, set the tags, making sure to
+        update the tags with the lv_uuid and lv_path which the incoming tags
+        will not have.
+
+        If the device is not a logical volume, then retrieve the partition UUID
+        by querying ``blkid``
+        """
+        if device_name is None:
+            return '', '', tags
+        tags['ceph.type'] = device_type
+        lv = self.get_lv(device_name)
+        if lv:
+            uuid = lv.lv_uuid
+            path = lv.lv_path
+            tags['ceph.%s_uuid' % device_type] = uuid
+            tags['ceph.%s_device' % device_type] = path
+            lv.set_tags(tags)
+        else:
+            # otherwise assume this is a regular disk partition
+            uuid = self.get_ptuuid(device_name)
+            path = device_name
+            tags['ceph.%s_uuid' % device_type] = uuid
+            tags['ceph.%s_device' % device_type] = path
+        return path, uuid, tags
+
+    def prepare_device(self, arg, device_type, cluster_fsid, osd_fsid):
+        """
+        Check if ``arg`` is a device or partition to create an LV out of it
+        with a distinct volume group name, assigning LV tags on it and
+        ultimately, returning the logical volume object.  Failing to detect
+        a device or partition will result in error.
+
+        :param arg: The value of ``--data`` when parsing args
+        :param device_type: Usually, either ``data`` or ``block`` (filestore vs. bluestore)
+        :param cluster_fsid: The cluster fsid/uuid
+        :param osd_fsid: The OSD fsid/uuid
+        """
+        if disk.is_partition(arg) or disk.is_device(arg):
+            # we must create a vg, and then a single lv
+            vg_name = "ceph-%s" % cluster_fsid
+            if api.get_vg(vg_name=vg_name):
+                # means we already have a group for this, make a different one
+                # XXX this could end up being annoying for an operator, maybe?
+                vg_name = "ceph-%s" % str(uuid.uuid4())
+            api.create_vg(vg_name, arg)
+            lv_name = "osd-%s-%s" % (device_type, osd_fsid)
+            return api.create_lv(
+                lv_name,
+                vg_name,  # the volume group
+                tags={'ceph.type': device_type})
+        else:
+            error = [
+                'Cannot use device (%s).',
+                'A vg/lv path or an existing device is needed' % arg]
+            raise RuntimeError(' '.join(error))
+
+        raise RuntimeError('no data logical volume found with: %s' % arg)
+
     @decorators.needs_root
     def prepare(self, args):
         # FIXME we don't allow re-using a keyring, we always generate one for the
@@ -80,69 +170,66 @@ class Prepare(object):
         secrets = {'cephx_secret': prepare_utils.create_key()}
 
         cluster_fsid = conf.ceph.get('global', 'fsid')
-        fsid = args.osd_fsid or system.generate_uuid()
-        #osd_id = args.osd_id or prepare_utils.create_id(fsid)
+        osd_fsid = args.osd_fsid or system.generate_uuid()
         # allow re-using an id, in case a prepare failed
-        osd_id = args.osd_id or prepare_utils.create_id(fsid, json.dumps(secrets))
-        vg_name, lv_name = args.data.split('/')
+        osd_id = args.osd_id or prepare_utils.create_id(osd_fsid, json.dumps(secrets))
         if args.filestore:
-            data_lv = api.get_lv(lv_name=lv_name, vg_name=vg_name)
-
-            # we must have either an existing data_lv or a newly created, so lets make
-            # sure that the tags are correct
-            if not data_lv:
-                raise RuntimeError('no data logical volume found with: %s' % args.data)
-
             if not args.journal:
                 raise RuntimeError('--journal is required when using --filestore')
 
-            journal_lv = self.get_journal_lv(args.journal)
-            if journal_lv:
-                journal_device = journal_lv.lv_path
-                journal_uuid = journal_lv.lv_uuid
-                # we can only set tags on an lv, the pv (if any) can't as we
-                # aren't making it part of an lvm group (vg)
-                journal_lv.set_tags({
-                    'ceph.type': 'journal',
-                    'ceph.osd_fsid': fsid,
-                    'ceph.osd_id': osd_id,
-                    'ceph.cluster_fsid': cluster_fsid,
-                    'ceph.journal_device': journal_device,
-                    'ceph.journal_uuid': journal_uuid,
-                    'ceph.data_device': data_lv.lv_path,
-                    'ceph.data_uuid': data_lv.lv_uuid,
-                })
-
-            # allow a file
-            elif os.path.isfile(args.journal):
-                journal_uuid = ''
-                journal_device = args.journal
-
-            # otherwise assume this is a regular disk partition
-            else:
-                journal_uuid = self.get_journal_ptuuid(args.journal)
-                journal_device = args.journal
+            data_lv = self.get_lv(args.data)
+            if not data_lv:
+                data_lv = self.prepare_device(args.data, 'data', cluster_fsid, osd_fsid)
 
-            data_lv.set_tags({
-                'ceph.type': 'data',
-                'ceph.osd_fsid': fsid,
+            tags = {
+                'ceph.osd_fsid': osd_fsid,
                 'ceph.osd_id': osd_id,
                 'ceph.cluster_fsid': cluster_fsid,
-                'ceph.journal_device': journal_device,
-                'ceph.journal_uuid': journal_uuid,
+                'ceph.cluster_name': conf.cluster,
                 'ceph.data_device': data_lv.lv_path,
                 'ceph.data_uuid': data_lv.lv_uuid,
-            })
+            }
+
+            journal_device, journal_uuid, tags = self.setup_device('journal', args.journal, tags)
+
+            tags['ceph.type'] = 'data'
+            data_lv.set_tags(tags)
 
             prepare_filestore(
                 data_lv.lv_path,
                 journal_device,
                 secrets,
                 id_=osd_id,
-                fsid=fsid,
+                fsid=osd_fsid,
             )
         elif args.bluestore:
-            prepare_bluestore(args)
+            block_lv = self.get_lv(args.data)
+            if not block_lv:
+                block_lv = self.prepare_device(args.data, 'block', cluster_fsid, osd_fsid)
+
+            tags = {
+                'ceph.osd_fsid': osd_fsid,
+                'ceph.osd_id': osd_id,
+                'ceph.cluster_fsid': cluster_fsid,
+                'ceph.cluster_name': conf.cluster,
+                'ceph.block_device': block_lv.lv_path,
+                'ceph.block_uuid': block_lv.lv_uuid,
+            }
+
+            wal_device, wal_uuid, tags = self.setup_device('wal', args.block_wal, tags)
+            db_device, db_uuid, tags = self.setup_device('db', args.block_db, tags)
+
+            tags['ceph.type'] = 'block'
+            block_lv.set_tags(tags)
+
+            prepare_bluestore(
+                block_lv.lv_path,
+                wal_device,
+                db_device,
+                secrets,
+                id_=osd_id,
+                fsid=osd_fsid,
+            )
 
     def main(self):
         sub_command_help = dedent("""
@@ -166,17 +253,30 @@ class Prepare(object):
 
           Existing logical volume (lv) or device:
 
-              ceph-volume lvm prepare --data {logical volume} --journal /path/to/{lv}|{device}
+              ceph-volume lvm prepare --filestore --data {vg/lv} --journal /path/to/device
 
           Or:
 
-              ceph-volume lvm prepare --data {data volume group} --journal {journal volume group}
+              ceph-volume lvm prepare --filestore --data {vg/lv} --journal {vg/lv}
+
+          Existing block device, that will be made a group and logical volume:
+
+              ceph-volume lvm prepare --filestore --data /path/to/device --journal {vg/lv}
+
+        Bluestore
+        ---------
+
+          Existing logical volume (lv):
+
+              ceph-volume lvm prepare --bluestore --data {vg/lv}
+
+          Existing block device, that will be made a group and logical volume:
 
-        Collocated (same group) for data and journal
-        --------------------------------------------
+              ceph-volume lvm prepare --bluestore --data /path/to/device
 
-              ceph-volume lvm prepare --data {volume group}
+          Optionally, can consume db and wal devices or logical volumes:
 
+              ceph-volume lvm prepare --bluestore --data {vg/lv} --block.wal {device} --block-db {vg/lv}
         """)
         parser = prepare_parser(
             prog='ceph-volume lvm prepare',
@@ -186,4 +286,8 @@ class Prepare(object):
             print(sub_command_help)
             return
         args = parser.parse_args(self.argv)
+        # Default to bluestore here since defaulting it in add_argument may
+        # cause both to be True
+        if args.bluestore is None and args.filestore is None:
+            args.bluestore = True
         self.prepare(args)
index 9111620729cdbc20242455f4dc89be414c632c49..dc57011dfdba5eb4b8e0b41268b3371ec661328e 100644 (file)
@@ -67,4 +67,4 @@ class Trigger(object):
         args = parser.parse_args(self.argv)
         osd_id = parse_osd_id(args.systemd_data)
         osd_uuid = parse_osd_uuid(args.systemd_data)
-        Activate([osd_id, osd_uuid]).main()
+        Activate(['--auto-detect-objectstore', osd_id, osd_uuid]).main()
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py
new file mode 100644 (file)
index 0000000..df19686
--- /dev/null
@@ -0,0 +1,107 @@
+import argparse
+import logging
+
+from textwrap import dedent
+
+from ceph_volume import decorators, terminal, process
+from ceph_volume.api import lvm as api
+
+logger = logging.getLogger(__name__)
+
+
+def wipefs(path):
+    """
+    Removes the filesystem from an lv or partition.
+    """
+    process.run([
+        'sudo',
+        'wipefs',
+        '--all',
+        path
+    ])
+
+
+def zap_data(path):
+    """
+    Clears all data from the given path. Path should be
+    an absolute path to an lv or partition.
+
+    10M of data is written to the path to make sure that
+    there is no trace left of any previous Filesystem.
+    """
+    process.run([
+        'dd',
+        'if=/dev/zero',
+        'of={path}'.format(path=path),
+        'bs=1M',
+        'count=10',
+    ])
+
+
+class Zap(object):
+
+    help = 'Removes all data and filesystems from a logical volume or partition.'
+
+    def __init__(self, argv):
+        self.argv = argv
+
+    @decorators.needs_root
+    def zap(self, args):
+        device = args.device
+        lv = api.get_lv_from_argument(device)
+        if lv:
+            # we are zapping a logical volume
+            path = lv.lv_path
+        else:
+            # we are zapping a partition
+            #TODO: ensure device is a partition
+            path = device
+
+        logger.info("Zapping: %s", path)
+        terminal.write("Zapping: %s" % path)
+
+        wipefs(path)
+        zap_data(path)
+
+        if lv:
+            # remove all lvm metadata
+            lv.clear_tags()
+
+        terminal.success("Zapping successful for: %s" % path)
+
+    def main(self):
+        sub_command_help = dedent("""
+        Zaps the given logical volume or partition. If given a path to a logical
+        volume it must be in the format of vg/lv. Any filesystems present
+        on the given lv or partition will be removed and all data will be purged.
+
+        However, the lv or partition will be kept intact.
+
+        Example calls for supported scenarios:
+
+          Zapping a logical volume:
+
+              ceph-volume lvm zap {vg name/lv name}
+
+          Zapping a partition:
+
+              ceph-volume lvm zap /dev/sdc1
+
+        """)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume lvm zap',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=sub_command_help,
+        )
+
+        parser.add_argument(
+            'device',
+            metavar='DEVICE',
+            nargs='?',
+            help='Path to an lv (as vg/lv) or to a partition like /dev/sda1'
+        )
+        if len(self.argv) == 0:
+            print(sub_command_help)
+            return
+        args = parser.parse_args(self.argv)
+        self.zap(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/simple/__init__.py b/ceph/src/ceph-volume/ceph_volume/devices/simple/__init__.py
new file mode 100644 (file)
index 0000000..280e130
--- /dev/null
@@ -0,0 +1 @@
+from .main import Simple # noqa
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/simple/activate.py b/ceph/src/ceph-volume/ceph_volume/devices/simple/activate.py
new file mode 100644 (file)
index 0000000..fdc50f0
--- /dev/null
@@ -0,0 +1,152 @@
+from __future__ import print_function
+import argparse
+import json
+import logging
+import os
+from textwrap import dedent
+from ceph_volume import process, decorators, terminal
+from ceph_volume.util import system, disk
+from ceph_volume.systemd import systemctl
+
+
+logger = logging.getLogger(__name__)
+
+
+class Activate(object):
+
+    help = 'Enable systemd units to mount configured devices and start a Ceph OSD'
+
+    def __init__(self, argv, systemd=False):
+        self.argv = argv
+        self.systemd = systemd
+
+    @decorators.needs_root
+    def activate(self, args):
+        with open(args.json_config, 'r') as fp:
+            osd_metadata = json.load(fp)
+
+        osd_id = osd_metadata.get('whoami', args.osd_id)
+        osd_fsid = osd_metadata.get('fsid', args.osd_fsid)
+
+        cluster_name = osd_metadata.get('cluster_name', 'ceph')
+        osd_dir = '/var/lib/ceph/osd/%s-%s' % (cluster_name, osd_id)
+        data_uuid = osd_metadata.get('data', {}).get('uuid')
+        if not data_uuid:
+            raise RuntimeError(
+                'Unable to activate OSD %s - no "uuid" key found for data' % args.osd_id
+            )
+        data_device = disk.get_device_from_partuuid(data_uuid)
+        journal_device = disk.get_device_from_partuuid(osd_metadata.get('journal', {}).get('uuid'))
+        block_device = disk.get_device_from_partuuid(osd_metadata.get('block', {}).get('uuid'))
+        block_db_device = disk.get_device_from_partuuid(osd_metadata.get('block.db', {}).get('uuid'))
+        block_wal_device = disk.get_device_from_partuuid(
+            osd_metadata.get('block.wal', {}).get('uuid')
+        )
+
+        if not system.device_is_mounted(data_device, destination=osd_dir):
+            process.run(['sudo', 'mount', '-v', data_device, osd_dir])
+
+        device_map = {
+            'journal': journal_device,
+            'block': block_device,
+            'block.db': block_db_device,
+            'block.wal': block_wal_device
+        }
+
+        for name, device in device_map.items():
+            if not device:
+                continue
+            # always re-do the symlink regardless if it exists, so that the journal
+            # device path that may have changed can be mapped correctly every time
+            destination = os.path.join(osd_dir, name)
+            process.run(['sudo', 'ln', '-snf', device, destination])
+
+            # make sure that the journal has proper permissions
+            system.chown(device)
+
+        if not self.systemd:
+            # enable the ceph-volume unit for this OSD
+            systemctl.enable_volume(osd_id, osd_fsid, 'simple')
+
+            # disable any/all ceph-disk units
+            systemctl.mask_ceph_disk()
+
+        # enable the OSD
+        systemctl.enable_osd(osd_id)
+
+        # start the OSD
+        systemctl.start_osd(osd_id)
+
+        if not self.systemd:
+            terminal.success('Successfully activated OSD %s with FSID %s' % (osd_id, osd_fsid))
+            terminal.warning(
+                ('All ceph-disk systemd units have been disabled to '
+                 'prevent OSDs getting triggered by UDEV events')
+            )
+
+    def main(self):
+        sub_command_help = dedent("""
+        Activate OSDs by mounting devices previously configured to their
+        appropriate destination::
+
+            ceph-volume simple activate {ID} {FSID}
+
+        Or using a JSON file directly::
+
+            ceph-volume simple activate --file /etc/ceph/osd/{ID}-{FSID}.json
+
+        The OSD must have been "scanned" previously (see ``ceph-volume simple
+        scan``), so that all needed OSD device information and metadata exist.
+
+        A previously scanned OSD would exist like::
+
+            /etc/ceph/osd/{ID}-{FSID}.json
+
+
+        Environment variables supported:
+
+        CEPH_VOLUME_SIMPLE_JSON_DIR: Directory location for scanned OSD JSON configs
+        """)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume simple activate',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=sub_command_help,
+        )
+        parser.add_argument(
+            'osd_id',
+            metavar='ID',
+            nargs='?',
+            help='The ID of the OSD, usually an integer, like 0'
+        )
+        parser.add_argument(
+            'osd_fsid',
+            metavar='FSID',
+            nargs='?',
+            help='The FSID of the OSD, similar to a SHA1'
+        )
+        parser.add_argument(
+            '--file',
+            help='The path to a JSON file, from a scanned OSD'
+        )
+        if len(self.argv) == 0:
+            print(sub_command_help)
+            return
+        args = parser.parse_args(self.argv)
+        if not args.file:
+            if not args.osd_id and not args.osd_fsid:
+                terminal.error('ID and FSID are required to find the right OSD to activate')
+                terminal.error('from a scanned OSD location in /etc/ceph/osd/')
+                raise RuntimeError('Unable to activate without both ID and FSID')
+        # don't allow a CLI flag to specify the JSON dir, because that might
+        # implicitly indicate that it would be possible to activate a json file
+        # at a non-default location which would not work at boot time if the
+        # custom location is not exposed through an ENV var
+        json_dir = os.environ.get('CEPH_VOLUME_SIMPLE_JSON_DIR', '/etc/ceph/osd/')
+        if args.file:
+            json_config = args.file
+        else:
+            json_config = os.path.join(json_dir, '%s-%s.json' % (args.osd_id, args.osd_fsid))
+        if not os.path.exists(json_config):
+            raise RuntimeError('Expected JSON config path not found: %s' % json_config)
+        args.json_config = json_config
+        self.activate(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/simple/main.py b/ceph/src/ceph-volume/ceph_volume/devices/simple/main.py
new file mode 100644 (file)
index 0000000..2119963
--- /dev/null
@@ -0,0 +1,41 @@
+import argparse
+from textwrap import dedent
+from ceph_volume import terminal
+from . import scan
+from . import activate
+from . import trigger
+
+
+class Simple(object):
+
+    help = 'Manage already deployed OSDs with ceph-volume'
+
+    _help = dedent("""
+    Take over a deployed OSD, persisting its metadata in /etc/ceph/osd/ so that it can be managed
+    with ceph-volume directly. Avoids UDEV and ceph-disk handling.
+
+    {sub_help}
+    """)
+
+    mapper = {
+        'scan': scan.Scan,
+        'activate': activate.Activate,
+        'trigger': trigger.Trigger,
+    }
+
+    def __init__(self, argv):
+        self.argv = argv
+
+    def print_help(self, sub_help):
+        return self._help.format(sub_help=sub_help)
+
+    def main(self):
+        terminal.dispatch(self.mapper, self.argv)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume simple',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=self.print_help(terminal.subhelp(self.mapper)),
+        )
+        parser.parse_args(self.argv)
+        if len(self.argv) <= 1:
+            return parser.print_help()
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/simple/scan.py b/ceph/src/ceph-volume/ceph_volume/devices/simple/scan.py
new file mode 100644 (file)
index 0000000..905baf4
--- /dev/null
@@ -0,0 +1,206 @@
+from __future__ import print_function
+import argparse
+import json
+import logging
+import os
+from textwrap import dedent
+from ceph_volume import decorators, terminal, conf
+from ceph_volume.api import lvm
+from ceph_volume.util import arg_validators, system, disk
+
+
+logger = logging.getLogger(__name__)
+
+
+class Scan(object):
+
+    help = 'Capture metadata from an OSD data partition or directory'
+
+    def __init__(self, argv):
+        self.argv = argv
+        self._etc_path = '/etc/ceph/osd/'
+
+    @property
+    def etc_path(self):
+        if os.path.isdir(self._etc_path):
+            return self._etc_path
+
+        if not os.path.exists(self._etc_path):
+            os.mkdir(self._etc_path)
+            return self._etc_path
+
+        error = "OSD Configuration path (%s) needs to be a directory" % self._etc_path
+        raise RuntimeError(error)
+
+    def get_contents(self, path):
+        with open(path, 'r') as fp:
+            contents = fp.readlines()
+        if len(contents) > 1:
+            return ''.join(contents)
+        return ''.join(contents).strip().strip('\n')
+
+    def scan_device(self, path):
+        device_metadata = {'path': None, 'uuid': None}
+        if not path:
+            return device_metadata
+        # cannot read the symlink if this is tmpfs
+        if os.path.islink(path):
+            device = os.readlink(path)
+        else:
+            device = path
+        lvm_device = lvm.get_lv_from_argument(device)
+        if lvm_device:
+            device_uuid = lvm_device.lv_uuid
+        else:
+            device_uuid = disk.get_partuuid(device)
+
+        device_metadata['uuid'] = device_uuid
+        device_metadata['path'] = device
+
+        return device_metadata
+
+    def scan_directory(self, path):
+        osd_metadata = {'cluster_name': conf.cluster}
+        path_mounts = system.get_mounts(paths=True)
+        for _file in os.listdir(path):
+            file_path = os.path.join(path, _file)
+            if os.path.islink(file_path):
+                osd_metadata[_file] = self.scan_device(file_path)
+            if os.path.isdir(file_path):
+                continue
+            # the check for binary needs to go before the file, to avoid
+            # capturing data from binary files but still be able to capture
+            # contents from actual files later
+            if system.is_binary(file_path):
+                continue
+            if os.path.isfile(file_path):
+                osd_metadata[_file] = self.get_contents(file_path)
+
+        device = path_mounts.get(path)
+        # it is possible to have more than one device, pick the first one, and
+        # warn that it is possible that more than one device is 'data'
+        if not device:
+            terminal.error('Unable to detect device mounted for path: %s' % path)
+            raise RuntimeError('Cannot activate OSD')
+        osd_metadata['data'] = self.scan_device(device[0] if len(device) else None)
+
+        return osd_metadata
+
+    @decorators.needs_root
+    def scan(self, args):
+        osd_metadata = {'cluster_name': conf.cluster}
+        device_mounts = system.get_mounts(devices=True)
+        osd_path = None
+        logger.info('detecting if argument is a device or a directory: %s', args.osd_path)
+        if os.path.isdir(args.osd_path):
+            logger.info('will scan directly, path is a directory')
+            osd_path = args.osd_path
+        else:
+            # assume this is a device, check if it is mounted and use that path
+            logger.info('path is not a directory, will check if mounted')
+            if system.device_is_mounted(args.osd_path):
+                logger.info('argument is a device, which is mounted')
+                mounted_osd_paths = device_mounts.get(args.osd_path)
+                osd_path = mounted_osd_paths[0] if len(mounted_osd_paths) else None
+
+        # argument is not a directory, and it is not a device that is mounted
+        # somewhere so temporarily mount it to poke inside, otherwise, scan
+        # directly
+        if not osd_path:
+            logger.info('device is not mounted, will mount it temporarily to scan')
+            with system.tmp_mount(args.osd_path) as osd_path:
+                osd_metadata = self.scan_directory(osd_path)
+        else:
+            logger.info('will scan OSD directory at path: %s', osd_path)
+            osd_metadata = self.scan_directory(osd_path)
+
+        osd_id = osd_metadata['whoami']
+        osd_fsid = osd_metadata['fsid']
+        filename = '%s-%s.json' % (osd_id, osd_fsid)
+        json_path = os.path.join(self.etc_path, filename)
+        if os.path.exists(json_path) and not args.stdout:
+            if not args.force:
+                raise RuntimeError(
+                    '--force was not used and OSD metadata file exists: %s' % json_path
+                )
+
+        if args.stdout:
+            print(json.dumps(osd_metadata, indent=4, sort_keys=True, ensure_ascii=False))
+        else:
+            with open(json_path, 'w') as fp:
+                json.dump(osd_metadata, fp, indent=4, sort_keys=True, ensure_ascii=False)
+            terminal.success(
+                'OSD %s got scanned and metadata persisted to file: %s' % (
+                    osd_id,
+                    json_path
+                )
+            )
+            terminal.success(
+                'To take over managment of this scanned OSD, and disable ceph-disk and udev, run:'
+            )
+            terminal.success('    ceph-volume simple activate %s %s' % (osd_id, osd_fsid))
+
+        if not osd_metadata.get('data'):
+            msg = 'Unable to determine device mounted on %s' % args.osd_path
+            logger.warning(msg)
+            terminal.warning(msg)
+            terminal.warning('OSD will not be able to start without this information:')
+            terminal.warning('    "data": "/path/to/device",')
+            logger.warning('Unable to determine device mounted on %s' % args.osd_path)
+
+    def main(self):
+        sub_command_help = dedent("""
+        Scan an OSD directory for files and configurations that will allow to
+        take over the management of the OSD.
+
+        Scanned OSDs will get their configurations stored in
+        /etc/ceph/osd/<id>-<fsid>.json
+
+        For an OSD ID of 0 with fsid of ``a9d50838-e823-43d6-b01f-2f8d0a77afc2``
+        that could mean a scan command that looks like::
+
+            ceph-volume lvm scan /var/lib/ceph/osd/ceph-0
+
+        Which would store the metadata in a JSON file at::
+
+            /etc/ceph/osd/0-a9d50838-e823-43d6-b01f-2f8d0a77afc2.json
+
+        To a scan an existing, running, OSD:
+
+            ceph-volume simple scan /var/lib/ceph/osd/{cluster}-{osd id}
+
+        And to scan a device (mounted or unmounted) that has OSD data in it, for example /dev/sda1
+
+            ceph-volume simple scan /dev/sda1
+        """)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume simple scan',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=sub_command_help,
+        )
+
+        parser.add_argument(
+            '-f', '--force',
+            action='store_true',
+            help='If OSD has already been scanned, the JSON file will be overwritten'
+        )
+
+        parser.add_argument(
+            '--stdout',
+            action='store_true',
+            help='Do not save to a file, output metadata to stdout'
+        )
+
+        parser.add_argument(
+            'osd_path',
+            metavar='OSD_PATH',
+            type=arg_validators.OSDPath(),
+            nargs='?',
+            help='Path to an existing OSD directory or OSD data partition'
+        )
+
+        if len(self.argv) == 0:
+            print(sub_command_help)
+            return
+        args = parser.parse_args(self.argv)
+        self.scan(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/simple/trigger.py b/ceph/src/ceph-volume/ceph_volume/devices/simple/trigger.py
new file mode 100644 (file)
index 0000000..aeb5cf1
--- /dev/null
@@ -0,0 +1,70 @@
+from __future__ import print_function
+import argparse
+from textwrap import dedent
+from ceph_volume.exceptions import SuffixParsingError
+from ceph_volume import decorators
+from .activate import Activate
+
+
+def parse_osd_id(string):
+    osd_id = string.split('-', 1)[0]
+    if not osd_id:
+        raise SuffixParsingError('OSD id', string)
+    if osd_id.isdigit():
+        return osd_id
+    raise SuffixParsingError('OSD id', string)
+
+
+def parse_osd_uuid(string):
+    osd_id = '%s-' % parse_osd_id(string)
+    # remove the id first
+    osd_uuid = string.split(osd_id, 1)[-1]
+    if not osd_uuid:
+        raise SuffixParsingError('OSD uuid', string)
+    return osd_uuid
+
+
+class Trigger(object):
+
+    help = 'systemd helper to activate an OSD'
+
+    def __init__(self, argv):
+        self.argv = argv
+
+    @decorators.needs_root
+    def main(self):
+        sub_command_help = dedent("""
+        ** DO NOT USE DIRECTLY **
+        This tool is meant to help the systemd unit that knows about OSDs.
+
+        Proxy OSD activation to ``ceph-volume simple activate`` by parsing the
+        input from systemd, detecting the UUID and ID associated with an OSD::
+
+            ceph-volume simple trigger {SYSTEMD-DATA}
+
+        The systemd "data" is expected to be in the format of::
+
+            {OSD ID}-{OSD UUID}
+
+        The devices associated with the OSD need to have been scanned previously,
+        so that all needed metadata can be used for starting the OSD process.
+        """)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume simple trigger',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=sub_command_help,
+        )
+
+        parser.add_argument(
+            'systemd_data',
+            metavar='SYSTEMD_DATA',
+            nargs='?',
+            help='Data from a systemd unit containing ID and UUID of the OSD, like 0-asdf-lkjh'
+        )
+        if len(self.argv) == 0:
+            print(sub_command_help)
+            return
+        args = parser.parse_args(self.argv)
+        osd_id = parse_osd_id(args.systemd_data)
+        osd_uuid = parse_osd_uuid(args.systemd_data)
+        Activate([osd_id, osd_uuid], systemd=True).main()
index d4bee154d97176e0389b3e7b737003906efee701..e7ed5d88cc51182d5172974155fdee1e5942e43d 100644 (file)
@@ -27,7 +27,7 @@ Ceph Conf: {ceph_path}
     """
 
     def __init__(self, argv=None, parse=True):
-        self.mapper = {'lvm': devices.lvm.LVM}
+        self.mapper = {'lvm': devices.lvm.LVM, 'simple': devices.simple.Simple}
         self.plugin_help = "No plugins found/loaded"
         if argv is None:
             self.argv = sys.argv
index bc5047a179c3033f2c06c5fa6edd0e1f9c4f7a9b..4b6a9c284741f50d5cb22644214a0293d19c266f 100644 (file)
@@ -48,6 +48,47 @@ def log_descriptors(reads, process, terminal_logging):
             pass
 
 
+def obfuscate(command_, on=None):
+    """
+    Certain commands that are useful to log might contain information that
+    should be replaced by '*' like when creating OSDs and the keyryings are
+    being passed, which should not be logged.
+
+    :param on: A string (will match a flag) or an integer (will match an index)
+
+    If matching on a flag (when ``on`` is a string) it will obfuscate on the
+    value for that flag. That is a command like ['ls', '-l', '/'] that calls
+    `obfuscate(command, on='-l')` will obfustace '/' which is the value for
+    `-l`.
+
+    The reason for `on` to allow either a string or an integer, altering
+    behavior for both is because it is easier for ``run`` and ``call`` to just
+    pop a value to obfuscate (vs. allowing an index or a flag)
+    """
+    command = command_[:]
+    msg = "Running command: %s" % ' '.join(command)
+    if on in [None, False]:
+        return msg
+
+    if isinstance(on, int):
+        index = on
+
+    else:
+        try:
+            index = command.index(on) + 1
+        except ValueError:
+            # if the flag just doesn't exist then it doesn't matter just return
+            # the base msg
+            return msg
+
+    try:
+        command[index] = '*' * len(command[index])
+    except IndexError: # the index was completely out of range
+        return msg
+
+    return "Running command: %s" % ' '.join(command)
+
+
 def run(command, **kw):
     """
     A real-time-logging implementation of a remote subprocess.Popen call where
@@ -57,7 +98,7 @@ def run(command, **kw):
     :param stop_on_error: If a nonzero exit status is return, it raises a ``RuntimeError``
     """
     stop_on_error = kw.pop('stop_on_error', True)
-    command_msg = "Running command: %s" % ' '.join(command)
+    command_msg = obfuscate(command, kw.pop('obfuscate', None))
     stdin = kw.pop('stdin', None)
     logger.info(command_msg)
     terminal.write(command_msg)
@@ -115,10 +156,12 @@ def call(command, **kw):
                              it is forcefully set to True if a return code is non-zero
     """
     terminal_verbose = kw.pop('terminal_verbose', False)
+    show_command = kw.pop('show_command', False)
     command_msg = "Running command: %s" % ' '.join(command)
     stdin = kw.pop('stdin', None)
     logger.info(command_msg)
-    terminal.write(command_msg)
+    if show_command:
+        terminal.write(command_msg)
 
     process = subprocess.Popen(
         command,
index 9bb4d7d3a0e30b30c899e5c9228e653bc29cb3d3..ab8f3e70ae31d8650b7f5661cebda0a41ad77a55 100644 (file)
@@ -20,6 +20,10 @@ def disable(unit):
     process.run(['sudo', 'systemctl', 'disable', unit])
 
 
+def mask(unit):
+    process.run(['sudo', 'systemctl', 'mask', unit])
+
+
 def start_osd(id_):
     return start(osd_unit % id_)
 
@@ -40,9 +44,20 @@ def enable_volume(id_, fsid, device_type='lvm'):
     return enable(volume_unit % (device_type, id_, fsid))
 
 
+def mask_ceph_disk():
+    # systemctl allows using a glob like '*' for masking, but there was a bug
+    # in that it wouldn't allow this for service templates. This means that
+    # masking ceph-disk@* will not work, so we must link the service directly.
+    # /etc/systemd takes precendence regardless of the location of the unit
+    process.run(
+        ['sudo', 'ln', '-sf', '/dev/null',  '/etc/systemd/system/ceph-disk@.service']
+    )
+
+
 #
 # templates
 #
 
 osd_unit = "ceph-osd@%s"
+ceph_disk_unit = "ceph-disk@%s"
 volume_unit = "ceph-volume@%s-%s-%s"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/api/test_lvm.py b/ceph/src/ceph-volume/ceph_volume/tests/api/test_lvm.py
new file mode 100644 (file)
index 0000000..3639f01
--- /dev/null
@@ -0,0 +1,399 @@
+import pytest
+from ceph_volume import process, exceptions
+from ceph_volume.api import lvm as api
+
+
+class TestParseTags(object):
+
+    def test_no_tags_means_empty_dict(self):
+        result = api.parse_tags('')
+        assert result == {}
+
+    def test_single_tag_gets_parsed(self):
+        result = api.parse_tags('ceph.osd_something=1')
+        assert result == {'ceph.osd_something': '1'}
+
+    def test_multiple_csv_expands_in_dict(self):
+        result = api.parse_tags('ceph.osd_something=1,ceph.foo=2,ceph.fsid=0000')
+        # assert them piecemeal to avoid the un-ordered dict nature
+        assert result['ceph.osd_something'] == '1'
+        assert result['ceph.foo'] == '2'
+        assert result['ceph.fsid'] == '0000'
+
+
+class TestGetAPIVgs(object):
+
+    def test_report_is_emtpy(self, monkeypatch):
+        monkeypatch.setattr(api.process, 'call', lambda x: ('\n\n', '', 0))
+        assert api.get_api_vgs() == []
+
+    def test_report_has_stuff(self, monkeypatch):
+        report = ['  VolGroup00']
+        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
+        assert api.get_api_vgs() == [{'vg_name': 'VolGroup00'}]
+
+    def test_report_has_stuff_with_empty_attrs(self, monkeypatch):
+        report = ['  VolGroup00 ;;;;;;9g']
+        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
+        result = api.get_api_vgs()[0]
+        assert len(result.keys()) == 7
+        assert result['vg_name'] == 'VolGroup00'
+        assert result['vg_free'] == '9g'
+
+    def test_report_has_multiple_items(self, monkeypatch):
+        report = ['   VolGroup00;;;;;;;', '    ceph_vg;;;;;;;']
+        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
+        result = api.get_api_vgs()
+        assert result[0]['vg_name'] == 'VolGroup00'
+        assert result[1]['vg_name'] == 'ceph_vg'
+
+
+class TestGetAPILvs(object):
+
+    def test_report_is_emtpy(self, monkeypatch):
+        monkeypatch.setattr(api.process, 'call', lambda x: ('', '', 0))
+        assert api.get_api_lvs() == []
+
+    def test_report_has_stuff(self, monkeypatch):
+        report = ['  ;/path;VolGroup00;root']
+        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
+        result = api.get_api_lvs()
+        assert result[0]['lv_name'] == 'VolGroup00'
+
+    def test_report_has_multiple_items(self, monkeypatch):
+        report = ['  ;/path;VolName;root', ';/dev/path;ceph_lv;ceph_vg']
+        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
+        result = api.get_api_lvs()
+        assert result[0]['lv_name'] == 'VolName'
+        assert result[1]['lv_name'] == 'ceph_lv'
+
+
+@pytest.fixture
+def volumes(monkeypatch):
+    monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
+    volumes = api.Volumes()
+    volumes._purge()
+    # also patch api.Volumes so that when it is called, it will use the newly
+    # created fixture, with whatever the test method wants to append to it
+    monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+    return volumes
+
+
+@pytest.fixture
+def pvolumes(monkeypatch):
+    monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
+    pvolumes = api.PVolumes()
+    pvolumes._purge()
+    return pvolumes
+
+
+@pytest.fixture
+def volume_groups(monkeypatch):
+    monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
+    vgs = api.VolumeGroups()
+    vgs._purge()
+    return vgs
+
+
+class TestGetLV(object):
+
+    def test_nothing_is_passed_in(self):
+        # so we return a None
+        assert api.get_lv() is None
+
+    def test_single_lv_is_matched(self, volumes, monkeypatch):
+        FooVolume = api.Volume(lv_name='foo', lv_path='/dev/vg/foo', lv_tags="ceph.type=data")
+        volumes.append(FooVolume)
+        monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+        assert api.get_lv(lv_name='foo') == FooVolume
+
+    def test_single_lv_is_matched_by_uuid(self, volumes, monkeypatch):
+        FooVolume = api.Volume(
+            lv_name='foo', lv_path='/dev/vg/foo',
+            lv_uuid='1111', lv_tags="ceph.type=data")
+        volumes.append(FooVolume)
+        monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+        assert api.get_lv(lv_uuid='1111') == FooVolume
+
+
+class TestGetPV(object):
+
+    def test_nothing_is_passed_in(self):
+        # so we return a None
+        assert api.get_pv() is None
+
+    def test_single_pv_is_not_matched(self, pvolumes, monkeypatch):
+        FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+        pvolumes.append(FooPVolume)
+        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+        assert api.get_pv(pv_uuid='foo') is None
+
+    def test_single_pv_is_matched(self, pvolumes, monkeypatch):
+        FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+        pvolumes.append(FooPVolume)
+        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+        assert api.get_pv(pv_uuid='0000') == FooPVolume
+
+    def test_single_pv_is_matched_by_uuid(self, pvolumes, monkeypatch):
+        FooPVolume = api.PVolume(
+            pv_name='/dev/vg/foo',
+            pv_uuid='1111', pv_tags="ceph.type=data")
+        pvolumes.append(FooPVolume)
+        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+        assert api.get_pv(pv_uuid='1111') == FooPVolume
+
+
+class TestPVolumes(object):
+
+    def test_filter_by_tag_does_not_match_one(self, pvolumes, monkeypatch):
+        pv_tags = "ceph.type=journal,ceph.osd_id=1,ceph.fsid=000-aaa"
+        FooPVolume = api.PVolume(
+            pv_name='/dev/vg/foo',
+            pv_uuid='1111', pv_tags=pv_tags)
+        pvolumes.append(FooPVolume)
+        pvolumes.filter(pv_tags={'ceph.type': 'journal', 'ceph.osd_id': '2'})
+        assert pvolumes == []
+
+    def test_filter_by_tags_matches(self, pvolumes, monkeypatch):
+        pv_tags = "ceph.type=journal,ceph.osd_id=1"
+        FooPVolume = api.PVolume(
+            pv_name='/dev/vg/foo',
+            pv_uuid='1111', pv_tags=pv_tags)
+        pvolumes.append(FooPVolume)
+        pvolumes.filter(pv_tags={'ceph.type': 'journal', 'ceph.osd_id': '1'})
+        assert pvolumes == [FooPVolume]
+
+
+class TestGetVG(object):
+
+    def test_nothing_is_passed_in(self):
+        # so we return a None
+        assert api.get_vg() is None
+
+    def test_single_vg_is_matched(self, volume_groups, monkeypatch):
+        FooVG = api.VolumeGroup(vg_name='foo')
+        volume_groups.append(FooVG)
+        monkeypatch.setattr(api, 'VolumeGroups', lambda: volume_groups)
+        assert api.get_vg(vg_name='foo') == FooVG
+
+
+class TestVolumes(object):
+
+    def test_volume_get_has_no_volumes(self, volumes):
+        assert volumes.get() is None
+
+    def test_volume_get_filtered_has_no_volumes(self, volumes):
+        assert volumes.get(lv_name='ceph') is None
+
+    def test_volume_has_multiple_matches(self, volumes):
+        volume1 = volume2 = api.Volume(lv_name='foo', lv_path='/dev/vg/lv', lv_tags='')
+        volumes.append(volume1)
+        volumes.append(volume2)
+        with pytest.raises(exceptions.MultipleLVsError):
+            volumes.get(lv_name='foo')
+
+    def test_as_dict_infers_type_from_tags(self, volumes):
+        lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
+        volumes.append(osd)
+        result = volumes.get(lv_tags={'ceph.type': 'data'}).as_dict()
+        assert result['type'] == 'data'
+
+    def test_as_dict_populates_path_from_lv_api(self, volumes):
+        lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
+        volumes.append(osd)
+        result = volumes.get(lv_tags={'ceph.type': 'data'}).as_dict()
+        assert result['path'] == '/dev/vg/lv'
+
+    def test_find_the_correct_one(self, volumes):
+        volume1 = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags='')
+        volume2 = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='')
+        volumes.append(volume1)
+        volumes.append(volume2)
+        assert volumes.get(lv_name='volume1') == volume1
+
+    def test_filter_by_tag(self, volumes):
+        lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
+        journal = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='ceph.type=journal')
+        volumes.append(osd)
+        volumes.append(journal)
+        volumes.filter(lv_tags={'ceph.type': 'data'})
+        assert len(volumes) == 1
+        assert volumes[0].lv_name == 'volume1'
+
+    def test_filter_by_tag_does_not_match_one(self, volumes):
+        lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
+        journal = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='ceph.osd_id=1,ceph.type=journal')
+        volumes.append(osd)
+        volumes.append(journal)
+        # note the different osd_id!
+        volumes.filter(lv_tags={'ceph.type': 'data', 'ceph.osd_id': '2'})
+        assert volumes == []
+
+    def test_filter_by_vg_name(self, volumes):
+        lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+        osd = api.Volume(lv_name='volume1', vg_name='ceph_vg', lv_tags=lv_tags)
+        journal = api.Volume(lv_name='volume2', vg_name='system_vg', lv_tags='ceph.type=journal')
+        volumes.append(osd)
+        volumes.append(journal)
+        volumes.filter(vg_name='ceph_vg')
+        assert len(volumes) == 1
+        assert volumes[0].lv_name == 'volume1'
+
+    def test_filter_by_lv_path(self, volumes):
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_tags='')
+        journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_tags='')
+        volumes.append(osd)
+        volumes.append(journal)
+        volumes.filter(lv_path='/dev/volume1')
+        assert len(volumes) == 1
+        assert volumes[0].lv_name == 'volume1'
+
+    def test_filter_by_lv_uuid(self, volumes):
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_uuid='1111', lv_tags='')
+        journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_uuid='', lv_tags='')
+        volumes.append(osd)
+        volumes.append(journal)
+        volumes.filter(lv_uuid='1111')
+        assert len(volumes) == 1
+        assert volumes[0].lv_name == 'volume1'
+
+    def test_filter_by_lv_uuid_nothing_found(self, volumes):
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_uuid='1111', lv_tags='')
+        journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_uuid='', lv_tags='')
+        volumes.append(osd)
+        volumes.append(journal)
+        volumes.filter(lv_uuid='22222')
+        assert volumes == []
+
+    def test_filter_requires_params(self, volumes):
+        with pytest.raises(TypeError):
+            volumes.filter()
+
+
+class TestVolumeGroups(object):
+
+    def test_volume_get_has_no_volume_groups(self, volume_groups):
+        assert volume_groups.get() is None
+
+    def test_volume_get_filtered_has_no_volumes(self, volume_groups):
+        assert volume_groups.get(vg_name='ceph') is None
+
+    def test_volume_has_multiple_matches(self, volume_groups):
+        volume1 = volume2 = api.VolumeGroup(vg_name='foo', lv_path='/dev/vg/lv', lv_tags='')
+        volume_groups.append(volume1)
+        volume_groups.append(volume2)
+        with pytest.raises(exceptions.MultipleVGsError):
+            volume_groups.get(vg_name='foo')
+
+    def test_find_the_correct_one(self, volume_groups):
+        volume1 = api.VolumeGroup(vg_name='volume1', lv_tags='')
+        volume2 = api.VolumeGroup(vg_name='volume2', lv_tags='')
+        volume_groups.append(volume1)
+        volume_groups.append(volume2)
+        assert volume_groups.get(vg_name='volume1') == volume1
+
+    def test_filter_by_tag(self, volume_groups):
+        vg_tags = "ceph.group=dmcache"
+        osd = api.VolumeGroup(vg_name='volume1', vg_tags=vg_tags)
+        journal = api.VolumeGroup(vg_name='volume2', vg_tags='ceph.group=plain')
+        volume_groups.append(osd)
+        volume_groups.append(journal)
+        volume_groups.filter(vg_tags={'ceph.group': 'dmcache'})
+        assert len(volume_groups) == 1
+        assert volume_groups[0].vg_name == 'volume1'
+
+    def test_filter_by_tag_does_not_match_one(self, volume_groups):
+        vg_tags = "ceph.group=dmcache,ceph.disk_type=ssd"
+        osd = api.VolumeGroup(vg_name='volume1', vg_path='/dev/vg/lv', vg_tags=vg_tags)
+        volume_groups.append(osd)
+        volume_groups.filter(vg_tags={'ceph.group': 'data', 'ceph.disk_type': 'ssd'})
+        assert volume_groups == []
+
+    def test_filter_by_vg_name(self, volume_groups):
+        vg_tags = "ceph.type=data,ceph.fsid=000-aaa"
+        osd = api.VolumeGroup(vg_name='ceph_vg', vg_tags=vg_tags)
+        journal = api.VolumeGroup(vg_name='volume2', vg_tags='ceph.type=journal')
+        volume_groups.append(osd)
+        volume_groups.append(journal)
+        volume_groups.filter(vg_name='ceph_vg')
+        assert len(volume_groups) == 1
+        assert volume_groups[0].vg_name == 'ceph_vg'
+
+    def test_filter_requires_params(self, volume_groups):
+        with pytest.raises(TypeError):
+            volume_groups.filter()
+
+
+class TestGetLVFromArgument(object):
+
+    def setup(self):
+        self.foo_volume = api.Volume(
+            lv_name='foo', lv_path='/path/to/lv',
+            vg_name='foo_group', lv_tags=''
+        )
+
+    def test_non_absolute_path_is_not_valid(self, volumes):
+        volumes.append(self.foo_volume)
+        assert api.get_lv_from_argument('foo') is None
+
+    def test_too_many_slashes_is_invalid(self, volumes):
+        volumes.append(self.foo_volume)
+        assert api.get_lv_from_argument('path/to/lv') is None
+
+    def test_absolute_path_is_not_lv(self, volumes):
+        volumes.append(self.foo_volume)
+        assert api.get_lv_from_argument('/path') is None
+
+    def test_absolute_path_is_lv(self, volumes):
+        volumes.append(self.foo_volume)
+        assert api.get_lv_from_argument('/path/to/lv') == self.foo_volume
+
+
+class TestRemoveLV(object):
+
+    def test_removes_lv(self, monkeypatch):
+        def mock_call(cmd, **kw):
+            return ('', '', 0)
+        monkeypatch.setattr(process, 'call', mock_call)
+        assert api.remove_lv("vg/lv")
+
+    def test_fails_to_remove_lv(self, monkeypatch):
+        def mock_call(cmd, **kw):
+            return ('', '', 1)
+        monkeypatch.setattr(process, 'call', mock_call)
+        with pytest.raises(RuntimeError):
+            api.remove_lv("vg/lv")
+
+
+class TestCreateLV(object):
+
+    def setup(self):
+        self.foo_volume = api.Volume(lv_name='foo', lv_path='/path', vg_name='foo_group', lv_tags='')
+
+    def test_uses_size(self, monkeypatch, capture):
+        monkeypatch.setattr(process, 'run', capture)
+        monkeypatch.setattr(process, 'call', capture)
+        monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
+        api.create_lv('foo', 'foo_group', size='5G', tags={'ceph.type': 'data'})
+        expected = ['sudo', 'lvcreate', '--yes', '-L', '5G', '-n', 'foo', 'foo_group']
+        assert capture.calls[0]['args'][0] == expected
+
+    def test_calls_to_set_type_tag(self, monkeypatch, capture):
+        monkeypatch.setattr(process, 'run', capture)
+        monkeypatch.setattr(process, 'call', capture)
+        monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
+        api.create_lv('foo', 'foo_group', size='5G', tags={'ceph.type': 'data'})
+        ceph_tag = ['sudo', 'lvchange', '--addtag', 'ceph.type=data', '/path']
+        assert capture.calls[1]['args'][0] == ceph_tag
+
+    def test_calls_to_set_data_tag(self, monkeypatch, capture):
+        monkeypatch.setattr(process, 'run', capture)
+        monkeypatch.setattr(process, 'call', capture)
+        monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
+        api.create_lv('foo', 'foo_group', size='5G', tags={'ceph.type': 'data'})
+        data_tag = ['sudo', 'lvchange', '--addtag', 'ceph.data_device=/path', '/path']
+        assert capture.calls[2]['args'][0] == data_tag
index 7a580e57c726dc7f55160134e78c9344a6732741..f5803346177d0e4b4ec996f754abb20d2f01e7ce 100644 (file)
@@ -1,5 +1,7 @@
+import os
 import pytest
-from ceph_volume.devices.lvm import api
+from ceph_volume.api import lvm as lvm_api
+
 
 class Capture(object):
 
@@ -12,6 +14,18 @@ class Capture(object):
         self.calls.append({'args': a, 'kwargs': kw})
 
 
+class Factory(object):
+
+    def __init__(self, **kw):
+        for k, v in kw.items():
+            setattr(self, k, v)
+
+
+@pytest.fixture
+def factory():
+    return Factory
+
+
 @pytest.fixture
 def capture():
     return Capture()
@@ -20,7 +34,7 @@ def capture():
 @pytest.fixture
 def volumes(monkeypatch):
     monkeypatch.setattr('ceph_volume.process.call', lambda x: ('', '', 0))
-    volumes = api.Volumes()
+    volumes = lvm_api.Volumes()
     volumes._purge()
     return volumes
 
@@ -28,7 +42,7 @@ def volumes(monkeypatch):
 @pytest.fixture
 def volume_groups(monkeypatch):
     monkeypatch.setattr('ceph_volume.process.call', lambda x: ('', '', 0))
-    vgs = api.VolumeGroups()
+    vgs = lvm_api.VolumeGroups()
     vgs._purge()
     return vgs
 
@@ -40,3 +54,17 @@ def is_root(monkeypatch):
     is root (or is sudoing to superuser) can continue as-is
     """
     monkeypatch.setattr('os.getuid', lambda: 0)
+
+
+@pytest.fixture
+def tmpfile(tmpdir):
+    """
+    Create a temporary file, optionally filling it with contents, returns an
+    absolute path to the file when called
+    """
+    def generate_file(name='file', contents=''):
+        path = os.path.join(str(tmpdir), name)
+        with open(path, 'w') as fp:
+            fp.write(contents)
+        return path
+    return generate_file
index 40df77576a1dcead1723a099341b66cfb21cfcc1..ce623aac98f26f58b2e2bb48657bf2d1ed393adc 100644 (file)
@@ -1,10 +1,15 @@
 import pytest
-from ceph_volume.devices.lvm import activate, api
+from ceph_volume.devices.lvm import activate
+from ceph_volume.api import lvm as api
 
 
 class Args(object):
 
     def __init__(self, **kw):
+        # default flags
+        self.bluestore = False
+        self.filestore = False
+        self.auto_detect_objectstore = None
         for k, v in kw.items():
             setattr(self, k, v)
 
@@ -20,7 +25,16 @@ class TestActivate(object):
         volumes.append(FooVolume)
         monkeypatch.setattr(api, 'Volumes', lambda: volumes)
         monkeypatch.setattr(activate, 'activate_filestore', capture)
-        args = Args(osd_id=None, osd_fsid='1234')
+        args = Args(osd_id=None, osd_fsid='1234', filestore=True)
+        activate.Activate([]).activate(args)
+        assert capture.calls[0]['args'][0] == [FooVolume]
+
+    def test_no_osd_id_matches_fsid_bluestore(self, is_root, volumes, monkeypatch, capture):
+        FooVolume = api.Volume(lv_name='foo', lv_path='/dev/vg/foo', lv_tags="ceph.osd_fsid=1234")
+        volumes.append(FooVolume)
+        monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+        monkeypatch.setattr(activate, 'activate_bluestore', capture)
+        args = Args(osd_id=None, osd_fsid='1234', bluestore=True)
         activate.Activate([]).activate(args)
         assert capture.calls[0]['args'][0] == [FooVolume]
 
@@ -32,3 +46,33 @@ class TestActivate(object):
         args = Args(osd_id=None, osd_fsid='1234')
         with pytest.raises(RuntimeError):
             activate.Activate([]).activate(args)
+
+
+class TestActivateFlags(object):
+
+    def test_default_objectstore(self, capture):
+        args = ['0', 'asdf-ljh-asdf']
+        activation = activate.Activate(args)
+        activation.activate = capture
+        activation.main()
+        parsed_args = capture.calls[0]['args'][0]
+        assert parsed_args.filestore is False
+        assert parsed_args.bluestore is True
+
+    def test_uses_filestore(self, capture):
+        args = ['--filestore', '0', 'asdf-ljh-asdf']
+        activation = activate.Activate(args)
+        activation.activate = capture
+        activation.main()
+        parsed_args = capture.calls[0]['args'][0]
+        assert parsed_args.filestore is True
+        assert parsed_args.bluestore is False
+
+    def test_uses_bluestore(self, capture):
+        args = ['--bluestore', '0', 'asdf-ljh-asdf']
+        activation = activate.Activate(args)
+        activation.activate = capture
+        activation.main()
+        parsed_args = capture.calls[0]['args'][0]
+        assert parsed_args.filestore is False
+        assert parsed_args.bluestore is True
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_api.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_api.py
deleted file mode 100644 (file)
index d6aa549..0000000
+++ /dev/null
@@ -1,341 +0,0 @@
-import pytest
-from ceph_volume import process, exceptions
-from ceph_volume.devices.lvm import api
-
-
-class TestParseTags(object):
-
-    def test_no_tags_means_empty_dict(self):
-        result = api.parse_tags('')
-        assert result == {}
-
-    def test_single_tag_gets_parsed(self):
-        result = api.parse_tags('ceph.osd_something=1')
-        assert result == {'ceph.osd_something': '1'}
-
-    def test_multiple_csv_expands_in_dict(self):
-        result = api.parse_tags('ceph.osd_something=1,ceph.foo=2,ceph.fsid=0000')
-        # assert them piecemeal to avoid the un-ordered dict nature
-        assert result['ceph.osd_something'] == '1'
-        assert result['ceph.foo'] == '2'
-        assert result['ceph.fsid'] == '0000'
-
-
-class TestGetAPIVgs(object):
-
-    def test_report_is_emtpy(self, monkeypatch):
-        monkeypatch.setattr(api.process, 'call', lambda x: ('\n\n', '', 0))
-        assert api.get_api_vgs() == []
-
-    def test_report_has_stuff(self, monkeypatch):
-        report = ['  VolGroup00']
-        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
-        assert api.get_api_vgs() == [{'vg_name': 'VolGroup00'}]
-
-    def test_report_has_stuff_with_empty_attrs(self, monkeypatch):
-        report = ['  VolGroup00 ;;;;;;9g']
-        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
-        result = api.get_api_vgs()[0]
-        assert len(result.keys()) == 7
-        assert result['vg_name'] == 'VolGroup00'
-        assert result['vg_free'] == '9g'
-
-    def test_report_has_multiple_items(self, monkeypatch):
-        report = ['   VolGroup00;;;;;;;', '    ceph_vg;;;;;;;']
-        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
-        result = api.get_api_vgs()
-        assert result[0]['vg_name'] == 'VolGroup00'
-        assert result[1]['vg_name'] == 'ceph_vg'
-
-
-class TestGetAPILvs(object):
-
-    def test_report_is_emtpy(self, monkeypatch):
-        monkeypatch.setattr(api.process, 'call', lambda x: ('', '', 0))
-        assert api.get_api_lvs() == []
-
-    def test_report_has_stuff(self, monkeypatch):
-        report = ['  ;/path;VolGroup00;root']
-        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
-        result = api.get_api_lvs()
-        assert result[0]['lv_name'] == 'VolGroup00'
-
-    def test_report_has_multiple_items(self, monkeypatch):
-        report = ['  ;/path;VolName;root', ';/dev/path;ceph_lv;ceph_vg']
-        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
-        result = api.get_api_lvs()
-        assert result[0]['lv_name'] == 'VolName'
-        assert result[1]['lv_name'] == 'ceph_lv'
-
-
-@pytest.fixture
-def volumes(monkeypatch):
-    monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
-    volumes = api.Volumes()
-    volumes._purge()
-    return volumes
-
-
-@pytest.fixture
-def pvolumes(monkeypatch):
-    monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
-    pvolumes = api.PVolumes()
-    pvolumes._purge()
-    return pvolumes
-
-
-@pytest.fixture
-def volume_groups(monkeypatch):
-    monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
-    vgs = api.VolumeGroups()
-    vgs._purge()
-    return vgs
-
-
-class TestGetLV(object):
-
-    def test_nothing_is_passed_in(self):
-        # so we return a None
-        assert api.get_lv() is None
-
-    def test_single_lv_is_matched(self, volumes, monkeypatch):
-        FooVolume = api.Volume(lv_name='foo', lv_path='/dev/vg/foo', lv_tags="ceph.type=data")
-        volumes.append(FooVolume)
-        monkeypatch.setattr(api, 'Volumes', lambda: volumes)
-        assert api.get_lv(lv_name='foo') == FooVolume
-
-    def test_single_lv_is_matched_by_uuid(self, volumes, monkeypatch):
-        FooVolume = api.Volume(
-            lv_name='foo', lv_path='/dev/vg/foo',
-            lv_uuid='1111', lv_tags="ceph.type=data")
-        volumes.append(FooVolume)
-        monkeypatch.setattr(api, 'Volumes', lambda: volumes)
-        assert api.get_lv(lv_uuid='1111') == FooVolume
-
-
-class TestGetPV(object):
-
-    def test_nothing_is_passed_in(self):
-        # so we return a None
-        assert api.get_pv() is None
-
-    def test_single_pv_is_not_matched(self, pvolumes, monkeypatch):
-        FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
-        pvolumes.append(FooPVolume)
-        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
-        assert api.get_pv(pv_uuid='foo') is None
-
-    def test_single_pv_is_matched(self, pvolumes, monkeypatch):
-        FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
-        pvolumes.append(FooPVolume)
-        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
-        assert api.get_pv(pv_uuid='0000') == FooPVolume
-
-    def test_single_pv_is_matched_by_uuid(self, pvolumes, monkeypatch):
-        FooPVolume = api.PVolume(
-            pv_name='/dev/vg/foo',
-            pv_uuid='1111', pv_tags="ceph.type=data")
-        pvolumes.append(FooPVolume)
-        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
-        assert api.get_pv(pv_uuid='1111') == FooPVolume
-
-
-class TestPVolumes(object):
-
-    def test_filter_by_tag_does_not_match_one(self, pvolumes, monkeypatch):
-        pv_tags = "ceph.type=journal,ceph.osd_id=1,ceph.fsid=000-aaa"
-        FooPVolume = api.PVolume(
-            pv_name='/dev/vg/foo',
-            pv_uuid='1111', pv_tags=pv_tags)
-        pvolumes.append(FooPVolume)
-        pvolumes.filter(pv_tags={'ceph.type': 'journal', 'ceph.osd_id': '2'})
-        assert pvolumes == []
-
-    def test_filter_by_tags_matches(self, pvolumes, monkeypatch):
-        pv_tags = "ceph.type=journal,ceph.osd_id=1"
-        FooPVolume = api.PVolume(
-            pv_name='/dev/vg/foo',
-            pv_uuid='1111', pv_tags=pv_tags)
-        pvolumes.append(FooPVolume)
-        pvolumes.filter(pv_tags={'ceph.type': 'journal', 'ceph.osd_id': '1'})
-        assert pvolumes == [FooPVolume]
-
-
-class TestGetVG(object):
-
-    def test_nothing_is_passed_in(self):
-        # so we return a None
-        assert api.get_vg() is None
-
-    def test_single_vg_is_matched(self, volume_groups, monkeypatch):
-        FooVG = api.VolumeGroup(vg_name='foo')
-        volume_groups.append(FooVG)
-        monkeypatch.setattr(api, 'VolumeGroups', lambda: volume_groups)
-        assert api.get_vg(vg_name='foo') == FooVG
-
-
-class TestVolumes(object):
-
-    def test_volume_get_has_no_volumes(self, volumes):
-        assert volumes.get() is None
-
-    def test_volume_get_filtered_has_no_volumes(self, volumes):
-        assert volumes.get(lv_name='ceph') is None
-
-    def test_volume_has_multiple_matches(self, volumes):
-        volume1 = volume2 = api.Volume(lv_name='foo', lv_path='/dev/vg/lv', lv_tags='')
-        volumes.append(volume1)
-        volumes.append(volume2)
-        with pytest.raises(exceptions.MultipleLVsError):
-            volumes.get(lv_name='foo')
-
-    def test_find_the_correct_one(self, volumes):
-        volume1 = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags='')
-        volume2 = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='')
-        volumes.append(volume1)
-        volumes.append(volume2)
-        assert volumes.get(lv_name='volume1') == volume1
-
-    def test_filter_by_tag(self, volumes):
-        lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
-        osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
-        journal = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='ceph.type=journal')
-        volumes.append(osd)
-        volumes.append(journal)
-        volumes.filter(lv_tags={'ceph.type': 'data'})
-        assert len(volumes) == 1
-        assert volumes[0].lv_name == 'volume1'
-
-    def test_filter_by_tag_does_not_match_one(self, volumes):
-        lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
-        osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
-        journal = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='ceph.osd_id=1,ceph.type=journal')
-        volumes.append(osd)
-        volumes.append(journal)
-        # note the different osd_id!
-        volumes.filter(lv_tags={'ceph.type': 'data', 'ceph.osd_id': '2'})
-        assert volumes == []
-
-    def test_filter_by_vg_name(self, volumes):
-        lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
-        osd = api.Volume(lv_name='volume1', vg_name='ceph_vg', lv_tags=lv_tags)
-        journal = api.Volume(lv_name='volume2', vg_name='system_vg', lv_tags='ceph.type=journal')
-        volumes.append(osd)
-        volumes.append(journal)
-        volumes.filter(vg_name='ceph_vg')
-        assert len(volumes) == 1
-        assert volumes[0].lv_name == 'volume1'
-
-    def test_filter_by_lv_path(self, volumes):
-        osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_tags='')
-        journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_tags='')
-        volumes.append(osd)
-        volumes.append(journal)
-        volumes.filter(lv_path='/dev/volume1')
-        assert len(volumes) == 1
-        assert volumes[0].lv_name == 'volume1'
-
-    def test_filter_by_lv_uuid(self, volumes):
-        osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_uuid='1111', lv_tags='')
-        journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_uuid='', lv_tags='')
-        volumes.append(osd)
-        volumes.append(journal)
-        volumes.filter(lv_uuid='1111')
-        assert len(volumes) == 1
-        assert volumes[0].lv_name == 'volume1'
-
-    def test_filter_by_lv_uuid_nothing_found(self, volumes):
-        osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_uuid='1111', lv_tags='')
-        journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_uuid='', lv_tags='')
-        volumes.append(osd)
-        volumes.append(journal)
-        volumes.filter(lv_uuid='22222')
-        assert volumes == []
-
-    def test_filter_requires_params(self, volumes):
-        with pytest.raises(TypeError):
-            volumes.filter()
-
-
-class TestVolumeGroups(object):
-
-    def test_volume_get_has_no_volume_groups(self, volume_groups):
-        assert volume_groups.get() is None
-
-    def test_volume_get_filtered_has_no_volumes(self, volume_groups):
-        assert volume_groups.get(vg_name='ceph') is None
-
-    def test_volume_has_multiple_matches(self, volume_groups):
-        volume1 = volume2 = api.VolumeGroup(vg_name='foo', lv_path='/dev/vg/lv', lv_tags='')
-        volume_groups.append(volume1)
-        volume_groups.append(volume2)
-        with pytest.raises(exceptions.MultipleVGsError):
-            volume_groups.get(vg_name='foo')
-
-    def test_find_the_correct_one(self, volume_groups):
-        volume1 = api.VolumeGroup(vg_name='volume1', lv_tags='')
-        volume2 = api.VolumeGroup(vg_name='volume2', lv_tags='')
-        volume_groups.append(volume1)
-        volume_groups.append(volume2)
-        assert volume_groups.get(vg_name='volume1') == volume1
-
-    def test_filter_by_tag(self, volume_groups):
-        vg_tags = "ceph.group=dmcache"
-        osd = api.VolumeGroup(vg_name='volume1', vg_tags=vg_tags)
-        journal = api.VolumeGroup(vg_name='volume2', vg_tags='ceph.group=plain')
-        volume_groups.append(osd)
-        volume_groups.append(journal)
-        volume_groups.filter(vg_tags={'ceph.group': 'dmcache'})
-        assert len(volume_groups) == 1
-        assert volume_groups[0].vg_name == 'volume1'
-
-    def test_filter_by_tag_does_not_match_one(self, volume_groups):
-        vg_tags = "ceph.group=dmcache,ceph.disk_type=ssd"
-        osd = api.VolumeGroup(vg_name='volume1', vg_path='/dev/vg/lv', vg_tags=vg_tags)
-        volume_groups.append(osd)
-        volume_groups.filter(vg_tags={'ceph.group': 'data', 'ceph.disk_type': 'ssd'})
-        assert volume_groups == []
-
-    def test_filter_by_vg_name(self, volume_groups):
-        vg_tags = "ceph.type=data,ceph.fsid=000-aaa"
-        osd = api.VolumeGroup(vg_name='ceph_vg', vg_tags=vg_tags)
-        journal = api.VolumeGroup(vg_name='volume2', vg_tags='ceph.type=journal')
-        volume_groups.append(osd)
-        volume_groups.append(journal)
-        volume_groups.filter(vg_name='ceph_vg')
-        assert len(volume_groups) == 1
-        assert volume_groups[0].vg_name == 'ceph_vg'
-
-    def test_filter_requires_params(self, volume_groups):
-        with pytest.raises(TypeError):
-            volume_groups.filter()
-
-
-class TestCreateLV(object):
-
-    def setup(self):
-        self.foo_volume = api.Volume(lv_name='foo', lv_path='/path', vg_name='foo_group', lv_tags='')
-
-    def test_uses_size(self, monkeypatch, capture):
-        monkeypatch.setattr(process, 'run', capture)
-        monkeypatch.setattr(process, 'call', capture)
-        monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
-        api.create_lv('foo', 'foo_group', size=5, type='data')
-        expected = ['sudo', 'lvcreate', '--yes', '-L', '5G', '-n', 'foo', 'foo_group']
-        assert capture.calls[0]['args'][0] == expected
-
-    def test_calls_to_set_type_tag(self, monkeypatch, capture):
-        monkeypatch.setattr(process, 'run', capture)
-        monkeypatch.setattr(process, 'call', capture)
-        monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
-        api.create_lv('foo', 'foo_group', size=5, type='data')
-        ceph_tag = ['sudo', 'lvchange', '--addtag', 'ceph.type=data', '/path']
-        assert capture.calls[1]['args'][0] == ceph_tag
-
-    def test_calls_to_set_data_tag(self, monkeypatch, capture):
-        monkeypatch.setattr(process, 'run', capture)
-        monkeypatch.setattr(process, 'call', capture)
-        monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
-        api.create_lv('foo', 'foo_group', size=5, type='data')
-        data_tag = ['sudo', 'lvchange', '--addtag', 'ceph.data_device=/path', '/path']
-        assert capture.calls[2]['args'][0] == data_tag
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_listing.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_listing.py
new file mode 100644 (file)
index 0000000..b780ea2
--- /dev/null
@@ -0,0 +1,176 @@
+import pytest
+from ceph_volume.devices import lvm
+from ceph_volume.api import lvm as api
+
+
+class TestReadableTag(object):
+
+    def test_dots_get_replaced(self):
+        result = lvm.listing.readable_tag('ceph.foo')
+        assert result == 'foo'
+
+    def test_underscores_are_replaced_with_spaces(self):
+        result = lvm.listing.readable_tag('ceph.long_tag')
+        assert result == 'long tag'
+
+
+class TestPrettyReport(object):
+
+    def test_is_empty(self, capsys):
+        lvm.listing.pretty_report({})
+        stdout, stderr = capsys.readouterr()
+        assert stdout == '\n'
+
+    def test_type_and_path_are_reported(self, capsys):
+        lvm.listing.pretty_report({0: [{'type': 'data', 'path': '/dev/sda1'}]})
+        stdout, stderr = capsys.readouterr()
+        assert '[data]    /dev/sda1' in stdout
+
+    def test_osd_id_header_is_reported(self, capsys):
+        lvm.listing.pretty_report({0: [{'type': 'data', 'path': '/dev/sda1'}]})
+        stdout, stderr = capsys.readouterr()
+        assert '====== osd.0 =======' in stdout
+
+    def test_tags_are_included(self, capsys):
+        lvm.listing.pretty_report(
+            {0: [{
+                'type': 'data',
+                'path': '/dev/sda1',
+                'tags': {'ceph.osd_id': '0'}
+            }]}
+        )
+        stdout, stderr = capsys.readouterr()
+        assert 'osd id' in stdout
+
+
+class TestList(object):
+
+    def test_empty_full_json_zero_exit_status(self, is_root, volumes, factory, capsys):
+        args = factory(format='json', device=None)
+        lvm.listing.List([]).list(args)
+        stdout, stderr = capsys.readouterr()
+        assert stdout == '{}\n'
+
+    def test_empty_device_json_zero_exit_status(self, is_root, volumes, factory, capsys):
+        args = factory(format='json', device='/dev/sda1')
+        lvm.listing.List([]).list(args)
+        stdout, stderr = capsys.readouterr()
+        assert stdout == '{}\n'
+
+    def test_empty_full_zero_exit_status(self, is_root, volumes, factory):
+        args = factory(format='pretty', device=None)
+        with pytest.raises(SystemExit):
+            lvm.listing.List([]).list(args)
+
+    def test_empty_device_zero_exit_status(self, is_root, volumes, factory):
+        args = factory(format='pretty', device='/dev/sda1')
+        with pytest.raises(SystemExit):
+            lvm.listing.List([]).list(args)
+
+
+class TestFullReport(object):
+
+    def test_no_ceph_lvs(self, volumes, monkeypatch):
+        # ceph lvs are detected by looking into its tags
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/VolGroup/lv', lv_tags={})
+        volumes.append(osd)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).full_report()
+        assert result == {}
+
+    def test_ceph_data_lv_reported(self, volumes, monkeypatch):
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).full_report()
+        assert result['0'][0]['name'] == 'volume1'
+
+    def test_ceph_journal_lv_reported(self, volumes, monkeypatch):
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+        journal_tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=journal'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        journal = api.Volume(
+            lv_name='journal', lv_uuid='x', lv_path='/dev/VolGroup/journal', lv_tags=journal_tags)
+        volumes.append(osd)
+        volumes.append(journal)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).full_report()
+        assert result['0'][0]['name'] == 'volume1'
+        assert result['0'][1]['name'] == 'journal'
+
+    def test_ceph_wal_lv_reported(self, volumes, monkeypatch):
+        tags = 'ceph.osd_id=0,ceph.wal_uuid=x,ceph.type=data'
+        wal_tags = 'ceph.osd_id=0,ceph.wal_uuid=x,ceph.type=wal'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        wal = api.Volume(
+            lv_name='wal', lv_uuid='x', lv_path='/dev/VolGroup/wal', lv_tags=wal_tags)
+        volumes.append(osd)
+        volumes.append(wal)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).full_report()
+        assert result['0'][0]['name'] == 'volume1'
+        assert result['0'][1]['name'] == 'wal'
+
+    def test_physical_journal_gets_reported(self, volumes, monkeypatch):
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        monkeypatch.setattr(lvm.listing.disk, 'get_device_from_partuuid', lambda x: '/dev/sda1')
+        result = lvm.listing.List([]).full_report()
+        assert result['0'][1]['path'] == '/dev/sda1'
+        assert result['0'][1]['tags'] == {'PARTUUID': 'x'}
+        assert result['0'][1]['type'] == 'journal'
+
+    def test_physical_wal_gets_reported(self, volumes, monkeypatch):
+        tags = 'ceph.osd_id=0,ceph.wal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        monkeypatch.setattr(lvm.listing.disk, 'get_device_from_partuuid', lambda x: '/dev/sda1')
+        result = lvm.listing.List([]).full_report()
+        assert result['0'][1]['path'] == '/dev/sda1'
+        assert result['0'][1]['tags'] == {'PARTUUID': 'x'}
+        assert result['0'][1]['type'] == 'wal'
+
+
+class TestSingleReport(object):
+
+    def test_not_a_ceph_lv(self, volumes, monkeypatch):
+        # ceph lvs are detected by looking into its tags
+        lv = api.Volume(
+            lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags={})
+        volumes.append(lv)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).single_report('VolGroup/lv')
+        assert result == {}
+
+    def test_report_a_ceph_lv(self, volumes, monkeypatch):
+        # ceph lvs are detected by looking into its tags
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+        lv = api.Volume(
+            lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(lv)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).single_report('VolGroup/lv')
+        assert result['0'][0]['name'] == 'lv'
+        assert result['0'][0]['lv_tags'] == tags
+        assert result['0'][0]['path'] == '/dev/VolGroup/lv'
+
+    def test_report_a_ceph_journal_device(self, volumes, monkeypatch):
+        # ceph lvs are detected by looking into its tags
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data,ceph.journal_device=/dev/sda1'
+        lv = api.Volume(
+            lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(lv)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).single_report('/dev/sda1')
+        assert result['0'][0]['tags'] == {'PARTUUID': 'x'}
+        assert result['0'][0]['type'] == 'journal'
+        assert result['0'][0]['path'] == '/dev/sda1'
index fabae296a2b374887dbfa31287783e5efc4f0617..c69394fd69479223b43128b7e9c4dc073aa6f6b0 100644 (file)
@@ -33,8 +33,9 @@ class TestPrepare(object):
         with pytest.raises(SystemExit):
             lvm.prepare.Prepare(argv=['--help']).main()
         stdout, stderr = capsys.readouterr()
-        assert 'required arguments:' in stdout
-        assert 'A logical group name or a path' in stdout
+        assert 'Use the filestore objectstore' in stdout
+        assert 'Use the bluestore objectstore' in stdout
+        assert 'A physical device or logical' in stdout
 
 
 class TestGetJournalLV(object):
@@ -43,13 +44,13 @@ class TestGetJournalLV(object):
     def test_no_journal_on_invalid_path(self, monkeypatch, arg):
         monkeypatch.setattr(lvm.prepare.api, 'get_lv', lambda **kw: False)
         prepare = lvm.prepare.Prepare([])
-        assert prepare.get_journal_lv(arg) is None
+        assert prepare.get_lv(arg) is None
 
     def test_no_journal_lv_found(self, monkeypatch):
         # patch it with 0 so we know we are getting to get_lv
         monkeypatch.setattr(lvm.prepare.api, 'get_lv', lambda **kw: 0)
         prepare = lvm.prepare.Prepare([])
-        assert prepare.get_journal_lv('vg/lv') == 0
+        assert prepare.get_lv('vg/lv') == 0
 
 
 class TestActivate(object):
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py
new file mode 100644 (file)
index 0000000..bae3276
--- /dev/null
@@ -0,0 +1,23 @@
+import os
+import pytest
+from ceph_volume.devices.simple import activate
+
+
+class TestActivate(object):
+
+    def test_no_data_uuid(self, factory, tmpfile, is_root, monkeypatch, capture):
+        json_config = tmpfile(contents='{}')
+        args = factory(osd_id='0', osd_fsid='1234', json_config=json_config)
+        with pytest.raises(RuntimeError):
+            activate.Activate([]).activate(args)
+
+    def test_invalid_json_path(self):
+        os.environ['CEPH_VOLUME_SIMPLE_JSON_DIR'] = '/non/existing/path'
+        with pytest.raises(RuntimeError) as error:
+            activate.Activate(['1', 'asdf']).main()
+        assert 'RuntimeError: Expected JSON config path not found' in str(error)
+
+    def test_main_spits_help_with_no_arguments(self, capsys):
+        activate.Activate([]).main()
+        stdout, stderr = capsys.readouterr()
+        assert 'Activate OSDs by mounting devices previously configured' in stdout
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py
new file mode 100644 (file)
index 0000000..d68fe63
--- /dev/null
@@ -0,0 +1,52 @@
+import os
+import pytest
+from ceph_volume.devices.simple import scan
+
+
+class TestScan(object):
+
+    def test_main_spits_help_with_no_arguments(self, capsys):
+        scan.Scan([]).main()
+        stdout, stderr = capsys.readouterr()
+        assert 'Scan an OSD directory for files' in stdout
+
+
+class TestGetContents(object):
+
+    def test_multiple_lines_are_left_as_is(self, tmpfile):
+        magic_file = tmpfile(contents='first\nsecond\n')
+        scanner = scan.Scan([])
+        assert scanner.get_contents(magic_file) == 'first\nsecond\n'
+
+    def test_extra_whitespace_gets_removed(self, tmpfile):
+        magic_file = tmpfile(contents='first   ')
+        scanner = scan.Scan([])
+        assert scanner.get_contents(magic_file) == 'first'
+
+    def test_single_newline_values_are_trimmed(self, tmpfile):
+        magic_file = tmpfile(contents='first\n')
+        scanner = scan.Scan([])
+        assert scanner.get_contents(magic_file) == 'first'
+
+
+class TestEtcPath(object):
+
+    def test_directory_is_valid(self, tmpdir):
+        path = str(tmpdir)
+        scanner = scan.Scan([])
+        scanner._etc_path = path
+        assert scanner.etc_path == path
+
+    def test_directory_does_not_exist_gets_created(self, tmpdir):
+        path = os.path.join(str(tmpdir), 'subdir')
+        scanner = scan.Scan([])
+        scanner._etc_path = path
+        assert scanner.etc_path == path
+        assert os.path.isdir(path)
+
+    def test_complains_when_file(self, tmpfile):
+        path = tmpfile()
+        scanner = scan.Scan([])
+        scanner._etc_path = path
+        with pytest.raises(RuntimeError):
+            scanner.etc_path
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_trigger.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_trigger.py
new file mode 100644 (file)
index 0000000..d3220f2
--- /dev/null
@@ -0,0 +1,45 @@
+import pytest
+from ceph_volume import exceptions
+from ceph_volume.devices.simple import trigger
+
+
+class TestParseOSDid(object):
+
+    def test_no_id_found_if_no_digit(self):
+        with pytest.raises(exceptions.SuffixParsingError):
+            trigger.parse_osd_id('asdlj-ljahsdfaslkjhdfa')
+
+    def test_no_id_found(self):
+        with pytest.raises(exceptions.SuffixParsingError):
+            trigger.parse_osd_id('ljahsdfaslkjhdfa')
+
+    def test_id_found(self):
+        result = trigger.parse_osd_id('1-ljahsdfaslkjhdfa')
+        assert result == '1'
+
+
+class TestParseOSDUUID(object):
+
+    def test_uuid_is_parsed(self):
+        result = trigger.parse_osd_uuid('1-asdf-ljkh-asdf-ljkh-asdf')
+        assert result == 'asdf-ljkh-asdf-ljkh-asdf'
+
+    def test_uuid_is_parsed_longer_sha1(self):
+        result = trigger.parse_osd_uuid('1-foo-bar-asdf-ljkh-asdf-ljkh-asdf')
+        assert result == 'foo-bar-asdf-ljkh-asdf-ljkh-asdf'
+
+    def test_uuid_is_not_found(self):
+        with pytest.raises(exceptions.SuffixParsingError):
+            trigger.parse_osd_uuid('ljahsdfaslkjhdfa')
+
+    def test_uuid_is_not_found_missing_id(self):
+        with pytest.raises(exceptions.SuffixParsingError):
+            trigger.parse_osd_uuid('ljahs-dfa-slkjhdfa-foo')
+
+    def test_robust_double_id_in_uuid(self):
+        # it is possible to have the id in the SHA1, this should
+        # be fine parsing that
+        result = trigger.parse_osd_uuid("1-abc959fd-1ec9-4864-b141-3154f9b9f8ed")
+        assert result == 'abc959fd-1ec9-4864-b141-3154f9b9f8ed'
+
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py
new file mode 100644 (file)
index 0000000..bc26e33
--- /dev/null
@@ -0,0 +1,17 @@
+import pytest
+from ceph_volume.devices import lvm
+
+
+class TestZap(object):
+
+    def test_main_spits_help_with_no_arguments(self, capsys):
+        lvm.zap.Zap([]).main()
+        stdout, stderr = capsys.readouterr()
+        assert 'Zaps the given logical volume or partition' in stdout
+
+    def test_main_shows_full_help(self, capsys):
+        with pytest.raises(SystemExit):
+            lvm.zap.Zap(argv=['--help']).main()
+        stdout, stderr = capsys.readouterr()
+        assert 'optional arguments' in stdout
+        assert 'positional arguments' in stdout
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/Vagrantfile
deleted file mode 120000 (symlink)
index 2572fa2..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/group_vars/all
deleted file mode 100644 (file)
index e7c1f72..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
----
-
-ceph_dev: True
-cluster: ceph
-public_network: "192.168.3.0/24"
-cluster_network: "192.168.4.0/24"
-monitor_interface: eth1
-journal_size: 100
-osd_objectstore: "filestore"
-osd_scenario: lvm
-ceph_origin: 'repository'
-ceph_repository: 'dev'
-copy_admin_key: true
-# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
-lvm_volumes:
-  - data: data-lv1
-    journal: /dev/sdc1
-    data_vg: test_group
-  - data: data-lv2
-    journal: journal1
-    data_vg: test_group
-    journal_vg: journals
-os_tuning_params:
-  - { name: kernel.pid_max, value: 4194303 }
-  - { name: fs.file-max, value: 26234859 }
-ceph_conf_overrides:
-  global:
-    osd_pool_default_pg_num: 8
-    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/hosts
deleted file mode 100644 (file)
index f6a265a..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-[mons]
-mon0
-
-[osds]
-osd0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/vagrant_variables.yml
deleted file mode 100644 (file)
index 7d1a444..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
----
-
-# DEFINE THE NUMBER OF VMS TO RUN
-mon_vms: 1
-osd_vms: 1
-mds_vms: 0
-rgw_vms: 0
-nfs_vms: 0
-rbd_mirror_vms: 0
-client_vms: 0
-iscsi_gw_vms: 0
-mgr_vms: 0
-
-# SUBNETS TO USE FOR THE VMS
-public_subnet: 192.168.3
-cluster_subnet: 192.168.4
-
-# MEMORY
-# set 1024 for CentOS
-memory: 512
-
-# Ethernet interface name
-# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
-eth: 'eth1'
-
-
-# VAGRANT BOX
-# Ceph boxes are *strongly* suggested. They are under better control and will
-# not get updated frequently unless required for build systems. These are (for
-# now):
-#
-# * ceph/ubuntu-xenial
-#
-# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
-# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
-# libvirt CentOS: centos/7
-# parallels Ubuntu: parallels/ubuntu-14.04
-# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
-# For more boxes have a look at:
-#   - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
-#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
-vagrant_box: centos/7
-#ssh_private_key_path: "~/.ssh/id_rsa"
-# The sync directory changes based on vagrant box
-# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
-#vagrant_sync_dir: /home/vagrant/sync
-#vagrant_sync_dir: /
-# Disables synced folder creation. Not needed for testing, will skip mounting
-# the vagrant directory on the remote box regardless of the provider.
-vagrant_disable_synced_folder: true
-# VAGRANT URL
-# This is a URL to download an image from an alternate location.  vagrant_box
-# above should be set to the filename of the image.
-# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
-# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
-# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/Vagrantfile
new file mode 120000 (symlink)
index 0000000..16076e4
--- /dev/null
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/group_vars/all
new file mode 100644 (file)
index 0000000..17e9044
--- /dev/null
@@ -0,0 +1,28 @@
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "bluestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+lvm_volumes:
+  - data: data-lv1
+    data_vg: test_group
+  - data: data-lv2
+    data_vg: test_group
+    db: journal1
+    db_vg: journals
+  - data: /dev/sdd1
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/hosts
new file mode 100644 (file)
index 0000000..f6a265a
--- /dev/null
@@ -0,0 +1,5 @@
+[mons]
+mon0
+
+[osds]
+osd0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/setup.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/setup.yml
new file mode 120000 (symlink)
index 0000000..1c1a3ce
--- /dev/null
@@ -0,0 +1 @@
+../../../playbooks/setup_partitions.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/vagrant_variables.yml
new file mode 100644 (file)
index 0000000..7d1a444
--- /dev/null
@@ -0,0 +1,56 @@
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/Vagrantfile
new file mode 120000 (symlink)
index 0000000..16076e4
--- /dev/null
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/group_vars/all
new file mode 100644 (file)
index 0000000..e7ff18e
--- /dev/null
@@ -0,0 +1,31 @@
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "filestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
+lvm_volumes:
+  - data: data-lv1
+    journal: /dev/sdc1
+    data_vg: test_group
+  - data: data-lv2
+    journal: journal1
+    data_vg: test_group
+    journal_vg: journals
+  - data: /dev/sdd1
+    journal: /dev/sdd2
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/hosts
new file mode 100644 (file)
index 0000000..f6a265a
--- /dev/null
@@ -0,0 +1,5 @@
+[mons]
+mon0
+
+[osds]
+osd0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/setup.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/setup.yml
new file mode 120000 (symlink)
index 0000000..1c1a3ce
--- /dev/null
@@ -0,0 +1 @@
+../../../playbooks/setup_partitions.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/vagrant_variables.yml
new file mode 100644 (file)
index 0000000..7d1a444
--- /dev/null
@@ -0,0 +1,56 @@
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/setup_partitions.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/setup_partitions.yml
new file mode 100644 (file)
index 0000000..37a4894
--- /dev/null
@@ -0,0 +1,27 @@
+---
+
+- hosts: osds
+  gather_facts: false
+  become: yes
+  tasks:
+
+    - name: partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
+    - name: partition /dev/sdd lvm journals
+      parted:
+        device: /dev/sdd
+        number: 2
+        part_start: 50%
+        part_end: 100%
+        unit: '%'
+        state: present
+        label: gpt
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini
new file mode 100644 (file)
index 0000000..797138f
--- /dev/null
@@ -0,0 +1,59 @@
+[tox]
+envlist = {centos7,xenial}-{filestore,bluestore}-{create,prepare_activate}
+skipsdist = True
+
+[testenv]
+whitelist_externals =
+    vagrant
+    bash
+    git
+passenv=*
+setenv=
+  ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config
+  ANSIBLE_STDOUT_CALLBACK = debug
+  ANSIBLE_RETRY_FILES_ENABLED = False
+  VAGRANT_CWD = {changedir}
+  CEPH_VOLUME_DEBUG = 1
+deps=
+  ansible==2.4.1
+  testinfra==1.7.1
+  pytest-xdist
+changedir=
+  centos7-filestore-create: {toxinidir}/centos7/filestore/create
+  centos7-bluestore-create: {toxinidir}/centos7/bluestore/create
+  xenial-filestore-create: {toxinidir}/xenial/filestore/create
+  xenial-bluestore-create: {toxinidir}/xenial/bluestore/create
+  # TODO: these are placeholders for now, eventually we want to
+  # test the prepare/activate workflow of ceph-volume as well
+  xenial-filestore-prepare_activate: {toxinidir}/xenial/filestore/prepare_activate
+  xenial-bluestore-prepare_activate: {toxinidir}/xenial/bluestore/prepare_activate
+  centos7-filestore-prepare_activate: {toxinidir}/xenial/filestore/prepare_activate
+  centos7-bluestore-prepare_activate: {toxinidir}/xenial/bluestore/prepare_activate
+commands=
+  git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
+
+  vagrant up --no-provision {posargs:--provider=virtualbox}
+  bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
+
+  # create logical volumes to test with on the vms
+  ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/lvm_setup.yml
+
+  # ad-hoc/local test setup for lvm
+  ansible-playbook -vv -i {changedir}/hosts {changedir}/setup.yml
+
+  # use ceph-ansible to deploy a ceph cluster on the vms
+  ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/site.yml.sample --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest}"
+
+  # prepare nodes for testing with testinfra
+  ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
+
+  # test cluster state using ceph-ansible tests
+  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+  # reboot all vms
+  vagrant reload --no-provision
+
+  # retest to ensure cluster came back up correctly after rebooting
+  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+  vagrant destroy --force
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/Vagrantfile
new file mode 120000 (symlink)
index 0000000..16076e4
--- /dev/null
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/group_vars/all
new file mode 100644 (file)
index 0000000..17e9044
--- /dev/null
@@ -0,0 +1,28 @@
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "bluestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+lvm_volumes:
+  - data: data-lv1
+    data_vg: test_group
+  - data: data-lv2
+    data_vg: test_group
+    db: journal1
+    db_vg: journals
+  - data: /dev/sdd1
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/hosts
new file mode 100644 (file)
index 0000000..f6a265a
--- /dev/null
@@ -0,0 +1,5 @@
+[mons]
+mon0
+
+[osds]
+osd0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/setup.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/setup.yml
new file mode 120000 (symlink)
index 0000000..1c1a3ce
--- /dev/null
@@ -0,0 +1 @@
+../../../playbooks/setup_partitions.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/vagrant_variables.yml
new file mode 100644 (file)
index 0000000..7252344
--- /dev/null
@@ -0,0 +1,56 @@
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/Vagrantfile
new file mode 120000 (symlink)
index 0000000..16076e4
--- /dev/null
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/group_vars/all
new file mode 100644 (file)
index 0000000..e7ff18e
--- /dev/null
@@ -0,0 +1,31 @@
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "filestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
+lvm_volumes:
+  - data: data-lv1
+    journal: /dev/sdc1
+    data_vg: test_group
+  - data: data-lv2
+    journal: journal1
+    data_vg: test_group
+    journal_vg: journals
+  - data: /dev/sdd1
+    journal: /dev/sdd2
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/hosts
new file mode 100644 (file)
index 0000000..f6a265a
--- /dev/null
@@ -0,0 +1,5 @@
+[mons]
+mon0
+
+[osds]
+osd0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/setup.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/setup.yml
new file mode 120000 (symlink)
index 0000000..1c1a3ce
--- /dev/null
@@ -0,0 +1 @@
+../../../playbooks/setup_partitions.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/vagrant_variables.yml
new file mode 100644 (file)
index 0000000..82b330e
--- /dev/null
@@ -0,0 +1,54 @@
+---
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/Vagrantfile
new file mode 120000 (symlink)
index 0000000..16076e4
--- /dev/null
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/group_vars/all
new file mode 100644 (file)
index 0000000..560c8b0
--- /dev/null
@@ -0,0 +1,19 @@
+---
+
+ceph_dev: True
+cluster: test
+public_network: "192.168.1.0/24"
+cluster_network: "192.168.2.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "bluestore"
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd0.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd0.yml
new file mode 100644 (file)
index 0000000..2e1c7ee
--- /dev/null
@@ -0,0 +1,7 @@
+---
+
+devices:
+  - '/dev/sdb'
+dedicated_devices:
+  - '/dev/sdc'
+osd_scenario: "non-collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd1.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd1.yml
new file mode 100644 (file)
index 0000000..7e90071
--- /dev/null
@@ -0,0 +1,6 @@
+---
+
+devices:
+  - '/dev/sdb'
+  - '/dev/sdc'
+osd_scenario: "collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/hosts
new file mode 100644 (file)
index 0000000..e0c08b9
--- /dev/null
@@ -0,0 +1,9 @@
+[mons]
+mon0 monitor_interface=eth1
+
+[osds]
+osd0 
+osd1
+
+[mgrs]
+mon0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/test.yml
new file mode 100644 (file)
index 0000000..24e2c03
--- /dev/null
@@ -0,0 +1,31 @@
+---
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    - name: list all OSD directories
+      find:
+        paths: /var/lib/ceph/osd
+        file_type: directory
+      register: osd_paths
+
+    - name: scan all OSD directories
+      command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_paths.files }}"
+
+    - name: list all OSD JSON files
+      find:
+        paths: /etc/ceph/osd
+        file_type: file
+      register: osd_configs
+
+    - name: activate all scanned OSDs
+      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_configs.files }}"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/vagrant_variables.yml
new file mode 100644 (file)
index 0000000..63700c3
--- /dev/null
@@ -0,0 +1,73 @@
+---
+
+# DEPLOY CONTAINERIZED DAEMONS
+docker: false
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 2
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+
+# INSTALL SOURCE OF CEPH
+# valid values are 'stable' and 'dev'
+ceph_install_source: stable
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.1
+cluster_subnet: 192.168.2
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# Disks
+# For libvirt use disks: "[ '/dev/vdb', '/dev/vdc' ]"
+# For CentOS7 use disks: "[ '/dev/sda', '/dev/sdb' ]"
+disks: "[ '/dev/sdb', '/dev/sdc' ]"
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/Vagrantfile
new file mode 120000 (symlink)
index 0000000..16076e4
--- /dev/null
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/group_vars/all
new file mode 100644 (file)
index 0000000..8902bdd
--- /dev/null
@@ -0,0 +1,19 @@
+---
+
+ceph_dev: True
+cluster: test
+public_network: "192.168.1.0/24"
+cluster_network: "192.168.2.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "filestore"
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd0.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd0.yml
new file mode 100644 (file)
index 0000000..2e1c7ee
--- /dev/null
@@ -0,0 +1,7 @@
+---
+
+devices:
+  - '/dev/sdb'
+dedicated_devices:
+  - '/dev/sdc'
+osd_scenario: "non-collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd1.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd1.yml
new file mode 100644 (file)
index 0000000..7e90071
--- /dev/null
@@ -0,0 +1,6 @@
+---
+
+devices:
+  - '/dev/sdb'
+  - '/dev/sdc'
+osd_scenario: "collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/hosts
new file mode 100644 (file)
index 0000000..e0c08b9
--- /dev/null
@@ -0,0 +1,9 @@
+[mons]
+mon0 monitor_interface=eth1
+
+[osds]
+osd0 
+osd1
+
+[mgrs]
+mon0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml
new file mode 100644 (file)
index 0000000..24e2c03
--- /dev/null
@@ -0,0 +1,31 @@
+---
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    - name: list all OSD directories
+      find:
+        paths: /var/lib/ceph/osd
+        file_type: directory
+      register: osd_paths
+
+    - name: scan all OSD directories
+      command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_paths.files }}"
+
+    - name: list all OSD JSON files
+      find:
+        paths: /etc/ceph/osd
+        file_type: file
+      register: osd_configs
+
+    - name: activate all scanned OSDs
+      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_configs.files }}"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/vagrant_variables.yml
new file mode 100644 (file)
index 0000000..63700c3
--- /dev/null
@@ -0,0 +1,73 @@
+---
+
+# DEPLOY CONTAINERIZED DAEMONS
+docker: false
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 2
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+
+# INSTALL SOURCE OF CEPH
+# valid values are 'stable' and 'dev'
+ceph_install_source: stable
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.1
+cluster_subnet: 192.168.2
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# Disks
+# For libvirt use disks: "[ '/dev/vdb', '/dev/vdc' ]"
+# For CentOS7 use disks: "[ '/dev/sda', '/dev/sdb' ]"
+disks: "[ '/dev/sdb', '/dev/sdc' ]"
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini
new file mode 100644 (file)
index 0000000..0d2e68a
--- /dev/null
@@ -0,0 +1,50 @@
+[tox]
+envlist = {centos7,xenial}-{filestore,bluestore}-{activate}
+skipsdist = True
+
+[testenv]
+whitelist_externals =
+    vagrant
+    bash
+    git
+passenv=*
+setenv=
+  ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config
+  ANSIBLE_STDOUT_CALLBACK = debug
+  ANSIBLE_RETRY_FILES_ENABLED = False
+  VAGRANT_CWD = {changedir}
+  CEPH_VOLUME_DEBUG = 1
+deps=
+  ansible==2.4.1
+  testinfra==1.7.1
+  pytest-xdist
+changedir=
+  centos7-filestore-activate: {toxinidir}/centos7/filestore/activate
+  centos7-bluestore-activate: {toxinidir}/centos7/bluestore/activate
+  xenial-filestore-activate: {toxinidir}/xenial/filestore/activate
+  xenial-bluestore-activate: {toxinidir}/xenial/bluestore/activate
+commands=
+  git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
+
+  vagrant up --no-provision {posargs:--provider=virtualbox}
+  bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
+
+  # use ceph-ansible to deploy a ceph cluster on the vms
+  ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/site.yml.sample --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest}"
+
+  # prepare nodes for testing with testinfra
+  ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
+
+  # test cluster state using ceph-ansible tests
+  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+  # make ceph-volume simple take over all the OSDs that got deployed, disabling ceph-disk
+  ansible-playbook -vv -i {changedir}/hosts {changedir}/test.yml
+
+  # reboot all vms
+  vagrant reload --no-provision
+
+  # retest to ensure cluster came back up correctly after rebooting
+  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+  vagrant destroy --force
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/Vagrantfile
new file mode 120000 (symlink)
index 0000000..16076e4
--- /dev/null
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/group_vars/all
new file mode 100644 (file)
index 0000000..560c8b0
--- /dev/null
@@ -0,0 +1,19 @@
+---
+
+ceph_dev: True
+cluster: test
+public_network: "192.168.1.0/24"
+cluster_network: "192.168.2.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "bluestore"
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd0.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd0.yml
new file mode 100644 (file)
index 0000000..2e1c7ee
--- /dev/null
@@ -0,0 +1,7 @@
+---
+
+devices:
+  - '/dev/sdb'
+dedicated_devices:
+  - '/dev/sdc'
+osd_scenario: "non-collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd1.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd1.yml
new file mode 100644 (file)
index 0000000..7e90071
--- /dev/null
@@ -0,0 +1,6 @@
+---
+
+devices:
+  - '/dev/sdb'
+  - '/dev/sdc'
+osd_scenario: "collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/hosts
new file mode 100644 (file)
index 0000000..e0c08b9
--- /dev/null
@@ -0,0 +1,9 @@
+[mons]
+mon0 monitor_interface=eth1
+
+[osds]
+osd0 
+osd1
+
+[mgrs]
+mon0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/test.yml
new file mode 100644 (file)
index 0000000..24e2c03
--- /dev/null
@@ -0,0 +1,31 @@
+---
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    - name: list all OSD directories
+      find:
+        paths: /var/lib/ceph/osd
+        file_type: directory
+      register: osd_paths
+
+    - name: scan all OSD directories
+      command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_paths.files }}"
+
+    - name: list all OSD JSON files
+      find:
+        paths: /etc/ceph/osd
+        file_type: file
+      register: osd_configs
+
+    - name: activate all scanned OSDs
+      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_configs.files }}"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/vagrant_variables.yml
new file mode 100644 (file)
index 0000000..b4aa759
--- /dev/null
@@ -0,0 +1,73 @@
+---
+
+# DEPLOY CONTAINERIZED DAEMONS
+docker: false
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 2
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+
+# INSTALL SOURCE OF CEPH
+# valid values are 'stable' and 'dev'
+ceph_install_source: stable
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.1
+cluster_subnet: 192.168.2
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# Disks
+# For libvirt use disks: "[ '/dev/vdb', '/dev/vdc' ]"
+# For CentOS7 use disks: "[ '/dev/sda', '/dev/sdb' ]"
+disks: "[ '/dev/sdb', '/dev/sdc' ]"
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/Vagrantfile
new file mode 120000 (symlink)
index 0000000..16076e4
--- /dev/null
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/group_vars/all
new file mode 100644 (file)
index 0000000..8902bdd
--- /dev/null
@@ -0,0 +1,19 @@
+---
+
+ceph_dev: True
+cluster: test
+public_network: "192.168.1.0/24"
+cluster_network: "192.168.2.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "filestore"
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd0.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd0.yml
new file mode 100644 (file)
index 0000000..2e1c7ee
--- /dev/null
@@ -0,0 +1,7 @@
+---
+
+devices:
+  - '/dev/sdb'
+dedicated_devices:
+  - '/dev/sdc'
+osd_scenario: "non-collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd1.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd1.yml
new file mode 100644 (file)
index 0000000..7e90071
--- /dev/null
@@ -0,0 +1,6 @@
+---
+
+devices:
+  - '/dev/sdb'
+  - '/dev/sdc'
+osd_scenario: "collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/hosts
new file mode 100644 (file)
index 0000000..e0c08b9
--- /dev/null
@@ -0,0 +1,9 @@
+[mons]
+mon0 monitor_interface=eth1
+
+[osds]
+osd0 
+osd1
+
+[mgrs]
+mon0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml
new file mode 100644 (file)
index 0000000..24e2c03
--- /dev/null
@@ -0,0 +1,31 @@
+---
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    - name: list all OSD directories
+      find:
+        paths: /var/lib/ceph/osd
+        file_type: directory
+      register: osd_paths
+
+    - name: scan all OSD directories
+      command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_paths.files }}"
+
+    - name: list all OSD JSON files
+      find:
+        paths: /etc/ceph/osd
+        file_type: file
+      register: osd_configs
+
+    - name: activate all scanned OSDs
+      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_configs.files }}"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/vagrant_variables.yml
new file mode 100644 (file)
index 0000000..b4aa759
--- /dev/null
@@ -0,0 +1,73 @@
+---
+
+# DEPLOY CONTAINERIZED DAEMONS
+docker: false
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 2
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+
+# INSTALL SOURCE OF CEPH
+# valid values are 'stable' and 'dev'
+ceph_install_source: stable
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.1
+cluster_subnet: 192.168.2
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# Disks
+# For libvirt use disks: "[ '/dev/vdb', '/dev/vdc' ]"
+# For CentOS7 use disks: "[ '/dev/sda', '/dev/sdb' ]"
+disks: "[ '/dev/sdb', '/dev/sdc' ]"
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/tox.ini b/ceph/src/ceph-volume/ceph_volume/tests/functional/tox.ini
deleted file mode 100644 (file)
index 6e0dfbf..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-[tox]
-envlist = {centos7,xenial}-{create,prepare_activate}
-skipsdist = True
-
-[testenv]
-whitelist_externals =
-    vagrant
-    bash
-    git
-passenv=*
-setenv=
-  ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config
-  ANSIBLE_STDOUT_CALLBACK = debug
-  ANSIBLE_RETRY_FILES_ENABLED = False
-  VAGRANT_CWD = {changedir}
-  CEPH_VOLUME_DEBUG = 1
-deps=
-  ansible==2.3.1
-  testinfra==1.6.0
-  pytest-xdist
-changedir=
-  centos7-create: {toxinidir}/centos7/create
-  xenial-create: {toxinidir}/xenial/create
-  # TODO: these are placeholders for now, eventually we want to
-  # test the prepare/activate workflow of ceph-volume as well
-  xenial-prepare_activate: {toxinidir}/xenial/prepare_activate
-  centos7-prepare_activate: {toxinidir}/xenial/prepare_activate
-commands=
-  git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
-
-  vagrant up --no-provision {posargs:--provider=virtualbox}
-  bash {toxinidir}/scripts/generate_ssh_config.sh {changedir}
-
-  # create logical volumes to test with on the vms
-  ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/lvm_setup.yml
-
-  # use ceph-ansible to deploy a ceph cluster on the vms
-  ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/site.yml.sample --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest}"
-
-  # prepare nodes for testing with testinfra
-  ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
-
-  # test cluster state using ceph-ansible tests
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
-
-  # reboot all vms
-  vagrant reload --no-provision
-
-  # retest to ensure cluster came back up correctly after rebooting
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
-
-  vagrant destroy --force
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/Vagrantfile
deleted file mode 120000 (symlink)
index 2572fa2..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/group_vars/all
deleted file mode 100644 (file)
index e7c1f72..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
----
-
-ceph_dev: True
-cluster: ceph
-public_network: "192.168.3.0/24"
-cluster_network: "192.168.4.0/24"
-monitor_interface: eth1
-journal_size: 100
-osd_objectstore: "filestore"
-osd_scenario: lvm
-ceph_origin: 'repository'
-ceph_repository: 'dev'
-copy_admin_key: true
-# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
-lvm_volumes:
-  - data: data-lv1
-    journal: /dev/sdc1
-    data_vg: test_group
-  - data: data-lv2
-    journal: journal1
-    data_vg: test_group
-    journal_vg: journals
-os_tuning_params:
-  - { name: kernel.pid_max, value: 4194303 }
-  - { name: fs.file-max, value: 26234859 }
-ceph_conf_overrides:
-  global:
-    osd_pool_default_pg_num: 8
-    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/hosts
deleted file mode 100644 (file)
index f6a265a..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-[mons]
-mon0
-
-[osds]
-osd0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/vagrant_variables.yml
deleted file mode 100644 (file)
index 82b330e..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
----
-# DEFINE THE NUMBER OF VMS TO RUN
-mon_vms: 1
-osd_vms: 1
-mds_vms: 0
-rgw_vms: 0
-nfs_vms: 0
-rbd_mirror_vms: 0
-client_vms: 0
-iscsi_gw_vms: 0
-mgr_vms: 0
-
-# SUBNETS TO USE FOR THE VMS
-public_subnet: 192.168.3
-cluster_subnet: 192.168.4
-
-# MEMORY
-# set 1024 for CentOS
-memory: 512
-
-# Ethernet interface name
-# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
-eth: 'eth1'
-
-# VAGRANT BOX
-# Ceph boxes are *strongly* suggested. They are under better control and will
-# not get updated frequently unless required for build systems. These are (for
-# now):
-#
-# * ceph/ubuntu-xenial
-#
-# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
-# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
-# libvirt CentOS: centos/7
-# parallels Ubuntu: parallels/ubuntu-14.04
-# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
-# For more boxes have a look at:
-#   - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
-#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
-vagrant_box: ceph/ubuntu-xenial
-#ssh_private_key_path: "~/.ssh/id_rsa"
-# The sync directory changes based on vagrant box
-# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
-#vagrant_sync_dir: /home/vagrant/sync
-#vagrant_sync_dir: /
-# Disables synced folder creation. Not needed for testing, will skip mounting
-# the vagrant directory on the remote box regardless of the provider.
-vagrant_disable_synced_folder: true
-# VAGRANT URL
-# This is a URL to download an image from an alternate location.  vagrant_box
-# above should be set to the filename of the image.
-# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
-# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
-# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
index 917469128da7e7b4e9c478cb4fe7edc578b54eac..22b962b1d52d7f8f36e627d7e8e61269d27b9ae1 100644 (file)
@@ -1,10 +1,11 @@
 import pytest
 import argparse
+from ceph_volume import exceptions
 from ceph_volume.util import arg_validators
 
 
 invalid_lv_paths = [
-    '', 'lv_name', '///', '/lv_name', 'lv_name/',
+    '', 'lv_name', '/lv_name', 'lv_name/',
     '/dev/lv_group/lv_name'
 ]
 
@@ -22,3 +23,31 @@ class TestLVPath(object):
     def test_is_valid(self):
         path = 'vg/lv'
         assert self.validator(path) == path
+
+    def test_abspath_is_valid(self):
+        path = '/'
+        assert self.validator(path) == path
+
+
+class TestOSDPath(object):
+
+    def setup(self):
+        self.validator = arg_validators.OSDPath()
+
+    def test_is_not_root(self):
+        with pytest.raises(exceptions.SuperUserError):
+            self.validator('')
+
+    def test_path_is_not_a_directory(self, is_root, tmpfile, monkeypatch):
+        monkeypatch.setattr(arg_validators.disk, 'is_partition', lambda x: False)
+        validator = arg_validators.OSDPath()
+        with pytest.raises(argparse.ArgumentError):
+            validator(tmpfile())
+
+    def test_files_are_missing(self, is_root, tmpdir, monkeypatch):
+        tmppath = str(tmpdir)
+        monkeypatch.setattr(arg_validators.disk, 'is_partition', lambda x: False)
+        validator = arg_validators.OSDPath()
+        with pytest.raises(argparse.ArgumentError) as error:
+            validator(tmppath)
+        assert 'Required file (ceph_fsid) was not found in OSD' in str(error)
index 7cb6a1f14fa8cd300744224a22f164d55369e8c1..56b88b3f4aef731e40e3eeb7a25442b9f9096697 100644 (file)
@@ -1,6 +1,7 @@
 import os
 import pwd
 import getpass
+import pytest
 from textwrap import dedent
 from ceph_volume.util import system
 
@@ -34,7 +35,74 @@ class TestMkdirP(object):
         assert os.path.isdir(path)
 
 
-class TestIsMounted(object):
+@pytest.fixture
+def fake_proc(tmpdir, monkeypatch):
+    PROCDIR = str(tmpdir)
+    proc_path = os.path.join(PROCDIR, 'mounts')
+    with open(proc_path, 'w') as f:
+        f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
+            rootfs / rootfs rw 0 0
+            sysfs /sys sysfs rw,seclabel,nosuid,nodev,noexec,relatime 0 0
+            proc /proc proc rw,nosuid,nodev,noexec,relatime 0 0
+            devtmpfs /dev devtmpfs rw,seclabel,nosuid,size=238292k,nr_inodes=59573,mode=755 0 0
+            securityfs /sys/kernel/security securityfs rw,nosuid,nodev,noexec,relatime 0 0
+            tmpfs /dev/shm tmpfs rw,seclabel,nosuid,nodev 0 0
+            devpts /dev/pts devpts rw,seclabel,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=000 0 0
+            tmpfs /run tmpfs rw,seclabel,nosuid,nodev,mode=755 0 0
+            tmpfs /sys/fs/cgroup tmpfs ro,seclabel,nosuid,nodev,noexec,mode=755 0 0
+            cgroup /sys/fs/cgroup/systemd cgroup rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd 0 0
+            cgroup /sys/fs/cgroup/freezer cgroup rw,nosuid,nodev,noexec,relatime,freezer 0 0
+            configfs /sys/kernel/config configfs rw,relatime 0 0
+            /dev/mapper/VolGroup00-LogVol00 / xfs rw,seclabel,relatime,attr2,inode64,noquota 0 0
+            selinuxfs /sys/fs/selinux selinuxfs rw,relatime 0 0
+            debugfs /sys/kernel/debug debugfs rw,relatime 0 0
+            hugetlbfs /dev/hugepages hugetlbfs rw,seclabel,relatime 0 0
+            mqueue /dev/mqueue mqueue rw,seclabel,relatime 0 0
+            sunrpc /far/lib/nfs/rpc_pipefs rpc_pipefs rw,relatime 0 0
+            /dev/sde4 /two/field/path
+            nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
+            /dev/sde2 /boot xfs rw,seclabel,relatime,attr2,inode64,noquota 0 0
+            tmpfs /far/lib/ceph/osd/ceph-5 tmpfs rw,seclabel,relatime 0 0
+            tmpfs /far/lib/ceph/osd/ceph-7 tmpfs rw,seclabel,relatime 0 0
+            /dev/sda1 /far/lib/ceph/osd/ceph-0 xfs rw,seclabel,noatime,attr2,inode64,noquota 0 0
+            tmpfs /run/user/1000 tmpfs rw,seclabel,nosuid,nodev,relatime,size=50040k,mode=700,uid=1000,gid=1000 0 0
+            /dev/sdc2 /boot xfs rw,seclabel,relatime,attr2,inode64,noquota 0 0
+            tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
+    monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
+    monkeypatch.setattr(os.path, 'exists', lambda x: True)
+
+
+class TestPathIsMounted(object):
+
+    def test_is_mounted(self, fake_proc):
+        assert system.path_is_mounted('/boot') is True
+
+    def test_is_not_mounted(self, fake_proc):
+        assert system.path_is_mounted('/far/fib/feph') is False
+
+    def test_is_not_mounted_at_destination(self, fake_proc):
+        assert system.path_is_mounted('/boot', destination='/dev/sda1') is False
+
+    def test_is_mounted_at_destination(self, fake_proc):
+        assert system.path_is_mounted('/boot', destination='/dev/sdc2') is True
+
+
+class TestDeviceIsMounted(object):
+
+    def test_is_mounted(self, fake_proc):
+        assert system.device_is_mounted('/dev/sda1') is True
+
+    def test_path_is_not_device(self, fake_proc):
+        assert system.device_is_mounted('/far/lib/ceph/osd/ceph-7') is False
+
+    def test_is_not_mounted_at_destination(self, fake_proc):
+        assert system.device_is_mounted('/dev/sda1', destination='/far/lib/ceph/osd/test-1') is False
+
+    def test_is_mounted_at_destination(self, fake_proc):
+        assert system.device_is_mounted('/dev/sda1', destination='/far/lib/ceph/osd/ceph-7') is False
+
+
+class TestGetMounts(object):
 
     def test_not_mounted(self, tmpdir, monkeypatch):
         PROCDIR = str(tmpdir)
@@ -42,48 +110,47 @@ class TestIsMounted(object):
         with open(proc_path, 'w') as f:
             f.write('')
         monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
-        assert system.is_mounted('sdb') is False
+        assert system.get_mounts() == {}
 
-    def test_is_mounted_(self, tmpdir, monkeypatch):
-        PROCDIR = str(tmpdir)
-        proc_path = os.path.join(PROCDIR, 'mounts')
-        with open(proc_path, 'w') as f:
-            f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
-                    /dev/sdc2 /boot xfs rw,seclabel,relatime,attr2,inode64,noquota 0 0
-                    tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
-        monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
-        monkeypatch.setattr(os.path, 'exists', lambda x: True)
-        assert system.is_mounted('/dev/sdc2') is True
+    def test_is_mounted_(self, fake_proc):
+        result = system.get_mounts()
+        assert result['/dev/sdc2'] == ['/boot']
 
-    def test_ignores_two_fields(self, tmpdir, monkeypatch):
-        PROCDIR = str(tmpdir)
-        proc_path = os.path.join(PROCDIR, 'mounts')
-        with open(proc_path, 'w') as f:
-            f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
-                    /dev/sdc2 /boot
-                    tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
-        monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
-        monkeypatch.setattr(os.path, 'exists', lambda x: True)
-        assert system.is_mounted('/dev/sdc2') is False
+    def test_ignores_two_fields(self, fake_proc):
+        result = system.get_mounts()
+        assert result.get('/dev/sde4') is None
 
-    def test_not_mounted_at_destination(self, tmpdir, monkeypatch):
-        PROCDIR = str(tmpdir)
-        proc_path = os.path.join(PROCDIR, 'mounts')
-        with open(proc_path, 'w') as f:
-            f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
-                    /dev/sdc2 /var/lib/ceph/osd/ceph-9 xfs rw,attr2,inode64,noquota 0 0
-                    tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
-        monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
-        monkeypatch.setattr(os.path, 'exists', lambda x: True)
-        assert system.is_mounted('/dev/sdc2', '/var/lib/ceph/osd/ceph-0') is False
+    def test_tmpfs_is_reported(self, fake_proc):
+        result = system.get_mounts()
+        assert result['tmpfs'][0] == '/dev/shm'
+
+    def test_non_skip_devs_arent_reported(self, fake_proc):
+        result = system.get_mounts()
+        assert result.get('cgroup') is None
+
+    def test_multiple_mounts_are_appended(self, fake_proc):
+        result = system.get_mounts()
+        assert len(result['tmpfs']) == 7
 
-    def test_is_mounted_at_destination(self, tmpdir, monkeypatch):
+    def test_nonexistent_devices_are_skipped(self, tmpdir, monkeypatch):
         PROCDIR = str(tmpdir)
         proc_path = os.path.join(PROCDIR, 'mounts')
         with open(proc_path, 'w') as f:
             f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
-                    /dev/sdc2 /var/lib/ceph/osd/ceph-0 xfs rw,attr2,inode64,noquota 0 0
-                    tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
+                    /dev/sda1 /far/lib/ceph/osd/ceph-0 xfs rw,attr2,inode64,noquota 0 0
+                    /dev/sda2 /far/lib/ceph/osd/ceph-1 xfs rw,attr2,inode64,noquota 0 0"""))
         monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
-        monkeypatch.setattr(os.path, 'exists', lambda x: True)
-        assert system.is_mounted('/dev/sdc2', '/var/lib/ceph/osd/ceph-0') is True
+        monkeypatch.setattr(os.path, 'exists', lambda x: False if x == '/dev/sda1' else True)
+        result = system.get_mounts()
+        assert result.get('/dev/sda1') is None
+
+
+class TestIsBinary(object):
+
+    def test_is_binary(self, tmpfile):
+        binary_path = tmpfile(contents='asd\n\nlkjh\x00')
+        assert system.is_binary(binary_path)
+
+    def test_is_not_binary(self, tmpfile):
+        binary_path = tmpfile(contents='asd\n\nlkjh0')
+        assert system.is_binary(binary_path) is False
index feb4707165a5d864ba46112dbbd8bad2d1c7f32e..349d5da173861dcd4d3766e307b42243ff8d77e3 100644 (file)
@@ -1,4 +1,8 @@
 import argparse
+import os
+from ceph_volume import terminal
+from ceph_volume import decorators
+from ceph_volume.util import disk
 
 
 class LVPath(object):
@@ -7,12 +11,20 @@ class LVPath(object):
 
         <vg name>/<lv name>
 
+    Or a full path to a device, like ``/dev/sda``
+
     Because for LVM it is better to be specific on what group does an lv
     belongs to.
     """
 
     def __call__(self, string):
         error = None
+        if string.startswith('/'):
+            if not os.path.exists(string):
+                error = "Argument (device) does not exist: %s" % string
+                raise argparse.ArgumentError(None, error)
+            else:
+                return string
         try:
             vg, lv = string.split('/')
         except ValueError:
@@ -27,3 +39,35 @@ class LVPath(object):
         if error:
             raise argparse.ArgumentError(None, error)
         return string
+
+
+class OSDPath(object):
+    """
+    Validate path exists and it looks like an OSD directory.
+    """
+
+    @decorators.needs_root
+    def __call__(self, string):
+        if not os.path.exists(string):
+            error = "Path does not exist: %s" % string
+            raise argparse.ArgumentError(None, error)
+
+        arg_is_partition = disk.is_partition(string)
+        if arg_is_partition:
+            return os.path.abspath(string)
+        absolute_path = os.path.abspath(string)
+        if not os.path.isdir(absolute_path):
+            error = "Argument is not a directory or device which is required to scan"
+            raise argparse.ArgumentError(None, error)
+        key_files = ['ceph_fsid', 'fsid', 'keyring', 'ready', 'type', 'whoami']
+        dir_files = os.listdir(absolute_path)
+        for key_file in key_files:
+            if key_file not in dir_files:
+                terminal.error('All following files must exist in path: %s' % ' '.join(key_files))
+                error = "Required file (%s) was not found in OSD dir path: %s" % (
+                    key_file,
+                    absolute_path
+                )
+                raise argparse.ArgumentError(None, error)
+
+        return os.path.abspath(string)
index 0d3061d3cd069166024eb6f280de189773537dc7..da3dc93413ced3190a4e74c8ad564d5c9cb30e45 100644 (file)
@@ -1,3 +1,5 @@
+import os
+import stat
 from ceph_volume import process
 
 
@@ -22,3 +24,160 @@ def get_device_from_partuuid(partuuid):
         ['sudo', 'blkid', '-t', 'PARTUUID="%s"' % partuuid, '-o', 'device']
     )
     return ' '.join(out).strip()
+
+
+def _stat_is_device(stat_obj):
+    """
+    Helper function that will interpret ``os.stat`` output directly, so that other
+    functions can call ``os.stat`` once and interpret that result several times
+    """
+    return stat.S_ISBLK(stat_obj)
+
+
+def lsblk(device, columns=None):
+    """
+    Create a dictionary of identifying values for a device using ``lsblk``.
+    Each supported column is a key, in its *raw* format (all uppercase
+    usually).  ``lsblk`` has support for certain "columns" (in blkid these
+    would be labels), and these columns vary between distributions and
+    ``lsblk`` versions. The newer versions support a richer set of columns,
+    while older ones were a bit limited.
+
+    These are the default lsblk columns reported which are safe to use for
+    Ubuntu 14.04.5 LTS:
+
+         NAME  device name
+        KNAME  internal kernel device name
+      MAJ:MIN  major:minor device number
+       FSTYPE  filesystem type
+   MOUNTPOINT  where the device is mounted
+        LABEL  filesystem LABEL
+         UUID  filesystem UUID
+           RO  read-only device
+           RM  removable device
+        MODEL  device identifier
+         SIZE  size of the device
+        STATE  state of the device
+        OWNER  user name
+        GROUP  group name
+         MODE  device node permissions
+    ALIGNMENT  alignment offset
+       MIN-IO  minimum I/O size
+       OPT-IO  optimal I/O size
+      PHY-SEC  physical sector size
+      LOG-SEC  logical sector size
+         ROTA  rotational device
+        SCHED  I/O scheduler name
+      RQ-SIZE  request queue size
+         TYPE  device type
+     DISC-ALN  discard alignment offset
+    DISC-GRAN  discard granularity
+     DISC-MAX  discard max bytes
+    DISC-ZERO  discard zeroes data
+
+    There is a bug in ``lsblk`` where using all the available (supported)
+    columns will result in no output (!), in order to workaround this the
+    following columns have been removed from the default reporting columns:
+
+    * RQ-SIZE (request queue size)
+    * MIN-IO  minimum I/O size
+    * OPT-IO  optimal I/O size
+
+    These should be available however when using `columns`. For example::
+
+        >>> lsblk('/dev/sda1', columns=['OPT-IO'])
+        {'OPT-IO': '0'}
+
+    Normal CLI output, as filtered by the flags in this function will look like ::
+
+        $ sudo lsblk --nodeps -P -o NAME,KNAME,MAJ:MIN,FSTYPE,MOUNTPOINT
+        NAME="sda1" KNAME="sda1" MAJ:MIN="8:1" FSTYPE="ext4" MOUNTPOINT="/"
+
+    :param columns: A list of columns to report as keys in its original form.
+    """
+    default_columns = [
+        'NAME', 'KNAME', 'MAJ:MIN', 'FSTYPE', 'MOUNTPOINT', 'LABEL', 'UUID',
+        'RO', 'RM', 'MODEL', 'SIZE', 'STATE', 'OWNER', 'GROUP', 'MODE',
+        'ALIGNMENT', 'PHY-SEC', 'LOG-SEC', 'ROTA', 'SCHED', 'TYPE', 'DISC-ALN',
+        'DISC-GRAN', 'DISC-MAX', 'DISC-ZERO'
+    ]
+    device = device.rstrip('/')
+    columns = columns or default_columns
+    # --nodeps -> Avoid adding children/parents to the device, only give information
+    #             on the actual device we are querying for
+    # -P       -> Produce pairs of COLUMN="value"
+    # -o       -> Use the columns specified or default ones provided by this function
+    command = ['sudo', 'lsblk', '--nodeps', '-P', '-o']
+    command.append(','.join(columns))
+    command.append(device)
+    out, err, rc = process.call(command)
+
+    if rc != 0:
+        return {}
+
+    # parse the COLUMN="value" output to construct the dictionary
+    pairs = ' '.join(out).split()
+    parsed = {}
+    for pair in pairs:
+        try:
+            column, value = pair.split('=')
+        except ValueError:
+            continue
+        parsed[column] = value.strip().strip().strip('"')
+    return parsed
+
+
+def _lsblk_type(device):
+    """
+    Helper function that will use the ``TYPE`` label output of ``lsblk`` to determine
+    if a device is a partition or disk.
+    It does not process the output to return a boolean, but it does process it to return the
+    """
+    out, err, rc = process.call(
+        ['sudo', 'blkid', '-s', 'PARTUUID', '-o', 'value', device]
+    )
+    return ' '.join(out).strip()
+
+
+def is_device(dev):
+    """
+    Boolean to determine if a given device is a block device (**not**
+    a partition!)
+
+    For example: /dev/sda would return True, but not /dev/sdc1
+    """
+    if not os.path.exists(dev):
+        return False
+    # use lsblk first, fall back to using stat
+    TYPE = lsblk(dev).get('TYPE')
+    if TYPE:
+        return TYPE == 'disk'
+
+    # fallback to stat
+    return _stat_is_device(os.lstat(dev).st_mode)
+    if stat.S_ISBLK(os.lstat(dev)):
+        return True
+    return False
+
+
+def is_partition(dev):
+    """
+    Boolean to determine if a given device is a partition, like /dev/sda1
+    """
+    if not os.path.exists(dev):
+        return False
+    # use lsblk first, fall back to using stat
+    TYPE = lsblk(dev).get('TYPE')
+    if TYPE:
+        return TYPE == 'part'
+
+    # fallback to stat
+    stat_obj = os.stat(dev)
+    if _stat_is_device(stat_obj.st_mode):
+        return False
+
+    major = os.major(stat_obj.st_rdev)
+    minor = os.minor(stat_obj.st_rdev)
+    if os.path.exists('/sys/dev/block/%d:%d/partition' % (major, minor)):
+        return True
+    return False
index eefa0adc2041526ee1358c50762ae5146e295c77..6b38fe0978d536a1705022888ada328d1b241dc1 100644 (file)
@@ -57,8 +57,21 @@ def create_id(fsid, json_secrets):
     return ' '.join(stdout).strip()
 
 
-def create_path(osd_id):
+def mount_tmpfs(path):
+    process.run([
+        'sudo',
+        'mount',
+        '-t',
+        'tmpfs', 'tmpfs',
+        path
+    ])
+
+
+def create_osd_path(osd_id, tmpfs=False):
+    path = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
     system.mkdir_p('/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id))
+    if tmpfs:
+        mount_tmpfs(path)
 
 
 def format_device(device):
@@ -98,15 +111,39 @@ def mount_osd(device, osd_id):
     process.run(command)
 
 
-def link_journal(journal_device, osd_id):
-    journal_path = '/var/lib/ceph/osd/%s-%s/journal' % (
+def _link_device(device, device_type, osd_id):
+    """
+    Allow linking any device type in an OSD directory. ``device`` must the be
+    source, with an absolute path and ``device_type`` will be the destination
+    name, like 'journal', or 'block'
+    """
+    device_path = '/var/lib/ceph/osd/%s-%s/%s' % (
         conf.cluster,
-        osd_id
+        osd_id,
+        device_type
     )
-    command = ['sudo', 'ln', '-s', journal_device, journal_path]
+    command = ['sudo', 'ln', '-s', device, device_path]
+    system.chown(device)
+
     process.run(command)
 
 
+def link_journal(journal_device, osd_id):
+    _link_device(journal_device, 'journal', osd_id)
+
+
+def link_block(block_device, osd_id):
+    _link_device(block_device, 'block', osd_id)
+
+
+def link_wal(wal_device, osd_id):
+    _link_device(wal_device, 'block.wal', osd_id)
+
+
+def link_db(db_device, osd_id):
+    _link_device(db_device, 'block.db', osd_id)
+
+
 def get_monmap(osd_id):
     """
     Before creating the OSD files, a monmap needs to be retrieved so that it
@@ -130,7 +167,64 @@ def get_monmap(osd_id):
     ])
 
 
-def osd_mkfs(osd_id, fsid):
+def osd_mkfs_bluestore(osd_id, fsid, keyring=None, wal=False, db=False):
+    """
+    Create the files for the OSD to function. A normal call will look like:
+
+          ceph-osd --cluster ceph --mkfs --mkkey -i 0 \
+                   --monmap /var/lib/ceph/osd/ceph-0/activate.monmap \
+                   --osd-data /var/lib/ceph/osd/ceph-0 \
+                   --osd-uuid 8d208665-89ae-4733-8888-5d3bfbeeec6c \
+                   --keyring /var/lib/ceph/osd/ceph-0/keyring \
+                   --setuser ceph --setgroup ceph
+
+    In some cases it is required to use the keyring, when it is passed in as
+    a keywork argument it is used as part of the ceph-osd command
+    """
+    path = '/var/lib/ceph/osd/%s-%s/' % (conf.cluster, osd_id)
+    monmap = os.path.join(path, 'activate.monmap')
+
+    system.chown(path)
+
+    base_command = [
+        'sudo',
+        'ceph-osd',
+        '--cluster', conf.cluster,
+        # undocumented flag, sets the `type` file to contain 'bluestore'
+        '--osd-objectstore', 'bluestore',
+        '--mkfs',
+        '-i', osd_id,
+        '--monmap', monmap,
+    ]
+
+    supplementary_command = [
+        '--osd-data', path,
+        '--osd-uuid', fsid,
+        '--setuser', 'ceph',
+        '--setgroup', 'ceph'
+    ]
+
+    if keyring is not None:
+        base_command.extend(['--key', keyring])
+
+    if wal:
+        base_command.extend(
+            ['--bluestore-block-wal-path', wal]
+        )
+        system.chown(wal)
+
+    if db:
+        base_command.extend(
+            ['--bluestore-block-db-path', db]
+        )
+        system.chown(db)
+
+    command = base_command + supplementary_command
+
+    process.run(command, obfuscate='--key')
+
+
+def osd_mkfs_filestore(osd_id, fsid):
     """
     Create the files for the OSD to function. A normal call will look like:
 
@@ -154,6 +248,8 @@ def osd_mkfs(osd_id, fsid):
         'sudo',
         'ceph-osd',
         '--cluster', conf.cluster,
+        # undocumented flag, sets the `type` file to contain 'filestore'
+        '--osd-objectstore', 'filestore',
         '--mkfs',
         '-i', osd_id,
         '--monmap', monmap,
index 084a0e0d3710042a8b9d38aeaf530d1038c8ffd0..d580a4c28f08f33cb7b5fe12fdf6d04ce8c1be90 100644 (file)
@@ -2,6 +2,7 @@ import errno
 import os
 import pwd
 import platform
+import tempfile
 import uuid
 from ceph_volume import process
 from . import as_string
@@ -68,37 +69,122 @@ def chown(path, recursive=True):
         os.chown(path, uid, gid)
 
 
-def is_mounted(source, destination=None):
+def is_binary(path):
     """
-    Check if the given device is mounted, optionally validating destination.
-    This relies on absolute path devices, it will ignore non-absolute
-    entries like::
+    Detect if a file path is a binary or not. Will falsely report as binary
+    when utf-16 encoded. In the ceph universe there is no such risk (yet)
+    """
+    with open(path, 'rb') as fp:
+        contents = fp.read(8192)
+    if b'\x00' in contents:  # a null byte may signal binary
+        return True
+    return False
+
+
+class tmp_mount(object):
+    """
+    Temporarily mount a device on a temporary directory,
+    and unmount it upon exit
+    """
+
+    def __init__(self, device):
+        self.device = device
+        self.path = None
+
+    def __enter__(self):
+        self.path = tempfile.mkdtemp()
+        process.run([
+            'sudo',
+            'mount',
+            '-v',
+            self.device,
+            self.path
+        ])
+        return self.path
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        process.run([
+            'sudo',
+            'umount',
+            '-v',
+            self.path
+        ])
+
+
+def path_is_mounted(path, destination=None):
+    """
+    Check if the given path is mounted
+    """
+    mounts = get_mounts(paths=True)
+    realpath = os.path.realpath(path)
+    mounted_locations = mounts.get(realpath, [])
+
+    if destination:
+        if destination.startswith('/'):
+            destination = os.path.realpath(destination)
+        return destination in mounted_locations
+    return mounted_locations != []
+
 
-        tmpfs /run tmpfs rw,seclabel,nosuid,nodev,mode=755 0 0
+def device_is_mounted(dev, destination=None):
+    """
+    Check if the given device is mounted, optionally validating that a
+    destination exists
+    """
+    mounts = get_mounts(devices=True)
+    realpath = os.path.realpath(dev) if dev.startswith('/') else dev
+    destination = os.path.realpath(destination) if destination else None
+    mounted_locations = mounts.get(realpath, [])
+
+    if destination:
+        return destination in mounted_locations
+    return mounted_locations != []
+
+
+def get_mounts(devices=False, paths=False):
+    """
+    Create a mapping of all available system mounts so that other helpers can
+    detect nicely what path or device is mounted
 
-    But will parse paths that are absolute like::
+    It ignores (most of) non existing devices, but since some setups might need
+    some extra device information, it will make an exception for:
 
-        /dev/sdc2 /boot xfs rw,attr2,inode64,noquota 0 0
+    - tmpfs
+    - devtmpfs
 
-    When destination is passed in, it will check that the entry where the
-    source appears is mounted to where destination defines. This is useful so
-    that an error message can report that a source is not mounted at an
-    expected destination.
+    If ``devices`` is set to ``True`` the mapping will be a device-to-path(s),
+    if ``paths`` is set to ``True`` then the mapping will be
+    a path-to-device(s)
     """
-    dev = os.path.realpath(source)
-    with open(PROCDIR + '/mounts', 'rb') as proc_mounts:
-        for line in proc_mounts:
-            fields = line.split()
-            if len(fields) < 3:
+    devices_mounted = {}
+    paths_mounted = {}
+    do_not_skip = ['tmpfs', 'devtmpfs']
+    default_to_devices = devices is False and paths is False
+
+    with open(PROCDIR + '/mounts', 'rb') as mounts:
+        proc_mounts = mounts.readlines()
+
+    for line in proc_mounts:
+        fields = [as_string(f) for f in line.split()]
+        if len(fields) < 3:
+            continue
+        device = os.path.realpath(fields[0]) if fields[0].startswith('/') else fields[0]
+        path = os.path.realpath(fields[1])
+        # only care about actual existing devices
+        if not os.path.exists(device) or not device.startswith('/'):
+            if device not in do_not_skip:
                 continue
-            mounted_device = fields[0]
-            mounted_path = fields[1]
-            if os.path.isabs(mounted_device) and os.path.exists(mounted_device):
-                mounted_device = os.path.realpath(mounted_device)
-                if as_string(mounted_device) == dev:
-                    if destination:
-                        destination = os.path.realpath(destination)
-                        return destination == as_string(os.path.realpath(mounted_path))
-                    else:
-                        return True
-    return False
+        if device in devices_mounted.keys():
+            devices_mounted[device].append(path)
+        else:
+            devices_mounted[device] = [path]
+        if path in paths_mounted.keys():
+            paths_mounted[path].append(device)
+        else:
+            paths_mounted[path] = [device]
+
+    # Default to returning information for devices if
+    if devices is True or default_to_devices:
+        return devices_mounted
+    else:
+        return paths_mounted
index cb1bd86941950a99012db3e913cde9b90de4f2ca..7c1eda2c0920bdc57d356b18140eba7f0d74531f 100755 (executable)
@@ -48,7 +48,7 @@ PRIO_USEFUL = 5
 PRIO_UNINTERESTING = 2
 PRIO_DEBUGONLY = 0
 
-PRIO_DEFAULT = PRIO_USEFUL
+PRIO_DEFAULT = PRIO_INTERESTING
 
 # Make life easier on developers:
 # If our parent dir contains CMakeCache.txt and bin/init-ceph,
@@ -228,7 +228,7 @@ def validate_target(target):
                   file=sys.stderr)
             return False
 
-        if service_id in exist_ids:
+        if service_id in exist_ids or len(exist_ids) > 0 and service_id == '*':
             return True
         else:
             print('WARN: the service id you provided does not exist. service id should '
index 91043f6e81628a1f6e9bf65a5727871a6fbbed03..5e8f6798e3924271ba21ed375b25931ed0cba81e 100644 (file)
 
 #include <Python.h>
 
+#include <pthread.h>
+
 #include "include/types.h"
+#include "include/compat.h"
 #include "common/config.h"
 #include "common/ceph_argparse.h"
 #include "common/errno.h"
@@ -38,6 +41,8 @@ static void usage()
  */
 int main(int argc, const char **argv)
 {
+  ceph_pthread_setname(pthread_self(), "ceph-mgr");
+
   vector<const char*> args;
   argv_to_vec(argc, argv, args);
   env_to_vec(args);
index 3663bb04e4b3c3e71dc20bace01969fcb92eb7ee..41a6ee0eb10a4b246d6f2df8eed6d862165b99a7 100644 (file)
@@ -247,7 +247,6 @@ int main(int argc, const char **argv)
                         flags, "mon_data");
   ceph_heap_profiler_init();
 
-  uuid_d fsid;
   std::string val;
   for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
     if (ceph_argparse_double_dash(args, i)) {
@@ -331,10 +330,11 @@ int main(int argc, const char **argv)
     MonMap monmap;
 
     // load or generate monmap
-    if (g_conf->monmap.length()) {
-      int err = monmapbl.read_file(g_conf->monmap.c_str(), &error);
+    const auto monmap_fn = g_conf->get_val<string>("monmap");
+    if (monmap_fn.length()) {
+      int err = monmapbl.read_file(monmap_fn.c_str(), &error);
       if (err < 0) {
-       derr << argv[0] << ": error reading " << g_conf->monmap << ": " << error << dendl;
+       derr << argv[0] << ": error reading " << monmap_fn << ": " << error << dendl;
        exit(1);
       }
       try {
@@ -342,9 +342,8 @@ int main(int argc, const char **argv)
 
        // always mark seed/mkfs monmap as epoch 0
        monmap.set_epoch(0);
-      }
-      catch (const buffer::error& e) {
-       derr << argv[0] << ": error decoding monmap " << g_conf->monmap << ": " << e.what() << dendl;
+      } catch (const buffer::error& e) {
+       derr << argv[0] << ": error decoding monmap " << monmap_fn << ": " << e.what() << dendl;
        exit(1);
       }      
     } else {
@@ -393,9 +392,10 @@ int main(int argc, const char **argv)
       }
     }
 
-    if (!g_conf->fsid.is_zero()) {
-      monmap.fsid = g_conf->fsid;
-      dout(0) << argv[0] << ": set fsid to " << g_conf->fsid << dendl;
+    const auto fsid = g_conf->get_val<uuid_d>("fsid");
+    if (!fsid.is_zero()) {
+      monmap.fsid = fsid;
+      dout(0) << argv[0] << ": set fsid to " << fsid << dendl;
     }
     
     if (monmap.fsid.is_zero()) {
index d7e54a3a3faf5d8938d9444c4ee89c74bbe5a6f3..1cfda9c1ddf35d71b79490ff954cce94b7f235d8 100644 (file)
@@ -266,29 +266,6 @@ int main(int argc, const char **argv)
   cephd_preload_embedded_plugins();
 #endif
 
-  if (mkfs) {
-    common_init_finish(g_ceph_context);
-    MonClient mc(g_ceph_context);
-    if (mc.build_initial_monmap() < 0)
-      return -1;
-    if (mc.get_monmap_privately() < 0)
-      return -1;
-
-    if (mc.monmap.fsid.is_zero()) {
-      derr << "must specify cluster fsid" << dendl;
-      return -EINVAL;
-    }
-
-    int err = OSD::mkfs(g_ceph_context, store, g_conf->osd_data,
-                       mc.monmap.fsid, whoami);
-    if (err < 0) {
-      derr << TEXT_RED << " ** ERROR: error creating empty object store in "
-          << g_conf->osd_data << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
-      exit(1);
-    }
-    derr << "created object store " << g_conf->osd_data
-        << " for osd." << whoami << " fsid " << mc.monmap.fsid << dendl;
-  }
   if (mkkey) {
     common_init_finish(g_ceph_context);
     KeyRing *keyring = KeyRing::create_empty();
@@ -317,6 +294,29 @@ int main(int argc, const char **argv)
        derr << "created new key in keyring " << g_conf->keyring << dendl;
     }
   }
+  if (mkfs) {
+    common_init_finish(g_ceph_context);
+    MonClient mc(g_ceph_context);
+    if (mc.build_initial_monmap() < 0)
+      return -1;
+    if (mc.get_monmap_privately() < 0)
+      return -1;
+
+    if (mc.monmap.fsid.is_zero()) {
+      derr << "must specify cluster fsid" << dendl;
+      return -EINVAL;
+    }
+
+    int err = OSD::mkfs(g_ceph_context, store, g_conf->osd_data,
+                       mc.monmap.fsid, whoami);
+    if (err < 0) {
+      derr << TEXT_RED << " ** ERROR: error creating empty object store in "
+          << g_conf->osd_data << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
+      exit(1);
+    }
+    derr << "created object store " << g_conf->osd_data
+        << " for osd." << whoami << " fsid " << mc.monmap.fsid << dendl;
+  }
   if (mkfs || mkkey)
     exit(0);
   if (mkjournal) {
index 29a9c49f79faf26e56c5f496734d58fa52dc712d..1d9277a61b6ecfb1d5c01f83d2aedc4418600e3f 100644 (file)
@@ -5944,19 +5944,6 @@ void Client::unmount()
   ldout(cct, 2) << "unmounted." << dendl;
 }
 
-
-
-class C_C_Tick : public Context {
-  Client *client;
-public:
-  explicit C_C_Tick(Client *c) : client(c) {}
-  void finish(int r) override {
-    // Called back via Timer, which takes client_lock for us
-    assert(client->client_lock.is_locked_by_me());
-    client->tick();
-  }
-};
-
 void Client::flush_cap_releases()
 {
   // send any cap releases
@@ -5985,9 +5972,13 @@ void Client::tick()
   }
 
   ldout(cct, 21) << "tick" << dendl;
-  tick_event = new C_C_Tick(this);
-  timer.add_event_after(cct->_conf->client_tick_interval, tick_event);
-
+  tick_event = timer.add_event_after(
+    cct->_conf->client_tick_interval,
+    new FunctionContext([this](int) {
+       // Called back via Timer, which takes client_lock for us
+       assert(client_lock.is_locked_by_me());
+       tick();
+      }));
   utime_t now = ceph_clock_now();
 
   if (!mounted && !mds_requests.empty()) {
index e89a25440506ac0be1dd550cc011d7932251f9f5..16aef0312c15e8cb00632e01bc1b05be1a684d6a 100644 (file)
@@ -498,7 +498,6 @@ protected:
   friend class C_Client_CacheInvalidate;  // calls ino_invalidate_cb
   friend class C_Client_DentryInvalidate;  // calls dentry_invalidate_cb
   friend class C_Block_Sync; // Calls block map and protected helpers
-  friend class C_C_Tick; // Asserts on client_lock
   friend class C_Client_RequestInterrupt;
   friend class C_Client_Remount;
   friend void intrusive_ptr_release(Inode *in);
index f966c07f24aed7c48cc7f04087aed941434d8957..2f1d18d0d79c88a3caa383a915a2885e7d18e322 100644 (file)
@@ -188,6 +188,7 @@ int expire_tags(cls_method_context_t hctx, const std::string *skip_client_id) {
       if (tag.tid >= minimum_tag_tid) {
         // no need to check for tag classes beyond this point
         vals.clear();
+        more = false;
         break;
       }
     }
@@ -1047,6 +1048,7 @@ int journal_tag_list(cls_method_context_t hctx, bufferlist *in,
         // completed calculation of tag class minimums
         if (tag.tid >= minimum_tag_tid) {
           vals.clear();
+          more = false;
           break;
         }
       } else if (tag_pass == TAG_PASS_LIST) {
index 79795dbc37d142f916dacdef6744145257b132a2..90a48a82199a9ab5a38e3148459967ac4e462fbc 100644 (file)
@@ -2460,7 +2460,7 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
     CLS_ERR("object map footer read failed");
     return r;
   }
+
   try {
     bufferlist::iterator it = footer_bl.begin();
     object_map.decode_footer(it);
@@ -2496,13 +2496,14 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
   }
 
   bool updated = false;
-  for (uint64_t object_no = start_object_no; object_no < end_object_no;
-       ++object_no) {
-    uint8_t state = object_map[object_no];
+  auto it = object_map.begin() + start_object_no;
+  auto end_it = object_map.begin() + end_object_no;
+  for (; it != end_it; ++it) {
+    uint8_t state = *it;
     if ((!current_object_state || state == *current_object_state ||
         (*current_object_state == OBJECT_EXISTS &&
          state == OBJECT_EXISTS_CLEAN)) && state != new_object_state) {
-      object_map[object_no] = new_object_state;
+      *it = new_object_state;
       updated = true;
     }
   }
@@ -3167,6 +3168,22 @@ int uuid_get(cls_method_context_t hctx, std::string *mirror_uuid) {
   return 0;
 }
 
+int list_watchers(cls_method_context_t hctx,
+                  std::set<entity_inst_t> *entities) {
+  obj_list_watch_response_t watchers;
+  int r = cls_cxx_list_watchers(hctx, &watchers);
+  if (r < 0 && r != -ENOENT) {
+    CLS_ERR("error listing watchers: '%s'", cpp_strerror(r).c_str());
+    return r;
+  }
+
+  entities->clear();
+  for (auto &w : watchers.entries) {
+    entities->emplace(w.name, w.addr);
+  }
+  return 0;
+}
+
 int read_peers(cls_method_context_t hctx,
                std::vector<cls::rbd::MirrorPeer> *peers) {
   std::string last_read = PEER_KEY_PREFIX;
@@ -3419,6 +3436,7 @@ int image_status_remove(cls_method_context_t hctx,
 }
 
 int image_status_get(cls_method_context_t hctx, const string &global_image_id,
+                     const std::set<entity_inst_t> &watchers,
                     cls::rbd::MirrorImageStatus *status) {
 
   bufferlist bl;
@@ -3441,23 +3459,9 @@ int image_status_get(cls_method_context_t hctx, const string &global_image_id,
     return -EIO;
   }
 
-  obj_list_watch_response_t watchers;
-  r = cls_cxx_list_watchers(hctx, &watchers);
-  if (r < 0 && r != -ENOENT) {
-    CLS_ERR("error listing watchers: '%s'", cpp_strerror(r).c_str());
-    return r;
-  }
 
   *status = static_cast<cls::rbd::MirrorImageStatus>(ondisk_status);
-  status->up = false;
-  for (auto &w : watchers.entries) {
-    if (w.name == ondisk_status.origin.name &&
-       w.addr == ondisk_status.origin.addr) {
-      status->up = true;
-      break;
-    }
-  }
-
+  status->up = (watchers.find(ondisk_status.origin) != watchers.end());
   return 0;
 }
 
@@ -3469,11 +3473,17 @@ int image_status_list(cls_method_context_t hctx,
   int max_read = RBD_MAX_KEYS_READ;
   bool more = true;
 
+  std::set<entity_inst_t> watchers;
+  int r = list_watchers(hctx, &watchers);
+  if (r < 0) {
+    return r;
+  }
+
   while (more && mirror_images->size() < max_return) {
     std::map<std::string, bufferlist> vals;
     CLS_LOG(20, "last_read = '%s'", last_read.c_str());
-    int r = cls_cxx_map_get_vals(hctx, last_read, IMAGE_KEY_PREFIX, max_read,
-                                 &vals, &more);
+    r = cls_cxx_map_get_vals(hctx, last_read, IMAGE_KEY_PREFIX, max_read, &vals,
+                             &more);
     if (r < 0) {
       CLS_ERR("error reading mirror image directory by name: %s",
               cpp_strerror(r).c_str());
@@ -3496,7 +3506,8 @@ int image_status_list(cls_method_context_t hctx,
       (*mirror_images)[image_id] = mirror_image;
 
       cls::rbd::MirrorImageStatus status;
-      int r1 = image_status_get(hctx, mirror_image.global_image_id, &status);
+      int r1 = image_status_get(hctx, mirror_image.global_image_id, watchers,
+                                &status);
       if (r1 < 0) {
        continue;
       }
@@ -3513,20 +3524,12 @@ int image_status_list(cls_method_context_t hctx,
 
 int image_status_get_summary(cls_method_context_t hctx,
        std::map<cls::rbd::MirrorImageStatusState, int> *states) {
-  obj_list_watch_response_t watchers_;
-  int r = cls_cxx_list_watchers(hctx, &watchers_);
+  std::set<entity_inst_t> watchers;
+  int r = list_watchers(hctx, &watchers);
   if (r < 0) {
-    if (r != -ENOENT) {
-      CLS_ERR("error listing watchers: '%s'", cpp_strerror(r).c_str());
-    }
     return r;
   }
 
-  set<entity_inst_t> watchers;
-  for (auto &w : watchers_.entries) {
-    watchers.insert(entity_inst_t(w.name, w.addr));
-  }
-
   states->clear();
 
   string last_read = IMAGE_KEY_PREFIX;
@@ -3559,7 +3562,7 @@ int image_status_get_summary(cls_method_context_t hctx,
       }
 
       cls::rbd::MirrorImageStatus status;
-      image_status_get(hctx, mirror_image.global_image_id, &status);
+      image_status_get(hctx, mirror_image.global_image_id, watchers, &status);
 
       cls::rbd::MirrorImageStatusState state = status.up ? status.state :
        cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN;
@@ -3575,20 +3578,12 @@ int image_status_get_summary(cls_method_context_t hctx,
 }
 
 int image_status_remove_down(cls_method_context_t hctx) {
-  obj_list_watch_response_t watchers_;
-  int r = cls_cxx_list_watchers(hctx, &watchers_);
+  std::set<entity_inst_t> watchers;
+  int r = list_watchers(hctx, &watchers);
   if (r < 0) {
-    if (r != -ENOENT) {
-      CLS_ERR("error listing watchers: '%s'", cpp_strerror(r).c_str());
-    }
     return r;
   }
 
-  set<entity_inst_t> watchers;
-  for (auto &w : watchers_.entries) {
-    watchers.insert(entity_inst_t(w.name, w.addr));
-  }
-
   string last_read = STATUS_GLOBAL_KEY_PREFIX;
   int max_read = RBD_MAX_KEYS_READ;
   bool more = true;
@@ -4275,8 +4270,14 @@ int mirror_image_status_get(cls_method_context_t hctx, bufferlist *in,
     return -EINVAL;
   }
 
+  std::set<entity_inst_t> watchers;
+  int r = mirror::list_watchers(hctx, &watchers);
+  if (r < 0) {
+    return r;
+  }
+
   cls::rbd::MirrorImageStatus status;
-  int r = mirror::image_status_get(hctx, global_image_id, &status);
+  r = mirror::image_status_get(hctx, global_image_id, watchers, &status);
   if (r < 0) {
     return r;
   }
index 17a618053a61effcb2185ba148fa93bf411b7bbe..354a132da48813f8a6f985cdb20d56280949ab02 100644 (file)
@@ -445,8 +445,9 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
         CLS_LOG(20, "entry %s[%s] is not valid\n", key.name.c_str(), key.instance.c_str());
         continue;
       }
-
-      if (!op.list_versions && !entry.is_visible()) {
+      
+      // filter out noncurrent versions, delete markers, and initial marker
+      if (!op.list_versions && (!entry.is_visible() || op.start_obj.name == key.name)) {
         CLS_LOG(20, "entry %s[%s] is not visible\n", key.name.c_str(), key.instance.c_str());
         continue;
       }
@@ -935,6 +936,7 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist
     unaccount_entry(header, remove_entry);
 
     if (op.log_op && !header.syncstopped) {
+      ++header.ver; // increment index version, or we'll overwrite keys previously written
       rc = log_index_operation(hctx, remove_key, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
                                remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags, NULL, NULL, &op.zones_trace);
       if (rc < 0)
@@ -1863,7 +1865,8 @@ static int rgw_bucket_clear_olh(cls_method_context_t hctx, bufferlist *in, buffe
   return 0;
 }
 
-int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+int rgw_dir_suggest_changes(cls_method_context_t hctx,
+                           bufferlist *in, bufferlist *out)
 {
   CLS_LOG(1, "rgw_dir_suggest_changes()");
 
@@ -1956,8 +1959,21 @@ int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlis
         }
         break;
       case CEPH_RGW_UPDATE:
+       if (!cur_disk.exists) {
+         // this update would only have been sent by the rgw client
+         // if the rgw_bucket_dir_entry existed, however between that
+         // check and now the entry has diappeared, so we were likely
+         // in the midst of a delete op, and we will not recreate the
+         // entry
+         CLS_LOG(10,
+                 "CEPH_RGW_UPDATE not applied because rgw_bucket_dir_entry"
+                 " no longer exists\n");
+         break;
+       }
+
         CLS_LOG(10, "CEPH_RGW_UPDATE name=%s instance=%s total_entries: %" PRId64 " -> %" PRId64 "\n",
                 cur_change.key.name.c_str(), cur_change.key.instance.c_str(), stats.num_entries, stats.num_entries + 1);
+
         stats.num_entries++;
         stats.total_size += cur_change.meta.accounted_size;
         stats.total_size_rounded += cls_rgw_get_rounded_size(cur_change.meta.accounted_size);
@@ -1978,10 +1994,9 @@ int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlis
           }
         }
         break;
-      }
-    }
-
-  }
+      } // switch(op)
+    } // if (cur_disk.pending_map.empty())
+  } // while (!in_iter.end())
 
   if (header_changed) {
     return write_bucket_header(hctx, &header);
@@ -2900,9 +2915,7 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
   bool by_user = !user.empty();
   uint32_t i = 0;
   string user_key;
-
-  if (truncated)
-    *truncated = false;
+  bool truncated_status = false;
 
   if (!by_user) {
     usage_record_prefix_by_time(end, end_key);
@@ -2922,11 +2935,14 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
   }
 
   CLS_LOG(20, "usage_iterate_range start_key=%s", start_key.c_str());
-  int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, max_entries, &keys, truncated);
+  int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, max_entries, &keys, &truncated_status);
   if (ret < 0)
     return ret;
 
-
+  if (truncated) {
+    *truncated = truncated_status;
+  }
+      
   map<string, bufferlist>::iterator iter = keys.begin();
   if (iter == keys.end())
     return 0;
@@ -2939,11 +2955,17 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
 
     if (!by_user && key.compare(end_key) >= 0) {
       CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
+      if (truncated_status) {
+        key_iter = key;
+      }
       return 0;
     }
 
     if (by_user && key.compare(0, user_key.size(), user_key) != 0) {
       CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
+      if (truncated_status) {
+        key_iter = key;
+      }
       return 0;
     }
 
index 7912578aa36c8aed8fdea07c8cb1849e760f0aea..840470e9f4d9b665a6b0d647046759a39f2099bb 100644 (file)
@@ -163,7 +163,6 @@ static int cls_user_set_buckets_info(cls_method_context_t hctx, bufferlist *in,
     if (!op.add){
       apply_entry_stats(update_entry, &entry);
     }
-
     entry.user_stats_sync = true;
 
     ret = write_entry(hctx, key, entry);
index 8595f25fd1cae35bc4a9e2821ce47679809b14da..6ffd933231606e188ed420cf2d4d11ea978c99bf 100644 (file)
@@ -101,14 +101,14 @@ struct cls_user_bucket_entry {
   cls_user_bucket bucket;
   size_t size;
   size_t size_rounded;
-  real_time creation_time;
+  ceph::real_time creation_time;
   uint64_t count;
   bool user_stats_sync;
 
   cls_user_bucket_entry() : size(0), size_rounded(0), count(0), user_stats_sync(false) {}
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(7, 5, bl);
+    ENCODE_START(9, 5, bl);
     uint64_t s = size;
     __u32 mt = ceph::real_clock::to_time_t(creation_time);
     string empty_str;  // originally had the bucket name here, but we encode bucket later
@@ -121,10 +121,11 @@ struct cls_user_bucket_entry {
     ::encode(s, bl);
     ::encode(user_stats_sync, bl);
     ::encode(creation_time, bl);
+    //::encode(placement_rule, bl); removed in v9
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
-    DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
+    DECODE_START_LEGACY_COMPAT_LEN(9, 5, 5, bl);
     __u32 mt;
     uint64_t s;
     string empty_str;  // backward compatibility
@@ -146,6 +147,10 @@ struct cls_user_bucket_entry {
       ::decode(user_stats_sync, bl);
     if (struct_v >= 7)
       ::decode(creation_time, bl);
+    if (struct_v == 8) { // added in v8, removed in v9
+      std::string placement_rule;
+      ::decode(placement_rule, bl);
+    }
     DECODE_FINISH(bl);
   }
   void dump(Formatter *f) const;
index 28512ac800e374804349dddb326ea5bb881586a7..d5c7a852ddf145f682d92c1ce0a442264e615563 100644 (file)
@@ -18,6 +18,8 @@
 #include "common/Finisher.h"
 #include "common/Formatter.h"
 
+#define rdout(x) lgeneric_subdout(cct,reserver,x)
+
 /**
  * Manages a configurable number of asyncronous reservations.
  *
  */
 template <typename T>
 class AsyncReserver {
+  CephContext *cct;
   Finisher *f;
   unsigned max_allowed;
   unsigned min_priority;
   Mutex lock;
 
-  map<unsigned, list<pair<T, Context*> > > queues;
-  map<T, pair<unsigned, typename list<pair<T, Context*> >::iterator > > queue_pointers;
-  set<T> in_progress;
+  struct Reservation {
+    T item;
+    unsigned prio = 0;
+    Context *grant = 0;
+    Context *preempt = 0;
+    Reservation() {}
+    Reservation(T i, unsigned pr, Context *g, Context *p = 0)
+      : item(i), prio(pr), grant(g), preempt(p) {}
+    void dump(Formatter *f) const {
+      f->dump_stream("item") << item;
+      f->dump_unsigned("prio", prio);
+      f->dump_bool("can_preempt", !!preempt);
+    }
+    friend ostream& operator<<(ostream& out, const Reservation& r) {
+      return out << r.item << "(prio " << r.prio << " grant " << r.grant
+                << " preempt " << r.preempt << ")";
+    }
+  };
+
+  map<unsigned, list<Reservation>> queues;
+  map<T, pair<unsigned, typename list<Reservation>::iterator>> queue_pointers;
+  map<T,Reservation> in_progress;
+  set<pair<unsigned,T>> preempt_by_prio;  ///< in_progress that can be preempted
+
+  void preempt_one() {
+    assert(!preempt_by_prio.empty());
+    auto q = in_progress.find(preempt_by_prio.begin()->second);
+    assert(q != in_progress.end());
+    Reservation victim = q->second;
+    rdout(10) << __func__ << " preempt " << victim << dendl;
+    f->queue(victim.preempt);
+    victim.preempt = nullptr;
+    in_progress.erase(q);
+    preempt_by_prio.erase(preempt_by_prio.begin());
+  }
 
   void do_queues() {
-    typename map<unsigned, list<pair<T, Context*> > >::reverse_iterator it;
-    for (it = queues.rbegin();
-         it != queues.rend() &&
-          in_progress.size() < max_allowed &&
-          it->first >= min_priority;
-         ++it) {
-      while (in_progress.size() < max_allowed &&
-             !it->second.empty()) {
-        pair<T, Context*> p = it->second.front();
-        queue_pointers.erase(p.first);
-        it->second.pop_front();
-        f->queue(p.second);
-        in_progress.insert(p.first);
+    rdout(20) << __func__ << ":\n";
+    JSONFormatter jf(true);
+    jf.open_object_section("queue");
+    _dump(&jf);
+    jf.close_section();
+    jf.flush(*_dout);
+    *_dout << dendl;
+
+    // in case min_priority was adjusted up or max_allowed was adjusted down
+    while (!preempt_by_prio.empty() &&
+          (in_progress.size() > max_allowed ||
+           preempt_by_prio.begin()->first < min_priority)) {
+      preempt_one();
+    }
+
+    while (!queues.empty()) {
+      // choose highest priority queue
+      auto it = queues.end();
+      --it;
+      assert(!it->second.empty());
+      if (it->first < min_priority) {
+       break;
+      }
+      if (in_progress.size() >= max_allowed &&
+         !preempt_by_prio.empty() &&
+         it->first > preempt_by_prio.begin()->first) {
+       preempt_one();
+      }
+      if (in_progress.size() >= max_allowed) {
+       break; // no room
+      }
+      // grant
+      Reservation p = it->second.front();
+      rdout(10) << __func__ << " grant " << p << dendl;
+      queue_pointers.erase(p.item);
+      it->second.pop_front();
+      if (it->second.empty()) {
+       queues.erase(it);
+      }
+      f->queue(p.grant);
+      p.grant = nullptr;
+      in_progress[p.item] = p;
+      if (p.preempt) {
+       preempt_by_prio.insert(make_pair(p.prio, p.item));
       }
     }
   }
 public:
   AsyncReserver(
+    CephContext *cct,
     Finisher *f,
     unsigned max_allowed,
     unsigned min_priority = 0)
-    : f(f),
+    : cct(cct),
+      f(f),
       max_allowed(max_allowed),
       min_priority(min_priority),
       lock("AsyncReserver::lock") {}
@@ -77,27 +145,26 @@ public:
 
   void dump(Formatter *f) {
     Mutex::Locker l(lock);
+    _dump(f);
+  }
+  void _dump(Formatter *f) {
     f->dump_unsigned("max_allowed", max_allowed);
     f->dump_unsigned("min_priority", min_priority);
     f->open_array_section("queues");
-    for (typename map<unsigned, list<pair<T, Context*> > > ::const_iterator p =
-          queues.begin(); p != queues.end(); ++p) {
+    for (auto& p : queues) {
       f->open_object_section("queue");
-      f->dump_unsigned("priority", p->first);
+      f->dump_unsigned("priority", p.first);
       f->open_array_section("items");
-      for (typename list<pair<T, Context*> >::const_iterator q =
-            p->second.begin(); q != p->second.end(); ++q) {
-       f->dump_stream("item") << q->first;
+      for (auto& q : p.second) {
+       f->dump_object("item", q);
       }
       f->close_section();
       f->close_section();
     }
     f->close_section();
     f->open_array_section("in_progress");
-    for (typename set<T>::const_iterator p = in_progress.begin();
-        p != in_progress.end();
-        ++p) {
-      f->dump_stream("item") << *p;
+    for (auto& p : in_progress) {
+      f->dump_object("item", p.second);
     }
     f->close_section();
   }
@@ -113,13 +180,17 @@ public:
   void request_reservation(
     T item,                   ///< [in] reservation key
     Context *on_reserved,     ///< [in] callback to be called on reservation
-    unsigned prio
+    unsigned prio,            ///< [in] priority
+    Context *on_preempt = 0   ///< [in] callback to be called if we are preempted (optional)
     ) {
     Mutex::Locker l(lock);
+    Reservation r(item, prio, on_reserved, on_preempt);
+    rdout(10) << __func__ << " queue " << r << dendl;
     assert(!queue_pointers.count(item) &&
           !in_progress.count(item));
-    queues[prio].push_back(make_pair(item, on_reserved));
-    queue_pointers.insert(make_pair(item, make_pair(prio,--(queues[prio]).end())));
+    queues[prio].push_back(r);
+    queue_pointers.insert(make_pair(item,
+                                   make_pair(prio,--(queues[prio]).end())));
     do_queues();
   }
 
@@ -134,13 +205,31 @@ public:
     T item                   ///< [in] key for reservation to cancel
     ) {
     Mutex::Locker l(lock);
-    if (queue_pointers.count(item)) {
-      unsigned prio = queue_pointers[item].first;
-      delete queue_pointers[item].second->second;
-      queues[prio].erase(queue_pointers[item].second);
-      queue_pointers.erase(item);
+    auto i = queue_pointers.find(item);
+    if (i != queue_pointers.end()) {
+      unsigned prio = i->second.first;
+      const Reservation& r = *i->second.second;
+      rdout(10) << __func__ << " cancel " << r << " (was queued)" << dendl;
+      delete r.grant;
+      delete r.preempt;
+      queues[prio].erase(i->second.second);
+      if (queues[prio].empty()) {
+       queues.erase(prio);
+      }
+      queue_pointers.erase(i);
     } else {
-      in_progress.erase(item);
+      auto p = in_progress.find(item);
+      if (p != in_progress.end()) {
+       rdout(10) << __func__ << " cancel " << p->second
+                 << " (was in progress)" << dendl;
+       if (p->second.preempt) {
+         preempt_by_prio.erase(make_pair(p->second.prio, p->second.item));
+         delete p->second.preempt;
+       }
+       in_progress.erase(p);
+      } else {
+       rdout(10) << __func__ << " cancel " << item << " (not found)" << dendl;
+      }
     }
     do_queues();
   }
@@ -157,4 +246,5 @@ public:
   static const unsigned MAX_PRIORITY = (unsigned)-1;
 };
 
+#undef rdout
 #endif
index aeb2f5bfcec82946ac7d1d600fb6668f90e30b67..6157194bb8b3600c20d36ac8b977959e9602d1c8 100644 (file)
@@ -88,7 +88,7 @@ int parse_log_client_options(CephContext *cct,
     return r;
   }
 
-  fsid = cct->_conf->fsid;
+  fsid = cct->_conf->get_val<uuid_d>("fsid");
   host = cct->_conf->host;
   return 0;
 }
index f211a6f8ff8c2171f4921f326ac451e5c82cc283..45305f553fa6c80ed45e23fd94a36ff598b68444 100644 (file)
@@ -114,7 +114,7 @@ void SafeTimer::timer_thread()
   lock.Unlock();
 }
 
-bool SafeTimer::add_event_after(double seconds, Context *callback)
+Context* SafeTimer::add_event_after(double seconds, Context *callback)
 {
   assert(lock.is_locked());
 
@@ -123,14 +123,14 @@ bool SafeTimer::add_event_after(double seconds, Context *callback)
   return add_event_at(when, callback);
 }
 
-bool SafeTimer::add_event_at(utime_t when, Context *callback)
+Context* SafeTimer::add_event_at(utime_t when, Context *callback)
 {
   assert(lock.is_locked());
   ldout(cct,10) << __func__ << " " << when << " -> " << callback << dendl;
   if (stopping) {
     ldout(cct,5) << __func__ << " already shutdown, event not added" << dendl;
     delete callback;
-    return false;
+    return nullptr;
   }
   scheduled_map_t::value_type s_val(when, callback);
   scheduled_map_t::iterator i = schedule.insert(s_val);
@@ -145,7 +145,7 @@ bool SafeTimer::add_event_at(utime_t when, Context *callback)
    * adjust our timeout. */
   if (i == schedule.begin())
     cond.Signal();
-  return true;
+  return callback;
 }
 
 bool SafeTimer::cancel_event(Context *callback)
index 861b239ca32e12637f7fa8b1bfc1aa472ce23416..8fd478a9934999d4bbee05ef8bf26a44f1113178 100644 (file)
@@ -70,8 +70,8 @@ public:
 
   /* Schedule an event in the future
    * Call with the event_lock LOCKED */
-  bool add_event_after(double seconds, Context *callback);
-  bool add_event_at(utime_t when, Context *callback);
+  Context* add_event_after(double seconds, Context *callback);
+  Context* add_event_at(utime_t when, Context *callback);
 
   /* Cancel an event.
    * Call with the event_lock LOCKED
index 6a6e6b7d03e7c86cee913d8578470929a13b4e9a..b010970b3213d4fae377a2bb25bb8e778d7b4951 100644 (file)
@@ -14,6 +14,7 @@
 #include "common/Formatter.h"
 #include "include/assert.h"
 #include "include/encoding.h"
+#include <utility>
 
 namespace ceph {
 
@@ -28,36 +29,150 @@ private:
   // must be power of 2
   BOOST_STATIC_ASSERT((_bit_count != 0) && !(_bit_count & (_bit_count - 1)));
   BOOST_STATIC_ASSERT(_bit_count <= BITS_PER_BYTE);
-public:
-  static const uint32_t BLOCK_SIZE;
 
-  class ConstReference {
+  template <typename DataIterator>
+  class ReferenceImpl {
+  protected:
+    DataIterator m_data_iterator;
+    uint64_t m_shift;
+
+    ReferenceImpl(const DataIterator& data_iterator, uint64_t shift)
+      : m_data_iterator(data_iterator), m_shift(shift) {
+    }
+    ReferenceImpl(DataIterator&& data_iterator, uint64_t shift)
+      : m_data_iterator(std::move(data_iterator)), m_shift(shift) {
+    }
+
   public:
-    operator uint8_t() const;
+    inline operator uint8_t() const {
+      return (*m_data_iterator >> m_shift) & MASK;
+    }
+  };
+
+public:
+
+  class ConstReference : public ReferenceImpl<bufferlist::const_iterator> {
   private:
     friend class BitVector;
-    const BitVector &m_bit_vector;
-    uint64_t m_offset;
 
-    ConstReference(const BitVector &bit_vector, uint64_t offset);
+    ConstReference(const bufferlist::const_iterator& data_iterator,
+                   uint64_t shift)
+      : ReferenceImpl<bufferlist::const_iterator>(data_iterator, shift) {
+    }
+    ConstReference(bufferlist::const_iterator&& data_iterator, uint64_t shift)
+      : ReferenceImpl<bufferlist::const_iterator>(std::move(data_iterator),
+                                                  shift) {
+    }
   };
 
-  class Reference {
+  class Reference : public ReferenceImpl<bufferlist::iterator> {
   public:
-    operator uint8_t() const;
     Reference& operator=(uint8_t v);
+
+  private:
+    friend class BitVector;
+
+    Reference(const bufferlist::iterator& data_iterator, uint64_t shift)
+      : ReferenceImpl<bufferlist::iterator>(data_iterator, shift) {
+    }
+    Reference(bufferlist::iterator&& data_iterator, uint64_t shift)
+      : ReferenceImpl<bufferlist::iterator>(std::move(data_iterator), shift) {
+    }
+  };
+
+public:
+  template <typename BitVectorT, typename DataIterator>
+  class IteratorImpl {
   private:
     friend class BitVector;
-    BitVector &m_bit_vector;
-    uint64_t m_offset;
 
-    Reference(BitVector &bit_vector, uint64_t offset);
+    uint64_t m_offset = 0;
+    BitVectorT *m_bit_vector;
+
+    // cached derived values
+    uint64_t m_index = 0;
+    uint64_t m_shift = 0;
+    DataIterator m_data_iterator;
+
+    IteratorImpl(BitVectorT *bit_vector, uint64_t offset)
+      : m_bit_vector(bit_vector),
+        m_data_iterator(bit_vector->m_data.begin()) {
+      *this += offset;
+    }
+
+  public:
+    inline IteratorImpl& operator++() {
+      ++m_offset;
+
+      uint64_t index;
+      compute_index(m_offset, &index, &m_shift);
+
+      assert(index == m_index || index == m_index + 1);
+      if (index > m_index) {
+        m_index = index;
+        ++m_data_iterator;
+      }
+      return *this;
+    }
+    inline IteratorImpl& operator+=(uint64_t offset) {
+      m_offset += offset;
+      compute_index(m_offset, &m_index, &m_shift);
+      if (m_offset < m_bit_vector->size()) {
+        m_data_iterator.seek(m_index);
+      } else {
+        m_data_iterator = m_bit_vector->m_data.end();
+      }
+      return *this;
+    }
+
+    inline IteratorImpl operator++(int) {
+      IteratorImpl iterator_impl(*this);
+      ++iterator_impl;
+      return iterator_impl;
+    }
+    inline IteratorImpl operator+(uint64_t offset) {
+      IteratorImpl iterator_impl(*this);
+      iterator_impl += offset;
+      return iterator_impl;
+    }
+
+    inline bool operator==(const IteratorImpl& rhs) const {
+      return (m_offset == rhs.m_offset && m_bit_vector == rhs.m_bit_vector);
+    }
+    inline bool operator!=(const IteratorImpl& rhs) const {
+      return (m_offset != rhs.m_offset || m_bit_vector != rhs.m_bit_vector);
+    }
+
+    inline ConstReference operator*() const {
+      return ConstReference(m_data_iterator, m_shift);
+    }
+    inline Reference operator*() {
+      return Reference(m_data_iterator, m_shift);
+    }
   };
 
+  typedef IteratorImpl<const BitVector,
+                       bufferlist::const_iterator> ConstIterator;
+  typedef IteratorImpl<BitVector, bufferlist::iterator> Iterator;
+
+  static const uint32_t BLOCK_SIZE;
   static const uint8_t BIT_COUNT = _bit_count;
 
   BitVector();
 
+  inline ConstIterator begin() const {
+    return ConstIterator(this, 0);
+  }
+  inline ConstIterator end() const {
+    return ConstIterator(this, m_size);
+  }
+  inline Iterator begin() {
+    return Iterator(this, 0);
+  }
+  inline Iterator end() {
+    return Iterator(this, m_size);
+  }
+
   void set_crc_enabled(bool enabled) {
     m_crc_enabled = enabled;
   }
@@ -345,55 +460,33 @@ bool BitVector<_b>::operator==(const BitVector &b) const {
 
 template <uint8_t _b>
 typename BitVector<_b>::Reference BitVector<_b>::operator[](uint64_t offset) {
-  return Reference(*this, offset);
-}
-
-template <uint8_t _b>
-typename BitVector<_b>::ConstReference BitVector<_b>::operator[](uint64_t offset) const {
-  return ConstReference(*this, offset);
-}
-
-template <uint8_t _b>
-BitVector<_b>::ConstReference::ConstReference(const BitVector<_b> &bit_vector,
-                                             uint64_t offset)
-  : m_bit_vector(bit_vector), m_offset(offset)
-{
-}
-
-template <uint8_t _b>
-BitVector<_b>::ConstReference::operator uint8_t() const {
   uint64_t index;
   uint64_t shift;
-  this->m_bit_vector.compute_index(this->m_offset, &index, &shift);
+  compute_index(offset, &index, &shift);
 
-  return (this->m_bit_vector.m_data[index] >> shift) & MASK;
+  bufferlist::iterator data_iterator(m_data.begin());
+  data_iterator.seek(index);
+  return Reference(std::move(data_iterator), shift);
 }
 
 template <uint8_t _b>
-BitVector<_b>::Reference::Reference(BitVector<_b> &bit_vector, uint64_t offset)
-  : m_bit_vector(bit_vector), m_offset(offset)
-{
-}
-
-template <uint8_t _b>
-BitVector<_b>::Reference::operator uint8_t() const {
+typename BitVector<_b>::ConstReference BitVector<_b>::operator[](uint64_t offset) const {
   uint64_t index;
   uint64_t shift;
-  this->m_bit_vector.compute_index(this->m_offset, &index, &shift);
+  compute_index(offset, &index, &shift);
 
-  return (this->m_bit_vector.m_data[index] >> shift) & MASK;
+  bufferlist::const_iterator data_iterator(m_data.begin());
+  data_iterator.seek(index);
+  return ConstReference(std::move(data_iterator), shift);
 }
 
 template <uint8_t _b>
 typename BitVector<_b>::Reference& BitVector<_b>::Reference::operator=(uint8_t v) {
-  uint64_t index;
-  uint64_t shift;
-  this->m_bit_vector.compute_index(this->m_offset, &index, &shift);
-
-  uint8_t mask = MASK << shift;
-  char packed_value = (this->m_bit_vector.m_data[index] & ~mask) |
-                     ((v << shift) & mask);
-  this->m_bit_vector.m_data.copy_in(index, 1, &packed_value);
+  uint8_t mask = MASK << this->m_shift;
+  char packed_value = (*this->m_data_iterator & ~mask) |
+                      ((v << this->m_shift) & mask);
+  bufferlist::iterator it(this->m_data_iterator);
+  it.copy_in(1, &packed_value, true);
   return *this;
 }
 
index b8e87d1eea3dc16b9908cc163f937d0ffd4ffe3b..18ae276cc6faccaf06cf8c6128b4358bb46a5a4d 100644 (file)
@@ -172,17 +172,17 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
     char *data;
     unsigned len;
     std::atomic<unsigned> nref { 0 };
-    int mempool = mempool::mempool_buffer_anon;
+    int mempool;
 
     mutable std::atomic_flag crc_spinlock = ATOMIC_FLAG_INIT;
     map<pair<size_t, size_t>, pair<uint32_t, uint32_t> > crc_map;
 
-    explicit raw(unsigned l)
-      : data(NULL), len(l), nref(0) {
+    explicit raw(unsigned l, int mempool=mempool::mempool_buffer_anon)
+      : data(NULL), len(l), nref(0), mempool(mempool) {
       mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
     }
-    raw(char *c, unsigned l)
-      : data(c), len(l), nref(0) {
+    raw(char *c, unsigned l, int mempool=mempool::mempool_buffer_anon)
+      : data(c), len(l), nref(0), mempool(mempool) {
       mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
     }
     virtual ~raw() {
@@ -281,8 +281,9 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
   class buffer::raw_combined : public buffer::raw {
     size_t alignment;
   public:
-    raw_combined(char *dataptr, unsigned l, unsigned align=0)
-      : raw(dataptr, l),
+    raw_combined(char *dataptr, unsigned l, unsigned align,
+                int mempool)
+      : raw(dataptr, l, mempool),
        alignment(align) {
       inc_total_alloc(len);
       inc_history_alloc(len);
@@ -294,7 +295,9 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
       return create(len, alignment);
     }
 
-    static raw_combined *create(unsigned len, unsigned align=0) {
+    static raw_combined *create(unsigned len,
+                               unsigned align,
+                               int mempool = mempool::mempool_buffer_anon) {
       if (!align)
        align = sizeof(size_t);
       size_t rawlen = ROUND_UP_TO(sizeof(buffer::raw_combined),
@@ -314,7 +317,7 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
 
       // actual data first, since it has presumably larger alignment restriction
       // then put the raw_combined at the end
-      return new (ptr + datalen) raw_combined(ptr, len, align);
+      return new (ptr + datalen) raw_combined(ptr, len, align, mempool);
     }
 
     static void operator delete(void *ptr) {
@@ -771,6 +774,9 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
   buffer::raw* buffer::create(unsigned len) {
     return buffer::create_aligned(len, sizeof(size_t));
   }
+  buffer::raw* buffer::create_in_mempool(unsigned len, int mempool) {
+    return buffer::create_aligned_in_mempool(len, sizeof(size_t), mempool);
+  }
   buffer::raw* buffer::claim_char(unsigned len, char *buf) {
     return new raw_claimed_char(len, buf);
   }
@@ -787,7 +793,8 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
     return new raw_claim_buffer(buf, len, std::move(del));
   }
 
-  buffer::raw* buffer::create_aligned(unsigned len, unsigned align) {
+  buffer::raw* buffer::create_aligned_in_mempool(
+    unsigned len, unsigned align, int mempool) {
     // If alignment is a page multiple, use a separate buffer::raw to
     // avoid fragmenting the heap.
     //
@@ -805,7 +812,12 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
       return new raw_hack_aligned(len, align);
 #endif
     }
-    return raw_combined::create(len, align);
+    return raw_combined::create(len, align, mempool);
+  }
+  buffer::raw* buffer::create_aligned(
+    unsigned len, unsigned align) {
+    return create_aligned_in_mempool(len, align,
+                                    mempool::mempool_buffer_anon);
   }
 
   buffer::raw* buffer::create_page_aligned(unsigned len) {
@@ -952,6 +964,24 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
 
   bool buffer::ptr::at_buffer_tail() const { return _off + _len == _raw->len; }
 
+  int buffer::ptr::get_mempool() const {
+    if (_raw) {
+      return _raw->mempool;
+    }
+    return mempool::mempool_buffer_anon;
+  }
+
+  void buffer::ptr::reassign_to_mempool(int pool) {
+    if (_raw) {
+      _raw->reassign_to_mempool(pool);
+    }
+  }
+  void buffer::ptr::try_assign_to_mempool(int pool) {
+    if (_raw) {
+      _raw->try_assign_to_mempool(pool);
+    }
+  }
+
   const char *buffer::ptr::c_str() const {
     assert(_raw);
     if (buffer_track_c_str)
@@ -1493,7 +1523,6 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
   {
     std::swap(_len, other._len);
     std::swap(_memcopy_count, other._memcopy_count);
-    std::swap(_mempool, other._mempool);
     _buffers.swap(other._buffers);
     append_buffer.swap(other.append_buffer);
     //last_p.swap(other.last_p);
@@ -1666,9 +1695,16 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
     return is_aligned(CEPH_PAGE_SIZE);
   }
 
+  int buffer::list::get_mempool() const
+  {
+    if (_buffers.empty()) {
+      return mempool::mempool_buffer_anon;
+    }
+    return _buffers.back().get_mempool();
+  }
+
   void buffer::list::reassign_to_mempool(int pool)
   {
-    _mempool = pool;
     if (append_buffer.get_raw()) {
       append_buffer.get_raw()->reassign_to_mempool(pool);
     }
@@ -1679,7 +1715,6 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
 
   void buffer::list::try_assign_to_mempool(int pool)
   {
-    _mempool = pool;
     if (append_buffer.get_raw()) {
       append_buffer.get_raw()->try_assign_to_mempool(pool);
     }
@@ -1778,10 +1813,7 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
   void buffer::list::reserve(size_t prealloc)
   {
     if (append_buffer.unused_tail_length() < prealloc) {
-      append_buffer = buffer::create(prealloc);
-      if (_mempool >= 0) {
-       append_buffer.get_raw()->reassign_to_mempool(_mempool);
-      }
+      append_buffer = buffer::create_in_mempool(prealloc, get_mempool());
       append_buffer.set_length(0);   // unused, so far.
     }
   }
@@ -1879,11 +1911,9 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
     unsigned gap = append_buffer.unused_tail_length();
     if (!gap) {
       // make a new append_buffer!
-      append_buffer = raw_combined::create(CEPH_BUFFER_APPEND_SIZE);
+      append_buffer = raw_combined::create(CEPH_BUFFER_APPEND_SIZE, 0,
+                                          get_mempool());
       append_buffer.set_length(0);   // unused, so far.
-      if (_mempool >= 0) {
-       append_buffer.get_raw()->reassign_to_mempool(_mempool);
-      }
     }
     append(append_buffer, append_buffer.append(c) - 1, 1);     // add segment to the list
   }
@@ -1909,11 +1939,8 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
       size_t need = ROUND_UP_TO(len, sizeof(size_t)) + sizeof(raw_combined);
       size_t alen = ROUND_UP_TO(need, CEPH_BUFFER_ALLOC_UNIT) -
        sizeof(raw_combined);
-      append_buffer = raw_combined::create(alen);
+      append_buffer = raw_combined::create(alen, 0, get_mempool());
       append_buffer.set_length(0);   // unused, so far.
-      if (_mempool >= 0) {
-       append_buffer.get_raw()->reassign_to_mempool(_mempool);
-      }
     }
   }
 
index 2cf0f7d8c5866a474fe69addc71921afef23a787..64b2ec6747793de6e85e0aa986379b7b3e1edab7 100644 (file)
@@ -259,7 +259,7 @@ public:
     }
 
     if (log->graylog() && changed.count("fsid")) {
-      log->graylog()->set_fsid(conf->fsid);
+      log->graylog()->set_fsid(conf->get_val<uuid_d>("fsid"));
     }
   }
 };
index 9cb1b1207fdb9ff1403aa0ec8564a393e6937269..7889f42a0c2da19bd00e67d37667578be09b497c 100644 (file)
@@ -58,6 +58,10 @@ CephContext *common_preinit(const CephInitParameters &iparams,
     conf->set_val_or_die("err_to_stderr", "false");
     conf->set_val_or_die("log_flush_on_exit", "false");
   }
+  if (code_env != CODE_ENVIRONMENT_DAEMON) {
+    // NOTE: disable ms subsystem gathering in clients by default
+    conf->set_val_or_die("debug_ms", "0/0");
+  }
 
   return cct;
 }
index ea372bfb38649e8843bd32801f472bf0a4bdc279..3cbb27e3484c043788d3412786c7a48b9908cea8 100644 (file)
@@ -492,7 +492,10 @@ int md_config_t::parse_argv(std::vector<const char*>& args)
       set_val_or_die("client_mountpoint", val.c_str());
     }
     else {
-      parse_option(args, i, NULL);
+      int r = parse_option(args, i, NULL);
+      if (r < 0) {
+        return r;
+      }
     }
   }
 
@@ -536,8 +539,16 @@ int md_config_t::parse_option(std::vector<const char*>& args,
     std::string as_option("--");
     as_option += "debug_";
     as_option += subsys.get_name(o);
-    if (ceph_argparse_witharg(args, i, &val,
+    ostringstream err;
+    if (ceph_argparse_witharg(args, i, &val, err,
                              as_option.c_str(), (char*)NULL)) {
+      if (err.tellp()) {
+        if (oss) {
+          *oss << err.str();
+        }
+        ret = -EINVAL;
+        break;
+      }
       int log, gather;
       int r = sscanf(val.c_str(), "%d/%d", &log, &gather);
       if (r >= 1) {
index cb6b406bb12ec1d870787c376969450b57ab449e..e0f0ad7e3e410bb4ba409403061ea7d8e9bd6a69 100644 (file)
 
 /* note: no header guard */
 OPTION(host, OPT_STR) // "" means that ceph will use short hostname
-OPTION(fsid, OPT_UUID)
 OPTION(public_addr, OPT_ADDR)
 OPTION(public_bind_addr, OPT_ADDR)
 OPTION(cluster_addr, OPT_ADDR)
 OPTION(public_network, OPT_STR)
 OPTION(cluster_network, OPT_STR)
-OPTION(monmap, OPT_STR)
-OPTION(mon_host, OPT_STR)
-OPTION(mon_dns_srv_name, OPT_STR)
 OPTION(lockdep, OPT_BOOL)
 OPTION(lockdep_force_backtrace, OPT_BOOL) // always gather current backtrace at every lock
 OPTION(run_dir, OPT_STR)       // the "/var/run/ceph" dir, created on daemon startup
@@ -239,8 +235,6 @@ OPTION(mon_timecheck_interval, OPT_FLOAT) // on leader, timecheck (clock drift c
 OPTION(mon_timecheck_skew_interval, OPT_FLOAT) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
 OPTION(mon_pg_stuck_threshold, OPT_INT) // number of seconds after which pgs can be considered stuck inactive, unclean, etc (see doc/control.rst under dump_stuck for more info)
 OPTION(mon_pg_min_inactive, OPT_U64) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR.
-OPTION(mon_pg_warn_min_per_osd, OPT_INT)  // min # pgs per (in) osd before we warn the admin
-OPTION(mon_pg_warn_max_per_osd, OPT_INT)  // max # pgs per (in) osd before we warn the admin
 OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT) // max skew few average in objects per pg
 OPTION(mon_pg_warn_min_objects, OPT_INT)  // do not warn below this object #
 OPTION(mon_pg_warn_min_pool_objects, OPT_INT)  // do not warn on pools below this object #
@@ -267,7 +261,6 @@ OPTION(mon_max_mdsmap_epochs, OPT_INT)
 OPTION(mon_max_osd, OPT_INT)
 OPTION(mon_probe_timeout, OPT_DOUBLE)
 OPTION(mon_client_bytes, OPT_U64)  // client msg data allowed in memory (in bytes)
-OPTION(mon_mgr_proxy_client_bytes_ratio, OPT_FLOAT) // ratio of mon_client_bytes that can be consumed by proxied mgr commands before we error out to client
 OPTION(mon_log_max_summary, OPT_U64)
 OPTION(mon_daemon_bytes, OPT_U64)  // mds, osd message memory cap (in bytes)
 OPTION(mon_max_log_entries_per_event, OPT_INT)
@@ -1541,27 +1534,6 @@ OPTION(rgw_shard_warning_threshold, OPT_DOUBLE) // pct of safe max
 
 OPTION(rgw_swift_versioning_enabled, OPT_BOOL) // whether swift object versioning feature is enabled
 
-OPTION(mgr_module_path, OPT_STR) // where to load python modules from
-OPTION(mgr_initial_modules, OPT_STR)  // Which modules to load
-OPTION(mgr_data, OPT_STR) // where to find keyring etc
-OPTION(mgr_tick_period, OPT_INT)  // How frequently to tick
-OPTION(mgr_stats_period, OPT_INT) // How frequently clients send stats
-OPTION(mgr_client_bytes, OPT_U64) // bytes from clients
-OPTION(mgr_client_messages, OPT_U64)      // messages from clients
-OPTION(mgr_osd_bytes, OPT_U64)   // bytes from osds
-OPTION(mgr_osd_messages, OPT_U64)       // messages from osds
-OPTION(mgr_mds_bytes, OPT_U64)   // bytes from mdss
-OPTION(mgr_mds_messages, OPT_U64)        // messages from mdss
-OPTION(mgr_mon_bytes, OPT_U64)   // bytes from mons
-OPTION(mgr_mon_messages, OPT_U64)        // messages from mons
-
-OPTION(mgr_connect_retry_interval, OPT_DOUBLE)
-OPTION(mgr_service_beacon_grace, OPT_DOUBLE)
-
-OPTION(mon_mgr_digest_period, OPT_INT)  // How frequently to send digests
-OPTION(mon_mgr_beacon_grace, OPT_INT)  // How long to wait to failover
-OPTION(mon_mgr_inactive_grace, OPT_INT) // How long before health WARN -> ERR
-OPTION(mon_mgr_mkfs_grace, OPT_INT) // How long before we complain about MGR_DOWN
 OPTION(rgw_crypt_require_ssl, OPT_BOOL) // requests including encryption key headers must be sent over ssl
 OPTION(rgw_crypt_default_encryption_key, OPT_STR) // base64 encoded key for encryption of rgw objects
 OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR) // extra keys that may be used for aws:kms
index 0be052e1bc551eaf3787bbc205afda66f52b3598..33f9205258886eab706ab0bacda697d59bed5976 100644 (file)
@@ -11,6 +11,9 @@
 #include <boost/lexical_cast.hpp>
 #include <boost/regex.hpp>
 
+// Definitions for enums
+#include "common/perf_counters.h"
+
 
 void Option::dump_value(const char *field_name,
     const Option::value_t &v, Formatter *f) const
@@ -160,12 +163,24 @@ std::vector<Option> get_global_options() {
     Option("public_network", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .add_service({"mon", "mds", "osd", "mgr"})
     .add_tag("network")
-    .set_description(""),
+    .set_description("Network(s) from which to choose a public address to bind to"),
+
+    Option("public_network_interface", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .add_service({"mon", "mds", "osd", "mgr"})
+    .add_tag("network")
+    .set_description("Interface name(s) from which to choose an address from a public_network to bind to; public_network must also be specified.")
+    .add_see_also("public_network"),
 
     Option("cluster_network", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .add_service("osd")
     .add_tag("network")
-    .set_description(""),
+    .set_description("Network(s) from which to choose a cluster address to bind to"),
+
+    Option("cluster_network_interface", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .add_service({"mon", "mds", "osd", "mgr"})
+    .add_tag("network")
+    .set_description("Interface name(s) from which to choose an address from a cluster_network to bind to; cluster_network must also be specified.")
+    .add_see_also("cluster_network"),
 
     Option("monmap", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_description("path to MonMap file")
@@ -183,6 +198,7 @@ std::vector<Option> get_global_options() {
     .add_service("common"),
 
     Option("mon_dns_srv_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("ceph-mon")
     .set_description("name of DNS SRV record to check for monitor addresses")
     .add_service("common")
     .add_tag("network")
@@ -521,11 +537,16 @@ std::vector<Option> get_global_options() {
 
     Option("key", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("")
-    .set_description(""),
+    .set_description("Authentication key")
+    .set_long_description("A CephX authentication key, base64 encoded.  It normally looks something like 'AQAtut9ZdMbNJBAAHz6yBAWyJyz2yYRyeMWDag=='.")
+    .add_see_also("keyfile")
+    .add_see_also("keyring"),
 
     Option("keyfile", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("")
-    .set_description(""),
+    .set_description("Path to a file containing a key")
+    .set_long_description("The file should contain a CephX authentication key and optionally a trailing newline, but nothing else.")
+    .add_see_also("key"),
 
     Option("keyring", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default(
@@ -537,7 +558,10 @@ std::vector<Option> get_global_options() {
       "/usr/local/etc/ceph/keyring,/usr/local/etc/ceph/keyring.bin," 
   #endif
     )
-    .set_description(""),
+    .set_description("Path to a keyring file.")
+    .set_long_description("A keyring file is an INI-style formatted file where the section names are client or daemon names (e.g., 'osd.0') and each section contains a 'key' property with CephX authentication key as the value.")
+    .add_see_also("key")
+    .add_see_also("keyfile"),
 
     Option("heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(5)
@@ -1011,13 +1035,13 @@ std::vector<Option> get_global_options() {
     .set_default(1)
     .set_description(""),
 
-    Option("mon_pg_warn_min_per_osd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("mon_pg_warn_min_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(30)
-    .set_description(""),
+    .set_description("minimal number PGs per (in) osd before we warn the admin"),
 
-    Option("mon_pg_warn_max_per_osd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
-    .set_default(300)
-    .set_description(""),
+    Option("mon_max_pg_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(200)
+    .set_description("Max number of PGs per OSD the cluster will allow"),
 
     Option("mon_pg_warn_max_object_skew", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(10.0)
@@ -1127,9 +1151,10 @@ std::vector<Option> get_global_options() {
     .set_default(100ul << 20)
     .set_description(""),
 
-    Option("mon_mgr_proxy_client_bytes_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    Option("mon_mgr_proxy_client_bytes_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(.3)
-    .set_description(""),
+    .set_description("ratio of mon_client_bytes that can be consumed by "
+                     "proxied mgr commands before we error out to client"),
 
     Option("mon_log_max_summary", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(50)
@@ -1273,6 +1298,10 @@ std::vector<Option> get_global_options() {
     .set_default(false)
     .set_description(""),
 
+    Option("mon_fixup_legacy_erasure_code_profiles", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Automatically adjust ruleset-* to crush-* so that legacy apps can set modern erasure code profiles without modification"),
+
     Option("mon_debug_deprecated_as_obsolete", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(false)
     .set_description(""),
@@ -2544,6 +2573,13 @@ std::vector<Option> get_global_options() {
     .set_default(100)
     .set_description(""),
 
+    Option("osd_max_pg_per_osd_hard_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_min(1)
+    .set_description("Maximum number of PG per OSD, a factor of 'mon_max_pg_per_osd'")
+    .set_long_description("OSD will refuse to instantiate PG if the number of PG it serves exceeds this number.")
+    .add_see_also("mon_max_pg_per_osd"),
+
     Option("osd_op_complaint_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(30)
     .set_description(""),
@@ -2580,6 +2616,10 @@ std::vector<Option> get_global_options() {
     .set_default(false)
     .set_description(""),
 
+    Option("osd_debug_shutdown", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Turn up debug levels during shutdown"),
+
     Option("osd_debug_crash_on_ignored_backoff", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(false)
     .set_description(""),
@@ -3002,6 +3042,10 @@ std::vector<Option> get_global_options() {
     // --------------------------
     // bluestore
 
+    Option("bdev_inject_bad_size", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
     Option("bdev_debug_inflight_ios", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(false)
     .set_description(""),
@@ -3113,6 +3157,10 @@ std::vector<Option> get_global_options() {
     .set_default(1*1024*1024*1024)
     .set_description("minimum disk space allocated to BlueFS (e.g., at mkfs)"),
 
+    Option("bluestore_bluefs_min_free", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1*1024*1024*1024)
+    .set_description("minimum free space allocated to BlueFS"),
+
     Option("bluestore_bluefs_min_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(.02)
     .set_description("Minimum fraction of free space devoted to BlueFS"),
@@ -3626,7 +3674,7 @@ std::vector<Option> get_global_options() {
     // filestore
 
     Option("filestore_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
-    .set_default("")
+    .set_default("max_background_compactions=8,compaction_readahead_size=2097152,compression=kNoCompression")
     .set_description(""),
 
     Option("filestore_omap_backend", Option::TYPE_STR, Option::LEVEL_ADVANCED)
@@ -4009,6 +4057,14 @@ std::vector<Option> get_global_options() {
     .set_default(0)
     .set_description(""),
 
+  Option("mgr_stats_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+  .set_default((int64_t)PerfCountersBuilder::PRIO_USEFUL)
+  .set_description("Lowest perfcounter priority collected by mgr")
+  .set_long_description("Daemons only set perf counter data to the manager "
+    "daemon if the counter has a priority higher than this.")
+  .set_min_max((int64_t)PerfCountersBuilder::PRIO_DEBUGONLY,
+               (int64_t)PerfCountersBuilder::PRIO_CRITICAL),
+
     Option("journal_zero_on_create", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
     .set_description(""),
@@ -4043,79 +4099,110 @@ std::vector<Option> get_global_options() {
 
     Option("mgr_module_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default(CEPH_PKGLIBDIR "/mgr")
-    .set_description(""),
+    .add_service("mgr")
+    .set_description("Filesystem path to manager modules."),
 
-    Option("mgr_initial_modules", Option::TYPE_STR, Option::LEVEL_ADVANCED)
-    .set_default("restful status")
-    .set_description(""),
+    Option("mgr_initial_modules", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("restful status balancer")
+    .add_service("mon")
+    .set_description("List of manager modules to enable when the cluster is "
+                     "first started")
+    .set_long_description("This list of module names is read by the monitor "
+        "when the cluster is first started after installation, to populate "
+        "the list of enabled manager modules.  Subsequent updates are done using "
+        "the 'mgr module [enable|disable]' commands.  List may be comma "
+        "or space separated."),
 
     Option("mgr_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("/var/lib/ceph/mgr/$cluster-$id")
-    .set_description(""),
+    .add_service("mgr")
+    .set_description("Filesystem path to the ceph-mgr data directory, used to "
+                     "contain keyring."),
 
     Option("mgr_tick_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(2)
-    .set_description(""),
+    .add_service("mgr")
+    .set_description("Period in seconds of beacon messages to monitor"),
 
-    Option("mgr_stats_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("mgr_stats_period", Option::TYPE_INT, Option::LEVEL_BASIC)
     .set_default(5)
-    .set_description(""),
-
-    Option("mgr_client_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .add_service("mgr")
+    .set_description("Period in seconds of OSD/MDS stats reports to manager")
+    .set_long_description("Use this setting to control the granularity of "
+                          "time series data collection from daemons.  Adjust "
+                          "upwards if the manager CPU load is too high, or "
+                          "if you simply do not require the most up to date "
+                          "performance counter data."),
+
+    Option("mgr_client_bytes", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(128*1048576)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_client_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_client_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(512)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_osd_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_osd_bytes", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(512*1048576)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_osd_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_osd_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(8192)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_mds_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_mds_bytes", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(128*1048576)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_mds_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_mds_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(128)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_mon_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_mon_bytes", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(128*1048576)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_mon_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_mon_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(128)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_connect_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    Option("mgr_connect_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(1.0)
-    .set_description(""),
+    .add_service("common"),
 
     Option("mgr_service_beacon_grace", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(60.0)
-    .set_description(""),
+    .add_service("mgr")
+    .set_description("Period in seconds from last beacon to manager dropping "
+                     "state about a monitored service (RGW, rbd-mirror etc)"),
 
-    Option("mon_mgr_digest_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("mon_mgr_digest_period", Option::TYPE_INT, Option::LEVEL_DEV)
     .set_default(5)
-    .set_description(""),
+    .add_service("mon")
+    .set_description("Period in seconds between monitor-to-manager "
+                     "health/status updates"),
 
     Option("mon_mgr_beacon_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(30)
-    .set_description(""),
+    .add_service("mon")
+    .set_description("Period in seconds from last beacon to monitor marking "
+                     "a manager daemon as failed"),
 
     Option("mon_mgr_inactive_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(60)
-    .set_description(""),
+    .add_service("mon")
+    .set_description("Period in seconds after cluster creation during which "
+                     "cluster may have no active manager")
+    .set_long_description("This grace period enables the cluster to come "
+                          "up cleanly without raising spurious health check "
+                          "failures about managers that aren't online yet"),
 
     Option("mon_mgr_mkfs_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(60)
-    .set_description(""),
+    .add_service("mon")
+    .set_description("Period in seconds that the cluster may have no active "
+                     "manager before this is reported as an ERR rather than "
+                     "a WARN"),
 
     Option("mutex_perf_counter", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
@@ -4165,6 +4252,18 @@ std::vector<Option> get_rgw_options() {
     .set_default(1 * 1024 * 1024)
     .set_description(""),
 
+    Option("rgw_max_attr_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("The maximum length of metadata value. 0 skips the check"),
+
+    Option("rgw_max_attr_name_len", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("The maximum length of metadata name. 0 skips the check"),
+
+    Option("rgw_max_attrs_num_in_req", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("The maximum number of metadata items that can be put via single request"),
+
     Option("rgw_override_bucket_index_max_shards", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(0)
     .set_description(""),
@@ -5748,6 +5847,14 @@ std::vector<Option> get_mds_options() {
     Option("mds_client_writeable_range_max_inc_objs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(1024)
     .set_description(""),
+    Option("mds_min_caps_per_client", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description("minimum number of capabilities a client may hold"),
+
+    Option("mds_max_ratio_caps_per_client", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.8)
+    .set_description("maximum ratio of current caps that may be recalled during MDS cache pressure"),
   });
 }
 
index f3bc4e37860f759c78a7dbb7b60cc49809ea42f3..048c93a64334a20ef59abdbc4eff08d167979ad7 100644 (file)
@@ -53,7 +53,7 @@ void PerfCountersCollection::add(class PerfCounters *l)
     path += ".";
     path += data.name;
 
-    by_path[path] = &data;
+    by_path[path] = {&data, l};
   }
 }
 
@@ -396,12 +396,7 @@ void PerfCounters::dump_formatted_generic(Formatter *f, bool schema,
       } else {
         f->dump_string("nick", "");
       }
-      if (d->prio) {
-       int p = std::max(std::min(d->prio + prio_adjust,
-                                 (int)PerfCountersBuilder::PRIO_CRITICAL),
-                        0);
-       f->dump_int("priority", p);
-      }
+      f->dump_int("priority", get_adjusted_priority(d->prio));
       f->close_section();
     } else {
       if (d->type & PERFCOUNTER_LONGRUNAVG) {
@@ -549,7 +544,7 @@ void PerfCountersBuilder::add_impl(
     assert(strlen(nick) <= 4);
   }
   data.nick = nick;
-  data.prio = prio;
+  data.prio = prio ? prio : prio_default;
   data.type = (enum perfcounter_type_d)ty;
   data.histogram = std::move(histogram);
 }
index e831b73ad68233b4e964fba74fadc4d7fc1b6227..846e407ad2f0d0cd008c9435e88fe35012e0787e 100644 (file)
@@ -42,6 +42,80 @@ enum perfcounter_type_d : uint8_t
 };
 
 
+/* Class for constructing a PerfCounters object.
+ *
+ * This class performs some validation that the parameters we have supplied are
+ * correct in create_perf_counters().
+ *
+ * In the future, we will probably get rid of the first/last arguments, since
+ * PerfCountersBuilder can deduce them itself.
+ */
+class PerfCountersBuilder
+{
+public:
+  PerfCountersBuilder(CephContext *cct, const std::string &name,
+                   int first, int last);
+  ~PerfCountersBuilder();
+
+  // prio values: higher is better, and higher values get included in
+  // 'ceph daemonperf' (and similar) results.
+  // Use of priorities enables us to add large numbers of counters
+  // internally without necessarily overwhelming consumers.
+  enum {
+    PRIO_CRITICAL = 10,
+    // 'interesting' is the default threshold for `daemonperf` output
+    PRIO_INTERESTING = 8,
+    // `useful` is the default threshold for transmission to ceph-mgr
+    // and inclusion in prometheus/influxdb plugin output
+    PRIO_USEFUL = 5,
+    PRIO_UNINTERESTING = 2,
+    PRIO_DEBUGONLY = 0,
+  };
+  void add_u64(int key, const char *name,
+              const char *description=NULL, const char *nick = NULL,
+              int prio=0);
+  void add_u64_counter(int key, const char *name,
+                      const char *description=NULL,
+                      const char *nick = NULL,
+                      int prio=0);
+  void add_u64_avg(int key, const char *name,
+                  const char *description=NULL,
+                  const char *nick = NULL,
+                  int prio=0);
+  void add_time(int key, const char *name,
+               const char *description=NULL,
+               const char *nick = NULL,
+               int prio=0);
+  void add_time_avg(int key, const char *name,
+                   const char *description=NULL,
+                   const char *nick = NULL,
+                   int prio=0);
+  void add_u64_counter_histogram(
+    int key, const char* name,
+    PerfHistogramCommon::axis_config_d x_axis_config,
+    PerfHistogramCommon::axis_config_d y_axis_config,
+    const char *description=NULL,
+    const char* nick = NULL,
+    int prio=0);
+
+  void set_prio_default(int prio_)
+  {
+    prio_default = prio_;
+  }
+
+  PerfCounters* create_perf_counters();
+private:
+  PerfCountersBuilder(const PerfCountersBuilder &rhs);
+  PerfCountersBuilder& operator=(const PerfCountersBuilder &rhs);
+  void add_impl(int idx, const char *name,
+                const char *description, const char *nick, int prio, int ty,
+                unique_ptr<PerfHistogram<>> histogram = nullptr);
+
+  PerfCounters *m_perf_counters;
+
+  int prio_default = 0;
+};
+
 /*
  * A PerfCounters object is usually associated with a single subsystem.
  * It contains counters which we modify to track performance and throughput
@@ -96,7 +170,7 @@ public:
     const char *name;
     const char *description;
     const char *nick;
-    int prio = 0;
+    uint8_t prio = 0;
     enum perfcounter_type_d type;
     std::atomic<uint64_t> u64 = { 0 };
     std::atomic<uint64_t> avgcount = { 0 };
@@ -179,6 +253,12 @@ public:
     prio_adjust = p;
   }
 
+  int get_adjusted_priority(int p) const {
+    return std::max(std::min(p + prio_adjust,
+                             (int)PerfCountersBuilder::PRIO_CRITICAL),
+                    0);
+  }
+
 private:
   PerfCounters(CephContext *cct, const std::string &name,
             int lower_bound, int upper_bound);
@@ -240,8 +320,17 @@ public:
     dump_formatted_generic(f, schema, true, logger, counter);
   }
 
+  // A reference to a perf_counter_data_any_d, with an accompanying
+  // pointer to the enclosing PerfCounters, in order that the consumer
+  // can see the prio_adjust
+  class PerfCounterRef
+  {
+    public:
+    PerfCounters::perf_counter_data_any_d *data;
+    PerfCounters *perf_counters;
+  };
   typedef std::map<std::string,
-          PerfCounters::perf_counter_data_any_d *> CounterMap;
+          PerfCounterRef> CounterMap;
 
   void with_counters(std::function<void(const CounterMap &)>) const;
 
@@ -257,71 +346,11 @@ private:
 
   perf_counters_set_t m_loggers;
 
-  std::map<std::string, PerfCounters::perf_counter_data_any_d *> by_path; 
+  CounterMap by_path; 
 
   friend class PerfCountersCollectionTest;
 };
 
-/* Class for constructing a PerfCounters object.
- *
- * This class performs some validation that the parameters we have supplied are
- * correct in create_perf_counters().
- *
- * In the future, we will probably get rid of the first/last arguments, since
- * PerfCountersBuilder can deduce them itself.
- */
-class PerfCountersBuilder
-{
-public:
-  PerfCountersBuilder(CephContext *cct, const std::string &name,
-                   int first, int last);
-  ~PerfCountersBuilder();
-
-  // prio values: higher is better, and higher values get included in
-  // 'ceph daemonperf' (and similar) results.
-  enum {
-    PRIO_CRITICAL = 10,
-    PRIO_INTERESTING = 8,
-    PRIO_USEFUL = 5,
-    PRIO_UNINTERESTING = 2,
-    PRIO_DEBUGONLY = 0,
-  };
-  void add_u64(int key, const char *name,
-              const char *description=NULL, const char *nick = NULL,
-              int prio=0);
-  void add_u64_counter(int key, const char *name,
-                      const char *description=NULL,
-                      const char *nick = NULL,
-                      int prio=0);
-  void add_u64_avg(int key, const char *name,
-                  const char *description=NULL,
-                  const char *nick = NULL,
-                  int prio=0);
-  void add_time(int key, const char *name,
-               const char *description=NULL,
-               const char *nick = NULL,
-               int prio=0);
-  void add_time_avg(int key, const char *name,
-                   const char *description=NULL,
-                   const char *nick = NULL,
-                   int prio=0);
-  void add_u64_counter_histogram(
-    int key, const char* name,
-    PerfHistogramCommon::axis_config_d x_axis_config,
-    PerfHistogramCommon::axis_config_d y_axis_config,
-    const char *description=NULL,
-    const char* nick = NULL,
-    int prio=0);
-
-  PerfCounters* create_perf_counters();
-private:
-  PerfCountersBuilder(const PerfCountersBuilder &rhs);
-  PerfCountersBuilder& operator=(const PerfCountersBuilder &rhs);
-  void add_impl(int idx, const char *name,
-                const char *description, const char *nick, int prio, int ty,
-                unique_ptr<PerfHistogram<>> histogram = nullptr);
 
-  PerfCounters *m_perf_counters;
-};
 
 #endif
index dfea843e9cb13c34dcb63cd17f84664d66e3c0d2..4bda7ba1bff2390dba7fb77d8b65dffdf0ba8cd6 100644 (file)
 
 #define dout_subsys ceph_subsys_
 
-static const struct sockaddr *find_ip_in_subnet_list(CephContext *cct,
-                                                    const struct ifaddrs *ifa,
-                                                    const std::string &networks)
+const struct sockaddr *find_ip_in_subnet_list(
+  CephContext *cct,
+  const struct ifaddrs *ifa,
+  const std::string &networks,
+  const std::string &interfaces)
 {
   std::list<string> nets;
   get_str_list(networks, nets);
+  std::list<string> ifs;
+  get_str_list(interfaces, ifs);
+
+  // filter interfaces by name
+  const struct ifaddrs *filtered = 0;
+  if (ifs.empty()) {
+    filtered = ifa;
+  } else {
+    if (nets.empty()) {
+      lderr(cct) << "interface names specified but not network names" << dendl;
+      exit(1);
+    }
+    const struct ifaddrs *t = ifa;
+    struct ifaddrs *head = 0;
+    while (t != NULL) {
+      bool match = false;
+      for (auto& i : ifs) {
+       if (strcmp(i.c_str(), t->ifa_name) == 0) {
+         match = true;
+         break;
+       }
+      }
+      if (match) {
+       struct ifaddrs *n = new ifaddrs;
+       memcpy(n, t, sizeof(*t));
+       n->ifa_next = head;
+       head = n;
+      }
+      t = t->ifa_next;
+    }
+    if (head == NULL) {
+      lderr(cct) << "no interfaces matching " << ifs << dendl;
+      exit(1);
+    }
+    filtered = head;
+  }
 
-  for(std::list<string>::iterator s = nets.begin(); s != nets.end(); ++s) {
-      struct sockaddr_storage net;
-      unsigned int prefix_len;
+  struct sockaddr *r = NULL;
+  for (std::list<string>::iterator s = nets.begin(); s != nets.end(); ++s) {
+    struct sockaddr_storage net;
+    unsigned int prefix_len;
 
-      if (!parse_network(s->c_str(), &net, &prefix_len)) {
-       lderr(cct) << "unable to parse network: " << *s << dendl;
-       exit(1);
-      }
+    if (!parse_network(s->c_str(), &net, &prefix_len)) {
+      lderr(cct) << "unable to parse network: " << *s << dendl;
+      exit(1);
+    }
+
+    const struct ifaddrs *found = find_ip_in_subnet(
+      filtered,
+      (struct sockaddr *) &net, prefix_len);
+    if (found) {
+      r = found->ifa_addr;
+      break;
+    }
+  }
 
-      const struct ifaddrs *found = find_ip_in_subnet(ifa,
-                                      (struct sockaddr *) &net, prefix_len);
-      if (found)
-       return found->ifa_addr;
+  if (filtered != ifa) {
+    while (filtered) {
+      struct ifaddrs *t = filtered->ifa_next;
+      delete filtered;
+      filtered = t;
     }
+  }
 
-  return NULL;
+  return r;
 }
 
 // observe this change
@@ -67,11 +117,14 @@ struct Observer : public md_config_obs_t {
 static void fill_in_one_address(CephContext *cct,
                                const struct ifaddrs *ifa,
                                const string networks,
+                               const string interfaces,
                                const char *conf_var)
 {
-  const struct sockaddr *found = find_ip_in_subnet_list(cct, ifa, networks);
+  const struct sockaddr *found = find_ip_in_subnet_list(cct, ifa, networks,
+                                                       interfaces);
   if (!found) {
-    lderr(cct) << "unable to find any IP address in networks: " << networks << dendl;
+    lderr(cct) << "unable to find any IP address in networks '" << networks
+              << "' interfaces '" << interfaces << "'" << dendl;
     exit(1);
   }
 
@@ -111,22 +164,29 @@ void pick_addresses(CephContext *cct, int needs)
     exit(1);
   }
 
-
   if ((needs & CEPH_PICK_ADDRESS_PUBLIC)
       && cct->_conf->public_addr.is_blank_ip()
       && !cct->_conf->public_network.empty()) {
-    fill_in_one_address(cct, ifa, cct->_conf->public_network, "public_addr");
+    fill_in_one_address(cct, ifa, cct->_conf->public_network,
+                       cct->_conf->get_val<string>("public_network_interface"),
+                       "public_addr");
   }
 
   if ((needs & CEPH_PICK_ADDRESS_CLUSTER)
       && cct->_conf->cluster_addr.is_blank_ip()) {
     if (!cct->_conf->cluster_network.empty()) {
-      fill_in_one_address(cct, ifa, cct->_conf->cluster_network, "cluster_addr");
+      fill_in_one_address(
+       cct, ifa, cct->_conf->cluster_network,
+       cct->_conf->get_val<string>("cluster_network_interface"),
+       "cluster_addr");
     } else {
       if (!cct->_conf->public_network.empty()) {
         lderr(cct) << "Public network was set, but cluster network was not set " << dendl;
         lderr(cct) << "    Using public network also for cluster network" << dendl;
-        fill_in_one_address(cct, ifa, cct->_conf->public_network, "cluster_addr");
+        fill_in_one_address(
+         cct, ifa, cct->_conf->public_network,
+         cct->_conf->get_val<string>("public_network_interface"),
+         "cluster_addr");
       }
     }
   }
index c7c813d640a1b258543fbc42fdfdb937de8daeaf..73020602b7eef303bd1b5c6a2dbe020877d2fe54 100644 (file)
@@ -47,4 +47,11 @@ std::string pick_iface(CephContext *cct, const struct sockaddr_storage &network)
  */
 bool have_local_addr(CephContext *cct, const list<entity_addr_t>& ls, entity_addr_t *match);
 
+
+const struct sockaddr *find_ip_in_subnet_list(
+  CephContext *cct,
+  const struct ifaddrs *ifa,
+  const std::string &networks,
+  const std::string &interfaces);
+
 #endif
index 105ca99b7e969f6843751268deacda59b7b8a458..6e6c26fa78f0b75590e998d2acf84979c20a75ed 100644 (file)
@@ -53,6 +53,7 @@ SUBSYS(tp, 0, 5)
 SUBSYS(auth, 1, 5)
 SUBSYS(crypto, 1, 5)
 SUBSYS(finisher, 1, 1)
+SUBSYS(reserver, 1, 1)
 SUBSYS(heartbeatmap, 1, 5)
 SUBSYS(perfcounter, 1, 5)
 SUBSYS(rgw, 1, 5)                 // log level for the Rados gateway
index 4a3da9b82ff93d6e3095c2f065f2141cdba52999..6591b1bcdb40749f916fc21228c655114542c8eb 100644 (file)
@@ -68,7 +68,7 @@ namespace CrushTreeDumper {
     explicit Dumper(const CrushWrapper *crush_,
                    const name_map_t& weight_set_names_)
       : crush(crush_), weight_set_names(weight_set_names_) {
-      crush->find_nonshadow_roots(roots);
+      crush->find_nonshadow_roots(&roots);
       root = roots.begin();
     }
     explicit Dumper(const CrushWrapper *crush_,
@@ -76,9 +76,9 @@ namespace CrushTreeDumper {
                     bool show_shadow)
       : crush(crush_), weight_set_names(weight_set_names_) {
       if (show_shadow) {
-        crush->find_roots(roots);
+        crush->find_roots(&roots);
       } else {
-        crush->find_nonshadow_roots(roots);
+        crush->find_nonshadow_roots(&roots);
       }
       root = roots.begin();
     }
index bf6f3cf5ab8d69aa7bea8ad4562d0a1093102b9b..52af91f6f47d1f9c9815dd3f112b659c9878a8aa 100644 (file)
@@ -13,7 +13,7 @@
 
 #define dout_subsys ceph_subsys_crush
 
-bool CrushWrapper::has_legacy_rulesets() const
+bool CrushWrapper::has_legacy_rule_ids() const
 {
   for (unsigned i=0; i<crush->max_rules; i++) {
     crush_rule *r = crush->rules[i];
@@ -25,51 +25,17 @@ bool CrushWrapper::has_legacy_rulesets() const
   return false;
 }
 
-int CrushWrapper::renumber_rules_by_ruleset()
+std::map<int, int> CrushWrapper::renumber_rules()
 {
-  int max_ruleset = 0;
+  std::map<int, int> result;
   for (unsigned i=0; i<crush->max_rules; i++) {
     crush_rule *r = crush->rules[i];
-    if (r && r->mask.ruleset >= max_ruleset) {
-      max_ruleset = r->mask.ruleset + 1;
+    if (r && r->mask.ruleset != i) {
+      result[r->mask.ruleset] = i;
+      r->mask.ruleset = i;
     }
   }
-  struct crush_rule **newrules =
-    (crush_rule**)calloc(1, max_ruleset * sizeof(crush_rule*));
-  for (unsigned i=0; i<crush->max_rules; i++) {
-    crush_rule *r = crush->rules[i];
-    if (!r)
-      continue;
-    if (newrules[r->mask.ruleset]) {
-      // collision, we can't do it.
-      free(newrules);
-      return -EINVAL;
-    }
-    newrules[r->mask.ruleset] = r;
-  }
-
-  // success, swap!
-  free(crush->rules);
-  crush->rules = newrules;
-  crush->max_rules = max_ruleset;
-  return 0;
-}
-
-bool CrushWrapper::has_multirule_rulesets() const
-{
-  for (unsigned i=0; i<crush->max_rules; i++) {
-    crush_rule *r = crush->rules[i];
-    if (!r)
-      continue;
-    for (unsigned j=i+1; j<crush->max_rules; j++) {
-      crush_rule *s = crush->rules[j];
-      if (!s)
-       continue;
-      if (r->mask.ruleset == s->mask.ruleset)
-       return true;
-    }
-  }
-  return false;
+  return result;
 }
 
 bool CrushWrapper::has_non_straw2_buckets() const
@@ -318,7 +284,7 @@ int CrushWrapper::rename_rule(const string& srcname,
   return 0;
 }
 
-void CrushWrapper::find_takes(set<int>roots) const
+void CrushWrapper::find_takes(set<int> *roots) const
 {
   for (unsigned i=0; i<crush->max_rules; i++) {
     crush_rule *r = crush->rules[i];
@@ -326,19 +292,19 @@ void CrushWrapper::find_takes(set<int>& roots) const
       continue;
     for (unsigned j=0; j<r->len; j++) {
       if (r->steps[j].op == CRUSH_RULE_TAKE)
-       roots.insert(r->steps[j].arg1);
+       roots->insert(r->steps[j].arg1);
     }
   }
 }
 
-void CrushWrapper::find_roots(set<int>roots) const
+void CrushWrapper::find_roots(set<int> *roots) const
 {
   for (int i = 0; i < crush->max_buckets; i++) {
     if (!crush->buckets[i])
       continue;
     crush_bucket *b = crush->buckets[i];
     if (!_search_item_exists(b->id))
-      roots.insert(b->id);
+      roots->insert(b->id);
   }
 }
 
@@ -1439,7 +1405,7 @@ int CrushWrapper::populate_classes(
   // finish constructing the containing buckets.
   map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> weights
   set<int> roots;
-  find_nonshadow_roots(roots);
+  find_nonshadow_roots(&roots);
   for (auto &r : roots) {
     if (r >= 0)
       continue;
@@ -1457,7 +1423,7 @@ int CrushWrapper::populate_classes(
 int CrushWrapper::trim_roots_with_class()
 {
   set<int> roots;
-  find_shadow_roots(roots);
+  find_shadow_roots(&roots);
   for (auto &r : roots) {
     if (r >= 0)
       continue;
@@ -1499,7 +1465,7 @@ int32_t CrushWrapper::_alloc_class_id() const {
 void CrushWrapper::reweight(CephContext *cct)
 {
   set<int> roots;
-  find_roots(roots);
+  find_roots(&roots);
   for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
     if (*p >= 0)
       continue;
@@ -1627,7 +1593,56 @@ int CrushWrapper::add_simple_rule(
                            rule_type, -1, err);
 }
 
-int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
+float CrushWrapper::_get_take_weight_osd_map(int root,
+                                            map<int,float> *pmap) const
+{
+  float sum = 0.0;
+  list<int> q;
+  q.push_back(root);
+  //breadth first iterate the OSD tree
+  while (!q.empty()) {
+    int bno = q.front();
+    q.pop_front();
+    crush_bucket *b = crush->buckets[-1-bno];
+    assert(b);
+    for (unsigned j=0; j<b->size; ++j) {
+      int item_id = b->items[j];
+      if (item_id >= 0) { //it's an OSD
+       float w = crush_get_bucket_item_weight(b, j);
+       (*pmap)[item_id] = w;
+       sum += w;
+      } else { //not an OSD, expand the child later
+       q.push_back(item_id);
+      }
+    }
+  }
+  return sum;
+}
+
+void CrushWrapper::_normalize_weight_map(float sum,
+                                        const map<int,float>& m,
+                                        map<int,float> *pmap) const
+{
+  for (auto& p : m) {
+    map<int,float>::iterator q = pmap->find(p.first);
+    if (q == pmap->end()) {
+      (*pmap)[p.first] = p.second / sum;
+    } else {
+      q->second += p.second / sum;
+    }
+  }
+}
+
+int CrushWrapper::get_take_weight_osd_map(int root, map<int,float> *pmap) const
+{
+  map<int,float> m;
+  float sum = _get_take_weight_osd_map(root, &m);
+  _normalize_weight_map(sum, m, pmap);
+  return 0;
+}
+
+int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno,
+                                         map<int,float> *pmap) const
 {
   if (ruleno >= crush->max_rules)
     return -ENOENT;
@@ -1650,35 +1665,10 @@ int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
        m[n] = 1.0;
        sum = 1.0;
       } else {
-       list<int> q;
-       q.push_back(n);
-       //breadth first iterate the OSD tree
-       while (!q.empty()) {
-         int bno = q.front();
-         q.pop_front();
-         crush_bucket *b = crush->buckets[-1-bno];
-         assert(b);
-         for (unsigned j=0; j<b->size; ++j) {
-           int item_id = b->items[j];
-           if (item_id >= 0) { //it's an OSD
-             float w = crush_get_bucket_item_weight(b, j);
-             m[item_id] = w;
-             sum += w;
-           } else { //not an OSD, expand the child later
-             q.push_back(item_id);
-           }
-         }
-       }
-      }
-    }
-    for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
-      map<int,float>::iterator q = pmap->find(p->first);
-      if (q == pmap->end()) {
-       (*pmap)[p->first] = p->second / sum;
-      } else {
-       q->second += p->second / sum;
+       sum += _get_take_weight_osd_map(n, &m);
       }
     }
+    _normalize_weight_map(sum, m, pmap);
   }
 
   return 0;
@@ -1839,6 +1829,16 @@ int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
   return 0;
 }
 
+int CrushWrapper::bucket_set_alg(int bid, int alg)
+{
+  crush_bucket *b = get_bucket(bid);
+  if (!b) {
+    return -ENOENT;
+  }
+  b->alg = alg;
+  return 0;
+}
+
 int CrushWrapper::update_device_class(int id,
                                       const string& class_name,
                                       const string& name,
@@ -2054,6 +2054,44 @@ int CrushWrapper::get_rules_by_class(const string &class_name, set<int> *rules)
   return 0;
 }
 
+// return rules that might reference the given osd
+int CrushWrapper::get_rules_by_osd(int osd, set<int> *rules)
+{
+  assert(rules);
+  rules->clear();
+  if (osd < 0) {
+    return -EINVAL;
+  }
+  for (unsigned i = 0; i < crush->max_rules; ++i) {
+    crush_rule *r = crush->rules[i];
+    if (!r)
+      continue;
+    for (unsigned j = 0; j < r->len; ++j) {
+      if (r->steps[j].op == CRUSH_RULE_TAKE) {
+        int step_item = r->steps[j].arg1;
+        list<int> unordered;
+        int rc = _get_leaves(step_item, &unordered);
+        if (rc < 0) {
+          return rc; // propagate fatal errors!
+        }
+        bool match = false;
+        for (auto &o: unordered) {
+          assert(o >= 0);
+          if (o == osd) {
+            match = true;
+            break;
+          }
+        }
+        if (match) {
+          rules->insert(i);
+          break;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
 bool CrushWrapper::_class_is_dead(int class_id)
 {
   for (auto &p: class_map) {
@@ -2604,7 +2642,7 @@ namespace {
 
     void dump(Formatter *f) {
       set<int> roots;
-      crush->find_roots(roots);
+      crush->find_roots(&roots);
       for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
        dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
       }
index 384af2c442692357969c9d483b27b5f821b3b2f2..607b7c61e7a47a425c8627e08e200403e5af385e 100644 (file)
@@ -120,14 +120,25 @@ public:
     set_tunables_default();
   }
 
-  /// true if any rule has a ruleset != the rule id
-  bool has_legacy_rulesets() const;
-
-  /// fix rules whose ruleid != ruleset
-  int renumber_rules_by_ruleset();
+  /**
+   * true if any rule has a rule id != its position in the array
+   *
+   * These indicate "ruleset" IDs that were created by older versions
+   * of Ceph.  They are cleaned up in renumber_rules so that eventually
+   * we can remove the code for handling them.
+   */
+  bool has_legacy_rule_ids() const;
 
-  /// true if any ruleset has more than 1 rule
-  bool has_multirule_rulesets() const;
+  /**
+   * fix rules whose ruleid != ruleset
+   *
+   * These rules were created in older versions of Ceph.  The concept
+   * of a ruleset no longer exists.
+   *
+   * Return a map of old ID -> new ID.  Caller must update OSDMap
+   * to use new IDs.
+   */
+  std::map<int, int> renumber_rules();
 
   /// true if any buckets that aren't straw2
   bool has_non_straw2_buckets() const;
@@ -574,25 +585,25 @@ public:
    *
    * Note that these may not be parentless roots.
    */
-  void find_takes(set<int>roots) const;
+  void find_takes(set<int> *roots) const;
 
   /**
    * find tree roots
    *
    * These are parentless nodes in the map.
    */
-  void find_roots(set<int>roots) const;
+  void find_roots(set<int> *roots) const;
 
 
   /**
    * find tree roots that contain shadow (device class) items only
    */
-  void find_shadow_roots(set<int>roots) const {
+  void find_shadow_roots(set<int> *roots) const {
     set<int> all;
-    find_roots(all);
+    find_roots(&all);
     for (auto& p: all) {
       if (is_shadow_item(p)) {
-        roots.insert(p);
+        roots->insert(p);
       }
     }
   }
@@ -603,12 +614,12 @@ public:
    * These are parentless nodes in the map that are not shadow
    * items for device classes.
    */
-  void find_nonshadow_roots(set<int>roots) const {
+  void find_nonshadow_roots(set<int> *roots) const {
     set<int> all;
-    find_roots(all);
+    find_roots(&all);
     for (auto& p: all) {
       if (!is_shadow_item(p)) {
-        roots.insert(p);
+        roots->insert(p);
       }
     }
   }
@@ -973,6 +984,17 @@ public:
       return true;
     return false;
   }
+  bool rule_has_take(unsigned ruleno, int take) const {
+    if (!crush) return false;
+    crush_rule *rule = get_rule(ruleno);
+    for (unsigned i = 0; i < rule->len; ++i) {
+      if (rule->steps[i].op == CRUSH_RULE_TAKE &&
+         rule->steps[i].arg1 == take) {
+       return true;
+      }
+    }
+    return false;
+  }
   int get_rule_len(unsigned ruleno) const {
     crush_rule *r = get_rule(ruleno);
     if (IS_ERR(r)) return PTR_ERR(r);
@@ -1014,6 +1036,12 @@ public:
     return s->arg2;
   }
 
+private:
+  float _get_take_weight_osd_map(int root, map<int,float> *pmap) const;
+  void _normalize_weight_map(float sum, const map<int,float>& m,
+                            map<int,float> *pmap) const;
+
+public:
   /**
    * calculate a map of osds to weights for a given rule
    *
@@ -1024,7 +1052,19 @@ public:
    * @param pmap [out] map of osd to weight
    * @return 0 for success, or negative error code
    */
-  int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap);
+  int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap) const;
+
+  /**
+   * calculate a map of osds to weights for a given starting root
+   *
+   * Generate a map of which OSDs get how much relative weight for a
+   * given starting root
+   *
+   * @param root node
+   * @param pmap [out] map of osd to weight
+   * @return 0 for success, or negative error code
+   */
+  int get_take_weight_osd_map(int root, map<int,float> *pmap) const;
 
   /* modifiers */
 
@@ -1206,8 +1246,9 @@ public:
   void finalize() {
     assert(crush);
     crush_finalize(crush);
-    have_uniform_rules = !has_legacy_rulesets();
+    have_uniform_rules = !has_legacy_rule_ids();
   }
+  int bucket_set_alg(int id, int alg);
 
   int update_device_class(int id, const string& class_name, const string& name, ostream *ss);
   int remove_device_class(CephContext *cct, int id, ostream *ss);
@@ -1221,6 +1262,7 @@ public:
   int populate_classes(
     const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket);
   int get_rules_by_class(const string &class_name, set<int> *rules);
+  int get_rules_by_osd(int osd, set<int> *rules);
   bool _class_is_dead(int class_id);
   void cleanup_dead_classes();
   int rebuild_roots_with_classes();
@@ -1282,7 +1324,7 @@ public:
   /**
    * Return the lowest numbered ruleset of type `type`
    *
-   * @returns a ruleset ID, or -1 if no matching rulesets found.
+   * @returns a ruleset ID, or -1 if no matching rules found.
    */
   int find_first_ruleset(int type) const {
     int result = -1;
index d9c92ce64d6eef8375b6fe26209df9ce7d8e8dee..226a60ff189480989436d1520fc8168aa1e10a46 100644 (file)
@@ -151,11 +151,13 @@ namespace buffer CEPH_BUFFER_API {
    */
   raw* copy(const char *c, unsigned len);
   raw* create(unsigned len);
+  raw* create_in_mempool(unsigned len, int mempool);
   raw* claim_char(unsigned len, char *buf);
   raw* create_malloc(unsigned len);
   raw* claim_malloc(unsigned len, char *buf);
   raw* create_static(unsigned len, char *buf);
   raw* create_aligned(unsigned len, unsigned align);
+  raw* create_aligned_in_mempool(unsigned len, unsigned align, int mempool);
   raw* create_page_aligned(unsigned len);
   raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
   raw* create_unshareable(unsigned len);
@@ -288,6 +290,10 @@ namespace buffer CEPH_BUFFER_API {
       return have_raw() && (start() > 0 || end() < raw_length());
     }
 
+    int get_mempool() const;
+    void reassign_to_mempool(int pool);
+    void try_assign_to_mempool(int pool);
+
     // accessors
     raw *get_raw() const { return _raw; }
     const char *c_str() const;
@@ -348,7 +354,6 @@ namespace buffer CEPH_BUFFER_API {
     unsigned _len;
     unsigned _memcopy_count; //the total of memcopy using rebuild().
     ptr append_buffer;  // where i put small appends.
-    int _mempool = -1;
 
   public:
     class iterator;
@@ -443,6 +448,7 @@ namespace buffer CEPH_BUFFER_API {
 
       void advance(int o);
       void seek(unsigned o);
+      using iterator_impl<false>::operator*;
       char operator*();
       iterator& operator++();
       ptr get_current_ptr();
@@ -682,7 +688,6 @@ namespace buffer CEPH_BUFFER_API {
       _memcopy_count = other._memcopy_count;
       last_p = begin();
       append_buffer.swap(other.append_buffer);
-      _mempool = other._mempool;
       other.clear();
       return *this;
     }
@@ -691,6 +696,7 @@ namespace buffer CEPH_BUFFER_API {
     const ptr& front() const { return _buffers.front(); }
     const ptr& back() const { return _buffers.back(); }
 
+    int get_mempool() const;
     void reassign_to_mempool(int pool);
     void try_assign_to_mempool(int pool);
 
index 95ca603c9653f98251621a897ac4ff3376e6ab74..5c76c471c64278b45ce2a66def9428a7e0ba35b2 100644 (file)
@@ -27,7 +27,7 @@ extern "C" {
 
 #define LIBRGW_FILE_VER_MAJOR 1
 #define LIBRGW_FILE_VER_MINOR 1
-#define LIBRGW_FILE_VER_EXTRA 4
+#define LIBRGW_FILE_VER_EXTRA 6
 
 #define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
 #define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA)
@@ -126,6 +126,10 @@ int rgw_mount(librgw_t rgw, const char *uid, const char *key,
              const char *secret, struct rgw_fs **rgw_fs,
              uint32_t flags);
 
+int rgw_mount2(librgw_t rgw, const char *uid, const char *key,
+               const char *secret, const char *root, struct rgw_fs **rgw_fs,
+               uint32_t flags);
+
 /*
  register invalidate callbacks
 */
@@ -217,6 +221,12 @@ int rgw_readdir(struct rgw_fs *rgw_fs,
                rgw_readdir_cb rcb, void *cb_arg, bool *eof,
                uint32_t flags);
 
+/* enumeration continuing from name */
+int rgw_readdir2(struct rgw_fs *rgw_fs,
+                struct rgw_file_handle *parent_fh, const char *name,
+                rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+                uint32_t flags);
+
 /* project offset of dirent name */
 #define RGW_DIRENT_OFFSET_FLAG_NONE 0x0000
 
index 56eb92bd6e3c20de3f4fa65895a0c7f6c2d5bee9..f9dc24b1ddc6bdba7210a23cb292b2422a7a9489 100644 (file)
 # define MSG_MORE 0
 #endif
 
+/*
+ * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE.
+ */
+#ifndef MSG_NOSIGNAL
+# define MSG_NOSIGNAL 0
+# ifdef SO_NOSIGPIPE
+#  define CEPH_USE_SO_NOSIGPIPE
+# else
+#  define CEPH_USE_SIGPIPE_BLOCKER
+#  warning "Using SIGPIPE blocking instead of suppression; this is not well-tested upstream!"
+# endif
+#endif
+
 #endif
index 3d6fcfb2eca4e1091380b17f9cfe048cf9cada7f..4073216bcdfb2eb561cf03fe77f0d58d62a38dc8 100644 (file)
@@ -802,9 +802,9 @@ void JournalMetadata::schedule_commit_task() {
   assert(m_lock.is_locked());
   assert(m_commit_position_ctx != nullptr);
   if (m_commit_position_task_ctx == NULL) {
-    m_commit_position_task_ctx = new C_CommitPositionTask(this);
-    m_timer->add_event_after(m_settings.commit_interval,
-                             m_commit_position_task_ctx);
+    m_commit_position_task_ctx =
+      m_timer->add_event_after(m_settings.commit_interval,
+                              new C_CommitPositionTask(this));
   }
 }
 
index 92dd702615bcd018e749bb5e56fdce688bc98018..8292ebb1abfcb5801fed45ee62df2eceb87fd1c4 100644 (file)
@@ -234,9 +234,12 @@ void ObjectPlayer::schedule_watch() {
   }
 
   ldout(m_cct, 20) << __func__ << ": " << m_oid << " scheduling watch" << dendl;
-  assert(m_watch_task == NULL);
-  m_watch_task = new C_WatchTask(this);
-  m_timer.add_event_after(m_watch_interval, m_watch_task);
+  assert(m_watch_task == nullptr);
+  m_watch_task = m_timer.add_event_after(
+    m_watch_interval,
+    new FunctionContext([this](int) {
+       handle_watch_task();
+      }));
 }
 
 bool ObjectPlayer::cancel_watch() {
@@ -301,10 +304,6 @@ void ObjectPlayer::C_Fetch::finish(int r) {
   on_finish->complete(r);
 }
 
-void ObjectPlayer::C_WatchTask::finish(int r) {
-  object_player->handle_watch_task();
-}
-
 void ObjectPlayer::C_WatchFetch::finish(int r) {
   object_player->handle_watch_fetched(r);
 }
index 3d495ba7ff7ae0336f4717cdba7752b088a65e1b..a3cbe807332f32182bfca354de051e18e00db344 100644 (file)
@@ -90,12 +90,6 @@ private:
     }
     void finish(int r) override;
   };
-  struct C_WatchTask : public Context {
-    ObjectPlayerPtr object_player;
-    C_WatchTask(ObjectPlayer *o) : object_player(o) {
-    }
-    void finish(int r) override;
-  };
   struct C_WatchFetch : public Context {
     ObjectPlayerPtr object_player;
     C_WatchFetch(ObjectPlayer *o) : object_player(o) {
index a2faeae8aa60e1aadd7e17a1491d09b081d2e330..a87c31ddb29788d40060fdab17ce954807456220 100644 (file)
@@ -28,7 +28,7 @@ ObjectRecorder::ObjectRecorder(librados::IoCtx &ioctx, const std::string &oid,
     m_timer_lock(timer_lock), m_handler(handler), m_order(order),
     m_soft_max_size(1 << m_order), m_flush_interval(flush_interval),
     m_flush_bytes(flush_bytes), m_flush_age(flush_age), m_flush_handler(this),
-    m_append_task(NULL), m_lock(lock), m_append_tid(0), m_pending_bytes(0),
+    m_lock(lock), m_append_tid(0), m_pending_bytes(0),
     m_size(0), m_overflowed(false), m_object_closed(false),
     m_in_flight_flushes(false), m_aio_scheduled(false) {
   m_ioctx.dup(ioctx);
@@ -194,9 +194,11 @@ void ObjectRecorder::cancel_append_task() {
 
 void ObjectRecorder::schedule_append_task() {
   Mutex::Locker locker(m_timer_lock);
-  if (m_append_task == NULL && m_flush_age > 0) {
-    m_append_task = new C_AppendTask(this);
-    m_timer.add_event_after(m_flush_age, m_append_task);
+  if (m_append_task == nullptr && m_flush_age > 0) {
+    m_append_task = m_timer.add_event_after(
+      m_flush_age, new FunctionContext([this](int) {
+         handle_append_task();
+       }));
   }
 }
 
index aad46690134e94e50e38c60b3ecfb9bb1efce914..22a46697c522d8442e598b724581550b143df5cb 100644 (file)
@@ -90,14 +90,6 @@ private:
       object_recorder->flush(future);
     }
   };
-  struct C_AppendTask : public Context {
-    ObjectRecorder *object_recorder;
-    C_AppendTask(ObjectRecorder *o) : object_recorder(o) {
-    }
-    void finish(int r) override {
-      object_recorder->handle_append_task();
-    }
-  };
   struct C_AppendFlush : public Context {
     ObjectRecorder *object_recorder;
     uint64_t tid;
@@ -132,7 +124,7 @@ private:
 
   FlushHandler m_flush_handler;
 
-  C_AppendTask *m_append_task;
+  Context *m_append_task = nullptr;
 
   mutable std::shared_ptr<Mutex> m_lock;
   AppendBuffers m_append_buffers;
index 37a78480f140814a26ccdb72d59521828bf9ed43..818884a1a97edbf5dd6c11ab9978776903d0e934 100644 (file)
@@ -12,6 +12,7 @@
 #include <boost/scoped_ptr.hpp>
 #include "include/encoding.h"
 #include "common/Formatter.h"
+#include "common/perf_counters.h"
 
 using std::string;
 /**
@@ -350,6 +351,15 @@ public:
   virtual void get_statistics(Formatter *f) {
     return;
   }
+
+  /**
+   * Return your perf counters if you have any.  Subclasses are not
+   * required to implement this, and callers must respect a null return
+   * value.
+   */
+  virtual PerfCounters *get_perf_counters() {
+    return nullptr;
+  }
 protected:
   /// List of matching prefixes and merge operators
   std::vector<std::pair<std::string,
index be344ff18e6836a6719314162d0d583b3c5b97e6..5a3ced9e4c9b9d565f435274c668b70d2d11e118 100644 (file)
@@ -184,6 +184,11 @@ public:
 
   void close() override;
 
+  PerfCounters *get_perf_counters() override
+  {
+    return logger;
+  }
+
   class LevelDBTransactionImpl : public KeyValueDB::TransactionImpl {
   public:
     leveldb::WriteBatch bat;
index 6a7c0e37772b275cbb90a9ff50c501bd23ee2867..44c99e1b2e9d59b6ec8aaeb48735e09afaace8d3 100644 (file)
@@ -158,6 +158,11 @@ public:
   void split_stats(const std::string &s, char delim, std::vector<std::string> &elems);
   void get_statistics(Formatter *f) override;
 
+  PerfCounters *get_perf_counters() override
+  {
+    return logger;
+  }
+
   struct  RocksWBHandler: public rocksdb::WriteBatch::Handler {
     std::string seen ;
     int num_seen = 0;
index 0930e6a56a4bf1c9b0baef6b242c0771f0ab0a59..257554dc1718b50f51af419fb60558b1fb6eb9aa 100644 (file)
@@ -111,10 +111,10 @@ bool ObjectMap<I>::object_may_exist(uint64_t object_no) const
 }
 
 template <typename I>
-bool ObjectMap<I>::update_required(uint64_t object_no, uint8_t new_state) {
+bool ObjectMap<I>::update_required(const ceph::BitVector<2>::Iterator& it,
+                                   uint8_t new_state) {
   assert(m_image_ctx.object_map_lock.is_wlocked());
-  uint8_t state = (*this)[object_no];
-
+  uint8_t state = *it;
   if ((state == new_state) ||
       (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
       (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING)) {
@@ -224,7 +224,7 @@ void ObjectMap<I>::detained_aio_update(UpdateOperation &&op) {
 
   BlockGuardCell *cell;
   int r = m_update_guard->detain({op.start_object_no, op.end_object_no},
-                                &op, &cell);
+                                 &op, &cell);
   if (r < 0) {
     lderr(cct) << "failed to detain object map update: " << cpp_strerror(r)
                << dendl;
@@ -297,13 +297,14 @@ void ObjectMap<I>::aio_update(uint64_t snap_id, uint64_t start_object_no,
       return;
     }
 
-    uint64_t object_no;
-    for (object_no = start_object_no; object_no < end_object_no; ++object_no) {
-      if (update_required(object_no, new_state)) {
+    auto it = m_object_map.begin() + start_object_no;
+    auto end_it = m_object_map.begin() + end_object_no;
+    for (; it != end_it; ++it) {
+      if (update_required(it, new_state)) {
         break;
       }
     }
-    if (object_no == end_object_no) {
+    if (it == end_it) {
       ldout(cct, 20) << "object map update not required" << dendl;
       m_image_ctx.op_work_queue->queue(on_finish, 0);
       return;
index 427ecdf165a378ea7f4c58144c8815bf360d1971..ebd1a9ba3102dd9056893164a8e134651db5546c 100644 (file)
@@ -69,15 +69,15 @@ public:
                   const ZTracer::Trace &parent_trace, T *callback_object) {
     assert(start_object_no < end_object_no);
     if (snap_id == CEPH_NOSNAP) {
-      uint64_t object_no;
-      for (object_no = start_object_no; object_no < end_object_no;
-           ++object_no) {
-        if (update_required(object_no, new_state)) {
+      auto it = m_object_map.begin() + start_object_no;
+      auto end_it = m_object_map.begin() + end_object_no;
+      for (; it != end_it; ++it) {
+        if (update_required(it, new_state)) {
           break;
         }
       }
 
-      if (object_no == end_object_no) {
+      if (it == end_it) {
         return false;
       }
 
@@ -133,7 +133,8 @@ private:
                   uint64_t end_object_no, uint8_t new_state,
                   const boost::optional<uint8_t> &current_state,
                   const ZTracer::Trace &parent_trace, Context *on_finish);
-  bool update_required(uint64_t object_no, uint8_t new_state);
+  bool update_required(const ceph::BitVector<2>::Iterator &it,
+                       uint8_t new_state);
 
 };
 
index 8661e7a73a35de219e61f51ce58c6744111d56e3..46e5135d3d9eee3c6d4f254f2ed309df9e600a1e 100644 (file)
@@ -60,7 +60,7 @@ int list_mirror_images(librados::IoCtx& io_ctx,
     std::map<std::string, std::string> mirror_images;
     r =  cls_client::mirror_image_list(&io_ctx, last_read, max_read,
                                        &mirror_images);
-    if (r < 0) {
+    if (r < 0 && r != -ENOENT) {
       lderr(cct) << "error listing mirrored image directory: "
                  << cpp_strerror(r) << dendl;
       return r;
@@ -810,6 +810,10 @@ int Mirror<I>::image_status_list(librados::IoCtx& io_ctx,
   for (auto it = images_.begin(); it != images_.end(); ++it) {
     auto &image_id = it->first;
     auto &info = it->second;
+    if (info.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLED) {
+      continue;
+    }
+
     auto &image_name = id_to_name[image_id];
     if (image_name.empty()) {
       lderr(cct) << "failed to find image name for image " << image_id << ", "
@@ -840,7 +844,7 @@ int Mirror<I>::image_status_summary(librados::IoCtx& io_ctx,
 
   std::map<cls::rbd::MirrorImageStatusState, int> states_;
   int r = cls_client::mirror_image_status_get_summary(&io_ctx, &states_);
-  if (r < 0) {
+  if (r < 0 && r != -ENOENT) {
     lderr(cct) << "failed to get mirror status summary: "
                << cpp_strerror(r) << dendl;
     return r;
index 0b2415cf42e8a2f443e9d1f3e42d1bbaaf7c3feb..d6a3905deabafb24a2c1ed6c70cd33c3eb780d23 100644 (file)
@@ -53,6 +53,16 @@ ObjectRequest<I>::create_truncate(I *ictx, const std::string &oid,
                                    object_off, snapc, parent_trace, completion);
 }
 
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_trim(I *ictx, const std::string &oid,
+                              uint64_t object_no, const ::SnapContext &snapc,
+                              bool post_object_map_update,
+                              Context *completion) {
+  return new ObjectTrimRequest(util::get_image_ctx(ictx), oid, object_no,
+                               snapc, post_object_map_update, completion);
+}
+
 template <typename I>
 ObjectRequest<I>*
 ObjectRequest<I>::create_write(I *ictx, const std::string &oid,
index e1ec6dc61cbe0d85544cdacc013e5b6c2556e58b..fa99bda44dc16e854436c35e7f53e2603e1dfe81 100644 (file)
@@ -58,6 +58,11 @@ public:
                                         const ::SnapContext &snapc,
                                        const ZTracer::Trace &parent_trace,
                                         Context *completion);
+  static ObjectRequest* create_trim(ImageCtxT *ictx, const std::string &oid,
+                                    uint64_t object_no,
+                                    const ::SnapContext &snapc,
+                                    bool post_object_map_update,
+                                    Context *completion);
   static ObjectRequest* create_write(ImageCtxT *ictx, const std::string &oid,
                                      uint64_t object_no,
                                      uint64_t object_off,
index 8c0ec69de8d15d42bd8b3172489d79436ef24d1b..9ceb28a177379b1fbeae984d1e09c3d4b99bce52 100644 (file)
@@ -7,28 +7,44 @@
 #include "common/dout.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
 #include "cls/lock/cls_lock_client.h"
 #include <string>
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
-#define dout_prefix *_dout << "librbd::object_map::UpdateRequest: "
+#define dout_prefix *_dout << "librbd::object_map::UpdateRequest: " << this \
+                           << " " << __func__ << ": "
 
 namespace librbd {
 namespace object_map {
 
+namespace {
+
+// keep aligned to bit_vector 4K block sizes
+const uint64_t MAX_OBJECTS_PER_UPDATE = 256 * (1 << 10);
+
+}
+
 template <typename I>
 void UpdateRequest<I>::send() {
+  update_object_map();
+}
+
+template <typename I>
+void UpdateRequest<I>::update_object_map() {
   assert(m_image_ctx.snap_lock.is_locked());
   assert(m_image_ctx.object_map_lock.is_locked());
   CephContext *cct = m_image_ctx.cct;
 
-  // safe to update in-memory state first without handling rollback since any
-  // failures will invalidate the object map
+  // break very large requests into manageable batches
+  m_update_end_object_no = MIN(
+    m_end_object_no, m_update_start_object_no + MAX_OBJECTS_PER_UPDATE);
+
   std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
-  ldout(cct, 20) << this << " updating object map"
-                 << ": ictx=" << &m_image_ctx << ", oid=" << oid << ", ["
-                << m_start_object_no << "," << m_end_object_no << ") = "
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", oid=" << oid << ", "
+                 << "[" << m_update_start_object_no << ","
+                        << m_update_end_object_no << ") = "
                 << (m_current_state ?
                       stringify(static_cast<uint32_t>(*m_current_state)) : "")
                 << "->" << static_cast<uint32_t>(m_new_state)
@@ -38,10 +54,12 @@ void UpdateRequest<I>::send() {
   if (m_snap_id == CEPH_NOSNAP) {
     rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
   }
-  cls_client::object_map_update(&op, m_start_object_no, m_end_object_no,
-                               m_new_state, m_current_state);
+  cls_client::object_map_update(&op, m_update_start_object_no,
+                                m_update_end_object_no, m_new_state,
+                                m_current_state);
 
-  librados::AioCompletion *rados_completion = create_callback_completion();
+  auto rados_completion = librbd::util::create_rados_callback<
+    UpdateRequest<I>, &UpdateRequest<I>::handle_update_object_map>(this);
   std::vector<librados::snap_t> snaps;
   int r = m_image_ctx.md_ctx.aio_operate(
     oid, rados_completion, &op, 0, snaps,
@@ -51,26 +69,53 @@ void UpdateRequest<I>::send() {
 }
 
 template <typename I>
-void UpdateRequest<I>::finish_request() {
-  RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
-  RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
-  ldout(m_image_ctx.cct, 20) << this << " on-disk object map updated"
-                             << dendl;
+void UpdateRequest<I>::handle_update_object_map(int r) {
+  ldout(m_image_ctx.cct, 20) << "r=" << r << dendl;
+
+  {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+    update_in_memory_object_map();
+
+    if (m_update_end_object_no < m_end_object_no) {
+      m_update_start_object_no = m_update_end_object_no;
+      update_object_map();
+      return;
+    }
+  }
+
+  // no more batch updates to send
+  complete(r);
+}
+
+template <typename I>
+void UpdateRequest<I>::update_in_memory_object_map() {
+  assert(m_image_ctx.snap_lock.is_locked());
+  assert(m_image_ctx.object_map_lock.is_locked());
 
   // rebuilding the object map might update on-disk only
   if (m_snap_id == m_image_ctx.snap_id) {
-    for (uint64_t object_no = m_start_object_no;
-         object_no < MIN(m_end_object_no, m_object_map.size());
-         ++object_no) {
-      uint8_t state = m_object_map[object_no];
+    ldout(m_image_ctx.cct, 20) << dendl;
+
+    auto it = m_object_map.begin() +
+                    MIN(m_update_start_object_no, m_object_map.size());
+    auto end_it = m_object_map.begin() +
+                    MIN(m_update_end_object_no, m_object_map.size());
+    for (; it != end_it; ++it) {
+      auto state_ref = *it;
+      uint8_t state = state_ref;
       if (!m_current_state || state == *m_current_state ||
           (*m_current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) {
-        m_object_map[object_no] = m_new_state;
+        state_ref = m_new_state;
       }
     }
   }
 }
 
+template <typename I>
+void UpdateRequest<I>::finish_request() {
+}
+
 } // namespace object_map
 } // namespace librbd
 
index 175160752dac7d0cc844cc0f498abd5aab8037e5..cb9804d07c9f0e09dc2f2110ab5fd70c7383d857 100644 (file)
@@ -41,7 +41,8 @@ public:
                const ZTracer::Trace &parent_trace, Context *on_finish)
     : Request(image_ctx, snap_id, on_finish), m_object_map(*object_map),
       m_start_object_no(start_object_no), m_end_object_no(end_object_no),
-      m_new_state(new_state), m_current_state(current_state),
+      m_update_start_object_no(start_object_no), m_new_state(new_state),
+      m_current_state(current_state),
       m_trace(util::create_trace(image_ctx, "update object map", parent_trace))
   {
     m_trace.event("start");
@@ -56,12 +57,35 @@ protected:
   void finish_request() override;
 
 private:
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    |/------------------\
+   *    v                   | (repeat in batches)
+   * UPDATE_OBJECT_MAP -----/
+   *    |
+   *    v
+   * <finish>
+   *
+   * @endverbatim
+   */
+
   ceph::BitVector<2> &m_object_map;
   uint64_t m_start_object_no;
   uint64_t m_end_object_no;
+  uint64_t m_update_start_object_no;
+  uint64_t m_update_end_object_no = 0;
   uint8_t m_new_state;
   boost::optional<uint8_t> m_current_state;
   ZTracer::Trace m_trace;
+
+  void update_object_map();
+  void handle_update_object_map(int r);
+
+  void update_in_memory_object_map();
+
 };
 
 } // namespace object_map
index d332d870c15dc5909e9052f1a0d9f213dfe479b0..63ddd8848c4d38ec785745fe045a2a20a223fe00 100644 (file)
@@ -129,7 +129,7 @@ void SnapshotCreateRequest<I>::send_allocate_snap_id() {
   librados::AioCompletion *rados_completion = create_rados_callback<
     SnapshotCreateRequest<I>,
     &SnapshotCreateRequest<I>::handle_allocate_snap_id>(this);
-  image_ctx.md_ctx.aio_selfmanaged_snap_create(&m_snap_id, rados_completion);
+  image_ctx.data_ctx.aio_selfmanaged_snap_create(&m_snap_id, rados_completion);
   rados_completion->release();
 }
 
@@ -255,7 +255,7 @@ void SnapshotCreateRequest<I>::send_release_snap_id() {
   librados::AioCompletion *rados_completion = create_rados_callback<
     SnapshotCreateRequest<I>,
     &SnapshotCreateRequest<I>::handle_release_snap_id>(this);
-  image_ctx.md_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
+  image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
   rados_completion->release();
 }
 
index 4cbfc03b0d497f5954d8f11104007815f9fd04d4..fe19ff5c9c3ea5652b1bdba6bdcd00f238e10173 100644 (file)
@@ -204,7 +204,7 @@ void SnapshotRemoveRequest<I>::send_release_snap_id() {
 
   librados::AioCompletion *rados_completion =
     this->create_callback_completion();
-  image_ctx.md_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
+  image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
   rados_completion->release();
 }
 
index 46ec967b5eddcb3bf8d6aa0db32ce42e82516a3c..28f2deb1af84f9d9f1137fe6faddf3348089f862 100644 (file)
@@ -45,8 +45,8 @@ public:
     string oid = image_ctx.get_object_name(m_object_no);
     ldout(image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl;
 
-    auto req = new io::ObjectTrimRequest(&image_ctx, oid, m_object_no,
-                                         m_snapc, false, this);
+    auto req = io::ObjectRequest<I>::create_trim(&image_ctx, oid, m_object_no,
+                                                 m_snapc, false, this);
     req->send();
     return 0;
   }
@@ -58,7 +58,7 @@ private:
 template <typename I>
 class C_RemoveObject : public C_AsyncObjectThrottle<I> {
 public:
-  C_RemoveObject(AsyncObjectThrottle<I> &throttle, ImageCtx *image_ctx,
+  C_RemoveObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
                  uint64_t object_no)
     : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_no(object_no)
   {
@@ -105,6 +105,7 @@ TrimRequest<I>::TrimRequest(I &image_ctx, Context *on_finish,
   m_delete_off = MIN(new_num_periods * period, original_size);
   // first object we can delete free and clear
   m_delete_start = new_num_periods * image_ctx.get_stripe_count();
+  m_delete_start_min = m_delete_start;
   m_num_objects = Striper::get_num_objects(image_ctx.layout, original_size);
 
   CephContext *cct = image_ctx.cct;
@@ -131,33 +132,23 @@ bool TrimRequest<I>::should_complete(int r)
 
   RWLock::RLocker owner_lock(image_ctx.owner_lock);
   switch (m_state) {
-  case STATE_PRE_COPYUP:
-    ldout(cct, 5) << " PRE_COPYUP" << dendl;
+  case STATE_PRE_TRIM:
+    ldout(cct, 5) << " PRE_TRIM" << dendl;
     send_copyup_objects();
     break;
 
   case STATE_COPYUP_OBJECTS:
     ldout(cct, 5) << " COPYUP_OBJECTS" << dendl;
-    send_post_copyup();
-    break;
-
-  case STATE_POST_COPYUP:
-    ldout(cct, 5) << " POST_COPYUP" << dendl;
-    send_pre_remove();
-    break;
-
-  case STATE_PRE_REMOVE:
-    ldout(cct, 5) << " PRE_REMOVE" << dendl;
     send_remove_objects();
     break;
 
   case STATE_REMOVE_OBJECTS:
     ldout(cct, 5) << " REMOVE_OBJECTS" << dendl;
-    send_post_remove();
+    send_post_trim();
     break;
 
-  case STATE_POST_REMOVE:
-    ldout(cct, 5) << " POST_OBJECTS" << dendl;
+  case STATE_POST_TRIM:
+    ldout(cct, 5) << " POST_TRIM" << dendl;
     send_clean_boundary();
     break;
 
@@ -180,198 +171,132 @@ bool TrimRequest<I>::should_complete(int r)
 
 template <typename I>
 void TrimRequest<I>::send() {
-  send_pre_copyup();
+  send_pre_trim();
 }
 
 template<typename I>
-void TrimRequest<I>::send_copyup_objects() {
+void TrimRequest<I>::send_pre_trim() {
   I &image_ctx = this->m_image_ctx;
   assert(image_ctx.owner_lock.is_locked());
 
-  ldout(image_ctx.cct, 5) << this << " send_copyup_objects: "
-                          << " start object=" << m_copyup_start << ", "
-                          << " end object=" << m_copyup_end << dendl;
-  m_state = STATE_COPYUP_OBJECTS;
+  if (m_delete_start >= m_num_objects) {
+    send_clean_boundary();
+    return;
+  }
 
-  ::SnapContext snapc;
   {
     RWLock::RLocker snap_locker(image_ctx.snap_lock);
-    RWLock::RLocker parent_locker(image_ctx.parent_lock);
-    snapc = image_ctx.snapc;
-  }
-
-  Context *ctx = this->create_callback_context();
-  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
-    boost::lambda::bind(boost::lambda::new_ptr<C_CopyupObject<I> >(),
-      boost::lambda::_1, &image_ctx, snapc, boost::lambda::_2));
-  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
-    this, image_ctx, context_factory, ctx, &m_prog_ctx, m_copyup_start,
-    m_copyup_end);
-  throttle->start_ops(image_ctx.concurrent_management_ops);
-}
+    if (image_ctx.object_map != nullptr) {
+      ldout(image_ctx.cct, 5) << this << " send_pre_trim: "
+                              << " delete_start_min=" << m_delete_start_min
+                              << " num_objects=" << m_num_objects << dendl;
+      m_state = STATE_PRE_TRIM;
 
-template <typename I>
-void TrimRequest<I>::send_remove_objects() {
-  I &image_ctx = this->m_image_ctx;
-  assert(image_ctx.owner_lock.is_locked());
+      assert(image_ctx.exclusive_lock->is_lock_owner());
 
-  ldout(image_ctx.cct, 5) << this << " send_remove_objects: "
-                           << " delete_start=" << m_delete_start
-                           << " num_objects=" << m_num_objects << dendl;
-  m_state = STATE_REMOVE_OBJECTS;
+      RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
+      if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
+            CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_PENDING,
+            OBJECT_EXISTS, {}, this)) {
+        return;
+      }
+    }
+  }
 
-  Context *ctx = this->create_callback_context();
-  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
-    boost::lambda::bind(boost::lambda::new_ptr<C_RemoveObject<I> >(),
-      boost::lambda::_1, &image_ctx, boost::lambda::_2));
-  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
-    this, image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start,
-    m_num_objects);
-  throttle->start_ops(image_ctx.concurrent_management_ops);
+  send_copyup_objects();
 }
 
 template<typename I>
-void TrimRequest<I>::send_pre_copyup() {
+void TrimRequest<I>::send_copyup_objects() {
   I &image_ctx = this->m_image_ctx;
   assert(image_ctx.owner_lock.is_locked());
 
-  if (m_delete_start >= m_num_objects) {
-    send_clean_boundary();
-    return;
-  }
-
+  ::SnapContext snapc;
   bool has_snapshots;
   uint64_t parent_overlap;
   {
     RWLock::RLocker snap_locker(image_ctx.snap_lock);
     RWLock::RLocker parent_locker(image_ctx.parent_lock);
 
+    snapc = image_ctx.snapc;
     has_snapshots = !image_ctx.snaps.empty();
     int r = image_ctx.get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
     assert(r == 0);
   }
 
   // copyup is only required for portion of image that overlaps parent
-  m_copyup_end = Striper::get_num_objects(image_ctx.layout, parent_overlap);
+  uint64_t copyup_end = Striper::get_num_objects(image_ctx.layout,
+                                                 parent_overlap);
 
   // TODO: protect against concurrent shrink and snap create?
   // skip to remove if no copyup is required.
-  if (m_copyup_end <= m_delete_start || !has_snapshots) {
-    send_pre_remove();
+  if (copyup_end <= m_delete_start || !has_snapshots) {
+    send_remove_objects();
     return;
   }
 
-  m_copyup_start = m_delete_start;
-  m_delete_start = m_copyup_end;
-
-  {
-    RWLock::RLocker snap_locker(image_ctx.snap_lock);
-    if (image_ctx.object_map != nullptr) {
-      ldout(image_ctx.cct, 5) << this << " send_pre_copyup: "
-                              << " copyup_start=" << m_copyup_start
-                              << " copyup_end=" << m_copyup_end << dendl;
-      m_state = STATE_PRE_COPYUP;
-
-      assert(image_ctx.exclusive_lock->is_lock_owner());
+  uint64_t copyup_start = m_delete_start;
+  m_delete_start = copyup_end;
 
-      RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
-      if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
-            CEPH_NOSNAP, m_copyup_start, m_copyup_end, OBJECT_PENDING,
-            OBJECT_EXISTS, {}, this)) {
-        return;
-      }
-    }
-  }
+  ldout(image_ctx.cct, 5) << this << " send_copyup_objects: "
+                          << " start object=" << copyup_start << ", "
+                          << " end object=" << copyup_end << dendl;
+  m_state = STATE_COPYUP_OBJECTS;
 
-  send_copyup_objects();
+  Context *ctx = this->create_callback_context();
+  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_CopyupObject<I> >(),
+      boost::lambda::_1, &image_ctx, snapc, boost::lambda::_2));
+  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+    this, image_ctx, context_factory, ctx, &m_prog_ctx, copyup_start,
+    copyup_end);
+  throttle->start_ops(image_ctx.concurrent_management_ops);
 }
 
 template <typename I>
-void TrimRequest<I>::send_pre_remove() {
+void TrimRequest<I>::send_remove_objects() {
   I &image_ctx = this->m_image_ctx;
   assert(image_ctx.owner_lock.is_locked());
-  if (m_delete_start >= m_num_objects) {
-    send_clean_boundary();
-    return;
-  }
-
-  {
-    RWLock::RLocker snap_locker(image_ctx.snap_lock);
-    if (image_ctx.object_map != nullptr) {
-      ldout(image_ctx.cct, 5) << this << " send_pre_remove: "
-                               << " delete_start=" << m_delete_start
-                               << " num_objects=" << m_num_objects << dendl;
-      m_state = STATE_PRE_REMOVE;
-
-      assert(image_ctx.exclusive_lock->is_lock_owner());
 
-      // flag the objects as pending deletion
-      RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
-      if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
-            CEPH_NOSNAP, m_delete_start, m_num_objects, OBJECT_PENDING,
-            OBJECT_EXISTS, {}, this)) {
-        return;
-      }
-    }
-  }
+  ldout(image_ctx.cct, 5) << this << " send_remove_objects: "
+                           << " delete_start=" << m_delete_start
+                           << " num_objects=" << m_num_objects << dendl;
+  m_state = STATE_REMOVE_OBJECTS;
 
-  // no object map update required
-  send_remove_objects();
+  Context *ctx = this->create_callback_context();
+  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_RemoveObject<I> >(),
+      boost::lambda::_1, &image_ctx, boost::lambda::_2));
+  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+    this, image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start,
+    m_num_objects);
+  throttle->start_ops(image_ctx.concurrent_management_ops);
 }
 
 template<typename I>
-void TrimRequest<I>::send_post_copyup() {
-  I &image_ctx = this->m_image_ctx;
-  assert(image_ctx.owner_lock.is_locked());
-
-  {
-    RWLock::RLocker snap_locker(image_ctx.snap_lock);
-    if (image_ctx.object_map != nullptr) {
-      ldout(image_ctx.cct, 5) << this << " send_post_copyup:"
-                              << " copyup_start=" << m_copyup_start
-                              << " copyup_end=" << m_copyup_end << dendl;
-      m_state = STATE_POST_COPYUP;
-
-      assert(image_ctx.exclusive_lock->is_lock_owner());
-
-      RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
-      if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
-            CEPH_NOSNAP, m_copyup_start, m_copyup_end, OBJECT_NONEXISTENT,
-            OBJECT_PENDING, {}, this)) {
-        return;
-      }
-    }
-  }
-
-  send_pre_remove();
-}
-
-template <typename I>
-void TrimRequest<I>::send_post_remove() {
+void TrimRequest<I>::send_post_trim() {
   I &image_ctx = this->m_image_ctx;
   assert(image_ctx.owner_lock.is_locked());
 
   {
     RWLock::RLocker snap_locker(image_ctx.snap_lock);
     if (image_ctx.object_map != nullptr) {
-      ldout(image_ctx.cct, 5) << this << " send_post_remove: "
-                               << " delete_start=" << m_delete_start
-                               << " num_objects=" << m_num_objects << dendl;
-      m_state = STATE_POST_REMOVE;
+      ldout(image_ctx.cct, 5) << this << " send_post_trim:"
+                              << " delete_start_min=" << m_delete_start_min
+                              << " num_objects=" << m_num_objects << dendl;
+      m_state = STATE_POST_TRIM;
 
       assert(image_ctx.exclusive_lock->is_lock_owner());
 
-      // flag the pending objects as removed
       RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
       if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
-            CEPH_NOSNAP, m_delete_start, m_num_objects, OBJECT_NONEXISTENT,
+            CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_NONEXISTENT,
             OBJECT_PENDING, {}, this)) {
         return;
       }
     }
   }
 
-  // no object map update required
   send_clean_boundary();
 }
 
@@ -413,13 +338,15 @@ void TrimRequest<I>::send_clean_boundary() {
     ldout(cct, 20) << " ex " << *p << dendl;
     Context *req_comp = new C_ContextCompletion(*completion);
 
-    io::ObjectRequest<> *req;
+    io::ObjectRequest<I> *req;
     if (p->offset == 0) {
-      req = new io::ObjectTrimRequest(&image_ctx, p->oid.name, p->objectno,
-                                      snapc, true, req_comp);
+      req = io::ObjectRequest<I>::create_trim(&image_ctx, p->oid.name,
+                                              p->objectno, snapc, true,
+                                              req_comp);
     } else {
-      req = new io::ObjectTruncateRequest(&image_ctx, p->oid.name, p->objectno,
-                                          p->offset, snapc, {}, req_comp);
+      req = io::ObjectRequest<I>::create_truncate(&image_ctx, p->oid.name,
+                                                  p->objectno, p->offset, snapc,
+                                                  {}, req_comp);
     }
     req->send();
   }
index 5eb9fdffe3868b47f58a002b5c5177f49eda6be4..8526046c94b3e4d4a1ec431a384b2f825800aace 100644 (file)
@@ -24,6 +24,10 @@ public:
                            prog_ctx);
   }
 
+  TrimRequest(ImageCtxT &image_ctx, Context *on_finish,
+             uint64_t original_size, uint64_t new_size,
+             ProgressContext &prog_ctx);
+
   void send() override;
 
 protected:
@@ -33,75 +37,63 @@ protected:
    *
    * @verbatim
    *
-   *     <start> . . . . > STATE_FINISHED . . . . . . . . .
-   *      |    . . . . . . . . . . > . . . . . . . . .    .
-   *      |   /                                      .    .
-   * STATE_PRE_COPYUP ---> STATE_COPYUP_OBJECTS      .    .
-   *                                |                .    .
-   *        /-----------------------/                v    .
-   *        |                                        .    .
-   *        v                                        .    .
-   * STATE_POST_COPYUP. . . > .                      .    .
-   *      |    . . . . . . . . . . < . . . . . . . . .    .
-   *      |    |              .                           .
-   *      v    v              v                           .
-   * STATE_PRE_REMOVE ---> STATE_REMOVE_OBJECTS           .
-   *                                |   .   .             .
-   *        /-----------------------/   .   . . . . . .   .
-   *        |                           .             .   .
-   *        v                           v             v   v
-   * STATE_POST_REMOVE --> STATE_CLEAN_BOUNDARY ---> <finish>
-   *        .                                           ^
-   *        .                                           .
-   *        . . . . . . . . . . . . . . . . . . . . . . .
-   *
-   * @endverbatim
+   *     <start>  . . . . . . . . . . . . . . . . .
+   *        |                                     .
+   *        v (skip if not needed)                .
+   * STATE_PRE_TRIM                               .
+   *        |                                     .
+   *        v (skip if not needed)                .
+   * STATE_COPYUP_OBJECTS                         .
+   *        |                                     .
+   *        v (skip if not needed)                .
+   * STATE_REMOVE_OBJECTS                         .
+   *        |                                     .
+   *        v (skip if not needed)                .
+   * STATE_POST_TRIM                              .
+   *        |                                     .
+   *        v (skip if not needed)                .
+   * STATE_CLEAN_BOUNDARY                         .
+   *        |                                     .
+   *        v                                     .
+   * STATE_FINISHED < . . . . . . . . . . . . . . .
+   *        |
+   *        v
+   *    <finish>
    *
    * The _COPYUP_OBJECTS state is skipped if there is no parent overlap
    * within the new image size and the image does not have any snapshots.
-   * The _PRE_REMOVE/_POST_REMOVE states are skipped if the object map
+   * The _PRE_TRIM/_POST_TRIM states are skipped if the object map
    * isn't enabled. The _REMOVE_OBJECTS state is skipped if no whole objects
    * are removed.  The _CLEAN_BOUNDARY state is skipped if no boundary
    * objects are cleaned.  The state machine will immediately transition
    * to _FINISHED state if there are no bytes to trim.
-   */ 
+   */
 
   enum State {
-    STATE_PRE_COPYUP,
+    STATE_PRE_TRIM,
     STATE_COPYUP_OBJECTS,
-    STATE_POST_COPYUP,
-    STATE_PRE_REMOVE,
     STATE_REMOVE_OBJECTS,
-    STATE_POST_REMOVE,
+    STATE_POST_TRIM,
     STATE_CLEAN_BOUNDARY,
     STATE_FINISHED
   };
 
   bool should_complete(int r) override;
 
-  State m_state;
+  State m_state = STATE_PRE_TRIM;
 
 private:
   uint64_t m_delete_start;
+  uint64_t m_delete_start_min = 0;
   uint64_t m_num_objects;
   uint64_t m_delete_off;
   uint64_t m_new_size;
   ProgressContext &m_prog_ctx;
 
-  uint64_t m_copyup_start;
-  uint64_t m_copyup_end;
-
-  TrimRequest(ImageCtxT &image_ctx, Context *on_finish,
-             uint64_t original_size, uint64_t new_size,
-             ProgressContext &prog_ctx);
-
-  void send_pre_copyup();
+  void send_pre_trim();
   void send_copyup_objects();
-  void send_post_copyup();
-
-  void send_pre_remove();
   void send_remove_objects();
-  void send_post_remove();
+  void send_post_trim();
 
   void send_clean_boundary();
   void send_finish(int r);
index 10ee8c242a0ba3ab0137d99c9e9df2f40ee34de1..e6bf3930e913281097b74993697f96173be6c10b 100644 (file)
 #define dout_prefix *_dout << "mds.beacon." << name << ' '
 
 
-class Beacon::C_MDS_BeaconSender : public Context {
-public:
-  explicit C_MDS_BeaconSender(Beacon *beacon_) : beacon(beacon_) {}
-  void finish(int r) override {
-    assert(beacon->lock.is_locked_by_me());
-    beacon->sender = NULL;
-    beacon->_send();
-  }
-private:
-  Beacon *beacon;
-};
-
 Beacon::Beacon(CephContext *cct_, MonClient *monc_, std::string name_) :
   Dispatcher(cct_), lock("Beacon"), monc(monc_), timer(g_ceph_context, lock),
   name(name_), standby_for_rank(MDS_RANK_NONE),
@@ -53,7 +41,6 @@ Beacon::Beacon(CephContext *cct_, MonClient *monc_, std::string name_) :
   awaiting_seq(-1)
 {
   last_seq = 0;
-  sender = NULL;
   was_laggy = false;
 
   epoch = 0;
@@ -192,8 +179,13 @@ void Beacon::_send()
   if (sender) {
     timer.cancel_event(sender);
   }
-  sender = new C_MDS_BeaconSender(this);
-  timer.add_event_after(g_conf->mds_beacon_interval, sender);
+  sender = timer.add_event_after(
+    g_conf->mds_beacon_interval,
+    new FunctionContext([this](int) {
+       assert(lock.is_locked_by_me());
+       sender = nullptr;
+       _send();
+      }));
 
   if (!cct->get_heartbeat_map()->is_healthy()) {
     /* If anything isn't progressing, let avoid sending a beacon so that
index 571f7f5599564859437732df165ef768bb42a520..201804def072f593bc3dafaac4ede6b7aff9e4bc 100644 (file)
@@ -102,8 +102,7 @@ private:
   MDSHealth health;
 
   // Ticker
-  class C_MDS_BeaconSender;
-  C_MDS_BeaconSender *sender;
+  Context *sender = nullptr;
 
   version_t awaiting_seq;
   Cond waiting_cond;
index 8d868d0786a76ce2cd1f017b5aa9b3c54a9f4894..f1b371678c084a3d6884d5fcc1d7a60b0fd42b9e 100644 (file)
@@ -208,7 +208,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
   static const int MASK_STATE_EXPORTED =
     (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
   static const int MASK_STATE_EXPORT_KEPT =
-    (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS);
+    (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS|STATE_QUEUEDEXPORTPIN);
 
   // -- waiters --
   static const uint64_t WAIT_DIR         = (1<<0);
index 95e78b23425734faa591dd765eb74147f3c60e71..b224e11190d25c9e9edd17e8c36c1a95bfd7fb0a 100644 (file)
@@ -431,17 +431,17 @@ void FSMap::encode(bufferlist& bl, uint64_t features) const
 
 void FSMap::decode(bufferlist::iterator& p)
 {
-  // Because the mon used to store an MDSMap where we now
-  // store an FSMap, FSMap knows how to decode the legacy
-  // MDSMap format (it never needs to encode it though).
-  MDSMap legacy_mds_map;
-  
   // The highest MDSMap encoding version before we changed the
   // MDSMonitor to store an FSMap instead of an MDSMap was
   // 5, so anything older than 6 is decoded as an MDSMap,
   // and anything newer is decoded as an FSMap.
   DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p);
   if (struct_v < 6) {
+    // Because the mon used to store an MDSMap where we now
+    // store an FSMap, FSMap knows how to decode the legacy
+    // MDSMap format (it never needs to encode it though).
+    MDSMap legacy_mds_map;
+
     // Decoding an MDSMap (upgrade)
     ::decode(epoch, p);
     ::decode(legacy_mds_map.flags, p);
@@ -621,6 +621,12 @@ void FSMap::decode(bufferlist::iterator& p)
   DECODE_FINISH(p);
 }
 
+void FSMap::sanitize(std::function<bool(int64_t pool)> pool_exists)
+{
+  for (auto &fs : filesystems) {
+    fs.second->mds_map.sanitize(pool_exists);
+  }
+}
 
 void Filesystem::encode(bufferlist& bl, uint64_t features) const
 {
index ea102a712740c63045bd1573d59b7a2c6484b0d4..3bb97ee58aae94f737097844671a3e71cecd7286 100644 (file)
@@ -493,6 +493,7 @@ public:
     bufferlist::iterator p = bl.begin();
     decode(p);
   }
+  void sanitize(std::function<bool(int64_t pool)> pool_exists);
 
   void print(ostream& out) const;
   void print_summary(Formatter *f, ostream *out) const;
index a2510f989819d5a66f07eb7822f4dd0c58faf665..b40833fd1cabbee3ebf15906d4df332f8239e677 100644 (file)
@@ -1679,7 +1679,7 @@ void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
       mut->add_cow_inode(oldin);
       if (pcow_inode)
        *pcow_inode = oldin;
-      CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, follows);
+      CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, oldin->last);
       oldin->inode.version = olddn->pre_dirty();
       dout(10) << " olddn " << *olddn << dendl;
       bool need_snapflush = !oldin->client_snap_caps.empty();
@@ -6481,8 +6481,8 @@ void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap
       unexpirables.push_back(dn);
     } else {
       trimmed++;
+      if (count > 0) count--;
     }
-    count--;
   }
 
   for (auto &dn : unexpirables) {
index 087c995a830df2ae31de81c35f682119c892a934..4c30b6747136bb4ecfd9355a630fe7e30d662aff 100644 (file)
 #undef dout_prefix
 #define dout_prefix *_dout << "mds." << name << ' '
 
-
-class MDSDaemon::C_MDS_Tick : public Context {
-  protected:
-    MDSDaemon *mds_daemon;
-public:
-  explicit C_MDS_Tick(MDSDaemon *m) : mds_daemon(m) {}
-  void finish(int r) override {
-    assert(mds_daemon->mds_lock.is_locked_by_me());
-    mds_daemon->tick();
-  }
-};
-
 // cons/des
 MDSDaemon::MDSDaemon(const std::string &n, Messenger *m, MonClient *mc) :
   Dispatcher(m->cct),
@@ -102,7 +90,6 @@ MDSDaemon::MDSDaemon(const std::string &n, Messenger *m, MonClient *mc) :
   mgrc(m->cct, m),
   log_client(m->cct, messenger, &mc->monmap, LogClient::NO_FLAGS),
   mds_rank(NULL),
-  tick_event(0),
   asok_hook(NULL)
 {
   orig_argc = 0;
@@ -545,8 +532,12 @@ void MDSDaemon::reset_tick()
   if (tick_event) timer.cancel_event(tick_event);
 
   // schedule
-  tick_event = new C_MDS_Tick(this);
-  timer.add_event_after(g_conf->mds_tick_interval, tick_event);
+  tick_event = timer.add_event_after(
+    g_conf->mds_tick_interval,
+    new FunctionContext([this](int) {
+       assert(mds_lock.is_locked_by_me());
+       tick();
+      }));
 }
 
 void MDSDaemon::tick()
index 0c7a1a7378a3b89ffd86c943b57f3bad5132d335..0e3bbaf26398f9e7d6ff197e961fee3da8acbe88 100644 (file)
@@ -87,8 +87,7 @@ class MDSDaemon : public Dispatcher, public md_config_obs_t {
                                  const std::set <std::string> &changed) override;
  protected:
   // tick and other timer fun
-  class C_MDS_Tick;
-  C_MDS_Tick *tick_event;
+  Context *tick_event = nullptr;
   void     reset_tick();
 
   void wait_for_omap_osds();
index 1d38f19f498e663b266cd500070bc69a46388a45..9dfce950f5a8d46fa2d410b198122e0d965e11c5 100644 (file)
  * 
  */
 
+#include "common/debug.h"
+#include "mon/health_check.h"
 
 #include "MDSMap.h"
 
 #include <sstream>
 using std::stringstream;
 
-#include "mon/health_check.h"
-
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_
 
 // features
 CompatSet get_mdsmap_compat_set_all() {
@@ -635,6 +637,23 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
   ENCODE_FINISH(bl);
 }
 
+void MDSMap::sanitize(std::function<bool(int64_t pool)> pool_exists)
+{
+  /* Before we did stricter checking, it was possible to remove a data pool
+   * without also deleting it from the MDSMap. Check for that here after
+   * decoding the data pools.
+   */
+
+  for (auto it = data_pools.begin(); it != data_pools.end();) {
+    if (!pool_exists(*it)) {
+      dout(0) << "removed non-existant data pool " << *it << " from MDSMap" << dendl;
+      it = data_pools.erase(it);
+    } else {
+      it++;
+    }
+  }
+}
+
 void MDSMap::decode(bufferlist::iterator& p)
 {
   std::map<mds_rank_t,int32_t> inc;  // Legacy field, parse and drop
index 744e6423508f775242bde6d987b91060e57ccc64..454f422dde2288ee1ae4c70e1912f0ffab49bc03 100644 (file)
@@ -660,7 +660,7 @@ public:
     bufferlist::iterator p = bl.begin();
     decode(p);
   }
-
+  void sanitize(std::function<bool(int64_t pool)> pool_exists);
 
   void print(ostream& out) const;
   void print_summary(Formatter *f, ostream *out) const;
index 77f1819788fc5977338a2821ac6c34f04d3317d0..51ae378947f57f30fbb44557670b2ce98e36a775 100644 (file)
@@ -1055,6 +1055,14 @@ void MDSRank::boot_start(BootStep step, int r)
         dout(2) << "boot_start " << step << ": opening mds log" << dendl;
         mdlog->open(gather.new_sub());
 
+       if (is_starting()) {
+         dout(2) << "boot_start " << step << ": opening purge queue" << dendl;
+         purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
+       } else if (!standby_replaying) {
+         dout(2) << "boot_start " << step << ": opening purge queue (async)" << dendl;
+         purge_queue.open(NULL);
+       }
+
         if (mdsmap->get_tableserver() == whoami) {
           dout(2) << "boot_start " << step << ": opening snap table" << dendl;
           snapserver->set_rank(whoami);
@@ -1073,8 +1081,6 @@ void MDSRank::boot_start(BootStep step, int r)
 
         mdcache->open_mydir_inode(gather.new_sub());
 
-        purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
-
         if (is_starting() ||
             whoami == mdsmap->get_root()) {  // load root inode off disk if we are auth
           mdcache->open_root_inode(gather.new_sub());
@@ -1087,8 +1093,17 @@ void MDSRank::boot_start(BootStep step, int r)
       break;
     case MDS_BOOT_PREPARE_LOG:
       if (is_any_replay()) {
-        dout(2) << "boot_start " << step << ": replaying mds log" << dendl;
-        mdlog->replay(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
+       dout(2) << "boot_start " << step << ": replaying mds log" << dendl;
+       MDSGatherBuilder gather(g_ceph_context,
+           new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
+
+       if (!standby_replaying) {
+         dout(2) << "boot_start " << step << ": waiting for purge queue recovered" << dendl;
+         purge_queue.wait_for_recovery(new C_IO_Wrapper(this, gather.new_sub()));
+       }
+
+       mdlog->replay(gather.new_sub());
+       gather.activate();
       } else {
         dout(2) << "boot_start " << step << ": positioning at end of old mds log" << dendl;
         mdlog->append();
@@ -1214,7 +1229,16 @@ void MDSRank::_standby_replay_restart_finish(int r, uint64_t old_read_pos)
   }
 }
 
-inline void MDSRank::standby_replay_restart()
+class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext {
+public:
+  explicit C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {}
+  void finish(int r) override {
+    assert(!r);
+    mds->standby_replay_restart();
+  }
+};
+
+void MDSRank::standby_replay_restart()
 {
   if (standby_replaying) {
     /* Go around for another pass of replaying in standby */
@@ -1227,15 +1251,17 @@ inline void MDSRank::standby_replay_restart()
     /* We are transitioning out of standby: wait for OSD map update
        before making final pass */
     dout(1) << "standby_replay_restart (final takeover pass)" << dendl;
-    Context *fin = new C_IO_Wrapper(this, new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
-    bool const ready =
-      objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(), fin);
+    Context *fin = new C_IO_Wrapper(this, new C_MDS_StandbyReplayRestart(this));
+    bool ready = objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(), fin);
     if (ready) {
       delete fin;
       mdlog->get_journaler()->reread_head_and_probe(
         new C_MDS_StandbyReplayRestartFinish(
           this,
          mdlog->get_journaler()->get_read_pos()));
+
+      dout(1) << " opening purge queue (async)" << dendl;
+      purge_queue.open(NULL);
     } else {
       dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
               << " (which blacklists prior instance)" << dendl;
@@ -1243,15 +1269,6 @@ inline void MDSRank::standby_replay_restart()
   }
 }
 
-class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext {
-public:
-  explicit C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {}
-  void finish(int r) override {
-    assert(!r);
-    mds->standby_replay_restart();
-  }
-};
-
 void MDSRank::replay_done()
 {
   dout(1) << "replay_done" << (standby_replaying ? " (as standby)" : "") << dendl;
index f520240da3dae16647c4a808b9d2bce720bfb68f..49e48b04cc6d12700a4ff97ab149177a8a2f5c2b 100644 (file)
@@ -79,7 +79,8 @@ PurgeQueue::PurgeQueue(
     max_purge_ops(0),
     drain_initial(0),
     draining(false),
-    delayed_flush(nullptr)
+    delayed_flush(nullptr),
+    recovered(false)
 {
   assert(cct != nullptr);
   assert(on_error != nullptr);
@@ -147,11 +148,14 @@ void PurgeQueue::open(Context *completion)
 
   Mutex::Locker l(lock);
 
-  journaler.recover(new FunctionContext([this, completion](int r){
+  if (completion)
+    waiting_for_recovery.push_back(completion);
+
+  journaler.recover(new FunctionContext([this](int r){
     if (r == -ENOENT) {
       dout(1) << "Purge Queue not found, assuming this is an upgrade and "
                  "creating it." << dendl;
-      create(completion);
+      create(NULL);
     } else if (r == 0) {
       Mutex::Locker l(lock);
       dout(4) << "open complete" << dendl;
@@ -162,12 +166,13 @@ void PurgeQueue::open(Context *completion)
       if (journaler.last_committed.write_pos < journaler.get_write_pos()) {
        dout(4) << "recovering write_pos" << dendl;
        journaler.set_read_pos(journaler.last_committed.write_pos);
-       _recover(completion);
+       _recover();
        return;
       }
 
       journaler.set_writeable();
-      completion->complete(0);
+      recovered = true;
+      finish_contexts(g_ceph_context, waiting_for_recovery);
     } else {
       derr << "Error " << r << " loading Journaler" << dendl;
       on_error->complete(r);
@@ -175,8 +180,16 @@ void PurgeQueue::open(Context *completion)
   }));
 }
 
+void PurgeQueue::wait_for_recovery(Context* c)
+{
+  Mutex::Locker l(lock);
+  if (recovered)
+    c->complete(0);
+  else
+    waiting_for_recovery.push_back(c);
+}
 
-void PurgeQueue::_recover(Context *completion)
+void PurgeQueue::_recover()
 {
   assert(lock.is_locked_by_me());
 
@@ -185,9 +198,9 @@ void PurgeQueue::_recover(Context *completion)
     if (!journaler.is_readable() &&
        !journaler.get_error() &&
        journaler.get_read_pos() < journaler.get_write_pos()) {
-      journaler.wait_for_readable(new FunctionContext([this, completion](int r) {
+      journaler.wait_for_readable(new FunctionContext([this](int r) {
         Mutex::Locker l(lock);
-       _recover(completion);
+       _recover();
       }));
       return;
     }
@@ -204,7 +217,8 @@ void PurgeQueue::_recover(Context *completion)
       // restore original read_pos
       journaler.set_read_pos(journaler.last_committed.expire_pos);
       journaler.set_writeable();
-      completion->complete(0);
+      recovered = true;
+      finish_contexts(g_ceph_context, waiting_for_recovery);
       return;
     }
 
@@ -219,11 +233,18 @@ void PurgeQueue::create(Context *fin)
   dout(4) << "creating" << dendl;
   Mutex::Locker l(lock);
 
+  if (fin)
+    waiting_for_recovery.push_back(fin);
+
   file_layout_t layout = file_layout_t::get_default();
   layout.pool_id = metadata_pool;
   journaler.set_writeable();
   journaler.create(&layout, JOURNAL_FORMAT_RESILIENT);
-  journaler.write_head(fin);
+  journaler.write_head(new FunctionContext([this](int r) {
+    Mutex::Locker l(lock);
+    recovered = true;
+    finish_contexts(g_ceph_context, waiting_for_recovery);
+  }));
 }
 
 /**
index aed66c94ebcdda1f9ae34c25dd96fa2eea383e01..6a13a57ee30360f14d913d0abc8166718d95f99d 100644 (file)
@@ -113,7 +113,7 @@ protected:
   bool draining;
 
   // recover the journal write_pos (drop any partial written entry)
-  void _recover(Context *completion);
+  void _recover();
 
   /**
    * @return true if we were in a position to try and consume something:
@@ -130,6 +130,8 @@ protected:
   void _execute_item_complete(
       uint64_t expire_to);
 
+  bool recovered;
+  std::list<Context*> waiting_for_recovery;
 
 public:
   void init();
@@ -144,6 +146,8 @@ public:
   // Read the Journaler header for an existing queue and start consuming
   void open(Context *completion);
 
+  void wait_for_recovery(Context *c);
+
   // Submit one entry to the work queue.  Call back when it is persisted
   // to the queue (there is no callback for when it is executed)
   void push(const PurgeItem &pi, Context *completion);
index 3d34bd4c1caa0f45835b4af664da5215d5a5b4fc..38c44523e95c90b8858b4bf8b4c35e90d1e3976c 100644 (file)
@@ -341,6 +341,9 @@ void Server::handle_client_session(MClientSession *m)
        session->is_stale() ||
        session->is_killing()) {
       dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
+      // set client metadata for session opened by prepare_force_open_sessions
+      if (!m->client_meta.empty())
+       session->set_client_metadata(m->client_meta);
       m->put();
       return;
     }
@@ -618,9 +621,7 @@ void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
           << " initial v " << mds->sessionmap.get_version() << dendl;
   
 
-  int sessions_inserted = 0;
   for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
-    sessions_inserted++;
 
     Session *session = mds->sessionmap.get_session(p->second.name);
     assert(session);
@@ -1085,8 +1086,14 @@ void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
 void Server::recall_client_state(void)
 {
   /* try to recall at least 80% of all caps */
-  uint64_t max_caps_per_client = (Capability::count() * .8);
-  uint64_t min_caps_per_client = 100;
+  uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
+  uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
+  if (max_caps_per_client < min_caps_per_client) {
+    dout(0) << "max_caps_per_client " << max_caps_per_client
+            << " < min_caps_per_client " << min_caps_per_client << dendl;
+    max_caps_per_client = min_caps_per_client + 1;
+  }
+
   /* unless this ratio is smaller: */
   /* ratio: determine the amount of caps to recall from each client. Use
    * percentage full over the cache reservation. Cap the ratio at 80% of client
@@ -1109,14 +1116,12 @@ void Server::recall_client_state(void)
             << ", leases " << session->leases.size()
             << dendl;
 
-    if (session->caps.size() > min_caps_per_client) {  
-      uint64_t newlim = MIN((session->caps.size() * ratio), max_caps_per_client);
-      if (session->caps.size() > newlim) {
-          MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
-          m->head.max_caps = newlim;
-          mds->send_message_client(m, session);
-          session->notify_recall_sent(newlim);
-      }
+    uint64_t newlim = MAX(MIN((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
+    if (session->caps.size() > newlim) {
+      MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
+      m->head.max_caps = newlim;
+      mds->send_message_client(m, session);
+      session->notify_recall_sent(newlim);
     }
   }
 }
index c877afc152438491cc6f4c150dfd0da6accf8fef..5def13c1491c34c27c80f934c1a0edf0752f730c 100644 (file)
@@ -23,7 +23,7 @@
 
 class MMgrBeacon : public PaxosServiceMessage {
 
-  static const int HEAD_VERSION = 5;
+  static const int HEAD_VERSION = 6;
   static const int COMPAT_VERSION = 1;
 
 protected:
@@ -35,6 +35,9 @@ protected:
   std::set<std::string> available_modules;
   map<string,string> metadata; ///< misc metadata about this osd
 
+  // From active daemon to populate MgrMap::services
+  std::map<std::string, std::string> services;
+
   // Only populated during activation
   std::vector<MonCommand> command_descs;
 
@@ -65,6 +68,15 @@ public:
     return metadata;
   }
 
+  const std::map<std::string,std::string>& get_services() const {
+    return services;
+  }
+
+  void set_services(const std::map<std::string, std::string> &svcs)
+  {
+    services = svcs;
+  }
+
   void set_command_descs(const std::vector<MonCommand> &cmds)
   {
     command_descs = cmds;
@@ -98,6 +110,7 @@ public:
     ::encode(available_modules, payload);
     ::encode(command_descs, payload);
     ::encode(metadata, payload);
+    ::encode(services, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
@@ -118,6 +131,9 @@ public:
     if (header.version >= 5) {
       ::decode(metadata, p);
     }
+    if (header.version >= 6) {
+      ::decode(services, p);
+    }
   }
 };
 
index 27c7cbca4913eb18f56754242bb90bde37755549..10d990807cba70e2ff85987264af5bb49312d2bd 100644 (file)
  */
 class MMgrConfigure : public Message
 {
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
 public:
   uint32_t stats_period;
+  
+  // Default 0 means if unspecified will include all stats
+  uint32_t stats_threshold = 0;
 
   void decode_payload() override
   {
     bufferlist::iterator p = payload.begin();
     ::decode(stats_period, p);
+    if (header.version >= 2) {
+      ::decode(stats_threshold, p);
+    }
   }
 
   void encode_payload(uint64_t features) override {
     ::encode(stats_period, payload);
+    ::encode(stats_threshold, payload);
   }
 
   const char *get_type_name() const override { return "mgrconfigure"; }
   void print(ostream& out) const override {
-    out << get_type_name() << "()";
+    out << get_type_name() << "(period=" << stats_period
+                           << ", threshold=" << stats_threshold << ")";
   }
 
   MMgrConfigure()
index 9b033ec23c23645bf7bf474635a99fa6e93da662..26268927b1a5c99d44e6d02b3170039978dea5f7 100644 (file)
@@ -29,27 +29,36 @@ public:
   std::string nick;
   enum perfcounter_type_d type;
 
+  // For older clients that did not send priority, pretend everything
+  // is "useful" so that mgr plugins filtering on prio will get some
+  // data (albeit probably more than they wanted)
+  uint8_t priority = PerfCountersBuilder::PRIO_USEFUL;
+
   void encode(bufferlist &bl) const
   {
     // TODO: decide whether to drop the per-type
     // encoding here, we could rely on the MgrReport
     // verisoning instead.
-    ENCODE_START(1, 1, bl);
+    ENCODE_START(2, 1, bl);
     ::encode(path, bl);
     ::encode(description, bl);
     ::encode(nick, bl);
     static_assert(sizeof(type) == 1, "perfcounter_type_d must be one byte");
     ::encode((uint8_t)type, bl);
+    ::encode(priority, bl);
     ENCODE_FINISH(bl);
   }
   
   void decode(bufferlist::iterator &p)
   {
-    DECODE_START(1, p);
+    DECODE_START(2, p);
     ::decode(path, p);
     ::decode(description, p);
     ::decode(nick, p);
     ::decode((uint8_t&)type, p);
+    if (struct_v >= 2) {
+      ::decode(priority, p);
+    }
     DECODE_FINISH(p);
   }
 };
index 72246288a51493ca59db1636abd13f9fad3316aa..865642cf417c30f487fb3a698946a20e86f8075f 100644 (file)
@@ -86,7 +86,8 @@ public:
        (features & CEPH_FEATURE_PGPOOL3) == 0 ||
        (features & CEPH_FEATURE_OSDENC) == 0 ||
         (features & CEPH_FEATURE_OSDMAP_ENC) == 0 ||
-       (features & CEPH_FEATURE_MSG_ADDR2) == 0) {
+       (features & CEPH_FEATURE_MSG_ADDR2) == 0 ||
+       !HAVE_FEATURE(features, SERVER_LUMINOUS)) {
       if ((features & CEPH_FEATURE_PGID64) == 0 ||
          (features & CEPH_FEATURE_PGPOOL3) == 0)
        header.version = 1;  // old old_client version
@@ -112,6 +113,14 @@ public:
          inc.fullmap.clear();
          m.encode(inc.fullmap, features | CEPH_FEATURE_RESERVED);
        }
+       if (inc.crush.length()) {
+         // embedded crush map
+         CrushWrapper c;
+         auto p = inc.crush.begin();
+         c.decode(p);
+         inc.crush.clear();
+         c.encode(inc.crush, features);
+       }
        inc.encode(p->second, features | CEPH_FEATURE_RESERVED);
       }
       for (map<epoch_t,bufferlist>::iterator p = maps.begin();
index 3ea7a211c7f4b971d00e5e246a779b94e17778e1..1cdaed9b2c2ae826296f0ae83d1a87ecbb2eaad8 100644 (file)
@@ -23,9 +23,15 @@ class MOSDPGTemp : public PaxosServiceMessage {
  public:
   epoch_t map_epoch = 0;
   map<pg_t, vector<int32_t> > pg_temp;
+  bool forced = false;
 
-  MOSDPGTemp(epoch_t e) : PaxosServiceMessage(MSG_OSD_PGTEMP, e), map_epoch(e) { }
-  MOSDPGTemp() : PaxosServiceMessage(MSG_OSD_PGTEMP, 0) {}
+  MOSDPGTemp(epoch_t e)
+    : PaxosServiceMessage(MSG_OSD_PGTEMP, e, HEAD_VERSION, COMPAT_VERSION),
+      map_epoch(e)
+  {}
+  MOSDPGTemp()
+    : MOSDPGTemp(0)
+  {}
 private:
   ~MOSDPGTemp() override {}
 
@@ -34,19 +40,25 @@ public:
     paxos_encode();
     ::encode(map_epoch, payload);
     ::encode(pg_temp, payload);
+    ::encode(forced, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
     paxos_decode(p);
     ::decode(map_epoch, p);
     ::decode(pg_temp, p);
+    if (header.version >= 2) {
+      ::decode(forced, p);
+    }
   }
 
   const char *get_type_name() const override { return "osd_pgtemp"; }
   void print(ostream &out) const override {
     out << "osd_pgtemp(e" << map_epoch << " " << pg_temp << " v" << version << ")";
   }
-  
+private:
+  static constexpr int HEAD_VERSION = 2;
+  static constexpr int COMPAT_VERSION = 1;
 };
 
 #endif
diff --git a/ceph/src/mgr/ActivePyModule.cc b/ceph/src/mgr/ActivePyModule.cc
new file mode 100644 (file)
index 0000000..90040af
--- /dev/null
@@ -0,0 +1,225 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "BaseMgrModule.h"
+
+#include "PyFormatter.h"
+
+#include "common/debug.h"
+
+#include "ActivePyModule.h"
+
+//XXX courtesy of http://stackoverflow.com/questions/1418015/how-to-get-python-exception-text
+#include <boost/python.hpp>
+#include "include/assert.h"  // boost clobbers this
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+// decode a Python exception into a string
+std::string handle_pyerror()
+{
+    using namespace boost::python;
+    using namespace boost;
+
+    PyObject *exc, *val, *tb;
+    object formatted_list, formatted;
+    PyErr_Fetch(&exc, &val, &tb);
+    handle<> hexc(exc), hval(allow_null(val)), htb(allow_null(tb));
+    object traceback(import("traceback"));
+    if (!tb) {
+        object format_exception_only(traceback.attr("format_exception_only"));
+        formatted_list = format_exception_only(hexc, hval);
+    } else {
+        object format_exception(traceback.attr("format_exception"));
+        formatted_list = format_exception(hexc,hval, htb);
+    }
+    formatted = str("").join(formatted_list);
+    return extract<std::string>(formatted);
+}
+
+int ActivePyModule::load(ActivePyModules *py_modules)
+{
+  assert(py_modules);
+  Gil gil(pMyThreadState, true);
+
+  // We tell the module how we name it, so that it can be consistent
+  // with us in logging etc.
+  auto pThisPtr = PyCapsule_New(this, nullptr, nullptr);
+  auto pPyModules = PyCapsule_New(py_modules, nullptr, nullptr);
+  auto pModuleName = PyString_FromString(module_name.c_str());
+  auto pArgs = PyTuple_Pack(3, pModuleName, pPyModules, pThisPtr);
+
+  pClassInstance = PyObject_CallObject(pClass, pArgs);
+  Py_DECREF(pClass);
+  Py_DECREF(pModuleName);
+  Py_DECREF(pArgs);
+  if (pClassInstance == nullptr) {
+    derr << "Failed to construct class in '" << module_name << "'" << dendl;
+    derr << handle_pyerror() << dendl;
+    return -EINVAL;
+  } else {
+    dout(1) << "Constructed class from module: " << module_name << dendl;
+  }
+
+  return load_commands();
+}
+
+void ActivePyModule::notify(const std::string &notify_type, const std::string &notify_id)
+{
+  assert(pClassInstance != nullptr);
+
+  Gil gil(pMyThreadState, true);
+
+  // Execute
+  auto pValue = PyObject_CallMethod(pClassInstance,
+       const_cast<char*>("notify"), const_cast<char*>("(ss)"),
+       notify_type.c_str(), notify_id.c_str());
+
+  if (pValue != NULL) {
+    Py_DECREF(pValue);
+  } else {
+    derr << module_name << ".notify:" << dendl;
+    derr << handle_pyerror() << dendl;
+    // FIXME: callers can't be expected to handle a python module
+    // that has spontaneously broken, but Mgr() should provide
+    // a hook to unload misbehaving modules when they have an
+    // error somewhere like this
+  }
+}
+
+void ActivePyModule::notify_clog(const LogEntry &log_entry)
+{
+  assert(pClassInstance != nullptr);
+
+  Gil gil(pMyThreadState, true);
+
+  // Construct python-ized LogEntry
+  PyFormatter f;
+  log_entry.dump(&f);
+  auto py_log_entry = f.get();
+
+  // Execute
+  auto pValue = PyObject_CallMethod(pClassInstance,
+       const_cast<char*>("notify"), const_cast<char*>("(sN)"),
+       "clog", py_log_entry);
+
+  if (pValue != NULL) {
+    Py_DECREF(pValue);
+  } else {
+    derr << module_name << ".notify_clog:" << dendl;
+    derr << handle_pyerror() << dendl;
+    // FIXME: callers can't be expected to handle a python module
+    // that has spontaneously broken, but Mgr() should provide
+    // a hook to unload misbehaving modules when they have an
+    // error somewhere like this
+  }
+}
+
+int ActivePyModule::load_commands()
+{
+  // Don't need a Gil here -- this is called from ActivePyModule::load(),
+  // which already has one.
+  PyObject *command_list = PyObject_GetAttrString(pClassInstance, "COMMANDS");
+  if (command_list == nullptr) {
+    // Even modules that don't define command should still have the COMMANDS
+    // from the MgrModule definition.  Something is wrong!
+    derr << "Module " << get_name() << " has missing COMMANDS member" << dendl;
+    return -EINVAL;
+  }
+  if (!PyObject_TypeCheck(command_list, &PyList_Type)) {
+    // Relatively easy mistake for human to make, e.g. defining COMMANDS
+    // as a {} instead of a []
+    derr << "Module " << get_name() << " has COMMANDS member of wrong type ("
+            "should be a list)" << dendl;
+    return -EINVAL;
+  }
+  const size_t list_size = PyList_Size(command_list);
+  for (size_t i = 0; i < list_size; ++i) {
+    PyObject *command = PyList_GetItem(command_list, i);
+    assert(command != nullptr);
+
+    ModuleCommand item;
+
+    PyObject *pCmd = PyDict_GetItemString(command, "cmd");
+    assert(pCmd != nullptr);
+    item.cmdstring = PyString_AsString(pCmd);
+
+    dout(20) << "loaded command " << item.cmdstring << dendl;
+
+    PyObject *pDesc = PyDict_GetItemString(command, "desc");
+    assert(pDesc != nullptr);
+    item.helpstring = PyString_AsString(pDesc);
+
+    PyObject *pPerm = PyDict_GetItemString(command, "perm");
+    assert(pPerm != nullptr);
+    item.perm = PyString_AsString(pPerm);
+
+    item.handler = this;
+
+    commands.push_back(item);
+  }
+  Py_DECREF(command_list);
+
+  dout(10) << "loaded " << commands.size() << " commands" << dendl;
+
+  return 0;
+}
+
+int ActivePyModule::handle_command(
+  const cmdmap_t &cmdmap,
+  std::stringstream *ds,
+  std::stringstream *ss)
+{
+  assert(ss != nullptr);
+  assert(ds != nullptr);
+
+  Gil gil(pMyThreadState, true);
+
+  PyFormatter f;
+  cmdmap_dump(cmdmap, &f);
+  PyObject *py_cmd = f.get();
+
+  auto pResult = PyObject_CallMethod(pClassInstance,
+      const_cast<char*>("handle_command"), const_cast<char*>("(O)"), py_cmd);
+
+  Py_DECREF(py_cmd);
+
+  int r = 0;
+  if (pResult != NULL) {
+    if (PyTuple_Size(pResult) != 3) {
+      r = -EINVAL;
+    } else {
+      r = PyInt_AsLong(PyTuple_GetItem(pResult, 0));
+      *ds << PyString_AsString(PyTuple_GetItem(pResult, 1));
+      *ss << PyString_AsString(PyTuple_GetItem(pResult, 2));
+    }
+
+    Py_DECREF(pResult);
+  } else {
+    *ds << "";
+    *ss << handle_pyerror();
+    r = -EINVAL;
+  }
+
+  return r;
+}
+
+void ActivePyModule::get_health_checks(health_check_map_t *checks)
+{
+  checks->merge(health_checks);
+}
+
diff --git a/ceph/src/mgr/ActivePyModule.h b/ceph/src/mgr/ActivePyModule.h
new file mode 100644 (file)
index 0000000..0c2ee12
--- /dev/null
@@ -0,0 +1,98 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#pragma once
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include "Python.h"
+
+#include "common/cmdparse.h"
+#include "common/LogEntry.h"
+#include "common/Mutex.h"
+#include "common/Thread.h"
+#include "mon/health_check.h"
+#include "mgr/Gil.h"
+
+#include "PyModuleRunner.h"
+
+#include <vector>
+#include <string>
+
+
+class ActivePyModule;
+class ActivePyModules;
+
+/**
+ * A Ceph CLI command description provided from a Python module
+ */
+class ModuleCommand {
+public:
+  std::string cmdstring;
+  std::string helpstring;
+  std::string perm;
+  ActivePyModule *handler;
+};
+
+class ActivePyModule : public PyModuleRunner
+{
+private:
+  health_check_map_t health_checks;
+
+  std::vector<ModuleCommand> commands;
+
+  int load_commands();
+
+  // Optional, URI exposed by plugins that implement serve()
+  std::string uri;
+
+public:
+  ActivePyModule(const std::string &module_name_,
+      PyObject *pClass_,
+      const SafeThreadState &my_ts_)
+    : PyModuleRunner(module_name_, pClass_, my_ts_)
+  {}
+
+  int load(ActivePyModules *py_modules);
+  void notify(const std::string &notify_type, const std::string &notify_id);
+  void notify_clog(const LogEntry &le);
+
+  const std::vector<ModuleCommand> &get_commands() const
+  {
+    return commands;
+  }
+
+  int handle_command(
+    const cmdmap_t &cmdmap,
+    std::stringstream *ds,
+    std::stringstream *ss);
+
+  void set_health_checks(health_check_map_t&& c) {
+    health_checks = std::move(c);
+  }
+  void get_health_checks(health_check_map_t *checks);
+
+  void set_uri(const std::string &str)
+  {
+    uri = str;
+  }
+
+  std::string get_uri() const
+  {
+    return uri;
+  }
+};
+
+std::string handle_pyerror();
+
diff --git a/ceph/src/mgr/ActivePyModules.cc b/ceph/src/mgr/ActivePyModules.cc
new file mode 100644 (file)
index 0000000..6025e62
--- /dev/null
@@ -0,0 +1,737 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+// Include this first to get python headers earlier
+#include "BaseMgrModule.h"
+#include "Gil.h"
+
+#include "common/errno.h"
+#include "include/stringify.h"
+
+#include "PyFormatter.h"
+
+#include "osd/OSDMap.h"
+#include "mon/MonMap.h"
+
+#include "mgr/MgrContext.h"
+
+// For ::config_prefix
+#include "PyModuleRegistry.h"
+
+#include "ActivePyModules.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+
+ActivePyModules::ActivePyModules(PyModuleConfig const &config_,
+          DaemonStateIndex &ds, ClusterState &cs,
+         MonClient &mc, LogChannelRef clog_, Objecter &objecter_,
+          Client &client_, Finisher &f)
+  : config_cache(config_), daemon_state(ds), cluster_state(cs),
+    monc(mc), clog(clog_), objecter(objecter_), client(client_), finisher(f),
+    lock("ActivePyModules")
+{}
+
+ActivePyModules::~ActivePyModules() = default;
+
+void ActivePyModules::dump_server(const std::string &hostname,
+                      const DaemonStateCollection &dmc,
+                      Formatter *f)
+{
+  f->dump_string("hostname", hostname);
+  f->open_array_section("services");
+  std::string ceph_version;
+
+  for (const auto &i : dmc) {
+    Mutex::Locker l(i.second->lock);
+    const auto &key = i.first;
+    const std::string &str_type = key.first;
+    const std::string &svc_name = key.second;
+
+    // TODO: pick the highest version, and make sure that
+    // somewhere else (during health reporting?) we are
+    // indicating to the user if we see mixed versions
+    auto ver_iter = i.second->metadata.find("ceph_version");
+    if (ver_iter != i.second->metadata.end()) {
+      ceph_version = i.second->metadata.at("ceph_version");
+    }
+
+    f->open_object_section("service");
+    f->dump_string("type", str_type);
+    f->dump_string("id", svc_name);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->dump_string("ceph_version", ceph_version);
+}
+
+
+
+PyObject *ActivePyModules::get_server_python(const std::string &hostname)
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  Mutex::Locker l(lock);
+  PyEval_RestoreThread(tstate);
+  dout(10) << " (" << hostname << ")" << dendl;
+
+  auto dmc = daemon_state.get_by_server(hostname);
+
+  PyFormatter f;
+  dump_server(hostname, dmc, &f);
+  return f.get();
+}
+
+
+PyObject *ActivePyModules::list_servers_python()
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  Mutex::Locker l(lock);
+  PyEval_RestoreThread(tstate);
+  dout(10) << " >" << dendl;
+
+  PyFormatter f(false, true);
+  daemon_state.with_daemons_by_server([this, &f]
+      (const std::map<std::string, DaemonStateCollection> &all) {
+    for (const auto &i : all) {
+      const auto &hostname = i.first;
+
+      f.open_object_section("server");
+      dump_server(hostname, i.second, &f);
+      f.close_section();
+    }
+  });
+
+  return f.get();
+}
+
+PyObject *ActivePyModules::get_metadata_python(
+  const std::string &svc_type,
+  const std::string &svc_id)
+{
+  auto metadata = daemon_state.get(DaemonKey(svc_type, svc_id));
+  if (metadata == nullptr) {
+    derr << "Requested missing service " << svc_type << "." << svc_id << dendl;
+    Py_RETURN_NONE;
+  }
+
+  Mutex::Locker l(metadata->lock);
+  PyFormatter f;
+  f.dump_string("hostname", metadata->hostname);
+  for (const auto &i : metadata->metadata) {
+    f.dump_string(i.first.c_str(), i.second);
+  }
+
+  return f.get();
+}
+
+PyObject *ActivePyModules::get_daemon_status_python(
+  const std::string &svc_type,
+  const std::string &svc_id)
+{
+  auto metadata = daemon_state.get(DaemonKey(svc_type, svc_id));
+  if (metadata == nullptr) {
+    derr << "Requested missing service " << svc_type << "." << svc_id << dendl;
+    Py_RETURN_NONE;
+  }
+
+  Mutex::Locker l(metadata->lock);
+  PyFormatter f;
+  for (const auto &i : metadata->service_status) {
+    f.dump_string(i.first.c_str(), i.second);
+  }
+  return f.get();
+}
+
+PyObject *ActivePyModules::get_python(const std::string &what)
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  Mutex::Locker l(lock);
+  PyEval_RestoreThread(tstate);
+
+  if (what == "fs_map") {
+    PyFormatter f;
+    cluster_state.with_fsmap([&f](const FSMap &fsmap) {
+      fsmap.dump(&f);
+    });
+    return f.get();
+  } else if (what == "osdmap_crush_map_text") {
+    bufferlist rdata;
+    cluster_state.with_osdmap([&rdata](const OSDMap &osd_map){
+       osd_map.crush->encode(rdata, CEPH_FEATURES_SUPPORTED_DEFAULT);
+    });
+    std::string crush_text = rdata.to_str();
+    return PyString_FromString(crush_text.c_str());
+  } else if (what.substr(0, 7) == "osd_map") {
+    PyFormatter f;
+    cluster_state.with_osdmap([&f, &what](const OSDMap &osd_map){
+      if (what == "osd_map") {
+        osd_map.dump(&f);
+      } else if (what == "osd_map_tree") {
+        osd_map.print_tree(&f, nullptr);
+      } else if (what == "osd_map_crush") {
+        osd_map.crush->dump(&f);
+      }
+    });
+    return f.get();
+  } else if (what == "config") {
+    PyFormatter f;
+    g_conf->show_config(&f);
+    return f.get();
+  } else if (what == "mon_map") {
+    PyFormatter f;
+    cluster_state.with_monmap(
+      [&f](const MonMap &monmap) {
+        monmap.dump(&f);
+      }
+    );
+    return f.get();
+  } else if (what == "service_map") {
+    PyFormatter f;
+    cluster_state.with_servicemap(
+      [&f](const ServiceMap &service_map) {
+        service_map.dump(&f);
+      }
+    );
+    return f.get();
+  } else if (what == "osd_metadata") {
+    PyFormatter f;
+    auto dmc = daemon_state.get_by_service("osd");
+    for (const auto &i : dmc) {
+      Mutex::Locker l(i.second->lock);
+      f.open_object_section(i.first.second.c_str());
+      f.dump_string("hostname", i.second->hostname);
+      for (const auto &j : i.second->metadata) {
+        f.dump_string(j.first.c_str(), j.second);
+      }
+      f.close_section();
+    }
+    return f.get();
+  } else if (what == "pg_summary") {
+    PyFormatter f;
+    cluster_state.with_pgmap(
+        [&f](const PGMap &pg_map) {
+          std::map<std::string, std::map<std::string, uint32_t> > osds;
+          std::map<std::string, std::map<std::string, uint32_t> > pools;
+          std::map<std::string, uint32_t> all;
+          for (const auto &i : pg_map.pg_stat) {
+            const auto pool = i.first.m_pool;
+            const std::string state = pg_state_string(i.second.state);
+            // Insert to per-pool map
+            pools[stringify(pool)][state]++;
+            for (const auto &osd_id : i.second.acting) {
+              osds[stringify(osd_id)][state]++;
+            }
+            all[state]++;
+          }
+          f.open_object_section("by_osd");
+          for (const auto &i : osds) {
+            f.open_object_section(i.first.c_str());
+            for (const auto &j : i.second) {
+              f.dump_int(j.first.c_str(), j.second);
+            }
+            f.close_section();
+          }
+          f.close_section();
+          f.open_object_section("by_pool");
+          for (const auto &i : pools) {
+            f.open_object_section(i.first.c_str());
+            for (const auto &j : i.second) {
+              f.dump_int(j.first.c_str(), j.second);
+            }
+            f.close_section();
+          }
+          f.close_section();
+          f.open_object_section("all");
+          for (const auto &i : all) {
+            f.dump_int(i.first.c_str(), i.second);
+          }
+          f.close_section();
+        }
+    );
+    return f.get();
+  } else if (what == "pg_status") {
+    PyFormatter f;
+    cluster_state.with_pgmap(
+        [&f](const PGMap &pg_map) {
+         pg_map.print_summary(&f, nullptr);
+        }
+    );
+    return f.get();
+  } else if (what == "pg_dump") {
+    PyFormatter f;
+        cluster_state.with_pgmap(
+        [&f](const PGMap &pg_map) {
+         pg_map.dump(&f);
+        }
+    );
+    return f.get();
+  } else if (what == "df") {
+    PyFormatter f;
+
+    cluster_state.with_osdmap([this, &f](const OSDMap &osd_map){
+      cluster_state.with_pgmap(
+          [&osd_map, &f](const PGMap &pg_map) {
+        pg_map.dump_fs_stats(nullptr, &f, true);
+        pg_map.dump_pool_stats_full(osd_map, nullptr, &f, true);
+      });
+    });
+    return f.get();
+  } else if (what == "osd_stats") {
+    PyFormatter f;
+    cluster_state.with_pgmap(
+        [&f](const PGMap &pg_map) {
+      pg_map.dump_osd_stats(&f);
+    });
+    return f.get();
+  } else if (what == "health" || what == "mon_status") {
+    PyFormatter f;
+    bufferlist json;
+    if (what == "health") {
+      json = cluster_state.get_health();
+    } else if (what == "mon_status") {
+      json = cluster_state.get_mon_status();
+    } else {
+      assert(false);
+    }
+    f.dump_string("json", json.to_str());
+    return f.get();
+  } else if (what == "mgr_map") {
+    PyFormatter f;
+    cluster_state.with_mgrmap([&f](const MgrMap &mgr_map) {
+      mgr_map.dump(&f);
+    });
+    return f.get();
+  } else {
+    derr << "Python module requested unknown data '" << what << "'" << dendl;
+    Py_RETURN_NONE;
+  }
+}
+
+int ActivePyModules::start_one(std::string const &module_name,
+    PyObject *pClass, const SafeThreadState &pMyThreadState)
+{
+  Mutex::Locker l(lock);
+
+  assert(modules.count(module_name) == 0);
+
+  modules[module_name].reset(new ActivePyModule(
+      module_name, pClass,
+      pMyThreadState));
+
+  int r = modules[module_name]->load(this);
+  if (r != 0) {
+    return r;
+  } else {
+    dout(4) << "Starting thread for " << module_name << dendl;
+    // Giving Thread the module's module_name member as its
+    // char* thread name: thread must not outlive module class lifetime.
+    modules[module_name]->thread.create(
+        modules[module_name]->get_name().c_str());
+
+    return 0;
+  }
+}
+
+void ActivePyModules::shutdown()
+{
+  Mutex::Locker locker(lock);
+
+  // Signal modules to drop out of serve() and/or tear down resources
+  for (auto &i : modules) {
+    auto module = i.second.get();
+    const auto& name = i.first;
+
+    lock.Unlock();
+    dout(10) << "calling module " << name << " shutdown()" << dendl;
+    module->shutdown();
+    dout(10) << "module " << name << " shutdown() returned" << dendl;
+    lock.Lock();
+  }
+
+  // For modules implementing serve(), finish the threads where we
+  // were running that.
+  for (auto &i : modules) {
+    lock.Unlock();
+    dout(10) << "joining module " << i.first << dendl;
+    i.second->thread.join();
+    dout(10) << "joined module " << i.first << dendl;
+    lock.Lock();
+  }
+
+  modules.clear();
+}
+
+void ActivePyModules::notify_all(const std::string &notify_type,
+                     const std::string &notify_id)
+{
+  Mutex::Locker l(lock);
+
+  dout(10) << __func__ << ": notify_all " << notify_type << dendl;
+  for (auto& i : modules) {
+    auto module = i.second.get();
+    // Send all python calls down a Finisher to avoid blocking
+    // C++ code, and avoid any potential lock cycles.
+    finisher.queue(new FunctionContext([module, notify_type, notify_id](int r){
+      module->notify(notify_type, notify_id);
+    }));
+  }
+}
+
+void ActivePyModules::notify_all(const LogEntry &log_entry)
+{
+  Mutex::Locker l(lock);
+
+  dout(10) << __func__ << ": notify_all (clog)" << dendl;
+  for (auto& i : modules) {
+    auto module = i.second.get();
+    // Send all python calls down a Finisher to avoid blocking
+    // C++ code, and avoid any potential lock cycles.
+    //
+    // Note intentional use of non-reference lambda binding on
+    // log_entry: we take a copy because caller's instance is
+    // probably ephemeral.
+    finisher.queue(new FunctionContext([module, log_entry](int r){
+      module->notify_clog(log_entry);
+    }));
+  }
+}
+
+bool ActivePyModules::get_config(const std::string &module_name,
+    const std::string &key, std::string *val) const
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  Mutex::Locker l(lock);
+  PyEval_RestoreThread(tstate);
+
+  const std::string global_key = PyModuleRegistry::config_prefix
+    + module_name + "/" + key;
+
+  dout(4) << __func__ << "key: " << global_key << dendl;
+
+  if (config_cache.count(global_key)) {
+    *val = config_cache.at(global_key);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+PyObject *ActivePyModules::get_config_prefix(const std::string &module_name,
+    const std::string &prefix) const
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  Mutex::Locker l(lock);
+  PyEval_RestoreThread(tstate);
+
+  const std::string base_prefix = PyModuleRegistry::config_prefix
+                                    + module_name + "/";
+  const std::string global_prefix = base_prefix + prefix;
+  dout(4) << __func__ << "prefix: " << global_prefix << dendl;
+
+  PyFormatter f;
+  for (auto p = config_cache.lower_bound(global_prefix);
+       p != config_cache.end() && p->first.find(global_prefix) == 0;
+       ++p) {
+    f.dump_string(p->first.c_str() + base_prefix.size(), p->second);
+  }
+  return f.get();
+}
+
+void ActivePyModules::set_config(const std::string &module_name,
+    const std::string &key, const boost::optional<std::string>& val)
+{
+  const std::string global_key = PyModuleRegistry::config_prefix
+                                   + module_name + "/" + key;
+
+  Command set_cmd;
+  {
+    PyThreadState *tstate = PyEval_SaveThread();
+    Mutex::Locker l(lock);
+    PyEval_RestoreThread(tstate);
+    if (val) {
+      config_cache[global_key] = *val;
+    } else {
+      config_cache.erase(global_key);
+    }
+
+    std::ostringstream cmd_json;
+    JSONFormatter jf;
+    jf.open_object_section("cmd");
+    if (val) {
+      jf.dump_string("prefix", "config-key set");
+      jf.dump_string("key", global_key);
+      jf.dump_string("val", *val);
+    } else {
+      jf.dump_string("prefix", "config-key del");
+      jf.dump_string("key", global_key);
+    }
+    jf.close_section();
+    jf.flush(cmd_json);
+    set_cmd.run(&monc, cmd_json.str());
+  }
+  set_cmd.wait();
+
+  if (set_cmd.r != 0) {
+    // config-key set will fail if mgr's auth key has insufficient
+    // permission to set config keys
+    // FIXME: should this somehow raise an exception back into Python land?
+    dout(0) << "`config-key set " << global_key << " " << val << "` failed: "
+      << cpp_strerror(set_cmd.r) << dendl;
+    dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl;
+  }
+}
+
+std::vector<ModuleCommand> ActivePyModules::get_py_commands() const
+{
+  Mutex::Locker l(lock);
+
+  std::vector<ModuleCommand> result;
+  for (const auto& i : modules) {
+    auto module = i.second.get();
+    auto mod_commands = module->get_commands();
+    for (auto j : mod_commands) {
+      result.push_back(j);
+    }
+  }
+
+  return result;
+}
+
+std::vector<MonCommand> ActivePyModules::get_commands() const
+{
+  std::vector<ModuleCommand> commands = get_py_commands();
+  std::vector<MonCommand> result;
+  for (auto &pyc: commands) {
+    result.push_back({pyc.cmdstring, pyc.helpstring, "mgr",
+                        pyc.perm, "cli", MonCommand::FLAG_MGR});
+  }
+  return result;
+}
+
+
+std::map<std::string, std::string> ActivePyModules::get_services() const
+{
+  std::map<std::string, std::string> result;
+  Mutex::Locker l(lock);
+  for (const auto& i : modules) {
+    const auto &module = i.second.get();
+    std::string svc_str = module->get_uri();
+    if (!svc_str.empty()) {
+      result[module->get_name()] = svc_str;
+    }
+  }
+
+  return result;
+}
+
+PyObject* ActivePyModules::get_counter_python(
+    const std::string &svc_name,
+    const std::string &svc_id,
+    const std::string &path)
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  Mutex::Locker l(lock);
+  PyEval_RestoreThread(tstate);
+
+  PyFormatter f;
+  f.open_array_section(path.c_str());
+
+  auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
+  if (metadata) {
+    Mutex::Locker l2(metadata->lock);
+    if (metadata->perf_counters.instances.count(path)) {
+      auto counter_instance = metadata->perf_counters.instances.at(path);
+      const auto &data = counter_instance.get_data();
+      for (const auto &datapoint : data) {
+        f.open_array_section("datapoint");
+        f.dump_unsigned("t", datapoint.t.sec());
+        f.dump_unsigned("v", datapoint.v);
+        f.close_section();
+
+      }
+    } else {
+      dout(4) << "Missing counter: '" << path << "' ("
+              << svc_name << "." << svc_id << ")" << dendl;
+      dout(20) << "Paths are:" << dendl;
+      for (const auto &i : metadata->perf_counters.instances) {
+        dout(20) << i.first << dendl;
+      }
+    }
+  } else {
+    dout(4) << "No daemon state for "
+              << svc_name << "." << svc_id << ")" << dendl;
+  }
+  f.close_section();
+  return f.get();
+}
+
+PyObject* ActivePyModules::get_perf_schema_python(
+    const std::string svc_type,
+    const std::string &svc_id)
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  Mutex::Locker l(lock);
+  PyEval_RestoreThread(tstate);
+
+  DaemonStateCollection daemons;
+
+  if (svc_type == "") {
+    daemons = std::move(daemon_state.get_all());
+  } else if (svc_id.empty()) {
+    daemons = std::move(daemon_state.get_by_service(svc_type));
+  } else {
+    auto key = DaemonKey(svc_type, svc_id);
+    // so that the below can be a loop in all cases
+    auto got = daemon_state.get(key);
+    if (got != nullptr) {
+      daemons[key] = got;
+    }
+  }
+
+  PyFormatter f;
+  if (!daemons.empty()) {
+    for (auto statepair : daemons) {
+      auto key = statepair.first;
+      auto state = statepair.second;
+
+      std::ostringstream daemon_name;
+      daemon_name << key.first << "." << key.second;
+      f.open_object_section(daemon_name.str().c_str());
+
+      Mutex::Locker l(state->lock);
+      for (auto ctr_inst_iter : state->perf_counters.instances) {
+        const auto &counter_name = ctr_inst_iter.first;
+       f.open_object_section(counter_name.c_str());
+       auto type = state->perf_counters.types[counter_name];
+       f.dump_string("description", type.description);
+       if (!type.nick.empty()) {
+         f.dump_string("nick", type.nick);
+       }
+       f.dump_unsigned("type", type.type);
+       f.dump_unsigned("priority", type.priority);
+       f.close_section();
+      }
+      f.close_section();
+    }
+  } else {
+    dout(4) << __func__ << ": No daemon state found for "
+              << svc_type << "." << svc_id << ")" << dendl;
+  }
+  return f.get();
+}
+
+PyObject *ActivePyModules::get_context()
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  Mutex::Locker l(lock);
+  PyEval_RestoreThread(tstate);
+
+  // Construct a capsule containing ceph context.
+  // Not incrementing/decrementing ref count on the context because
+  // it's the global one and it has process lifetime.
+  auto capsule = PyCapsule_New(g_ceph_context, nullptr, nullptr);
+  return capsule;
+}
+
+/**
+ * Helper for our wrapped types that take a capsule in their constructor.
+ */
+PyObject *construct_with_capsule(
+    const std::string &module_name,
+    const std::string &clsname,
+    void *wrapped)
+{
+  // Look up the OSDMap type which we will construct
+  PyObject *module = PyImport_ImportModule(module_name.c_str());
+  if (!module) {
+    derr << "Failed to import python module:" << dendl;
+    derr << handle_pyerror() << dendl;
+  }
+  assert(module);
+
+  PyObject *wrapper_type = PyObject_GetAttrString(
+      module, (const char*)clsname.c_str());
+  if (!wrapper_type) {
+    derr << "Failed to get python type:" << dendl;
+    derr << handle_pyerror() << dendl;
+  }
+  assert(wrapper_type);
+
+  // Construct a capsule containing an OSDMap.
+  auto wrapped_capsule = PyCapsule_New(wrapped, nullptr, nullptr);
+  assert(wrapped_capsule);
+
+  // Construct the python OSDMap
+  auto pArgs = PyTuple_Pack(1, wrapped_capsule);
+  auto wrapper_instance = PyObject_CallObject(wrapper_type, pArgs);
+  if (wrapper_instance == nullptr) {
+    derr << "Failed to construct python OSDMap:" << dendl;
+    derr << handle_pyerror() << dendl;
+  }
+  assert(wrapper_instance != nullptr);
+  Py_DECREF(pArgs);
+  Py_DECREF(wrapped_capsule);
+
+  Py_DECREF(wrapper_type);
+  Py_DECREF(module);
+
+  return wrapper_instance;
+}
+
+PyObject *ActivePyModules::get_osdmap()
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  Mutex::Locker l(lock);
+  PyEval_RestoreThread(tstate);
+
+  OSDMap *newmap = new OSDMap;
+
+  cluster_state.with_osdmap([&](const OSDMap& o) {
+      newmap->deepish_copy_from(o);
+    });
+
+  return construct_with_capsule("mgr_module", "OSDMap", (void*)newmap);
+}
+
+void ActivePyModules::set_health_checks(const std::string& module_name,
+                                 health_check_map_t&& checks)
+{
+  Mutex::Locker l(lock);
+  auto p = modules.find(module_name);
+  if (p != modules.end()) {
+    p->second->set_health_checks(std::move(checks));
+  }
+}
+
+void ActivePyModules::get_health_checks(health_check_map_t *checks)
+{
+  Mutex::Locker l(lock);
+  for (auto& p : modules) {
+    p.second->get_health_checks(checks);
+  }
+}
+
+void ActivePyModules::set_uri(const std::string& module_name,
+                        const std::string &uri)
+{
+  Mutex::Locker l(lock);
+
+  dout(4) << " module " << module_name << " set URI '" << uri << "'" << dendl;
+
+  modules[module_name]->set_uri(uri);
+}
+
diff --git a/ceph/src/mgr/ActivePyModules.h b/ceph/src/mgr/ActivePyModules.h
new file mode 100644 (file)
index 0000000..21e6529
--- /dev/null
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#pragma once
+
+#include "ActivePyModule.h"
+
+#include "common/Finisher.h"
+#include "common/Mutex.h"
+
+#include "osdc/Objecter.h"
+#include "client/Client.h"
+#include "common/LogClient.h"
+#include "mon/MgrMap.h"
+#include "mon/MonCommand.h"
+
+#include "DaemonState.h"
+#include "ClusterState.h"
+
+class health_check_map_t;
+
+typedef std::map<std::string, std::string> PyModuleConfig;
+
+class ActivePyModules
+{
+
+  std::map<std::string, std::unique_ptr<ActivePyModule>> modules;
+  PyModuleConfig config_cache;
+  DaemonStateIndex &daemon_state;
+  ClusterState &cluster_state;
+  MonClient &monc;
+  LogChannelRef clog;
+  Objecter &objecter;
+  Client   &client;
+  Finisher &finisher;
+
+
+  mutable Mutex lock{"ActivePyModules::lock"};
+
+public:
+  ActivePyModules(PyModuleConfig const &config_,
+            DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+            LogChannelRef clog_, Objecter &objecter_, Client &client_,
+            Finisher &f);
+
+  ~ActivePyModules();
+
+  // FIXME: wrap for send_command?
+  MonClient &get_monc() {return monc;}
+  Objecter  &get_objecter() {return objecter;}
+  Client    &get_client() {return client;}
+
+  PyObject *get_python(const std::string &what);
+  PyObject *get_server_python(const std::string &hostname);
+  PyObject *list_servers_python();
+  PyObject *get_metadata_python(
+    const std::string &svc_type, const std::string &svc_id);
+  PyObject *get_daemon_status_python(
+    const std::string &svc_type, const std::string &svc_id);
+  PyObject *get_counter_python(
+    const std::string &svc_type,
+    const std::string &svc_id,
+    const std::string &path);
+  PyObject *get_perf_schema_python(
+     const std::string svc_type,
+     const std::string &svc_id);
+  PyObject *get_context();
+  PyObject *get_osdmap();
+
+  bool get_config(const std::string &module_name,
+      const std::string &key, std::string *val) const;
+  PyObject *get_config_prefix(const std::string &module_name,
+                             const std::string &prefix) const;
+  void set_config(const std::string &module_name,
+      const std::string &key, const boost::optional<std::string> &val);
+
+  void set_health_checks(const std::string& module_name,
+                        health_check_map_t&& checks);
+  void get_health_checks(health_check_map_t *checks);
+
+  void set_uri(const std::string& module_name, const std::string &uri);
+
+  // Python command definitions, including callback
+  std::vector<ModuleCommand> get_py_commands() const;
+
+  // Monitor command definitions, suitable for CLI
+  std::vector<MonCommand> get_commands() const;
+
+  std::map<std::string, std::string> get_services() const;
+
+  // Public so that MonCommandCompletion can use it
+  // FIXME: for send_command completion notifications,
+  // send it to only the module that sent the command, not everyone
+  void notify_all(const std::string &notify_type,
+                  const std::string &notify_id);
+  void notify_all(const LogEntry &log_entry);
+
+  int init();
+  void shutdown();
+
+  int start_one(std::string const &module_name,
+                PyObject *pClass,
+                const SafeThreadState &pMyThreadState);
+
+  void dump_server(const std::string &hostname,
+                   const DaemonStateCollection &dmc,
+                   Formatter *f);
+};
+
diff --git a/ceph/src/mgr/BaseMgrModule.cc b/ceph/src/mgr/BaseMgrModule.cc
new file mode 100644 (file)
index 0000000..74d1a94
--- /dev/null
@@ -0,0 +1,636 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+/**
+ * The interface we present to python code that runs within
+ * ceph-mgr.  This is implemented as a Python class from which
+ * all modules must inherit -- access to the Ceph state is then
+ * available as methods on that object.
+ */
+
+#include "Python.h"
+
+#include "Mgr.h"
+
+#include "mon/MonClient.h"
+#include "common/errno.h"
+#include "common/version.h"
+
+#include "BaseMgrModule.h"
+#include "Gil.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+#define PLACEHOLDER ""
+
+
+typedef struct {
+  PyObject_HEAD
+  ActivePyModules *py_modules;
+  ActivePyModule *this_module;
+} BaseMgrModule;
+
+class MonCommandCompletion : public Context
+{
+  ActivePyModules *py_modules;
+  PyObject *python_completion;
+  const std::string tag;
+  SafeThreadState pThreadState;
+
+public:
+  std::string outs;
+  bufferlist outbl;
+
+  MonCommandCompletion(
+      ActivePyModules *py_modules_, PyObject* ev,
+      const std::string &tag_, PyThreadState *ts_)
+    : py_modules(py_modules_), python_completion(ev),
+      tag(tag_), pThreadState(ts_)
+  {
+    assert(python_completion != nullptr);
+    Py_INCREF(python_completion);
+  }
+
+  ~MonCommandCompletion() override
+  {
+    if (python_completion) {
+      // Usually do this in finish(): this path is only for if we're
+      // being destroyed without completing.
+      Gil gil(pThreadState, true);
+      Py_DECREF(python_completion);
+      python_completion = nullptr;
+    }
+  }
+
+  void finish(int r) override
+  {
+    assert(python_completion != nullptr);
+
+    dout(10) << "MonCommandCompletion::finish()" << dendl;
+    {
+      // Scoped so the Gil is released before calling notify_all()
+      // Create new thread state because this is called via the MonClient
+      // Finisher, not the PyModules finisher.
+      Gil gil(pThreadState, true);
+
+      auto set_fn = PyObject_GetAttrString(python_completion, "complete");
+      assert(set_fn != nullptr);
+
+      auto pyR = PyInt_FromLong(r);
+      auto pyOutBl = PyString_FromString(outbl.to_str().c_str());
+      auto pyOutS = PyString_FromString(outs.c_str());
+      auto args = PyTuple_Pack(3, pyR, pyOutBl, pyOutS);
+      Py_DECREF(pyR);
+      Py_DECREF(pyOutBl);
+      Py_DECREF(pyOutS);
+
+      auto rtn = PyObject_CallObject(set_fn, args);
+      if (rtn != nullptr) {
+       Py_DECREF(rtn);
+      }
+      Py_DECREF(args);
+      Py_DECREF(set_fn);
+
+      Py_DECREF(python_completion);
+      python_completion = nullptr;
+    }
+    py_modules->notify_all("command", tag);
+  }
+};
+
+
+static PyObject*
+ceph_send_command(BaseMgrModule *self, PyObject *args)
+{
+  // Like mon, osd, mds
+  char *type = nullptr;
+
+  // Like "23" for an OSD or "myid" for an MDS
+  char *name = nullptr;
+
+  char *cmd_json = nullptr;
+  char *tag = nullptr;
+  PyObject *completion = nullptr;
+  if (!PyArg_ParseTuple(args, "Ossss:ceph_send_command",
+        &completion, &type, &name, &cmd_json, &tag)) {
+    return nullptr;
+  }
+
+  auto set_fn = PyObject_GetAttrString(completion, "complete");
+  if (set_fn == nullptr) {
+    ceph_abort();  // TODO raise python exception instead
+  } else {
+    assert(PyCallable_Check(set_fn));
+  }
+  Py_DECREF(set_fn);
+
+  auto c = new MonCommandCompletion(self->py_modules,
+      completion, tag, PyThreadState_Get());
+  if (std::string(type) == "mon") {
+    self->py_modules->get_monc().start_mon_command(
+        {cmd_json},
+        {},
+        &c->outbl,
+        &c->outs,
+        c);
+  } else if (std::string(type) == "osd") {
+    std::string err;
+    uint64_t osd_id = strict_strtoll(name, 10, &err);
+    if (!err.empty()) {
+      delete c;
+      string msg("invalid osd_id: ");
+      msg.append("\"").append(name).append("\"");
+      PyErr_SetString(PyExc_ValueError, msg.c_str());
+      return nullptr;
+    }
+
+    ceph_tid_t tid;
+    self->py_modules->get_objecter().osd_command(
+        osd_id,
+        {cmd_json},
+        {},
+        &tid,
+        &c->outbl,
+        &c->outs,
+        c);
+  } else if (std::string(type) == "mds") {
+    int r = self->py_modules->get_client().mds_command(
+        name,
+        {cmd_json},
+        {},
+        &c->outbl,
+        &c->outs,
+        c);
+    if (r != 0) {
+      string msg("failed to send command to mds: ");
+      msg.append(cpp_strerror(r));
+      PyErr_SetString(PyExc_RuntimeError, msg.c_str());
+      return nullptr;
+    }
+  } else if (std::string(type) == "pg") {
+    pg_t pgid;
+    if (!pgid.parse(name)) {
+      delete c;
+      string msg("invalid pgid: ");
+      msg.append("\"").append(name).append("\"");
+      PyErr_SetString(PyExc_ValueError, msg.c_str());
+      return nullptr;
+    }
+
+    ceph_tid_t tid;
+    self->py_modules->get_objecter().pg_command(
+        pgid,
+        {cmd_json},
+        {},
+        &tid,
+        &c->outbl,
+        &c->outs,
+        c);
+    return nullptr;
+  } else {
+    delete c;
+    string msg("unknown service type: ");
+    msg.append(type);
+    PyErr_SetString(PyExc_ValueError, msg.c_str());
+    return nullptr;
+  }
+
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_set_health_checks(BaseMgrModule *self, PyObject *args)
+{
+  PyObject *checks = NULL;
+  if (!PyArg_ParseTuple(args, "O:ceph_set_health_checks", &checks)) {
+    return NULL;
+  }
+  if (!PyDict_Check(checks)) {
+    derr << __func__ << " arg not a dict" << dendl;
+    Py_RETURN_NONE;
+  }
+  PyObject *checksls = PyDict_Items(checks);
+  health_check_map_t out_checks;
+  for (int i = 0; i < PyList_Size(checksls); ++i) {
+    PyObject *kv = PyList_GET_ITEM(checksls, i);
+    char *check_name = nullptr;
+    PyObject *check_info = nullptr;
+    if (!PyArg_ParseTuple(kv, "sO:pair", &check_name, &check_info)) {
+      derr << __func__ << " dict item " << i
+          << " not a size 2 tuple" << dendl;
+      continue;
+    }
+    if (!PyDict_Check(check_info)) {
+      derr << __func__ << " item " << i << " " << check_name
+          << " value not a dict" << dendl;
+      continue;
+    }
+    health_status_t severity = HEALTH_OK;
+    string summary;
+    list<string> detail;
+    PyObject *infols = PyDict_Items(check_info);
+    for (int j = 0; j < PyList_Size(infols); ++j) {
+      PyObject *pair = PyList_GET_ITEM(infols, j);
+      if (!PyTuple_Check(pair)) {
+       derr << __func__ << " item " << i << " pair " << j
+            << " not a tuple" << dendl;
+       continue;
+      }
+      char *k = nullptr;
+      PyObject *v = nullptr;
+      if (!PyArg_ParseTuple(pair, "sO:pair", &k, &v)) {
+       derr << __func__ << " item " << i << " pair " << j
+            << " not a size 2 tuple" << dendl;
+       continue;
+      }
+      string ks(k);
+      if (ks == "severity") {
+       if (!PyString_Check(v)) {
+         derr << __func__ << " check " << check_name
+              << " severity value not string" << dendl;
+         continue;
+       }
+       string vs(PyString_AsString(v));
+       if (vs == "warning") {
+         severity = HEALTH_WARN;
+       } else if (vs == "error") {
+         severity = HEALTH_ERR;
+       }
+      } else if (ks == "summary") {
+       if (!PyString_Check(v)) {
+         derr << __func__ << " check " << check_name
+              << " summary value not string" << dendl;
+         continue;
+       }
+       summary = PyString_AsString(v);
+      } else if (ks == "detail") {
+       if (!PyList_Check(v)) {
+         derr << __func__ << " check " << check_name
+              << " detail value not list" << dendl;
+         continue;
+       }
+       for (int k = 0; k < PyList_Size(v); ++k) {
+         PyObject *di = PyList_GET_ITEM(v, k);
+         if (!PyString_Check(di)) {
+           derr << __func__ << " check " << check_name
+                << " detail item " << k << " not a string" << dendl;
+           continue;
+         }
+         detail.push_back(PyString_AsString(di));
+       }
+      } else {
+       derr << __func__ << " check " << check_name
+            << " unexpected key " << k << dendl;
+      }
+    }
+    auto& d = out_checks.add(check_name, severity, summary);
+    d.detail.swap(detail);
+  }
+
+  JSONFormatter jf(true);
+  dout(10) << "module " << self->this_module->get_name()
+          << " health checks:\n";
+  out_checks.dump(&jf);
+  jf.flush(*_dout);
+  *_dout << dendl;
+
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->set_health_checks(self->this_module->get_name(),
+                                      std::move(out_checks));
+  PyEval_RestoreThread(tstate);
+  
+  Py_RETURN_NONE;
+}
+
+
+static PyObject*
+ceph_state_get(BaseMgrModule *self, PyObject *args)
+{
+  char *what = NULL;
+  if (!PyArg_ParseTuple(args, "s:ceph_state_get", &what)) {
+    return NULL;
+  }
+
+  return self->py_modules->get_python(what);
+}
+
+
+static PyObject*
+ceph_get_server(BaseMgrModule *self, PyObject *args)
+{
+  char *hostname = NULL;
+  if (!PyArg_ParseTuple(args, "z:ceph_get_server", &hostname)) {
+    return NULL;
+  }
+
+  if (hostname) {
+    return self->py_modules->get_server_python(hostname);
+  } else {
+    return self->py_modules->list_servers_python();
+  }
+}
+
+static PyObject*
+ceph_get_mgr_id(BaseMgrModule *self, PyObject *args)
+{
+  return PyString_FromString(g_conf->name.get_id().c_str());
+}
+
+static PyObject*
+ceph_config_get(BaseMgrModule *self, PyObject *args)
+{
+  char *what = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_config_get", &what)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  std::string value;
+  bool found = self->py_modules->get_config(self->this_module->get_name(),
+      what, &value);
+  if (found) {
+    dout(10) << "ceph_config_get " << what << " found: " << value.c_str() << dendl;
+    return PyString_FromString(value.c_str());
+  } else {
+    dout(4) << "ceph_config_get " << what << " not found " << dendl;
+    Py_RETURN_NONE;
+  }
+}
+
+static PyObject*
+ceph_config_get_prefix(BaseMgrModule *self, PyObject *args)
+{
+  char *prefix = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_config_get", &prefix)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  return self->py_modules->get_config_prefix(self->this_module->get_name(),
+      prefix);
+}
+
+static PyObject*
+ceph_config_set(BaseMgrModule *self, PyObject *args)
+{
+  char *key = nullptr;
+  char *value = nullptr;
+  if (!PyArg_ParseTuple(args, "sz:ceph_config_set", &key, &value)) {
+    return nullptr;
+  }
+  boost::optional<string> val;
+  if (value) {
+    val = value;
+  }
+  self->py_modules->set_config(self->this_module->get_name(), key, val);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+get_metadata(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_name = NULL;
+  char *svc_id = NULL;
+  if (!PyArg_ParseTuple(args, "ss:get_metadata", &svc_name, &svc_id)) {
+    return nullptr;
+  }
+  return self->py_modules->get_metadata_python(svc_name, svc_id);
+}
+
+static PyObject*
+get_daemon_status(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_name = NULL;
+  char *svc_id = NULL;
+  if (!PyArg_ParseTuple(args, "ss:get_daemon_status", &svc_name,
+                       &svc_id)) {
+    return nullptr;
+  }
+  return self->py_modules->get_daemon_status_python(svc_name, svc_id);
+}
+
+static PyObject*
+ceph_log(BaseMgrModule *self, PyObject *args)
+{
+
+  int level = 0;
+  char *record = nullptr;
+  if (!PyArg_ParseTuple(args, "is:log", &level, &record)) {
+    return nullptr;
+  }
+
+  assert(self->this_module);
+
+  self->this_module->log(level, record);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject *
+ceph_get_version(BaseMgrModule *self, PyObject *args)
+{
+  return PyString_FromString(pretty_version_to_str().c_str());
+}
+
+static PyObject *
+ceph_get_context(BaseMgrModule *self, PyObject *args)
+{
+  return self->py_modules->get_context();
+}
+
+static PyObject*
+get_counter(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_name = nullptr;
+  char *svc_id = nullptr;
+  char *counter_path = nullptr;
+  if (!PyArg_ParseTuple(args, "sss:get_counter", &svc_name,
+                                                  &svc_id, &counter_path)) {
+    return nullptr;
+  }
+  return self->py_modules->get_counter_python(
+      svc_name, svc_id, counter_path);
+}
+
+static PyObject*
+get_perf_schema(BaseMgrModule *self, PyObject *args)
+{
+  char *type_str = nullptr;
+  char *svc_id = nullptr;
+  if (!PyArg_ParseTuple(args, "ss:get_perf_schema", &type_str,
+                                                    &svc_id)) {
+    return nullptr;
+  }
+
+  return self->py_modules->get_perf_schema_python(type_str, svc_id);
+}
+
+static PyObject *
+ceph_get_osdmap(BaseMgrModule *self, PyObject *args)
+{
+  return self->py_modules->get_osdmap();
+}
+
+static PyObject*
+ceph_set_uri(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_str = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_advertize_service",
+        &svc_str)) {
+    return nullptr;
+  }
+
+  // We call down into PyModules even though we have a MgrPyModule
+  // reference here, because MgrPyModule's fields are protected
+  // by PyModules' lock.
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->set_uri(self->this_module->get_name(), svc_str);
+  PyEval_RestoreThread(tstate);
+
+  Py_RETURN_NONE;
+}
+
+
+PyMethodDef BaseMgrModule_methods[] = {
+  {"_ceph_get", (PyCFunction)ceph_state_get, METH_VARARGS,
+   "Get a cluster object"},
+
+  {"_ceph_get_server", (PyCFunction)ceph_get_server, METH_VARARGS,
+   "Get a server object"},
+
+  {"_ceph_get_metadata", (PyCFunction)get_metadata, METH_VARARGS,
+   "Get a service's metadata"},
+
+  {"_ceph_get_daemon_status", (PyCFunction)get_daemon_status, METH_VARARGS,
+   "Get a service's status"},
+
+  {"_ceph_send_command", (PyCFunction)ceph_send_command, METH_VARARGS,
+   "Send a mon command"},
+
+  {"_ceph_set_health_checks", (PyCFunction)ceph_set_health_checks, METH_VARARGS,
+   "Set health checks for this module"},
+
+  {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS,
+   "Get the name of the Mgr daemon where we are running"},
+
+  {"_ceph_get_config", (PyCFunction)ceph_config_get, METH_VARARGS,
+   "Get a configuration value"},
+
+  {"_ceph_get_config_prefix", (PyCFunction)ceph_config_get_prefix, METH_VARARGS,
+   "Get all configuration values with a given prefix"},
+
+  {"_ceph_set_config", (PyCFunction)ceph_config_set, METH_VARARGS,
+   "Set a configuration value"},
+
+  {"_ceph_get_counter", (PyCFunction)get_counter, METH_VARARGS,
+    "Get a performance counter"},
+
+  {"_ceph_get_perf_schema", (PyCFunction)get_perf_schema, METH_VARARGS,
+    "Get the performance counter schema"},
+
+  {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS,
+   "Emit a (local) log message"},
+
+  {"_ceph_get_version", (PyCFunction)ceph_get_version, METH_VARARGS,
+   "Get the ceph version of this process"},
+
+  {"_ceph_get_context", (PyCFunction)ceph_get_context, METH_NOARGS,
+    "Get a CephContext* in a python capsule"},
+
+  {"_ceph_get_osdmap", (PyCFunction)ceph_get_osdmap, METH_NOARGS,
+    "Get an OSDMap* in a python capsule"},
+
+  {"_ceph_set_uri", (PyCFunction)ceph_set_uri, METH_VARARGS,
+    "Advertize a service URI served by this module"},
+
+  {NULL, NULL, 0, NULL}
+};
+
+
+static PyObject *
+BaseMgrModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    BaseMgrModule *self;
+
+    self = (BaseMgrModule *)type->tp_alloc(type, 0);
+
+    return (PyObject *)self;
+}
+
+static int
+BaseMgrModule_init(BaseMgrModule *self, PyObject *args, PyObject *kwds)
+{
+    PyObject *py_modules_capsule = nullptr;
+    PyObject *this_module_capsule = nullptr;
+    static const char *kwlist[] = {"py_modules", "this_module", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "OO",
+                                      const_cast<char**>(kwlist),
+                                      &py_modules_capsule,
+                                      &this_module_capsule)) {
+        return -1;
+    }
+
+    self->py_modules = (ActivePyModules*)PyCapsule_GetPointer(
+        py_modules_capsule, nullptr);
+    assert(self->py_modules);
+    self->this_module = (ActivePyModule*)PyCapsule_GetPointer(
+        this_module_capsule, nullptr);
+    assert(self->this_module);
+
+    return 0;
+}
+
+PyTypeObject BaseMgrModuleType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BaseMgrModule", /* tp_name */
+  sizeof(BaseMgrModule),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  0,                         /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "ceph-mgr Python Plugin", /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BaseMgrModule_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BaseMgrModule_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  BaseMgrModule_new,     /* tp_new */
+};
+
diff --git a/ceph/src/mgr/BaseMgrModule.h b/ceph/src/mgr/BaseMgrModule.h
new file mode 100644 (file)
index 0000000..2c2e5de
--- /dev/null
@@ -0,0 +1,7 @@
+
+#pragma once
+
+#include "Python.h"
+
+extern PyTypeObject BaseMgrModuleType;
+
diff --git a/ceph/src/mgr/BaseMgrStandbyModule.cc b/ceph/src/mgr/BaseMgrStandbyModule.cc
new file mode 100644 (file)
index 0000000..b7bd0f6
--- /dev/null
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#include "BaseMgrStandbyModule.h"
+
+#include "StandbyPyModules.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+typedef struct {
+  PyObject_HEAD
+  StandbyPyModule *this_module;
+} BaseMgrStandbyModule;
+
+static PyObject *
+BaseMgrStandbyModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    BaseMgrStandbyModule *self;
+
+    self = (BaseMgrStandbyModule *)type->tp_alloc(type, 0);
+
+    return (PyObject *)self;
+}
+
+static int
+BaseMgrStandbyModule_init(BaseMgrStandbyModule *self, PyObject *args, PyObject *kwds)
+{
+    PyObject *this_module_capsule = nullptr;
+    static const char *kwlist[] = {"this_module", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+                                      const_cast<char**>(kwlist),
+                                      &this_module_capsule)) {
+        return -1;
+    }
+
+    self->this_module = (StandbyPyModule*)PyCapsule_GetPointer(
+        this_module_capsule, nullptr);
+    assert(self->this_module);
+
+    return 0;
+}
+
+static PyObject*
+ceph_get_mgr_id(BaseMgrStandbyModule *self, PyObject *args)
+{
+  return PyString_FromString(g_conf->name.get_id().c_str());
+}
+
+static PyObject*
+ceph_config_get(BaseMgrStandbyModule *self, PyObject *args)
+{
+  char *what = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_config_get", &what)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  std::string value;
+  bool found = self->this_module->get_config(what, &value);
+  if (found) {
+    dout(10) << "ceph_config_get " << what << " found: " << value.c_str() << dendl;
+    return PyString_FromString(value.c_str());
+  } else {
+    dout(4) << "ceph_config_get " << what << " not found " << dendl;
+    Py_RETURN_NONE;
+  }
+}
+
+static PyObject*
+ceph_get_active_uri(BaseMgrStandbyModule *self, PyObject *args)
+{
+  return PyString_FromString(self->this_module->get_active_uri().c_str());
+}
+
+static PyObject*
+ceph_log(BaseMgrStandbyModule *self, PyObject *args)
+{
+  int level = 0;
+  char *record = nullptr;
+  if (!PyArg_ParseTuple(args, "is:log", &level, &record)) {
+    return nullptr;
+  }
+
+  assert(self->this_module);
+
+  self->this_module->log(level, record);
+
+  Py_RETURN_NONE;
+}
+
+PyMethodDef BaseMgrStandbyModule_methods[] = {
+
+  {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS,
+   "Get the name of the Mgr daemon where we are running"},
+
+  {"_ceph_get_config", (PyCFunction)ceph_config_get, METH_VARARGS,
+   "Get a configuration value"},
+
+  {"_ceph_get_active_uri", (PyCFunction)ceph_get_active_uri, METH_NOARGS,
+   "Get the URI of the active instance of this module, if any"},
+
+  {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS,
+   "Emit a log message"},
+
+  {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BaseMgrStandbyModuleType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BaseMgrStandbyModule", /* tp_name */
+  sizeof(BaseMgrStandbyModule),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  0,                         /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "ceph-mgr Standby Python Plugin", /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BaseMgrStandbyModule_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BaseMgrStandbyModule_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  BaseMgrStandbyModule_new,     /* tp_new */
+};
diff --git a/ceph/src/mgr/BaseMgrStandbyModule.h b/ceph/src/mgr/BaseMgrStandbyModule.h
new file mode 100644 (file)
index 0000000..c5c6beb
--- /dev/null
@@ -0,0 +1,7 @@
+
+#pragma once
+
+#include "Python.h"
+
+extern PyTypeObject BaseMgrStandbyModuleType;
+
index 34aac187181889e22e5131a9a5ad6d34eb127f24..321a38ad5349496154d1b4c6338e98fef52f0338 100644 (file)
@@ -41,26 +41,26 @@ DaemonServer::DaemonServer(MonClient *monc_,
                            Finisher &finisher_,
                           DaemonStateIndex &daemon_state_,
                           ClusterState &cluster_state_,
-                          PyModules &py_modules_,
+                          PyModuleRegistry &py_modules_,
                           LogChannelRef clog_,
                           LogChannelRef audit_clog_)
     : Dispatcher(g_ceph_context),
       client_byte_throttler(new Throttle(g_ceph_context, "mgr_client_bytes",
-                                        g_conf->mgr_client_bytes)),
+                                        g_conf->get_val<uint64_t>("mgr_client_bytes"))),
       client_msg_throttler(new Throttle(g_ceph_context, "mgr_client_messages",
-                                       g_conf->mgr_client_messages)),
+                                       g_conf->get_val<uint64_t>("mgr_client_messages"))),
       osd_byte_throttler(new Throttle(g_ceph_context, "mgr_osd_bytes",
-                                     g_conf->mgr_osd_bytes)),
+                                     g_conf->get_val<uint64_t>("mgr_osd_bytes"))),
       osd_msg_throttler(new Throttle(g_ceph_context, "mgr_osd_messsages",
-                                    g_conf->mgr_osd_messages)),
+                                    g_conf->get_val<uint64_t>("mgr_osd_messages"))),
       mds_byte_throttler(new Throttle(g_ceph_context, "mgr_mds_bytes",
-                                     g_conf->mgr_mds_bytes)),
+                                     g_conf->get_val<uint64_t>("mgr_mds_bytes"))),
       mds_msg_throttler(new Throttle(g_ceph_context, "mgr_mds_messsages",
-                                    g_conf->mgr_mds_messages)),
+                                    g_conf->get_val<uint64_t>("mgr_mds_messages"))),
       mon_byte_throttler(new Throttle(g_ceph_context, "mgr_mon_bytes",
-                                     g_conf->mgr_mon_bytes)),
+                                     g_conf->get_val<uint64_t>("mgr_mon_bytes"))),
       mon_msg_throttler(new Throttle(g_ceph_context, "mgr_mon_messsages",
-                                    g_conf->mgr_mon_messages)),
+                                    g_conf->get_val<uint64_t>("mgr_mon_messages"))),
       msgr(nullptr),
       monc(monc_),
       finisher(finisher_),
@@ -73,11 +73,15 @@ DaemonServer::DaemonServer(MonClient *monc_,
                     g_conf->auth_supported.empty() ?
                       g_conf->auth_cluster_required :
                       g_conf->auth_supported),
-      lock("DaemonServer")
-{}
+      lock("DaemonServer"),
+      pgmap_ready(false)
+{
+  g_conf->add_observer(this);
+}
 
 DaemonServer::~DaemonServer() {
   delete msgr;
+  g_conf->remove_observer(this);
 }
 
 int DaemonServer::init(uint64_t gid, entity_addr_t client_addr)
@@ -232,6 +236,11 @@ bool DaemonServer::ms_handle_reset(Connection *con)
     dout(10) << "unregistering osd." << session->osd_id
             << "  session " << session << " con " << con << dendl;
     osd_cons[session->osd_id].erase(con);
+
+    auto iter = daemon_connections.find(con);
+    if (iter != daemon_connections.end()) {
+      daemon_connections.erase(iter);
+    }
   }
   return false;
 }
@@ -244,8 +253,9 @@ bool DaemonServer::ms_handle_refused(Connection *con)
 
 bool DaemonServer::ms_dispatch(Message *m)
 {
-  Mutex::Locker l(lock);
-
+  // Note that we do *not* take ::lock here, in order to avoid
+  // serializing all message handling.  It's up to each handler
+  // to take whatever locks it needs.
   switch (m->get_type()) {
     case MSG_PGSTATS:
       cluster_state.ingest_pgstats(static_cast<MPGStats*>(m));
@@ -266,29 +276,36 @@ bool DaemonServer::ms_dispatch(Message *m)
 
 void DaemonServer::maybe_ready(int32_t osd_id)
 {
-  if (!pgmap_ready && reported_osds.find(osd_id) == reported_osds.end()) {
-    dout(4) << "initial report from osd " << osd_id << dendl;
-    reported_osds.insert(osd_id);
-    std::set<int32_t> up_osds;
+  if (pgmap_ready.load()) {
+    // Fast path: we don't need to take lock because pgmap_ready
+    // is already set
+  } else {
+    Mutex::Locker l(lock);
 
-    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
-        osdmap.get_up_osds(up_osds);
-    });
+    if (reported_osds.find(osd_id) == reported_osds.end()) {
+      dout(4) << "initial report from osd " << osd_id << dendl;
+      reported_osds.insert(osd_id);
+      std::set<int32_t> up_osds;
 
-    std::set<int32_t> unreported_osds;
-    std::set_difference(up_osds.begin(), up_osds.end(),
-                        reported_osds.begin(), reported_osds.end(),
-                        std::inserter(unreported_osds, unreported_osds.begin()));
+      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+          osdmap.get_up_osds(up_osds);
+      });
 
-    if (unreported_osds.size() == 0) {
-      dout(4) << "all osds have reported, sending PG state to mon" << dendl;
-      pgmap_ready = true;
-      reported_osds.clear();
-      // Avoid waiting for next tick
-      send_report();
-    } else {
-      dout(4) << "still waiting for " << unreported_osds.size() << " osds"
-                 " to report in before PGMap is ready" << dendl;
+      std::set<int32_t> unreported_osds;
+      std::set_difference(up_osds.begin(), up_osds.end(),
+                          reported_osds.begin(), reported_osds.end(),
+                          std::inserter(unreported_osds, unreported_osds.begin()));
+
+      if (unreported_osds.size() == 0) {
+        dout(4) << "all osds have reported, sending PG state to mon" << dendl;
+        pgmap_ready = true;
+        reported_osds.clear();
+        // Avoid waiting for next tick
+        send_report();
+      } else {
+        dout(4) << "still waiting for " << unreported_osds.size() << " osds"
+                   " to report in before PGMap is ready" << dendl;
+      }
     }
   }
 }
@@ -305,6 +322,8 @@ void DaemonServer::shutdown()
 
 bool DaemonServer::handle_open(MMgrOpen *m)
 {
+  Mutex::Locker l(lock);
+
   DaemonKey key;
   if (!m->service_name.empty()) {
     key.first = m->service_name;
@@ -315,9 +334,7 @@ bool DaemonServer::handle_open(MMgrOpen *m)
 
   dout(4) << "from " << m->get_connection() << "  " << key << dendl;
 
-  auto configure = new MMgrConfigure();
-  configure->stats_period = g_conf->mgr_stats_period;
-  m->get_connection()->send_message(configure);
+  _send_configure(m->get_connection());
 
   DaemonStatePtr daemon;
   if (daemon_state.exists(key)) {
@@ -326,7 +343,7 @@ bool DaemonServer::handle_open(MMgrOpen *m)
   if (daemon) {
     dout(20) << "updating existing DaemonState for " << m->daemon_name << dendl;
     Mutex::Locker l(daemon->lock);
-    daemon_state.get(key)->perf_counters.clear();
+    daemon->perf_counters.clear();
   }
 
   if (m->service_daemon) {
@@ -358,6 +375,15 @@ bool DaemonServer::handle_open(MMgrOpen *m)
     }
   }
 
+  if (m->get_connection()->get_peer_type() != entity_name_t::TYPE_CLIENT &&
+      m->service_name.empty())
+  {
+    // Store in set of the daemon/service connections, i.e. those
+    // connections that require an update in the event of stats
+    // configuration changes.
+    daemon_connections.insert(m->get_connection());
+  }
+
   m->put();
   return true;
 }
@@ -384,6 +410,7 @@ bool DaemonServer::handle_report(MMgrReport *m)
     return true;
   }
 
+  // Look up the DaemonState
   DaemonStatePtr daemon;
   if (daemon_state.exists(key)) {
     dout(20) << "updating existing DaemonState for " << key << dendl;
@@ -398,12 +425,26 @@ bool DaemonServer::handle_report(MMgrReport *m)
     // daemons without sessions, and ensuring that session open
     // always contains metadata.
   }
+
+  // Update the DaemonState
   assert(daemon != nullptr);
-  auto &daemon_counters = daemon->perf_counters;
   {
     Mutex::Locker l(daemon->lock);
+    auto &daemon_counters = daemon->perf_counters;
     daemon_counters.update(m);
+
+    if (daemon->service_daemon) {
+      utime_t now = ceph_clock_now();
+      if (m->daemon_status) {
+        daemon->service_status = *m->daemon_status;
+        daemon->service_status_stamp = now;
+      }
+      daemon->last_service_beacon = now;
+    } else if (m->daemon_status) {
+      derr << "got status from non-daemon " << key << dendl;
+    }
   }
+
   // if there are any schema updates, notify the python modules
   if (!m->declare_types.empty() || !m->undeclare_types.empty()) {
     ostringstream oss;
@@ -411,17 +452,6 @@ bool DaemonServer::handle_report(MMgrReport *m)
     py_modules.notify_all("perf_schema_update", oss.str());
   }
 
-  if (daemon->service_daemon) {
-    utime_t now = ceph_clock_now();
-    if (m->daemon_status) {
-      daemon->service_status = *m->daemon_status;
-      daemon->service_status_stamp = now;
-    }
-    daemon->last_service_beacon = now;
-  } else if (m->daemon_status) {
-    derr << "got status from non-daemon " << key << dendl;
-  }
-
   m->put();
   return true;
 }
@@ -496,6 +526,7 @@ bool DaemonServer::_allowed_command(
 
 bool DaemonServer::handle_command(MCommand *m)
 {
+  Mutex::Locker l(lock);
   int r = 0;
   std::stringstream ss;
   std::string prefix;
@@ -705,6 +736,19 @@ bool DaemonServer::handle_command(MCommand *m)
     return true;
   }
 
+  if (prefix == "config set") {
+    std::string key;
+    std::string val;
+    cmd_getval(cct, cmdctx->cmdmap, "key", key);
+    cmd_getval(cct, cmdctx->cmdmap, "value", val);
+    r = cct->_conf->set_val(key, val, true, &ss);
+    if (r == 0) {
+      cct->_conf->apply_changes(nullptr);
+    }
+    cmdctx->reply(0, ss);
+    return true;
+  }
+
   // -----------
   // PG commands
 
@@ -1148,7 +1192,7 @@ bool DaemonServer::handle_command(MCommand *m)
                  }
                  break;
                case OFR_BACKFILL:
-                 if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL)) == 0) {
+                 if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING)) == 0) {
                    ss << "pg " << pstr << " doesn't require backfilling; ";
                    continue;
                  } else  if (workpg.state & PG_STATE_FORCED_BACKFILL) {
@@ -1238,7 +1282,7 @@ bool DaemonServer::handle_command(MCommand *m)
   }
 
   // None of the special native commands, 
-  MgrPyModule *handler = nullptr;
+  ActivePyModule *handler = nullptr;
   auto py_commands = py_modules.get_py_commands();
   for (const auto &pyc : py_commands) {
     auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring);
@@ -1273,7 +1317,7 @@ bool DaemonServer::handle_command(MCommand *m)
 void DaemonServer::_prune_pending_service_map()
 {
   utime_t cutoff = ceph_clock_now();
-  cutoff -= g_conf->mgr_service_beacon_grace;
+  cutoff -= g_conf->get_val<double>("mgr_service_beacon_grace");
   auto p = pending_service_map.services.begin();
   while (p != pending_service_map.services.end()) {
     auto q = p->second.daemons.begin();
@@ -1313,7 +1357,7 @@ void DaemonServer::_prune_pending_service_map()
 void DaemonServer::send_report()
 {
   if (!pgmap_ready) {
-    if (ceph_clock_now() - started_at > g_conf->mgr_stats_period * 4.0) {
+    if (ceph_clock_now() - started_at > g_conf->get_val<int64_t>("mgr_stats_period") * 4.0) {
       pgmap_ready = true;
       reported_osds.clear();
       dout(1) << "Giving up on OSDs that haven't reported yet, sending "
@@ -1404,3 +1448,48 @@ void DaemonServer::got_service_map()
     daemon_state.cull(p.first, names);
   }
 }
+
+
+const char** DaemonServer::get_tracked_conf_keys() const
+{
+  static const char *KEYS[] = {
+    "mgr_stats_threshold",
+    "mgr_stats_period",
+    nullptr
+  };
+
+  return KEYS;
+}
+
+void DaemonServer::handle_conf_change(const struct md_config_t *conf,
+                                              const std::set <std::string> &changed)
+{
+  dout(4) << "ohai" << dendl;
+  // We may be called within lock (via MCommand `config set`) or outwith the
+  // lock (via admin socket `config set`), so handle either case.
+  const bool initially_locked = lock.is_locked_by_me();
+  if (!initially_locked) {
+    lock.Lock();
+  }
+
+  if (changed.count("mgr_stats_threshold") || changed.count("mgr_stats_period")) {
+    dout(4) << "Updating stats threshold/period on "
+            << daemon_connections.size() << " clients" << dendl;
+    // Send a fresh MMgrConfigure to all clients, so that they can follow
+    // the new policy for transmitting stats
+    for (auto &c : daemon_connections) {
+      _send_configure(c);
+    }
+  }
+}
+
+void DaemonServer::_send_configure(ConnectionRef c)
+{
+  assert(lock.is_locked_by_me());
+
+  auto configure = new MMgrConfigure();
+  configure->stats_period = g_conf->get_val<int64_t>("mgr_stats_period");
+  configure->stats_threshold = g_conf->get_val<int64_t>("mgr_stats_threshold");
+  c->send_message(configure);
+}
+
index 6e44832021510462107eb2e02b7df18c884c54d1..fe809833cbd800ce51023672cb7adc0013778e55 100644 (file)
@@ -14,7 +14,7 @@
 #ifndef DAEMON_SERVER_H_
 #define DAEMON_SERVER_H_
 
-#include "PyModules.h"
+#include "PyModuleRegistry.h"
 
 #include <set>
 #include <string>
@@ -42,7 +42,7 @@ struct MonCommand;
  * Server used in ceph-mgr to communicate with Ceph daemons like
  * MDSs and OSDs.
  */
-class DaemonServer : public Dispatcher
+class DaemonServer : public Dispatcher, public md_config_obs_t
 {
 protected:
   boost::scoped_ptr<Throttle> client_byte_throttler;
@@ -59,15 +59,20 @@ protected:
   Finisher  &finisher;
   DaemonStateIndex &daemon_state;
   ClusterState &cluster_state;
-  PyModules &py_modules;
+  PyModuleRegistry &py_modules;
   LogChannelRef clog, audit_clog;
 
   AuthAuthorizeHandlerRegistry auth_registry;
 
+  // Connections for daemons, and clients with service names set
+  // (i.e. those MgrClients that are allowed to send MMgrReports)
+  std::set<ConnectionRef> daemon_connections;
+
   /// connections for osds
   ceph::unordered_map<int,set<ConnectionRef>> osd_cons;
 
   ServiceMap pending_service_map;  // uncommitted
+
   epoch_t pending_service_map_dirty = 0;
 
   Mutex lock;
@@ -90,7 +95,7 @@ private:
   void _prune_pending_service_map();
 
   utime_t started_at;
-  bool pgmap_ready = false;
+  std::atomic<bool> pgmap_ready;
   std::set<int32_t> reported_osds;
   void maybe_ready(int32_t osd_id);
 
@@ -104,7 +109,7 @@ public:
                Finisher &finisher_,
               DaemonStateIndex &daemon_state_,
               ClusterState &cluster_state_,
-              PyModules &py_modules_,
+              PyModuleRegistry &py_modules_,
               LogChannelRef cl,
               LogChannelRef auditcl);
   ~DaemonServer() override;
@@ -128,6 +133,12 @@ public:
   bool handle_command(MCommand *m);
   void send_report();
   void got_service_map();
+
+  void _send_configure(ConnectionRef c);
+
+  virtual const char** get_tracked_conf_keys() const override;
+  virtual void handle_conf_change(const struct md_config_t *conf,
+                          const std::set <std::string> &changed) override;
 };
 
 #endif
index 93fe130190eb6d72b52eb9d59b7a131f954fbf36..a7b8f572e1614b3f0a0b420f7685ccdeccb5ecaa 100644 (file)
@@ -13,6 +13,8 @@
 
 #include "DaemonState.h"
 
+#include "MgrSession.h"
+
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_mgr
 #undef dout_prefix
@@ -20,7 +22,7 @@
 
 void DaemonStateIndex::insert(DaemonStatePtr dm)
 {
-  Mutex::Locker l(lock);
+  RWLock::WLocker l(lock);
 
   if (all.count(dm->key)) {
     _erase(dm->key);
@@ -32,7 +34,7 @@ void DaemonStateIndex::insert(DaemonStatePtr dm)
 
 void DaemonStateIndex::_erase(const DaemonKey& dmk)
 {
-  assert(lock.is_locked_by_me());
+  assert(lock.is_wlocked());
 
   const auto to_erase = all.find(dmk);
   assert(to_erase != all.end());
@@ -49,7 +51,7 @@ void DaemonStateIndex::_erase(const DaemonKey& dmk)
 DaemonStateCollection DaemonStateIndex::get_by_service(
   const std::string& svc) const
 {
-  Mutex::Locker l(lock);
+  RWLock::RLocker l(lock);
 
   DaemonStateCollection result;
 
@@ -65,7 +67,7 @@ DaemonStateCollection DaemonStateIndex::get_by_service(
 DaemonStateCollection DaemonStateIndex::get_by_server(
   const std::string &hostname) const
 {
-  Mutex::Locker l(lock);
+  RWLock::RLocker l(lock);
 
   if (by_server.count(hostname)) {
     return by_server.at(hostname);
@@ -76,16 +78,21 @@ DaemonStateCollection DaemonStateIndex::get_by_server(
 
 bool DaemonStateIndex::exists(const DaemonKey &key) const
 {
-  Mutex::Locker l(lock);
+  RWLock::RLocker l(lock);
 
   return all.count(key) > 0;
 }
 
 DaemonStatePtr DaemonStateIndex::get(const DaemonKey &key)
 {
-  Mutex::Locker l(lock);
+  RWLock::RLocker l(lock);
 
-  return all.at(key);
+  auto iter = all.find(key);
+  if (iter != all.end()) {
+    return iter->second;
+  } else {
+    return nullptr;
+  }
 }
 
 void DaemonStateIndex::cull(const std::string& svc_name,
@@ -93,7 +100,7 @@ void DaemonStateIndex::cull(const std::string& svc_name,
 {
   std::vector<string> victims;
 
-  Mutex::Locker l(lock);
+  RWLock::WLocker l(lock);
   auto begin = all.lower_bound({svc_name, ""});
   auto end = all.end();
   for (auto &i = begin; i != end; ++i) {
@@ -118,14 +125,18 @@ void DaemonPerfCounters::update(MMgrReport *report)
           << types.size() << " types, got "
            << report->packed.length() << " bytes of data" << dendl;
 
+  // Retrieve session state
+  MgrSessionRef session(static_cast<MgrSession*>(
+        report->get_connection()->get_priv()));
+
   // Load any newly declared types
   for (const auto &t : report->declare_types) {
     types.insert(std::make_pair(t.path, t));
-    declared_types.insert(t.path);
+    session->declared_types.insert(t.path);
   }
   // Remove any old types
   for (const auto &t : report->undeclare_types) {
-    declared_types.erase(t);
+    session->declared_types.erase(t);
   }
 
   const auto now = ceph_clock_now();
@@ -133,7 +144,7 @@ void DaemonPerfCounters::update(MMgrReport *report)
   // Parse packed data according to declared set of types
   bufferlist::iterator p = report->packed.begin();
   DECODE_START(1, p);
-  for (const auto &t_path : declared_types) {
+  for (const auto &t_path : session->declared_types) {
     const auto &t = types.at(t_path);
     uint64_t val = 0;
     uint64_t avgcount = 0;
index 98853a2ec2cb628985b028891c7400997c479467..846ce5dd8d9fcbe6e6580eecb55f2c7146c44ed2 100644 (file)
@@ -20,7 +20,7 @@
 #include <set>
 #include <boost/circular_buffer.hpp>
 
-#include "common/Mutex.h"
+#include "common/RWLock.h"
 
 #include "msg/msg_types.h"
 
@@ -74,18 +74,11 @@ class DaemonPerfCounters
 
   std::map<std::string, PerfCounterInstance> instances;
 
-  // FIXME: this state is really local to DaemonServer, it's part
-  // of the protocol rather than being part of what other classes
-  // mgiht want to read.  Maybe have a separate session object
-  // inside DaemonServer instead of stashing session-ish state here?
-  std::set<std::string> declared_types;
-
   void update(MMgrReport *report);
 
   void clear()
   {
     instances.clear();
-    declared_types.clear();
   }
 };
 
@@ -133,38 +126,52 @@ typedef std::map<DaemonKey, DaemonStatePtr> DaemonStateCollection;
 class DaemonStateIndex
 {
   private:
+  mutable RWLock lock = {"DaemonStateIndex", true, true, true};
+
   std::map<std::string, DaemonStateCollection> by_server;
   DaemonStateCollection all;
-
   std::set<DaemonKey> updating;
 
-  mutable Mutex lock;
+  void _erase(const DaemonKey& dmk);
 
   public:
-
-  DaemonStateIndex() : lock("DaemonState") {}
+  DaemonStateIndex() {}
 
   // FIXME: shouldn't really be public, maybe construct DaemonState
   // objects internally to avoid this.
   PerfCounterTypes types;
 
   void insert(DaemonStatePtr dm);
-  void _erase(const DaemonKey& dmk);
-
   bool exists(const DaemonKey &key) const;
   DaemonStatePtr get(const DaemonKey &key);
+
+  // Note that these return by value rather than reference to avoid
+  // callers needing to stay in lock while using result.  Callers must
+  // still take the individual DaemonState::lock on each entry though.
   DaemonStateCollection get_by_server(const std::string &hostname) const;
   DaemonStateCollection get_by_service(const std::string &svc_name) const;
-
-  const DaemonStateCollection &get_all() const {return all;}
-  const std::map<std::string, DaemonStateCollection> &get_all_servers() const
-  {
-    return by_server;
+  DaemonStateCollection get_all() const {return all;}
+
+  template<typename Callback, typename...Args>
+  auto with_daemons_by_server(Callback&& cb, Args&&... args) const ->
+    decltype(cb(by_server, std::forward<Args>(args)...)) {
+    RWLock::RLocker l(lock);
+    
+    return std::forward<Callback>(cb)(by_server, std::forward<Args>(args)...);
   }
 
-  void notify_updating(const DaemonKey &k) { updating.insert(k); }
-  void clear_updating(const DaemonKey &k) { updating.erase(k); }
-  bool is_updating(const DaemonKey &k) { return updating.count(k) > 0; }
+  void notify_updating(const DaemonKey &k) {
+    RWLock::WLocker l(lock);
+    updating.insert(k);
+  }
+  void clear_updating(const DaemonKey &k) {
+    RWLock::WLocker l(lock);
+    updating.erase(k);
+  }
+  bool is_updating(const DaemonKey &k) {
+    RWLock::RLocker l(lock);
+    return updating.count(k) > 0;
+  }
 
   /**
    * Remove state for all daemons of this type whose names are
diff --git a/ceph/src/mgr/Gil.cc b/ceph/src/mgr/Gil.cc
new file mode 100644 (file)
index 0000000..9489a31
--- /dev/null
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include "Python.h"
+
+#include "common/debug.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+#include "Gil.h"
+
+SafeThreadState::SafeThreadState(PyThreadState *ts_)
+    : ts(ts_)
+{
+  assert(ts != nullptr);
+  thread = pthread_self();
+}
+
+Gil::Gil(SafeThreadState &ts, bool new_thread) : pThreadState(ts)
+{
+  // Acquire the GIL, set the current thread state
+  PyEval_RestoreThread(pThreadState.ts);
+  dout(25) << "GIL acquired for thread state " << pThreadState.ts << dendl;
+
+  //
+  // If called from a separate OS thread (i.e. a thread not created
+  // by Python, that does't already have a python thread state that
+  // was created when that thread was active), we need to manually
+  // create and switch to a python thread state specifically for this
+  // OS thread.
+  //
+  // Note that instead of requring the caller to set new_thread == true
+  // when calling this from a separate OS thread, we could figure out
+  // if this was necessary automatically, as follows:
+  //
+  //   if (pThreadState->thread_id != PyThread_get_thread_ident()) {
+  //
+  // However, this means we're accessing pThreadState->thread_id, but
+  // the Python C API docs say that "The only public data member is
+  // PyInterpreterState *interp", i.e. doing this would violate
+  // something that's meant to be a black box.
+  //
+  if (new_thread) {
+    pNewThreadState = PyThreadState_New(pThreadState.ts->interp);
+    PyThreadState_Swap(pNewThreadState);
+    dout(20) << "Switched to new thread state " << pNewThreadState << dendl;
+  } else {
+    assert(pthread_self() == pThreadState.thread);
+  }
+}
+
+Gil::~Gil()
+{
+  if (pNewThreadState != nullptr) {
+    dout(20) << "Destroying new thread state " << pNewThreadState << dendl;
+    PyThreadState_Swap(pThreadState.ts);
+    PyThreadState_Clear(pNewThreadState);
+    PyThreadState_Delete(pNewThreadState);
+  }
+  // Release the GIL, reset the thread state to NULL
+  PyEval_SaveThread();
+  dout(25) << "GIL released for thread state " << pThreadState.ts << dendl;
+}
+
index 522d4b0e18add69f85f6d52e9a9d85a19a448548..ef9e76ac108b9b7ad35ae6b6f3485238d991dfbc 100644 (file)
  *
  */
 
-#ifndef GIL_H_
-#define GIL_H_
+#pragma once
 
-#include "Python.h"
+struct _ts;
+typedef struct _ts PyThreadState;
 
-#include "common/debug.h"
+#include <pthread.h>
 
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_mgr
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+/**
+ * Wrap PyThreadState to carry a record of which POSIX thread
+ * the thread state relates to.  This allows the Gil class to
+ * validate that we're being used from the right thread.
+ */
+class SafeThreadState
+{
+  public:
+  SafeThreadState(PyThreadState *ts_);
+
+  SafeThreadState()
+    : ts(nullptr), thread(0)
+  {
+  }
+
+  PyThreadState *ts;
+  pthread_t thread;
+
+  void set(PyThreadState *ts_)
+  {
+    ts = ts_;
+    thread = pthread_self();
+  }
+};
 
 //
 // Use one of these in any scope in which you need to hold Python's
 // If in doubt, explicitly put a scope around the block of code you
 // know you need the GIL in.
 //
-// See the comment below for when to set new_thread == true
+// See the comment in Gil::Gil for when to set new_thread == true
 //
 class Gil {
 public:
   Gil(const Gil&) = delete;
   Gil& operator=(const Gil&) = delete;
 
-  Gil(PyThreadState *ts, bool new_thread = false) : pThreadState(ts)
-  {
-    assert(pThreadState != nullptr);
-
-    // Acquire the GIL, set the current thread state
-    PyEval_RestoreThread(pThreadState);
-    dout(20) << "GIL acquired for thread state " << pThreadState << dendl;
-
-    //
-    // If called from a separate OS thread (i.e. a thread not created
-    // by Python, that does't already have a python thread state that
-    // was created when that thread was active), we need to manually
-    // create and switch to a python thread state specifically for this
-    // OS thread.
-    //
-    // Note that instead of requring the caller to set new_thread == true
-    // when calling this from a separate OS thread, we could figure out
-    // if this was necessary automatically, as follows:
-    //
-    //   if (pThreadState->thread_id != PyThread_get_thread_ident()) {
-    //
-    // However, this means we're accessing pThreadState->thread_id, but
-    // the Python C API docs say that "The only public data member is
-    // PyInterpreterState *interp", i.e. doing this would violate
-    // something that's meant to be a black box.
-    //
-    if (new_thread) {
-      pNewThreadState = PyThreadState_New(pThreadState->interp);
-      PyThreadState_Swap(pNewThreadState);
-      dout(20) << "Switched to new thread state " << pNewThreadState << dendl;
-    }
-  }
-
-  ~Gil()
-  {
-    if (pNewThreadState != nullptr) {
-      dout(20) << "Destroying new thread state " << pNewThreadState << dendl;
-      PyThreadState_Swap(pThreadState);
-      PyThreadState_Clear(pNewThreadState);
-      PyThreadState_Delete(pNewThreadState);
-    }
-    // Release the GIL, reset the thread state to NULL
-    PyEval_SaveThread();
-    dout(20) << "GIL released for thread state " << pThreadState << dendl;
-  }
+  Gil(SafeThreadState &ts, bool new_thread = false);
+  ~Gil();
 
 private:
-  PyThreadState *pThreadState;
+  SafeThreadState &pThreadState;
   PyThreadState *pNewThreadState = nullptr;
 };
 
-#endif
-
index 092b71fdb9fac8a872beac43d87167d996fed989..c17d0418786454d883437e690bd01b14895a7037 100644 (file)
@@ -24,7 +24,7 @@
 #include "mgr/MgrContext.h"
 #include "mgr/mgr_commands.h"
 
-#include "MgrPyModule.h"
+//#include "MgrPyModule.h"
 #include "DaemonServer.h"
 #include "messages/MMgrBeacon.h"
 #include "messages/MMgrDigest.h"
@@ -42,6 +42,7 @@
 
 
 Mgr::Mgr(MonClient *monc_, const MgrMap& mgrmap,
+         PyModuleRegistry *py_module_registry_,
         Messenger *clientm_, Objecter *objecter_,
         Client* client_, LogChannelRef clog_, LogChannelRef audit_clog_) :
   monc(monc_),
@@ -52,11 +53,12 @@ Mgr::Mgr(MonClient *monc_, const MgrMap& mgrmap,
   timer(g_ceph_context, lock),
   finisher(g_ceph_context, "Mgr", "mgr-fin"),
   digest_received(false),
-  py_modules(daemon_state, cluster_state, *monc, clog_, *objecter, *client,
-             finisher),
+  py_module_registry(py_module_registry_),
   cluster_state(monc, nullptr, mgrmap),
-  server(monc, finisher, daemon_state, cluster_state, py_modules,
+  server(monc, finisher, daemon_state, cluster_state, *py_module_registry,
          clog_, audit_clog_),
+  clog(clog_),
+  audit_clog(audit_clog_),
   initialized(false),
   initializing(false)
 {
@@ -220,7 +222,7 @@ void Mgr::init()
   // Preload config keys (`get` for plugins is to be a fast local
   // operation, we we don't have to synchronize these later because
   // all sets will come via mgr)
-  load_config();
+  auto loaded_config = load_config();
 
   // Wait for MgrDigest...
   dout(4) << "waiting for MgrDigest..." << dendl;
@@ -229,9 +231,9 @@ void Mgr::init()
   }
 
   // assume finisher already initialized in background_init
-  dout(4) << "starting PyModules..." << dendl;
-  py_modules.init();
-  py_modules.start();
+  dout(4) << "starting python modules..." << dendl;
+  py_module_registry->active_start(loaded_config, daemon_state, cluster_state, *monc,
+      clog, *objecter, *client, finisher);
 
   dout(4) << "Complete." << dendl;
   initializing = false;
@@ -327,7 +329,7 @@ void Mgr::load_all_metadata()
   }
 }
 
-void Mgr::load_config()
+std::map<std::string, std::string> Mgr::load_config()
 {
   assert(lock.is_locked_by_me());
 
@@ -345,7 +347,7 @@ void Mgr::load_config()
     std::string const key = key_str.get_str();
     dout(20) << "saw key '" << key << "'" << dendl;
 
-    const std::string config_prefix = PyModules::config_prefix;
+    const std::string config_prefix = PyModuleRegistry::config_prefix;
 
     if (key.substr(0, config_prefix.size()) == config_prefix) {
       dout(20) << "fetching '" << key << "'" << dendl;
@@ -361,7 +363,7 @@ void Mgr::load_config()
     }
   }
 
-  py_modules.insert_config(loaded);
+  return loaded;
 }
 
 void Mgr::shutdown()
@@ -377,7 +379,7 @@ void Mgr::shutdown()
       server.shutdown();
     }
     // after the messenger is stopped, signal modules to shutdown via finisher
-    py_modules.shutdown();
+    py_module_registry->active_shutdown();
   }));
 
   // Then stop the finisher to ensure its enqueued contexts aren't going
@@ -460,7 +462,7 @@ void Mgr::handle_osd_map()
 void Mgr::handle_log(MLog *m)
 {
   for (const auto &e : m->entries) {
-    py_modules.notify_all(e);
+    py_module_registry->notify_all(e);
   }
 
   m->put();
@@ -483,18 +485,18 @@ bool Mgr::ms_dispatch(Message *m)
       handle_mgr_digest(static_cast<MMgrDigest*>(m));
       break;
     case CEPH_MSG_MON_MAP:
-      py_modules.notify_all("mon_map", "");
+      py_module_registry->notify_all("mon_map", "");
       m->put();
       break;
     case CEPH_MSG_FS_MAP:
-      py_modules.notify_all("fs_map", "");
+      py_module_registry->notify_all("fs_map", "");
       handle_fs_map((MFSMap*)m);
       return false; // I shall let this pass through for Client
       break;
     case CEPH_MSG_OSD_MAP:
       handle_osd_map();
 
-      py_modules.notify_all("osd_map", "");
+      py_module_registry->notify_all("osd_map", "");
 
       // Continuous subscribe, so that we can generate notifications
       // for our MgrPyModules
@@ -503,7 +505,7 @@ bool Mgr::ms_dispatch(Message *m)
       break;
     case MSG_SERVICE_MAP:
       handle_service_map((MServiceMap*)m);
-      py_modules.notify_all("service_map", "");
+      py_module_registry->notify_all("service_map", "");
       m->put();
       break;
     case MSG_LOG:
@@ -614,12 +616,12 @@ void Mgr::handle_mgr_digest(MMgrDigest* m)
   dout(10) << m->mon_status_json.length() << dendl;
   dout(10) << m->health_json.length() << dendl;
   cluster_state.load_digest(m);
-  py_modules.notify_all("mon_status", "");
-  py_modules.notify_all("health", "");
+  py_module_registry->notify_all("mon_status", "");
+  py_module_registry->notify_all("health", "");
 
   // Hack: use this as a tick/opportunity to prompt python-land that
   // the pgmap might have changed since last time we were here.
-  py_modules.notify_all("pg_summary", "");
+  py_module_registry->notify_all("pg_summary", "");
   dout(10) << "done." << dendl;
 
   m->put();
@@ -641,8 +643,15 @@ std::vector<MonCommand> Mgr::get_command_set() const
   Mutex::Locker l(lock);
 
   std::vector<MonCommand> commands = mgr_commands;
-  std::vector<MonCommand> py_commands = py_modules.get_commands();
+  std::vector<MonCommand> py_commands = py_module_registry->get_commands();
   commands.insert(commands.end(), py_commands.begin(), py_commands.end());
   return commands;
 }
 
+std::map<std::string, std::string> Mgr::get_services() const
+{
+  Mutex::Locker l(lock);
+
+  return py_module_registry->get_services();
+}
+
index 68f2b40b4616d62fdfef22f7f7fad093234586c1..9a6b3974b2915ad20930c7688f05064618129eee 100644 (file)
@@ -32,7 +32,7 @@
 #include "mon/MgrMap.h"
 
 #include "DaemonServer.h"
-#include "PyModules.h"
+#include "PyModuleRegistry.h"
 
 #include "DaemonState.h"
 #include "ClusterState.h"
@@ -44,8 +44,6 @@ class MServiceMap;
 class Objecter;
 class Client;
 
-class MgrPyModule;
-
 class Mgr {
 protected:
   MonClient *monc;
@@ -62,13 +60,16 @@ protected:
   bool digest_received;
   Cond digest_cond;
 
-  PyModules py_modules;
+  PyModuleRegistry *py_module_registry;
   DaemonStateIndex daemon_state;
   ClusterState cluster_state;
 
   DaemonServer server;
 
-  void load_config();
+  LogChannelRef clog;
+  LogChannelRef audit_clog;
+
+  PyModuleConfig load_config();
   void load_all_metadata();
   void init();
 
@@ -77,6 +78,7 @@ protected:
 
 public:
   Mgr(MonClient *monc_, const MgrMap& mgrmap,
+      PyModuleRegistry *py_module_registry_,
       Messenger *clientm_, Objecter *objecter_,
       Client *client_, LogChannelRef clog_, LogChannelRef audit_clog_);
   ~Mgr();
@@ -100,6 +102,7 @@ public:
   void shutdown();
 
   std::vector<MonCommand> get_command_set() const;
+  std::map<std::string, std::string> get_services() const;
 };
 
 #endif
index 849590ba93c7974c5e8630504c0b80ead62917de..c72470d9bca5a93644bcc2d536853bf8ae9cd4f0 100644 (file)
@@ -111,14 +111,15 @@ void MgrClient::reconnect()
   if (last_connect_attempt != utime_t()) {
     utime_t now = ceph_clock_now();
     utime_t when = last_connect_attempt;
-    when += cct->_conf->mgr_connect_retry_interval;
+    when += cct->_conf->get_val<double>("mgr_connect_retry_interval");
     if (now < when) {
       if (!connect_retry_callback) {
-       connect_retry_callback = new FunctionContext([this](int r){
-           connect_retry_callback = nullptr;
-           reconnect();
-         });
-       timer.add_event_at(when, connect_retry_callback);
+       connect_retry_callback = timer.add_event_at(
+         when,
+         new FunctionContext([this](int r){
+             connect_retry_callback = nullptr;
+             reconnect();
+           }));
       }
       ldout(cct, 4) << "waiting to retry connect until " << when << dendl;
       return;
@@ -227,22 +228,48 @@ void MgrClient::send_report()
   pcc->with_counters([this, report](
         const PerfCountersCollection::CounterMap &by_path)
   {
+    // Helper for checking whether a counter should be included
+    auto include_counter = [this](
+        const PerfCounters::perf_counter_data_any_d &ctr,
+        const PerfCounters &perf_counters)
+    {
+      return perf_counters.get_adjusted_priority(ctr.prio) >= (int)stats_threshold;
+    };
+
+    // Helper for cases where we want to forget a counter
+    auto undeclare = [report, this](const std::string &path)
+    {
+      report->undeclare_types.push_back(path);
+      ldout(cct,20) << " undeclare " << path << dendl;
+      session->declared.erase(path);
+    };
+
     ENCODE_START(1, 1, report->packed);
+
+    // Find counters that no longer exist, and undeclare them
     for (auto p = session->declared.begin(); p != session->declared.end(); ) {
-      if (by_path.count(*p) == 0) {
-       report->undeclare_types.push_back(*p);
-       ldout(cct,20) << __func__ << " undeclare " << *p << dendl;
-       p = session->declared.erase(p);
-      } else {
-       ++p;
+      const auto &path = *(p++);
+      if (by_path.count(path) == 0) {
+        undeclare(path);
       }
     }
+
     for (const auto &i : by_path) {
       auto& path = i.first;
-      auto& data = *(i.second);
+      auto& data = *(i.second.data);
+      auto& perf_counters = *(i.second.perf_counters);
+
+      // Find counters that still exist, but are no longer permitted by
+      // stats_threshold
+      if (!include_counter(data, perf_counters)) {
+        if (session->declared.count(path)) {
+          undeclare(path);
+        }
+        continue;
+      }
 
       if (session->declared.count(path) == 0) {
-       ldout(cct,20) << __func__ << " declare " << path << dendl;
+       ldout(cct,20) << " declare " << path << dendl;
        PerfCounterType type;
        type.path = path;
        if (data.description) {
@@ -252,6 +279,7 @@ void MgrClient::send_report()
          type.nick = data.nick;
        }
        type.type = data.type;
+        type.priority = perf_counters.get_adjusted_priority(data.prio);
        report->declare_types.push_back(std::move(type));
        session->declared.insert(path);
       }
@@ -264,8 +292,11 @@ void MgrClient::send_report()
     }
     ENCODE_FINISH(report->packed);
 
-    ldout(cct, 20) << by_path.size() << " counters, of which "
-                  << report->declare_types.size() << " new" << dendl;
+    ldout(cct, 20) << "sending " << session->declared.size() << " counters ("
+                      "of possible " << by_path.size() << "), "
+                  << report->declare_types.size() << " new, "
+                   << report->undeclare_types.size() << " removed"
+                   << dendl;
   });
 
   ldout(cct, 20) << "encoded " << report->packed.length() << " bytes" << dendl;
@@ -313,6 +344,11 @@ bool MgrClient::handle_mgr_configure(MMgrConfigure *m)
 
   ldout(cct, 4) << "stats_period=" << m->stats_period << dendl;
 
+  if (stats_threshold != m->stats_threshold) {
+    ldout(cct, 4) << "updated stats threshold: " << m->stats_threshold << dendl;
+    stats_threshold = m->stats_threshold;
+  }
+
   bool starting = (stats_period == 0) && (m->stats_period != 0);
   stats_period = m->stats_period;
   if (starting) {
index 09fe831b3948364988c6c15a8d678eded1e2cf7e..08ff24c23849428219570c0446a88813f927b8a2 100644 (file)
@@ -59,6 +59,7 @@ protected:
   Mutex lock = {"MgrClient::lock"};
 
   uint32_t stats_period = 0;
+  uint32_t stats_threshold = 0;
   SafeTimer timer;
 
   CommandTable<MgrCommand> command_table;
index 1818454e1fcc47df9a5ded7323c5a9c59d5cd55d..79766fafed93acdc1a36b6edb710ece131a3526a 100644 (file)
@@ -131,3 +131,8 @@ COMMAND("service dump",
         "dump service map", "service", "r", "cli,rest")
 COMMAND("service status",
         "dump service state", "service", "r", "cli,rest")
+
+COMMAND("config set " \
+       "name=key,type=CephString name=value,type=CephString",
+       "Set a configuration option at runtime (not persistent)",
+       "mgr", "rw", "cli,rest")
diff --git a/ceph/src/mgr/MgrPyModule.cc b/ceph/src/mgr/MgrPyModule.cc
deleted file mode 100644 (file)
index a2bf73c..0000000
+++ /dev/null
@@ -1,371 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 John Spray <john.spray@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- */
-
-#include "PyState.h"
-#include "Gil.h"
-
-#include "PyFormatter.h"
-
-#include "common/debug.h"
-
-#include "MgrPyModule.h"
-
-//XXX courtesy of http://stackoverflow.com/questions/1418015/how-to-get-python-exception-text
-#include <boost/python.hpp>
-#include "include/assert.h"  // boost clobbers this
-
-// decode a Python exception into a string
-std::string handle_pyerror()
-{
-    using namespace boost::python;
-    using namespace boost;
-
-    PyObject *exc, *val, *tb;
-    object formatted_list, formatted;
-    PyErr_Fetch(&exc, &val, &tb);
-    handle<> hexc(exc), hval(allow_null(val)), htb(allow_null(tb));
-    object traceback(import("traceback"));
-    if (!tb) {
-        object format_exception_only(traceback.attr("format_exception_only"));
-        formatted_list = format_exception_only(hexc, hval);
-    } else {
-        object format_exception(traceback.attr("format_exception"));
-        formatted_list = format_exception(hexc,hval, htb);
-    }
-    formatted = str("").join(formatted_list);
-    return extract<std::string>(formatted);
-}
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_mgr
-
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr[py] "
-
-namespace {
-  PyObject* log_write(PyObject*, PyObject* args) {
-    char* m = nullptr;
-    if (PyArg_ParseTuple(args, "s", &m)) {
-      auto len = strlen(m);
-      if (len && m[len-1] == '\n') {
-       m[len-1] = '\0';
-      }
-      dout(4) << m << dendl;
-    }
-    Py_RETURN_NONE;
-  }
-
-  PyObject* log_flush(PyObject*, PyObject*){
-    Py_RETURN_NONE;
-  }
-
-  static PyMethodDef log_methods[] = {
-    {"write", log_write, METH_VARARGS, "write stdout and stderr"},
-    {"flush", log_flush, METH_VARARGS, "flush"},
-    {nullptr, nullptr, 0, nullptr}
-  };
-}
-
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr " << __func__ << " "
-
-MgrPyModule::MgrPyModule(const std::string &module_name_, const std::string &sys_path, PyThreadState *main_ts_)
-  : module_name(module_name_),
-    pClassInstance(nullptr),
-    pMainThreadState(main_ts_)
-{
-  assert(pMainThreadState != nullptr);
-
-  Gil gil(pMainThreadState);
-
-  pMyThreadState = Py_NewInterpreter();
-  if (pMyThreadState == nullptr) {
-    derr << "Failed to create python sub-interpreter for '" << module_name << '"' << dendl;
-  } else {
-    // Some python modules do not cope with an unpopulated argv, so lets
-    // fake one.  This step also picks up site-packages into sys.path.
-    const char *argv[] = {"ceph-mgr"};
-    PySys_SetArgv(1, (char**)argv);
-
-    if (g_conf->daemonize) {
-      auto py_logger = Py_InitModule("ceph_logger", log_methods);
-#if PY_MAJOR_VERSION >= 3
-      PySys_SetObject("stderr", py_logger);
-      PySys_SetObject("stdout", py_logger);
-#else
-      PySys_SetObject(const_cast<char*>("stderr"), py_logger);
-      PySys_SetObject(const_cast<char*>("stdout"), py_logger);
-#endif
-    }
-    // Populate python namespace with callable hooks
-    Py_InitModule("ceph_state", CephStateMethods);
-
-    PySys_SetPath(const_cast<char*>(sys_path.c_str()));
-  }
-}
-
-MgrPyModule::~MgrPyModule()
-{
-  if (pMyThreadState != nullptr) {
-    Gil gil(pMyThreadState);
-
-    Py_XDECREF(pClassInstance);
-
-    //
-    // Ideally, now, we'd be able to do this:
-    //
-    //    Py_EndInterpreter(pMyThreadState);
-    //    PyThreadState_Swap(pMainThreadState);
-    //
-    // Unfortunately, if the module has any other *python* threads active
-    // at this point, Py_EndInterpreter() will abort with:
-    //
-    //    Fatal Python error: Py_EndInterpreter: not the last thread
-    //
-    // This can happen when using CherryPy in a module, becuase CherryPy
-    // runs an extra thread as a timeout monitor, which spends most of its
-    // life inside a time.sleep(60).  Unless you are very, very lucky with
-    // the timing calling this destructor, that thread will still be stuck
-    // in a sleep, and Py_EndInterpreter() will abort.
-    //
-    // This could of course also happen with a poorly written module which
-    // made no attempt to clean up any additional threads it created.
-    //
-    // The safest thing to do is just not call Py_EndInterpreter(), and
-    // let Py_Finalize() kill everything after all modules are shut down.
-    //
-  }
-}
-
-int MgrPyModule::load()
-{
-  if (pMyThreadState == nullptr) {
-    derr << "No python sub-interpreter exists for module '" << module_name << "'" << dendl;
-    return -EINVAL;
-  }
-
-  Gil gil(pMyThreadState);
-
-  // Load the module
-  PyObject *pName = PyString_FromString(module_name.c_str());
-  auto pModule = PyImport_Import(pName);
-  Py_DECREF(pName);
-  if (pModule == nullptr) {
-    derr << "Module not found: '" << module_name << "'" << dendl;
-    derr << handle_pyerror() << dendl;
-    return -ENOENT;
-  }
-
-  // Find the class
-  // TODO: let them call it what they want instead of just 'Module'
-  auto pClass = PyObject_GetAttrString(pModule, (const char*)"Module");
-  Py_DECREF(pModule);
-  if (pClass == nullptr) {
-    derr << "Class not found in module '" << module_name << "'" << dendl;
-    derr << handle_pyerror() << dendl;
-    return -EINVAL;
-  }
-
-  
-  // Just using the module name as the handle, replace with a
-  // uuidish thing if needed
-  auto pyHandle = PyString_FromString(module_name.c_str());
-  auto pArgs = PyTuple_Pack(1, pyHandle);
-  pClassInstance = PyObject_CallObject(pClass, pArgs);
-  Py_DECREF(pClass);
-  Py_DECREF(pyHandle);
-  Py_DECREF(pArgs);
-  if (pClassInstance == nullptr) {
-    derr << "Failed to construct class in '" << module_name << "'" << dendl;
-    derr << handle_pyerror() << dendl;
-    return -EINVAL;
-  } else {
-    dout(1) << "Constructed class from module: " << module_name << dendl;
-  }
-
-  return load_commands();
-}
-
-int MgrPyModule::serve()
-{
-  assert(pClassInstance != nullptr);
-
-  // This method is called from a separate OS thread (i.e. a thread not
-  // created by Python), so tell Gil to wrap this in a new thread state.
-  Gil gil(pMyThreadState, true);
-
-  auto pValue = PyObject_CallMethod(pClassInstance,
-      const_cast<char*>("serve"), nullptr);
-
-  int r = 0;
-  if (pValue != NULL) {
-    Py_DECREF(pValue);
-  } else {
-    derr << module_name << ".serve:" << dendl;
-    derr << handle_pyerror() << dendl;
-    return -EINVAL;
-  }
-
-  return r;
-}
-
-// FIXME: DRY wrt serve
-void MgrPyModule::shutdown()
-{
-  assert(pClassInstance != nullptr);
-
-  Gil gil(pMyThreadState);
-
-  auto pValue = PyObject_CallMethod(pClassInstance,
-      const_cast<char*>("shutdown"), nullptr);
-
-  if (pValue != NULL) {
-    Py_DECREF(pValue);
-  } else {
-    derr << "Failed to invoke shutdown() on " << module_name << dendl;
-    derr << handle_pyerror() << dendl;
-  }
-}
-
-void MgrPyModule::notify(const std::string &notify_type, const std::string &notify_id)
-{
-  assert(pClassInstance != nullptr);
-
-  Gil gil(pMyThreadState);
-
-  // Execute
-  auto pValue = PyObject_CallMethod(pClassInstance,
-       const_cast<char*>("notify"), const_cast<char*>("(ss)"),
-       notify_type.c_str(), notify_id.c_str());
-
-  if (pValue != NULL) {
-    Py_DECREF(pValue);
-  } else {
-    derr << module_name << ".notify:" << dendl;
-    derr << handle_pyerror() << dendl;
-    // FIXME: callers can't be expected to handle a python module
-    // that has spontaneously broken, but Mgr() should provide
-    // a hook to unload misbehaving modules when they have an
-    // error somewhere like this
-  }
-}
-
-void MgrPyModule::notify_clog(const LogEntry &log_entry)
-{
-  assert(pClassInstance != nullptr);
-
-  Gil gil(pMyThreadState);
-
-  // Construct python-ized LogEntry
-  PyFormatter f;
-  log_entry.dump(&f);
-  auto py_log_entry = f.get();
-
-  // Execute
-  auto pValue = PyObject_CallMethod(pClassInstance,
-       const_cast<char*>("notify"), const_cast<char*>("(sN)"),
-       "clog", py_log_entry);
-
-  if (pValue != NULL) {
-    Py_DECREF(pValue);
-  } else {
-    derr << module_name << ".notify_clog:" << dendl;
-    derr << handle_pyerror() << dendl;
-    // FIXME: callers can't be expected to handle a python module
-    // that has spontaneously broken, but Mgr() should provide
-    // a hook to unload misbehaving modules when they have an
-    // error somewhere like this
-  }
-}
-
-int MgrPyModule::load_commands()
-{
-  // Don't need a Gil here -- this is called from MgrPyModule::load(),
-  // which already has one.
-  PyObject *command_list = PyObject_GetAttrString(pClassInstance, "COMMANDS");
-  assert(command_list != nullptr);
-  const size_t list_size = PyList_Size(command_list);
-  for (size_t i = 0; i < list_size; ++i) {
-    PyObject *command = PyList_GetItem(command_list, i);
-    assert(command != nullptr);
-
-    ModuleCommand item;
-
-    PyObject *pCmd = PyDict_GetItemString(command, "cmd");
-    assert(pCmd != nullptr);
-    item.cmdstring = PyString_AsString(pCmd);
-
-    dout(20) << "loaded command " << item.cmdstring << dendl;
-
-    PyObject *pDesc = PyDict_GetItemString(command, "desc");
-    assert(pDesc != nullptr);
-    item.helpstring = PyString_AsString(pDesc);
-
-    PyObject *pPerm = PyDict_GetItemString(command, "perm");
-    assert(pPerm != nullptr);
-    item.perm = PyString_AsString(pPerm);
-
-    item.handler = this;
-
-    commands.push_back(item);
-  }
-  Py_DECREF(command_list);
-
-  dout(10) << "loaded " << commands.size() << " commands" << dendl;
-
-  return 0;
-}
-
-int MgrPyModule::handle_command(
-  const cmdmap_t &cmdmap,
-  std::stringstream *ds,
-  std::stringstream *ss)
-{
-  assert(ss != nullptr);
-  assert(ds != nullptr);
-
-  Gil gil(pMyThreadState);
-
-  PyFormatter f;
-  cmdmap_dump(cmdmap, &f);
-  PyObject *py_cmd = f.get();
-
-  auto pResult = PyObject_CallMethod(pClassInstance,
-      const_cast<char*>("handle_command"), const_cast<char*>("(O)"), py_cmd);
-
-  Py_DECREF(py_cmd);
-
-  int r = 0;
-  if (pResult != NULL) {
-    if (PyTuple_Size(pResult) != 3) {
-      r = -EINVAL;
-    } else {
-      r = PyInt_AsLong(PyTuple_GetItem(pResult, 0));
-      *ds << PyString_AsString(PyTuple_GetItem(pResult, 1));
-      *ss << PyString_AsString(PyTuple_GetItem(pResult, 2));
-    }
-
-    Py_DECREF(pResult);
-  } else {
-    *ds << "";
-    *ss << handle_pyerror();
-    r = -EINVAL;
-  }
-
-  return r;
-}
-
-void MgrPyModule::get_health_checks(health_check_map_t *checks)
-{
-  checks->merge(health_checks);
-}
diff --git a/ceph/src/mgr/MgrPyModule.h b/ceph/src/mgr/MgrPyModule.h
deleted file mode 100644 (file)
index 6eea29e..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 John Spray <john.spray@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- */
-
-
-#ifndef MGR_PY_MODULE_H_
-#define MGR_PY_MODULE_H_
-
-// Python.h comes first because otherwise it clobbers ceph's assert
-#include "Python.h"
-
-#include "common/cmdparse.h"
-#include "common/LogEntry.h"
-#include "common/Mutex.h"
-#include "mon/health_check.h"
-
-#include <vector>
-#include <string>
-
-
-class MgrPyModule;
-
-/**
- * A Ceph CLI command description provided from a Python module
- */
-class ModuleCommand {
-public:
-  std::string cmdstring;
-  std::string helpstring;
-  std::string perm;
-  MgrPyModule *handler;
-};
-
-class MgrPyModule
-{
-private:
-  const std::string module_name;
-  PyObject *pClassInstance;
-  PyThreadState *pMainThreadState;
-  PyThreadState *pMyThreadState = nullptr;
-
-  health_check_map_t health_checks;
-
-  std::vector<ModuleCommand> commands;
-
-  int load_commands();
-
-public:
-  MgrPyModule(const std::string &module_name, const std::string &sys_path, PyThreadState *main_ts);
-  ~MgrPyModule();
-
-  int load();
-  int serve();
-  void shutdown();
-  void notify(const std::string &notify_type, const std::string &notify_id);
-  void notify_clog(const LogEntry &le);
-
-  const std::vector<ModuleCommand> &get_commands() const
-  {
-    return commands;
-  }
-
-  const std::string &get_name() const
-  {
-    return module_name;
-  }
-
-  int handle_command(
-    const cmdmap_t &cmdmap,
-    std::stringstream *ds,
-    std::stringstream *ss);
-
-  void set_health_checks(health_check_map_t&& c) {
-    health_checks = std::move(c);
-  }
-  void get_health_checks(health_check_map_t *checks);
-};
-
-std::string handle_pyerror();
-
-#endif
-
index 72afd7a676728f0066c65ebd5b71413ba52e612e..c52e2e177761ac979eeea3791d9c69aa05c7f764 100644 (file)
@@ -23,6 +23,8 @@ struct MgrSession : public RefCountedObject {
   // mon caps are suitably generic for mgr
   MonCap caps;
 
+  std::set<std::string> declared_types;
+
   MgrSession(CephContext *cct) : RefCountedObject(cct, 0) {}
   ~MgrSession() override {}
 };
index 99df69c909fdea44512973aabe69e3e45990d525..171add02f8761106d2e972d391ac9288705ca171 100644 (file)
@@ -46,6 +46,7 @@ MgrStandby::MgrStandby(int argc, const char **argv) :
   audit_clog(log_client.create_channel(CLOG_CHANNEL_AUDIT)),
   lock("MgrStandby::lock"),
   timer(g_ceph_context, lock),
+  py_module_registry(clog),
   active_mgr(nullptr),
   orig_argc(argc),
   orig_argv(argv),
@@ -151,15 +152,14 @@ void MgrStandby::send_beacon()
   dout(1) << state_str() << dendl;
 
   set<string> modules;
-  PyModules::list_modules(&modules);
+  PyModuleRegistry::list_modules(&modules);
 
   // Whether I think I am available (request MgrMonitor to set me
   // as available in the map)
   bool available = active_mgr != nullptr && active_mgr->is_initialized();
 
   auto addr = available ? active_mgr->get_server_addr() : entity_addr_t();
-  dout(10) << "sending beacon as gid " << monc.get_global_id()
-          << " modules " << modules << dendl;
+  dout(10) << "sending beacon as gid " << monc.get_global_id() << dendl;
 
   map<string,string> metadata;
   collect_sys_info(&metadata, g_ceph_context);
@@ -172,13 +172,17 @@ void MgrStandby::send_beacon()
                                 modules,
                                 std::move(metadata));
 
-  if (available && !available_in_map) {
-    // We are informing the mon that we are done initializing: inform
-    // it of our command set.  This has to happen after init() because
-    // it needs the python modules to have loaded.
-    m->set_command_descs(active_mgr->get_command_set());
-    dout(4) << "going active, including " << m->get_command_descs().size()
-            << " commands in beacon" << dendl;
+  if (available) {
+    if (!available_in_map) {
+      // We are informing the mon that we are done initializing: inform
+      // it of our command set.  This has to happen after init() because
+      // it needs the python modules to have loaded.
+      m->set_command_descs(active_mgr->get_command_set());
+      dout(4) << "going active, including " << m->get_command_descs().size()
+              << " commands in beacon" << dendl;
+    }
+
+    m->set_services(active_mgr->get_services());
   }
                                  
   monc.send_mon_message(m);
@@ -189,14 +193,14 @@ void MgrStandby::tick()
   dout(10) << __func__ << dendl;
   send_beacon();
 
-  if (active_mgr) {
+  if (active_mgr && active_mgr->is_initialized()) {
     active_mgr->tick();
   }
 
-  timer.add_event_after(g_conf->mgr_tick_period, new FunctionContext(
-        [this](int r){
+  timer.add_event_after(g_conf->get_val<int64_t>("mgr_tick_period"),
+      new FunctionContext([this](int r){
           tick();
-        }
+      }
   )); 
 }
 
@@ -213,6 +217,8 @@ void MgrStandby::shutdown()
   // Expect already to be locked as we're called from signal handler
   assert(lock.is_locked_by_me());
 
+  dout(4) << "Shutting down" << dendl;
+
   // stop sending beacon first, i use monc to talk with monitors
   timer.shutdown();
   // client uses monc and objecter
@@ -223,6 +229,9 @@ void MgrStandby::shutdown()
   if (active_mgr) {
     active_mgr->shutdown();
   }
+
+  py_module_registry.shutdown();
+
   // objecter is used by monc and active_mgr
   objecter.shutdown();
   // client_messenger is used by all of them, so stop it in the end
@@ -302,10 +311,24 @@ void MgrStandby::handle_mgr_map(MMgrMap* mmap)
   const bool active_in_map = map.active_gid == monc.get_global_id();
   dout(4) << "active in map: " << active_in_map
           << " active is " << map.active_gid << dendl;
+
+  if (!py_module_registry.is_initialized()) {
+    int r = py_module_registry.init(map);
+
+    // FIXME: error handling
+    assert(r == 0);
+  } else {
+    bool need_respawn = py_module_registry.handle_mgr_map(map);
+    if (need_respawn) {
+      respawn();
+    }
+  }
+
   if (active_in_map) {
     if (!active_mgr) {
       dout(1) << "Activating!" << dendl;
-      active_mgr.reset(new Mgr(&monc, map, client_messenger.get(), &objecter,
+      active_mgr.reset(new Mgr(&monc, map, &py_module_registry,
+                               client_messenger.get(), &objecter,
                               &client, clog, audit_clog));
       active_mgr->background_init(new FunctionContext(
             [this](int r){
@@ -327,10 +350,16 @@ void MgrStandby::handle_mgr_map(MMgrMap* mmap)
       dout(4) << "Map now says I am available" << dendl;
       available_in_map = true;
     }
+  } else if (active_mgr != nullptr) {
+    derr << "I was active but no longer am" << dendl;
+    respawn();
   } else {
-    if (active_mgr != nullptr) {
-      derr << "I was active but no longer am" << dendl;
-      respawn();
+    if (map.active_gid != 0 && map.active_name != g_conf->name.get_id()) {
+      // I am the standby and someone else is active, start modules
+      // in standby mode to do redirects if needed
+      if (!py_module_registry.is_standby_running()) {
+        py_module_registry.standby_start(&monc);
+      }
     }
   }
 
@@ -412,6 +441,12 @@ int MgrStandby::main(vector<const char *> args)
 
 std::string MgrStandby::state_str()
 {
-  return active_mgr == nullptr ? "standby" : "active";
+  if (active_mgr == nullptr) {
+    return "standby";
+  } else if (active_mgr->is_initialized()) {
+    return "active";
+  } else {
+    return "active (starting)";
+  }
 }
 
index e24f175cada4090845e42157e26d7313f88fa304..a64fd7e99958c4f164dd9fe3aee6e8d6fc30fb46 100644 (file)
@@ -23,6 +23,7 @@
 #include "client/Client.h"
 #include "mon/MonClient.h"
 #include "osdc/Objecter.h"
+#include "PyModuleRegistry.h"
 
 
 class MMgrMap;
@@ -48,6 +49,7 @@ protected:
   Mutex lock;
   SafeTimer timer;
 
+  PyModuleRegistry py_module_registry;
   std::shared_ptr<Mgr> active_mgr;
 
   int orig_argc;
diff --git a/ceph/src/mgr/PyModuleRegistry.cc b/ceph/src/mgr/PyModuleRegistry.cc
new file mode 100644 (file)
index 0000000..7004ae2
--- /dev/null
@@ -0,0 +1,450 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/backport14.h"
+
+#include "BaseMgrModule.h"
+#include "PyOSDMap.h"
+#include "BaseMgrStandbyModule.h"
+#include "Gil.h"
+
+#include "ActivePyModules.h"
+
+#include "PyModuleRegistry.h"
+
+// definition for non-const static member
+std::string PyModuleRegistry::config_prefix;
+
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr[py] "
+
+namespace {
+  PyObject* log_write(PyObject*, PyObject* args) {
+    char* m = nullptr;
+    if (PyArg_ParseTuple(args, "s", &m)) {
+      auto len = strlen(m);
+      if (len && m[len-1] == '\n') {
+       m[len-1] = '\0';
+      }
+      dout(4) << m << dendl;
+    }
+    Py_RETURN_NONE;
+  }
+
+  PyObject* log_flush(PyObject*, PyObject*){
+    Py_RETURN_NONE;
+  }
+
+  static PyMethodDef log_methods[] = {
+    {"write", log_write, METH_VARARGS, "write stdout and stderr"},
+    {"flush", log_flush, METH_VARARGS, "flush"},
+    {nullptr, nullptr, 0, nullptr}
+  };
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+
+
+std::string PyModule::get_site_packages()
+{
+  std::stringstream site_packages;
+
+  // CPython doesn't auto-add site-packages dirs to sys.path for us,
+  // but it does provide a module that we can ask for them.
+  auto site_module = PyImport_ImportModule("site");
+  assert(site_module);
+
+  auto site_packages_fn = PyObject_GetAttrString(site_module, "getsitepackages");
+  if (site_packages_fn != nullptr) {
+    auto site_packages_list = PyObject_CallObject(site_packages_fn, nullptr);
+    assert(site_packages_list);
+
+    auto n = PyList_Size(site_packages_list);
+    for (Py_ssize_t i = 0; i < n; ++i) {
+      if (i != 0) {
+        site_packages << ":";
+      }
+      site_packages << PyString_AsString(PyList_GetItem(site_packages_list, i));
+    }
+
+    Py_DECREF(site_packages_list);
+    Py_DECREF(site_packages_fn);
+  } else {
+    // Fall back to generating our own site-packages paths by imitating
+    // what the standard site.py does.  This is annoying but it lets us
+    // run inside virtualenvs :-/
+
+    auto site_packages_fn = PyObject_GetAttrString(site_module, "addsitepackages");
+    assert(site_packages_fn);
+
+    auto known_paths = PySet_New(nullptr);
+    auto pArgs = PyTuple_Pack(1, known_paths);
+    PyObject_CallObject(site_packages_fn, pArgs);
+    Py_DECREF(pArgs);
+    Py_DECREF(known_paths);
+    Py_DECREF(site_packages_fn);
+
+    auto sys_module = PyImport_ImportModule("sys");
+    assert(sys_module);
+    auto sys_path = PyObject_GetAttrString(sys_module, "path");
+    assert(sys_path);
+
+    dout(1) << "sys.path:" << dendl;
+    auto n = PyList_Size(sys_path);
+    bool first = true;
+    for (Py_ssize_t i = 0; i < n; ++i) {
+      dout(1) << "  " << PyString_AsString(PyList_GetItem(sys_path, i)) << dendl;
+      if (first) {
+        first = false;
+      } else {
+        site_packages << ":";
+      }
+      site_packages << PyString_AsString(PyList_GetItem(sys_path, i));
+    }
+
+    Py_DECREF(sys_path);
+    Py_DECREF(sys_module);
+  }
+
+  Py_DECREF(site_module);
+
+  return site_packages.str();
+}
+
+int PyModuleRegistry::init(const MgrMap &map)
+{
+  Mutex::Locker locker(lock);
+
+  // Don't try and init me if you don't really have a map
+  assert(map.epoch > 0);
+
+  mgr_map = map;
+
+  // namespace in config-key prefixed by "mgr/"
+  config_prefix = std::string(g_conf->name.get_type_str()) + "/";
+
+  // Set up global python interpreter
+  Py_SetProgramName(const_cast<char*>(PYTHON_EXECUTABLE));
+  Py_InitializeEx(0);
+
+  // Let CPython know that we will be calling it back from other
+  // threads in future.
+  if (! PyEval_ThreadsInitialized()) {
+    PyEval_InitThreads();
+  }
+
+  // Drop the GIL and remember the main thread state (current
+  // thread state becomes NULL)
+  pMainThreadState = PyEval_SaveThread();
+  assert(pMainThreadState != nullptr);
+
+  std::list<std::string> failed_modules;
+
+  // Load python code
+  for (const auto& module_name : mgr_map.modules) {
+    dout(1) << "Loading python module '" << module_name << "'" << dendl;
+    auto mod = ceph::make_unique<PyModule>(module_name);
+    int r = mod->load(pMainThreadState);
+    if (r != 0) {
+      // Don't use handle_pyerror() here; we don't have the GIL
+      // or the right thread state (this is deliberate).
+      derr << "Error loading module '" << module_name << "': "
+        << cpp_strerror(r) << dendl;
+      failed_modules.push_back(module_name);
+      // Don't drop out here, load the other modules
+    } else {
+      // Success!
+      modules[module_name] = std::move(mod);
+    }
+  }
+
+  if (!failed_modules.empty()) {
+    clog->error() << "Failed to load ceph-mgr modules: " << joinify(
+        failed_modules.begin(), failed_modules.end(), std::string(", "));
+  }
+
+  return 0;
+}
+
+
+int PyModule::load(PyThreadState *pMainThreadState)
+{
+  assert(pMainThreadState != nullptr);
+
+  // Configure sub-interpreter and construct C++-generated python classes
+  {
+    SafeThreadState sts(pMainThreadState);
+    Gil gil(sts);
+
+    auto thread_state = Py_NewInterpreter();
+    if (thread_state == nullptr) {
+      derr << "Failed to create python sub-interpreter for '" << module_name << '"' << dendl;
+      return -EINVAL;
+    } else {
+      pMyThreadState.set(thread_state);
+      // Some python modules do not cope with an unpopulated argv, so lets
+      // fake one.  This step also picks up site-packages into sys.path.
+      const char *argv[] = {"ceph-mgr"};
+      PySys_SetArgv(1, (char**)argv);
+
+      if (g_conf->daemonize) {
+        auto py_logger = Py_InitModule("ceph_logger", log_methods);
+#if PY_MAJOR_VERSION >= 3
+        PySys_SetObject("stderr", py_logger);
+        PySys_SetObject("stdout", py_logger);
+#else
+        PySys_SetObject(const_cast<char*>("stderr"), py_logger);
+        PySys_SetObject(const_cast<char*>("stdout"), py_logger);
+#endif
+      }
+
+      // Configure sys.path to include mgr_module_path
+      std::string sys_path = std::string(Py_GetPath()) + ":" + get_site_packages()
+                             + ":" + g_conf->get_val<std::string>("mgr_module_path");
+      dout(10) << "Computed sys.path '" << sys_path << "'" << dendl;
+
+      PySys_SetPath(const_cast<char*>(sys_path.c_str()));
+    }
+
+    PyMethodDef ModuleMethods[] = {
+      {nullptr}
+    };
+
+    // Initialize module
+    PyObject *ceph_module = Py_InitModule("ceph_module", ModuleMethods);
+    assert(ceph_module != nullptr);
+
+    auto load_class = [ceph_module](const char *name, PyTypeObject *type)
+    {
+      type->tp_new = PyType_GenericNew;
+      if (PyType_Ready(type) < 0) {
+          assert(0);
+      }
+      Py_INCREF(type);
+
+      PyModule_AddObject(ceph_module, name, (PyObject *)type);
+    };
+
+    load_class("BaseMgrModule", &BaseMgrModuleType);
+    load_class("BaseMgrStandbyModule", &BaseMgrStandbyModuleType);
+    load_class("BasePyOSDMap", &BasePyOSDMapType);
+    load_class("BasePyOSDMapIncremental", &BasePyOSDMapIncrementalType);
+    load_class("BasePyCRUSH", &BasePyCRUSHType);
+  }
+
+  // Environment is all good, import the external module
+  {
+    Gil gil(pMyThreadState);
+
+    // Load the module
+    PyObject *pName = PyString_FromString(module_name.c_str());
+    auto pModule = PyImport_Import(pName);
+    Py_DECREF(pName);
+    if (pModule == nullptr) {
+      derr << "Module not found: '" << module_name << "'" << dendl;
+      derr << handle_pyerror() << dendl;
+      return -ENOENT;
+    }
+
+    // Find the class
+    // TODO: let them call it what they want instead of just 'Module'
+    pClass = PyObject_GetAttrString(pModule, (const char*)"Module");
+    if (pClass == nullptr) {
+      derr << "Class not found in module '" << module_name << "'" << dendl;
+      derr << handle_pyerror() << dendl;
+      return -EINVAL;
+    }
+
+    pStandbyClass = PyObject_GetAttrString(pModule,
+                                           (const char*)"StandbyModule");
+    if (pStandbyClass) {
+      dout(4) << "Standby mode available in module '" << module_name
+              << "'" << dendl;
+    } else {
+      dout(4) << "Standby mode not provided by module '" << module_name
+              << "'" << dendl;
+      PyErr_Clear();
+    }
+
+    Py_DECREF(pModule);
+  }
+
+  return 0;
+} 
+
+PyModule::~PyModule()
+{
+  if (pMyThreadState.ts != nullptr) {
+    Gil gil(pMyThreadState, true);
+    Py_XDECREF(pClass);
+    Py_XDECREF(pStandbyClass);
+  }
+}
+
+void PyModuleRegistry::standby_start(MonClient *monc)
+{
+  Mutex::Locker l(lock);
+  assert(active_modules == nullptr);
+  assert(standby_modules == nullptr);
+  assert(is_initialized());
+
+  dout(4) << "Starting modules in standby mode" << dendl;
+
+  standby_modules.reset(new StandbyPyModules(monc, mgr_map));
+
+  std::set<std::string> failed_modules;
+  for (const auto &i : modules) {
+    if (i.second->pStandbyClass) {
+      dout(4) << "starting module " << i.second->get_name() << dendl;
+      int r = standby_modules->start_one(i.first,
+              i.second->pStandbyClass,
+              i.second->pMyThreadState);
+      if (r != 0) {
+        derr << "failed to start module '" << i.second->get_name()
+             << "'" << dendl;;
+        failed_modules.insert(i.second->get_name());
+        // Continue trying to load any other modules
+      }
+    } else {
+      dout(4) << "skipping module '" << i.second->get_name() << "' because "
+                 "it does not implement a standby mode" << dendl;
+    }
+  }
+
+  if (!failed_modules.empty()) {
+    clog->error() << "Failed to execute ceph-mgr module(s) in standby mode: "
+        << joinify(failed_modules.begin(), failed_modules.end(),
+                   std::string(", "));
+  }
+}
+
+void PyModuleRegistry::active_start(
+            PyModuleConfig &config_,
+            DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+            LogChannelRef clog_, Objecter &objecter_, Client &client_,
+            Finisher &f)
+{
+  Mutex::Locker locker(lock);
+
+  dout(4) << "Starting modules in active mode" << dendl;
+
+  assert(active_modules == nullptr);
+  assert(is_initialized());
+
+  if (standby_modules != nullptr) {
+    standby_modules->shutdown();
+    standby_modules.reset();
+  }
+
+  active_modules.reset(new ActivePyModules(
+              config_, ds, cs, mc, clog_, objecter_, client_, f));
+
+  for (const auto &i : modules) {
+    dout(4) << "Starting " << i.first << dendl;
+    int r = active_modules->start_one(i.first,
+            i.second->pClass,
+            i.second->pMyThreadState);
+    if (r != 0) {
+      derr << "Failed to run module in active mode ('" << i.first << "')"
+           << dendl;
+    }
+  }
+}
+
+void PyModuleRegistry::active_shutdown()
+{
+  Mutex::Locker locker(lock);
+
+  if (active_modules != nullptr) {
+    active_modules->shutdown();
+    active_modules.reset();
+  }
+}
+
+void PyModuleRegistry::shutdown()
+{
+  Mutex::Locker locker(lock);
+
+  if (standby_modules != nullptr) {
+    standby_modules->shutdown();
+    standby_modules.reset();
+  }
+
+  // Ideally, now, we'd be able to do this for all modules:
+  //
+  //    Py_EndInterpreter(pMyThreadState);
+  //    PyThreadState_Swap(pMainThreadState);
+  //
+  // Unfortunately, if the module has any other *python* threads active
+  // at this point, Py_EndInterpreter() will abort with:
+  //
+  //    Fatal Python error: Py_EndInterpreter: not the last thread
+  //
+  // This can happen when using CherryPy in a module, becuase CherryPy
+  // runs an extra thread as a timeout monitor, which spends most of its
+  // life inside a time.sleep(60).  Unless you are very, very lucky with
+  // the timing calling this destructor, that thread will still be stuck
+  // in a sleep, and Py_EndInterpreter() will abort.
+  //
+  // This could of course also happen with a poorly written module which
+  // made no attempt to clean up any additional threads it created.
+  //
+  // The safest thing to do is just not call Py_EndInterpreter(), and
+  // let Py_Finalize() kill everything after all modules are shut down.
+
+  modules.clear();
+
+  PyEval_RestoreThread(pMainThreadState);
+  Py_Finalize();
+}
+
+static void _list_modules(
+  const std::string path,
+  std::set<std::string> *modules)
+{
+  DIR *dir = opendir(path.c_str());
+  if (!dir) {
+    return;
+  }
+  struct dirent *entry = NULL;
+  while ((entry = readdir(dir)) != NULL) {
+    string n(entry->d_name);
+    string fn = path + "/" + n;
+    struct stat st;
+    int r = ::stat(fn.c_str(), &st);
+    if (r == 0 && S_ISDIR(st.st_mode)) {
+      string initfn = fn + "/module.py";
+      r = ::stat(initfn.c_str(), &st);
+      if (r == 0) {
+       modules->insert(n);
+      }
+    }
+  }
+  closedir(dir);
+}
+
+void PyModuleRegistry::list_modules(std::set<std::string> *modules)
+{
+  _list_modules(g_conf->get_val<std::string>("mgr_module_path"), modules);
+}
+
diff --git a/ceph/src/mgr/PyModuleRegistry.h b/ceph/src/mgr/PyModuleRegistry.h
new file mode 100644 (file)
index 0000000..5564e7f
--- /dev/null
@@ -0,0 +1,173 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#pragma once
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include "Python.h"
+
+#include <string>
+#include <map>
+#include <memory>
+
+#include "common/LogClient.h"
+
+#include "ActivePyModules.h"
+#include "StandbyPyModules.h"
+
+class PyModule
+{
+private:
+  const std::string module_name;
+  std::string get_site_packages();
+
+public:
+  SafeThreadState pMyThreadState;
+  PyObject *pClass = nullptr;
+  PyObject *pStandbyClass = nullptr;
+
+  PyModule(const std::string &module_name_)
+    : module_name(module_name_)
+  {
+  }
+
+  ~PyModule();
+
+  int load(PyThreadState *pMainThreadState);
+
+  std::string get_name() const {
+    return module_name;
+  }
+};
+
+/**
+ * This class is responsible for setting up the python runtime environment
+ * and importing the python modules.
+ *
+ * It is *not* responsible for constructing instances of their BaseMgrModule
+ * subclasses.
+ */
+class PyModuleRegistry
+{
+private:
+  mutable Mutex lock{"PyModuleRegistry::lock"};
+
+  LogChannelRef clog;
+
+  std::map<std::string, std::unique_ptr<PyModule>> modules;
+
+  std::unique_ptr<ActivePyModules> active_modules;
+  std::unique_ptr<StandbyPyModules> standby_modules;
+
+  PyThreadState *pMainThreadState;
+
+  // We have our own copy of MgrMap, because we are constructed
+  // before ClusterState exists.
+  MgrMap mgr_map;
+
+public:
+  static std::string config_prefix;
+
+  static void list_modules(std::set<std::string> *modules);
+
+  PyModuleRegistry(LogChannelRef clog_)
+    : clog(clog_)
+  {}
+
+  bool handle_mgr_map(const MgrMap &mgr_map_)
+  {
+    Mutex::Locker l(lock);
+
+    bool modules_changed = mgr_map_.modules != mgr_map.modules;
+    mgr_map = mgr_map_;
+
+    if (standby_modules != nullptr) {
+      standby_modules->handle_mgr_map(mgr_map_);
+    }
+
+    return modules_changed;
+  }
+
+  bool is_initialized() const
+  {
+    return mgr_map.epoch > 0;
+  }
+
+  int init(const MgrMap &map);
+
+  void active_start(
+                PyModuleConfig &config_,
+                DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+                LogChannelRef clog_, Objecter &objecter_, Client &client_,
+                Finisher &f);
+  void standby_start(
+      MonClient *monc);
+
+  bool is_standby_running() const
+  {
+    return standby_modules != nullptr;
+  }
+
+  void active_shutdown();
+  void shutdown();
+
+  template<typename Callback, typename...Args>
+  void with_active_modules(Callback&& cb, Args&&...args) const
+  {
+    Mutex::Locker l(lock);
+    assert(active_modules != nullptr);
+
+    std::forward<Callback>(cb)(*active_modules, std::forward<Args>(args)...);
+  }
+
+  // FIXME: breaking interface so that I don't have to go rewrite all
+  // the places that call into these (for now)
+  // >>>
+  void notify_all(const std::string &notify_type,
+                  const std::string &notify_id)
+  {
+    if (active_modules) {
+      active_modules->notify_all(notify_type, notify_id);
+    }
+  }
+
+  void notify_all(const LogEntry &log_entry)
+  {
+    if (active_modules) {
+      active_modules->notify_all(log_entry);
+    }
+  }
+
+  std::vector<MonCommand> get_commands() const
+  {
+    assert(active_modules);
+    return active_modules->get_commands();
+  }
+  std::vector<ModuleCommand> get_py_commands() const
+  {
+    assert(active_modules);
+    return active_modules->get_py_commands();
+  }
+  void get_health_checks(health_check_map_t *checks)
+  {
+    assert(active_modules);
+    active_modules->get_health_checks(checks);
+  }
+  std::map<std::string, std::string> get_services() const
+  {
+    assert(active_modules);
+    return active_modules->get_services();
+  }
+  // <<< (end of ActivePyModules cheeky call-throughs)
+};
diff --git a/ceph/src/mgr/PyModuleRunner.cc b/ceph/src/mgr/PyModuleRunner.cc
new file mode 100644 (file)
index 0000000..5e04e3d
--- /dev/null
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include "Python.h"
+
+#include "common/debug.h"
+#include "mgr/Gil.h"
+
+#include "PyModuleRunner.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+
+std::string handle_pyerror();
+
+PyModuleRunner::~PyModuleRunner()
+{
+  Gil gil(pMyThreadState, true);
+
+  if (pClassInstance) {
+    Py_XDECREF(pClassInstance);
+    pClassInstance = nullptr;
+  }
+
+  Py_DECREF(pClass);
+  pClass = nullptr;
+}
+
+int PyModuleRunner::serve()
+{
+  assert(pClassInstance != nullptr);
+
+  // This method is called from a separate OS thread (i.e. a thread not
+  // created by Python), so tell Gil to wrap this in a new thread state.
+  Gil gil(pMyThreadState, true);
+
+  auto pValue = PyObject_CallMethod(pClassInstance,
+      const_cast<char*>("serve"), nullptr);
+
+  int r = 0;
+  if (pValue != NULL) {
+    Py_DECREF(pValue);
+  } else {
+    derr << module_name << ".serve:" << dendl;
+    derr << handle_pyerror() << dendl;
+    return -EINVAL;
+  }
+
+  return r;
+}
+
+void PyModuleRunner::shutdown()
+{
+  assert(pClassInstance != nullptr);
+
+  Gil gil(pMyThreadState, true);
+
+  auto pValue = PyObject_CallMethod(pClassInstance,
+      const_cast<char*>("shutdown"), nullptr);
+
+  if (pValue != NULL) {
+    Py_DECREF(pValue);
+  } else {
+    derr << "Failed to invoke shutdown() on " << module_name << dendl;
+    derr << handle_pyerror() << dendl;
+  }
+}
+
+void PyModuleRunner::log(int level, const std::string &record)
+{
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr[" << module_name << "] "
+  dout(level) << record << dendl;
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+}
+
+void* PyModuleRunner::PyModuleRunnerThread::entry()
+{
+  // No need to acquire the GIL here; the module does it.
+  dout(4) << "Entering thread for " << mod->get_name() << dendl;
+  mod->serve();
+  return nullptr;
+}
diff --git a/ceph/src/mgr/PyModuleRunner.h b/ceph/src/mgr/PyModuleRunner.h
new file mode 100644 (file)
index 0000000..cb51df3
--- /dev/null
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#pragma once
+
+#include "common/Thread.h"
+#include "mgr/Gil.h"
+
+/**
+ * Implement the pattern of calling serve() on a module in a thread,
+ * until shutdown() is called.
+ */
+class PyModuleRunner
+{
+protected:
+  const std::string module_name;
+
+  // Passed in by whoever loaded our python module and looked up
+  // the symbols in it.
+  PyObject *pClass = nullptr;
+
+  // Passed in by whoever created our subinterpreter for us
+  SafeThreadState pMyThreadState = nullptr;
+
+  // Populated when we construct our instance of pClass in load()
+  PyObject *pClassInstance = nullptr;
+
+  class PyModuleRunnerThread : public Thread
+  {
+    PyModuleRunner *mod;
+
+  public:
+    PyModuleRunnerThread(PyModuleRunner *mod_)
+      : mod(mod_) {}
+
+    void *entry() override;
+  };
+
+public:
+  int serve();
+  void shutdown();
+  void log(int level, const std::string &record);
+
+  PyModuleRunner(
+      const std::string &module_name_,
+      PyObject *pClass_,
+      const SafeThreadState &pMyThreadState_)
+    : 
+      module_name(module_name_),
+      pClass(pClass_), pMyThreadState(pMyThreadState_),
+      thread(this)
+  {
+    assert(pClass != nullptr);
+    assert(pMyThreadState.ts != nullptr);
+    assert(!module_name.empty());
+  }
+
+  ~PyModuleRunner();
+
+  PyModuleRunnerThread thread;
+
+  std::string const &get_name() const { return module_name; }
+};
+
+
diff --git a/ceph/src/mgr/PyModules.cc b/ceph/src/mgr/PyModules.cc
deleted file mode 100644 (file)
index d8b7b01..0000000
+++ /dev/null
@@ -1,827 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2014 John Spray <john.spray@inktank.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- */
-
-// Include this first to get python headers earlier
-#include "PyState.h"
-#include "Gil.h"
-
-#include "common/errno.h"
-#include "include/stringify.h"
-
-#include "PyFormatter.h"
-
-#include "osd/OSDMap.h"
-#include "mon/MonMap.h"
-
-#include "mgr/MgrContext.h"
-
-#include "PyModules.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_mgr
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr " << __func__ << " "
-
-// definition for non-const static member
-std::string PyModules::config_prefix;
-
-// constructor/destructor implementations cannot be in .h,
-// because ServeThread is still an "incomplete" type there
-
-PyModules::PyModules(DaemonStateIndex &ds, ClusterState &cs,
-         MonClient &mc, LogChannelRef clog_, Objecter &objecter_,
-          Client &client_, Finisher &f)
-  : daemon_state(ds), cluster_state(cs), monc(mc), clog(clog_),
-    objecter(objecter_), client(client_), finisher(f),
-    lock("PyModules")
-{}
-
-PyModules::~PyModules() = default;
-
-void PyModules::dump_server(const std::string &hostname,
-                      const DaemonStateCollection &dmc,
-                      Formatter *f)
-{
-  f->dump_string("hostname", hostname);
-  f->open_array_section("services");
-  std::string ceph_version;
-
-  for (const auto &i : dmc) {
-    Mutex::Locker l(i.second->lock);
-    const auto &key = i.first;
-    const std::string &str_type = key.first;
-    const std::string &svc_name = key.second;
-
-    // TODO: pick the highest version, and make sure that
-    // somewhere else (during health reporting?) we are
-    // indicating to the user if we see mixed versions
-    auto ver_iter = i.second->metadata.find("ceph_version");
-    if (ver_iter != i.second->metadata.end()) {
-      ceph_version = i.second->metadata.at("ceph_version");
-    }
-
-    f->open_object_section("service");
-    f->dump_string("type", str_type);
-    f->dump_string("id", svc_name);
-    f->close_section();
-  }
-  f->close_section();
-
-  f->dump_string("ceph_version", ceph_version);
-}
-
-
-
-PyObject *PyModules::get_server_python(const std::string &hostname)
-{
-  PyThreadState *tstate = PyEval_SaveThread();
-  Mutex::Locker l(lock);
-  PyEval_RestoreThread(tstate);
-  dout(10) << " (" << hostname << ")" << dendl;
-
-  auto dmc = daemon_state.get_by_server(hostname);
-
-  PyFormatter f;
-  dump_server(hostname, dmc, &f);
-  return f.get();
-}
-
-
-PyObject *PyModules::list_servers_python()
-{
-  PyThreadState *tstate = PyEval_SaveThread();
-  Mutex::Locker l(lock);
-  PyEval_RestoreThread(tstate);
-  dout(10) << " >" << dendl;
-
-  PyFormatter f(false, true);
-  const auto &all = daemon_state.get_all_servers();
-  for (const auto &i : all) {
-    const auto &hostname = i.first;
-
-    f.open_object_section("server");
-    dump_server(hostname, i.second, &f);
-    f.close_section();
-  }
-
-  return f.get();
-}
-
-PyObject *PyModules::get_metadata_python(
-  std::string const &handle,
-  const std::string &svc_name,
-  const std::string &svc_id)
-{
-  auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
-  Mutex::Locker l(metadata->lock);
-  PyFormatter f;
-  f.dump_string("hostname", metadata->hostname);
-  for (const auto &i : metadata->metadata) {
-    f.dump_string(i.first.c_str(), i.second);
-  }
-
-  return f.get();
-}
-
-PyObject *PyModules::get_daemon_status_python(
-  std::string const &handle,
-  const std::string &svc_name,
-  const std::string &svc_id)
-{
-  auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
-  Mutex::Locker l(metadata->lock);
-  PyFormatter f;
-  for (const auto &i : metadata->service_status) {
-    f.dump_string(i.first.c_str(), i.second);
-  }
-  return f.get();
-}
-
-PyObject *PyModules::get_python(const std::string &what)
-{
-  PyThreadState *tstate = PyEval_SaveThread();
-  Mutex::Locker l(lock);
-  PyEval_RestoreThread(tstate);
-
-  if (what == "fs_map") {
-    PyFormatter f;
-    cluster_state.with_fsmap([&f](const FSMap &fsmap) {
-      fsmap.dump(&f);
-    });
-    return f.get();
-  } else if (what == "osdmap_crush_map_text") {
-    bufferlist rdata;
-    cluster_state.with_osdmap([&rdata](const OSDMap &osd_map){
-       osd_map.crush->encode(rdata, CEPH_FEATURES_SUPPORTED_DEFAULT);
-    });
-    std::string crush_text = rdata.to_str();
-    return PyString_FromString(crush_text.c_str());
-  } else if (what.substr(0, 7) == "osd_map") {
-    PyFormatter f;
-    cluster_state.with_osdmap([&f, &what](const OSDMap &osd_map){
-      if (what == "osd_map") {
-        osd_map.dump(&f);
-      } else if (what == "osd_map_tree") {
-        osd_map.print_tree(&f, nullptr);
-      } else if (what == "osd_map_crush") {
-        osd_map.crush->dump(&f);
-      }
-    });
-    return f.get();
-  } else if (what == "config") {
-    PyFormatter f;
-    g_conf->show_config(&f);
-    return f.get();
-  } else if (what == "mon_map") {
-    PyFormatter f;
-    cluster_state.with_monmap(
-      [&f](const MonMap &monmap) {
-        monmap.dump(&f);
-      }
-    );
-    return f.get();
-  } else if (what == "service_map") {
-    PyFormatter f;
-    cluster_state.with_servicemap(
-      [&f](const ServiceMap &service_map) {
-        service_map.dump(&f);
-      }
-    );
-    return f.get();
-  } else if (what == "osd_metadata") {
-    PyFormatter f;
-    auto dmc = daemon_state.get_by_service("osd");
-    for (const auto &i : dmc) {
-      Mutex::Locker l(i.second->lock);
-      f.open_object_section(i.first.second.c_str());
-      f.dump_string("hostname", i.second->hostname);
-      for (const auto &j : i.second->metadata) {
-        f.dump_string(j.first.c_str(), j.second);
-      }
-      f.close_section();
-    }
-    return f.get();
-  } else if (what == "pg_summary") {
-    PyFormatter f;
-    cluster_state.with_pgmap(
-        [&f](const PGMap &pg_map) {
-          std::map<std::string, std::map<std::string, uint32_t> > osds;
-          std::map<std::string, std::map<std::string, uint32_t> > pools;
-          std::map<std::string, uint32_t> all;
-          for (const auto &i : pg_map.pg_stat) {
-            const auto pool = i.first.m_pool;
-            const std::string state = pg_state_string(i.second.state);
-            // Insert to per-pool map
-            pools[stringify(pool)][state]++;
-            for (const auto &osd_id : i.second.acting) {
-              osds[stringify(osd_id)][state]++;
-            }
-            all[state]++;
-          }
-          f.open_object_section("by_osd");
-          for (const auto &i : osds) {
-            f.open_object_section(i.first.c_str());
-            for (const auto &j : i.second) {
-              f.dump_int(j.first.c_str(), j.second);
-            }
-            f.close_section();
-          }
-          f.close_section();
-          f.open_object_section("by_pool");
-          for (const auto &i : pools) {
-            f.open_object_section(i.first.c_str());
-            for (const auto &j : i.second) {
-              f.dump_int(j.first.c_str(), j.second);
-            }
-            f.close_section();
-          }
-          f.close_section();
-          f.open_object_section("all");
-          for (const auto &i : all) {
-            f.dump_int(i.first.c_str(), i.second);
-          }
-          f.close_section();
-        }
-    );
-    return f.get();
-
-  } else if (what == "df") {
-    PyFormatter f;
-
-    cluster_state.with_osdmap([this, &f](const OSDMap &osd_map){
-      cluster_state.with_pgmap(
-          [&osd_map, &f](const PGMap &pg_map) {
-        pg_map.dump_fs_stats(nullptr, &f, true);
-        pg_map.dump_pool_stats_full(osd_map, nullptr, &f, true);
-      });
-    });
-    return f.get();
-  } else if (what == "osd_stats") {
-    PyFormatter f;
-    cluster_state.with_pgmap(
-        [&f](const PGMap &pg_map) {
-      pg_map.dump_osd_stats(&f);
-    });
-    return f.get();
-  } else if (what == "health" || what == "mon_status") {
-    PyFormatter f;
-    bufferlist json;
-    if (what == "health") {
-      json = cluster_state.get_health();
-    } else if (what == "mon_status") {
-      json = cluster_state.get_mon_status();
-    } else {
-      assert(false);
-    }
-    f.dump_string("json", json.to_str());
-    return f.get();
-  } else if (what == "mgr_map") {
-    PyFormatter f;
-    cluster_state.with_mgrmap([&f](const MgrMap &mgr_map) {
-      mgr_map.dump(&f);
-    });
-    return f.get();
-  } else {
-    derr << "Python module requested unknown data '" << what << "'" << dendl;
-    Py_RETURN_NONE;
-  }
-}
-
-std::string PyModules::get_site_packages()
-{
-  std::stringstream site_packages;
-
-  // CPython doesn't auto-add site-packages dirs to sys.path for us,
-  // but it does provide a module that we can ask for them.
-  auto site_module = PyImport_ImportModule("site");
-  assert(site_module);
-
-  auto site_packages_fn = PyObject_GetAttrString(site_module, "getsitepackages");
-  if (site_packages_fn != nullptr) {
-    auto site_packages_list = PyObject_CallObject(site_packages_fn, nullptr);
-    assert(site_packages_list);
-
-    auto n = PyList_Size(site_packages_list);
-    for (Py_ssize_t i = 0; i < n; ++i) {
-      if (i != 0) {
-        site_packages << ":";
-      }
-      site_packages << PyString_AsString(PyList_GetItem(site_packages_list, i));
-    }
-
-    Py_DECREF(site_packages_list);
-    Py_DECREF(site_packages_fn);
-  } else {
-    // Fall back to generating our own site-packages paths by imitating
-    // what the standard site.py does.  This is annoying but it lets us
-    // run inside virtualenvs :-/
-
-    auto site_packages_fn = PyObject_GetAttrString(site_module, "addsitepackages");
-    assert(site_packages_fn);
-
-    auto known_paths = PySet_New(nullptr);
-    auto pArgs = PyTuple_Pack(1, known_paths);
-    PyObject_CallObject(site_packages_fn, pArgs);
-    Py_DECREF(pArgs);
-    Py_DECREF(known_paths);
-    Py_DECREF(site_packages_fn);
-
-    auto sys_module = PyImport_ImportModule("sys");
-    assert(sys_module);
-    auto sys_path = PyObject_GetAttrString(sys_module, "path");
-    assert(sys_path);
-
-    dout(1) << "sys.path:" << dendl;
-    auto n = PyList_Size(sys_path);
-    bool first = true;
-    for (Py_ssize_t i = 0; i < n; ++i) {
-      dout(1) << "  " << PyString_AsString(PyList_GetItem(sys_path, i)) << dendl;
-      if (first) {
-        first = false;
-      } else {
-        site_packages << ":";
-      }
-      site_packages << PyString_AsString(PyList_GetItem(sys_path, i));
-    }
-
-    Py_DECREF(sys_path);
-    Py_DECREF(sys_module);
-  }
-
-  Py_DECREF(site_module);
-
-  return site_packages.str();
-}
-
-
-int PyModules::init()
-{
-  Mutex::Locker locker(lock);
-
-  global_handle = this;
-  // namespace in config-key prefixed by "mgr/"
-  config_prefix = std::string(g_conf->name.get_type_str()) + "/";
-
-  // Set up global python interpreter
-  Py_SetProgramName(const_cast<char*>(PYTHON_EXECUTABLE));
-  Py_InitializeEx(0);
-
-  // Let CPython know that we will be calling it back from other
-  // threads in future.
-  if (! PyEval_ThreadsInitialized()) {
-    PyEval_InitThreads();
-  }
-
-  // Configure sys.path to include mgr_module_path
-  std::string sys_path = std::string(Py_GetPath()) + ":" + get_site_packages()
-                         + ":" + g_conf->mgr_module_path;
-  dout(10) << "Computed sys.path '" << sys_path << "'" << dendl;
-
-  // Drop the GIL and remember the main thread state (current
-  // thread state becomes NULL)
-  pMainThreadState = PyEval_SaveThread();
-
-  std::list<std::string> failed_modules;
-
-  // Load python code
-  set<string> ls;
-  cluster_state.with_mgrmap([&](const MgrMap& m) {
-      ls = m.modules;
-    });
-  for (const auto& module_name : ls) {
-    dout(1) << "Loading python module '" << module_name << "'" << dendl;
-    auto mod = std::unique_ptr<MgrPyModule>(new MgrPyModule(module_name, sys_path, pMainThreadState));
-    int r = mod->load();
-    if (r != 0) {
-      // Don't use handle_pyerror() here; we don't have the GIL
-      // or the right thread state (this is deliberate).
-      derr << "Error loading module '" << module_name << "': "
-        << cpp_strerror(r) << dendl;
-      failed_modules.push_back(module_name);
-      // Don't drop out here, load the other modules
-    } else {
-      // Success!
-      modules[module_name] = std::move(mod);
-    }
-  }
-
-  if (!failed_modules.empty()) {
-    clog->error() << "Failed to load ceph-mgr modules: " << joinify(
-        failed_modules.begin(), failed_modules.end(), std::string(", "));
-  }
-
-  return 0;
-}
-
-class ServeThread : public Thread
-{
-  MgrPyModule *mod;
-
-public:
-  bool running;
-
-  ServeThread(MgrPyModule *mod_)
-    : mod(mod_) {}
-
-  void *entry() override
-  {
-    running = true;
-
-    // No need to acquire the GIL here; the module does it.
-    dout(4) << "Entering thread for " << mod->get_name() << dendl;
-    mod->serve();
-
-    running = false;
-    return nullptr;
-  }
-};
-
-void PyModules::start()
-{
-  Mutex::Locker l(lock);
-
-  dout(1) << "Creating threads for " << modules.size() << " modules" << dendl;
-  for (auto& i : modules) {
-    auto thread = new ServeThread(i.second.get());
-    serve_threads[i.first].reset(thread);
-  }
-
-  for (auto &i : serve_threads) {
-    std::ostringstream thread_name;
-    thread_name << "mgr." << i.first;
-    dout(4) << "Starting thread for " << i.first << dendl;
-    i.second->create(thread_name.str().c_str());
-  }
-}
-
-void PyModules::shutdown()
-{
-  Mutex::Locker locker(lock);
-  assert(global_handle);
-
-  // Signal modules to drop out of serve() and/or tear down resources
-  for (auto &i : modules) {
-    auto module = i.second.get();
-    const auto& name = i.first;
-    dout(10) << "waiting for module " << name << " to shutdown" << dendl;
-    lock.Unlock();
-    module->shutdown();
-    lock.Lock();
-    dout(10) << "module " << name << " shutdown" << dendl;
-  }
-
-  // For modules implementing serve(), finish the threads where we
-  // were running that.
-  for (auto &i : serve_threads) {
-    lock.Unlock();
-    i.second->join();
-    lock.Lock();
-  }
-  serve_threads.clear();
-
-  modules.clear();
-
-  PyEval_RestoreThread(pMainThreadState);
-  Py_Finalize();
-
-  // nobody needs me anymore.
-  global_handle = nullptr;
-}
-
-void PyModules::notify_all(const std::string &notify_type,
-                     const std::string &notify_id)
-{
-  Mutex::Locker l(lock);
-
-  dout(10) << __func__ << ": notify_all " << notify_type << dendl;
-  for (auto& i : modules) {
-    auto module = i.second.get();
-    if (!serve_threads[i.first]->running)
-      continue;
-    // Send all python calls down a Finisher to avoid blocking
-    // C++ code, and avoid any potential lock cycles.
-    finisher.queue(new FunctionContext([module, notify_type, notify_id](int r){
-      module->notify(notify_type, notify_id);
-    }));
-  }
-}
-
-void PyModules::notify_all(const LogEntry &log_entry)
-{
-  Mutex::Locker l(lock);
-
-  dout(10) << __func__ << ": notify_all (clog)" << dendl;
-  for (auto& i : modules) {
-    auto module = i.second.get();
-    if (!serve_threads[i.first]->running)
-      continue;
-    // Send all python calls down a Finisher to avoid blocking
-    // C++ code, and avoid any potential lock cycles.
-    //
-    // Note intentional use of non-reference lambda binding on
-    // log_entry: we take a copy because caller's instance is
-    // probably ephemeral.
-    finisher.queue(new FunctionContext([module, log_entry](int r){
-      module->notify_clog(log_entry);
-    }));
-  }
-}
-
-bool PyModules::get_config(const std::string &handle,
-    const std::string &key, std::string *val) const
-{
-  PyThreadState *tstate = PyEval_SaveThread();
-  Mutex::Locker l(lock);
-  PyEval_RestoreThread(tstate);
-
-  const std::string global_key = config_prefix + handle + "/" + key;
-
-  dout(4) << __func__ << "key: " << global_key << dendl;
-
-  if (config_cache.count(global_key)) {
-    *val = config_cache.at(global_key);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-PyObject *PyModules::get_config_prefix(const std::string &handle,
-    const std::string &prefix) const
-{
-  PyThreadState *tstate = PyEval_SaveThread();
-  Mutex::Locker l(lock);
-  PyEval_RestoreThread(tstate);
-
-  const std::string base_prefix = config_prefix + handle + "/";
-  const std::string global_prefix = base_prefix + prefix;
-  dout(4) << __func__ << "prefix: " << global_prefix << dendl;
-
-  PyFormatter f;
-  for (auto p = config_cache.lower_bound(global_prefix);
-       p != config_cache.end() && p->first.find(global_prefix) == 0;
-       ++p) {
-    f.dump_string(p->first.c_str() + base_prefix.size(), p->second);
-  }
-  return f.get();
-}
-
-void PyModules::set_config(const std::string &handle,
-    const std::string &key, const boost::optional<std::string>& val)
-{
-  const std::string global_key = config_prefix + handle + "/" + key;
-
-  Command set_cmd;
-  {
-    PyThreadState *tstate = PyEval_SaveThread();
-    Mutex::Locker l(lock);
-    PyEval_RestoreThread(tstate);
-    if (val) {
-      config_cache[global_key] = *val;
-    } else {
-      config_cache.erase(global_key);
-    }
-
-    std::ostringstream cmd_json;
-    JSONFormatter jf;
-    jf.open_object_section("cmd");
-    if (val) {
-      jf.dump_string("prefix", "config-key set");
-      jf.dump_string("key", global_key);
-      jf.dump_string("val", *val);
-    } else {
-      jf.dump_string("prefix", "config-key del");
-      jf.dump_string("key", global_key);
-    }
-    jf.close_section();
-    jf.flush(cmd_json);
-    set_cmd.run(&monc, cmd_json.str());
-  }
-  set_cmd.wait();
-
-  if (set_cmd.r != 0) {
-    // config-key set will fail if mgr's auth key has insufficient
-    // permission to set config keys
-    // FIXME: should this somehow raise an exception back into Python land?
-    dout(0) << "`config-key set " << global_key << " " << val << "` failed: "
-      << cpp_strerror(set_cmd.r) << dendl;
-    dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl;
-  }
-}
-
-std::vector<ModuleCommand> PyModules::get_py_commands() const
-{
-  Mutex::Locker l(lock);
-
-  std::vector<ModuleCommand> result;
-  for (const auto& i : modules) {
-    auto module = i.second.get();
-    auto mod_commands = module->get_commands();
-    for (auto j : mod_commands) {
-      result.push_back(j);
-    }
-  }
-
-  return result;
-}
-
-std::vector<MonCommand> PyModules::get_commands() const
-{
-  std::vector<ModuleCommand> commands = get_py_commands();
-  std::vector<MonCommand> result;
-  for (auto &pyc: commands) {
-    result.push_back({pyc.cmdstring, pyc.helpstring, "mgr",
-                        pyc.perm, "cli", MonCommand::FLAG_MGR});
-  }
-  return result;
-}
-
-void PyModules::insert_config(const std::map<std::string,
-                              std::string> &new_config)
-{
-  Mutex::Locker l(lock);
-
-  dout(4) << "Loaded " << new_config.size() << " config settings" << dendl;
-  config_cache = new_config;
-}
-
-void PyModules::log(const std::string &handle,
-    int level, const std::string &record)
-{
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr[" << handle << "] "
-  dout(level) << record << dendl;
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr " << __func__ << " "
-}
-
-PyObject* PyModules::get_counter_python(
-    const std::string &handle,
-    const std::string &svc_name,
-    const std::string &svc_id,
-    const std::string &path)
-{
-  PyThreadState *tstate = PyEval_SaveThread();
-  Mutex::Locker l(lock);
-  PyEval_RestoreThread(tstate);
-
-  PyFormatter f;
-  f.open_array_section(path.c_str());
-
-  auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
-
-  Mutex::Locker l2(metadata->lock);
-  if (metadata) {
-    if (metadata->perf_counters.instances.count(path)) {
-      auto counter_instance = metadata->perf_counters.instances.at(path);
-      const auto &data = counter_instance.get_data();
-      for (const auto &datapoint : data) {
-        f.open_array_section("datapoint");
-        f.dump_unsigned("t", datapoint.t.sec());
-        f.dump_unsigned("v", datapoint.v);
-        f.close_section();
-
-      }
-    } else {
-      dout(4) << "Missing counter: '" << path << "' ("
-              << svc_name << "." << svc_id << ")" << dendl;
-      dout(20) << "Paths are:" << dendl;
-      for (const auto &i : metadata->perf_counters.instances) {
-        dout(20) << i.first << dendl;
-      }
-    }
-  } else {
-    dout(4) << "No daemon state for "
-              << svc_name << "." << svc_id << ")" << dendl;
-  }
-  f.close_section();
-  return f.get();
-}
-
-PyObject* PyModules::get_perf_schema_python(
-    const std::string &handle,
-    const std::string svc_type,
-    const std::string &svc_id)
-{
-  PyThreadState *tstate = PyEval_SaveThread();
-  Mutex::Locker l(lock);
-  PyEval_RestoreThread(tstate);
-
-  DaemonStateCollection states;
-
-  if (svc_type == "") {
-    states = daemon_state.get_all();
-  } else if (svc_id.empty()) {
-    states = daemon_state.get_by_service(svc_type);
-  } else {
-    auto key = DaemonKey(svc_type, svc_id);
-    // so that the below can be a loop in all cases
-    if (daemon_state.exists(key)) {
-      states[key] = daemon_state.get(key);
-    }
-  }
-
-  PyFormatter f;
-  f.open_object_section("perf_schema");
-
-  // FIXME: this is unsafe, I need to either be inside DaemonStateIndex's
-  // lock or put a lock on individual DaemonStates
-  if (!states.empty()) {
-    for (auto statepair : states) {
-      std::ostringstream daemon_name;
-      auto key = statepair.first;
-      auto state = statepair.second;
-      Mutex::Locker l(state->lock);
-      daemon_name << key.first << "." << key.second;
-      f.open_object_section(daemon_name.str().c_str());
-
-      for (auto typestr : state->perf_counters.declared_types) {
-       f.open_object_section(typestr.c_str());
-       auto type = state->perf_counters.types[typestr];
-       f.dump_string("description", type.description);
-       if (!type.nick.empty()) {
-         f.dump_string("nick", type.nick);
-       }
-       f.dump_unsigned("type", type.type);
-       f.close_section();
-      }
-      f.close_section();
-    }
-  } else {
-    dout(4) << __func__ << ": No daemon state found for "
-              << svc_type << "." << svc_id << ")" << dendl;
-  }
-  f.close_section();
-  return f.get();
-}
-
-PyObject *PyModules::get_context()
-{
-  PyThreadState *tstate = PyEval_SaveThread();
-  Mutex::Locker l(lock);
-  PyEval_RestoreThread(tstate);
-
-  // Construct a capsule containing ceph context.
-  // Not incrementing/decrementing ref count on the context because
-  // it's the global one and it has process lifetime.
-  auto capsule = PyCapsule_New(g_ceph_context, nullptr, nullptr);
-  return capsule;
-}
-
-static void _list_modules(
-  const std::string path,
-  std::set<std::string> *modules)
-{
-  DIR *dir = opendir(path.c_str());
-  if (!dir) {
-    return;
-  }
-  struct dirent *entry = NULL;
-  while ((entry = readdir(dir)) != NULL) {
-    string n(entry->d_name);
-    string fn = path + "/" + n;
-    struct stat st;
-    int r = ::stat(fn.c_str(), &st);
-    if (r == 0 && S_ISDIR(st.st_mode)) {
-      string initfn = fn + "/module.py";
-      r = ::stat(initfn.c_str(), &st);
-      if (r == 0) {
-       modules->insert(n);
-      }
-    }
-  }
-  closedir(dir);
-}
-
-void PyModules::list_modules(std::set<std::string> *modules)
-{
-  _list_modules(g_conf->mgr_module_path, modules);
-}
-
-void PyModules::set_health_checks(const std::string& handle,
-                                 health_check_map_t&& checks)
-{
-  Mutex::Locker l(lock);
-  auto p = modules.find(handle);
-  if (p != modules.end()) {
-    p->second->set_health_checks(std::move(checks));
-  }
-}
-
-void PyModules::get_health_checks(health_check_map_t *checks)
-{
-  Mutex::Locker l(lock);
-  for (auto& p : modules) {
-    p.second->get_health_checks(checks);
-  }
-}
diff --git a/ceph/src/mgr/PyModules.h b/ceph/src/mgr/PyModules.h
deleted file mode 100644 (file)
index c7aad4e..0000000
+++ /dev/null
@@ -1,130 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2014 John Spray <john.spray@inktank.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- */
-
-#ifndef PY_MODULES_H_
-#define PY_MODULES_H_
-
-#include "MgrPyModule.h"
-
-#include "common/Finisher.h"
-#include "common/Mutex.h"
-#include "common/Thread.h"
-
-#include "osdc/Objecter.h"
-#include "client/Client.h"
-#include "common/LogClient.h"
-#include "mon/MgrMap.h"
-#include "mon/MonCommand.h"
-
-#include "DaemonState.h"
-#include "ClusterState.h"
-
-class ServeThread;
-class health_check_map_t;
-
-class PyModules
-{
-  std::map<std::string, std::unique_ptr<MgrPyModule>> modules;
-  std::map<std::string, std::unique_ptr<ServeThread>> serve_threads;
-  DaemonStateIndex &daemon_state;
-  ClusterState &cluster_state;
-  MonClient &monc;
-  LogChannelRef clog;
-  Objecter &objecter;
-  Client   &client;
-  Finisher &finisher;
-
-  mutable Mutex lock{"PyModules::lock"};
-
-  std::string get_site_packages();
-
-  PyThreadState *pMainThreadState = nullptr;
-
-public:
-  static std::string config_prefix;
-
-  PyModules(DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
-            LogChannelRef clog_, Objecter &objecter_, Client &client_,
-            Finisher &f);
-
-  ~PyModules();
-
-  // FIXME: wrap for send_command?
-  MonClient &get_monc() {return monc;}
-  Objecter  &get_objecter() {return objecter;}
-  Client    &get_client() {return client;}
-
-  PyObject *get_python(const std::string &what);
-  PyObject *get_server_python(const std::string &hostname);
-  PyObject *list_servers_python();
-  PyObject *get_metadata_python(
-    std::string const &handle,
-    const std::string &svc_name, const std::string &svc_id);
-  PyObject *get_daemon_status_python(
-    std::string const &handle,
-    const std::string &svc_name, const std::string &svc_id);
-  PyObject *get_counter_python(
-    std::string const &handle,
-    const std::string &svc_name,
-    const std::string &svc_id,
-    const std::string &path);
-  PyObject *get_perf_schema_python(
-     const std::string &handle,
-     const std::string svc_type,
-     const std::string &svc_id);
-  PyObject *get_context();
-
-  std::map<std::string, std::string> config_cache;
-
-  // Python command definitions, including callback
-  std::vector<ModuleCommand> get_py_commands() const;
-
-  // Monitor command definitions, suitable for CLI
-  std::vector<MonCommand> get_commands() const;
-
-  void insert_config(const std::map<std::string, std::string> &new_config);
-
-  // Public so that MonCommandCompletion can use it
-  // FIXME: for send_command completion notifications,
-  // send it to only the module that sent the command, not everyone
-  void notify_all(const std::string &notify_type,
-                  const std::string &notify_id);
-  void notify_all(const LogEntry &log_entry);
-
-  int init();
-  void start();
-  void shutdown();
-
-  void dump_server(const std::string &hostname,
-                   const DaemonStateCollection &dmc,
-                   Formatter *f);
-
-  bool get_config(const std::string &handle,
-      const std::string &key, std::string *val) const;
-  PyObject *get_config_prefix(const std::string &handle,
-                             const std::string &prefix) const;
-  void set_config(const std::string &handle,
-      const std::string &key, const boost::optional<std::string> &val);
-
-  void set_health_checks(const std::string& handle,
-                        health_check_map_t&& checks);
-  void get_health_checks(health_check_map_t *checks);
-
-  void log(const std::string &handle,
-           int level, const std::string &record);
-
-  static void list_modules(std::set<std::string> *modules);
-};
-
-#endif
-
diff --git a/ceph/src/mgr/PyOSDMap.cc b/ceph/src/mgr/PyOSDMap.cc
new file mode 100644 (file)
index 0000000..8bae2e4
--- /dev/null
@@ -0,0 +1,589 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Mgr.h"
+
+#include "osd/OSDMap.h"
+#include "common/errno.h"
+#include "common/version.h"
+#include "include/stringify.h"
+
+#include "PyOSDMap.h"
+#include "PyFormatter.h"
+#include "Gil.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+
+typedef struct {
+  PyObject_HEAD
+  OSDMap *osdmap;
+} BasePyOSDMap;
+
+typedef struct {
+  PyObject_HEAD
+  OSDMap::Incremental *inc;
+} BasePyOSDMapIncremental;
+
+typedef struct {
+  PyObject_HEAD
+  ceph::shared_ptr<CrushWrapper> crush;
+} BasePyCRUSH;
+
+// ----------
+
+static PyObject *osdmap_get_epoch(BasePyOSDMap *self, PyObject *obj)
+{
+  return PyInt_FromLong(self->osdmap->get_epoch());
+}
+
+static PyObject *osdmap_get_crush_version(BasePyOSDMap* self, PyObject *obj)
+{
+  return PyInt_FromLong(self->osdmap->get_crush_version());
+}
+
+static PyObject *osdmap_dump(BasePyOSDMap* self, PyObject *obj)
+{
+  PyFormatter f;
+  self->osdmap->dump(&f);
+  return f.get();
+}
+
+static PyObject *osdmap_new_incremental(BasePyOSDMap *self, PyObject *obj)
+{
+  OSDMap::Incremental *inc = new OSDMap::Incremental;
+
+  inc->fsid = self->osdmap->get_fsid();
+  inc->epoch = self->osdmap->get_epoch() + 1;
+  // always include latest crush map here... this is okay since we never
+  // actually use this map in the real world (and even if we did it would
+  // be a no-op).
+  self->osdmap->crush->encode(inc->crush, CEPH_FEATURES_ALL);
+  dout(10) << __func__ << " " << inc << dendl;
+
+  return construct_with_capsule("mgr_module", "OSDMapIncremental",
+                                (void*)(inc));
+}
+
+static PyObject *osdmap_apply_incremental(BasePyOSDMap *self,
+    BasePyOSDMapIncremental *incobj)
+{
+  if (!PyObject_TypeCheck(incobj, &BasePyOSDMapIncrementalType)) {
+    derr << "Wrong type in osdmap_apply_incremental!" << dendl;
+    return nullptr;
+  }
+
+  bufferlist bl;
+  self->osdmap->encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
+  OSDMap *next = new OSDMap;
+  next->decode(bl);
+  next->apply_incremental(*(incobj->inc));
+  dout(10) << __func__ << " map " << self->osdmap << " inc " << incobj->inc
+          << " next " << next << dendl;
+
+  return construct_with_capsule("mgr_module", "OSDMap", (void*)next);
+}
+
+static PyObject *osdmap_get_crush(BasePyOSDMap* self, PyObject *obj)
+{
+  return construct_with_capsule("mgr_module", "CRUSHMap",
+      (void*)(&(self->osdmap->crush)));
+}
+
+static PyObject *osdmap_get_pools_by_take(BasePyOSDMap* self, PyObject *args)
+{
+  int take;
+  if (!PyArg_ParseTuple(args, "i:get_pools_by_take",
+                       &take)) {
+    return nullptr;
+  }
+
+  PyFormatter f;
+  f.open_array_section("pools");
+  for (auto& p : self->osdmap->get_pools()) {
+    if (self->osdmap->crush->rule_has_take(p.second.crush_rule, take)) {
+      f.dump_int("pool", p.first);
+    }
+  }
+  f.close_section();
+  return f.get();
+}
+
+static PyObject *osdmap_calc_pg_upmaps(BasePyOSDMap* self, PyObject *args)
+{
+  PyObject *pool_list;
+  BasePyOSDMapIncremental *incobj;
+  double max_deviation = 0;
+  int max_iterations = 0;
+  if (!PyArg_ParseTuple(args, "OdiO:calc_pg_upmaps",
+                       &incobj, &max_deviation,
+                       &max_iterations, &pool_list)) {
+    return nullptr;
+  }
+
+  dout(10) << __func__ << " osdmap " << self->osdmap << " inc " << incobj->inc
+          << " max_deviation " << max_deviation
+          << " max_iterations " << max_iterations
+          << dendl;
+  set<int64_t> pools;
+  // FIXME: unpack pool_list and translate to pools set
+  int r = self->osdmap->calc_pg_upmaps(g_ceph_context,
+                                max_deviation,
+                                max_iterations,
+                                pools,
+                                incobj->inc);
+  dout(10) << __func__ << " r = " << r << dendl;
+  return PyInt_FromLong(r);
+}
+
+static PyObject *osdmap_map_pool_pgs_up(BasePyOSDMap* self, PyObject *args)
+{
+  int poolid;
+  if (!PyArg_ParseTuple(args, "i:map_pool_pgs_up",
+                       &poolid)) {
+    return nullptr;
+  }
+  auto pi = self->osdmap->get_pg_pool(poolid);
+  if (!pi)
+    return nullptr;
+  map<pg_t,vector<int>> pm;
+  for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
+    pg_t pgid(ps, poolid);
+    self->osdmap->pg_to_up_acting_osds(pgid, &pm[pgid], nullptr, nullptr, nullptr);
+  }
+  PyFormatter f;
+  for (auto p : pm) {
+    string pg = stringify(p.first);
+    f.open_array_section(pg.c_str());
+    for (auto o : p.second) {
+      f.dump_int("osd", o);
+    }
+    f.close_section();
+  }
+  return f.get();
+}
+
+static int
+BasePyOSDMap_init(BasePyOSDMap *self, PyObject *args, PyObject *kwds)
+{
+    PyObject *osdmap_capsule = nullptr;
+    static const char *kwlist[] = {"osdmap_capsule", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+                                      const_cast<char**>(kwlist),
+                                      &osdmap_capsule)) {
+      assert(0);
+        return -1;
+    }
+    assert(PyObject_TypeCheck(osdmap_capsule, &PyCapsule_Type));
+
+    self->osdmap = (OSDMap*)PyCapsule_GetPointer(
+        osdmap_capsule, nullptr);
+    assert(self->osdmap);
+
+    return 0;
+}
+
+
+static void
+BasePyOSDMap_dealloc(BasePyOSDMap *self)
+{
+  if (self->osdmap) {
+    delete self->osdmap;
+    self->osdmap = nullptr;
+  } else {
+    derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl;
+  }
+  Py_TYPE(self)->tp_free(self);
+}
+
+
+PyMethodDef BasePyOSDMap_methods[] = {
+  {"_get_epoch", (PyCFunction)osdmap_get_epoch, METH_NOARGS, "Get OSDMap epoch"},
+  {"_get_crush_version", (PyCFunction)osdmap_get_crush_version, METH_NOARGS,
+    "Get CRUSH version"},
+  {"_dump", (PyCFunction)osdmap_dump, METH_NOARGS, "Dump OSDMap::Incremental"},
+  {"_new_incremental", (PyCFunction)osdmap_new_incremental, METH_NOARGS,
+   "Create OSDMap::Incremental"},
+  {"_apply_incremental", (PyCFunction)osdmap_apply_incremental, METH_O,
+   "Apply OSDMap::Incremental and return the resulting OSDMap"},
+  {"_get_crush", (PyCFunction)osdmap_get_crush, METH_NOARGS, "Get CrushWrapper"},
+  {"_get_pools_by_take", (PyCFunction)osdmap_get_pools_by_take, METH_VARARGS,
+   "Get pools that have CRUSH rules that TAKE the given root"},
+  {"_calc_pg_upmaps", (PyCFunction)osdmap_calc_pg_upmaps, METH_VARARGS,
+   "Calculate new pg-upmap values"},
+  {"_map_pool_pgs_up", (PyCFunction)osdmap_map_pool_pgs_up, METH_VARARGS,
+   "Calculate up set mappings for all PGs in a pool"},
+  {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyOSDMapType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BasePyOSDMap", /* tp_name */
+  sizeof(BasePyOSDMap),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  (destructor)BasePyOSDMap_dealloc,      /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "Ceph OSDMap",             /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BasePyOSDMap_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BasePyOSDMap_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  0,     /* tp_new */
+};
+
+// ----------
+
+
+static int
+BasePyOSDMapIncremental_init(BasePyOSDMapIncremental *self,
+    PyObject *args, PyObject *kwds)
+{
+    PyObject *inc_capsule = nullptr;
+    static const char *kwlist[] = {"inc_capsule", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+                                      const_cast<char**>(kwlist),
+                                      &inc_capsule)) {
+      assert(0);
+        return -1;
+    }
+    assert(PyObject_TypeCheck(inc_capsule, &PyCapsule_Type));
+
+    self->inc = (OSDMap::Incremental*)PyCapsule_GetPointer(
+        inc_capsule, nullptr);
+    assert(self->inc);
+
+    return 0;
+}
+
+static void
+BasePyOSDMapIncremental_dealloc(BasePyOSDMapIncremental *self)
+{
+  if (self->inc) {
+    delete self->inc;
+    self->inc = nullptr;
+  } else {
+    derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl;
+  }
+  Py_TYPE(self)->tp_free(self);
+}
+
+static PyObject *osdmap_inc_get_epoch(BasePyOSDMapIncremental *self,
+    PyObject *obj)
+{
+  return PyInt_FromLong(self->inc->epoch);
+}
+
+static PyObject *osdmap_inc_dump(BasePyOSDMapIncremental *self,
+    PyObject *obj)
+{
+  PyFormatter f;
+  self->inc->dump(&f);
+  return f.get();
+}
+
+static int get_int_float_map(PyObject *obj, map<int,double> *out)
+{
+  PyObject *ls = PyDict_Items(obj);
+  for (int j = 0; j < PyList_Size(ls); ++j) {
+    PyObject *pair = PyList_GET_ITEM(ls, j);
+    if (!PyTuple_Check(pair)) {
+      derr << __func__ << " item " << j << " not a tuple" << dendl;
+      Py_DECREF(ls);
+      return -1;
+    }
+    int k;
+    double v;
+    if (!PyArg_ParseTuple(pair, "id:pair", &k, &v)) {
+      derr << __func__ << " item " << j << " not a size 2 tuple" << dendl;
+      Py_DECREF(ls);
+      return -1;
+    }
+    (*out)[k] = v;
+  }
+
+  Py_DECREF(ls);
+  return 0;
+}
+
+static PyObject *osdmap_inc_set_osd_reweights(BasePyOSDMapIncremental *self,
+    PyObject *weightobj)
+{
+  map<int,double> wm;
+  if (get_int_float_map(weightobj, &wm) < 0) {
+    return nullptr;
+  }
+
+  for (auto i : wm) {
+    self->inc->new_weight[i.first] = std::max(0.0, std::min(1.0, i.second)) * 0x10000;
+  }
+  Py_RETURN_NONE;
+}
+
+static PyObject *osdmap_inc_set_compat_weight_set_weights(
+  BasePyOSDMapIncremental *self, PyObject *weightobj)
+{
+  map<int,double> wm;
+  if (get_int_float_map(weightobj, &wm) < 0) {
+    return nullptr;
+  }
+
+  CrushWrapper crush;
+  assert(self->inc->crush.length());  // see new_incremental
+  auto p = self->inc->crush.begin();
+  ::decode(crush, p);
+  crush.create_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS, 1);
+  for (auto i : wm) {
+    crush.choose_args_adjust_item_weightf(
+      g_ceph_context,
+      crush.choose_args_get(CrushWrapper::DEFAULT_CHOOSE_ARGS),
+      i.first,
+      { i.second },
+      nullptr);
+  }
+  self->inc->crush.clear();
+  crush.encode(self->inc->crush, CEPH_FEATURES_ALL);
+  Py_RETURN_NONE;
+}
+
+PyMethodDef BasePyOSDMapIncremental_methods[] = {
+  {"_get_epoch", (PyCFunction)osdmap_inc_get_epoch, METH_NOARGS,
+    "Get OSDMap::Incremental epoch"},
+  {"_dump", (PyCFunction)osdmap_inc_dump, METH_NOARGS,
+    "Dump OSDMap::Incremental"},
+  {"_set_osd_reweights", (PyCFunction)osdmap_inc_set_osd_reweights,
+    METH_O, "Set osd reweight values"},
+  {"_set_crush_compat_weight_set_weights",
+   (PyCFunction)osdmap_inc_set_compat_weight_set_weights, METH_O,
+   "Set weight values in the pending CRUSH compat weight-set"},
+  {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyOSDMapIncrementalType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BasePyOSDMapIncremental", /* tp_name */
+  sizeof(BasePyOSDMapIncremental),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  (destructor)BasePyOSDMapIncremental_dealloc,      /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "Ceph OSDMapIncremental",  /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BasePyOSDMapIncremental_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BasePyOSDMapIncremental_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  0,                         /* tp_new */
+};
+
+
+// ----------
+
+static int
+BasePyCRUSH_init(BasePyCRUSH *self,
+    PyObject *args, PyObject *kwds)
+{
+    PyObject *crush_capsule = nullptr;
+    static const char *kwlist[] = {"crush_capsule", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+                                      const_cast<char**>(kwlist),
+                                      &crush_capsule)) {
+      assert(0);
+        return -1;
+    }
+    assert(PyObject_TypeCheck(crush_capsule, &PyCapsule_Type));
+
+    auto ptr_ref = (ceph::shared_ptr<CrushWrapper>*)(
+        PyCapsule_GetPointer(crush_capsule, nullptr));
+
+    // We passed a pointer to a shared pointer, which is weird, but
+    // just enough to get it into the constructor: this is a real shared
+    // pointer construction now, and then we throw away that pointer to
+    // the shared pointer.
+    self->crush = *ptr_ref;
+    assert(self->crush);
+
+    return 0;
+}
+
+static void
+BasePyCRUSH_dealloc(BasePyCRUSH *self)
+{
+  self->crush.reset();
+  Py_TYPE(self)->tp_free(self);
+}
+
+static PyObject *crush_dump(BasePyCRUSH *self, PyObject *obj)
+{
+  PyFormatter f;
+  self->crush->dump(&f);
+  return f.get();
+}
+
+static PyObject *crush_get_item_name(BasePyCRUSH *self, PyObject *args)
+{
+  int item;
+  if (!PyArg_ParseTuple(args, "i:get_item_name", &item)) {
+    return nullptr;
+  }
+  if (!self->crush->item_exists(item)) {
+    Py_RETURN_NONE;
+  }
+  return PyString_FromString(self->crush->get_item_name(item));
+}
+
+static PyObject *crush_get_item_weight(BasePyCRUSH *self, PyObject *args)
+{
+  int item;
+  if (!PyArg_ParseTuple(args, "i:get_item_weight", &item)) {
+    return nullptr;
+  }
+  if (!self->crush->item_exists(item)) {
+    Py_RETURN_NONE;
+  }
+  return PyFloat_FromDouble(self->crush->get_item_weightf(item));
+}
+
+static PyObject *crush_find_takes(BasePyCRUSH *self, PyObject *obj)
+{
+  set<int> takes;
+  self->crush->find_takes(&takes);
+  PyFormatter f;
+  f.open_array_section("takes");
+  for (auto root : takes) {
+    f.dump_int("root", root);
+  }
+  f.close_section();
+  return f.get();
+}
+
+static PyObject *crush_get_take_weight_osd_map(BasePyCRUSH *self, PyObject *args)
+{
+  int root;
+  if (!PyArg_ParseTuple(args, "i:get_take_weight_osd_map",
+                       &root)) {
+    return nullptr;
+  }
+  map<int,float> wmap;
+
+  if (!self->crush->item_exists(root)) {
+    return nullptr;
+  }
+
+  self->crush->get_take_weight_osd_map(root, &wmap);
+  PyFormatter f;
+  f.open_object_section("weights");
+  for (auto& p : wmap) {
+    string n = stringify(p.first);     // ick
+    f.dump_float(n.c_str(), p.second);
+  }
+  f.close_section();
+  return f.get();
+}
+
+PyMethodDef BasePyCRUSH_methods[] = {
+  {"_dump", (PyCFunction)crush_dump, METH_NOARGS, "Dump map"},
+  {"_get_item_name", (PyCFunction)crush_get_item_name, METH_VARARGS,
+    "Get item name"},
+  {"_get_item_weight", (PyCFunction)crush_get_item_weight, METH_VARARGS,
+    "Get item weight"},
+  {"_find_takes", (PyCFunction)crush_find_takes, METH_NOARGS,
+    "Find distinct TAKE roots"},
+  {"_get_take_weight_osd_map", (PyCFunction)crush_get_take_weight_osd_map,
+    METH_VARARGS, "Get OSD weight map for a given TAKE root node"},
+  {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyCRUSHType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BasePyCRUSH", /* tp_name */
+  sizeof(BasePyCRUSH),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  (destructor)BasePyCRUSH_dealloc,      /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "Ceph OSDMapIncremental",  /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BasePyCRUSH_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BasePyCRUSH_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  0,                         /* tp_new */
+};
diff --git a/ceph/src/mgr/PyOSDMap.h b/ceph/src/mgr/PyOSDMap.h
new file mode 100644 (file)
index 0000000..09e5b04
--- /dev/null
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+
+#include "Python.h"
+
+
+
+extern PyTypeObject BasePyOSDMapType;
+extern PyTypeObject BasePyOSDMapIncrementalType;
+extern PyTypeObject BasePyCRUSHType;
+
+PyObject *construct_with_capsule(
+    const std::string &module,
+    const std::string &clsname,
+    void *wrapped);
+
diff --git a/ceph/src/mgr/PyState.cc b/ceph/src/mgr/PyState.cc
deleted file mode 100644 (file)
index fb6b831..0000000
+++ /dev/null
@@ -1,490 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 John Spray <john.spray@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- */
-
-/**
- * The interface we present to python code that runs within
- * ceph-mgr.
- */
-
-#include "Mgr.h"
-
-#include "mon/MonClient.h"
-#include "common/errno.h"
-#include "common/version.h"
-
-#include "PyState.h"
-#include "Gil.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_mgr
-
-PyModules *global_handle = NULL;
-
-
-class MonCommandCompletion : public Context
-{
-  PyObject *python_completion;
-  const std::string tag;
-  PyThreadState *pThreadState;
-
-public:
-  std::string outs;
-  bufferlist outbl;
-
-  MonCommandCompletion(PyObject* ev, const std::string &tag_, PyThreadState *ts_)
-    : python_completion(ev), tag(tag_), pThreadState(ts_)
-  {
-    assert(python_completion != nullptr);
-    Py_INCREF(python_completion);
-  }
-
-  ~MonCommandCompletion() override
-  {
-    Py_DECREF(python_completion);
-  }
-
-  void finish(int r) override
-  {
-    dout(10) << "MonCommandCompletion::finish()" << dendl;
-    {
-      // Scoped so the Gil is released before calling notify_all()
-      // Create new thread state because this is called via the MonClient
-      // Finisher, not the PyModules finisher.
-      Gil gil(pThreadState, true);
-
-      auto set_fn = PyObject_GetAttrString(python_completion, "complete");
-      assert(set_fn != nullptr);
-
-      auto pyR = PyInt_FromLong(r);
-      auto pyOutBl = PyString_FromString(outbl.to_str().c_str());
-      auto pyOutS = PyString_FromString(outs.c_str());
-      auto args = PyTuple_Pack(3, pyR, pyOutBl, pyOutS);
-      Py_DECREF(pyR);
-      Py_DECREF(pyOutBl);
-      Py_DECREF(pyOutS);
-
-      auto rtn = PyObject_CallObject(set_fn, args);
-      if (rtn != nullptr) {
-       Py_DECREF(rtn);
-      }
-      Py_DECREF(args);
-    }
-    global_handle->notify_all("command", tag);
-  }
-};
-
-
-static PyObject*
-ceph_send_command(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-
-  // Like mon, osd, mds
-  char *type = nullptr;
-
-  // Like "23" for an OSD or "myid" for an MDS
-  char *name = nullptr;
-
-  char *cmd_json = nullptr;
-  char *tag = nullptr;
-  PyObject *completion = nullptr;
-  if (!PyArg_ParseTuple(args, "sOssss:ceph_send_command",
-        &handle, &completion, &type, &name, &cmd_json, &tag)) {
-    return nullptr;
-  }
-
-  auto set_fn = PyObject_GetAttrString(completion, "complete");
-  if (set_fn == nullptr) {
-    ceph_abort();  // TODO raise python exception instead
-  } else {
-    assert(PyCallable_Check(set_fn));
-  }
-  Py_DECREF(set_fn);
-
-  auto c = new MonCommandCompletion(completion, tag, PyThreadState_Get());
-  if (std::string(type) == "mon") {
-    global_handle->get_monc().start_mon_command(
-        {cmd_json},
-        {},
-        &c->outbl,
-        &c->outs,
-        c);
-  } else if (std::string(type) == "osd") {
-    std::string err;
-    uint64_t osd_id = strict_strtoll(name, 10, &err);
-    if (!err.empty()) {
-      delete c;
-      string msg("invalid osd_id: ");
-      msg.append("\"").append(name).append("\"");
-      PyErr_SetString(PyExc_ValueError, msg.c_str());
-      return nullptr;
-    }
-
-    ceph_tid_t tid;
-    global_handle->get_objecter().osd_command(
-        osd_id,
-        {cmd_json},
-        {},
-        &tid,
-        &c->outbl,
-        &c->outs,
-        c);
-  } else if (std::string(type) == "mds") {
-    int r = global_handle->get_client().mds_command(
-        name,
-        {cmd_json},
-        {},
-        &c->outbl,
-        &c->outs,
-        c);
-    if (r != 0) {
-      string msg("failed to send command to mds: ");
-      msg.append(cpp_strerror(r));
-      PyErr_SetString(PyExc_RuntimeError, msg.c_str());
-      return nullptr;
-    }
-  } else if (std::string(type) == "pg") {
-    pg_t pgid;
-    if (!pgid.parse(name)) {
-      delete c;
-      string msg("invalid pgid: ");
-      msg.append("\"").append(name).append("\"");
-      PyErr_SetString(PyExc_ValueError, msg.c_str());
-      return nullptr;
-    }
-
-    ceph_tid_t tid;
-    global_handle->get_objecter().pg_command(
-        pgid,
-        {cmd_json},
-        {},
-        &tid,
-        &c->outbl,
-        &c->outs,
-        c);
-    return nullptr;
-  } else {
-    delete c;
-    string msg("unknown service type: ");
-    msg.append(type);
-    PyErr_SetString(PyExc_ValueError, msg.c_str());
-    return nullptr;
-  }
-
-  Py_RETURN_NONE;
-}
-
-static PyObject*
-ceph_set_health_checks(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  PyObject *checks = NULL;
-  if (!PyArg_ParseTuple(args, "sO:ceph_set_health_checks", &handle, &checks)) {
-    return NULL;
-  }
-  if (!PyDict_Check(checks)) {
-    derr << __func__ << " arg not a dict" << dendl;
-    Py_RETURN_NONE;
-  }
-  PyObject *checksls = PyDict_Items(checks);
-  health_check_map_t out_checks;
-  for (int i = 0; i < PyList_Size(checksls); ++i) {
-    PyObject *kv = PyList_GET_ITEM(checksls, i);
-    char *check_name = nullptr;
-    PyObject *check_info = nullptr;
-    if (!PyArg_ParseTuple(kv, "sO:pair", &check_name, &check_info)) {
-      derr << __func__ << " dict item " << i
-          << " not a size 2 tuple" << dendl;
-      continue;
-    }
-    if (!PyDict_Check(check_info)) {
-      derr << __func__ << " item " << i << " " << check_name
-          << " value not a dict" << dendl;
-      continue;
-    }
-    health_status_t severity = HEALTH_OK;
-    string summary;
-    list<string> detail;
-    PyObject *infols = PyDict_Items(check_info);
-    for (int j = 0; j < PyList_Size(infols); ++j) {
-      PyObject *pair = PyList_GET_ITEM(infols, j);
-      if (!PyTuple_Check(pair)) {
-       derr << __func__ << " item " << i << " pair " << j
-            << " not a tuple" << dendl;
-       continue;
-      }
-      char *k = nullptr;
-      PyObject *v = nullptr;
-      if (!PyArg_ParseTuple(pair, "sO:pair", &k, &v)) {
-       derr << __func__ << " item " << i << " pair " << j
-            << " not a size 2 tuple" << dendl;
-       continue;
-      }
-      string ks(k);
-      if (ks == "severity") {
-       if (!PyString_Check(v)) {
-         derr << __func__ << " check " << check_name
-              << " severity value not string" << dendl;
-         continue;
-       }
-       string vs(PyString_AsString(v));
-       if (vs == "warning") {
-         severity = HEALTH_WARN;
-       } else if (vs == "error") {
-         severity = HEALTH_ERR;
-       }
-      } else if (ks == "summary") {
-       if (!PyString_Check(v)) {
-         derr << __func__ << " check " << check_name
-              << " summary value not string" << dendl;
-         continue;
-       }
-       summary = PyString_AsString(v);
-      } else if (ks == "detail") {
-       if (!PyList_Check(v)) {
-         derr << __func__ << " check " << check_name
-              << " detail value not list" << dendl;
-         continue;
-       }
-       for (int k = 0; k < PyList_Size(v); ++k) {
-         PyObject *di = PyList_GET_ITEM(v, k);
-         if (!PyString_Check(di)) {
-           derr << __func__ << " check " << check_name
-                << " detail item " << k << " not a string" << dendl;
-           continue;
-         }
-         detail.push_back(PyString_AsString(di));
-       }
-      } else {
-       derr << __func__ << " check " << check_name
-            << " unexpected key " << k << dendl;
-      }
-    }
-    auto& d = out_checks.add(check_name, severity, summary);
-    d.detail.swap(detail);
-  }
-
-  JSONFormatter jf(true);
-  dout(10) << "module " << handle << " health checks:\n";
-  out_checks.dump(&jf);
-  jf.flush(*_dout);
-  *_dout << dendl;
-
-  global_handle->set_health_checks(handle, std::move(out_checks));
-  
-  Py_RETURN_NONE;
-}
-
-
-static PyObject*
-ceph_state_get(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *what = NULL;
-  if (!PyArg_ParseTuple(args, "ss:ceph_state_get", &handle, &what)) {
-    return NULL;
-  }
-
-  return global_handle->get_python(what);
-}
-
-
-static PyObject*
-ceph_get_server(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *hostname = NULL;
-  if (!PyArg_ParseTuple(args, "sz:ceph_get_server", &handle, &hostname)) {
-    return NULL;
-  }
-
-  if (hostname) {
-    return global_handle->get_server_python(hostname);
-  } else {
-    return global_handle->list_servers_python();
-  }
-}
-
-static PyObject*
-ceph_get_mgr_id(PyObject *self, PyObject *args)
-{
-  return PyString_FromString(g_conf->name.get_id().c_str());
-}
-
-static PyObject*
-ceph_config_get(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *what = nullptr;
-  if (!PyArg_ParseTuple(args, "ss:ceph_config_get", &handle, &what)) {
-    derr << "Invalid args!" << dendl;
-    return nullptr;
-  }
-
-  std::string value;
-  bool found = global_handle->get_config(handle, what, &value);
-  if (found) {
-    dout(10) << "ceph_config_get " << what << " found: " << value.c_str() << dendl;
-    return PyString_FromString(value.c_str());
-  } else {
-    dout(4) << "ceph_config_get " << what << " not found " << dendl;
-    Py_RETURN_NONE;
-  }
-}
-
-static PyObject*
-ceph_config_get_prefix(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *prefix = nullptr;
-  if (!PyArg_ParseTuple(args, "ss:ceph_config_get", &handle, &prefix)) {
-    derr << "Invalid args!" << dendl;
-    return nullptr;
-  }
-
-  return global_handle->get_config_prefix(handle, prefix);
-}
-
-static PyObject*
-ceph_config_set(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *key = nullptr;
-  char *value = nullptr;
-  if (!PyArg_ParseTuple(args, "ssz:ceph_config_set", &handle, &key, &value)) {
-    return nullptr;
-  }
-  boost::optional<string> val;
-  if (value) {
-    val = value;
-  }
-  global_handle->set_config(handle, key, val);
-
-  Py_RETURN_NONE;
-}
-
-static PyObject*
-get_metadata(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *svc_name = NULL;
-  char *svc_id = NULL;
-  if (!PyArg_ParseTuple(args, "sss:get_metadata", &handle, &svc_name, &svc_id)) {
-    return nullptr;
-  }
-  return global_handle->get_metadata_python(handle, svc_name, svc_id);
-}
-
-static PyObject*
-get_daemon_status(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *svc_name = NULL;
-  char *svc_id = NULL;
-  if (!PyArg_ParseTuple(args, "sss:get_daemon_status", &handle, &svc_name,
-                       &svc_id)) {
-    return nullptr;
-  }
-  return global_handle->get_daemon_status_python(handle, svc_name, svc_id);
-}
-
-static PyObject*
-ceph_log(PyObject *self, PyObject *args)
-{
-  int level = 0;
-  char *record = nullptr;
-  char *handle = nullptr;
-  if (!PyArg_ParseTuple(args, "sis:log", &handle, &level, &record)) {
-    return nullptr;
-  }
-
-  global_handle->log(handle, level, record);
-
-  Py_RETURN_NONE;
-}
-
-static PyObject *
-ceph_get_version(PyObject *self, PyObject *args)
-{
-  return PyString_FromString(pretty_version_to_str().c_str());
-}
-
-static PyObject *
-ceph_get_context(PyObject *self, PyObject *args)
-{
-  return global_handle->get_context();
-}
-
-static PyObject*
-get_counter(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *svc_name = nullptr;
-  char *svc_id = nullptr;
-  char *counter_path = nullptr;
-  if (!PyArg_ParseTuple(args, "ssss:get_counter", &handle, &svc_name,
-                                                  &svc_id, &counter_path)) {
-    return nullptr;
-  }
-  return global_handle->get_counter_python(
-      handle, svc_name, svc_id, counter_path);
-}
-
-static PyObject*
-get_perf_schema(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *type_str = nullptr;
-  char *svc_id = nullptr;
-  if (!PyArg_ParseTuple(args, "sss:get_perf_schema", &handle, &type_str,
-                                                     &svc_id)) {
-    return nullptr;
-  }
-
-  return global_handle->get_perf_schema_python(handle, type_str, svc_id);
-}
-
-PyMethodDef CephStateMethods[] = {
-    {"get", ceph_state_get, METH_VARARGS,
-     "Get a cluster object"},
-    {"get_server", ceph_get_server, METH_VARARGS,
-     "Get a server object"},
-    {"get_metadata", get_metadata, METH_VARARGS,
-     "Get a service's metadata"},
-    {"get_daemon_status", get_daemon_status, METH_VARARGS,
-     "Get a service's status"},
-    {"send_command", ceph_send_command, METH_VARARGS,
-     "Send a mon command"},
-    {"set_health_checks", ceph_set_health_checks, METH_VARARGS,
-     "Set health checks for this module"},
-    {"get_mgr_id", ceph_get_mgr_id, METH_NOARGS,
-     "Get the mgr id"},
-    {"get_config", ceph_config_get, METH_VARARGS,
-     "Get a configuration value"},
-    {"get_config_prefix", ceph_config_get_prefix, METH_VARARGS,
-     "Get all configuration values with a given prefix"},
-    {"set_config", ceph_config_set, METH_VARARGS,
-     "Set a configuration value"},
-    {"get_counter", get_counter, METH_VARARGS,
-      "Get a performance counter"},
-    {"get_perf_schema", get_perf_schema, METH_VARARGS,
-      "Get the performance counter schema"},
-    {"log", ceph_log, METH_VARARGS,
-     "Emit a (local) log message"},
-    {"get_version", ceph_get_version, METH_VARARGS,
-     "Get the ceph version of this process"},
-    {"get_context", ceph_get_context, METH_NOARGS,
-      "Get a CephContext* in a python capsule"},
-    {NULL, NULL, 0, NULL}
-};
-
diff --git a/ceph/src/mgr/PyState.h b/ceph/src/mgr/PyState.h
deleted file mode 100644 (file)
index e53296b..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef PYSTATE_H_
-#define PYSTATE_H_
-
-#include "Python.h"
-
-class PyModules;
-
-extern PyModules *global_handle;
-extern PyMethodDef CephStateMethods[];
-
-#endif
-
diff --git a/ceph/src/mgr/StandbyPyModules.cc b/ceph/src/mgr/StandbyPyModules.cc
new file mode 100644 (file)
index 0000000..e567269
--- /dev/null
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "StandbyPyModules.h"
+
+#include "common/debug.h"
+
+#include "mgr/MgrContext.h"
+#include "mgr/Gil.h"
+
+
+#include <boost/python.hpp>
+#include "include/assert.h"  // boost clobbers this
+
+// For ::config_prefix
+#include "PyModuleRegistry.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+// Declaration fulfilled by ActivePyModules
+std::string handle_pyerror();
+
+
+StandbyPyModules::StandbyPyModules(MonClient *monc_, const MgrMap &mgr_map_)
+    : monc(monc_), load_config_thread(monc, &state)
+{
+  state.set_mgr_map(mgr_map_);
+}
+
+// FIXME: completely identical to ActivePyModules
+void StandbyPyModules::shutdown()
+{
+  Mutex::Locker locker(lock);
+
+  if (!state.is_config_loaded && load_config_thread.is_started()) {
+    // FIXME: handle cases where initial load races with shutdown
+    // this is actually not super rare because 
+    assert(0);
+    //load_config_thread.kill(SIGKILL);
+  }
+
+  // Signal modules to drop out of serve() and/or tear down resources
+  for (auto &i : modules) {
+    auto module = i.second.get();
+    const auto& name = i.first;
+    dout(10) << "waiting for module " << name << " to shutdown" << dendl;
+    lock.Unlock();
+    module->shutdown();
+    lock.Lock();
+    dout(10) << "module " << name << " shutdown" << dendl;
+  }
+
+  // For modules implementing serve(), finish the threads where we
+  // were running that.
+  for (auto &i : modules) {
+    lock.Unlock();
+    dout(10) << "joining thread for module " << i.first << dendl;
+    i.second->thread.join();
+    dout(10) << "joined thread for module " << i.first << dendl;
+    lock.Lock();
+  }
+
+  modules.clear();
+}
+
+int StandbyPyModules::start_one(std::string const &module_name,
+    PyObject *pClass, const SafeThreadState &pMyThreadState)
+{
+  Mutex::Locker l(lock);
+
+  assert(modules.count(module_name) == 0);
+
+  modules[module_name].reset(new StandbyPyModule(
+      state,
+      module_name, pClass,
+      pMyThreadState));
+
+  if (modules.size() == 1) {
+    load_config_thread.create("LoadConfig");
+  }
+
+  int r = modules[module_name]->load();
+  if (r != 0) {
+    modules.erase(module_name);
+    return r;
+  } else {
+    dout(4) << "Starting thread for " << module_name << dendl;
+    // Giving Thread the module's module_name member as its
+    // char* thread name: thread must not outlive module class lifetime.
+    modules[module_name]->thread.create(
+        modules[module_name]->get_name().c_str());
+    return 0;
+  }
+}
+
+int StandbyPyModule::load()
+{
+  Gil gil(pMyThreadState, true);
+
+  // We tell the module how we name it, so that it can be consistent
+  // with us in logging etc.
+  auto pThisPtr = PyCapsule_New(this, nullptr, nullptr);
+  assert(pThisPtr != nullptr);
+  auto pModuleName = PyString_FromString(module_name.c_str());
+  assert(pModuleName != nullptr);
+  auto pArgs = PyTuple_Pack(2, pModuleName, pThisPtr);
+  Py_DECREF(pThisPtr);
+  Py_DECREF(pModuleName);
+
+  pClassInstance = PyObject_CallObject(pClass, pArgs);
+  Py_DECREF(pArgs);
+  if (pClassInstance == nullptr) {
+    derr << "Failed to construct class in '" << module_name << "'" << dendl;
+    derr << handle_pyerror() << dendl;
+    return -EINVAL;
+  } else {
+    dout(1) << "Constructed class from module: " << module_name << dendl;
+    return 0;
+  }
+}
+
+void *StandbyPyModules::LoadConfigThread::entry()
+{
+  dout(10) << "listing keys" << dendl;
+  JSONCommand cmd;
+  cmd.run(monc, "{\"prefix\": \"config-key ls\"}");
+  cmd.wait();
+  assert(cmd.r == 0);
+
+  std::map<std::string, std::string> loaded;
+  
+  for (auto &key_str : cmd.json_result.get_array()) {
+    std::string const key = key_str.get_str();
+    dout(20) << "saw key '" << key << "'" << dendl;
+
+    const std::string config_prefix = PyModuleRegistry::config_prefix;
+
+    if (key.substr(0, config_prefix.size()) == config_prefix) {
+      dout(20) << "fetching '" << key << "'" << dendl;
+      Command get_cmd;
+      std::ostringstream cmd_json;
+      cmd_json << "{\"prefix\": \"config-key get\", \"key\": \"" << key << "\"}";
+      get_cmd.run(monc, cmd_json.str());
+      get_cmd.wait();
+      assert(get_cmd.r == 0);
+      loaded[key] = get_cmd.outbl.to_str();
+    }
+  }
+  state->loaded_config(loaded);
+
+  return nullptr;
+}
+
+bool StandbyPyModule::get_config(const std::string &key,
+                                 std::string *value) const
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  PyEval_RestoreThread(tstate);
+
+  const std::string global_key = PyModuleRegistry::config_prefix
+    + module_name + "/" + key;
+
+  dout(4) << __func__ << "key: " << global_key << dendl;
+
+  return state.with_config([global_key, value](const PyModuleConfig &config){
+    if (config.count(global_key)) {
+      *value = config.at(global_key);
+      return true;
+    } else {
+      return false;
+    }
+  });
+}
+
+std::string StandbyPyModule::get_active_uri() const
+{
+  std::string result;
+  state.with_mgr_map([&result, this](const MgrMap &mgr_map){
+    auto iter = mgr_map.services.find(module_name);
+    if (iter != mgr_map.services.end()) {
+      result = iter->second;
+    }
+  });
+
+  return result;
+}
+
diff --git a/ceph/src/mgr/StandbyPyModules.h b/ceph/src/mgr/StandbyPyModules.h
new file mode 100644 (file)
index 0000000..4f01146
--- /dev/null
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#pragma once
+
+#include "Python.h"
+
+#include <string>
+#include <map>
+
+#include "common/Thread.h"
+#include "common/Mutex.h"
+
+#include "mgr/Gil.h"
+#include "mon/MonClient.h"
+#include "mon/MgrMap.h"
+#include "mgr/PyModuleRunner.h"
+
+typedef std::map<std::string, std::string> PyModuleConfig;
+
+/**
+ * State that is read by all modules running in standby mode
+ */
+class StandbyPyModuleState
+{
+  mutable Mutex lock{"StandbyPyModuleState::lock"};
+
+  MgrMap mgr_map;
+  PyModuleConfig config_cache;
+
+  mutable Cond config_loaded;
+
+public:
+
+  bool is_config_loaded = false;
+
+  void set_mgr_map(const MgrMap &mgr_map_)
+  {
+    Mutex::Locker l(lock);
+
+    mgr_map = mgr_map_;
+  }
+
+  void loaded_config(const PyModuleConfig &config_)
+  {
+    Mutex::Locker l(lock);
+
+    config_cache = config_;
+    is_config_loaded = true;
+    config_loaded.Signal();
+  }
+
+  template<typename Callback, typename...Args>
+  void with_mgr_map(Callback&& cb, Args&&...args) const
+  {
+    Mutex::Locker l(lock);
+    std::forward<Callback>(cb)(mgr_map, std::forward<Args>(args)...);
+  }
+
+  template<typename Callback, typename...Args>
+  auto with_config(Callback&& cb, Args&&... args) const ->
+    decltype(cb(config_cache, std::forward<Args>(args)...)) {
+    Mutex::Locker l(lock);
+
+    if (!is_config_loaded) {
+      config_loaded.Wait(lock);
+    }
+
+    return std::forward<Callback>(cb)(config_cache, std::forward<Args>(args)...);
+  }
+};
+
+
+class StandbyPyModule : public PyModuleRunner
+{
+  StandbyPyModuleState &state;
+
+  public:
+
+  StandbyPyModule(
+      StandbyPyModuleState &state_,
+      const std::string &module_name_,
+      PyObject *pClass_,
+      const SafeThreadState &pMyThreadState_)
+    :
+      PyModuleRunner(module_name_, pClass_, pMyThreadState_),
+      state(state_)
+  {
+  }
+
+  bool get_config(const std::string &key, std::string *value) const;
+  std::string get_active_uri() const;
+
+  int load();
+};
+
+class StandbyPyModules
+{
+private:
+  mutable Mutex lock{"StandbyPyModules::lock"};
+  std::map<std::string, std::unique_ptr<StandbyPyModule>> modules;
+
+  MonClient *monc;
+
+  StandbyPyModuleState state;
+
+  void load_config();
+  class LoadConfigThread : public Thread
+  {
+    protected:
+      MonClient *monc;
+      StandbyPyModuleState *state;
+    public:
+    LoadConfigThread(MonClient *monc_, StandbyPyModuleState *state_)
+      : monc(monc_), state(state_)
+    {}
+    void *entry() override;
+  };
+
+  LoadConfigThread load_config_thread;
+
+public:
+
+  StandbyPyModules(
+      MonClient *monc_,
+      const MgrMap &mgr_map_);
+
+  int start_one(std::string const &module_name,
+                PyObject *pClass,
+                const SafeThreadState &pMyThreadState);
+
+  void shutdown();
+
+  void handle_mgr_map(const MgrMap &mgr_map)
+  {
+    state.set_mgr_map(mgr_map);
+  }
+
+};
index c9c836dc5151d99bedb09bcd156ce822575c6665..1f13145a6d929678b9f7d44cae453586f9a8d8d7 100644 (file)
@@ -1551,11 +1551,15 @@ void AuthMonitor::upgrade_format()
       }
     }
 
-    // add bootstrap key
-    {
+    // add bootstrap key if it does not already exist
+    // (might have already been get-or-create'd by
+    //  ceph-create-keys)
+    EntityName bootstrap_mgr_name;
+    int r = bootstrap_mgr_name.from_str("client.bootstrap-mgr");
+    assert(r);
+    if (!mon->key_server.contains(bootstrap_mgr_name)) {
       KeyServerData::Incremental auth_inc;
-      bool r = auth_inc.name.from_str("client.bootstrap-mgr");
-      assert(r);
+      auth_inc.name = bootstrap_mgr_name;
       ::encode("allow profile bootstrap-mgr", auth_inc.auth.caps["mon"]);
       auth_inc.op = KeyServerData::AUTH_INC_ADD;
       // generate key
index b7fde85528deff25ee46b52e537e213fa055c245..f69bcf16d5a6395b4eed8550012099a28124a834 100644 (file)
@@ -159,11 +159,11 @@ void Elector::reset_timer(double plus)
    * as far as we know, we may even be dead); so, just propose ourselves as the
    * Leader.
    */
-  expire_event = new C_MonContext(mon, [this](int) {
-      expire();
-    });
-  mon->timer.add_event_after(g_conf->mon_election_timeout + plus,
-                            expire_event);
+  expire_event = mon->timer.add_event_after(
+    g_conf->mon_election_timeout + plus,
+    new C_MonContext(mon, [this](int) {
+       expire();
+      }));
 }
 
 
index c7c2b281b48ced94585fe2c98f6f86b579bcd97c..b52767f9bceb8563dda0a866a16c568e8b623653 100644 (file)
@@ -429,23 +429,24 @@ bool LogMonitor::preprocess_command(MonOpRequestRef op)
     };
 
     auto rp = summary.tail.rbegin();
-    while (num > 0 && rp != summary.tail.rend()) {
+    for (; num > 0 && rp != summary.tail.rend(); ++rp) {
       if (match(*rp)) {
         num--;
       }
-      ++rp;
+    }
+    if (rp == summary.tail.rend()) {
+      --rp;
     }
     ostringstream ss;
-    auto p = summary.tail.begin();
-    for ( ; p != summary.tail.end(); ++p) {
-      if (!match(*p)) {
+    for (; rp != summary.tail.rbegin(); --rp) {
+      if (!match(*rp)) {
         continue;
       }
 
       if (f) {
-       f->dump_object("entry", *p);
+       f->dump_object("entry", *rp);
       } else {
-       ss << *p << "\n";
+       ss << *rp << "\n";
       }
     }
     if (f) {
@@ -811,7 +812,7 @@ ceph::logging::Graylog::Ref LogMonitor::log_channel_info::get_graylog(
   if (graylogs.count(channel) == 0) {
     auto graylog(std::make_shared<ceph::logging::Graylog>("mon"));
 
-    graylog->set_fsid(g_conf->fsid);
+    graylog->set_fsid(g_conf->get_val<uuid_d>("fsid"));
     graylog->set_hostname(g_conf->host);
     graylog->set_destination(get_str_map_key(log_to_graylog_host, channel,
                                             &CLOG_CONFIG_DEFAULT_KEY),
index 32a80a7b8b16a84d3d783bd4eb3d83d9f0cc71ec..59ebaaf755e317c6accba5e5e297eddf831aef78 100644 (file)
@@ -49,6 +49,10 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, FSMap const& fsmap) {
                << ").mds e" << fsmap.get_epoch() << " ";
 }
 
+static const string MDS_METADATA_PREFIX("mds_metadata");
+static const string MDS_HEALTH_PREFIX("mds_health");
+
+
 /*
  * Specialized implementation of cmd_getval to allow us to parse
  * out strongly-typedef'd types
@@ -71,9 +75,6 @@ template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
   return cmd_getval(cct, cmdmap, k, (int64_t&)val);
 }
 
-static const string MDS_METADATA_PREFIX("mds_metadata");
-
-
 // my methods
 
 void MDSMonitor::print_map(FSMap &m, int dbl)
@@ -89,6 +90,12 @@ void MDSMonitor::create_initial()
   dout(10) << "create_initial" << dendl;
 }
 
+void MDSMonitor::get_store_prefixes(std::set<string>& s)
+{
+  s.insert(service_name);
+  s.insert(MDS_METADATA_PREFIX);
+  s.insert(MDS_HEALTH_PREFIX);
+}
 
 void MDSMonitor::update_from_paxos(bool *need_bootstrap)
 {
@@ -133,6 +140,11 @@ void MDSMonitor::create_pending()
   pending_fsmap = fsmap;
   pending_fsmap.epoch++;
 
+  if (mon->osdmon()->is_readable()) {
+    auto &osdmap = mon->osdmon()->osdmap;
+    pending_fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
+  }
+
   dout(10) << "create_pending e" << pending_fsmap.epoch << dendl;
 }
 
index c14c9603943cc25c1a3699f70516f5f6b6f8d6d2..3d84f92a811fc6ec393fd348773942e7d2168f9c 100644 (file)
@@ -34,14 +34,13 @@ class MMDSLoadTargets;
 class MMDSMap;
 class FileSystemCommandHandler;
 
-#define MDS_HEALTH_PREFIX "mds_health"
-
 class MDSMonitor : public PaxosService {
  public:
   MDSMonitor(Monitor *mn, Paxos *p, string service_name);
 
   // service methods
   void create_initial() override;
+  void get_store_prefixes(std::set<string>& s) override;
   void update_from_paxos(bool *need_bootstrap) override;
   void init() override;
   void create_pending() override; 
index 01ed2515c34a8a811a30feb8b8d5c9f9a90c12fa..1af3a0ee7667e7050cffdb5582fcbf93fdfb13bc 100644 (file)
@@ -77,6 +77,10 @@ public:
   std::set<std::string> modules;
   std::set<std::string> available_modules;
 
+  // Map of module name to URI, indicating services exposed by
+  // running modules on the active mgr daemon.
+  std::map<std::string, std::string> services;
+
   epoch_t get_epoch() const { return epoch; }
   entity_addr_t get_active_addr() const { return active_addr; }
   uint64_t get_active_gid() const { return active_gid; }
@@ -120,7 +124,7 @@ public:
 
   void encode(bufferlist& bl, uint64_t features) const
   {
-    ENCODE_START(2, 1, bl);
+    ENCODE_START(3, 1, bl);
     ::encode(epoch, bl);
     ::encode(active_addr, bl, features);
     ::encode(active_gid, bl);
@@ -129,6 +133,7 @@ public:
     ::encode(standbys, bl);
     ::encode(modules, bl);
     ::encode(available_modules, bl);
+    ::encode(services, bl);
     ENCODE_FINISH(bl);
   }
 
@@ -145,6 +150,9 @@ public:
       ::decode(modules, p);
       ::decode(available_modules, p);
     }
+    if (struct_v >= 3) {
+      ::decode(services, p);
+    }
     DECODE_FINISH(p);
   }
 
@@ -177,6 +185,12 @@ public:
       f->dump_string("module", j);
     }
     f->close_section();
+
+    f->open_object_section("services");
+    for (const auto &i : services) {
+      f->dump_string(i.first.c_str(), i.second);
+    }
+    f->close_section();
   }
 
   static void generate_test_instances(list<MgrMap*> &l) {
index 3840b642b956cd9269522f8d172f0749690bf229..a307dd4df5ce1cadd0128d01ba942e0723e0dbf7 100644 (file)
@@ -43,7 +43,9 @@ const static std::string command_descs_prefix = "mgr_command_descs";
 
 void MgrMonitor::create_initial()
 {
-  boost::tokenizer<> tok(g_conf->mgr_initial_modules);
+  // Take a local copy of initial_modules for tokenizer to iterate over.
+  auto initial_modules = g_conf->get_val<std::string>("mgr_initial_modules");
+  boost::tokenizer<> tok(initial_modules);
   for (auto& m : tok) {
     pending_map.modules.insert(m);
   }
@@ -53,6 +55,13 @@ void MgrMonitor::create_initial()
           << dendl;
 }
 
+void MgrMonitor::get_store_prefixes(std::set<string>& s)
+{
+  s.insert(service_name);
+  s.insert(command_descs_prefix);
+  s.insert(MGR_METADATA_PREFIX);
+}
+
 void MgrMonitor::update_from_paxos(bool *need_bootstrap)
 {
   version_t version = get_last_committed();
@@ -85,8 +94,9 @@ void MgrMonitor::update_from_paxos(bool *need_bootstrap)
     check_subs();
 
     if (version == 1
-       || (map.get_available()
-           && (!old_available || old_gid != map.get_active_gid()))) {
+        || command_descs.empty()
+        || (map.get_available()
+            && (!old_available || old_gid != map.get_active_gid()))) {
       dout(4) << "mkfs or daemon transitioned to available, loading commands"
              << dendl;
       bufferlist loaded_commands;
@@ -108,6 +118,16 @@ void MgrMonitor::create_pending()
 {
   pending_map = map;
   pending_map.epoch++;
+
+  if (map.get_epoch() == 1 &&
+      command_descs.empty() &&
+      pending_command_descs.empty()) {
+    // we've been through the initial map and we haven't populated the
+    // command_descs vector. This likely means we came from kraken, where
+    // we wouldn't populate the vector, nor would we write it to disk, on
+    // create_initial().
+    create_initial();
+  }
 }
 
 health_status_t MgrMonitor::should_warn_about_mgr_down()
@@ -120,10 +140,10 @@ health_status_t MgrMonitor::should_warn_about_mgr_down()
   // no OSDs are ever created.
   if (ever_had_active_mgr ||
       (mon->osdmon()->osdmap.get_num_osds() > 0 &&
-       now > mon->monmap->created + g_conf->mon_mgr_mkfs_grace)) {
+       now > mon->monmap->created + g_conf->get_val<int64_t>("mon_mgr_mkfs_grace"))) {
     health_status_t level = HEALTH_WARN;
     if (first_seen_inactive != utime_t() &&
-       now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) {
+       now - first_seen_inactive > g_conf->get_val<int64_t>("mon_mgr_inactive_grace")) {
       level = HEALTH_ERR;
     }
     return level;
@@ -293,6 +313,13 @@ bool MgrMonitor::prepare_beacon(MonOpRequestRef op)
   bool updated = false;
 
   if (pending_map.active_gid == m->get_gid()) {
+    if (pending_map.services != m->get_services()) {
+      dout(4) << "updated services from mgr." << m->get_name()
+              << ": " << m->get_services() << dendl;
+      pending_map.services = m->get_services();
+      updated = true;
+    }
+
     // A beacon from the currently active daemon
     if (pending_map.active_addr != m->get_server_addr()) {
       dout(4) << "learned address " << m->get_server_addr()
@@ -453,10 +480,11 @@ void MgrMonitor::send_digests()
     sub->session->con->send_message(mdigest);
   }
 
-  digest_event = new C_MonContext(mon, [this](int){
+  digest_event = mon->timer.add_event_after(
+    g_conf->get_val<int64_t>("mon_mgr_digest_period"),
+    new C_MonContext(mon, [this](int) {
       send_digests();
-  });
-  mon->timer.add_event_after(g_conf->mon_mgr_digest_period, digest_event);
+  }));
 }
 
 void MgrMonitor::cancel_timer()
@@ -496,7 +524,7 @@ void MgrMonitor::get_health(
     if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
       utime_t now = ceph_clock_now();
       if (first_seen_inactive != utime_t() &&
-         now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) {
+         now - first_seen_inactive > g_conf->get_val<int64_t>("mon_mgr_inactive_grace")) {
        level = HEALTH_ERR;
       }
     }
@@ -510,7 +538,28 @@ void MgrMonitor::tick()
     return;
 
   const auto now = ceph::coarse_mono_clock::now();
-  const auto cutoff = now - std::chrono::seconds(g_conf->mon_mgr_beacon_grace);
+
+  const auto mgr_beacon_grace = std::chrono::seconds(
+      g_conf->get_val<int64_t>("mon_mgr_beacon_grace"));
+
+  // Note that this is the mgr daemon's tick period, not ours (the
+  // beacon is sent with this period).
+  const auto mgr_tick_period = std::chrono::seconds(
+      g_conf->get_val<int64_t>("mgr_tick_period"));
+
+  if (last_tick != ceph::coarse_mono_clock::time_point::min()
+      && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) {
+    // This case handles either local slowness (calls being delayed
+    // for whatever reason) or cluster election slowness (a long gap
+    // between calls while an election happened)
+    dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
+            "(slow election?) of " << now - last_tick << " seconds" << dendl;
+    for (auto &i : last_beacon) {
+      i.second = now;
+    }
+  }
+
+  last_tick = now;
 
   // Populate any missing beacons (i.e. no beacon since MgrMonitor
   // instantiation) with the current time, so that they will
@@ -528,6 +577,7 @@ void MgrMonitor::tick()
   // Cull standbys first so that any remaining standbys
   // will be eligible to take over from the active if we cull him.
   std::list<uint64_t> dead_standbys;
+  const auto cutoff = now - mgr_beacon_grace;
   for (const auto &i : pending_map.standbys) {
     auto last_beacon_time = last_beacon.at(i.first);
     if (last_beacon_time < cutoff) {
@@ -563,7 +613,7 @@ void MgrMonitor::tick()
     if (promote_standby()) {
       dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
       mon->clog->info() << "Activating manager daemon "
-                        << pending_map.active_name;
+                      << pending_map.active_name;
       propose = true;
     }
   }
@@ -571,8 +621,9 @@ void MgrMonitor::tick()
   if (!pending_map.available &&
       !ever_had_active_mgr &&
       should_warn_about_mgr_down() != HEALTH_OK) {
-    dout(10) << " exceeded mon_mgr_mkfs_grace " << g_conf->mon_mgr_mkfs_grace
-            << " seconds" << dendl;
+    dout(10) << " exceeded mon_mgr_mkfs_grace "
+             << g_conf->get_val<int64_t>("mon_mgr_mkfs_grace")
+             << " seconds" << dendl;
     propose = true;
   }
 
@@ -585,6 +636,7 @@ void MgrMonitor::on_restart()
 {
   // Clear out the leader-specific state.
   last_beacon.clear();
+  last_tick = ceph::coarse_mono_clock::now();
 }
 
 
@@ -619,6 +671,7 @@ void MgrMonitor::drop_active()
   pending_map.active_gid = 0;
   pending_map.available = false;
   pending_map.active_addr = entity_addr_t();
+  pending_map.services.clear();
 
   // So that when new active mgr subscribes to mgrdigest, it will
   // get an immediate response instead of waiting for next timer
@@ -685,9 +738,27 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
     }
     f->flush(rdata);
   } else if (prefix == "mgr module ls") {
-    f->open_array_section("modules");
-    for (auto& p : map.modules) {
-      f->dump_string("module", p);
+    f->open_object_section("modules");
+    {
+      f->open_array_section("enabled_modules");
+      for (auto& p : map.modules) {
+        f->dump_string("module", p);
+      }
+      f->close_section();
+      f->open_array_section("disabled_modules");
+      for (auto& p : map.available_modules) {
+        if (map.modules.count(p) == 0) {
+          f->dump_string("module", p);
+        }
+      }
+      f->close_section();
+    }
+    f->close_section();
+    f->flush(rdata);
+  } else if (prefix == "mgr services") {
+    f->open_object_section("services");
+    for (const auto &i : map.services) {
+      f->dump_string(i.first.c_str(), i.second);
     }
     f->close_section();
     f->flush(rdata);
index 563ae7c5de8de937626fdf2be61823809a7e6fb9..82315d3550de574d3d7ebb627ffd7cb6b6370a4f 100644 (file)
@@ -79,6 +79,7 @@ public:
   bool in_use() const { return map.epoch > 0; }
 
   void create_initial() override;
+  void get_store_prefixes(std::set<string>& s) override;
   void update_from_paxos(bool *need_bootstrap) override;
   void create_pending() override;
   void encode_pending(MonitorDBStore::TransactionRef t) override;
@@ -117,6 +118,11 @@ public:
   void count_metadata(const string& field, std::map<string,int> *out);
 
   friend class C_Updated;
+
+  // When did the mon last call into our tick() method?  Used for detecting
+  // when the mon was not updating us for some period (e.g. during slow
+  // election) to reset last_beacon timeouts
+  ceph::coarse_mono_clock::time_point last_tick;
 };
 
 #endif
index 946a95756d88b8c98b39ac16c93aa2ee2cd88521..176ca4055f5e3f6aff2ac96e018e6583168159f8 100644 (file)
@@ -548,6 +548,9 @@ COMMAND("osd crush add " \
        "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]", \
        "add or update crushmap position and weight for <name> with <weight> and location <args>", \
        "osd", "rw", "cli,rest")
+COMMAND("osd crush set-all-straw-buckets-to-straw2",
+        "convert all CRUSH current straw buckets to use the straw2 algorithm",
+       "osd", "rw", "cli,rest")
 COMMAND("osd crush set-device-class " \
         "name=class,type=CephString " \
        "name=ids,type=CephString,n=N", \
@@ -739,13 +742,15 @@ COMMAND("osd erasure-code-profile ls", \
        "list all erasure code profiles", \
        "osd", "r", "cli,rest")
 COMMAND("osd set " \
-       "name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds", \
+       "name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds " \
+       "name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
        "set <key>", "osd", "rw", "cli,rest")
 COMMAND("osd unset " \
        "name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent", \
        "unset <key>", "osd", "rw", "cli,rest")
 COMMAND("osd require-osd-release "\
-       "name=release,type=CephChoices,strings=luminous",
+       "name=release,type=CephChoices,strings=luminous " \
+       "name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
        "set the minimum allowed OSD release to participate in the cluster",
        "osd", "rw", "cli,rest")
 COMMAND("osd cluster_snap", "take cluster snapshot (disabled)", \
@@ -1073,6 +1078,9 @@ COMMAND("mgr fail name=who,type=CephString", \
        "treat the named manager daemon as failed", "mgr", "rw", "cli,rest")
 COMMAND("mgr module ls",
        "list active mgr modules", "mgr", "r", "cli,rest")
+COMMAND("mgr services",
+       "list service endpoints provided by mgr modules",
+        "mgr", "r", "cli,rest")
 COMMAND("mgr module enable "                                           \
        "name=module,type=CephString "                                  \
        "name=force,type=CephChoices,strings=--force,req=false",
index 218df9aa843df7d5c838fd66645a00489f881ded..7a1b9420e77fdcdb8a2397f1d851cf20ce0145fd 100644 (file)
@@ -439,31 +439,34 @@ int MonMap::build_initial(CephContext *cct, ostream& errout)
 {
   const md_config_t *conf = cct->_conf;
   // file?
-  if (!conf->monmap.empty()) {
+  const auto monmap = conf->get_val<std::string>("monmap");
+  if (!monmap.empty()) {
     int r;
     try {
-      r = read(conf->monmap.c_str());
+      r = read(monmap.c_str());
     }
     catch (const buffer::error &e) {
       r = -EINVAL;
     }
     if (r >= 0)
       return 0;
-    errout << "unable to read/decode monmap from " << conf->monmap
+    errout << "unable to read/decode monmap from " << monmap
         << ": " << cpp_strerror(-r) << std::endl;
     return r;
   }
 
   // fsid from conf?
-  if (!cct->_conf->fsid.is_zero()) {
-    fsid = cct->_conf->fsid;
+  const auto new_fsid = conf->get_val<uuid_d>("fsid");
+  if (!new_fsid.is_zero()) {
+    fsid = new_fsid;
   }
 
   // -m foo?
-  if (!conf->mon_host.empty()) {
-    int r = build_from_host_list(conf->mon_host, "noname-");
+  const auto mon_host = conf->get_val<std::string>("mon_host");
+  if (!mon_host.empty()) {
+    int r = build_from_host_list(mon_host, "noname-");
     if (r < 0) {
-      errout << "unable to parse addrs in '" << conf->mon_host << "'"
+      errout << "unable to parse addrs in '" << mon_host << "'"
              << std::endl;
       return r;
     }
@@ -536,7 +539,7 @@ int MonMap::build_initial(CephContext *cct, ostream& errout)
 
   if (size() == 0) {
     // no info found from conf options lets try use DNS SRV records
-    string srv_name = conf->mon_dns_srv_name;
+    string srv_name = conf->get_val<std::string>("mon_dns_srv_name");
     string domain;
     // check if domain is also provided and extract it from srv_name
     size_t idx = srv_name.find("_");
index 4e5914aa7caa447b70d83488668398505b218c52..af3683f67e1e7a540a9c3abc76a1014c6baaf993 100644 (file)
@@ -571,14 +571,22 @@ int Monitor::preinit()
   assert(!logger);
   {
     PerfCountersBuilder pcb(g_ceph_context, "mon", l_mon_first, l_mon_last);
-    pcb.add_u64(l_mon_num_sessions, "num_sessions", "Open sessions", "sess");
-    pcb.add_u64_counter(l_mon_session_add, "session_add", "Created sessions", "sadd");
-    pcb.add_u64_counter(l_mon_session_rm, "session_rm", "Removed sessions", "srm");
-    pcb.add_u64_counter(l_mon_session_trim, "session_trim", "Trimmed sessions");
-    pcb.add_u64_counter(l_mon_num_elections, "num_elections", "Elections participated in");
-    pcb.add_u64_counter(l_mon_election_call, "election_call", "Elections started");
-    pcb.add_u64_counter(l_mon_election_win, "election_win", "Elections won");
-    pcb.add_u64_counter(l_mon_election_lose, "election_lose", "Elections lost");
+    pcb.add_u64(l_mon_num_sessions, "num_sessions", "Open sessions", "sess",
+        PerfCountersBuilder::PRIO_USEFUL);
+    pcb.add_u64_counter(l_mon_session_add, "session_add", "Created sessions",
+        "sadd", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mon_session_rm, "session_rm", "Removed sessions",
+        "srm", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mon_session_trim, "session_trim", "Trimmed sessions",
+        "strm", PerfCountersBuilder::PRIO_USEFUL);
+    pcb.add_u64_counter(l_mon_num_elections, "num_elections", "Elections participated in",
+        "ecnt", PerfCountersBuilder::PRIO_USEFUL);
+    pcb.add_u64_counter(l_mon_election_call, "election_call", "Elections started",
+        "estt", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mon_election_win, "election_win", "Elections won",
+        "ewon", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mon_election_lose, "election_lose", "Elections lost",
+        "elst", PerfCountersBuilder::PRIO_INTERESTING);
     logger = pcb.create_perf_counters();
     cct->get_perfcounters_collection()->add(logger);
   }
@@ -1242,10 +1250,11 @@ void Monitor::sync_reset_timeout()
   dout(10) << __func__ << dendl;
   if (sync_timeout_event)
     timer.cancel_event(sync_timeout_event);
-  sync_timeout_event = new C_MonContext(this, [this](int) {
-      sync_timeout();
-    });
-  timer.add_event_after(g_conf->mon_sync_timeout, sync_timeout_event);
+  sync_timeout_event = timer.add_event_after(
+    g_conf->mon_sync_timeout,
+    new C_MonContext(this, [this](int) {
+       sync_timeout();
+      }));
 }
 
 void Monitor::sync_finish(version_t last_committed)
@@ -1588,8 +1597,12 @@ void Monitor::reset_probe_timeout()
       probe_timeout(r);
     });
   double t = g_conf->mon_probe_timeout;
-  timer.add_event_after(t, probe_timeout_event);
-  dout(10) << "reset_probe_timeout " << probe_timeout_event << " after " << t << " seconds" << dendl;
+  if (timer.add_event_after(t, probe_timeout_event)) {
+    dout(10) << "reset_probe_timeout " << probe_timeout_event
+            << " after " << t << " seconds" << dendl;
+  } else {
+    probe_timeout_event = nullptr;
+  }
 }
 
 void Monitor::probe_timeout(int r)
@@ -2291,14 +2304,14 @@ void Monitor::health_tick_start()
   dout(15) << __func__ << dendl;
 
   health_tick_stop();
-  health_tick_event = new C_MonContext(this, [this](int r) {
-      if (r < 0)
-        return;
-      do_health_to_clog();
-      health_tick_start();
-    });
-  timer.add_event_after(cct->_conf->mon_health_to_clog_tick_interval,
-                        health_tick_event);
+  health_tick_event = timer.add_event_after(
+    cct->_conf->mon_health_to_clog_tick_interval,
+    new C_MonContext(this, [this](int r) {
+       if (r < 0)
+         return;
+       do_health_to_clog();
+       health_tick_start();
+      }));
 }
 
 void Monitor::health_tick_stop()
@@ -2345,7 +2358,9 @@ void Monitor::health_interval_start()
         return;
       do_health_to_clog_interval();
     });
-  timer.add_event_at(next, health_interval_event);
+  if (!timer.add_event_at(next, health_interval_event)) {
+    health_interval_event = nullptr;
+  }
 }
 
 void Monitor::health_interval_stop()
@@ -2495,21 +2510,26 @@ health_status_t Monitor::get_health_status(
     *plain += "\n";
   }
 
+  const std::string old_fields_message = "'ceph health' JSON format has "
+    "changed in luminous. If you see this your monitoring system is "
+    "scraping the wrong fields. Disable this with 'mon health preluminous "
+    "compat warning = false'";
+
   if (f && (compat || compat_warn)) {
     health_status_t cr = compat_warn ? min(HEALTH_WARN, r) : r;
+    f->open_array_section("summary");
+    if (compat_warn) {
+      f->open_object_section("item");
+      f->dump_stream("severity") << HEALTH_WARN;
+      f->dump_string("summary", old_fields_message);
+      f->close_section();
+    }
     if (compat) {
-      f->open_array_section("summary");
-      if (compat_warn) {
-       f->open_object_section("item");
-       f->dump_stream("severity") << HEALTH_WARN;
-       f->dump_string("summary", "'ceph health' JSON format has changed in luminous; update your health monitoring scripts");
-       f->close_section();
-      }
       for (auto& svc : paxos_service) {
-       svc->get_health_checks().dump_summary_compat(f);
+        svc->get_health_checks().dump_summary_compat(f);
       }
-      f->close_section();
     }
+    f->close_section();
     f->dump_stream("overall_status") << cr;
   }
 
@@ -2517,7 +2537,7 @@ health_status_t Monitor::get_health_status(
     if (f && (compat || compat_warn)) {
       f->open_array_section("detail");
       if (compat_warn) {
-       f->dump_string("item", "'ceph health' JSON format has changed in luminous. If you see this your monitoring system is scraping the wrong fields. Disable this with 'mon health preluminous compat warning = false'");
+       f->dump_string("item", old_fields_message);
       }
     }
 
@@ -3149,8 +3169,8 @@ void Monitor::handle_command(MonOpRequestRef op)
       osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
     const auto& hdr = m->get_header();
     uint64_t size = hdr.front_len + hdr.middle_len + hdr.data_len;
-    uint64_t max =
-      g_conf->mon_client_bytes * g_conf->mon_mgr_proxy_client_bytes_ratio;
+    uint64_t max = g_conf->get_val<uint64_t>("mon_client_bytes")
+                 * g_conf->get_val<double>("mon_mgr_proxy_client_bytes_ratio");
     if (mgr_proxy_bytes + size > max) {
       dout(10) << __func__ << " current mgr proxy bytes " << mgr_proxy_bytes
               << " + " << size << " > max " << max << dendl;
@@ -4578,10 +4598,11 @@ void Monitor::timecheck_reset_event()
            << " rounds_since_clean " << timecheck_rounds_since_clean
            << dendl;
 
-  timecheck_event = new C_MonContext(this, [this](int) {
-      timecheck_start_round();
-    });
-  timer.add_event_after(delay, timecheck_event);
+  timecheck_event = timer.add_event_after(
+    delay,
+    new C_MonContext(this, [this](int) {
+       timecheck_start_round();
+      }));
 }
 
 void Monitor::timecheck_check_skews()
@@ -5427,10 +5448,11 @@ void Monitor::scrub_event_start()
     return;
   }
 
-  scrub_event = new C_MonContext(this, [this](int) {
+  scrub_event = timer.add_event_after(
+    cct->_conf->mon_scrub_interval,
+    new C_MonContext(this, [this](int) {
       scrub_start();
-    });
-  timer.add_event_after(cct->_conf->mon_scrub_interval, scrub_event);
+      }));
 }
 
 void Monitor::scrub_event_cancel()
@@ -5454,11 +5476,11 @@ void Monitor::scrub_reset_timeout()
 {
   dout(15) << __func__ << " reset timeout event" << dendl;
   scrub_cancel_timeout();
-
-  scrub_timeout_event = new C_MonContext(this, [this](int) {
+  scrub_timeout_event = timer.add_event_after(
+    g_conf->mon_scrub_timeout,
+    new C_MonContext(this, [this](int) {
       scrub_timeout();
-    });
-  timer.add_event_after(g_conf->mon_scrub_timeout, scrub_timeout_event);
+    }));
 }
 
 /************ TICK ***************/
index 707d635af557a01a91c4a5f694797e9d9f7dd017..00e56a9d8fcdb8654e40132ff339611047d8e291 100644 (file)
@@ -624,6 +624,8 @@ class MonitorDBStore
       db->init(g_conf->mon_rocksdb_options);
     else
       db->init();
+
+
   }
 
   int open(ostream &out) {
@@ -640,6 +642,16 @@ class MonitorDBStore
     r = db->open(out);
     if (r < 0)
       return r;
+
+    // Monitors are few in number, so the resource cost of exposing 
+    // very detailed stats is low: ramp up the priority of all the
+    // KV store's perf counters.  Do this after open, because backend may
+    // not have constructed PerfCounters earlier.
+    if (db->get_perf_counters()) {
+      db->get_perf_counters()->set_prio_adjust(
+          PerfCountersBuilder::PRIO_USEFUL - PerfCountersBuilder::PRIO_DEBUGONLY);
+    }
+
     io_work.start();
     is_open = true;
     return 0;
index 46f702f4023e47810a1d12ba7cbf123a14c92720..c9ff10b345f72ecb7c7c3353f45fd4a3718f23e8 100644 (file)
@@ -81,7 +81,8 @@
 #include <boost/algorithm/string/predicate.hpp>
 
 #define dout_subsys ceph_subsys_mon
-#define OSD_PG_CREATING_PREFIX "osd_pg_creating"
+static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
+static const string OSD_METADATA_PREFIX("osd_metadata");
 
 namespace {
 
@@ -268,6 +269,7 @@ void OSDMonitor::get_store_prefixes(std::set<string>& s)
 {
   s.insert(service_name);
   s.insert(OSD_PG_CREATING_PREFIX);
+  s.insert(OSD_METADATA_PREFIX);
 }
 
 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
@@ -566,23 +568,6 @@ void OSDMonitor::on_active()
 void OSDMonitor::on_restart()
 {
   last_osd_report.clear();
-
-  if (mon->is_leader()) {
-    // fix ruleset != ruleid
-    if (osdmap.crush->has_legacy_rulesets() &&
-       !osdmap.crush->has_multirule_rulesets()) {
-      CrushWrapper newcrush;
-      _get_pending_crush(newcrush);
-      int r = newcrush.renumber_rules_by_ruleset();
-      if (r >= 0) {
-       dout(1) << __func__ << " crush map has ruleset != rule id; fixing" << dendl;
-       pending_inc.crush.clear();
-       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
-      } else {
-       dout(10) << __func__ << " unable to renumber rules by ruleset" << dendl;
-      }
-    }
-  }
 }
 
 void OSDMonitor::on_shutdown()
@@ -662,6 +647,40 @@ void OSDMonitor::create_pending()
              << pending_inc.new_nearfull_ratio << dendl;
     }
   }
+
+  // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
+  // structure.
+  if (osdmap.crush->has_legacy_rule_ids()) {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    // First, for all pools, work out which rule they really used
+    // by resolving ruleset to rule.
+    for (const auto &i : osdmap.get_pools()) {
+      const auto pool_id = i.first;
+      const auto &pool = i.second;
+      int new_rule_id = newcrush.find_rule(pool.crush_rule,
+                                          pool.type, pool.size);
+
+      dout(1) << __func__ << " rewriting pool "
+             << osdmap.get_pool_name(pool_id) << " crush ruleset "
+             << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
+      if (pending_inc.new_pools.count(pool_id) == 0) {
+       pending_inc.new_pools[pool_id] = pool;
+      }
+      pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
+    }
+
+    // Now, go ahead and renumber all the rules so that their
+    // rule_id field corresponds to their position in the array
+    auto old_to_new = newcrush.renumber_rules();
+    dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
+    for (const auto &i : old_to_new) {
+      dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
+    }
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
+  }
 }
 
 creating_pgs_t
@@ -972,31 +991,190 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
     tmp.apply_incremental(pending_inc);
 
     if (tmp.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
-      // set or clear full/nearfull?
-      int full, backfill, nearfull;
-      tmp.count_full_nearfull_osds(&full, &backfill, &nearfull);
-      if (full > 0) {
-       if (!tmp.test_flag(CEPH_OSDMAP_FULL)) {
-         dout(10) << __func__ << " setting full flag" << dendl;
-         add_flag(CEPH_OSDMAP_FULL);
-         remove_flag(CEPH_OSDMAP_NEARFULL);
-       }
-      } else {
-       if (tmp.test_flag(CEPH_OSDMAP_FULL)) {
-         dout(10) << __func__ << " clearing full flag" << dendl;
-         remove_flag(CEPH_OSDMAP_FULL);
-       }
-       if (nearfull > 0) {
-         if (!tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
-           dout(10) << __func__ << " setting nearfull flag" << dendl;
-           add_flag(CEPH_OSDMAP_NEARFULL);
-         }
-       } else {
-         if (tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
-           dout(10) << __func__ << " clearing nearfull flag" << dendl;
-           remove_flag(CEPH_OSDMAP_NEARFULL);
-         }
-       }
+      // remove any legacy osdmap nearfull/full flags
+      {
+        if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
+          dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
+                   << dendl;
+          remove_flag(CEPH_OSDMAP_NEARFULL);
+          remove_flag(CEPH_OSDMAP_FULL);
+        }
+      }
+      // collect which pools are currently affected by
+      // the near/backfill/full osd(s),
+      // and set per-pool near/backfill/full flag instead
+      set<int64_t> full_pool_ids;
+      set<int64_t> backfillfull_pool_ids;
+      set<int64_t> nearfull_pool_ids;
+      tmp.get_full_pools(g_ceph_context,
+                         &full_pool_ids,
+                         &backfillfull_pool_ids,
+                         &nearfull_pool_ids);
+      if (full_pool_ids.empty() ||
+          backfillfull_pool_ids.empty() ||
+          nearfull_pool_ids.empty()) {
+        // normal case - no nearfull, backfillfull or full osds
+        // try cancel any improper nearfull/backfillfull/full pool
+        // flags first
+        for (auto &pool: tmp.get_pools()) {
+          auto p = pool.first;
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
+              nearfull_pool_ids.empty()) {
+            dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+                     << "'s nearfull flag" << dendl;
+            if (pending_inc.new_pools.count(p) == 0) {
+              // load original pool info first!
+              pending_inc.new_pools[p] = pool.second;
+            }
+            pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+          }
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
+              backfillfull_pool_ids.empty()) {
+            dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+                     << "'s backfillfull flag" << dendl;
+            if (pending_inc.new_pools.count(p) == 0) {
+              pending_inc.new_pools[p] = pool.second;
+            }
+            pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+          }
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
+              full_pool_ids.empty()) {
+            if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+              // set by EQUOTA, skipping
+              continue;
+            }
+            dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+                     << "'s full flag" << dendl;
+            if (pending_inc.new_pools.count(p) == 0) {
+              pending_inc.new_pools[p] = pool.second;
+            }
+            pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
+          }
+        }
+      }
+      if (!full_pool_ids.empty()) {
+        dout(10) << __func__ << " marking pool(s) " << full_pool_ids
+                 << " as full" << dendl;
+        for (auto &p: full_pool_ids) {
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
+            continue;
+          }
+          if (pending_inc.new_pools.count(p) == 0) {
+            pending_inc.new_pools[p] = tmp.pools[p];
+          }
+          pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
+          pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+          pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+        }
+        // cancel FLAG_FULL for pools which are no longer full too
+        for (auto &pool: tmp.get_pools()) {
+          auto p = pool.first;
+          if (full_pool_ids.count(p)) {
+            // skip pools we have just marked as full above
+            continue;
+          }
+          if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
+               tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+            // don't touch if currently is not full
+            // or is running out of quota (and hence considered as full)
+            continue;
+          }
+          dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+                   << "'s full flag" << dendl;
+          if (pending_inc.new_pools.count(p) == 0) {
+            pending_inc.new_pools[p] = pool.second;
+          }
+          pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
+        }
+      }
+      if (!backfillfull_pool_ids.empty()) {
+        for (auto &p: backfillfull_pool_ids) {
+          if (full_pool_ids.count(p)) {
+            // skip pools we have already considered as full above
+            continue;
+          }
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+            // make sure FLAG_FULL is truly set, so we are safe not
+            // to set a extra (redundant) FLAG_BACKFILLFULL flag
+            assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
+            continue;
+          }
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+            // don't bother if pool is already marked as backfillfull
+            continue;
+          }
+          dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
+                   << "'s as backfillfull" << dendl;
+          if (pending_inc.new_pools.count(p) == 0) {
+            pending_inc.new_pools[p] = tmp.pools[p];
+          }
+          pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
+          pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+        }
+        // cancel FLAG_BACKFILLFULL for pools
+        // which are no longer backfillfull too
+        for (auto &pool: tmp.get_pools()) {
+          auto p = pool.first;
+          if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
+            // skip pools we have just marked as backfillfull/full above
+            continue;
+          }
+          if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+            // and don't touch if currently is not backfillfull
+            continue;
+          }
+          dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+                   << "'s backfillfull flag" << dendl;
+          if (pending_inc.new_pools.count(p) == 0) {
+            pending_inc.new_pools[p] = pool.second;
+          }
+          pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+        }
+      }
+      if (!nearfull_pool_ids.empty()) {
+        for (auto &p: nearfull_pool_ids) {
+          if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
+            continue;
+          }
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+            // make sure FLAG_FULL is truly set, so we are safe not
+            // to set a extra (redundant) FLAG_NEARFULL flag
+            assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
+            continue;
+          }
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
+            // don't bother if pool is already marked as nearfull
+            continue;
+          }
+          dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
+                   << "'s as nearfull" << dendl;
+          if (pending_inc.new_pools.count(p) == 0) {
+            pending_inc.new_pools[p] = tmp.pools[p];
+          }
+          pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
+        }
+        // cancel FLAG_NEARFULL for pools
+        // which are no longer nearfull too
+        for (auto &pool: tmp.get_pools()) {
+          auto p = pool.first;
+          if (full_pool_ids.count(p) ||
+              backfillfull_pool_ids.count(p) ||
+              nearfull_pool_ids.count(p)) {
+            // skip pools we have just marked as
+            // nearfull/backfillfull/full above
+            continue;
+          }
+          if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
+            // and don't touch if currently is not nearfull
+            continue;
+          }
+          dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+                   << "'s nearfull flag" << dendl;
+          if (pending_inc.new_pools.count(p) == 0) {
+            pending_inc.new_pools[p] = pool.second;
+          }
+          pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+        }
       }
 
       // min_compat_client?
@@ -2668,6 +2846,10 @@ bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
     goto ignore;
   }
 
+  if (m->forced) {
+    return false;
+  }
+
   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
     dout(20) << " " << p->first
             << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
@@ -3749,16 +3931,6 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
       }
     }
 
-    if (osdmap.crush->has_multirule_rulesets()) {
-      ostringstream ss;
-      ss << "CRUSH map contains multirule rulesets";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-      if (detail) {
-       ss << "; please manually fix the map";
-       detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-      }
-    }
-
     // Not using 'sortbitwise' and should be?
     if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
         (osdmap.get_up_osd_features() &
@@ -5264,10 +5436,20 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
   return true;
 }
 
-void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags)
+void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
+{
+  pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
+    osdmap.get_pg_pool(pool_id));
+  assert(pool);
+  pool->set_flag(flags);
+}
+
+void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
 {
-  const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
-  pending_inc.get_new_pool(pool_id, pool)->flags = flags;
+  pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
+    osdmap.get_pg_pool(pool_id));
+  assert(pool);
+  pool->unset_flag(flags);
 }
 
 bool OSDMonitor::update_pools_status()
@@ -5290,14 +5472,16 @@ bool OSDMonitor::update_pools_status()
       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
 
-    if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
+    if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
       if (pool_is_full)
         continue;
 
       mon->clog->info() << "pool '" << pool_name
-                       << "' no longer full; removing FULL flag";
-
-      update_pool_flags(it->first, pool.get_flags() & ~pg_pool_t::FLAG_FULL);
+                       << "' no longer out of quota; removing NO_QUOTA flag";
+      // below we cancel FLAG_FULL too, we'll set it again in
+      // OSDMonitor::encode_pending if it still fails the osd-full checking.
+      clear_pool_flags(it->first,
+                       pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
       ret = true;
     } else {
       if (!pool_is_full)
@@ -5315,7 +5499,14 @@ bool OSDMonitor::update_pools_status()
                          << " (reached quota's max_objects: "
                          << pool.quota_max_objects << ")";
       }
-      update_pool_flags(it->first, pool.get_flags() | pg_pool_t::FLAG_FULL);
+      // set both FLAG_FULL_NO_QUOTA and FLAG_FULL
+      // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
+      // since FLAG_FULL should always take precedence
+      set_pool_flags(it->first,
+                     pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
+      clear_pool_flags(it->first,
+                       pg_pool_t::FLAG_NEARFULL |
+                       pg_pool_t::FLAG_BACKFILLFULL);
       ret = true;
     }
   }
@@ -5625,9 +5816,22 @@ int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_pr
       user_map[*i] = string();
       (*erasure_code_profile_map)[*i] = string();
     } else {
-      const string key = i->substr(0, equal);
+      string key = i->substr(0, equal);
       equal++;
       const string value = i->substr(equal);
+      if (key.find("ruleset-") == 0) {
+       if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+           g_conf->get_val<bool>("mon_fixup_legacy_erasure_code_profiles")) {
+         mon->clog->warn() << "erasure code profile property '" << key
+                           << "' is no longer supported; try "
+                           << "'crush-" << key.substr(8) << "' instead";
+         key = string("crush-") + key.substr(8);
+       } else {
+         *ss << "property '" << key << "' is no longer supported; try "
+             << "'crush-" << key.substr(8) << "' instead";
+         return -EINVAL;
+       }
+      }
       user_map[key] = value;
       (*erasure_code_profile_map)[key] = value;
     }
@@ -5797,6 +6001,36 @@ int OSDMonitor::get_crush_rule(const string &rule_name,
   return 0;
 }
 
+int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
+{
+  auto max_pgs_per_osd = g_conf->get_val<uint64_t>("mon_max_pg_per_osd");
+  auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);   // assume min cluster size 3
+  auto max_pgs = max_pgs_per_osd * num_osds;
+  uint64_t projected = 0;
+  if (pool < 0) {
+    projected += pg_num * size;
+  }
+  for (const auto& i : osdmap.get_pools()) {
+    if (i.first == pool) {
+      projected += pg_num * size;
+    } else {
+      projected += i.second.get_pg_num() * i.second.get_size();
+    }
+  }
+  if (projected > max_pgs) {
+    if (pool >= 0) {
+      *ss << "pool id " << pool;
+    }
+    *ss << " pg_num " << pg_num << " size " << size
+       << " would mean " << projected
+       << " total pgs, which exceeds max " << max_pgs
+       << " (mon_max_pg_per_osd " << max_pgs_per_osd
+       << " * num_in_osds " << num_osds << ")";
+    return -ERANGE;
+  }
+  return 0;
+}
+
 /**
  * @param name The name of the new pool
  * @param auid The auid of the pool owner. Can be -1
@@ -5876,6 +6110,11 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
     dout(10) << " prepare_pool_size returns " << r << dendl;
     return r;
   }
+  r = check_pg_num(-1, pg_num, size, ss);
+  if (r) {
+    dout(10) << " prepare_pool_size returns " << r << dendl;
+    return r;
+  }
 
   if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
     return -EINVAL;
@@ -6052,6 +6291,10 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
       ss << "pool size must be between 1 and 10";
       return -EINVAL;
     }
+    int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
+    if (r < 0) {
+      return r;
+    }
     p.size = n;
     if (n < p.min_size)
       p.min_size = n;
@@ -6121,6 +6364,10 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
          << " (you may adjust 'mon max pool pg num' for higher values)";
       return -ERANGE;
     }
+    int r = check_pg_num(pool, n, p.get_size(), &ss);
+    if (r) {
+      return r;
+    }
     string force;
     cmd_getval(g_ceph_context,cmdmap, "force", force);
     if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
@@ -6250,7 +6497,15 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
     bloomp->set_fpp(f);
   } else if (var == "use_gmt_hitset") {
     if (val == "true" || (interr.empty() && n == 1)) {
-      if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
+      string force;
+      cmd_getval(g_ceph_context, cmdmap, "force", force);
+      if (!osdmap.get_num_up_osds() && force != "--yes-i-really-mean-it") {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        return -EPERM;
+      }
+      if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)
+          && force != "--yes-i-really-mean-it") {
        ss << "not all OSDs support GMT hit set.";
        return -EINVAL;
       }
@@ -7412,7 +7667,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       }
     }
 
-    if (crush.has_legacy_rulesets()) {
+    if (crush.has_legacy_rule_ids()) {
       err = -EINVAL;
       ss << "crush maps with ruleset != ruleid are no longer allowed";
       goto reply;
@@ -7422,16 +7677,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
 
-    const auto& osdmap_pools = osdmap.get_pools();
-    for (auto pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
-      const int64_t pool_id = pit->first;
-      const pg_pool_t &pool = pit->second;
-      int ruleno = pool.get_crush_rule();
-      if (!crush.rule_exists(ruleno)) {
-       ss << " the crush rule no "<< ruleno << " for pool id " << pool_id << " is in use";
-       err = -EINVAL;
-       goto reply;
-      }
+    err = osdmap.validate_crush_rules(&crush, &ss);
+    if (err < 0) {
+      goto reply;
     }
 
     if (g_conf->mon_osd_crush_smoke_test) {
@@ -7460,6 +7708,26 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     ss << osdmap.get_crush_version() + 1;
     goto update;
 
+  } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
+      int bid = -1 - b;
+      if (newcrush.bucket_exists(bid) &&
+         newcrush.get_bucket_alg(bid)) {
+       dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
+       newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
+      }
+    }
+    if (!validate_crush_against_features(&newcrush, ss)) {
+      err = -EINVAL;
+      goto reply;
+    }
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+                                             get_last_committed() + 1));
+    return true;
   } else if (prefix == "osd crush set-device-class") {
     if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
@@ -8596,7 +8864,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       // FIXME: this is ok in some situations, but let's not bother with that
       // complexity now.
       int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
-      if (osdmap.crush_ruleset_in_use(ruleset)) {
+      if (osdmap.crush_rule_in_use(ruleset)) {
        ss << "crush ruleset " << name << " " << ruleset << " is in use";
        err = -EBUSY;
        goto reply;
@@ -8801,6 +9069,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
 
   } else if (prefix == "osd set") {
+    string sure;
+    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
     string key;
     cmd_getval(g_ceph_context, cmdmap, "key", key);
     if (key == "full")
@@ -8828,7 +9098,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     else if (key == "notieragent")
       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
     else if (key == "sortbitwise") {
-      if (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT) {
+      if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        err = -EPERM;
+        goto reply;
+      }
+      if ((osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)
+          || sure == "--yes-i-really-mean-it") {
        return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
       } else {
        ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
@@ -8836,7 +9113,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
       }
     } else if (key == "recovery_deletes") {
-      if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)) {
+      if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        err = -EPERM;
+        goto reply;
+      }
+      if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)
+          || sure == "--yes-i-really-mean-it") {
        return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
       } else {
        ss << "not all up OSDs have OSD_RECOVERY_DELETES feature";
@@ -8844,6 +9128,12 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
       }
     } else if (key == "require_jewel_osds") {
+      if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        err = -EPERM;
+        goto reply;
+      }
       if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
        ss << "the sortbitwise flag must be set before require_jewel_osds";
        err = -EPERM;
@@ -8852,13 +9142,20 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        ss << "require_osd_release is already >= jewel";
        err = 0;
        goto reply;
-      } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)) {
+      } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)
+                 || sure == "--yes-i-really-mean-it") {
        return prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_JEWEL);
       } else {
        ss << "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
        err = -EPERM;
       }
     } else if (key == "require_kraken_osds") {
+      if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        err = -EPERM;
+        goto reply;
+      }
       if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
        ss << "the sortbitwise flag must be set before require_kraken_osds";
        err = -EPERM;
@@ -8867,7 +9164,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        ss << "require_osd_release is already >= kraken";
        err = 0;
        goto reply;
-      } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)) {
+      } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)
+                 || sure == "--yes-i-really-mean-it") {
        bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_KRAKEN);
        // ensure JEWEL is also set
        pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
@@ -8916,6 +9214,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   } else if (prefix == "osd require-osd-release") {
     string release;
     cmd_getval(g_ceph_context, cmdmap, "release", release);
+    string sure;
+    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
     if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
       ss << "the sortbitwise flag must be set first";
       err = -EPERM;
@@ -9032,7 +9332,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
             if (osdmap.is_up(osd)) {
               msg << ", while it was still marked up";
             } else {
-              msg << ", after it was down for " << int(down_pending_out[osd].sec())
+              auto period = ceph_clock_now() - down_pending_out[osd];
+              msg << ", after it was down for " << int(period.sec())
                   << " seconds";
             }
 
index baee6a894d1d92a58fe366454995a2de37786cfd..9767f10035780898e1f6ac141e08791b22082e50 100644 (file)
@@ -43,8 +43,6 @@ class MOSDMap;
 #include "erasure-code/ErasureCodeInterface.h"
 #include "mon/MonOpRequest.h"
 
-#define OSD_METADATA_PREFIX "osd_metadata"
-
 /// information about a particular peer's failure reports for one osd
 struct failure_reporter_t {
   utime_t failed_since;     ///< when they think it failed
@@ -346,6 +344,7 @@ private:
                                const string &erasure_code_profile,
                                unsigned *stripe_width,
                                ostream *ss);
+  int check_pg_num(int64_t pool, int pg_num, int size, ostream* ss);
   int prepare_new_pool(string& name, uint64_t auid,
                       int crush_rule,
                       const string &crush_rule_name,
@@ -357,7 +356,8 @@ private:
                       ostream *ss);
   int prepare_new_pool(MonOpRequestRef op);
 
-  void update_pool_flags(int64_t pool_id, uint64_t flags);
+  void set_pool_flags(int64_t pool_id, uint64_t flags);
+  void clear_pool_flags(int64_t pool_id, uint64_t flags);
   bool update_pools_status();
 
   bool prepare_set_flag(MonOpRequestRef op, int flag);
index fde038fc5806a866c5310e1321b0c0d4cc69fd01..6fcc3c0721afba4ba79cfc83ea15d6e2cffaeeab 100644 (file)
@@ -824,8 +824,9 @@ void PGMapDigest::dump_object_stat_sum(
     curr_object_copies_rate = (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
 
   float used = 0.0;
+  // note avail passed in is raw_avail, calc raw_used here.
   if (avail) {
-    used = sum.num_bytes * curr_object_copies_rate;
+    used = sum.num_bytes * raw_used_rate * curr_object_copies_rate;
     used /= used + avail;
   } else if (sum.num_bytes) {
     used = 1.0;
@@ -2921,27 +2922,28 @@ void PGMap::get_health_checks(
   }
 
   // TOO_FEW_PGS
-  int num_in = osdmap.get_num_in_osds();
-  int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
-  if (num_in &&
-      cct->_conf->mon_pg_warn_min_per_osd > 0 &&
-      osdmap.get_pools().size() > 0) {
-    int per = sum_pg_up / num_in;
-    if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
+  unsigned num_in = osdmap.get_num_in_osds();
+  auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
+  const auto min_pg_per_osd =
+    cct->_conf->get_val<uint64_t>("mon_pg_warn_min_per_osd");
+  if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
+    auto per = sum_pg_up / num_in;
+    if (per < min_pg_per_osd && per) {
       ostringstream ss;
       ss << "too few PGs per OSD (" << per
-        << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
+        << " < min " << min_pg_per_osd << ")";
       checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
     }
   }
 
   // TOO_MANY_PGS
-  if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
-    int per = sum_pg_up / num_in;
-    if (per > cct->_conf->mon_pg_warn_max_per_osd) {
+  auto max_pg_per_osd = cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd");
+  if (num_in && max_pg_per_osd > 0) {
+    auto per = sum_pg_up / num_in;
+    if (per > max_pg_per_osd) {
       ostringstream ss;
       ss << "too many PGs per OSD (" << per
-        << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
+        << " > max " << max_pg_per_osd << ")";
       checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
     }
   }
@@ -3326,7 +3328,7 @@ void PGMap::get_health(
       note["incomplete"] += p->second;
     if (p->first & PG_STATE_BACKFILL_WAIT)
       note["backfill_wait"] += p->second;
-    if (p->first & PG_STATE_BACKFILL)
+    if (p->first & PG_STATE_BACKFILLING)
       note["backfilling"] += p->second;
     if (p->first & PG_STATE_BACKFILL_TOOFULL)
       note["backfill_toofull"] += p->second;
@@ -3431,7 +3433,7 @@ void PGMap::get_health(
                                PG_STATE_RECOVERY_TOOFULL |
                                PG_STATE_INCOMPLETE |
                                PG_STATE_BACKFILL_WAIT |
-                               PG_STATE_BACKFILL |
+                               PG_STATE_BACKFILLING |
                                PG_STATE_BACKFILL_TOOFULL)) &&
            stuck_pgs.count(p->first) == 0) {
          if (max > 0) {
@@ -3588,27 +3590,31 @@ void PGMap::get_health(
   }
 
   // pg skew
-  int num_in = osdmap.get_num_in_osds();
-  int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
+  auto num_in = osdmap.get_num_in_osds();
+  auto sum_pg_up = MAX(static_cast<unsigned>(pg_sum.up), pg_stat.size());
   int sum_objects = pg_sum.stats.sum.num_objects;
   if (sum_objects < cct->_conf->mon_pg_warn_min_objects) {
     return;
   }
-  if (num_in && cct->_conf->mon_pg_warn_min_per_osd > 0) {
-    int per = sum_pg_up / num_in;
-    if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
+  const auto min_pg_per_osd =
+    cct->_conf->get_val<uint64_t>("mon_pg_warn_min_per_osd");
+  if (num_in && min_pg_per_osd > 0) {
+    auto per = sum_pg_up / num_in;
+    if (per < min_pg_per_osd && per) {
       ostringstream ss;
-      ss << "too few PGs per OSD (" << per << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
+      ss << "too few PGs per OSD (" << per << " < min " << min_pg_per_osd << ")";
       summary.push_back(make_pair(HEALTH_WARN, ss.str()));
       if (detail)
        detail->push_back(make_pair(HEALTH_WARN, ss.str()));
     }
   }
-  if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
+  int64_t max_pg_per_osd = cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd");
+  if (num_in && max_pg_per_osd > 0) {
     int per = sum_pg_up / num_in;
-    if (per > cct->_conf->mon_pg_warn_max_per_osd) {
+    if (per > max_pg_per_osd) {
       ostringstream ss;
-      ss << "too many PGs per OSD (" << per << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
+      ss << "too many PGs per OSD (" << per << " > max "
+        << max_pg_per_osd << ")";
       summary.push_back(make_pair(HEALTH_WARN, ss.str()));
       if (detail)
        detail->push_back(make_pair(HEALTH_WARN, ss.str()));
@@ -3882,13 +3888,13 @@ int process_pg_map_command(
         state = -1;
         break;
       } else {
-        int filter = pg_string_state(state_str);
-        if (filter < 0) {
+        auto filter = pg_string_state(state_str);
+        if (!filter) {
           *ss << "'" << state_str << "' is not a valid pg state,"
               << " available choices: " << pg_state_string(0xFFFFFFFF);
           return -EINVAL;
         }
-        state |= filter;
+        state |= *filter;
       }
 
       states.pop_back();
index 3555d60dba5d728e45330f19ade22cef21b9ab78..e92438769f088a910be5e7c6097e8a19e31caad3 100644 (file)
@@ -86,6 +86,11 @@ void Paxos::init()
 void Paxos::init_logger()
 {
   PerfCountersBuilder pcb(g_ceph_context, "paxos", l_paxos_first, l_paxos_last);
+
+  // Because monitors are so few in number, the resource cost of capturing
+  // almost all their perf counters at USEFUL is trivial.
+  pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
   pcb.add_u64_counter(l_paxos_start_leader, "start_leader", "Starts in leader role");
   pcb.add_u64_counter(l_paxos_start_peon, "start_peon", "Starts in peon role");
   pcb.add_u64_counter(l_paxos_restart, "restart", "Restarts");
@@ -195,14 +200,14 @@ void Paxos::collect(version_t oldpn)
   }
 
   // set timeout event
-  collect_timeout_event = new C_MonContext(mon, [this](int r) {
+  collect_timeout_event = mon->timer.add_event_after(
+    g_conf->mon_accept_timeout_factor *
+    g_conf->mon_lease,
+    new C_MonContext(mon, [this](int r) {
        if (r == -ECANCELED)
          return;
        collect_timeout();
-    });
-  mon->timer.add_event_after(g_conf->mon_accept_timeout_factor *
-                            g_conf->mon_lease,
-                            collect_timeout_event);
+    }));
 }
 
 
@@ -687,14 +692,13 @@ void Paxos::begin(bufferlist& v)
   }
 
   // set timeout event
-  accept_timeout_event = new C_MonContext(mon, [this](int r) {
-      if (r == -ECANCELED)
-       return;
-      accept_timeout();
-    });
-  mon->timer.add_event_after(g_conf->mon_accept_timeout_factor *
-                            g_conf->mon_lease,
-                            accept_timeout_event);
+  accept_timeout_event = mon->timer.add_event_after(
+    g_conf->mon_accept_timeout_factor * g_conf->mon_lease,
+    new C_MonContext(mon, [this](int r) {
+       if (r == -ECANCELED)
+         return;
+       accept_timeout();
+      }));
 }
 
 // peon
@@ -992,26 +996,25 @@ void Paxos::extend_lease()
   // set timeout event.
   //  if old timeout is still in place, leave it.
   if (!lease_ack_timeout_event) {
-    lease_ack_timeout_event = new C_MonContext(mon, [this](int r) {
-       if (r == -ECANCELED)
-         return;
-       lease_ack_timeout();
-      });
-    mon->timer.add_event_after(g_conf->mon_lease_ack_timeout_factor *
-                              g_conf->mon_lease,
-                              lease_ack_timeout_event);
+    lease_ack_timeout_event = mon->timer.add_event_after(
+      g_conf->mon_lease_ack_timeout_factor * g_conf->mon_lease,
+      new C_MonContext(mon, [this](int r) {
+         if (r == -ECANCELED)
+           return;
+         lease_ack_timeout();
+       }));
   }
 
   // set renew event
-  lease_renew_event = new C_MonContext(mon, [this](int r) {
-      if (r == -ECANCELED)
-       return;
-      lease_renew_timeout();
-    });
   utime_t at = lease_expire;
   at -= g_conf->mon_lease;
   at += g_conf->mon_lease_renew_interval_factor * g_conf->mon_lease;
-  mon->timer.add_event_at(at, lease_renew_event);
+  lease_renew_event = mon->timer.add_event_at(
+    at, new C_MonContext(mon, [this](int r) {
+       if (r == -ECANCELED)
+         return;
+       lease_renew_timeout();
+    }));
 }
 
 void Paxos::warn_on_future_time(utime_t t, entity_name_t from)
@@ -1195,14 +1198,13 @@ void Paxos::reset_lease_timeout()
   dout(20) << "reset_lease_timeout - setting timeout event" << dendl;
   if (lease_timeout_event)
     mon->timer.cancel_event(lease_timeout_event);
-  lease_timeout_event = new C_MonContext(mon, [this](int r) {
-      if (r == -ECANCELED)
-       return;
-      lease_timeout();
-    });
-  mon->timer.add_event_after(g_conf->mon_lease_ack_timeout_factor *
-                            g_conf->mon_lease,
-                            lease_timeout_event);
+  lease_timeout_event = mon->timer.add_event_after(
+    g_conf->mon_lease_ack_timeout_factor * g_conf->mon_lease,
+    new C_MonContext(mon, [this](int r) {
+       if (r == -ECANCELED)
+         return;
+       lease_timeout();
+      }));
 }
 
 void Paxos::lease_timeout()
index dcd83506cebbc8a569cee63604613bb39586b6e0..de732c322301956af95f9e7276495fb842f2467b 100644 (file)
@@ -117,7 +117,7 @@ bool PaxosService::dispatch(MonOpRequestRef op)
        * Callback class used to propose the pending value once the proposal_timer
        * fires up.
        */
-    proposal_timer = new C_MonContext(mon, [this](int r) {
+    auto do_propose = new C_MonContext(mon, [this](int r) {
         proposal_timer = 0;
         if (r >= 0) {
           propose_pending();
@@ -127,9 +127,9 @@ bool PaxosService::dispatch(MonOpRequestRef op)
           assert(0 == "bad return value for proposal_timer");
         }
     });
-    dout(10) << " setting proposal_timer " << proposal_timer
+    dout(10) << " setting proposal_timer " << do_propose
              << " with delay of " << delay << dendl;
-    mon->timer.add_event_after(delay, proposal_timer);
+    proposal_timer = mon->timer.add_event_after(delay, do_propose);
   } else {
     dout(10) << " proposal_timer already set" << dendl;
   }
index 7c1a0d1fad5febe9ee507215aa37910508f4c9bc..c6dbcc17694d4e4ea594b4612313f753885bda55 100644 (file)
@@ -31,6 +31,7 @@ using namespace std;
 
 #include <errno.h>
 #include <sstream>
+#include <signal.h>
 
 #define SOCKET_PRIORITY_MIN_DELAY 6
 
@@ -558,11 +559,53 @@ protected:
   /**
    * @} // Subclass Interfacing
    */
+public:
+#ifdef CEPH_USE_SIGPIPE_BLOCKER
+  /**
+   * We need to disable SIGPIPE on all platforms, and if they
+   * don't give us a better mechanism (read: are on Solaris) that
+   * means blocking the signal whenever we do a send or sendmsg...
+   * That means any implementations must invoke MSGR_SIGPIPE_STOPPER in-scope
+   * whenever doing so. On most systems that's blank, but on systems where
+   * it's needed we construct an RAII object to plug and un-plug the SIGPIPE.
+   * See http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html
+   */
+  struct sigpipe_stopper {
+    bool blocked;
+    sigset_t existing_mask;
+    sigset_t pipe_mask;
+    sigpipe_stopper() {
+      sigemptyset(&pipe_mask);
+      sigaddset(&pipe_mask, SIGPIPE);
+      sigset_t signals;
+      sigemptyset(&signals);
+      sigpending(&signals);
+      if (sigismember(&signals, SIGPIPE)) {
+       blocked = false;
+      } else {
+       blocked = true;
+       int r = pthread_sigmask(SIG_BLOCK, &pipe_mask, &existing_mask);
+       assert(r == 0);
+      }
+    }
+    ~sigpipe_stopper() {
+      if (blocked) {
+       struct timespec nowait{0};
+       int r = sigtimedwait(&pipe_mask, 0, &nowait);
+       assert(r == EAGAIN || r == 0);
+       r = pthread_sigmask(SIG_SETMASK, &existing_mask, 0);
+       assert(r == 0);
+      }
+    }
+  };
+#  define MSGR_SIGPIPE_STOPPER Messenger::sigpipe_stopper stopper();
+#else
+#  define MSGR_SIGPIPE_STOPPER
+#endif
   /**
    * @defgroup Dispatcher Interfacing
    * @{
    */
-public:
   /**
    * Determine whether a message can be fast-dispatched. We will
    * query each Dispatcher in sequence to determine if they are
index 005b7c13ab29af3f1d618075c362902fa0d3fdf9..ab2ff2c4ab05054a8a2b579d0eed26c33a97f9ad 100644 (file)
@@ -19,7 +19,6 @@
 
 #include <atomic>
 #include <pthread.h>
-#include <signal.h>
 #include <climits>
 #include <list>
 #include <mutex>
index d0e6b5af0834789346bc510f4077773f9201cb43..5fb975ae906eaf89cb404794c2f5bddda6827535 100644 (file)
 
 #include "include/buffer.h"
 #include "include/str_list.h"
-#include "include/sock_compat.h"
 #include "common/errno.h"
 #include "common/strtol.h"
 #include "common/dout.h"
 #include "common/simple_spin.h"
+#include "msg/Messenger.h"
+#include "include/sock_compat.h"
 
 #define dout_subsys ceph_subsys_ms
 #undef dout_prefix
@@ -41,11 +42,6 @@ class PosixConnectedSocketImpl final : public ConnectedSocketImpl {
   int _fd;
   entity_addr_t sa;
   bool connected;
-#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
-  sigset_t sigpipe_mask;
-  bool sigpipe_pending;
-  bool sigpipe_unblock;
-#endif
 
  public:
   explicit PosixConnectedSocketImpl(NetHandler &h, const entity_addr_t &sa, int f, bool connected)
@@ -77,83 +73,15 @@ class PosixConnectedSocketImpl final : public ConnectedSocketImpl {
     return r;
   }
 
-  /*
-   SIGPIPE suppression - for platforms without SO_NOSIGPIPE or MSG_NOSIGNAL
-    http://krokisplace.blogspot.in/2010/02/suppressing-sigpipe-in-library.html 
-    http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html 
-  */
-  static void suppress_sigpipe()
-  {
-  #if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
-    /*
-      We want to ignore possible SIGPIPE that we can generate on write.
-      SIGPIPE is delivered *synchronously* and *only* to the thread
-      doing the write.  So if it is reported as already pending (which
-      means the thread blocks it), then we do nothing: if we generate
-      SIGPIPE, it will be merged with the pending one (there's no
-      queuing), and that suits us well.  If it is not pending, we block
-      it in this thread (and we avoid changing signal action, because it
-      is per-process).
-    */
-    sigset_t pending;
-    sigemptyset(&pending);
-    sigpending(&pending);
-    sigpipe_pending = sigismember(&pending, SIGPIPE);
-    if (!sigpipe_pending) {
-      sigset_t blocked;
-      sigemptyset(&blocked);
-      pthread_sigmask(SIG_BLOCK, &sigpipe_mask, &blocked);
-
-      /* Maybe is was blocked already?  */
-      sigpipe_unblock = ! sigismember(&blocked, SIGPIPE);
-    }
-  #endif /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
-  }
-
-  static void restore_sigpipe()
-  {
-  #if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
-    /*
-      If SIGPIPE was pending already we do nothing.  Otherwise, if it
-      become pending (i.e., we generated it), then we sigwait() it (thus
-      clearing pending status).  Then we unblock SIGPIPE, but only if it
-      were us who blocked it.
-    */
-    if (!sigpipe_pending) {
-      sigset_t pending;
-      sigemptyset(&pending);
-      sigpending(&pending);
-      if (sigismember(&pending, SIGPIPE)) {
-        /*
-          Protect ourselves from a situation when SIGPIPE was sent
-          by the user to the whole process, and was delivered to
-          other thread before we had a chance to wait for it.
-        */
-        static const struct timespec nowait = { 0, 0 };
-        TEMP_FAILURE_RETRY(sigtimedwait(&sigpipe_mask, NULL, &nowait));
-      }
-
-      if (sigpipe_unblock)
-        pthread_sigmask(SIG_UNBLOCK, &sigpipe_mask, NULL);
-    }
-  #endif  /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
-  }
-
   // return the sent length
   // < 0 means error occured
   static ssize_t do_sendmsg(int fd, struct msghdr &msg, unsigned len, bool more)
   {
-    suppress_sigpipe();
-
     size_t sent = 0;
     while (1) {
+      MSGR_SIGPIPE_STOPPER;
       ssize_t r;
-  #if defined(MSG_NOSIGNAL)
       r = ::sendmsg(fd, &msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
-  #else
-      r = ::sendmsg(fd, &msg, (more ? MSG_MORE : 0));
-  #endif /* defined(MSG_NOSIGNAL) */
-
       if (r < 0) {
         if (errno == EINTR) {
           continue;
@@ -179,7 +107,6 @@ class PosixConnectedSocketImpl final : public ConnectedSocketImpl {
         }
       }
     }
-    restore_sigpipe();
     return (ssize_t)sent;
   }
 
index 19adb2c83f2065ec5d0e19edaae7410756f99606..99ca1f32d9279462fe6b98ca36d484a3ca15b31d 100644 (file)
@@ -119,7 +119,7 @@ int NetHandler::set_socket_options(int sd, bool nodelay, int size)
   }
 
   // block ESIGPIPE
-#ifdef SO_NOSIGPIPE
+#ifdef CEPH_USE_SO_NOSIGPIPE
   int val = 1;
   r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
   if (r) {
index 4a7ab9acab7a0e6a2b8db601c1f9ac8cd008a38d..848efd45c0090f2381a930d11a6aae1014ed6572 100644 (file)
@@ -907,7 +907,7 @@ void Pipe::set_socket_options()
   }
 
   // block ESIGPIPE
-#if defined(SO_NOSIGPIPE)
+#ifdef CEPH_USE_SO_NOSIGPIPE
   int val = 1;
   int r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
   if (r) {
@@ -2257,91 +2257,21 @@ int Pipe::read_message(Message **pm, AuthSessionHandler* auth_handler)
   return ret;
 }
 
-/* 
- SIGPIPE suppression - for platforms without SO_NOSIGPIPE or MSG_NOSIGNAL
-  http://krokisplace.blogspot.in/2010/02/suppressing-sigpipe-in-library.html 
-  http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html 
-*/
-void Pipe::suppress_sigpipe()
-{
-#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
-  /*
-    We want to ignore possible SIGPIPE that we can generate on write.
-    SIGPIPE is delivered *synchronously* and *only* to the thread
-    doing the write.  So if it is reported as already pending (which
-    means the thread blocks it), then we do nothing: if we generate
-    SIGPIPE, it will be merged with the pending one (there's no
-    queuing), and that suits us well.  If it is not pending, we block
-    it in this thread (and we avoid changing signal action, because it
-    is per-process).
-  */
-  sigset_t pending;
-  sigemptyset(&pending);
-  sigpending(&pending);
-  sigpipe_pending = sigismember(&pending, SIGPIPE);
-  if (!sigpipe_pending) {
-    sigset_t blocked;
-    sigemptyset(&blocked);
-    pthread_sigmask(SIG_BLOCK, &sigpipe_mask, &blocked);
-
-    /* Maybe is was blocked already?  */
-    sigpipe_unblock = ! sigismember(&blocked, SIGPIPE);
-  }
-#endif  /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
-}
-
-
-void Pipe::restore_sigpipe()
-{
-#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
-  /*
-    If SIGPIPE was pending already we do nothing.  Otherwise, if it
-    become pending (i.e., we generated it), then we sigwait() it (thus
-    clearing pending status).  Then we unblock SIGPIPE, but only if it
-    were us who blocked it.
-  */
-  if (!sigpipe_pending) {
-    sigset_t pending;
-    sigemptyset(&pending);
-    sigpending(&pending);
-    if (sigismember(&pending, SIGPIPE)) {
-      /*
-        Protect ourselves from a situation when SIGPIPE was sent
-        by the user to the whole process, and was delivered to
-        other thread before we had a chance to wait for it.
-      */
-      static const struct timespec nowait = { 0, 0 };
-      TEMP_FAILURE_RETRY(sigtimedwait(&sigpipe_mask, NULL, &nowait));
-    }
-
-    if (sigpipe_unblock)
-      pthread_sigmask(SIG_UNBLOCK, &sigpipe_mask, NULL);
-  }
-#endif  /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
-}
-
-
 int Pipe::do_sendmsg(struct msghdr *msg, unsigned len, bool more)
 {
-  suppress_sigpipe();
+  MSGR_SIGPIPE_STOPPER;
   while (len > 0) {
     int r;
-#if defined(MSG_NOSIGNAL)
     r = ::sendmsg(sd, msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
-#else
-    r = ::sendmsg(sd, msg, (more ? MSG_MORE : 0));
-#endif
     if (r == 0) 
       ldout(msgr->cct,10) << "do_sendmsg hmm do_sendmsg got r==0!" << dendl;
     if (r < 0) {
       r = -errno; 
       ldout(msgr->cct,1) << "do_sendmsg error " << cpp_strerror(r) << dendl;
-      restore_sigpipe();
       return r;
     }
     if (state == STATE_CLOSED) {
       ldout(msgr->cct,10) << "do_sendmsg oh look, state == CLOSED, giving up" << dendl;
-      restore_sigpipe();
       return -EINTR; // close enough
     }
 
@@ -2366,7 +2296,6 @@ int Pipe::do_sendmsg(struct msghdr *msg, unsigned len, bool more)
       }
     }
   }
-  restore_sigpipe();
   return 0;
 }
 
@@ -2733,15 +2662,9 @@ int Pipe::tcp_write(const char *buf, unsigned len)
 
   //lgeneric_dout(cct, DBL) << "tcp_write writing " << len << dendl;
   assert(len > 0);
-  suppress_sigpipe();
-
   while (len > 0) {
-    int did;
-#if defined(MSG_NOSIGNAL)
-    did = ::send( sd, buf, len, MSG_NOSIGNAL );
-#else
-    did = ::send( sd, buf, len, 0);
-#endif
+    MSGR_SIGPIPE_STOPPER;
+    int did = ::send( sd, buf, len, MSG_NOSIGNAL );
     if (did < 0) {
       //lgeneric_dout(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl;
       //lgeneric_derr(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl;
@@ -2751,7 +2674,5 @@ int Pipe::tcp_write(const char *buf, unsigned len)
     buf += did;
     //lgeneric_dout(cct, DBL) << "tcp_write did " << did << ", " << len << " left" << dendl;
   }
-  restore_sigpipe();
-
   return 0;
 }
index 9dd00d1b48f345f0d4782798516c300246db14a0..d8d2a8e0831db13e0fdd672db8e4f0672ff60344 100644 (file)
@@ -115,11 +115,6 @@ static const int SM_IOV_MAX = (IOV_MAX >= 1024 ? IOV_MAX / 4 : IOV_MAX);
   private:
     int sd;
     struct iovec msgvec[SM_IOV_MAX];
-#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
-    sigset_t sigpipe_mask;
-    bool sigpipe_pending;
-    bool sigpipe_unblock;
-#endif
 
   public:
     int port;
@@ -188,10 +183,6 @@ static const int SM_IOV_MAX = (IOV_MAX >= 1024 ? IOV_MAX / 4 : IOV_MAX);
     int write_keepalive();
     int write_keepalive2(char tag, const utime_t &t);
 
-    void suppress_sigpipe();
-    void restore_sigpipe();
-
-
     void fault(bool reader=false);
 
     void was_session_reset();
index 67a5780ae8c283882173862d8f32beff7828de49..32b423c2861681ac41b179a527cd7d9c8c7fbbe9 100644 (file)
@@ -152,7 +152,7 @@ public:
     const SequencerPosition *spos=0   ///< [in] Sequencer
     ) { return 0; }
 
-  virtual int check(std::ostream &out, bool repair = false) { return 0; }
+  virtual int check(std::ostream &out, bool repair = false, bool force = false) { return 0; }
 
   virtual void compact() {}
 
index 97624c09fca651dea2d43d193dc4de986c41a27f..2daf2c64b91cf61fcc054513586822497f5ff44b 100644 (file)
@@ -1543,6 +1543,9 @@ public:
   virtual int fsck(bool deep) {
     return -EOPNOTSUPP;
   }
+  virtual int repair(bool deep) {
+    return -EOPNOTSUPP;
+  }
 
   virtual void set_cache_shards(unsigned num) { }
 
index 075d3eff6ef18451c6ba04cdd41463ad6cbc0ee6..2480945f41e807ef4cbe7286931f5d7a7ad09948 100644 (file)
@@ -58,9 +58,11 @@ BitmapFreelistManager::BitmapFreelistManager(CephContext* cct,
 {
 }
 
-int BitmapFreelistManager::create(uint64_t new_size, KeyValueDB::Transaction txn)
+int BitmapFreelistManager::create(uint64_t new_size, uint64_t min_alloc_size,
+                                 KeyValueDB::Transaction txn)
 {
-  bytes_per_block = cct->_conf->bdev_block_size;
+  bytes_per_block = std::max(cct->_conf->bdev_block_size,
+                            (int64_t)min_alloc_size);
   assert(ISP2(bytes_per_block));
   size = P2ALIGN(new_size, bytes_per_block);
   blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key;
@@ -105,7 +107,7 @@ int BitmapFreelistManager::create(uint64_t new_size, KeyValueDB::Transaction txn
   return 0;
 }
 
-int BitmapFreelistManager::init()
+int BitmapFreelistManager::init(uint64_t dev_size)
 {
   dout(1) << __func__ << dendl;
 
@@ -153,6 +155,49 @@ int BitmapFreelistManager::init()
           << " blocks_per_key 0x" << blocks_per_key
           << std::dec << dendl;
   _init_misc();
+
+  // check for http://tracker.ceph.com/issues/21089 inconsistency
+  {
+    uint64_t new_size = P2ALIGN(dev_size, bytes_per_block);
+    if (new_size != size) {
+      uint64_t bad_size = new_size & ~bytes_per_block;
+      if (size == bad_size) {
+       derr << __func__ << " size is 0x" << std::hex << size << " should be 0x"
+            << new_size << " and appears to be due to #21089" << std::dec
+            << dendl;
+
+       uint64_t new_blocks = new_size / bytes_per_block;
+       if (new_blocks / blocks_per_key * blocks_per_key != new_blocks) {
+         new_blocks = (new_blocks / blocks_per_key + 1) *
+           blocks_per_key;
+       }
+
+       KeyValueDB::Transaction t = kvdb->get_transaction();
+       {
+         bufferlist sizebl;
+         ::encode(new_size, sizebl);
+         t->set(meta_prefix, "size", sizebl);
+       }
+       if (new_blocks != blocks) {
+         derr << "blocks is 0x" << std::hex << blocks << " should be 0x"
+              << new_blocks << std::dec << dendl;
+         bufferlist bl;
+         ::encode(new_blocks, bl);
+         t->set(meta_prefix, "blocks", bl);
+         _xor(new_size, new_blocks * bytes_per_block - new_size, t);
+       } else {
+         derr << "blocks are ok" << dendl;
+         _xor(bad_size, bytes_per_block, t);
+       }
+       int r = kvdb->submit_transaction_sync(t);
+       assert(r == 0);
+       size = new_size;
+       blocks = new_blocks;
+       derr << __func__ << " fixed inconsistency, size now 0x" << std::hex
+            << size << " blocks 0x" << blocks << std::dec << dendl;
+      }
+    }
+  }
   return 0;
 }
 
index 9ed39ff56534fd0658ae7305b996be133ddd6a42..10982545b73f035043194384576c702be40a744a 100644 (file)
@@ -51,9 +51,10 @@ public:
 
   static void setup_merge_operator(KeyValueDB *db, string prefix);
 
-  int create(uint64_t size, KeyValueDB::Transaction txn) override;
+  int create(uint64_t size, uint64_t min_alloc_size,
+            KeyValueDB::Transaction txn) override;
 
-  int init() override;
+  int init(uint64_t dev_size) override;
   void shutdown() override;
 
   void dump() override;
index 1d39d76358c4859e02defeaa0e1dbb0742d0ff85..4d07b48c20ed79241af6a6f914906fc175254882 100644 (file)
@@ -255,6 +255,16 @@ void BlueFS::dump_perf_counters(Formatter *f)
   f->close_section();
 }
 
+void BlueFS::dump_block_extents(ostream& out)
+{
+  for (unsigned i = 0; i < MAX_BDEV; ++i) {
+    if (!bdev[i]) {
+      continue;
+    }
+    out << i << " : size 0x" << std::hex << bdev[i]->get_size()
+       << " : own 0x" << block_all[i] << std::dec << "\n";
+  }
+}
 
 void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
 {
@@ -1191,6 +1201,14 @@ void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
   new_log = new File;
   new_log->fnode.ino = 0;   // so that _flush_range won't try to log the fnode
 
+  // 0. wait for any racing flushes to complete.  (We do not want to block
+  // in _flush_sync_log with jump_to set or else a racing thread might flush
+  // our entries and our jump_to update won't be correct.)
+  while (log_flushing) {
+    dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
+    log_cond.wait(l);
+  }
+
   // 1. allocate new log space and jump to it.
   old_log_jump_to = log_file->fnode.get_allocated();
   uint64_t need = old_log_jump_to + cct->_conf->bluefs_max_log_runway;
@@ -1351,16 +1369,19 @@ int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
   while (log_flushing) {
     dout(10) << __func__ << " want_seq " << want_seq
             << " log is currently flushing, waiting" << dendl;
+    assert(!jump_to);
     log_cond.wait(l);
   }
   if (want_seq && want_seq <= log_seq_stable) {
     dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
             << log_seq_stable << ", done" << dendl;
+    assert(!jump_to);
     return 0;
   }
   if (log_t.empty() && dirty_files.empty()) {
     dout(10) << __func__ << " want_seq " << want_seq
             << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
+    assert(!jump_to);
     return 0;
   }
 
@@ -1505,6 +1526,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
       derr << __func__ << " allocated: 0x" << std::hex << allocated
            << " offset: 0x" << offset << " length: 0x" << length << std::dec
            << dendl;
+      assert(0 == "bluefs enospc");
       return r;
     }
     h->file->fnode.recalc_allocated();
index 1b38a6ab1d493840046c58a8bcdb0c7ff547f920..36b41b451bbe5a66168e35794ee0033dd5c498f4 100644 (file)
@@ -342,6 +342,8 @@ public:
   void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...]
   void dump_perf_counters(Formatter *f);
 
+  void dump_block_extents(ostream& out);
+
   /// get current extents that we own for given block device
   int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
 
index 3566005cd01a76132fdbf37d97e6bb10764e80bb..5fe5b98c5dd074980957be21127afc53bb07d029 100644 (file)
@@ -1615,6 +1615,12 @@ bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
   return false;
 }
 
+void BlueStore::OnodeSpace::dump(CephContext *cct, int lvl)
+{
+  for (auto& i : onode_map) {
+    ldout(cct, lvl) << i.first << " : " << i.second << dendl;
+  }
+}
 
 // SharedBlob
 
@@ -1661,7 +1667,7 @@ void BlueStore::SharedBlob::put()
                             << " removing self from set " << get_parent()
                             << dendl;
     if (get_parent()) {
-      if (get_parent()->remove(this)) {
+      if (get_parent()->try_remove(this)) {
        delete this;
       } else {
        ldout(coll->store->cct, 20)
@@ -1692,6 +1698,19 @@ void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
   }
 }
 
+// SharedBlobSet
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
+
+void BlueStore::SharedBlobSet::dump(CephContext *cct, int lvl)
+{
+  std::lock_guard<std::mutex> l(lock);
+  for (auto& i : sb_map) {
+    ldout(cct, lvl) << i.first << " : " << *i.second << dendl;
+  }
+}
+
 // Blob
 
 #undef dout_prefix
@@ -1976,6 +1995,7 @@ void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
       unsigned n;
       // we need to encode inline_bl to measure encoded length
       bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
+      inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
       assert(!never_happen);
       size_t len = inline_bl.length();
       dout(20) << __func__ << "  inline shard " << len << " bytes from " << n
@@ -3228,12 +3248,17 @@ BlueStore::OnodeRef BlueStore::Collection::get_onode(
     on->exists = true;
     bufferptr::iterator p = v.front().begin_deep();
     on->onode.decode(p);
+    for (auto& i : on->onode.attrs) {
+      i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+    }
 
     // initialize extent_map
     on->extent_map.decode_spanning_blobs(p);
     if (on->onode.extent_map_shards.empty()) {
       denc(on->extent_map.inline_bl, p);
       on->extent_map.decode_some(on->extent_map.inline_bl);
+      on->extent_map.inline_bl.reassign_to_mempool(
+       mempool::mempool_bluestore_cache_other);
     } else {
       on->extent_map.init_shards(false, false);
     }
@@ -3292,13 +3317,13 @@ void BlueStore::Collection::split_cache(
          continue;
        }
        ldout(store->cct, 20) << __func__ << "  moving " << *sb << dendl;
-       sb->coll = dest;
        if (sb->get_sbid()) {
          ldout(store->cct, 20) << __func__
                                << "   moving registration " << *sb << dendl;
          shared_blob_set.remove(sb);
          dest->shared_blob_set.add(dest, sb);
        }
+       sb->coll = dest;
        if (dest->cache != cache) {
          for (auto& i : sb->bc.buffer_map) {
            if (!i.second->is_writing()) {
@@ -3631,8 +3656,8 @@ void BlueStore::_set_compression()
     return;
   }
 
-  if (cct->_conf->bluestore_compression_max_blob_size) {
-    comp_min_blob_size = cct->_conf->bluestore_compression_max_blob_size;
+  if (cct->_conf->bluestore_compression_min_blob_size) {
+    comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
   } else {
     assert(bdev);
     if (bdev->is_rotational()) {
@@ -3774,6 +3799,36 @@ int BlueStore::_set_cache_sizes()
   return 0;
 }
 
+int BlueStore::write_meta(const std::string& key, const std::string& value)
+{
+  bluestore_bdev_label_t label;
+  string p = path + "/block";
+  int r = _read_bdev_label(cct, p, &label);
+  if (r < 0) {
+    return ObjectStore::write_meta(key, value);
+  }
+  label.meta[key] = value;
+  r = _write_bdev_label(cct, p, label);
+  assert(r == 0);
+  return ObjectStore::write_meta(key, value);
+}
+
+int BlueStore::read_meta(const std::string& key, std::string *value)
+{
+  bluestore_bdev_label_t label;
+  string p = path + "/block";
+  int r = _read_bdev_label(cct, p, &label);
+  if (r < 0) {
+    return ObjectStore::read_meta(key, value);
+  }
+  auto i = label.meta.find(key);
+  if (i == label.meta.end()) {
+    return ObjectStore::read_meta(key, value);
+  }
+  *value = i->second;
+  return 0;
+}
+
 void BlueStore::_init_logger()
 {
   PerfCountersBuilder b(cct, "bluestore",
@@ -3966,7 +4021,8 @@ void BlueStore::_close_path()
   path_fd = -1;
 }
 
-int BlueStore::_write_bdev_label(string path, bluestore_bdev_label_t label)
+int BlueStore::_write_bdev_label(CephContext *cct,
+                                string path, bluestore_bdev_label_t label)
 {
   dout(10) << __func__ << " path " << path << " label " << label << dendl;
   bufferlist bl;
@@ -3990,6 +4046,11 @@ int BlueStore::_write_bdev_label(string path, bluestore_bdev_label_t label)
     derr << __func__ << " failed to write to " << path
         << ": " << cpp_strerror(r) << dendl;
   }
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " failed to fsync " << path
+        << ": " << cpp_strerror(r) << dendl;
+  }
   VOID_TEMP_FAILURE_RETRY(::close(fd));
   return r;
 }
@@ -4047,7 +4108,7 @@ int BlueStore::_check_or_set_bdev_label(
     label.size = size;
     label.btime = ceph_clock_now();
     label.description = desc;
-    int r = _write_bdev_label(path, label);
+    int r = _write_bdev_label(cct, path, label);
     if (r < 0)
       return r;
   } else {
@@ -4159,18 +4220,19 @@ int BlueStore::_open_fm(bool create)
       bl.append(freelist_type);
       t->set(PREFIX_SUPER, "freelist_type", bl);
     }
-    fm->create(bdev->get_size(), t);
+    fm->create(bdev->get_size(), min_alloc_size, t);
 
     // allocate superblock reserved space.  note that we do not mark
     // bluefs space as allocated in the freelist; we instead rely on
     // bluefs_extents.
-    fm->allocate(0, SUPER_RESERVED, t);
+    uint64_t reserved = ROUND_UP_TO(MAX(SUPER_RESERVED, min_alloc_size),
+                                   min_alloc_size);
+    fm->allocate(0, reserved, t);
 
-    uint64_t reserved = 0;
     if (cct->_conf->bluestore_bluefs) {
       assert(bluefs_extents.num_intervals() == 1);
       interval_set<uint64_t>::iterator p = bluefs_extents.begin();
-      reserved = p.get_start() + p.get_len();
+      reserved = ROUND_UP_TO(p.get_start() + p.get_len(), min_alloc_size);
       dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
               << " for bluefs" << dendl;
       bufferlist bl;
@@ -4178,8 +4240,6 @@ int BlueStore::_open_fm(bool create)
       t->set(PREFIX_SUPER, "bluefs_extents", bl);
       dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
               << std::dec << dendl;
-    } else {
-      reserved = SUPER_RESERVED;
     }
 
     if (cct->_conf->bluestore_debug_prefill > 0) {
@@ -4226,7 +4286,7 @@ int BlueStore::_open_fm(bool create)
     db->submit_transaction_sync(t);
   }
 
-  int r = fm->init();
+  int r = fm->init(bdev->get_size());
   if (r < 0) {
     derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
     delete fm;
@@ -4493,7 +4553,9 @@ int BlueStore::_open_db(bool create)
     string bfn;
     struct stat st;
 
-    bfn = path + "/block.db";
+    if (read_meta("path_block.db", &bfn) < 0) {
+      bfn = path + "/block.db";
+    }
     if (::stat(bfn.c_str(), &st) == 0) {
       r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
       if (r < 0) {
@@ -4532,7 +4594,9 @@ int BlueStore::_open_db(bool create)
     }
 
     // shared device
-    bfn = path + "/block";
+    if (read_meta("path_block", &bfn) < 0) {
+      bfn = path + "/block";
+    }
     r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
     if (r < 0) {
       derr << __func__ << " add block device(" << bfn << ") returned: " 
@@ -4545,6 +4609,13 @@ int BlueStore::_open_db(bool create)
        bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
                            cct->_conf->bluestore_bluefs_gift_ratio);
       initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
+      if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
+       derr << __func__ << " bluefs_alloc_size 0x" << std::hex
+            << cct->_conf->bluefs_alloc_size << " is not a multiple of "
+            << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+       r = -EINVAL;
+       goto free_bluefs;
+      }
       // align to bluefs's alloc_size
       initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
       // put bluefs in the middle of the device in case it is an HDD
@@ -4554,7 +4625,9 @@ int BlueStore::_open_db(bool create)
       bluefs_extents.insert(start, initial);
     }
 
-    bfn = path + "/block.wal";
+    if (read_meta("path_block.wal", &bfn) < 0) {
+      bfn = path + "/block.wal";
+    }
     if (::stat(bfn.c_str(), &st) == 0) {
       r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
       if (r < 0) {
@@ -4817,9 +4890,11 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
             << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
             << ", should reclaim " << pretty_si_t(reclaim) << dendl;
   }
+
+  // don't take over too much of the freespace
+  uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
   if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
-    cct->_conf->bluestore_bluefs_min <
-      (uint64_t)(cct->_conf->bluestore_bluefs_max_ratio * total_free)) {
+      cct->_conf->bluestore_bluefs_min < free_cap) {
     uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
     dout(10) << __func__ << " bluefs_total " << bluefs_total
             << " < min " << cct->_conf->bluestore_bluefs_min
@@ -4828,6 +4903,17 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
       gift = g;
     reclaim = 0;
   }
+  uint64_t min_free = cct->_conf->get_val<uint64_t>("bluestore_bluefs_min_free");
+  if (bluefs_free < min_free &&
+      min_free < free_cap) {
+    uint64_t g = min_free - bluefs_free;
+    dout(10) << __func__ << " bluefs_free " << bluefs_total
+            << " < min " << min_free
+            << ", should gift " << pretty_si_t(g) << dendl;
+    if (g > gift)
+      gift = g;
+    reclaim = 0;
+  }
 
   if (gift) {
     // round up to alloc size
@@ -5150,6 +5236,39 @@ int BlueStore::mkfs()
   if (r < 0)
     goto out_close_fsid;
 
+  {
+    string wal_path = cct->_conf->get_val<string>("bluestore_block_wal_path");
+    if (wal_path.size()) {
+      write_meta("path_block.wal", wal_path);
+    }
+    string db_path = cct->_conf->get_val<string>("bluestore_block_db_path");
+    if (db_path.size()) {
+      write_meta("path_block.db", db_path);
+    }
+  }
+
+  // choose min_alloc_size
+  if (cct->_conf->bluestore_min_alloc_size) {
+    min_alloc_size = cct->_conf->bluestore_min_alloc_size;
+  } else {
+    assert(bdev);
+    if (bdev->is_rotational()) {
+      min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
+    } else {
+      min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
+    }
+  }
+
+  // make sure min_alloc_size is power of 2 aligned.
+  if (!ISP2(min_alloc_size)) {
+    derr << __func__ << " min_alloc_size 0x"
+        << std::hex << min_alloc_size << std::dec
+        << " is not power of 2 aligned!"
+        << dendl;
+    r = -EINVAL;
+    goto out_close_bdev;
+  }
+
   r = _open_db(true);
   if (r < 0)
     goto out_close_bdev;
@@ -5167,28 +5286,6 @@ int BlueStore::mkfs()
       t->set(PREFIX_SUPER, "blobid_max", bl);
     }
 
-    // choose min_alloc_size
-    if (cct->_conf->bluestore_min_alloc_size) {
-      min_alloc_size = cct->_conf->bluestore_min_alloc_size;
-    } else {
-      assert(bdev);
-      if (bdev->is_rotational()) {
-       min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
-      } else {
-       min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
-      }
-    }
-
-    // make sure min_alloc_size is power of 2 aligned.
-    if (!ISP2(min_alloc_size)) {
-      derr << __func__ << " min_alloc_size 0x"
-           << std::hex << min_alloc_size << std::dec
-           << " is not power of 2 aligned!"
-           << dendl;
-      r = -EINVAL;
-      goto out_close_fm;
-    }
-
     {
       bufferlist bl;
       ::encode((uint64_t)min_alloc_size, bl);
@@ -5205,7 +5302,7 @@ int BlueStore::mkfs()
   if (r < 0)
     goto out_close_fm;
 
-  r = write_meta("bluefs", stringify((int)cct->_conf->bluestore_bluefs));
+  r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
   if (r < 0)
     goto out_close_fm;
 
@@ -5268,6 +5365,8 @@ int BlueStore::_mount(bool kv_only)
 {
   dout(1) << __func__ << " path " << path << dendl;
 
+  _kv_only = kv_only;
+
   {
     string type;
     int r = read_meta("type", &type);
@@ -5378,23 +5477,24 @@ int BlueStore::_mount(bool kv_only)
 
 int BlueStore::umount()
 {
-  assert(mounted);
+  assert(_kv_only || mounted);
   dout(1) << __func__ << dendl;
 
   _osr_drain_all();
   _osr_unregister_all();
 
-  mempool_thread.shutdown();
-
-  dout(20) << __func__ << " stopping kv thread" << dendl;
-  _kv_stop();
-  _reap_collections();
-  _flush_cache();
-  dout(20) << __func__ << " closing" << dendl;
-
   mounted = false;
-  _close_alloc();
-  _close_fm();
+  if (!_kv_only) {
+    mempool_thread.shutdown();
+    dout(20) << __func__ << " stopping kv thread" << dendl;
+    _kv_stop();
+    _reap_collections();
+    _flush_cache();
+    dout(20) << __func__ << " closing" << dendl;
+
+    _close_alloc();
+    _close_fm();
+  }
   _close_db();
   _close_bdev();
   _close_fsid();
@@ -5445,7 +5545,7 @@ int BlueStore::_fsck_check_extents(
     }
     bool already = false;
     apply(
-      e.offset, e.length, block_size, used_blocks, __func__,
+      e.offset, e.length, min_alloc_size, used_blocks, __func__,
       [&](uint64_t pos, mempool_dynamic_bitset &bs) {
        if (bs.test(pos))
          already = true;
@@ -5466,10 +5566,13 @@ int BlueStore::_fsck_check_extents(
   return errors;
 }
 
-int BlueStore::fsck(bool deep)
+int BlueStore::_fsck(bool deep, bool repair)
 {
-  dout(1) << __func__ << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
+  dout(1) << __func__
+         << (repair ? " fsck" : " repair")
+         << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
   int errors = 0;
+  int repaired = 0;
 
   typedef btree::btree_set<
     uint64_t,std::less<uint64_t>,
@@ -5547,9 +5650,10 @@ int BlueStore::fsck(bool deep)
   if (r < 0)
     goto out_scan;
 
-  used_blocks.resize(bdev->get_size() / block_size);
+  used_blocks.resize(bdev->get_size() / min_alloc_size);
   apply(
-    0, SUPER_RESERVED, block_size, used_blocks, "0~SUPER_RESERVED",
+    0, MAX(min_alloc_size, SUPER_RESERVED), min_alloc_size, used_blocks,
+    "0~SUPER_RESERVED",
     [&](uint64_t pos, mempool_dynamic_bitset &bs) {
       bs.set(pos);
     }
@@ -5558,7 +5662,7 @@ int BlueStore::fsck(bool deep)
   if (bluefs) {
     for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
       apply(
-        e.get_start(), e.get_len(), block_size, used_blocks, "bluefs",
+        e.get_start(), e.get_len(), min_alloc_size, used_blocks, "bluefs",
         [&](uint64_t pos, mempool_dynamic_bitset &bs) {
           bs.set(pos);
         }
@@ -5593,7 +5697,7 @@ int BlueStore::fsck(bool deep)
       if (is_extent_shard_key(it->key())) {
        while (!expecting_shards.empty() &&
               expecting_shards.front() < it->key()) {
-         derr << __func__ << " error: missing shard key "
+         derr << "fsck error: missing shard key "
               << pretty_binary_string(expecting_shards.front())
               << dendl;
          ++errors;
@@ -5609,18 +5713,18 @@ int BlueStore::fsck(bool deep)
         uint32_t offset;
         string okey;
         get_key_extent_shard(it->key(), &okey, &offset);
-        derr << __func__ << " error: stray shard 0x" << std::hex << offset
+        derr << "fsck error: stray shard 0x" << std::hex << offset
             << std::dec << dendl;
         if (expecting_shards.empty()) {
-          derr << __func__ << " error: " << pretty_binary_string(it->key())
+          derr << "fsck error: " << pretty_binary_string(it->key())
                << " is unexpected" << dendl;
           ++errors;
           continue;
         }
        while (expecting_shards.front() > it->key()) {
-         derr << __func__ << " error:   saw " << pretty_binary_string(it->key())
+         derr << "fsck error:   saw " << pretty_binary_string(it->key())
               << dendl;
-         derr << __func__ << " error:   exp "
+         derr << "fsck error:   exp "
               << pretty_binary_string(expecting_shards.front()) << dendl;
          ++errors;
          expecting_shards.pop_front();
@@ -5634,7 +5738,7 @@ int BlueStore::fsck(bool deep)
       ghobject_t oid;
       int r = get_key_object(it->key(), &oid);
       if (r < 0) {
-        derr << __func__ << " error: bad object key "
+        derr << "fsck error: bad object key "
              << pretty_binary_string(it->key()) << dendl;
        ++errors;
        continue;
@@ -5654,7 +5758,7 @@ int BlueStore::fsck(bool deep)
          }
        }
        if (!c) {
-          derr << __func__ << " error: stray object " << oid
+          derr << "fsck error: stray object " << oid
                << " not owned by any collection" << dendl;
          ++errors;
          continue;
@@ -5665,7 +5769,7 @@ int BlueStore::fsck(bool deep)
 
       if (!expecting_shards.empty()) {
        for (auto &k : expecting_shards) {
-         derr << __func__ << " error: missing shard key "
+         derr << "fsck error: missing shard key "
               << pretty_binary_string(k) << dendl;
        }
        ++errors;
@@ -5677,12 +5781,12 @@ int BlueStore::fsck(bool deep)
       OnodeRef o = c->get_onode(oid, false);
       if (o->onode.nid) {
        if (o->onode.nid > nid_max) {
-         derr << __func__ << " error: " << oid << " nid " << o->onode.nid
+         derr << "fsck error: " << oid << " nid " << o->onode.nid
               << " > nid_max " << nid_max << dendl;
          ++errors;
        }
        if (used_nids.count(o->onode.nid)) {
-         derr << __func__ << " error: " << oid << " nid " << o->onode.nid
+         derr << "fsck error: " << oid << " nid " << o->onode.nid
               << " already in use" << dendl;
          ++errors;
          continue; // go for next object
@@ -5704,7 +5808,7 @@ int BlueStore::fsck(bool deep)
        get_extent_shard_key(o->key, s.shard_info->offset,
                             &expecting_shards.back());
        if (s.shard_info->offset >= o->onode.size) {
-         derr << __func__ << " error: " << oid << " shard 0x" << std::hex
+         derr << "fsck error: " << oid << " shard 0x" << std::hex
               << s.shard_info->offset << " past EOF at 0x" << o->onode.size
               << std::dec << dendl;
          ++errors;
@@ -5718,14 +5822,14 @@ int BlueStore::fsck(bool deep)
       for (auto& l : o->extent_map.extent_map) {
        dout(20) << __func__ << "    " << l << dendl;
        if (l.logical_offset < pos) {
-         derr << __func__ << " error: " << oid << " lextent at 0x"
+         derr << "fsck error: " << oid << " lextent at 0x"
               << std::hex << l.logical_offset
               << " overlaps with the previous, which ends at 0x" << pos
               << std::dec << dendl;
          ++errors;
        }
        if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
-         derr << __func__ << " error: " << oid << " lextent at 0x"
+         derr << "fsck error: " << oid << " lextent at 0x"
               << std::hex << l.logical_offset << "~" << l.length
               << " spans a shard boundary"
               << std::dec << dendl;
@@ -5771,7 +5875,7 @@ int BlueStore::fsck(bool deep)
                 << std::dec << " for " << *i.first << dendl;
        const bluestore_blob_t& blob = i.first->get_blob();
        if (i.second & blob.unused) {
-         derr << __func__ << " error: " << oid << " blob claims unused 0x"
+         derr << "fsck error: " << oid << " blob claims unused 0x"
               << std::hex << blob.unused
               << " but extents reference 0x" << i.second
               << " on blob " << *i.first << dendl;
@@ -5793,7 +5897,7 @@ int BlueStore::fsck(bool deep)
            if ((blob.unused & mask) == mask) {
              // this csum chunk region is marked unused
              if (blob.get_csum_item(p) != 0) {
-               derr << __func__ << " error: " << oid
+               derr << "fsck error: " << oid
                     << " blob claims csum chunk 0x" << std::hex << pos
                     << "~" << csum_chunk_size
                     << " is unused (mask 0x" << mask << " of unused 0x"
@@ -5811,7 +5915,7 @@ int BlueStore::fsck(bool deep)
        const bluestore_blob_t& blob = i.first->get_blob();
        bool equal = i.first->get_blob_use_tracker().equal(i.second);
        if (!equal) {
-         derr << __func__ << " error: " << oid << " blob " << *i.first
+         derr << "fsck error: " << oid << " blob " << *i.first
               << " doesn't match expected ref_map " << i.second << dendl;
          ++errors;
        }
@@ -5822,12 +5926,12 @@ int BlueStore::fsck(bool deep)
        }
        if (blob.is_shared()) {
          if (i.first->shared_blob->get_sbid() > blobid_max) {
-           derr << __func__ << " error: " << oid << " blob " << blob
+           derr << "fsck error: " << oid << " blob " << blob
                 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
                 << blobid_max << dendl;
            ++errors;
          } else if (i.first->shared_blob->get_sbid() == 0) {
-            derr << __func__ << " error: " << oid << " blob " << blob
+            derr << "fsck error: " << oid << " blob " << blob
                  << " marked as shared but has uninitialized sbid"
                  << dendl;
             ++errors;
@@ -5853,14 +5957,14 @@ int BlueStore::fsck(bool deep)
        int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
        if (r < 0) {
          ++errors;
-         derr << __func__ << " error: " << oid << " error during read: "
+         derr << "fsck error: " << oid << " error during read: "
               << cpp_strerror(r) << dendl;
        }
       }
       // omap
       if (o->onode.has_omap()) {
        if (used_omap_head.count(o->onode.nid)) {
-         derr << __func__ << " error: " << oid << " omap_head " << o->onode.nid
+         derr << "fsck error: " << oid << " omap_head " << o->onode.nid
               << " already in use" << dendl;
          ++errors;
        } else {
@@ -5876,14 +5980,14 @@ int BlueStore::fsck(bool deep)
       string key = it->key();
       uint64_t sbid;
       if (get_key_shared_blob(key, &sbid)) {
-       derr << __func__ << " error: bad key '" << key
+       derr << "fsck error: bad key '" << key
             << "' in shared blob namespace" << dendl;
        ++errors;
        continue;
       }
       auto p = sb_info.find(sbid);
       if (p == sb_info.end()) {
-       derr << __func__ << " error: found stray shared blob data for sbid 0x"
+       derr << "fsck error: found stray shared blob data for sbid 0x"
             << std::hex << sbid << std::dec << dendl;
        ++errors;
       } else {
@@ -5895,7 +5999,7 @@ int BlueStore::fsck(bool deep)
        ::decode(shared_blob, blp);
        dout(20) << __func__ << "  " << *sbi.sb << " " << shared_blob << dendl;
        if (shared_blob.ref_map != sbi.ref_map) {
-         derr << __func__ << " error: shared blob 0x" << std::hex << sbid
+         derr << "fsck error: shared blob 0x" << std::hex << sbid
               << std::dec << " ref_map " << shared_blob.ref_map
               << " != expected " << sbi.ref_map << dendl;
          ++errors;
@@ -5913,12 +6017,12 @@ int BlueStore::fsck(bool deep)
     }
   }
   for (auto &p : sb_info) {
-    derr << __func__ << " error: shared_blob 0x" << p.first
+    derr << "fsck error: shared_blob 0x" << p.first
         << " key is missing (" << *p.second.sb << ")" << dendl;
     ++errors;
   }
   if (!(actual_statfs == expected_statfs)) {
-    derr << __func__ << " error: actual " << actual_statfs
+    derr << "fsck error: actual " << actual_statfs
         << " != expected " << expected_statfs << dendl;
     ++errors;
   }
@@ -5930,7 +6034,7 @@ int BlueStore::fsck(bool deep)
       uint64_t omap_head;
       _key_decode_u64(it->key().c_str(), &omap_head);
       if (used_omap_head.count(omap_head) == 0) {
-       derr << __func__ << " error: found stray omap data on omap_head "
+       derr << "fsck error: found stray omap data on omap_head "
             << omap_head << dendl;
        ++errors;
       }
@@ -5947,7 +6051,7 @@ int BlueStore::fsck(bool deep)
       try {
        ::decode(wt, p);
       } catch (buffer::error& e) {
-       derr << __func__ << " error: failed to decode deferred txn "
+       derr << "fsck error: failed to decode deferred txn "
             << pretty_binary_string(it->key()) << dendl;
        r = -EIO;
         goto out_scan;
@@ -5957,7 +6061,7 @@ int BlueStore::fsck(bool deep)
               << " released 0x" << std::hex << wt.released << std::dec << dendl;
       for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
         apply(
-          e.get_start(), e.get_len(), block_size, used_blocks, "deferred",
+          e.get_start(), e.get_len(), min_alloc_size, used_blocks, "deferred",
           [&](uint64_t pos, mempool_dynamic_bitset &bs) {
             bs.set(pos);
           }
@@ -5972,7 +6076,8 @@ int BlueStore::fsck(bool deep)
     // know they are allocated.
     for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
       apply(
-        e.get_start(), e.get_len(), block_size, used_blocks, "bluefs_extents",
+        e.get_start(), e.get_len(), min_alloc_size, used_blocks,
+        "bluefs_extents",
         [&](uint64_t pos, mempool_dynamic_bitset &bs) {
          bs.reset(pos);
         }
@@ -5983,7 +6088,7 @@ int BlueStore::fsck(bool deep)
     while (fm->enumerate_next(&offset, &length)) {
       bool intersects = false;
       apply(
-        offset, length, block_size, used_blocks, "free",
+        offset, length, min_alloc_size, used_blocks, "free",
         [&](uint64_t pos, mempool_dynamic_bitset &bs) {
           if (bs.test(pos)) {
             intersects = true;
@@ -5993,79 +6098,26 @@ int BlueStore::fsck(bool deep)
         }
       );
       if (intersects) {
-       derr << __func__ << " error: free extent 0x" << std::hex << offset
-            << "~" << length << std::dec
-            << " intersects allocated blocks" << dendl;
-       ++errors;
-      }
-    }
-    fm->enumerate_reset();
-    size_t count = used_blocks.count();
-    if (used_blocks.size() == count + 1) {
-      // this due to http://tracker.ceph.com/issues/21089
-      bufferlist fm_bpb_bl, fm_blocks_bl, fm_bpk_bl;
-      db->get(PREFIX_ALLOC, "bytes_per_block", &fm_bpb_bl);
-      db->get(PREFIX_ALLOC, "blocks", &fm_blocks_bl);
-      db->get(PREFIX_ALLOC, "blocks_per_key", &fm_bpk_bl);
-      uint64_t fm_blocks = 0;
-      uint64_t fm_bsize = 1;
-      uint64_t fm_blocks_per_key = 1;
-      try {
-       auto p = fm_blocks_bl.begin();
-       ::decode(fm_blocks, p);
-       auto q = fm_bpb_bl.begin();
-       ::decode(fm_bsize, q);
-       auto r = fm_bpk_bl.begin();
-       ::decode(fm_blocks_per_key, r);
-      } catch (buffer::error& e) {
-      }
-      uint64_t dev_bsize = bdev->get_block_size();
-      uint64_t bad_size = bdev->get_size() & ~fm_bsize;
-      if (used_blocks.test(bad_size / dev_bsize) == 0) {
-       // this is the last block of the device that we previously
-       // (incorrectly) truncated off of the effective device size.  this
-       // prevented BitmapFreelistManager from marking it as used along with
-       // the other "past-eof" blocks in the last key slot.  mark it used
-       // now.
-       derr << __func__ << " warning: fixing leaked block 0x" << std::hex
-            << bad_size << "~" << fm_bsize << std::dec << " due to old bug"
-            << dendl;
-       KeyValueDB::Transaction t = db->get_transaction();
-       // fix freelistmanager metadata (the internal 'blocks' count is
-       // rounded up to include the trailing key, past eof)
-       uint64_t new_blocks = bdev->get_size() / fm_bsize;
-       if (new_blocks / fm_blocks_per_key * fm_blocks_per_key != new_blocks) {
-         new_blocks = (new_blocks / fm_blocks_per_key + 1) *
-           fm_blocks_per_key;
-       }
-       if (new_blocks != fm_blocks) {
-         // the fm block count increased
-         derr << __func__ << "  freelist block and key count changed, fixing 0x"
-              << std::hex << bdev->get_size() << "~"
-              << ((new_blocks * fm_bsize) - bdev->get_size()) << std::dec
-              << dendl;
-         bufferlist bl;
-         ::encode(new_blocks, bl);
-         t->set(PREFIX_ALLOC, "blocks", bl);
-         fm->allocate(bdev->get_size(),
-                      (new_blocks * fm_bsize) - bdev->get_size(),
-                      t);
+       if (offset == SUPER_RESERVED &&
+           length == min_alloc_size - SUPER_RESERVED) {
+         // this is due to the change just after luminous to min_alloc_size
+         // granularity allocations, and our baked in assumption at the top
+         // of _fsck that 0~ROUND_UP_TO(SUPER_RESERVED,min_alloc_size) is used
+         // (vs luminous's ROUND_UP_TO(SUPER_RESERVED,block_size)).  harmless,
+         // since we will never allocate this region below min_alloc_size.
+         dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
+                  << " and min_alloc_size, 0x" << std::hex << offset << "~"
+                  << length << dendl;
        } else {
-         // block count is the same, but size changed; fix just the size
-         derr << __func__ << "  fixing just the stray block at 0x"
-              << std::hex << bad_size << "~" << fm_bsize << std::dec << dendl;
-         fm->allocate(bad_size, fm_bsize, t);
+         derr << "fsck error: free extent 0x" << std::hex << offset
+              << "~" << length << std::dec
+              << " intersects allocated blocks" << dendl;
+         ++errors;
        }
-       bufferlist sizebl;
-       ::encode(bdev->get_size(), sizebl);
-       t->set(PREFIX_ALLOC, "size", sizebl);
-       int r = db->submit_transaction_sync(t);
-       assert(r == 0);
-
-       used_blocks.set(bad_size / dev_bsize);
-       ++count;
       }
     }
+    fm->enumerate_reset();
+    size_t count = used_blocks.count();
     if (used_blocks.size() != count) {
       assert(used_blocks.size() > count);
       ++errors;
@@ -6076,9 +6128,9 @@ int BlueStore::fsck(bool deep)
        while (true) {
          size_t next = used_blocks.find_next(cur);
          if (next != cur + 1) {
-           derr << __func__ << " error: leaked extent 0x" << std::hex
-                << ((uint64_t)start * block_size) << "~"
-                << ((cur + 1 - start) * block_size) << std::dec
+           derr << "fsck error: leaked extent 0x" << std::hex
+                << ((uint64_t)start * min_alloc_size) << "~"
+                << ((cur + 1 - start) * min_alloc_size) << std::dec
                 << dendl;
            start = next;
            break;
@@ -6121,9 +6173,10 @@ int BlueStore::fsck(bool deep)
          << dendl;
 
   utime_t duration = ceph_clock_now() - start;
-  dout(1) << __func__ << " finish with " << errors << " errors in "
+  dout(1) << __func__ << " finish with " << errors << " errors, " << repaired
+         << " repaired, " << (errors - repaired) << " remaining in "
          << duration << " seconds" << dendl;
-  return errors;
+  return errors - repaired;
 }
 
 void BlueStore::collect_metadata(map<string,string> *pm)
@@ -8772,6 +8825,14 @@ void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
   bdev->aio_submit(&b->ioc);
 }
 
+struct C_DeferredTrySubmit : public Context {
+  BlueStore *store;
+  C_DeferredTrySubmit(BlueStore *s) : store(s) {}
+  void finish(int r) {
+    store->deferred_try_submit();
+  }
+};
+
 void BlueStore::_deferred_aio_finish(OpSequencer *osr)
 {
   dout(10) << __func__ << " osr " << osr << dendl;
@@ -8788,9 +8849,7 @@ void BlueStore::_deferred_aio_finish(OpSequencer *osr)
       deferred_queue.erase(q);
     } else if (deferred_aggressive) {
       dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
-      deferred_finisher.queue(new FunctionContext([&](int) {
-           deferred_try_submit();
-         }));
+      deferred_finisher.queue(new C_DeferredTrySubmit(this));
     } else {
       dout(20) << __func__ << " leaving queued, more pending" << dendl;
     }
@@ -8930,6 +8989,11 @@ int BlueStore::queue_transactions(
               << dendl;
       ++deferred_aggressive;
       deferred_try_submit();
+      {
+       // wake up any previously finished deferred events
+       std::lock_guard<std::mutex> l(kv_lock);
+       kv_cond.notify_one();
+      }
       throttle_deferred_bytes.get(txc->cost);
       --deferred_aggressive;
    }
@@ -9856,20 +9920,10 @@ int BlueStore::_do_alloc_write(
   dout(20) << __func__ << " txc " << txc
           << " " << wctx->writes.size() << " blobs"
           << dendl;
-
-  uint64_t need = 0;
-  auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
-  for (auto &wi : wctx->writes) {
-    need += wi.blob_length;
-  }
-  int r = alloc->reserve(need);
-  if (r < 0) {
-    derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
-        << dendl;
-    return r;
+  if (wctx->writes.empty()) {
+    return 0;
   }
 
-  uint64_t hint = 0;
   CompressorRef c;
   double crr = 0;
   if (wctx->compress) {
@@ -9894,7 +9948,7 @@ int BlueStore::_do_alloc_write(
       cct->_conf->bluestore_compression_required_ratio,
       [&]() {
         double val;
-        if(coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
+        if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
           return boost::optional<double>(val);
         }
         return boost::optional<double>();
@@ -9909,78 +9963,102 @@ int BlueStore::_do_alloc_write(
     csum,
     [&]() {
       int val;
-      if(coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
+      if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
         return  boost::optional<int>(val);
       }
       return boost::optional<int>();
     }
   );
 
+  // compress (as needed) and calc needed space
+  uint64_t need = 0;
+  auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
   for (auto& wi : wctx->writes) {
-    BlobRef b = wi.b;
-    bluestore_blob_t& dblob = b->dirty_blob();
-    uint64_t b_off = wi.b_off;
-    bufferlist *l = &wi.bl;
-    uint64_t final_length = wi.blob_length;
-    uint64_t csum_length = wi.blob_length;
-    unsigned csum_order = block_size_order;
-    bufferlist compressed_bl;
-    bool compressed = false;
-    if(c && wi.blob_length > min_alloc_size) {
-
+    if (c && wi.blob_length > min_alloc_size) {
       utime_t start = ceph_clock_now();
 
       // compress
-      assert(b_off == 0);
-      assert(wi.blob_length == l->length());
-      bluestore_compression_header_t chdr;
-      chdr.type = c->get_type();
+      assert(wi.b_off == 0);
+      assert(wi.blob_length == wi.bl.length());
+
       // FIXME: memory alignment here is bad
       bufferlist t;
-
-      r = c->compress(*l, t);
+      int r = c->compress(wi.bl, t);
       assert(r == 0);
 
+      bluestore_compression_header_t chdr;
+      chdr.type = c->get_type();
       chdr.length = t.length();
-      ::encode(chdr, compressed_bl);
-      compressed_bl.claim_append(t);
-      uint64_t rawlen = compressed_bl.length();
-      uint64_t newlen = P2ROUNDUP(rawlen, min_alloc_size);
-      uint64_t want_len_raw = final_length * crr;
+      ::encode(chdr, wi.compressed_bl);
+      wi.compressed_bl.claim_append(t);
+
+      wi.compressed_len = wi.compressed_bl.length();
+      uint64_t newlen = P2ROUNDUP(wi.compressed_len, min_alloc_size);
+      uint64_t want_len_raw = wi.blob_length * crr;
       uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
-      if (newlen <= want_len && newlen < final_length) {
-        // Cool. We compressed at least as much as we were hoping to.
-        // pad out to min_alloc_size
-       compressed_bl.append_zero(newlen - rawlen);
-       logger->inc(l_bluestore_write_pad_bytes, newlen - rawlen);
+      if (newlen <= want_len && newlen < wi.blob_length) {
+       // Cool. We compressed at least as much as we were hoping to.
+       // pad out to min_alloc_size
+       wi.compressed_bl.append_zero(newlen - wi.compressed_len);
+       logger->inc(l_bluestore_write_pad_bytes, newlen - wi.compressed_len);
        dout(20) << __func__ << std::hex << "  compressed 0x" << wi.blob_length
-                << " -> 0x" << rawlen << " => 0x" << newlen
+                << " -> 0x" << wi.compressed_len << " => 0x" << newlen
                 << " with " << c->get_type()
                 << std::dec << dendl;
-       txc->statfs_delta.compressed() += rawlen;
-       txc->statfs_delta.compressed_original() += l->length();
+       txc->statfs_delta.compressed() += wi.compressed_len;
+       txc->statfs_delta.compressed_original() += wi.blob_length;
        txc->statfs_delta.compressed_allocated() += newlen;
-       l = &compressed_bl;
-       final_length = newlen;
-       csum_length = newlen;
-       csum_order = ctz(newlen);
-       dblob.set_compressed(wi.blob_length, rawlen);
-       compressed = true;
-        logger->inc(l_bluestore_compress_success_count);
+       logger->inc(l_bluestore_compress_success_count);
+       wi.compressed = true;
+       need += newlen;
       } else {
-       dout(20) << __func__ << std::hex << "  0x" << l->length()
-                << " compressed to 0x" << rawlen << " -> 0x" << newlen
-                 << " with " << c->get_type()
-                 << ", which is more than required 0x" << want_len_raw
+       dout(20) << __func__ << std::hex << "  0x" << wi.blob_length
+                << " compressed to 0x" << wi.compressed_len << " -> 0x" << newlen
+                << " with " << c->get_type()
+                << ", which is more than required 0x" << want_len_raw
                 << " -> 0x" << want_len
-                 << ", leaving uncompressed"
-                 << std::dec << dendl;
-        logger->inc(l_bluestore_compress_rejected_count);
+                << ", leaving uncompressed"
+                << std::dec << dendl;
+       logger->inc(l_bluestore_compress_rejected_count);
+       need += wi.blob_length;
       }
       logger->tinc(l_bluestore_compress_lat,
                   ceph_clock_now() - start);
+    } else {
+      need += wi.blob_length;
     }
-    if (!compressed && wi.new_blob) {
+  }
+  int r = alloc->reserve(need);
+  if (r < 0) {
+    derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
+        << dendl;
+    return r;
+  }
+  AllocExtentVector prealloc;
+  prealloc.reserve(2 * wctx->writes.size());;
+  int prealloc_left = 0;
+  prealloc_left = alloc->allocate(
+    need, min_alloc_size, need,
+    0, &prealloc);
+  assert(prealloc_left == (int64_t)need);
+  dout(20) << __func__ << " prealloc " << prealloc << dendl;
+  auto prealloc_pos = prealloc.begin();
+
+  for (auto& wi : wctx->writes) {
+    BlobRef b = wi.b;
+    bluestore_blob_t& dblob = b->dirty_blob();
+    uint64_t b_off = wi.b_off;
+    bufferlist *l = &wi.bl;
+    uint64_t final_length = wi.blob_length;
+    uint64_t csum_length = wi.blob_length;
+    unsigned csum_order = block_size_order;
+    if (wi.compressed) {
+      final_length = wi.compressed_bl.length();
+      csum_length = final_length;
+      csum_order = ctz(csum_length);
+      l = &wi.compressed_bl;
+      dblob.set_compressed(wi.blob_length, wi.compressed_len);
+    } else if (wi.new_blob) {
       // initialize newly created blob only
       assert(dblob.is_mutable());
       if (l->length() != wi.blob_length) {
@@ -10015,17 +10093,27 @@ int BlueStore::_do_alloc_write(
     }
 
     AllocExtentVector extents;
-    extents.reserve(4);  // 4 should be (more than) enough for most allocations
-    int64_t got = alloc->allocate(final_length, min_alloc_size, 
-                                 max_alloc_size.load(),
-                                 hint, &extents);
-    assert(got == (int64_t)final_length);
-    need -= got;
-    txc->statfs_delta.allocated() += got;
+    int64_t left = final_length;
+    while (left > 0) {
+      assert(prealloc_left > 0);
+      if (prealloc_pos->length <= left) {
+       prealloc_left -= prealloc_pos->length;
+       left -= prealloc_pos->length;
+       txc->statfs_delta.allocated() += prealloc_pos->length;
+       extents.push_back(*prealloc_pos);
+       ++prealloc_pos;
+      } else {
+       extents.emplace_back(prealloc_pos->offset, left);
+       prealloc_pos->offset += left;
+       prealloc_pos->length -= left;
+       prealloc_left -= left;
+       txc->statfs_delta.allocated() += left;
+       left = 0;
+       break;
+      }
+    }
     for (auto& p : extents) {
-      bluestore_pextent_t e = bluestore_pextent_t(p);
-      txc->allocated.insert(e.offset, e.length);
-      hint = p.end();
+      txc->allocated.insert(p.offset, p.length);
     }
     dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
 
@@ -10079,9 +10167,8 @@ int BlueStore::_do_alloc_write(
       }
     }
   }
-  if (need > 0) {
-    alloc->unreserve(need);
-  }
+  assert(prealloc_pos == prealloc.end());
+  assert(prealloc_left == 0);
   return 0;
 }
 
@@ -10607,10 +10694,14 @@ int BlueStore::_do_remove(
     if (b.is_shared() &&
        sb->loaded &&
        maybe_unshared_blobs.count(sb)) {
-      b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
-        expect[sb].get(off, len);
-       return 0;
-      });
+      if (b.is_compressed()) {
+       expect[sb].get(0, b.get_ondisk_length());
+      } else {
+       b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
+           expect[sb].get(off, len);
+           return 0;
+         });
+      }
     }
   }
 
@@ -10671,10 +10762,14 @@ int BlueStore::_setattr(TransContext *txc,
           << " " << name << " (" << val.length() << " bytes)"
           << dendl;
   int r = 0;
-  if (val.is_partial())
-    o->onode.attrs[name.c_str()] = bufferptr(val.c_str(), val.length());
-  else
-    o->onode.attrs[name.c_str()] = val;
+  if (val.is_partial()) {
+    auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
+                                                      val.length());
+    b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+  } else {
+    auto& b = o->onode.attrs[name.c_str()] = val;
+    b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+  }
   txc->write_onode(o);
   dout(10) << __func__ << " " << c->cid << " " << o->oid
           << " " << name << " (" << val.length() << " bytes)"
@@ -10693,11 +10788,14 @@ int BlueStore::_setattrs(TransContext *txc,
   int r = 0;
   for (map<string,bufferptr>::const_iterator p = aset.begin();
        p != aset.end(); ++p) {
-    if (p->second.is_partial())
-      o->onode.attrs[p->first.c_str()] =
+    if (p->second.is_partial()) {
+      auto& b = o->onode.attrs[p->first.c_str()] =
        bufferptr(p->second.c_str(), p->second.length());
-    else
-      o->onode.attrs[p->first.c_str()] = p->second;
+      b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+    } else {
+      auto& b = o->onode.attrs[p->first.c_str()] = p->second;
+      b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+    }
   }
   txc->write_onode(o);
   dout(10) << __func__ << " " << c->cid << " " << o->oid
@@ -11551,6 +11649,14 @@ void BlueStore::_flush_cache()
     assert(i->empty());
   }
   for (auto& p : coll_map) {
+    if (!p.second->onode_map.empty()) {
+      derr << __func__ << "stray onodes on " << p.first << dendl;
+      p.second->onode_map.dump(cct, 0);
+    }
+    if (!p.second->shared_blob_set.empty()) {
+      derr << __func__ << " stray shared blobs on " << p.first << dendl;
+      p.second->shared_blob_set.dump(cct, 0);
+    }
     assert(p.second->onode_map.empty());
     assert(p.second->shared_blob_set.empty());
   }
index cf89f243895db7caf556c64670b03049feafc942..57a8688396fed015e1b700863897b61d8d420739 100644 (file)
@@ -442,7 +442,7 @@ public:
       sb->coll = coll;
     }
 
-    bool remove(SharedBlob *sb) {
+    bool try_remove(SharedBlob *sb) {
       std::lock_guard<std::mutex> l(lock);
       if (sb->nref == 0) {
        assert(sb->get_parent() == this);
@@ -452,10 +452,18 @@ public:
       return false;
     }
 
+    void remove(SharedBlob *sb) {
+      std::lock_guard<std::mutex> l(lock);
+      assert(sb->get_parent() == this);
+      sb_map.erase(sb->get_sbid());
+    }
+
     bool empty() {
       std::lock_guard<std::mutex> l(lock);
       return sb_map.empty();
     }
+
+    void dump(CephContext *cct, int lvl);
   };
 
 //#define CACHE_BLOB_BL  // not sure if this is a win yet or not... :/
@@ -1318,6 +1326,8 @@ public:
     void clear();
     bool empty();
 
+    void dump(CephContext *cct, int lvl);
+
     /// return true if f true for any item
     bool map_any(std::function<bool(OnodeRef)> f);
   };
@@ -1854,6 +1864,7 @@ private:
   KVSyncThread kv_sync_thread;
   std::mutex kv_lock;
   std::condition_variable kv_cond;
+  bool _kv_only = false;
   bool kv_sync_started = false;
   bool kv_stop = false;
   bool kv_finalize_started = false;
@@ -1978,8 +1989,9 @@ private:
   int _setup_block_symlink_or_file(string name, string path, uint64_t size,
                                   bool create);
 
-  int _write_bdev_label(string path, bluestore_bdev_label_t label);
 public:
+  static int _write_bdev_label(CephContext* cct,
+                              string path, bluestore_bdev_label_t label);
   static int _read_bdev_label(CephContext* cct, string path,
                              bluestore_bdev_label_t *label);
 private:
@@ -2036,7 +2048,9 @@ private:
 
   bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o);
   void _deferred_queue(TransContext *txc);
+public:
   void deferred_try_submit();
+private:
   void _deferred_submit_unlock(OpSequencer *osr);
   void _deferred_aio_finish(OpSequencer *osr);
   int _deferred_replay();
@@ -2144,7 +2158,17 @@ public:
     return 0;
   }
 
-  int fsck(bool deep) override;
+  int write_meta(const std::string& key, const std::string& value) override;
+  int read_meta(const std::string& key, std::string *value) override;
+
+
+  int fsck(bool deep) override {
+    return _fsck(deep, false);
+  }
+  int repair(bool deep) override {
+    return _fsck(deep, true);
+  }
+  int _fsck(bool deep, bool repair);
 
   void set_cache_shards(unsigned num) override;
 
@@ -2452,6 +2476,10 @@ private:
       bool mark_unused;
       bool new_blob; ///< whether new blob was created
 
+      bool compressed = false;
+      bufferlist compressed_bl;
+      size_t compressed_len = 0;
+
       write_item(
        uint64_t logical_offs,
         BlobRef b,
index 7f5ad4d79f9b7ab95540d60150ce96f3ff0e9cf6..b4418b2c7aae3355c8d83c38de9ec414aefa34a8 100644 (file)
@@ -24,9 +24,10 @@ public:
 
   static void setup_merge_operators(KeyValueDB *db);
 
-  virtual int create(uint64_t size, KeyValueDB::Transaction txn) = 0;
+  virtual int create(uint64_t size, uint64_t min_alloc_size,
+                    KeyValueDB::Transaction txn) = 0;
 
-  virtual int init() = 0;
+  virtual int init(uint64_t dev_size) = 0;
   virtual void shutdown() = 0;
 
   virtual void dump() = 0;
index 3ae5be1ea1e0847b63a882142ac3bba81e0a7e3e..420b59d55f936d83b72a67eeef757e684cc301f2 100644 (file)
@@ -130,6 +130,11 @@ int KernelDevice::open(const string& p)
   } else {
     size = st.st_size;
   }
+  if (cct->_conf->get_val<bool>("bdev_inject_bad_size")) {
+    derr << "injecting bad size; actual 0x" << std::hex << size
+        << " but using 0x" << (size & ~block_size) << std::dec << dendl;
+    size &= ~(block_size);
+  }
 
   {
     char partition[PATH_MAX], devname[PATH_MAX];
index c7c134051b6e3f183170c995de83c6815c50a41b..db55868692e4c68bb4cb8b669fd642715d89fda7 100644 (file)
@@ -14,6 +14,7 @@
 #include "common/ceph_argparse.h"
 #include "include/stringify.h"
 #include "common/errno.h"
+#include "common/safe_io.h"
 
 #include "os/bluestore/BlueFS.h"
 #include "os/bluestore/BlueStore.h"
@@ -65,24 +66,88 @@ void validate_path(CephContext *cct, const string& path, bool bluefs)
   }
 }
 
+BlueFS *open_bluefs(
+  CephContext *cct,
+  const string& path,
+  const vector<string>& devs)
+{
+  validate_path(cct, path, true);
+  BlueFS *fs = new BlueFS(cct);
+
+  string main;
+  set<int> got;
+  for (auto& i : devs) {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct, i, &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << i << ": "
+          << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    int id = -1;
+    if (label.description == "main")
+      main = i;
+    else if (label.description == "bluefs db")
+      id = BlueFS::BDEV_DB;
+    else if (label.description == "bluefs wal")
+      id = BlueFS::BDEV_WAL;
+    if (id >= 0) {
+      got.insert(id);
+      cout << " slot " << id << " " << i << std::endl;
+      int r = fs->add_block_device(id, i);
+      if (r < 0) {
+       cerr << "unable to open " << i << ": " << cpp_strerror(r) << std::endl;
+       exit(EXIT_FAILURE);
+      }
+    }
+  }
+  if (main.length()) {
+    int id = BlueFS::BDEV_DB;
+    if (got.count(BlueFS::BDEV_DB))
+      id = BlueFS::BDEV_SLOW;
+    cout << " slot " << id << " " << main << std::endl;
+    int r = fs->add_block_device(id, main);
+    if (r < 0) {
+      cerr << "unable to open " << main << ": " << cpp_strerror(r)
+          << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  int r = fs->mount();
+  if (r < 0) {
+    cerr << "unable to mount bluefs: " << cpp_strerror(r)
+        << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  return fs;
+}
+
 int main(int argc, char **argv)
 {
   string out_dir;
   vector<string> devs;
   string path;
   string action;
+  string log_file;
+  string key, value;
+  int log_level = 30;
   bool fsck_deep = false;
   po::options_description po_options("Options");
   po_options.add_options()
     ("help,h", "produce help message")
     ("path", po::value<string>(&path), "bluestore path")
     ("out-dir", po::value<string>(&out_dir), "output directory")
+    ("log-file,l", po::value<string>(&log_file), "log file")
+    ("log-level", po::value<int>(&log_level), "log level (30=most, 20=lots, 10=some, 1=little)")
     ("dev", po::value<vector<string>>(&devs), "device(s)")
     ("deep", po::value<bool>(&fsck_deep), "deep fsck (read all data)")
+    ("key,k", po::value<string>(&key), "label metadata key name")
+    ("value,v", po::value<string>(&value), "label metadata value")
     ;
   po::options_description po_positional("Positional options");
   po_positional.add_options()
-    ("command", po::value<string>(&action), "fsck, bluefs-export, show-label")
+    ("command", po::value<string>(&action), "fsck, repair, bluefs-export, bluefs-bdev-sizes, bluefs-bdev-expand, show-label, set-label-key, rm-label-key, prime-osd-dir")
     ;
   po::options_description po_all("All options");
   po_all.add(po_options).add(po_positional);
@@ -112,23 +177,50 @@ int main(int argc, char **argv)
     exit(EXIT_FAILURE);
   }
 
-  if (action == "fsck") {
+  if (action == "fsck" || action == "repair") {
     if (path.empty()) {
       cerr << "must specify bluestore path" << std::endl;
       exit(EXIT_FAILURE);
     }
   }
+  if (action == "prime-osd-dir") {
+    if (devs.size() != 1) {
+      cerr << "must specify the main bluestore device" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (path.empty()) {
+      cerr << "must specify osd dir to prime" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  if (action == "set-label-key" ||
+      action == "rm-label-key") {
+    if (devs.size() != 1) {
+      cerr << "must specify the main bluestore device" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (key.size() == 0) {
+      cerr << "must specify a key name with -k" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (action == "set-label-key" && value.size() == 0) {
+      cerr << "must specify a value with -v" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
   if (action == "show-label") {
     if (devs.empty() && path.empty()) {
       cerr << "must specify bluestore path *or* raw device(s)" << std::endl;
       exit(EXIT_FAILURE);
     }
-    cout << "infering bluefs devices from bluestore path" << std::endl;
-    for (auto fn : {"block", "block.wal", "block.db"}) {
-      string p = path + "/" + fn;
-      struct stat st;
-      if (::stat(p.c_str(), &st) == 0) {
-       devs.push_back(p);
+    if (devs.empty()) {
+      cout << "infering bluefs devices from bluestore path" << std::endl;
+      for (auto fn : {"block", "block.wal", "block.db"}) {
+       string p = path + "/" + fn;
+       struct stat st;
+       if (::stat(p.c_str(), &st) == 0) {
+         devs.push_back(p);
+       }
       }
     }
   }
@@ -150,8 +242,35 @@ int main(int argc, char **argv)
       }
     }
   }
+  if (action == "bluefs-bdev-sizes" || action == "bluefs-bdev-expand") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    cout << "infering bluefs devices from bluestore path" << std::endl;
+    for (auto fn : {"block", "block.wal", "block.db"}) {
+      string p = path + "/" + fn;
+      struct stat st;
+      if (::stat(p.c_str(), &st) == 0) {
+        devs.push_back(p);
+      }
+    }
+  }
 
   vector<const char*> args;
+  if (log_file.size()) {
+    args.push_back("--log-file");
+    args.push_back(log_file.c_str());
+    static char ll[10];
+    snprintf(ll, sizeof(ll), "%d", log_level);
+    args.push_back("--debug-bluestore");
+    args.push_back(ll);
+    args.push_back("--debug-bluefs");
+    args.push_back(ll);
+  }
+  args.push_back("--no-log-to-stderr");
+  args.push_back("--err-to-stderr");
+
   for (auto& i : ceph_option_strings) {
     args.push_back(i.c_str());
   }
@@ -161,21 +280,106 @@ int main(int argc, char **argv)
                         CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(cct.get());
 
-  cout << "action " << action << std::endl;
-
   if (action == "fsck" ||
-      action == "fsck-deep") {
+      action == "repair") {
     validate_path(cct.get(), path, false);
     BlueStore bluestore(cct.get(), path);
-    int r = bluestore.fsck(fsck_deep);
+    int r;
+    if (action == "fsck") {
+      r = bluestore.fsck(fsck_deep);
+    } else {
+      r = bluestore.repair(fsck_deep);
+    }
     if (r < 0) {
       cerr << "error from fsck: " << cpp_strerror(r) << std::endl;
       exit(EXIT_FAILURE);
     }
+    cout << action << " success" << std::endl;
+  }
+  else if (action == "prime-osd-dir") {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+    if (r < 0) {
+      cerr << "failed to read label for " << devs.front() << ": "
+          << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    // kludge some things into the map that we want to populate into
+    // target dir
+    label.meta["path_block"] = devs.front();
+    label.meta["type"] = "bluestore";
+    label.meta["fsid"] = stringify(label.osd_uuid);
+    
+    for (auto kk : {
+       "whoami",
+         "osd_key",
+         "path_block", "path_block.db", "path_block.wal",
+         "ceph_fsid",
+         "fsid",
+         "type",
+         "ready" }) {
+      string k = kk;
+      auto i = label.meta.find(k);
+      if (i == label.meta.end()) {
+       continue;
+      }
+      string p = path + "/" + k;
+      string v = i->second;
+      if (k == "osd_key") {
+       p = path + "/keyring";
+       v = "[osd.";
+       v += label.meta["whoami"];
+       v += "]\nkey = " + i->second;
+      }
+      if (k.find("path_") == 0) {
+       p = path + "/" + k.substr(5);
+       int r = ::symlink(v.c_str(), p.c_str());
+       if (r < 0 && errno == EEXIST) {
+         struct stat st;
+         r = ::stat(p.c_str(), &st);
+         if (r == 0 && S_ISLNK(st.st_mode)) {
+           char target[PATH_MAX];
+           r = ::readlink(p.c_str(), target, sizeof(target));
+           if (r > 0) {
+             if (v == target) {
+               r = 0;  // already matches our target
+             } else {
+               ::unlink(p.c_str());
+               r = ::symlink(v.c_str(), p.c_str());
+             }
+           } else {
+             cerr << "error reading existing link at " << p << ": " << cpp_strerror(errno)
+                  << std::endl;
+           }
+         }
+       }
+       if (r < 0) {
+         cerr << "error symlinking " << p << ": " << cpp_strerror(errno)
+              << std::endl;
+         exit(EXIT_FAILURE);
+       }
+      } else {
+       v += "\n";
+       int fd = ::open(p.c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0600);
+       if (fd < 0) {
+         cerr << "error writing " << p << ": " << cpp_strerror(errno)
+              << std::endl;
+         exit(EXIT_FAILURE);
+       }
+       int r = safe_write(fd, v.c_str(), v.size());
+       if (r < 0) {
+         cerr << "error writing to " << p << ": " << cpp_strerror(errno)
+              << std::endl;
+         exit(EXIT_FAILURE);
+       }
+       ::close(fd);
+      }
+    }
   }
   else if (action == "show-label") {
     JSONFormatter jf(true);
-    jf.open_array_section("devices");
+    jf.open_object_section("devices");
     for (auto& i : devs) {
       bluestore_bdev_label_t label;
       int r = BlueStore::_read_bdev_label(cct.get(), i, &label);
@@ -191,58 +395,69 @@ int main(int argc, char **argv)
     jf.close_section();
     jf.flush(cout);
   }
-  else if (action == "bluefs-export") {
-    validate_path(cct.get(), path, true);
-    BlueFS fs(&(*cct));
-    string main;
-    set<int> got;
-    for (auto& i : devs) {
-      bluestore_bdev_label_t label;
-      int r = BlueStore::_read_bdev_label(cct.get(), i, &label);
-      if (r < 0) {
-       cerr << "unable to read label for " << i << ": "
-            << cpp_strerror(r) << std::endl;
-       exit(EXIT_FAILURE);
-      }
-      int id = -1;
-      if (label.description == "main")
-       main = i;
-      else if (label.description == "bluefs db")
-       id = BlueFS::BDEV_DB;
-      else if (label.description == "bluefs wal")
-       id = BlueFS::BDEV_WAL;
-      if (id >= 0) {
-       got.insert(id);
-       cout << " slot " << id << " " << i << std::endl;
-       int r = fs.add_block_device(id, i);
-       if (r < 0) {
-         cerr << "unable to open " << i << ": " << cpp_strerror(r) << std::endl;
-         exit(EXIT_FAILURE);
-       }
-      }
+  else if (action == "set-label-key") {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << devs.front() << ": "
+          << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
     }
-    if (main.length()) {
-      int id = BlueFS::BDEV_DB;
-      if (got.count(BlueFS::BDEV_DB))
-       id = BlueFS::BDEV_SLOW;
-      cout << " slot " << id << " " << main << std::endl;
-      int r = fs.add_block_device(id, main);
-      if (r < 0) {
-       cerr << "unable to open " << main << ": " << cpp_strerror(r)
-            << std::endl;
-       exit(EXIT_FAILURE);
-      }
+    label.meta[key] = value;
+    r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
+    if (r < 0) {
+      cerr << "unable to write label for " << devs.front() << ": "
+          << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
     }
-
-    int r = fs.mount();
+  }
+  else if (action == "rm-label-key") {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
     if (r < 0) {
-      cerr << "unable to mount bluefs: " << cpp_strerror(r)
-          << std::endl;
+      cerr << "unable to read label for " << devs.front() << ": "
+          << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (!label.meta.count(key)) {
+      cerr << "key '" << key << "' not present" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    label.meta.erase(key);
+    r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
+    if (r < 0) {
+      cerr << "unable to write label for " << devs.front() << ": "
+          << cpp_strerror(r) << std::endl;
       exit(EXIT_FAILURE);
     }
+  }
+  else if (action == "bluefs-bdev-sizes") {
+    BlueFS *fs = open_bluefs(cct.get(), path, devs);
+    fs->dump_block_extents(cout);
+    delete fs;
+  }
+  else if (action == "bluefs-bdev-expand") {
+    BlueFS *fs = open_bluefs(cct.get(), path, devs);
+    cout << "start:" << std::endl;
+    fs->dump_block_extents(cout);
+    for (int devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB }) {
+      interval_set<uint64_t> before;
+      fs->get_block_extents(devid, &before);
+      uint64_t end = before.range_end();
+      uint64_t size = fs->get_block_device_size(devid);
+      if (end < size) {
+       cout << "expanding dev " << devid << " from 0x" << std::hex
+            << end << " to 0x" << size << std::dec << std::endl;
+       fs->add_block_extent(devid, end, size-end);
+      }
+    }
+    delete fs;
+  }
+  else if (action == "bluefs-export") {
+    BlueFS *fs = open_bluefs(cct.get(), path, devs);
 
     vector<string> dirs;
-    r = fs.readdir("", &dirs);
+    int r = fs->readdir("", &dirs);
     if (r < 0) {
       cerr << "readdir in root failed: " << cpp_strerror(r) << std::endl;
       exit(EXIT_FAILURE);
@@ -252,7 +467,7 @@ int main(int argc, char **argv)
        continue;
       cout << dir << "/" << std::endl;
       vector<string> ls;
-      r = fs.readdir(dir, &ls);
+      r = fs->readdir(dir, &ls);
       if (r < 0) {
        cerr << "readdir " << dir << " failed: " << cpp_strerror(r) << std::endl;
        exit(EXIT_FAILURE);
@@ -270,7 +485,7 @@ int main(int argc, char **argv)
        cout << dir << "/" << file << std::endl;
        uint64_t size;
        utime_t mtime;
-       r = fs.stat(dir, file, &size, &mtime);
+       r = fs->stat(dir, file, &size, &mtime);
        if (r < 0) {
          cerr << "stat " << file << " failed: " << cpp_strerror(r) << std::endl;
          exit(EXIT_FAILURE);
@@ -285,7 +500,7 @@ int main(int argc, char **argv)
        assert(fd >= 0);
        if (size > 0) {
          BlueFS::FileReader *h;
-         r = fs.open_for_read(dir, file, &h, false);
+         r = fs->open_for_read(dir, file, &h, false);
          if (r < 0) {
            cerr << "open_for_read " << dir << "/" << file << " failed: "
                 << cpp_strerror(r) << std::endl;
@@ -295,7 +510,7 @@ int main(int argc, char **argv)
          int left = size;
          while (left) {
            bufferlist bl;
-           r = fs.read(h, &h->buf, pos, left, &bl, NULL);
+           r = fs->read(h, &h->buf, pos, left, &bl, NULL);
            if (r <= 0) {
              cerr << "read " << dir << "/" << file << " from " << pos
                   << " failed: " << cpp_strerror(r) << std::endl;
@@ -315,7 +530,8 @@ int main(int argc, char **argv)
        ::close(fd);
       }
     }
-    fs.umount();
+    fs->umount();
+    delete fs;
   } else {
     cerr << "unrecognized action " << action << std::endl;
     return 1;
index 9fb7ce84b626f00bcc28fc5e9074600eddebf8f2..06f64d21d4b34b318525f464af2fa29ddec1ef48 100644 (file)
@@ -47,22 +47,26 @@ void bluestore_bdev_label_t::encode(bufferlist& bl) const
   bl.append("bluestore block device\n");
   bl.append(stringify(osd_uuid));
   bl.append("\n");
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
   ::encode(osd_uuid, bl);
   ::encode(size, bl);
   ::encode(btime, bl);
   ::encode(description, bl);
+  ::encode(meta, bl);
   ENCODE_FINISH(bl);
 }
 
 void bluestore_bdev_label_t::decode(bufferlist::iterator& p)
 {
   p.advance(60); // see above
-  DECODE_START(1, p);
+  DECODE_START(2, p);
   ::decode(osd_uuid, p);
   ::decode(size, p);
   ::decode(btime, p);
   ::decode(description, p);
+  if (struct_v >= 2) {
+    ::decode(meta, p);
+  }
   DECODE_FINISH(p);
 }
 
@@ -72,6 +76,9 @@ void bluestore_bdev_label_t::dump(Formatter *f) const
   f->dump_unsigned("size", size);
   f->dump_stream("btime") << btime;
   f->dump_string("description", description);
+  for (auto& i : meta) {
+    f->dump_string(i.first.c_str(), i.second);
+  }
 }
 
 void bluestore_bdev_label_t::generate_test_instances(
@@ -82,14 +89,17 @@ void bluestore_bdev_label_t::generate_test_instances(
   o.back()->size = 123;
   o.back()->btime = utime_t(4, 5);
   o.back()->description = "fakey";
+  o.back()->meta["foo"] = "bar";
 }
 
 ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l)
 {
   return out << "bdev(osd_uuid " << l.osd_uuid
-            << " size 0x" << std::hex << l.size << std::dec
-            << " btime " << l.btime
-            << " desc " << l.description << ")";
+            << ", size 0x" << std::hex << l.size << std::dec
+            << ", btime " << l.btime
+            << ", desc " << l.description
+            << ", " << l.meta.size() << " meta"
+            << ")";
 }
 
 // cnode_t
index 8e2b77aeb4c48217872b819190e5b76bb2703938..f48f095a0961387b438ae488a595a3ece6f19143 100644 (file)
@@ -36,6 +36,8 @@ struct bluestore_bdev_label_t {
   utime_t btime;       ///< birth time
   string description;  ///< device description
 
+  map<string,string> meta; ///< {read,write}_meta() content from ObjectStore
+
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& p);
   void dump(Formatter *f) const;
@@ -558,6 +560,7 @@ public:
       int len;
       denc_varint(len, p);
       csum_data = p.get_ptr(len);
+      csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
     }
     if (has_unused()) {
       denc(unused, p);
@@ -823,6 +826,7 @@ public:
     csum_chunk_order = order;
     csum_data = buffer::create(get_csum_value_size() * len / get_csum_chunk_size());
     csum_data.zero();
+    csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
   }
 
   /// calculate csum for the buffer at the given b_off
index b0430c499e5857c381b95aab3c52579cdccb7c85..b36d9fe5509d0ad259b73dd9d99edfe21e76eeb5 100644 (file)
@@ -55,9 +55,9 @@ static void append_escaped(const string &in, string *out)
   }
 }
 
-int DBObjectMap::check(std::ostream &out, bool repair)
+int DBObjectMap::check(std::ostream &out, bool repair, bool force)
 {
-  int errors = 0;
+  int errors = 0, comp_errors = 0;
   bool repaired = false;
   map<uint64_t, uint64_t> parent_to_num_children;
   map<uint64_t, uint64_t> parent_to_actual_num_children;
@@ -71,34 +71,37 @@ int DBObjectMap::check(std::ostream &out, bool repair)
       if (header.seq != 0)
        parent_to_actual_num_children[header.seq] = header.num_children;
 
-      // Check complete table
-      bool complete_error = false;
-      boost::optional<string> prev;
-      KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
-      for (complete_iter->seek_to_first(); complete_iter->valid();
-           complete_iter->next()) {
-         if (prev && prev >= complete_iter->key()) {
-             out << "Bad complete for " << header.oid << std::endl;
-             complete_error = true;
-             break;
-         }
-         prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1);
-      }
-      if (complete_error) {
-        out << "Complete mapping for " << header.seq << " :" << std::endl;
-        for (complete_iter->seek_to_first(); complete_iter->valid();
-             complete_iter->next()) {
-          out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl;
-        }
-        if (repair) {
-          repaired = true;
-          KeyValueDB::Transaction t = db->get_transaction();
-          t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
-          db->submit_transaction(t);
-          out << "Cleared complete mapping to repair" << std::endl;
-        } else {
-          errors++;  // Only count when not repaired
-        }
+      if (state.v == 2 || force) {
+       // Check complete table
+       bool complete_error = false;
+       boost::optional<string> prev;
+       KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+       for (complete_iter->seek_to_first(); complete_iter->valid();
+            complete_iter->next()) {
+         if (prev && prev >= complete_iter->key()) {
+            out << "Bad complete for " << header.oid << std::endl;
+            complete_error = true;
+            break;
+         }
+         prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1);
+       }
+       if (complete_error) {
+         out << "Complete mapping for " << header.seq << " :" << std::endl;
+         for (complete_iter->seek_to_first(); complete_iter->valid();
+              complete_iter->next()) {
+           out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl;
+         }
+         if (repair) {
+           repaired = true;
+           KeyValueDB::Transaction t = db->get_transaction();
+           t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+           db->submit_transaction(t);
+           out << "Cleared complete mapping to repair" << std::endl;
+         } else {
+           errors++;  // Only count when not repaired
+           comp_errors++;  // Track errors here for version update
+         }
+       }
       }
 
       if (header.parent == 0)
@@ -137,6 +140,17 @@ int DBObjectMap::check(std::ostream &out, bool repair)
     }
     parent_to_actual_num_children.erase(i->first);
   }
+
+  // Only advance the version from 2 to 3 here
+  // Mark as legacy because there are still older structures
+  // we don't update.  The value of legacy is only used
+  // for internal assertions.
+  if (comp_errors == 0 && state.v == 2 && repair) {
+    state.v = 3;
+    state.legacy = true;
+    set_state();
+  }
+
   if (errors == 0 && repaired)
     return -1;
   return errors;
@@ -645,7 +659,7 @@ int DBObjectMap::rm_keys(const ghobject_t &oid,
     return db->submit_transaction(t);
   }
 
-  assert(state.v < 3);
+  assert(state.legacy);
 
   {
     // We only get here for legacy (v2) stores
@@ -852,7 +866,7 @@ int DBObjectMap::legacy_clone(const ghobject_t &oid,
                       const ghobject_t &target,
                       const SequencerPosition *spos)
 {
-  state.v = 2;
+  state.legacy = true;
 
   if (oid == target)
     return 0;
@@ -1021,15 +1035,22 @@ int DBObjectMap::upgrade_to_v2()
 
   state.v = 2;
 
+  set_state();
+  return 0;
+}
+
+void DBObjectMap::set_state()
+{
   Mutex::Locker l(header_lock);
   KeyValueDB::Transaction t = db->get_transaction();
   write_state(t);
-  db->submit_transaction_sync(t);
+  int ret = db->submit_transaction_sync(t);
+  assert(ret == 0);
   dout(1) << __func__ << " done" << dendl;
-  return 0;
+  return;
 }
 
-int DBObjectMap::init(bool do_upgrade)
+int DBObjectMap::get_state()
 {
   map<string, bufferlist> result;
   set<string> to_get;
@@ -1040,28 +1061,36 @@ int DBObjectMap::init(bool do_upgrade)
   if (!result.empty()) {
     bufferlist::iterator bliter = result.begin()->second.begin();
     state.decode(bliter);
-    if (state.v < 1) {
-      dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
-             << dendl;
-      return -ENOTSUP;
-    }
-    if (state.v < 2) { // Needs upgrade
-      if (!do_upgrade) {
-       dout(1) << "DOBjbectMap requires an upgrade,"
-               << " set filestore_update_to"
-               << dendl;
-       return -ENOTSUP;
-      } else {
-       r = upgrade_to_v2();
-       if (r < 0)
-         return r;
-      }
-    }
   } else {
     // New store
-    // Version 3 means that complete regions never used
-    state.v = 3;
+    state.v = State::CUR_VERSION;
     state.seq = 1;
+    state.legacy = false;
+  }
+  return 0;
+}
+
+int DBObjectMap::init(bool do_upgrade)
+{
+  int ret = get_state();
+  if (ret < 0)
+    return ret;
+  if (state.v < 1) {
+    dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
+           << dendl;
+    return -ENOTSUP;
+  }
+  if (state.v < 2) { // Needs upgrade
+    if (!do_upgrade) {
+      dout(1) << "DOBjbectMap requires an upgrade,"
+             << " set filestore_update_to"
+             << dendl;
+      return -ENOTSUP;
+    } else {
+      int r = upgrade_to_v2();
+      if (r < 0)
+       return r;
+    }
   }
   ostringstream ss;
   int errors = check(ss, true);
@@ -1222,7 +1251,7 @@ void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t)
   dout(20) << "clear_header: clearing seq " << header->seq << dendl;
   t->rmkeys_by_prefix(user_prefix(header));
   t->rmkeys_by_prefix(sys_prefix(header));
-  if (state.v < 3)
+  if (state.legacy)
     t->rmkeys_by_prefix(complete_prefix(header)); // Needed when header.parent != 0
   t->rmkeys_by_prefix(xattr_prefix(header));
   set<string> keys;
index 3f6798d2ee28f46de92bb3203df581ae0267e120..fb1653e489d0e4070a98f2c20a744873b1c1ab77 100644 (file)
@@ -219,13 +219,17 @@ public:
     );
 
   /// Read initial state from backing store
+  int get_state();
+  /// Write current state settings to DB
+  void set_state();
+  /// Read initial state and upgrade or initialize state
   int init(bool upgrade = false);
 
   /// Upgrade store to current version
   int upgrade_to_v2();
 
   /// Consistency check, debug, there must be no parallel writes
-  int check(std::ostream &out, bool repair = false) override;
+  int check(std::ostream &out, bool repair = false, bool force = false) override;
 
   /// Ensure that all previous operations are durable
   int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override;
@@ -261,30 +265,40 @@ public:
 
   /// persistent state for store @see generate_header
   struct State {
+    static const __u8 CUR_VERSION = 3;
     __u8 v;
     uint64_t seq;
-    State() : v(0), seq(1) {}
-    explicit State(uint64_t seq) : v(0), seq(seq) {}
+    // legacy is false when complete regions never used
+    bool legacy;
+    State() : v(0), seq(1), legacy(false) {}
+    explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {}
 
     void encode(bufferlist &bl) const {
-      ENCODE_START(2, 1, bl);
+      ENCODE_START(3, 1, bl);
       ::encode(v, bl);
       ::encode(seq, bl);
+      ::encode(legacy, bl);
       ENCODE_FINISH(bl);
     }
 
     void decode(bufferlist::iterator &bl) {
-      DECODE_START(2, bl);
+      DECODE_START(3, bl);
       if (struct_v >= 2)
        ::decode(v, bl);
       else
        v = 0;
       ::decode(seq, bl);
+      if (struct_v >= 3)
+       ::decode(legacy, bl);
+      else
+       legacy = false;
       DECODE_FINISH(bl);
     }
 
     void dump(Formatter *f) const {
+      f->dump_unsigned("v", v);
       f->dump_unsigned("seq", seq);
+      f->dump_bool("legacy", legacy);
     }
 
     static void generate_test_instances(list<State*> &o) {
index 0491559aee98b3c6312cee268c520bd648abcb3a..8dd75435dc6755f6b5506241dc4433cf254c2274 100644 (file)
@@ -1973,6 +1973,8 @@ bool FileJournal::read_entry(
         journaled_seq = seq;
       return true;
     }
+  } else {
+    derr << "do_read_entry(" << pos << "): " << ss.str() << dendl;
   }
 
   if (seq && seq < header.committed_up_to) {
@@ -1988,7 +1990,6 @@ bool FileJournal::read_entry(
     }
   }
 
-  dout(25) << ss.str() << dendl;
   dout(2) << "No further valid entries found, journal is most likely valid"
          << dendl;
   return false;
index b097a3c18a61ba812932f636680b0470ebae1109..caac76ec41408127d8155f84dbdd5be294c7ad95 100644 (file)
@@ -687,8 +687,7 @@ void FileStore::collect_metadata(map<string,string> *pm)
   (*pm)["filestore_f_type"] = ss.str();
 
   if (cct->_conf->filestore_collect_device_partition_information) {
-    rc = get_device_by_uuid(get_fsid(), "PARTUUID", partition_path,
-          dev_node);
+    rc = get_device_by_fd(fsid_fd, partition_path, dev_node, PATH_MAX);
   } else {
     rc = -EINVAL;
   }
index 426da2440089115065c5c2e191580b1137636ee0..926a3be7e03422c9a9bff9e4f3c5914385020110 100644 (file)
@@ -247,9 +247,9 @@ OSDService::OSDService(OSD *osd) :
   recovery_sleep_lock("OSDService::recovery_sleep_lock"),
   recovery_sleep_timer(cct, recovery_sleep_lock, false),
   reserver_finisher(cct),
-  local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
+  local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
                 cct->_conf->osd_min_recovery_priority),
-  remote_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
+  remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
                  cct->_conf->osd_min_recovery_priority),
   pg_temp_lock("OSDService::pg_temp_lock"),
   snap_sleep_lock("OSDService::snap_sleep_lock"),
@@ -258,7 +258,7 @@ OSDService::OSDService(OSD *osd) :
   scrub_sleep_lock("OSDService::scrub_sleep_lock"),
   scrub_sleep_timer(
     osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
-  snap_reserver(&reserver_finisher,
+  snap_reserver(cct, &reserver_finisher,
                cct->_conf->osd_max_trimming_pgs),
   recovery_lock("OSDService::recovery_lock"),
   recovery_ops_active(0),
@@ -1789,7 +1789,7 @@ int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
     waiter.wait();
   }
 
-  ret = write_meta(store, sb.cluster_fsid, sb.osd_fsid, whoami);
+  ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
   if (ret) {
     derr << "OSD::mkfs: failed to write fsid file: error "
          << cpp_strerror(ret) << dendl;
@@ -1803,7 +1803,7 @@ free_store:
   return ret;
 }
 
-int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
+int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
 {
   char val[80];
   int r;
@@ -1823,6 +1823,14 @@ int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid,
   if (r < 0)
     return r;
 
+  string key = cct->_conf->get_val<string>("key");
+  lderr(cct) << "key " << key << dendl;
+  if (key.size()) {
+    r = store->write_meta("osd_key", key);
+    if (r < 0)
+      return r;
+  }
+
   r = store->write_meta("ready", "ready");
   if (r < 0)
     return r;
@@ -2956,6 +2964,9 @@ void OSD::create_logger()
   };
 
 
+  // All the basic OSD operation stats are to be considered useful
+  osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
   osd_plb.add_u64(
     l_osd_op_wip, "op_wip",
     "Replication operations currently being processed (primary)");
@@ -3043,6 +3054,10 @@ void OSD::create_logger()
     l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
     "Latency of read-modify-write operations (excluding queue time and wait for finished)");
 
+  // Now we move on to some more obscure stats, revert to assuming things
+  // are low priority unless otherwise specified.
+  osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+
   osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
     "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
   osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
@@ -3129,8 +3144,12 @@ void OSD::create_logger()
     l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
     "OSDMap buffer cache misses");
 
-  osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
-  osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
+  osd_plb.add_u64(
+    l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
+    PerfCountersBuilder::PRIO_USEFUL);
+  osd_plb.add_u64(
+    l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
+    PerfCountersBuilder::PRIO_USEFUL);
   osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
 
   osd_plb.add_u64_counter(
@@ -3250,11 +3269,14 @@ int OSD::shutdown()
   set_state(STATE_STOPPING);
 
   // Debugging
-  cct->_conf->set_val("debug_osd", "100");
-  cct->_conf->set_val("debug_journal", "100");
-  cct->_conf->set_val("debug_filestore", "100");
-  cct->_conf->set_val("debug_ms", "100");
-  cct->_conf->apply_changes(NULL);
+  if (cct->_conf->get_val<bool>("osd_debug_shutdown")) {
+    cct->_conf->set_val("debug_osd", "100");
+    cct->_conf->set_val("debug_journal", "100");
+    cct->_conf->set_val("debug_filestore", "100");
+    cct->_conf->set_val("debug_bluestore", "100");
+    cct->_conf->set_val("debug_ms", "100");
+    cct->_conf->apply_changes(NULL);
+  }
 
   // stop MgrClient earlier as it's more like an internal consumer of OSD
   mgrc.shutdown();
@@ -4063,6 +4085,11 @@ void OSD::build_past_intervals_parallel()
         ++i) {
       PG *pg = i->second;
 
+      // Ignore PGs only partially created (DNE)
+      if (pg->info.dne()) {
+       continue;
+      }
+
       auto rpib = pg->get_required_past_interval_bounds(
        pg->info,
        superblock.oldest_map);
@@ -4249,6 +4276,11 @@ int OSD::handle_pg_peering_evt(
       ceph_abort();
     }
 
+    const bool is_mon_create =
+      evt->get_event().dynamic_type() == PG::NullEvt::static_type();
+    if (maybe_wait_for_max_pg(pgid, is_mon_create)) {
+      return -EAGAIN;
+    }
     // do we need to resurrect a deleting pg?
     spg_t resurrected;
     PGRef old_pg_state;
@@ -4389,6 +4421,88 @@ int OSD::handle_pg_peering_evt(
   }
 }
 
+bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
+{
+  const auto max_pgs_per_osd =
+    (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
+     cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+
+  RWLock::RLocker pg_map_locker{pg_map_lock};
+  if (pg_map.size() < max_pgs_per_osd) {
+    return false;
+  }
+  lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+  if (is_mon_create) {
+    pending_creates_from_mon++;
+  } else {
+    pending_creates_from_osd.emplace(pgid.pgid);
+  }
+  dout(5) << __func__ << " withhold creation of pg " << pgid
+         << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
+  return true;
+}
+
+// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
+// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
+// to up set if pg_temp is empty. so an empty pg_temp won't work.
+static vector<int32_t> twiddle(const vector<int>& acting) {
+  if (acting.size() > 1) {
+    return {acting[0]};
+  } else {
+    vector<int32_t> twiddled(acting.begin(), acting.end());
+    twiddled.push_back(-1);
+    return twiddled;
+  }
+}
+
+void OSD::resume_creating_pg()
+{
+  bool do_sub_pg_creates = false;
+  MOSDPGTemp *pgtemp = nullptr;
+  {
+    const auto max_pgs_per_osd =
+      (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
+       cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+    RWLock::RLocker l(pg_map_lock);
+    if (max_pgs_per_osd <= pg_map.size()) {
+      // this could happen if admin decreases this setting before a PG is removed
+      return;
+    }
+    unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
+    lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+    if (pending_creates_from_mon > 0) {
+      do_sub_pg_creates = true;
+      if (pending_creates_from_mon >= spare_pgs) {
+       spare_pgs = pending_creates_from_mon = 0;
+      } else {
+       spare_pgs -= pending_creates_from_mon;
+       pending_creates_from_mon = 0;
+      }
+    }
+    auto pg = pending_creates_from_osd.cbegin();
+    while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
+      if (!pgtemp) {
+       pgtemp = new MOSDPGTemp{osdmap->get_epoch()};
+      }
+      vector<int> acting;
+      osdmap->pg_to_up_acting_osds(*pg, nullptr, nullptr, &acting, nullptr);
+      pgtemp->pg_temp[*pg] = twiddle(acting);
+      pg = pending_creates_from_osd.erase(pg);
+      spare_pgs--;
+    }
+  }
+  if (do_sub_pg_creates) {
+    if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
+      dout(4) << __func__ << ": resolicit pg creates from mon since "
+             << last_pg_create_epoch << dendl;
+      monc->renew_subs();
+    }
+  }
+  if (pgtemp) {
+    pgtemp->forced = true;
+    monc->send_mon_message(pgtemp);
+  }
+}
 
 void OSD::build_initial_pg_history(
   spg_t pgid,
@@ -5194,6 +5308,7 @@ void OSD::tick_without_osd_lock()
       sched_scrub();
     }
     service.promote_throttle_recalibrate();
+    resume_creating_pg();
     bool need_send_beacon = false;
     const auto now = ceph::coarse_mono_clock::now();
     {
@@ -8171,6 +8286,15 @@ void OSD::consume_map()
   assert(osd_lock.is_locked());
   dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
 
+  /** make sure the cluster is speaking in SORTBITWISE, because we don't
+   *  speak the older sorting version any more. Be careful not to force
+   *  a shutdown if we are merely processing old maps, though.
+   */
+  if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
+    derr << __func__ << " SORTBITWISE flag is not set" << dendl;
+    ceph_abort();
+  }
+
   int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
   list<PGRef> to_remove;
 
@@ -8198,6 +8322,16 @@ void OSD::consume_map()
 
       pg->unlock();
     }
+
+    lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+    for (auto pg = pending_creates_from_osd.cbegin();
+        pg != pending_creates_from_osd.cend();) {
+      if (osdmap->get_pg_acting_rank(*pg, whoami) < 0) {
+       pg = pending_creates_from_osd.erase(pg);
+      } else {
+       ++pg;
+      }
+    }
   }
 
   for (list<PGRef>::iterator i = to_remove.begin();
@@ -8252,11 +8386,6 @@ void OSD::activate_map()
 
   dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
 
-  if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
-    derr << __func__ << " SORTBITWISE flag is not set" << dendl;
-    ceph_abort();
-  }
-
   if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
     dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
     osdmap_subscribe(osdmap->get_epoch() + 1, false);
@@ -8531,7 +8660,6 @@ void OSD::handle_pg_create(OpRequestRef op)
               << dendl;
       continue;
     }
-
     if (handle_pg_peering_evt(
           pgid,
           history,
@@ -8546,8 +8674,13 @@ void OSD::handle_pg_create(OpRequestRef op)
       service.send_pg_created(pgid.pgid);
     }
   }
-  last_pg_create_epoch = m->epoch;
 
+  {
+    lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+    if (pending_creates_from_mon == 0) {
+      last_pg_create_epoch = m->epoch;
+    }
+  }
   maybe_update_heartbeat_peers();
 }
 
@@ -8667,7 +8800,7 @@ void OSD::do_notifies(
       continue;
     }
     service.share_map_peer(it->first, con.get(), curmap);
-    dout(7) << __func__ << " osd " << it->first
+    dout(7) << __func__ << " osd." << it->first
            << " on " << it->second.size() << " PGs" << dendl;
     MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
                                       it->second);
@@ -8923,6 +9056,8 @@ void OSD::handle_pg_backfill_reserve(OpRequestRef op)
        m->query_epoch,
        PG::RemoteBackfillReserved()));
   } else if (m->type == MBackfillReserve::REJECT) {
+    // NOTE: this is replica -> primary "i reject your request"
+    //      and also primary -> replica "cancel my previously-granted request"
     evt = PG::CephPeeringEvtRef(
       new PG::CephPeeringEvt(
        m->query_epoch,
@@ -9229,7 +9364,6 @@ void OSD::_remove_pg(PG *pg)
   pg->put("PGMap"); // since we've taken it out of map
 }
 
-
 // =========================================================
 // RECOVERY
 
@@ -9315,7 +9449,7 @@ void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
       i->lock();
       int pgstate = i->get_state();
       if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
-           ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL))) )
+           ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING))) )
         i->_change_recovery_force_mode(newstate, false);
       i->unlock();
     }
@@ -9400,18 +9534,18 @@ void OSD::do_recovery(
       pg->discover_all_missing(*rctx.query_map);
       if (rctx.query_map->empty()) {
        string action;
-        if (pg->state_test(PG_STATE_BACKFILL)) {
+        if (pg->state_test(PG_STATE_BACKFILLING)) {
          auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
            queued,
            queued,
-           PG::CancelBackfill()));
+           PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
          pg->queue_peering_event(evt);
          action = "in backfill";
         } else if (pg->state_test(PG_STATE_RECOVERING)) {
          auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
            queued,
            queued,
-           PG::CancelRecovery()));
+           PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
          pg->queue_peering_event(evt);
          action = "in recovery";
        } else {
index 42a152b3c9adc6aca9fc27ef89cab5ee20edde84..e34bd518e7b3b5a8056085549a6831e49fdf92ec 100644 (file)
@@ -1946,6 +1946,10 @@ protected:
   RWLock pg_map_lock; // this lock orders *above* individual PG _locks
   ceph::unordered_map<spg_t, PG*> pg_map; // protected by pg_map lock
 
+  std::mutex pending_creates_lock;
+  std::set<pg_t> pending_creates_from_osd;
+  unsigned pending_creates_from_mon = 0;
+
   map<spg_t, list<PG::CephPeeringEvtRef> > peering_wait_for_split;
   PGRecoveryStats pg_recovery_stats;
 
@@ -1995,7 +1999,9 @@ protected:
     const PastIntervals& pi,
     epoch_t epoch,
     PG::CephPeeringEvtRef evt);
-  
+  bool maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create);
+  void resume_creating_pg();
+
   void load_pgs();
   void build_past_intervals_parallel();
 
@@ -2407,7 +2413,8 @@ private:
   int update_crush_device_class();
   int update_crush_location();
 
-  static int write_meta(ObjectStore *store,
+  static int write_meta(CephContext *cct,
+                       ObjectStore *store,
                        uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami);
 
   void handle_pg_scrub(struct MOSDScrub *m, PG* pg);
index 5d7eb423880c6958ca9791268dd85d9c7f299405..3143bbccf0cc9a67dbe89a6756dbb07241c114d5 100644 (file)
@@ -20,6 +20,7 @@
 #include "OSDMap.h"
 #include <algorithm>
 #include "common/config.h"
+#include "common/errno.h"
 #include "common/Formatter.h"
 #include "common/TextTable.h"
 #include "include/ceph_features.h"
@@ -1145,21 +1146,41 @@ int OSDMap::calc_num_osds()
   return num_osd;
 }
 
-void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const
+void OSDMap::get_full_pools(CephContext *cct,
+                            set<int64_t> *full,
+                            set<int64_t> *backfillfull,
+                            set<int64_t> *nearfull) const
 {
-  *full = 0;
-  *backfill = 0;
-  *nearfull = 0;
+  assert(full);
+  assert(backfillfull);
+  assert(nearfull);
+  full->clear();
+  backfillfull->clear();
+  nearfull->clear();
+
+  vector<int> full_osds;
+  vector<int> backfillfull_osds;
+  vector<int> nearfull_osds;
   for (int i = 0; i < max_osd; ++i) {
     if (exists(i) && is_up(i) && is_in(i)) {
       if (osd_state[i] & CEPH_OSD_FULL)
-       ++(*full);
+        full_osds.push_back(i);
       else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
-       ++(*backfill);
+       backfillfull_osds.push_back(i);
       else if (osd_state[i] & CEPH_OSD_NEARFULL)
-       ++(*nearfull);
+       nearfull_osds.push_back(i);
     }
   }
+
+  for (auto i: full_osds) {
+    get_pool_ids_by_osd(cct, i, full);
+  }
+  for (auto i: backfillfull_osds) {
+    get_pool_ids_by_osd(cct, i, backfillfull);
+  }
+  for (auto i: nearfull_osds) {
+    get_pool_ids_by_osd(cct, i, nearfull);
+  }
 }
 
 static bool get_osd_utilization(
@@ -1430,6 +1451,8 @@ void OSDMap::_calc_up_osd_features()
     if (!is_up(osd))
       continue;
     const osd_xinfo_t &xi = get_xinfo(osd);
+    if (xi.features == 0)
+      continue;  // bogus xinfo, maybe #20751 or similar, skipping
     if (first) {
       cached_up_osd_features = xi.features;
       first = false;
@@ -3263,15 +3286,46 @@ void OSDMap::print_oneline_summary(ostream& out) const
     out << " nearfull";
 }
 
-bool OSDMap::crush_ruleset_in_use(int ruleset) const
+bool OSDMap::crush_rule_in_use(int rule_id) const
 {
   for (const auto &pool : pools) {
-    if (pool.second.crush_rule == ruleset)
+    if (pool.second.crush_rule == rule_id)
       return true;
   }
   return false;
 }
 
+int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
+                                ostream *ss) const
+{
+  for (auto& i : pools) {
+    auto& pool = i.second;
+    int ruleno = pool.get_crush_rule();
+    if (!newcrush->rule_exists(ruleno)) {
+      *ss << "pool " << i.first << " references crush_rule " << ruleno
+         << " but it is not present";
+      return -EINVAL;
+    }
+    if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) {
+      *ss << "rule " << ruleno << " mask ruleset does not match rule id";
+      return -EINVAL;
+    }
+    if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) {
+      *ss << "pool " << i.first << " type does not match rule " << ruleno;
+      return -EINVAL;
+    }
+    if (pool.get_size() < (int)newcrush->get_rule_mask_min_size(ruleno) ||
+       pool.get_size() > (int)newcrush->get_rule_mask_max_size(ruleno)) {
+      *ss << "pool " << i.first << " size " << pool.get_size() << " does not"
+         << " fall within rule " << ruleno
+         << " min_size " << newcrush->get_rule_mask_min_size(ruleno)
+         << " and max_size " << newcrush->get_rule_mask_max_size(ruleno);
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
+
 int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
                                  int nosd, int pg_bits, int pgp_bits,
                                  bool default_pool)
@@ -3817,8 +3871,9 @@ int OSDMap::calc_pg_upmaps(
       tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
       ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
       for (auto p : pmap) {
-       osd_weight[p.first] += p.second;
-       osd_weight_total += p.second;
+       auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
+       osd_weight[p.first] += adjusted_weight;
+       osd_weight_total += adjusted_weight;
       }
     }
     for (auto& i : osd_weight) {
@@ -3973,6 +4028,31 @@ int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
   return crush->get_leaves(name, osds);
 }
 
+// get pools whose crush rules might reference the given osd
+void OSDMap::get_pool_ids_by_osd(CephContext *cct,
+                                int osd,
+                                set<int64_t> *pool_ids) const
+{
+  assert(pool_ids);
+  set<int> raw_rules;
+  int r = crush->get_rules_by_osd(osd, &raw_rules);
+  if (r < 0) {
+    lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
+               << dendl;
+    assert(r >= 0);
+  }
+  set<int> rules;
+  for (auto &i: raw_rules) {
+    // exclude any dead rule
+    if (crush_rule_in_use(i)) {
+      rules.insert(i);
+    }
+  }
+  for (auto &r: rules) {
+    get_pool_ids_by_rule(r, pool_ids);
+  }
+}
+
 template <typename F>
 class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
 public:
@@ -4528,6 +4608,7 @@ void OSDMap::check_health(health_check_map_t *checks) const
   {
     // warn about flags
     uint64_t warn_flags =
+      CEPH_OSDMAP_NEARFULL |
       CEPH_OSDMAP_FULL |
       CEPH_OSDMAP_PAUSERD |
       CEPH_OSDMAP_PAUSEWR |
@@ -4634,23 +4715,49 @@ void OSDMap::check_health(health_check_map_t *checks) const
   // OSD_UPGRADE_FINISHED
   // none of these (yet) since we don't run until luminous upgrade is done.
 
-  // POOL_FULL
+  // POOL_NEARFULL/BACKFILLFULL/FULL
   {
-    list<string> detail;
+    list<string> full_detail, backfillfull_detail, nearfull_detail;
     for (auto it : get_pools()) {
       const pg_pool_t &pool = it.second;
+      const string& pool_name = get_pool_name(it.first);
       if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
-       const string& pool_name = get_pool_name(it.first);
        stringstream ss;
-       ss << "pool '" << pool_name << "' is full";
-       detail.push_back(ss.str());
+        if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+          // may run out of space too,
+          // but we want EQUOTA taking precedence
+          ss << "pool '" << pool_name << "' is full (no quota)";
+        } else {
+          ss << "pool '" << pool_name << "' is full (no space)";
+        }
+       full_detail.push_back(ss.str());
+      } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+        stringstream ss;
+        ss << "pool '" << pool_name << "' is backfillfull";
+        backfillfull_detail.push_back(ss.str());
+      } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
+        stringstream ss;
+        ss << "pool '" << pool_name << "' is nearfull";
+        nearfull_detail.push_back(ss.str());
       }
     }
-    if (!detail.empty()) {
+    if (!full_detail.empty()) {
       ostringstream ss;
-      ss << detail.size() << " pool(s) full";
+      ss << full_detail.size() << " pool(s) full";
       auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
-      d.detail.swap(detail);
+      d.detail.swap(full_detail);
+    }
+    if (!backfillfull_detail.empty()) {
+      ostringstream ss;
+      ss << backfillfull_detail.size() << " pool(s) backfillfull";
+      auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str());
+      d.detail.swap(backfillfull_detail);
+    }
+    if (!nearfull_detail.empty()) {
+      ostringstream ss;
+      ss << nearfull_detail.size() << " pool(s) nearfull";
+      auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str());
+      d.detail.swap(nearfull_detail);
     }
   }
 }
index e676bff0a72c7bc752188e5ba6b1790d814002ac..6ba56511823d1390ad4aa5a4bfcd7b8cfcaabe40 100644 (file)
@@ -644,13 +644,15 @@ public:
   float get_nearfull_ratio() const {
     return nearfull_ratio;
   }
-  void count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const;
   void get_full_osd_util(
     const mempool::pgmap::unordered_map<int32_t,osd_stat_t> &osd_stat,
     map<int, float> *full,
     map<int, float> *backfill,
     map<int, float> *nearfull) const;
-
+  void get_full_pools(CephContext *cct,
+                      set<int64_t> *full,
+                      set<int64_t> *backfillfull,
+                      set<int64_t> *nearfull) const;
   void get_full_osd_counts(set<int> *full, set<int> *backfill,
                           set<int> *nearfull) const;
 
@@ -1171,6 +1173,17 @@ public:
   mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() {
     return pools;
   }
+  void get_pool_ids_by_rule(int rule_id, set<int64_t> *pool_ids) const {
+    assert(pool_ids);
+    for (auto &p: pools) {
+      if ((int)p.second.get_crush_rule() == rule_id) {
+        pool_ids->insert(p.first);
+      }
+    }
+  }
+  void get_pool_ids_by_osd(CephContext *cct,
+                           int osd,
+                           set<int64_t> *pool_ids) const;
   const string& get_pool_name(int64_t p) const {
     auto i = pool_name.find(p);
     assert(i != pool_name.end());
@@ -1329,7 +1342,9 @@ public:
     const string& root,
     ostream *ss);
 
-  bool crush_ruleset_in_use(int ruleset) const;
+  bool crush_rule_in_use(int rule_id) const;
+
+  int validate_crush_rules(CrushWrapper *crush, ostream *ss) const;
 
   void clear_temp() {
     pg_temp->clear();
index da4da001e0404e2b8daca753cdafc320359e303b..139a9ac6d17941d835bfecbea14b2e2577d3f072 100644 (file)
@@ -3878,21 +3878,21 @@ void PG::reject_reservation()
     get_osdmap()->get_epoch());
 }
 
-void PG::schedule_backfill_full_retry()
+void PG::schedule_backfill_retry(float delay)
 {
   Mutex::Locker lock(osd->recovery_request_lock);
   osd->recovery_request_timer.add_event_after(
-    cct->_conf->osd_backfill_retry_interval,
+    delay,
     new QueuePeeringEvt<RequestBackfill>(
       this, get_osdmap()->get_epoch(),
       RequestBackfill()));
 }
 
-void PG::schedule_recovery_full_retry()
+void PG::schedule_recovery_retry(float delay)
 {
   Mutex::Locker lock(osd->recovery_request_lock);
   osd->recovery_request_timer.add_event_after(
-    cct->_conf->osd_recovery_retry_interval,
+    delay,
     new QueuePeeringEvt<DoRecovery>(
       this, get_osdmap()->get_epoch(),
       DoRecovery()));
@@ -5530,8 +5530,6 @@ void PG::on_new_interval()
     upacting_features &= osdmap->get_xinfo(*p).features;
   }
 
-  assert(osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE));
-
   _on_new_interval();
 }
 
@@ -6395,18 +6393,19 @@ PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
   pg->queue_recovery();
   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
   pg->state_clear(PG_STATE_BACKFILL_WAIT);
-  pg->state_set(PG_STATE_BACKFILL);
+  pg->state_set(PG_STATE_BACKFILLING);
   pg->publish_stats_to_osd();
 }
 
 boost::statechart::result
-PG::RecoveryState::Backfilling::react(const CancelBackfill &)
+PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
 {
   PG *pg = context< RecoveryMachine >().pg;
+  ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
-  // XXX: Add a new pg state so user can see why backfill isn't proceeding
-  // Can't use PG_STATE_BACKFILL_WAIT since it means waiting for reservations
-  //pg->state_set(PG_STATE_BACKFILL_STALLED????);
+
+  pg->state_set(PG_STATE_BACKFILL_WAIT);
+  pg->state_clear(PG_STATE_BACKFILLING);
 
   for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
        it != pg->backfill_targets.end();
@@ -6424,9 +6423,13 @@ PG::RecoveryState::Backfilling::react(const CancelBackfill &)
     }
   }
 
-  pg->waiting_on_backfill.clear();
 
-  pg->schedule_backfill_full_retry();
+  if (!pg->waiting_on_backfill.empty()) {
+    pg->waiting_on_backfill.clear();
+    pg->finish_recovery_op(hobject_t::get_max());
+  }
+
+  pg->schedule_backfill_retry(c.delay);
   return transit<NotBackfilling>();
 }
 
@@ -6453,10 +6456,12 @@ PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
     }
   }
 
-  pg->waiting_on_backfill.clear();
-  pg->finish_recovery_op(hobject_t::get_max());
+  if (!pg->waiting_on_backfill.empty()) {
+    pg->waiting_on_backfill.clear();
+    pg->finish_recovery_op(hobject_t::get_max());
+  }
 
-  pg->schedule_backfill_full_retry();
+  pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
   return transit<NotBackfilling>();
 }
 
@@ -6466,7 +6471,7 @@ void PG::RecoveryState::Backfilling::exit()
   PG *pg = context< RecoveryMachine >().pg;
   pg->backfill_reserved = false;
   pg->backfill_reserving = false;
-  pg->state_clear(PG_STATE_BACKFILL);
+  pg->state_clear(PG_STATE_BACKFILLING);
   pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
   utime_t dur = ceph_clock_now() - enter_time;
   pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
@@ -6550,7 +6555,7 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationReje
   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
   pg->publish_stats_to_osd();
 
-  pg->schedule_backfill_full_retry();
+  pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
 
   return transit<NotBackfilling>();
 }
@@ -6568,7 +6573,10 @@ PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_conte
     new QueuePeeringEvt<LocalBackfillReserved>(
       pg, pg->get_osdmap()->get_epoch(),
       LocalBackfillReserved()),
-    pg->get_backfill_priority());
+    pg->get_backfill_priority(),
+    new QueuePeeringEvt<DeferBackfill>(
+      pg, pg->get_osdmap()->get_epoch(),
+      DeferBackfill(0.0)));
   pg->publish_stats_to_osd();
 }
 
@@ -6636,6 +6644,15 @@ PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
   context< RecoveryMachine >().log_enter(state_name);
 }
 
+boost::statechart::result
+PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->reject_reservation();
+  post_event(RemoteReservationRejected());
+  return discard_event();
+}
+
 void PG::RecoveryState::RepNotRecovering::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
@@ -6674,6 +6691,15 @@ PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &
   return transit<RepRecovering>();
 }
 
+boost::statechart::result
+PG::RecoveryState::RepWaitRecoveryReserved::react(
+  const RemoteReservationCanceled &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
+  return transit<RepNotRecovering>();
+}
+
 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
@@ -6700,12 +6726,12 @@ PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
     ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
                       << dendl;
-    post_event(RemoteReservationRejected());
+    post_event(RejectRemoteReservation());
   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
       pg->osd->check_backfill_full(ss)) {
     ldout(pg->cct, 10) << "backfill reservation rejected: "
                       << ss.str() << dendl;
-    post_event(RemoteReservationRejected());
+    post_event(RejectRemoteReservation());
   } else {
     pg->osd->remote_reserver.request_reservation(
       pg->info.pgid,
@@ -6734,15 +6760,13 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &
       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
                       << "failure injection" << dendl;
-    pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
-    post_event(RemoteReservationRejected());
+    post_event(RejectRemoteReservation());
     return discard_event();
   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
             pg->osd->check_backfill_full(ss)) {
     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
                       << ss.str() << dendl;
-    pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
-    post_event(RemoteReservationRejected());
+    post_event(RejectRemoteReservation());
     return discard_event();
   } else {
     pg->osd->send_message_osd_cluster(
@@ -6757,10 +6781,30 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &
 }
 
 boost::statechart::result
-PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejected &evt)
+PG::RecoveryState::RepWaitBackfillReserved::react(
+  const RejectRemoteReservation &evt)
 {
   PG *pg = context< RecoveryMachine >().pg;
   pg->reject_reservation();
+  post_event(RemoteReservationRejected());
+  return discard_event();
+}
+
+boost::statechart::result
+PG::RecoveryState::RepWaitBackfillReserved::react(
+  const RemoteReservationRejected &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
+  return transit<RepNotRecovering>();
+}
+
+boost::statechart::result
+PG::RecoveryState::RepWaitBackfillReserved::react(
+  const RemoteReservationCanceled &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
   return transit<RepNotRecovering>();
 }
 
@@ -6826,7 +6870,10 @@ PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_conte
     new QueuePeeringEvt<LocalRecoveryReserved>(
       pg, pg->get_osdmap()->get_epoch(),
       LocalRecoveryReserved()),
-    pg->get_recovery_priority());
+    pg->get_recovery_priority(),
+    new QueuePeeringEvt<DeferRecovery>(
+      pg, pg->get_osdmap()->get_epoch(),
+      DeferRecovery(0.0)));
   pg->publish_stats_to_osd();
 }
 
@@ -6835,7 +6882,7 @@ PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
 {
   PG *pg = context< RecoveryMachine >().pg;
   pg->state_set(PG_STATE_RECOVERY_TOOFULL);
-  pg->schedule_recovery_full_retry();
+  pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
   return transit<NotRecovering>();
 }
 
@@ -6933,6 +6980,7 @@ PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
   pg->state_clear(PG_STATE_RECOVERING);
   pg->state_clear(PG_STATE_FORCED_RECOVERY);
   release_reservations();
+  pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
   return transit<Recovered>();
 }
 
@@ -6943,17 +6991,20 @@ PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
   pg->state_clear(PG_STATE_RECOVERING);
   pg->state_clear(PG_STATE_FORCED_RECOVERY);
   release_reservations();
-  return transit<WaitRemoteBackfillReserved>();
+  pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+  return transit<WaitLocalBackfillReserved>();
 }
 
 boost::statechart::result
-PG::RecoveryState::Recovering::react(const CancelRecovery &evt)
+PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
 {
   PG *pg = context< RecoveryMachine >().pg;
+  ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
   pg->state_clear(PG_STATE_RECOVERING);
+  pg->state_set(PG_STATE_RECOVERY_WAIT);
   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
   release_reservations(true);
-  pg->schedule_recovery_full_retry();
+  pg->schedule_recovery_retry(evt.delay);
   return transit<NotRecovering>();
 }
 
@@ -6974,7 +7025,6 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx)
   context< RecoveryMachine >().log_enter(state_name);
 
   PG *pg = context< RecoveryMachine >().pg;
-  pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
 
   assert(!pg->needs_recovery());
 
index 80b75dc5eb604c82ad4e2de1682e74f2427b574e..fd5f36aec0662f9590ce99138268e3718dd6079b 100644 (file)
@@ -1410,8 +1410,8 @@ public:
   void handle_scrub_reserve_release(OpRequestRef op);
 
   void reject_reservation();
-  void schedule_backfill_full_retry();
-  void schedule_recovery_full_retry();
+  void schedule_backfill_retry(float retry);
+  void schedule_recovery_retry(float retry);
 
   // -- recovery state --
 
@@ -1563,6 +1563,21 @@ public:
       *out << #T;                                                 \
     }                                                             \
   };
+  struct DeferBackfill : boost::statechart::event<DeferBackfill> {
+    float delay;
+    explicit DeferBackfill(float delay) : delay(delay) {}
+    void print(std::ostream *out) const {
+      *out << "DeferBackfill: delay " << delay;
+    }
+  };
+  struct DeferRecovery : boost::statechart::event<DeferRecovery> {
+    float delay;
+    explicit DeferRecovery(float delay) : delay(delay) {}
+    void print(std::ostream *out) const {
+      *out << "DeferRecovery: delay " << delay;
+    }
+  };
+
   TrivialEvent(Initialize)
   TrivialEvent(Load)
   TrivialEvent(GotInfo)
@@ -1572,14 +1587,20 @@ public:
   TrivialEvent(Backfilled)
   TrivialEvent(LocalBackfillReserved)
   TrivialEvent(RemoteBackfillReserved)
+  TrivialEvent(RejectRemoteReservation)
   TrivialEvent(RemoteReservationRejected)
-  TrivialEvent(CancelBackfill)
+  TrivialEvent(RemoteReservationCanceled)
   TrivialEvent(RequestBackfill)
   TrivialEvent(RequestRecovery)
   TrivialEvent(RecoveryDone)
   TrivialEvent(BackfillTooFull)
   TrivialEvent(RecoveryTooFull)
-  TrivialEvent(CancelRecovery)
+
+  TrivialEvent(MakePrimary)
+  TrivialEvent(MakeStray)
+  TrivialEvent(NeedActingChange)
+  TrivialEvent(IsIncomplete)
+  TrivialEvent(IsDown)
 
   TrivialEvent(AllReplicasRecovered)
   TrivialEvent(DoRecovery)
@@ -1746,12 +1767,6 @@ public:
       }
     };
 
-    struct MakePrimary : boost::statechart::event< MakePrimary > {
-      MakePrimary() : boost::statechart::event< MakePrimary >() {}
-    };
-    struct MakeStray : boost::statechart::event< MakeStray > {
-      MakeStray() : boost::statechart::event< MakeStray >() {}
-    };
     struct Primary;
     struct Stray;
 
@@ -1767,17 +1782,8 @@ public:
 
     struct Peering;
     struct WaitActingChange;
-    struct NeedActingChange : boost::statechart::event< NeedActingChange > {
-      NeedActingChange() : boost::statechart::event< NeedActingChange >() {}
-    };
     struct Incomplete;
-    struct IsIncomplete : boost::statechart::event< IsIncomplete > {
-      IsIncomplete() : boost::statechart::event< IsIncomplete >() {}
-    };
     struct Down;
-    struct IsDown : boost::statechart::event< IsDown > {
-      IsDown() : boost::statechart::event< IsDown >() {}
-    };
 
     struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
       explicit Primary(my_context ctx);
@@ -1847,7 +1853,9 @@ public:
        boost::statechart::custom_reaction< MNotifyRec >,
        boost::statechart::custom_reaction< MLogRec >,
        boost::statechart::custom_reaction< Backfilled >,
-       boost::statechart::custom_reaction< AllReplicasActivated >
+       boost::statechart::custom_reaction< AllReplicasActivated >,
+       boost::statechart::custom_reaction< DeferRecovery >,
+       boost::statechart::custom_reaction< DeferBackfill >
        > reactions;
       boost::statechart::result react(const QueryState& q);
       boost::statechart::result react(const ActMap&);
@@ -1859,6 +1867,12 @@ public:
        return discard_event();
       }
       boost::statechart::result react(const AllReplicasActivated&);
+      boost::statechart::result react(const DeferRecovery& evt) {
+       return discard_event();
+      }
+      boost::statechart::result react(const DeferBackfill& evt) {
+       return discard_event();
+      }
     };
 
     struct Clean : boost::statechart::state< Clean, Active >, NamedState {
@@ -1886,12 +1900,12 @@ public:
     struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
       typedef boost::mpl::list<
        boost::statechart::transition< Backfilled, Recovered >,
-       boost::statechart::custom_reaction< CancelBackfill >,
+       boost::statechart::custom_reaction< DeferBackfill >,
        boost::statechart::custom_reaction< RemoteReservationRejected >
        > reactions;
       explicit Backfilling(my_context ctx);
       boost::statechart::result react(const RemoteReservationRejected& evt);
-      boost::statechart::result react(const CancelBackfill& evt);
+      boost::statechart::result react(const DeferBackfill& evt);
       void exit();
     };
 
@@ -1931,10 +1945,10 @@ public:
     struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
       typedef boost::mpl::list<
        boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
-       boost::statechart::custom_reaction< CancelRecovery >
+       boost::statechart::custom_reaction< DeferRecovery >
        > reactions;
       explicit NotRecovering(my_context ctx);
-      boost::statechart::result react(const CancelRecovery& evt) {
+      boost::statechart::result react(const DeferRecovery& evt) {
        /* no-op */
        return discard_event();
       }
@@ -1952,7 +1966,9 @@ public:
        boost::statechart::custom_reaction< MQuery >,
        boost::statechart::custom_reaction< MInfoRec >,
        boost::statechart::custom_reaction< MLogRec >,
-       boost::statechart::custom_reaction< Activate >
+       boost::statechart::custom_reaction< Activate >,
+       boost::statechart::custom_reaction< DeferRecovery >,
+       boost::statechart::custom_reaction< DeferBackfill >
        > reactions;
       boost::statechart::result react(const QueryState& q);
       boost::statechart::result react(const MInfoRec& infoevt);
@@ -1960,12 +1976,20 @@ public:
       boost::statechart::result react(const ActMap&);
       boost::statechart::result react(const MQuery&);
       boost::statechart::result react(const Activate&);
+      boost::statechart::result react(const DeferRecovery& evt) {
+       return discard_event();
+      }
+      boost::statechart::result react(const DeferBackfill& evt) {
+       return discard_event();
+      }
     };
 
     struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
       typedef boost::mpl::list<
        boost::statechart::transition< RecoveryDone, RepNotRecovering >,
+       // for compat with old peers
        boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
+       boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
        boost::statechart::custom_reaction< BackfillTooFull >
        > reactions;
       explicit RepRecovering(my_context ctx);
@@ -1976,45 +2000,62 @@ public:
     struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
       typedef boost::mpl::list<
        boost::statechart::custom_reaction< RemoteBackfillReserved >,
-       boost::statechart::custom_reaction< RemoteReservationRejected >
+       boost::statechart::custom_reaction< RejectRemoteReservation >,
+       boost::statechart::custom_reaction< RemoteReservationRejected >,
+       boost::statechart::custom_reaction< RemoteReservationCanceled >
        > reactions;
       explicit RepWaitBackfillReserved(my_context ctx);
       void exit();
       boost::statechart::result react(const RemoteBackfillReserved &evt);
+      boost::statechart::result react(const RejectRemoteReservation &evt);
       boost::statechart::result react(const RemoteReservationRejected &evt);
+      boost::statechart::result react(const RemoteReservationCanceled &evt);
     };
 
     struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
       typedef boost::mpl::list<
-       boost::statechart::custom_reaction< RemoteRecoveryReserved >
+       boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+       // for compat with old peers
+       boost::statechart::custom_reaction< RemoteReservationRejected >,
+       boost::statechart::custom_reaction< RemoteReservationCanceled >
        > reactions;
       explicit RepWaitRecoveryReserved(my_context ctx);
       void exit();
       boost::statechart::result react(const RemoteRecoveryReserved &evt);
+      boost::statechart::result react(const RemoteReservationRejected &evt) {
+       // for compat with old peers
+       post_event(RemoteReservationCanceled());
+       return discard_event();
+      }
+      boost::statechart::result react(const RemoteReservationCanceled &evt);
     };
 
     struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
       typedef boost::mpl::list<
        boost::statechart::custom_reaction< RequestBackfillPrio >,
         boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
+       boost::statechart::custom_reaction< RejectRemoteReservation >,
+       boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
+       boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
        boost::statechart::transition< RecoveryDone, RepNotRecovering >  // for compat with pre-reservation peers
        > reactions;
       explicit RepNotRecovering(my_context ctx);
       boost::statechart::result react(const RequestBackfillPrio &evt);
+      boost::statechart::result react(const RejectRemoteReservation &evt);
       void exit();
     };
 
     struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
       typedef boost::mpl::list <
        boost::statechart::custom_reaction< AllReplicasRecovered >,
-       boost::statechart::custom_reaction< CancelRecovery >,
+       boost::statechart::custom_reaction< DeferRecovery >,
        boost::statechart::custom_reaction< RequestBackfill >
        > reactions;
       explicit Recovering(my_context ctx);
       void exit();
       void release_reservations(bool cancel = false);
       boost::statechart::result react(const AllReplicasRecovered &evt);
-      boost::statechart::result react(const CancelRecovery& evt);
+      boost::statechart::result react(const DeferRecovery& evt);
       boost::statechart::result react(const RequestBackfill &evt);
     };
 
index 2877c28d67a618da2577b9b744aee684a599c3b9..3dd4eff09893b04f12545410058f70eac0261b7d 100644 (file)
@@ -717,7 +717,7 @@ void PrimaryLogPG::maybe_force_recovery()
   if (!is_degraded() &&
       !state_test(PG_STATE_RECOVERING |
                   PG_STATE_RECOVERY_WAIT |
-                 PG_STATE_BACKFILL |
+                 PG_STATE_BACKFILLING |
                  PG_STATE_BACKFILL_WAIT |
                  PG_STATE_BACKFILL_TOOFULL))
     return;
@@ -1553,7 +1553,7 @@ void PrimaryLogPG::calc_trim_to()
   if (is_degraded() ||
       state_test(PG_STATE_RECOVERING |
                 PG_STATE_RECOVERY_WAIT |
-                PG_STATE_BACKFILL |
+                PG_STATE_BACKFILLING |
                 PG_STATE_BACKFILL_WAIT |
                 PG_STATE_BACKFILL_TOOFULL)) {
     target = cct->_conf->osd_max_pg_log_entries;
@@ -2041,6 +2041,7 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
   if (write_ordered && is_degraded_or_backfilling_object(head)) {
     if (can_backoff && g_conf->osd_backoff_on_degraded) {
       add_backoff(session, head, head);
+      maybe_kick_recovery(head);
     } else {
       wait_for_degraded_object(head, op);
     }
@@ -2394,7 +2395,6 @@ void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
   dout(20) << __func__ << " r=" << r << dendl;
   assert(op->may_write());
   const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
-  ObjectContextRef obc;
   mempool::osd_pglog::list<pg_log_entry_t> entries;
   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
                                   get_next_version(), eversion_t(), 0,
@@ -4715,13 +4715,28 @@ int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
   dout(20) << __func__ << dendl;
   ceph_osd_op& op = osd_op.op;
 
-  if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
+  auto& oi = ctx->new_obs.oi;
+  uint64_t size = oi.size;
+  if ((oi.truncate_seq < op.extent.truncate_seq) &&
+      (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
+    size = op.extent.truncate_size;
+  }
+
+  if (op.extent.offset >= size) {
+    op.extent.length = 0;
+  } else if (op.extent.offset + op.extent.length > size) {
+    op.extent.length = size - op.extent.offset;
+  }
+
+  if (op.extent.length == 0) {
+    dout(20) << __func__ << " zero length extent" << dendl;
+    return finish_extent_cmp(osd_op, bufferlist{});
+  } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
     dout(20) << __func__ << " object DNE" << dendl;
     return finish_extent_cmp(osd_op, {});
   } else if (pool.info.require_rollback()) {
     // If there is a data digest and it is possible we are reading
     // entire object, pass the digest.
-    auto& oi = ctx->new_obs.oi;
     boost::optional<uint32_t> maybe_crc;
     if (oi.is_data_digest() && op.checksum.offset == 0 &&
         op.checksum.length >= oi.size) {
@@ -6865,7 +6880,7 @@ inline int PrimaryLogPG::_delete_oid(
       }
     }
   } else {
-    legacy = false;
+    legacy = true;
   }
   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
           << " no_whiteout=" << (int)no_whiteout
@@ -11313,7 +11328,7 @@ bool PrimaryLogPG::start_recovery_ops(
   assert(is_primary());
 
   if (!state_test(PG_STATE_RECOVERING) &&
-      !state_test(PG_STATE_BACKFILL)) {
+      !state_test(PG_STATE_BACKFILLING)) {
     /* TODO: I think this case is broken and will make do_recovery()
      * unhappy since we're returning false */
     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
@@ -11348,7 +11363,7 @@ bool PrimaryLogPG::start_recovery_ops(
 
   bool deferred_backfill = false;
   if (recovering.empty() &&
-      state_test(PG_STATE_BACKFILL) &&
+      state_test(PG_STATE_BACKFILLING) &&
       !backfill_targets.empty() && started < max &&
       missing.num_missing() == 0 &&
       waiting_on_backfill.empty()) {
@@ -11417,6 +11432,9 @@ bool PrimaryLogPG::start_recovery_ops(
   if (state_test(PG_STATE_RECOVERING)) {
     state_clear(PG_STATE_RECOVERING);
     state_clear(PG_STATE_FORCED_RECOVERY);
+    if (get_osdmap()->get_pg_size(info.pgid.pgid) <= acting.size()) {
+      state_clear(PG_STATE_DEGRADED);
+    }
     if (needs_backfill()) {
       dout(10) << "recovery done, queuing backfill" << dendl;
       queue_peering_event(
@@ -11437,7 +11455,7 @@ bool PrimaryLogPG::start_recovery_ops(
             AllReplicasRecovered())));
     }
   } else { // backfilling
-    state_clear(PG_STATE_BACKFILL);
+    state_clear(PG_STATE_BACKFILLING);
     state_clear(PG_STATE_FORCED_BACKFILL);
     state_clear(PG_STATE_FORCED_RECOVERY);
     dout(10) << "recovery done, backfill done" << dendl;
index a4d34d17141d29b52d4d458e3f5ad78afbc579c6..df2a45f5877bbff4625124f7e0a5bbbba3f09e89 100644 (file)
@@ -1551,10 +1551,10 @@ private:
       };
       auto *pg = context< SnapTrimmer >().pg;
       if (pg->cct->_conf->osd_snap_trim_sleep > 0) {
-       wakeup = new OnTimer{pg, pg->get_osdmap()->get_epoch()};
        Mutex::Locker l(pg->osd->snap_sleep_lock);
-       pg->osd->snap_sleep_timer.add_event_after(
-         pg->cct->_conf->osd_snap_trim_sleep, wakeup);
+       wakeup = pg->osd->snap_sleep_timer.add_event_after(
+         pg->cct->_conf->osd_snap_trim_sleep,
+         new OnTimer{pg, pg->get_osdmap()->get_epoch()});
       } else {
        post_event(SnapTrimTimerReady());
       }
index 52048eb8f018b3e2deefc35c1480e2ffb6476259..7cb1df40c63a6f84b70ff586ee73437adc3ad332 100644 (file)
@@ -58,7 +58,6 @@ public:
 
   void check_recovery_sources(const OSDMapRef& osdmap) override;
 
-  /// @see PGBackend::delay_message_until_active
   bool can_handle_while_inactive(OpRequestRef op) override;
 
   /// @see PGBackend::handle_message
index df92bb7712c451e9718877722309fd0b631fbada..7ff9f99b2bfab5929a52b7409279d20cccb5d09d 100644 (file)
@@ -124,9 +124,9 @@ void Notify::register_cb()
   {
     osd->watch_lock.Lock();
     cb = new NotifyTimeoutCB(self.lock());
-    osd->watch_timer.add_event_after(
-      timeout,
-      cb);
+    if (!osd->watch_timer.add_event_after(timeout, cb)) {
+      cb = nullptr;
+    }
     osd->watch_lock.Unlock();
   }
 }
@@ -333,9 +333,9 @@ void Watch::register_cb()
     dout(15) << "registering callback, timeout: " << timeout << dendl;
   }
   cb = new HandleWatchTimeout(self.lock());
-  osd->watch_timer.add_event_after(
-    timeout,
-    cb);
+  if (!osd->watch_timer.add_event_after(timeout, cb)) {
+    cb = nullptr;
+  }
 }
 
 void Watch::unregister_cb()
index 393cd7097d23fdb9483949fd2571e506fc31f629..b22001af6f003d62a0d4cdecf30b6ac55f47b382 100644 (file)
@@ -827,10 +827,9 @@ std::string pg_state_string(int state)
     oss << "peering+";
   if (state & PG_STATE_REPAIR)
     oss << "repair+";
-  if ((state & PG_STATE_BACKFILL_WAIT) &&
-      !(state &PG_STATE_BACKFILL))
+  if (state & PG_STATE_BACKFILL_WAIT)
     oss << "backfill_wait+";
-  if (state & PG_STATE_BACKFILL)
+  if (state & PG_STATE_BACKFILLING)
     oss << "backfilling+";
   if (state & PG_STATE_FORCED_BACKFILL)
     oss << "forced_backfill+";
@@ -854,9 +853,9 @@ std::string pg_state_string(int state)
   return ret;
 }
 
-int pg_string_state(const std::string& state)
+boost::optional<uint64_t> pg_string_state(const std::string& state)
 {
-  int type;
+  boost::optional<uint64_t> type;
   if (state == "active")
     type = PG_STATE_ACTIVE;
   else if (state == "clean")
@@ -887,8 +886,8 @@ int pg_string_state(const std::string& state)
     type = PG_STATE_REMAPPED;
   else if (state == "deep_scrub")
     type = PG_STATE_DEEP_SCRUB;
-  else if (state == "backfill")
-    type = PG_STATE_BACKFILL;
+  else if (state == "backfilling")
+    type = PG_STATE_BACKFILLING;
   else if (state == "forced_backfill")
     type = PG_STATE_FORCED_BACKFILL;
   else if (state == "backfill_toofull")
@@ -910,7 +909,7 @@ int pg_string_state(const std::string& state)
   else if (state == "snaptrim_error")
     type = PG_STATE_SNAPTRIM_ERROR;
   else
-    type = -1;
+    type = boost::none;
   return type;
 }
 
index 43d9a98e73753ea8bb14c45ba32f01ea19a24d0b..a820c8f6bfa70e8e63881310e5b038ca3fee868b 100644 (file)
@@ -986,7 +986,7 @@ inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
 #define PG_STATE_STALE        (1<<17) // our state for this pg is stale, unknown.
 #define PG_STATE_REMAPPED     (1<<18) // pg is explicitly remapped to different OSDs than CRUSH
 #define PG_STATE_DEEP_SCRUB   (1<<19) // deep scrub: check CRC32 on files
-#define PG_STATE_BACKFILL  (1<<20) // [active] backfilling pg content
+#define PG_STATE_BACKFILLING  (1<<20) // [active] backfilling pg content
 #define PG_STATE_BACKFILL_TOOFULL (1<<21) // backfill can't proceed: too full
 #define PG_STATE_RECOVERY_WAIT (1<<22) // waiting for recovery reservations
 #define PG_STATE_UNDERSIZED    (1<<23) // pg acting < pool size
@@ -1001,7 +1001,7 @@ inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
 
 std::string pg_state_string(int state);
 std::string pg_vector_string(const vector<int32_t> &a);
-int pg_string_state(const std::string& state);
+boost::optional<uint64_t> pg_string_state(const std::string& state);
 
 
 /*
@@ -1147,6 +1147,9 @@ struct pg_pool_t {
     FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
     FLAG_NOSCRUB = 1<<8, // block periodic scrub
     FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
+    FLAG_FULL_NO_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
+    FLAG_NEARFULL = 1<<11, // pool is nearfull
+    FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
   };
 
   static const char *get_flag_name(int f) {
@@ -1161,6 +1164,9 @@ struct pg_pool_t {
     case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
     case FLAG_NOSCRUB: return "noscrub";
     case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
+    case FLAG_FULL_NO_QUOTA: return "full_no_quota";
+    case FLAG_NEARFULL: return "nearfull";
+    case FLAG_BACKFILLFULL: return "backfillfull";
     default: return "???";
     }
   }
@@ -1199,6 +1205,12 @@ struct pg_pool_t {
       return FLAG_NOSCRUB;
     if (name == "nodeep-scrub")
       return FLAG_NODEEP_SCRUB;
+    if (name == "full_no_quota")
+      return FLAG_FULL_NO_QUOTA;
+    if (name == "nearfull")
+      return FLAG_NEARFULL;
+    if (name == "backfillfull")
+      return FLAG_BACKFILLFULL;
     return 0;
   }
 
index a77d6b318834796c945fbed137d7e262f0f8f5f0..4afd1de9b6fc16e31f8dc7981a2d7093d0b7aeb5 100644 (file)
@@ -12,6 +12,7 @@
 #include "include/assert.h"
 
 #define MAX_FLUSH_UNDER_LOCK 20  ///< max bh's we start writeback on
+#define BUFFER_MEMORY_WEIGHT 12   // memory usage of BufferHead, count in (1<<n)
 
 using std::chrono::seconds;
                                 /// while holding the lock
@@ -625,7 +626,8 @@ ObjectCacher::ObjectCacher(CephContext *cct_, string name,
     flush_set_callback_arg(flush_callback_arg),
     last_read_tid(0), flusher_stop(false), flusher_thread(this),finisher(cct),
     stat_clean(0), stat_zero(0), stat_dirty(0), stat_rx(0), stat_tx(0),
-    stat_missing(0), stat_error(0), stat_dirty_waiting(0), reads_outstanding(0)
+    stat_missing(0), stat_error(0), stat_dirty_waiting(0),
+    stat_nr_dirty_waiters(0), reads_outstanding(0)
 {
   perf_start();
   finisher.start();
@@ -1255,7 +1257,11 @@ void ObjectCacher::trim()
                 << get_stat_clean() << ", objects: max " << max_objects
                 << " current " << ob_lru.lru_get_size() << dendl;
 
-  while (get_stat_clean() > 0 && (uint64_t) get_stat_clean() > max_size) {
+  uint64_t max_clean_bh = max_size >> BUFFER_MEMORY_WEIGHT;
+  uint64_t nr_clean_bh = bh_lru_rest.lru_get_size() - bh_lru_rest.lru_get_num_pinned();
+  while (get_stat_clean() > 0 &&
+        ((uint64_t)get_stat_clean() > max_size ||
+         nr_clean_bh > max_clean_bh)) {
     BufferHead *bh = static_cast<BufferHead*>(bh_lru_rest.lru_expire());
     if (!bh)
       break;
@@ -1267,6 +1273,8 @@ void ObjectCacher::trim()
     bh_remove(ob, bh);
     delete bh;
 
+    --nr_clean_bh;
+
     if (ob->complete) {
       ldout(cct, 10) << "trim clearing complete on " << *ob << dendl;
       ob->complete = false;
@@ -1782,9 +1790,14 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len,
   //  - do not wait for bytes other waiters are waiting on.  this means that
   //    threads do not wait for each other.  this effectively allows the cache
   //    size to balloon proportional to the data that is in flight.
+
+  uint64_t max_dirty_bh = max_dirty >> BUFFER_MEMORY_WEIGHT;
   while (get_stat_dirty() + get_stat_tx() > 0 &&
-        (uint64_t) (get_stat_dirty() + get_stat_tx()) >=
-        max_dirty + get_stat_dirty_waiting()) {
+        (((uint64_t)(get_stat_dirty() + get_stat_tx()) >=
+         max_dirty + get_stat_dirty_waiting()) ||
+        (dirty_or_tx_bh.size() >=
+         max_dirty_bh + get_stat_nr_dirty_waiters()))) {
+
     if (blocked == 0) {
       trace->event("start wait for writeback");
     }
@@ -1794,8 +1807,10 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len,
                   << get_stat_dirty_waiting() << dendl;
     flusher_cond.Signal();
     stat_dirty_waiting += len;
+    ++stat_nr_dirty_waiters;
     stat_cond.Wait(lock);
     stat_dirty_waiting -= len;
+    --stat_nr_dirty_waiters;
     ++blocked;
     ldout(cct, 10) << __func__ << " woke up" << dendl;
   }
index 31201a7235422df27ed375b6c26776721e43d310..58b3e7aafeea373f6412216c410f2eaeefbefdd8 100644 (file)
@@ -461,6 +461,8 @@ class ObjectCacher {
   loff_t stat_error;
   loff_t stat_dirty_waiting;   // bytes that writers are waiting on to write
 
+  size_t stat_nr_dirty_waiters;
+
   void verify_stats() const;
 
   void bh_stat_add(BufferHead *bh);
@@ -468,9 +470,10 @@ class ObjectCacher {
   loff_t get_stat_tx() const { return stat_tx; }
   loff_t get_stat_rx() const { return stat_rx; }
   loff_t get_stat_dirty() const { return stat_dirty; }
-  loff_t get_stat_dirty_waiting() const { return stat_dirty_waiting; }
   loff_t get_stat_clean() const { return stat_clean; }
   loff_t get_stat_zero() const { return stat_zero; }
+  loff_t get_stat_dirty_waiting() const { return stat_dirty_waiting; }
+  size_t get_stat_nr_dirty_waiters() const { return stat_nr_dirty_waiters; }
 
   void touch_bh(BufferHead *bh) {
     if (bh->is_dirty())
index 89722ec95e737470427dedf1546fc58e0c65b832..ac01807b69362a3b4c927e1d021b165ecf6b529b 100644 (file)
@@ -31,7 +31,6 @@ class RadosError(Exception):
 
 
 RADOS_TIMEOUT = 10
-SNAP_DIR = ".snap"
 
 log = logging.getLogger(__name__)
 
@@ -204,6 +203,7 @@ CEPHFSVOLUMECLIENT_VERSION_HISTORY = """
     CephFSVolumeClient Version History:
 
     * 1 - Initial version
+    * 2 - Added get_object, put_object, delete_object methods to CephFSVolumeClient
 
 """
 
@@ -228,9 +228,7 @@ class CephFSVolumeClient(object):
     """
 
     # Current version
-    version = 1
-    # Earliest compatible version
-    compat_version = 1
+    version = 2
 
     # Where shall we create our volumes?
     POOL_PREFIX = "fsvolume_"
@@ -529,7 +527,7 @@ class CephFSVolumeClient(object):
         # We can't query the actual cluster config remotely, but since this is
         # just a heuristic we'll assume that the ceph.conf we have locally reflects
         # that in use in the rest of the cluster.
-        pg_warn_max_per_osd = int(self.rados.conf_get('mon_pg_warn_max_per_osd'))
+        pg_warn_max_per_osd = int(self.rados.conf_get('mon_max_pg_per_osd'))
 
         other_pgs = 0
         for pool in osd_map['pools']:
@@ -851,7 +849,7 @@ class CephFSVolumeClient(object):
         that encoded the metadata.
         """
         data['compat_version'] = 1
-        data['version'] = 1
+        data['version'] = self.version
         return self._metadata_set(self._auth_metadata_path(auth_id), data)
 
     def _volume_metadata_path(self, volume_path):
@@ -904,7 +902,7 @@ class CephFSVolumeClient(object):
         that encoded the metadata.
         """
         data['compat_version'] = 1
-        data['version'] = 1
+        data['version'] = self.version
         return self._metadata_set(self._volume_metadata_path(volume_path), data)
 
     def authorize(self, volume_path, auth_id, readonly=False, tenant_id=None):
@@ -1067,6 +1065,9 @@ class CephFSVolumeClient(object):
                 # occurrence of wanted auth caps and no occurrence of
                 # conflicting auth caps.
 
+                if not orig:
+                    return want
+
                 cap_tokens = set(orig.split(","))
 
                 cap_tokens.discard(unwanted)
@@ -1296,7 +1297,7 @@ class CephFSVolumeClient(object):
 
     def _snapshot_path(self, dir_path, snapshot_name):
         return os.path.join(
-            dir_path, SNAP_DIR, snapshot_name
+            dir_path, self.rados.conf_get('client_snapdir'), snapshot_name
         )
 
     def _snapshot_create(self, dir_path, snapshot_name):
@@ -1341,3 +1342,58 @@ class CephFSVolumeClient(object):
         src_snapshot_path = self._snapshot_path(self._get_path(src_volume_path), src_snapshot_name)
 
         self._cp_r(src_snapshot_path, dest_fs_path)
+
+    def put_object(self, pool_name, object_name, data):
+        """
+        Synchronously write data to an object.
+
+        :param pool_name: name of the pool
+        :type pool_name: str
+        :param object_name: name of the object
+        :type object_name: str
+        :param data: data to write
+        :type data: bytes
+        """
+        ioctx = self.rados.open_ioctx(pool_name)
+        max_size = int(self.rados.conf_get('osd_max_write_size')) * 1024 * 1024
+        if len(data) > max_size:
+            msg = ("Data to be written to object '{0}' exceeds "
+                   "{1} bytes".format(object_name, max_size))
+            log.error(msg)
+            raise CephFSVolumeClientError(msg)
+        try:
+            ioctx.write_full(object_name, data)
+        finally:
+            ioctx.close()
+
+    def get_object(self, pool_name, object_name):
+        """
+        Synchronously read data from object.
+
+        :param pool_name: name of the pool
+        :type pool_name: str
+        :param object_name: name of the object
+        :type object_name: str
+
+        :returns: bytes - data read from object
+        """
+        ioctx = self.rados.open_ioctx(pool_name)
+        max_size = int(self.rados.conf_get('osd_max_write_size')) * 1024 * 1024
+        try:
+            bytes_read = ioctx.read(object_name, max_size)
+            if ((len(bytes_read) == max_size) and
+                    (ioctx.read(object_name, 1, offset=max_size))):
+                log.warning("Size of object {0} exceeds '{1}' bytes "
+                            "read".format(object_name, max_size))
+        finally:
+            ioctx.close()
+        return bytes_read
+
+    def delete_object(self, pool_name, object_name):
+        ioctx = self.rados.open_ioctx(pool_name)
+        try:
+            ioctx.remove_object(object_name)
+        except rados.ObjectNotFound:
+            log.warn("Object '{0}' was already removed".format(object_name))
+        finally:
+            ioctx.close()
diff --git a/ceph/src/pybind/mgr/balancer/__init__.py b/ceph/src/pybind/mgr/balancer/__init__.py
new file mode 100644 (file)
index 0000000..79f5b86
--- /dev/null
@@ -0,0 +1,2 @@
+
+from module import *  # NOQA
diff --git a/ceph/src/pybind/mgr/balancer/module.py b/ceph/src/pybind/mgr/balancer/module.py
new file mode 100644 (file)
index 0000000..b8cc087
--- /dev/null
@@ -0,0 +1,933 @@
+
+"""
+Balance PG distribution across OSDs.
+"""
+
+import copy
+import errno
+import json
+import math
+import random
+import time
+from mgr_module import MgrModule, CommandResult
+from threading import Event
+
+# available modes: 'none', 'crush', 'crush-compat', 'upmap', 'osd_weight'
+default_mode = 'none'
+default_sleep_interval = 60   # seconds
+default_max_misplaced = .05    # max ratio of pgs replaced at a time
+
+TIME_FORMAT = '%Y-%m-%d_%H:%M:%S'
+
+
+class MappingState:
+    def __init__(self, osdmap, pg_dump, desc=''):
+        self.desc = desc
+        self.osdmap = osdmap
+        self.osdmap_dump = self.osdmap.dump()
+        self.crush = osdmap.get_crush()
+        self.crush_dump = self.crush.dump()
+        self.pg_dump = pg_dump
+        self.pg_stat = {
+            i['pgid']: i['stat_sum'] for i in pg_dump.get('pg_stats', [])
+        }
+        self.poolids = [p['pool'] for p in self.osdmap_dump.get('pools', [])]
+        self.pg_up = {}
+        self.pg_up_by_poolid = {}
+        for poolid in self.poolids:
+            self.pg_up_by_poolid[poolid] = osdmap.map_pool_pgs_up(poolid)
+            for a,b in self.pg_up_by_poolid[poolid].iteritems():
+                self.pg_up[a] = b
+
+    def calc_misplaced_from(self, other_ms):
+        num = len(other_ms.pg_up)
+        misplaced = 0
+        for pgid, before in other_ms.pg_up.iteritems():
+            if before != self.pg_up.get(pgid, []):
+                misplaced += 1
+        if num > 0:
+            return float(misplaced) / float(num)
+        return 0.0
+
+class Plan:
+    def __init__(self, name, ms):
+        self.mode = 'unknown'
+        self.name = name
+        self.initial = ms
+
+        self.osd_weights = {}
+        self.compat_ws = {}
+        self.inc = ms.osdmap.new_incremental()
+
+    def final_state(self):
+        self.inc.set_osd_reweights(self.osd_weights)
+        self.inc.set_crush_compat_weight_set_weights(self.compat_ws)
+        return MappingState(self.initial.osdmap.apply_incremental(self.inc),
+                            self.initial.pg_dump,
+                            'plan %s final' % self.name)
+
+    def dump(self):
+        return json.dumps(self.inc.dump(), indent=4)
+
+    def show(self):
+        ls = []
+        ls.append('# starting osdmap epoch %d' % self.initial.osdmap.get_epoch())
+        ls.append('# starting crush version %d' %
+                  self.initial.osdmap.get_crush_version())
+        ls.append('# mode %s' % self.mode)
+        if len(self.compat_ws) and \
+           '-1' not in self.initial.crush_dump.get('choose_args', {}):
+            ls.append('ceph osd crush weight-set create-compat')
+        for osd, weight in self.compat_ws.iteritems():
+            ls.append('ceph osd crush weight-set reweight-compat %s %f' %
+                      (osd, weight))
+        for osd, weight in self.osd_weights.iteritems():
+            ls.append('ceph osd reweight osd.%d %f' % (osd, weight))
+        incdump = self.inc.dump()
+        for pgid in incdump.get('old_pg_upmap_items', []):
+            ls.append('ceph osd rm-pg-upmap-items %s' % pgid)
+        for item in incdump.get('new_pg_upmap_items', []):
+            osdlist = []
+            for m in item['mappings']:
+                osdlist += [m['from'], m['to']]
+            ls.append('ceph osd pg-upmap-items %s %s' %
+                      (item['pgid'], ' '.join([str(a) for a in osdlist])))
+        return '\n'.join(ls)
+
+
+class Eval:
+    root_ids = {}        # root name -> id
+    pool_name = {}       # pool id -> pool name
+    pool_id = {}         # pool name -> id
+    pool_roots = {}      # pool name -> root name
+    root_pools = {}      # root name -> pools
+    target_by_root = {}  # root name -> target weight map
+    count_by_pool = {}
+    count_by_root = {}
+    actual_by_pool = {}  # pool -> by_* -> actual weight map
+    actual_by_root = {}  # pool -> by_* -> actual weight map
+    total_by_pool = {}   # pool -> by_* -> total
+    total_by_root = {}   # root -> by_* -> total
+    stats_by_pool = {}   # pool -> by_* -> stddev or avg -> value
+    stats_by_root = {}   # root -> by_* -> stddev or avg -> value
+
+    score_by_pool = {}
+    score_by_root = {}
+
+    score = 0.0
+
+    def __init__(self, ms):
+        self.ms = ms
+
+    def show(self, verbose=False):
+        if verbose:
+            r = self.ms.desc + '\n'
+            r += 'target_by_root %s\n' % self.target_by_root
+            r += 'actual_by_pool %s\n' % self.actual_by_pool
+            r += 'actual_by_root %s\n' % self.actual_by_root
+            r += 'count_by_pool %s\n' % self.count_by_pool
+            r += 'count_by_root %s\n' % self.count_by_root
+            r += 'total_by_pool %s\n' % self.total_by_pool
+            r += 'total_by_root %s\n' % self.total_by_root
+            r += 'stats_by_root %s\n' % self.stats_by_root
+            r += 'score_by_pool %s\n' % self.score_by_pool
+            r += 'score_by_root %s\n' % self.score_by_root
+        else:
+            r = self.ms.desc + ' '
+        r += 'score %f (lower is better)\n' % self.score
+        return r
+
+    def calc_stats(self, count, target, total):
+        num = max(len(target), 1)
+        r = {}
+        for t in ('pgs', 'objects', 'bytes'):
+            avg = float(total[t]) / float(num)
+            dev = 0.0
+
+            # score is a measure of how uneven the data distribution is.
+            # score lies between [0, 1), 0 means perfect distribution.
+            score = 0.0
+            sum_weight = 0.0
+
+            for k, v in count[t].iteritems():
+                # adjust/normalize by weight
+                if target[k]:
+                    adjusted = float(v) / target[k] / float(num)
+                else:
+                    adjusted = 0.0
+
+                # Overweighted devices and their weights are factors to calculate reweight_urgency.
+                # One 10% underfilled device with 5 2% overfilled devices, is arguably a better
+                # situation than one 10% overfilled with 5 2% underfilled devices
+                if adjusted > avg:
+                    '''
+                    F(x) = 2*phi(x) - 1, where phi(x) = cdf of standard normal distribution
+                    x = (adjusted - avg)/avg.
+                    Since, we're considering only over-weighted devices, x >= 0, and so phi(x) lies in [0.5, 1).
+                    To bring range of F(x) in range [0, 1), we need to make the above modification.
+
+                    In general, we need to use a function F(x), where x = (adjusted - avg)/avg
+                    1. which is bounded between 0 and 1, so that ultimately reweight_urgency will also be bounded.
+                    2. A larger value of x, should imply more urgency to reweight.
+                    3. Also, the difference between F(x) when x is large, should be minimal.
+                    4. The value of F(x) should get close to 1 (highest urgency to reweight) with steeply.
+
+                    Could have used F(x) = (1 - e^(-x)). But that had slower convergence to 1, compared to the one currently in use.
+
+                    cdf of standard normal distribution: https://stackoverflow.com/a/29273201
+                    '''
+                    score += target[k] * (math.erf(((adjusted - avg)/avg) / math.sqrt(2.0)))
+                    sum_weight += target[k]
+                dev += (avg - adjusted) * (avg - adjusted)
+            stddev = math.sqrt(dev / float(max(num - 1, 1)))
+            score = score / max(sum_weight, 1)
+            r[t] = {
+                'avg': avg,
+                'stddev': stddev,
+                'sum_weight': sum_weight,
+                'score': score,
+            }
+        return r
+
+class Module(MgrModule):
+    COMMANDS = [
+        {
+            "cmd": "balancer status",
+            "desc": "Show balancer status",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap",
+            "desc": "Set balancer mode",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer on",
+            "desc": "Enable automatic balancing",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer off",
+            "desc": "Disable automatic balancing",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer eval name=plan,type=CephString,req=false",
+            "desc": "Evaluate data distribution for the current cluster or specific plan",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer eval-verbose name=plan,type=CephString,req=false",
+            "desc": "Evaluate data distribution for the current cluster or specific plan (verbosely)",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer optimize name=plan,type=CephString",
+            "desc": "Run optimizer to create a new plan",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer show name=plan,type=CephString",
+            "desc": "Show details of an optimization plan",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer rm name=plan,type=CephString",
+            "desc": "Discard an optimization plan",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer reset",
+            "desc": "Discard all optimization plans",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer dump name=plan,type=CephString",
+            "desc": "Show an optimization plan",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer execute name=plan,type=CephString",
+            "desc": "Execute an optimization plan",
+            "perm": "r",
+        },
+    ]
+    active = False
+    run = True
+    plans = {}
+    mode = ''
+
+    def __init__(self, *args, **kwargs):
+        super(Module, self).__init__(*args, **kwargs)
+        self.event = Event()
+
+    def handle_command(self, command):
+        self.log.warn("Handling command: '%s'" % str(command))
+        if command['prefix'] == 'balancer status':
+            s = {
+                'plans': self.plans.keys(),
+                'active': self.active,
+                'mode': self.get_config('mode', default_mode),
+            }
+            return (0, json.dumps(s, indent=4), '')
+        elif command['prefix'] == 'balancer mode':
+            self.set_config('mode', command['mode'])
+            return (0, '', '')
+        elif command['prefix'] == 'balancer on':
+            if not self.active:
+                self.set_config('active', '1')
+                self.active = True
+            self.event.set()
+            return (0, '', '')
+        elif command['prefix'] == 'balancer off':
+            if self.active:
+                self.set_config('active', '')
+                self.active = False
+            self.event.set()
+            return (0, '', '')
+        elif command['prefix'] == 'balancer eval' or command['prefix'] == 'balancer eval-verbose':
+            verbose = command['prefix'] == 'balancer eval-verbose'
+            if 'plan' in command:
+                plan = self.plans.get(command['plan'])
+                if not plan:
+                    return (-errno.ENOENT, '', 'plan %s not found' %
+                            command['plan'])
+                ms = plan.final_state()
+            else:
+                ms = MappingState(self.get_osdmap(),
+                                  self.get("pg_dump"),
+                                  'current cluster')
+            return (0, self.evaluate(ms, verbose=verbose), '')
+        elif command['prefix'] == 'balancer optimize':
+            plan = self.plan_create(command['plan'])
+            self.optimize(plan)
+            return (0, '', '')
+        elif command['prefix'] == 'balancer rm':
+            self.plan_rm(command['name'])
+            return (0, '', '')
+        elif command['prefix'] == 'balancer reset':
+            self.plans = {}
+            return (0, '', '')
+        elif command['prefix'] == 'balancer dump':
+            plan = self.plans.get(command['plan'])
+            if not plan:
+                return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
+            return (0, plan.dump(), '')
+        elif command['prefix'] == 'balancer show':
+            plan = self.plans.get(command['plan'])
+            if not plan:
+                return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
+            return (0, plan.show(), '')
+        elif command['prefix'] == 'balancer execute':
+            plan = self.plans.get(command['plan'])
+            if not plan:
+                return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
+            self.execute(plan)
+            self.plan_rm(plan)
+            return (0, '', '')
+        else:
+            return (-errno.EINVAL, '',
+                    "Command not found '{0}'".format(command['prefix']))
+
+    def shutdown(self):
+        self.log.info('Stopping')
+        self.run = False
+        self.event.set()
+
+    def time_in_interval(self, tod, begin, end):
+        if begin <= end:
+            return tod >= begin and tod < end
+        else:
+            return tod >= begin or tod < end
+
+    def serve(self):
+        self.log.info('Starting')
+        while self.run:
+            self.active = self.get_config('active', '') is not ''
+            begin_time = self.get_config('begin_time') or '0000'
+            end_time = self.get_config('end_time') or '2400'
+            timeofday = time.strftime('%H%M', time.localtime())
+            self.log.debug('Waking up [%s, scheduled for %s-%s, now %s]',
+                           "active" if self.active else "inactive",
+                           begin_time, end_time, timeofday)
+            sleep_interval = float(self.get_config('sleep_interval',
+                                                   default_sleep_interval))
+            if self.active and self.time_in_interval(timeofday, begin_time, end_time):
+                self.log.debug('Running')
+                name = 'auto_%s' % time.strftime(TIME_FORMAT, time.gmtime())
+                plan = self.plan_create(name)
+                if self.optimize(plan):
+                    self.execute(plan)
+                self.plan_rm(name)
+            self.log.debug('Sleeping for %d', sleep_interval)
+            self.event.wait(sleep_interval)
+            self.event.clear()
+
+    def plan_create(self, name):
+        plan = Plan(name, MappingState(self.get_osdmap(),
+                                       self.get("pg_dump"),
+                                       'plan %s initial' % name))
+        self.plans[name] = plan
+        return plan
+
+    def plan_rm(self, name):
+        if name in self.plans:
+            del self.plans[name]
+
+    def calc_eval(self, ms):
+        pe = Eval(ms)
+        pool_rule = {}
+        pool_info = {}
+        for p in ms.osdmap_dump.get('pools',[]):
+            pe.pool_name[p['pool']] = p['pool_name']
+            pe.pool_id[p['pool_name']] = p['pool']
+            pool_rule[p['pool_name']] = p['crush_rule']
+            pe.pool_roots[p['pool_name']] = []
+            pool_info[p['pool_name']] = p
+        pools = pe.pool_id.keys()
+        if len(pools) == 0:
+            return pe
+        self.log.debug('pool_name %s' % pe.pool_name)
+        self.log.debug('pool_id %s' % pe.pool_id)
+        self.log.debug('pools %s' % pools)
+        self.log.debug('pool_rule %s' % pool_rule)
+
+        osd_weight = { a['osd']: a['weight']
+                       for a in ms.osdmap_dump.get('osds',[]) }
+
+        # get expected distributions by root
+        actual_by_root = {}
+        rootids = ms.crush.find_takes()
+        roots = []
+        for rootid in rootids:
+            root = ms.crush.get_item_name(rootid)
+            pe.root_ids[root] = rootid
+            roots.append(root)
+            ls = ms.osdmap.get_pools_by_take(rootid)
+            pe.root_pools[root] = []
+            for poolid in ls:
+                pe.pool_roots[pe.pool_name[poolid]].append(root)
+                pe.root_pools[root].append(pe.pool_name[poolid])
+            weight_map = ms.crush.get_take_weight_osd_map(rootid)
+            adjusted_map = {
+                osd: cw * osd_weight.get(osd, 1.0)
+                for osd,cw in weight_map.iteritems()
+            }
+            sum_w = sum(adjusted_map.values()) or 1.0
+            pe.target_by_root[root] = { osd: w / sum_w
+                                        for osd,w in adjusted_map.iteritems() }
+            actual_by_root[root] = {
+                'pgs': {},
+                'objects': {},
+                'bytes': {},
+            }
+            for osd in pe.target_by_root[root].iterkeys():
+                actual_by_root[root]['pgs'][osd] = 0
+                actual_by_root[root]['objects'][osd] = 0
+                actual_by_root[root]['bytes'][osd] = 0
+            pe.total_by_root[root] = {
+                'pgs': 0,
+                'objects': 0,
+                'bytes': 0,
+            }
+        self.log.debug('pool_roots %s' % pe.pool_roots)
+        self.log.debug('root_pools %s' % pe.root_pools)
+        self.log.debug('target_by_root %s' % pe.target_by_root)
+
+        # pool and root actual
+        for pool, pi in pool_info.iteritems():
+            poolid = pi['pool']
+            pm = ms.pg_up_by_poolid[poolid]
+            pgs = 0
+            objects = 0
+            bytes = 0
+            pgs_by_osd = {}
+            objects_by_osd = {}
+            bytes_by_osd = {}
+            for root in pe.pool_roots[pool]:
+                for osd in pe.target_by_root[root].iterkeys():
+                    pgs_by_osd[osd] = 0
+                    objects_by_osd[osd] = 0
+                    bytes_by_osd[osd] = 0
+            for pgid, up in pm.iteritems():
+                for osd in [int(osd) for osd in up]:
+                    pgs_by_osd[osd] += 1
+                    objects_by_osd[osd] += ms.pg_stat[pgid]['num_objects']
+                    bytes_by_osd[osd] += ms.pg_stat[pgid]['num_bytes']
+                    # pick a root to associate this pg instance with.
+                    # note that this is imprecise if the roots have
+                    # overlapping children.
+                    # FIXME: divide bytes by k for EC pools.
+                    for root in pe.pool_roots[pool]:
+                        if osd in pe.target_by_root[root]:
+                            actual_by_root[root]['pgs'][osd] += 1
+                            actual_by_root[root]['objects'][osd] += ms.pg_stat[pgid]['num_objects']
+                            actual_by_root[root]['bytes'][osd] += ms.pg_stat[pgid]['num_bytes']
+                            pgs += 1
+                            objects += ms.pg_stat[pgid]['num_objects']
+                            bytes += ms.pg_stat[pgid]['num_bytes']
+                            pe.total_by_root[root]['pgs'] += 1
+                            pe.total_by_root[root]['objects'] += ms.pg_stat[pgid]['num_objects']
+                            pe.total_by_root[root]['bytes'] += ms.pg_stat[pgid]['num_bytes']
+                            break
+            pe.count_by_pool[pool] = {
+                'pgs': {
+                    k: v
+                    for k, v in pgs_by_osd.iteritems()
+                },
+                'objects': {
+                    k: v
+                    for k, v in objects_by_osd.iteritems()
+                },
+                'bytes': {
+                    k: v
+                    for k, v in bytes_by_osd.iteritems()
+                },
+            }
+            pe.actual_by_pool[pool] = {
+                'pgs': {
+                    k: float(v) / float(max(pgs, 1))
+                    for k, v in pgs_by_osd.iteritems()
+                },
+                'objects': {
+                    k: float(v) / float(max(objects, 1))
+                    for k, v in objects_by_osd.iteritems()
+                },
+                'bytes': {
+                    k: float(v) / float(max(bytes, 1))
+                    for k, v in bytes_by_osd.iteritems()
+                },
+            }
+            pe.total_by_pool[pool] = {
+                'pgs': pgs,
+                'objects': objects,
+                'bytes': bytes,
+            }
+        for root, m in pe.total_by_root.iteritems():
+            pe.count_by_root[root] = {
+                'pgs': {
+                    k: float(v)
+                    for k, v in actual_by_root[root]['pgs'].iteritems()
+                },
+                'objects': {
+                    k: float(v)
+                    for k, v in actual_by_root[root]['objects'].iteritems()
+                },
+                'bytes': {
+                    k: float(v)
+                    for k, v in actual_by_root[root]['bytes'].iteritems()
+                },
+            }
+            pe.actual_by_root[root] = {
+                'pgs': {
+                    k: float(v) / float(max(pe.total_by_root[root]['pgs'], 1))
+                    for k, v in actual_by_root[root]['pgs'].iteritems()
+                },
+                'objects': {
+                    k: float(v) / float(max(pe.total_by_root[root]['objects'], 1))
+                    for k, v in actual_by_root[root]['objects'].iteritems()
+                },
+                'bytes': {
+                    k: float(v) / float(max(pe.total_by_root[root]['bytes'], 1))
+                    for k, v in actual_by_root[root]['bytes'].iteritems()
+                },
+            }
+        self.log.debug('actual_by_pool %s' % pe.actual_by_pool)
+        self.log.debug('actual_by_root %s' % pe.actual_by_root)
+
+        # average and stddev and score
+        pe.stats_by_root = {
+            a: pe.calc_stats(
+                b,
+                pe.target_by_root[a],
+                pe.total_by_root[a]
+            ) for a, b in pe.count_by_root.iteritems()
+        }
+
+       # the scores are already normalized
+        pe.score_by_root = {
+            r: {
+                'pgs': pe.stats_by_root[r]['pgs']['score'],
+                'objects': pe.stats_by_root[r]['objects']['score'],
+                'bytes': pe.stats_by_root[r]['bytes']['score'],
+            } for r in pe.total_by_root.keys()
+        }
+
+        # total score is just average of normalized stddevs
+        pe.score = 0.0
+        for r, vs in pe.score_by_root.iteritems():
+            for k, v in vs.iteritems():
+                pe.score += v
+        pe.score /= 3 * len(roots)
+        return pe
+
+    def evaluate(self, ms, verbose=False):
+        pe = self.calc_eval(ms)
+        return pe.show(verbose=verbose)
+
+    def optimize(self, plan):
+        self.log.info('Optimize plan %s' % plan.name)
+        plan.mode = self.get_config('mode', default_mode)
+        max_misplaced = float(self.get_config('max_misplaced',
+                                              default_max_misplaced))
+        self.log.info('Mode %s, max misplaced %f' %
+                      (plan.mode, max_misplaced))
+
+        info = self.get('pg_status')
+        unknown = info.get('unknown_pgs_ratio', 0.0)
+        degraded = info.get('degraded_ratio', 0.0)
+        inactive = info.get('inactive_pgs_ratio', 0.0)
+        misplaced = info.get('misplaced_ratio', 0.0)
+        self.log.debug('unknown %f degraded %f inactive %f misplaced %g',
+                       unknown, degraded, inactive, misplaced)
+        if unknown > 0.0:
+            self.log.info('Some PGs (%f) are unknown; waiting', unknown)
+        elif degraded > 0.0:
+            self.log.info('Some objects (%f) are degraded; waiting', degraded)
+        elif inactive > 0.0:
+            self.log.info('Some PGs (%f) are inactive; waiting', inactive)
+        elif misplaced >= max_misplaced:
+            self.log.info('Too many objects (%f > %f) are misplaced; waiting',
+                          misplaced, max_misplaced)
+        else:
+            if plan.mode == 'upmap':
+                return self.do_upmap(plan)
+            elif plan.mode == 'crush-compat':
+                return self.do_crush_compat(plan)
+            elif plan.mode == 'none':
+                self.log.info('Idle')
+            else:
+                self.log.info('Unrecognized mode %s' % plan.mode)
+        return False
+
+        ##
+
+    def do_upmap(self, plan):
+        self.log.info('do_upmap')
+        max_iterations = self.get_config('upmap_max_iterations', 10)
+        max_deviation = self.get_config('upmap_max_deviation', .01)
+
+        ms = plan.initial
+        pools = [str(i['pool_name']) for i in ms.osdmap_dump.get('pools',[])]
+        if len(pools) == 0:
+            self.log.info('no pools, nothing to do')
+            return False
+        # shuffle pool list so they all get equal (in)attention
+        random.shuffle(pools)
+        self.log.info('pools %s' % pools)
+
+        inc = plan.inc
+        total_did = 0
+        left = max_iterations
+        for pool in pools:
+            did = ms.osdmap.calc_pg_upmaps(inc, max_deviation, left, [pool])
+            total_did += did
+            left -= did
+            if left <= 0:
+                break
+        self.log.info('prepared %d/%d changes' % (total_did, max_iterations))
+        return True
+
+    def do_crush_compat(self, plan):
+        self.log.info('do_crush_compat')
+        max_iterations = self.get_config('crush_compat_max_iterations', 25)
+        if max_iterations < 1:
+            return False
+        step = self.get_config('crush_compat_step', .5)
+        if step <= 0 or step >= 1.0:
+            return False
+        max_misplaced = float(self.get_config('max_misplaced',
+                                              default_max_misplaced))
+        min_pg_per_osd = 2
+
+        ms = plan.initial
+        osdmap = ms.osdmap
+        crush = osdmap.get_crush()
+        pe = self.calc_eval(ms)
+        if pe.score == 0:
+            self.log.info('Distribution is already perfect')
+            return False
+
+        # get current osd reweights
+        orig_osd_weight = { a['osd']: a['weight']
+                            for a in ms.osdmap_dump.get('osds',[]) }
+        reweighted_osds = [ a for a,b in orig_osd_weight.iteritems()
+                            if b < 1.0 and b > 0.0 ]
+
+        # get current compat weight-set weights
+        orig_ws = self.get_compat_weight_set_weights()
+        orig_ws = { a: b for a, b in orig_ws.iteritems() if a >= 0 }
+
+        # Make sure roots don't overlap their devices.  If so, we
+        # can't proceed.
+        roots = pe.target_by_root.keys()
+        self.log.debug('roots %s', roots)
+        visited = {}
+        overlap = {}
+        root_ids = {}
+        for root, wm in pe.target_by_root.iteritems():
+            for osd in wm.iterkeys():
+                if osd in visited:
+                    overlap[osd] = 1
+                visited[osd] = 1
+        if len(overlap) > 0:
+            self.log.err('error: some osds belong to multiple subtrees: %s' %
+                         overlap)
+            return False
+
+        key = 'pgs'  # pgs objects or bytes
+
+        # go
+        best_ws = copy.deepcopy(orig_ws)
+        best_ow = copy.deepcopy(orig_osd_weight)
+        best_pe = pe
+        left = max_iterations
+        bad_steps = 0
+        next_ws = copy.deepcopy(best_ws)
+        next_ow = copy.deepcopy(best_ow)
+        while left > 0:
+            # adjust
+            self.log.debug('best_ws %s' % best_ws)
+            random.shuffle(roots)
+            for root in roots:
+                pools = best_pe.root_pools[root]
+                pgs = len(best_pe.target_by_root[root])
+                min_pgs = pgs * min_pg_per_osd
+                if best_pe.total_by_root[root] < min_pgs:
+                    self.log.info('Skipping root %s (pools %s), total pgs %d '
+                                  '< minimum %d (%d per osd)',
+                                  root, pools, pgs, min_pgs, min_pg_per_osd)
+                    continue
+                self.log.info('Balancing root %s (pools %s) by %s' %
+                              (root, pools, key))
+                target = best_pe.target_by_root[root]
+                actual = best_pe.actual_by_root[root][key]
+                queue = sorted(actual.keys(),
+                               key=lambda osd: -abs(target[osd] - actual[osd]))
+                for osd in queue:
+                    if orig_osd_weight[osd] == 0:
+                        self.log.debug('skipping out osd.%d', osd)
+                    else:
+                        deviation = target[osd] - actual[osd]
+                        if deviation == 0:
+                            break
+                        self.log.debug('osd.%d deviation %f', osd, deviation)
+                        weight = best_ws[osd]
+                        ow = orig_osd_weight[osd]
+                        if actual[osd] > 0:
+                            calc_weight = target[osd] / actual[osd] * weight * ow
+                        else:
+                            # not enough to go on here... keep orig weight
+                            calc_weight = weight / orig_osd_weight[osd]
+                        new_weight = weight * (1.0 - step) + calc_weight * step
+                        self.log.debug('Reweight osd.%d %f -> %f', osd, weight,
+                                       new_weight)
+                        next_ws[osd] = new_weight
+                        if ow < 1.0:
+                            new_ow = min(1.0, max(step + (1.0 - step) * ow,
+                                                  ow + .005))
+                            self.log.debug('Reweight osd.%d reweight %f -> %f',
+                                           osd, ow, new_ow)
+                            next_ow[osd] = new_ow
+
+                # normalize weights under this root
+                root_weight = crush.get_item_weight(pe.root_ids[root])
+                root_sum = sum(b for a,b in next_ws.iteritems()
+                               if a in target.keys())
+                if root_sum > 0 and root_weight > 0:
+                    factor = root_sum / root_weight
+                    self.log.debug('normalizing root %s %d, weight %f, '
+                                   'ws sum %f, factor %f',
+                                   root, pe.root_ids[root], root_weight,
+                                   root_sum, factor)
+                    for osd in actual.keys():
+                        next_ws[osd] = next_ws[osd] / factor
+
+            # recalc
+            plan.compat_ws = copy.deepcopy(next_ws)
+            next_ms = plan.final_state()
+            next_pe = self.calc_eval(next_ms)
+            next_misplaced = next_ms.calc_misplaced_from(ms)
+            self.log.debug('Step result score %f -> %f, misplacing %f',
+                           best_pe.score, next_pe.score, next_misplaced)
+
+            if next_misplaced > max_misplaced:
+                if best_pe.score < pe.score:
+                    self.log.debug('Step misplaced %f > max %f, stopping',
+                                   next_misplaced, max_misplaced)
+                    break
+                step /= 2.0
+                next_ws = copy.deepcopy(best_ws)
+                next_ow = copy.deepcopy(best_ow)
+                self.log.debug('Step misplaced %f > max %f, reducing step to %f',
+                               next_misplaced, max_misplaced, step)
+            else:
+                if next_pe.score > best_pe.score * 1.0001:
+                    if bad_steps < 5 and random.randint(0, 100) < 70:
+                        self.log.debug('Score got worse, taking another step')
+                    else:
+                        step /= 2.0
+                        next_ws = copy.deepcopy(best_ws)
+                        next_ow = copy.deepcopy(best_ow)
+                        self.log.debug('Score got worse, trying smaller step %f',
+                                       step)
+                else:
+                    bad_steps = 0
+                    best_pe = next_pe
+                    best_ws = next_ws
+                    best_ow = next_ow
+                    if best_pe.score == 0:
+                        break
+            left -= 1
+
+        # allow a small regression if we are phasing out osd weights
+        fudge = 0
+        if next_ow != orig_osd_weight:
+            fudge = .001
+
+        if best_pe.score < pe.score + fudge:
+            self.log.info('Success, score %f -> %f', pe.score, best_pe.score)
+            plan.compat_ws = best_ws
+            for osd, w in best_ow.iteritems():
+                if w != orig_osd_weight[osd]:
+                    self.log.debug('osd.%d reweight %f', osd, w)
+                    plan.osd_weights[osd] = w
+            return True
+        else:
+            self.log.info('Failed to find further optimization, score %f',
+                          pe.score)
+            return False
+
+    def get_compat_weight_set_weights(self):
+        # enable compat weight-set
+        self.log.debug('ceph osd crush weight-set create-compat')
+        result = CommandResult('')
+        self.send_command(result, 'mon', '', json.dumps({
+            'prefix': 'osd crush weight-set create-compat',
+            'format': 'json',
+        }), '')
+        r, outb, outs = result.wait()
+        if r != 0:
+            self.log.error('Error creating compat weight-set')
+            return
+
+        result = CommandResult('')
+        self.send_command(result, 'mon', '', json.dumps({
+            'prefix': 'osd crush dump',
+            'format': 'json',
+        }), '')
+        r, outb, outs = result.wait()
+        if r != 0:
+            self.log.error('Error dumping crush map')
+            return
+        try:
+            crushmap = json.loads(outb)
+        except:
+            raise RuntimeError('unable to parse crush map')
+
+        raw = crushmap.get('choose_args',{}).get('-1', [])
+        weight_set = {}
+        for b in raw:
+            bucket = None
+            for t in crushmap['buckets']:
+                if t['id'] == b['bucket_id']:
+                    bucket = t
+                    break
+            if not bucket:
+                raise RuntimeError('could not find bucket %s' % b['bucket_id'])
+            self.log.debug('bucket items %s' % bucket['items'])
+            self.log.debug('weight set %s' % b['weight_set'][0])
+            if len(bucket['items']) != len(b['weight_set'][0]):
+                raise RuntimeError('weight-set size does not match bucket items')
+            for pos in range(len(bucket['items'])):
+                weight_set[bucket['items'][pos]['id']] = b['weight_set'][0][pos]
+
+        self.log.debug('weight_set weights %s' % weight_set)
+        return weight_set
+
+    def do_crush(self):
+        self.log.info('do_crush (not yet implemented)')
+
+    def do_osd_weight(self):
+        self.log.info('do_osd_weight (not yet implemented)')
+
+    def execute(self, plan):
+        self.log.info('Executing plan %s' % plan.name)
+
+        commands = []
+
+        # compat weight-set
+        if len(plan.compat_ws) and \
+           '-1' not in plan.initial.crush_dump.get('choose_args', {}):
+            self.log.debug('ceph osd crush weight-set create-compat')
+            result = CommandResult('')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd crush weight-set create-compat',
+                'format': 'json',
+            }), '')
+            r, outb, outs = result.wait()
+            if r != 0:
+                self.log.error('Error creating compat weight-set')
+                return
+
+        for osd, weight in plan.compat_ws.iteritems():
+            self.log.info('ceph osd crush weight-set reweight-compat osd.%d %f',
+                          osd, weight)
+            result = CommandResult('foo')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd crush weight-set reweight-compat',
+                'format': 'json',
+                'item': 'osd.%d' % osd,
+                'weight': [weight],
+            }), 'foo')
+            commands.append(result)
+
+        # new_weight
+        reweightn = {}
+        for osd, weight in plan.osd_weights.iteritems():
+            reweightn[str(osd)] = str(int(weight * float(0x10000)))
+        if len(reweightn):
+            self.log.info('ceph osd reweightn %s', reweightn)
+            result = CommandResult('foo')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd reweightn',
+                'format': 'json',
+                'weights': json.dumps(reweightn),
+            }), 'foo')
+            commands.append(result)
+
+        # upmap
+        incdump = plan.inc.dump()
+        for pgid in incdump.get('old_pg_upmap_items', []):
+            self.log.info('ceph osd rm-pg-upmap-items %s', pgid)
+            result = CommandResult('foo')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd rm-pg-upmap-items',
+                'format': 'json',
+                'pgid': pgid,
+            }), 'foo')
+            commands.append(result)
+
+        for item in incdump.get('new_pg_upmap_items', []):
+            self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'],
+                          item['mappings'])
+            osdlist = []
+            for m in item['mappings']:
+                osdlist += [m['from'], m['to']]
+            result = CommandResult('foo')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd pg-upmap-items',
+                'format': 'json',
+                'pgid': item['pgid'],
+                'id': osdlist,
+            }), 'foo')
+            commands.append(result)
+
+        # wait for commands
+        self.log.debug('commands %s' % commands)
+        for result in commands:
+            r, outb, outs = result.wait()
+            if r != 0:
+                self.log.error('Error on command')
+                return
+        self.log.debug('done')
index 146bd92d0c807a2d7aaa7fd06b60ac42288b3e02..03deb8e20f74252bc22961ee37c3a3c1c38fa22d 100644 (file)
@@ -9,27 +9,27 @@
     <meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no"
           name="viewport">
     <link rel="stylesheet"
-          href="/static/AdminLTE-2.3.7/bootstrap/css/bootstrap.min.css">
+          href="{{ url_prefix }}/static/AdminLTE-2.3.7/bootstrap/css/bootstrap.min.css">
     <link rel="stylesheet"
           href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.5.0/css/font-awesome.min.css">
     <link rel="stylesheet"
           href="https://cdnjs.cloudflare.com/ajax/libs/ionicons/2.0.1/css/ionicons.min.css">
     <link rel="stylesheet"
-          href="/static/AdminLTE-2.3.7/dist/css/AdminLTE.min.css">
+          href="{{ url_prefix }}/static/AdminLTE-2.3.7/dist/css/AdminLTE.min.css">
     <link rel="stylesheet"
-          href="/static/AdminLTE-2.3.7/dist/css/skins/skin-blue.min.css">
+          href="{{ url_prefix }}/static/AdminLTE-2.3.7/dist/css/skins/skin-blue.min.css">
     <link rel="stylesheet"
-          href="/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.css">
+          href="{{ url_prefix }}/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.css">
 
-    <script src="/static/AdminLTE-2.3.7/plugins/jQuery/jquery-2.2.3.min.js"></script>
-    <script src="/static/AdminLTE-2.3.7/plugins/sparkline/jquery.sparkline.min.js"></script>
+    <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/plugins/jQuery/jquery-2.2.3.min.js"></script>
+    <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/plugins/sparkline/jquery.sparkline.min.js"></script>
 
-    <script src="/static/rivets.bundled.min.js"></script>
-    <script src="/static/underscore-min.js"></script>
+    <script src="{{ url_prefix }}/static/rivets.bundled.min.js"></script>
+    <script src="{{ url_prefix }}/static/underscore-min.js"></script>
 
-    <script src="/static/AdminLTE-2.3.7/bootstrap/js/bootstrap.min.js"></script>
-    <script src="/static/AdminLTE-2.3.7/dist/js/app.min.js"></script>
-    <script src="/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.min.js"></script>
+    <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/bootstrap/js/bootstrap.min.js"></script>
+    <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/dist/js/app.min.js"></script>
+    <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.min.js"></script>
 
     <script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.17.1/moment.min.js"></script>
     <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.4.0/Chart.min.js"></script>
@@ -42,7 +42,7 @@
             var refresh_interval = 5000;
 
             var refresh = function() {
-                $.get("/toplevel_data", function(data) {
+                $.get("{{ url_prefix }}/toplevel_data", function(data) {
                     _.extend(toplevel_data, data);
                     setTimeout(refresh, refresh_interval);
                 });
                 var width=4;
                 var unit = 0;
 
+                if (n == null) {
+                    // People shouldn't really be passing null, but let's
+                    // do something sensible instead of barfing.
+                    return "-";
+                }
+
                 while (Math.floor(n / (divisor**unit)).toString().length > width - 1) {
                     unit = unit + 1;
                 }
         });
       </script>
 
-      <link rel="shortcut icon" href="http://ceph.com/wp-content/themes/ceph/favicon.ico">
-      <link rel="shortcut icon" href="/static/favicon.ico">
+      <link rel="shortcut icon" href="https://ceph.com/wp-content/themes/cephTheme/Resources/Favicons/favicon-96x96.png">
+      <link rel="shortcut icon" href="{{ url_prefix }}/static/favicon.ico">
 
     <style>
         div.box {
     <!-- Main Header -->
     <header class="main-header">
         <!-- Logo -->
-        <a href="/" class="logo">
+        <a href="{{ url_prefix }}/" class="logo">
       <span class="logo-lg">
-        <img src="/static/Ceph_Logo_Standard_RGB_White_120411_fa.png"
+        <img src="{{ url_prefix }}/static/Ceph_Logo_Standard_RGB_White_120411_fa.png"
              width="123px" height="34px"/>
           </span>
       <span class="logo-mini">
-        <img src="/static/logo-mini.png"
+        <img src="{{ url_prefix }}/static/logo-mini.png"
              width="34px" height="34px"/>
           </span>
         </a>
       <ul class="sidebar-menu">
         <!-- Optionally, you can add icons to the links -->
         <li class="{%if path_info=='/' or path_info.startswith('/health')%}active{%endif%}">
-            <a href="/health">
+            <a href="{{ url_prefix }}/health">
             <i class="fa fa-heartbeat" rv-style="health_status | health_color"></i>
             <span>Cluster health</span></a>
         </li>
           </a>
           <ul class="treeview-menu menu-open">
             <li>
-                <a href="/servers">Servers</a>
+                <a href="{{ url_prefix }}/servers">Servers</a>
             </li>
             <li>
-                <a href="/osd">OSDs</a>
+                <a href="{{ url_prefix }}/osd">OSDs</a>
             </li>
           </ul>
         </li>
           </a>
           <ul class="treeview-menu menu-open">
             <li>
-              <a href="/rbd_mirroring">
+              <a href="{{ url_prefix }}/rbd_mirroring">
                 <i class="fa fa-exchange"></i> Mirroring
                 <span class="pull-right-container">
                     <small rv-hide="rbd_mirroring.warnings | hide_count_box" class="label pull-right bg-yellow">{rbd_mirroring.warnings}</small>
               </a>
             </li>
             <li>
-              <a href="/rbd_iscsi">
+              <a href="{{ url_prefix }}/rbd_iscsi">
                 <i class="fa fa-upload"></i> iSCSI
                 <span class="pull-right-container" />
               </a>
     <strong>Copyright &copy; 2016 by Ceph Contributors.</strong> Free software (LGPL 2.1)
   </footer>
 
-  <!-- Control Sidebar -->
-  <aside class="control-sidebar control-sidebar-dark">
-    <!-- Create the tabs -->
-    <ul class="nav nav-tabs nav-justified control-sidebar-tabs">
-      <li class="active"><a href="#control-sidebar-home-tab" data-toggle="tab"><i class="fa fa-home"></i></a></li>
-      <li><a href="#control-sidebar-settings-tab" data-toggle="tab"><i class="fa fa-gears"></i></a></li>
-    </ul>
-    <!-- Tab panes -->
-    <div class="tab-content">
-      <!-- Home tab content -->
-      <div class="tab-pane active" id="control-sidebar-home-tab">
-        <h3 class="control-sidebar-heading">Recent Activity</h3>
-        <ul class="control-sidebar-menu">
-          <li>
-            <a href="javascript::;">
-              <i class="menu-icon fa fa-birthday-cake bg-red"></i>
-
-              <div class="menu-info">
-                <h4 class="control-sidebar-subheading">Langdon's Birthday</h4>
-
-                <p>Will be 23 on April 24th</p>
-              </div>
-            </a>
-          </li>
-        </ul>
-        <!-- /.control-sidebar-menu -->
-
-        <h3 class="control-sidebar-heading">Tasks Progress</h3>
-        <ul class="control-sidebar-menu">
-          <li>
-            <a href="javascript::;">
-              <h4 class="control-sidebar-subheading">
-                Custom Template Design
-                <span class="pull-right-container">
-                  <span class="label label-danger pull-right">70%</span>
-                </span>
-              </h4>
-
-              <div class="progress progress-xxs">
-                <div class="progress-bar progress-bar-danger" style="width: 70%"></div>
-              </div>
-            </a>
-          </li>
-        </ul>
-        <!-- /.control-sidebar-menu -->
-
-      </div>
-      <!-- /.tab-pane -->
-      <!-- Stats tab content -->
-      <div class="tab-pane" id="control-sidebar-stats-tab">Stats Tab Content</div>
-      <!-- /.tab-pane -->
-      <!-- Settings tab content -->
-      <div class="tab-pane" id="control-sidebar-settings-tab">
-        <form method="post">
-          <h3 class="control-sidebar-heading">General Settings</h3>
-
-          <div class="form-group">
-            <label class="control-sidebar-subheading">
-              Report panel usage
-              <input type="checkbox" class="pull-right" checked>
-            </label>
-
-            <p>
-              Some information about this general settings option
-            </p>
-          </div>
-          <!-- /.form-group -->
-        </form>
-      </div>
-      <!-- /.tab-pane -->
-    </div>
-  </aside>
-  <!-- /.control-sidebar -->
-  <!-- Add the sidebar's background. This div must be placed
-       immediately after the control sidebar -->
-  <div class="control-sidebar-bg"></div>
-
 </div>
 
 <!--
index bd5a557210bca21fc92c69806a03bec57445aaa0..ab99bf764e768037bd40ebcdabf4e17dded8bee1 100644 (file)
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/clients_data/" + content_data.fscid  + "/", function(data) {
+                $.get("{{ url_prefix }}/clients_data/" + content_data.fscid  + "/", function(data) {
                     content_data.clients = data;
                     setTimeout(refresh, 5000);
                 });
index c58f1e18fd57c967444e5b84e6a65188d9d49b67..60a97a007d7628967174261be927d8637a77cc2e 100644 (file)
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/filesystem_data/" + content_data.fs_status.filesystem.id  + "/", function(data) {
+                $.get("{{ url_prefix }}/filesystem_data/" + content_data.fs_status.filesystem.id  + "/", function(data) {
                     _.extend(content_data.fs_status, data);
                     setTimeout(refresh, 5000);
                 });
@@ -71,7 +71,7 @@
             var rhs_transform = delta_timeseries;
 
             var draw_chart = function() {
-                $.get("/mds_counters/" + content_data.fs_status.filesystem.id  + "/", function(data) {
+                $.get("{{ url_prefix }}/mds_counters/" + content_data.fs_status.filesystem.id  + "/", function(data) {
                     var top_chart = true;
 
                     // Cull any chart elements that correspond to MDSs no
         <div class="box-body">
             <table>
                 <thead>
-                <tr>
+                <tr rv-show="standbys | length">
                     <th>Daemon</th>
                 </tr>
                 </thead>
                 <tr rv-each-standby="standbys">
                     <td>{standby.name}</td>
                 </tr>
+                <tr class="ceph-none-found" rv-hide="standbys | length">
+                    <td>None found</td>
+                </tr>
                 </tbody>
             </table>
         </div>
index 48cea82f1097c69476fc43923ae49e5db7e82957..e076d4bb0ad528232176c6b4fec19d7d3c4ce95c 100644 (file)
             };
 
             rivets.formatters.pg_status_style = function(pg_status) {
-                var unhealthy = false;
-                var scrubbing = false;
                 $.each(pg_status, function(state, count) {
-                    if (state == "active+clean") {
-
-                    } else if (state == "active+clean+scrubbing"
-                    || state == "active+clean+scrubbing+deep") {
-                        scrubbing = true;
+                    if (state == "active+clean"
+                     || state == "active+clean+scrubbing"
+                     || state == "active+clean+scrubbing+deep") {
+                        return "color: #00bb00";
                     } else {
-                        unhealthy = true;
+                        return "color: #FFC200";
                     }
                 });
-
-                if (unhealthy) {
-                    return "color: #FFC200";
-                } else if (scrubbing) {
-                    return "color: #0000bb";
-                } else {
-                    return "color: #00bb00";
-                }
             };
 
             rivets.formatters.pg_status = function(pg_status) {
             rivets.bind($("#content"), content_data);
 
             var refresh = function() {
-                $.get("/health_data", function(data) {
+                $.get("{{ url_prefix }}/health_data", function(data) {
                     _.extend(content_data, data);
                     draw_usage_charts();
                     setTimeout(refresh, 5000);
index 10b5c37edb30794d3ef363b31e80485b90dfd50e..074103a5a4b059b3fdea104469d73130b517147e 100644 (file)
@@ -22,11 +22,12 @@ import json
 import sys
 import time
 import threading
+import socket
 
 import cherrypy
 import jinja2
 
-from mgr_module import MgrModule, CommandResult
+from mgr_module import MgrModule, MgrStandbyModule, CommandResult
 
 from types import OsdMap, NotFound, Config, FsMap, MonMap, \
     PgSummary, Health, MonStatus
@@ -45,7 +46,7 @@ log = logging.getLogger("dashboard")
 LOG_BUFFER_SIZE = 30
 
 # cherrypy likes to sys.exit on error.  don't let it take us down too!
-def os_exit_noop():
+def os_exit_noop(*args, **kwargs):
     pass
 
 os._exit = os_exit_noop
@@ -61,6 +62,54 @@ def recurse_refs(root, path):
 
     log.info("%s %d (%s)" % (path, sys.getrefcount(root), root.__class__))
 
+def get_prefixed_url(url):
+    return global_instance().url_prefix + url
+
+
+
+class StandbyModule(MgrStandbyModule):
+    def serve(self):
+        server_addr = self.get_localized_config('server_addr', '::')
+        server_port = self.get_localized_config('server_port', '7000')
+        if server_addr is None:
+            raise RuntimeError('no server_addr configured; try "ceph config-key set mgr/dashboard/server_addr <ip>"')
+        log.info("server_addr: %s server_port: %s" % (server_addr, server_port))
+        cherrypy.config.update({
+            'server.socket_host': server_addr,
+            'server.socket_port': int(server_port),
+            'engine.autoreload.on': False
+        })
+
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        jinja_loader = jinja2.FileSystemLoader(current_dir)
+        env = jinja2.Environment(loader=jinja_loader)
+
+        module = self
+
+        class Root(object):
+            @cherrypy.expose
+            def index(self):
+                active_uri = module.get_active_uri()
+                if active_uri:
+                    log.info("Redirecting to active '{0}'".format(active_uri))
+                    raise cherrypy.HTTPRedirect(active_uri)
+                else:
+                    template = env.get_template("standby.html")
+                    return template.render(delay=5)
+
+        cherrypy.tree.mount(Root(), "/", {})
+        log.info("Starting engine...")
+        cherrypy.engine.start()
+        log.info("Waiting for engine...")
+        cherrypy.engine.wait(state=cherrypy.engine.states.STOPPED)
+        log.info("Engine done.")
+
+    def shutdown(self):
+        log.info("Stopping server...")
+        cherrypy.engine.wait(state=cherrypy.engine.states.STARTED)
+        cherrypy.engine.stop()
+        log.info("Stopped server")
+
 
 class Module(MgrModule):
     def __init__(self, *args, **kwargs):
@@ -98,6 +147,9 @@ class Module(MgrModule):
         self.pool_stats = defaultdict(lambda: defaultdict(
             lambda: collections.deque(maxlen=10)))
 
+        # A prefix for all URLs to use the dashboard with a reverse http proxy
+        self.url_prefix = ''
+
     @property
     def rados(self):
         """
@@ -107,8 +159,7 @@ class Module(MgrModule):
         if self._rados:
             return self._rados
 
-        from mgr_module import ceph_state
-        ctx_capsule = ceph_state.get_context()
+        ctx_capsule = self.get_context()
         self._rados = rados.Rados(context=ctx_capsule)
         self._rados.connect()
 
@@ -374,7 +425,7 @@ class Module(MgrModule):
                 "id": fs_id,
                 "name": mdsmap['fs_name'],
                 "client_count": client_count,
-                "clients_url": "/clients/{0}/".format(fs_id),
+                "clients_url": get_prefixed_url("/clients/{0}/".format(fs_id)),
                 "ranks": rank_table,
                 "pools": pools_table
             },
@@ -440,7 +491,7 @@ class Module(MgrModule):
                 rbd_pools = sorted([
                     {
                         "name": name,
-                        "url": "/rbd_pool/{0}/".format(name)
+                        "url": get_prefixed_url("/rbd_pool/{0}/".format(name))
                     }
                     for name in data
                 ], key=lambda k: k['name'])
@@ -455,7 +506,7 @@ class Module(MgrModule):
                     {
                         "id": f['id'],
                         "name": f['mdsmap']['fs_name'],
-                        "url": "/filesystem/{0}/".format(f['id'])
+                        "url": get_prefixed_url("/filesystem/{0}/".format(f['id']))
                     }
                     for f in fsmap.data['filesystems']
                 ]
@@ -479,6 +530,7 @@ class Module(MgrModule):
                 }
 
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(toplevel_data, indent=2),
@@ -542,11 +594,12 @@ class Module(MgrModule):
                     "clients": clients,
                     "fs_name": fs_name,
                     "fscid": fscid,
-                    "fs_url": "/filesystem/" + fscid_str + "/"
+                    "fs_url": get_prefixed_url("/filesystem/" + fscid_str + "/")
                 }
 
                 template = env.get_template("clients.html")
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(self._toplevel_data(), indent=2),
@@ -591,6 +644,7 @@ class Module(MgrModule):
                 }
 
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(toplevel_data, indent=2),
@@ -617,6 +671,7 @@ class Module(MgrModule):
                 content_data = self._rbd_mirroring()
 
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(toplevel_data, indent=2),
@@ -643,6 +698,7 @@ class Module(MgrModule):
                 content_data = self._rbd_iscsi()
 
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(toplevel_data, indent=2),
@@ -658,6 +714,7 @@ class Module(MgrModule):
             def health(self):
                 template = env.get_template("health.html")
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(self._toplevel_data(), indent=2),
@@ -668,6 +725,7 @@ class Module(MgrModule):
             def servers(self):
                 template = env.get_template("servers.html")
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(self._toplevel_data(), indent=2),
@@ -821,6 +879,17 @@ class Module(MgrModule):
                         ret[k1][k2] = sorted_dict
                 return ret
 
+        url_prefix = self.get_config('url_prefix')
+        if url_prefix == None:
+            url_prefix = ''
+        else:
+            if len(url_prefix) != 0:
+                if url_prefix[0] != '/':
+                    url_prefix = '/'+url_prefix
+                if url_prefix[-1] == '/':
+                    url_prefix = url_prefix[:-1]
+        self.url_prefix = url_prefix
+
         server_addr = self.get_localized_config('server_addr', '::')
         server_port = self.get_localized_config('server_port', '7000')
         if server_addr is None:
@@ -832,6 +901,16 @@ class Module(MgrModule):
             'engine.autoreload.on': False
         })
 
+        osdmap = self.get_osdmap()
+        log.info("latest osdmap is %d" % osdmap.get_epoch())
+
+        # Publish the URI that others may use to access the service we're
+        # about to start serving
+        self.set_uri("http://{0}:{1}/".format(
+            socket.getfqdn() if server_addr == "::" else server_addr,
+            server_port
+        ))
+
         static_dir = os.path.join(current_dir, 'static')
         conf = {
             "/static": {
@@ -882,6 +961,7 @@ class Module(MgrModule):
                 toplevel_data = self._toplevel_data()
 
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info='/osd' + cherrypy.request.path_info,
                     toplevel_data=json.dumps(toplevel_data, indent=2),
@@ -923,7 +1003,7 @@ class Module(MgrModule):
                 result['up'] = osd_info['up']
                 result['in'] = osd_info['in']
 
-                result['url'] = "/osd/perf/{0}".format(osd_id)
+                result['url'] = get_prefixed_url("/osd/perf/{0}".format(osd_id))
 
                 return result
 
@@ -936,7 +1016,6 @@ class Module(MgrModule):
                 for server in servers:
                     hostname = server['hostname']
                     services = server['services']
-                    first = True
                     for s in services:
                         if s["type"] == "osd":
                             osd_id = int(s["id"])
@@ -948,18 +1027,18 @@ class Module(MgrModule):
                             summary = self._osd_summary(osd_id,
                                                         osd_map.osds_by_id[osd_id])
 
-                            if first:
-                                # A little helper for rendering
-                                summary['first'] = True
-                                first = False
                             result[hostname].append(summary)
 
+                    result[hostname].sort(key=lambda a: a['id'])
+                    if len(result[hostname]):
+                        result[hostname][0]['first'] = True
+
                 global_instance().log.warn("result.size {0} servers.size {1}".format(
                     len(result), len(servers)
                 ))
 
                 # Return list form for convenience of rendering
-                return result.items()
+                return sorted(result.items(), key=lambda a: a[0])
 
             @cherrypy.expose
             def index(self):
@@ -976,16 +1055,18 @@ class Module(MgrModule):
                 }
 
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info='/osd' + cherrypy.request.path_info,
                     toplevel_data=json.dumps(toplevel_data, indent=2),
                     content_data=json.dumps(content_data, indent=2)
                 )
 
-        cherrypy.tree.mount(Root(), "/", conf)
-        cherrypy.tree.mount(OSDEndpoint(), "/osd", conf)
+        cherrypy.tree.mount(Root(), get_prefixed_url("/"), conf)
+        cherrypy.tree.mount(OSDEndpoint(), get_prefixed_url("/osd"), conf)
 
-        log.info("Starting engine...")
+        log.info("Starting engine on {0}:{1}...".format(
+            server_addr, server_port))
         cherrypy.engine.start()
         log.info("Waiting for engine...")
         cherrypy.engine.block()
index 7ab958effb56d3bb6b0fd06dcfbc4e9324af2786..b13ad17e1fd7ba0791ca12afa59f8ce1d11b940a 100644 (file)
@@ -96,7 +96,7 @@
             post_load();
 
             var refresh = function() {
-                $.get("/osd/perf_data/" + content_data.osd.osd  + "/", function(data) {
+                $.get("{{ url_prefix }}/osd/perf_data/" + content_data.osd.osd  + "/", function(data) {
                     _.extend(content_data.osd_histogram, data.osd_histogram);
                     _.extend(content_data.osd, data.osd);
                     _.extend(content_data.osd_metadata, data.osd_metadata);
index ddcb8577d9526ba5f72fcdd40c5675dca8342447..b55b3df594c281acd72a41ae1452c9487775c1b3 100644 (file)
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/osd/list_data/", function(data) {
+                $.get("{{ url_prefix }}/osd/list_data/", function(data) {
                     content_data.osds_by_server = data;
                     $('.inlinesparkline').sparkline();
                     setTimeout(refresh, 5000);
index 105f5dec8aa5600878f5c47edf77af9e4d324647..b8e47fdb23488b4316c41c1debc982a857de5ac2 100644 (file)
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/rbd_iscsi_data", function(data) {
+                $.get("{{ url_prefix }}/rbd_iscsi_data", function(data) {
                     _.extend(content_data, data);
                     setTimeout(refresh, 30000);
                 });
index 6588766a7dad18962e8de2d5dff28f84b093bc6e..87315a91bb41a11482e3de84d17c1ce5eaf6cc46 100644 (file)
@@ -6,8 +6,8 @@ from remote_view_cache import RemoteViewCache
 
 class RbdPoolLs(RemoteViewCache):
     def _get(self):
-        from mgr_module import ceph_state
-        ctx_capsule = ceph_state.get_context()
+        ctx_capsule = self._module.get_context()
+
 
         osd_map = self._module.get_sync_object(OsdMap).data
         osd_pools = [pool['pool_name'] for pool in osd_map['pools']]
index b83aadd08f40b1ff7c7d842da3e9e66450f25d63..2720685474a04f85f946d11cce6ea3615dc0d287 100644 (file)
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/rbd_mirroring_data", function(data) {
+                $.get("{{ url_prefix }}/rbd_mirroring_data", function(data) {
                     _.extend(content_data, data);
                     setTimeout(refresh, 30000);
                 });
index 973bc3717d89e74ea290b3811a61194e6a0d8db5..0d0e54fdc6f3153f5eb012a3144e5102bfea917d 100644 (file)
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/rbd_pool_data/" + content_data.pool_name  + "/", function(data) {
+                $.get("{{ url_prefix }}/rbd_pool_data/" + content_data.pool_name  + "/", function(data) {
                     content_data.images = data;
                     setTimeout(refresh, 10000);
                 });
index e6c8b3cf603336ea9d55181b6209520ab463433d..421d3389c3585edb09d8845208d6bda5f7ce2380 100644 (file)
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/servers_data", function(data) {
+                $.get("{{ url_prefix }}/servers_data", function(data) {
                     _.extend(content_data, data);
                     setTimeout(refresh, 5000);
                 });
diff --git a/ceph/src/pybind/mgr/dashboard/standby.html b/ceph/src/pybind/mgr/dashboard/standby.html
new file mode 100644 (file)
index 0000000..ec706a1
--- /dev/null
@@ -0,0 +1,15 @@
+
+<html>
+    <!-- Note: this is only displayed when the standby
+         does not know an active URI to redirect to, otherwise
+         a simple redirect is returned instead -->
+    <head>
+        <title>Ceph</title>
+        <meta http-equiv="refresh" content="{{delay}}">
+    </head>
+    <body>
+        No active ceph-mgr instance is currently running
+        the dashboard.  A failover may be in progress.
+        Retrying in {{delay}} seconds...
+    </body>
+</html>
diff --git a/ceph/src/pybind/mgr/influx/__init__.py b/ceph/src/pybind/mgr/influx/__init__.py
new file mode 100644 (file)
index 0000000..0440e07
--- /dev/null
@@ -0,0 +1 @@
+from module import *  # NOQA
diff --git a/ceph/src/pybind/mgr/influx/module.py b/ceph/src/pybind/mgr/influx/module.py
new file mode 100644 (file)
index 0000000..adeb452
--- /dev/null
@@ -0,0 +1,162 @@
+
+from datetime import datetime
+from threading import Event
+import json
+import errno
+
+from mgr_module import MgrModule
+
+try:
+    from influxdb import InfluxDBClient
+    from influxdb.exceptions import InfluxDBClientError
+except ImportError:
+    InfluxDBClient = None
+
+class Module(MgrModule):
+    COMMANDS = [
+        {
+            "cmd": "influx self-test",
+            "desc": "debug the module",
+            "perm": "rw"  
+        },
+    ]
+
+
+    def __init__(self, *args, **kwargs):
+        super(Module, self).__init__(*args, **kwargs)
+        self.event = Event()
+        self.run = True 
+
+
+    def get_latest(self, daemon_type, daemon_name, stat):
+        data = self.get_counter(daemon_type, daemon_name, stat)[stat]
+        if data:
+            return data[-1][1]
+        else:
+            return 0
+
+
+    def get_df_stats(self):
+        df = self.get("df")
+        data = []
+
+        df_types = [
+            'bytes_used',
+            'dirty',
+            'rd_bytes',
+            'raw_bytes_used',
+            'wr_bytes',
+            'objects',
+            'max_avail'
+        ]
+
+        for df_type in df_types:
+            for pool in df['pools']:
+                point = {
+                    "measurement": "ceph_pool_stats",
+                    "tags": {
+                        "pool_name" : pool['name'],
+                        "pool_id" : pool['id'],
+                        "type_instance" : df_type,
+                        "mgr_id" : self.get_mgr_id(),
+                    },
+                        "time" : datetime.utcnow().isoformat() + 'Z',
+                        "fields": {
+                            "value" : pool['stats'][df_type],
+                        }
+                }
+                data.append(point)
+        return data
+
+    def get_daemon_stats(self):
+        data = []
+
+        for daemon, counters in self.get_all_perf_counters().iteritems():
+            svc_type, svc_id = daemon.split(".")
+            metadata = self.get_metadata(svc_type, svc_id)
+
+            for path, counter_info in counters.items():
+                if counter_info['type'] & self.PERFCOUNTER_HISTOGRAM:
+                    continue
+
+                value = counter_info['value']
+
+                data.append({
+                    "measurement": "ceph_daemon_stats",
+                    "tags": {
+                        "ceph_daemon": daemon,
+                        "type_instance": path,
+                        "host": metadata['hostname']
+                    },
+                    "time": datetime.utcnow().isoformat() + 'Z',
+                    "fields": {
+                        "value": value
+                    }
+                })
+
+        return data
+
+    def send_to_influx(self):
+        host = self.get_config("hostname")
+        if not host:
+            self.log.error("No InfluxDB server configured, please set"
+                           "`hostname` configuration key.")
+            return
+
+        port = int(self.get_config("port", default="8086"))
+        database = self.get_config("database", default="ceph")
+
+        # If influx server has authentication turned off then
+        # missing username/password is valid.
+        username = self.get_config("username", default="")
+        password = self.get_config("password", default="")
+
+        client = InfluxDBClient(host, port, username, password, database)
+
+        # using influx client get_list_database requires admin privs, instead we'll catch the not found exception and inform the user if db can't be created
+        try:
+            client.write_points(self.get_df_stats(), 'ms')
+            client.write_points(self.get_daemon_stats(), 'ms')
+        except InfluxDBClientError as e:
+            if e.code == 404:
+                self.log.info("Database '{0}' not found, trying to create (requires admin privs).  You can also create manually and grant write privs to user '{1}'".format(database,username))
+                client.create_database(database)
+            else:
+                raise
+
+    def shutdown(self):
+        self.log.info('Stopping influx module')
+        self.run = False
+        self.event.set()
+
+    def handle_command(self, cmd):
+        if cmd['prefix'] == 'influx self-test':
+            daemon_stats = self.get_daemon_stats()
+            assert len(daemon_stats)
+            df_stats = self.get_df_stats()
+            result = {
+                'daemon_stats': daemon_stats,
+                'df_stats': df_stats
+            }
+            return 0, json.dumps(result, indent=2), 'Self-test OK'
+        else:
+            return (-errno.EINVAL, '',
+                    "Command not found '{0}'".format(cmd['prefix']))
+
+    def serve(self):
+        if InfluxDBClient is None:
+            self.log.error("Cannot transmit statistics: influxdb python "
+                           "module not found.  Did you install it?")
+            return
+
+        self.log.info('Starting influx module')
+        self.run = True
+        while self.run:
+            self.send_to_influx()
+            self.log.debug("Running interval loop")
+            interval = self.get_config("interval")
+            if interval is None:
+                interval = 5
+            self.log.debug("sleeping for %d seconds",interval)
+            self.event.wait(interval)
+            
diff --git a/ceph/src/pybind/mgr/localpool/__init__.py b/ceph/src/pybind/mgr/localpool/__init__.py
new file mode 100644 (file)
index 0000000..79f5b86
--- /dev/null
@@ -0,0 +1,2 @@
+
+from module import *  # NOQA
diff --git a/ceph/src/pybind/mgr/localpool/module.py b/ceph/src/pybind/mgr/localpool/module.py
new file mode 100644 (file)
index 0000000..0abdbfb
--- /dev/null
@@ -0,0 +1,92 @@
+from mgr_module import MgrModule, CommandResult
+import json
+import threading
+
+class Module(MgrModule):
+    def __init__(self, *args, **kwargs):
+        super(Module, self).__init__(*args, **kwargs)
+        self.serve_event = threading.Event()
+
+    def notify(self, notify_type, notify_id):
+        if notify_type == 'osd_map':
+            self.handle_osd_map()
+
+    def handle_osd_map(self):
+        """
+        Check pools on each OSDMap change
+        """
+        subtree_type = self.get_config('subtree') or 'rack'
+        failure_domain = self.get_config('failure_domain') or 'host'
+        pg_num = self.get_config('pg_num') or '128'
+        num_rep = self.get_config('num_rep') or '3'
+        min_size = self.get_config('min_size')
+        prefix = self.get_config('prefix') or 'by-' + subtree_type + '-'
+
+        osdmap = self.get("osd_map")
+        lpools = []
+        for pool in osdmap['pools']:
+            if pool['pool_name'].find(prefix) == 0:
+                lpools.append(pool['pool_name'])
+
+        self.log.debug('localized pools = %s', lpools)
+        subtrees = []
+        tree = self.get('osd_map_tree')
+        for node in tree['nodes']:
+            if node['type'] == subtree_type:
+                subtrees.append(node['name'])
+                pool_name = prefix + node['name']
+                if pool_name not in lpools:
+                    self.log.info('Creating localized pool %s', pool_name)
+                    #
+                    result = CommandResult("")
+                    self.send_command(result, "mon", "", json.dumps({
+                        "prefix": "osd crush rule create-replicated",
+                        "format": "json",
+                        "name": pool_name,
+                        "root": node['name'],
+                        "type": failure_domain,
+                    }), "")
+                    r, outb, outs = result.wait()
+
+                    result = CommandResult("")
+                    self.send_command(result, "mon", "", json.dumps({
+                        "prefix": "osd pool create",
+                        "format": "json",
+                        "pool": pool_name,
+                        'rule': pool_name,
+                        'erasure_code_profile': pool_name,
+                        "pool_type": 'replicated',
+                        'pg_num': str(pg_num),
+                    }), "")
+                    r, outb, outs = result.wait()
+
+                    result = CommandResult("")
+                    self.send_command(result, "mon", "", json.dumps({
+                        "prefix": "osd pool set",
+                        "format": "json",
+                        "pool": pool_name,
+                        'var': 'size',
+                        "val": str(num_rep),
+                    }), "")
+                    r, outb, outs = result.wait()
+
+                    if min_size:
+                        result = CommandResult("")
+                        self.send_command(result, "mon", "", json.dumps({
+                            "prefix": "osd pool set",
+                            "format": "json",
+                            "pool": pool_name,
+                            'var': 'min_size',
+                            "val": str(min_size),
+                        }), "")
+                        r, outb, outs = result.wait()
+
+        # TODO remove pools for hosts that don't exist?
+
+    def serve(self):
+        self.handle_osd_map()
+        self.serve_event.wait()
+        self.serve_event.clear()
+
+    def shutdown(self):
+        self.serve_event.set()
index 2463bafe75d8a11e73e6d9f012cfbfc277b2cc01..1abbcc5cd4fbe02c4fc635c3586ff6ed393b5972 100644 (file)
@@ -1,10 +1,54 @@
 
-import ceph_state  #noqa
+import ceph_module  # noqa
+#import ceph_osdmap  #noqa
+#import ceph_osdmap_incremental  #noqa
+#import ceph_crushmap  #noqa
+
 import json
 import logging
 import threading
+from collections import defaultdict
+
+
+class CPlusPlusHandler(logging.Handler):
+    def __init__(self, module_inst):
+        super(CPlusPlusHandler, self).__init__()
+        self._module = module_inst
+
+    def emit(self, record):
+        if record.levelno <= logging.DEBUG:
+            ceph_level = 20
+        elif record.levelno <= logging.INFO:
+            ceph_level = 4
+        elif record.levelno <= logging.WARNING:
+            ceph_level = 1
+        else:
+            ceph_level = 0
+
+        self._module._ceph_log(ceph_level, self.format(record))
+
+
+def configure_logger(module_inst, name):
+    logger = logging.getLogger(name)
+
+
+    # Don't filter any logs at the python level, leave it to C++
+    logger.setLevel(logging.DEBUG)
+
+    # FIXME: we should learn the log level from C++ land, and then
+    # avoid calling the C++ level log when we know a message is of
+    # an insufficient level to be ultimately output
+    logger.addHandler(CPlusPlusHandler(module_inst))
+
+    return logger
 
 
+def unconfigure_logger(module_inst, name):
+    logger = logging.getLogger(name)
+    rm_handlers = [h for h in logger.handlers if isinstance(h, CPlusPlusHandler)]
+    for h in rm_handlers:
+        logger.removeHandler(h)
+
 class CommandResult(object):
     """
     Use with MgrModule.send_command
@@ -30,36 +74,168 @@ class CommandResult(object):
         return self.r, self.outb, self.outs
 
 
-class MgrModule(object):
+class OSDMap(ceph_module.BasePyOSDMap):
+    def get_epoch(self):
+        return self._get_epoch()
+
+    def get_crush_version(self):
+        return self._get_crush_version()
+
+    def dump(self):
+        return self._dump()
+
+    def new_incremental(self):
+        return self._new_incremental()
+
+    def apply_incremental(self, inc):
+        return self._apply_incremental(inc)
+
+    def get_crush(self):
+        return self._get_crush()
+
+    def get_pools_by_take(self, take):
+        return self._get_pools_by_take(take).get('pools', [])
+
+    def calc_pg_upmaps(self, inc,
+                       max_deviation=.01, max_iterations=10, pools=[]):
+        return self._calc_pg_upmaps(
+            inc,
+            max_deviation, max_iterations, pools)
+
+    def map_pool_pgs_up(self, poolid):
+        return self._map_pool_pgs_up(poolid)
+
+class OSDMapIncremental(ceph_module.BasePyOSDMapIncremental):
+    def get_epoch(self):
+        return self._get_epoch()
+
+    def dump(self):
+        return self._dump()
+
+    def set_osd_reweights(self, weightmap):
+        """
+        weightmap is a dict, int to float.  e.g. { 0: .9, 1: 1.0, 3: .997 }
+        """
+        return self._set_osd_reweights(weightmap)
+
+    def set_crush_compat_weight_set_weights(self, weightmap):
+        """
+        weightmap is a dict, int to float.  devices only.  e.g.,
+        { 0: 3.4, 1: 3.3, 2: 3.334 }
+        """
+        return self._set_crush_compat_weight_set_weights(weightmap)
+
+class CRUSHMap(ceph_module.BasePyCRUSH):
+    def dump(self):
+        return self._dump()
+
+    def get_item_weight(self, item):
+        return self._get_item_weight(item)
+
+    def get_item_name(self, item):
+        return self._get_item_name(item)
+
+    def find_takes(self):
+        return self._find_takes().get('takes', [])
+
+    def get_take_weight_osd_map(self, root):
+        uglymap = self._get_take_weight_osd_map(root)
+        return { int(k): v for k, v in uglymap.get('weights', {}).iteritems() }
+
+class MgrStandbyModule(ceph_module.BaseMgrStandbyModule):
+    """
+    Standby modules only implement a serve and shutdown method, they
+    are not permitted to implement commands and they do not receive
+    any notifications.
+
+    They only have access to the mgrmap (for acecssing service URI info
+    from their active peer), and to configuration settings (read only).
+    """
+
+    def __init__(self, module_name, capsule):
+        super(MgrStandbyModule, self).__init__(capsule)
+        self.module_name = module_name
+        self._logger = configure_logger(self, module_name)
+
+    def __del__(self):
+        unconfigure_logger(self, self.module_name)
+
+    @property
+    def log(self):
+        return self._logger
+
+    def serve(self):
+        """
+        The serve method is mandatory for standby modules.
+        :return:
+        """
+        raise NotImplementedError()
+
+    def get_mgr_id(self):
+        return self._ceph_get_mgr_id()
+
+    def get_config(self, key):
+        return self._ceph_get_config(key)
+
+    def get_active_uri(self):
+        return self._ceph_get_active_uri()
+
+    def get_localized_config(self, key, default=None):
+        r = self.get_config(self.get_mgr_id() + '/' + key)
+        if r is None:
+            r = self.get_config(key)
+
+        if r is None:
+            r = default
+        return r
+
+class MgrModule(ceph_module.BaseMgrModule):
     COMMANDS = []
 
-    def __init__(self, handle):
-        self._handle = handle
-        self._logger = logging.getLogger(handle)
+    # Priority definitions for perf counters
+    PRIO_CRITICAL = 10
+    PRIO_INTERESTING = 8
+    PRIO_USEFUL = 5
+    PRIO_UNINTERESTING = 2
+    PRIO_DEBUGONLY = 0
+
+    # counter value types
+    PERFCOUNTER_TIME = 1
+    PERFCOUNTER_U64 = 2
+
+    # counter types
+    PERFCOUNTER_LONGRUNAVG = 4
+    PERFCOUNTER_COUNTER = 8
+    PERFCOUNTER_HISTOGRAM = 0x10
+    PERFCOUNTER_TYPE_MASK = ~2
 
-        # Don't filter any logs at the python level, leave it to C++
-        self._logger.setLevel(logging.DEBUG)
+    def __init__(self, module_name, py_modules_ptr, this_ptr):
+        self.module_name = module_name
 
-        # FIXME: we should learn the log level from C++ land, and then
-        # avoid calling ceph_state.log when we know a message is of
-        # an insufficient level to be ultimately output
+        # If we're taking over from a standby module, let's make sure
+        # its logger was unconfigured before we hook ours up
+        unconfigure_logger(self, self.module_name)
+        self._logger = configure_logger(self, module_name)
 
-        class CPlusPlusHandler(logging.Handler):
-            def emit(self, record):
-                if record.levelno <= logging.DEBUG:
-                    ceph_level = 20
-                elif record.levelno <= logging.INFO:
-                    ceph_level = 4
-                elif record.levelno <= logging.WARNING:
-                    ceph_level = 1
-                else:
-                    ceph_level = 0
+        super(MgrModule, self).__init__(py_modules_ptr, this_ptr)
 
-                ceph_state.log(handle, ceph_level, self.format(record))
+        self._version = self._ceph_get_version()
 
-        self._logger.addHandler(CPlusPlusHandler())
+        self._perf_schema_cache = None
 
-        self._version = ceph_state.get_version()
+    def __del__(self):
+        unconfigure_logger(self, self.module_name)
+
+    def update_perf_schema(self, daemon_type, daemon_name):
+        """
+        For plugins that use get_all_perf_counters, call this when
+        receiving a notification of type 'perf_schema_update', to
+        prompt MgrModule to update its cache of counter schemas.
+
+        :param daemon_type:
+        :param daemon_name:
+        :return:
+        """
 
     @property
     def log(self):
@@ -69,6 +245,12 @@ class MgrModule(object):
     def version(self):
         return self._version
 
+    def get_context(self):
+        """
+        :return: a Python capsule containing a C++ CephContext pointer
+        """
+        return self._ceph_get_context()
+
     def notify(self, notify_type, notify_id):
         """
         Called by the ceph-mgr service to notify the Python plugin
@@ -100,7 +282,7 @@ class MgrModule(object):
         """
         Called by the plugin to load some cluster state from ceph-mgr
         """
-        return ceph_state.get(self._handle, data_name)
+        return self._ceph_get(data_name)
 
     def get_server(self, hostname):
         """
@@ -109,7 +291,7 @@ class MgrModule(object):
 
         :param hostname: a hostame
         """
-        return ceph_state.get_server(self._handle, hostname)
+        return self._ceph_get_server(hostname)
 
     def get_perf_schema(self, svc_type, svc_name):
         """
@@ -121,7 +303,7 @@ class MgrModule(object):
         :param svc_name:
         :return: list of dicts describing the counters requested
         """
-        return ceph_state.get_perf_schema(self._handle, svc_type, svc_name)
+        return self._ceph_get_perf_schema(svc_type, svc_name)
 
     def get_counter(self, svc_type, svc_name, path):
         """
@@ -133,14 +315,14 @@ class MgrModule(object):
         :param path:
         :return: A list of two-element lists containing time and value
         """
-        return ceph_state.get_counter(self._handle, svc_type, svc_name, path)
+        return self._ceph_get_counter(svc_type, svc_name, path)
 
     def list_servers(self):
         """
         Like ``get_server``, but instead of returning information
         about just one node, return all the nodes in an array.
         """
-        return ceph_state.get_server(self._handle, None)
+        return self._ceph_get_server(None)
 
     def get_metadata(self, svc_type, svc_id):
         """
@@ -150,7 +332,7 @@ class MgrModule(object):
         :param svc_id: string
         :return: dict
         """
-        return ceph_state.get_metadata(self._handle, svc_type, svc_id)
+        return self._ceph_get_metadata(svc_type, svc_id)
 
     def get_daemon_status(self, svc_type, svc_id):
         """
@@ -160,14 +342,14 @@ class MgrModule(object):
         :param svc_id: string
         :return: dict
         """
-        return ceph_state.get_daemon_status(self._handle, svc_type, svc_id)
+        return self._ceph_get_daemon_status(svc_type, svc_id)
 
     def send_command(self, *args, **kwargs):
         """
         Called by the plugin to send a command to the mon
         cluster.
         """
-        ceph_state.send_command(self._handle, *args, **kwargs)
+        self._ceph_send_command(*args, **kwargs)
 
     def set_health_checks(self, checks):
         """
@@ -191,7 +373,7 @@ class MgrModule(object):
 
         :param list: dict of health check dicts
         """
-        ceph_state.set_health_checks(self._handle, checks)
+        self._ceph_set_health_checks(checks)
 
     def handle_command(self, cmd):
         """
@@ -217,16 +399,20 @@ class MgrModule(object):
 
         :return: str
         """
-        return ceph_state.get_mgr_id()
+        return self._ceph_get_mgr_id()
 
-    def get_config(self, key):
+    def get_config(self, key, default=None):
         """
         Retrieve the value of a persistent configuration setting
 
         :param key: str
         :return: str
         """
-        return ceph_state.get_config(self._handle, key)
+        r = self._ceph_get_config(key)
+        if r is None:
+            return default
+        else:
+            return r
 
     def get_config_prefix(self, key_prefix):
         """
@@ -235,7 +421,7 @@ class MgrModule(object):
         :param key_prefix: str
         :return: str
         """
-        return ceph_state.get_config_prefix(self._handle, key_prefix)
+        return self._ceph_get_config_prefix(key_prefix)
 
     def get_localized_config(self, key, default=None):
         """
@@ -259,7 +445,7 @@ class MgrModule(object):
         :param key: str
         :param val: str
         """
-        ceph_state.set_config(self._handle, key, val)
+        self._ceph_set_config(key, val)
 
     def set_localized_config(self, key, val):
         """
@@ -268,7 +454,7 @@ class MgrModule(object):
         :param default: str
         :return: str
         """
-        return self.set_config(self.get_mgr_id() + '/' + key, val)
+        return self._ceph_set_config(self.get_mgr_id() + '/' + key, val)
 
     def set_config_json(self, key, val):
         """
@@ -277,7 +463,7 @@ class MgrModule(object):
         :param key: str
         :param val: json-serializable object
         """
-        self.set_config(key, json.dumps(val))
+        self._ceph_set_config(key, json.dumps(val))
 
     def get_config_json(self, key):
         """
@@ -299,3 +485,77 @@ class MgrModule(object):
         :return: bool
         """
         pass
+
+    def get_osdmap(self):
+        """
+        Get a handle to an OSDMap.  If epoch==0, get a handle for the latest
+        OSDMap.
+        :return: OSDMap
+        """
+        return self._ceph_get_osdmap()
+
+    def get_all_perf_counters(self, prio_limit=PRIO_USEFUL):
+        """
+        Return the perf counters currently known to this ceph-mgr
+        instance, filtered by priority equal to or greater than `prio_limit`.
+
+        The result us a map of string to dict, associating services
+        (like "osd.123") with their counters.  The counter
+        dict for each service maps counter paths to a counter
+        info structure, which is the information from
+        the schema, plus an additional "value" member with the latest
+        value.
+        """
+
+        result = defaultdict(dict)
+
+        # TODO: improve C++->Python interface to return just
+        # the latest if that's all we want.
+        def get_latest(daemon_type, daemon_name, counter):
+            data = self.get_counter(daemon_type, daemon_name, counter)[counter]
+            if data:
+                return data[-1][1]
+            else:
+                return 0
+
+        for server in self.list_servers():
+            for service in server['services']:
+                if service['type'] not in ("mds", "osd", "mon"):
+                    continue
+
+                schema = self.get_perf_schema(service['type'], service['id'])
+                if not schema:
+                    self.log.warn("No perf counter schema for {0}.{1}".format(
+                        service['type'], service['id']
+                    ))
+                    continue
+
+                # Value is returned in a potentially-multi-service format,
+                # get just the service we're asking about
+                svc_full_name = "{0}.{1}".format(service['type'], service['id'])
+                schema = schema[svc_full_name]
+
+                # Populate latest values
+                for counter_path, counter_schema in schema.items():
+                    # self.log.debug("{0}: {1}".format(
+                    #     counter_path, json.dumps(counter_schema)
+                    # ))
+                    if counter_schema['priority'] < prio_limit:
+                        continue
+
+                    counter_info = counter_schema
+                    counter_info['value'] = get_latest(service['type'], service['id'], counter_path)
+                    result[svc_full_name][counter_path] = counter_info
+
+        self.log.debug("returning {0} counter".format(len(result)))
+
+        return result
+
+    def set_uri(self, uri):
+        """
+        If the module exposes a service, then call this to publish the
+        address once it is available.
+
+        :return: a string
+        """
+        return self._ceph_set_uri(uri)
index f98f1f7e6735ef442354da62994e6c603cd6092b..842517f208f37e7a101864d875b295263caec083 100644 (file)
@@ -1,7 +1,8 @@
 import cherrypy
+import json
+import errno
 import math
 import os
-import time
 from collections import OrderedDict
 from mgr_module import MgrModule
 
@@ -14,7 +15,7 @@ DEFAULT_PORT = 9283
 
 
 # cherrypy likes to sys.exit on error.  don't let it take us down too!
-def os_exit_noop():
+def os_exit_noop(*args, **kwargs):
     pass
 
 
@@ -32,31 +33,32 @@ def global_instance():
     return _global_instance['plugin']
 
 
-# counter value types
-PERFCOUNTER_TIME = 1
-PERFCOUNTER_U64 = 2
+def health_status_to_number(status):
 
-# counter types
-PERFCOUNTER_LONGRUNAVG = 4
-PERFCOUNTER_COUNTER = 8
-PERFCOUNTER_HISTOGRAM = 0x10
-PERFCOUNTER_TYPE_MASK = ~2
+    if status == 'HEALTH_OK':
+        return 0
+    elif status == 'HEALTH_WARN':
+        return 1
+    elif status == 'HEALTH_ERR':
+        return 2
 
+PG_STATES = ['creating', 'active', 'clean', 'down', 'scrubbing', 'degraded',
+        'inconsistent', 'peering', 'repair', 'recovering', 'forced-recovery',
+        'backfill', 'forced-backfill', 'wait-backfill', 'backfill-toofull',
+        'incomplete', 'stale', 'remapped', 'undersized', 'peered']
 
-def stattype_to_str(stattype):
+DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_objects']
 
-    typeonly = stattype & PERFCOUNTER_TYPE_MASK
-    if typeonly == 0:
-        return 'gauge'
-    if typeonly == PERFCOUNTER_LONGRUNAVG:
-        # this lie matches the DaemonState decoding: only val, no counts
-        return 'counter'
-    if typeonly == PERFCOUNTER_COUNTER:
-        return 'counter'
-    if typeonly == PERFCOUNTER_HISTOGRAM:
-        return 'histogram'
+DF_POOL = ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty',
+           'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
 
-    return ''
+OSD_METADATA = ('cluster_addr', 'device_class', 'id', 'public_addr')
+
+OSD_STATUS = ['weight', 'up', 'in']
+
+POOL_METADATA = ('pool_id', 'name')
+
+DISK_OCCUPATION = ('instance', 'device', 'ceph_daemon')
 
 
 class Metric(object):
@@ -76,7 +78,16 @@ class Metric(object):
 
         def promethize(path):
             ''' replace illegal metric name characters '''
-            return path.replace('.', '_').replace('-', '_')
+            result = path.replace('.', '_').replace('+', '_plus').replace('::', '_')
+
+            # Hyphens usually turn into underscores, unless they are
+            # trailing
+            if result.endswith("-"):
+                result = result[0:-1] + "_minus"
+            else:
+                result = result.replace("-", "_")
+
+            return "ceph_{0}".format(result)
 
         def floatstr(value):
             ''' represent as Go-compatible float '''
@@ -116,98 +127,233 @@ class Metric(object):
 
 
 class Module(MgrModule):
+    COMMANDS = [
+        {
+            "cmd": "prometheus self-test",
+            "desc": "Run a self test on the prometheus module",
+            "perm": "rw"
+        },
+    ]
 
     def __init__(self, *args, **kwargs):
         super(Module, self).__init__(*args, **kwargs)
         self.notified = False
         self.serving = False
-        self.metrics = dict()
+        self.metrics = self._setup_static_metrics()
         self.schema = OrderedDict()
         _global_instance['plugin'] = self
 
-    def _get_ordered_schema(self, **kwargs):
-
-        '''
-        fetch an ordered-by-key performance counter schema
-        ['perf_schema'][daemontype.id][countername] with keys
-        'nick' (if present)
-        'description'
-        'type' (counter type....counter/gauge/avg/histogram/etc.)
-        '''
-
-        daemon_type = kwargs.get('daemon_type', '')
-        daemon_id = kwargs.get('daemon_id', '')
-
-        schema = self.get_perf_schema(daemon_type, daemon_id)
-        if not schema:
-            self.log.warning('_get_ordered_schema: no data')
-            return
-
-        new_schema = dict()
-        for k1 in schema.keys():    # 'perf_schema', but assume only one
-            for k2 in sorted(schema[k1].keys()):
-                sorted_dict = OrderedDict(
-                    sorted(schema[k1][k2].items(), key=lambda i: i[0])
-                )
-                new_schema[k2] = sorted_dict
-        for k in sorted(new_schema.keys()):
-            self.log.debug("updating schema for %s" % k)
-            self.schema[k] = new_schema[k]
-
-    def shutdown(self):
-        self.serving = False
-        pass
-
-    # XXX duplicated from dashboard; factor out?
-    def get_latest(self, daemon_type, daemon_name, stat):
-        data = self.get_counter(daemon_type, daemon_name, stat)[stat]
-        if data:
-            return data[-1][1]
-        else:
-            return 0
-
-    def get_stat(self, daemon, path):
+    def _stattype_to_str(self, stattype):
+
+        typeonly = stattype & self.PERFCOUNTER_TYPE_MASK
+        if typeonly == 0:
+            return 'gauge'
+        if typeonly == self.PERFCOUNTER_LONGRUNAVG:
+            # this lie matches the DaemonState decoding: only val, no counts
+            return 'counter'
+        if typeonly == self.PERFCOUNTER_COUNTER:
+            return 'counter'
+        if typeonly == self.PERFCOUNTER_HISTOGRAM:
+            return 'histogram'
+
+        return ''
+
+    def _setup_static_metrics(self):
+        metrics = {}
+        metrics['health_status'] = Metric(
+            'untyped',
+            'health_status',
+            'Cluster health status'
+        )
+        metrics['mon_quorum_count'] = Metric(
+            'gauge',
+            'mon_quorum_count',
+            'Monitors in quorum'
+        )
+        metrics['osd_metadata'] = Metric(
+            'untyped',
+            'osd_metadata',
+            'OSD Metadata',
+            OSD_METADATA
+        )
 
-        perfcounter = self.schema[daemon][path]
-        stattype = stattype_to_str(perfcounter['type'])
-        # XXX simplify first effort: no histograms
-        # averages are already collapsed to one value for us
-        if not stattype or stattype == 'histogram':
-            self.log.debug('ignoring %s, type %s' % (path, stattype))
-            return
+        # The reason for having this separate to OSD_METADATA is
+        # so that we can stably use the same tag names that
+        # the Prometheus node_exporter does
+        metrics['disk_occupation'] = Metric(
+            'undef',
+            'disk_occupation',
+            'Associate Ceph daemon with disk used',
+            DISK_OCCUPATION
+        )
 
-        if path not in self.metrics:
-            self.metrics[path] = Metric(
-                stattype,
+        metrics['pool_metadata'] = Metric(
+            'untyped',
+            'pool_metadata',
+            'POOL Metadata',
+            POOL_METADATA
+        )
+        for state in OSD_STATUS:
+            path = 'osd_{}'.format(state)
+            self.log.debug("init: creating {}".format(path))
+            metrics[path] = Metric(
+                'untyped',
                 path,
-                perfcounter['description'],
-                ('daemon',),
+                'OSD status {}'.format(state),
+                ('ceph_daemon',)
             )
+        for state in PG_STATES:
+            path = 'pg_{}'.format(state)
+            self.log.debug("init: creating {}".format(path))
+            metrics[path] = Metric(
+                'gauge',
+                path,
+                'PG {}'.format(state),
+            )
+        for state in DF_CLUSTER:
+            path = 'cluster_{}'.format(state)
+            self.log.debug("init: creating {}".format(path))
+            metrics[path] = Metric(
+                'gauge',
+                path,
+                'DF {}'.format(state),
+            )
+        for state in DF_POOL:
+            path = 'pool_{}'.format(state)
+            self.log.debug("init: creating {}".format(path))
+            metrics[path] = Metric(
+                'gauge',
+                path,
+                'DF pool {}'.format(state),
+                ('pool_id',)
+            )
+
+        return metrics
 
-        daemon_type, daemon_id = daemon.split('.')
+    def shutdown(self):
+        self.serving = False
+        pass
 
-        self.metrics[path].set(
-            self.get_latest(daemon_type, daemon_id, path),
-            (daemon,)
+    def get_health(self):
+        health = json.loads(self.get('health')['json'])
+        self.metrics['health_status'].set(
+            health_status_to_number(health['status'])
         )
 
+    def get_df(self):
+        # maybe get the to-be-exported metrics from a config?
+        df = self.get('df')
+        for stat in DF_CLUSTER:
+            path = 'cluster_{}'.format(stat)
+            self.metrics[path].set(df['stats'][stat])
+
+        for pool in df['pools']:
+            for stat in DF_POOL:
+                path = 'pool_{}'.format(stat)
+                self.metrics[path].set(pool['stats'][stat], (pool['id'],))
+
+    def get_quorum_status(self):
+        mon_status = json.loads(self.get('mon_status')['json'])
+        self.metrics['mon_quorum_count'].set(len(mon_status['quorum']))
+
+    def get_pg_status(self):
+        # TODO add per pool status?
+        pg_s = self.get('pg_summary')['all']
+        reported_pg_s = [(s,v) for key, v in pg_s.items() for s in
+                         key.split('+')]
+        for state, value in reported_pg_s:
+            path = 'pg_{}'.format(state)
+            self.metrics[path].set(value)
+        reported_states = [s[0] for s in reported_pg_s]
+        for state in PG_STATES:
+            path = 'pg_{}'.format(state)
+            if state not in reported_states:
+                self.metrics[path].set(0)
+
+    def get_metadata_and_osd_status(self):
+        osd_map = self.get('osd_map')
+        osd_devices = self.get('osd_map_crush')['devices']
+        for osd in osd_map['osds']:
+            id_ = osd['osd']
+            p_addr = osd['public_addr'].split(':')[0]
+            c_addr = osd['cluster_addr'].split(':')[0]
+            dev_class = next((osd for osd in osd_devices if osd['id'] == id_))
+            self.metrics['osd_metadata'].set(0, (
+                c_addr,
+                dev_class['class'],
+                id_,
+                p_addr
+            ))
+            for state in OSD_STATUS:
+                status = osd[state]
+                self.metrics['osd_{}'.format(state)].set(
+                    status,
+                    ('osd.{}'.format(id_),))
+
+            osd_metadata = self.get_metadata("osd", str(id_))
+            dev_keys = ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
+            osd_dev_node = None
+            for dev_key in dev_keys:
+                val = osd_metadata.get(dev_key, None)
+                if val and val != "unknown":
+                    osd_dev_node = val
+                    break
+            osd_hostname = osd_metadata.get('hostname', None)
+            if osd_dev_node and osd_hostname:
+                self.log.debug("Got dev for osd {0}: {1}/{2}".format(
+                    id_, osd_hostname, osd_dev_node))
+                self.metrics['disk_occupation'].set(0, (
+                    osd_hostname,
+                    osd_dev_node,
+                    "osd.{0}".format(id_)
+                ))
+            else:
+                self.log.info("Missing dev node metadata for osd {0}, skipping "
+                               "occupation record for this osd".format(id_))
+
+        for pool in osd_map['pools']:
+            id_ = pool['pool']
+            name = pool['pool_name']
+            self.metrics['pool_metadata'].set(0, (id_, name))
+
     def collect(self):
-        for daemon in self.schema.keys():
-            for path in self.schema[daemon].keys():
-                self.get_stat(daemon, path)
+        self.get_health()
+        self.get_df()
+        self.get_quorum_status()
+        self.get_metadata_and_osd_status()
+        self.get_pg_status()
+
+        for daemon, counters in self.get_all_perf_counters().iteritems():
+            for path, counter_info in counters.items():
+                stattype = self._stattype_to_str(counter_info['type'])
+                # XXX simplify first effort: no histograms
+                # averages are already collapsed to one value for us
+                if not stattype or stattype == 'histogram':
+                    self.log.debug('ignoring %s, type %s' % (path, stattype))
+                    continue
+
+                if path not in self.metrics:
+                    self.metrics[path] = Metric(
+                        stattype,
+                        path,
+                        counter_info['description'],
+                        ("ceph_daemon",),
+                    )
+
+                self.metrics[path].set(
+                    counter_info['value'],
+                    (daemon,)
+                )
+
         return self.metrics
 
-    def notify(self, ntype, nid):
-        ''' Just try to sync and not run until we're notified once '''
-        if not self.notified:
-            self.serving = True
-            self.notified = True
-        if ntype == 'perf_schema_update':
-            daemon_type, daemon_id = nid.split('.')
-            self._get_ordered_schema(
-                daemon_type=daemon_type,
-                daemon_id=daemon_id
-            )
+    def handle_command(self, cmd):
+        if cmd['prefix'] == 'prometheus self-test':
+            self.collect()
+            return 0, '', 'Self-test OK'
+        else:
+            return (-errno.EINVAL, '',
+                    "Command not found '{0}'".format(cmd['prefix']))
 
     def serve(self):
 
@@ -226,6 +372,17 @@ class Module(MgrModule):
 
             @cherrypy.expose
             def index(self):
+                return '''<!DOCTYPE html>
+<html>
+       <head><title>Ceph Exporter</title></head>
+       <body>
+               <h1>Ceph Exporter</h1>
+               <p><a href='/metrics'>Metrics</a></p>
+       </body>
+</html>'''
+
+            @cherrypy.expose
+            def metrics(self):
                 metrics = global_instance().collect()
                 cherrypy.response.headers['Content-Type'] = 'text/plain'
                 if metrics:
@@ -237,13 +394,10 @@ class Module(MgrModule):
             "server_addr: %s server_port: %s" %
             (server_addr, server_port)
         )
-        # wait for first notification (of any kind) to start up
-        while not self.serving:
-            time.sleep(1)
 
         cherrypy.config.update({
             'server.socket_host': server_addr,
-            'server.socket_port': server_port,
+            'server.socket_port': int(server_port),
             'engine.autoreload.on': False
         })
         cherrypy.tree.mount(Root(), "/")
index 63fcae72a5a21c384166ae91c026ddcc770b5acb..6ce610b881f4d06d98ae0e25fca3e84721dfeb47 100644 (file)
@@ -10,6 +10,7 @@ import inspect
 import tempfile
 import threading
 import traceback
+import socket
 
 import common
 
@@ -26,6 +27,9 @@ from mgr_module import MgrModule, CommandResult
 instance = None
 
 
+class CannotServe(Exception):
+    pass
+
 
 class CommandsRequest(object):
     """
@@ -247,6 +251,8 @@ class Module(MgrModule):
             try:
                 self._serve()
                 self.server.socket.close()
+            except CannotServe as cs:
+                self.log.warn("server not running: {0}".format(cs.message))
             except:
                 self.log.error(str(traceback.format_exc()))
 
@@ -272,7 +278,8 @@ class Module(MgrModule):
 
         server_addr = self.get_localized_config('server_addr', '::')
         if server_addr is None:
-            raise RuntimeError('no server_addr configured; try "ceph config-key set mgr/restful/server_addr <ip>"')
+            raise CannotServe('no server_addr configured; try "ceph config-key set mgr/restful/server_addr <ip>"')
+
         server_port = int(self.get_localized_config('server_port', '8003'))
         self.log.info('server_addr: %s server_port: %d',
                       server_addr, server_port)
@@ -296,11 +303,18 @@ class Module(MgrModule):
             pkey_fname = self.get_localized_config('key_file')
 
         if not cert_fname or not pkey_fname:
-            raise RuntimeError('no certificate configured')
+            raise CannotServe('no certificate configured')
         if not os.path.isfile(cert_fname):
-            raise RuntimeError('certificate %s does not exist' % cert_fname)
+            raise CannotServe('certificate %s does not exist' % cert_fname)
         if not os.path.isfile(pkey_fname):
-            raise RuntimeError('private key %s does not exist' % pkey_fname)
+            raise CannotServe('private key %s does not exist' % pkey_fname)
+
+        # Publish the URI that others may use to access the service we're
+        # about to start serving
+        self.set_uri("https://{0}:{1}/".format(
+            socket.gethostname() if server_addr == "::" else server_addr,
+            server_port
+        ))
 
         # Create the HTTPS werkzeug server serving pecan app
         self.server = make_server(
diff --git a/ceph/src/pybind/mgr/selftest/__init__.py b/ceph/src/pybind/mgr/selftest/__init__.py
new file mode 100644 (file)
index 0000000..622a611
--- /dev/null
@@ -0,0 +1,3 @@
+
+from module import *
+
diff --git a/ceph/src/pybind/mgr/selftest/module.py b/ceph/src/pybind/mgr/selftest/module.py
new file mode 100644 (file)
index 0000000..e289aee
--- /dev/null
@@ -0,0 +1,217 @@
+
+from mgr_module import MgrModule, CommandResult
+import threading
+import random
+import json
+import errno
+
+
+class Module(MgrModule):
+    """
+    This module is for testing the ceph-mgr python interface from within
+    a running ceph-mgr daemon.
+
+    It implements a sychronous self-test command for calling the functions
+    in the MgrModule interface one by one, and a background "workload"
+    command for causing the module to perform some thrashing-type
+    activities in its serve() thread.
+    """
+
+    WORKLOAD_COMMAND_SPAM = "command_spam"
+    SHUTDOWN = "shutdown"
+
+    WORKLOADS = (WORKLOAD_COMMAND_SPAM, )
+
+    COMMANDS = [
+            {
+                "cmd": "mgr self-test run",
+                "desc": "Run mgr python interface tests",
+                "perm": "r"
+            },
+            {
+                "cmd": "mgr self-test background start name=workload,type=CephString",
+                "desc": "Activate a background workload (one of {0})".format(
+                    ", ".join(WORKLOADS)),
+                "perm": "r"
+            },
+            {
+                "cmd": "mgr self-test background stop",
+                "desc": "Stop background workload if any is running",
+                "perm": "r"
+            },
+            ]
+
+
+
+    def __init__(self, *args, **kwargs):
+        super(Module, self).__init__(*args, **kwargs)
+        self._event = threading.Event()
+        self._workload = None
+
+    def handle_command(self, command):
+        if command['prefix'] == 'mgr self-test run':
+            self._self_test()
+            return 0, '', 'Self-test succeeded'
+
+        elif command['prefix'] == 'mgr self-test background start':
+            if command['workload'] not in self.WORKLOADS:
+                return (-errno.EINVAL, '',
+                        "Workload not found '{0}'".format(command['workload']))
+            self._workload = command['workload']
+            self._event.set()
+            return 0, '', 'Running `{0}` in background'.format(self._workload)
+
+        elif command['prefix'] == 'mgr self-test background stop':
+            if self._workload:
+                was_running = self._workload
+                self._workload = None
+                self._event.set()
+                return 0, '', 'Stopping background workload `{0}`'.format(
+                        was_running)
+            else:
+                return 0, '', 'No background workload was running'
+
+        else:
+            return (-errno.EINVAL, '',
+                    "Command not found '{0}'".format(command['prefix']))
+
+    def _self_test(self):
+        self.log.info("Running self-test procedure...")
+
+        self._self_test_osdmap()
+        self._self_test_getters()
+        self._self_test_config()
+        self._self_test_misc()
+        self._self_test_perf_counters()
+
+    def _self_test_getters(self):
+        self.version
+        self.get_context()
+        self.get_mgr_id()
+
+        # In this function, we will assume that the system is in a steady
+        # state, i.e. if a server/service appears in one call, it will
+        # not have gone by the time we call another function referring to it
+
+        objects = [
+                "fs_map",
+                "osdmap_crush_map_text",
+                "osd_map",
+                "config",
+                "mon_map",
+                "service_map",
+                "osd_metadata",
+                "pg_summary",
+                "pg_status",
+                "pg_dump",
+                "df",
+                "osd_stats",
+                "health",
+                "mon_status",
+                "mgr_map"
+                ]
+        for obj in objects:
+            self.get(obj)
+
+        servers = self.list_servers()
+        for server in servers:
+            self.get_server(server['hostname'])
+
+        osdmap = self.get('osd_map')
+        for o in osdmap['osds']:
+            osd_id = o['osd']
+            self.get_metadata("osd", str(osd_id))
+
+        self.get_daemon_status("osd", "0")
+        #send_command
+
+    def _self_test_config(self):
+        # This is not a strong test (can't tell if values really
+        # persisted), it's just for the python interface bit.
+
+        self.set_config("testkey", "testvalue")
+        assert self.get_config("testkey") == "testvalue"
+
+        self.set_localized_config("testkey", "testvalue")
+        assert self.get_localized_config("testkey") == "testvalue"
+
+        self.set_config_json("testjsonkey", {"testblob": 2})
+        assert self.get_config_json("testjsonkey") == {"testblob": 2}
+
+        assert sorted(self.get_config_prefix("test").keys()) == sorted(
+                ["testkey", "testjsonkey"])
+
+    def _self_test_perf_counters(self):
+        self.get_perf_schema("osd", "0")
+        self.get_counter("osd", "0", "osd.op")
+        #get_counter
+        #get_all_perf_coutners
+
+    def _self_test_misc(self):
+        self.set_uri("http://this.is.a.test.com")
+        self.set_health_checks({})
+
+    def _self_test_osdmap(self):
+        osdmap = self.get_osdmap()
+        osdmap.get_epoch()
+        osdmap.get_crush_version()
+        osdmap.dump()
+
+        inc = osdmap.new_incremental()
+        osdmap.apply_incremental(inc)
+        inc.get_epoch()
+        inc.dump()
+
+        crush = osdmap.get_crush()
+        crush.dump()
+        crush.get_item_name(-1)
+        crush.get_item_weight(-1)
+        crush.find_takes()
+        crush.get_take_weight_osd_map(-1)
+
+        #osdmap.get_pools_by_take()
+        #osdmap.calc_pg_upmaps()
+        #osdmap.map_pools_pgs_up()
+
+        #inc.set_osd_reweights
+        #inc.set_crush_compat_weight_set_weights
+
+        self.log.info("Finished self-test procedure.")
+
+    def shutdown(self):
+        self._workload = self.SHUTDOWN
+        self._event.set()
+
+    def _command_spam(self):
+        self.log.info("Starting command_spam workload...")
+        while not self._event.is_set():
+            osdmap = self.get_osdmap()
+            dump = osdmap.dump()
+            count = len(dump['osds'])
+            i = int(random.random() * count)
+            w = random.random()
+
+            result = CommandResult('')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd reweight',
+                'id': i,
+                'weight': w
+                }), '')
+
+            crush = osdmap.get_crush().dump()
+            r, outb, outs = result.wait()
+
+        self._event.clear()
+        self.log.info("Ended command_spam workload...")
+
+    def serve(self):
+        while True:
+            if self._workload == self.WORKLOAD_COMMAND_SPAM:
+                self._command_spam()
+            elif self._workload == self.SHUTDOWN:
+                self.log.info("Shutting down...")
+                break
+            else:
+                self.log.info("Waiting for workload request...")
+                self._event.wait()
+                self._event.clear()
index 35b9dbb66b8170fd04463c3fe4debdf8b0708f14..606fb93e638c281761d8581358c50a4d9ff6e16e 100644 (file)
@@ -58,18 +58,22 @@ class Module(MgrModule):
         """
         return self.BOLD_SEQ + msg + self.RESET_SEQ
 
-    def format_dimless(self, n, width, colored=True):
+    def format_units(self, n, width, colored, decimal):
         """
         Format a number without units, so as to fit into `width` characters, substituting
         an appropriate unit suffix.
+        
+        Use decimal for dimensionless things, use base 2 (decimal=False) for byte sizes/rates.
         """
+        
+        factor = 1000 if decimal else 1024
         units = [' ', 'k', 'M', 'G', 'T', 'P']
         unit = 0
-        while len("%s" % (int(n) // (1000**unit))) > width - 1:
+        while len("%s" % (int(n) // (factor**unit))) > width - 1:
             unit += 1
 
         if unit > 0:
-            truncated_float = ("%f" % (n / (1000.0 ** unit)))[0:width - 1]
+            truncated_float = ("%f" % (n / (float(factor) ** unit)))[0:width - 1]
             if truncated_float[-1] == '.':
                 truncated_float = " " + truncated_float[0:-1]
         else:
@@ -86,6 +90,12 @@ class Module(MgrModule):
         else:
             return formatted
 
+    def format_dimless(self, n, width, colored=True):
+        return self.format_units(n, width, colored, decimal=True)
+    
+    def format_bytes(self, n, width, colored=True):
+        return self.format_units(n, width, colored, decimal=False)
+        
     def get_latest(self, daemon_type, daemon_name, stat):
         data = self.get_counter(daemon_type, daemon_name, stat)[stat]
         #self.log.error("get_latest {0} data={1}".format(stat, data))
@@ -209,8 +219,8 @@ class Module(MgrModule):
                 stats = pool_stats[pool_id]
                 pools_table.add_row([
                     pools[pool_id]['pool_name'], pool_type,
-                    self.format_dimless(stats['bytes_used'], 5),
-                    self.format_dimless(stats['max_avail'], 5)
+                    self.format_bytes(stats['bytes_used'], 5),
+                    self.format_bytes(stats['max_avail'], 5)
                 ])
 
             output += "{0} - {1} clients\n".format(
@@ -273,13 +283,13 @@ class Module(MgrModule):
             stats = osd_stats[osd_id]
 
             osd_table.add_row([osd_id, metadata['hostname'],
-                               self.format_dimless(stats['kb_used'] * 1024, 5),
-                               self.format_dimless(stats['kb_avail'] * 1024, 5),
+                               self.format_bytes(stats['kb_used'] * 1024, 5),
+                               self.format_bytes(stats['kb_avail'] * 1024, 5),
                                self.format_dimless(self.get_rate("osd", osd_id.__str__(), "osd.op_w") +
                                self.get_rate("osd", osd_id.__str__(), "osd.op_rw"), 5),
-                               self.format_dimless(self.get_rate("osd", osd_id.__str__(), "osd.op_in_bytes"), 5),
+                               self.format_bytes(self.get_rate("osd", osd_id.__str__(), "osd.op_in_bytes"), 5),
                                self.format_dimless(self.get_rate("osd", osd_id.__str__(), "osd.op_r"), 5),
-                               self.format_dimless(self.get_rate("osd", osd_id.__str__(), "osd.op_out_bytes"), 5),
+                               self.format_bytes(self.get_rate("osd", osd_id.__str__(), "osd.op_out_bytes"), 5),
                                ])
 
         return 0, "", osd_table.get_string()
index eb2eba11a1ffe56e0348a2fb4b6dcdf194a24604..c2125a942eed19b07033f4bc5ff59535f67d4086 100644 (file)
@@ -12,7 +12,10 @@ from mgr_module import MgrModule
 
 
 def avg(data):
-    return sum(data) / float(len(data))
+    if len(data):
+        return sum(data) / float(len(data))
+    else:
+        return 0
 
 
 class ZabbixSender(object):
@@ -257,11 +260,12 @@ class Module(MgrModule):
         while self.run:
             self.log.debug('Waking up for new iteration')
 
-            # Sometimes fetching data fails, should be fixed by PR #16020
             try:
                 self.send()
             except Exception as exc:
-                self.log.error(exc)
+                # Shouldn't happen, but let's log it and retry next interval,
+                # rather than dying completely.
+                self.log.exception("Unexpected error during send():")
 
             interval = self.config['interval']
             self.log.debug('Sleeping for %d seconds', interval)
index 25de4648398d62aef955c92c3d60a88b26aff6d3..3b840aef94c7b2962a204d13703637c677ac5b35 100755 (executable)
@@ -58,19 +58,19 @@ do_map() {
 
 unmount_unmap() {
        local rbd_dev=$1
-       local mnt=$(findmnt --mtab --source ${rbd_dev} --noheadings \
+       local mnts=$(findmnt --mtab --source ${rbd_dev} --noheadings \
                                                        | awk '{print $1'})
 
        logger -p "daemon.debug" -t rbdmap "Unmapping '${rbd_dev}'"
-       if [ -n "${mnt}" ]; then
+       for mnt in ${mnts}; do
            logger -p "daemon.debug" -t rbdmap "Unmounting '${mnt}'"
            umount "${mnt}" >>/dev/null 2>&1
-       fi
-       if mountpoint -q "${mnt}"; then
-           ## Un-mounting failed.
-           logger -p "daemon.warning" -t rbdmap "Failed to unmount '${mnt}'"
-           return 1
-       fi
+           if mountpoint -q "${mnt}"; then
+                 ## Un-mounting failed.
+                 logger -p "daemon.warning" -t rbdmap "Failed to unmount '${mnt}'"
+                 return 1
+           fi
+       done
        ## Un-mapping.
        rbd unmap $rbd_dev >>/dev/null 2>&1
        if [ $? -ne 0 ]; then
index 9c0826822de8e28a2966749fd499192fb4e3d6e6..bd770160710c169d7e17c2b6c50ff5b05b39acd2 100644 (file)
@@ -9,6 +9,7 @@
 #include <boost/optional.hpp>
 
 #include "auth/Crypto.h"
+#include "compressor/Compressor.h"
 
 #include "common/armor.h"
 #include "common/ceph_json.h"
@@ -4250,6 +4251,13 @@ int main(int argc, const char **argv)
           cerr << "ERROR: --placement-id not specified" << std::endl;
           return EINVAL;
         }
+        // validate compression type
+        if (compression_type && *compression_type != "random"
+            && !Compressor::get_comp_alg_type(*compression_type)) {
+          std::cerr << "Unrecognized compression type" << std::endl;
+          return EINVAL;
+        }
+
        RGWZoneParams zone(zone_id, zone_name);
        int ret = zone.init(g_ceph_context, store);
         if (ret < 0) {
index 9369864259f045a00465cc37f6dee059c24c5fd4..1c851a933e1138baf7528428f113acc5663da3ae 100644 (file)
@@ -36,7 +36,7 @@ class ExternalAuthStrategy : public rgw::auth::Strategy,
   using keystone_cache_t = rgw::keystone::TokenCache;
   using EC2Engine = rgw::auth::keystone::EC2Engine;
 
-  EC2Engine keystone_engine;
+  boost::optional <EC2Engine> keystone_engine;
   LDAPEngine ldap_engine;
 
   aplptr_t create_apl_remote(CephContext* const cct,
@@ -56,16 +56,18 @@ public:
                        RGWRados* const store,
                        AWSEngine::VersionAbstractor* const ver_abstractor)
     : store(store),
-      keystone_engine(cct, ver_abstractor,
-                      static_cast<rgw::auth::RemoteApplier::Factory*>(this),
-                      keystone_config_t::get_instance(),
-                      keystone_cache_t::get_instance<keystone_config_t>()),
       ldap_engine(cct, store, *ver_abstractor,
                   static_cast<rgw::auth::RemoteApplier::Factory*>(this)) {
 
     if (cct->_conf->rgw_s3_auth_use_keystone &&
         ! cct->_conf->rgw_keystone_url.empty()) {
-      add_engine(Control::SUFFICIENT, keystone_engine);
+
+      keystone_engine.emplace(cct, ver_abstractor,
+                              static_cast<rgw::auth::RemoteApplier::Factory*>(this),
+                              keystone_config_t::get_instance(),
+                              keystone_cache_t::get_instance<keystone_config_t>());
+      add_engine(Control::SUFFICIENT, *keystone_engine);
+
     }
 
     if (cct->_conf->rgw_s3_auth_use_ldap &&
index 31e9d3a32acb4d4aac93aac5cadd993f44468e70..b4707b1a7088770b95b695a15ae5ac54f82eb360 100644 (file)
@@ -160,12 +160,10 @@ public:
   }
 
   const std::string& get_tenant() const {
-    ceph_assert(t != Wildcard);
     return u.tenant;
   }
 
   const std::string& get_id() const {
-    ceph_assert(t != Wildcard && t != Tenant);
     return u.id;
   }
 
index 0da6dd587b672226d14d35bdd9967d840954d713..2a38733aa2966740b3dcbc1dec48eba6e8fb56fe 100644 (file)
@@ -106,10 +106,9 @@ int rgw_read_user_buckets(RGWRados * store,
 {
   int ret;
   buckets.clear();
-  string buckets_obj_id;
+  std::string buckets_obj_id;
   rgw_get_buckets_obj(user_id, buckets_obj_id);
   rgw_raw_obj obj(store->get_zone_params().user_uid_pool, buckets_obj_id);
-  list<cls_user_bucket_entry> entries;
 
   bool truncated = false;
   string m = marker;
@@ -121,15 +120,18 @@ int rgw_read_user_buckets(RGWRados * store,
   }
 
   do {
+    std::list<cls_user_bucket_entry> entries;
     ret = store->cls_user_list_buckets(obj, m, end_marker, max - total, entries, &m, &truncated);
-    if (ret == -ENOENT)
+    if (ret == -ENOENT) {
       ret = 0;
+    }
 
-    if (ret < 0)
+    if (ret < 0) {
       return ret;
+    }
 
-    for (const auto& entry : entries) {
-      buckets.add(RGWBucketEnt(user_id, entry));
+    for (auto& entry : entries) {
+      buckets.add(RGWBucketEnt(user_id, std::move(entry)));
       total++;
     }
 
@@ -178,7 +180,11 @@ int rgw_bucket_sync_user_stats(RGWRados *store, const string& tenant_name, const
   return 0;
 }
 
-int rgw_link_bucket(RGWRados *store, const rgw_user& user_id, rgw_bucket& bucket, real_time creation_time, bool update_entrypoint)
+int rgw_link_bucket(RGWRados* const store,
+                    const rgw_user& user_id,
+                    rgw_bucket& bucket,
+                    ceph::real_time creation_time,
+                    bool update_entrypoint)
 {
   int ret;
   string& tenant_name = bucket.tenant;
@@ -476,7 +482,8 @@ void check_bad_user_bucket_mapping(RGWRados *store, const rgw_user& user_id,
         cout << "bucket info mismatch: expected " << actual_bucket << " got " << bucket << std::endl;
         if (fix) {
           cout << "fixing" << std::endl;
-          r = rgw_link_bucket(store, user_id, actual_bucket, bucket_info.creation_time);
+          r = rgw_link_bucket(store, user_id, actual_bucket,
+                              bucket_info.creation_time);
           if (r < 0) {
             cerr << "failed to fix bucket: " << cpp_strerror(-r) << std::endl;
           }
@@ -893,7 +900,8 @@ int RGWBucket::link(RGWBucketAdminOpState& op_state, std::string *err_msg)
       return r;
     }
 
-    r = rgw_link_bucket(store, user_info.user_id, bucket_info.bucket, real_time());
+    r = rgw_link_bucket(store, user_info.user_id, bucket_info.bucket,
+                        ceph::real_time());
     if (r < 0) {
       return r;
     }
@@ -2121,7 +2129,8 @@ public:
     if (be.linked) {
       ret = rgw_link_bucket(store, be.owner, be.bucket, be.creation_time, false);
     } else {
-      ret = rgw_unlink_bucket(store, be.owner, be.bucket.tenant, be.bucket.name, false);
+      ret = rgw_unlink_bucket(store, be.owner, be.bucket.tenant,
+                              be.bucket.name, false);
     }
 
     return ret;
index 6cc20eb8f6149107e5b6900fc1b85f2c3accb978..62b29232588279b64f93c1af978d2e16e369794c 100644 (file)
@@ -104,10 +104,14 @@ public:
  */
 class RGWUserBuckets
 {
-  map<string, RGWBucketEnt> buckets;
+  std::map<std::string, RGWBucketEnt> buckets;
 
 public:
-  RGWUserBuckets() {}
+  RGWUserBuckets() = default;
+  RGWUserBuckets(RGWUserBuckets&&) = default;
+
+  RGWUserBuckets& operator=(const RGWUserBuckets&) = default;
+
   void encode(bufferlist& bl) const {
     ::encode(buckets, bl);
   }
@@ -172,7 +176,11 @@ extern int rgw_read_user_buckets(RGWRados *store,
                                 bool* is_truncated,
                                  uint64_t default_amount = 1000);
 
-extern int rgw_link_bucket(RGWRados *store, const rgw_user& user_id, rgw_bucket& bucket, real_time creation_time, bool update_entrypoint = true);
+extern int rgw_link_bucket(RGWRados* store,
+                           const rgw_user& user_id,
+                           rgw_bucket& bucket,
+                           ceph::real_time creation_time,
+                           bool update_entrypoint = true);
 extern int rgw_unlink_bucket(RGWRados *store, const rgw_user& user_id,
                              const string& tenant_name, const string& bucket_name, bool update_entrypoint = true);
 
index 02b807efc1e4281da9362dff523ce39737032694..d10577ef455cb19900b9b6aabcfb019254901ef6 100644 (file)
@@ -76,6 +76,7 @@ rgw_http_errors rgw_http_s3_errors({
     { ERR_AMZ_CONTENT_SHA256_MISMATCH, {400, "XAmzContentSHA256Mismatch" }},
     { ERR_INVALID_TAG, {400, "InvalidTag"}},
     { ERR_MALFORMED_ACL_ERROR, {400, "MalformedACLError" }},
+    { ERR_INVALID_ENCRYPTION_ALGORITHM, {400, "InvalidEncryptionAlgorithmError" }},
     { ERR_LENGTH_REQUIRED, {411, "MissingContentLength" }},
     { EACCES, {403, "AccessDenied" }},
     { EPERM, {403, "AccessDenied" }},
@@ -118,11 +119,14 @@ rgw_http_errors rgw_http_s3_errors({
 rgw_http_errors rgw_http_swift_errors({
     { EACCES, {403, "AccessDenied" }},
     { EPERM, {401, "AccessDenied" }},
+    { ENAMETOOLONG, {400, "Metadata name too long" }},
     { ERR_USER_SUSPENDED, {401, "UserSuspended" }},
     { ERR_INVALID_UTF8, {412, "Invalid UTF8" }},
     { ERR_BAD_URL, {412, "Bad URL" }},
     { ERR_NOT_SLO_MANIFEST, {400, "Not an SLO manifest" }},
     { ERR_QUOTA_EXCEEDED, {413, "QuotaExceeded" }},
+    { ENOTEMPTY, {409, "There was a conflict when trying "
+                       "to complete your request." }},
     /* FIXME(rzarzynski): we need to find a way to apply Swift's error handling
      * procedures also for ERR_ZERO_IN_URL. This make a problem as the validation
      * is performed very early, even before setting the req_state::proto_flags. */
@@ -341,7 +345,17 @@ void set_req_state_err(struct req_state* s, int err_no, const string& err_msg)
 {
   if (s) {
     set_req_state_err(s, err_no);
-    s->err.message = err_msg;
+    if (s->prot_flags & RGW_REST_SWIFT && !err_msg.empty()) {
+      /* TODO(rzarzynski): there never ever should be a check like this one.
+       * It's here only for the sake of the patch's backportability. Further
+       * commits will move the logic to a per-RGWHandler replacement of
+       * the end_header() function. Alternativaly, we might consider making
+       * that just for the dump(). Please take a look on @cbodley's comments
+       * in PR #10690 (https://github.com/ceph/ceph/pull/10690). */
+      s->err.err_code = err_msg;
+    } else {
+      s->err.message = err_msg;
+    }
   }
 }
 
index 8d299cf39d475c4818491ec3c6c3fee0ef785d20..6146d1cb33cc720d5a7cde51f6d0e069355c30ad 100644 (file)
@@ -216,6 +216,7 @@ using ceph::crypto::MD5;
 #define ERR_ZERO_IN_URL          2211
 #define ERR_MALFORMED_ACL_ERROR  2212
 #define ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION 2213
+#define ERR_INVALID_ENCRYPTION_ALGORITHM                 2214
 
 #define ERR_BUSY_RESHARDING      2300
 
@@ -437,6 +438,7 @@ enum RGWOpType {
   RGW_OP_STAT_ACCOUNT,
   RGW_OP_LIST_BUCKET,
   RGW_OP_GET_BUCKET_LOGGING,
+  RGW_OP_GET_BUCKET_LOCATION,
   RGW_OP_GET_BUCKET_VERSIONING,
   RGW_OP_SET_BUCKET_VERSIONING,
   RGW_OP_GET_BUCKET_WEBSITE,
@@ -774,11 +776,12 @@ struct RGWUserInfo
 WRITE_CLASS_ENCODER(RGWUserInfo)
 
 struct rgw_pool {
-  string name;
-  string ns;
+  std::string name;
+  std::string ns;
 
-  rgw_pool() {}
+  rgw_pool() = default;
   rgw_pool(const rgw_pool& _p) : name(_p.name), ns(_p.ns) {}
+  rgw_pool(rgw_pool&&) = default;
   rgw_pool(const string& _s) {
     from_str(_s);
   }
@@ -834,6 +837,8 @@ struct rgw_pool {
     DECODE_FINISH(bl);
   }
 
+  rgw_pool& operator=(const rgw_pool&) = default;
+
   bool operator==(const rgw_pool& p) const {
     return (compare(p) == 0);
   }
@@ -855,10 +860,20 @@ struct rgw_data_placement_target {
   rgw_pool data_extra_pool;
   rgw_pool index_pool;
 
-  rgw_data_placement_target() {}
+  rgw_data_placement_target() = default;
+  rgw_data_placement_target(const rgw_data_placement_target&) = default;
+  rgw_data_placement_target(rgw_data_placement_target&&) = default;
+
+  rgw_data_placement_target(const rgw_pool& data_pool,
+                            const rgw_pool& data_extra_pool,
+                            const rgw_pool& index_pool)
+    : data_pool(data_pool),
+      data_extra_pool(data_extra_pool),
+      index_pool(index_pool) {
+  }
 
-  rgw_data_placement_target(const rgw_pool& _data_pool, const rgw_pool& _data_extra_pool, const rgw_pool& _index_pool) 
-         : data_pool(_data_pool), data_extra_pool(_data_extra_pool), index_pool(_index_pool) {}
+  rgw_data_placement_target&
+  operator=(const rgw_data_placement_target&) = default;
 
   const rgw_pool& get_data_extra_pool() const {
     if (data_extra_pool.empty()) {
@@ -984,6 +999,8 @@ struct rgw_bucket {
     explicit_placement(b.explicit_placement.data_pool,
                        b.explicit_placement.data_extra_pool,
                        b.explicit_placement.index_pool) {}
+  rgw_bucket(const rgw_bucket&) = default;
+  rgw_bucket(rgw_bucket&&) = default;
 
   void convert(cls_user_bucket *b) const {
     b->name = name;
@@ -1069,6 +1086,8 @@ struct rgw_bucket {
   void decode_json(JSONObj *obj);
   static void generate_test_instances(list<rgw_bucket*>& o);
 
+  rgw_bucket& operator=(const rgw_bucket&) = default;
+
   bool operator<(const rgw_bucket& b) const {
     return name.compare(b.name) < 0;
   }
@@ -1860,19 +1879,31 @@ struct RGWBucketEnt {
   rgw_bucket bucket;
   size_t size;
   size_t size_rounded;
-  real_time creation_time;
+  ceph::real_time creation_time;
   uint64_t count;
 
-  RGWBucketEnt() : size(0), size_rounded(0), count(0) {}
+  /* The placement_rule is necessary to calculate per-storage-policy statics
+   * of the Swift API. Although the info available in RGWBucketInfo, we need
+   * to duplicate it here to not affect the performance of buckets listing. */
+  std::string placement_rule;
 
-  explicit RGWBucketEnt(const rgw_user& u, const cls_user_bucket_entry& e)
-    : bucket(u, e.bucket),
+  RGWBucketEnt()
+    : size(0),
+      size_rounded(0),
+      count(0) {
+  }
+  RGWBucketEnt(const RGWBucketEnt&) = default;
+  RGWBucketEnt(RGWBucketEnt&&) = default;
+  explicit RGWBucketEnt(const rgw_user& u, cls_user_bucket_entry&& e)
+    : bucket(u, std::move(e.bucket)),
       size(e.size),
       size_rounded(e.size_rounded),
       creation_time(e.creation_time),
       count(e.count) {
   }
 
+  RGWBucketEnt& operator=(const RGWBucketEnt&) = default;
+
   void convert(cls_user_bucket_entry *b) const {
     bucket.convert(&b->bucket);
     b->size = size;
@@ -1882,7 +1913,7 @@ struct RGWBucketEnt {
   }
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(6, 5, bl);
+    ENCODE_START(7, 5, bl);
     uint64_t s = size;
     __u32 mt = ceph::real_clock::to_time_t(creation_time);
     string empty_str;  // originally had the bucket name here, but we encode bucket later
@@ -1894,6 +1925,7 @@ struct RGWBucketEnt {
     s = size_rounded;
     ::encode(s, bl);
     ::encode(creation_time, bl);
+    ::encode(placement_rule, bl);
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
@@ -1917,6 +1949,8 @@ struct RGWBucketEnt {
     size_rounded = s;
     if (struct_v >= 6)
       ::decode(creation_time, bl);
+    if (struct_v >= 7)
+      ::decode(placement_rule, bl);
     DECODE_FINISH(bl);
   }
   void dump(Formatter *f) const;
index 4d5b00ff6baafd0edce91996d6e64c49bed9530c..81a84ad698af6939832aa81fd9badabc8efbeee4 100644 (file)
@@ -905,7 +905,16 @@ static int get_actual_key_from_kms(CephContext *cct,
 
   map<string, string>::iterator it = str_map.find(std::string(key_id));
   if (it != str_map.end() ) {
-    std::string master_key = from_base64((*it).second);
+    std::string master_key;
+    try {
+      master_key = from_base64((*it).second);
+    } catch (...) {
+      ldout(cct, 5) << "ERROR: get_actual_key_from_kms invalid encryption key id "
+                    << "which contains character that is not base64 encoded."
+                    << dendl;
+      return -EINVAL;
+    }
+
     if (master_key.length() == AES_256_KEYSIZE) {
       uint8_t _actual_key[AES_256_KEYSIZE];
       if (AES_256_ECB_encrypt(cct,
@@ -1025,26 +1034,57 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
         ldout(s->cct, 5) << "ERROR: Invalid value for header "
                          << "x-amz-server-side-encryption-customer-algorithm"
                          << dendl;
-        return -ERR_INVALID_REQUEST;
+        s->err.message = "The requested encryption algorithm is not valid, must be AES256.";
+        return -ERR_INVALID_ENCRYPTION_ALGORITHM;
       }
       if (s->cct->_conf->rgw_crypt_require_ssl &&
           !s->info.env->exists("SERVER_PORT_SECURE")) {
         ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
         return -ERR_INVALID_REQUEST;
       }
-      std::string key_bin = from_base64(
+
+      std::string key_bin;
+      try {
+        key_bin = from_base64(
           get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY) );
+      } catch (...) {
+        ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_encrypt invalid encryption "
+                         << "key which contains character that is not base64 encoded."
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key.";
+        return -EINVAL;
+      }
+
       if (key_bin.size() != AES_256_CBC::AES_256_KEYSIZE) {
         ldout(s->cct, 5) << "ERROR: invalid encryption key size" << dendl;
-        return -ERR_INVALID_REQUEST;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key.";
+        return -EINVAL;
       }
+
       boost::string_view keymd5 =
           get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5);
-      std::string keymd5_bin = from_base64(keymd5);
+
+      std::string keymd5_bin;
+      try {
+        keymd5_bin = from_base64(keymd5);
+      } catch (...) {
+        ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_encrypt invalid encryption key "
+                         << "md5 which contains character that is not base64 encoded."
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key md5.";
+        return -EINVAL;
+      }
+
       if (keymd5_bin.size() != CEPH_CRYPTO_MD5_DIGESTSIZE) {
         ldout(s->cct, 5) << "ERROR: Invalid key md5 size" << dendl;
-        return -ERR_INVALID_DIGEST;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key md5.";
+        return -EINVAL;
       }
+
       MD5 key_hash;
       byte key_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE];
       key_hash.Update(reinterpret_cast<const byte*>(key_bin.c_str()), key_bin.size());
@@ -1052,7 +1092,8 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
 
       if (memcmp(key_hash_res, keymd5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) {
         ldout(s->cct, 5) << "ERROR: Invalid key md5 hash" << dendl;
-        return -ERR_INVALID_DIGEST;
+        s->err.message = "The calculated MD5 hash of the key did not match the hash that was provided.";
+        return -EINVAL;
       }
 
       set_attr(attrs, RGW_ATTR_CRYPT_MODE, "SSE-C-AES256");
@@ -1067,7 +1108,30 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
       crypt_http_responses["x-amz-server-side-encryption-customer-algorithm"] = "AES256";
       crypt_http_responses["x-amz-server-side-encryption-customer-key-MD5"] = keymd5.to_string();
       return 0;
+    } else {
+      boost::string_view customer_key =
+          get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY);
+      if (!customer_key.empty()) {
+        ldout(s->cct, 5) << "ERROR: SSE-C encryption request is missing the header "
+                         << "x-amz-server-side-encryption-customer-algorithm"
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide a valid encryption algorithm.";
+        return -EINVAL;
+      }
+
+      boost::string_view customer_key_md5 =
+          get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5);
+      if (!customer_key_md5.empty()) {
+        ldout(s->cct, 5) << "ERROR: SSE-C encryption request is missing the header "
+                         << "x-amz-server-side-encryption-customer-algorithm"
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide a valid encryption algorithm.";
+        return -EINVAL;
+      }
     }
+
     /* AMAZON server side encryption with KMS (key management service) */
     boost::string_view req_sse =
         get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION);
@@ -1075,7 +1139,9 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
       if (req_sse != "aws:kms") {
         ldout(s->cct, 5) << "ERROR: Invalid value for header x-amz-server-side-encryption"
                          << dendl;
-        return -ERR_INVALID_REQUEST;
+        s->err.message = "Server Side Encryption with KMS managed key requires "
+                         "HTTP header x-amz-server-side-encryption : aws:kms";
+        return -EINVAL;
       }
       if (s->cct->_conf->rgw_crypt_require_ssl &&
           !s->info.env->exists("SERVER_PORT_SECURE")) {
@@ -1085,17 +1151,24 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
       boost::string_view key_id =
           get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID);
       if (key_id.empty()) {
+        ldout(s->cct, 5) << "ERROR: not provide a valid key id" << dendl;
+        s->err.message = "Server Side Encryption with KMS managed key requires "
+                         "HTTP header x-amz-server-side-encryption-aws-kms-key-id";
         return -ERR_INVALID_ACCESS_KEY;
       }
       /* try to retrieve actual key */
       std::string key_selector = create_random_key_selector(s->cct);
       std::string actual_key;
       res = get_actual_key_from_kms(s->cct, key_id, key_selector, actual_key);
-      if (res != 0)
+      if (res != 0) {
+        ldout(s->cct, 5) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl;
+        s->err.message = "Failed to retrieve the actual key, kms-keyid: " + key_id.to_string();
         return res;
+      }
       if (actual_key.size() != AES_256_KEYSIZE) {
         ldout(s->cct, 5) << "ERROR: key obtained from key_id:" <<
             key_id << " is not 256 bit size" << dendl;
+        s->err.message = "KMS provided an invalid key for the given kms-keyid.";
         return -ERR_INVALID_ACCESS_KEY;
       }
       set_attr(attrs, RGW_ATTR_CRYPT_MODE, "SSE-KMS");
@@ -1108,13 +1181,37 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
         *block_crypt = std::move(aes);
       }
       actual_key.replace(0, actual_key.length(), actual_key.length(), '\000');
+
+      crypt_http_responses["x-amz-server-side-encryption"] = "aws:kms";
+      crypt_http_responses["x-amz-server-side-encryption-aws-kms-key-id"] = key_id.to_string();
       return 0;
+    } else {
+      boost::string_view key_id =
+          get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID);
+      if (!key_id.empty()) {
+        ldout(s->cct, 5) << "ERROR: SSE-KMS encryption request is missing the header "
+                         << "x-amz-server-side-encryption"
+                         << dendl;
+        s->err.message = "Server Side Encryption with KMS managed key requires "
+                         "HTTP header x-amz-server-side-encryption : aws:kms";
+        return -EINVAL;
+      }
     }
 
     /* no other encryption mode, check if default encryption is selected */
     if (s->cct->_conf->rgw_crypt_default_encryption_key != "") {
-      std::string master_encryption_key =
-          from_base64(s->cct->_conf->rgw_crypt_default_encryption_key);
+      std::string master_encryption_key;
+      try {
+        master_encryption_key = from_base64(s->cct->_conf->rgw_crypt_default_encryption_key);
+      } catch (...) {
+        ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_encrypt invalid default encryption key "
+                         << "which contains character that is not base64 encoded."
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key.";
+        return -EINVAL;
+      }
+
       if (master_encryption_key.size() != 256 / 8) {
         ldout(s->cct, 0) << "ERROR: failed to decode 'rgw crypt default encryption key' to 256 bit string" << dendl;
         /* not an error to return; missing encryption does not inhibit processing */
@@ -1170,26 +1267,58 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
     const char *req_cust_alg =
         s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM", NULL);
 
-    if ((nullptr == req_cust_alg) || (strcmp(req_cust_alg, "AES256") != 0)) {
-      ldout(s->cct, 5) << "ERROR: Invalid value for header "
+    if (nullptr == req_cust_alg)  {
+      ldout(s->cct, 5) << "ERROR: Request for SSE-C encrypted object missing "
                        << "x-amz-server-side-encryption-customer-algorithm"
                        << dendl;
-      return -ERR_INVALID_REQUEST;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide a valid encryption algorithm.";
+      return -EINVAL;
+    } else if (strcmp(req_cust_alg, "AES256") != 0) {
+      ldout(s->cct, 5) << "ERROR: The requested encryption algorithm is not valid, must be AES256." << dendl;
+      s->err.message = "The requested encryption algorithm is not valid, must be AES256.";
+      return -ERR_INVALID_ENCRYPTION_ALGORITHM;
+    }
+
+    std::string key_bin;
+    try {
+      key_bin = from_base64(s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY", ""));
+    } catch (...) {
+      ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_decrypt invalid encryption key "
+                       << "which contains character that is not base64 encoded."
+                       << dendl;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide an appropriate secret key.";
+      return -EINVAL;
     }
 
-    std::string key_bin =
-        from_base64(s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY", ""));
     if (key_bin.size() != AES_256_CBC::AES_256_KEYSIZE) {
       ldout(s->cct, 5) << "ERROR: Invalid encryption key size" << dendl;
-      return -ERR_INVALID_REQUEST;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide an appropriate secret key.";
+      return -EINVAL;
     }
 
     std::string keymd5 =
         s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5", "");
-    std::string keymd5_bin = from_base64(keymd5);
+    std::string keymd5_bin;
+    try {
+      keymd5_bin = from_base64(keymd5);
+    } catch (...) {
+      ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_decrypt invalid encryption key md5 "
+                       << "which contains character that is not base64 encoded."
+                       << dendl;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide an appropriate secret key md5.";
+      return -EINVAL;
+    }
+
+
     if (keymd5_bin.size() != CEPH_CRYPTO_MD5_DIGESTSIZE) {
       ldout(s->cct, 5) << "ERROR: Invalid key md5 size " << dendl;
-      return -ERR_INVALID_DIGEST;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide an appropriate secret key md5.";
+      return -EINVAL;
     }
 
     MD5 key_hash;
@@ -1199,7 +1328,8 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
 
     if ((memcmp(key_hash_res, keymd5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) ||
         (get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYMD5) != keymd5_bin)) {
-      return -ERR_INVALID_DIGEST;
+      s->err.message = "The calculated MD5 hash of the key did not match the hash that was provided.";
+      return -EINVAL;
     }
     auto aes = std::unique_ptr<AES_256_CBC>(new AES_256_CBC(s->cct));
     aes->set_key(reinterpret_cast<const uint8_t*>(key_bin.c_str()), AES_256_CBC::AES_256_KEYSIZE);
@@ -1222,12 +1352,14 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
     std::string actual_key;
     res = get_actual_key_from_kms(s->cct, key_id, key_selector, actual_key);
     if (res != 0) {
-      ldout(s->cct, 10) << "No encryption key for key-id=" << key_id << dendl;
+      ldout(s->cct, 10) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl;
+      s->err.message = "Failed to retrieve the actual key, kms-keyid: " + key_id;
       return res;
     }
     if (actual_key.size() != AES_256_KEYSIZE) {
       ldout(s->cct, 0) << "ERROR: key obtained from key_id:" <<
           key_id << " is not 256 bit size" << dendl;
+      s->err.message = "KMS provided an invalid key for the given kms-keyid.";
       return -ERR_INVALID_ACCESS_KEY;
     }
 
@@ -1242,8 +1374,17 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
   }
 
   if (stored_mode == "RGW-AUTO") {
-    std::string master_encryption_key =
-        from_base64(std::string(s->cct->_conf->rgw_crypt_default_encryption_key));
+    std::string master_encryption_key;
+    try {
+      master_encryption_key = from_base64(std::string(s->cct->_conf->rgw_crypt_default_encryption_key));
+    } catch (...) {
+      ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_decrypt invalid default encryption key "
+                       << "which contains character that is not base64 encoded."
+                       << dendl;
+      s->err.message = "The default encryption key is not valid base64.";
+      return -EINVAL;
+    }
+
     if (master_encryption_key.size() != 256 / 8) {
       ldout(s->cct, 0) << "ERROR: failed to decode 'rgw crypt default encryption key' to 256 bit string" << dendl;
       return -EIO;
index 8fe6497f29b61a0d2ddc3b3da13b5e351189a6d6..daaffb7cde9eb7b2c10b9eeb9490c7e9c581b588 100644 (file)
@@ -735,19 +735,20 @@ public:
 
   int operate() override {
     reenter(this) {
-      entries_index = new RGWShardedOmapCRManager(sync_env->async_rados, store, this, num_shards,
-                                                 store->get_zone_params().log_pool,
-                                                  oid_prefix);
       yield {
         string entrypoint = string("/admin/metadata/bucket.instance");
         /* FIXME: need a better scaling solution here, requires streaming output */
         call(new RGWReadRESTResourceCR<list<string> >(store->ctx(), sync_env->conn, sync_env->http_manager,
                                                       entrypoint, NULL, &result));
       }
-      if (get_ret_status() < 0) {
+      if (retcode < 0) {
         ldout(sync_env->cct, 0) << "ERROR: failed to fetch metadata for section bucket.index" << dendl;
-        return set_state(RGWCoroutine_Error);
+        return set_cr_error(retcode);
       }
+      entries_index = new RGWShardedOmapCRManager(sync_env->async_rados, store, this, num_shards,
+                                                 store->get_zone_params().log_pool,
+                                                  oid_prefix);
+      yield; // yield so OmapAppendCRs can start
       for (iter = result.begin(); iter != result.end(); ++iter) {
         ldout(sync_env->cct, 20) << "list metadata: section=bucket.index key=" << *iter << dendl;
 
@@ -1595,8 +1596,9 @@ class RGWDataSyncControlCR : public RGWBackoffControlCR
   RGWDataSyncEnv *sync_env;
   uint32_t num_shards;
 
+  static constexpr bool exit_on_error = false; // retry on all errors
 public:
-  RGWDataSyncControlCR(RGWDataSyncEnv *_sync_env, uint32_t _num_shards) : RGWBackoffControlCR(_sync_env->cct, true),
+  RGWDataSyncControlCR(RGWDataSyncEnv *_sync_env, uint32_t _num_shards) : RGWBackoffControlCR(_sync_env->cct, exit_on_error),
                                                       sync_env(_sync_env), num_shards(_num_shards) {
   }
 
index 83df3ddbf01bb94ede7be806e23b69f3006cf627..84f8e961e4b691a65685928fd98a1e0867a486b2 100644 (file)
@@ -92,10 +92,10 @@ namespace rgw {
        auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1);
        auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1);
        if (ux_key && ux_attrs) {
-         bool old_key = rgw_fh->decode_attrs(ux_key, ux_attrs);
-         if (old_key) {
-           update_fhk(rgw_fh);
-         }
+         DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs);
+         if (get<0>(dar) || get<1>(dar)) {
+           update_fh(rgw_fh);
+          }
        }
        if (! (flags & RGWFileHandle::FLAG_LOCKED)) {
          rgw_fh->mtx.unlock();
@@ -147,10 +147,10 @@ namespace rgw {
            auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1);
            auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1);
            if (ux_key && ux_attrs) {
-             bool old_key = rgw_fh->decode_attrs(ux_key, ux_attrs);
-             if (old_key) {
-               update_fhk(rgw_fh);
-             }
+              DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs);
+              if (get<0>(dar) || get<1>(dar)) {
+                update_fh(rgw_fh);
+              }
            }
          }
          goto done;
@@ -181,10 +181,10 @@ namespace rgw {
            auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1);
            auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1);
            if (ux_key && ux_attrs) {
-             bool old_key = rgw_fh->decode_attrs(ux_key, ux_attrs);
-             if (old_key) {
-               update_fhk(rgw_fh);
-             }
+              DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs);
+              if (get<0>(dar) || get<1>(dar)) {
+                update_fh(rgw_fh);
+              }
            }
          }
          goto done;
@@ -744,7 +744,7 @@ namespace rgw {
   } /* RGWLibFS::setattr */
 
   /* called under rgw_fh->mtx held */
-  void RGWLibFS::update_fhk(RGWFileHandle *rgw_fh)
+  void RGWLibFS::update_fh(RGWFileHandle *rgw_fh)
   {
     int rc, rc2;
     string obj_name{rgw_fh->relative_object_name()};
@@ -757,15 +757,15 @@ namespace rgw {
 
     lsubdout(get_context(), rgw, 17)
       << __func__
-      << " update old versioned fhk : " << obj_name
+      << " update old versioned fh : " << obj_name
       << dendl;
 
     RGWSetAttrsRequest req(cct, get_user(), rgw_fh->bucket_name(), obj_name);
 
     rgw_fh->encode_attrs(ux_key, ux_attrs);
 
-    /* update ux_key only */
     req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key));
+    req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs));
 
     rc = rgwlib.get_fe()->execute_req(&req);
     rc2 = req.get_ret();
@@ -773,10 +773,10 @@ namespace rgw {
     if ((rc != 0) || (rc2 != 0)) {
       lsubdout(get_context(), rgw, 17)
        << __func__
-       << " update fhk failed : " << obj_name
+       << " update fh failed : " << obj_name
        << dendl;
     }
-  } /* RGWLibFS::update_fhk */
+  } /* RGWLibFS::update_fh */
 
   void RGWLibFS::close()
   {
@@ -786,7 +786,7 @@ namespace rgw {
     {
       RGWLibFS* fs;
     public:
-      ObjUnref(RGWLibFS* fs) : fs(fs) {}
+      ObjUnref(RGWLibFS* _fs) : fs(_fs) {}
       void operator()(RGWFileHandle* fh) const {
        lsubdout(fs->get_context(), rgw, 5)
          << __func__
@@ -956,7 +956,7 @@ namespace rgw {
       fs->fh_cache.remove(fh.fh_hk.object, this, FHCache::FLAG_LOCK);
     }
     /* cond-unref parent */
-    if (parent && (! parent->is_root())) {
+    if (parent && (! parent->is_mount())) {
       /* safe because if parent->unref causes its deletion,
        * there are a) by refcnt, no other objects/paths pointing
        * to it and b) by the semantics of valid iteration of
@@ -976,23 +976,26 @@ namespace rgw {
     rgw::encode(*this, ux_attrs1);
   } /* RGWFileHandle::encode_attrs */
 
-  bool RGWFileHandle::decode_attrs(const ceph::buffer::list* ux_key1,
-                                  const ceph::buffer::list* ux_attrs1)
+  DecodeAttrsResult RGWFileHandle::decode_attrs(const ceph::buffer::list* ux_key1,
+                                                const ceph::buffer::list* ux_attrs1)
   {
-    bool old_key = false;
+    DecodeAttrsResult dar { false, false };
     fh_key fhk;
     auto bl_iter_key1  = const_cast<buffer::list*>(ux_key1)->begin();
     rgw::decode(fhk, bl_iter_key1);
     if (fhk.version >= 2) {
       assert(this->fh.fh_hk == fhk.fh_hk);
     } else {
-      old_key = true;
+      get<0>(dar) = true;
     }
 
     auto bl_iter_unix1 = const_cast<buffer::list*>(ux_attrs1)->begin();
     rgw::decode(*this, bl_iter_unix1);
+    if (this->state.version < 2) {
+      get<1>(dar) = true;
+    }
 
-    return old_key;
+    return dar;
   } /* RGWFileHandle::decode_attrs */
 
   bool RGWFileHandle::reclaim() {
@@ -1020,26 +1023,29 @@ namespace rgw {
     return false;
   }
 
-  int RGWFileHandle::readdir(rgw_readdir_cb rcb, void *cb_arg, uint64_t *offset,
+  std::ostream& operator<<(std::ostream &os,
+                          RGWFileHandle::readdir_offset const &offset)
+  {
+    using boost::get;
+    if (unlikely(!! get<uint64_t*>(&offset))) {
+      uint64_t* ioff = get<uint64_t*>(offset);
+      os << *ioff;
+    }
+    else
+      os << get<const char*>(offset);
+    return os;
+  }
+
+  int RGWFileHandle::readdir(rgw_readdir_cb rcb, void *cb_arg,
+                            readdir_offset offset,
                             bool *eof, uint32_t flags)
   {
     using event = RGWLibFS::event;
+    using boost::get;
     int rc = 0;
     struct timespec now;
     CephContext* cct = fs->get_context();
 
-    if ((*offset == 0) &&
-       (flags & RGW_READDIR_FLAG_DOTDOT)) {
-      /* send '.' and '..' with their NFS-defined offsets */
-      rcb(".", cb_arg, 1, RGW_LOOKUP_FLAG_DIR);
-      rcb("..", cb_arg, 2, RGW_LOOKUP_FLAG_DIR);
-    }
-
-    lsubdout(fs->get_context(), rgw, 15)
-      << __func__
-      << " offset=" << *offset
-      << dendl;
-
     directory* d = get<directory>(&variant_type);
     if (d) {
       (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */
@@ -1047,6 +1053,13 @@ namespace rgw {
       d->last_readdir = now;
     }
 
+    bool initial_off;
+    if (likely(!! get<const char*>(&offset))) {
+      initial_off = ! get<const char*>(offset);
+    } else {
+      initial_off = (*get<uint64_t*>(offset) == 0);
+    }
+
     if (is_root()) {
       RGWListBucketsRequest req(cct, fs->get_user(), this, rcb, cb_arg,
                                offset);
@@ -1055,7 +1068,7 @@ namespace rgw {
        (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */
        lock_guard guard(mtx);
        state.atime = now;
-       if (*offset == 0)
+       if (initial_off)
          set_nlink(2);
        inc_nlink(req.d_count);
        *eof = req.eof();
@@ -1070,7 +1083,7 @@ namespace rgw {
        (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */
        lock_guard guard(mtx);
        state.atime = now;
-       if (*offset == 0)
+       if (initial_off)
          set_nlink(2);
        inc_nlink(req.d_count);
        *eof = req.eof();
@@ -1155,6 +1168,15 @@ namespace rgw {
       }
     }
 
+    int overlap = 0;
+    if ((static_cast<off_t>(off) < f->write_req->real_ofs) &&
+        ((f->write_req->real_ofs - off) <= len)) {
+      overlap = f->write_req->real_ofs - off;
+      off = f->write_req->real_ofs;
+      buffer = static_cast<char*>(buffer) + overlap;
+      len -= overlap;
+    }
+
     buffer::list bl;
     /* XXXX */
 #if 0
@@ -1191,7 +1213,7 @@ namespace rgw {
       rc = -EIO;
     }
 
-    *bytes_written = (rc == 0) ? len : 0;
+    *bytes_written = (rc == 0) ? (len + overlap) : 0;
     return rc;
   } /* RGWFileHandle::write */
 
@@ -1394,7 +1416,7 @@ namespace rgw {
     struct timespec omtime = rgw_fh->get_mtime();
     real_time appx_t = real_clock::now();
 
-    s->obj_size = ofs; // XXX check ofs
+    s->obj_size = bytes_written;
     perfcounter->inc(l_rgw_put_b, s->obj_size);
 
     op_ret = get_store()->check_quota(s->bucket_owner.get_id(), s->bucket,
@@ -1403,7 +1425,8 @@ namespace rgw {
       goto done;
     }
 
-    op_ret = get_store()->check_bucket_shards(s->bucket_info, s->bucket, bucket_quota);
+    op_ret = get_store()->check_bucket_shards(s->bucket_info, s->bucket,
+                                             bucket_quota);
     if (op_ret < 0) {
       goto done;
     }
@@ -1449,7 +1472,10 @@ namespace rgw {
       attrbl.append(val.c_str(), val.size() + 1);
     }
 
-    rgw_get_request_metadata(s->cct, s->info, attrs);
+    op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+    if (op_ret < 0) {
+      goto done;
+    }
     encode_delete_at_attr(delete_at, attrs);
 
     /* Add a custom metadata to expose the information whether an object
@@ -1504,7 +1530,38 @@ void rgwfile_version(int *major, int *minor, int *extra)
 
   /* stash access data for "mount" */
   RGWLibFS* new_fs = new RGWLibFS(static_cast<CephContext*>(rgw), uid, acc_key,
-                                 sec_key);
+                                 sec_key, "/");
+  assert(new_fs);
+
+  rc = new_fs->authorize(rgwlib.get_store());
+  if (rc != 0) {
+    delete new_fs;
+    return -EINVAL;
+  }
+
+  /* register fs for shared gc */
+  rgwlib.get_fe()->get_process()->register_fs(new_fs);
+
+  struct rgw_fs *fs = new_fs->get_fs();
+  fs->rgw = rgw;
+
+  /* XXX we no longer assume "/" is unique, but we aren't tracking the
+   * roots atm */
+
+  *rgw_fs = fs;
+
+  return 0;
+}
+
+int rgw_mount2(librgw_t rgw, const char *uid, const char *acc_key,
+               const char *sec_key, const char *root, struct rgw_fs **rgw_fs,
+               uint32_t flags)
+{
+  int rc = 0;
+
+  /* stash access data for "mount" */
+  RGWLibFS* new_fs = new RGWLibFS(static_cast<CephContext*>(rgw), uid, acc_key,
+                                 sec_key, root);
   assert(new_fs);
 
   rc = new_fs->authorize(rgwlib.get_store());
@@ -1566,8 +1623,8 @@ int rgw_statfs(struct rgw_fs *rgw_fs,
   vfs_st->f_bavail = UINT64_MAX;
   vfs_st->f_files = 1024; /* object count, do we have an est? */
   vfs_st->f_ffree = UINT64_MAX;
-  vfs_st->f_fsid[0] = fs->get_inst();
-  vfs_st->f_fsid[1] = fs->get_inst();
+  vfs_st->f_fsid[0] = fs->get_fsid();
+  vfs_st->f_fsid[1] = fs->get_fsid();
   vfs_st->f_flag = 0;
   vfs_st->f_namemax = 4096;
   return 0;
@@ -1841,9 +1898,50 @@ int rgw_readdir(struct rgw_fs *rgw_fs,
     /* bad parent */
     return -EINVAL;
   }
+
+  lsubdout(parent->get_fs()->get_context(), rgw, 15)
+    << __func__
+    << " offset=" << *offset
+    << dendl;
+
+  if ((*offset == 0) &&
+      (flags & RGW_READDIR_FLAG_DOTDOT)) {
+    /* send '.' and '..' with their NFS-defined offsets */
+    rcb(".", cb_arg, 1, RGW_LOOKUP_FLAG_DIR);
+    rcb("..", cb_arg, 2, RGW_LOOKUP_FLAG_DIR);
+  }
+
   int rc = parent->readdir(rcb, cb_arg, offset, eof, flags);
   return rc;
-}
+} /* rgw_readdir */
+
+/* enumeration continuing from name */
+int rgw_readdir2(struct rgw_fs *rgw_fs,
+                struct rgw_file_handle *parent_fh, const char *name,
+                rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+                uint32_t flags)
+{
+  RGWFileHandle* parent = get_rgwfh(parent_fh);
+  if (! parent) {
+    /* bad parent */
+    return -EINVAL;
+  }
+
+  lsubdout(parent->get_fs()->get_context(), rgw, 15)
+    << __func__
+    << " offset=" << name
+    << dendl;
+
+  if ((! name) &&
+      (flags & RGW_READDIR_FLAG_DOTDOT)) {
+    /* send '.' and '..' with their NFS-defined offsets */
+    rcb(".", cb_arg, 1, RGW_LOOKUP_FLAG_DIR);
+    rcb("..", cb_arg, 2, RGW_LOOKUP_FLAG_DIR);
+  }
+
+  int rc = parent->readdir(rcb, cb_arg, name, eof, flags);
+  return rc;
+} /* rgw_readdir2 */
 
 /* project offset of dirent name */
 int rgw_dirent_offset(struct rgw_fs *rgw_fs,
@@ -1891,8 +1989,14 @@ int rgw_write(struct rgw_fs *rgw_fs,
   if (! rgw_fh->is_file())
     return -EISDIR;
 
-  if (! rgw_fh->is_open())
-    return -EPERM;
+  if (! rgw_fh->is_open()) {
+    if (flags & RGW_OPEN_FLAG_V3) {
+      rc = rgw_fh->open(flags);
+      if (!! rc)
+       return rc;
+    } else
+      return -EPERM;
+  }
 
   rc = rgw_fh->write(offset, length, bytes_written, buffer);
 
index 34f3f0bcec8616ab4e6911b1a03138eff50a492b..2f8b8bad140cffa2df3327d82e592d1ea85be5bb 100644 (file)
@@ -173,6 +173,8 @@ namespace rgw {
   using boost::variant;
   using boost::container::flat_map;
 
+  typedef std::tuple<bool, bool> DecodeAttrsResult;
+
   class RGWFileHandle : public cohort::lru::Object
   {
     struct rgw_file_handle fh;
@@ -204,8 +206,9 @@ namespace rgw {
       struct timespec ctime;
       struct timespec mtime;
       struct timespec atime;
+      uint32_t version;
       State() : dev(0), size(0), nlink(1), owner_uid(0), owner_gid(0),
-               ctime{0,0}, mtime{0,0}, atime{0,0} {}
+               ctime{0,0}, mtime{0,0}, atime{0,0}, version(0) {}
     } state;
 
     struct file {
@@ -250,6 +253,7 @@ namespace rgw {
     static constexpr uint32_t FLAG_LOCKED = 0x0200;
     static constexpr uint32_t FLAG_STATELESS_OPEN = 0x0400;
     static constexpr uint32_t FLAG_EXACT_MATCH = 0x0800;
+    static constexpr uint32_t FLAG_MOUNT = 0x1000;
 
 #define CREATE_FLAGS(x) \
     ((x) & ~(RGWFileHandle::FLAG_CREATE|RGWFileHandle::FLAG_LOCK))
@@ -257,33 +261,47 @@ namespace rgw {
     friend class RGWLibFS;
 
   private:
-    RGWFileHandle(RGWLibFS* _fs, uint32_t fs_inst)
+    RGWFileHandle(RGWLibFS* _fs)
       : fs(_fs), bucket(nullptr), parent(nullptr), variant_type{directory()},
-       depth(0), flags(FLAG_ROOT)
+       depth(0), flags(FLAG_NONE)
       {
        /* root */
        fh.fh_type = RGW_FS_TYPE_DIRECTORY;
        variant_type = directory();
        /* stat */
-       state.dev = fs_inst;
        state.unix_mode = RGW_RWXMODE|S_IFDIR;
        /* pointer to self */
        fh.fh_private = this;
       }
 
-    void init_rootfs(std::string& fsid, const std::string& object_name) {
+    uint64_t init_fsid(std::string& uid) {
+      return XXH64(uid.c_str(), uid.length(), fh_key::seed);
+    }
+
+    void init_rootfs(std::string& fsid, const std::string& object_name,
+                     bool is_bucket) {
       /* fh_key */
       fh.fh_hk.bucket = XXH64(fsid.c_str(), fsid.length(), fh_key::seed);
       fh.fh_hk.object = XXH64(object_name.c_str(), object_name.length(),
                              fh_key::seed);
       fhk = fh.fh_hk;
       name = object_name;
+
+      state.dev = init_fsid(fsid);
+
+      if (is_bucket) {
+        flags |= RGWFileHandle::FLAG_BUCKET | RGWFileHandle::FLAG_MOUNT;
+        bucket = this;
+        depth = 1;
+      } else {
+        flags |= RGWFileHandle::FLAG_ROOT | RGWFileHandle::FLAG_MOUNT;
+      }
     }
 
   public:
-    RGWFileHandle(RGWLibFS* fs, uint32_t fs_inst, RGWFileHandle* _parent,
+    RGWFileHandle(RGWLibFS* _fs, RGWFileHandle* _parent,
                  const fh_key& _fhk, std::string& _name, uint32_t _flags)
-      : fs(fs), bucket(nullptr), parent(_parent), name(std::move(_name)),
+      : fs(_fs), bucket(nullptr), parent(_parent), name(std::move(_name)),
        fhk(_fhk), flags(_flags) {
 
       if (parent->is_root()) {
@@ -307,8 +325,8 @@ namespace rgw {
       /* save constant fhk */
       fh.fh_hk = fhk.fh_hk; /* XXX redundant in fh_hk */
 
-      /* stat */
-      state.dev = fs_inst;
+      /* inherits parent's fsid */
+      state.dev = parent->state.dev;
 
       switch (fh.fh_type) {
       case RGW_FS_TYPE_DIRECTORY:
@@ -515,6 +533,7 @@ namespace rgw {
 
     bool is_open() const { return flags & FLAG_OPEN; }
     bool is_root() const { return flags & FLAG_ROOT; }
+    bool is_mount() const { return flags & FLAG_MOUNT; }
     bool is_bucket() const { return flags & FLAG_BUCKET; }
     bool is_object() const { return !is_bucket(); }
     bool is_file() const { return (fh.fh_type == RGW_FS_TYPE_FILE); }
@@ -536,8 +555,11 @@ namespace rgw {
       return -EPERM;
     }
 
-    int readdir(rgw_readdir_cb rcb, void *cb_arg, uint64_t *offset, bool *eof,
-               uint32_t flags);
+    typedef boost::variant<uint64_t*, const char*> readdir_offset;
+
+    int readdir(rgw_readdir_cb rcb, void *cb_arg, readdir_offset offset,
+               bool *eof, uint32_t flags);
+
     int write(uint64_t off, size_t len, size_t *nbytes, void *buffer);
 
     int commit(uint64_t offset, uint64_t length, uint32_t flags) {
@@ -594,7 +616,7 @@ namespace rgw {
     }
 
     void encode(buffer::list& bl) const {
-      ENCODE_START(1, 1, bl);
+      ENCODE_START(2, 1, bl);
       ::encode(uint32_t(fh.fh_type), bl);
       ::encode(state.dev, bl);
       ::encode(state.size, bl);
@@ -605,11 +627,12 @@ namespace rgw {
       for (const auto& t : { state.ctime, state.mtime, state.atime }) {
        ::encode(real_clock::from_timespec(t), bl);
       }
+      ::encode((uint32_t)2, bl);
       ENCODE_FINISH(bl);
     }
 
     void decode(bufferlist::iterator& bl) {
-      DECODE_START(1, bl);
+      DECODE_START(2, bl);
       uint32_t fh_type;
       ::decode(fh_type, bl);
       assert(fh.fh_type == fh_type);
@@ -624,14 +647,17 @@ namespace rgw {
        ::decode(enc_time, bl);
        *t = real_clock::to_timespec(enc_time);
       }
+      if (struct_v >= 2) {
+        ::decode(state.version, bl);
+      }
       DECODE_FINISH(bl);
     }
 
     void encode_attrs(ceph::buffer::list& ux_key1,
                      ceph::buffer::list& ux_attrs1);
 
-    bool decode_attrs(const ceph::buffer::list* ux_key1,
-                     const ceph::buffer::list* ux_attrs1);
+    DecodeAttrsResult decode_attrs(const ceph::buffer::list* ux_key1,
+                                   const ceph::buffer::list* ux_attrs1);
 
     void invalidate();
 
@@ -694,7 +720,6 @@ namespace rgw {
     {
     public:
       RGWLibFS* fs;
-      uint32_t fs_inst;
       RGWFileHandle* parent;
       const fh_key& fhk;
       std::string& name;
@@ -702,20 +727,20 @@ namespace rgw {
 
       Factory() = delete;
 
-      Factory(RGWLibFS* fs, uint32_t fs_inst, RGWFileHandle* parent,
-             const fh_key& fhk, std::string& name, uint32_t flags)
-       : fs(fs), fs_inst(fs_inst), parent(parent), fhk(fhk), name(name),
-         flags(flags) {}
+      Factory(RGWLibFS* _fs, RGWFileHandle* _parent,
+             const fh_key& _fhk, std::string& _name, uint32_t _flags)
+       : fs(_fs), parent(_parent), fhk(_fhk), name(_name),
+         flags(_flags) {}
 
       void recycle (cohort::lru::Object* o) override {
        /* re-use an existing object */
        o->~Object(); // call lru::Object virtual dtor
        // placement new!
-       new (o) RGWFileHandle(fs, fs_inst, parent, fhk, name, flags);
+       new (o) RGWFileHandle(fs, parent, fhk, name, flags);
       }
 
       cohort::lru::Object* alloc() override {
-       return new RGWFileHandle(fs, fs_inst, parent, fhk, name, flags);
+       return new RGWFileHandle(fs, parent, fhk, name, flags);
       }
     }; /* Factory */
 
@@ -768,7 +793,6 @@ namespace rgw {
     static std::atomic<uint32_t> fs_inst_counter;
 
     static uint32_t write_completion_interval_s;
-    std::string fsid;
 
     using lock_guard = std::lock_guard<std::mutex>;
     using unique_lock = std::unique_lock<std::mutex>;
@@ -837,8 +861,8 @@ namespace rgw {
     };
 
     RGWLibFS(CephContext* _cct, const char *_uid, const char *_user_id,
-           const char* _key)
-      : cct(_cct), root_fh(this, new_inst()), invalidate_cb(nullptr),
+           const char* _key, const char *root)
+      : cct(_cct), root_fh(this), invalidate_cb(nullptr),
        invalidate_arg(nullptr), shutdown(false), refcnt(1),
        fh_cache(cct->_conf->rgw_nfs_fhcache_partitions,
                 cct->_conf->rgw_nfs_fhcache_size),
@@ -846,17 +870,19 @@ namespace rgw {
               cct->_conf->rgw_nfs_lru_lane_hiwat),
        uid(_uid), key(_user_id, _key) {
 
-      /* no bucket may be named rgw_fs_inst-(.*) */
-      fsid = RGWFileHandle::root_name + "rgw_fs_inst-" +
-       std::to_string(get_inst());
-
-      root_fh.init_rootfs(fsid /* bucket */, RGWFileHandle::root_name);
+      if (!root || !strcmp(root, "/")) {
+        root_fh.init_rootfs(uid, RGWFileHandle::root_name, false);
+      } else {
+        root_fh.init_rootfs(uid, root, true);
+      }
 
       /* pointer to self */
       fs.fs_private = this;
 
       /* expose public root fh */
       fs.root_fh = root_fh.get_fh();
+
+      new_inst();
     }
 
     friend void intrusive_ptr_add_ref(const RGWLibFS* fs) {
@@ -1031,7 +1057,7 @@ namespace rgw {
            fh->mtx.unlock(); /* ! LOCKED */
       } else {
        /* make or re-use handle */
-       RGWFileHandle::Factory prototype(this, get_inst(), parent, fhk,
+       RGWFileHandle::Factory prototype(this, parent, fhk,
                                         obj_name, CREATE_FLAGS(flags));
        fh = static_cast<RGWFileHandle*>(
          fh_lru.insert(&prototype,
@@ -1045,7 +1071,7 @@ namespace rgw {
          fh_cache.insert_latched(fh, lat, RGWFileHandle::FHCache::FLAG_UNLOCK);
          get<1>(fhr) |= RGWFileHandle::FLAG_CREATE;
          /* ref parent (non-initial ref cannot fail on valid object) */
-         if (! parent->is_root()) {
+         if (! parent->is_mount()) {
            (void) fh_lru.ref(parent, cohort::lru::FLAG_NONE);
          }
          goto out; /* !LATCHED */
@@ -1066,13 +1092,13 @@ namespace rgw {
     } /*  lookup_fh(RGWFileHandle*, const char *, const uint32_t) */
 
     inline void unref(RGWFileHandle* fh) {
-      if (likely(! fh->is_root())) {
+      if (likely(! fh->is_mount())) {
        (void) fh_lru.unref(fh, cohort::lru::FLAG_NONE);
       }
     }
 
     inline RGWFileHandle* ref(RGWFileHandle* fh) {
-      if (likely(! fh->is_root())) {
+      if (likely(! fh->is_mount())) {
        fh_lru.ref(fh, cohort::lru::FLAG_NONE);
       }
       return fh;
@@ -1083,8 +1109,7 @@ namespace rgw {
     int setattr(RGWFileHandle* rgw_fh, struct stat* st, uint32_t mask,
                uint32_t flags);
 
-    void update_fhk(RGWFileHandle *rgw_fh);
-
+    void update_fh(RGWFileHandle *rgw_fh);
 
     LookupFHResult stat_bucket(RGWFileHandle* parent, const char *path,
                               RGWLibFS::BucketStats& bs,
@@ -1166,7 +1191,7 @@ namespace rgw {
 
     struct rgw_fs* get_fs() { return &fs; }
 
-    uint32_t get_inst() { return root_fh.state.dev; }
+    uint64_t get_fsid() { return root_fh.state.dev; }
 
     RGWUserInfo* get_user() { return &user; }
 
@@ -1193,20 +1218,32 @@ class RGWListBucketsRequest : public RGWLibRequest,
 {
 public:
   RGWFileHandle* rgw_fh;
-  uint64_t* offset;
+  RGWFileHandle::readdir_offset offset;
   void* cb_arg;
   rgw_readdir_cb rcb;
+  uint64_t* ioff;
   size_t ix;
   uint32_t d_count;
 
   RGWListBucketsRequest(CephContext* _cct, RGWUserInfo *_user,
                        RGWFileHandle* _rgw_fh, rgw_readdir_cb _rcb,
-                       void* _cb_arg, uint64_t* _offset)
+                       void* _cb_arg, RGWFileHandle::readdir_offset& _offset)
     : RGWLibRequest(_cct, _user), rgw_fh(_rgw_fh), offset(_offset),
-      cb_arg(_cb_arg), rcb(_rcb), ix(0), d_count(0) {
-    const auto& mk = rgw_fh->find_marker(*offset);
-    if (mk) {
-      marker = mk->name;
+      cb_arg(_cb_arg), rcb(_rcb), ioff(nullptr), ix(0), d_count(0) {
+
+    using boost::get;
+
+    if (unlikely(!! get<uint64_t*>(&offset))) {
+      ioff = get<uint64_t*>(offset);
+      const auto& mk = rgw_fh->find_marker(*ioff);
+      if (mk) {
+       marker = mk->name;
+      }
+    } else {
+      const char* mk = get<const char*>(offset);
+      if (mk) {
+       marker = mk;
+      }
     }
     op = this;
   }
@@ -1277,7 +1314,9 @@ public:
   int operator()(const boost::string_ref& name,
                 const boost::string_ref& marker) {
     uint64_t off = XXH64(name.data(), name.length(), fh_key::seed);
-    *offset = off;
+    if (!! ioff) {
+      *ioff = off;
+    }
     /* update traversal cache */
     rgw_fh->add_marker(off, rgw_obj_key{marker.data(), ""},
                       RGW_FS_TYPE_DIRECTORY);
@@ -1286,7 +1325,7 @@ public:
   }
 
   bool eof() {
-    lsubdout(cct, rgw, 15) << "READDIR offset: " << *offset
+    lsubdout(cct, rgw, 15) << "READDIR offset: " << offset
                           << " is_truncated: " << is_truncated
                           << dendl;
     return !is_truncated;
@@ -1303,21 +1342,37 @@ class RGWReaddirRequest : public RGWLibRequest,
 {
 public:
   RGWFileHandle* rgw_fh;
-  uint64_t* offset;
+  RGWFileHandle::readdir_offset offset;
   void* cb_arg;
   rgw_readdir_cb rcb;
+  uint64_t* ioff;
   size_t ix;
   uint32_t d_count;
 
   RGWReaddirRequest(CephContext* _cct, RGWUserInfo *_user,
                    RGWFileHandle* _rgw_fh, rgw_readdir_cb _rcb,
-                   void* _cb_arg, uint64_t* _offset)
+                   void* _cb_arg, RGWFileHandle::readdir_offset& _offset)
     : RGWLibRequest(_cct, _user), rgw_fh(_rgw_fh), offset(_offset),
-      cb_arg(_cb_arg), rcb(_rcb), ix(0), d_count(0) {
-    const auto& mk = rgw_fh->find_marker(*offset);
-    if (mk) {
-      marker = *mk;
+      cb_arg(_cb_arg), rcb(_rcb), ioff(nullptr), ix(0), d_count(0) {
+
+    using boost::get;
+
+    if (unlikely(!! get<uint64_t*>(&offset))) {
+      ioff = get<uint64_t*>(offset);
+      const auto& mk = rgw_fh->find_marker(*ioff);
+      if (mk) {
+       marker = *mk;
+      }
+    } else {
+      const char* mk = get<const char*>(offset);
+      if (mk) {
+       std::string tmark{rgw_fh->relative_object_name()};
+       tmark += "/";
+       tmark += mk;    
+       marker = rgw_obj_key{std::move(tmark), "", ""};
+      }
     }
+
     default_max = 1000; // XXX was being omitted
     op = this;
   }
@@ -1366,7 +1421,9 @@ public:
 
     /* hash offset of name in parent (short name) for NFS readdir cookie */
     uint64_t off = XXH64(name.data(), name.length(), fh_key::seed);
-    *offset = off;
+    if (unlikely(!! ioff)) {
+      *ioff = off;
+    }
     /* update traversal cache */
     rgw_fh->add_marker(off, marker, type);
     ++d_count;
@@ -1456,7 +1513,7 @@ public:
   }
 
   bool eof() {
-    lsubdout(cct, rgw, 15) << "READDIR offset: " << *offset
+    lsubdout(cct, rgw, 15) << "READDIR offset: " << offset
                           << " next marker: " << next_marker
                           << " is_truncated: " << is_truncated
                           << dendl;
index 28b97d04d5c3ee2918abd50d3c1243f83b6176a7..cff4fa493351373f482e5a40d8d22e1a82499929 100644 (file)
@@ -221,7 +221,9 @@ optional<ARN> ARN::parse(const string& s, bool wildcards) {
   if ((s == "*") && wildcards) {
     return ARN(Partition::wildcard, Service::wildcard, "*", "*", "*");
   } else if (regex_match(s, match, wildcards ? rx_wild : rx_no_wild)) {
-    ceph_assert(match.size() == 6);
+    if (match.size() != 6) {
+      return boost::none;
+    }
 
     ARN a;
     {
@@ -771,7 +773,9 @@ static optional<Principal> parse_principal(CephContext* cct, TokenID t,
                          ECMAScript | optimize);
     smatch match;
     if (regex_match(a->resource, match, rx)) {
-      ceph_assert(match.size() == 3);
+      if (match.size() != 3) {
+       return boost::none;
+      }
 
       if (match[1] == "user") {
        return Principal::user(std::move(a->account),
@@ -839,7 +843,9 @@ bool ParseState::do_string(CephContext* cct, const char* s, size_t l) {
     // Principals
 
   } else if (w->kind == TokenKind::princ_type) {
-    ceph_assert(pp->s.size() > 1);
+    if (pp->s.size() <= 1) {
+      return false;
+    }
     auto& pri = pp->s[pp->s.size() - 2].w->id == TokenID::Principal ?
       t->princ : t->noprinc;
 
index 5236e6b02192179af17101d59518cffa19e5bbf1..59117456e0cc4a11c59213e29f7f37fe8fc9146d 100644 (file)
@@ -29,8 +29,6 @@
 #include "rgw_iam_policy_keywords.h"
 #include "rgw_string.h"
 
-#include "include/assert.h" // razzin' frazzin' ...grrr.
-
 class RGWRados;
 namespace rgw {
 namespace auth {
@@ -254,7 +252,6 @@ string to_string(const MaskedIP& m);
 inline bool operator ==(const MaskedIP& l, const MaskedIP& r) {
   auto shift = std::max((l.v6 ? 128 : 32) - l.prefix,
                        (r.v6 ? 128 : 32) - r.prefix);
-  ceph_assert(shift > 0);
   return (l.addr >> shift) == (r.addr >> shift);
 }
 
index 4792e62c35694eae1a706d739abd15cdc253bfb6..ad7b941bf3ad53f9906bcadd9157d6832afc674e 100644 (file)
@@ -798,6 +798,7 @@ void RGWBucketEnt::dump(Formatter *f) const
   utime_t ut(creation_time);
   encode_json("mtime", ut, f); /* mtime / creation time discrepency needed for backward compatibility */
   encode_json("count", count, f);
+  encode_json("placement_rule", placement_rule, f);
 }
 
 void RGWUploadPartInfo::dump(Formatter *f) const
index 3add9ae8a4c091b6905641e359ea644bbfd3a418..1cdbae519c94f5e01beced11688859d8c1f4e154 100644 (file)
@@ -277,8 +277,11 @@ class TokenCache {
   ~TokenCache() {
     down_flag = true;
 
-    revocator.stop();
-    revocator.join();
+    // Only stop and join if revocator thread is started.
+    if (revocator.is_started()) {
+      revocator.stop();
+      revocator.join();
+    }
   }
 
 public:
index cad0304e3527cb15e5f5b8f1bed0fa9f21f0e077..cdcfaffff77cc7345352f5ff09f807e3f0a2755a 100644 (file)
@@ -202,8 +202,10 @@ bool RGWLC::if_already_run_today(time_t& start_date)
   localtime_r(&start_date, &bdt);
 
   if (cct->_conf->rgw_lc_debug_interval > 0) {
-         /* We're debugging, so say we can run */
-         return false;
+    if (now - start_date < cct->_conf->rgw_lc_debug_interval)
+      return true;
+    else
+      return false;
   }
 
   bdt.tm_hour = 0;
@@ -674,13 +676,11 @@ int RGWLC::process(int index, int max_lock_secs)
     l.unlock(&store->lc_pool_ctx, obj_names[index]);
     ret = bucket_lc_process(entry.first);
     bucket_lc_post(index, max_lock_secs, entry, ret);
-    return 0;
+  }while(1);
+
 exit:
     l.unlock(&store->lc_pool_ctx, obj_names[index]);
     return 0;
-
-  }while(1);
-
 }
 
 void RGWLC::start_processor()
index ac6f7b70081aece857f51238abbd19cf778f217e..11fdbd4ebc3fdb95863baf710c0cd52da11a9574 100644 (file)
@@ -495,6 +495,8 @@ int rgw_build_bucket_policies(RGWRados* store, struct req_state* s)
        */
       if (store->get_zonegroup().is_master_zonegroup() && s->system_request) {
         /*If this is the master, don't redirect*/
+      } else if (s->op_type == RGW_OP_GET_BUCKET_LOCATION ) {
+        /* If op is get bucket location, don't redirect */
       } else if (!s->local_source ||
           (s->op != OP_PUT && s->op != OP_COPY) ||
           s->object.empty()) {
@@ -1638,8 +1640,13 @@ void RGWGetObj::execute()
   /* start gettorrent */
   if (torrent.get_flag())
   {
+    attr_iter = attrs.find(RGW_ATTR_CRYPT_MODE);
+    if (attr_iter != attrs.end() && attr_iter->second.to_str() == "SSE-C-AES256") {
+      op_ret = -ERR_INVALID_REQUEST;
+      goto done_err;
+    }
     torrent.init(s, store);
-    torrent.get_torrent_file(op_ret, read_op, total_len, bl, obj);
+    op_ret = torrent.get_torrent_file(read_op, total_len, bl, obj);
     if (op_ret < 0)
     {
       ldout(s->cct, 0) << "ERROR: failed to get_torrent_file ret= " << op_ret
@@ -1807,7 +1814,7 @@ void RGWListBuckets::execute()
   bool started = false;
   uint64_t total_count = 0;
 
-  uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk;
+  const uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk;
 
   op_ret = get_params();
   if (op_ret < 0) {
@@ -1842,15 +1849,32 @@ void RGWListBuckets::execute()
                        << s->user->user_id << dendl;
       break;
     }
-    map<string, RGWBucketEnt>& m = buckets.get_buckets();
-    map<string, RGWBucketEnt>::iterator iter;
-    for (iter = m.begin(); iter != m.end(); ++iter) {
-      RGWBucketEnt& bucket = iter->second;
-      buckets_size += bucket.size;
-      buckets_size_rounded += bucket.size_rounded;
-      buckets_objcount += bucket.count;
+
+    /* We need to have stats for all our policies - even if a given policy
+     * isn't actually used in a given account. In such situation its usage
+     * stats would be simply full of zeros. */
+    for (const auto& policy : store->get_zonegroup().placement_targets) {
+      policies_stats.emplace(policy.second.name,
+                             decltype(policies_stats)::mapped_type());
+    }
+
+    std::map<std::string, RGWBucketEnt>& m = buckets.get_buckets();
+    for (const auto& kv : m) {
+      const auto& bucket = kv.second;
+
+      global_stats.bytes_used += bucket.size;
+      global_stats.bytes_used_rounded += bucket.size_rounded;
+      global_stats.objects_count += bucket.count;
+
+      /* operator[] still can create a new entry for storage policy seen
+       * for first time. */
+      auto& policy_stats = policies_stats[bucket.placement_rule];
+      policy_stats.bytes_used += bucket.size;
+      policy_stats.bytes_used_rounded += bucket.size_rounded;
+      policy_stats.buckets_count++;
+      policy_stats.objects_count += bucket.count;
     }
-    buckets_count += m.size();
+    global_stats.buckets_count += m.size();
     total_count += m.size();
 
     done = (m.size() < read_count || (limit >= 0 && total_count >= (uint64_t)limit));
@@ -1861,10 +1885,10 @@ void RGWListBuckets::execute()
     }
 
     if (!m.empty()) {
-      send_response_data(buckets);
-
       map<string, RGWBucketEnt>::reverse_iterator riter = m.rbegin();
       marker = riter->first;
+
+      handle_listing_chunk(std::move(buckets));
     }
   } while (is_truncated && !done);
 
@@ -1968,17 +1992,31 @@ void RGWStatAccount::execute()
                        << s->user->user_id << dendl;
       break;
     } else {
-      map<string, RGWBucketEnt>& m = buckets.get_buckets();
-      map<string, RGWBucketEnt>::iterator iter;
-      for (iter = m.begin(); iter != m.end(); ++iter) {
-        RGWBucketEnt& bucket = iter->second;
-        buckets_size += bucket.size;
-        buckets_size_rounded += bucket.size_rounded;
-        buckets_objcount += bucket.count;
-
-        marker = iter->first;
+      /* We need to have stats for all our policies - even if a given policy
+       * isn't actually used in a given account. In such situation its usage
+       * stats would be simply full of zeros. */
+      for (const auto& policy : store->get_zonegroup().placement_targets) {
+        policies_stats.emplace(policy.second.name,
+                               decltype(policies_stats)::mapped_type());
+      }
+
+      std::map<std::string, RGWBucketEnt>& m = buckets.get_buckets();
+      for (const auto& kv : m) {
+        const auto& bucket = kv.second;
+
+        global_stats.bytes_used += bucket.size;
+        global_stats.bytes_used_rounded += bucket.size_rounded;
+        global_stats.objects_count += bucket.count;
+
+        /* operator[] still can create a new entry for storage policy seen
+         * for first time. */
+        auto& policy_stats = policies_stats[bucket.placement_rule];
+        policy_stats.bytes_used += bucket.size;
+        policy_stats.bytes_used_rounded += bucket.size_rounded;
+        policy_stats.buckets_count++;
+        policy_stats.objects_count += bucket.count;
       }
-      buckets_count += m.size();
+      global_stats.buckets_count += m.size();
 
     }
   } while (is_truncated);
@@ -1986,11 +2024,16 @@ void RGWStatAccount::execute()
 
 int RGWGetBucketVersioning::verify_permission()
 {
-  if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+                           rgw::IAM::s3GetBucketVersioning,
+                           ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
   }
-
-  return 0;
+  return -EACCES;
 }
 
 void RGWGetBucketVersioning::pre_exec()
@@ -2006,11 +2049,16 @@ void RGWGetBucketVersioning::execute()
 
 int RGWSetBucketVersioning::verify_permission()
 {
-  if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+                           rgw::IAM::s3PutBucketVersioning,
+                           ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
   }
-
-  return 0;
+  return -EACCES;
 }
 
 void RGWSetBucketVersioning::pre_exec()
@@ -2050,10 +2098,17 @@ void RGWSetBucketVersioning::execute()
 
 int RGWGetBucketWebsite::verify_permission()
 {
-  if (s->user->user_id.compare(s->bucket_owner.get_id()) != 0)
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+                           rgw::IAM::s3GetBucketWebsite,
+                           ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
+  }
 
-  return 0;
+  return -EACCES;
 }
 
 void RGWGetBucketWebsite::pre_exec()
@@ -2070,10 +2125,17 @@ void RGWGetBucketWebsite::execute()
 
 int RGWSetBucketWebsite::verify_permission()
 {
-  if (s->user->user_id.compare(s->bucket_owner.get_id()) != 0)
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+                           rgw::IAM::s3PutBucketWebsite,
+                           ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
+  }
 
-  return 0;
+  return -EACCES;
 }
 
 void RGWSetBucketWebsite::pre_exec()
@@ -2258,11 +2320,16 @@ int RGWGetBucketLogging::verify_permission()
 
 int RGWGetBucketLocation::verify_permission()
 {
-  if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+                           rgw::IAM::s3GetBucketLocation,
+                           ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
   }
-
-  return 0;
+  return -EACCES;
 }
 
 int RGWCreateBucket::verify_permission()
@@ -2639,7 +2706,10 @@ void RGWCreateBucket::execute()
   if (need_metadata_upload()) {
     /* It's supposed that following functions WILL NOT change any special
      * attributes (like RGW_ATTR_ACL) if they are already present in attrs. */
-    rgw_get_request_metadata(s->cct, s->info, attrs, false);
+    op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false);
+    if (op_ret < 0) {
+      return;
+    }
     prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs);
     populate_with_generic_attrs(s, attrs);
 
@@ -2732,7 +2802,10 @@ void RGWCreateBucket::execute()
 
       attrs.clear();
 
-      rgw_get_request_metadata(s->cct, s->info, attrs, false);
+      op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false);
+      if (op_ret < 0) {
+        return;
+      }
       prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs);
       populate_with_generic_attrs(s, attrs);
       op_ret = filter_out_quota_info(attrs, rmattr_names, s->bucket_info.quota);
@@ -3531,7 +3604,10 @@ void RGWPutObj::execute()
   emplace_attr(RGW_ATTR_ETAG, std::move(bl));
 
   populate_with_generic_attrs(s, attrs);
-  rgw_get_request_metadata(s->cct, s->info, attrs);
+  op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+  if (op_ret < 0) {
+    goto done;
+  }
   encode_delete_at_attr(delete_at, attrs);
   encode_obj_tags_attr(obj_tags.get(), attrs);
 
@@ -3845,7 +3921,10 @@ int RGWPutMetadataAccount::init_processing()
     attrs.emplace(RGW_ATTR_ACL, std::move(acl_bl));
   }
 
-  rgw_get_request_metadata(s->cct, s->info, attrs, false);
+  op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false);
+  if (op_ret < 0) {
+    return op_ret;
+  }
   prepare_add_del_attrs(orig_attrs, rmattr_names, attrs);
   populate_with_generic_attrs(s, attrs);
 
@@ -3937,7 +4016,10 @@ void RGWPutMetadataBucket::execute()
     return;
   }
 
-  rgw_get_request_metadata(s->cct, s->info, attrs, false);
+  op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false);
+  if (op_ret < 0) {
+    return;
+  }
 
   if (!placement_rule.empty() &&
       placement_rule != s->bucket_info.placement_rule) {
@@ -4024,7 +4106,11 @@ void RGWPutMetadataObject::execute()
     return;
   }
 
-  rgw_get_request_metadata(s->cct, s->info, attrs);
+  op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+  if (op_ret < 0) {
+    return;
+  }
+
   /* check if obj exists, read orig attrs */
   op_ret = get_obj_attrs(store, s, obj, orig_attrs);
   if (op_ret < 0) {
@@ -4398,7 +4484,10 @@ int RGWCopyObj::init_common()
   dest_policy.encode(aclbl);
   emplace_attr(RGW_ATTR_ACL, std::move(aclbl));
 
-  rgw_get_request_metadata(s->cct, s->info, attrs);
+  op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+  if (op_ret < 0) {
+    return op_ret;
+  }
   populate_with_generic_attrs(s, attrs);
 
   return 0;
@@ -4868,11 +4957,16 @@ void RGWDeleteLC::execute()
 
 int RGWGetCORS::verify_permission()
 {
-  if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+                           rgw::IAM::s3PutBucketCORS,
+                           ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
   }
-
-  return 0;
+  return -EACCES;
 }
 
 void RGWGetCORS::execute()
@@ -4890,11 +4984,16 @@ void RGWGetCORS::execute()
 
 int RGWPutCORS::verify_permission()
 {
-  if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+                           rgw::IAM::s3PutBucketCORS,
+                           ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
   }
-
-  return 0;
+  return -EACCES;
 }
 
 void RGWPutCORS::execute()
@@ -5017,6 +5116,12 @@ void RGWOptionsCORS::execute()
 
 int RGWGetRequestPayment::verify_permission()
 {
+  if (s->iam_policy &&
+      s->iam_policy->eval(s->env, *s->auth.identity,
+                         rgw::IAM::s3GetBucketRequestPayment,
+                         ARN(s->bucket)) != Effect::Allow) {
+      return -EACCES;
+  }
   return 0;
 }
 
@@ -5032,11 +5137,16 @@ void RGWGetRequestPayment::execute()
 
 int RGWSetRequestPayment::verify_permission()
 {
-  if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+                           rgw::IAM::s3PutBucketRequestPayment,
+                           ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
   }
-
-  return 0;
+  return -EACCES;
 }
 
 void RGWSetRequestPayment::pre_exec()
@@ -5108,7 +5218,10 @@ void RGWInitMultipart::execute()
   if (op_ret != 0)
     return;
 
-  rgw_get_request_metadata(s->cct, s->info, attrs);
+  op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+  if (op_ret < 0) {
+    return;
+  }
 
   do {
     char buf[33];
@@ -5287,22 +5400,16 @@ void RGWCompleteMultipart::execute()
     from deleting the parts*/
   rgw_pool meta_pool;
   rgw_raw_obj raw_obj;
-  librados::ObjectWriteOperation op;
-  librados::IoCtx ioctx;
-  rados::cls::lock::Lock l("RGWCompleteMultipart");
-  int max_lock_secs_mp = s->cct->_conf->get_val<int64_t>("rgw_mp_lock_max_time");
+  int max_lock_secs_mp =
+    s->cct->_conf->get_val<int64_t>("rgw_mp_lock_max_time");
+  utime_t dur(max_lock_secs_mp, 0);
 
-  op.assert_exists();
   store->obj_to_raw((s->bucket_info).placement_rule, meta_obj, &raw_obj);
-  store->get_obj_data_pool((s->bucket_info).placement_rule,meta_obj,&meta_pool);
-  store->open_pool_ctx(meta_pool, ioctx);
-
-  const string raw_meta_oid = raw_obj.oid;
-  utime_t time(max_lock_secs_mp, 0);
-  l.set_duration(time);
-  l.lock_exclusive(&op);
-  op_ret = ioctx.operate(raw_meta_oid, &op);
+  store->get_obj_data_pool((s->bucket_info).placement_rule,
+                          meta_obj,&meta_pool);
+  store->open_pool_ctx(meta_pool, serializer.ioctx);
 
+  op_ret = serializer.try_lock(raw_obj.oid, dur);
   if (op_ret < 0) {
     dout(0) << "RGWCompleteMultipart::execute() failed to acquire lock " << dendl;
     op_ret = -ERR_INTERNAL_ERROR;
@@ -5452,6 +5559,7 @@ void RGWCompleteMultipart::execute()
   obj_op.meta.owner = s->owner.get_id();
   obj_op.meta.flags = PUT_OBJ_CREATE;
   obj_op.meta.modify_tail = true;
+  obj_op.meta.completeMultipart = true;
   op_ret = obj_op.write_meta(ofs, accounted_size, attrs);
   if (op_ret < 0)
     return;
@@ -5459,13 +5567,41 @@ void RGWCompleteMultipart::execute()
   // remove the upload obj
   int r = store->delete_obj(*static_cast<RGWObjectCtx *>(s->obj_ctx),
                            s->bucket_info, meta_obj, 0);
-  if (r < 0) {
-    ldout(store->ctx(), 0) << "WARNING: failed to remove object " << meta_obj << dendl;
-    r = l.unlock(&ioctx, raw_meta_oid);
+  if (r >= 0)  {
+    /* serializer's exclusive lock is released */
+    serializer.clear_locked();
+  } else {
+      ldout(store->ctx(), 0) << "WARNING: failed to remove object "
+                            << meta_obj << dendl;
+  }
+}
+
+int RGWCompleteMultipart::MPSerializer::try_lock(
+  const std::string& _oid,
+  utime_t dur)
+{
+  oid = _oid;
+  op.assert_exists();
+  lock.set_duration(dur);
+  lock.lock_exclusive(&op);
+  int ret = ioctx.operate(oid, &op);
+  if (! ret) {
+    locked = true;
+  }
+  return ret;
+}
+
+void RGWCompleteMultipart::complete()
+{
+  /* release exclusive lock iff not already */
+  if (unlikely(serializer.locked)) {
+    int r = serializer.unlock();
     if (r < 0) {
-      ldout(store->ctx(), 0) << "WARNING: failed to unlock " << raw_meta_oid << dendl;
+      ldout(store->ctx(), 0) << "WARNING: failed to unlock "
+                            << serializer.oid << dendl;
     }
   }
+  send_response();
 }
 
 int RGWAbortMultipart::verify_permission()
index d9ce2b400011ff815b3c800cad4b9364e9d04809..68b83a45f13388d6d3e7bba5cc6406d882a6914a 100644 (file)
@@ -42,6 +42,8 @@
 #include "rgw_lc.h"
 #include "rgw_torrent.h"
 #include "rgw_tag.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rgw/cls_rgw_client.h"
 
 #include "include/assert.h"
 
@@ -582,40 +584,51 @@ public:
 }; /* RGWBulkUploadOp::AlignedStreamGetter */
 
 
+struct RGWUsageStats {
+  uint64_t bytes_used = 0;
+  uint64_t bytes_used_rounded = 0;
+  uint64_t buckets_count = 0;
+  uint64_t objects_count = 0;
+};
+
 #define RGW_LIST_BUCKETS_LIMIT_MAX 10000
 
 class RGWListBuckets : public RGWOp {
 protected:
   bool sent_data;
-  string marker;
-  string end_marker;
+  std::string marker;
+  std::string end_marker;
   int64_t limit;
   uint64_t limit_max;
-  uint32_t buckets_count;
-  uint64_t buckets_objcount;
-  uint64_t buckets_size;
-  uint64_t buckets_size_rounded;
-  map<string, bufferlist> attrs;
+  std::map<std::string, ceph::bufferlist> attrs;
   bool is_truncated;
 
+  RGWUsageStats global_stats;
+  std::map<std::string, RGWUsageStats> policies_stats;
+
   virtual uint64_t get_default_max() const {
     return 1000;
   }
 
 public:
-  RGWListBuckets() : sent_data(false) {
-    limit = limit_max = RGW_LIST_BUCKETS_LIMIT_MAX;
-    buckets_count = 0;
-    buckets_objcount = 0;
-    buckets_size = 0;
-    buckets_size_rounded = 0;
-    is_truncated = false;
+  RGWListBuckets()
+    : sent_data(false),
+      limit(RGW_LIST_BUCKETS_LIMIT_MAX),
+      limit_max(RGW_LIST_BUCKETS_LIMIT_MAX),
+      is_truncated(false) {
   }
 
   int verify_permission() override;
   void execute() override;
 
   virtual int get_params() = 0;
+  virtual void handle_listing_chunk(RGWUserBuckets&& buckets) {
+    /* The default implementation, used by e.g. S3, just generates a new
+     * part of listing and sends it client immediately. Swift can behave
+     * differently: when the reverse option is requested, all incoming
+     * instances of RGWUserBuckets are buffered and finally reversed. */
+    return send_response_data(buckets);
+  }
   virtual void send_response_begin(bool has_buckets) = 0;
   virtual void send_response_data(RGWUserBuckets& buckets) = 0;
   virtual void send_response_end() = 0;
@@ -659,24 +672,17 @@ public:
 
 class RGWStatAccount : public RGWOp {
 protected:
-  uint32_t buckets_count;
-  uint64_t buckets_objcount;
-  uint64_t buckets_size;
-  uint64_t buckets_size_rounded;
+  RGWUsageStats global_stats;
+  std::map<std::string, RGWUsageStats> policies_stats;
 
 public:
-  RGWStatAccount() {
-    buckets_count = 0;
-    buckets_objcount = 0;
-    buckets_size = 0;
-    buckets_size_rounded = 0;
-  }
+  RGWStatAccount() = default;
 
   int verify_permission() override;
   void execute() override;
 
   void send_response() override = 0;
-  const string name() override { return "stat_account"; }
+  const std::string name() override { return "stat_account"; }
   RGWOpType get_type() override { return RGW_OP_STAT_ACCOUNT; }
   uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
 };
@@ -739,6 +745,7 @@ public:
 
   void send_response() override = 0;
   const string name() override { return "get_bucket_location"; }
+  RGWOpType get_type() override { return RGW_OP_GET_BUCKET_LOCATION; }
   uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
 };
 
@@ -1590,6 +1597,27 @@ protected:
   char *data;
   int len;
 
+  struct MPSerializer {
+    librados::IoCtx ioctx;
+    rados::cls::lock::Lock lock;
+    librados::ObjectWriteOperation op;
+    std::string oid;
+    bool locked;
+
+    MPSerializer() : lock("RGWCompleteMultipart"), locked(false)
+      {}
+
+    int try_lock(const std::string& oid, utime_t dur);
+
+    int unlock() {
+      return lock.unlock(&ioctx, oid);
+    }
+
+    void clear_locked() {
+      locked = false;
+    }
+  } serializer;
+
 public:
   RGWCompleteMultipart() {
     data = NULL;
@@ -1602,6 +1630,7 @@ public:
   int verify_permission() override;
   void pre_exec() override;
   void execute() override;
+  void complete() override;
 
   virtual int get_params() = 0;
   void send_response() override = 0;
@@ -1885,38 +1914,73 @@ static inline void format_xattr(std::string &xattr)
  * map(<attr_name, attr_contents>, where attr_name is RGW_ATTR_PREFIX.HTTP_NAME)
  * s: The request state
  * attrs: will be filled up with attrs mapped as <attr_name, attr_contents>
+ * On success returns 0.
+ * On failure returns a negative error code.
  *
  */
-static inline void rgw_get_request_metadata(CephContext *cct,
-                                           struct req_info& info,
-                                           map<string, bufferlist>& attrs,
-                                           const bool allow_empty_attrs = true)
+static inline int rgw_get_request_metadata(CephContext* const cct,
+                                           struct req_info& info,
+                                           std::map<std::string, ceph::bufferlist>& attrs,
+                                           const bool allow_empty_attrs = true)
 {
   static const std::set<std::string> blacklisted_headers = {
       "x-amz-server-side-encryption-customer-algorithm",
       "x-amz-server-side-encryption-customer-key",
       "x-amz-server-side-encryption-customer-key-md5"
   };
-  map<string, string>::iterator iter;
-  for (iter = info.x_meta_map.begin(); iter != info.x_meta_map.end(); ++iter) {
-    const string &name(iter->first);
-    string &xattr(iter->second);
+
+  size_t valid_meta_count = 0;
+  for (auto& kv : info.x_meta_map) {
+    const std::string& name = kv.first;
+    std::string& xattr = kv.second;
+
     if (blacklisted_headers.count(name) == 1) {
       lsubdout(cct, rgw, 10) << "skipping x>> " << name << dendl;
       continue;
-    }
-    if (allow_empty_attrs || !xattr.empty()) {
+    } else if (allow_empty_attrs || !xattr.empty()) {
       lsubdout(cct, rgw, 10) << "x>> " << name << ":" << xattr << dendl;
       format_xattr(xattr);
-      string attr_name(RGW_ATTR_PREFIX);
+
+      std::string attr_name(RGW_ATTR_PREFIX);
       attr_name.append(name);
-      map<string, bufferlist>::value_type v(attr_name, bufferlist());
-      std::pair < map<string, bufferlist>::iterator, bool >
-       rval(attrs.insert(v));
-      bufferlist& bl(rval.first->second);
+
+      /* Check roughly whether we aren't going behind the limit on attribute
+       * name. Passing here doesn't guarantee that an OSD will accept that
+       * as ObjectStore::get_max_attr_name_length() can set the limit even
+       * lower than the "osd_max_attr_name_len" configurable.  */
+      const size_t max_attr_name_len = \
+        cct->_conf->get_val<size_t>("rgw_max_attr_name_len");
+      if (max_attr_name_len && attr_name.length() > max_attr_name_len) {
+        return -ENAMETOOLONG;
+      }
+
+      /* Similar remarks apply to the check for value size. We're veryfing
+       * it early at the RGW's side as it's being claimed in /info. */
+      const size_t max_attr_size = \
+        cct->_conf->get_val<size_t>("rgw_max_attr_size");
+      if (max_attr_size && xattr.length() > max_attr_size) {
+        return -EFBIG;
+      }
+
+      /* Swift allows administrators to limit the number of metadats items
+       * send _in a single request_. */
+      const auto rgw_max_attrs_num_in_req = \
+        cct->_conf->get_val<size_t>("rgw_max_attrs_num_in_req");
+      if (rgw_max_attrs_num_in_req &&
+          ++valid_meta_count > rgw_max_attrs_num_in_req) {
+        return -E2BIG;
+      }
+
+      auto rval = attrs.emplace(std::move(attr_name), ceph::bufferlist());
+      /* At the moment the value of the freshly created attribute key-value
+       * pair is an empty bufferlist. */
+
+      ceph::bufferlist& bl = rval.first->second;
       bl.append(xattr.c_str(), xattr.size() + 1);
     }
   }
+
+  return 0;
 } /* rgw_get_request_metadata */
 
 static inline void encode_delete_at_attr(boost::optional<ceph::real_time> delete_at,
index d94dc0ca4cf74798d0f5b1317eaa601b8d5df434..ce3d1265facd45b8ab2e210511553486e3a7feca 100644 (file)
@@ -110,10 +110,9 @@ bool RGWQuotaCache<T>::can_use_cached_stats(RGWQuotaInfo& quota, RGWStorageStats
       quota.max_size_soft_threshold = quota.max_size * store->ctx()->_conf->rgw_bucket_quota_soft_threshold;
     }
 
-    const auto cached_stats_num_kb_rounded = rgw_rounded_kb(cached_stats.size_rounded);
-    if (cached_stats_num_kb_rounded >= (uint64_t)quota.max_size_soft_threshold) {
+    if (cached_stats.size_rounded  >= (uint64_t)quota.max_size_soft_threshold) {
       ldout(store->ctx(), 20) << "quota: can't use cached stats, exceeded soft threshold (size): "
-        << cached_stats_num_kb_rounded << " >= " << quota.max_size_soft_threshold << dendl;
+        << cached_stats.size_rounded << " >= " << quota.max_size_soft_threshold << dendl;
       return false;
     }
   }
index 9df547a19ade5cabb20b5cc96fd146aa71b47f37..a44fc01a850342f51fa5064a7f556f6e7270d125 100644 (file)
@@ -5557,17 +5557,13 @@ int RGWRados::Bucket::List::list_objects(int64_t max,
   result->clear();
 
   rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
-
-  rgw_obj_key end_marker_obj;
-  rgw_obj_index_key cur_end_marker;
-  if (!params.ns.empty()) {
-    end_marker_obj = rgw_obj_key(params.end_marker.name, params.end_marker.instance, params.ns);
-    end_marker_obj.ns = params.ns;
-    end_marker_obj.get_index_key(&cur_end_marker);
-  }
   rgw_obj_index_key cur_marker;
   marker_obj.get_index_key(&cur_marker);
 
+  rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
+                             params.ns);
+  rgw_obj_index_key cur_end_marker;
+  end_marker_obj.get_index_key(&cur_end_marker);
   const bool cur_end_marker_valid = !params.end_marker.empty();
 
   rgw_obj_key prefix_obj(params.prefix);
@@ -7013,8 +7009,14 @@ int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_si
   meta.canceled = false;
 
   /* update quota cache */
-  store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
-                                     accounted_size, orig_size);
+  if (meta.completeMultipart){
+       store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+                                     0, orig_size);
+  }
+  else {
+    store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+                                     accounted_size, orig_size);  
+  }
   return 0;
 
 done_cancel:
@@ -8142,9 +8144,11 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
     if (tail_placement.bucket.name.empty()) {
       manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
     }
+    string ref_tag;
     for (; miter != astate->manifest.obj_end(); ++miter) {
       ObjectWriteOperation op;
-      cls_refcount_get(op, tag, true);
+      ref_tag = tag + '\0';
+      cls_refcount_get(op, ref_tag, true);
       const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
       ref.ioctx.locator_set_key(loc.loc);
 
@@ -8898,7 +8902,6 @@ int RGWRados::Object::Delete::delete_obj()
   index_op.set_zones_trace(params.zones_trace);
   index_op.set_bilog_flags(params.bilog_flags);
 
-
   r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
   if (r < 0)
     return r;
@@ -9728,10 +9731,13 @@ int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& ob
       return r;
 
     bl.append(tag.c_str(), tag.size() + 1);
-
     op.setxattr(RGW_ATTR_ID_TAG,  bl);
   }
 
+
+  real_time mtime = real_clock::now();
+  struct timespec mtime_ts = real_clock::to_timespec(mtime);
+  op.mtime2(&mtime_ts);
   r = ref.ioctx.operate(ref.oid, &op);
   if (state) {
     if (r >= 0) {
@@ -9742,7 +9748,6 @@ int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& ob
       string content_type(content_type_bl.c_str(), content_type_bl.length());
       uint64_t epoch = ref.ioctx.get_last_version();
       int64_t poolid = ref.ioctx.get_id();
-      real_time mtime = real_clock::now();
       r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
                             mtime, etag, content_type, &acl_bl,
                             RGW_OBJ_CATEGORY_MAIN, NULL);
@@ -12185,6 +12190,10 @@ int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
         ent.size_rounded += stats.total_size_rounded;
       }
     }
+
+    // fill in placement_rule from the bucket instance for use in swift's
+    // per-storage policy statistics
+    ent.placement_rule = std::move(bucket_info.placement_rule);
   }
 
   return m.size();
@@ -12821,8 +12830,11 @@ int RGWRados::cls_bucket_list(RGWBucketInfo& bucket_info, int shard_id, rgw_obj_
     const string& name = vcurrents[pos]->first;
     struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
 
-    bool force_check = force_check_filter && force_check_filter(dirent.key.name);
-    if ((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty() || force_check) {
+    bool force_check = force_check_filter &&
+        force_check_filter(dirent.key.name);
+    if ((!dirent.exists && !dirent.is_delete_marker()) ||
+        !dirent.pending_map.empty() ||
+        force_check) {
       /* there are uncommitted ops. We need to check the current state,
        * and if the tags are old we need to do cleanup as well. */
       librados::IoCtx sub_ctx;
index 6984192f0f497cdd999c63ada76ce236e3f43087..da916a59925904207dd63a61e09d35233684189d 100644 (file)
@@ -1860,7 +1860,7 @@ public:
   int get_zonegroup(RGWZoneGroup& zonegroup,
                    const string& zonegroup_id);
 
-  bool is_single_zonegroup()
+  bool is_single_zonegroup() const
   {
       return (period_map.zonegroups.size() == 1);
   }
@@ -2809,11 +2809,12 @@ public:
         const string *user_data;
         rgw_zone_set *zones_trace;
         bool modify_tail;
+        bool completeMultipart;
 
         MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
                  remove_objs(NULL), category(RGW_OBJ_CATEGORY_MAIN), flags(0),
                  if_match(NULL), if_nomatch(NULL), olh_epoch(0), canceled(false), user_data(nullptr), zones_trace(nullptr),
-                 modify_tail(false) {}
+                 modify_tail(false),  completeMultipart(false) {}
       } meta;
 
       explicit Write(RGWRados::Object *_target) : target(_target) {}
@@ -3593,6 +3594,11 @@ public:
       (get_zonegroup().zones.size() > 1 || current_period.is_multi_zonegroups_with_zones());
   }
 
+  bool can_reshard() const {
+    return current_period.get_id().empty() ||
+      (zonegroup.zones.size() == 1 && current_period.is_single_zonegroup());
+  }
+
   librados::Rados* get_rados_handle();
 
   int delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles);
index 0389ca33dce293c4fba356db6b3568d5416b1e76..a3a712c767234f2cf414c7fd12c7d212efdb062d 100644 (file)
@@ -595,6 +595,11 @@ void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& buc
 
 int RGWReshard::add(cls_rgw_reshard_entry& entry)
 {
+  if (!store->can_reshard()) {
+    ldout(store->ctx(), 20) << __func__ << " Resharding is disabled"  << dendl;
+    return 0;
+  }
+
   string logshard_oid;
 
   get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
@@ -856,6 +861,10 @@ void  RGWReshard::get_logshard_oid(int shard_num, string *logshard)
 
 int RGWReshard::process_all_logshards()
 {
+  if (!store->can_reshard()) {
+    ldout(store->ctx(), 20) << __func__ << " Resharding is disabled"  << dendl;
+    return 0;
+  }
   int ret = 0;
 
   for (int i = 0; i < num_logshards; i++) {
@@ -899,14 +908,11 @@ void *RGWReshard::ReshardWorker::entry() {
   utime_t last_run;
   do {
     utime_t start = ceph_clock_now();
-    ldout(cct, 2) << "object expiration: start" << dendl;
     if (reshard->process_all_logshards()) {
       /* All shards have been processed properly. Next time we can start
        * from this moment. */
       last_run = start;
     }
-    ldout(cct, 2) << "object expiration: stop" << dendl;
-
 
     if (reshard->going_down())
       break;
index f780ab4abacd75b55d2171bb438d9ede5ca69439..515057dfcd679c8b622603477ed3f23022903239 100644 (file)
@@ -662,6 +662,7 @@ extern void dump_header(struct req_state* s,
 extern void dump_header(struct req_state* s,
                         const boost::string_ref& name,
                         const utime_t& val);
+
 template <class... Args>
 static inline void dump_header_prefixed(struct req_state* s,
                                         const boost::string_ref& name_prefix,
@@ -677,6 +678,24 @@ static inline void dump_header_prefixed(struct req_state* s,
   return dump_header(s, std::move(full_name), std::forward<Args>(args)...);
 }
 
+template <class... Args>
+static inline void dump_header_infixed(struct req_state* s,
+                                       const boost::string_ref& prefix,
+                                       const boost::string_ref& infix,
+                                       const boost::string_ref& sufix,
+                                       Args&&... args) {
+  char full_name_buf[prefix.size() + infix.size() + sufix.size() + 1];
+  const auto len = snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s%.*s",
+                            static_cast<int>(prefix.length()),
+                            prefix.data(),
+                            static_cast<int>(infix.length()),
+                            infix.data(),
+                            static_cast<int>(sufix.length()),
+                            sufix.data());
+  boost::string_ref full_name(full_name_buf, len);
+  return dump_header(s, std::move(full_name), std::forward<Args>(args)...);
+}
+
 template <class... Args>
 static inline void dump_header_quoted(struct req_state* s,
                                       const boost::string_ref& name,
index 96f7cb7e5053201709969eba074f9a5f632a00f8..ba1822f38a16ae37f4f745f2724e82e15e840e82 100644 (file)
@@ -2,6 +2,7 @@
 // vim: ts=8 sw=2 smarttab
 
 #include <boost/algorithm/string/predicate.hpp>
+#include <boost/format.hpp>
 #include <boost/optional.hpp>
 #include <boost/utility/in_place_factory.hpp>
 
@@ -38,10 +39,15 @@ int RGWListBuckets_ObjStore_SWIFT::get_params()
   prefix = s->info.args.get("prefix");
   marker = s->info.args.get("marker");
   end_marker = s->info.args.get("end_marker");
+  wants_reversed = s->info.args.exists("reverse");
 
-  string limit_str = s->info.args.get("limit");
+  if (wants_reversed) {
+    std::swap(marker, end_marker);
+  }
+
+  std::string limit_str = s->info.args.get("limit");
   if (!limit_str.empty()) {
-    string err;
+    std::string err;
     long l = strict_strtol(limit_str.c_str(), 10, &err);
     if (!err.empty()) {
       return -EINVAL;
@@ -73,10 +79,8 @@ int RGWListBuckets_ObjStore_SWIFT::get_params()
 }
 
 static void dump_account_metadata(struct req_state * const s,
-                                  const uint32_t buckets_count,
-                                  const uint64_t buckets_object_count,
-                                  const uint64_t buckets_size,
-                                  const uint64_t buckets_size_rounded,
+                                  const RGWUsageStats& global_stats,
+                                  const std::map<std::string, RGWUsageStats> policies_stats,
                                   /* const */map<string, bufferlist>& attrs,
                                   const RGWQuotaInfo& quota,
                                   const RGWAccessControlPolicy_SWIFTAcct &policy)
@@ -84,10 +88,24 @@ static void dump_account_metadata(struct req_state * const s,
   /* Adding X-Timestamp to keep align with Swift API */
   dump_header(s, "X-Timestamp", ceph_clock_now());
 
-  dump_header(s, "X-Account-Container-Count", buckets_count);
-  dump_header(s, "X-Account-Object-Count", buckets_object_count);
-  dump_header(s, "X-Account-Bytes-Used", buckets_size);
-  dump_header(s, "X-Account-Bytes-Used-Actual", buckets_size_rounded);
+  dump_header(s, "X-Account-Container-Count", global_stats.buckets_count);
+  dump_header(s, "X-Account-Object-Count", global_stats.objects_count);
+  dump_header(s, "X-Account-Bytes-Used", global_stats.bytes_used);
+  dump_header(s, "X-Account-Bytes-Used-Actual", global_stats.bytes_used_rounded);
+
+  for (const auto& kv : policies_stats) {
+    const auto& policy_name = camelcase_dash_http_attr(kv.first);
+    const auto& policy_stats = kv.second;
+
+    dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+                        "-Container-Count", policy_stats.buckets_count);
+    dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+                        "-Object-Count", policy_stats.objects_count);
+    dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+                        "-Bytes-Used", policy_stats.bytes_used);
+    dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+                        "-Bytes-Used-Actual", policy_stats.bytes_used_rounded);
+  }
 
   /* Dump TempURL-related stuff */
   if (s->perm_mask == RGW_PERM_FULL_CONTROL) {
@@ -150,10 +168,8 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets)
   if (! s->cct->_conf->rgw_swift_enforce_content_length) {
     /* Adding account stats in the header to keep align with Swift API */
     dump_account_metadata(s,
-            buckets_count,
-            buckets_objcount,
-            buckets_size,
-            buckets_size_rounded,
+            global_stats,
+            policies_stats,
             attrs,
             user_quota,
             static_cast<RGWAccessControlPolicy_SWIFTAcct&>(*s->user_acl));
@@ -170,6 +186,17 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets)
   }
 }
 
+void RGWListBuckets_ObjStore_SWIFT::handle_listing_chunk(RGWUserBuckets&& buckets)
+{
+  if (wants_reversed) {
+    /* Just store in the reversal buffer. Its content will be handled later,
+     * in send_response_end(). */
+    reverse_buffer.emplace(std::begin(reverse_buffer), std::move(buckets));
+  } else {
+    return send_response_data(buckets);
+  }
+}
+
 void RGWListBuckets_ObjStore_SWIFT::send_response_data(RGWUserBuckets& buckets)
 {
   if (! sent_data) {
@@ -184,23 +211,61 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_data(RGWUserBuckets& buckets)
   for (auto iter = m.lower_bound(prefix);
        iter != m.end() && boost::algorithm::starts_with(iter->first, prefix);
        ++iter) {
-    const RGWBucketEnt& obj = iter->second;
+    dump_bucket_entry(iter->second);
+  }
+}
 
-    s->formatter->open_object_section("container");
-    s->formatter->dump_string("name", obj.bucket.name);
-    if (need_stats) {
-      s->formatter->dump_int("count", obj.count);
-      s->formatter->dump_int("bytes", obj.size);
-    }
-    s->formatter->close_section();
-    if (! s->cct->_conf->rgw_swift_enforce_content_length) {
-      rgw_flush_formatter(s, s->formatter);
-    }
+void RGWListBuckets_ObjStore_SWIFT::dump_bucket_entry(const RGWBucketEnt& obj)
+{
+  s->formatter->open_object_section("container");
+  s->formatter->dump_string("name", obj.bucket.name);
+
+  if (need_stats) {
+    s->formatter->dump_int("count", obj.count);
+    s->formatter->dump_int("bytes", obj.size);
+  }
+
+  s->formatter->close_section();
+
+  if (! s->cct->_conf->rgw_swift_enforce_content_length) {
+    rgw_flush_formatter(s, s->formatter);
+  }
+}
+
+void RGWListBuckets_ObjStore_SWIFT::send_response_data_reversed(RGWUserBuckets& buckets)
+{
+  if (! sent_data) {
+    return;
+  }
+
+  /* Take care of the prefix parameter of Swift API. There is no business
+   * in applying the filter earlier as we really need to go through all
+   * entries regardless of it (the headers like X-Account-Container-Count
+   * aren't affected by specifying prefix). */
+  std::map<std::string, RGWBucketEnt>& m = buckets.get_buckets();
+
+  auto iter = m.rbegin();
+  for (/* initialized above */;
+       iter != m.rend() && !boost::algorithm::starts_with(iter->first, prefix);
+       ++iter) {
+    /* NOP */;
+  }
+
+  for (/* iter carried */;
+       iter != m.rend() && boost::algorithm::starts_with(iter->first, prefix);
+       ++iter) {
+    dump_bucket_entry(iter->second);
   }
 }
 
 void RGWListBuckets_ObjStore_SWIFT::send_response_end()
 {
+  if (wants_reversed) {
+    for (auto& buckets : reverse_buffer) {
+      send_response_data_reversed(buckets);
+    }
+  }
+
   if (sent_data) {
     s->formatter->close_section();
   }
@@ -208,15 +273,13 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_end()
   if (s->cct->_conf->rgw_swift_enforce_content_length) {
     /* Adding account stats in the header to keep align with Swift API */
     dump_account_metadata(s,
-            buckets_count,
-            buckets_objcount,
-            buckets_size,
-            buckets_size_rounded,
+            global_stats,
+            policies_stats,
             attrs,
             user_quota,
             static_cast<RGWAccessControlPolicy_SWIFTAcct&>(*s->user_acl));
     dump_errno(s);
-    end_header(s, NULL, NULL, s->formatter->get_len(), true);
+    end_header(s, nullptr, nullptr, s->formatter->get_len(), true);
   }
 
   if (sent_data || s->cct->_conf->rgw_swift_enforce_content_length) {
@@ -470,10 +533,8 @@ void RGWStatAccount_ObjStore_SWIFT::send_response()
   if (op_ret >= 0) {
     op_ret = STATUS_NO_CONTENT;
     dump_account_metadata(s,
-            buckets_count,
-            buckets_objcount,
-            buckets_size,
-            buckets_size_rounded,
+            global_stats,
+            policies_stats,
             attrs,
             user_quota,
             static_cast<RGWAccessControlPolicy_SWIFTAcct&>(*s->user_acl));
@@ -646,13 +707,41 @@ int RGWCreateBucket_ObjStore_SWIFT::get_params()
   return get_swift_versioning_settings(s, swift_ver_location);
 }
 
+static inline int handle_metadata_errors(req_state* const s, const int op_ret)
+{
+  if (op_ret == -EFBIG) {
+    /* Handle the custom error message of exceeding maximum custom attribute
+     * (stored as xattr) size. */
+    const auto error_message = boost::str(
+      boost::format("Metadata value longer than %lld")
+        % s->cct->_conf->get_val<size_t>("rgw_max_attr_size"));
+    set_req_state_err(s, EINVAL, error_message);
+    return -EINVAL;
+  } else if (op_ret == -E2BIG) {
+    const auto error_message = boost::str(
+      boost::format("Too many metadata items; max %lld")
+        % s->cct->_conf->get_val<size_t>("rgw_max_attrs_num_in_req"));
+    set_req_state_err(s, EINVAL, error_message);
+    return -EINVAL;
+  }
+
+  return op_ret;
+}
+
 void RGWCreateBucket_ObjStore_SWIFT::send_response()
 {
-  if (! op_ret)
-    op_ret = STATUS_CREATED;
-  else if (op_ret == -ERR_BUCKET_EXISTS)
-    op_ret = STATUS_ACCEPTED;
-  set_req_state_err(s, op_ret);
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret != op_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret) {
+      op_ret = STATUS_CREATED;
+    } else if (op_ret == -ERR_BUCKET_EXISTS) {
+      op_ret = STATUS_ACCEPTED;
+    }
+    set_req_state_err(s, op_ret);
+  }
+
   dump_errno(s);
   /* Propose ending HTTP header with 0 Content-Length header. */
   end_header(s, NULL, NULL, 0);
@@ -819,8 +908,14 @@ int RGWPutObj_ObjStore_SWIFT::get_params()
 
 void RGWPutObj_ObjStore_SWIFT::send_response()
 {
-  if (! op_ret) {
-    op_ret = STATUS_CREATED;
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret) {
+      op_ret = STATUS_CREATED;
+    }
+    set_req_state_err(s, op_ret);
   }
 
   if (! lo_etag.empty()) {
@@ -892,10 +987,16 @@ int RGWPutMetadataAccount_ObjStore_SWIFT::get_params()
 
 void RGWPutMetadataAccount_ObjStore_SWIFT::send_response()
 {
-  if (! op_ret) {
-    op_ret = STATUS_NO_CONTENT;
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret != op_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret) {
+      op_ret = STATUS_NO_CONTENT;
+    }
+    set_req_state_err(s, op_ret);
   }
-  set_req_state_err(s, op_ret);
+
   dump_errno(s);
   end_header(s, this);
   rgw_flush_formatter_and_reset(s, s->formatter);
@@ -922,10 +1023,16 @@ int RGWPutMetadataBucket_ObjStore_SWIFT::get_params()
 
 void RGWPutMetadataBucket_ObjStore_SWIFT::send_response()
 {
-  if (!op_ret && (op_ret != -EINVAL)) {
-    op_ret = STATUS_NO_CONTENT;
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret != op_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret && (op_ret != -EINVAL)) {
+      op_ret = STATUS_NO_CONTENT;
+    }
+    set_req_state_err(s, op_ret);
   }
-  set_req_state_err(s, op_ret);
+
   dump_errno(s);
   end_header(s, this);
   rgw_flush_formatter_and_reset(s, s->formatter);
@@ -952,13 +1059,20 @@ int RGWPutMetadataObject_ObjStore_SWIFT::get_params()
 
 void RGWPutMetadataObject_ObjStore_SWIFT::send_response()
 {
-  if (! op_ret) {
-    op_ret = STATUS_ACCEPTED;
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret != op_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret) {
+      op_ret = STATUS_ACCEPTED;
+    }
+    set_req_state_err(s, op_ret);
   }
-  set_req_state_err(s, op_ret);
+
   if (!s->is_err()) {
     dump_content_length(s, 0);
   }
+
   dump_errno(s);
   end_header(s, this);
   rgw_flush_formatter_and_reset(s, s->formatter);
@@ -1660,7 +1774,25 @@ void RGWInfo_ObjStore_SWIFT::list_swift_data(Formatter& formatter,
 
   string ceph_version(CEPH_GIT_NICE_VER);
   formatter.dump_string("version", ceph_version);
-  formatter.dump_int("max_meta_name_length", 81);
+
+  const size_t max_attr_name_len = \
+    g_conf->get_val<size_t>("rgw_max_attr_name_len");
+  if (max_attr_name_len) {
+    const size_t meta_name_limit = \
+      max_attr_name_len - strlen(RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX);
+    formatter.dump_int("max_meta_name_length", meta_name_limit);
+  }
+
+  const size_t meta_value_limit = g_conf->get_val<size_t>("rgw_max_attr_size");
+  if (meta_value_limit) {
+    formatter.dump_int("max_meta_value_length", meta_value_limit);
+  }
+
+  const size_t meta_num_limit = \
+    g_conf->get_val<size_t>("rgw_max_attrs_num_in_req");
+  if (meta_num_limit) {
+    formatter.dump_int("max_meta_count", meta_num_limit);
+  }
 
   formatter.open_array_section("policies");
   RGWZoneGroup& zonegroup = store.get_zonegroup();
@@ -2591,11 +2723,22 @@ int RGWHandler_REST_SWIFT::postauth_init()
 
 int RGWHandler_REST_SWIFT::validate_bucket_name(const string& bucket)
 {
-  int ret = RGWHandler_REST::validate_bucket_name(bucket);
-  if (ret < 0)
-    return ret;
+  const size_t len = bucket.size();
 
-  int len = bucket.size();
+  if (len > MAX_BUCKET_NAME_LEN) {
+    /* Bucket Name too long. Generate custom error message and bind it
+     * to an R-value reference. */
+    const auto msg = boost::str(
+      boost::format("Container name length of %lld longer than %lld")
+        % len % int(MAX_BUCKET_NAME_LEN));
+    set_req_state_err(s, ERR_INVALID_BUCKET_NAME, msg);
+    return -ERR_INVALID_BUCKET_NAME;
+  }
+
+  const auto ret = RGWHandler_REST::validate_bucket_name(bucket);
+  if (ret < 0) {
+    return ret;
+  }
 
   if (len == 0)
     return 0;
@@ -2608,7 +2751,7 @@ int RGWHandler_REST_SWIFT::validate_bucket_name(const string& bucket)
 
   const char *s = bucket.c_str();
 
-  for (int i = 0; i < len; ++i, ++s) {
+  for (size_t i = 0; i < len; ++i, ++s) {
     if (*(unsigned char *)s == 0xff)
       return -ERR_INVALID_BUCKET_NAME;
   }
index 296b83acde49b5772f7340c191873e65d8c123c5..a0fd8cfd05798859fe34da0d433787e521cba7ca 100644 (file)
@@ -37,18 +37,27 @@ public:
 
 class RGWListBuckets_ObjStore_SWIFT : public RGWListBuckets_ObjStore {
   bool need_stats;
+  bool wants_reversed;
   std::string prefix;
+  std::vector<RGWUserBuckets> reverse_buffer;
 
   uint64_t get_default_max() const override {
     return 0;
   }
+
 public:
-  RGWListBuckets_ObjStore_SWIFT() : need_stats(true) {}
+  RGWListBuckets_ObjStore_SWIFT()
+    : need_stats(true),
+      wants_reversed(false) {
+  }
   ~RGWListBuckets_ObjStore_SWIFT() override {}
 
   int get_params() override;
+  void handle_listing_chunk(RGWUserBuckets&& buckets) override;
   void send_response_begin(bool has_buckets) override;
   void send_response_data(RGWUserBuckets& buckets) override;
+  void send_response_data_reversed(RGWUserBuckets& buckets);
+  void dump_bucket_entry(const RGWBucketEnt& obj);
   void send_response_end() override;
 
   bool should_get_stats() override { return need_stats; }
@@ -381,7 +390,7 @@ public:
   }
   ~RGWHandler_REST_SWIFT() override = default;
 
-  static int validate_bucket_name(const string& bucket);
+  int validate_bucket_name(const string& bucket);
 
   int init(RGWRados *store, struct req_state *s, rgw::io::BasicClient *cio) override;
   int authorize() override;
index 8539c3ed766409a923b70e836b88a1429e68d0b8..c8d8dd74265c8ed116d53699d65e717d2deb7c87 100644 (file)
@@ -32,6 +32,7 @@ void RGWOp_User_Info::execute()
 
   std::string uid_str;
   bool fetch_stats;
+  bool sync_stats;
 
   RESTArgs::get_string(s, "uid", uid_str, &uid_str);
 
@@ -47,8 +48,11 @@ void RGWOp_User_Info::execute()
 
   RESTArgs::get_bool(s, "stats", false, &fetch_stats);
 
+  RESTArgs::get_bool(s, "sync", false, &sync_stats);
+
   op_state.set_user_id(uid);
   op_state.set_fetch_stats(fetch_stats);
+  op_state.set_sync_stats(sync_stats);
 
   http_ret = RGWUserAdminOp_User::info(store, op_state, flusher);
 }
index 525671c34b4ad0c96b3fb6eead805cad69157596..15dfe032b3aa3989c0b3abfda4729f18837dd226 100644 (file)
@@ -203,13 +203,17 @@ class TempURLEngine::PrefixableSignatureHelper
   const boost::optional<const std::string&> prefix;
 
 public:
-  PrefixableSignatureHelper(const std::string& decoded_uri,
+  PrefixableSignatureHelper(const std::string& _decoded_uri,
                            const std::string& object_name,
                             const boost::optional<const std::string&> prefix)
-    : decoded_uri(decoded_uri),
+    : decoded_uri(_decoded_uri),
       object_name(object_name),
       prefix(prefix) {
-    /* Transform: v1/acct/cont/obj - > v1/acct/cont/ */
+    /* Transform: v1/acct/cont/obj - > v1/acct/cont/
+     *
+     * NOTE(rzarzynski): we really want to substr() on boost::string_view,
+     * not std::string. Otherwise we would end with no_obj_uri referencing
+     * a temporary. */
     no_obj_uri = \
       decoded_uri.substr(0, decoded_uri.length() - object_name.length());
   }
index afab8e069745a38db28b741c95b339295bccd330..cc508202db855fd5e07be02eec58d6df0026d48f 100644 (file)
@@ -168,7 +168,7 @@ class DefaultStrategy : public rgw::auth::Strategy,
   /* The engines. */
   const rgw::auth::swift::TempURLEngine tempurl_engine;
   const rgw::auth::swift::SignedTokenEngine signed_engine;
-  const rgw::auth::keystone::TokenEngine keystone_engine;
+  boost::optional <const rgw::auth::keystone::TokenEngine> keystone_engine;
   const rgw::auth::swift::ExternalTokenEngine external_engine;
   const rgw::auth::swift::SwiftAnonymousEngine anon_engine;
 
@@ -229,11 +229,6 @@ public:
                     store,
                     static_cast<rgw::auth::TokenExtractor*>(this),
                     static_cast<rgw::auth::LocalApplier::Factory*>(this)),
-      keystone_engine(cct,
-                      static_cast<rgw::auth::TokenExtractor*>(this),
-                      static_cast<rgw::auth::RemoteApplier::Factory*>(this),
-                      keystone_config_t::get_instance(),
-                      keystone_cache_t::get_instance<keystone_config_t>()),
       external_engine(cct,
                       store,
                       static_cast<rgw::auth::TokenExtractor*>(this),
@@ -251,7 +246,13 @@ public:
     /* The auth strategy is responsible for deciding whether a parcular
      * engine is disabled or not. */
     if (! cct->_conf->rgw_keystone_url.empty()) {
-      add_engine(Control::SUFFICIENT, keystone_engine);
+      keystone_engine.emplace(cct,
+                              static_cast<rgw::auth::TokenExtractor*>(this),
+                              static_cast<rgw::auth::RemoteApplier::Factory*>(this),
+                              keystone_config_t::get_instance(),
+                              keystone_cache_t::get_instance<keystone_config_t>());
+
+      add_engine(Control::SUFFICIENT, *keystone_engine);
     }
     if (! cct->_conf->rgw_swift_auth_url.empty()) {
       add_engine(Control::SUFFICIENT, external_engine);
index c1f8aaac0bd00c35f36fc2a52bffcbb00db32e9e..ad85148b3308e7b6ce30829d41b9e66da35aca62 100644 (file)
@@ -37,8 +37,10 @@ void seed::init(struct req_state *p_req, RGWRados *p_store)
   store = p_store;
 }
 
-void seed::get_torrent_file(int &op_ret, RGWRados::Object::Read &read_op, uint64_t &total_len, 
-  bufferlist &bl_data, rgw_obj &obj)
+int seed::get_torrent_file(RGWRados::Object::Read &read_op,
+                           uint64_t &total_len,
+                           ceph::bufferlist &bl_data,
+                           rgw_obj &obj)
 {
   /* add other field if config is set */
   dencode.bencode_dict(bl);
@@ -63,11 +65,12 @@ void seed::get_torrent_file(int &op_ret, RGWRados::Object::Read &read_op, uint64
   ldout(s->cct, 0) << "NOTICE: head obj oid= " << oid << dendl;
 
   obj_key.insert(RGW_OBJ_TORRENT);
-  op_ret = read_op.state.io_ctx.omap_get_vals_by_keys(oid, obj_key, &m);
+  const int op_ret = read_op.state.io_ctx.omap_get_vals_by_keys(oid, obj_key, &m);
   if (op_ret < 0)
   {
-    ldout(s->cct, 0) << "ERROR: failed to omap_get_vals_by_keys op_ret = " << op_ret << dendl;
-    return;
+    ldout(s->cct, 0) << "ERROR: failed to omap_get_vals_by_keys op_ret = "
+                     << op_ret << dendl;
+    return op_ret;
   }
 
   map<string, bufferlist>::iterator iter;
@@ -81,7 +84,7 @@ void seed::get_torrent_file(int &op_ret, RGWRados::Object::Read &read_op, uint64
 
   bl_data = bl;
   total_len = bl.length();
-  return;
+  return 0;
 }
 
 bool seed::get_flag()
index b33aac09b74c779aa6b5d97026b2abe16ea37404..fd6a0d6ed5882a16d0780faee9a54493db02e58d 100644 (file)
@@ -115,8 +115,10 @@ public:
 
   int get_params();
   void init(struct req_state *p_req, RGWRados *p_store);
-  void get_torrent_file(int &op_ret, RGWRados::Object::Read &read_op, 
-    uint64_t &total_len, bufferlist &bl_data, rgw_obj &obj);
+  int get_torrent_file(RGWRados::Object::Read &read_op,
+                       uint64_t &total_len,
+                       ceph::bufferlist &bl_data,
+                       rgw_obj &obj);
   
   off_t get_data_len();
   bool get_flag();
index ebe795e7d6f6bb5deac831d4d87000ce9171b96d..7fe88421805f13b5f1e08b1dd7d8365f2d6ceb01 100644 (file)
@@ -1760,7 +1760,7 @@ int RGWUser::init(RGWUserAdminOpState& op_state)
 {
   bool found = false;
   std::string swift_user;
-  rgw_user& uid = op_state.get_user_id();
+  user_id = op_state.get_user_id();
   std::string user_email = op_state.get_user_email();
   std::string access_key = op_state.get_access_key();
   std::string subuser = op_state.get_subuser();
@@ -1775,16 +1775,16 @@ int RGWUser::init(RGWUserAdminOpState& op_state)
 
   clear_populated();
 
-  if (uid.empty() && !subuser.empty()) {
+  if (user_id.empty() && !subuser.empty()) {
     size_t pos = subuser.find(':');
     if (pos != string::npos) {
-      uid = subuser.substr(0, pos);
-      op_state.set_user_id(uid);
+      user_id = subuser.substr(0, pos);
+      op_state.set_user_id(user_id);
     }
   }
 
-  if (!uid.empty() && (uid.compare(RGW_USER_ANON_ID) != 0)) {
-    found = (rgw_get_user_info_by_uid(store, uid, user_info, &op_state.objv) >= 0);
+  if (!user_id.empty() && (user_id.compare(RGW_USER_ANON_ID) != 0)) {
+    found = (rgw_get_user_info_by_uid(store, user_id, user_info, &op_state.objv) >= 0);
     op_state.found_by_uid = found;
   }
   if (!user_email.empty() && !found) {
@@ -1809,7 +1809,9 @@ int RGWUser::init(RGWUserAdminOpState& op_state)
     set_populated();
   }
 
-  user_id = user_info.user_id;
+  if (user_id.empty()) {
+    user_id = user_info.user_id;
+  }
   op_state.set_initialized();
 
   // this may have been called by a helper object
@@ -2363,6 +2365,13 @@ int RGWUserAdminOp_User::info(RGWRados *store, RGWUserAdminOpState& op_state,
   if (ret < 0)
     return ret;
 
+  if (op_state.sync_stats) {
+    ret = rgw_user_sync_all_stats(store, info.user_id);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
   RGWStorageStats stats;
   RGWStorageStats *arg_stats = NULL;
   if (op_state.fetch_stats) {
index 5e6754b0380ccee2fef8fc5a7f0af8e5311b7a51..516617a295574c985bcee22c3bbbca60de174569 100644 (file)
@@ -163,6 +163,7 @@ struct RGWUserAdminOpState {
   __u8 system;
   __u8 exclusive;
   __u8 fetch_stats;
+  __u8 sync_stats;
   std::string caps;
   RGWObjVersionTracker objv;
   uint32_t op_mask;
@@ -334,6 +335,10 @@ struct RGWUserAdminOpState {
     fetch_stats = is_fetch_stats;
   }
 
+  void set_sync_stats(__u8 is_sync_stats) {
+    sync_stats = is_sync_stats;
+  }
+
   void set_user_info(RGWUserInfo& user_info) {
     user_id = user_info.user_id;
     info = user_info;
index 50b35769106f1d08d06322058289f43418438252..f3999e3b0114abfdc56cae6c5501a03e470a8c71 100644 (file)
@@ -14,6 +14,7 @@ add_subdirectory(cls_log)
 add_subdirectory(cls_numops)
 add_subdirectory(cls_sdk)
 if(WITH_RBD)
+  add_subdirectory(cls_journal)
   add_subdirectory(cls_rbd)
 endif(WITH_RBD)
 add_subdirectory(cls_refcount)
@@ -347,6 +348,20 @@ target_link_libraries(ceph_test_librgw_file_aw
   ${EXTRALIBS}
   )
 
+# ceph_test_librgw_file_marker (READDIR with string and uint64 offsets)
+add_executable(ceph_test_librgw_file_marker
+  librgw_file_marker.cc
+  )
+set_target_properties(ceph_test_librgw_file_marker PROPERTIES COMPILE_FLAGS
+  ${UNITTEST_CXX_FLAGS})
+target_link_libraries(ceph_test_librgw_file_marker
+  rgw
+  librados
+  ceph-common
+  ${UNITTEST_LIBS}
+  ${EXTRALIBS}
+  )
+
 # ceph_test_rgw_token
 add_executable(ceph_test_rgw_token
   test_rgw_token.cc
diff --git a/ceph/src/test/ceph_objectstore_tool.py b/ceph/src/test/ceph_objectstore_tool.py
deleted file mode 100755 (executable)
index bae12f4..0000000
+++ /dev/null
@@ -1,1997 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
-from subprocess import call
-try:
-    from subprocess import check_output
-except ImportError:
-    def check_output(*popenargs, **kwargs):
-        import subprocess
-        # backported from python 2.7 stdlib
-        process = subprocess.Popen(
-            stdout=subprocess.PIPE, *popenargs, **kwargs)
-        output, unused_err = process.communicate()
-        retcode = process.poll()
-        if retcode:
-            cmd = kwargs.get("args")
-            if cmd is None:
-                cmd = popenargs[0]
-            error = subprocess.CalledProcessError(retcode, cmd)
-            error.output = output
-            raise error
-        return output
-
-import filecmp
-import os
-import subprocess
-import math
-import time
-import sys
-import re
-import logging
-import json
-import tempfile
-import platform
-
-try:
-    from subprocess import DEVNULL
-except ImportError:
-    DEVNULL = open(os.devnull, "wb")
-
-logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
-
-
-if sys.version_info[0] >= 3:
-    def decode(s):
-        return s.decode('utf-8')
-
-    def check_output(*args, **kwargs):
-        return decode(subprocess.check_output(*args, **kwargs))
-else:
-    def decode(s):
-        return s
-
-
-
-def wait_for_health():
-    print("Wait for health_ok...", end="")
-    tries = 0
-    while call("{path}/ceph health 2> /dev/null | grep -v 'HEALTH_OK\|HEALTH_WARN' > /dev/null".format(path=CEPH_BIN), shell=True) == 0:
-        tries += 1
-        if tries == 150:
-            raise Exception("Time exceeded to go to health")
-        time.sleep(1)
-    print("DONE")
-
-
-def get_pool_id(name, nullfd):
-    cmd = "{path}/ceph osd pool stats {pool}".format(pool=name, path=CEPH_BIN).split()
-    # pool {pool} id # .... grab the 4 field
-    return check_output(cmd, stderr=nullfd).split()[3]
-
-
-# return a list of unique PGS given an osd subdirectory
-def get_osd_pgs(SUBDIR, ID):
-    PGS = []
-    if ID:
-        endhead = re.compile("{id}.*_head$".format(id=ID))
-    DIR = os.path.join(SUBDIR, "current")
-    PGS += [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and (ID is None or endhead.match(f))]
-    PGS = [re.sub("_head", "", p) for p in PGS if "_head" in p]
-    return PGS
-
-
-# return a sorted list of unique PGs given a directory
-def get_pgs(DIR, ID):
-    OSDS = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and f.find("osd") == 0]
-    PGS = []
-    for d in OSDS:
-        SUBDIR = os.path.join(DIR, d)
-        PGS += get_osd_pgs(SUBDIR, ID)
-    return sorted(set(PGS))
-
-
-# return a sorted list of PGS a subset of ALLPGS that contain objects with prefix specified
-def get_objs(ALLPGS, prefix, DIR, ID):
-    OSDS = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and f.find("osd") == 0]
-    PGS = []
-    for d in OSDS:
-        DIRL2 = os.path.join(DIR, d)
-        SUBDIR = os.path.join(DIRL2, "current")
-        for p in ALLPGS:
-            PGDIR = p + "_head"
-            if not os.path.isdir(os.path.join(SUBDIR, PGDIR)):
-                continue
-            FINALDIR = os.path.join(SUBDIR, PGDIR)
-            # See if there are any objects there
-            if any(f for f in [val for _, _, fl in os.walk(FINALDIR) for val in fl] if f.startswith(prefix)):
-                PGS += [p]
-    return sorted(set(PGS))
-
-
-# return a sorted list of OSDS which have data from a given PG
-def get_osds(PG, DIR):
-    ALLOSDS = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and f.find("osd") == 0]
-    OSDS = []
-    for d in ALLOSDS:
-        DIRL2 = os.path.join(DIR, d)
-        SUBDIR = os.path.join(DIRL2, "current")
-        PGDIR = PG + "_head"
-        if not os.path.isdir(os.path.join(SUBDIR, PGDIR)):
-            continue
-        OSDS += [d]
-    return sorted(OSDS)
-
-
-def get_lines(filename):
-    tmpfd = open(filename, "r")
-    line = True
-    lines = []
-    while line:
-        line = tmpfd.readline().rstrip('\n')
-        if line:
-            lines += [line]
-    tmpfd.close()
-    os.unlink(filename)
-    return lines
-
-
-def cat_file(level, filename):
-    if level < logging.getLogger().getEffectiveLevel():
-        return
-    print("File: " + filename)
-    with open(filename, "r") as f:
-        while True:
-            line = f.readline().rstrip('\n')
-            if not line:
-                break
-            print(line)
-    print("<EOF>")
-
-
-def vstart(new, opt=""):
-    print("vstarting....", end="")
-    NEW = new and "-n" or "-N"
-    call("MON=1 OSD=4 MDS=0 MGR=1 CEPH_PORT=7400 {path}/src/vstart.sh --short -l {new} -d {opt} > /dev/null 2>&1".format(new=NEW, opt=opt, path=CEPH_ROOT), shell=True)
-    print("DONE")
-
-
-def test_failure(cmd, errmsg, tty=False):
-    if tty:
-        try:
-            ttyfd = open("/dev/tty", "rwb")
-        except Exception as e:
-            logging.info(str(e))
-            logging.info("SKIP " + cmd)
-            return 0
-    TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
-    tmpfd = open(TMPFILE, "wb")
-
-    logging.debug(cmd)
-    if tty:
-        ret = call(cmd, shell=True, stdin=ttyfd, stdout=ttyfd, stderr=tmpfd)
-        ttyfd.close()
-    else:
-        ret = call(cmd, shell=True, stderr=tmpfd)
-    tmpfd.close()
-    if ret == 0:
-        logging.error(cmd)
-        logging.error("Should have failed, but got exit 0")
-        return 1
-    lines = get_lines(TMPFILE)
-    matched = [ l for l in lines if errmsg in l ]
-    if any(matched):
-        logging.info("Correctly failed with message \"" + matched[0] + "\"")
-        return 0
-    else:
-        logging.error("Command: " + cmd )
-        logging.error("Bad messages to stderr \"" + str(lines) + "\"")
-        logging.error("Expected \"" + errmsg + "\"")
-        return 1
-
-
-def get_nspace(num):
-    if num == 0:
-        return ""
-    return "ns{num}".format(num=num)
-
-
-def verify(DATADIR, POOL, NAME_PREFIX, db):
-    TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
-    ERRORS = 0
-    for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(NAME_PREFIX) == 0]:
-        nsfile = rawnsfile.split("__")[0]
-        clone = rawnsfile.split("__")[1]
-        nspace = nsfile.split("-")[0]
-        file = nsfile.split("-")[1]
-        # Skip clones
-        if clone != "head":
-            continue
-        path = os.path.join(DATADIR, rawnsfile)
-        try:
-            os.unlink(TMPFILE)
-        except:
-            pass
-        cmd = "{path}/rados -p {pool} -N '{nspace}' get {file} {out}".format(pool=POOL, file=file, out=TMPFILE, nspace=nspace, path=CEPH_BIN)
-        logging.debug(cmd)
-        call(cmd, shell=True, stdout=DEVNULL, stderr=DEVNULL)
-        cmd = "diff -q {src} {result}".format(src=path, result=TMPFILE)
-        logging.debug(cmd)
-        ret = call(cmd, shell=True)
-        if ret != 0:
-            logging.error("{file} data not imported properly".format(file=file))
-            ERRORS += 1
-        try:
-            os.unlink(TMPFILE)
-        except:
-            pass
-        for key, val in db[nspace][file]["xattr"].items():
-            cmd = "{path}/rados -p {pool} -N '{nspace}' getxattr {name} {key}".format(pool=POOL, name=file, key=key, nspace=nspace, path=CEPH_BIN)
-            logging.debug(cmd)
-            getval = check_output(cmd, shell=True, stderr=DEVNULL)
-            logging.debug("getxattr {key} {val}".format(key=key, val=getval))
-            if getval != val:
-                logging.error("getxattr of key {key} returned wrong val: {get} instead of {orig}".format(key=key, get=getval, orig=val))
-                ERRORS += 1
-                continue
-        hdr = db[nspace][file].get("omapheader", "")
-        cmd = "{path}/rados -p {pool} -N '{nspace}' getomapheader {name} {file}".format(pool=POOL, name=file, nspace=nspace, file=TMPFILE, path=CEPH_BIN)
-        logging.debug(cmd)
-        ret = call(cmd, shell=True, stderr=DEVNULL)
-        if ret != 0:
-            logging.error("rados getomapheader returned {ret}".format(ret=ret))
-            ERRORS += 1
-        else:
-            getlines = get_lines(TMPFILE)
-            assert(len(getlines) == 0 or len(getlines) == 1)
-            if len(getlines) == 0:
-                gethdr = ""
-            else:
-                gethdr = getlines[0]
-            logging.debug("header: {hdr}".format(hdr=gethdr))
-            if gethdr != hdr:
-                logging.error("getomapheader returned wrong val: {get} instead of {orig}".format(get=gethdr, orig=hdr))
-                ERRORS += 1
-        for key, val in db[nspace][file]["omap"].items():
-            cmd = "{path}/rados -p {pool} -N '{nspace}' getomapval {name} {key} {file}".format(pool=POOL, name=file, key=key, nspace=nspace, file=TMPFILE, path=CEPH_BIN)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True, stderr=DEVNULL)
-            if ret != 0:
-                logging.error("getomapval returned {ret}".format(ret=ret))
-                ERRORS += 1
-                continue
-            getlines = get_lines(TMPFILE)
-            if len(getlines) != 1:
-                logging.error("Bad data from getomapval {lines}".format(lines=getlines))
-                ERRORS += 1
-                continue
-            getval = getlines[0]
-            logging.debug("getomapval {key} {val}".format(key=key, val=getval))
-            if getval != val:
-                logging.error("getomapval returned wrong val: {get} instead of {orig}".format(get=getval, orig=val))
-                ERRORS += 1
-        try:
-            os.unlink(TMPFILE)
-        except:
-            pass
-    return ERRORS
-
-
-def check_journal(jsondict):
-    errors = 0
-    if 'header' not in jsondict:
-        logging.error("Key 'header' not in dump-journal")
-        errors += 1
-    elif 'max_size' not in jsondict['header']:
-        logging.error("Key 'max_size' not in dump-journal header")
-        errors += 1
-    else:
-        print("\tJournal max_size = {size}".format(size=jsondict['header']['max_size']))
-    if 'entries' not in jsondict:
-        logging.error("Key 'entries' not in dump-journal output")
-        errors += 1
-    elif len(jsondict['entries']) == 0:
-        logging.info("No entries in journal found")
-    else:
-        errors += check_journal_entries(jsondict['entries'])
-    return errors
-
-
-def check_journal_entries(entries):
-    errors = 0
-    for enum in range(len(entries)):
-        if 'offset' not in entries[enum]:
-            logging.error("No 'offset' key in entry {e}".format(e=enum))
-            errors += 1
-        if 'seq' not in entries[enum]:
-            logging.error("No 'seq' key in entry {e}".format(e=enum))
-            errors += 1
-        if 'transactions' not in entries[enum]:
-            logging.error("No 'transactions' key in entry {e}".format(e=enum))
-            errors += 1
-        elif len(entries[enum]['transactions']) == 0:
-            logging.error("No transactions found in entry {e}".format(e=enum))
-            errors += 1
-        else:
-            errors += check_entry_transactions(entries[enum], enum)
-    return errors
-
-
-def check_entry_transactions(entry, enum):
-    errors = 0
-    for tnum in range(len(entry['transactions'])):
-        if 'trans_num' not in entry['transactions'][tnum]:
-            logging.error("Key 'trans_num' missing from entry {e} trans {t}".format(e=enum, t=tnum))
-            errors += 1
-        elif entry['transactions'][tnum]['trans_num'] != tnum:
-            ft = entry['transactions'][tnum]['trans_num']
-            logging.error("Bad trans_num ({ft}) entry {e} trans {t}".format(ft=ft, e=enum, t=tnum))
-            errors += 1
-        if 'ops' not in entry['transactions'][tnum]:
-            logging.error("Key 'ops' missing from entry {e} trans {t}".format(e=enum, t=tnum))
-            errors += 1
-        else:
-            errors += check_transaction_ops(entry['transactions'][tnum]['ops'], enum, tnum)
-    return errors
-
-
-def check_transaction_ops(ops, enum, tnum):
-    if len(ops) is 0:
-        logging.warning("No ops found in entry {e} trans {t}".format(e=enum, t=tnum))
-    errors = 0
-    for onum in range(len(ops)):
-        if 'op_num' not in ops[onum]:
-            logging.error("Key 'op_num' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum))
-            errors += 1
-        elif ops[onum]['op_num'] != onum:
-            fo = ops[onum]['op_num']
-            logging.error("Bad op_num ({fo}) from entry {e} trans {t} op {o}".format(fo=fo, e=enum, t=tnum, o=onum))
-            errors += 1
-        if 'op_name' not in ops[onum]:
-            logging.error("Key 'op_name' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum))
-            errors += 1
-    return errors
-
-
-def test_dump_journal(CFSD_PREFIX, osds):
-    ERRORS = 0
-    pid = os.getpid()
-    TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
-
-    for osd in osds:
-        # Test --op dump-journal by loading json
-        cmd = (CFSD_PREFIX + "--op dump-journal --format json").format(osd=osd)
-        logging.debug(cmd)
-        tmpfd = open(TMPFILE, "wb")
-        ret = call(cmd, shell=True, stdout=tmpfd)
-        if ret != 0:
-            logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
-            ERRORS += 1
-            continue
-        tmpfd.close()
-        tmpfd = open(TMPFILE, "r")
-        jsondict = json.load(tmpfd)
-        tmpfd.close()
-        os.unlink(TMPFILE)
-
-        journal_errors = check_journal(jsondict)
-        if journal_errors is not 0:
-            logging.error(jsondict)
-        ERRORS += journal_errors
-
-    return ERRORS
-
-CEPH_BUILD_DIR = os.environ.get('CEPH_BUILD_DIR')
-CEPH_BIN = os.environ.get('CEPH_BIN')
-CEPH_ROOT = os.environ.get('CEPH_ROOT')
-
-if not CEPH_BUILD_DIR:
-    CEPH_BUILD_DIR=os.getcwd()
-    os.putenv('CEPH_BUILD_DIR', CEPH_BUILD_DIR)
-    CEPH_BIN=CEPH_BUILD_DIR
-    os.putenv('CEPH_BIN', CEPH_BIN)
-    CEPH_ROOT=os.path.dirname(CEPH_BUILD_DIR)
-    os.putenv('CEPH_ROOT', CEPH_ROOT)
-    CEPH_LIB=os.path.join(CEPH_BIN, '.libs')
-    os.putenv('CEPH_LIB', CEPH_LIB)
-
-CEPH_DIR = CEPH_BUILD_DIR + "/cot_dir"
-CEPH_CONF = os.path.join(CEPH_DIR, 'ceph.conf')
-
-def kill_daemons():
-    call("{path}/init-ceph -c {conf} stop > /dev/null 2>&1".format(conf=CEPH_CONF, path=CEPH_BIN), shell=True)
-
-
-def check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME):
-    repcount = 0
-    ERRORS = 0
-    for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(SPLIT_NAME) == 0]:
-        nsfile = rawnsfile.split("__")[0]
-        clone = rawnsfile.split("__")[1]
-        nspace = nsfile.split("-")[0]
-        file = nsfile.split("-")[1] + "__" + clone
-        # Skip clones
-        if clone != "head":
-            continue
-        path = os.path.join(DATADIR, rawnsfile)
-        tmpfd = open(TMPFILE, "wb")
-        cmd = "find {dir} -name '{file}_*_{nspace}_*'".format(dir=OSDDIR, file=file, nspace=nspace)
-        logging.debug(cmd)
-        ret = call(cmd, shell=True, stdout=tmpfd)
-        if ret:
-            logging.critical("INTERNAL ERROR")
-            return 1
-        tmpfd.close()
-        obj_locs = get_lines(TMPFILE)
-        if len(obj_locs) == 0:
-            logging.error("Can't find imported object {name}".format(name=file))
-            ERRORS += 1
-        for obj_loc in obj_locs:
-            # For btrfs skip snap_* dirs
-            if re.search("/snap_[0-9]*/", obj_loc) is not None:
-                continue
-            repcount += 1
-            cmd = "diff -q {src} {obj_loc}".format(src=path, obj_loc=obj_loc)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True)
-            if ret != 0:
-                logging.error("{file} data not imported properly into {obj}".format(file=file, obj=obj_loc))
-                ERRORS += 1
-    return ERRORS, repcount
-
-
-def set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
-    # change the weight of osd.0 to math.pi in the newest osdmap of given osd
-    osdmap_file = tempfile.NamedTemporaryFile(delete=True)
-    cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
-                                                                        osdmap_file=osdmap_file.name)
-    output = check_output(cmd, shell=True)
-    epoch = int(re.findall('#(\d+)', output)[0])
-
-    new_crush_file = tempfile.NamedTemporaryFile(delete=True)
-    old_crush_file = tempfile.NamedTemporaryFile(delete=True)
-    ret = call("{path}/osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
-                                                                          crush_file=old_crush_file.name, path=CEPH_BIN),
-               stdout=DEVNULL,
-               stderr=DEVNULL,
-               shell=True)
-    assert(ret == 0)
-
-    for osd_id in osd_ids:
-        cmd = "{path}/crushtool -i {crush_file} --reweight-item osd.{osd} {weight} -o {new_crush_file}".format(osd=osd_id,
-                                                                                                          crush_file=old_crush_file.name,
-                                                                                                          weight=weight,
-                                                                                                          new_crush_file=new_crush_file.name, path=CEPH_BIN)
-        ret = call(cmd, stdout=DEVNULL, shell=True)
-        assert(ret == 0)
-        old_crush_file, new_crush_file = new_crush_file, old_crush_file
-
-    # change them back, since we don't need to preapre for another round
-    old_crush_file, new_crush_file = new_crush_file, old_crush_file
-    old_crush_file.close()
-
-    ret = call("{path}/osdmaptool --import-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
-                                                                               crush_file=new_crush_file.name, path=CEPH_BIN),
-               stdout=DEVNULL,
-               stderr=DEVNULL,
-               shell=True)
-    assert(ret == 0)
-
-    # Minimum test of --dry-run by using it, but not checking anything
-    cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force --dry-run"
-    cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch)
-    ret = call(cmd, stdout=DEVNULL, shell=True)
-    assert(ret == 0)
-
-    # osdmaptool increases the epoch of the changed osdmap, so we need to force the tool
-    # to use use a different epoch than the one in osdmap
-    cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force"
-    cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch)
-    ret = call(cmd, stdout=DEVNULL, shell=True)
-
-    return ret == 0
-
-def get_osd_weights(CFSD_PREFIX, osd_ids, osd_path):
-    osdmap_file = tempfile.NamedTemporaryFile(delete=True)
-    cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
-                                                                        osdmap_file=osdmap_file.name)
-    ret = call(cmd, stdout=DEVNULL, shell=True)
-    if ret != 0:
-        return None
-    # we have to read the weights from the crush map, even we can query the weights using
-    # osdmaptool, but please keep in mind, they are different:
-    #    item weights in crush map versus weight associated with each osd in osdmap
-    crush_file = tempfile.NamedTemporaryFile(delete=True)
-    ret = call("{path}/osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
-                                                                               crush_file=crush_file.name, path=CEPH_BIN),
-               stdout=DEVNULL,
-               shell=True)
-    assert(ret == 0)
-    output = check_output("{path}/crushtool --tree -i {crush_file} | tail -n {num_osd}".format(crush_file=crush_file.name,
-                                                                                          num_osd=len(osd_ids), path=CEPH_BIN),
-                          stderr=DEVNULL,
-                          shell=True)
-    weights = []
-    for line in output.strip().split('\n'):
-        print(line)
-        linev = re.split('\s+', line)
-        if linev[0] is '':
-            linev.pop(0)
-        print('linev %s' % linev)
-        weights.append(float(linev[1]))
-
-    return weights
-
-
-def test_get_set_osdmap(CFSD_PREFIX, osd_ids, osd_paths):
-    print("Testing get-osdmap and set-osdmap")
-    errors = 0
-    kill_daemons()
-    weight = 1 / math.e           # just some magic number in [0, 1]
-    changed = []
-    for osd_path in osd_paths:
-        if set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
-            changed.append(osd_path)
-        else:
-            logging.warning("Failed to change the weights: {0}".format(osd_path))
-    # i am pissed off if none of the store gets changed
-    if not changed:
-        errors += 1
-
-    for osd_path in changed:
-        weights = get_osd_weights(CFSD_PREFIX, osd_ids, osd_path)
-        if not weights:
-            errors += 1
-            continue
-        if any(abs(w - weight) > 1e-5 for w in weights):
-            logging.warning("Weight is not changed: {0} != {1}".format(weights, weight))
-            errors += 1
-    return errors
-
-def test_get_set_inc_osdmap(CFSD_PREFIX, osd_path):
-    # incrementals are not used unless we need to build an MOSDMap to update
-    # OSD's peers, so an obvious way to test it is simply overwrite an epoch
-    # with a different copy, and read it back to see if it matches.
-    kill_daemons()
-    file_e2 = tempfile.NamedTemporaryFile(delete=True)
-    cmd = (CFSD_PREFIX + "--op get-inc-osdmap --file {file}").format(osd=osd_path,
-                                                                     file=file_e2.name)
-    output = check_output(cmd, shell=True)
-    epoch = int(re.findall('#(\d+)', output)[0])
-    # backup e1 incremental before overwriting it
-    epoch -= 1
-    file_e1_backup = tempfile.NamedTemporaryFile(delete=True)
-    cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
-    ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
-    if ret: return 1
-    # overwrite e1 with e2
-    cmd = CFSD_PREFIX + "--op set-inc-osdmap --force --epoch {epoch} --file {file}"
-    ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e2.name), shell=True)
-    if ret: return 1
-    # Use dry-run to set back to e1 which shouldn't happen
-    cmd = CFSD_PREFIX + "--op set-inc-osdmap --dry-run --epoch {epoch} --file {file}"
-    ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
-    if ret: return 1
-    # read from e1
-    file_e1_read = tempfile.NamedTemporaryFile(delete=True)
-    cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
-    ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_read.name), shell=True)
-    if ret: return 1
-    errors = 0
-    try:
-        if not filecmp.cmp(file_e2.name, file_e1_read.name, shallow=False):
-            logging.error("{{get,set}}-inc-osdmap mismatch {0} != {1}".format(file_e2.name, file_e1_read.name))
-            errors += 1
-    finally:
-        # revert the change with file_e1_backup
-        cmd = CFSD_PREFIX + "--op set-inc-osdmap --epoch {epoch} --file {file}"
-        ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
-        if ret:
-            logging.error("Failed to revert the changed inc-osdmap")
-            errors += 1
-
-    return errors
-
-
-def test_removeall(CFSD_PREFIX, db, OBJREPPGS, REP_POOL, CEPH_BIN, OSDDIR, REP_NAME, NUM_CLONED_REP_OBJECTS):
-    # Test removeall
-    TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
-    nullfd = open(os.devnull, "w")
-    errors=0
-    print("Test removeall")
-    kill_daemons()
-    for nspace in db.keys():
-        for basename in db[nspace].keys():
-            JSON = db[nspace][basename]['json']
-            for pg in OBJREPPGS:
-                OSDS = get_osds(pg, OSDDIR)
-                for osd in OSDS:
-                    DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
-                    fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
-                              and f.split("_")[0] == basename and f.split("_")[4] == nspace]
-                    if not fnames:
-                        continue
-
-                    if int(basename.split(REP_NAME)[1]) <= int(NUM_CLONED_REP_OBJECTS):
-                        cmd = (CFSD_PREFIX + "'{json}' remove").format(osd=osd, json=JSON)
-                        errors += test_failure(cmd, "Snapshots are present, use removeall to delete everything")
-
-                    cmd = (CFSD_PREFIX + " --force --dry-run '{json}' remove").format(osd=osd, json=JSON)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-                    if ret != 0:
-                        logging.error("remove with --force failed for {json}".format(json=JSON))
-                        errors += 1
-
-                    cmd = (CFSD_PREFIX + " --dry-run '{json}' removeall").format(osd=osd, json=JSON)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-                    if ret != 0:
-                        logging.error("removeall failed for {json}".format(json=JSON))
-                        errors += 1
-
-                    cmd = (CFSD_PREFIX + " '{json}' removeall").format(osd=osd, json=JSON)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-                    if ret != 0:
-                        logging.error("removeall failed for {json}".format(json=JSON))
-                        errors += 1
-
-                    tmpfd = open(TMPFILE, "w")
-                    cmd = (CFSD_PREFIX + "--op list --pgid {pg} --namespace {ns} {name}").format(osd=osd, pg=pg, ns=nspace, name=basename)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True, stdout=tmpfd)
-                    if ret != 0:
-                        logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
-                        errors += 1
-                    tmpfd.close()
-                    lines = get_lines(TMPFILE)
-                    if len(lines) != 0:
-                        logging.error("Removeall didn't remove all objects {ns}/{name} : {lines}".format(ns=nspace, name=basename, lines=lines))
-                        errors += 1
-    vstart(new=False)
-    wait_for_health()
-    cmd = "{path}/rados -p {pool} rmsnap snap1".format(pool=REP_POOL, path=CEPH_BIN)
-    logging.debug(cmd)
-    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-    if ret != 0:
-        logging.error("rados rmsnap failed")
-        errors += 1
-    time.sleep(2)
-    wait_for_health()
-    return errors
-
-
-def main(argv):
-    if sys.version_info[0] < 3:
-        sys.stdout = stdout = os.fdopen(sys.stdout.fileno(), 'wb', 0)
-    else:
-        stdout = sys.stdout.buffer
-    if len(argv) > 1 and argv[1] == "debug":
-        nullfd = stdout
-    else:
-        nullfd = DEVNULL
-
-    call("rm -fr {dir}; mkdir {dir}".format(dir=CEPH_DIR), shell=True)
-    os.environ["CEPH_DIR"] = CEPH_DIR
-    OSDDIR = os.path.join(CEPH_DIR, "dev")
-    REP_POOL = "rep_pool"
-    REP_NAME = "REPobject"
-    EC_POOL = "ec_pool"
-    EC_NAME = "ECobject"
-    if len(argv) > 0 and argv[0] == 'large':
-        PG_COUNT = 12
-        NUM_REP_OBJECTS = 800
-        NUM_CLONED_REP_OBJECTS = 100
-        NUM_EC_OBJECTS = 12
-        NUM_NSPACES = 4
-        # Larger data sets for first object per namespace
-        DATALINECOUNT = 50000
-        # Number of objects to do xattr/omap testing on
-        ATTR_OBJS = 10
-    else:
-        PG_COUNT = 4
-        NUM_REP_OBJECTS = 2
-        NUM_CLONED_REP_OBJECTS = 2
-        NUM_EC_OBJECTS = 2
-        NUM_NSPACES = 2
-        # Larger data sets for first object per namespace
-        DATALINECOUNT = 10
-        # Number of objects to do xattr/omap testing on
-        ATTR_OBJS = 2
-    ERRORS = 0
-    pid = os.getpid()
-    TESTDIR = "/tmp/test.{pid}".format(pid=pid)
-    DATADIR = "/tmp/data.{pid}".format(pid=pid)
-    CFSD_PREFIX = CEPH_BIN + "/ceph-objectstore-tool --data-path " + OSDDIR + "/{osd} "
-    PROFNAME = "testecprofile"
-
-    os.environ['CEPH_CONF'] = CEPH_CONF
-    vstart(new=True)
-    wait_for_health()
-
-    cmd = "{path}/ceph osd pool create {pool} {pg} {pg} replicated".format(pool=REP_POOL, pg=PG_COUNT, path=CEPH_BIN)
-    logging.debug(cmd)
-    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-    REPID = get_pool_id(REP_POOL, nullfd)
-
-    print("Created Replicated pool #{repid}".format(repid=REPID))
-
-    cmd = "{path}/ceph osd erasure-code-profile set {prof} crush-failure-domain=osd".format(prof=PROFNAME, path=CEPH_BIN)
-    logging.debug(cmd)
-    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-    cmd = "{path}/ceph osd erasure-code-profile get {prof}".format(prof=PROFNAME, path=CEPH_BIN)
-    logging.debug(cmd)
-    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-    cmd = "{path}/ceph osd pool create {pool} {pg} {pg} erasure {prof}".format(pool=EC_POOL, prof=PROFNAME, pg=PG_COUNT, path=CEPH_BIN)
-    logging.debug(cmd)
-    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-    ECID = get_pool_id(EC_POOL, nullfd)
-
-    print("Created Erasure coded pool #{ecid}".format(ecid=ECID))
-
-    print("Creating {objs} objects in replicated pool".format(objs=(NUM_REP_OBJECTS*NUM_NSPACES)))
-    cmd = "mkdir -p {datadir}".format(datadir=DATADIR)
-    logging.debug(cmd)
-    call(cmd, shell=True)
-
-    db = {}
-
-    objects = range(1, NUM_REP_OBJECTS + 1)
-    nspaces = range(NUM_NSPACES)
-    for n in nspaces:
-        nspace = get_nspace(n)
-
-        db[nspace] = {}
-
-        for i in objects:
-            NAME = REP_NAME + "{num}".format(num=i)
-            LNAME = nspace + "-" + NAME
-            DDNAME = os.path.join(DATADIR, LNAME)
-            DDNAME += "__head"
-
-            cmd = "rm -f " + DDNAME
-            logging.debug(cmd)
-            call(cmd, shell=True)
-
-            if i == 1:
-                dataline = range(DATALINECOUNT)
-            else:
-                dataline = range(1)
-            fd = open(DDNAME, "w")
-            data = "This is the replicated data for " + LNAME + "\n"
-            for _ in dataline:
-                fd.write(data)
-            fd.close()
-
-            cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=REP_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True, stderr=nullfd)
-            if ret != 0:
-                logging.critical("Rados put command failed with {ret}".format(ret=ret))
-                return 1
-
-            db[nspace][NAME] = {}
-
-            if i < ATTR_OBJS + 1:
-                keys = range(i)
-            else:
-                keys = range(0)
-            db[nspace][NAME]["xattr"] = {}
-            for k in keys:
-                if k == 0:
-                    continue
-                mykey = "key{i}-{k}".format(i=i, k=k)
-                myval = "val{i}-{k}".format(i=i, k=k)
-                cmd = "{path}/rados -p {pool} -N '{nspace}' setxattr {name} {key} {val}".format(pool=REP_POOL, name=NAME, key=mykey, val=myval, nspace=nspace, path=CEPH_BIN)
-                logging.debug(cmd)
-                ret = call(cmd, shell=True)
-                if ret != 0:
-                    logging.error("setxattr failed with {ret}".format(ret=ret))
-                    ERRORS += 1
-                db[nspace][NAME]["xattr"][mykey] = myval
-
-            # Create omap header in all objects but REPobject1
-            if i < ATTR_OBJS + 1 and i != 1:
-                myhdr = "hdr{i}".format(i=i)
-                cmd = "{path}/rados -p {pool} -N '{nspace}' setomapheader {name} {hdr}".format(pool=REP_POOL, name=NAME, hdr=myhdr, nspace=nspace, path=CEPH_BIN)
-                logging.debug(cmd)
-                ret = call(cmd, shell=True)
-                if ret != 0:
-                    logging.critical("setomapheader failed with {ret}".format(ret=ret))
-                    ERRORS += 1
-                db[nspace][NAME]["omapheader"] = myhdr
-
-            db[nspace][NAME]["omap"] = {}
-            for k in keys:
-                if k == 0:
-                    continue
-                mykey = "okey{i}-{k}".format(i=i, k=k)
-                myval = "oval{i}-{k}".format(i=i, k=k)
-                cmd = "{path}/rados -p {pool} -N '{nspace}' setomapval {name} {key} {val}".format(pool=REP_POOL, name=NAME, key=mykey, val=myval, nspace=nspace, path=CEPH_BIN)
-                logging.debug(cmd)
-                ret = call(cmd, shell=True)
-                if ret != 0:
-                    logging.critical("setomapval failed with {ret}".format(ret=ret))
-                db[nspace][NAME]["omap"][mykey] = myval
-
-    # Create some clones
-    cmd = "{path}/rados -p {pool} mksnap snap1".format(pool=REP_POOL, path=CEPH_BIN)
-    logging.debug(cmd)
-    call(cmd, shell=True)
-
-    objects = range(1, NUM_CLONED_REP_OBJECTS + 1)
-    nspaces = range(NUM_NSPACES)
-    for n in nspaces:
-        nspace = get_nspace(n)
-
-        for i in objects:
-            NAME = REP_NAME + "{num}".format(num=i)
-            LNAME = nspace + "-" + NAME
-            DDNAME = os.path.join(DATADIR, LNAME)
-            # First clone
-            CLONENAME = DDNAME + "__1"
-            DDNAME += "__head"
-
-            cmd = "mv -f " + DDNAME + " " + CLONENAME
-            logging.debug(cmd)
-            call(cmd, shell=True)
-
-            if i == 1:
-                dataline = range(DATALINECOUNT)
-            else:
-                dataline = range(1)
-            fd = open(DDNAME, "w")
-            data = "This is the replicated data after a snapshot for " + LNAME + "\n"
-            for _ in dataline:
-                fd.write(data)
-            fd.close()
-
-            cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=REP_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True, stderr=nullfd)
-            if ret != 0:
-                logging.critical("Rados put command failed with {ret}".format(ret=ret))
-                return 1
-
-    print("Creating {objs} objects in erasure coded pool".format(objs=(NUM_EC_OBJECTS*NUM_NSPACES)))
-
-    objects = range(1, NUM_EC_OBJECTS + 1)
-    nspaces = range(NUM_NSPACES)
-    for n in nspaces:
-        nspace = get_nspace(n)
-
-        for i in objects:
-            NAME = EC_NAME + "{num}".format(num=i)
-            LNAME = nspace + "-" + NAME
-            DDNAME = os.path.join(DATADIR, LNAME)
-            DDNAME += "__head"
-
-            cmd = "rm -f " + DDNAME
-            logging.debug(cmd)
-            call(cmd, shell=True)
-
-            if i == 1:
-                dataline = range(DATALINECOUNT)
-            else:
-                dataline = range(1)
-            fd = open(DDNAME, "w")
-            data = "This is the erasure coded data for " + LNAME + "\n"
-            for j in dataline:
-                fd.write(data)
-            fd.close()
-
-            cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=EC_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True, stderr=nullfd)
-            if ret != 0:
-                logging.critical("Erasure coded pool creation failed with {ret}".format(ret=ret))
-                return 1
-
-            db[nspace][NAME] = {}
-
-            db[nspace][NAME]["xattr"] = {}
-            if i < ATTR_OBJS + 1:
-                keys = range(i)
-            else:
-                keys = range(0)
-            for k in keys:
-                if k == 0:
-                    continue
-                mykey = "key{i}-{k}".format(i=i, k=k)
-                myval = "val{i}-{k}".format(i=i, k=k)
-                cmd = "{path}/rados -p {pool} -N '{nspace}' setxattr {name} {key} {val}".format(pool=EC_POOL, name=NAME, key=mykey, val=myval, nspace=nspace, path=CEPH_BIN)
-                logging.debug(cmd)
-                ret = call(cmd, shell=True)
-                if ret != 0:
-                    logging.error("setxattr failed with {ret}".format(ret=ret))
-                    ERRORS += 1
-                db[nspace][NAME]["xattr"][mykey] = myval
-
-            # Omap isn't supported in EC pools
-            db[nspace][NAME]["omap"] = {}
-
-    logging.debug(db)
-
-    kill_daemons()
-
-    if ERRORS:
-        logging.critical("Unable to set up test")
-        return 1
-
-    ALLREPPGS = get_pgs(OSDDIR, REPID)
-    logging.debug(ALLREPPGS)
-    ALLECPGS = get_pgs(OSDDIR, ECID)
-    logging.debug(ALLECPGS)
-
-    OBJREPPGS = get_objs(ALLREPPGS, REP_NAME, OSDDIR, REPID)
-    logging.debug(OBJREPPGS)
-    OBJECPGS = get_objs(ALLECPGS, EC_NAME, OSDDIR, ECID)
-    logging.debug(OBJECPGS)
-
-    ONEPG = ALLREPPGS[0]
-    logging.debug(ONEPG)
-    osds = get_osds(ONEPG, OSDDIR)
-    ONEOSD = osds[0]
-    logging.debug(ONEOSD)
-
-    print("Test invalid parameters")
-    # On export can't use stdout to a terminal
-    cmd = (CFSD_PREFIX + "--op export --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True)
-
-    # On export can't use stdout to a terminal
-    cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True)
-
-    # Prep a valid ec export file for import failure tests
-    ONEECPG = ALLECPGS[0]
-    osds = get_osds(ONEECPG, OSDDIR)
-    ONEECOSD = osds[0]
-    OTHERFILE = "/tmp/foo.{pid}".format(pid=pid)
-    cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=ONEECPG, file=OTHERFILE)
-    logging.debug(cmd)
-    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-
-    # On import can't specify a different shard
-    BADPG = ONEECPG.split('s')[0] + "s10"
-    cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=BADPG, file=OTHERFILE)
-    ERRORS += test_failure(cmd, "Can't specify a different shard, must be")
-
-    os.unlink(OTHERFILE)
-
-    # Prep a valid export file for import failure tests
-    OTHERFILE = "/tmp/foo.{pid}".format(pid=pid)
-    cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE)
-    logging.debug(cmd)
-    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-
-    # On import can't specify a PG with a non-existent pool
-    cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg="10.0", file=OTHERFILE)
-    ERRORS += test_failure(cmd, "Can't specify a different pgid pool, must be")
-
-    # On import can't specify shard for a replicated export
-    cmd = (CFSD_PREFIX + "--op import --pgid {pg}s0 --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE)
-    ERRORS += test_failure(cmd, "Can't specify a sharded pgid with a non-sharded export")
-
-    # On import can't specify a PG with a bad seed
-    TMPPG="{pool}.80".format(pool=REPID)
-    cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg=TMPPG, file=OTHERFILE)
-    ERRORS += test_failure(cmd, "Illegal pgid, the seed is larger than current pg_num")
-
-    os.unlink(OTHERFILE)
-    cmd = (CFSD_PREFIX + "--op import --file {FOO}").format(osd=ONEOSD, FOO=OTHERFILE)
-    ERRORS += test_failure(cmd, "file: {FOO}: No such file or directory".format(FOO=OTHERFILE))
-
-    cmd = "{path}/ceph-objectstore-tool --data-path BAD_DATA_PATH --op list".format(osd=ONEOSD, path=CEPH_BIN)
-    ERRORS += test_failure(cmd, "data-path: BAD_DATA_PATH: No such file or directory")
-
-    cmd = "{path}/ceph-objectstore-tool --journal-path BAD_JOURNAL_PATH --op dump-journal".format(path=CEPH_BIN)
-    ERRORS += test_failure(cmd, "journal-path: BAD_JOURNAL_PATH: (2) No such file or directory")
-
-    # On import can't use stdin from a terminal
-    cmd = (CFSD_PREFIX + "--op import --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
-
-    # On import can't use stdin from a terminal
-    cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
-
-    # Specify a bad --type
-    os.mkdir(OSDDIR + "/fakeosd")
-    cmd = ("{path}/ceph-objectstore-tool --data-path " + OSDDIR + "/{osd} --type foobar --op list --pgid {pg}").format(osd="fakeosd", pg=ONEPG, path=CEPH_BIN)
-    ERRORS += test_failure(cmd, "Unable to create store of type foobar")
-
-    # Don't specify a data-path
-    cmd = "{path}/ceph-objectstore-tool --type memstore --op list --pgid {pg}".format(dir=OSDDIR, osd=ONEOSD, pg=ONEPG, path=CEPH_BIN)
-    ERRORS += test_failure(cmd, "Must provide --data-path")
-
-    cmd = (CFSD_PREFIX + "--op remove").format(osd=ONEOSD)
-    ERRORS += test_failure(cmd, "Must provide pgid")
-
-    # Don't secify a --op nor object command
-    cmd = CFSD_PREFIX.format(osd=ONEOSD)
-    ERRORS += test_failure(cmd, "Must provide --op or object command...")
-
-    # Specify a bad --op command
-    cmd = (CFSD_PREFIX + "--op oops").format(osd=ONEOSD)
-    ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, mkfs, fsck, export, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)")
-
-    # Provide just the object param not a command
-    cmd = (CFSD_PREFIX + "object").format(osd=ONEOSD)
-    ERRORS += test_failure(cmd, "Invalid syntax, missing command")
-
-    # Provide an object name that doesn't exist
-    cmd = (CFSD_PREFIX + "NON_OBJECT get-bytes").format(osd=ONEOSD)
-    ERRORS += test_failure(cmd, "No object id 'NON_OBJECT' found")
-
-    # Provide an invalid object command
-    cmd = (CFSD_PREFIX + "--pgid {pg} '' notacommand").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure(cmd, "Unknown object command 'notacommand'")
-
-    cmd = (CFSD_PREFIX + "foo list-omap").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure(cmd, "No object id 'foo' found or invalid JSON specified")
-
-    cmd = (CFSD_PREFIX + "'{{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}}' list-omap").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure(cmd, "Without --pgid the object '{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}' must be a JSON array")
-
-    cmd = (CFSD_PREFIX + "'[]' list-omap").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure(cmd, "Object '[]' must be a JSON array with 2 elements")
-
-    cmd = (CFSD_PREFIX + "'[\"1.0\"]' list-omap").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure(cmd, "Object '[\"1.0\"]' must be a JSON array with 2 elements")
-
-    cmd = (CFSD_PREFIX + "'[\"1.0\", 5, 8, 9]' list-omap").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure(cmd, "Object '[\"1.0\", 5, 8, 9]' must be a JSON array with 2 elements")
-
-    cmd = (CFSD_PREFIX + "'[1, 2]' list-omap").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure(cmd, "Object '[1, 2]' must be a JSON array with the first element a string")
-
-    cmd = (CFSD_PREFIX + "'[\"1.3\",{{\"snapid\":\"not an int\"}}]' list-omap").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure(cmd, "Decode object JSON error: value type is 2 not 4")
-
-    TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
-    ALLPGS = OBJREPPGS + OBJECPGS
-    OSDS = get_osds(ALLPGS[0], OSDDIR)
-    osd = OSDS[0]
-
-    print("Test all --op dump-journal")
-    ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]
-    ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS)
-
-    # Test --op list and generate json for all objects
-    print("Test --op list variants")
-
-    # retrieve all objects from all PGs
-    tmpfd = open(TMPFILE, "wb")
-    cmd = (CFSD_PREFIX + "--op list --format json").format(osd=osd)
-    logging.debug(cmd)
-    ret = call(cmd, shell=True, stdout=tmpfd)
-    if ret != 0:
-        logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
-        ERRORS += 1
-    tmpfd.close()
-    lines = get_lines(TMPFILE)
-    JSONOBJ = sorted(set(lines))
-    (pgid, coll, jsondict) = json.loads(JSONOBJ[0])[0]
-
-    # retrieve all objects in a given PG
-    tmpfd = open(OTHERFILE, "ab")
-    cmd = (CFSD_PREFIX + "--op list --pgid {pg} --format json").format(osd=osd, pg=pgid)
-    logging.debug(cmd)
-    ret = call(cmd, shell=True, stdout=tmpfd)
-    if ret != 0:
-        logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
-        ERRORS += 1
-    tmpfd.close()
-    lines = get_lines(OTHERFILE)
-    JSONOBJ = sorted(set(lines))
-    (other_pgid, other_coll, other_jsondict) = json.loads(JSONOBJ[0])[0]
-
-    if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll:
-        logging.error("the first line of --op list is different "
-                      "from the first line of --op list --pgid {pg}".format(pg=pgid))
-        ERRORS += 1
-
-    # retrieve all objects with a given name in a given PG
-    tmpfd = open(OTHERFILE, "wb")
-    cmd = (CFSD_PREFIX + "--op list --pgid {pg} {object} --format json").format(osd=osd, pg=pgid, object=jsondict['oid'])
-    logging.debug(cmd)
-    ret = call(cmd, shell=True, stdout=tmpfd)
-    if ret != 0:
-        logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
-        ERRORS += 1
-    tmpfd.close()
-    lines = get_lines(OTHERFILE)
-    JSONOBJ = sorted(set(lines))
-    (other_pgid, other_coll, other_jsondict) in json.loads(JSONOBJ[0])[0]
-
-    if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll:
-        logging.error("the first line of --op list is different "
-                      "from the first line of --op list --pgid {pg} {object}".format(pg=pgid, object=jsondict['oid']))
-        ERRORS += 1
-
-    print("Test --op list by generating json for all objects using default format")
-    for pg in ALLPGS:
-        OSDS = get_osds(pg, OSDDIR)
-        for osd in OSDS:
-            tmpfd = open(TMPFILE, "ab")
-            cmd = (CFSD_PREFIX + "--op list --pgid {pg}").format(osd=osd, pg=pg)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True, stdout=tmpfd)
-            if ret != 0:
-                logging.error("Bad exit status {ret} from --op list request".format(ret=ret))
-                ERRORS += 1
-
-    tmpfd.close()
-    lines = get_lines(TMPFILE)
-    JSONOBJ = sorted(set(lines))
-    for JSON in JSONOBJ:
-        (pgid, jsondict) = json.loads(JSON)
-        # Skip clones for now
-        if jsondict['snapid'] != -2:
-            continue
-        db[jsondict['namespace']][jsondict['oid']]['json'] = json.dumps((pgid, jsondict))
-        # print db[jsondict['namespace']][jsondict['oid']]['json']
-        if jsondict['oid'].find(EC_NAME) == 0 and 'shard_id' not in jsondict:
-            logging.error("Malformed JSON {json}".format(json=JSON))
-            ERRORS += 1
-
-    # Test get-bytes
-    print("Test get-bytes and set-bytes")
-    for nspace in db.keys():
-        for basename in db[nspace].keys():
-            file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
-            JSON = db[nspace][basename]['json']
-            GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
-            TESTNAME = "/tmp/testbytes.{pid}".format(pid=pid)
-            SETNAME = "/tmp/setbytes.{pid}".format(pid=pid)
-            BADNAME = "/tmp/badbytes.{pid}".format(pid=pid)
-            for pg in OBJREPPGS:
-                OSDS = get_osds(pg, OSDDIR)
-                for osd in OSDS:
-                    DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
-                    fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
-                              and f.split("_")[0] == basename and f.split("_")[4] == nspace]
-                    if not fnames:
-                        continue
-                    try:
-                        os.unlink(GETNAME)
-                    except:
-                        pass
-                    cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-bytes {fname}").format(osd=osd, pg=pg, json=JSON, fname=GETNAME)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True)
-                    if ret != 0:
-                        logging.error("Bad exit status {ret}".format(ret=ret))
-                        ERRORS += 1
-                        continue
-                    cmd = "diff -q {file} {getfile}".format(file=file, getfile=GETNAME)
-                    ret = call(cmd, shell=True)
-                    if ret != 0:
-                        logging.error("Data from get-bytes differ")
-                        logging.debug("Got:")
-                        cat_file(logging.DEBUG, GETNAME)
-                        logging.debug("Expected:")
-                        cat_file(logging.DEBUG, file)
-                        ERRORS += 1
-                    fd = open(SETNAME, "w")
-                    data = "put-bytes going into {file}\n".format(file=file)
-                    fd.write(data)
-                    fd.close()
-                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' set-bytes {sname}").format(osd=osd, pg=pg, json=JSON, sname=SETNAME)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True)
-                    if ret != 0:
-                        logging.error("Bad exit status {ret} from set-bytes".format(ret=ret))
-                        ERRORS += 1
-                    fd = open(TESTNAME, "wb")
-                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-bytes -").format(osd=osd, pg=pg, json=JSON)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True, stdout=fd)
-                    fd.close()
-                    if ret != 0:
-                        logging.error("Bad exit status {ret} from get-bytes".format(ret=ret))
-                        ERRORS += 1
-                    cmd = "diff -q {setfile} {testfile}".format(setfile=SETNAME, testfile=TESTNAME)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True)
-                    if ret != 0:
-                        logging.error("Data after set-bytes differ")
-                        logging.debug("Got:")
-                        cat_file(logging.DEBUG, TESTNAME)
-                        logging.debug("Expected:")
-                        cat_file(logging.DEBUG, SETNAME)
-                        ERRORS += 1
-
-                    # Use set-bytes with --dry-run and make sure contents haven't changed
-                    fd = open(BADNAME, "w")
-                    data = "Bad data for --dry-run in {file}\n".format(file=file)
-                    fd.write(data)
-                    fd.close()
-                    cmd = (CFSD_PREFIX + "--dry-run --pgid {pg} '{json}' set-bytes {sname}").format(osd=osd, pg=pg, json=JSON, sname=BADNAME)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-                    if ret != 0:
-                        logging.error("Bad exit status {ret} from set-bytes --dry-run".format(ret=ret))
-                        ERRORS += 1
-                    fd = open(TESTNAME, "wb")
-                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-bytes -").format(osd=osd, pg=pg, json=JSON)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True, stdout=fd)
-                    fd.close()
-                    if ret != 0:
-                        logging.error("Bad exit status {ret} from get-bytes".format(ret=ret))
-                        ERRORS += 1
-                    cmd = "diff -q {setfile} {testfile}".format(setfile=SETNAME, testfile=TESTNAME)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True)
-                    if ret != 0:
-                        logging.error("Data after set-bytes --dry-run changed!")
-                        logging.debug("Got:")
-                        cat_file(logging.DEBUG, TESTNAME)
-                        logging.debug("Expected:")
-                        cat_file(logging.DEBUG, SETNAME)
-                        ERRORS += 1
-
-                    fd = open(file, "rb")
-                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' set-bytes").format(osd=osd, pg=pg, json=JSON)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True, stdin=fd)
-                    if ret != 0:
-                        logging.error("Bad exit status {ret} from set-bytes to restore object".format(ret=ret))
-                        ERRORS += 1
-                    fd.close()
-
-    try:
-        os.unlink(GETNAME)
-    except:
-        pass
-    try:
-        os.unlink(TESTNAME)
-    except:
-        pass
-    try:
-        os.unlink(SETNAME)
-    except:
-        pass
-    try:
-        os.unlink(BADNAME)
-    except:
-        pass
-
-    # Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap
-    print("Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap")
-    for nspace in db.keys():
-        for basename in db[nspace].keys():
-            file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
-            JSON = db[nspace][basename]['json']
-            for pg in OBJREPPGS:
-                OSDS = get_osds(pg, OSDDIR)
-                for osd in OSDS:
-                    DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
-                    fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
-                              and f.split("_")[0] == basename and f.split("_")[4] == nspace]
-                    if not fnames:
-                        continue
-                    for key, val in db[nspace][basename]["xattr"].items():
-                        attrkey = "_" + key
-                        cmd = (CFSD_PREFIX + " '{json}' get-attr {key}").format(osd=osd, json=JSON, key=attrkey)
-                        logging.debug(cmd)
-                        getval = check_output(cmd, shell=True)
-                        if getval != val:
-                            logging.error("get-attr of key {key} returned wrong val: {get} instead of {orig}".format(key=attrkey, get=getval, orig=val))
-                            ERRORS += 1
-                            continue
-                        # set-attr to bogus value "foobar"
-                        cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True)
-                        if ret != 0:
-                            logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
-                            ERRORS += 1
-                            continue
-                        # Test set-attr with dry-run
-                        cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True, stdout=nullfd)
-                        if ret != 0:
-                            logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
-                            ERRORS += 1
-                            continue
-                        # Check the set-attr
-                        cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
-                        logging.debug(cmd)
-                        getval = check_output(cmd, shell=True)
-                        if ret != 0:
-                            logging.error("Bad exit status {ret} from get-attr".format(ret=ret))
-                            ERRORS += 1
-                            continue
-                        if getval != "foobar":
-                            logging.error("Check of set-attr failed because we got {val}".format(val=getval))
-                            ERRORS += 1
-                            continue
-                        # Test rm-attr
-                        cmd = (CFSD_PREFIX + "'{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True)
-                        if ret != 0:
-                            logging.error("Bad exit status {ret} from rm-attr".format(ret=ret))
-                            ERRORS += 1
-                            continue
-                        # Check rm-attr with dry-run
-                        cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True, stdout=nullfd)
-                        if ret != 0:
-                            logging.error("Bad exit status {ret} from rm-attr".format(ret=ret))
-                            ERRORS += 1
-                            continue
-                        cmd = (CFSD_PREFIX + "'{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd)
-                        if ret == 0:
-                            logging.error("For rm-attr expect get-attr to fail, but it succeeded")
-                            ERRORS += 1
-                        # Put back value
-                        cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey, val=val)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True)
-                        if ret != 0:
-                            logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
-                            ERRORS += 1
-                            continue
-
-                    hdr = db[nspace][basename].get("omapheader", "")
-                    cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, json=JSON)
-                    logging.debug(cmd)
-                    gethdr = check_output(cmd, shell=True)
-                    if gethdr != hdr:
-                        logging.error("get-omaphdr was wrong: {get} instead of {orig}".format(get=gethdr, orig=hdr))
-                        ERRORS += 1
-                        continue
-                    # set-omaphdr to bogus value "foobar"
-                    cmd = ("echo -n foobar | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True)
-                    if ret != 0:
-                        logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
-                        ERRORS += 1
-                        continue
-                    # Check the set-omaphdr
-                    cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, pg=pg, json=JSON)
-                    logging.debug(cmd)
-                    gethdr = check_output(cmd, shell=True)
-                    if ret != 0:
-                        logging.error("Bad exit status {ret} from get-omaphdr".format(ret=ret))
-                        ERRORS += 1
-                        continue
-                    if gethdr != "foobar":
-                        logging.error("Check of set-omaphdr failed because we got {val}".format(val=getval))
-                        ERRORS += 1
-                        continue
-                    # Test dry-run with set-omaphdr
-                    cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True, stdout=nullfd)
-                    if ret != 0:
-                        logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
-                        ERRORS += 1
-                        continue
-                    # Put back value
-                    cmd = ("echo -n {val} | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON, val=hdr)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True)
-                    if ret != 0:
-                        logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
-                        ERRORS += 1
-                        continue
-
-                    for omapkey, val in db[nspace][basename]["omap"].items():
-                        cmd = (CFSD_PREFIX + " '{json}' get-omap {key}").format(osd=osd, json=JSON, key=omapkey)
-                        logging.debug(cmd)
-                        getval = check_output(cmd, shell=True)
-                        if getval != val:
-                            logging.error("get-omap of key {key} returned wrong val: {get} instead of {orig}".format(key=omapkey, get=getval, orig=val))
-                            ERRORS += 1
-                            continue
-                        # set-omap to bogus value "foobar"
-                        cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True)
-                        if ret != 0:
-                            logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
-                            ERRORS += 1
-                            continue
-                        # Check set-omap with dry-run
-                        cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True, stdout=nullfd)
-                        if ret != 0:
-                            logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
-                            ERRORS += 1
-                            continue
-                        # Check the set-omap
-                        cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
-                        logging.debug(cmd)
-                        getval = check_output(cmd, shell=True)
-                        if ret != 0:
-                            logging.error("Bad exit status {ret} from get-omap".format(ret=ret))
-                            ERRORS += 1
-                            continue
-                        if getval != "foobar":
-                            logging.error("Check of set-omap failed because we got {val}".format(val=getval))
-                            ERRORS += 1
-                            continue
-                        # Test rm-omap
-                        cmd = (CFSD_PREFIX + "'{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True)
-                        if ret != 0:
-                            logging.error("Bad exit status {ret} from rm-omap".format(ret=ret))
-                            ERRORS += 1
-                        # Check rm-omap with dry-run
-                        cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True, stdout=nullfd)
-                        if ret != 0:
-                            logging.error("Bad exit status {ret} from rm-omap".format(ret=ret))
-                            ERRORS += 1
-                        cmd = (CFSD_PREFIX + "'{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd)
-                        if ret == 0:
-                            logging.error("For rm-omap expect get-omap to fail, but it succeeded")
-                            ERRORS += 1
-                        # Put back value
-                        cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey, val=val)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True)
-                        if ret != 0:
-                            logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
-                            ERRORS += 1
-                            continue
-
-    # Test dump
-    print("Test dump")
-    for nspace in db.keys():
-        for basename in db[nspace].keys():
-            file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
-            JSON = db[nspace][basename]['json']
-            GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
-            for pg in OBJREPPGS:
-                OSDS = get_osds(pg, OSDDIR)
-                for osd in OSDS:
-                    DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
-                    fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
-                              and f.split("_")[0] == basename and f.split("_")[4] == nspace]
-                    if not fnames:
-                        continue
-                    if int(basename.split(REP_NAME)[1]) > int(NUM_CLONED_REP_OBJECTS):
-                        continue
-                    cmd = (CFSD_PREFIX + " '{json}' dump | grep '\"snap\": 1,' > /dev/null").format(osd=osd, json=JSON)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True)
-                    if ret != 0:
-                        logging.error("Invalid dump for {json}".format(json=JSON))
-                        ERRORS += 1
-
-    print("Test list-attrs get-attr")
-    ATTRFILE = r"/tmp/attrs.{pid}".format(pid=pid)
-    VALFILE = r"/tmp/val.{pid}".format(pid=pid)
-    for nspace in db.keys():
-        for basename in db[nspace].keys():
-            file = os.path.join(DATADIR, nspace + "-" + basename)
-            JSON = db[nspace][basename]['json']
-            jsondict = json.loads(JSON)
-
-            if 'shard_id' in jsondict:
-                logging.debug("ECobject " + JSON)
-                found = 0
-                for pg in OBJECPGS:
-                    OSDS = get_osds(pg, OSDDIR)
-                    # Fix shard_id since we only have one json instance for each object
-                    jsondict['shard_id'] = int(pg.split('s')[1])
-                    JSON = json.dumps(jsondict)
-                    for osd in OSDS:
-                        cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-attr hinfo_key").format(osd=osd, pg=pg, json=JSON)
-                        logging.debug("TRY: " + cmd)
-                        try:
-                            out = check_output(cmd, shell=True, stderr=subprocess.STDOUT)
-                            logging.debug("FOUND: {json} in {osd} has value '{val}'".format(osd=osd, json=JSON, val=out))
-                            found += 1
-                        except subprocess.CalledProcessError as e:
-                            if "No such file or directory" not in e.output and "No data available" not in e.output:
-                                raise
-                # Assuming k=2 m=1 for the default ec pool
-                if found != 3:
-                    logging.error("{json} hinfo_key found {found} times instead of 3".format(json=JSON, found=found))
-                    ERRORS += 1
-
-            for pg in ALLPGS:
-                # Make sure rep obj with rep pg or ec obj with ec pg
-                if ('shard_id' in jsondict) != (pg.find('s') > 0):
-                    continue
-                if 'shard_id' in jsondict:
-                    # Fix shard_id since we only have one json instance for each object
-                    jsondict['shard_id'] = int(pg.split('s')[1])
-                    JSON = json.dumps(jsondict)
-                OSDS = get_osds(pg, OSDDIR)
-                for osd in OSDS:
-                    DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
-                    fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
-                              and f.split("_")[0] == basename and f.split("_")[4] == nspace]
-                    if not fnames:
-                        continue
-                    afd = open(ATTRFILE, "wb")
-                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' list-attrs").format(osd=osd, pg=pg, json=JSON)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True, stdout=afd)
-                    afd.close()
-                    if ret != 0:
-                        logging.error("list-attrs failed with {ret}".format(ret=ret))
-                        ERRORS += 1
-                        continue
-                    keys = get_lines(ATTRFILE)
-                    values = dict(db[nspace][basename]["xattr"])
-                    for key in keys:
-                        if key == "_" or key == "snapset" or key == "hinfo_key":
-                            continue
-                        key = key.strip("_")
-                        if key not in values:
-                            logging.error("Unexpected key {key} present".format(key=key))
-                            ERRORS += 1
-                            continue
-                        exp = values.pop(key)
-                        vfd = open(VALFILE, "wb")
-                        cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key="_" + key)
-                        logging.debug(cmd)
-                        ret = call(cmd, shell=True, stdout=vfd)
-                        vfd.close()
-                        if ret != 0:
-                            logging.error("get-attr failed with {ret}".format(ret=ret))
-                            ERRORS += 1
-                            continue
-                        lines = get_lines(VALFILE)
-                        val = lines[0]
-                        if exp != val:
-                            logging.error("For key {key} got value {got} instead of {expected}".format(key=key, got=val, expected=exp))
-                            ERRORS += 1
-                    if len(values) != 0:
-                        logging.error("Not all keys found, remaining keys:")
-                        print(values)
-
-    print("Test --op meta-list")
-    tmpfd = open(TMPFILE, "wb")
-    cmd = (CFSD_PREFIX + "--op meta-list").format(osd=ONEOSD)
-    logging.debug(cmd)
-    ret = call(cmd, shell=True, stdout=tmpfd)
-    if ret != 0:
-        logging.error("Bad exit status {ret} from --op meta-list request".format(ret=ret))
-        ERRORS += 1
-
-    print("Test get-bytes on meta")
-    tmpfd.close()
-    lines = get_lines(TMPFILE)
-    JSONOBJ = sorted(set(lines))
-    for JSON in JSONOBJ:
-        (pgid, jsondict) = json.loads(JSON)
-        if pgid != "meta":
-            logging.error("pgid incorrect for --op meta-list {pgid}".format(pgid=pgid))
-            ERRORS += 1
-        if jsondict['namespace'] != "":
-            logging.error("namespace non null --op meta-list {ns}".format(ns=jsondict['namespace']))
-            ERRORS += 1
-        logging.info(JSON)
-        try:
-            os.unlink(GETNAME)
-        except:
-            pass
-        cmd = (CFSD_PREFIX + "'{json}' get-bytes {fname}").format(osd=ONEOSD, json=JSON, fname=GETNAME)
-        logging.debug(cmd)
-        ret = call(cmd, shell=True)
-        if ret != 0:
-            logging.error("Bad exit status {ret}".format(ret=ret))
-            ERRORS += 1
-
-    try:
-        os.unlink(GETNAME)
-    except:
-        pass
-    try:
-        os.unlink(TESTNAME)
-    except:
-        pass
-
-    print("Test pg info")
-    for pg in ALLREPPGS + ALLECPGS:
-        for osd in get_osds(pg, OSDDIR):
-            cmd = (CFSD_PREFIX + "--op info --pgid {pg} | grep '\"pgid\": \"{pg}\"'").format(osd=osd, pg=pg)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True, stdout=nullfd)
-            if ret != 0:
-                logging.error("Getting info failed for pg {pg} from {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
-                ERRORS += 1
-
-    print("Test pg logging")
-    if len(ALLREPPGS + ALLECPGS) == len(OBJREPPGS + OBJECPGS):
-        logging.warning("All PGs have objects, so no log without modify entries")
-    for pg in ALLREPPGS + ALLECPGS:
-        for osd in get_osds(pg, OSDDIR):
-            tmpfd = open(TMPFILE, "wb")
-            cmd = (CFSD_PREFIX + "--op log --pgid {pg}").format(osd=osd, pg=pg)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True, stdout=tmpfd)
-            if ret != 0:
-                logging.error("Getting log failed for pg {pg} from {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
-                ERRORS += 1
-            HASOBJ = pg in OBJREPPGS + OBJECPGS
-            MODOBJ = False
-            for line in get_lines(TMPFILE):
-                if line.find("modify") != -1:
-                    MODOBJ = True
-                    break
-            if HASOBJ != MODOBJ:
-                logging.error("Bad log for pg {pg} from {osd}".format(pg=pg, osd=osd))
-                MSG = (HASOBJ and [""] or ["NOT "])[0]
-                print("Log should {msg}have a modify entry".format(msg=MSG))
-                ERRORS += 1
-
-    try:
-        os.unlink(TMPFILE)
-    except:
-        pass
-
-    print("Test list-pgs")
-    for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
-
-        CHECK_PGS = get_osd_pgs(os.path.join(OSDDIR, osd), None)
-        CHECK_PGS = sorted(CHECK_PGS)
-
-        cmd = (CFSD_PREFIX + "--op list-pgs").format(osd=osd)
-        logging.debug(cmd)
-        TEST_PGS = check_output(cmd, shell=True).split("\n")
-        TEST_PGS = sorted(TEST_PGS)[1:]  # Skip extra blank line
-
-        if TEST_PGS != CHECK_PGS:
-            logging.error("list-pgs got wrong result for osd.{osd}".format(osd=osd))
-            logging.error("Expected {pgs}".format(pgs=CHECK_PGS))
-            logging.error("Got {pgs}".format(pgs=TEST_PGS))
-            ERRORS += 1
-
-    EXP_ERRORS = 0
-    print("Test pg export --dry-run")
-    pg = ALLREPPGS[0]
-    osd = get_osds(pg, OSDDIR)[0]
-    fname = "/tmp/fname.{pid}".format(pid=pid)
-    cmd = (CFSD_PREFIX + "--dry-run --op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
-    logging.debug(cmd)
-    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-    if ret != 0:
-        logging.error("Exporting --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
-        EXP_ERRORS += 1
-    elif os.path.exists(fname):
-        logging.error("Exporting --dry-run created file")
-        EXP_ERRORS += 1
-
-    cmd = (CFSD_PREFIX + "--dry-run --op export --pgid {pg} > {file}").format(osd=osd, pg=pg, file=fname)
-    logging.debug(cmd)
-    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-    if ret != 0:
-        logging.error("Exporting --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
-        EXP_ERRORS += 1
-    else:
-        outdata = get_lines(fname)
-        if len(outdata) > 0:
-            logging.error("Exporting --dry-run to stdout not empty")
-            logging.error("Data: " + outdata)
-            EXP_ERRORS += 1
-
-    os.mkdir(TESTDIR)
-    for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
-        os.mkdir(os.path.join(TESTDIR, osd))
-    print("Test pg export")
-    for pg in ALLREPPGS + ALLECPGS:
-        for osd in get_osds(pg, OSDDIR):
-            mydir = os.path.join(TESTDIR, osd)
-            fname = os.path.join(mydir, pg)
-            if pg == ALLREPPGS[0]:
-                cmd = (CFSD_PREFIX + "--op export --pgid {pg} > {file}").format(osd=osd, pg=pg, file=fname)
-            elif pg == ALLREPPGS[1]:
-                cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file - > {file}").format(osd=osd, pg=pg, file=fname)
-            else:
-                cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-            if ret != 0:
-                logging.error("Exporting failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
-                EXP_ERRORS += 1
-
-    ERRORS += EXP_ERRORS
-
-    print("Test pg removal")
-    RM_ERRORS = 0
-    for pg in ALLREPPGS + ALLECPGS:
-        for osd in get_osds(pg, OSDDIR):
-            # This should do nothing
-            cmd = (CFSD_PREFIX + "--op remove --pgid {pg} --dry-run").format(pg=pg, osd=osd)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True, stdout=nullfd)
-            if ret != 0:
-                logging.error("Removing --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
-                RM_ERRORS += 1
-            cmd = (CFSD_PREFIX + "--op remove --pgid {pg}").format(pg=pg, osd=osd)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True, stdout=nullfd)
-            if ret != 0:
-                logging.error("Removing failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
-                RM_ERRORS += 1
-
-    ERRORS += RM_ERRORS
-
-    IMP_ERRORS = 0
-    if EXP_ERRORS == 0 and RM_ERRORS == 0:
-        print("Test pg import")
-        for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
-            dir = os.path.join(TESTDIR, osd)
-            PGS = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
-            for pg in PGS:
-                file = os.path.join(dir, pg)
-                # This should do nothing
-                cmd = (CFSD_PREFIX + "--op import --file {file} --dry-run").format(osd=osd, file=file)
-                logging.debug(cmd)
-                ret = call(cmd, shell=True, stdout=nullfd)
-                if ret != 0:
-                    logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
-                    IMP_ERRORS += 1
-                if pg == PGS[0]:
-                    cmd = ("cat {file} |".format(file=file) + CFSD_PREFIX + "--op import").format(osd=osd)
-                elif pg == PGS[1]:
-                    cmd = (CFSD_PREFIX + "--op import --file - --pgid {pg} < {file}").format(osd=osd, file=file, pg=pg)
-                else:
-                    cmd = (CFSD_PREFIX + "--op import --file {file}").format(osd=osd, file=file)
-                logging.debug(cmd)
-                ret = call(cmd, shell=True, stdout=nullfd)
-                if ret != 0:
-                    logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
-                    IMP_ERRORS += 1
-    else:
-        logging.warning("SKIPPING IMPORT TESTS DUE TO PREVIOUS FAILURES")
-
-    ERRORS += IMP_ERRORS
-    logging.debug(cmd)
-
-    if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
-        print("Verify replicated import data")
-        data_errors, _ = check_data(DATADIR, TMPFILE, OSDDIR, REP_NAME)
-        ERRORS += data_errors
-    else:
-        logging.warning("SKIPPING CHECKING IMPORT DATA DUE TO PREVIOUS FAILURES")
-
-    print("Test all --op dump-journal again")
-    ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]
-    ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS)
-
-    vstart(new=False)
-    wait_for_health()
-
-    if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
-        print("Verify erasure coded import data")
-        ERRORS += verify(DATADIR, EC_POOL, EC_NAME, db)
-        # Check replicated data/xattr/omap using rados
-        print("Verify replicated import data using rados")
-        ERRORS += verify(DATADIR, REP_POOL, REP_NAME, db)
-
-    if EXP_ERRORS == 0:
-        NEWPOOL = "rados-import-pool"
-        cmd = "{path}/rados mkpool {pool}".format(pool=NEWPOOL, path=CEPH_BIN)
-        logging.debug(cmd)
-        ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-
-        print("Test rados import")
-        first = True
-        for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
-            dir = os.path.join(TESTDIR, osd)
-            for pg in [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]:
-                if pg.find("{id}.".format(id=REPID)) != 0:
-                    continue
-                file = os.path.join(dir, pg)
-                if first:
-                    first = False
-                    # This should do nothing
-                    cmd = "{path}/rados import -p {pool} --dry-run {file}".format(pool=NEWPOOL, file=file, path=CEPH_BIN)
-                    logging.debug(cmd)
-                    ret = call(cmd, shell=True, stdout=nullfd)
-                    if ret != 0:
-                        logging.error("Rados import --dry-run failed from {file} with {ret}".format(file=file, ret=ret))
-                        ERRORS += 1
-                    cmd = "{path}/rados -p {pool} ls".format(pool=NEWPOOL, path=CEPH_BIN)
-                    logging.debug(cmd)
-                    data = check_output(cmd, shell=True)
-                    if data:
-                        logging.error("'{data}'".format(data=data))
-                        logging.error("Found objects after dry-run")
-                        ERRORS += 1
-                cmd = "{path}/rados import -p {pool} {file}".format(pool=NEWPOOL, file=file, path=CEPH_BIN)
-                logging.debug(cmd)
-                ret = call(cmd, shell=True, stdout=nullfd)
-                if ret != 0:
-                    logging.error("Rados import failed from {file} with {ret}".format(file=file, ret=ret))
-                    ERRORS += 1
-                cmd = "{path}/rados import -p {pool} --no-overwrite {file}".format(pool=NEWPOOL, file=file, path=CEPH_BIN)
-                logging.debug(cmd)
-                ret = call(cmd, shell=True, stdout=nullfd)
-                if ret != 0:
-                    logging.error("Rados import --no-overwrite failed from {file} with {ret}".format(file=file, ret=ret))
-                    ERRORS += 1
-
-        ERRORS += verify(DATADIR, NEWPOOL, REP_NAME, db)
-    else:
-        logging.warning("SKIPPING IMPORT-RADOS TESTS DUE TO PREVIOUS FAILURES")
-
-    # Clear directories of previous portion
-    call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True)
-    call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True)
-    os.mkdir(TESTDIR)
-    os.mkdir(DATADIR)
-
-    # Cause SPLIT_POOL to split and test import with object/log filtering
-    print("Testing import all objects after a split")
-    SPLIT_POOL = "split_pool"
-    PG_COUNT = 1
-    SPLIT_OBJ_COUNT = 5
-    SPLIT_NSPACE_COUNT = 2
-    SPLIT_NAME = "split"
-    cmd = "{path}/ceph osd pool create {pool} {pg} {pg} replicated".format(pool=SPLIT_POOL, pg=PG_COUNT, path=CEPH_BIN)
-    logging.debug(cmd)
-    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-    SPLITID = get_pool_id(SPLIT_POOL, nullfd)
-    pool_size = int(check_output("{path}/ceph osd pool get {pool} size".format(pool=SPLIT_POOL, path=CEPH_BIN), shell=True, stderr=nullfd).split(" ")[1])
-    EXP_ERRORS = 0
-    RM_ERRORS = 0
-    IMP_ERRORS = 0
-
-    objects = range(1, SPLIT_OBJ_COUNT + 1)
-    nspaces = range(SPLIT_NSPACE_COUNT)
-    for n in nspaces:
-        nspace = get_nspace(n)
-
-        for i in objects:
-            NAME = SPLIT_NAME + "{num}".format(num=i)
-            LNAME = nspace + "-" + NAME
-            DDNAME = os.path.join(DATADIR, LNAME)
-            DDNAME += "__head"
-
-            cmd = "rm -f " + DDNAME
-            logging.debug(cmd)
-            call(cmd, shell=True)
-
-            if i == 1:
-                dataline = range(DATALINECOUNT)
-            else:
-                dataline = range(1)
-            fd = open(DDNAME, "w")
-            data = "This is the split data for " + LNAME + "\n"
-            for _ in dataline:
-                fd.write(data)
-            fd.close()
-
-            cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=SPLIT_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True, stderr=nullfd)
-            if ret != 0:
-                logging.critical("Rados put command failed with {ret}".format(ret=ret))
-                return 1
-
-    wait_for_health()
-    kill_daemons()
-
-    for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
-        os.mkdir(os.path.join(TESTDIR, osd))
-
-    pg = "{pool}.0".format(pool=SPLITID)
-    EXPORT_PG = pg
-
-    export_osds = get_osds(pg, OSDDIR)
-    for osd in export_osds:
-        mydir = os.path.join(TESTDIR, osd)
-        fname = os.path.join(mydir, pg)
-        cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
-        logging.debug(cmd)
-        ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-        if ret != 0:
-            logging.error("Exporting failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
-            EXP_ERRORS += 1
-
-    ERRORS += EXP_ERRORS
-
-    if EXP_ERRORS == 0:
-        vstart(new=False)
-        wait_for_health()
-
-        cmd = "{path}/ceph osd pool set {pool} pg_num 2".format(pool=SPLIT_POOL, path=CEPH_BIN)
-        logging.debug(cmd)
-        ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-        time.sleep(5)
-        wait_for_health()
-
-        kill_daemons()
-
-        # Now 2 PGs, poolid.0 and poolid.1
-        for seed in range(2):
-            pg = "{pool}.{seed}".format(pool=SPLITID, seed=seed)
-
-            which = 0
-            for osd in get_osds(pg, OSDDIR):
-                cmd = (CFSD_PREFIX + "--op remove --pgid {pg}").format(pg=pg, osd=osd)
-                logging.debug(cmd)
-                ret = call(cmd, shell=True, stdout=nullfd)
-
-                # This is weird.  The export files are based on only the EXPORT_PG
-                # and where that pg was before the split.  Use 'which' to use all
-                # export copies in import.
-                mydir = os.path.join(TESTDIR, export_osds[which])
-                fname = os.path.join(mydir, EXPORT_PG)
-                which += 1
-                cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
-                logging.debug(cmd)
-                ret = call(cmd, shell=True, stdout=nullfd)
-                if ret != 0:
-                    logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
-                    IMP_ERRORS += 1
-
-        ERRORS += IMP_ERRORS
-
-        # Start up again to make sure imports didn't corrupt anything
-        if IMP_ERRORS == 0:
-            print("Verify split import data")
-            data_errors, count = check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME)
-            ERRORS += data_errors
-            if count != (SPLIT_OBJ_COUNT * SPLIT_NSPACE_COUNT * pool_size):
-                logging.error("Incorrect number of replicas seen {count}".format(count=count))
-                ERRORS += 1
-            vstart(new=False)
-            wait_for_health()
-
-    call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True)
-    call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True)
-
-    ERRORS += test_removeall(CFSD_PREFIX, db, OBJREPPGS, REP_POOL, CEPH_BIN, OSDDIR, REP_NAME, NUM_CLONED_REP_OBJECTS)
-
-    # vstart() starts 4 OSDs
-    ERRORS += test_get_set_osdmap(CFSD_PREFIX, list(range(4)), ALLOSDS)
-    ERRORS += test_get_set_inc_osdmap(CFSD_PREFIX, ALLOSDS[0])
-    if ERRORS == 0:
-        print("TEST PASSED")
-        return 0
-    else:
-        print("TEST FAILED WITH {errcount} ERRORS".format(errcount=ERRORS))
-        return 1
-
-
-def remove_btrfs_subvolumes(path):
-    if platform.system() == "FreeBSD":
-        return
-    result = subprocess.Popen("stat -f -c '%%T' %s" % path, shell=True, stdout=subprocess.PIPE)
-    for line in result.stdout:
-        filesystem = decode(line).rstrip('\n')
-    if filesystem == "btrfs":
-        result = subprocess.Popen("sudo btrfs subvolume list %s" % path, shell=True, stdout=subprocess.PIPE)
-        for line in result.stdout:
-            subvolume = decode(line).split()[8]
-            # extracting the relative volume name
-            m = re.search(".*(%s.*)" % path, subvolume)
-            if m:
-                found = m.group(1)
-                call("sudo btrfs subvolume delete %s" % found, shell=True)
-
-
-if __name__ == "__main__":
-    status = 1
-    try:
-        status = main(sys.argv[1:])
-    finally:
-        kill_daemons()
-        remove_btrfs_subvolumes(CEPH_DIR)
-        call("/bin/rm -fr {dir}".format(dir=CEPH_DIR), shell=True)
-    sys.exit(status)
index eadd96942811379a0af94d61de071d1a6443b377..4061d8fb4a43b27819bfb983e1496487179df9b1 100644 (file)
@@ -14,7 +14,7 @@
 # display a warning if there is more than one root
 #
   $ crushtool --outfn "$map" --build --num_osds 5 node straw 2 rack straw 1 
-  .* The crush rulesets will use the root rack0 (re)
+  The crush rulesets will use the root rack0 (re)
   and ignore the others.
   There are 3 roots, they can be
   grouped into a single root by appending something like:
index e3e6cd091fc15638fa5501f2245b0751bfbe11e0..8fd313a765add05da214ba4bc2f24ae58956bdc8 100644 (file)
@@ -8,6 +8,7 @@
      --test-map-pgs-dump-all [--pool <poolid>] map all pgs to osds
      --health                dump health checks
      --mark-up-in            mark osds up and in (but do not persist)
+     --mark-out <osdid>      mark an osd as out (but do not persist)
      --with-default-pool     include default pool when creating map
      --clear-temp            clear pg_temp and primary_temp
      --test-random           do random placements
index 97ed692cb81eeb872e9b7ecaf3c402737e91b1e1..db1745bd857d56e9c1bb406e5a92151f399fc960 100644 (file)
@@ -8,6 +8,7 @@
      --test-map-pgs-dump-all [--pool <poolid>] map all pgs to osds
      --health                dump health checks
      --mark-up-in            mark osds up and in (but do not persist)
+     --mark-out <osdid>      mark an osd as out (but do not persist)
      --with-default-pool     include default pool when creating map
      --clear-temp            clear pg_temp and primary_temp
      --test-random           do random placements
diff --git a/ceph/src/test/cli/osdmaptool/upmap-out.t b/ceph/src/test/cli/osdmaptool/upmap-out.t
new file mode 100644 (file)
index 0000000..bc0a28a
--- /dev/null
@@ -0,0 +1,23 @@
+  $ osdmaptool --create-from-conf om -c $TESTDIR/ceph.conf.withracks --with-default-pool
+  osdmaptool: osdmap file 'om'
+  osdmaptool: writing epoch 1 to om
+  $ osdmaptool om --mark-up-in --mark-out 147 --upmap-max 11 --upmap c
+  osdmaptool: osdmap file 'om'
+  marking all OSDs up and in
+  marking OSD@147 as out
+  writing upmap command output to: c
+  checking for upmap cleanups
+  upmap, max-count 11, max deviation 0.01
+  $ cat c
+  ceph osd pg-upmap-items 1.7 142 145
+  ceph osd pg-upmap-items 1.8 219 223 99 103
+  ceph osd pg-upmap-items 1.17 171 173 201 202
+  ceph osd pg-upmap-items 1.1a 201 202 115 114
+  ceph osd pg-upmap-items 1.1c 171 173 201 202 127 130
+  ceph osd pg-upmap-items 1.20 88 87 201 202
+  ceph osd pg-upmap-items 1.21 207 206 142 145
+  ceph osd pg-upmap-items 1.51 201 202 65 64 186 189
+  ceph osd pg-upmap-items 1.62 219 223
+  ceph osd pg-upmap-items 1.6f 219 223 108 111
+  ceph osd pg-upmap-items 1.82 219 223 157 158 6 3
+  $ rm -f om c
diff --git a/ceph/src/test/cls_journal/CMakeLists.txt b/ceph/src/test/cls_journal/CMakeLists.txt
new file mode 100644 (file)
index 0000000..6e99cdc
--- /dev/null
@@ -0,0 +1,18 @@
+# cls_test_cls_journal
+add_executable(ceph_test_cls_journal
+  test_cls_journal.cc
+  $<TARGET_OBJECTS:common_texttable_obj>)
+set_target_properties(ceph_test_cls_journal PROPERTIES COMPILE_FLAGS
+  ${UNITTEST_CXX_FLAGS})
+target_link_libraries(ceph_test_cls_journal
+  cls_journal_client
+  librados
+  global
+  ${UNITTEST_LIBS}
+  ${CMAKE_DL_LIBS}
+  ${CRYPTO_LIBS}
+  ${EXTRALIBS}
+  radostest)
+install(TARGETS
+  ceph_test_cls_journal
+  DESTINATION ${CMAKE_INSTALL_BINDIR})
index 2e1123607752e2864bc6586b93c3d4dbb4e1d3f5..7d948c976e62dc1889b7c987e65333bf9868ef02 100644 (file)
@@ -322,15 +322,21 @@ TEST_F(TestClsJournal, ClientUnregisterPruneTags) {
                                   bufferlist()));
   ASSERT_EQ(0, client::tag_create(ioctx, oid, 1, Tag::TAG_CLASS_NEW,
                                   bufferlist()));
-  ASSERT_EQ(0, client::tag_create(ioctx, oid, 2, 1, bufferlist()));
+
+  for (uint32_t i = 2; i <= 96; ++i) {
+    ASSERT_EQ(0, client::tag_create(ioctx, oid, i, 1, bufferlist()));
+  }
 
   librados::ObjectWriteOperation op1;
-  client::client_commit(&op1, "id1", {{{1, 2, 120}}});
+  client::client_commit(&op1, "id1", {{{1, 32, 120}}});
   ASSERT_EQ(0, ioctx.operate(oid, &op1));
 
   ASSERT_EQ(0, client::client_unregister(ioctx, oid, "id2"));
 
-  std::set<Tag> expected_tags = {{0, 0, {}}, {2, 1, {}}};
+  std::set<Tag> expected_tags = {{0, 0, {}}};
+  for (uint32_t i = 32; i <= 96; ++i) {
+    expected_tags.insert({i, 1, {}});
+  }
   std::set<Tag> tags;
   ASSERT_EQ(0, client::tag_list(ioctx, oid, "id1",
                                 boost::optional<uint64_t>(), &tags));
@@ -547,6 +553,14 @@ TEST_F(TestClsJournal, TagList) {
   ASSERT_EQ(0, client::tag_list(ioctx, oid, "id1", boost::optional<uint64_t>(0),
                                 &tags));
   ASSERT_EQ(expected_filtered_tags, tags);
+
+  librados::ObjectWriteOperation op1;
+  client::client_commit(&op1, "id1", {{{96, 0, 120}}});
+  ASSERT_EQ(0, ioctx.operate(oid, &op1));
+
+  ASSERT_EQ(0, client::tag_list(ioctx, oid, "id1", boost::optional<uint64_t>(),
+                                &tags));
+  ASSERT_EQ(expected_all_tags, tags);
 }
 
 TEST_F(TestClsJournal, GuardAppend) {
index f5b0b26dc25309cf8c9d418cc819bc5a03e67bfd..a486bf46d07b2e98d821cf08e10bef156b2bece3 100644 (file)
@@ -248,3 +248,31 @@ TYPED_TEST(BitVectorTest, data_crc) {
   ASSERT_THROW(bit_vector2.decode_data(data_it, byte_offset),
               buffer::malformed_input);
 }
+
+TYPED_TEST(BitVectorTest, iterator) {
+  typename TestFixture::bit_vector_t bit_vector;
+
+  uint64_t radix = 1 << bit_vector.BIT_COUNT;
+  uint64_t size = 25 * (1ULL << 20);
+  uint64_t offset = 0;
+
+  // create fragmented in-memory bufferlist layout
+  uint64_t resize = 0;
+  while (resize < size) {
+    resize += 4096;
+    if (resize > size) {
+      resize = size;
+    }
+    bit_vector.resize(resize);
+  }
+
+  for (auto it = bit_vector.begin(); it != bit_vector.end(); ++it, ++offset) {
+    *it = offset % radix;
+  }
+
+  offset = 123;
+  auto end_it = bit_vector.begin() + (size - 1024);
+  for (auto it = bit_vector.begin() + offset; it != end_it; ++it, ++offset) {
+    ASSERT_EQ(offset % radix, *it);
+  }
+}
index 5cfd0855195e4b1c560420aef40e052276edca03..fc3a5c7a9859747a35d67523a139fdd25fae46d1 100644 (file)
@@ -32,80 +32,80 @@ using std::string;
 TEST(DaemonConfig, SimpleSet) {
   int ret;
   ret = g_ceph_context->_conf->set_val("log_graylog_port", "21");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   char buf[128];
   memset(buf, 0, sizeof(buf));
   char *tmp = buf;
   ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("21"), string(buf));
 }
 
 TEST(DaemonConfig, Substitution) {
   int ret;
   ret = g_ceph_context->_conf->set_val("internal_safe_to_start_threads", "false");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("host", "foo");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("public_network", "bar$host.baz", false);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   char buf[128];
   memset(buf, 0, sizeof(buf));
   char *tmp = buf;
   ret = g_ceph_context->_conf->get_val("public_network", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("barfoo.baz"), string(buf));
 }
 
 TEST(DaemonConfig, SubstitutionTrailing) {
   int ret;
   ret = g_ceph_context->_conf->set_val("internal_safe_to_start_threads", "false");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("host", "foo");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("public_network", "bar$host", false);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   char buf[128];
   memset(buf, 0, sizeof(buf));
   char *tmp = buf;
   ret = g_ceph_context->_conf->get_val("public_network", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("barfoo"), string(buf));
 }
 
 TEST(DaemonConfig, SubstitutionBraces) {
   int ret;
   ret = g_ceph_context->_conf->set_val("internal_safe_to_start_threads", "false");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("host", "foo");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("public_network", "bar${host}baz", false);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   char buf[128];
   memset(buf, 0, sizeof(buf));
   char *tmp = buf;
   ret = g_ceph_context->_conf->get_val("public_network", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("barfoobaz"), string(buf));
 }
 TEST(DaemonConfig, SubstitutionBracesTrailing) {
   int ret;
   ret = g_ceph_context->_conf->set_val("internal_safe_to_start_threads", "false");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("host", "foo");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("public_network", "bar${host}", false);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   char buf[128];
   memset(buf, 0, sizeof(buf));
   char *tmp = buf;
   ret = g_ceph_context->_conf->get_val("public_network", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("barfoo"), string(buf));
 }
 
@@ -113,15 +113,15 @@ TEST(DaemonConfig, SubstitutionBracesTrailing) {
 TEST(DaemonConfig, SubstitutionMultiple) {
   int ret;
   ret = g_ceph_context->_conf->set_val("mon_host", "localhost", false);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("keyring", "$mon_host/$cluster.keyring,$mon_host/$cluster.mon.keyring", false);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   char buf[512];
   memset(buf, 0, sizeof(buf));
   char *tmp = buf;
   ret = g_ceph_context->_conf->get_val("keyring", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("localhost/ceph.keyring,localhost/ceph.mon.keyring"), tmp);
   ASSERT_TRUE(strchr(buf, '$') == NULL);
 }
@@ -143,12 +143,12 @@ TEST(DaemonConfig, ArgV) {
   char *tmp = buf;
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("keyfile", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("/tmp/my-keyfile"), string(buf));
 
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("22"), string(buf));
 
   ASSERT_EQ(0, g_ceph_context->_conf->set_val("internal_safe_to_start_threads",
@@ -159,25 +159,25 @@ TEST(DaemonConfig, InjectArgs) {
   int ret;
   std::string injection("--log-graylog-port 56 --leveldb-max-open-files 42");
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   char buf[128];
   char *tmp = buf;
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("leveldb_max_open_files", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("42"), string(buf));
 
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("56"), string(buf));
 
   injection = "--log-graylog-port 57";
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("57"), string(buf));
 }
 
@@ -191,30 +191,35 @@ TEST(DaemonConfig, InjectArgsReject) {
   // We should complain about the garbage in the input
   std::string injection("--random-garbage-in-injectargs 26 --log-graylog-port 28");
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, -EINVAL); 
+  ASSERT_EQ(-EINVAL, ret);
 
   // But, debug should still be set...
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("28"), string(buf));
 
   // What's the current value of osd_data?
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("osd_data", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   // Injectargs shouldn't let us change this, since it is a string-valued
   // variable and there isn't an observer for it.
   std::string injection2("--osd_data /tmp/some-other-directory --log-graylog-port 4");
   ret = g_ceph_context->_conf->injectargs(injection2, &cout);
-  ASSERT_EQ(ret, -ENOSYS); 
+  ASSERT_EQ(-ENOSYS, ret);
 
   // It should be unchanged.
   memset(buf2, 0, sizeof(buf2));
   ret = g_ceph_context->_conf->get_val("osd_data", &tmp2, sizeof(buf2));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string(buf), string(buf2));
+
+  // We should complain about the missing arguments.
+  std::string injection3("--log-graylog-port 28 --debug_ms");
+  ret = g_ceph_context->_conf->injectargs(injection3, &cout);
+  ASSERT_EQ(-EINVAL, ret);
 }
 
 TEST(DaemonConfig, InjectArgsBooleans) {
@@ -225,51 +230,51 @@ TEST(DaemonConfig, InjectArgsBooleans) {
   // Change log_to_syslog
   std::string injection("--log_to_syslog --log-graylog-port 28");
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   // log_to_syslog should be set...
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_to_syslog", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("true"), string(buf));
 
   // Turn off log_to_syslog
   injection = "--log_to_syslog=false --log-graylog-port 28";
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   // log_to_syslog should be cleared...
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_to_syslog", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("false"), string(buf));
 
   // Turn on log_to_syslog
   injection = "--log-graylog-port=1 --log_to_syslog=true --leveldb-max-open-files 40";
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   // log_to_syslog should be set...
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_to_syslog", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("true"), string(buf));
 
   // parse error
   injection = "--log-graylog-port 1 --log_to_syslog=falsey --leveldb-max-open-files 42";
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, -EINVAL);
+  ASSERT_EQ(-EINVAL, ret);
 
   // log_to_syslog should still be set...
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_to_syslog", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("true"), string(buf));
 
   // debug-ms should still become 42...
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("leveldb_max_open_files", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("42"), string(buf));
 }
 
@@ -285,25 +290,25 @@ TEST(DaemonConfig, InjectArgsLogfile) {
   injection += tmpfile;
   // We're allowed to change log_file because there is an observer.
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   // It should have taken effect.
   char buf[128];
   char *tmp = buf;
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_file", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string(buf), string(tmpfile));
 
   // The logfile should exist.
-  ASSERT_EQ(access(tmpfile, R_OK), 0);
+  ASSERT_EQ(0, access(tmpfile, R_OK));
 
   // Let's turn off the logfile.
   ret = g_ceph_context->_conf->set_val("log_file", "");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   ret = g_ceph_context->_conf->get_val("log_file", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string(""), string(buf));
 
   // Clean up the garbage
@@ -315,7 +320,7 @@ TEST(DaemonConfig, ThreadSafety1) {
   // Verify that we can't change this, since internal_safe_to_start_threads has
   // been set.
   ret = g_ceph_context->_conf->set_val("osd_data", "");
-  ASSERT_EQ(ret, -ENOSYS);
+  ASSERT_EQ(-ENOSYS, ret);
 
   ASSERT_EQ(0, g_ceph_context->_conf->set_val("internal_safe_to_start_threads",
                                       "false"));
@@ -324,24 +329,24 @@ TEST(DaemonConfig, ThreadSafety1) {
   // OSD threads running, we know changing osd_data won't actually blow up the
   // world.
   ret = g_ceph_context->_conf->set_val("osd_data", "/tmp/crazydata");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   char buf[128];
   char *tmp = buf;
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("osd_data", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("/tmp/crazydata"), string(buf));
 
   ASSERT_EQ(0, g_ceph_context->_conf->set_val("internal_safe_to_start_threads",
                                       "false"));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 }
 
 TEST(DaemonConfig, InvalidIntegers) {
   {
     int ret = g_ceph_context->_conf->set_val("log_graylog_port", "rhubarb");
-    ASSERT_EQ(ret, -EINVAL);
+    ASSERT_EQ(-EINVAL, ret);
   }
 
   {
@@ -349,7 +354,7 @@ TEST(DaemonConfig, InvalidIntegers) {
     string str = boost::lexical_cast<string>(max);
     str = str + "999"; // some extra digits to take us out of bounds
     int ret = g_ceph_context->_conf->set_val("log_graylog_port", str);
-    ASSERT_EQ(ret, -EINVAL);
+    ASSERT_EQ(-EINVAL, ret);
   }
 }
 
@@ -358,17 +363,17 @@ TEST(DaemonConfig, InvalidFloats) {
     double bad_value = 2 * (double)std::numeric_limits<float>::max();
     string str = boost::lexical_cast<string>(-bad_value);
     int ret = g_ceph_context->_conf->set_val("log_stop_at_utilization", str);
-    ASSERT_EQ(ret, -EINVAL);
+    ASSERT_EQ(-EINVAL, ret);
   }
   {
     double bad_value = 2 * (double)std::numeric_limits<float>::max();
     string str = boost::lexical_cast<string>(bad_value);
     int ret = g_ceph_context->_conf->set_val("log_stop_at_utilization", str);
-    ASSERT_EQ(ret, -EINVAL);
+    ASSERT_EQ(-EINVAL, ret);
   }
   {
     int ret = g_ceph_context->_conf->set_val("log_stop_at_utilization", "not a float");
-    ASSERT_EQ(ret, -EINVAL);
+    ASSERT_EQ(-EINVAL, ret);
   }
 }
 
index ea4d3e713e7aca1400faf08a4c53209275841bd1..f7ce348b0a0f863e77d95147709e053ac6aec4df 100644 (file)
@@ -28,6 +28,7 @@ using std::string;
 
 typedef RadosTest LibRadosMisc;
 typedef RadosTestPP LibRadosMiscPP;
+typedef RadosTestECPP LibRadosMiscECPP;
 
 TEST(LibRadosMiscVersion, Version) {
   int major, minor, extra;
@@ -1356,3 +1357,24 @@ TEST_F(LibRadosMiscPP, Applications) {
   ASSERT_EQ(0, ioctx.application_metadata_list("app1", &meta));
   ASSERT_EQ(expected_meta, meta);
 }
+
+TEST_F(LibRadosMiscECPP, CompareExtentRange) {
+  bufferlist bl1;
+  bl1.append("ceph");
+  ObjectWriteOperation write;
+  write.write(0, bl1);
+  ASSERT_EQ(0, ioctx.operate("foo", &write));
+
+  bufferlist bl2;
+  bl2.append("ph");
+  bl2.append(std::string(2, '\0'));
+  ObjectReadOperation read1;
+  read1.cmpext(2, bl2, nullptr);
+  ASSERT_EQ(0, ioctx.operate("foo", &read1, nullptr));
+
+  bufferlist bl3;
+  bl3.append(std::string(4, '\0'));
+  ObjectReadOperation read2;
+  read2.cmpext(2097152, bl3, nullptr);
+  ASSERT_EQ(0, ioctx.operate("foo", &read2, nullptr));
+}
index 2833d22e68a2629f17a9b8763ee22112773df589..7c095f482db3662e028ac9b3ee8393c71f74dd5c 100644 (file)
@@ -64,6 +64,7 @@ set(unittest_librbd_srcs
   operation/test_mock_SnapshotRemoveRequest.cc
   operation/test_mock_SnapshotRollbackRequest.cc
   operation/test_mock_SnapshotUnprotectRequest.cc
+  operation/test_mock_TrimRequest.cc
   watcher/test_mock_RewatchRequest.cc
   )
 add_executable(unittest_librbd
index be439b985f722badaf5b2abcded7ffb0c57f80a6..92330a1ddafd11ff347871db61b23e4c6142589a 100644 (file)
@@ -156,6 +156,8 @@ struct MockImageCtx {
                                             cls::rbd::SnapshotNamespace *out_snap_namespace));
   MOCK_CONST_METHOD2(get_parent_spec, int(librados::snap_t in_snap_id,
                                           ParentSpec *pspec));
+  MOCK_CONST_METHOD2(get_parent_overlap, int(librados::snap_t in_snap_id,
+                                             uint64_t *overlap));
 
   MOCK_CONST_METHOD2(is_snap_protected, int(librados::snap_t in_snap_id,
                                             bool *is_protected));
@@ -204,6 +206,9 @@ struct MockImageCtx {
   MOCK_METHOD8(write_to_cache, void(object_t, const bufferlist&, size_t,
                                     uint64_t, Context *, int, uint64_t, ZTracer::Trace *));
 
+  MOCK_CONST_METHOD0(get_stripe_count, uint64_t());
+  MOCK_CONST_METHOD0(get_stripe_period, uint64_t());
+
   ImageCtx *image_ctx;
   CephContext *cct;
   PerfCounters *perfcounter;
index 9ace5e374800e83c3d1a2f63f26e566f7b607f20..26e979eed5c201b384d4adff61ebf6f632a1142d 100644 (file)
@@ -19,7 +19,7 @@ struct MockObjectMap {
   MOCK_METHOD3(aio_resize, void(uint64_t new_size, uint8_t default_object_state,
                                 Context *on_finish));
 
-  template <typename T, void(T::*MF)(int)>
+  template <typename T, void(T::*MF)(int) = &T::complete>
   bool aio_update(uint64_t snap_id, uint64_t start_object_no, uint8_t new_state,
                   const boost::optional<uint8_t> &current_state,
                   const ZTracer::Trace &parent_trace, T *callback_object) {
@@ -28,23 +28,31 @@ struct MockObjectMap {
                              callback_object);
   }
 
-  template <typename T, void(T::*MF)(int)>
+  template <typename T, void(T::*MF)(int) = &T::complete>
   bool aio_update(uint64_t snap_id, uint64_t start_object_no,
                   uint64_t end_object_no, uint8_t new_state,
                   const boost::optional<uint8_t> &current_state,
                   const ZTracer::Trace &parent_trace, T *callback_object) {
-    return aio_update(snap_id, start_object_no, end_object_no, new_state,
-                      current_state, parent_trace,
-                      util::create_context_callback<T, MF>(callback_object));
+    auto ctx = util::create_context_callback<T, MF>(callback_object);
+    bool updated = aio_update(snap_id, start_object_no, end_object_no,
+                              new_state, current_state, parent_trace, ctx);
+    if (!updated) {
+      delete ctx;
+    }
+    return updated;
   }
   MOCK_METHOD7(aio_update, bool(uint64_t snap_id, uint64_t start_object_no,
                                 uint64_t end_object_no, uint8_t new_state,
                                 const boost::optional<uint8_t> &current_state,
                                 const ZTracer::Trace &parent_trace,
                                 Context *on_finish));
+
   MOCK_METHOD2(snapshot_add, void(uint64_t snap_id, Context *on_finish));
   MOCK_METHOD2(snapshot_remove, void(uint64_t snap_id, Context *on_finish));
   MOCK_METHOD2(rollback, void(uint64_t snap_id, Context *on_finish));
+
+  MOCK_CONST_METHOD1(object_may_exist, bool(uint64_t));
+
 };
 
 } // namespace librbd
index 7f47be28176ef5a401b11ed39059453f63e86351..453e49cd6367e8bba7b3caa93dda2fa68e65fb0a 100644 (file)
@@ -18,12 +18,22 @@ namespace object_map {
 
 using ::testing::_;
 using ::testing::DoDefault;
+using ::testing::InSequence;
 using ::testing::Return;
 using ::testing::StrEq;
 
 class TestMockObjectMapUpdateRequest : public TestMockFixture {
 public:
-  void expect_update(librbd::ImageCtx *ictx, uint64_t snap_id, int r) {
+  void expect_update(librbd::ImageCtx *ictx, uint64_t snap_id,
+                     uint64_t start_object_no, uint64_t end_object_no,
+                     uint8_t new_state,
+                     const boost::optional<uint8_t>& current_state, int r) {
+    bufferlist bl;
+    ::encode(start_object_no, bl);
+    ::encode(end_object_no, bl);
+    ::encode(new_state, bl);
+    ::encode(current_state, bl);
+
     std::string oid(ObjectMap<>::object_map_name(ictx->id, snap_id));
     if (snap_id == CEPH_NOSNAP) {
       EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
@@ -33,11 +43,13 @@ public:
 
     if (r < 0) {
       EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
-                  exec(oid, _, StrEq("rbd"), StrEq("object_map_update"), _, _, _))
+                  exec(oid, _, StrEq("rbd"), StrEq("object_map_update"),
+                       ContentsEqual(bl), _, _))
                     .WillOnce(Return(r));
     } else {
       EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
-                  exec(oid, _, StrEq("rbd"), StrEq("object_map_update"), _, _, _))
+                  exec(oid, _, StrEq("rbd"), StrEq("object_map_update"),
+                       ContentsEqual(bl), _, _))
                     .WillOnce(DoDefault());
     }
   }
@@ -92,7 +104,7 @@ TEST_F(TestMockObjectMapUpdateRequest, UpdateHeadOnDisk) {
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
 
-  expect_update(ictx, CEPH_NOSNAP, 0);
+  expect_update(ictx, CEPH_NOSNAP, 0, 1, OBJECT_NONEXISTENT, OBJECT_EXISTS, 0);
 
   ceph::BitVector<2> object_map;
   object_map.resize(1);
@@ -122,7 +134,7 @@ TEST_F(TestMockObjectMapUpdateRequest, UpdateSnapOnDisk) {
                                "snap1"));
 
   uint64_t snap_id = ictx->snap_id;
-  expect_update(ictx, snap_id, 0);
+  expect_update(ictx, snap_id, 0, 1, OBJECT_NONEXISTENT, OBJECT_EXISTS, 0);
 
   ceph::BitVector<2> object_map;
   object_map.resize(1);
@@ -148,7 +160,8 @@ TEST_F(TestMockObjectMapUpdateRequest, UpdateOnDiskError) {
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
 
-  expect_update(ictx, CEPH_NOSNAP, -EINVAL);
+  expect_update(ictx, CEPH_NOSNAP, 0, 1, OBJECT_NONEXISTENT, OBJECT_EXISTS,
+                -EINVAL);
   expect_invalidate(ictx);
 
   ceph::BitVector<2> object_map;
@@ -178,7 +191,8 @@ TEST_F(TestMockObjectMapUpdateRequest, RebuildSnapOnDisk) {
   ASSERT_EQ(CEPH_NOSNAP, ictx->snap_id);
 
   uint64_t snap_id = ictx->snap_info.rbegin()->first;
-  expect_update(ictx, snap_id, 0);
+  expect_update(ictx, snap_id, 0, 1, OBJECT_EXISTS_CLEAN,
+                boost::optional<uint8_t>(), 0);
   expect_unlock_exclusive_lock(*ictx);
 
   ceph::BitVector<2> object_map;
@@ -199,5 +213,40 @@ TEST_F(TestMockObjectMapUpdateRequest, RebuildSnapOnDisk) {
   ASSERT_NE(OBJECT_EXISTS_CLEAN, object_map[0]);
 }
 
+TEST_F(TestMockObjectMapUpdateRequest, BatchUpdate) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  librbd::NoOpProgressContext no_progress;
+  ASSERT_EQ(0, ictx->operations->resize(712312 * ictx->get_object_size(), false,
+                                        no_progress));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  InSequence seq;
+  expect_update(ictx, CEPH_NOSNAP, 0, 262144, OBJECT_NONEXISTENT, OBJECT_EXISTS,
+                0);
+  expect_update(ictx, CEPH_NOSNAP, 262144, 524288, OBJECT_NONEXISTENT,
+                OBJECT_EXISTS, 0);
+  expect_update(ictx, CEPH_NOSNAP, 524288, 712312, OBJECT_NONEXISTENT,
+                OBJECT_EXISTS, 0);
+  expect_unlock_exclusive_lock(*ictx);
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(712312);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *req = new UpdateRequest<>(
+    *ictx, &object_map, CEPH_NOSNAP, 0, object_map.size(), OBJECT_NONEXISTENT,
+    OBJECT_EXISTS, {}, &cond_ctx);
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    RWLock::WLocker object_map_locker(ictx->object_map_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
 } // namespace object_map
 } // namespace librbd
index 845c36fde88b5b35fe897dd3e035269ae6a0fc00..7124df5b367bf5c9d00ef751e01bead8e790c9f0 100644 (file)
@@ -43,7 +43,7 @@ public:
   }
 
   void expect_allocate_snap_id(MockImageCtx &mock_image_ctx, int r) {
-    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx),
                                selfmanaged_snap_create(_));
     if (r < 0 && r != -ESTALE) {
       expect.WillOnce(Return(r));
@@ -53,7 +53,7 @@ public:
   }
 
   void expect_release_snap_id(MockImageCtx &mock_image_ctx, int r) {
-    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx),
                                selfmanaged_snap_remove(_));
     if (r < 0) {
       expect.WillOnce(Return(r));
index 3f5fe186865a69a587c0bb9127b753b368bb672e..d16e0a4d2dc5ad4f84015f8e75358e1ce73e38e0 100644 (file)
@@ -110,7 +110,7 @@ public:
   }
 
   void expect_release_snap_id(MockImageCtx &mock_image_ctx) {
-    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx),
                                 selfmanaged_snap_remove(_))
                                   .WillOnce(DoDefault());
   }
diff --git a/ceph/src/test/librbd/operation/test_mock_TrimRequest.cc b/ceph/src/test/librbd/operation/test_mock_TrimRequest.cc
new file mode 100644 (file)
index 0000000..7a8cb43
--- /dev/null
@@ -0,0 +1,496 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "common/bit_vector.hpp"
+#include "librbd/AsyncRequest.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ObjectRequest.h"
+#include "librbd/operation/TrimRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace librbd {
+namespace {
+
+struct MockTestImageCtx : public MockImageCtx {
+  MockTestImageCtx(ImageCtx &image_ctx) : MockImageCtx(image_ctx) {
+  }
+};
+
+} // anonymous namespace
+
+template<>
+struct AsyncRequest<librbd::MockTestImageCtx> {
+  librbd::MockTestImageCtx& m_image_ctx;
+  Context *on_finish;
+
+  AsyncRequest(librbd::MockTestImageCtx& image_ctx, Context* on_finish)
+    : m_image_ctx(image_ctx), on_finish(on_finish) {
+  }
+  virtual ~AsyncRequest() {
+  }
+
+  Context* create_callback_context() {
+    return util::create_context_callback(this);
+  }
+
+  Context* create_async_callback_context() {
+    return util::create_context_callback<AsyncRequest,
+                                         &AsyncRequest::async_complete>(this);
+  }
+
+  void complete(int r) {
+    if (should_complete(r)) {
+      async_complete(r);
+    }
+  }
+
+  void async_complete(int r) {
+    on_finish->complete(r);
+  }
+
+  bool is_canceled() const {
+    return false;
+  }
+
+  virtual void send() = 0;
+  virtual bool should_complete(int r) = 0;
+};
+
+namespace io {
+
+template <>
+struct ObjectRequest<librbd::MockTestImageCtx> : public ObjectRequestHandle {
+  static ObjectRequest* s_instance;
+  Context *on_finish = nullptr;
+
+  static ObjectRequest* create_truncate(librbd::MockTestImageCtx *ictx,
+                                        const std::string &oid,
+                                        uint64_t object_no,
+                                        uint64_t object_off,
+                                        const ::SnapContext &snapc,
+                                        const ZTracer::Trace &parent_trace,
+                                        Context *completion) {
+    assert(s_instance != nullptr);
+    s_instance->on_finish = completion;
+    s_instance->construct_truncate();
+    return s_instance;
+  }
+
+  static ObjectRequest* create_trim(librbd::MockTestImageCtx *ictx,
+                                    const std::string &oid,
+                                    uint64_t object_no,
+                                    const ::SnapContext &snapc,
+                                    bool post_object_map_update,
+                                    Context *completion) {
+    assert(s_instance != nullptr);
+    s_instance->on_finish = completion;
+    s_instance->construct_trim();
+    return s_instance;
+  }
+
+  ObjectRequest() {
+    s_instance = this;
+  }
+
+  MOCK_METHOD0(construct_truncate, void());
+  MOCK_METHOD0(construct_trim, void());
+  MOCK_METHOD0(send, void());
+  MOCK_METHOD1(complete, void(int));
+};
+
+ObjectRequest<librbd::MockTestImageCtx>* ObjectRequest<librbd::MockTestImageCtx>::s_instance = nullptr;
+
+} // namespace io
+} // namespace librbd
+
+// template definitions
+#include "librbd/AsyncObjectThrottle.cc"
+#include "librbd/operation/TrimRequest.cc"
+
+namespace librbd {
+namespace operation {
+
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::InSequence;
+using ::testing::Invoke;
+using ::testing::Return;
+using ::testing::StrEq;
+using ::testing::WithArg;
+
+class TestMockOperationTrimRequest : public TestMockFixture {
+public:
+  typedef TrimRequest<MockTestImageCtx> MockTrimRequest;
+  typedef librbd::io::ObjectRequest<MockTestImageCtx> MockObjectRequest;
+
+  int create_snapshot(const char *snap_name) {
+    librbd::ImageCtx *ictx;
+    int r = open_image(m_image_name, &ictx);
+    if (r < 0) {
+      return r;
+    }
+
+    r = snap_create(*ictx, snap_name);
+    if (r < 0) {
+      return r;
+    }
+
+    r = snap_protect(*ictx, snap_name);
+    if (r < 0) {
+      return r;
+    }
+    close_image(ictx);
+    return 0;
+  }
+
+  void expect_is_lock_owner(MockTestImageCtx &mock_image_ctx) {
+    if (mock_image_ctx.exclusive_lock != nullptr) {
+      EXPECT_CALL(*mock_image_ctx.exclusive_lock, is_lock_owner())
+                    .WillRepeatedly(Return(true));
+    }
+  }
+
+  void expect_object_map_update(MockTestImageCtx &mock_image_ctx,
+                                uint64_t start_object, uint64_t end_object,
+                                uint8_t state, uint8_t current_state,
+                                bool updated, int ret_val) {
+    if (mock_image_ctx.object_map != nullptr) {
+      EXPECT_CALL(*mock_image_ctx.object_map,
+                  aio_update(CEPH_NOSNAP, start_object, end_object, state,
+                             boost::optional<uint8_t>(current_state), _, _))
+        .WillOnce(WithArg<6>(Invoke([&mock_image_ctx, updated, ret_val](Context *ctx) {
+                               if (updated) {
+                                 mock_image_ctx.op_work_queue->queue(ctx, ret_val);
+                               }
+                               return updated;
+                             })));
+    }
+  }
+
+  void expect_get_parent_overlap(MockTestImageCtx &mock_image_ctx,
+                                 uint64_t overlap) {
+    EXPECT_CALL(mock_image_ctx, get_parent_overlap(CEPH_NOSNAP, _))
+      .WillOnce(WithArg<1>(Invoke([overlap](uint64_t *o) {
+                             *o = overlap;
+                             return 0;
+                           })));
+  }
+
+  void expect_object_may_exist(MockTestImageCtx &mock_image_ctx,
+                               uint64_t object_no, bool exists) {
+    if (mock_image_ctx.object_map != nullptr) {
+      EXPECT_CALL(*mock_image_ctx.object_map, object_may_exist(object_no))
+        .WillOnce(Return(exists));
+    }
+  }
+
+  void expect_get_object_name(MockTestImageCtx &mock_image_ctx,
+                              uint64_t object_no, const std::string& oid) {
+    EXPECT_CALL(mock_image_ctx, get_object_name(object_no))
+      .WillOnce(Return(oid));
+  }
+
+  void expect_aio_remove(MockTestImageCtx &mock_image_ctx,
+                         const std::string& oid, int ret_val) {
+    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx), remove(oid, _))
+      .WillOnce(Return(ret_val));
+  }
+
+  void expect_object_trim(MockImageCtx &mock_image_ctx,
+                          MockObjectRequest &mock_object_request, int ret_val) {
+    EXPECT_CALL(mock_object_request, construct_trim());
+    EXPECT_CALL(mock_object_request, send())
+      .WillOnce(Invoke([&mock_image_ctx, &mock_object_request, ret_val]() {
+                         mock_image_ctx.op_work_queue->queue(mock_object_request.on_finish, ret_val);
+                       }));
+  }
+
+  void expect_object_truncate(MockImageCtx &mock_image_ctx,
+                              MockObjectRequest &mock_object_request,
+                              int ret_val) {
+    EXPECT_CALL(mock_object_request, construct_truncate());
+    EXPECT_CALL(mock_object_request, send())
+      .WillOnce(Invoke([&mock_image_ctx, &mock_object_request, ret_val]() {
+                         mock_image_ctx.op_work_queue->queue(mock_object_request.on_finish, ret_val);
+                       }));
+  }
+};
+
+TEST_F(TestMockOperationTrimRequest, SuccessRemove) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+  expect_op_work_queue(mock_image_ctx);
+  expect_is_lock_owner(mock_image_ctx);
+
+  InSequence seq;
+  EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+  EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+  // pre
+  expect_object_map_update(mock_image_ctx, 0, 1, OBJECT_PENDING, OBJECT_EXISTS,
+                           true, 0);
+
+  // copy-up
+  expect_get_parent_overlap(mock_image_ctx, 0);
+
+  // remove
+  expect_object_may_exist(mock_image_ctx, 0, true);
+  expect_get_object_name(mock_image_ctx, 0, "object0");
+  expect_aio_remove(mock_image_ctx, "object0", 0);
+
+   // post
+  expect_object_map_update(mock_image_ctx, 0, 1, OBJECT_NONEXISTENT,
+                           OBJECT_PENDING, true, 0);
+
+  C_SaferCond cond_ctx;
+  librbd::NoOpProgressContext progress_ctx;
+  MockTrimRequest *req = new MockTrimRequest(
+    mock_image_ctx, &cond_ctx, m_image_size, 0, progress_ctx);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, SuccessCopyUp) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING)
+  ASSERT_EQ(0, create_snapshot("snap1"));
+
+  int order = 22;
+  uint64_t features;
+  ASSERT_TRUE(::get_features(&features));
+  std::string clone_name = get_temp_image_name();
+  ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
+                             clone_name.c_str(), features, &order, 0, 0));
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(clone_name, &ictx));
+  ASSERT_EQ(0, snap_create(*ictx, "snap"));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+  expect_op_work_queue(mock_image_ctx);
+  expect_is_lock_owner(mock_image_ctx);
+
+  InSequence seq;
+  EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+  EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+  // pre
+  expect_object_map_update(mock_image_ctx, 0, 2, OBJECT_PENDING, OBJECT_EXISTS,
+                           true, 0);
+
+  // copy-up
+  expect_get_parent_overlap(mock_image_ctx, ictx->get_object_size());
+  expect_get_object_name(mock_image_ctx, 0, "object0");
+
+  MockObjectRequest mock_object_request;
+  expect_object_trim(mock_image_ctx, mock_object_request, 0);
+
+  // remove
+  expect_object_may_exist(mock_image_ctx, 1, true);
+  expect_get_object_name(mock_image_ctx, 1, "object1");
+  expect_aio_remove(mock_image_ctx, "object1", 0);
+
+   // post
+  expect_object_map_update(mock_image_ctx, 0, 2, OBJECT_NONEXISTENT,
+                           OBJECT_PENDING, true, 0);
+
+  C_SaferCond cond_ctx;
+  librbd::NoOpProgressContext progress_ctx;
+  MockTrimRequest *req = new MockTrimRequest(
+    mock_image_ctx, &cond_ctx, 2 * ictx->get_object_size(), 0, progress_ctx);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, SuccessBoundary) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+  expect_op_work_queue(mock_image_ctx);
+  expect_is_lock_owner(mock_image_ctx);
+
+  InSequence seq;
+  EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+  EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+  // boundary
+  MockObjectRequest mock_object_request;
+  expect_object_truncate(mock_image_ctx, mock_object_request, 0);
+
+  C_SaferCond cond_ctx;
+  librbd::NoOpProgressContext progress_ctx;
+  MockTrimRequest *req = new MockTrimRequest(
+    mock_image_ctx, &cond_ctx, ictx->get_object_size(), 1, progress_ctx);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, SuccessNoOp) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+}
+
+TEST_F(TestMockOperationTrimRequest, RemoveError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+  expect_op_work_queue(mock_image_ctx);
+  expect_is_lock_owner(mock_image_ctx);
+
+  InSequence seq;
+  EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+  EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+  // pre
+  expect_object_map_update(mock_image_ctx, 0, 1, OBJECT_PENDING, OBJECT_EXISTS,
+                           false, 0);
+
+  // copy-up
+  expect_get_parent_overlap(mock_image_ctx, 0);
+
+  // remove
+  expect_object_may_exist(mock_image_ctx, 0, true);
+  expect_get_object_name(mock_image_ctx, 0, "object0");
+  expect_aio_remove(mock_image_ctx, "object0", -EPERM);
+
+  C_SaferCond cond_ctx;
+  librbd::NoOpProgressContext progress_ctx;
+  MockTrimRequest *req = new MockTrimRequest(
+    mock_image_ctx, &cond_ctx, m_image_size, 0, progress_ctx);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EPERM, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, CopyUpError) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING)
+  ASSERT_EQ(0, create_snapshot("snap1"));
+
+  int order = 22;
+  uint64_t features;
+  ASSERT_TRUE(::get_features(&features));
+  std::string clone_name = get_temp_image_name();
+  ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
+                             clone_name.c_str(), features, &order, 0, 0));
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(clone_name, &ictx));
+  ASSERT_EQ(0, snap_create(*ictx, "snap"));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+  expect_op_work_queue(mock_image_ctx);
+  expect_is_lock_owner(mock_image_ctx);
+
+  InSequence seq;
+  EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+  EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+  // pre
+  expect_object_map_update(mock_image_ctx, 0, 2, OBJECT_PENDING, OBJECT_EXISTS,
+                           false, 0);
+
+  // copy-up
+  expect_get_parent_overlap(mock_image_ctx, ictx->get_object_size());
+  expect_get_object_name(mock_image_ctx, 0, "object0");
+
+  MockObjectRequest mock_object_request;
+  expect_object_trim(mock_image_ctx, mock_object_request, -EINVAL);
+
+  C_SaferCond cond_ctx;
+  librbd::NoOpProgressContext progress_ctx;
+  MockTrimRequest *req = new MockTrimRequest(
+    mock_image_ctx, &cond_ctx, 2 * ictx->get_object_size(), 0, progress_ctx);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, BoundaryError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+  expect_op_work_queue(mock_image_ctx);
+  expect_is_lock_owner(mock_image_ctx);
+
+  InSequence seq;
+  EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+  EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+  // boundary
+  MockObjectRequest mock_object_request;
+  expect_object_truncate(mock_image_ctx, mock_object_request, -EINVAL);
+
+  C_SaferCond cond_ctx;
+  librbd::NoOpProgressContext progress_ctx;
+  MockTrimRequest *req = new MockTrimRequest(
+    mock_image_ctx, &cond_ctx, ictx->get_object_size(), 1, progress_ctx);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+} // namespace operation
+} // namespace librbd
index 0fc6bbe75c9c39365238d1807939eac5cc5821a2..c8dea47baa7b8adcfd8467a8cef29a28b9883148 100644 (file)
@@ -53,8 +53,8 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
-                     &fs, RGW_MOUNT_FLAG_NONE);
+  int ret = rgw_mount2(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
+                       "/", &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
 }
index 9a29fd235cd8fb24337cc5d2f61ccd8a343b6795..337cd25a03bb2be92c0e5ed3c9c610f95a2cb0a5 100644 (file)
@@ -176,8 +176,8 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw, userid.c_str(), access_key.c_str(),
-                     secret_key.c_str(), &fs, RGW_MOUNT_FLAG_NONE);
+  int ret = rgw_mount2(rgw, userid.c_str(), access_key.c_str(),
+                       secret_key.c_str(), "/", &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
 }
index 0dd2a4d996f577402fe31eb79f170101872602df..9b6af4b6a8705fd528b61d3d7d797d8369c82cb3 100644 (file)
@@ -57,8 +57,8 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw, userid.c_str(), access_key.c_str(),
-                     secret_key.c_str(), &fs, RGW_MOUNT_FLAG_NONE);
+  int ret = rgw_mount2(rgw, userid.c_str(), access_key.c_str(),
+                       secret_key.c_str(), "/", &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
 }
index d2b00fc830044859c49f0998c5219a0b91a44aea..2b591e468988759f852fc934183ff02277e2d3f7 100644 (file)
@@ -179,8 +179,8 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
-                     &fs, RGW_MOUNT_FLAG_NONE);
+  int ret = rgw_mount2(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
+                       "/", &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
 }
diff --git a/ceph/src/test/librgw_file_marker.cc b/ceph/src/test/librgw_file_marker.cc
new file mode 100644 (file)
index 0000000..74199df
--- /dev/null
@@ -0,0 +1,488 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <stdint.h>
+#include <tuple>
+#include <iostream>
+#include <fstream>
+#include <stack>
+
+#include "include/rados/librgw.h"
+#include "include/rados/rgw_file.h"
+#include "rgw/rgw_file.h"
+#include "rgw/rgw_lib_frontend.h" // direct requests
+
+#include "gtest/gtest.h"
+#include "common/backport14.h"
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "global/global_init.h"
+#include "include/assert.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace {
+
+  using namespace rgw;
+  using std::get;
+  using std::string;
+
+  librgw_t rgw_h = nullptr;
+  string userid("testuser");
+  string access_key("");
+  string secret_key("");
+  struct rgw_fs *fs = nullptr;
+  CephContext* cct = nullptr;
+
+  uint32_t owner_uid = 867;
+  uint32_t owner_gid = 5309;
+
+  uint32_t create_mask = RGW_SETATTR_UID | RGW_SETATTR_GID | RGW_SETATTR_MODE;
+
+  string bucket_name("nfsroot");
+
+  class obj_rec
+  {
+  public:
+    string name;
+    struct rgw_file_handle* fh;
+    struct rgw_file_handle* parent_fh;
+    RGWFileHandle* rgw_fh; // alias into fh
+
+    struct state {
+      bool readdir;
+      state() : readdir(false) {}
+    } state;
+
+    obj_rec(string _name, struct rgw_file_handle* _fh,
+           struct rgw_file_handle* _parent_fh, RGWFileHandle* _rgw_fh)
+      : name(std::move(_name)), fh(_fh), parent_fh(_parent_fh),
+       rgw_fh(_rgw_fh) {}
+
+    void clear() {
+      fh = nullptr;
+      rgw_fh = nullptr;
+    }
+
+    void sync() {
+      if (fh)
+       rgw_fh = get_rgwfh(fh);
+    }
+
+    friend ostream& operator<<(ostream& os, const obj_rec& rec);
+  };
+
+  ostream& operator<<(ostream& os, const obj_rec& rec)
+  {
+    RGWFileHandle* rgw_fh = rec.rgw_fh;
+    if (rgw_fh) {
+      const char* type = rgw_fh->is_dir() ? "DIR " : "FILE ";
+      os << rec.rgw_fh->full_object_name()
+        << " (" << rec.rgw_fh->object_name() << "): "
+        << type;
+    }
+    return os;
+  }
+  
+  std::stack<obj_rec> obj_stack;
+  std::deque<obj_rec> cleanup_queue;
+
+  typedef std::vector<obj_rec> obj_vec;
+  typedef std::tuple<obj_rec, obj_vec> dirs1_rec;
+  typedef std::vector<dirs1_rec> dirs1_vec;
+
+  dirs1_vec dirs_vec;
+
+  struct obj_rec_st
+  {
+    const obj_rec& obj;
+    const struct stat& st;
+
+    obj_rec_st(const obj_rec& _obj, const struct stat& _st)
+      : obj(_obj), st(_st) {}
+  };
+
+  ostream& operator<<(ostream& os, const obj_rec_st& rec)
+  {
+    RGWFileHandle* rgw_fh = rec.obj.rgw_fh;
+    if (rgw_fh) {
+      const char* type = rgw_fh->is_dir() ? "DIR " : "FILE ";
+      os << rgw_fh->full_object_name()
+        << " (" << rgw_fh->object_name() << "): "
+        << type;
+      const struct stat& st = rec.st;
+      switch(uint8_t(rgw_fh->is_dir())) {
+      case 1:
+       os << " mode: " << st.st_mode;
+       os << " nlinks: " << st.st_nlink;
+       break;
+      case 0:
+      default:
+       os << " mode: " << st.st_mode;
+       os << " size: " << st.st_size;
+       // xxx
+       break;
+      }
+    }
+    return os;
+  }
+
+  bool do_marker1 = false;
+  bool do_marker2 = true;
+  bool do_create = false;
+  bool do_delete = false;
+  bool verbose = false;
+
+  string marker_dir("nfs_marker");
+  struct rgw_file_handle *bucket_fh = nullptr;
+  struct rgw_file_handle *marker_fh;
+  static constexpr int marker_nobjs = 2*1024;
+  std::deque<obj_rec> marker_objs;
+
+  using dirent_t = std::tuple<std::string, uint64_t>;
+  struct dirent_vec
+  {
+    std::vector<dirent_t> obj_names;
+    uint32_t count;
+    dirent_vec() : count(0) {}
+  };
+
+  struct {
+    int argc;
+    char **argv;
+  } saved_args;
+}
+
+TEST(LibRGW, TVAR) {
+  typedef boost::variant<uint64_t*, const char*> readdir_offset;
+
+  uint64_t i1{64001};
+  std::string s1{"blunderbuss"};
+
+  readdir_offset v1{&i1};
+  readdir_offset v2{s1.c_str()};
+  readdir_offset v3{static_cast<const char*>(nullptr)};
+
+  uint64_t* pi1 = get<uint64_t*>(v1);
+  ASSERT_NE(pi1, nullptr);
+  std::cout << "read i1: " << *pi1 << std::endl;
+
+  const char* ps1 = get<const char*>(v2);
+  ASSERT_NE(ps1, nullptr);
+  std::cout << "read s1: " << ps1 << std::endl;
+
+  const char* ps3 = get<const char*>(v3);
+  ASSERT_EQ(ps3, nullptr);
+  std::cout << "read s3: " << ps3 << std::endl;
+}
+
+TEST(LibRGW, INIT) {
+  int ret = librgw_create(&rgw_h, saved_args.argc, saved_args.argv);
+  ASSERT_EQ(ret, 0);
+  ASSERT_NE(rgw_h, nullptr);
+}
+
+TEST(LibRGW, MOUNT) {
+  int ret = rgw_mount2(rgw_h, userid.c_str(), access_key.c_str(),
+                       secret_key.c_str(), "/", &fs, RGW_MOUNT_FLAG_NONE);
+  ASSERT_EQ(ret, 0);
+  ASSERT_NE(fs, nullptr);
+
+  cct = static_cast<RGWLibFS*>(fs->fs_private)->get_context();
+}
+
+TEST(LibRGW, MARKER1_SETUP_BUCKET) {
+  /* "large" directory enumeration test.  this one deals only with
+   * file objects */
+  struct stat st;
+  int ret;
+
+  st.st_uid = owner_uid;
+  st.st_gid = owner_gid;
+  st.st_mode = 755;
+
+  (void) rgw_lookup(fs, fs->root_fh, bucket_name.c_str(), &bucket_fh,
+                   RGW_LOOKUP_FLAG_NONE);
+  if (! bucket_fh) {
+    if (do_create) {
+      struct stat st;
+
+      st.st_uid = owner_uid;
+      st.st_gid = owner_gid;
+      st.st_mode = 755;
+
+      ret = rgw_mkdir(fs, fs->root_fh, bucket_name.c_str(), &st, create_mask,
+                     &bucket_fh, RGW_MKDIR_FLAG_NONE);
+      ASSERT_EQ(ret, 0);
+    }
+  }
+
+  ASSERT_NE(bucket_fh, nullptr);
+
+  (void) rgw_lookup(fs, bucket_fh, marker_dir.c_str(), &marker_fh,
+                   RGW_LOOKUP_FLAG_NONE);
+  if (! marker_fh) {
+    if (do_create) {
+      ret = rgw_mkdir(fs, bucket_fh, marker_dir.c_str(), &st, create_mask,
+                     &marker_fh, RGW_MKDIR_FLAG_NONE);
+      ASSERT_EQ(ret, 0);
+    }
+  }
+
+  ASSERT_NE(marker_fh, nullptr);
+}
+
+TEST(LibRGW, MARKER1_SETUP_OBJECTS)
+{
+  /* "large" directory enumeration test.  this one deals only with
+   * file objects */
+  if (do_create) {
+    int ret;
+
+    for (int ix = 0; ix < marker_nobjs; ++ix) {
+      std::string object_name("f_");
+      object_name += to_string(ix);
+      obj_rec obj{object_name, nullptr, marker_fh, nullptr};
+      // lookup object--all operations are by handle
+      ret = rgw_lookup(fs, marker_fh, obj.name.c_str(), &obj.fh,
+                      RGW_LOOKUP_FLAG_CREATE);
+      ASSERT_EQ(ret, 0);
+      obj.rgw_fh = get_rgwfh(obj.fh);
+      // open object--open transaction
+      ret = rgw_open(fs, obj.fh, 0 /* posix flags */, RGW_OPEN_FLAG_NONE);
+      ASSERT_EQ(ret, 0);
+      ASSERT_TRUE(obj.rgw_fh->is_open());
+      // unstable write data
+      size_t nbytes;
+      string data("data for ");
+      data += object_name;
+      int ret = rgw_write(fs, obj.fh, 0, data.length(), &nbytes,
+                         (void*) data.c_str(), RGW_WRITE_FLAG_NONE);
+      ASSERT_EQ(ret, 0);
+      ASSERT_EQ(nbytes, data.length());
+      // commit transaction (write on close)
+      ret = rgw_close(fs, obj.fh, 0 /* flags */);
+      ASSERT_EQ(ret, 0);
+      // save for cleanup
+      marker_objs.push_back(obj);
+    }
+  }
+}
+
+extern "C" {
+  static bool r2_cb(const char* name, void *arg, uint64_t offset,
+                   uint32_t flags) {
+    dirent_vec& dvec =
+      *(static_cast<dirent_vec*>(arg));
+    lsubdout(cct, rgw, 10) << __func__
+                          << " bucket=" << bucket_name
+                          << " dir=" << marker_dir
+                          << " iv count=" << dvec.count
+                          << " called back name=" << name
+                          << " flags=" << flags
+                          << dendl;
+
+  std::cout << __func__
+                          << " bucket=" << bucket_name
+                          << " dir=" << marker_dir
+                          << " iv count=" << dvec.count
+                          << " called back name=" << name
+                          << " flags=" << flags
+                          << std::endl;
+
+    string name_str{name};
+    if (! ((name_str == ".") ||
+          (name_str == ".."))) {
+      dvec.obj_names.push_back(dirent_t{std::move(name_str), offset});
+    }
+    return true; /* XXX */
+  }
+}
+
+TEST(LibRGW, MARKER1_READDIR)
+{
+  if (do_marker1) {
+    using std::get;
+
+    dirent_vec dvec;
+    uint64_t offset = 0;
+    bool eof = false;
+
+    /* because RGWReaddirRequest::default_max is 1000 (XXX make
+     * configurable?) and marker_nobjs is 5*1024, the number
+     * of required rgw_readdir operations N should be
+     * marker_nobjs/1000 < N < marker_nobjs/1000+1, i.e., 6 when
+     * marker_nobjs==5*1024 */
+    uint32_t max_iterations = marker_nobjs/1000+1;
+
+    do {
+      ASSERT_TRUE(dvec.count <= max_iterations);
+      int ret = rgw_readdir(fs, marker_fh, &offset, r2_cb, &dvec, &eof,
+                           RGW_READDIR_FLAG_DOTDOT);
+      ASSERT_EQ(ret, 0);
+      ASSERT_EQ(offset, get<1>(dvec.obj_names.back())); // cookie check
+      ++dvec.count;
+    } while(!eof);
+    std::cout << "Read " << dvec.obj_names.size() << " objects in "
+             << marker_dir.c_str() << std::endl;
+  }
+}
+
+TEST(LibRGW, MARKER2_READDIR)
+{
+  if (do_marker2) {
+    using std::get;
+
+    dirent_vec dvec;
+    std::string marker{""};
+    bool eof = false;
+
+    /* because RGWReaddirRequest::default_max is 1000 (XXX make
+     * configurable?) and marker_nobjs is 5*1024, the number
+     * of required rgw_readdir operations N should be
+     * marker_nobjs/1000 < N < marker_nobjs/1000+1, i.e., 6 when
+     * marker_nobjs==5*1024 */
+    uint32_t max_iterations = marker_nobjs/1000+1;
+
+    do {
+      ASSERT_TRUE(dvec.count <= max_iterations);
+      int ret = rgw_readdir2(fs, marker_fh,
+                            (marker.length() > 0) ? marker.c_str() : nullptr,
+                            r2_cb, &dvec, &eof,
+                            RGW_READDIR_FLAG_DOTDOT);
+      ASSERT_EQ(ret, 0);
+      marker = get<0>(dvec.obj_names.back());
+      ++dvec.count;
+    } while((!eof) && dvec.count < 4);
+    std::cout << "Read " << dvec.obj_names.size() << " objects in "
+             << marker_dir.c_str() << std::endl;
+  }
+}
+
+TEST(LibRGW, MARKER1_OBJ_CLEANUP)
+{
+  int rc;
+  for (auto& obj : marker_objs) {
+    if (obj.fh) {
+      if (do_delete) {
+       if (verbose) {
+         std::cout << "unlinking: " << bucket_name << ":" << obj.name
+                   << std::endl;
+       }
+       rc = rgw_unlink(fs, marker_fh, obj.name.c_str(), RGW_UNLINK_FLAG_NONE);
+      }
+      rc = rgw_fh_rele(fs, obj.fh, 0 /* flags */);
+      ASSERT_EQ(rc, 0);
+    }
+  }
+  marker_objs.clear();
+}
+
+TEST(LibRGW, CLEANUP) {
+  int rc;
+
+  if (do_marker1) {
+    cleanup_queue.push_back(
+      obj_rec{bucket_name, bucket_fh, fs->root_fh, get_rgwfh(fs->root_fh)});
+  }
+
+  for (auto& elt : cleanup_queue) {
+    if (elt.fh) {
+      rc = rgw_fh_rele(fs, elt.fh, 0 /* flags */);
+      ASSERT_EQ(rc, 0);
+    }
+  }
+  cleanup_queue.clear();
+}
+
+TEST(LibRGW, UMOUNT) {
+  if (! fs)
+    return;
+
+  int ret = rgw_umount(fs, RGW_UMOUNT_FLAG_NONE);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST(LibRGW, SHUTDOWN) {
+  librgw_shutdown(rgw_h);
+}
+
+int main(int argc, char *argv[])
+{
+  char *v{nullptr};
+  string val;
+  vector<const char*> args;
+
+  argv_to_vec(argc, const_cast<const char**>(argv), args);
+  env_to_vec(args);
+
+  v = getenv("AWS_ACCESS_KEY_ID");
+  if (v) {
+    access_key = v;
+  }
+
+  v = getenv("AWS_SECRET_ACCESS_KEY");
+  if (v) {
+    secret_key = v;
+  }
+
+  for (auto arg_iter = args.begin(); arg_iter != args.end();) {
+    if (ceph_argparse_witharg(args, arg_iter, &val, "--access",
+                             (char*) nullptr)) {
+      access_key = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--secret",
+                                    (char*) nullptr)) {
+      secret_key = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--userid",
+                                    (char*) nullptr)) {
+      userid = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--bn",
+                                    (char*) nullptr)) {
+      bucket_name = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--uid",
+                                    (char*) nullptr)) {
+      owner_uid = std::stoi(val);
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--gid",
+                                    (char*) nullptr)) {
+      owner_gid = std::stoi(val);
+    } else if (ceph_argparse_flag(args, arg_iter, "--marker1",
+                                           (char*) nullptr)) {
+      do_marker1 = true;
+    } else if (ceph_argparse_flag(args, arg_iter, "--create",
+                                           (char*) nullptr)) {
+      do_create = true;
+    } else if (ceph_argparse_flag(args, arg_iter, "--delete",
+                                           (char*) nullptr)) {
+      do_delete = true;
+    } else if (ceph_argparse_flag(args, arg_iter, "--verbose",
+                                           (char*) nullptr)) {
+      verbose = true;
+    } else {
+      ++arg_iter;
+    }
+  }
+
+  /* dont accidentally run as anonymous */
+  if ((access_key == "") ||
+      (secret_key == "")) {
+    std::cout << argv[0] << " no AWS credentials, exiting" << std::endl;
+    return EPERM;
+  }
+
+  saved_args.argc = argc;
+  saved_args.argv = argv;
+
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
index 347aee4c6fb356cfb9fbb4bfa5660b25d3d95dd1..92a0a6815e278c2c0cac1ea1b55dba4d1e322d38 100644 (file)
@@ -190,8 +190,8 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw_h, userid.c_str(), access_key.c_str(),
-                     secret_key.c_str(), &fs, RGW_MOUNT_FLAG_NONE);
+  int ret = rgw_mount2(rgw_h, userid.c_str(), access_key.c_str(),
+                       secret_key.c_str(), "/", &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
 
index d7e8a18fe41c893ac5647100bbf8f87cfbd2a343..37648b9e52bd7f615ed08d429465abfe0d703369 100644 (file)
@@ -175,7 +175,7 @@ TEST(pgmap, dump_object_stat_sum_0)
   float copies_rate =
     (static_cast<float>(sum.num_object_copies - sum.num_objects_degraded) /
      sum.num_object_copies);
-  float used_bytes = sum.num_bytes * copies_rate;
+  float used_bytes = sum.num_bytes * copies_rate * pool.get_size();
   float used_percent = used_bytes / (used_bytes + avail) * 100;
   unsigned col = 0;
   ASSERT_EQ(stringify(si_t(sum.num_bytes)), tbl.get(0, col++));
index cadecb42d5e14d7c5b248c604a15f723fc0eec1c..182af5429addcd4dcd3c2725ce66b0b0d5ff03b6 100644 (file)
@@ -182,7 +182,7 @@ TEST(PerfCounters, MultiplePerfCounters) {
   ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":13,\"element2\":0.000000000,"
            "\"element3\":{\"avgcount\":0,\"sum\":0.000000000,\"avgtime\":0.000000000}}}"), msg);
   ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf schema\", \"format\": \"json\" }", &msg));
-  ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":{\"type\":2,\"metric_type\":\"gauge\",\"value_type\":\"integer\",\"description\":\"\",\"nick\":\"\"},\"element2\":{\"type\":1,\"metric_type\":\"gauge\",\"value_type\":\"real\",\"description\":\"\",\"nick\":\"\"},\"element3\":{\"type\":5,\"metric_type\":\"gauge\",\"value_type\":\"real-integer-pair\",\"description\":\"\",\"nick\":\"\"}}}"), msg);
+  ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":{\"type\":2,\"metric_type\":\"gauge\",\"value_type\":\"integer\",\"description\":\"\",\"nick\":\"\",\"priority\":0},\"element2\":{\"type\":1,\"metric_type\":\"gauge\",\"value_type\":\"real\",\"description\":\"\",\"nick\":\"\",\"priority\":0},\"element3\":{\"type\":5,\"metric_type\":\"gauge\",\"value_type\":\"real-integer-pair\",\"description\":\"\",\"nick\":\"\",\"priority\":0}}}"), msg);
   coll->clear();
   ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf dump\", \"format\": \"json\" }", &msg));
   ASSERT_EQ("{}", msg);
index 98cccd87bbd58a013bfb472807dacd74acec51ad..c3b9f7cccc8a1f4111703b21e4a4c774c10d67b2 100644 (file)
@@ -785,8 +785,9 @@ double perf_timer()
   uint64_t start = Cycles::rdtsc();
   Mutex::Locker l(lock);
   for (int i = 0; i < count; i++) {
-    timer.add_event_after(12345, c[i]);
-    timer.cancel_event(c[i]);
+    if (timer.add_event_after(12345, c[i])) {
+      timer.cancel_event(c[i]);
+    }
   }
   uint64_t stop = Cycles::rdtsc();
   delete[] c;
index c372a4b9fc3970ca15403304d019e4e2d99ef78e..4dff2f0a2c6c2ba051ce5c1632e45a17b01686b7 100644 (file)
@@ -929,6 +929,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, PrimaryRemote) {
 
   // register missing client in remote journal
   librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta;
+  mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
   client_data.client_meta = mirror_peer_client_meta;
   expect_journaler_register_client(mock_journaler, client_data, 0);
 
@@ -1019,6 +1020,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, PrimaryRemoteLocalDeleted) {
   // re-register the client
   expect_journaler_unregister_client(mock_journaler, 0);
   mirror_peer_client_meta = {};
+  mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
   client_data.client_meta = mirror_peer_client_meta;
   expect_journaler_register_client(mock_journaler, client_data, 0);
 
index 3de5fbcdbf5cc4e24a5c83fc2fba6bfc63a1300b..32d58471d4a4b0aefbe4a94f84a78988e031b0ff 100644 (file)
@@ -9,7 +9,7 @@
 struct Context;
 
 struct MockSafeTimer {
-  MOCK_METHOD2(add_event_after, void(double, Context*));
+  MOCK_METHOD2(add_event_after, Context*(double, Context*));
   MOCK_METHOD1(cancel_event, bool(Context *));
 };
 
index 7a0bb6706f64393596a75baff2e60b00e728ecf6..9e2006c9d5190a250dd6243d6e0e8ab1097332f2 100644 (file)
@@ -105,6 +105,7 @@ using ::testing::InSequence;
 using ::testing::Invoke;
 using ::testing::MatcherCast;
 using ::testing::Return;
+using ::testing::ReturnArg;
 using ::testing::SetArgPointee;
 using ::testing::WithArg;
 
@@ -356,9 +357,10 @@ public:
   void expect_add_event_after_repeatedly(MockThreads &mock_threads) {
     EXPECT_CALL(*mock_threads.timer, add_event_after(_, _))
       .WillRepeatedly(
-        Invoke([this](double seconds, Context *ctx) {
-          m_threads->timer->add_event_after(seconds, ctx);
-        }));
+        DoAll(Invoke([this](double seconds, Context *ctx) {
+                      m_threads->timer->add_event_after(seconds, ctx);
+                    }),
+         ReturnArg<1>()));
     EXPECT_CALL(*mock_threads.timer, cancel_event(_))
       .WillRepeatedly(
         Invoke([this](Context *ctx) {
index 1903c55f2c9032b84c585614bf534155cea12c4c..02bc0886df514655f92a5834f9d5ea3f16e07438 100644 (file)
@@ -121,9 +121,11 @@ namespace rbd {
 namespace mirror {
 
 using ::testing::_;
+using ::testing::DoAll;
 using ::testing::InSequence;
 using ::testing::Invoke;
 using ::testing::Return;
+using ::testing::ReturnArg;
 using ::testing::ReturnRef;
 using ::testing::WithArg;
 
@@ -146,8 +148,8 @@ public:
   void expect_add_event_after(MockThreads &mock_threads,
                               Context** timer_ctx = nullptr) {
     EXPECT_CALL(*mock_threads.timer, add_event_after(_, _))
-      .WillOnce(WithArg<1>(
-        Invoke([this, &mock_threads, timer_ctx](Context *ctx) {
+      .WillOnce(DoAll(
+        WithArg<1>(Invoke([this, &mock_threads, timer_ctx](Context *ctx) {
           assert(mock_threads.timer_lock.is_locked());
           if (timer_ctx != nullptr) {
             *timer_ctx = ctx;
@@ -159,7 +161,8 @@ public:
                 ctx->complete(0);
               }), 0);
           }
-        })));
+        })),
+        ReturnArg<1>()));
   }
 
   void expect_cancel_event(MockThreads &mock_threads, bool canceled) {
index 1b7877434ad96bb24a59fbd45bc5d0dd0d8bb8b1..4c7463d660c552df8e20ad29371c031f5035149e 100644 (file)
@@ -145,6 +145,7 @@ using ::testing::DoAll;
 using ::testing::InSequence;
 using ::testing::Invoke;
 using ::testing::Return;
+using ::testing::ReturnArg;
 using ::testing::StrEq;
 using ::testing::WithArg;
 using ::testing::WithoutArgs;
@@ -238,13 +239,15 @@ public:
 
   void expect_timer_add_event(MockThreads &mock_threads) {
     EXPECT_CALL(*mock_threads.timer, add_event_after(_, _))
-      .WillOnce(WithArg<1>(Invoke([this](Context *ctx) {
-          auto wrapped_ctx = new FunctionContext([this, ctx](int r) {
-              Mutex::Locker timer_locker(m_threads->timer_lock);
-              ctx->complete(r);
-            });
-          m_threads->work_queue->queue(wrapped_ctx, 0);
-        })));
+      .WillOnce(DoAll(WithArg<1>(Invoke([this](Context *ctx) {
+                        auto wrapped_ctx =
+                         new FunctionContext([this, ctx](int r) {
+                             Mutex::Locker timer_locker(m_threads->timer_lock);
+                             ctx->complete(r);
+                           });
+                       m_threads->work_queue->queue(wrapped_ctx, 0);
+                      })),
+                      ReturnArg<1>()));
   }
 
   int when_shut_down(MockPoolWatcher &mock_pool_watcher) {
index 0109a02138db9ae358ee7d874808328878206fec..61e531af92cc43caa0dcf71d3501f502cae4a594 100644 (file)
@@ -10,6 +10,7 @@ try:
 except ImportError:
     from itertools import zip_longest
 from itertools import combinations
+from cStringIO import StringIO
 
 import boto
 import boto.s3.connection
@@ -931,6 +932,27 @@ def test_bucket_sync_disable_enable():
     for bucket_name in buckets:
         zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
 
+def test_multipart_object_sync():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+    buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+    _, bucket = zone_bucket[0]
+
+    # initiate a multipart upload
+    upload = bucket.initiate_multipart_upload('MULTIPART')
+    mp = boto.s3.multipart.MultiPartUpload(bucket)
+    mp.key_name = upload.key_name
+    mp.id = upload.id
+    part_size = 5 * 1024 * 1024 # 5M min part size
+    mp.upload_part_from_file(StringIO('a' * part_size), 1)
+    mp.upload_part_from_file(StringIO('b' * part_size), 2)
+    mp.upload_part_from_file(StringIO('c' * part_size), 3)
+    mp.upload_part_from_file(StringIO('d' * part_size), 4)
+    mp.complete_upload()
+
+    zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
 def test_encrypted_object_sync():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
index 6f5a42f8fabc30f542d9f690f08415f22c2e26b9..8cd0281af157f611dee9d8000da329e0a4de1fb6 100644 (file)
@@ -1,4 +1,6 @@
 #include "include/ipaddr.h"
+#include "common/pick_address.h"
+#include "global/global_context.h"
 #include "gtest/gtest.h"
 
 #if defined(__FreeBSD__)
@@ -537,3 +539,52 @@ TEST(CommonIPAddr, ParseNetwork_IPv6_9000)
   ipv6(&want, "2001:1234:5678:90ab::dead:beef");
   ASSERT_EQ(0, memcmp(want.sin6_addr.s6_addr, network.sin6_addr.s6_addr, sizeof(network.sin6_addr.s6_addr)));
 }
+
+TEST(pick_address, find_ip_in_subnet_list)
+{
+  struct ifaddrs one, two;
+  struct sockaddr_in a_one;
+  struct sockaddr_in a_two;
+  const struct sockaddr *result;
+
+  one.ifa_next = &two;
+  one.ifa_addr = (struct sockaddr*)&a_one;
+  one.ifa_name = eth0;
+
+  two.ifa_next = NULL;
+  two.ifa_addr = (struct sockaddr*)&a_two;
+  two.ifa_name = eth1;
+
+  ipv4(&a_one, "10.1.1.2");
+  ipv4(&a_two, "10.2.1.123");
+
+  // match by network
+  result = find_ip_in_subnet_list(
+    g_ceph_context,
+    &one,
+    "10.1.0.0/16",
+    "eth0");
+  ASSERT_EQ((struct sockaddr*)&a_one, result);
+
+  result = find_ip_in_subnet_list(
+    g_ceph_context,
+    &one,
+    "10.2.0.0/16",
+    "eth1");
+  ASSERT_EQ((struct sockaddr*)&a_two, result);
+
+  // match by eth name
+  result = find_ip_in_subnet_list(
+    g_ceph_context,
+    &one,
+    "10.0.0.0/8",
+    "eth0");
+  ASSERT_EQ((struct sockaddr*)&a_one, result);
+
+  result = find_ip_in_subnet_list(
+    g_ceph_context,
+    &one,
+    "10.0.0.0/8",
+    "eth1");
+  ASSERT_EQ((struct sockaddr*)&a_two, result);
+}
index ed19c63bc3c7da87dfb9ee6fa043d95683f1dc0b..7502085895afe77af48cfd55f1ddbf89c3c48353 100644 (file)
@@ -26,7 +26,9 @@ add_executable(ceph-osdomap-tool ceph_osdomap_tool.cc)
 target_link_libraries(ceph-osdomap-tool os global Boost::program_options)
 install(TARGETS ceph-osdomap-tool DESTINATION bin)
 
-add_executable(ceph-monstore-tool ceph_monstore_tool.cc)
+add_executable(ceph-monstore-tool
+  ceph_monstore_tool.cc
+  ../mgr/mgr_commands.cc)
 target_link_libraries(ceph-monstore-tool os global Boost::program_options)
 install(TARGETS ceph-monstore-tool DESTINATION bin)
 install(PROGRAMS
index 6d7ef7313ff2831ac703f7ae007c3e7debd4e2b9..e9f31091c1c9d5837e8389cea64f99a694075a6e 100644 (file)
@@ -36,7 +36,13 @@ using namespace std;
 
 class StoreTool
 {
-  boost::scoped_ptr<KeyValueDB> db;
+  boost::scoped_ptr<BlueStore> bluestore;
+
+  // TODO: make KeyValueDB enable_shared_from_this
+  // bluestore will hold *db* also, use unique_ptr/shared_ptr will
+  // double free. 
+  KeyValueDB* db;
+
   string store_path;
 
   public:
@@ -46,7 +52,7 @@ class StoreTool
 #ifdef HAVE_LIBAIO
       // note: we'll leak this!  the only user is ceph-kvstore-tool and
       // we don't care.
-      BlueStore *bluestore = new BlueStore(g_ceph_context, path);
+      bluestore.reset(new BlueStore(g_ceph_context, path));
       int r = bluestore->start_kv_only(&db_ptr);
       if (r < 0) {
        exit(1);
@@ -64,7 +70,18 @@ class StoreTool
        exit(1);
       }
     }
-    db.reset(db_ptr);
+    db = db_ptr;
+  }
+
+  ~StoreTool() {
+    if (bluestore) {
+      bluestore->umount();   
+    }
+    else {
+      if (db) {
+        delete db;
+      }
+    }
   }
 
   uint32_t traverse(const string &prefix,
index 8c941443d818ffe65ea787bf70825cd0df241fb3..bf607ffa9c415f44151874c6e88f99689f533f77 100644 (file)
@@ -24,6 +24,7 @@
 #include "auth/cephx/CephxKeyServer.h"
 #include "global/global_init.h"
 #include "include/stringify.h"
+#include "mgr/mgr_commands.h"
 #include "mon/AuthMonitor.h"
 #include "mon/MonitorDBStore.h"
 #include "mon/Paxos.h"
@@ -588,6 +589,36 @@ static int update_monitor(MonitorDBStore& st)
   return 0;
 }
 
+static int update_mgrmap(MonitorDBStore& st)
+{
+  auto t = make_shared<MonitorDBStore::Transaction>();
+
+  {
+    MgrMap map;
+    // mgr expects epoch > 1
+    map.epoch++;
+    auto initial_modules =
+      get_str_vec(g_ceph_context->_conf->get_val<string>("mgr_initial_modules"));
+    copy(begin(initial_modules),
+        end(initial_modules),
+        inserter(map.modules, end(map.modules)));
+    bufferlist bl;
+    map.encode(bl, CEPH_FEATURES_ALL);
+    t->put("mgr", map.epoch, bl);
+    t->put("mgr", "last_committed", map.epoch);
+  }
+  {
+    auto mgr_command_descs = mgr_commands;
+    for (auto& c : mgr_command_descs) {
+      c.set_flag(MonCommand::FLAG_MGR);
+    }
+    bufferlist bl;
+    ::encode(mgr_command_descs, bl);
+    t->put("mgr_command_desc", "", bl);
+  }
+  return st.apply_transaction(t);
+}
+
 static int update_paxos(MonitorDBStore& st)
 {
   // build a pending paxos proposal from all non-permanent k/v pairs. once the
@@ -598,6 +629,7 @@ static int update_paxos(MonitorDBStore& st)
   {
     MonitorDBStore::Transaction t;
     vector<string> prefixes = {"auth", "osdmap",
+                              "mgr", "mgr_command_desc",
                               "pgmap", "pgmap_pg", "pgmap_meta"};
     for (const auto& prefix : prefixes) {
       for (auto i = st.get_iterator(prefix); i->valid(); i->next()) {
@@ -706,6 +738,9 @@ int rebuild_monstore(const char* progname,
   if ((r = update_monitor(st))) {
     return r;
   }
+  if ((r = update_mgrmap(st))) {
+    return r;
+  }
   return 0;
 }
 
index 397303deaafbc24b68e4b0bb4b002a4f635d014b..de679a95676d72c66a9a87cd2b5fcfc357cd512c 100644 (file)
@@ -73,7 +73,7 @@ CompatSet get_test_compat_set() {
 const ssize_t max_read = 1024 * 1024;
 const int fd_none = INT_MIN;
 bool outistty;
-bool dry_run = false;
+bool dry_run;
 
 struct action_on_object_t {
   virtual ~action_on_object_t() {}
@@ -294,7 +294,7 @@ ghobject_t log_oid;
 ghobject_t biginfo_oid;
 
 int file_fd = fd_none;
-bool debug = false;
+bool debug;
 super_header sh;
 uint64_t testalign;
 
@@ -2534,16 +2534,16 @@ int main(int argc, char **argv)
     ("journal-path", po::value<string>(&jpath),
      "path to journal, use if tool can't find it")
     ("pgid", po::value<string>(&pgidstr),
-     "PG id, mandatory for info, log, remove, export, rm-past-intervals, mark-complete, and mandatory for apply-layout-settings if --pool is not specified")
+     "PG id, mandatory for info, log, remove, export, export-remove, rm-past-intervals, mark-complete, and mandatory for apply-layout-settings if --pool is not specified")
     ("pool", po::value<string>(&pool),
      "Pool name, mandatory for apply-layout-settings if --pgid is not specified")
     ("op", po::value<string>(&op),
-     "Arg is one of [info, log, remove, mkfs, fsck, fuse, dup, export, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
+     "Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
      "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, apply-layout-settings, update-mon-db]")
     ("epoch", po::value<unsigned>(&epoch),
      "epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified")
     ("file", po::value<string>(&file),
-     "path of file to export, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap")
+     "path of file to export, export-remove, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap")
     ("mon-store-path", po::value<string>(&mon_store_path),
      "path of monstore to update-mon-db")
     ("fsid", po::value<string>(&fsid),
@@ -2597,23 +2597,15 @@ int main(int argc, char **argv)
     return 1;
   }
 
-  if (!vm.count("debug")) {
-    debug = false;
-  } else {
-    debug = true;
-  }
+  debug = (vm.count("debug") > 0);
 
-  if (!vm.count("force")) {
-    force = false;
-  } else {
-    force = true;
-  }
+  force = (vm.count("force") > 0);
 
   if (vm.count("namespace"))
     nspace = argnspace;
 
-  if (vm.count("dry-run"))
-    dry_run = true;
+  dry_run = (vm.count("dry-run") > 0);
+
   osflagbits_t flags = 0;
   if (dry_run || vm.count("skip-journal-replay"))
     flags |= SKIP_JOURNAL_REPLAY;
@@ -2621,6 +2613,7 @@ int main(int argc, char **argv)
     flags |= SKIP_MOUNT_OMAP;
   if (op == "update-mon-db")
     flags |= SKIP_JOURNAL_REPLAY;
+
   head = (vm.count("head") > 0);
 
   vector<const char *> ceph_options;
@@ -2690,7 +2683,7 @@ int main(int argc, char **argv)
   outistty = isatty(STDOUT_FILENO);
 
   file_fd = fd_none;
-  if ((op == "export" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) {
+  if ((op == "export" || op == "export-remove" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) {
     if (!vm.count("file") || file == "-") {
       if (outistty) {
         cerr << "stdout is a tty and no --file filename specified" << std::endl;
@@ -2715,7 +2708,7 @@ int main(int argc, char **argv)
   ObjectStoreTool tool = ObjectStoreTool(file_fd, dry_run);
 
   if (vm.count("file") && file_fd == fd_none && !dry_run) {
-    cerr << "--file option only applies to import, export, "
+    cerr << "--file option only applies to import, export, export-remove, "
         << "get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap" << std::endl;
     return 1;
   }
@@ -2777,6 +2770,20 @@ int main(int argc, char **argv)
     return 1;
   }
 
+  //Verify that the journal-path really exists
+  if (type == "filestore") {
+    if (::stat(jpath.c_str(), &st) == -1) {
+      string err = string("journal-path: ") + jpath;
+      perror(err.c_str());
+      return 1;
+    }
+    if (S_ISDIR(st.st_mode)) {
+      cerr << "journal-path: " << jpath << ": "
+          << cpp_strerror(EISDIR) << std::endl;
+      return 1;
+    }
+  }
+
   ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags);
   if (fs == NULL) {
     cerr << "Unable to create store of type " << type << std::endl;
@@ -2796,6 +2803,19 @@ int main(int argc, char **argv)
     cout << "fsck found no errors" << std::endl;
     return 0;
   }
+  if (op == "repair" || op == "repair-deep") {
+    int r = fs->repair(op == "repair-deep");
+    if (r < 0) {
+      cerr << "repair failed: " << cpp_strerror(r) << std::endl;
+      return 1;
+    }
+    if (r > 0) {
+      cerr << "repair found " << r << " errors" << std::endl;
+      return 1;
+    }
+    cout << "repair found no errors" << std::endl;
+    return 0;
+  }
   if (op == "mkfs") {
     if (fsid.length()) {
       uuid_d f;
@@ -2808,7 +2828,7 @@ int main(int argc, char **argv)
     }
     int r = fs->mkfs();
     if (r < 0) {
-      cerr << "fsck failed: " << cpp_strerror(r) << std::endl;
+      cerr << "mkfs failed: " << cpp_strerror(r) << std::endl;
       return 1;
     }
     return 0;
@@ -3008,7 +3028,7 @@ int main(int argc, char **argv)
   // The ops which require --pgid option are checked here and
   // mentioned in the usage for --pgid.
   if ((op == "info" || op == "log" || op == "remove" || op == "export"
-      || op == "rm-past-intervals" || op == "mark-complete") &&
+      || op == "export-remove" || op == "rm-past-intervals" || op == "mark-complete") &&
       pgidstr.length() == 0) {
     cerr << "Must provide pgid" << std::endl;
     usage(desc);
@@ -3114,6 +3134,11 @@ int main(int argc, char **argv)
   biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
 
   if (op == "remove") {
+    if (!force && !dry_run) {
+      cerr << "Please use export-remove or you must use --force option" << std::endl;
+      ret = -EINVAL;
+      goto out;
+    }
     ret = initiate_new_remove_pg(fs, pgid, *osr);
     if (ret < 0) {
       cerr << "PG '" << pgid << "' not found" << std::endl;
@@ -3206,8 +3231,8 @@ int main(int argc, char **argv)
 
   // If not an object command nor any of the ops handled below, then output this usage
   // before complaining about a bad pgid
-  if (!vm.count("objcmd") && op != "export" && op != "info" && op != "log" && op != "rm-past-intervals" && op != "mark-complete") {
-    cerr << "Must provide --op (info, log, remove, mkfs, fsck, export, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
+  if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "rm-past-intervals" && op != "mark-complete") {
+    cerr << "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
       "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)"
         << std::endl;
     usage(desc);
@@ -3481,10 +3506,17 @@ int main(int argc, char **argv)
     if (debug)
       cerr << "struct_v " << (int)struct_ver << std::endl;
 
-    if (op == "export") {
+    if (op == "export" || op == "export-remove") {
       ret = tool.do_export(fs, coll, pgid, info, map_epoch, struct_ver, superblock, past_intervals);
-      if (ret == 0)
+      if (ret == 0) {
         cerr << "Export successful" << std::endl;
+        if (op == "export-remove") {
+          ret = initiate_new_remove_pg(fs, pgid, *osr);
+          // Export succeeded, so pgid is there
+          assert(ret == 0);
+          cerr << "Remove successful" << std::endl;
+        }
+      }
     } else if (op == "info") {
       formatter->open_object_section("info");
       info.dump(formatter);
index 21cf60c5aedd7b3a521cc9ca9b2bd8826a9433ec..52fa7ae710c64159e52b77242b8c73c53820615d 100644 (file)
@@ -27,7 +27,7 @@ using namespace std;
 
 int main(int argc, char **argv) {
   po::options_description desc("Allowed options");
-  string store_path, cmd, out_path, oid;
+  string store_path, cmd, oid, backend;
   bool debug = false;
   desc.add_options()
     ("help", "produce help message")
@@ -38,6 +38,8 @@ int main(int argc, char **argv) {
     ("oid", po::value<string>(&oid), "Restrict to this object id when dumping objects")
     ("command", po::value<string>(&cmd),
      "command arg is one of [dump-raw-keys, dump-raw-key-vals, dump-objects, dump-objects-with-keys, check, dump-headers, repair], mandatory")
+    ("backend", po::value<string>(&backend),
+     "DB backend (default rocksdb)")
     ;
   po::positional_options_description p;
   p.add("command", 1);
@@ -96,7 +98,15 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  KeyValueDB* store(KeyValueDB::create(g_ceph_context, "leveldb", store_path));
+  if (vm.count("backend") == 0) {
+    backend = "rocksdb";
+  }
+
+  KeyValueDB* store(KeyValueDB::create(g_ceph_context, backend, store_path));
+  if (store == NULL) {
+    std::cerr << "Invalid backend '" << backend << "' specified" << std::endl;
+    return 1;
+  }
   /*if (vm.count("paranoid")) {
     std::cerr << "Enabling paranoid checks" << std::endl;
     store->options.paranoid_checks = true;
@@ -113,6 +123,11 @@ int main(int argc, char **argv) {
   // the DBObjectMap which we might want to examine for diagnostic
   // reasons.  Instead use --command repair.
 
+  omap.get_state();
+  std::cout << "Version: " << (int)omap.state.v << std::endl;
+  std::cout << "Seq: " << omap.state.seq << std::endl;
+  std::cout << "legacy: " << (omap.state.legacy ? "true" : "false") << std::endl;
+
   if (cmd == "dump-raw-keys") {
     KeyValueDB::WholeSpaceIterator i = store->get_iterator();
     for (i->seek_to_first(); i->valid(); i->next()) {
@@ -164,7 +179,7 @@ int main(int argc, char **argv) {
   } else if (cmd == "check" || cmd == "repair") {
     ostringstream ss;
     bool repair = (cmd == "repair");
-    r = omap.check(ss, repair);
+    r = omap.check(ss, repair, true);
     if (r) {
       std::cerr << ss.str() << std::endl;
       if (r > 0) {
@@ -184,6 +199,10 @@ int main(int argc, char **argv) {
     for (auto i : headers)
       std::cout << i << std::endl;
     return 0;
+  } else if (cmd == "resetv2") {
+    omap.state.v = 2;
+    omap.state.legacy = false;
+    omap.set_state();
   } else {
     std::cerr << "Did not recognize command " << cmd << std::endl;
     return 1;
index 2a1bc83b8f86fda509dc51cd9f597bd20174dfba..7ecae9529ce9a3cf08ac265b948b58c4c1255538 100644 (file)
@@ -829,14 +829,15 @@ int main(int argc, const char **argv)
 
     {
       set<int> roots;
-      crush.find_roots(roots);
-      if (roots.size() > 1)
-       dout(1) << "The crush rulesets will use the root " << root << "\n"
-               << "and ignore the others.\n"
-               << "There are " << roots.size() << " roots, they can be\n"
-               << "grouped into a single root by appending something like:\n"
-               << "  root straw 0\n"
-               << dendl;
+      crush.find_roots(&roots);
+      if (roots.size() > 1) {
+       cerr << "The crush rulesets will use the root " << root << "\n"
+            << "and ignore the others.\n"
+            << "There are " << roots.size() << " roots, they can be\n"
+            << "grouped into a single root by appending something like:\n"
+            << "  root straw 0\n"
+            << std::endl;
+      }
     }
     
     if (OSDMap::build_simple_crush_rules(g_ceph_context, crush, root, &cerr))
index 1fac4c10e68404fdfa0656641f8d014789523e0f..0972faf185a9e9dbf9433f4246e7e17283289ee5 100644 (file)
@@ -310,7 +310,7 @@ int main(int argc, const char **argv)
     monmap.created = ceph_clock_now();
     monmap.last_changed = monmap.created;
     srand(getpid() + time(0));
-    if (g_conf->fsid.is_zero()) {
+    if (g_conf->get_val<uuid_d>("fsid").is_zero()) {
       monmap.generate_fsid();
       cout << me << ": generated fsid " << monmap.fsid << std::endl;
     }
@@ -338,8 +338,8 @@ int main(int argc, const char **argv)
     modified = true;
   }
 
-  if (!g_conf->fsid.is_zero()) {
-    monmap.fsid = g_conf->fsid;
+  if (!g_conf->get_val<uuid_d>("fsid").is_zero()) {
+    monmap.fsid = g_conf->get_val<uuid_d>("fsid");
     cout << me << ": set fsid to " << monmap.fsid << std::endl;
     modified = true;
   }
index e5b75917e5c753b03562fe245eafa78c047440c2..7178bd5ec70f496a7be9b85a67502cd289490e6e 100644 (file)
@@ -35,6 +35,7 @@ void usage()
   cout << "   --test-map-pgs-dump-all [--pool <poolid>] map all pgs to osds" << std::endl;
   cout << "   --health                dump health checks" << std::endl;
   cout << "   --mark-up-in            mark osds up and in (but do not persist)" << std::endl;
+  cout << "   --mark-out <osdid>      mark an osd as out (but do not persist)" << std::endl;
   cout << "   --with-default-pool     include default pool when creating map" << std::endl;
   cout << "   --clear-temp            clear pg_temp and primary_temp" << std::endl;
   cout << "   --test-random           do random placements" << std::endl;
@@ -116,6 +117,7 @@ int main(int argc, const char **argv)
   int range_last = -1;
   int pool = -1;
   bool mark_up_in = false;
+  int marked_out = -1;
   bool clear_temp = false;
   bool test_map_pgs = false;
   bool test_map_pgs_dump = false;
@@ -175,6 +177,8 @@ int main(int argc, const char **argv)
       create_from_conf = true;
     } else if (ceph_argparse_flag(args, i, "--mark-up-in", (char*)NULL)) {
       mark_up_in = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--mark-out", (char*)NULL)) {
+      marked_out = std::stoi(val);
     } else if (ceph_argparse_flag(args, i, "--clear-temp", (char*)NULL)) {
       clear_temp = true;
     } else if (ceph_argparse_flag(args, i, "--test-map-pgs", (char*)NULL)) {
@@ -317,6 +321,15 @@ int main(int argc, const char **argv)
       osdmap.crush->adjust_item_weightf(g_ceph_context, i, 1.0);
     }
   }
+
+  if (marked_out >=0 && marked_out < osdmap.get_max_osd()) {
+    cout << "marking OSD@" << marked_out << " as out" << std::endl;
+    int id = marked_out;
+    osdmap.set_state(id, osdmap.get_state(id) | CEPH_OSD_UP);
+    osdmap.set_weight(id, CEPH_OSD_OUT);
+    osdmap.crush->adjust_item_weightf(g_ceph_context, id, 1.0);
+  }
+
   if (clear_temp) {
     cout << "clearing pg/primary temp" << std::endl;
     osdmap.clear_temp();
index 70cd91f394c7d30867a93c9ecb487966649e949c..e0151c9780e90cac38f4c3c6a807bcb19325de6d 100644 (file)
@@ -31,6 +31,25 @@ namespace mirror_image {
 namespace at = argument_types;
 namespace po = boost::program_options;
 
+namespace {
+
+int validate_mirroring_enabled(librbd::Image& image) {
+  librbd::mirror_image_info_t mirror_image;
+  int r = image.mirror_image_get_info(&mirror_image, sizeof(mirror_image));
+  if (r < 0) {
+    std::cerr << "rbd: failed to retrieve mirror mode: "
+              << cpp_strerror(r) << std::endl;
+    return r;
+  }
+
+  if (mirror_image.state != RBD_MIRROR_IMAGE_ENABLED) {
+    std::cerr << "rbd: mirroring not enabled on the image" << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+} // anonymous namespace
 
 void get_arguments(po::options_description *positional,
                            po::options_description *options) {
@@ -115,6 +134,11 @@ int execute_promote(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(image);
+  if (r < 0) {
+    return r;
+  }
+
   r = image.mirror_image_promote(force);
   if (r < 0) {
     std::cerr << "rbd: error promoting image to primary" << std::endl;
@@ -146,6 +170,11 @@ int execute_demote(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(image);
+  if (r < 0) {
+    return r;
+  }
+
   r = image.mirror_image_demote();
   if (r < 0) {
     std::cerr << "rbd: error demoting image to non-primary" << std::endl;
@@ -177,6 +206,11 @@ int execute_resync(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(image);
+  if (r < 0) {
+    return r;
+  }
+
   r = image.mirror_image_resync();
   if (r < 0) {
     std::cerr << "rbd: error flagging image resync" << std::endl;
@@ -220,6 +254,11 @@ int execute_status(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(image);
+  if (r < 0) {
+    return r;
+  }
+
   librbd::mirror_image_status_t status;
   r = image.mirror_image_get_status(&status, sizeof(status));
   if (r < 0) {
index 4314b1ed6b46ddc3c8b8944a359414d24dda0422..ba179d054ce3f2e50fd32a6c4c89511298e5db78 100644 (file)
@@ -36,6 +36,23 @@ namespace po = boost::program_options;
 
 namespace {
 
+int validate_mirroring_enabled(librados::IoCtx& io_ctx) {
+  librbd::RBD rbd;
+  rbd_mirror_mode_t mirror_mode;
+  int r = rbd.mirror_mode_get(io_ctx, &mirror_mode);
+  if (r < 0) {
+    std::cerr << "rbd: failed to retrieve mirror mode: "
+              << cpp_strerror(r) << std::endl;
+    return r;
+  }
+
+  if (mirror_mode == RBD_MIRROR_MODE_DISABLED) {
+    std::cerr << "rbd: mirroring not enabled on the pool" << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
 int validate_uuid(const std::string &uuid) {
   boost::regex pattern("^[A-F0-9]{8}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{12}$",
                        boost::regex::icase);
@@ -328,7 +345,7 @@ public:
 
 protected:
   bool skip_action(const librbd::mirror_image_info_t &info) const override {
-    return info.primary;
+    return (info.state != RBD_MIRROR_IMAGE_ENABLED || info.primary);
   }
 
   void execute_action(librbd::Image &image,
@@ -340,6 +357,7 @@ protected:
     if (r >= 0) {
       (*m_counter)++;
     }
+    ImageRequestBase::handle_execute_action(r);
   }
 
   std::string get_action_type() const override {
@@ -360,7 +378,7 @@ public:
 
 protected:
   bool skip_action(const librbd::mirror_image_info_t &info) const override {
-    return !info.primary;
+    return (info.state != RBD_MIRROR_IMAGE_ENABLED || !info.primary);
   }
 
   void execute_action(librbd::Image &image,
@@ -403,6 +421,10 @@ protected:
   }
 
   void finalize_action() override {
+    if (m_mirror_image_status.info.global_id.empty()) {
+      return;
+    }
+
     std::string state = utils::mirror_image_status_state(m_mirror_image_status);
     std::string last_update = (
       m_mirror_image_status.last_update == 0 ?
@@ -529,25 +551,15 @@ int execute_peer_add(const po::variables_map &vm) {
   if (r < 0) {
     return r;
   }
-  
-  librbd::RBD rbd;
-  rbd_mirror_mode_t mirror_mode;
-  r = rbd.mirror_mode_get(io_ctx, &mirror_mode);
+
+  r = validate_mirroring_enabled(io_ctx);
   if (r < 0) {
-    std::cerr << "rbd: failed to retrieve mirror mode: " 
-              << cpp_strerror(r) << std::endl;
     return r;
   }
-  
-  if (mirror_mode == RBD_MIRROR_MODE_DISABLED) {
-    std::cerr << "rbd: failed to add mirror peer: "
-              << "mirroring must be enabled on the pool " 
-              << pool_name << std::endl;
-    return -EINVAL;
-  }
 
   // TODO: temporary restriction to prevent adding multiple peers
   // until rbd-mirror daemon can properly handle the scenario
+  librbd::RBD rbd;
   std::vector<librbd::mirror_peer_t> mirror_peers;
   r = rbd.mirror_peer_list(io_ctx, &mirror_peers);
   if (r < 0) {
@@ -593,6 +605,11 @@ int execute_peer_remove(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
   librbd::RBD rbd;
   r = rbd.mirror_peer_remove(io_ctx, uuid);
   if (r < 0) {
@@ -639,6 +656,11 @@ int execute_peer_set(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
   librbd::RBD rbd;
   if (key == "client") {
     r = rbd.mirror_peer_set_client(io_ctx, uuid.c_str(), value.c_str());
@@ -839,6 +861,11 @@ int execute_status(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
   librbd::RBD rbd;
 
   std::map<librbd::mirror_image_status_state_t, int> states;
@@ -932,6 +959,11 @@ int execute_promote(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
   std::atomic<unsigned> counter = { 0 };
   ImageRequestGenerator<PromoteImageRequest> generator(io_ctx, &counter,
                                                        vm["force"].as<bool>());
@@ -957,6 +989,11 @@ int execute_demote(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
   std::atomic<unsigned> counter { 0 };
   ImageRequestGenerator<DemoteImageRequest> generator(io_ctx, &counter);
   r = generator.execute();
index 2bb31b4b11dabc857ae61efbef2968390294cfcf..bf77e9db7bdb1b5832626b1615c1b5a2e2a9852b 100644 (file)
@@ -74,76 +74,83 @@ struct ReplayHandler : public ::journal::ReplayHandler {
   }
 };
 
+template <typename I>
 class ImageReplayerAdminSocketCommand {
 public:
+  ImageReplayerAdminSocketCommand(const std::string &desc,
+                                  ImageReplayer<I> *replayer)
+    : desc(desc), replayer(replayer) {
+  }
   virtual ~ImageReplayerAdminSocketCommand() {}
   virtual bool call(Formatter *f, stringstream *ss) = 0;
+
+  std::string desc;
+  ImageReplayer<I> *replayer;
+  bool registered = false;
 };
 
 template <typename I>
-class StatusCommand : public ImageReplayerAdminSocketCommand {
+class StatusCommand : public ImageReplayerAdminSocketCommand<I> {
 public:
-  explicit StatusCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+  explicit StatusCommand(const std::string &desc, ImageReplayer<I> *replayer)
+    : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+  }
 
   bool call(Formatter *f, stringstream *ss) override {
-    replayer->print_status(f, ss);
+    this->replayer->print_status(f, ss);
     return true;
   }
-
-private:
-  ImageReplayer<I> *replayer;
 };
 
 template <typename I>
-class StartCommand : public ImageReplayerAdminSocketCommand {
+class StartCommand : public ImageReplayerAdminSocketCommand<I> {
 public:
-  explicit StartCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+  explicit StartCommand(const std::string &desc, ImageReplayer<I> *replayer)
+    : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+  }
 
   bool call(Formatter *f, stringstream *ss) override {
-    replayer->start(nullptr, true);
+    this->replayer->start(nullptr, true);
     return true;
   }
-
-private:
-  ImageReplayer<I> *replayer;
 };
 
 template <typename I>
-class StopCommand : public ImageReplayerAdminSocketCommand {
+class StopCommand : public ImageReplayerAdminSocketCommand<I> {
 public:
-  explicit StopCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+  explicit StopCommand(const std::string &desc, ImageReplayer<I> *replayer)
+    : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+  }
 
   bool call(Formatter *f, stringstream *ss) override {
-    replayer->stop(nullptr, true);
+    this->replayer->stop(nullptr, true);
     return true;
   }
-
-private:
-  ImageReplayer<I> *replayer;
 };
 
 template <typename I>
-class RestartCommand : public ImageReplayerAdminSocketCommand {
+class RestartCommand : public ImageReplayerAdminSocketCommand<I> {
 public:
-  explicit RestartCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+  explicit RestartCommand(const std::string &desc, ImageReplayer<I> *replayer)
+    : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+  }
 
   bool call(Formatter *f, stringstream *ss) override {
-    replayer->restart();
+    this->replayer->restart();
     return true;
   }
-
-private:
-  ImageReplayer<I> *replayer;
 };
 
 template <typename I>
-class FlushCommand : public ImageReplayerAdminSocketCommand {
+class FlushCommand : public ImageReplayerAdminSocketCommand<I> {
 public:
-  explicit FlushCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+  explicit FlushCommand(const std::string &desc, ImageReplayer<I> *replayer)
+    : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+  }
 
   bool call(Formatter *f, stringstream *ss) override {
     C_SaferCond cond;
-    replayer->flush(&cond);
+    this->replayer->flush(&cond);
     int r = cond.wait();
     if (r < 0) {
       *ss << "flush: " << cpp_strerror(r);
@@ -151,9 +158,6 @@ public:
     }
     return true;
   }
-
-private:
-  ImageReplayer<I> *replayer;
 };
 
 template <typename I>
@@ -161,72 +165,44 @@ class ImageReplayerAdminSocketHook : public AdminSocketHook {
 public:
   ImageReplayerAdminSocketHook(CephContext *cct, const std::string &name,
                               ImageReplayer<I> *replayer)
-    : admin_socket(cct->get_admin_socket()), name(name), replayer(replayer),
-      lock("ImageReplayerAdminSocketHook::lock " +
-             replayer->get_global_image_id()) {
+    : admin_socket(cct->get_admin_socket()),
+      commands{{"rbd mirror flush " + name,
+                new FlushCommand<I>("flush rbd mirror " + name, replayer)},
+               {"rbd mirror restart " + name,
+                new RestartCommand<I>("restart rbd mirror " + name, replayer)},
+               {"rbd mirror start " + name,
+                new StartCommand<I>("start rbd mirror " + name, replayer)},
+               {"rbd mirror status " + name,
+                new StatusCommand<I>("get status for rbd mirror " + name, replayer)},
+               {"rbd mirror stop " + name,
+                new StopCommand<I>("stop rbd mirror " + name, replayer)}} {
   }
 
   int register_commands() {
-    std::string command;
-    int r;
-
-    command = "rbd mirror status " + name;
-    r = admin_socket->register_command(command, command, this,
-                                      "get status for rbd mirror " + name);
-    if (r < 0) {
-      return r;
-    }
-    commands[command] = new StatusCommand<I>(replayer);
-
-    command = "rbd mirror start " + name;
-    r = admin_socket->register_command(command, command, this,
-                                      "start rbd mirror " + name);
-    if (r < 0) {
-      return r;
-    }
-    commands[command] = new StartCommand<I>(replayer);
-
-    command = "rbd mirror stop " + name;
-    r = admin_socket->register_command(command, command, this,
-                                      "stop rbd mirror " + name);
-    if (r < 0) {
-      return r;
-    }
-    commands[command] = new StopCommand<I>(replayer);
-
-    command = "rbd mirror restart " + name;
-    r = admin_socket->register_command(command, command, this,
-                                      "restart rbd mirror " + name);
-    if (r < 0) {
-      return r;
-    }
-    commands[command] = new RestartCommand<I>(replayer);
-
-    command = "rbd mirror flush " + name;
-    r = admin_socket->register_command(command, command, this,
-                                      "flush rbd mirror " + name);
-    if (r < 0) {
-      return r;
+    for (auto &it : commands) {
+      int r = admin_socket->register_command(it.first, it.first, this,
+                                             it.second->desc);
+      if (r < 0) {
+        return r;
+      }
+      it.second->registered = true;
     }
-    commands[command] = new FlushCommand<I>(replayer);
-
     return 0;
   }
 
   ~ImageReplayerAdminSocketHook() override {
-    Mutex::Locker locker(lock);
-    for (Commands::const_iterator i = commands.begin(); i != commands.end();
-        ++i) {
-      (void)admin_socket->unregister_command(i->first);
-      delete i->second;
+    for (auto &it : commands) {
+      if (it.second->registered) {
+        admin_socket->unregister_command(it.first);
+      }
+      delete it.second;
     }
     commands.clear();
   }
 
   bool call(std::string command, cmdmap_t& cmdmap, std::string format,
            bufferlist& out) override {
-    Mutex::Locker locker(lock);
-    Commands::const_iterator i = commands.find(command);
+    auto i = commands.find(command);
     assert(i != commands.end());
     Formatter *f = Formatter::create(format);
     stringstream ss;
@@ -237,12 +213,9 @@ public:
   }
 
 private:
-  typedef std::map<std::string, ImageReplayerAdminSocketCommand*> Commands;
+  typedef std::map<std::string, ImageReplayerAdminSocketCommand<I> *> Commands;
 
   AdminSocket *admin_socket;
-  std::string name;
-  ImageReplayer<I> *replayer;
-  Mutex lock;
   Commands commands;
 };
 
@@ -606,20 +579,7 @@ void ImageReplayer<I>::handle_bootstrap(int r) {
     return;
   }
 
-  {
-    Mutex::Locker locker(m_lock);
-    std::string name = m_local_ioctx.get_pool_name() + "/" +
-                       m_local_image_ctx->name;
-    if (m_name != name) {
-      m_name = name;
-      if (m_asok_hook) {
-       // Re-register asok commands using the new name.
-       delete m_asok_hook;
-       m_asok_hook = nullptr;
-      }
-    }
-    register_admin_socket_hook();
-  }
+  on_name_changed();
 
   update_mirror_image_status(false, boost::none);
   init_remote_journaler();
@@ -785,7 +745,6 @@ void ImageReplayer<I>::stop(Context *on_finish, bool manual, int r,
   image_replayer::BootstrapRequest<I> *bootstrap_request = nullptr;
   bool shut_down_replay = false;
   bool running = true;
-  bool canceled_task = false;
   {
     Mutex::Locker locker(m_lock);
 
@@ -808,14 +767,6 @@ void ImageReplayer<I>::stop(Context *on_finish, bool manual, int r,
         std::swap(m_on_stop_finish, on_finish);
         m_stop_requested = true;
         m_manual_stop = manual;
-
-       Mutex::Locker timer_locker(m_threads->timer_lock);
-        if (m_delayed_preprocess_task != nullptr) {
-          canceled_task = m_threads->timer->cancel_event(
-            m_delayed_preprocess_task);
-          assert(canceled_task);
-          m_delayed_preprocess_task = nullptr;
-        }
       }
     }
   }
@@ -826,11 +777,6 @@ void ImageReplayer<I>::stop(Context *on_finish, bool manual, int r,
     bootstrap_request->put();
   }
 
-  if (canceled_task) {
-    m_event_replay_tracker.finish_op();
-    on_replay_interrupted();
-  }
-
   if (!running) {
     dout(20) << "not running" << dendl;
     if (on_finish) {
@@ -1275,6 +1221,8 @@ void ImageReplayer<I>::handle_process_entry_ready(int r) {
   dout(20) << dendl;
   assert(r == 0);
 
+  on_name_changed();
+
   // attempt to process the next event
   handle_replay_ready();
 }
@@ -1554,6 +1502,22 @@ void ImageReplayer<I>::reschedule_update_status_task(int new_interval) {
 template <typename I>
 void ImageReplayer<I>::shut_down(int r) {
   dout(20) << "r=" << r << dendl;
+
+  bool canceled_delayed_preprocess_task = false;
+  {
+    Mutex::Locker timer_locker(m_threads->timer_lock);
+    if (m_delayed_preprocess_task != nullptr) {
+      canceled_delayed_preprocess_task = m_threads->timer->cancel_event(
+        m_delayed_preprocess_task);
+      assert(canceled_delayed_preprocess_task);
+      m_delayed_preprocess_task = nullptr;
+    }
+  }
+  if (canceled_delayed_preprocess_task) {
+    // wake up sleeping replay
+    m_event_replay_tracker.finish_op();
+  }
+
   {
     Mutex::Locker locker(m_lock);
     assert(m_state == STATE_STOPPING);
@@ -1665,6 +1629,7 @@ template <typename I>
 void ImageReplayer<I>::handle_shut_down(int r) {
   reschedule_update_status_task(-1);
 
+  bool unregister_asok_hook = false;
   {
     Mutex::Locker locker(m_lock);
 
@@ -1696,17 +1661,21 @@ void ImageReplayer<I>::handle_shut_down(int r) {
       m_local_image_id = "";
       m_resync_requested = false;
       if (m_delete_requested) {
-        unregister_admin_socket_hook();
+        unregister_asok_hook = true;
         m_delete_requested = false;
       }
     } else if (m_last_r == -ENOENT &&
                m_local_image_id.empty() && m_remote_image.image_id.empty()) {
       dout(0) << "mirror image no longer exists" << dendl;
-      unregister_admin_socket_hook();
+      unregister_asok_hook = true;
       m_finished = true;
     }
   }
 
+  if (unregister_asok_hook) {
+    unregister_admin_socket_hook();
+  }
+
   dout(20) << "stop complete" << dendl;
   m_local_ioctx.close();
 
@@ -1789,30 +1758,51 @@ void ImageReplayer<I>::resync_image(Context *on_finish) {
 
 template <typename I>
 void ImageReplayer<I>::register_admin_socket_hook() {
-  if (m_asok_hook != nullptr) {
-    return;
-  }
+  ImageReplayerAdminSocketHook<I> *asok_hook;
+  {
+    Mutex::Locker locker(m_lock);
+    if (m_asok_hook != nullptr) {
+      return;
+    }
 
-  dout(20) << "registered asok hook: " << m_name << dendl;
-  auto asok_hook = new ImageReplayerAdminSocketHook<I>(g_ceph_context, m_name,
-                                                       this);
-  int r = asok_hook->register_commands();
-  if (r < 0) {
+    dout(20) << "registered asok hook: " << m_name << dendl;
+    asok_hook = new ImageReplayerAdminSocketHook<I>(g_ceph_context, m_name,
+                                                    this);
+    int r = asok_hook->register_commands();
+    if (r == 0) {
+      m_asok_hook = asok_hook;
+      return;
+    }
     derr << "error registering admin socket commands" << dendl;
-    delete asok_hook;
-    asok_hook = nullptr;
-    return;
   }
-
-  m_asok_hook = asok_hook;
+  delete asok_hook;
 }
 
 template <typename I>
 void ImageReplayer<I>::unregister_admin_socket_hook() {
   dout(20) << dendl;
 
-  delete m_asok_hook;
-  m_asok_hook = nullptr;
+  AdminSocketHook *asok_hook = nullptr;
+  {
+    Mutex::Locker locker(m_lock);
+    std::swap(asok_hook, m_asok_hook);
+  }
+  delete asok_hook;
+}
+
+template <typename I>
+void ImageReplayer<I>::on_name_changed() {
+  {
+    Mutex::Locker locker(m_lock);
+    std::string name = m_local_ioctx.get_pool_name() + "/" +
+      m_local_image_ctx->name;
+    if (m_name == name) {
+      return;
+    }
+    m_name = name;
+  }
+  unregister_admin_socket_hook();
+  register_admin_socket_hook();
 }
 
 template <typename I>
index 3f2ab2fca74e8985f4ce5871a92ebdca6b566b0f..a66b02a24abc553c82ceaf346548656983367945 100644 (file)
@@ -425,6 +425,8 @@ private:
 
   void register_admin_socket_hook();
   void unregister_admin_socket_hook();
+
+  void on_name_changed();
 };
 
 } // namespace mirror
index 8d03e878f16c80a3421c90742fdf31b363767a69..817d3434c3983a384b138e8091fa41fdb9f6c82f 100644 (file)
@@ -47,6 +47,9 @@ const std::string SERVICE_DAEMON_LEADER_KEY("leader");
 const std::string SERVICE_DAEMON_LOCAL_COUNT_KEY("image_local_count");
 const std::string SERVICE_DAEMON_REMOTE_COUNT_KEY("image_remote_count");
 
+const std::vector<std::string> UNIQUE_PEER_CONFIG_KEYS {
+  {"monmap", "mon_host", "mon_dns_srv_name", "key", "keyfile", "keyring"}};
+
 class PoolReplayerAdminSocketCommand {
 public:
   PoolReplayerAdminSocketCommand(PoolReplayer *pool_replayer)
@@ -260,7 +263,7 @@ void PoolReplayer::init()
   dout(20) << "replaying for " << m_peer << dendl;
   int r = init_rados(g_ceph_context->_conf->cluster,
                      g_ceph_context->_conf->name.to_str(),
-                     "local cluster", &m_local_rados);
+                     "local cluster", &m_local_rados, false);
   if (r < 0) {
     m_callout_id = m_service_daemon->add_or_update_callout(
       m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
@@ -270,7 +273,7 @@ void PoolReplayer::init()
 
   r = init_rados(m_peer.cluster_name, m_peer.client_name,
                  std::string("remote peer ") + stringify(m_peer),
-                 &m_remote_rados);
+                 &m_remote_rados, true);
   if (r < 0) {
     m_callout_id = m_service_daemon->add_or_update_callout(
       m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
@@ -377,7 +380,8 @@ void PoolReplayer::shut_down() {
 int PoolReplayer::init_rados(const std::string &cluster_name,
                             const std::string &client_name,
                             const std::string &description,
-                            RadosRef *rados_ref) {
+                            RadosRef *rados_ref,
+                             bool strip_cluster_overrides) {
   rados_ref->reset(new librados::Rados());
 
   // NOTE: manually bootstrap a CephContext here instead of via
@@ -402,6 +406,18 @@ int PoolReplayer::init_rados(const std::string &cluster_name,
     cct->put();
     return r;
   }
+
+  // preserve cluster-specific config settings before applying environment/cli
+  // overrides
+  std::map<std::string, std::string> config_values;
+  if (strip_cluster_overrides) {
+    // remote peer connections shouldn't apply cluster-specific
+    // configuration settings
+    for (auto& key : UNIQUE_PEER_CONFIG_KEYS) {
+      config_values[key] = cct->_conf->get_val<std::string>(key);
+    }
+  }
+
   cct->_conf->parse_env();
 
   // librados::Rados::conf_parse_env
@@ -427,6 +443,20 @@ int PoolReplayer::init_rados(const std::string &cluster_name,
     }
   }
 
+  if (strip_cluster_overrides) {
+    // remote peer connections shouldn't apply cluster-specific
+    // configuration settings
+    for (auto& pair : config_values) {
+      auto value = cct->_conf->get_val<std::string>(pair.first);
+      if (pair.second != value) {
+        dout(0) << "reverting global config option override: "
+                << pair.first << ": " << value << " -> " << pair.second
+                << dendl;
+        cct->_conf->set_val_or_die(pair.first, pair.second);
+      }
+    }
+  }
+
   if (!g_ceph_context->_conf->admin_socket.empty()) {
     cct->_conf->set_val_or_die("admin_socket",
                                "$run_dir/$name.$pid.$cluster.$cctid.asok");
index ca693ef74808600f948fa132e994712ded8e85b7..49aaee3c4ae764c0a5d7c3c401a4ba100d507aeb 100644 (file)
@@ -89,7 +89,8 @@ private:
 
   int init_rados(const std::string &cluster_name,
                  const std::string &client_name,
-                 const std::string &description, RadosRef *rados_ref);
+                 const std::string &description, RadosRef *rados_ref,
+                 bool strip_cluster_overrides);
 
   void handle_post_acquire_leader(Context *on_finish);
   void handle_pre_release_leader(Context *on_finish);
index 18c6df3840fb948d863d1277db66d7baddd455bb..8d60aa4f47a8c6a440a3b89545381c57a8bbaae3 100644 (file)
@@ -362,10 +362,11 @@ void PoolWatcher<I>::schedule_refresh_images(double interval) {
   }
 
   m_image_ids_invalid = true;
-  m_timer_ctx = new FunctionContext([this](int r) {
-      process_refresh_images();
-    });
-  m_threads->timer->add_event_after(interval, m_timer_ctx);
+  m_timer_ctx = m_threads->timer->add_event_after(
+    interval,
+    new FunctionContext([this](int r) {
+       process_refresh_images();
+      }));
 }
 
 template <typename I>
index 1b2359a7df03199b04f3bb9bd1f4fe1d46dd1231..1c521b274ac813c3021231a50c2e2cc82ab0d463 100644 (file)
@@ -213,9 +213,11 @@ void BootstrapRequest<I>::register_client() {
 
   update_progress("REGISTER_CLIENT");
 
-  // record an place-holder record
-  librbd::journal::ClientData client_data{
-    librbd::journal::MirrorPeerClientMeta{m_local_image_id}};
+  librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
+    m_local_image_id};
+  mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
+  librbd::journal::ClientData client_data{mirror_peer_client_meta};
   bufferlist client_data_bl;
   ::encode(client_data, client_data_bl);
 
@@ -239,6 +241,8 @@ void BootstrapRequest<I>::handle_register_client(int r) {
 
   m_client = {};
   *m_client_meta = librbd::journal::MirrorPeerClientMeta(m_local_image_id);
+  m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
   is_primary();
 }
 
@@ -498,10 +502,6 @@ void BootstrapRequest<I>::handle_create_local_image(int r) {
 
 template <typename I>
 void BootstrapRequest<I>::get_remote_tags() {
-  dout(20) << dendl;
-
-  update_progress("GET_REMOTE_TAGS");
-
   if (m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_SYNCING) {
     // optimization -- no need to compare remote tags if we just created
     // the image locally or sync was interrupted
@@ -510,6 +510,7 @@ void BootstrapRequest<I>::get_remote_tags() {
   }
 
   dout(20) << dendl;
+  update_progress("GET_REMOTE_TAGS");
 
   Context *ctx = create_context_callback<
     BootstrapRequest<I>, &BootstrapRequest<I>::handle_get_remote_tags>(this);
index 6278d01015558868bf4dad1ba1b3cc98428cdee1..6768caa005bd49570e1dc2eb87fb1d9177e2e54a 100644 (file)
@@ -161,8 +161,9 @@ void ImageCopyRequest<I>::send_object_copies() {
   {
     Mutex::Locker timer_locker(*m_timer_lock);
     if (m_update_sync_ctx) {
-      m_timer->add_event_after(m_update_sync_point_interval,
-                               m_update_sync_ctx);
+      m_update_sync_ctx = m_timer->add_event_after(
+        m_update_sync_point_interval,
+       m_update_sync_ctx);
     }
   }
 
index 82ec76bf94ee49b1bac36894a54815cbc14f5126..63f2202cb479b9f5b3ac514f9af461873c86ef5e 100755 (executable)
@@ -507,8 +507,10 @@ $DAEMONOPTS
         osd copyfrom max chunk = 524288
         bluestore fsck on mount = true
         bluestore block create = true
+       bluestore block db path = $CEPH_DEV_DIR/osd\$id/block.db.file
         bluestore block db size = 67108864
         bluestore block db create = true
+       bluestore block wal path = $CEPH_DEV_DIR/osd\$id/block.wal.file
         bluestore block wal size = 1048576000
         bluestore block wal create = true
 $COSDDEBUG
@@ -516,7 +518,7 @@ $COSDMEMSTORE
 $COSDSHORT
 $extra_conf
 [mon]
-        mgr initial modules = restful status dashboard
+        mgr initial modules = restful status dashboard balancer
         mon pg warn min per osd = 3
         mon osd allow primary affinity = true
         mon reweight min pgs per osd = 4
@@ -627,9 +629,14 @@ EOF
             echo "add osd$osd $uuid"
             ceph_adm osd create $uuid
             ceph_adm osd crush add osd.$osd 1.0 host=$HOSTNAME root=default
-            $SUDO $CEPH_BIN/ceph-osd -i $osd $ARGS --mkfs --mkkey --osd-uuid $uuid
+           OSD_SECRET=$($CEPH_BIN/ceph-authtool --gen-print-key)
+            $SUDO $CEPH_BIN/ceph-osd -i $osd $ARGS --mkfs --key $OSD_SECRET --osd-uuid $uuid
 
             local key_fn=$CEPH_DEV_DIR/osd$osd/keyring
+           cat > $key_fn<<EOF
+[osd.$osd]
+       key = $OSD_SECRET
+EOF
             echo adding osd$osd key to auth repository
             ceph_adm -i "$key_fn" auth add osd.$osd osd "allow *" mon "allow profile osd" mgr "allow profile osd"
         fi
@@ -776,6 +783,7 @@ else
         debug rocksdb = 10
         debug bdev = 20
         debug rgw = 20
+       debug reserver = 10
         debug objclass = 20'
     CMDSDEBUG='
         debug ms = 1
index d38aec524e8d6811b91378a7aefc38b38dcc1132..17fd7381f888c222e1e9a2b581d375682450419d 100644 (file)
@@ -2,6 +2,7 @@
 Description=Ceph rbd mirror daemon
 After=network-online.target local-fs.target
 Wants=network-online.target local-fs.target
+PartOf=ceph-rbd-mirror.target
 
 [Service]
 LimitNOFILE=1048576