From 3efd99882e8c73385040d3f5c48fd014e4247be7 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
Date: Tue, 28 Nov 2017 09:02:46 +0100
Subject: [PATCH] update sources to 12.2.2

---
 ceph/CMakeLists.txt                           |   2 +-
 ceph/COPYING                                  |   5 -
 ceph/PendingReleaseNotes                      |  11 +
 ceph/admin/doc-requirements.txt               |   4 +-
 ceph/alpine/APKBUILD                          |   6 +-
 ceph/ceph.spec                                |  18 +-
 ceph/ceph.spec.in                             |  12 +-
 ceph/debian/ceph-osd.install                  |   2 +
 ceph/debian/ceph-osd.postinst                 |   1 +
 ceph/debian/changelog                         |   6 +
 ceph/debian/rules                             |   1 +
 ceph/doc/ceph-volume/index.rst                |  44 +-
 ceph/doc/ceph-volume/lvm/activate.rst         |  20 +-
 ceph/doc/ceph-volume/lvm/create.rst           |  24 +
 ceph/doc/ceph-volume/lvm/index.rst            |   6 +-
 ceph/doc/ceph-volume/lvm/list.rst             | 173 ++++
 ceph/doc/ceph-volume/lvm/prepare.rst          | 148 ++-
 ceph/doc/ceph-volume/lvm/systemd.rst          |  32 +-
 ceph/doc/ceph-volume/lvm/zap.rst              |  19 +
 ceph/doc/ceph-volume/simple/activate.rst      |  80 ++
 ceph/doc/ceph-volume/simple/index.rst         |  19 +
 ceph/doc/ceph-volume/simple/scan.rst          | 158 +++
 ceph/doc/ceph-volume/simple/systemd.rst       |  28 +
 ceph/doc/ceph-volume/systemd.rst              |  49 +
 ceph/doc/cephfs/mds-config-ref.rst            |  14 +
 ceph/doc/conf.py                              |   3 +-
 ceph/doc/man/8/CMakeLists.txt                 |   3 +-
 ceph/doc/man/8/ceph-bluestore-tool.rst        | 123 +++
 ceph/doc/mgr/administrator.rst                |  56 ++
 ceph/doc/mgr/dashboard.rst                    |  10 +-
 ceph/doc/mgr/index.rst                        |   4 +-
 ceph/doc/mgr/influx.rst                       | 162 +++
 ceph/doc/mgr/localpool.rst                    |  35 +
 ceph/doc/mgr/plugins.rst                      |  25 +
 ceph/doc/mgr/prometheus.rst                   | 185 +++-
 .../configuration/pool-pg-config-ref.rst      |   9 +
 ceph/doc/rados/operations/health-checks.rst   |  16 +-
 ceph/doc/scripts/gen_state_diagram.py         |  28 +-
 ceph/etc/default/ceph                         |   8 -
 ceph/etc/sysconfig/ceph                       |   8 -
 ceph/qa/cephfs/clusters/3-mds.yaml            |   4 +-
 ceph/qa/cephfs/clusters/9-mds.yaml            |   4 +-
 .../bluestore-comp-ec-root.yaml               |  28 +
 .../cephfs/objectstore-ec/bluestore-comp.yaml |  23 +
 .../objectstore-ec/bluestore-ec-root.yaml     |  42 +
 ceph/qa/cephfs/objectstore-ec/bluestore.yaml  |  38 +
 .../cephfs/objectstore-ec/filestore-xfs.yaml  |  15 +
 ceph/qa/distros/all/centos_7.4.yaml           |   2 +
 ceph/qa/distros/supported/centos_latest.yaml  |   2 +-
 ceph/qa/releases/luminous-with-mgr.yaml       |   3 +-
 ceph/qa/releases/luminous.yaml                |   1 +
 ceph/qa/standalone/mon/osd-pool-create.sh     |   2 +-
 .../special}/ceph_objectstore_tool.py         |  49 +-
 .../smoke/basic/1-distros/centos_7.3.yaml     |   2 -
 .../smoke/basic/1-distros/centos_latest.yaml  |   1 +
 .../smoke/basic/1-distros/ubuntu_16.04.yaml   |   2 -
 .../smoke/basic/1-distros/ubuntu_latest.yaml  |   1 +
 .../smoke/basic/2-ceph/ceph_ansible.yaml      |  32 +
 .../smoke/basic/2-config/ceph_ansible.yaml    |  22 -
 .../3-config/bluestore_with_dmcrypt.yaml      |   8 +
 .../smoke/basic/3-config/dmcrypt_off.yaml     |   7 +
 .../smoke/basic/3-config/dmcrypt_on.yaml      |   7 +
 .../ceph-ansible/smoke/basic/3-tasks/cls.yaml |   7 -
 .../ceph-admin-commands.yaml                  |   0
 .../rbd_import_export.yaml                    |   0
 .../smoke/basic/4-tasks/rest.yaml             |  15 +
 ceph/qa/suites/fs/32bits/objectstore          |   1 -
 ceph/qa/suites/fs/32bits/objectstore-ec       |   1 +
 .../clusters/4-remote-clients.yaml            |   4 +-
 .../objectstore/bluestore-ec-root.yaml        |   1 +
 ceph/qa/suites/fs/basic_workload/objectstore  |   1 -
 .../suites/fs/basic_workload/objectstore-ec   |   1 +
 .../multiclient/clusters/three_clients.yaml   |   2 +-
 .../fs/multiclient/clusters/two_clients.yaml  |   2 +-
 ceph/qa/suites/fs/multiclient/objectstore     |   1 -
 ceph/qa/suites/fs/multiclient/objectstore-ec  |   1 +
 .../fs/multifs/clusters/2-remote-clients.yaml |   4 +-
 ceph/qa/suites/fs/multifs/objectstore         |   1 -
 ceph/qa/suites/fs/multifs/objectstore-ec      |   1 +
 ceph/qa/suites/fs/permission/objectstore      |   1 -
 ceph/qa/suites/fs/permission/objectstore-ec   |   1 +
 ceph/qa/suites/fs/snaps/objectstore           |   1 -
 ceph/qa/suites/fs/snaps/objectstore-ec        |   1 +
 ceph/qa/suites/fs/thrash/objectstore          |   1 -
 ceph/qa/suites/fs/thrash/objectstore-ec       |   1 +
 ceph/qa/suites/fs/traceless/objectstore       |   1 -
 ceph/qa/suites/fs/traceless/objectstore-ec    |   1 +
 ceph/qa/suites/fs/verify/objectstore          |   1 -
 ceph/qa/suites/fs/verify/objectstore-ec       |   1 +
 ceph/qa/suites/kcephfs/cephfs/objectstore     |   1 -
 ceph/qa/suites/kcephfs/cephfs/objectstore-ec  |   1 +
 .../suites/kcephfs/mixed-clients/objectstore  |   1 -
 .../kcephfs/mixed-clients/objectstore-ec      |   1 +
 .../recovery/clusters/4-remote-clients.yaml   |   4 +-
 ceph/qa/suites/kcephfs/recovery/objectstore   |   1 -
 .../qa/suites/kcephfs/recovery/objectstore-ec |   1 +
 ceph/qa/suites/kcephfs/thrash/objectstore     |   1 -
 ceph/qa/suites/kcephfs/thrash/objectstore-ec  |   1 +
 ceph/qa/suites/multimds/basic/objectstore     |   1 -
 ceph/qa/suites/multimds/basic/objectstore-ec  |   1 +
 ceph/qa/suites/multimds/thrash/objectstore    |   1 -
 ceph/qa/suites/multimds/thrash/objectstore-ec |   1 +
 ceph/qa/suites/multimds/verify/objectstore    |   1 -
 ceph/qa/suites/multimds/verify/objectstore-ec |   1 +
 ceph/qa/suites/rados/basic/d-require-luminous |   1 -
 .../basic/d-require-luminous/at-end.yaml      |  33 +
 .../basic/d-require-luminous/at-mkfs.yaml}    |   0
 .../suites/rados/mgr/clusters/2-node-mgr.yaml |   2 +-
 ceph/qa/suites/rados/mgr/tasks/dashboard.yaml |  16 +
 .../rados/mgr/tasks/module_selftest.yaml      |  19 +
 ceph/qa/suites/rados/mgr/tasks/workunits.yaml |  16 +
 .../suites/rados/monthrash/d-require-luminous |   2 +-
 ceph/qa/suites/rados/rest/mgr-restful.yaml    |   3 +
 .../basic/tasks => rados/rest}/rest_test.yaml |  13 +-
 .../all/admin_socket_output.yaml              |   1 +
 .../all/max-pg-per-osd.from-mon.yaml          |  26 +
 .../all/max-pg-per-osd.from-primary.yaml      |  31 +
 .../all/max-pg-per-osd.from-replica.yaml      |  31 +
 .../rados/singleton/all/mon-seesaw.yaml       |   4 +
 .../singleton/all/recovery-preemption.yaml    |  51 +
 .../at-mkfs-balancer-crush-compat.yaml        |  11 +
 .../at-mkfs-balancer-upmap.yaml               |  11 +
 .../qa/suites/rados/verify/d-require-luminous |   2 +-
 .../suites/rbd/basic/tasks/rbd_cls_tests.yaml |   2 +
 ceph/qa/suites/rgw/hadoop-s3a/s3a-hadoop.yaml |  17 +-
 .../suites/rgw/multifs/tasks/rgw_s3tests.yaml |   2 +-
 .../rgw/thrash/workload/rgw_s3tests.yaml      |   2 +-
 .../suites/rgw/verify/tasks/rgw_s3tests.yaml  |   2 +-
 .../+ => jewel-x/ceph-deploy/%}               |   0
 .../ceph-deploy/distros/centos_latest.yaml    |   1 +
 .../ceph-deploy/distros/ubuntu_latest.yaml    |   1 +
 .../jewel-x/ceph-deploy/jewel-luminous.yaml   |  82 ++
 .../jewel-x/parallel/0-cluster/start.yaml     |   1 +
 .../parallel/1-jewel-install/jewel.yaml       |  19 +
 .../parallel/2-workload/blogbench.yaml        |   2 +-
 .../upgrade/jewel-x/parallel/4-luminous.yaml  |  24 +-
 .../upgrade/jewel-x/parallel/5-workload.yaml  |  11 +
 .../parallel/6-luminous-with-mgr.yaml}        |   0
 .../jewel-x/parallel/6.5-crush-compat.yaml    |   8 +
 .../jewel-x/parallel/7-final-workload/+       |   0
 .../parallel/7-final-workload}/blogbench.yaml |   4 +-
 .../rados-snaps-few-objects.yaml              |   2 +-
 .../7-final-workload}/rados_loadgenmix.yaml   |   2 +-
 .../rados_mon_thrash.yaml                     |   2 +-
 .../parallel/7-final-workload}/rbd_cls.yaml   |   2 +-
 .../7-final-workload}/rbd_import_export.yaml  |   2 +-
 .../parallel/7-final-workload}/rgw_swift.yaml |   4 +-
 .../jewel-x/parallel/8-jewel-workload.yaml    |   1 +
 .../point-to-point-upgrade.yaml               |  13 +-
 .../stress-split/6.5-crush-compat.yaml        |   1 +
 .../kraken-x/ceph-deploy/kraken-luminous.yaml |  61 ++
 .../kraken-x/parallel/0-cluster/start.yaml    |   1 +
 .../upgrade/kraken-x/parallel/4-luminous.yaml |   4 +
 .../upgrade/kraken-x/parallel/5-workload.yaml |  11 +
 .../parallel/6-luminous-with-mgr.yaml         |   1 +
 .../kraken-x/parallel/7-final-workload/+      |   0
 .../parallel/7-final-workload}/blogbench.yaml |   0
 .../rados-snaps-few-objects.yaml              |   0
 .../7-final-workload}/rados_loadgenmix.yaml   |   0
 .../rados_mon_thrash.yaml                     |   0
 .../parallel/7-final-workload}/rbd_cls.yaml   |   0
 .../7-final-workload}/rbd_import_export.yaml  |   0
 .../parallel/7-final-workload}/rgw_swift.yaml |   0
 .../luminous-x/parallel/0-cluster/start.yaml  |   1 +
 .../rbd_import_export_no_upgrated.yaml        |  13 +
 ...t.yaml => rbd_import_export_upgrated.yaml} |   3 +-
 .../stress-split-erasure-code/1-ceph-install  |   1 +
 ceph/qa/tasks/ceph.py                         |  30 +-
 ceph/qa/tasks/ceph_deploy.py                  | 158 ++-
 ceph/qa/tasks/ceph_manager.py                 |  47 +-
 ceph/qa/tasks/ceph_objectstore_tool.py        |   2 +-
 ceph/qa/tasks/cephfs/filesystem.py            |  22 +-
 ceph/qa/tasks/cephfs/test_client_limits.py    |  38 +-
 ceph/qa/tasks/cephfs/test_volume_client.py    | 119 ++-
 ceph/qa/tasks/divergent_priors2.py            |   8 +-
 ceph/qa/tasks/mgr/mgr_test_case.py            |  93 +-
 ceph/qa/tasks/mgr/test_dashboard.py           |  70 ++
 ceph/qa/tasks/mgr/test_module_selftest.py     |  74 ++
 ceph/qa/tasks/osd_max_pg_per_osd.py           | 126 +++
 ceph/qa/tasks/reg11184.py                     |  11 +-
 ceph/qa/tasks/s3a_hadoop.py                   |   4 +-
 ceph/qa/tasks/thrashosds.py                   |   2 +-
 ceph/qa/tasks/util/rados.py                   |   4 +-
 ceph/qa/workunits/ceph-disk/ceph-disk-test.py |   2 +-
 ceph/qa/workunits/ceph-disk/ceph-disk.sh      |   2 +-
 ceph/qa/workunits/cephtool/test.sh            |  18 +-
 ceph/qa/workunits/cls/test_cls_journal.sh     |   6 +
 ceph/qa/workunits/mgr/test_localpool.sh       |  21 +
 ceph/qa/workunits/rados/test_rados_tool.sh    |   2 +-
 ceph/qa/workunits/rbd/rbd_mirror.sh           |  12 +
 ceph/qa/workunits/rbd/rbd_mirror_helpers.sh   |  10 +
 ceph/selinux/ceph.te                          |   2 +-
 ceph/src/.git_version                         |   4 +-
 ceph/src/90-ceph-osd.conf                     |   1 +
 ceph/src/CMakeLists.txt                       |  18 +-
 ceph/src/arch/arm.c                           |   5 +-
 ceph/src/ceph-disk/ceph_disk/main.py          | 162 +--
 ceph/src/ceph-disk/tox.ini                    |   2 +-
 .../ceph-volume/ceph_volume/api/__init__.py   |   3 +
 .../{devices/lvm/api.py => api/lvm.py}        | 108 +-
 .../src/ceph-volume/ceph_volume/decorators.py |   3 +
 .../ceph_volume/devices/__init__.py           |   2 +-
 .../ceph_volume/devices/lvm/activate.py       | 114 ++-
 .../ceph_volume/devices/lvm/common.py         |  24 +-
 .../ceph_volume/devices/lvm/create.py         |   4 +
 .../ceph_volume/devices/lvm/listing.py        | 244 +++++
 .../ceph_volume/devices/lvm/main.py           |   4 +
 .../ceph_volume/devices/lvm/prepare.py        | 232 +++--
 .../ceph_volume/devices/lvm/trigger.py        |   2 +-
 .../ceph_volume/devices/lvm/zap.py            | 107 ++
 .../ceph_volume/devices/simple/__init__.py    |   1 +
 .../ceph_volume/devices/simple/activate.py    | 152 +++
 .../ceph_volume/devices/simple/main.py        |  41 +
 .../ceph_volume/devices/simple/scan.py        | 206 ++++
 .../ceph_volume/devices/simple/trigger.py     |  70 ++
 ceph/src/ceph-volume/ceph_volume/main.py      |   2 +-
 ceph/src/ceph-volume/ceph_volume/process.py   |  47 +-
 .../ceph_volume/systemd/systemctl.py          |  15 +
 .../lvm/test_api.py => api/test_lvm.py}       |  66 +-
 .../ceph-volume/ceph_volume/tests/conftest.py |  34 +-
 .../tests/devices/lvm/test_activate.py        |  48 +-
 .../tests/devices/lvm/test_listing.py         | 176 ++++
 .../tests/devices/lvm/test_prepare.py         |   9 +-
 .../tests/devices/simple/test_activate.py     |  23 +
 .../tests/devices/simple/test_scan.py         |  52 +
 .../tests/devices/simple/test_trigger.py      |  45 +
 .../ceph_volume/tests/devices/test_zap.py     |  17 +
 .../functional/centos7/create/Vagrantfile     |   1 -
 .../lvm/centos7/bluestore/create/Vagrantfile  |   1 +
 .../centos7/bluestore/create/group_vars/all   |  28 +
 .../centos7/bluestore}/create/hosts           |   0
 .../lvm/centos7/bluestore/create/setup.yml    |   1 +
 .../bluestore}/create/vagrant_variables.yml   |   0
 .../lvm/centos7/filestore/create/Vagrantfile  |   1 +
 .../centos7/filestore}/create/group_vars/all  |   2 +
 .../centos7/filestore}/create/hosts           |   0
 .../lvm/centos7/filestore/create/setup.yml    |   1 +
 .../filestore/create/vagrant_variables.yml    |  56 ++
 .../lvm/playbooks/setup_partitions.yml        |  27 +
 .../ceph_volume/tests/functional/lvm/tox.ini  |  59 ++
 .../lvm/xenial/bluestore/create/Vagrantfile   |   1 +
 .../xenial/bluestore/create/group_vars/all    |  28 +
 .../lvm/xenial/bluestore/create/hosts         |   5 +
 .../lvm/xenial/bluestore/create/setup.yml     |   1 +
 .../bluestore/create/vagrant_variables.yml    |  56 ++
 .../lvm/xenial/filestore/create/Vagrantfile   |   1 +
 .../xenial/filestore}/create/group_vars/all   |   2 +
 .../lvm/xenial/filestore/create/hosts         |   5 +
 .../lvm/xenial/filestore/create/setup.yml     |   1 +
 .../filestore}/create/vagrant_variables.yml   |   0
 .../centos7/bluestore/activate/Vagrantfile    |   1 +
 .../centos7/bluestore/activate/group_vars/all |  19 +
 .../bluestore/activate/host_vars/osd0.yml     |   7 +
 .../bluestore/activate/host_vars/osd1.yml     |   6 +
 .../simple/centos7/bluestore/activate/hosts   |   9 +
 .../centos7/bluestore/activate/test.yml       |  31 +
 .../bluestore/activate/vagrant_variables.yml  |  73 ++
 .../centos7/filestore/activate/Vagrantfile    |   1 +
 .../centos7/filestore/activate/group_vars/all |  19 +
 .../filestore/activate/host_vars/osd0.yml     |   7 +
 .../filestore/activate/host_vars/osd1.yml     |   6 +
 .../simple/centos7/filestore/activate/hosts   |   9 +
 .../centos7/filestore/activate/test.yml       |  31 +
 .../filestore/activate/vagrant_variables.yml  |  73 ++
 .../tests/functional/{ => simple}/tox.ini     |  24 +-
 .../xenial/bluestore/activate/Vagrantfile     |   1 +
 .../xenial/bluestore/activate/group_vars/all  |  19 +
 .../bluestore/activate/host_vars/osd0.yml     |   7 +
 .../bluestore/activate/host_vars/osd1.yml     |   6 +
 .../simple/xenial/bluestore/activate/hosts    |   9 +
 .../simple/xenial/bluestore/activate/test.yml |  31 +
 .../bluestore/activate/vagrant_variables.yml  |  73 ++
 .../xenial/filestore/activate/Vagrantfile     |   1 +
 .../xenial/filestore/activate/group_vars/all  |  19 +
 .../filestore/activate/host_vars/osd0.yml     |   7 +
 .../filestore/activate/host_vars/osd1.yml     |   6 +
 .../simple/xenial/filestore/activate/hosts    |   9 +
 .../simple/xenial/filestore/activate/test.yml |  31 +
 .../filestore/activate/vagrant_variables.yml  |  73 ++
 .../functional/xenial/create/Vagrantfile      |   1 -
 .../tests/util/test_arg_validators.py         |  31 +-
 .../ceph_volume/tests/util/test_system.py     | 141 ++-
 .../ceph_volume/util/arg_validators.py        |  44 +
 ceph/src/ceph-volume/ceph_volume/util/disk.py | 159 +++
 .../ceph-volume/ceph_volume/util/prepare.py   | 108 +-
 .../ceph-volume/ceph_volume/util/system.py    | 140 ++-
 ceph/src/ceph.in                              |   4 +-
 ceph/src/ceph_mgr.cc                          |   5 +
 ceph/src/ceph_mon.cc                          |  20 +-
 ceph/src/ceph_osd.cc                          |  46 +-
 ceph/src/client/Client.cc                     |  23 +-
 ceph/src/client/Client.h                      |   1 -
 ceph/src/cls/journal/cls_journal.cc           |   2 +
 ceph/src/cls/rbd/cls_rbd.cc                   |  91 +-
 ceph/src/cls/rgw/cls_rgw.cc                   |  46 +-
 ceph/src/cls/user/cls_user.cc                 |   1 -
 ceph/src/cls/user/cls_user_types.h            |  11 +-
 ceph/src/common/AsyncReserver.h               | 162 ++-
 ceph/src/common/LogClient.cc                  |   2 +-
 ceph/src/common/Timer.cc                      |   8 +-
 ceph/src/common/Timer.h                       |   4 +-
 ceph/src/common/bit_vector.hpp                | 189 +++-
 ceph/src/common/buffer.cc                     |  79 +-
 ceph/src/common/ceph_context.cc               |   2 +-
 ceph/src/common/common_init.cc                |   4 +
 ceph/src/common/config.cc                     |  15 +-
 ceph/src/common/legacy_config_opts.h          |  28 -
 ceph/src/common/options.cc                    | 199 +++-
 ceph/src/common/perf_counters.cc              |  11 +-
 ceph/src/common/perf_counters.h               | 155 +--
 ceph/src/common/pick_address.cc               | 102 +-
 ceph/src/common/pick_address.h                |   7 +
 ceph/src/common/subsys.h                      |   1 +
 ceph/src/crush/CrushTreeDumper.h              |   6 +-
 ceph/src/crush/CrushWrapper.cc                | 192 ++--
 ceph/src/crush/CrushWrapper.h                 |  78 +-
 ceph/src/include/buffer.h                     |  10 +-
 ceph/src/include/rados/rgw_file.h             |  12 +-
 ceph/src/include/sock_compat.h                |  13 +
 ceph/src/journal/JournalMetadata.cc           |   6 +-
 ceph/src/journal/ObjectPlayer.cc              |  13 +-
 ceph/src/journal/ObjectPlayer.h               |   6 -
 ceph/src/journal/ObjectRecorder.cc            |  10 +-
 ceph/src/journal/ObjectRecorder.h             |  10 +-
 ceph/src/kv/KeyValueDB.h                      |  10 +
 ceph/src/kv/LevelDBStore.h                    |   5 +
 ceph/src/kv/RocksDBStore.h                    |   5 +
 ceph/src/librbd/ObjectMap.cc                  |  17 +-
 ceph/src/librbd/ObjectMap.h                   |  13 +-
 ceph/src/librbd/api/Mirror.cc                 |   8 +-
 ceph/src/librbd/io/ObjectRequest.cc           |  10 +
 ceph/src/librbd/io/ObjectRequest.h            |   5 +
 ceph/src/librbd/object_map/UpdateRequest.cc   |  83 +-
 ceph/src/librbd/object_map/UpdateRequest.h    |  26 +-
 .../librbd/operation/SnapshotCreateRequest.cc |   4 +-
 .../librbd/operation/SnapshotRemoveRequest.cc |   2 +-
 ceph/src/librbd/operation/TrimRequest.cc      | 227 ++---
 ceph/src/librbd/operation/TrimRequest.h       |  76 +-
 ceph/src/mds/Beacon.cc                        |  22 +-
 ceph/src/mds/Beacon.h                         |   3 +-
 ceph/src/mds/CInode.h                         |   2 +-
 ceph/src/mds/FSMap.cc                         |  16 +-
 ceph/src/mds/FSMap.h                          |   1 +
 ceph/src/mds/MDCache.cc                       |   4 +-
 ceph/src/mds/MDSDaemon.cc                     |  21 +-
 ceph/src/mds/MDSDaemon.h                      |   3 +-
 ceph/src/mds/MDSMap.cc                        |  23 +-
 ceph/src/mds/MDSMap.h                         |   2 +-
 ceph/src/mds/MDSRank.cc                       |  51 +-
 ceph/src/mds/PurgeQueue.cc                    |  41 +-
 ceph/src/mds/PurgeQueue.h                     |   6 +-
 ceph/src/mds/Server.cc                        |  29 +-
 ceph/src/messages/MMgrBeacon.h                |  18 +-
 ceph/src/messages/MMgrConfigure.h             |  12 +-
 ceph/src/messages/MMgrReport.h                |  13 +-
 ceph/src/messages/MOSDMap.h                   |  11 +-
 ceph/src/messages/MOSDPGTemp.h                |  18 +-
 ceph/src/mgr/ActivePyModule.cc                | 225 +++++
 .../mgr/{MgrPyModule.h => ActivePyModule.h}   |  50 +-
 .../mgr/{PyModules.cc => ActivePyModules.cc}  | 472 ++++-----
 .../mgr/{PyModules.h => ActivePyModules.h}    |  77 +-
 ceph/src/mgr/BaseMgrModule.cc                 | 636 ++++++++++++
 ceph/src/mgr/BaseMgrModule.h                  |   7 +
 ceph/src/mgr/BaseMgrStandbyModule.cc          | 161 +++
 ceph/src/mgr/BaseMgrStandbyModule.h           |   7 +
 ceph/src/mgr/DaemonServer.cc                  | 195 +++-
 ceph/src/mgr/DaemonServer.h                   |  21 +-
 ceph/src/mgr/DaemonState.cc                   |  33 +-
 ceph/src/mgr/DaemonState.h                    |  51 +-
 ceph/src/mgr/Gil.cc                           |  79 ++
 ceph/src/mgr/Gil.h                            |  90 +-
 ceph/src/mgr/Mgr.cc                           |  51 +-
 ceph/src/mgr/Mgr.h                            |  13 +-
 ceph/src/mgr/MgrClient.cc                     |  68 +-
 ceph/src/mgr/MgrClient.h                      |   1 +
 ceph/src/mgr/MgrCommands.h                    |   5 +
 ceph/src/mgr/MgrPyModule.cc                   | 371 -------
 ceph/src/mgr/MgrSession.h                     |   2 +
 ceph/src/mgr/MgrStandby.cc                    |  73 +-
 ceph/src/mgr/MgrStandby.h                     |   2 +
 ceph/src/mgr/PyModuleRegistry.cc              | 450 +++++++++
 ceph/src/mgr/PyModuleRegistry.h               | 173 ++++
 ceph/src/mgr/PyModuleRunner.cc                |  97 ++
 ceph/src/mgr/PyModuleRunner.h                 |  76 ++
 ceph/src/mgr/PyOSDMap.cc                      | 589 +++++++++++
 ceph/src/mgr/PyOSDMap.h                       |  20 +
 ceph/src/mgr/PyState.cc                       | 490 ---------
 ceph/src/mgr/PyState.h                        |  12 -
 ceph/src/mgr/StandbyPyModules.cc              | 200 ++++
 ceph/src/mgr/StandbyPyModules.h               | 149 +++
 ceph/src/mon/AuthMonitor.cc                   |  12 +-
 ceph/src/mon/Elector.cc                       |  10 +-
 ceph/src/mon/LogMonitor.cc                    |  17 +-
 ceph/src/mon/MDSMonitor.cc                    |  18 +-
 ceph/src/mon/MDSMonitor.h                     |   3 +-
 ceph/src/mon/MgrMap.h                         |  16 +-
 ceph/src/mon/MgrMonitor.cc                    | 103 +-
 ceph/src/mon/MgrMonitor.h                     |   6 +
 ceph/src/mon/MonCommands.h                    |  12 +-
 ceph/src/mon/MonMap.cc                        |  21 +-
 ceph/src/mon/Monitor.cc                       | 114 ++-
 ceph/src/mon/MonitorDBStore.h                 |  12 +
 ceph/src/mon/OSDMonitor.cc                    | 461 +++++++--
 ceph/src/mon/OSDMonitor.h                     |   6 +-
 ceph/src/mon/PGMap.cc                         |  60 +-
 ceph/src/mon/Paxos.cc                         |  72 +-
 ceph/src/mon/PaxosService.cc                  |   6 +-
 ceph/src/msg/Messenger.h                      |  45 +-
 ceph/src/msg/async/AsyncConnection.h          |   1 -
 ceph/src/msg/async/PosixStack.cc              |  79 +-
 ceph/src/msg/async/net_handler.cc             |   2 +-
 ceph/src/msg/simple/Pipe.cc                   |  87 +-
 ceph/src/msg/simple/Pipe.h                    |   9 -
 ceph/src/os/ObjectMap.h                       |   2 +-
 ceph/src/os/ObjectStore.h                     |   3 +
 .../src/os/bluestore/BitmapFreelistManager.cc |  51 +-
 ceph/src/os/bluestore/BitmapFreelistManager.h |   5 +-
 ceph/src/os/bluestore/BlueFS.cc               |  22 +
 ceph/src/os/bluestore/BlueFS.h                |   2 +
 ceph/src/os/bluestore/BlueStore.cc            | 604 +++++++-----
 ceph/src/os/bluestore/BlueStore.h             |  34 +-
 ceph/src/os/bluestore/FreelistManager.h       |   5 +-
 ceph/src/os/bluestore/KernelDevice.cc         |   5 +
 ceph/src/os/bluestore/bluestore_tool.cc       | 342 +++++--
 ceph/src/os/bluestore/bluestore_types.cc      |  20 +-
 ceph/src/os/bluestore/bluestore_types.h       |   4 +
 ceph/src/os/filestore/DBObjectMap.cc          | 139 +--
 ceph/src/os/filestore/DBObjectMap.h           |  24 +-
 ceph/src/os/filestore/FileJournal.cc          |   3 +-
 ceph/src/os/filestore/FileStore.cc            |   3 +-
 ceph/src/osd/OSD.cc                           | 184 +++-
 ceph/src/osd/OSD.h                            |  11 +-
 ceph/src/osd/OSDMap.cc                        | 145 ++-
 ceph/src/osd/OSDMap.h                         |  21 +-
 ceph/src/osd/PG.cc                            | 114 ++-
 ceph/src/osd/PG.h                             |  99 +-
 ceph/src/osd/PrimaryLogPG.cc                  |  36 +-
 ceph/src/osd/PrimaryLogPG.h                   |   6 +-
 ceph/src/osd/ReplicatedBackend.h              |   1 -
 ceph/src/osd/Watch.cc                         |  12 +-
 ceph/src/osd/osd_types.cc                     |  15 +-
 ceph/src/osd/osd_types.h                      |  16 +-
 ceph/src/osdc/ObjectCacher.cc                 |  23 +-
 ceph/src/osdc/ObjectCacher.h                  |   5 +-
 ceph/src/pybind/ceph_volume_client.py         |  72 +-
 ceph/src/pybind/mgr/balancer/__init__.py      |   2 +
 ceph/src/pybind/mgr/balancer/module.py        | 933 ++++++++++++++++++
 ceph/src/pybind/mgr/dashboard/base.html       | 127 +--
 ceph/src/pybind/mgr/dashboard/clients.html    |   2 +-
 ceph/src/pybind/mgr/dashboard/filesystem.html |   9 +-
 ceph/src/pybind/mgr/dashboard/health.html     |  23 +-
 ceph/src/pybind/mgr/dashboard/module.py       | 117 ++-
 ceph/src/pybind/mgr/dashboard/osd_perf.html   |   2 +-
 ceph/src/pybind/mgr/dashboard/osds.html       |   2 +-
 ceph/src/pybind/mgr/dashboard/rbd_iscsi.html  |   2 +-
 ceph/src/pybind/mgr/dashboard/rbd_ls.py       |   4 +-
 .../pybind/mgr/dashboard/rbd_mirroring.html   |   2 +-
 ceph/src/pybind/mgr/dashboard/rbd_pool.html   |   2 +-
 ceph/src/pybind/mgr/dashboard/servers.html    |   2 +-
 ceph/src/pybind/mgr/dashboard/standby.html    |  15 +
 ceph/src/pybind/mgr/influx/__init__.py        |   1 +
 ceph/src/pybind/mgr/influx/module.py          | 162 +++
 ceph/src/pybind/mgr/localpool/__init__.py     |   2 +
 ceph/src/pybind/mgr/localpool/module.py       |  92 ++
 ceph/src/pybind/mgr/mgr_module.py             | 338 ++++++-
 ceph/src/pybind/mgr/prometheus/module.py      | 354 +++++--
 ceph/src/pybind/mgr/restful/module.py         |  22 +-
 ceph/src/pybind/mgr/selftest/__init__.py      |   3 +
 ceph/src/pybind/mgr/selftest/module.py        | 217 ++++
 ceph/src/pybind/mgr/status/module.py          |  28 +-
 ceph/src/pybind/mgr/zabbix/module.py          |  10 +-
 ceph/src/rbdmap                               |  16 +-
 ceph/src/rgw/rgw_admin.cc                     |   8 +
 ceph/src/rgw/rgw_auth_s3.h                    |  14 +-
 ceph/src/rgw/rgw_basic_types.h                |   2 -
 ceph/src/rgw/rgw_bucket.cc                    |  29 +-
 ceph/src/rgw/rgw_bucket.h                     |  14 +-
 ceph/src/rgw/rgw_common.cc                    |  16 +-
 ceph/src/rgw/rgw_common.h                     |  56 +-
 ceph/src/rgw/rgw_crypt.cc                     | 187 +++-
 ceph/src/rgw/rgw_data_sync.cc                 |  14 +-
 ceph/src/rgw/rgw_file.cc                      | 202 +++-
 ceph/src/rgw/rgw_file.h                       | 163 ++-
 ceph/src/rgw/rgw_iam_policy.cc                |  12 +-
 ceph/src/rgw/rgw_iam_policy.h                 |   3 -
 ceph/src/rgw/rgw_json_enc.cc                  |   1 +
 ceph/src/rgw/rgw_keystone.h                   |   7 +-
 ceph/src/rgw/rgw_lc.cc                        |  12 +-
 ceph/src/rgw/rgw_op.cc                        | 290 ++++--
 ceph/src/rgw/rgw_op.h                         | 144 ++-
 ceph/src/rgw/rgw_quota.cc                     |   5 +-
 ceph/src/rgw/rgw_rados.cc                     |  44 +-
 ceph/src/rgw/rgw_rados.h                      |  10 +-
 ceph/src/rgw/rgw_reshard.cc                   |  12 +-
 ceph/src/rgw/rgw_rest.h                       |  19 +
 ceph/src/rgw/rgw_rest_swift.cc                | 255 +++--
 ceph/src/rgw/rgw_rest_swift.h                 |  13 +-
 ceph/src/rgw/rgw_rest_user.cc                 |   4 +
 ceph/src/rgw/rgw_swift_auth.cc                |  10 +-
 ceph/src/rgw/rgw_swift_auth.h                 |  15 +-
 ceph/src/rgw/rgw_torrent.cc                   |  15 +-
 ceph/src/rgw/rgw_torrent.h                    |   6 +-
 ceph/src/rgw/rgw_user.cc                      |  23 +-
 ceph/src/rgw/rgw_user.h                       |   5 +
 ceph/src/test/CMakeLists.txt                  |  15 +
 ceph/src/test/cli/crushtool/build.t           |   2 +-
 ceph/src/test/cli/osdmaptool/help.t           |   1 +
 .../test/cli/osdmaptool/missing-argument.t    |   1 +
 ceph/src/test/cli/osdmaptool/upmap-out.t      |  23 +
 ceph/src/test/cls_journal/CMakeLists.txt      |  18 +
 ceph/src/test/cls_journal/test_cls_journal.cc |  20 +-
 ceph/src/test/common/test_bit_vector.cc       |  28 +
 ceph/src/test/daemon_config.cc                | 117 +--
 ceph/src/test/librados/misc.cc                |  22 +
 ceph/src/test/librbd/CMakeLists.txt           |   1 +
 ceph/src/test/librbd/mock/MockImageCtx.h      |   5 +
 ceph/src/test/librbd/mock/MockObjectMap.h     |  18 +-
 .../object_map/test_mock_UpdateRequest.cc     |  63 +-
 .../test_mock_SnapshotCreateRequest.cc        |   4 +-
 .../test_mock_SnapshotRemoveRequest.cc        |   2 +-
 .../librbd/operation/test_mock_TrimRequest.cc | 496 ++++++++++
 ceph/src/test/librgw_file.cc                  |   4 +-
 ceph/src/test/librgw_file_aw.cc               |   4 +-
 ceph/src/test/librgw_file_cd.cc               |   4 +-
 ceph/src/test/librgw_file_gp.cc               |   4 +-
 ceph/src/test/librgw_file_marker.cc           | 488 +++++++++
 ceph/src/test/librgw_file_nfsns.cc            |   4 +-
 ceph/src/test/mon/PGMap.cc                    |   2 +-
 ceph/src/test/perf_counters.cc                |   2 +-
 ceph/src/test/perf_local.cc                   |   5 +-
 .../test_mock_BootstrapRequest.cc             |   2 +
 ceph/src/test/rbd_mirror/mock/MockSafeTimer.h |   2 +-
 .../rbd_mirror/test_mock_ImageReplayer.cc     |   8 +-
 .../rbd_mirror/test_mock_InstanceReplayer.cc  |   9 +-
 .../test/rbd_mirror/test_mock_PoolWatcher.cc  |  17 +-
 ceph/src/test/rgw/rgw_multi/tests.py          |  22 +
 ceph/src/test/test_ipaddr.cc                  |  51 +
 ceph/src/tools/CMakeLists.txt                 |   4 +-
 ceph/src/tools/ceph_kvstore_tool.cc           |  23 +-
 ceph/src/tools/ceph_monstore_tool.cc          |  35 +
 ceph/src/tools/ceph_objectstore_tool.cc       |  82 +-
 ceph/src/tools/ceph_osdomap_tool.cc           |  25 +-
 ceph/src/tools/crushtool.cc                   |  17 +-
 ceph/src/tools/monmaptool.cc                  |   6 +-
 ceph/src/tools/osdmaptool.cc                  |  13 +
 ceph/src/tools/rbd/action/MirrorImage.cc      |  39 +
 ceph/src/tools/rbd/action/MirrorPool.cc       |  67 +-
 ceph/src/tools/rbd_mirror/ImageReplayer.cc    | 252 +++--
 ceph/src/tools/rbd_mirror/ImageReplayer.h     |   2 +
 ceph/src/tools/rbd_mirror/PoolReplayer.cc     |  36 +-
 ceph/src/tools/rbd_mirror/PoolReplayer.h      |   3 +-
 ceph/src/tools/rbd_mirror/PoolWatcher.cc      |   9 +-
 .../image_replayer/BootstrapRequest.cc        |  15 +-
 .../rbd_mirror/image_sync/ImageCopyRequest.cc |   5 +-
 ceph/src/vstart.sh                            |  12 +-
 ceph/systemd/ceph-rbd-mirror@.service         |   1 +
 556 files changed, 18785 insertions(+), 4865 deletions(-)
 create mode 100644 ceph/doc/ceph-volume/lvm/create.rst
 create mode 100644 ceph/doc/ceph-volume/lvm/list.rst
 create mode 100644 ceph/doc/ceph-volume/lvm/zap.rst
 create mode 100644 ceph/doc/ceph-volume/simple/activate.rst
 create mode 100644 ceph/doc/ceph-volume/simple/index.rst
 create mode 100644 ceph/doc/ceph-volume/simple/scan.rst
 create mode 100644 ceph/doc/ceph-volume/simple/systemd.rst
 create mode 100644 ceph/doc/ceph-volume/systemd.rst
 create mode 100644 ceph/doc/man/8/ceph-bluestore-tool.rst
 create mode 100644 ceph/doc/mgr/influx.rst
 create mode 100644 ceph/doc/mgr/localpool.rst
 create mode 100644 ceph/qa/cephfs/objectstore-ec/bluestore-comp-ec-root.yaml
 create mode 100644 ceph/qa/cephfs/objectstore-ec/bluestore-comp.yaml
 create mode 100644 ceph/qa/cephfs/objectstore-ec/bluestore-ec-root.yaml
 create mode 100644 ceph/qa/cephfs/objectstore-ec/bluestore.yaml
 create mode 100644 ceph/qa/cephfs/objectstore-ec/filestore-xfs.yaml
 create mode 100644 ceph/qa/distros/all/centos_7.4.yaml
 rename ceph/{src/test => qa/standalone/special}/ceph_objectstore_tool.py (97%)
 delete mode 100644 ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_7.3.yaml
 create mode 120000 ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_latest.yaml
 delete mode 100644 ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_16.04.yaml
 create mode 120000 ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_latest.yaml
 create mode 100644 ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml
 delete mode 100644 ceph/qa/suites/ceph-ansible/smoke/basic/2-config/ceph_ansible.yaml
 create mode 100644 ceph/qa/suites/ceph-ansible/smoke/basic/3-config/bluestore_with_dmcrypt.yaml
 create mode 100644 ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_off.yaml
 create mode 100644 ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_on.yaml
 delete mode 100644 ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/cls.yaml
 rename ceph/qa/suites/ceph-ansible/smoke/basic/{3-tasks => 4-tasks}/ceph-admin-commands.yaml (100%)
 rename ceph/qa/suites/ceph-ansible/smoke/basic/{3-tasks => 4-tasks}/rbd_import_export.yaml (100%)
 create mode 100644 ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml
 delete mode 120000 ceph/qa/suites/fs/32bits/objectstore
 create mode 120000 ceph/qa/suites/fs/32bits/objectstore-ec
 create mode 120000 ceph/qa/suites/fs/basic_functional/objectstore/bluestore-ec-root.yaml
 delete mode 120000 ceph/qa/suites/fs/basic_workload/objectstore
 create mode 120000 ceph/qa/suites/fs/basic_workload/objectstore-ec
 delete mode 120000 ceph/qa/suites/fs/multiclient/objectstore
 create mode 120000 ceph/qa/suites/fs/multiclient/objectstore-ec
 delete mode 120000 ceph/qa/suites/fs/multifs/objectstore
 create mode 120000 ceph/qa/suites/fs/multifs/objectstore-ec
 delete mode 120000 ceph/qa/suites/fs/permission/objectstore
 create mode 120000 ceph/qa/suites/fs/permission/objectstore-ec
 delete mode 120000 ceph/qa/suites/fs/snaps/objectstore
 create mode 120000 ceph/qa/suites/fs/snaps/objectstore-ec
 delete mode 120000 ceph/qa/suites/fs/thrash/objectstore
 create mode 120000 ceph/qa/suites/fs/thrash/objectstore-ec
 delete mode 120000 ceph/qa/suites/fs/traceless/objectstore
 create mode 120000 ceph/qa/suites/fs/traceless/objectstore-ec
 delete mode 120000 ceph/qa/suites/fs/verify/objectstore
 create mode 120000 ceph/qa/suites/fs/verify/objectstore-ec
 delete mode 120000 ceph/qa/suites/kcephfs/cephfs/objectstore
 create mode 120000 ceph/qa/suites/kcephfs/cephfs/objectstore-ec
 delete mode 120000 ceph/qa/suites/kcephfs/mixed-clients/objectstore
 create mode 120000 ceph/qa/suites/kcephfs/mixed-clients/objectstore-ec
 delete mode 120000 ceph/qa/suites/kcephfs/recovery/objectstore
 create mode 120000 ceph/qa/suites/kcephfs/recovery/objectstore-ec
 delete mode 120000 ceph/qa/suites/kcephfs/thrash/objectstore
 create mode 120000 ceph/qa/suites/kcephfs/thrash/objectstore-ec
 delete mode 120000 ceph/qa/suites/multimds/basic/objectstore
 create mode 120000 ceph/qa/suites/multimds/basic/objectstore-ec
 delete mode 120000 ceph/qa/suites/multimds/thrash/objectstore
 create mode 120000 ceph/qa/suites/multimds/thrash/objectstore-ec
 delete mode 120000 ceph/qa/suites/multimds/verify/objectstore
 create mode 120000 ceph/qa/suites/multimds/verify/objectstore-ec
 delete mode 120000 ceph/qa/suites/rados/basic/d-require-luminous
 create mode 100644 ceph/qa/suites/rados/basic/d-require-luminous/at-end.yaml
 rename ceph/qa/suites/{upgrade/jewel-x/parallel/5-final-workload/+ => rados/basic/d-require-luminous/at-mkfs.yaml} (100%)
 create mode 100644 ceph/qa/suites/rados/mgr/tasks/dashboard.yaml
 create mode 100644 ceph/qa/suites/rados/mgr/tasks/module_selftest.yaml
 create mode 100644 ceph/qa/suites/rados/mgr/tasks/workunits.yaml
 rename ceph/qa/suites/{rest/basic/tasks => rados/rest}/rest_test.yaml (62%)
 create mode 100644 ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml
 create mode 100644 ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml
 create mode 100644 ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml
 create mode 100644 ceph/qa/suites/rados/singleton/all/recovery-preemption.yaml
 create mode 100644 ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-crush-compat.yaml
 create mode 100644 ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-upmap.yaml
 rename ceph/qa/suites/upgrade/{kraken-x/parallel/5-final-workload/+ => jewel-x/ceph-deploy/%} (100%)
 create mode 120000 ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/centos_latest.yaml
 create mode 120000 ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/ubuntu_latest.yaml
 create mode 100644 ceph/qa/suites/upgrade/jewel-x/ceph-deploy/jewel-luminous.yaml
 mode change 120000 => 100644 ceph/qa/suites/upgrade/jewel-x/parallel/4-luminous.yaml
 create mode 100644 ceph/qa/suites/upgrade/jewel-x/parallel/5-workload.yaml
 rename ceph/qa/suites/upgrade/{kraken-x/parallel/4-luminous-with-mgr.yaml => jewel-x/parallel/6-luminous-with-mgr.yaml} (100%)
 create mode 100644 ceph/qa/suites/upgrade/jewel-x/parallel/6.5-crush-compat.yaml
 create mode 100644 ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/+
 rename ceph/qa/suites/upgrade/{kraken-x/parallel/5-final-workload => jewel-x/parallel/7-final-workload}/blogbench.yaml (74%)
 rename ceph/qa/suites/upgrade/{kraken-x/parallel/5-final-workload => jewel-x/parallel/7-final-workload}/rados-snaps-few-objects.yaml (88%)
 rename ceph/qa/suites/upgrade/{kraken-x/parallel/5-final-workload => jewel-x/parallel/7-final-workload}/rados_loadgenmix.yaml (74%)
 rename ceph/qa/suites/upgrade/jewel-x/parallel/{5-final-workload => 7-final-workload}/rados_mon_thrash.yaml (84%)
 rename ceph/qa/suites/upgrade/{kraken-x/parallel/5-final-workload => jewel-x/parallel/7-final-workload}/rbd_cls.yaml (69%)
 rename ceph/qa/suites/upgrade/{kraken-x/parallel/5-final-workload => jewel-x/parallel/7-final-workload}/rbd_import_export.yaml (76%)
 rename ceph/qa/suites/upgrade/{kraken-x/parallel/5-final-workload => jewel-x/parallel/7-final-workload}/rgw_swift.yaml (64%)
 create mode 120000 ceph/qa/suites/upgrade/jewel-x/parallel/8-jewel-workload.yaml
 create mode 120000 ceph/qa/suites/upgrade/jewel-x/stress-split/6.5-crush-compat.yaml
 create mode 100644 ceph/qa/suites/upgrade/kraken-x/ceph-deploy/kraken-luminous.yaml
 create mode 100644 ceph/qa/suites/upgrade/kraken-x/parallel/4-luminous.yaml
 create mode 100644 ceph/qa/suites/upgrade/kraken-x/parallel/5-workload.yaml
 create mode 120000 ceph/qa/suites/upgrade/kraken-x/parallel/6-luminous-with-mgr.yaml
 create mode 100644 ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/+
 rename ceph/qa/suites/upgrade/{jewel-x/parallel/5-final-workload => kraken-x/parallel/7-final-workload}/blogbench.yaml (100%)
 rename ceph/qa/suites/upgrade/{jewel-x/parallel/5-final-workload => kraken-x/parallel/7-final-workload}/rados-snaps-few-objects.yaml (100%)
 rename ceph/qa/suites/upgrade/{jewel-x/parallel/5-final-workload => kraken-x/parallel/7-final-workload}/rados_loadgenmix.yaml (100%)
 rename ceph/qa/suites/upgrade/kraken-x/parallel/{5-final-workload => 7-final-workload}/rados_mon_thrash.yaml (100%)
 rename ceph/qa/suites/upgrade/{jewel-x/parallel/5-final-workload => kraken-x/parallel/7-final-workload}/rbd_cls.yaml (100%)
 rename ceph/qa/suites/upgrade/{jewel-x/parallel/5-final-workload => kraken-x/parallel/7-final-workload}/rbd_import_export.yaml (100%)
 rename ceph/qa/suites/upgrade/{jewel-x/parallel/5-final-workload => kraken-x/parallel/7-final-workload}/rgw_swift.yaml (100%)
 create mode 100644 ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_no_upgrated.yaml
 rename ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/{rbd_import_export.yaml => rbd_import_export_upgrated.yaml} (65%)
 create mode 120000 ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/1-ceph-install
 create mode 100644 ceph/qa/tasks/mgr/test_dashboard.py
 create mode 100644 ceph/qa/tasks/mgr/test_module_selftest.py
 create mode 100644 ceph/qa/tasks/osd_max_pg_per_osd.py
 create mode 100755 ceph/qa/workunits/cls/test_cls_journal.sh
 create mode 100755 ceph/qa/workunits/mgr/test_localpool.sh
 create mode 100644 ceph/src/90-ceph-osd.conf
 mode change 100755 => 100644 ceph/src/ceph-disk/ceph_disk/main.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/api/__init__.py
 rename ceph/src/ceph-volume/ceph_volume/{devices/lvm/api.py => api/lvm.py} (89%)
 create mode 100644 ceph/src/ceph-volume/ceph_volume/devices/lvm/listing.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/devices/simple/__init__.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/devices/simple/activate.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/devices/simple/main.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/devices/simple/scan.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/devices/simple/trigger.py
 rename ceph/src/ceph-volume/ceph_volume/tests/{devices/lvm/test_api.py => api/test_lvm.py} (84%)
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_listing.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_trigger.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py
 delete mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/Vagrantfile
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/Vagrantfile
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/group_vars/all
 rename ceph/src/ceph-volume/ceph_volume/tests/functional/{centos7 => lvm/centos7/bluestore}/create/hosts (100%)
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/setup.yml
 rename ceph/src/ceph-volume/ceph_volume/tests/functional/{centos7 => lvm/centos7/bluestore}/create/vagrant_variables.yml (100%)
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/Vagrantfile
 rename ceph/src/ceph-volume/ceph_volume/tests/functional/{centos7 => lvm/centos7/filestore}/create/group_vars/all (94%)
 rename ceph/src/ceph-volume/ceph_volume/tests/functional/{xenial => lvm/centos7/filestore}/create/hosts (100%)
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/setup.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/vagrant_variables.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/setup_partitions.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/Vagrantfile
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/group_vars/all
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/hosts
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/setup.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/vagrant_variables.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/Vagrantfile
 rename ceph/src/ceph-volume/ceph_volume/tests/functional/{xenial => lvm/xenial/filestore}/create/group_vars/all (94%)
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/hosts
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/setup.yml
 rename ceph/src/ceph-volume/ceph_volume/tests/functional/{xenial => lvm/xenial/filestore}/create/vagrant_variables.yml (100%)
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/Vagrantfile
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/group_vars/all
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd0.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd1.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/hosts
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/test.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/vagrant_variables.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/Vagrantfile
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/group_vars/all
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd0.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd1.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/hosts
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/vagrant_variables.yml
 rename ceph/src/ceph-volume/ceph_volume/tests/functional/{ => simple}/tox.ini (68%)
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/Vagrantfile
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/group_vars/all
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd0.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd1.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/hosts
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/test.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/vagrant_variables.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/Vagrantfile
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/group_vars/all
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd0.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd1.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/hosts
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/vagrant_variables.yml
 delete mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/Vagrantfile
 create mode 100644 ceph/src/mgr/ActivePyModule.cc
 rename ceph/src/mgr/{MgrPyModule.h => ActivePyModule.h} (70%)
 rename ceph/src/mgr/{PyModules.cc => ActivePyModules.cc} (61%)
 rename ceph/src/mgr/{PyModules.h => ActivePyModules.h} (66%)
 create mode 100644 ceph/src/mgr/BaseMgrModule.cc
 create mode 100644 ceph/src/mgr/BaseMgrModule.h
 create mode 100644 ceph/src/mgr/BaseMgrStandbyModule.cc
 create mode 100644 ceph/src/mgr/BaseMgrStandbyModule.h
 create mode 100644 ceph/src/mgr/Gil.cc
 delete mode 100644 ceph/src/mgr/MgrPyModule.cc
 create mode 100644 ceph/src/mgr/PyModuleRegistry.cc
 create mode 100644 ceph/src/mgr/PyModuleRegistry.h
 create mode 100644 ceph/src/mgr/PyModuleRunner.cc
 create mode 100644 ceph/src/mgr/PyModuleRunner.h
 create mode 100644 ceph/src/mgr/PyOSDMap.cc
 create mode 100644 ceph/src/mgr/PyOSDMap.h
 delete mode 100644 ceph/src/mgr/PyState.cc
 delete mode 100644 ceph/src/mgr/PyState.h
 create mode 100644 ceph/src/mgr/StandbyPyModules.cc
 create mode 100644 ceph/src/mgr/StandbyPyModules.h
 create mode 100644 ceph/src/pybind/mgr/balancer/__init__.py
 create mode 100644 ceph/src/pybind/mgr/balancer/module.py
 create mode 100644 ceph/src/pybind/mgr/dashboard/standby.html
 create mode 100644 ceph/src/pybind/mgr/influx/__init__.py
 create mode 100644 ceph/src/pybind/mgr/influx/module.py
 create mode 100644 ceph/src/pybind/mgr/localpool/__init__.py
 create mode 100644 ceph/src/pybind/mgr/localpool/module.py
 create mode 100644 ceph/src/pybind/mgr/selftest/__init__.py
 create mode 100644 ceph/src/pybind/mgr/selftest/module.py
 create mode 100644 ceph/src/test/cli/osdmaptool/upmap-out.t
 create mode 100644 ceph/src/test/cls_journal/CMakeLists.txt
 create mode 100644 ceph/src/test/librbd/operation/test_mock_TrimRequest.cc
 create mode 100644 ceph/src/test/librgw_file_marker.cc

diff --git a/ceph/CMakeLists.txt b/ceph/CMakeLists.txt
index c358e3e97..0d362d849 100644
--- a/ceph/CMakeLists.txt
+++ b/ceph/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8.11)
 
 project(ceph)
-set(VERSION 12.2.1)
+set(VERSION 12.2.2)
 
 if(POLICY CMP0046)
   # Tweak policies (this one disables "missing" dependency warning)
diff --git a/ceph/COPYING b/ceph/COPYING
index b7371e4f9..a87427936 100644
--- a/ceph/COPYING
+++ b/ceph/COPYING
@@ -145,8 +145,3 @@ Files: src/include/timegm.h
   Copyright (C) Copyright Howard Hinnant
   Copyright (C) Copyright 2010-2011 Vicente J. Botet Escriba
   License: Boost Software License, Version 1.0
-
-Files: src/msg/async/AsyncConnection.cc, src/msg/simple/Pipe.cc (sigpipe suppression)
-  Copyright (C) 2010 Tomash Brechko.  All rights reserved.
-  License: GPL3
-
diff --git a/ceph/PendingReleaseNotes b/ceph/PendingReleaseNotes
index 9ca48cdab..b46d1dce1 100644
--- a/ceph/PendingReleaseNotes
+++ b/ceph/PendingReleaseNotes
@@ -27,3 +27,14 @@
     limit (5% by default). Limits by inode count are still supported using
     mds_cache_size. Setting mds_cache_size to 0 (the default) disables the
     inode limit.
+
+* The maximum number of PGs per OSD before the monitor issues a
+  warning has been reduced from 300 to 200 PGs.  200 is still twice
+  the generally recommended target of 100 PGs per OSD.  This limit can
+  be adjusted via the ``mon_max_pg_per_osd`` option on the
+  monitors.  The older ``mon_pg_warn_max_per_osd`` option has been removed.
+
+* Creating pools or adjusting pg_num will now fail if the change would
+  make the number of PGs per OSD exceed the configured
+  ``mon_max_pg_per_osd`` limit.  The option can be adjusted if it
+  is really necessary to create a pool with more PGs.
diff --git a/ceph/admin/doc-requirements.txt b/ceph/admin/doc-requirements.txt
index aba92c28b..dc1411303 100644
--- a/ceph/admin/doc-requirements.txt
+++ b/ceph/admin/doc-requirements.txt
@@ -1,3 +1,3 @@
-Sphinx == 1.1.3
--e git+https://github.com/ceph/sphinx-ditaa.git#egg=sphinx-ditaa
+Sphinx == 1.6.3
+-e git+https://github.com/ceph/sphinx-ditaa.git@py3#egg=sphinx-ditaa
 -e git+https://github.com/michaeljones/breathe#egg=breathe
diff --git a/ceph/alpine/APKBUILD b/ceph/alpine/APKBUILD
index 5fd0a9a24..0a0d4a472 100644
--- a/ceph/alpine/APKBUILD
+++ b/ceph/alpine/APKBUILD
@@ -1,7 +1,7 @@
 # Contributor: John Coyle <dx9err@gmail.com>
 # Maintainer: John Coyle <dx9err@gmail.com>
 pkgname=ceph
-pkgver=12.2.1
+pkgver=12.2.2
 pkgrel=0
 pkgdesc="Ceph is a distributed object store and file system"
 pkgusers="ceph"
@@ -63,7 +63,7 @@ makedepends="
 	xmlstarlet
 	yasm
 "
-source="ceph-12.2.1.tar.bz2"
+source="ceph-12.2.2.tar.bz2"
 subpackages="
 	$pkgname-base
 	$pkgname-common
@@ -116,7 +116,7 @@ _sysconfdir=/etc
 _udevrulesdir=/etc/udev/rules.d
 _python_sitelib=/usr/lib/python2.7/site-packages
 
-builddir=$srcdir/ceph-12.2.1
+builddir=$srcdir/ceph-12.2.2
 
 build() {
 	export CEPH_BUILD_VIRTUALENV=$builddir
diff --git a/ceph/ceph.spec b/ceph/ceph.spec
index 00d09ee21..a4e5f8c94 100644
--- a/ceph/ceph.spec
+++ b/ceph/ceph.spec
@@ -61,13 +61,14 @@
 # main package definition
 #################################################################################
 Name:		ceph
-Version:	12.2.1
+Version:	12.2.2
 Release:	0%{?dist}
 %if 0%{?fedora} || 0%{?rhel}
 Epoch:		2
 %endif
 
-# define %_epoch_prefix macro which will expand to the empty string if %epoch is undefined
+# define _epoch_prefix macro which will expand to the empty string if epoch is
+# undefined
 %global _epoch_prefix %{?epoch:%{epoch}:}
 
 Summary:	User space components of the Ceph file system
@@ -76,7 +77,7 @@ License:	LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and
 Group:		System/Filesystems
 %endif
 URL:		http://ceph.com/
-Source0:	http://ceph.com/download/ceph-12.2.1.tar.bz2
+Source0:	http://ceph.com/download/ceph-12.2.2.tar.bz2
 %if 0%{?suse_version}
 %if 0%{?is_opensuse}
 ExclusiveArch:  x86_64 aarch64 ppc64 ppc64le
@@ -109,6 +110,7 @@ BuildRequires:	python-werkzeug
 %if 0%{?suse_version}
 BuildRequires:	python-CherryPy
 BuildRequires:	python-Werkzeug
+BuildRequires:	python-numpy-devel
 %endif
 BuildRequires:	python-pecan
 BuildRequires:	socat
@@ -773,7 +775,7 @@ python-rbd, python-rgw or python-cephfs instead.
 # common
 #################################################################################
 %prep
-%autosetup -p1 -n ceph-12.2.1
+%autosetup -p1 -n ceph-12.2.2
 
 %build
 %if 0%{with cephfs_java}
@@ -883,6 +885,7 @@ mkdir -p %{buildroot}%{_sbindir}
 install -m 0644 -D src/logrotate.conf %{buildroot}%{_sysconfdir}/logrotate.d/ceph
 chmod 0644 %{buildroot}%{_docdir}/ceph/sample.ceph.conf
 install -m 0644 -D COPYING %{buildroot}%{_docdir}/ceph/COPYING
+install -m 0644 -D src/90-ceph-osd.conf %{buildroot}%{_sysctldir}/90-ceph-osd.conf
 
 # firewall templates and /sbin/mount.ceph symlink
 %if 0%{?suse_version}
@@ -1412,12 +1415,14 @@ fi
 %{_udevrulesdir}/95-ceph-osd.rules
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/ceph-osd.8*
+%{_mandir}/man8/ceph-bluestore-tool.8*
 %if 0%{?rhel} && ! 0%{?centos}
 %attr(0755,-,-) %{_sysconfdir}/cron.hourly/subman
 %endif
 %{_unitdir}/ceph-osd@.service
 %{_unitdir}/ceph-osd.target
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd
+%config(noreplace) %{_sysctldir}/90-ceph-osd.conf
 
 %post osd
 %if 0%{?suse_version}
@@ -1431,6 +1436,11 @@ fi
 if [ $1 -eq 1 ] ; then
 /usr/bin/systemctl start ceph-osd.target >/dev/null 2>&1 || :
 fi
+%if 0%{?sysctl_apply}
+    %sysctl_apply 90-ceph-osd.conf
+%else
+    /usr/lib/systemd/systemd-sysctl %{_sysctldir}/90-ceph-osd.conf > /dev/null 2>&1 || :
+%endif
 
 %preun osd
 %if 0%{?suse_version}
diff --git a/ceph/ceph.spec.in b/ceph/ceph.spec.in
index b45c9feec..cc0830e60 100644
--- a/ceph/ceph.spec.in
+++ b/ceph/ceph.spec.in
@@ -67,7 +67,8 @@ Release:	@RPM_RELEASE@%{?dist}
 Epoch:		2
 %endif
 
-# define %_epoch_prefix macro which will expand to the empty string if %epoch is undefined
+# define _epoch_prefix macro which will expand to the empty string if epoch is
+# undefined
 %global _epoch_prefix %{?epoch:%{epoch}:}
 
 Summary:	User space components of the Ceph file system
@@ -109,6 +110,7 @@ BuildRequires:	python-werkzeug
 %if 0%{?suse_version}
 BuildRequires:	python-CherryPy
 BuildRequires:	python-Werkzeug
+BuildRequires:	python-numpy-devel
 %endif
 BuildRequires:	python-pecan
 BuildRequires:	socat
@@ -883,6 +885,7 @@ mkdir -p %{buildroot}%{_sbindir}
 install -m 0644 -D src/logrotate.conf %{buildroot}%{_sysconfdir}/logrotate.d/ceph
 chmod 0644 %{buildroot}%{_docdir}/ceph/sample.ceph.conf
 install -m 0644 -D COPYING %{buildroot}%{_docdir}/ceph/COPYING
+install -m 0644 -D src/90-ceph-osd.conf %{buildroot}%{_sysctldir}/90-ceph-osd.conf
 
 # firewall templates and /sbin/mount.ceph symlink
 %if 0%{?suse_version}
@@ -1412,12 +1415,14 @@ fi
 %{_udevrulesdir}/95-ceph-osd.rules
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/ceph-osd.8*
+%{_mandir}/man8/ceph-bluestore-tool.8*
 %if 0%{?rhel} && ! 0%{?centos}
 %attr(0755,-,-) %{_sysconfdir}/cron.hourly/subman
 %endif
 %{_unitdir}/ceph-osd@.service
 %{_unitdir}/ceph-osd.target
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd
+%config(noreplace) %{_sysctldir}/90-ceph-osd.conf
 
 %post osd
 %if 0%{?suse_version}
@@ -1431,6 +1436,11 @@ fi
 if [ $1 -eq 1 ] ; then
 /usr/bin/systemctl start ceph-osd.target >/dev/null 2>&1 || :
 fi
+%if 0%{?sysctl_apply}
+    %sysctl_apply 90-ceph-osd.conf
+%else
+    /usr/lib/systemd/systemd-sysctl %{_sysctldir}/90-ceph-osd.conf > /dev/null 2>&1 || :
+%endif
 
 %preun osd
 %if 0%{?suse_version}
diff --git a/ceph/debian/ceph-osd.install b/ceph/debian/ceph-osd.install
index 262082cfd..87cd5011c 100644
--- a/ceph/debian/ceph-osd.install
+++ b/ceph/debian/ceph-osd.install
@@ -19,3 +19,5 @@ usr/share/man/man8/ceph-disk.8
 usr/share/man/man8/ceph-volume.8
 usr/share/man/man8/ceph-volume-systemd.8
 usr/share/man/man8/ceph-osd.8
+usr/share/man/man8/ceph-bluestore-tool.8
+etc/sysctl.d/30-ceph-osd.conf
diff --git a/ceph/debian/ceph-osd.postinst b/ceph/debian/ceph-osd.postinst
index b642dfe34..5e44548fe 100644
--- a/ceph/debian/ceph-osd.postinst
+++ b/ceph/debian/ceph-osd.postinst
@@ -23,6 +23,7 @@ set -e
 
 case "$1" in
     configure)
+	[ -x /etc/init.d/procps ] && invoke-rc.d procps restart || :
 	[ -x /sbin/start ] && start ceph-osd-all || :
     ;;
     abort-upgrade|abort-remove|abort-deconfigure)
diff --git a/ceph/debian/changelog b/ceph/debian/changelog
index f05243917..7597cb680 100644
--- a/ceph/debian/changelog
+++ b/ceph/debian/changelog
@@ -1,3 +1,9 @@
+ceph (12.2.2-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com>  Thu, 30 Nov 2017 14:59:26 +0000
+
 ceph (12.2.1-1) stable; urgency=medium
 
   * New upstream release
diff --git a/ceph/debian/rules b/ceph/debian/rules
index 92bc0b587..857888f84 100755
--- a/ceph/debian/rules
+++ b/ceph/debian/rules
@@ -50,6 +50,7 @@ override_dh_auto_install:
 	install -D -m 644 udev/95-ceph-osd.rules $(DESTDIR)/lib/udev/rules.d/95-ceph-osd.rules
 	install -D -m 644 udev/60-ceph-by-parttypeuuid.rules $(DESTDIR)/lib/udev/rules.d/60-ceph-by-parttypeuuid.rules
 	install -D -m 644 src/etc-rbdmap $(DESTDIR)/etc/ceph/rbdmap
+	install -D -m 644 src/90-ceph-osd.conf $(DESTDIR)/etc/sysctl.d/30-ceph-osd.conf
 
 # doc/changelog is a directory, which confuses dh_installchangelogs
 override_dh_installchangelogs:
diff --git a/ceph/doc/ceph-volume/index.rst b/ceph/doc/ceph-volume/index.rst
index d34e80294..5cf4778bb 100644
--- a/ceph/doc/ceph-volume/index.rst
+++ b/ceph/doc/ceph-volume/index.rst
@@ -3,19 +3,46 @@
 ceph-volume
 ===========
 Deploy OSDs with different device technologies like lvm or physical disks using
-pluggable tools (:doc:`lvm/index` itself is treated like a plugin). It tries to
-follow the workflow of ``ceph-disk`` for deploying OSDs, with a predictable,
-and robust way of preparing, activating, and starting OSDs.
+pluggable tools (:doc:`lvm/index` itself is treated like a plugin) and trying to
+follow a predictable, and robust way of preparing, activating, and starting OSDs.
 
 :ref:`Overview <ceph-volume-overview>` |
 :ref:`Plugin Guide <ceph-volume-plugins>` |
 
 
 **Command Line Subcommands**
-Although currently there is support for ``lvm``, the plan is to support other
-technologies, including plain disks.
+There is currently support for ``lvm``, and plain disks (with GPT partitions)
+that may have been deployed with ``ceph-disk``.
 
 * :ref:`ceph-volume-lvm`
+* :ref:`ceph-volume-simple`
+
+
+Migrating
+---------
+Starting on Ceph version 12.2.2, ``ceph-disk`` is deprecated. Deprecation
+warnings will show up that will link to this page. It is strongly suggested
+that users start consuming ``ceph-volume``.
+
+New deployments
+^^^^^^^^^^^^^^^
+For new deployments, :ref:`ceph-volume-lvm` is recommended, it can use any
+logical volume as input for data OSDs, or it can setup a minimal/naive logical
+volume from a device.
+
+Existing OSDs
+^^^^^^^^^^^^^
+If the cluster has OSDs that were provisioned with ``ceph-disk``, then
+``ceph-volume`` can take over the management of these with
+:ref:`ceph-volume-simple`. A scan is done on the data device or OSD directory,
+and ``ceph-disk`` is fully disabled.
+
+Encrypted OSDs
+^^^^^^^^^^^^^^
+If using encryption with OSDs, there is currently no support in ``ceph-volume``
+for this scenario (although support for this is coming soon). In this case, it
+is OK to continue to use ``ceph-disk`` until ``ceph-volume`` fully supports it.
+This page will be updated when that happens.
 
 .. toctree::
    :hidden:
@@ -23,8 +50,15 @@ technologies, including plain disks.
    :caption: Contents:
 
    intro
+   systemd
    lvm/index
    lvm/activate
    lvm/prepare
    lvm/scan
    lvm/systemd
+   lvm/list
+   lvm/zap
+   simple/index
+   simple/activate
+   simple/scan
+   simple/systemd
diff --git a/ceph/doc/ceph-volume/lvm/activate.rst b/ceph/doc/ceph-volume/lvm/activate.rst
index b9f30d69f..956a62a62 100644
--- a/ceph/doc/ceph-volume/lvm/activate.rst
+++ b/ceph/doc/ceph-volume/lvm/activate.rst
@@ -17,7 +17,7 @@ New OSDs
 To activate newly prepared OSDs both the :term:`OSD id` and :term:`OSD uuid`
 need to be supplied. For example::
 
-    ceph-volume activate --filestore 0 0263644D-0BF1-4D6D-BC34-28BD98AE3BC8
+    ceph-volume lvm activate --bluestore 0 0263644D-0BF1-4D6D-BC34-28BD98AE3BC8
 
 .. note:: The UUID is stored in the ``osd_fsid`` file in the OSD path, which is
           generated when :ref:`ceph-volume-lvm-prepare` is used.
@@ -46,7 +46,7 @@ For example::
 Would start the discovery process for the OSD with an id of ``0`` and a UUID of
 ``8715BEB4-15C5-49DE-BA6F-401086EC7B41``.
 
-.. note:: for more details on the systemd workflow see :ref:`ceph-volume-systemd`
+.. note:: for more details on the systemd workflow see :ref:`ceph-volume-lvm-systemd`
 
 The systemd unit will look for the matching OSD device, and by looking at its
 :term:`LVM tags` will proceed to:
@@ -58,6 +58,9 @@ The systemd unit will look for the matching OSD device, and by looking at its
 
 # start the ``ceph-osd@0`` systemd unit
 
+.. note:: The system infers the objectstore type (filestore or bluestore) by
+          inspecting the LVM tags applied to the OSD devices
+
 Existing OSDs
 -------------
 For exsiting OSDs that have been deployed with different tooling, the only way
@@ -66,7 +69,18 @@ See :ref:`ceph-volume-lvm-existing-osds` for details on how to proceed.
 
 Summary
 -------
-To recap the ``activate`` process:
+To recap the ``activate`` process for :term:`bluestore`:
+
+#. require both :term:`OSD id` and :term:`OSD uuid`
+#. enable the system unit with matching id and uuid
+#. Create the ``tmpfs`` mount at the OSD directory in
+   ``/var/lib/ceph/osd/$cluster-$id/``
+#. Recreate all the files needed with ``ceph-bluestore-tool prime-osd-dir`` by
+   pointing it to the OSD ``block`` device.
+#. the systemd unit will ensure all devices are ready and linked
+#. the matching ``ceph-osd`` systemd unit will get started
+
+And for :term:`filestore`:
 
 #. require both :term:`OSD id` and :term:`OSD uuid`
 #. enable the system unit with matching id and uuid
diff --git a/ceph/doc/ceph-volume/lvm/create.rst b/ceph/doc/ceph-volume/lvm/create.rst
new file mode 100644
index 000000000..c90d1f6fa
--- /dev/null
+++ b/ceph/doc/ceph-volume/lvm/create.rst
@@ -0,0 +1,24 @@
+.. _ceph-volume-lvm-create:
+
+``create``
+===========
+This subcommand wraps the two-step process to provision a new osd (calling
+``prepare`` first and then ``activate``) into a single
+one. The reason to prefer ``prepare`` and then ``activate`` is to gradually
+introduce new OSDs into a cluster, and avoiding large amounts of data being
+rebalanced.
+
+The single-call process unifies exactly what :ref:`ceph-volume-lvm-prepare` and
+:ref:`ceph-volume-lvm-activate` do, with the convenience of doing it all at
+once.
+
+There is nothing different to the process except the OSD will become up and in
+immediately after completion.
+
+The backing objectstore can be specified with:
+
+* :ref:`--filestore <ceph-volume-lvm-prepare_filestore>`
+* :ref:`--bluestore <ceph-volume-lvm-prepare_bluestore>`
+
+All command line flags and options are the same as ``ceph-volume lvm prepare``.
+Please refer to :ref:`ceph-volume-lvm-prepare` for details.
diff --git a/ceph/doc/ceph-volume/lvm/index.rst b/ceph/doc/ceph-volume/lvm/index.rst
index 5c1ef0117..9a2191fb5 100644
--- a/ceph/doc/ceph-volume/lvm/index.rst
+++ b/ceph/doc/ceph-volume/lvm/index.rst
@@ -11,6 +11,10 @@ Implements the functionality needed to deploy OSDs from the ``lvm`` subcommand:
 
 * :ref:`ceph-volume-lvm-activate`
 
+* :ref:`ceph-volume-lvm-create`
+
+* :ref:`ceph-volume-lvm-list`
+
 .. not yet implemented
 .. * :ref:`ceph-volume-lvm-scan`
 
@@ -20,5 +24,5 @@ There are other aspects of the ``lvm`` subcommand that are internal and not
 exposed to the user, these sections explain how these pieces work together,
 clarifying the workflows of the tool.
 
-:ref:`Systemd Units <ceph-volume-systemd>` |
+:ref:`Systemd Units <ceph-volume-lvm-systemd>` |
 :ref:`lvm <ceph-volume-lvm-api>`
diff --git a/ceph/doc/ceph-volume/lvm/list.rst b/ceph/doc/ceph-volume/lvm/list.rst
new file mode 100644
index 000000000..19e06000b
--- /dev/null
+++ b/ceph/doc/ceph-volume/lvm/list.rst
@@ -0,0 +1,173 @@
+.. _ceph-volume-lvm-list:
+
+``list``
+========
+This subcommand will list any devices (logical and physical) that may be
+associated with a Ceph cluster, as long as they contain enough metadata to
+allow for that discovery.
+
+Output is grouped by the OSD ID associated with the devices, and unlike
+``ceph-disk`` it does not provide any information for devices that aren't
+associated with Ceph.
+
+Command line options:
+
+* ``--format`` Allows a ``json`` or ``pretty`` value. Defaults to ``pretty``
+  which will group the device information in a human-readable format.
+
+Full Reporting
+--------------
+When no positional arguments are used, a full reporting will be presented. This
+means that all devices and logical volumes found in the system will be
+displayed.
+
+Full ``pretty`` reporting for two OSDs, one with a lv as a journal, and another
+one with a physical device may look similar to::
+
+    # ceph-volume lvm list
+
+
+    ====== osd.1 =======
+
+      [journal]    /dev/journals/journal1
+
+          journal uuid              C65n7d-B1gy-cqX3-vZKY-ZoE0-IEYM-HnIJzs
+          osd id                    1
+          cluster fsid              ce454d91-d748-4751-a318-ff7f7aa18ffd
+          type                      journal
+          osd fsid                  661b24f8-e062-482b-8110-826ffe7f13fa
+          data uuid                 SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
+          journal device            /dev/journals/journal1
+          data device               /dev/test_group/data-lv2
+
+      [data]    /dev/test_group/data-lv2
+
+          journal uuid              C65n7d-B1gy-cqX3-vZKY-ZoE0-IEYM-HnIJzs
+          osd id                    1
+          cluster fsid              ce454d91-d748-4751-a318-ff7f7aa18ffd
+          type                      data
+          osd fsid                  661b24f8-e062-482b-8110-826ffe7f13fa
+          data uuid                 SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
+          journal device            /dev/journals/journal1
+          data device               /dev/test_group/data-lv2
+
+    ====== osd.0 =======
+
+      [data]    /dev/test_group/data-lv1
+
+          journal uuid              cd72bd28-002a-48da-bdf6-d5b993e84f3f
+          osd id                    0
+          cluster fsid              ce454d91-d748-4751-a318-ff7f7aa18ffd
+          type                      data
+          osd fsid                  943949f0-ce37-47ca-a33c-3413d46ee9ec
+          data uuid                 TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00
+          journal device            /dev/sdd1
+          data device               /dev/test_group/data-lv1
+
+      [journal]    /dev/sdd1
+
+          PARTUUID                  cd72bd28-002a-48da-bdf6-d5b993e84f3f
+
+.. note:: Tags are displayed in a readable format. The ``osd id`` key is stored
+          as a ``ceph.osd_id`` tag. For more information on lvm tag conventions
+          see :ref:`ceph-volume-lvm-tag-api`
+
+Single Reporting
+----------------
+Single reporting can consume both devices and logical volumes as input
+(positional parameters). For logical volumes, it is required to use the group
+name as well as the logical volume name.
+
+For example the ``data-lv2`` logical volume, in the ``test_group`` volume group
+can be listed in the following way::
+
+    # ceph-volume lvm list test_group/data-lv2
+
+
+    ====== osd.1 =======
+
+      [data]    /dev/test_group/data-lv2
+
+          journal uuid              C65n7d-B1gy-cqX3-vZKY-ZoE0-IEYM-HnIJzs
+          osd id                    1
+          cluster fsid              ce454d91-d748-4751-a318-ff7f7aa18ffd
+          type                      data
+          osd fsid                  661b24f8-e062-482b-8110-826ffe7f13fa
+          data uuid                 SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
+          journal device            /dev/journals/journal1
+          data device               /dev/test_group/data-lv2
+
+
+.. note:: Tags are displayed in a readable format. The ``osd id`` key is stored
+          as a ``ceph.osd_id`` tag. For more information on lvm tag conventions
+          see :ref:`ceph-volume-lvm-tag-api`
+
+
+For plain disks, the full path to the device is required. For example, for
+a device like ``/dev/sdd1`` it can look like::
+
+
+    # ceph-volume lvm list /dev/sdd1
+
+
+    ====== osd.0 =======
+
+      [journal]    /dev/sdd1
+
+          PARTUUID                  cd72bd28-002a-48da-bdf6-d5b993e84f3f
+
+
+
+``json`` output
+---------------
+All output using ``--format=json`` will show everything the system has stored
+as metadata for the devices, including tags.
+
+No changes for readability are done with ``json`` reporting, and all
+information is presented as-is. Full output as well as single devices can be
+listed.
+
+For brevity, this is how a single logical volume would look with ``json``
+output (note how tags aren't modified)::
+
+    # ceph-volume lvm list --format=json test_group/data-lv1
+    {
+        "0": [
+            {
+                "lv_name": "data-lv1",
+                "lv_path": "/dev/test_group/data-lv1",
+                "lv_tags": "ceph.cluster_fsid=ce454d91-d748-4751-a318-ff7f7aa18ffd,ceph.data_device=/dev/test_group/data-lv1,ceph.data_uuid=TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00,ceph.journal_device=/dev/sdd1,ceph.journal_uuid=cd72bd28-002a-48da-bdf6-d5b993e84f3f,ceph.osd_fsid=943949f0-ce37-47ca-a33c-3413d46ee9ec,ceph.osd_id=0,ceph.type=data",
+                "lv_uuid": "TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00",
+                "name": "data-lv1",
+                "path": "/dev/test_group/data-lv1",
+                "tags": {
+                    "ceph.cluster_fsid": "ce454d91-d748-4751-a318-ff7f7aa18ffd",
+                    "ceph.data_device": "/dev/test_group/data-lv1",
+                    "ceph.data_uuid": "TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00",
+                    "ceph.journal_device": "/dev/sdd1",
+                    "ceph.journal_uuid": "cd72bd28-002a-48da-bdf6-d5b993e84f3f",
+                    "ceph.osd_fsid": "943949f0-ce37-47ca-a33c-3413d46ee9ec",
+                    "ceph.osd_id": "0",
+                    "ceph.type": "data"
+                },
+                "type": "data",
+                "vg_name": "test_group"
+            }
+        ]
+    }
+
+
+Synchronized information
+------------------------
+Before any listing type, the lvm API is queried to ensure that physical devices
+that may be in use haven't changed naming. It is possible that non-persistent
+devices like ``/dev/sda1`` could change to ``/dev/sdb1``.
+
+The detection is possible because the ``PARTUUID`` is stored as part of the
+metadata in the logical volume for the data lv. Even in the case of a journal
+that is a physical device, this information is still stored on the data logical
+volume associated with it.
+
+If the name is no longer the same (as reported by ``blkid`` when using the
+``PARTUUID``), the tag will get updated and the report will use the newly
+refreshed information.
diff --git a/ceph/doc/ceph-volume/lvm/prepare.rst b/ceph/doc/ceph-volume/lvm/prepare.rst
index add0f185d..27ebb55d7 100644
--- a/ceph/doc/ceph-volume/lvm/prepare.rst
+++ b/ceph/doc/ceph-volume/lvm/prepare.rst
@@ -2,10 +2,11 @@
 
 ``prepare``
 ===========
-This subcommand allows a :term:`filestore` setup (:term:`bluestore` support is
-planned) and currently consumes only logical volumes for both the data and
-journal. It will not create or modify the logical volumes except for adding
-extra metadata.
+This subcommand allows a :term:`filestore` or :term:`bluestore` setup. It is
+recommended to pre-provision a logical volume before using it with
+``ceph-volume lvm``.
+
+Logical volumes are not altered except for adding extra metadata.
 
 .. note:: This is part of a two step process to deploy an OSD. If looking for
           a single-call way, please see :ref:`ceph-volume-lvm-create`
@@ -23,28 +24,46 @@ the back end can be specified with:
 
 
 * :ref:`--filestore <ceph-volume-lvm-prepare_filestore>`
-* ``--bluestore``
-
-.. when available, this will need to be updated to:
-.. * :ref:`--bluestore <ceph-volume-prepare_bluestore>`
+* :ref:`--bluestore <ceph-volume-lvm-prepare_bluestore>`
 
 .. _ceph-volume-lvm-prepare_filestore:
 
 ``filestore``
 -------------
-This is the default OSD backend and allows preparation of logical volumes for
-a :term:`filestore` OSD.
+This is the OSD backend that allows preparation of logical volumes for
+a :term:`filestore` objectstore OSD.
 
-The process is *very* strict, it requires two logical volumes that are ready to
-be used. No special preparation is needed for these volumes other than
-following the minimum size requirements for data and journal.
+It can use a logical volume for the OSD data and a partitioned physical device
+or logical volume for the journal.  No special preparation is needed for these
+volumes other than following the minimum size requirements for data and
+journal.
 
 The API call looks like::
 
     ceph-volume prepare --filestore --data data --journal journal
 
-The journal *must* be a logical volume, just like the data volume, and that
-argument is always required even if both live under the same group.
+There is flexibility to use a raw device or partition as well for ``--data``
+that will be converted to a logical volume. This is not ideal in all situations
+since ``ceph-volume`` is just going to create a unique volume group and
+a logical volume from that device.
+
+When using logical volumes for ``--data``, the  value *must* be a volume group
+name and a logical volume name separated by a ``/``. Since logical volume names
+are not enforced for uniqueness, this prevents using the wrong volume. The
+``--journal`` can be either a logical volume *or* a partition.
+
+When using a partition, it *must* contain a ``PARTUUID`` discoverable by
+``blkid``, so that it can later be identified correctly regardless of the
+device name (or path).
+
+When using a partition, this is how it would look for ``/dev/sdc1``::
+
+    ceph-volume prepare --filestore --data volume_group/lv_name --journal /dev/sdc1
+
+For a logical volume, just like for ``--data``, a volume group and logical
+volume name are required::
+
+    ceph-volume prepare --filestore --data volume_group/lv_name --journal volume_group/journal_lv
 
 A generated uuid is used to ask the cluster for a new OSD. These two pieces are
 crucial for identifying an OSD and will later be used throughout the
@@ -108,32 +127,109 @@ later be started (for detailed metadata description see :ref:`ceph-volume-lvm-ta
 
 ``bluestore``
 -------------
-This subcommand is planned but not currently implemented.
+The :term:`bluestore` objectstore is the default for new OSDs. It offers a bit
+more flexibility for devices. Bluestore supports the following configurations:
+
+* A block device, a block.wal, and a block.db device
+* A block device and a block.wal device
+* A block device and a block.db device
+* A single block device
+
+It can accept a whole device (or partition), or a logical volume for ``block``.
+If a physical device is provided it will then be turned into a logical volume.
+This allows a simpler approach at using LVM but at the cost of flexibility:
+there are no options or configurations to change how the LV is created.
+
+The ``block`` is specified with the ``--data`` flag, and in its simplest use
+case it looks like::
+
+    ceph-volume lvm prepare --bluestore --data vg/lv
+
+A raw device can be specified in the same way::
+
+    ceph-volume lvm prepare --bluestore --data /path/to/device
+
+
+If a ``block.db`` or a ``block.wal`` is needed (they are optional for
+bluestore) they can be specified with ``--block.db`` and ``--block.wal``
+accordingly. These can be a physical device (they **must** be a partition) or
+a logical volume.
+
+For both ``block.db`` and ``block.wal`` partitions aren't made logical volumes
+because they can be used as-is. Logical Volumes are also allowed.
+
+While creating the OSD directory, the process will use a ``tmpfs`` mount to
+place all the files needed for the OSD. These files are initially created by
+``ceph-osd --mkfs`` and are fully ephemeral.
+
+A symlink is always created for the ``block`` device, and optionally for
+``block.db`` and ``block.wal``. For a cluster with a default name, and an OSD
+id of 0, the directory could look like::
+
+    # ls -l /var/lib/ceph/osd/ceph-0
+    lrwxrwxrwx. 1 ceph ceph 93 Oct 20 13:05 block -> /dev/ceph-be2b6fbd-bcf2-4c51-b35d-a35a162a02f0/osd-block-25cf0a05-2bc6-44ef-9137-79d65bd7ad62
+    lrwxrwxrwx. 1 ceph ceph 93 Oct 20 13:05 block.db -> /dev/sda1
+    lrwxrwxrwx. 1 ceph ceph 93 Oct 20 13:05 block.wal -> /dev/ceph/osd-wal-0
+    -rw-------. 1 ceph ceph 37 Oct 20 13:05 ceph_fsid
+    -rw-------. 1 ceph ceph 37 Oct 20 13:05 fsid
+    -rw-------. 1 ceph ceph 55 Oct 20 13:05 keyring
+    -rw-------. 1 ceph ceph  6 Oct 20 13:05 ready
+    -rw-------. 1 ceph ceph 10 Oct 20 13:05 type
+    -rw-------. 1 ceph ceph  2 Oct 20 13:05 whoami
+
+In the above case, a device was used for ``block`` so ``ceph-volume`` create
+a volume group and a logical volume using the following convention:
+
+* volume group name: ``ceph-{cluster fsid}`` or if the vg exists already
+  ``ceph-{random uuid}``
+
+* logical volume name: ``osd-block-{osd_fsid}``
 
 
 Storing metadata
 ----------------
-The following tags will get applied as part of the prepartion process
-regardless of the type of volume (journal or data) and also regardless of the
-OSD backend:
+The following tags will get applied as part of the preparation process
+regardless of the type of volume (journal or data) or OSD objectstore:
 
 * ``cluster_fsid``
-* ``data_device``
-* ``journal_device``
 * ``encrypted``
 * ``osd_fsid``
 * ``osd_id``
-* ``block``
-* ``db``
-* ``wal``
-* ``lockbox_device``
+
+For :term:`filestore` these tags will be added:
+
+* ``journal_device``
+* ``journal_uuid``
+
+For :term:`bluestore` these tags will be added:
+
+* ``block_device``
+* ``block_uuid``
+* ``db_device``
+* ``db_uuid``
+* ``wal_device``
+* ``wal_uuid``
 
 .. note:: For the complete lvm tag conventions see :ref:`ceph-volume-lvm-tag-api`
 
 
 Summary
 -------
-To recap the ``prepare`` process:
+To recap the ``prepare`` process for :term:`bluestore`:
+
+#. Accept a logical volume for block or a raw device (that will get converted
+   to an lv)
+#. Accept partitions or logical volumes for ``block.wal`` or ``block.db``
+#. Generate a UUID for the OSD
+#. Ask the monitor get an OSD ID reusing the generated UUID
+#. OSD data directory is created on a tmpfs mount.
+#. ``block``, ``block.wal``, and ``block.db`` are symlinked if defined.
+#. monmap is fetched for activation
+#. Data directory is populated by ``ceph-osd``
+#. Logical Volumes are are assigned all the Ceph metadata using lvm tags
+
+
+And the ``prepare`` process for :term:`filestore`:
 
 #. Accept only logical volumes for data and journal (both required)
 #. Generate a UUID for the OSD
diff --git a/ceph/doc/ceph-volume/lvm/systemd.rst b/ceph/doc/ceph-volume/lvm/systemd.rst
index 7162e0433..30260de7e 100644
--- a/ceph/doc/ceph-volume/lvm/systemd.rst
+++ b/ceph/doc/ceph-volume/lvm/systemd.rst
@@ -1,31 +1,7 @@
-.. _ceph-volume-systemd:
+.. _ceph-volume-lvm-systemd:
 
 systemd
 =======
-As part of the :ref:`ceph-volume-lvm-activate` process, a few systemd units will get enabled
-that will use the OSD id and uuid as part of their name. These units will be
-run when the system boots, and will proceed to activate their corresponding
-volumes.
-
-The API for activation requires both the :term:`OSD id` and :term:`OSD uuid`,
-which get persisted by systemd. Internally, the activation process enables the
-systemd unit using the following convention::
-
-    ceph-volume@<type>-<extra metadata>
-
-Where ``type`` is the sub-command used to parse the extra metadata, and ``extra
-metadata`` is any additional information needed by the sub-command to be able
-to activate the OSD. For example an OSD with an ID of 0, for the ``lvm``
-sub-command would look like::
-
-    systemctl enable ceph-volume@lvm-0-0A3E1ED2-DA8A-4F0E-AA95-61DEC71768D6
-
-
-Process
--------
-The systemd unit is a :term:`systemd oneshot` service, meant to start at boot after the
-local filesystem is ready to be used.
-
 Upon startup, it will identify the logical volume using :term:`LVM tags`,
 finding a matching ID and later ensuring it is the right one with
 the :term:`OSD uuid`.
@@ -41,6 +17,12 @@ be mounted at::
 
     /var/lib/ceph/osd/ceph-0
 
+
 Once that process is complete, a call will be made to start the OSD::
 
     systemctl start ceph-osd@0
+
+The systemd portion of this process is handled by the ``ceph-volume lvm
+trigger`` sub-command, which is only in charge of parsing metadata coming from
+systemd and startup, and then dispatching to ``ceph-volume lvm activate`` which
+would proceed with activation.
diff --git a/ceph/doc/ceph-volume/lvm/zap.rst b/ceph/doc/ceph-volume/lvm/zap.rst
new file mode 100644
index 000000000..8d42a9089
--- /dev/null
+++ b/ceph/doc/ceph-volume/lvm/zap.rst
@@ -0,0 +1,19 @@
+.. _ceph-volume-lvm-zap:
+
+``zap``
+=======
+
+This subcommand is used to zap lvs or partitions that have been used
+by ceph OSDs so that they may be reused. If given a path to a logical
+volume it must be in the format of vg/lv. Any filesystems present
+on the given lv or partition will be removed and all data will be purged.
+
+.. note:: The lv or partition will be kept intact.
+
+Zapping a logical volume::
+
+      ceph-volume lvm zap {vg name/lv name}
+
+Zapping a partition::
+
+      ceph-volume lvm zap /dev/sdc1
diff --git a/ceph/doc/ceph-volume/simple/activate.rst b/ceph/doc/ceph-volume/simple/activate.rst
new file mode 100644
index 000000000..edbb1e3f8
--- /dev/null
+++ b/ceph/doc/ceph-volume/simple/activate.rst
@@ -0,0 +1,80 @@
+.. _ceph-volume-simple-activate:
+
+``activate``
+============
+Once :ref:`ceph-volume-simple-scan` has been completed, and all the metadata
+captured for an OSD has been persisted to ``/etc/ceph/osd/{id}-{uuid}.json``
+the OSD is now ready to get "activated".
+
+This activation process **disables** all ``ceph-disk`` systemd units by masking
+them, to prevent the UDEV/ceph-disk interaction that will attempt to start them
+up at boot time.
+
+The disabling of ``ceph-disk`` units is done only when calling ``ceph-volume
+simple activate`` directly, but is is avoided when being called by systemd when
+the system is booting up.
+
+The activation process requires using both the :term:`OSD id` and :term:`OSD uuid`
+To activate parsed OSDs::
+
+    ceph-volume simple activate 0 6cc43680-4f6e-4feb-92ff-9c7ba204120e
+
+The above command will assume that a JSON configuration will be found in::
+
+    /etc/ceph/osd/0-6cc43680-4f6e-4feb-92ff-9c7ba204120e.json
+
+Alternatively, using a path to a JSON file directly is also possible::
+
+    ceph-volume simple activate --file /etc/ceph/osd/0-6cc43680-4f6e-4feb-92ff-9c7ba204120e.json
+
+requiring uuids
+^^^^^^^^^^^^^^^
+The :term:`OSD uuid` is being required as an extra step to ensure that the
+right OSD is being activated. It is entirely possible that a previous OSD with
+the same id exists and would end up activating the incorrect one.
+
+
+Discovery
+---------
+With OSDs previously scanned by ``ceph-volume``, a *discovery* process is
+performed using ``blkid`` and ``lvm``. There is currently support only for
+devices with GPT partitions and LVM logical volumes.
+
+The GPT partitions will have a ``PARTUUID`` that can be queried by calling out
+to ``blkid``, and the logical volumes will have a ``lv_uuid`` that can be
+queried against ``lvs`` (the LVM tool to list logical volumes).
+
+This discovery process ensures that devices can be correctly detected even if
+they are repurposed into another system or if their name changes (as in the
+case of non-persisting names like ``/dev/sda1``)
+
+The JSON configuration file used to map what devices go to what OSD will then
+coordinate the mounting and symlinking as part of activation.
+
+To ensure that the symlinks are always correct, if they exist in the OSD
+directory, the symlinks will be re-done.
+
+A systemd unit will capture the :term:`OSD id` and :term:`OSD uuid` and
+persist it. Internally, the activation will enable it like::
+
+    systemctl enable ceph-volume@simple-$id-$uuid
+
+For example::
+
+    systemctl enable ceph-volume@simple-0-8715BEB4-15C5-49DE-BA6F-401086EC7B41
+
+Would start the discovery process for the OSD with an id of ``0`` and a UUID of
+``8715BEB4-15C5-49DE-BA6F-401086EC7B41``.
+
+
+The systemd process will call out to activate passing the information needed to
+identify the OSD and its devices, and it will proceed to:
+
+# mount the device in the corresponding location (by convention this is
+  ``/var/lib/ceph/osd/<cluster name>-<osd id>/``)
+
+# ensure that all required devices are ready for that OSD and properly linked,
+regardless of objectstore used (filestore or bluestore). The symbolic link will
+**always** be re-done to ensure that the correct device is linked.
+
+# start the ``ceph-osd@0`` systemd unit
diff --git a/ceph/doc/ceph-volume/simple/index.rst b/ceph/doc/ceph-volume/simple/index.rst
new file mode 100644
index 000000000..6f2534a73
--- /dev/null
+++ b/ceph/doc/ceph-volume/simple/index.rst
@@ -0,0 +1,19 @@
+.. _ceph-volume-simple:
+
+``simple``
+==========
+Implements the functionality needed to manage OSDs from the ``simple`` subcommand:
+``ceph-volume simple``
+
+**Command Line Subcommands**
+
+* :ref:`ceph-volume-simple-scan`
+
+* :ref:`ceph-volume-simple-activate`
+
+* :ref:`ceph-volume-simple-systemd`
+
+
+By *taking over* management, it disables all ``ceph-disk`` systemd units used
+to trigger devices at startup, relying on basic (customizable) JSON
+configuration and systemd for starting up OSDs.
diff --git a/ceph/doc/ceph-volume/simple/scan.rst b/ceph/doc/ceph-volume/simple/scan.rst
new file mode 100644
index 000000000..afeddabb8
--- /dev/null
+++ b/ceph/doc/ceph-volume/simple/scan.rst
@@ -0,0 +1,158 @@
+.. _ceph-volume-simple-scan:
+
+``scan``
+========
+Scanning allows to capture any important details from an already-deployed OSD
+so that ``ceph-volume`` can manage it without the need of any other startup
+workflows or tools (like ``udev`` or ``ceph-disk``).
+
+The command has the ability to inspect a running OSD, by inspecting the
+directory where the OSD data is stored, or by consuming the data partition.
+
+Once scanned, information will (by default) persist the metadata as JSON in
+a file in ``/etc/ceph/osd``. This ``JSON`` file will use the naming convention
+of: ``{OSD ID}-{OSD FSID}.json``. An OSD with an id of 1, and an FSID like
+``86ebd829-1405-43d3-8fd6-4cbc9b6ecf96`` the absolute path of the file would
+be::
+
+    /etc/ceph/osd/1-86ebd829-1405-43d3-8fd6-4cbc9b6ecf96.json
+
+The ``scan`` subcommand will refuse to write to this file if it already exists.
+If overwriting the contents is needed, the ``--force`` flag must be used::
+
+    ceph-volume simple scan --force {path}
+
+If there is no need to persist the ``JSON`` metadata, there is support to send
+the contents to ``stdout`` (no file will be written)::
+
+    ceph-volume simple scan --stdout {path}
+
+
+.. _ceph-volume-simple-scan-directory:
+
+Directory scan
+--------------
+The directory scan will capture OSD file contents from interesting files. There
+are a few files that must exist in order to have a successful scan:
+
+* ``ceph_fsid``
+* ``fsid``
+* ``keyring``
+* ``ready``
+* ``type``
+* ``whoami``
+
+In the case of any other file, as long as it is not a binary or a directory, it
+will also get captured and persisted as part of the JSON object.
+
+The convention for the keys in the JSON object is that any file name will be
+a key, and its contents will be its value. If the contents are a single line
+(like in the case of the ``whoami``) the contents are trimmed, and the newline
+is dropped. For example with an OSD with an id of 1, this is how the JSON entry
+would look like::
+
+    "whoami": "1",
+
+For files that may have more than one line, the contents are left as-is, for
+example, a ``keyring`` could look like this::
+
+    "keyring": "[osd.1]\n\tkey = AQBBJ/dZp57NIBAAtnuQS9WOS0hnLVe0rZnE6Q==\n",
+
+For a directory like ``/var/lib/ceph/osd/ceph-1``, the command could look
+like::
+
+    ceph-volume simple scan /var/lib/ceph/osd/ceph1
+
+
+.. note:: There is no support for encrypted OSDs
+
+
+.. _ceph-volume-simple-scan-device:
+
+Device scan
+-----------
+When an OSD directory is not available (OSD is not running, or device is not
+mounted) the ``scan`` command is able to introspect the device to capture
+required data. Just like :ref:`ceph-volume-simple-scan-directory`, it would
+still require a few files present. This means that the device to be scanned
+**must be** the data partition of the OSD.
+
+As long as the data partition of the OSD is being passed in as an argument, the
+sub-command can scan its contents.
+
+In the case where the device is already mounted, the tool can detect this
+scenario and capture file contents from that directory.
+
+If the device is not mounted, a temporary directory will be created, and the
+device will be mounted temporarily just for scanning the contents. Once
+contents are scanned, the device will be unmounted.
+
+For a device like ``/dev/sda1`` which **must** be a data partition, the command
+could look like::
+
+    ceph-volume simple scan /dev/sda1
+
+
+.. note:: There is no support for encrypted OSDs
+
+
+.. _ceph-volume-simple-scan-json:
+
+``JSON`` contents
+-----------------
+The contents of the JSON object is very simple. The scan not only will persist
+information from the special OSD files and their contents, but will also
+validate paths and device UUIDs. Unlike what ``ceph-disk`` would do, by storing
+them in ``{device type}_uuid`` files, the tool will persist them as part of the
+device type key.
+
+For example, a ``block.db`` device would look something like::
+
+    "block.db": {
+        "path": "/dev/disk/by-partuuid/6cc43680-4f6e-4feb-92ff-9c7ba204120e",
+        "uuid": "6cc43680-4f6e-4feb-92ff-9c7ba204120e"
+    },
+
+But it will also persist the ``ceph-disk`` special file generated, like so::
+
+    "block.db_uuid": "6cc43680-4f6e-4feb-92ff-9c7ba204120e",
+
+This duplication is in place because the tool is trying to ensure the
+following:
+
+# Support OSDs that may not have ceph-disk special files
+# Check the most up-to-date information on the device, by querying against LVM
+and ``blkid``
+# Support both logical volumes and GPT devices
+
+This is a sample ``JSON`` metadata, from an OSD that is using ``bluestore``::
+
+    {
+        "active": "ok",
+        "block": {
+            "path": "/dev/disk/by-partuuid/40fd0a64-caa5-43a3-9717-1836ac661a12",
+            "uuid": "40fd0a64-caa5-43a3-9717-1836ac661a12"
+        },
+        "block.db": {
+            "path": "/dev/disk/by-partuuid/6cc43680-4f6e-4feb-92ff-9c7ba204120e",
+            "uuid": "6cc43680-4f6e-4feb-92ff-9c7ba204120e"
+        },
+        "block.db_uuid": "6cc43680-4f6e-4feb-92ff-9c7ba204120e",
+        "block_uuid": "40fd0a64-caa5-43a3-9717-1836ac661a12",
+        "bluefs": "1",
+        "ceph_fsid": "c92fc9eb-0610-4363-aafc-81ddf70aaf1b",
+        "cluster_name": "ceph",
+        "data": {
+            "path": "/dev/sdr1",
+            "uuid": "86ebd829-1405-43d3-8fd6-4cbc9b6ecf96"
+        },
+        "fsid": "86ebd829-1405-43d3-8fd6-4cbc9b6ecf96",
+        "keyring": "[osd.3]\n\tkey = AQBBJ/dZp57NIBAAtnuQS9WOS0hnLVe0rZnE6Q==\n",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "ready": "ready",
+        "systemd": "",
+        "type": "bluestore",
+        "whoami": "3"
+    }
diff --git a/ceph/doc/ceph-volume/simple/systemd.rst b/ceph/doc/ceph-volume/simple/systemd.rst
new file mode 100644
index 000000000..aa5bebffe
--- /dev/null
+++ b/ceph/doc/ceph-volume/simple/systemd.rst
@@ -0,0 +1,28 @@
+.. _ceph-volume-simple-systemd:
+
+systemd
+=======
+Upon startup, it will identify the logical volume by loading the JSON file in
+``/etc/ceph/osd/{id}-{uuid}.json`` corresponding to the instance name of the
+systemd unit.
+
+After identifying the correct volume it will then proceed to mount it by using
+the OSD destination conventions, that is::
+
+    /var/lib/ceph/osd/{cluster name}-{osd id}
+
+For our example OSD with an id of ``0``, that means the identified device will
+be mounted at::
+
+
+    /var/lib/ceph/osd/ceph-0
+
+
+Once that process is complete, a call will be made to start the OSD::
+
+    systemctl start ceph-osd@0
+
+The systemd portion of this process is handled by the ``ceph-volume simple
+trigger`` sub-command, which is only in charge of parsing metadata coming from
+systemd and startup, and then dispatching to ``ceph-volume simple activate`` which
+would proceed with activation.
diff --git a/ceph/doc/ceph-volume/systemd.rst b/ceph/doc/ceph-volume/systemd.rst
new file mode 100644
index 000000000..6cbc11218
--- /dev/null
+++ b/ceph/doc/ceph-volume/systemd.rst
@@ -0,0 +1,49 @@
+.. _ceph-volume-systemd:
+
+systemd
+=======
+As part of the activation process (either with :ref:`ceph-volume-lvm-activate`
+or :ref:`ceph-volume-simple-activate`), systemd units will get enabled that
+will use the OSD id and uuid as part of their name. These units will be run
+when the system boots, and will proceed to activate their corresponding
+volumes via their sub-command implementation.
+
+The API for activation is a bit loose, it only requires two parts: the
+subcommand to use and any extra meta information separated by a dash. This
+convention makes the units look like::
+
+    ceph-volume@{command}-{extra metadata}
+
+The *extra metadata* can be anything needed that the subcommand implementing
+the processing might need. In the case of :ref:`ceph-volume-lvm` and
+:ref:`ceph-volume-simple`, both look to consume the :term:`OSD id` and :term:`OSD uuid`,
+but this is not a hard requirement, it is just how the sub-commands are
+implemented.
+
+Both the command and extra metadata gets persisted by systemd as part of the
+*"instance name"* of the unit.  For example an OSD with an ID of 0, for the
+``lvm`` sub-command would look like::
+
+    systemctl enable ceph-volume@lvm-0-0A3E1ED2-DA8A-4F0E-AA95-61DEC71768D6
+
+The enabled unit is a :term:`systemd oneshot` service, meant to start at boot
+after the local filesystem is ready to be used.
+
+
+Failure and Retries
+-------------------
+It is common to have failures when a system is coming up online. The devices
+are sometimes not fully available and this unpredictable behavior may cause an
+OSD to not be ready to be used.
+
+There are two configurable environment variables used to set the retry
+behavior:
+
+* ``CEPH_VOLUME_SYSTEMD_TRIES``: Defaults to 30
+* ``CEPH_VOLUME_SYSTEMD_INTERVAL``: Defaults to 5
+
+The *"tries"* is a number that sets the maximum amount of times the unit will
+attempt to activate an OSD before giving up.
+
+The *"interval"* is a value in seconds that determines the waiting time before
+initiating another try at activating the OSD.
diff --git a/ceph/doc/cephfs/mds-config-ref.rst b/ceph/doc/cephfs/mds-config-ref.rst
index 4f7bea3ef..2fd47ae33 100644
--- a/ceph/doc/cephfs/mds-config-ref.rst
+++ b/ceph/doc/cephfs/mds-config-ref.rst
@@ -613,3 +613,17 @@
               
 :Type:  Boolean
 :Default:  ``false``
+
+
+``mds min caps per client``
+
+:Description: Set the minimum number of capabilities a client may hold.
+:Type: Integer
+:Default: ``100``
+
+
+``mds max ratio caps per client``
+
+:Description: Set the maximum ratio of current caps that may be recalled during MDS cache pressure.
+:Type: Float
+:Default: ``0.8``
diff --git a/ceph/doc/conf.py b/ceph/doc/conf.py
index 49b6ecde2..a1968bb4c 100644
--- a/ceph/doc/conf.py
+++ b/ceph/doc/conf.py
@@ -41,9 +41,10 @@ extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.graphviz',
     'sphinx.ext.todo',
-    'sphinx_ditaa',
+    'sphinxcontrib.ditaa',
     'breathe',
     ]
+ditaa = 'ditaa'
 todo_include_todos = True
 
 top_level = os.path.dirname(
diff --git a/ceph/doc/man/8/CMakeLists.txt b/ceph/doc/man/8/CMakeLists.txt
index fd6bbae58..f819dede2 100644
--- a/ceph/doc/man/8/CMakeLists.txt
+++ b/ceph/doc/man/8/CMakeLists.txt
@@ -26,7 +26,8 @@ set(osd_srcs
   ceph-volume.rst
   ceph-volume-systemd.rst
   ceph-osd.rst
-  osdmaptool.rst)
+  osdmaptool.rst
+  ceph-bluestore-tool.rst)
 
 set(mon_srcs
   ceph-mon.rst
diff --git a/ceph/doc/man/8/ceph-bluestore-tool.rst b/ceph/doc/man/8/ceph-bluestore-tool.rst
new file mode 100644
index 000000000..7a7b0ea6a
--- /dev/null
+++ b/ceph/doc/man/8/ceph-bluestore-tool.rst
@@ -0,0 +1,123 @@
+:orphan:
+
+======================================================
+ ceph-bluestore-tool -- bluestore administrative tool
+======================================================
+
+.. program:: ceph-bluestore-tool
+
+Synopsis
+========
+
+| **ceph-bluestore-tool** *command*
+  [ --dev *device* ... ]
+  [ --path *osd path* ]
+  [ --out-dir *dir* ]
+  [ --log-file | -l *filename* ]
+  [ --deep ]
+| **ceph-bluestore-tool** fsck|repair --path *osd path* [ --deep ]
+| **ceph-bluestore-tool** show-label --dev *device* ...
+| **ceph-bluestore-tool** prime-osd-dir --dev *device* --path *osd path*
+| **ceph-bluestore-tool** bluefs-export --path *osd path* --out-dir *dir*
+| **ceph-bluestore-tool** bluefs-export --path *osd path* --out-dir *dir*
+
+
+Description
+===========
+
+**ceph-bluestore-tool** is a utility to perform low-level administrative
+operations on a BlueStore instance.
+
+Commands
+========
+
+.. option:: help
+
+   show help
+
+.. option:: fsck
+
+   run consistency check on BlueStore metadata.  If *--deep* is specified, also read all object data and verify checksums.
+
+.. option:: repair
+
+   Run a consistency check *and* repair any errors we can.
+
+.. option:: bluefs-export
+
+   Export the contents of BlueFS (i.e., rocksdb files) to an output directory.
+
+.. option:: bluefs-bdev-sizes --path *osd path*
+
+   Print the device sizes, as understood by BlueFS, to stdout.
+
+.. option:: bluefs-bdev-expand --path *osd path*
+
+   Instruct BlueFS to check the size of its block devices and, if they have expanded, make use of the additional space.
+
+.. option:: show-label --dev *device* [...]
+
+   Show device label(s).	   
+
+Options
+=======
+
+.. option:: --dev *device*
+
+   Add *device* to the list of devices to consider
+
+.. option:: --path *osd path*
+
+   Specify an osd path.  In most cases, the device list is inferred from the symlinks present in *osd path*.  This is usually simpler than explicitly specifying the device(s) with --dev.
+
+.. option:: --out-dir *dir*
+
+   Output directory for bluefs-export
+
+.. option:: -l, --log-file *log file*
+
+   file to log to
+
+.. option:: --log-level *num*
+
+   debug log level.  Default is 30 (extremely verbose), 20 is very
+   verbose, 10 is verbose, and 1 is not very verbose.
+
+.. option:: --deep
+
+   deep scrub/repair (read and validate object data, not just metadata)
+
+Device labels
+=============
+
+Every BlueStore block device has a single block label at the beginning of the
+device.  You can dump the contents of the label with::
+
+  ceph-bluestore-tool show-label --dev *device*
+
+The main device will have a lot of metadata, including information
+that used to be stored in small files in the OSD data directory.  The
+auxilliary devices (db and wal) will only have the minimum required
+fields (OSD UUID, size, device type, birth time).
+
+OSD directory priming
+=====================
+
+You can generate the content for an OSD data directory that can start up a
+BlueStore OSD with the *prime-osd-dir* command::
+
+  ceph-bluestore-tool prime-osd-dir --dev *main device* --path /var/lib/ceph/osd/ceph-*id*
+
+
+Availability
+============
+
+**ceph-bluestore-tool** is part of Ceph, a massively scalable,
+open-source, distributed storage system. Please refer to the Ceph
+documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`ceph-osd <ceph-osd>`\(8)
diff --git a/ceph/doc/mgr/administrator.rst b/ceph/doc/mgr/administrator.rst
index 1e8d189da..11daf3e42 100644
--- a/ceph/doc/mgr/administrator.rst
+++ b/ceph/doc/mgr/administrator.rst
@@ -53,6 +53,62 @@ by a standby.
 If you want to pre-empt failover, you can explicitly mark a ceph-mgr
 daemon as failed using ``ceph mgr fail <mgr name>``.
 
+Using modules
+-------------
+
+Use the command ``ceph mgr module ls`` to see which modules are
+available, and which are currently enabled.  Enable or disable modules
+using the commands ``ceph mgr module enable <module>`` and
+``ceph mgr module disable <module>`` respectively.
+
+If a module is *enabled* then the active ceph-mgr daemon will load
+and execute it.  In the case of modules that provide a service,
+such as an HTTP server, the module may publish its address when it
+is loaded.  To see the addresses of such modules, use the command 
+``ceph mgr services``.
+
+Some modules may also implement a special standby mode which runs on
+standby ceph-mgr daemons as well as the active daemon.  This enables
+modules that provide services to redirect their clients to the active
+daemon, if the client tries to connect to a standby.
+
+Consult the documentation pages for individual manager modules for more
+information about what functionality each module provides.
+
+Here is an example of enabling the ``dashboard`` module:
+
+::
+
+	$ ceph mgr module ls
+	{
+		"enabled_modules": [
+			"restful",
+			"status"
+		],
+		"disabled_modules": [
+			"dashboard"
+		]
+	}
+
+	$ ceph mgr module enable dashboard
+	$ ceph mgr module ls
+	{
+		"enabled_modules": [
+			"restful",
+			"status",
+			"dashboard"
+		],
+		"disabled_modules": [
+		]
+	}
+
+	$ ceph mgr services
+	{
+		"dashboard": "http://myserver.com:7789/",
+		"restful": "https://myserver.com:8789/"
+	}
+
+
 Calling module commands
 -----------------------
 
diff --git a/ceph/doc/mgr/dashboard.rst b/ceph/doc/mgr/dashboard.rst
index 890849700..4c2116b13 100644
--- a/ceph/doc/mgr/dashboard.rst
+++ b/ceph/doc/mgr/dashboard.rst
@@ -39,6 +39,13 @@ If the port is not configured, the web app will bind to port ``7000``.
 If the address it not configured, the web app will bind to ``::``,
 which corresponds to all available IPv4 and IPv6 addresses.
 
+You can configure a prefix for all URLs::
+
+  ceph config-key set mgr/dashboard/url_prefix $PREFIX
+
+so you can access the dashboard at ``http://$IP:$PORT/$PREFIX/``.
+
+
 Load balancer
 -------------
 
@@ -48,4 +55,5 @@ manager is active (e.g., ``ceph mgr dump``).  In order to make the
 dashboard available via a consistent URL regardless of which manager
 daemon is currently active, you may want to set up a load balancer
 front-end to direct traffic to whichever manager endpoint is
-available.
+available. If you use a reverse http proxy that forwards a subpath to
+the dashboard, you need to configure ``url_prefix`` (see above).
diff --git a/ceph/doc/mgr/index.rst b/ceph/doc/mgr/index.rst
index 29a221661..53844ba24 100644
--- a/ceph/doc/mgr/index.rst
+++ b/ceph/doc/mgr/index.rst
@@ -26,9 +26,11 @@ sensible.
     :maxdepth: 1
 
     Installation and Configuration <administrator>
+    Writing plugins <plugins>
     Dashboard plugin <dashboard>
+    Local pool plugin <localpool>
     RESTful plugin <restful>
     Zabbix plugin <zabbix>
     Prometheus plugin <prometheus>
-    Writing plugins <plugins>
+    Influx plugin <influx>
 
diff --git a/ceph/doc/mgr/influx.rst b/ceph/doc/mgr/influx.rst
new file mode 100644
index 000000000..37aa5cd63
--- /dev/null
+++ b/ceph/doc/mgr/influx.rst
@@ -0,0 +1,162 @@
+=============
+Influx Plugin 
+=============
+
+The influx plugin continuously collects and sends time series data to an
+influxdb database.
+
+The influx plugin was introduced in the 13.x *Mimic* release.
+
+--------
+Enabling 
+--------
+
+To enable the module, use the following command:
+
+::
+
+    ceph mgr module enable influx
+
+If you wish to subsequently disable the module, you can use the equivalent
+*disable* command:
+
+::
+
+    ceph mgr module disable influx
+
+-------------
+Configuration 
+-------------
+
+For the influx module to send statistics to an InfluxDB server, it
+is necessary to configure the servers address and some authentication
+credentials.
+
+Set configuration values using the following command:
+
+::
+
+    ceph config-key set mgr/influx/<key> <value>
+
+
+The most important settings are ``hostname``, ``username`` and ``password``.  
+For example, a typical configuration might look like this:
+
+::
+
+    ceph config-key set mgr/influx/hostname influx.mydomain.com
+    ceph config-key set mgr/influx/username admin123
+    ceph config-key set mgr/influx/password p4ssw0rd
+    
+Additional optional configuration settings are:
+
+:interval: Time between reports to InfluxDB.  Default 5 seconds.
+:database: InfluxDB database name.  Default "ceph".  You will need to create this database and grant write privileges to the configured username or the username must have admin privileges to create it.  
+:port: InfluxDB server port.  Default 8086
+    
+
+---------
+Debugging 
+---------
+
+By default, a few debugging statments as well as error statements have been set to print in the log files. Users can add more if necessary.
+To make use of the debugging option in the module:
+
+- Add this to the ceph.conf file.::
+
+    [mgr]
+        debug_mgr = 20  
+
+- Use this command ``ceph tell mgr.<mymonitor> influx self-test``.
+- Check the log files. Users may find it easier to filter the log files using *mgr[influx]*.
+
+--------------------
+Interesting counters
+--------------------
+
+The following tables describe a subset of the values output by
+this module.
+
+^^^^^
+Pools
+^^^^^
+
++---------------+-----------------------------------------------------+
+|Counter        | Description                                         |
++===============+=====================================================+
+|bytes_used     | Bytes used in the pool not including copies         |
++---------------+-----------------------------------------------------+
+|max_avail      | Max available number of bytes in the pool           |
++---------------+-----------------------------------------------------+
+|objects        | Number of objects in the pool                       |
++---------------+-----------------------------------------------------+
+|wr_bytes       | Number of bytes written in the pool                 |
++---------------+-----------------------------------------------------+
+|dirty          | Number of bytes dirty in the pool                   |
++---------------+-----------------------------------------------------+
+|rd_bytes       | Number of bytes read in the pool                    |
++---------------+-----------------------------------------------------+
+|raw_bytes_used | Bytes used in pool including copies made            |
++---------------+-----------------------------------------------------+
+
+^^^^
+OSDs
+^^^^
+
++------------+------------------------------------+
+|Counter     | Description                        |
++============+====================================+
+|op_w        | Client write operations            |
++------------+------------------------------------+
+|op_in_bytes | Client operations total write size |
++------------+------------------------------------+
+|op_r        | Client read operations             |
++------------+------------------------------------+
+|op_out_bytes| Client operations total read size  |
++------------+------------------------------------+
+
+
++------------------------+--------------------------------------------------------------------------+
+|Counter                 | Description                                                              |
++========================+==========================================================================+
+|op_wip                  | Replication operations currently being processed (primary)               |
++------------------------+--------------------------------------------------------------------------+
+|op_latency              | Latency of client operations (including queue time)                      |
++------------------------+--------------------------------------------------------------------------+
+|op_process_latency      | Latency of client operations (excluding queue time)                      |           
++------------------------+--------------------------------------------------------------------------+
+|op_prepare_latency      | Latency of client operations (excluding queue time and wait for finished)|
++------------------------+--------------------------------------------------------------------------+
+|op_r_latency            | Latency of read operation (including queue time)                         |
++------------------------+--------------------------------------------------------------------------+
+|op_r_process_latency    | Latency of read operation (excluding queue time)                         |
++------------------------+--------------------------------------------------------------------------+
+|op_w_in_bytes           | Client data written                                                      |
++------------------------+--------------------------------------------------------------------------+
+|op_w_latency            | Latency of write operation (including queue time)                        |
++------------------------+--------------------------------------------------------------------------+
+|op_w_process_latency    | Latency of write operation (excluding queue time)                        |
++------------------------+--------------------------------------------------------------------------+
+|op_w_prepare_latency    | Latency of write operations (excluding queue time and wait for finished) |
++------------------------+--------------------------------------------------------------------------+
+|op_rw                   | Client read-modify-write operations                                      |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_in_bytes          | Client read-modify-write operations write in                             |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_out_bytes         | Client read-modify-write operations read out                             |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_latency           | Latency of read-modify-write operation (including queue time)            |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_process_latency   | Latency of read-modify-write operation (excluding queue time)            |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_prepare_latency   | Latency of read-modify-write operations (excluding queue time            |
+|                        | and wait for finished)                                                   |
++------------------------+--------------------------------------------------------------------------+
+|op_before_queue_op_lat  | Latency of IO before calling queue (before really queue into ShardedOpWq)|
+|                        | op_before_dequeue_op_lat                                                 |
++------------------------+--------------------------------------------------------------------------+
+|op_before_dequeue_op_lat| Latency of IO before calling dequeue_op(already dequeued and get PG lock)|
++------------------------+--------------------------------------------------------------------------+
+
+Latency counters are measured in microseconds unless otherwise specified in the description.
+
diff --git a/ceph/doc/mgr/localpool.rst b/ceph/doc/mgr/localpool.rst
new file mode 100644
index 000000000..5779b7cf1
--- /dev/null
+++ b/ceph/doc/mgr/localpool.rst
@@ -0,0 +1,35 @@
+Local pool plugin
+=================
+
+The *localpool* plugin can automatically create RADOS pools that are
+localized to a subset of the overall cluster.  For example, by default, it will
+create a pool for each distinct rack in the cluster.  This can be useful for some
+deployments that want to distribute some data locally as well as globally across the cluster .
+
+Enabling
+--------
+
+The *localpool* module is enabled with::
+
+  ceph mgr module enable localpool
+
+Configuring
+-----------
+
+The *localpool* module understands the following options:
+
+* **subtree** (default: `rack`): which CRUSH subtree type the module
+  should create a pool for.
+* **failure_domain** (default: `host`): what failure domain we should
+  separate data replicas across.
+* **pg_num** (default: `128`): number of PGs to create for each pool
+* **num_rep** (default: `3`): number of replicas for each pool.
+  (Currently, pools are always replicated.)
+* **min_size** (default: none): value to set min_size to (unchanged from Ceph's default if this option is not set)
+* **prefix** (default: `by-$subtreetype-`): prefix for the pool name.
+
+These options are set via the config-key interface.  For example, to
+change the replication level to 2x with only 64 PGs, ::
+
+  ceph config-key set mgr/localpool/num_rep 2
+  ceph config-key set mgr/localpool/pg_num 64
diff --git a/ceph/doc/mgr/plugins.rst b/ceph/doc/mgr/plugins.rst
index b5a13cc3e..a75c14c84 100644
--- a/ceph/doc/mgr/plugins.rst
+++ b/ceph/doc/mgr/plugins.rst
@@ -157,6 +157,31 @@ a command completes, the ``notify()`` callback on the MgrModule
 instance is triggered, with notify_type set to "command", and
 notify_id set to the tag of the command.
 
+Implementing standby mode
+-------------------------
+
+For some modules, it is useful to run on standby manager daemons as well
+as on the active daemon.  For example, an HTTP server can usefully
+serve HTTP redirect responses from the standby managers so that
+the user can point his browser at any of the manager daemons without
+having to worry about which one is active.
+
+Standby manager daemons look for a class called ``StandbyModule``
+in each module.  If the class is not found then the module is not
+used at all on standby daemons.  If the class is found, then
+its ``serve`` method is called.  Implementations of ``StandbyModule``
+must inherit from ``mgr_module.MgrStandbyModule``.
+
+The interface of ``MgrStandbyModule`` is much restricted compared to
+``MgrModule`` -- none of the Ceph cluster state is available to
+the module.  ``serve`` and ``shutdown`` methods are used in the same
+way as a normal module class.  The ``get_active_uri`` method enables
+the standby module to discover the address of its active peer in
+order to make redirects.  See the ``MgrStandbyModule`` definition
+in the Ceph source code for the full list of methods.
+
+For an example of how to use this interface, look at the source code
+of the ``dashboard`` module.
 
 Logging
 -------
diff --git a/ceph/doc/mgr/prometheus.rst b/ceph/doc/mgr/prometheus.rst
index fc84afee4..5bae6a984 100644
--- a/ceph/doc/mgr/prometheus.rst
+++ b/ceph/doc/mgr/prometheus.rst
@@ -1,3 +1,4 @@
+=================
 Prometheus plugin
 =================
 
@@ -12,8 +13,8 @@ The HTTP path and query parameters are ignored; all extant counters
 for all reporting entities are returned in text exposition format.
 (See the Prometheus `documentation <https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details>`_.)
 
-Enabling
---------
+Enabling prometheus output
+==========================
 
 The *prometheus* module is enabled with::
 
@@ -28,19 +29,187 @@ configurable with ``ceph config-key set``, with keys
 ``mgr/prometheus/server_addr`` and ``mgr/prometheus/server_port``.
 This port is registered with Prometheus's `registry <https://github.com/prometheus/prometheus/wiki/Default-port-allocations>`_.
 
+Statistic names and labels
+==========================
+
+The names of the stats are exactly as Ceph names them, with
+illegal characters ``.``, ``-`` and ``::`` translated to ``_``, 
+and ``ceph_`` prefixed to all names.
+
+
+All *daemon* statistics have a ``ceph_daemon`` label such as "osd.123"
+that identifies the type and ID of the daemon they come from.  Some
+statistics can come from different types of daemon, so when querying
+e.g. an OSD's RocksDB stats, you would probably want to filter
+on ceph_daemon starting with "osd" to avoid mixing in the monitor
+rocksdb stats.
+
+
+The *cluster* statistics (i.e. those global to the Ceph cluster)
+have labels appropriate to what they report on.  For example, 
+metrics relating to pools have a ``pool_id`` label.
+
+Pool and OSD metadata series
+----------------------------
+
+Special series are output to enable displaying and querying on
+certain metadata fields.
+
+Pools have a ``ceph_pool_metadata`` field like this:
+
+::
+
+    ceph_pool_metadata{pool_id="2",name="cephfs_metadata_a"} 0.0
+
+OSDs have a ``ceph_osd_metadata`` field like this:
+
+::
+
+    ceph_osd_metadata{cluster_addr="172.21.9.34:6802/19096",device_class="ssd",id="0",public_addr="172.21.9.34:6801/19096",weight="1.0"} 0.0
+
+
+Correlating drive statistics with node_exporter
+-----------------------------------------------
+
+The prometheus output from Ceph is designed to be used in conjunction
+with the generic host monitoring from the Prometheus node_exporter.
+
+To enable correlation of Ceph OSD statistics with node_exporter's 
+drive statistics, special series are output like this:
+
+::
+
+    ceph_disk_occupation{ceph_daemon="osd.0",device="sdd",instance="myhost",job="ceph"}
+
+To use this to get disk statistics by OSD ID, use the ``and on`` syntax
+in your prometheus query like this:
+
+::
+
+    rate(node_disk_bytes_written[30s]) and on (device,instance) ceph_disk_occupation{ceph_daemon="osd.0"}
+
+See the prometheus documentation for more information about constructing
+queries.
+
+Note that for this mechanism to work, Ceph and node_exporter must agree
+about the values of the ``instance`` label.  See the following section
+for guidance about to to set up Prometheus in a way that sets
+``instance`` properly.
+
+Configuring Prometheus server
+=============================
+
+See the prometheus documentation for full details of how to add
+scrape endpoints: the notes
+in this section are tips on how to configure Prometheus to capture
+the Ceph statistics in the most usefully-labelled form.
+
+This configuration is necessary because Ceph is reporting metrics
+from many hosts and services via a single endpoint, and some
+metrics that relate to no physical host (such as pool statistics).
+
+honor_labels
+------------
+
+To enable Ceph to output properly-labelled data relating to any host,
+use the ``honor_labels`` setting when adding the ceph-mgr endpoints
+to your prometheus configuration.
+
+Without this setting, any ``instance`` labels that Ceph outputs, such
+as those in ``ceph_disk_occupation`` series, will be overridden
+by Prometheus.
+
+Ceph instance label
+-------------------
+
+By default, Prometheus applies an ``instance`` label that includes
+the hostname and port of the endpoint that the series game from.  Because
+Ceph clusters have multiple manager daemons, this results in an ``instance``
+label that changes spuriously when the active manager daemon changes.
+
+Set a custom ``instance`` label in your Prometheus target configuration: 
+you might wish to set it to the hostname of your first monitor, or something
+completely arbitrary like "ceph_cluster".
+
+node_exporter instance labels
+-----------------------------
+
+Set your ``instance`` labels to match what appears in Ceph's OSD metadata
+in the ``hostname`` field.  This is generally the short hostname of the node.
+
+This is only necessary if you want to correlate Ceph stats with host stats,
+but you may find it useful to do it in all cases in case you want to do
+the correlation in the future.
+
+Example configuration
+---------------------
+
+This example shows a single node configuration running ceph-mgr and
+node_exporter on a server called ``senta04``.
+
+This is just an example: there are other ways to configure prometheus
+scrape targets and label rewrite rules.
+
+prometheus.yml
+~~~~~~~~~~~~~~
+
+::
+
+    global:
+      scrape_interval:     15s
+      evaluation_interval: 15s
+
+    scrape_configs:
+      - job_name: 'node'
+        file_sd_configs:
+          - files:
+            - node_targets.yml
+      - job_name: 'ceph'
+        honor_labels: true
+        file_sd_configs:
+          - files:
+            - ceph_targets.yml
+
+
+ceph_targets.yml
+~~~~~~~~~~~~~~~~
+
+
+::
+
+    [
+        {
+            "targets": [ "senta04.mydomain.com:9283" ],
+            "labels": {
+                "instance": "ceph_cluster"
+            }
+        }
+    ]
+
+
+node_targets.yml
+~~~~~~~~~~~~~~~~
+
+::
+
+    [
+        {
+            "targets": [ "senta04.mydomain.com:9100" ],
+            "labels": {
+                "instance": "senta04"
+            }
+        }
+    ]
+
+
 Notes
------
+=====
 
 Counters and gauges are exported; currently histograms and long-running 
 averages are not.  It's possible that Ceph's 2-D histograms could be 
 reduced to two separate 1-D histograms, and that long-running averages
 could be exported as Prometheus' Summary type.
 
-The names of the stats are exactly as Ceph names them, with
-illegal characters ``.`` and ``-`` translated to ``_``.  There is one
-label applied, ``daemon``, and its value is the daemon.id for the
-daemon in question (e.g. ``{daemon=mon.hosta}`` or ``{daemon=osd.11}``).
-
 Timestamps, as with many Prometheus exporters, are established by
 the server's scrape time (Prometheus expects that it is polling the
 actual counter process synchronously).  It is possible to supply a
diff --git a/ceph/doc/rados/configuration/pool-pg-config-ref.rst b/ceph/doc/rados/configuration/pool-pg-config-ref.rst
index dd416edfa..89a3707cc 100644
--- a/ceph/doc/rados/configuration/pool-pg-config-ref.rst
+++ b/ceph/doc/rados/configuration/pool-pg-config-ref.rst
@@ -255,6 +255,15 @@ Ceph configuration file.
 :Type: 32-bit Integer
 :Default: ``45``
 
+``osd max pg per osd hard ratio``
+
+:Description: The ratio of number of PGs per OSD allowed by the cluster before
+              OSD refuses to create new PGs. OSD stops creating new PGs if the number
+              of PGs it serves exceeds
+              ``osd max pg per osd hard ratio`` \* ``mon max pg per osd``.
+
+:Type: Float
+:Default: ``2``
 
 .. _pool: ../../operations/pools
 .. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg#peering
diff --git a/ceph/doc/rados/operations/health-checks.rst b/ceph/doc/rados/operations/health-checks.rst
index 616435579..c1e22004a 100644
--- a/ceph/doc/rados/operations/health-checks.rst
+++ b/ceph/doc/rados/operations/health-checks.rst
@@ -336,17 +336,20 @@ TOO_MANY_PGS
 ____________
 
 The number of PGs in use in the cluster is above the configurable
-threshold of ``mon_pg_warn_max_per_osd`` PGs per OSD.  This can lead
+threshold of ``mon_max_pg_per_osd`` PGs per OSD.  If this threshold is
+exceed the cluster will not allow new pools to be created, pool `pg_num` to
+be increased, or pool replication to be increased (any of which would lead to
+more PGs in the cluster).  A large number of PGs can lead
 to higher memory utilization for OSD daemons, slower peering after
 cluster state changes (like OSD restarts, additions, or removals), and
 higher load on the Manager and Monitor daemons.
 
-The ``pg_num`` value for existing pools cannot currently be reduced.
-However, the ``pgp_num`` value can, which effectively collocates some
-PGs on the same sets of OSDs, mitigating some of the negative impacts
-described above.  The ``pgp_num`` value can be adjusted with::
+The simplest way to mitigate the problem is to increase the number of
+OSDs in the cluster by adding more hardware.  Note that the OSD count
+used for the purposes of this health check is the number of "in" OSDs,
+so marking "out" OSDs "in" (if there are any) can also help::
 
-  ceph osd pool set <pool> pgp_num <value>
+  ceph osd in <osd id(s)>
 
 Please refer to
 :doc:`placement-groups#Choosing-the-number-of-Placement-Groups` for
@@ -368,7 +371,6 @@ triggering the data migration, with::
 
   ceph osd pool set <pool> pgp_num <pg-num-value>
 
-
 MANY_OBJECTS_PER_PG
 ___________________
 
diff --git a/ceph/doc/scripts/gen_state_diagram.py b/ceph/doc/scripts/gen_state_diagram.py
index f8414368c..fccde2629 100755
--- a/ceph/doc/scripts/gen_state_diagram.py
+++ b/ceph/doc/scripts/gen_state_diagram.py
@@ -82,14 +82,22 @@ class StateMachineRenderer(object):
             )
 
     def read_input(self, input_lines):
+        previous_line = None
         for line in input_lines:
             self.get_state(line)
             self.get_event(line)
-            self.get_context(line)
-
-    def get_context(self, line):
-        match = re.search(r"(\w+::)*::(?P<tag>\w+)::\w+\(const (?P<event>\w+)",
-                          line)
+            # pass two lines at a time to get the context so that regexes can
+            # match on split signatures
+            self.get_context(line, previous_line)
+            previous_line = line
+
+    def get_context(self, line, previous_line):
+        match = re.search(r"(\w+::)*::(?P<tag>\w+)::\w+\(const (?P<event>\w+)", line)
+        if match is None and previous_line is not None:
+            # it is possible that we need to match on the previous line as well, so join
+            # them to make them one line and try and get this matching
+            joined_line = ' '.join([previous_line, line])
+            match = re.search(r"(\w+::)*::(?P<tag>\w+)::\w+\(\s*const (?P<event>\w+)", joined_line)
         if match is not None:
             self.context.append((match.group('tag'), self.context_depth, match.group('event')))
         if '{' in line:
@@ -105,7 +113,7 @@ class StateMachineRenderer(object):
                 r"boost::statechart::state_machine<\s*(\w*),\s*(\w*)\s*>",
                 line)
             if tokens is None:
-                raise "Error: malformed state_machine line: " + line
+                raise Exception("Error: malformed state_machine line: " + line)
             self.machines[tokens.group(1)] = tokens.group(2)
             self.context.append((tokens.group(1), self.context_depth, ""))
             return
@@ -114,7 +122,7 @@ class StateMachineRenderer(object):
                 r"boost::statechart::state<\s*(\w*),\s*(\w*)\s*,?\s*(\w*)\s*>",
                 line)
             if tokens is None:
-                raise "Error: malformed state line: " + line
+                raise Exception("Error: malformed state line: " + line)
             self.states[tokens.group(1)] = tokens.group(2)
             if tokens.group(2) not in self.state_contents.keys():
                 self.state_contents[tokens.group(2)] = []
@@ -131,14 +139,14 @@ class StateMachineRenderer(object):
                 if i.group(1) not in self.edges.keys():
                     self.edges[i.group(1)] = []
                 if len(self.context) is 0:
-                    raise "no context at line: " + line
+                    raise Exception("no context at line: " + line)
                 self.edges[i.group(1)].append((self.context[-1][0], i.group(2)))
         i = re.search("return\s+transit<\s*(\w*)\s*>()", line)
         if i is not None:
             if len(self.context) is 0:
-                raise "no context at line: " + line
+                raise Exception("no context at line: " + line)
             if self.context[-1][2] is "":
-                raise "no event in context at line: " + line
+                raise Exception("no event in context at line: " + line)
             if self.context[-1][2] not in self.edges.keys():
                 self.edges[self.context[-1][2]] = []
             self.edges[self.context[-1][2]].append((self.context[-1][0], i.group(1)))
diff --git a/ceph/etc/default/ceph b/ceph/etc/default/ceph
index f2722073b..4542838f4 100644
--- a/ceph/etc/default/ceph
+++ b/ceph/etc/default/ceph
@@ -5,11 +5,3 @@
 
 # Increase tcmalloc cache size
 TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728
-
-## use jemalloc instead of tcmalloc
-#
-# jemalloc is generally faster for small IO workloads and when
-# ceph-osd is backed by SSDs.  However, memory usage is usually
-# higher by 200-300mb.
-#
-#LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.1
diff --git a/ceph/etc/sysconfig/ceph b/ceph/etc/sysconfig/ceph
index 61e941ded..c7f4bc45a 100644
--- a/ceph/etc/sysconfig/ceph
+++ b/ceph/etc/sysconfig/ceph
@@ -6,14 +6,6 @@
 # Increase tcmalloc cache size
 TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728
 
-## use jemalloc instead of tcmalloc
-#
-# jemalloc is generally faster for small IO workloads and when
-# ceph-osd is backed by SSDs.  However, memory usage is usually
-# higher by 200-300mb.
-#
-#LD_PRELOAD=/usr/lib64/libjemalloc.so.1
-
 ## automatically restart systemd units on upgrade
 #
 # By default, it is left to the administrator to restart
diff --git a/ceph/qa/cephfs/clusters/3-mds.yaml b/ceph/qa/cephfs/clusters/3-mds.yaml
index ff35ed1a6..05c6142f8 100644
--- a/ceph/qa/cephfs/clusters/3-mds.yaml
+++ b/ceph/qa/cephfs/clusters/3-mds.yaml
@@ -1,4 +1,4 @@
 roles:
-- [mon.a, mon.c, mgr.y, mds.a, osd.0, osd.1, osd.2]
-- [mon.b, mgr.x, mds.b, mds.c, osd.3, osd.4, osd.5]
+- [mon.a, mon.c, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mgr.x, mds.b, mds.c, osd.4, osd.5, osd.6, osd.7]
 - [client.0]
diff --git a/ceph/qa/cephfs/clusters/9-mds.yaml b/ceph/qa/cephfs/clusters/9-mds.yaml
index c1228b3a1..a6342dc06 100644
--- a/ceph/qa/cephfs/clusters/9-mds.yaml
+++ b/ceph/qa/cephfs/clusters/9-mds.yaml
@@ -1,4 +1,4 @@
 roles:
-- [mon.a, mon.c, mgr.y, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2]
-- [mon.b, mgr.x, mds.e, mds.f, mds.g, mds.h, mds.i, osd.3, osd.4, osd.5]
+- [mon.a, mon.c, mgr.y, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mgr.x, mds.e, mds.f, mds.g, mds.h, mds.i, osd.4, osd.5, osd.6, osd.7]
 - [client.0]
diff --git a/ceph/qa/cephfs/objectstore-ec/bluestore-comp-ec-root.yaml b/ceph/qa/cephfs/objectstore-ec/bluestore-comp-ec-root.yaml
new file mode 100644
index 000000000..9bc487cfc
--- /dev/null
+++ b/ceph/qa/cephfs/objectstore-ec/bluestore-comp-ec-root.yaml
@@ -0,0 +1,28 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    cephfs_ec_profile:
+      - m=2
+      - k=2
+      - crush-failure-domain=osd
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore compression mode: aggressive
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+
diff --git a/ceph/qa/cephfs/objectstore-ec/bluestore-comp.yaml b/ceph/qa/cephfs/objectstore-ec/bluestore-comp.yaml
new file mode 100644
index 000000000..b408032fd
--- /dev/null
+++ b/ceph/qa/cephfs/objectstore-ec/bluestore-comp.yaml
@@ -0,0 +1,23 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore compression mode: aggressive
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
diff --git a/ceph/qa/cephfs/objectstore-ec/bluestore-ec-root.yaml b/ceph/qa/cephfs/objectstore-ec/bluestore-ec-root.yaml
new file mode 100644
index 000000000..726ad3d56
--- /dev/null
+++ b/ceph/qa/cephfs/objectstore-ec/bluestore-ec-root.yaml
@@ -0,0 +1,42 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    cephfs_ec_profile:
+      - m=2
+      - k=2
+      - crush-failure-domain=osd
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff --git a/ceph/qa/cephfs/objectstore-ec/bluestore.yaml b/ceph/qa/cephfs/objectstore-ec/bluestore.yaml
new file mode 100644
index 000000000..19dfeb036
--- /dev/null
+++ b/ceph/qa/cephfs/objectstore-ec/bluestore.yaml
@@ -0,0 +1,38 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff --git a/ceph/qa/cephfs/objectstore-ec/filestore-xfs.yaml b/ceph/qa/cephfs/objectstore-ec/filestore-xfs.yaml
new file mode 100644
index 000000000..f7aa0dd79
--- /dev/null
+++ b/ceph/qa/cephfs/objectstore-ec/filestore-xfs.yaml
@@ -0,0 +1,15 @@
+overrides:
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: filestore
+        osd sloppy crc: true
+  ceph-deploy:
+    fs: xfs
+    filestore: True
+    conf:
+      osd:
+        osd objectstore: filestore
+        osd sloppy crc: true
+
diff --git a/ceph/qa/distros/all/centos_7.4.yaml b/ceph/qa/distros/all/centos_7.4.yaml
new file mode 100644
index 000000000..d06bc384b
--- /dev/null
+++ b/ceph/qa/distros/all/centos_7.4.yaml
@@ -0,0 +1,2 @@
+os_type: centos
+os_version: "7.4"
diff --git a/ceph/qa/distros/supported/centos_latest.yaml b/ceph/qa/distros/supported/centos_latest.yaml
index 34d81be44..4cc59dad1 120000
--- a/ceph/qa/distros/supported/centos_latest.yaml
+++ b/ceph/qa/distros/supported/centos_latest.yaml
@@ -1 +1 @@
-../all/centos_7.3.yaml
\ No newline at end of file
+../all/centos_7.4.yaml
\ No newline at end of file
diff --git a/ceph/qa/releases/luminous-with-mgr.yaml b/ceph/qa/releases/luminous-with-mgr.yaml
index ea3130768..391a5e181 100644
--- a/ceph/qa/releases/luminous-with-mgr.yaml
+++ b/ceph/qa/releases/luminous-with-mgr.yaml
@@ -2,10 +2,11 @@ tasks:
 - exec:
     osd.0:
       - ceph osd require-osd-release luminous
-      - ceph osd set-require-min-compat-client luminous
 - ceph.healthy:
 overrides:
   ceph:
     conf:
       mon:
         mon warn on osd down out interval zero: false
+    log-whitelist:
+      - ruleset-
diff --git a/ceph/qa/releases/luminous.yaml b/ceph/qa/releases/luminous.yaml
index 9ed76715a..5bd666ca0 100644
--- a/ceph/qa/releases/luminous.yaml
+++ b/ceph/qa/releases/luminous.yaml
@@ -19,3 +19,4 @@ overrides:
         mon warn on osd down out interval zero: false
     log-whitelist:
       - no active mgr
+      - ruleset-
diff --git a/ceph/qa/standalone/mon/osd-pool-create.sh b/ceph/qa/standalone/mon/osd-pool-create.sh
index 99a6064c0..693165d89 100755
--- a/ceph/qa/standalone/mon/osd-pool-create.sh
+++ b/ceph/qa/standalone/mon/osd-pool-create.sh
@@ -200,7 +200,7 @@ function TEST_utf8_cli() {
     # the fix for http://tracker.ceph.com/issues/7387.  If it turns out
     # to not be OK (when is the default encoding *not* UTF-8?), maybe
     # the character 'é»' can be replaced with the escape $'\xe9\xbb\x84'
-    ceph osd pool create é» 1024 || return 1
+    ceph osd pool create é» 16 || return 1
     ceph osd lspools 2>&1 | \
         grep "é»" || return 1
     ceph -f json-pretty osd dump | \
diff --git a/ceph/src/test/ceph_objectstore_tool.py b/ceph/qa/standalone/special/ceph_objectstore_tool.py
similarity index 97%
rename from ceph/src/test/ceph_objectstore_tool.py
rename to ceph/qa/standalone/special/ceph_objectstore_tool.py
index bae12f4d8..7c52101e4 100755
--- a/ceph/src/test/ceph_objectstore_tool.py
+++ b/ceph/qa/standalone/special/ceph_objectstore_tool.py
@@ -152,7 +152,7 @@ def cat_file(level, filename):
 def vstart(new, opt=""):
     print("vstarting....", end="")
     NEW = new and "-n" or "-N"
-    call("MON=1 OSD=4 MDS=0 MGR=1 CEPH_PORT=7400 {path}/src/vstart.sh --short -l {new} -d {opt} > /dev/null 2>&1".format(new=NEW, opt=opt, path=CEPH_ROOT), shell=True)
+    call("MON=1 OSD=4 MDS=0 MGR=1 CEPH_PORT=7400 {path}/src/vstart.sh --filestore --short -l {new} -d {opt} > /dev/null 2>&1".format(new=NEW, opt=opt, path=CEPH_ROOT), shell=True)
     print("DONE")
 
 
@@ -388,14 +388,18 @@ CEPH_ROOT = os.environ.get('CEPH_ROOT')
 if not CEPH_BUILD_DIR:
     CEPH_BUILD_DIR=os.getcwd()
     os.putenv('CEPH_BUILD_DIR', CEPH_BUILD_DIR)
-    CEPH_BIN=CEPH_BUILD_DIR
+    CEPH_BIN=os.path.join(CEPH_BUILD_DIR, 'bin')
     os.putenv('CEPH_BIN', CEPH_BIN)
     CEPH_ROOT=os.path.dirname(CEPH_BUILD_DIR)
     os.putenv('CEPH_ROOT', CEPH_ROOT)
-    CEPH_LIB=os.path.join(CEPH_BIN, '.libs')
+    CEPH_LIB=os.path.join(CEPH_BUILD_DIR, 'lib')
     os.putenv('CEPH_LIB', CEPH_LIB)
 
-CEPH_DIR = CEPH_BUILD_DIR + "/cot_dir"
+try:
+    os.mkdir("td")
+except:
+    pass # ok if this is already there
+CEPH_DIR = os.path.join(CEPH_BUILD_DIR, os.path.join("td", "cot_dir"))
 CEPH_CONF = os.path.join(CEPH_DIR, 'ceph.conf')
 
 def kill_daemons():
@@ -518,7 +522,7 @@ def get_osd_weights(CFSD_PREFIX, osd_ids, osd_path):
         if linev[0] is '':
             linev.pop(0)
         print('linev %s' % linev)
-        weights.append(float(linev[1]))
+        weights.append(float(linev[2]))
 
     return weights
 
@@ -672,9 +676,10 @@ def main(argv):
     else:
         nullfd = DEVNULL
 
-    call("rm -fr {dir}; mkdir {dir}".format(dir=CEPH_DIR), shell=True)
+    call("rm -fr {dir}; mkdir -p {dir}".format(dir=CEPH_DIR), shell=True)
+    os.chdir(CEPH_DIR)
     os.environ["CEPH_DIR"] = CEPH_DIR
-    OSDDIR = os.path.join(CEPH_DIR, "dev")
+    OSDDIR = "dev"
     REP_POOL = "rep_pool"
     REP_NAME = "REPobject"
     EC_POOL = "ec_pool"
@@ -713,6 +718,7 @@ def main(argv):
     cmd = "{path}/ceph osd pool create {pool} {pg} {pg} replicated".format(pool=REP_POOL, pg=PG_COUNT, path=CEPH_BIN)
     logging.debug(cmd)
     call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+    time.sleep(2)
     REPID = get_pool_id(REP_POOL, nullfd)
 
     print("Created Replicated pool #{repid}".format(repid=REPID))
@@ -989,6 +995,12 @@ def main(argv):
     cmd = "{path}/ceph-objectstore-tool --journal-path BAD_JOURNAL_PATH --op dump-journal".format(path=CEPH_BIN)
     ERRORS += test_failure(cmd, "journal-path: BAD_JOURNAL_PATH: (2) No such file or directory")
 
+    cmd = (CFSD_PREFIX + "--journal-path BAD_JOURNAL_PATH --op list").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "journal-path: BAD_JOURNAL_PATH: No such file or directory")
+
+    cmd = (CFSD_PREFIX + "--journal-path /bin --op list").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "journal-path: /bin: (21) Is a directory")
+
     # On import can't use stdin from a terminal
     cmd = (CFSD_PREFIX + "--op import --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
     ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
@@ -1006,7 +1018,10 @@ def main(argv):
     cmd = "{path}/ceph-objectstore-tool --type memstore --op list --pgid {pg}".format(dir=OSDDIR, osd=ONEOSD, pg=ONEPG, path=CEPH_BIN)
     ERRORS += test_failure(cmd, "Must provide --data-path")
 
-    cmd = (CFSD_PREFIX + "--op remove").format(osd=ONEOSD)
+    cmd = (CFSD_PREFIX + "--op remove --pgid 2.0").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "Please use export-remove or you must use --force option")
+
+    cmd = (CFSD_PREFIX + "--force --op remove").format(osd=ONEOSD)
     ERRORS += test_failure(cmd, "Must provide pgid")
 
     # Don't secify a --op nor object command
@@ -1015,7 +1030,7 @@ def main(argv):
 
     # Specify a bad --op command
     cmd = (CFSD_PREFIX + "--op oops").format(osd=ONEOSD)
-    ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, mkfs, fsck, export, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)")
+    ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, mkfs, fsck, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)")
 
     # Provide just the object param not a command
     cmd = (CFSD_PREFIX + "object").format(osd=ONEOSD)
@@ -1720,7 +1735,7 @@ def main(argv):
             if ret != 0:
                 logging.error("Removing --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
                 RM_ERRORS += 1
-            cmd = (CFSD_PREFIX + "--op remove --pgid {pg}").format(pg=pg, osd=osd)
+            cmd = (CFSD_PREFIX + "--force --op remove --pgid {pg}").format(pg=pg, osd=osd)
             logging.debug(cmd)
             ret = call(cmd, shell=True, stdout=nullfd)
             if ret != 0:
@@ -1923,7 +1938,7 @@ def main(argv):
 
             which = 0
             for osd in get_osds(pg, OSDDIR):
-                cmd = (CFSD_PREFIX + "--op remove --pgid {pg}").format(pg=pg, osd=osd)
+                cmd = (CFSD_PREFIX + "--force --op remove --pgid {pg}").format(pg=pg, osd=osd)
                 logging.debug(cmd)
                 ret = call(cmd, shell=True, stdout=nullfd)
 
@@ -1961,6 +1976,17 @@ def main(argv):
     # vstart() starts 4 OSDs
     ERRORS += test_get_set_osdmap(CFSD_PREFIX, list(range(4)), ALLOSDS)
     ERRORS += test_get_set_inc_osdmap(CFSD_PREFIX, ALLOSDS[0])
+
+    kill_daemons()
+    CORES = [f for f in os.listdir(CEPH_DIR) if f.startswith("core.")]
+    if CORES:
+        CORE_DIR = os.path.join("/tmp", "cores.{pid}".format(pid=os.getpid()))
+        os.mkdir(CORE_DIR)
+        call("/bin/mv {ceph_dir}/core.* {core_dir}".format(ceph_dir=CEPH_DIR, core_dir=CORE_DIR), shell=True)
+        logging.error("Failure due to cores found")
+        logging.error("See {core_dir} for cores".format(core_dir=CORE_DIR))
+        ERRORS += len(CORES)
+
     if ERRORS == 0:
         print("TEST PASSED")
         return 0
@@ -1992,6 +2018,7 @@ if __name__ == "__main__":
         status = main(sys.argv[1:])
     finally:
         kill_daemons()
+        os.chdir(CEPH_BUILD_DIR)
         remove_btrfs_subvolumes(CEPH_DIR)
         call("/bin/rm -fr {dir}".format(dir=CEPH_DIR), shell=True)
     sys.exit(status)
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_7.3.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_7.3.yaml
deleted file mode 100644
index 9dfcc7f6b..000000000
--- a/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_7.3.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-os_type: centos
-os_version: "7.3"
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_latest.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_latest.yaml
new file mode 120000
index 000000000..b5973b952
--- /dev/null
+++ b/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/centos_latest.yaml
@@ -0,0 +1 @@
+../../../../../distros/supported/centos_latest.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_16.04.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_16.04.yaml
deleted file mode 100644
index a459fddff..000000000
--- a/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_16.04.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-os_type: ubuntu
-os_version: "16.04"
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_latest.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_latest.yaml
new file mode 120000
index 000000000..cc5b15bcc
--- /dev/null
+++ b/ceph/qa/suites/ceph-ansible/smoke/basic/1-distros/ubuntu_latest.yaml
@@ -0,0 +1 @@
+../../../../../distros/supported/ubuntu_latest.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml
new file mode 100644
index 000000000..36d0a07d9
--- /dev/null
+++ b/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml
@@ -0,0 +1,32 @@
+meta:
+- desc: "Build the ceph cluster using ceph-ansible"
+
+overrides:
+   ceph_ansible:
+     vars:
+        ceph_conf_overrides:
+          global:
+            osd default pool size: 2
+            mon pg warn min per osd: 2
+            osd pool default pg num: 64
+            osd pool default pgp num: 64
+            mon_max_pg_per_osd: 1024
+        ceph_test: true
+        ceph_stable_release: luminous
+        osd_scenario: collocated
+        journal_size: 1024
+        osd_auto_discovery: false
+        ceph_origin: repository
+        ceph_repository: dev
+        ceph_mgr_modules:
+          - status
+          - restful
+        cephfs_pools:
+          - name: "cephfs_data"
+            pgs: "64"
+          - name: "cephfs_metadata"
+            pgs: "64"
+tasks:
+- ssh-keys:
+- ceph_ansible:
+- install.ship_utilities:
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/2-config/ceph_ansible.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/2-config/ceph_ansible.yaml
deleted file mode 100644
index 5750d52bc..000000000
--- a/ceph/qa/suites/ceph-ansible/smoke/basic/2-config/ceph_ansible.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-meta:
-- desc: "Build the ceph cluster using ceph-ansible"
-
-overrides:                                                                                                                                                                                                  
-   ceph_ansible:                                                                                                                                                                                            
-     vars:                                                                                                                                                                                                  
-        ceph_conf_overrides:                                                                                                                                                                                
-          global:                                                                                                                                                                                           
-            osd default pool size: 2                                                                                                                                                                        
-            mon pg warn min per osd: 2                                                                                                                                                                      
-        ceph_dev: true                                                                                                                                                                                      
-        ceph_dev_key: https://download.ceph.com/keys/autobuild.asc                                                                                                                                          
-        ceph_origin: upstream                                                                                                                                                                               
-        ceph_test: true                                                                                                                                                                                     
-        journal_collocation: true                                                                                                                                                                           
-        journal_size: 1024                                                                                                                                                                                  
-        osd_auto_discovery: false           
-
-tasks:
-- ssh-keys:
-- ceph_ansible:
-- install.ship_utilities:
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/bluestore_with_dmcrypt.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/bluestore_with_dmcrypt.yaml
new file mode 100644
index 000000000..604e757ad
--- /dev/null
+++ b/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/bluestore_with_dmcrypt.yaml
@@ -0,0 +1,8 @@
+meta:
+- desc: "use bluestore + dmcrypt option"
+
+overrides:
+   ceph_ansible:
+     vars:
+        osd_objectstore: bluestore
+        dmcrypt: True
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_off.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_off.yaml
new file mode 100644
index 000000000..4bbd1c7c5
--- /dev/null
+++ b/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_off.yaml
@@ -0,0 +1,7 @@
+meta:
+- desc: "without dmcrypt"
+
+overrides:
+   ceph_ansible:
+     vars:
+        dmcrypt: False
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_on.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_on.yaml
new file mode 100644
index 000000000..12d63d325
--- /dev/null
+++ b/ceph/qa/suites/ceph-ansible/smoke/basic/3-config/dmcrypt_on.yaml
@@ -0,0 +1,7 @@
+meta:
+- desc: "use dmcrypt option"
+
+overrides:
+   ceph_ansible:
+     vars:
+        dmcrypt: True
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/cls.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/cls.yaml
deleted file mode 100644
index 781a4d4f8..000000000
--- a/ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/cls.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-meta:
-- desc: "Run the rados cls tests"
-tasks:
-- workunit:
-    clients:
-      client.0:
-        - cls
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/ceph-admin-commands.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/ceph-admin-commands.yaml
similarity index 100%
rename from ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/ceph-admin-commands.yaml
rename to ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/ceph-admin-commands.yaml
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/rbd_import_export.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/rbd_import_export.yaml
similarity index 100%
rename from ceph/qa/suites/ceph-ansible/smoke/basic/3-tasks/rbd_import_export.yaml
rename to ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/rbd_import_export.yaml
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml
new file mode 100644
index 000000000..8e389134b
--- /dev/null
+++ b/ceph/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml
@@ -0,0 +1,15 @@
+tasks:
+- exec:
+    mgr.x:
+      - systemctl stop ceph-mgr.target
+      - sleep 5
+      - ceph -s
+- exec:
+    mon.a:
+      - ceph restful create-key admin
+      - ceph restful create-self-signed-cert
+      - ceph restful restart
+- workunit:
+    clients:
+      client.0:
+        - rest/test-restful.sh
diff --git a/ceph/qa/suites/fs/32bits/objectstore b/ceph/qa/suites/fs/32bits/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/fs/32bits/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/32bits/objectstore-ec b/ceph/qa/suites/fs/32bits/objectstore-ec
new file mode 120000
index 000000000..15dc98f23
--- /dev/null
+++ b/ceph/qa/suites/fs/32bits/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/basic_functional/clusters/4-remote-clients.yaml b/ceph/qa/suites/fs/basic_functional/clusters/4-remote-clients.yaml
index f9e423e2f..1c540a4ef 100644
--- a/ceph/qa/suites/fs/basic_functional/clusters/4-remote-clients.yaml
+++ b/ceph/qa/suites/fs/basic_functional/clusters/4-remote-clients.yaml
@@ -1,6 +1,6 @@
 roles:
-- [mon.a, mgr.x, osd.0, mds.a, mds.b, client.1, client.2, client.3]
-- [client.0, osd.1, osd.2]
+- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, mds.a, mds.b, client.1, client.2, client.3]
+- [client.0, osd.4, osd.5, osd.6, osd.7]
 openstack:
 - volumes: # attached to each instance
     count: 2
diff --git a/ceph/qa/suites/fs/basic_functional/objectstore/bluestore-ec-root.yaml b/ceph/qa/suites/fs/basic_functional/objectstore/bluestore-ec-root.yaml
new file mode 120000
index 000000000..36a4d69cd
--- /dev/null
+++ b/ceph/qa/suites/fs/basic_functional/objectstore/bluestore-ec-root.yaml
@@ -0,0 +1 @@
+../../../../cephfs/objectstore-ec/bluestore-ec-root.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/basic_workload/objectstore b/ceph/qa/suites/fs/basic_workload/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/fs/basic_workload/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/basic_workload/objectstore-ec b/ceph/qa/suites/fs/basic_workload/objectstore-ec
new file mode 120000
index 000000000..a330d661a
--- /dev/null
+++ b/ceph/qa/suites/fs/basic_workload/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/multiclient/clusters/three_clients.yaml b/ceph/qa/suites/fs/multiclient/clusters/three_clients.yaml
index 78b912f72..a533af5c6 100644
--- a/ceph/qa/suites/fs/multiclient/clusters/three_clients.yaml
+++ b/ceph/qa/suites/fs/multiclient/clusters/three_clients.yaml
@@ -1,5 +1,5 @@
 roles:
-- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2]
+- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
 - [client.2]
 - [client.1]
 - [client.0]
diff --git a/ceph/qa/suites/fs/multiclient/clusters/two_clients.yaml b/ceph/qa/suites/fs/multiclient/clusters/two_clients.yaml
index 9586e6c8f..00f3815cb 100644
--- a/ceph/qa/suites/fs/multiclient/clusters/two_clients.yaml
+++ b/ceph/qa/suites/fs/multiclient/clusters/two_clients.yaml
@@ -1,5 +1,5 @@
 roles:
-- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2]
+- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
 - [client.1]
 - [client.0]
 
diff --git a/ceph/qa/suites/fs/multiclient/objectstore b/ceph/qa/suites/fs/multiclient/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/fs/multiclient/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/multiclient/objectstore-ec b/ceph/qa/suites/fs/multiclient/objectstore-ec
new file mode 120000
index 000000000..a330d661a
--- /dev/null
+++ b/ceph/qa/suites/fs/multiclient/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/multifs/clusters/2-remote-clients.yaml b/ceph/qa/suites/fs/multifs/clusters/2-remote-clients.yaml
index 52c5d7e01..2ae772c3f 100644
--- a/ceph/qa/suites/fs/multifs/clusters/2-remote-clients.yaml
+++ b/ceph/qa/suites/fs/multifs/clusters/2-remote-clients.yaml
@@ -1,6 +1,6 @@
 roles:
-- [mon.a, mgr.x, osd.0, mon.b, mds.a, mds.b, client.1]
-- [mds.c, mds.d, mon.c, client.0, osd.1, osd.2]
+- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, mon.b, mds.a, mds.b, client.1]
+- [mds.c, mds.d, mon.c, client.0, osd.4, osd.5, osd.6, osd.7]
 openstack:
 - volumes: # attached to each instance
     count: 2
diff --git a/ceph/qa/suites/fs/multifs/objectstore b/ceph/qa/suites/fs/multifs/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/fs/multifs/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/multifs/objectstore-ec b/ceph/qa/suites/fs/multifs/objectstore-ec
new file mode 120000
index 000000000..a330d661a
--- /dev/null
+++ b/ceph/qa/suites/fs/multifs/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/permission/objectstore b/ceph/qa/suites/fs/permission/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/fs/permission/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/permission/objectstore-ec b/ceph/qa/suites/fs/permission/objectstore-ec
new file mode 120000
index 000000000..a330d661a
--- /dev/null
+++ b/ceph/qa/suites/fs/permission/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/snaps/objectstore b/ceph/qa/suites/fs/snaps/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/fs/snaps/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/snaps/objectstore-ec b/ceph/qa/suites/fs/snaps/objectstore-ec
new file mode 120000
index 000000000..a330d661a
--- /dev/null
+++ b/ceph/qa/suites/fs/snaps/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/thrash/objectstore b/ceph/qa/suites/fs/thrash/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/fs/thrash/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/thrash/objectstore-ec b/ceph/qa/suites/fs/thrash/objectstore-ec
new file mode 120000
index 000000000..a330d661a
--- /dev/null
+++ b/ceph/qa/suites/fs/thrash/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/traceless/objectstore b/ceph/qa/suites/fs/traceless/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/fs/traceless/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/traceless/objectstore-ec b/ceph/qa/suites/fs/traceless/objectstore-ec
new file mode 120000
index 000000000..a330d661a
--- /dev/null
+++ b/ceph/qa/suites/fs/traceless/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/verify/objectstore b/ceph/qa/suites/fs/verify/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/fs/verify/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/verify/objectstore-ec b/ceph/qa/suites/fs/verify/objectstore-ec
new file mode 120000
index 000000000..a330d661a
--- /dev/null
+++ b/ceph/qa/suites/fs/verify/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec/
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/cephfs/objectstore b/ceph/qa/suites/kcephfs/cephfs/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/kcephfs/cephfs/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/cephfs/objectstore-ec b/ceph/qa/suites/kcephfs/cephfs/objectstore-ec
new file mode 120000
index 000000000..15dc98f23
--- /dev/null
+++ b/ceph/qa/suites/kcephfs/cephfs/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/mixed-clients/objectstore b/ceph/qa/suites/kcephfs/mixed-clients/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/kcephfs/mixed-clients/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/mixed-clients/objectstore-ec b/ceph/qa/suites/kcephfs/mixed-clients/objectstore-ec
new file mode 120000
index 000000000..15dc98f23
--- /dev/null
+++ b/ceph/qa/suites/kcephfs/mixed-clients/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/recovery/clusters/4-remote-clients.yaml b/ceph/qa/suites/kcephfs/recovery/clusters/4-remote-clients.yaml
index 1d432addb..b1072e3be 100644
--- a/ceph/qa/suites/kcephfs/recovery/clusters/4-remote-clients.yaml
+++ b/ceph/qa/suites/kcephfs/recovery/clusters/4-remote-clients.yaml
@@ -1,6 +1,6 @@
 roles:
-- [mon.a, osd.0, mds.a, mds.c, client.2]
-- [mgr.x, osd.1, osd.2, mds.b, mds.d, client.3]
+- [mon.a, osd.0, osd.1, osd.2, osd.3, mds.a, mds.c, client.2]
+- [mgr.x, osd.4, osd.5, osd.6, osd.7, mds.b, mds.d, client.3]
 - [client.0]
 - [client.1]
 openstack:
diff --git a/ceph/qa/suites/kcephfs/recovery/objectstore b/ceph/qa/suites/kcephfs/recovery/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/kcephfs/recovery/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/recovery/objectstore-ec b/ceph/qa/suites/kcephfs/recovery/objectstore-ec
new file mode 120000
index 000000000..15dc98f23
--- /dev/null
+++ b/ceph/qa/suites/kcephfs/recovery/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/thrash/objectstore b/ceph/qa/suites/kcephfs/thrash/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/kcephfs/thrash/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/kcephfs/thrash/objectstore-ec b/ceph/qa/suites/kcephfs/thrash/objectstore-ec
new file mode 120000
index 000000000..15dc98f23
--- /dev/null
+++ b/ceph/qa/suites/kcephfs/thrash/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/multimds/basic/objectstore b/ceph/qa/suites/multimds/basic/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/multimds/basic/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/multimds/basic/objectstore-ec b/ceph/qa/suites/multimds/basic/objectstore-ec
new file mode 120000
index 000000000..15dc98f23
--- /dev/null
+++ b/ceph/qa/suites/multimds/basic/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/multimds/thrash/objectstore b/ceph/qa/suites/multimds/thrash/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/multimds/thrash/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/multimds/thrash/objectstore-ec b/ceph/qa/suites/multimds/thrash/objectstore-ec
new file mode 120000
index 000000000..15dc98f23
--- /dev/null
+++ b/ceph/qa/suites/multimds/thrash/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/multimds/verify/objectstore b/ceph/qa/suites/multimds/verify/objectstore
deleted file mode 120000
index c72da2f82..000000000
--- a/ceph/qa/suites/multimds/verify/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore_cephfs
\ No newline at end of file
diff --git a/ceph/qa/suites/multimds/verify/objectstore-ec b/ceph/qa/suites/multimds/verify/objectstore-ec
new file mode 120000
index 000000000..15dc98f23
--- /dev/null
+++ b/ceph/qa/suites/multimds/verify/objectstore-ec
@@ -0,0 +1 @@
+../../../cephfs/objectstore-ec
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/basic/d-require-luminous b/ceph/qa/suites/rados/basic/d-require-luminous
deleted file mode 120000
index 737aee824..000000000
--- a/ceph/qa/suites/rados/basic/d-require-luminous
+++ /dev/null
@@ -1 +0,0 @@
-../thrash/d-require-luminous/
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/basic/d-require-luminous/at-end.yaml b/ceph/qa/suites/rados/basic/d-require-luminous/at-end.yaml
new file mode 100644
index 000000000..ef998cc89
--- /dev/null
+++ b/ceph/qa/suites/rados/basic/d-require-luminous/at-end.yaml
@@ -0,0 +1,33 @@
+# do not require luminous osds at mkfs time; only set flag at
+# the end of the test run, then do a final scrub (to convert any
+# legacy snapsets), and verify we are healthy.
+tasks:
+- full_sequential_finally:
+  - exec:
+      mon.a:
+        - ceph osd require-osd-release luminous
+        - ceph osd pool application enable base rados || true
+# make sure osds have latest map
+        - rados -p rbd bench 5 write -b 4096
+  - ceph.healthy:
+  - ceph.osd_scrub_pgs:
+      cluster: ceph
+  - exec:
+      mon.a:
+        - sleep 15
+        - ceph osd dump | grep purged_snapdirs
+        - ceph pg dump -f json-pretty
+        - "ceph pg dump sum -f json-pretty | grep num_legacy_snapsets | head -1 | grep ': 0'"
+overrides:
+  ceph:
+    conf:
+      global:
+        mon debug no require luminous: true
+
+# setting luminous triggers peering, which *might* trigger health alerts
+    log-whitelist:
+      - overall HEALTH_
+      - \(PG_AVAILABILITY\)
+      - \(PG_DEGRADED\)
+  thrashosds:
+    chance_thrash_cluster_full: 0
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/+ b/ceph/qa/suites/rados/basic/d-require-luminous/at-mkfs.yaml
similarity index 100%
rename from ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/+
rename to ceph/qa/suites/rados/basic/d-require-luminous/at-mkfs.yaml
diff --git a/ceph/qa/suites/rados/mgr/clusters/2-node-mgr.yaml b/ceph/qa/suites/rados/mgr/clusters/2-node-mgr.yaml
index bc950e5af..abc90e22d 100644
--- a/ceph/qa/suites/rados/mgr/clusters/2-node-mgr.yaml
+++ b/ceph/qa/suites/rados/mgr/clusters/2-node-mgr.yaml
@@ -1,6 +1,6 @@
 roles:
 - [mgr.x, mon.a, mon.c, mds.a, mds.c, osd.0, client.0]
-- [mgr.y, mon.b, mds.b, osd.1, osd.2, client.1]
+- [mgr.y, mgr.z, mon.b, mds.b, osd.1, osd.2, client.1]
 log-rotate:
   ceph-mds: 10G
   ceph-osd: 10G
diff --git a/ceph/qa/suites/rados/mgr/tasks/dashboard.yaml b/ceph/qa/suites/rados/mgr/tasks/dashboard.yaml
new file mode 100644
index 000000000..3065e11bc
--- /dev/null
+++ b/ceph/qa/suites/rados/mgr/tasks/dashboard.yaml
@@ -0,0 +1,16 @@
+
+tasks:
+  - install:
+  - ceph:
+      # tests may leave mgrs broken, so don't try and call into them
+      # to invoke e.g. pg dump during teardown.
+      wait-for-scrub: false
+      log-whitelist:
+        - overall HEALTH_
+        - \(MGR_DOWN\)
+        - \(PG_
+        - replacing it with standby
+        - No standby daemons available
+  - cephfs_test_runner:
+      modules:
+        - tasks.mgr.test_dashboard
diff --git a/ceph/qa/suites/rados/mgr/tasks/module_selftest.yaml b/ceph/qa/suites/rados/mgr/tasks/module_selftest.yaml
new file mode 100644
index 000000000..ffdfe8be2
--- /dev/null
+++ b/ceph/qa/suites/rados/mgr/tasks/module_selftest.yaml
@@ -0,0 +1,19 @@
+
+tasks:
+  - install:
+  - ceph:
+      # tests may leave mgrs broken, so don't try and call into them
+      # to invoke e.g. pg dump during teardown.
+      wait-for-scrub: false
+      log-whitelist:
+        - overall HEALTH_
+        - \(MGR_DOWN\)
+        - \(PG_
+        - replacing it with standby
+        - No standby daemons available
+        - Reduced data availability
+        - Degraded data redundancy
+        - objects misplaced
+  - cephfs_test_runner:
+      modules:
+        - tasks.mgr.test_module_selftest
diff --git a/ceph/qa/suites/rados/mgr/tasks/workunits.yaml b/ceph/qa/suites/rados/mgr/tasks/workunits.yaml
new file mode 100644
index 000000000..d7261f44b
--- /dev/null
+++ b/ceph/qa/suites/rados/mgr/tasks/workunits.yaml
@@ -0,0 +1,16 @@
+tasks:
+  - install:
+  - ceph:
+      # tests may leave mgrs broken, so don't try and call into them
+      # to invoke e.g. pg dump during teardown.
+      wait-for-scrub: false
+      log-whitelist:
+        - overall HEALTH_
+        - \(MGR_DOWN\)
+        - \(PG_
+        - replacing it with standby
+        - No standby daemons available
+  - workunit:
+      clients:
+        client.0:
+          - mgr
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/monthrash/d-require-luminous b/ceph/qa/suites/rados/monthrash/d-require-luminous
index 737aee824..82036c67f 120000
--- a/ceph/qa/suites/rados/monthrash/d-require-luminous
+++ b/ceph/qa/suites/rados/monthrash/d-require-luminous
@@ -1 +1 @@
-../thrash/d-require-luminous/
\ No newline at end of file
+../basic/d-require-luminous
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/rest/mgr-restful.yaml b/ceph/qa/suites/rados/rest/mgr-restful.yaml
index 90906d66e..049532e34 100644
--- a/ceph/qa/suites/rados/rest/mgr-restful.yaml
+++ b/ceph/qa/suites/rados/rest/mgr-restful.yaml
@@ -6,6 +6,9 @@ tasks:
     log-whitelist:
       - overall HEALTH_
       - \(MGR_DOWN\)
+      - \(PG_
+      - \(OSD_
+      - \(OBJECT_
 - exec:
     mon.a:
       - ceph restful create-key admin
diff --git a/ceph/qa/suites/rest/basic/tasks/rest_test.yaml b/ceph/qa/suites/rados/rest/rest_test.yaml
similarity index 62%
rename from ceph/qa/suites/rest/basic/tasks/rest_test.yaml
rename to ceph/qa/suites/rados/rest/rest_test.yaml
index 948545623..0fdb9dc6a 100644
--- a/ceph/qa/suites/rest/basic/tasks/rest_test.yaml
+++ b/ceph/qa/suites/rados/rest/rest_test.yaml
@@ -20,7 +20,18 @@ tasks:
 - ceph:
     fs: xfs
     log-whitelist:
-    - but it is still running
+      - overall HEALTH
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_
+      - \(PG_
+      - \(POOL_
+      - \(CACHE_POOL_
+      - \(SMALLER_PGP_NUM\)
+      - \(OBJECT_
+      - \(REQUEST_SLOW\)
+      - \(SLOW_OPS\)
+      - \(TOO_FEW_PGS\)
+      - but it is still running
     conf:
       client.rest0:
         debug ms: 1
diff --git a/ceph/qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml b/ceph/qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml
index 3aaca8759..bbf330b0b 100644
--- a/ceph/qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml
+++ b/ceph/qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml
@@ -9,6 +9,7 @@ overrides:
       - (OSDMAP_FLAGS)
       - (OSD_FULL)
       - (MDS_READ_ONLY)
+      - (POOL_FULL)
 tasks:
 - install:
 - ceph:
diff --git a/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml b/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml
new file mode 100644
index 000000000..accdd964f
--- /dev/null
+++ b/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml
@@ -0,0 +1,26 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+openstack:
+  - volumes: # attached to each instance
+      count: 2
+      size: 10 # GB
+overrides:
+  ceph:
+    create_rbd_pool: False
+    conf:
+      mon:
+        osd pool default size: 2
+      osd:
+        mon max pg per osd : 2
+        osd max pg per osd hard ratio : 1
+    log-whitelist:
+      - \(TOO_FEW_PGS\)
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+    test_create_from_mon: True
+    pg_num: 2
diff --git a/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml b/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml
new file mode 100644
index 000000000..1c48ada75
--- /dev/null
+++ b/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml
@@ -0,0 +1,31 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - osd.3
+openstack:
+  - volumes: # attached to each instance
+      count: 4
+      size: 10 # GB
+overrides:
+  ceph:
+    create_rbd_pool: False
+    conf:
+      mon:
+        osd pool default size: 2
+      osd:
+        mon max pg per osd : 1
+        osd max pg per osd hard ratio : 1
+    log-whitelist:
+      - \(TOO_FEW_PGS\)
+      - \(PG_
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+    test_create_from_mon: False
+    pg_num: 1
+    pool_size: 2
+    from_primary: True
diff --git a/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml b/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml
new file mode 100644
index 000000000..0cf37fd8e
--- /dev/null
+++ b/ceph/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml
@@ -0,0 +1,31 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - osd.3
+openstack:
+  - volumes: # attached to each instance
+      count: 4
+      size: 10 # GB
+overrides:
+  ceph:
+    create_rbd_pool: False
+    conf:
+      mon:
+        osd pool default size: 2
+      osd:
+        mon max pg per osd : 1
+        osd max pg per osd hard ratio : 1
+    log-whitelist:
+      - \(TOO_FEW_PGS\)
+      - \(PG_
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+    test_create_from_mon: False
+    pg_num: 1
+    pool_size: 2
+    from_primary: False
diff --git a/ceph/qa/suites/rados/singleton/all/mon-seesaw.yaml b/ceph/qa/suites/rados/singleton/all/mon-seesaw.yaml
index ccd980fde..815c518ee 100644
--- a/ceph/qa/suites/rados/singleton/all/mon-seesaw.yaml
+++ b/ceph/qa/suites/rados/singleton/all/mon-seesaw.yaml
@@ -17,6 +17,10 @@ tasks:
       osd:
         debug monc: 1
         debug ms: 1
+    log-whitelist:
+      - overall HEALTH
+      - Manager daemon
+      - \(MGR_DOWN\)
 - mon_seesaw:
 - ceph_manager.create_pool:
     kwargs:
diff --git a/ceph/qa/suites/rados/singleton/all/recovery-preemption.yaml b/ceph/qa/suites/rados/singleton/all/recovery-preemption.yaml
new file mode 100644
index 000000000..7507bf635
--- /dev/null
+++ b/ceph/qa/suites/rados/singleton/all/recovery-preemption.yaml
@@ -0,0 +1,51 @@
+roles:
+- - mon.a
+  - mon.b
+  - mon.c
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - osd.3
+openstack:
+  - volumes: # attached to each instance
+      count: 3
+      size: 20 # GB
+tasks:
+- install:
+- ceph:
+    conf:
+      osd:
+        osd recovery sleep: .1
+        osd min pg log entries: 100
+        osd max pg log entries: 1000
+    log-whitelist:
+      - \(POOL_APP_NOT_ENABLED\)
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_
+      - \(OBJECT_
+      - \(PG_
+      - overall HEALTH
+- exec:
+    osd.0:
+      - ceph osd pool create foo 128
+      - ceph osd pool application enable foo foo
+      - rados -p foo bench 30 write -b 4096 --no-cleanup
+      - ceph osd out 0
+      - sleep 5
+      - ceph osd set noup
+- ceph.restart:
+    daemons: [osd.1]
+    wait-for-up: false
+    wait-for-healthy: false
+- exec:
+    osd.0:
+      - rados -p foo bench 3 write -b 4096 --no-cleanup
+      - ceph osd unset noup
+      - sleep 10
+      - ceph tell osd.* config set osd_recovery_sleep 0
+      - ceph tell osd.* config set osd_recovery_max_active 20
+- ceph.healthy:
+- exec:
+    osd.0:
+      - egrep '(defer backfill|defer recovery)' /var/log/ceph/ceph-osd.*.log
diff --git a/ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-crush-compat.yaml b/ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-crush-compat.yaml
new file mode 100644
index 000000000..9eb7143de
--- /dev/null
+++ b/ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-crush-compat.yaml
@@ -0,0 +1,11 @@
+overrides:
+  ceph:
+    conf:
+      mgr:
+        debug osd: 20
+tasks:
+- exec:
+    mon.a:
+      - while ! ceph balancer status ; do sleep 1 ; done
+      - ceph balancer mode crush-compat
+      - ceph balancer on
diff --git a/ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-upmap.yaml b/ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-upmap.yaml
new file mode 100644
index 000000000..a1e0afea0
--- /dev/null
+++ b/ceph/qa/suites/rados/thrash/d-require-luminous/at-mkfs-balancer-upmap.yaml
@@ -0,0 +1,11 @@
+overrides:
+  ceph:
+    conf:
+      mgr:
+        debug osd: 20
+tasks:
+- exec:
+    mon.a:
+      - while ! ceph balancer status ; do sleep 1 ; done
+      - ceph balancer mode upmap
+      - ceph balancer on
diff --git a/ceph/qa/suites/rados/verify/d-require-luminous b/ceph/qa/suites/rados/verify/d-require-luminous
index 737aee824..82036c67f 120000
--- a/ceph/qa/suites/rados/verify/d-require-luminous
+++ b/ceph/qa/suites/rados/verify/d-require-luminous
@@ -1 +1 @@
-../thrash/d-require-luminous/
\ No newline at end of file
+../basic/d-require-luminous
\ No newline at end of file
diff --git a/ceph/qa/suites/rbd/basic/tasks/rbd_cls_tests.yaml b/ceph/qa/suites/rbd/basic/tasks/rbd_cls_tests.yaml
index 9ccd57c4a..51b35e2e1 100644
--- a/ceph/qa/suites/rbd/basic/tasks/rbd_cls_tests.yaml
+++ b/ceph/qa/suites/rbd/basic/tasks/rbd_cls_tests.yaml
@@ -3,3 +3,5 @@ tasks:
     clients:
       client.0:
         - cls/test_cls_rbd.sh
+        - cls/test_cls_lock.sh
+        - cls/test_cls_journal.sh
diff --git a/ceph/qa/suites/rgw/hadoop-s3a/s3a-hadoop.yaml b/ceph/qa/suites/rgw/hadoop-s3a/s3a-hadoop.yaml
index 171cc66e2..1c17a69f7 100644
--- a/ceph/qa/suites/rgw/hadoop-s3a/s3a-hadoop.yaml
+++ b/ceph/qa/suites/rgw/hadoop-s3a/s3a-hadoop.yaml
@@ -1,6 +1,8 @@
-os_type: centos
-os_version: "7.3"
-machine_type: vps
+machine_type: ovh
+openstack:
+- volumes: # attached to each instance
+    count: 3 
+    size: 10 # GB
 overrides:
     ceph_ansible:
       vars:
@@ -9,15 +11,16 @@ overrides:
             osd default pool size: 2
             osd pool default pg num: 128
             osd pool default pgp num: 128
-            debug rgw: 20 
+            debug rgw: 20
             debug ms: 1
         ceph_test: true
-        ceph_dev: true
-        ceph_dev_key: https://download.ceph.com/keys/autobuild.asc
-        ceph_origin: upstream
         journal_collocation: true
         osd_auto_discovery: false
         journal_size: 1024
+        ceph_stable_release: luminous
+        osd_scenario: collocated
+        ceph_origin: repository
+        ceph_repository: dev
 roles:
 - [mon.a, osd.0, osd.1, osd.2, rgw.0]
 - [osd.3, osd.4, osd.5]
diff --git a/ceph/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml b/ceph/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
index 4cdded04e..da05a5ea1 100644
--- a/ceph/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
+++ b/ceph/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
@@ -4,7 +4,7 @@ tasks:
 - rgw: [client.0]
 - s3tests:
     client.0:
-      force-branch: ceph-master
+      force-branch: ceph-luminous
       rgw_server: client.0
 overrides:
   ceph:
diff --git a/ceph/qa/suites/rgw/thrash/workload/rgw_s3tests.yaml b/ceph/qa/suites/rgw/thrash/workload/rgw_s3tests.yaml
index 45047ea41..82ac7c197 100644
--- a/ceph/qa/suites/rgw/thrash/workload/rgw_s3tests.yaml
+++ b/ceph/qa/suites/rgw/thrash/workload/rgw_s3tests.yaml
@@ -1,7 +1,7 @@
 tasks:
 - s3tests:
     client.0:
-      force-branch: ceph-master
+      force-branch: ceph-luminous
       rgw_server: client.0
 overrides:
   ceph:
diff --git a/ceph/qa/suites/rgw/verify/tasks/rgw_s3tests.yaml b/ceph/qa/suites/rgw/verify/tasks/rgw_s3tests.yaml
index bed9f0e19..cf413389b 100644
--- a/ceph/qa/suites/rgw/verify/tasks/rgw_s3tests.yaml
+++ b/ceph/qa/suites/rgw/verify/tasks/rgw_s3tests.yaml
@@ -10,7 +10,7 @@ tasks:
       valgrind: [--tool=memcheck]
 - s3tests:
     client.0:
-      force-branch: ceph-master
+      force-branch: ceph-luminous
       rgw_server: client.0
 overrides:
   ceph:
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/+ b/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/%
similarity index 100%
rename from ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/+
rename to ceph/qa/suites/upgrade/jewel-x/ceph-deploy/%
diff --git a/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/centos_latest.yaml b/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/centos_latest.yaml
new file mode 120000
index 000000000..b5973b952
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/centos_latest.yaml
@@ -0,0 +1 @@
+../../../../../distros/supported/centos_latest.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/ubuntu_latest.yaml b/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/ubuntu_latest.yaml
new file mode 120000
index 000000000..cc5b15bcc
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/distros/ubuntu_latest.yaml
@@ -0,0 +1 @@
+../../../../../distros/supported/ubuntu_latest.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/jewel-luminous.yaml b/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/jewel-luminous.yaml
new file mode 100644
index 000000000..9adede74f
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/ceph-deploy/jewel-luminous.yaml
@@ -0,0 +1,82 @@
+meta:
+- desc: |
+    Setup 4 node ceph cluster using ceph-deploy, use latest
+    stable jewel as initial release, upgrade to luminous and
+    also setup mgr nodes along after upgrade, check for
+    cluster to reach healthy state, After upgrade run kernel tar/untar
+    task and systemd task. This test will detect any
+    ceph upgrade issue and systemd issues.
+overrides:
+  ceph-deploy:
+    fs: xfs
+    conf:
+      global:
+        mon pg warn min per osd: 2
+      osd:
+        osd pool default size: 2
+        osd objectstore: filestore
+        osd sloppy crc: true
+      client:
+        rbd default features: 5
+openstack:
+- machine:
+    disk: 100
+- volumes:
+    count: 3
+    size: 30
+#  reluctantely :( hard-coded machine type
+#  it will override command line args with teuthology-suite  
+machine_type: vps
+roles:
+- - mon.a
+  - mds.a
+  - osd.0
+  - osd.1
+  - osd.2
+  - mgr.x
+- - mon.b
+  - mgr.y
+- - mon.c
+  - osd.3
+  - osd.4
+  - osd.5
+- - osd.6
+  - osd.7
+  - osd.8
+  - client.0
+tasks:
+- ssh-keys:
+- print: "**** done ssh-keys"
+- ceph-deploy:
+    branch:
+      stable: jewel
+    skip-mgr: True
+- print: "**** done initial ceph-deploy"
+- ceph-deploy.upgrade:
+    branch:
+      dev: luminous
+    setup-mgr-node: True
+    check-for-healthy: True
+    roles:
+      - mon.a
+      - mon.b
+      - mon.c
+      - osd.6
+- print: "**** done ceph-deploy upgrade"
+- exec:
+     osd.0:
+      - ceph osd require-osd-release luminous
+      - ceph osd set-require-min-compat-client luminous
+- print: "**** done `ceph osd require-osd-release luminous`"
+- workunit:
+    clients:
+      all:
+        - kernel_untar_build.sh
+- print: "**** done kernel_untar_build.sh"
+- systemd:
+- print: "**** done systemd"
+- workunit:
+    clients:
+      all:
+      - rados/load-gen-mix.sh
+- print: "**** done rados/load-gen-mix.sh"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/0-cluster/start.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/0-cluster/start.yaml
index 314562632..d1f1e1070 100644
--- a/ceph/qa/suites/upgrade/jewel-x/parallel/0-cluster/start.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/0-cluster/start.yaml
@@ -18,6 +18,7 @@ roles:
   - client.1
   - client.2
   - client.3
+- - client.4
 overrides:
   ceph:
     log-whitelist:
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/1-jewel-install/jewel.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/1-jewel-install/jewel.yaml
index a367ef37c..c64b2cded 100644
--- a/ceph/qa/suites/upgrade/jewel-x/parallel/1-jewel-install/jewel.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/1-jewel-install/jewel.yaml
@@ -1,3 +1,22 @@
+overrides:
+  ceph:
+    conf:
+      client.0:
+        debug ms: 1
+        debug client: 10
+        debug monc: 10
+      client.1:
+        debug ms: 1
+        debug client: 10
+        debug monc: 10
+      client.2:
+        debug ms: 1
+        debug client: 10
+        debug monc: 10
+      client.3:
+        debug ms: 1
+        debug client: 10
+        debug monc: 10
 meta:
 - desc: |
    install ceph/jewel latest
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/2-workload/blogbench.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/2-workload/blogbench.yaml
index a8e28c52c..56eedbd6b 100644
--- a/ceph/qa/suites/upgrade/jewel-x/parallel/2-workload/blogbench.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/2-workload/blogbench.yaml
@@ -5,7 +5,7 @@ meta:
 workload:
   full_sequential:
     - sequential:
-      - ceph-fuse:
+      - ceph-fuse: [client.2]
       - print: "**** done ceph-fuse 2-workload"
       - workunit:
           clients:
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/4-luminous.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/4-luminous.yaml
deleted file mode 120000
index 5283ac73e..000000000
--- a/ceph/qa/suites/upgrade/jewel-x/parallel/4-luminous.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../../releases/luminous.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/4-luminous.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/4-luminous.yaml
new file mode 100644
index 000000000..e57b37753
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/4-luminous.yaml
@@ -0,0 +1,23 @@
+# this is the same fragment as ../../../../releases/luminous.yaml
+# but without line "ceph osd set-require-min-compat-client luminous" 
+
+tasks:
+- exec:
+    mgr.x:
+      - mkdir -p /var/lib/ceph/mgr/ceph-x
+      - ceph auth get-or-create-key mgr.x mon 'allow profile mgr'
+      - ceph auth export mgr.x > /var/lib/ceph/mgr/ceph-x/keyring
+- ceph.restart:
+    daemons: [mgr.x]
+    wait-for-healthy: false
+- exec:
+    osd.0:
+      - ceph osd require-osd-release luminous
+- ceph.healthy:
+overrides:
+  ceph:
+    conf:
+      mon:
+        mon warn on osd down out interval zero: false
+    log-whitelist:
+      - no active mgr
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-workload.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/5-workload.yaml
new file mode 100644
index 000000000..f7e9de46a
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/5-workload.yaml
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd on not upgrated client.4
+   (covers issue http://tracker.ceph.com/issues/21660)
+tasks:
+  - workunit:
+      branch: jewel
+      clients:
+        client.4:
+          - rbd/import_export.sh
+  - print: "**** done rbd/import_export.sh 5-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/4-luminous-with-mgr.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/6-luminous-with-mgr.yaml
similarity index 100%
rename from ceph/qa/suites/upgrade/kraken-x/parallel/4-luminous-with-mgr.yaml
rename to ceph/qa/suites/upgrade/jewel-x/parallel/6-luminous-with-mgr.yaml
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/6.5-crush-compat.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/6.5-crush-compat.yaml
new file mode 100644
index 000000000..20c0ffd9f
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/6.5-crush-compat.yaml
@@ -0,0 +1,8 @@
+tasks:
+- exec:
+    mon.a:
+      - ceph osd set-require-min-compat-client jewel
+      - ceph osd crush set-all-straw-buckets-to-straw2
+      - ceph osd crush weight-set create-compat
+      - ceph osd crush weight-set reweight-compat osd.0 .9
+      - ceph osd crush weight-set reweight-compat osd.1 1.2
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/+ b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/+
new file mode 100644
index 000000000..e69de29bb
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/blogbench.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/blogbench.yaml
similarity index 74%
rename from ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/blogbench.yaml
rename to ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/blogbench.yaml
index d2629c03f..d73459e43 100644
--- a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/blogbench.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/blogbench.yaml
@@ -4,10 +4,10 @@ meta:
    mount ceph-fuse on client.3 before running workunit
 tasks:
 - sequential:
-  - ceph-fuse:
+  - ceph-fuse: [client.3]
   - print: "**** done ceph-fuse 5-final-workload"
   - workunit:
       clients:
          client.3:
           - suites/blogbench.sh
-  - print: "**** done suites/blogbench.sh 5-final-workload"
+  - print: "**** done suites/blogbench.sh 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados-snaps-few-objects.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados-snaps-few-objects.yaml
similarity index 88%
rename from ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados-snaps-few-objects.yaml
rename to ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados-snaps-few-objects.yaml
index d8b3dcb38..7dd61c5fc 100644
--- a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados-snaps-few-objects.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados-snaps-few-objects.yaml
@@ -14,4 +14,4 @@ tasks:
         snap_create: 50
         snap_remove: 50
         rollback: 50
-  - print: "**** done rados 4-final-workload"
+  - print: "**** done rados 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados_loadgenmix.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados_loadgenmix.yaml
similarity index 74%
rename from ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados_loadgenmix.yaml
rename to ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados_loadgenmix.yaml
index 922a9da4f..b218b9226 100644
--- a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados_loadgenmix.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados_loadgenmix.yaml
@@ -6,4 +6,4 @@ tasks:
       clients:
         client.1:
           - rados/load-gen-mix.sh
-  - print: "**** done rados/load-gen-mix.sh 4-final-workload"
+  - print: "**** done rados/load-gen-mix.sh 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados_mon_thrash.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados_mon_thrash.yaml
similarity index 84%
rename from ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados_mon_thrash.yaml
rename to ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados_mon_thrash.yaml
index 9b60d2ebc..c835a659b 100644
--- a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados_mon_thrash.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rados_mon_thrash.yaml
@@ -15,4 +15,4 @@ tasks:
       clients:
         client.1:
           - rados/test-upgrade-v11.0.0.sh
-  - print: "**** done rados/test-upgrade-v11.0.0.sh 4-final-workload"
+  - print: "**** done rados/test-upgrade-v11.0.0.sh 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rbd_cls.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rbd_cls.yaml
similarity index 69%
rename from ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rbd_cls.yaml
rename to ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rbd_cls.yaml
index aaf0a3779..46bbf7610 100644
--- a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rbd_cls.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rbd_cls.yaml
@@ -6,4 +6,4 @@ tasks:
       clients:
         client.1:
           - cls/test_cls_rbd.sh
-  - print: "**** done cls/test_cls_rbd.sh 4-final-workload"
+  - print: "**** done cls/test_cls_rbd.sh 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rbd_import_export.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rbd_import_export.yaml
similarity index 76%
rename from ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rbd_import_export.yaml
rename to ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rbd_import_export.yaml
index 46e135506..5ae749188 100644
--- a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rbd_import_export.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rbd_import_export.yaml
@@ -8,4 +8,4 @@ tasks:
           - rbd/import_export.sh
       env:
         RBD_CREATE_ARGS: --new-format
-  - print: "**** done rbd/import_export.sh 4-final-workload"
+  - print: "**** done rbd/import_export.sh 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rgw_swift.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rgw_swift.yaml
similarity index 64%
rename from ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rgw_swift.yaml
rename to ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rgw_swift.yaml
index 7a7659ff4..780c4ad70 100644
--- a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rgw_swift.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/7-final-workload/rgw_swift.yaml
@@ -6,8 +6,8 @@ overrides:
     frontend: civetweb
 tasks:
   - rgw: [client.1]
-  - print: "**** done rgw 4-final-workload"
+  - print: "**** done rgw 7-final-workload"
   - swift:
       client.1:
         rgw_server: client.1
-  - print: "**** done swift 4-final-workload"
+  - print: "**** done swift 7-final-workload"
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/8-jewel-workload.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/8-jewel-workload.yaml
new file mode 120000
index 000000000..81df389c3
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/8-jewel-workload.yaml
@@ -0,0 +1 @@
+5-workload.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml b/ceph/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml
index 3033f14be..d68c258c0 100644
--- a/ceph/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml
@@ -16,13 +16,20 @@ overrides:
     - scrub
     - osd_map_max_advance
     - wrongly marked
+    - overall HEALTH_
+    - \(MGR_DOWN\)
+    - \(OSD_
+    - \(PG_
+    - \(CACHE_
     fs: xfs
     conf:
+      global:
+        mon warn on pool no app: false
       mon:
         mon debug unsafe allow tier with nonempty snaps: true
-        mon warn on pool no app: false
       osd:
         osd map max advance: 1000
+        osd map cache size: 1100
 roles:
 - - mon.a
   - mds.a
@@ -161,7 +168,7 @@ workload_x:
        branch: jewel
        clients:
          client.1:
-         - rados/test-upgrade-v11.0.0.sh
+         - rados/test-upgrade-v11.0.0-noec.sh
          - cls
        env:
          CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_image'
@@ -170,7 +177,7 @@ workload_x:
        branch: jewel
        clients:
          client.0:
-         - rados/test-upgrade-v11.0.0.sh
+         - rados/test-upgrade-v11.0.0-noec.sh
          - cls
    - print: "**** done rados/test-upgrade-v11.0.0.sh &  cls workload_x upgraded client"
    - rgw: [client.1]
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split/6.5-crush-compat.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split/6.5-crush-compat.yaml
new file mode 120000
index 000000000..02263d105
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split/6.5-crush-compat.yaml
@@ -0,0 +1 @@
+../parallel/6.5-crush-compat.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/kraken-x/ceph-deploy/kraken-luminous.yaml b/ceph/qa/suites/upgrade/kraken-x/ceph-deploy/kraken-luminous.yaml
new file mode 100644
index 000000000..4a5536275
--- /dev/null
+++ b/ceph/qa/suites/upgrade/kraken-x/ceph-deploy/kraken-luminous.yaml
@@ -0,0 +1,61 @@
+meta:
+- desc: |
+    Setup 4 node ceph cluster using ceph-deploy, use latest
+    stable kraken as initial release, upgrade to luminous and
+    also setup mgr nodes along after upgrade, check for
+    cluster to reach healthy state, After upgrade run kernel tar/untar
+    task and systemd task. This test will detect any
+    ceph upgrade issue and systemd issues.
+overrides:
+  ceph-deploy:
+    fs: xfs
+    conf:
+      global:
+        mon pg warn min per osd: 2
+      osd:
+        osd pool default size: 2
+        osd objectstore: filestore
+        osd sloppy crc: true
+      client:
+        rbd default features: 5
+roles:
+- - mon.a
+  - mds.a
+  - osd.0
+  - osd.1
+  - osd.2
+  - mgr.x
+- - mon.b
+  - mgr.y
+- - mon.c
+  - osd.3
+  - osd.4
+  - osd.5
+- - osd.6
+  - osd.7
+  - osd.8
+  - client.0
+tasks:
+- ssh-keys:
+- ceph-deploy:
+    branch:
+      stable: kraken
+    skip-mgr: True
+- ceph-deploy.upgrade:
+    branch:
+      dev: luminous
+    setup-mgr-node: True
+    check-for-healthy: True
+    roles:
+      - mon.a
+      - mon.b
+      - mon.c
+- workunit:
+    clients:
+      all:
+        - kernel_untar_build.sh
+- systemd:
+- workunit:
+    clients:
+      all:
+      - rados/load-gen-mix.sh
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml
index 0dc9dd2bc..f5a883a39 100644
--- a/ceph/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml
+++ b/ceph/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml
@@ -18,6 +18,7 @@ roles:
   - client.1
   - client.2
   - client.3
+- - client.4
 overrides:
   ceph:
     log-whitelist:
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/4-luminous.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/4-luminous.yaml
new file mode 100644
index 000000000..80c2b9dbd
--- /dev/null
+++ b/ceph/qa/suites/upgrade/kraken-x/parallel/4-luminous.yaml
@@ -0,0 +1,4 @@
+tasks:
+- exec:
+    osd.0:
+      - ceph osd require-osd-release luminous
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-workload.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/5-workload.yaml
new file mode 100644
index 000000000..851c5c8cb
--- /dev/null
+++ b/ceph/qa/suites/upgrade/kraken-x/parallel/5-workload.yaml
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd on not upgrated client.4
+   (covers issue http://tracker.ceph.com/issues/21660)
+tasks:
+  - workunit:
+      branch: kraken
+      clients:
+        client.4:
+          - rbd/import_export.sh
+  - print: "**** done rbd/import_export.sh 5-workload"
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/6-luminous-with-mgr.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/6-luminous-with-mgr.yaml
new file mode 120000
index 000000000..5c72153e1
--- /dev/null
+++ b/ceph/qa/suites/upgrade/kraken-x/parallel/6-luminous-with-mgr.yaml
@@ -0,0 +1 @@
+../../../../releases/luminous-with-mgr.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/+ b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/+
new file mode 100644
index 000000000..e69de29bb
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/blogbench.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/blogbench.yaml
similarity index 100%
rename from ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/blogbench.yaml
rename to ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/blogbench.yaml
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados-snaps-few-objects.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados-snaps-few-objects.yaml
similarity index 100%
rename from ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados-snaps-few-objects.yaml
rename to ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados-snaps-few-objects.yaml
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados_loadgenmix.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados_loadgenmix.yaml
similarity index 100%
rename from ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rados_loadgenmix.yaml
rename to ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados_loadgenmix.yaml
diff --git a/ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados_mon_thrash.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados_mon_thrash.yaml
similarity index 100%
rename from ceph/qa/suites/upgrade/kraken-x/parallel/5-final-workload/rados_mon_thrash.yaml
rename to ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rados_mon_thrash.yaml
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rbd_cls.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rbd_cls.yaml
similarity index 100%
rename from ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rbd_cls.yaml
rename to ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rbd_cls.yaml
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rbd_import_export.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rbd_import_export.yaml
similarity index 100%
rename from ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rbd_import_export.yaml
rename to ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rbd_import_export.yaml
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rgw_swift.yaml b/ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rgw_swift.yaml
similarity index 100%
rename from ceph/qa/suites/upgrade/jewel-x/parallel/5-final-workload/rgw_swift.yaml
rename to ceph/qa/suites/upgrade/kraken-x/parallel/7-final-workload/rgw_swift.yaml
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/start.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/start.yaml
index 1e8d5a58d..3684b1e0a 100644
--- a/ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/start.yaml
+++ b/ceph/qa/suites/upgrade/luminous-x/parallel/0-cluster/start.yaml
@@ -18,6 +18,7 @@ roles:
   - client.1
   - client.2
   - client.3
+- - client.4
 overrides:
   ceph:
     log-whitelist:
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_no_upgrated.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_no_upgrated.yaml
new file mode 100644
index 000000000..5de8a2361
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_no_upgrated.yaml
@@ -0,0 +1,13 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd
+   on NO upgrated client
+tasks:
+  - workunit:
+      branch: luminous
+      clients:
+        client.4:
+          - rbd/import_export.sh
+      env:
+        RBD_CREATE_ARGS: --new-format
+  - print: "**** done rbd/import_export.sh 4-final-workload on NO upgrated client"
diff --git a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export.yaml b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_upgrated.yaml
similarity index 65%
rename from ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export.yaml
rename to ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_upgrated.yaml
index 46e135506..2c7c484e1 100644
--- a/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export.yaml
+++ b/ceph/qa/suites/upgrade/luminous-x/parallel/5-final-workload/rbd_import_export_upgrated.yaml
@@ -1,6 +1,7 @@
 meta:
 - desc: |
    run basic import/export cli tests for rbd
+   on upgrated client
 tasks:
   - workunit:
       clients:
@@ -8,4 +9,4 @@ tasks:
           - rbd/import_export.sh
       env:
         RBD_CREATE_ARGS: --new-format
-  - print: "**** done rbd/import_export.sh 4-final-workload"
+  - print: "**** done rbd/import_export.sh 4-final-workload  on upgrated client"
diff --git a/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/1-ceph-install b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/1-ceph-install
new file mode 120000
index 000000000..0479ac542
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-x/stress-split-erasure-code/1-ceph-install
@@ -0,0 +1 @@
+../stress-split/1-ceph-install/
\ No newline at end of file
diff --git a/ceph/qa/tasks/ceph.py b/ceph/qa/tasks/ceph.py
index a37fec1c0..72f265375 100644
--- a/ceph/qa/tasks/ceph.py
+++ b/ceph/qa/tasks/ceph.py
@@ -43,7 +43,9 @@ def generate_caps(type_):
             osd='allow *',
         ),
         mgr=dict(
-            mon='allow *',
+            mon='allow profile mgr',
+            osd='allow *',
+            mds='allow *',
         ),
         mds=dict(
             mon='allow *',
@@ -338,17 +340,18 @@ def create_rbd_pool(ctx, config):
         remote=mon_remote,
         ceph_cluster=cluster_name,
     )
-    log.info('Creating RBD pool')
-    mon_remote.run(
-        args=['sudo', 'ceph', '--cluster', cluster_name,
-              'osd', 'pool', 'create', 'rbd', '8'])
-    mon_remote.run(
-        args=[
-            'sudo', 'ceph', '--cluster', cluster_name,
-            'osd', 'pool', 'application', 'enable',
-            'rbd', 'rbd', '--yes-i-really-mean-it'
-        ],
-        check_status=False)
+    if config.get('create_rbd_pool', True):
+        log.info('Creating RBD pool')
+        mon_remote.run(
+            args=['sudo', 'ceph', '--cluster', cluster_name,
+                  'osd', 'pool', 'create', 'rbd', '8'])
+        mon_remote.run(
+            args=[
+                'sudo', 'ceph', '--cluster', cluster_name,
+                'osd', 'pool', 'application', 'enable',
+                'rbd', 'rbd', '--yes-i-really-mean-it'
+            ],
+            check_status=False)
     yield
 
 @contextlib.contextmanager
@@ -365,7 +368,8 @@ def cephfs_setup(ctx, config):
     if mdss.remotes:
         log.info('Setting up CephFS filesystem...')
 
-        fs = Filesystem(ctx, name='cephfs', create=True)
+        fs = Filesystem(ctx, name='cephfs', create=True,
+                        ec_profile=config.get('cephfs_ec_profile', None))
 
         is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
         all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
diff --git a/ceph/qa/tasks/ceph_deploy.py b/ceph/qa/tasks/ceph_deploy.py
index b22c32113..38fbe43c2 100644
--- a/ceph/qa/tasks/ceph_deploy.py
+++ b/ceph/qa/tasks/ceph_deploy.py
@@ -15,6 +15,7 @@ from teuthology.config import config as teuth_config
 from teuthology.task import install as install_fn
 from teuthology.orchestra import run
 from tasks.cephfs.filesystem import Filesystem
+from teuthology.misc import wait_until_healthy
 
 log = logging.getLogger(__name__)
 
@@ -27,7 +28,8 @@ def download_ceph_deploy(ctx, config):
     will use that instead. The `bootstrap` script is ran, with the argument
     obtained from `python_version`, if specified.
     """
-    ceph_admin = ctx.cluster.only(teuthology.get_first_mon(ctx, config))
+    # use mon.a for ceph_admin
+    (ceph_admin,) = ctx.cluster.only('mon.a').remotes.iterkeys()
 
     try:
         py_ver = str(config['python_version'])
@@ -41,8 +43,7 @@ def download_ceph_deploy(ctx, config):
             ))
 
         log.info("Installing Python")
-        for admin in ceph_admin.remotes:
-            system_type = teuthology.get_system_type(admin)
+        system_type = teuthology.get_system_type(ceph_admin)
 
         if system_type == 'rpm':
             package = 'python34' if py_ver == '3' else 'python'
@@ -145,7 +146,7 @@ def get_nodes_using_role(ctx, target_role):
 
     # Prepare a modified version of cluster.remotes with ceph-deploy-ized names
     modified_remotes = {}
-
+    ceph_deploy_mapped = dict()
     for _remote, roles_for_host in ctx.cluster.remotes.iteritems():
         modified_remotes[_remote] = []
         for svc_id in roles_for_host:
@@ -156,13 +157,16 @@ def get_nodes_using_role(ctx, target_role):
                     nodes_of_interest.append(fqdn)
                 else:
                     nodes_of_interest.append(nodename)
-
-                modified_remotes[_remote].append(
-                    "{0}.{1}".format(target_role, nodename))
+                mapped_role = "{0}.{1}".format(target_role, nodename)
+                modified_remotes[_remote].append(mapped_role)
+                # keep dict of mapped role for later use by tasks
+                # eg. mon.a => mon.node1
+                ceph_deploy_mapped[svc_id] = mapped_role
             else:
                 modified_remotes[_remote].append(svc_id)
 
     ctx.cluster.remotes = modified_remotes
+    ctx.cluster.mapped_role = ceph_deploy_mapped
 
     return nodes_of_interest
 
@@ -213,8 +217,8 @@ def build_ceph_cluster(ctx, config):
     # Expect to find ceph_admin on the first mon by ID, same place that the download task
     # puts it.  Remember this here, because subsequently IDs will change from those in
     # the test config to those that ceph-deploy invents.
-    (ceph_admin,) = ctx.cluster.only(
-        teuthology.get_first_mon(ctx, config)).remotes.iterkeys()
+
+    (ceph_admin,) = ctx.cluster.only('mon.a').remotes.iterkeys()
 
     def execute_ceph_deploy(cmd):
         """Remotely execute a ceph_deploy command"""
@@ -241,10 +245,16 @@ def build_ceph_cluster(ctx, config):
         mds_nodes = " ".join(mds_nodes)
         mon_node = get_nodes_using_role(ctx, 'mon')
         mon_nodes = " ".join(mon_node)
-        mgr_nodes = get_nodes_using_role(ctx, 'mgr')
-        mgr_nodes = " ".join(mgr_nodes)
+        # skip mgr based on config item
+        # this is needed when test uses latest code to install old ceph
+        # versions
+        skip_mgr = config.get('skip-mgr', False)
+        if not skip_mgr:
+            mgr_nodes = get_nodes_using_role(ctx, 'mgr')
+            mgr_nodes = " ".join(mgr_nodes)
         new_mon = './ceph-deploy new' + " " + mon_nodes
-        mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
+        if not skip_mgr:
+            mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
         mon_hostname = mon_nodes.split(' ')[0]
         mon_hostname = str(mon_hostname)
         gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
@@ -307,7 +317,8 @@ def build_ceph_cluster(ctx, config):
 
         estatus_gather = execute_ceph_deploy(gather_keys)
 
-        execute_ceph_deploy(mgr_create)
+        if not skip_mgr:
+            execute_ceph_deploy(mgr_create)
 
         if mds_nodes:
             estatus_mds = execute_ceph_deploy(deploy_mds)
@@ -334,7 +345,7 @@ def build_ceph_cluster(ctx, config):
             # first check for filestore, default is bluestore with ceph-deploy
             if config.get('filestore') is not None:
                 osd_create_cmd += '--filestore '
-            else:
+            elif config.get('bluestore') is not None:
                 osd_create_cmd += '--bluestore '
             if config.get('dmcrypt') is not None:
                 osd_create_cmd += '--dmcrypt '
@@ -414,7 +425,7 @@ def build_ceph_cluster(ctx, config):
 
             if mds_nodes:
                 log.info('Configuring CephFS...')
-                ceph_fs = Filesystem(ctx, create=True)
+                Filesystem(ctx, create=True)
         elif not config.get('only_mon'):
             raise RuntimeError(
                 "The cluster is NOT operational due to insufficient OSDs")
@@ -524,7 +535,7 @@ def cli_test(ctx, config):
         """Either use git path or repo path """
         args = ['cd', conf_dir, run.Raw(';')]
         if path:
-            args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path));
+            args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path))
         else:
             args.append('ceph-deploy')
         args.append(run.Raw(cmd))
@@ -608,11 +619,11 @@ def cli_test(ctx, config):
     log.info("Waiting for cluster to become healthy")
     with contextutil.safe_while(sleep=10, tries=6,
                                 action='check health') as proceed:
-       while proceed():
-           r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
-           out = r.stdout.getvalue()
-           if (out.split(None,1)[0] == 'HEALTH_OK'):
-               break
+        while proceed():
+            r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
+            out = r.stdout.getvalue()
+            if (out.split(None, 1)[0] == 'HEALTH_OK'):
+                break
     rgw_install = 'install {branch} --rgw {node}'.format(
         branch=test_branch,
         node=nodename,
@@ -679,6 +690,108 @@ def single_node_test(ctx, config):
             yield
 
 
+@contextlib.contextmanager
+def upgrade(ctx, config):
+    """
+     Upgrade using ceph-deploy
+     eg:
+       ceph-deploy.upgrade:
+          # to upgrade to specific branch, use
+          branch:
+             stable: jewel
+           # to setup mgr node, use
+           setup-mgr-node: True
+           # to wait for cluster to be healthy after all upgrade, use
+           wait-for-healthy: True
+           role: (upgrades the below roles serially)
+              mon.a
+              mon.b
+              osd.0
+     """
+    roles = config.get('roles')
+    # get the roles that are mapped as per ceph-deploy
+    # roles are mapped for mon/mds eg: mon.a  => mon.host_short_name
+    mapped_role = ctx.cluster.mapped_role
+    if config.get('branch'):
+        branch = config.get('branch')
+        (var, val) = branch.items()[0]
+        ceph_branch = '--{var}={val}'.format(var=var, val=val)
+    else:
+        # default to master
+        ceph_branch = '--dev=master'
+    # get the node used for initial deployment which is mon.a
+    mon_a = mapped_role.get('mon.a')
+    (ceph_admin,) = ctx.cluster.only(mon_a).remotes.iterkeys()
+    testdir = teuthology.get_testdir(ctx)
+    cmd = './ceph-deploy install ' + ceph_branch
+    for role in roles:
+        # check if this role is mapped (mon or mds)
+        if mapped_role.get(role):
+            role = mapped_role.get(role)
+        remotes_and_roles = ctx.cluster.only(role).remotes
+        for remote, roles in remotes_and_roles.iteritems():
+            nodename = remote.shortname
+            cmd = cmd + ' ' + nodename
+            log.info("Upgrading ceph on  %s", nodename)
+            ceph_admin.run(
+                args=[
+                    'cd',
+                    '{tdir}/ceph-deploy'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    run.Raw(cmd),
+                ],
+            )
+            # restart all ceph services, ideally upgrade should but it does not
+            remote.run(
+                args=[
+                    'sudo', 'systemctl', 'restart', 'ceph.target'
+                ]
+            )
+            ceph_admin.run(args=['sudo', 'ceph', '-s'])
+
+    # workaround for http://tracker.ceph.com/issues/20950
+    # write the correct mgr key to disk
+    if config.get('setup-mgr-node', None):
+        mons = ctx.cluster.only(teuthology.is_type('mon'))
+        for remote, roles in mons.remotes.iteritems():
+            remote.run(
+                args=[
+                    run.Raw('sudo ceph auth get client.bootstrap-mgr'),
+                    run.Raw('|'),
+                    run.Raw('sudo tee'),
+                    run.Raw('/var/lib/ceph/bootstrap-mgr/ceph.keyring')
+                ]
+            )
+
+    if config.get('setup-mgr-node', None):
+        mgr_nodes = get_nodes_using_role(ctx, 'mgr')
+        mgr_nodes = " ".join(mgr_nodes)
+        mgr_install = './ceph-deploy install --mgr ' + ceph_branch + " " + mgr_nodes
+        mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
+        # install mgr
+        ceph_admin.run(
+            args=[
+                'cd',
+                '{tdir}/ceph-deploy'.format(tdir=testdir),
+                run.Raw('&&'),
+                run.Raw(mgr_install),
+                ],
+            )
+        # create mgr
+        ceph_admin.run(
+            args=[
+                'cd',
+                '{tdir}/ceph-deploy'.format(tdir=testdir),
+                run.Raw('&&'),
+                run.Raw(mgr_create),
+                ],
+            )
+        ceph_admin.run(args=['sudo', 'ceph', '-s'])
+    if config.get('wait-for-healthy', None):
+        wait_until_healthy(ctx, ceph_admin, use_sudo=True)
+    yield
+
+
 @contextlib.contextmanager
 def task(ctx, config):
     """
@@ -694,12 +807,15 @@ def task(ctx, config):
              branch:
                 stable: bobtail
              mon_initial_members: 1
+             ceph-deploy-branch: my-ceph-deploy-branch
              only_mon: true
              keep_running: true
              # either choose bluestore or filestore, default is bluestore
              bluestore: True
              # or
              filestore: True
+             # skip install of mgr for old release using below flag
+             skip-mgr: True  ( default is False )
 
         tasks:
         - install:
diff --git a/ceph/qa/tasks/ceph_manager.py b/ceph/qa/tasks/ceph_manager.py
index 9da03bdd9..5a89f235f 100644
--- a/ceph/qa/tasks/ceph_manager.py
+++ b/ceph/qa/tasks/ceph_manager.py
@@ -111,12 +111,12 @@ class Thrasher:
         self.stopping = False
         self.logger = logger
         self.config = config
-        self.revive_timeout = self.config.get("revive_timeout", 150)
+        self.revive_timeout = self.config.get("revive_timeout", 360)
         self.pools_to_fix_pgp_num = set()
         if self.config.get('powercycle'):
             self.revive_timeout += 120
         self.clean_wait = self.config.get('clean_wait', 0)
-        self.minin = self.config.get("min_in", 3)
+        self.minin = self.config.get("min_in", 4)
         self.chance_move_pg = self.config.get('chance_move_pg', 1.0)
         self.sighup_delay = self.config.get('sighup_delay')
         self.optrack_toggle_delay = self.config.get('optrack_toggle_delay')
@@ -286,6 +286,7 @@ class Thrasher:
                                         pg=pg,
                                         id=exp_osd))
             # export
+            # Can't use new export-remove op since this is part of upgrade testing
             cmd = prefix + "--op export --pgid {pg} --file {file}"
             cmd = cmd.format(id=exp_osd, pg=pg, file=exp_path)
             proc = exp_remote.run(args=cmd)
@@ -294,7 +295,7 @@ class Thrasher:
                                 "export failure with status {ret}".
                                 format(ret=proc.exitstatus))
             # remove
-            cmd = prefix + "--op remove --pgid {pg}"
+            cmd = prefix + "--force --op remove --pgid {pg}"
             cmd = cmd.format(id=exp_osd, pg=pg)
             proc = exp_remote.run(args=cmd)
             if proc.exitstatus:
@@ -767,7 +768,7 @@ class Thrasher:
         osd_debug_skip_full_check_in_backfill_reservation to force
         the more complicated check in do_scan to be exercised.
 
-        Then, verify that all backfills stop.
+        Then, verify that all backfillings stop.
         """
         self.log("injecting backfill full")
         for i in self.live_osds:
@@ -779,13 +780,13 @@ class Thrasher:
                                      check_status=True, timeout=30, stdout=DEVNULL)
         for i in range(30):
             status = self.ceph_manager.compile_pg_status()
-            if 'backfill' not in status.keys():
+            if 'backfilling' not in status.keys():
                 break
             self.log(
-                "waiting for {still_going} backfills".format(
-                    still_going=status.get('backfill')))
+                "waiting for {still_going} backfillings".format(
+                    still_going=status.get('backfilling')))
             time.sleep(1)
-        assert('backfill' not in self.ceph_manager.compile_pg_status().keys())
+        assert('backfilling' not in self.ceph_manager.compile_pg_status().keys())
         for i in self.live_osds:
             self.ceph_manager.set_config(
                 i,
@@ -2043,7 +2044,7 @@ class CephManager:
         for pg in pgs:
             if (pg['state'].count('active') and
                     not pg['state'].count('recover') and
-                    not pg['state'].count('backfill') and
+                    not pg['state'].count('backfilling') and
                     not pg['state'].count('stale')):
                 num += 1
         return num
@@ -2217,6 +2218,8 @@ class CephManager:
                 else:
                     self.log("no progress seen, keeping timeout for now")
                     if now - start >= timeout:
+			if self.is_recovered():
+			    break
                         self.log('dumping pgs')
                         out = self.raw_cluster_cmd('pg', 'dump')
                         self.log(out)
@@ -2317,6 +2320,30 @@ class CephManager:
             time.sleep(3)
         self.log("active!")
 
+    def wait_till_pg_convergence(self, timeout=None):
+        start = time.time()
+        old_stats = None
+        active_osds = [osd['osd'] for osd in self.get_osd_dump()
+                       if osd['in'] and osd['up']]
+        while True:
+            # strictly speaking, no need to wait for mon. but due to the
+            # "ms inject socket failures" setting, the osdmap could be delayed,
+            # so mgr is likely to ignore the pg-stat messages with pgs serving
+            # newly created pools which is not yet known by mgr. so, to make sure
+            # the mgr is updated with the latest pg-stats, waiting for mon/mgr is
+            # necessary.
+            self.flush_pg_stats(active_osds)
+            new_stats = dict((stat['pgid'], stat['state'])
+                             for stat in self.get_pg_stats())
+            if old_stats == new_stats:
+                return old_stats
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'failed to reach convergence before %d secs' % timeout
+            old_stats = new_stats
+            # longer than mgr_stats_period
+            time.sleep(5 + 1)
+
     def mark_out_osd(self, osd):
         """
         Wrapper to mark osd out.
@@ -2368,7 +2395,7 @@ class CephManager:
         time.sleep(2)
         self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
 
-    def revive_osd(self, osd, timeout=150, skip_admin_check=False):
+    def revive_osd(self, osd, timeout=360, skip_admin_check=False):
         """
         Revive osds by either power cycling (if indicated by the config)
         or by restarting.
diff --git a/ceph/qa/tasks/ceph_objectstore_tool.py b/ceph/qa/tasks/ceph_objectstore_tool.py
index 3dc49624c..912577317 100644
--- a/ceph/qa/tasks/ceph_objectstore_tool.py
+++ b/ceph/qa/tasks/ceph_objectstore_tool.py
@@ -591,7 +591,7 @@ def test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME, ec=False):
                 continue
 
             for pg in pgs[osdid]:
-                cmd = ((prefix + "--op remove --pgid {pg}").
+                cmd = ((prefix + "--force --op remove --pgid {pg}").
                        format(pg=pg, id=osdid))
                 proc = remote.run(args=cmd, check_status=False,
                                   stdout=StringIO())
diff --git a/ceph/qa/tasks/cephfs/filesystem.py b/ceph/qa/tasks/cephfs/filesystem.py
index 44f6cbaf1..9638fd55c 100644
--- a/ceph/qa/tasks/cephfs/filesystem.py
+++ b/ceph/qa/tasks/cephfs/filesystem.py
@@ -374,10 +374,12 @@ class Filesystem(MDSCluster):
     This object is for driving a CephFS filesystem.  The MDS daemons driven by
     MDSCluster may be shared with other Filesystems.
     """
-    def __init__(self, ctx, fscid=None, name=None, create=False):
+    def __init__(self, ctx, fscid=None, name=None, create=False,
+                 ec_profile=None):
         super(Filesystem, self).__init__(ctx)
 
         self.name = name
+        self.ec_profile = ec_profile
         self.id = None
         self.metadata_pool_name = None
         self.metadata_overlay = False
@@ -473,8 +475,22 @@ class Filesystem(MDSCluster):
                                              self.name, self.metadata_pool_name, data_pool_name,
                                              '--allow-dangerous-metadata-overlay')
         else:
-            self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
-                                             data_pool_name, pgs_per_fs_pool.__str__())
+            if self.ec_profile:
+                log.info("EC profile is %s", self.ec_profile)
+                cmd = ['osd', 'erasure-code-profile', 'set', data_pool_name]
+                cmd.extend(self.ec_profile)
+                self.mon_manager.raw_cluster_cmd(*cmd)
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'create',
+                    data_pool_name, pgs_per_fs_pool.__str__(), 'erasure',
+                    data_pool_name)
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'set',
+                    data_pool_name, 'allow_ec_overwrites', 'true')
+            else:
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'create',
+                    data_pool_name, pgs_per_fs_pool.__str__())
             self.mon_manager.raw_cluster_cmd('fs', 'new',
                                              self.name, self.metadata_pool_name, data_pool_name)
         self.check_pool_application(self.metadata_pool_name)
diff --git a/ceph/qa/tasks/cephfs/test_client_limits.py b/ceph/qa/tasks/cephfs/test_client_limits.py
index b06d2a1d2..cb5e3a462 100644
--- a/ceph/qa/tasks/cephfs/test_client_limits.py
+++ b/ceph/qa/tasks/cephfs/test_client_limits.py
@@ -29,7 +29,7 @@ class TestClientLimits(CephFSTestCase):
     REQUIRE_KCLIENT_REMOTE = True
     CLIENTS_REQUIRED = 2
 
-    def _test_client_pin(self, use_subdir):
+    def _test_client_pin(self, use_subdir, open_files):
         """
         When a client pins an inode in its cache, for example because the file is held open,
         it should reject requests from the MDS to trim these caps.  The MDS should complain
@@ -39,13 +39,16 @@ class TestClientLimits(CephFSTestCase):
         :param use_subdir: whether to put test files in a subdir or use root
         """
 
-        cache_size = 100
-        open_files = 200
+        cache_size = open_files/2
 
         self.set_conf('mds', 'mds cache size', cache_size)
         self.fs.mds_fail_restart()
         self.fs.wait_for_daemons()
 
+        mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        self.assertTrue(open_files >= mds_min_caps_per_client)
+        mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client"))
+
         mount_a_client_id = self.mount_a.get_global_id()
         path = "subdir/mount_a" if use_subdir else "mount_a"
         open_proc = self.mount_a.open_n_background(path, open_files)
@@ -62,8 +65,7 @@ class TestClientLimits(CephFSTestCase):
         # MDS should not be happy about that, as the client is failing to comply
         # with the SESSION_RECALL messages it is being sent
         mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout"))
-        self.wait_for_health("MDS_CLIENT_RECALL",
-                mds_recall_state_timeout + 10)
+        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10)
 
         # We can also test that the MDS health warning for oversized
         # cache is functioning as intended.
@@ -82,19 +84,31 @@ class TestClientLimits(CephFSTestCase):
 
         # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
         # which depend on the caps outstanding, cache size and overall ratio
-        self.wait_until_equal(
-            lambda: self.get_session(mount_a_client_id)['num_caps'],
-            int(open_files * 0.2),
-            timeout=30,
-            reject_fn=lambda x: x < int(open_files*0.2))
+        recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2))
+        def expected_caps():
+            num_caps = self.get_session(mount_a_client_id)['num_caps']
+            if num_caps < mds_min_caps_per_client:
+                raise RuntimeError("client caps fell below min!")
+            elif num_caps == mds_min_caps_per_client:
+                return True
+            elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05:
+                return True
+            else:
+                return False
+
+        self.wait_until_true(expected_caps, timeout=60)
 
     @needs_trimming
     def test_client_pin_root(self):
-        self._test_client_pin(False)
+        self._test_client_pin(False, 400)
 
     @needs_trimming
     def test_client_pin(self):
-        self._test_client_pin(True)
+        self._test_client_pin(True, 800)
+
+    @needs_trimming
+    def test_client_pin_mincaps(self):
+        self._test_client_pin(True, 200)
 
     def test_client_release_bug(self):
         """
diff --git a/ceph/qa/tasks/cephfs/test_volume_client.py b/ceph/qa/tasks/cephfs/test_volume_client.py
index 65dc9a9eb..0876af96e 100644
--- a/ceph/qa/tasks/cephfs/test_volume_client.py
+++ b/ceph/qa/tasks/cephfs/test_volume_client.py
@@ -355,11 +355,11 @@ vc.disconnect()
         :return:
         """
 
-        # Because the teuthology config template sets mon_pg_warn_max_per_osd to
+        # Because the teuthology config template sets mon_max_pg_per_osd to
         # 10000 (i.e. it just tries to ignore health warnings), reset it to something
         # sane before using volume_client, to avoid creating pools with absurdly large
         # numbers of PGs.
-        self.set_conf("global", "mon pg warn max per osd", "300")
+        self.set_conf("global", "mon max pg per osd", "300")
         for mon_daemon_state in self.ctx.daemons.iter_daemons_of_role('mon'):
             mon_daemon_state.restart()
 
@@ -368,7 +368,7 @@ vc.disconnect()
 
         # Calculate how many PGs we'll expect the new volume pool to have
         osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
-        max_per_osd = int(self.fs.get_config('mon_pg_warn_max_per_osd'))
+        max_per_osd = int(self.fs.get_config('mon_max_pg_per_osd'))
         osd_count = len(osd_map['osds'])
         max_overall = osd_count * max_per_osd
 
@@ -764,7 +764,7 @@ vc.disconnect()
         # auth ID belongs to, the auth ID's authorized access levels
         # for different volumes, versioning details, etc.
         expected_auth_metadata = {
-            u"version": 1,
+            u"version": 2,
             u"compat_version": 1,
             u"dirty": False,
             u"tenant_id": u"tenant1",
@@ -791,7 +791,7 @@ vc.disconnect()
         # Verify that the volume metadata file stores info about auth IDs
         # and their access levels to the volume, versioning details, etc.
         expected_vol_metadata = {
-            u"version": 1,
+            u"version": 2,
             u"compat_version": 1,
             u"auths": {
                 u"guest": {
@@ -905,3 +905,112 @@ vc.disconnect()
             volume_id=volume_id,
             auth_id=guestclient["auth_id"],
         )))
+
+    def test_put_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test data'
+        obj_name = 'test_vc_obj_1'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self._volume_client_python(vc_mount, dedent("""
+            vc.put_object("{pool_name}", "{obj_name}", b"{obj_data}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+            obj_data = obj_data
+        )))
+
+        read_data = self.fs.rados(['get', obj_name, '-'], pool=pool_name)
+        self.assertEqual(obj_data, read_data)
+
+    def test_get_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test_data'
+        obj_name = 'test_vc_ob_2'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data)
+
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.get_object("{pool_name}", "{obj_name}")
+            assert data_read == b"{obj_data}"
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+            obj_data = obj_data
+        )))
+
+    def test_delete_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test data'
+        obj_name = 'test_vc_obj_3'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data)
+
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.delete_object("{pool_name}", "{obj_name}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+        )))
+
+        with self.assertRaises(CommandFailedError):
+            self.fs.rados(['stat', obj_name], pool=pool_name)
+
+        # Check idempotency -- no error raised trying to delete non-existent
+        # object
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.delete_object("{pool_name}", "{obj_name}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+        )))
+
+    def test_21501(self):
+        """
+        Reproducer for #21501 "ceph_volume_client: sets invalid caps for
+        existing IDs with no caps" (http://tracker.ceph.com/issues/21501)
+        """
+
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+
+        # Configure vc_mount as the handle for driving volumeclient
+        self._configure_vc_auth(vc_mount, "manila")
+
+        # Create a volume
+        group_id = "grpid"
+        volume_id = "volid"
+        mount_path = self._volume_client_python(vc_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*10)
+            print create_result['mount_path']
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )))
+
+        # Create an auth ID with no caps
+        guest_id = '21501'
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'get-or-create', 'client.{0}'.format(guest_id))
+
+        guest_mount = self.mounts[2]
+        guest_mount.umount_wait()
+
+        # Set auth caps for the auth ID using the volumeclient
+        self._configure_guest_auth(vc_mount, guest_mount, guest_id, mount_path)
+
+        # Mount the volume in the guest using the auth ID to assert that the
+        # auth caps are valid
+        guest_mount.mount(mount_path=mount_path)
diff --git a/ceph/qa/tasks/divergent_priors2.py b/ceph/qa/tasks/divergent_priors2.py
index 0e645c7c4..0ed753278 100644
--- a/ceph/qa/tasks/divergent_priors2.py
+++ b/ceph/qa/tasks/divergent_priors2.py
@@ -156,13 +156,7 @@ def task(ctx, config):
               format(fpath=FSPATH, jpath=JPATH))
     pid = os.getpid()
     expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
-    cmd = ((prefix + "--op export --pgid 2.0 --file {file}").
-           format(id=divergent, file=expfile))
-    proc = exp_remote.run(args=cmd, wait=True,
-                          check_status=False, stdout=StringIO())
-    assert proc.exitstatus == 0
-
-    cmd = ((prefix + "--op remove --pgid 2.0").
+    cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}").
            format(id=divergent, file=expfile))
     proc = exp_remote.run(args=cmd, wait=True,
                           check_status=False, stdout=StringIO())
diff --git a/ceph/qa/tasks/mgr/mgr_test_case.py b/ceph/qa/tasks/mgr/mgr_test_case.py
index a5531d33e..ec3f98d28 100644
--- a/ceph/qa/tasks/mgr/mgr_test_case.py
+++ b/ceph/qa/tasks/mgr/mgr_test_case.py
@@ -1,14 +1,18 @@
 
 from unittest import case
 import json
+import logging
 
 from teuthology import misc
 from tasks.ceph_test_case import CephTestCase
 
-# TODO move definition of CephCluster
+# TODO move definition of CephCluster away from the CephFS stuff
 from tasks.cephfs.filesystem import CephCluster
 
 
+log = logging.getLogger(__name__)
+
+
 class MgrCluster(CephCluster):
     def __init__(self, ctx):
         super(MgrCluster, self).__init__(ctx)
@@ -43,6 +47,12 @@ class MgrCluster(CephCluster):
     def get_standby_ids(self):
         return [s['name'] for s in self.get_mgr_map()["standbys"]]
 
+    def set_module_localized_conf(self, module, mgr_id, key, val):
+        self.mon_manager.raw_cluster_cmd("config-key", "set",
+                                         "mgr/{0}/{1}/{2}".format(
+                                             module, mgr_id, key
+                                         ), val)
+
 
 class MgrTestCase(CephTestCase):
     MGRS_REQUIRED = 1
@@ -77,3 +87,84 @@ class MgrTestCase(CephTestCase):
         self.wait_until_true(
             lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
             timeout=20)
+
+    def _load_module(self, module_name):
+        loaded = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                   "mgr", "module", "ls"))['enabled_modules']
+        if module_name in loaded:
+            # The enable command is idempotent, but our wait for a restart
+            # isn't, so let's return now if it's already loaded
+            return
+
+        initial_gid = self.mgr_cluster.get_mgr_map()['active_gid']
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable",
+                                         module_name)
+
+        # Wait for the module to load
+        def has_restarted():
+            mgr_map = self.mgr_cluster.get_mgr_map()
+            done = mgr_map['active_gid'] != initial_gid and mgr_map['available']
+            if done:
+                log.info("Restarted after module load (new active {0}/{1})".format(
+                    mgr_map['active_name'] , mgr_map['active_gid']))
+            return done
+        self.wait_until_true(has_restarted, timeout=30)
+
+
+    def _get_uri(self, service_name):
+        # Little dict hack so that I can assign into this from
+        # the get_or_none function
+        mgr_map = {'x': None}
+
+        def _get_or_none():
+            mgr_map['x'] = self.mgr_cluster.get_mgr_map()
+            result = mgr_map['x']['services'].get(service_name, None)
+            return result
+
+        self.wait_until_true(lambda: _get_or_none() is not None, 30)
+
+        uri = mgr_map['x']['services'][service_name]
+
+        log.info("Found {0} at {1} (daemon {2}/{3})".format(
+            service_name, uri, mgr_map['x']['active_name'],
+            mgr_map['x']['active_gid']))
+
+        return uri
+
+
+    def _assign_ports(self, module_name, config_name, min_port=7789):
+        """
+        To avoid the need to run lots of hosts in teuthology tests to
+        get different URLs per mgr, we will hand out different ports
+        to each mgr here.
+
+        This is already taken care of for us when running in a vstart
+        environment.
+        """
+        # Start handing out ports well above Ceph's range.
+        assign_port = min_port
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            self.mgr_cluster.mgr_stop(mgr_id)
+            self.mgr_cluster.mgr_fail(mgr_id)
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            log.info("Using port {0} for {1} on mgr.{2}".format(
+                assign_port, module_name, mgr_id
+            ))
+            self.mgr_cluster.set_module_localized_conf(module_name, mgr_id,
+                                                       config_name,
+                                                       str(assign_port))
+            assign_port += 1
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            self.mgr_cluster.mgr_restart(mgr_id)
+
+        def is_available():
+            mgr_map = self.mgr_cluster.get_mgr_map()
+            done = mgr_map['available']
+            if done:
+                log.info("Available after assign ports (new active {0}/{1})".format(
+                    mgr_map['active_name'] , mgr_map['active_gid']))
+            return done
+        self.wait_until_true(is_available, timeout=30)
diff --git a/ceph/qa/tasks/mgr/test_dashboard.py b/ceph/qa/tasks/mgr/test_dashboard.py
new file mode 100644
index 000000000..3b8a2cc80
--- /dev/null
+++ b/ceph/qa/tasks/mgr/test_dashboard.py
@@ -0,0 +1,70 @@
+
+
+from mgr_test_case import MgrTestCase
+
+import logging
+import requests
+
+
+log = logging.getLogger(__name__)
+
+
+class TestDashboard(MgrTestCase):
+    MGRS_REQUIRED = 3
+
+    def test_standby(self):
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        original_active = self.mgr_cluster.get_active_id()
+
+        original_uri = self._get_uri("dashboard")
+        log.info("Originally running at {0}".format(original_uri))
+
+        self.mgr_cluster.mgr_fail(original_active)
+
+        failed_over_uri = self._get_uri("dashboard")
+        log.info("After failover running at {0}".format(original_uri))
+
+        self.assertNotEqual(original_uri, failed_over_uri)
+
+        # The original active daemon should have come back up as a standby
+        # and be doing redirects to the new active daemon
+        r = requests.get(original_uri, allow_redirects=False)
+        self.assertEqual(r.status_code, 303)
+        self.assertEqual(r.headers['Location'], failed_over_uri)
+
+    def test_urls(self):
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        base_uri = self._get_uri("dashboard")
+
+        # This is a very simple smoke test to check that the dashboard can
+        # give us a 200 response to requests.  We're not testing that
+        # the content is correct or even renders!
+
+        urls = [
+            "/health",
+            "/servers",
+            "/osd/",
+            "/osd/perf/0",
+            "/rbd_mirroring",
+            "/rbd_iscsi"
+        ]
+
+        failures = []
+
+        for url in urls:
+            r = requests.get(base_uri + url, allow_redirects=False)
+            if r.status_code >= 300 and r.status_code < 400:
+                log.error("Unexpected redirect to: {0} (from {1})".format(
+                    r.headers['Location'], base_uri))
+            if r.status_code != 200:
+                failures.append(url)
+
+            log.info("{0}: {1} ({2} bytes)".format(
+                url, r.status_code, len(r.content)
+            ))
+
+        self.assertListEqual(failures, [])
diff --git a/ceph/qa/tasks/mgr/test_module_selftest.py b/ceph/qa/tasks/mgr/test_module_selftest.py
new file mode 100644
index 000000000..2776fb872
--- /dev/null
+++ b/ceph/qa/tasks/mgr/test_module_selftest.py
@@ -0,0 +1,74 @@
+
+import time
+import requests
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+
+class TestModuleSelftest(MgrTestCase):
+    """
+    That modules with a self-test command can be loaded and execute it
+    without errors.
+
+    This is not a substitute for really testing the modules, but it
+    is quick and is designed to catch regressions that could occur
+    if data structures change in a way that breaks how the modules
+    touch them.
+    """
+    MGRS_REQUIRED = 1
+
+    def _selftest_plugin(self, module_name):
+        self._load_module(module_name)
+
+        # Execute the module's self-test routine
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(module_name, "self-test")
+
+    def test_zabbix(self):
+        self._selftest_plugin("zabbix")
+
+    def test_prometheus(self):
+        self._selftest_plugin("prometheus")
+
+    def test_influx(self):
+        self._selftest_plugin("influx")
+
+    def test_selftest_run(self):
+        self._load_module("selftest")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", "run")
+
+    def test_selftest_command_spam(self):
+        # Use the selftest module to stress the mgr daemon
+        self._load_module("selftest")
+
+        # Use the dashboard to test that the mgr is still able to do its job
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+                                                     "background", "start",
+                                                     "command_spam")
+
+        dashboard_uri = self._get_uri("dashboard")
+
+        delay = 10
+        periods = 10
+        for i in range(0, periods):
+            t1 = time.time()
+            # Check that an HTTP module remains responsive
+            r = requests.get(dashboard_uri)
+            self.assertEqual(r.status_code, 200)
+
+            # Check that a native non-module command remains responsive
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("osd", "df")
+
+            time.sleep(delay - (time.time() - t1))
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+                                                     "background", "stop")
+
+        # Check that all mgr daemons are still running
+        self.assertEqual(original_active, self.mgr_cluster.get_active_id())
+        self.assertEqual(original_standbys, self.mgr_cluster.get_standby_ids())
diff --git a/ceph/qa/tasks/osd_max_pg_per_osd.py b/ceph/qa/tasks/osd_max_pg_per_osd.py
new file mode 100644
index 000000000..b4e2aa4de
--- /dev/null
+++ b/ceph/qa/tasks/osd_max_pg_per_osd.py
@@ -0,0 +1,126 @@
+import logging
+import random
+
+
+log = logging.getLogger(__name__)
+
+
+def pg_num_in_all_states(pgs, *states):
+    return sum(1 for state in pgs.itervalues()
+               if all(s in state for s in states))
+
+
+def pg_num_in_any_state(pgs, *states):
+    return sum(1 for state in pgs.itervalues()
+               if any(s in state for s in states))
+
+
+def test_create_from_mon(ctx, config):
+    """
+    osd should stop creating new pools if the number of pg it servers
+    exceeds the max-pg-per-osd setting, and it should resume the previously
+    suspended pg creations once the its pg number drops down below the setting
+    How it works::
+    1. set the hard limit of pg-per-osd to "2"
+    2. create pool.a with pg_num=2
+       # all pgs should be active+clean
+    2. create pool.b with pg_num=2
+       # new pgs belonging to this pool should be unknown (the primary osd
+       reaches the limit) or creating (replica osd reaches the limit)
+    3. remove pool.a
+    4. all pg belonging to pool.b should be active+clean
+    """
+    pg_num = config.get('pg_num', 2)
+    manager = ctx.managers['ceph']
+    log.info('1. creating pool.a')
+    pool_a = manager.create_pool_with_unique_name(pg_num)
+    manager.wait_for_clean()
+    assert manager.get_num_active_clean() == pg_num
+
+    log.info('2. creating pool.b')
+    pool_b = manager.create_pool_with_unique_name(pg_num)
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
+    pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+    assert pg_pending == pg_num
+
+    log.info('3. removing pool.a')
+    manager.remove_pool(pool_a)
+    pg_states = manager.wait_till_pg_convergence(300)
+    assert len(pg_states) == pg_num
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
+
+    # cleanup
+    manager.remove_pool(pool_b)
+
+
+def test_create_from_peer(ctx, config):
+    """
+    osd should stop creating new pools if the number of pg it servers
+    exceeds the max-pg-per-osd setting, and it should resume the previously
+    suspended pg creations once the its pg number drops down below the setting
+
+    How it works::
+    0. create 4 OSDs.
+    1. create pool.a with pg_num=1, size=2
+       pg will be mapped to osd.0, and osd.1, and it should be active+clean
+    2. create pool.b with pg_num=1, size=2.
+       if the pgs stuck in creating, delete the pool since the pool and try
+       again, eventually we'll get the pool to land on the other 2 osds that
+       aren't occupied by pool.a. (this will also verify that pgs for deleted
+       pools get cleaned out of the creating wait list.)
+    3. mark an osd out. verify that some pgs get stuck stale or peering.
+    4. delete a pool, verify pgs go active.
+    """
+    pg_num = config.get('pg_num', 1)
+    pool_size = config.get('pool_size', 2)
+    from_primary = config.get('from_primary', True)
+
+    manager = ctx.managers['ceph']
+    log.info('1. creating pool.a')
+    pool_a = manager.create_pool_with_unique_name(pg_num)
+    manager.wait_for_clean()
+    assert manager.get_num_active_clean() == pg_num
+
+    log.info('2. creating pool.b')
+    while True:
+        pool_b = manager.create_pool_with_unique_name(pg_num)
+        pg_states = manager.wait_till_pg_convergence(300)
+        pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+        assert pg_created >= pg_num
+        pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+        assert pg_pending == pg_num * 2 - pg_created
+        if pg_created == pg_num * 2:
+            break
+        manager.remove_pool(pool_b)
+
+    log.info('3. mark an osd out')
+    pg_stats = manager.get_pg_stats()
+    pg = random.choice(pg_stats)
+    if from_primary:
+        victim = pg['acting'][-1]
+    else:
+        victim = pg['acting'][0]
+    manager.mark_out_osd(victim)
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_stuck = pg_num_in_any_state(pg_states, 'activating', 'stale', 'peering')
+    assert pg_stuck > 0
+
+    log.info('4. removing pool.b')
+    manager.remove_pool(pool_b)
+    manager.wait_for_clean(30)
+
+    # cleanup
+    manager.remove_pool(pool_a)
+
+
+def task(ctx, config):
+    assert isinstance(config, dict), \
+        'osd_max_pg_per_osd task only accepts a dict for config'
+    manager = ctx.managers['ceph']
+    if config.get('test_create_from_mon', True):
+        test_create_from_mon(ctx, config)
+    else:
+        test_create_from_peer(ctx, config)
diff --git a/ceph/qa/tasks/reg11184.py b/ceph/qa/tasks/reg11184.py
index 50e3a8b33..f24862384 100644
--- a/ceph/qa/tasks/reg11184.py
+++ b/ceph/qa/tasks/reg11184.py
@@ -174,19 +174,12 @@ def task(ctx, config):
               format(fpath=FSPATH, jpath=JPATH))
     pid = os.getpid()
     expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
-    cmd = ((prefix + "--op export --pgid 2.0 --file {file}").
+    cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}").
            format(id=divergent, file=expfile))
     proc = exp_remote.run(args=cmd, wait=True,
                           check_status=False, stdout=StringIO())
     assert proc.exitstatus == 0
 
-    # Remove the same pg that was exported
-    cmd = ((prefix + "--op remove --pgid 2.0").
-           format(id=divergent))
-    proc = exp_remote.run(args=cmd, wait=True,
-                          check_status=False, stdout=StringIO())
-    assert proc.exitstatus == 0
-
     # Kill one of non-divergent OSDs
     log.info('killing osd.%d' % non_divergent[0])
     manager.kill_osd(non_divergent[0])
@@ -194,7 +187,7 @@ def task(ctx, config):
     # manager.mark_out_osd(non_divergent[0])
 
     # An empty collection for pg 2.0 might need to be cleaned up
-    cmd = ((prefix + "--op remove --pgid 2.0").
+    cmd = ((prefix + "--force --op remove --pgid 2.0").
            format(id=non_divergent[0]))
     proc = exp_remote.run(args=cmd, wait=True,
                           check_status=False, stdout=StringIO())
diff --git a/ceph/qa/tasks/s3a_hadoop.py b/ceph/qa/tasks/s3a_hadoop.py
index b969a36a8..c01fe1dda 100644
--- a/ceph/qa/tasks/s3a_hadoop.py
+++ b/ceph/qa/tasks/s3a_hadoop.py
@@ -82,7 +82,9 @@ def task(ctx, config):
     fix_rgw_config(rgw_node, dnsmasq_name)
     setup_user_bucket(rgw_node, dnsmasq_name, access_key, secret_key, bucket_name, testdir)
     if hadoop_ver.startswith('2.8'):
-        test_options = '-Dit.test=ITestS3A* -Dparallel-tests -Dscale -Dfs.s3a.scale.test.huge.filesize=128M verify'
+        # test all ITtests but skip AWS test using public bucket landsat-pds
+        # which is not available from within this test
+        test_options = '-Dit.test=ITestS3A* -Dit.test=\!ITestS3AAWSCredentialsProvider* -Dparallel-tests -Dscale -Dfs.s3a.scale.test.huge.filesize=128M verify'
     else:
         test_options = 'test -Dtest=S3a*,TestS3A*'
     try:
diff --git a/ceph/qa/tasks/thrashosds.py b/ceph/qa/tasks/thrashosds.py
index dbca056a0..420b73559 100644
--- a/ceph/qa/tasks/thrashosds.py
+++ b/ceph/qa/tasks/thrashosds.py
@@ -24,7 +24,7 @@ def task(ctx, config):
 
     cluster: (default 'ceph') the name of the cluster to thrash
 
-    min_in: (default 3) the minimum number of OSDs to keep in the
+    min_in: (default 4) the minimum number of OSDs to keep in the
        cluster
 
     min_out: (default 0) the minimum number of OSDs to keep out of the
diff --git a/ceph/qa/tasks/util/rados.py b/ceph/qa/tasks/util/rados.py
index 86c4b5389..a83f9e190 100644
--- a/ceph/qa/tasks/util/rados.py
+++ b/ceph/qa/tasks/util/rados.py
@@ -34,7 +34,7 @@ def create_ec_pool(remote, name, profile_name, pgnum, profile={}, cluster_name="
     if application:
         remote.run(args=[
             'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
-        ])
+        ], check_status=False) # may fail as EINVAL when run in jewel upgrade test
 
 def create_replicated_pool(remote, name, pgnum, cluster_name="ceph", application=None):
     remote.run(args=[
@@ -43,7 +43,7 @@ def create_replicated_pool(remote, name, pgnum, cluster_name="ceph", application
     if application:
         remote.run(args=[
             'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
-        ])
+        ], check_status=False)
 
 def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="ceph"):
     remote.run(args=[
diff --git a/ceph/qa/workunits/ceph-disk/ceph-disk-test.py b/ceph/qa/workunits/ceph-disk/ceph-disk-test.py
index efc080dc0..637fa90eb 100644
--- a/ceph/qa/workunits/ceph-disk/ceph-disk-test.py
+++ b/ceph/qa/workunits/ceph-disk/ceph-disk-test.py
@@ -113,7 +113,7 @@ class CephDisk:
         LOG.debug(self.unused_disks('sd.'))
         if self.unused_disks('sd.'):
             return
-        modprobe = "modprobe scsi_debug vpd_use_hostno=0 add_host=1 dev_size_mb=200 ; udevadm settle"
+        modprobe = "modprobe scsi_debug vpd_use_hostno=0 add_host=1 dev_size_mb=300 ; udevadm settle"
         try:
             self.sh(modprobe)
         except:
diff --git a/ceph/qa/workunits/ceph-disk/ceph-disk.sh b/ceph/qa/workunits/ceph-disk/ceph-disk.sh
index 7a795b925..7102efba1 100755
--- a/ceph/qa/workunits/ceph-disk/ceph-disk.sh
+++ b/ceph/qa/workunits/ceph-disk/ceph-disk.sh
@@ -35,7 +35,7 @@ if ! ${PYTHON} -m pytest --version > /dev/null 2>&1; then
     exit 1
 fi
 
-sudo env PATH=$(dirname $0):$(dirname $0)/..:$PATH ${PYTHON} -m pytest -s -v $(dirname $0)/ceph-disk-test.py
+sudo env PATH=$(dirname $0):$(dirname $0)/..:$PATH PYTHONWARNINGS=ignore ${PYTHON} -m pytest -s -v $(dirname $0)/ceph-disk-test.py
 result=$?
 
 sudo rm -f /lib/udev/rules.d/60-ceph-by-partuuid.rules
diff --git a/ceph/qa/workunits/cephtool/test.sh b/ceph/qa/workunits/cephtool/test.sh
index f5a313ea2..15344172a 100755
--- a/ceph/qa/workunits/cephtool/test.sh
+++ b/ceph/qa/workunits/cephtool/test.sh
@@ -1593,16 +1593,7 @@ function test_mon_osd()
   # When CEPH_CLI_TEST_DUP_COMMAND is set, osd create
   # is repeated and consumes two osd id, not just one.
   #
-  local next_osd
-  if test "$CEPH_CLI_TEST_DUP_COMMAND" ; then
-      next_osd=$((gap_start + 1))
-  else
-      next_osd=$gap_start
-  fi
-  id=`ceph osd create`
-  [ "$id" = "$next_osd" ]
-
-  next_osd=$((id + 1))
+  local next_osd=$gap_start
   id=`ceph osd create $(uuidgen)`
   [ "$id" = "$next_osd" ]
 
@@ -2162,9 +2153,12 @@ function test_mon_osd_erasure_code()
   ceph osd erasure-code-profile set fooprofile a=b c=d e=f --force
   ceph osd erasure-code-profile set fooprofile a=b c=d e=f
   expect_false ceph osd erasure-code-profile set fooprofile a=b c=d e=f g=h
-  #
-  # cleanup by removing profile 'fooprofile'
+  # ruleset-foo will work for luminous only
+  ceph osd erasure-code-profile set barprofile ruleset-failure-domain=host
+  ceph osd erasure-code-profile set barprofile crush-failure-domain=host
+  # clean up
   ceph osd erasure-code-profile rm fooprofile
+  ceph osd erasure-code-profile rm barprofile
 }
 
 function test_mon_osd_misc()
diff --git a/ceph/qa/workunits/cls/test_cls_journal.sh b/ceph/qa/workunits/cls/test_cls_journal.sh
new file mode 100755
index 000000000..9aa7450a9
--- /dev/null
+++ b/ceph/qa/workunits/cls/test_cls_journal.sh
@@ -0,0 +1,6 @@
+#!/bin/sh -e
+
+GTEST_FILTER=${CLS_JOURNAL_GTEST_FILTER:-*}
+ceph_test_cls_journal --gtest_filter=${GTEST_FILTER}
+
+exit 0
diff --git a/ceph/qa/workunits/mgr/test_localpool.sh b/ceph/qa/workunits/mgr/test_localpool.sh
new file mode 100755
index 000000000..c5a56a6d5
--- /dev/null
+++ b/ceph/qa/workunits/mgr/test_localpool.sh
@@ -0,0 +1,21 @@
+#!/bin/sh -ex
+
+ceph config-key set mgr/localpool/subtree host
+ceph config-key set mgr/localpool/failure_domain osd
+ceph mgr module enable localpool
+
+while ! ceph osd pool ls | grep '^by-host-'
+do
+    sleep 5
+done
+
+ceph mgr module disable localpool
+for p in `ceph osd pool ls | grep '^by-host-'`
+do
+    ceph osd pool rm $p $p --yes-i-really-really-mean-it
+done
+
+ceph config-key rm mgr/localpool/subtree
+ceph config-key rm mgr/localpool/failure_domain
+
+echo OK
diff --git a/ceph/qa/workunits/rados/test_rados_tool.sh b/ceph/qa/workunits/rados/test_rados_tool.sh
index 6a3ebe0b2..87c86ee69 100755
--- a/ceph/qa/workunits/rados/test_rados_tool.sh
+++ b/ceph/qa/workunits/rados/test_rados_tool.sh
@@ -346,7 +346,7 @@ test_rmobj() {
     $CEPH_TOOL osd pool set-quota $p max_objects 1
     V1=`mktemp fooattrXXXXXXX`
     $RADOS_TOOL put $OBJ $V1 -p $p
-    while ! $CEPH_TOOL osd dump | grep 'full max_objects'
+    while ! $CEPH_TOOL osd dump | grep 'full_no_quota max_objects'
     do
 	sleep 2
     done
diff --git a/ceph/qa/workunits/rbd/rbd_mirror.sh b/ceph/qa/workunits/rbd/rbd_mirror.sh
index 04a03a66e..5195e6cf3 100755
--- a/ceph/qa/workunits/rbd/rbd_mirror.sh
+++ b/ceph/qa/workunits/rbd/rbd_mirror.sh
@@ -111,6 +111,18 @@ wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image1} 'up+replaying'
 admin_daemon ${CLUSTER1} rbd mirror flush
 admin_daemon ${CLUSTER1} rbd mirror status
 
+testlog "TEST: test image rename"
+new_name="${image}_RENAMED"
+rename_image ${CLUSTER2} ${POOL} ${image} ${new_name}
+wait_for_image_replay_started ${CLUSTER1} ${POOL} ${new_name}
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${new_name} 'up+replaying'
+admin_daemon ${CLUSTER1} rbd mirror status ${POOL}/${new_name}
+admin_daemon ${CLUSTER1} rbd mirror restart ${POOL}/${new_name}
+wait_for_image_replay_started ${CLUSTER1} ${POOL} ${new_name}
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${new_name} 'up+replaying'
+rename_image ${CLUSTER2} ${POOL} ${new_name} ${image}
+wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
+
 testlog "TEST: failover and failback"
 start_mirror ${CLUSTER2}
 
diff --git a/ceph/qa/workunits/rbd/rbd_mirror_helpers.sh b/ceph/qa/workunits/rbd/rbd_mirror_helpers.sh
index 23216711e..325353b91 100755
--- a/ceph/qa/workunits/rbd/rbd_mirror_helpers.sh
+++ b/ceph/qa/workunits/rbd/rbd_mirror_helpers.sh
@@ -593,6 +593,16 @@ set_image_meta()
     rbd --cluster ${cluster} -p ${pool} image-meta set ${image} $key $val
 }
 
+rename_image()
+{
+    local cluster=$1
+    local pool=$2
+    local image=$3
+    local new_name=$4
+
+    rbd --cluster=${cluster} -p ${pool} rename ${image} ${new_name}
+}
+
 remove_image()
 {
     local cluster=$1
diff --git a/ceph/selinux/ceph.te b/ceph/selinux/ceph.te
index 552f73601..0a9349803 100644
--- a/ceph/selinux/ceph.te
+++ b/ceph/selinux/ceph.te
@@ -106,7 +106,7 @@ files_manage_generic_locks(ceph_t)
 
 allow ceph_t sysfs_t:dir read;
 allow ceph_t sysfs_t:file { read getattr open };
-allow ceph_t sysfs_t:lnk_file read;
+allow ceph_t sysfs_t:lnk_file { read getattr };
 
 allow ceph_t random_device_t:chr_file getattr;
 allow ceph_t urandom_device_t:chr_file getattr;
diff --git a/ceph/src/.git_version b/ceph/src/.git_version
index 4af702226..9b4bb5c8b 100644
--- a/ceph/src/.git_version
+++ b/ceph/src/.git_version
@@ -1,2 +1,2 @@
-3e7492b9ada8bdc9a5cd0feafd42fbca27f9c38e
-v12.2.1
+cf0baeeeeba3b47f9427c6c97e2144b094b7e5ba
+v12.2.2
diff --git a/ceph/src/90-ceph-osd.conf b/ceph/src/90-ceph-osd.conf
new file mode 100644
index 000000000..c5c64bb70
--- /dev/null
+++ b/ceph/src/90-ceph-osd.conf
@@ -0,0 +1 @@
+fs.aio-max-nr = 1048576
diff --git a/ceph/src/CMakeLists.txt b/ceph/src/CMakeLists.txt
index 3cdcb95be..3d3d2f7af 100644
--- a/ceph/src/CMakeLists.txt
+++ b/ceph/src/CMakeLists.txt
@@ -540,8 +540,11 @@ set(libcommon_files
   ${auth_files}
   ${mds_files})
 
+CHECK_C_COMPILER_FLAG("-fvar-tracking-assignments" HAS_VTA)
 if(HAS_VTA)
-  set_source_files_properties(common/config.cc
+  set_source_files_properties(
+    common/config.cc
+    common/options.cc
     PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
 endif()
 
@@ -691,12 +694,18 @@ if (WITH_MGR)
       mgr/DaemonState.cc
       mgr/DaemonServer.cc
       mgr/ClusterState.cc
-      mgr/PyModules.cc
+      mgr/ActivePyModules.cc
+      mgr/StandbyPyModules.cc
+      mgr/PyModuleRegistry.cc
+      mgr/PyModuleRunner.cc
       mgr/PyFormatter.cc
-      mgr/PyState.cc
-      mgr/MgrPyModule.cc
+      mgr/PyOSDMap.cc
+      mgr/BaseMgrModule.cc
+      mgr/BaseMgrStandbyModule.cc
+      mgr/ActivePyModule.cc
       mgr/MgrStandby.cc
       mgr/Mgr.cc
+      mgr/Gil.cc
       mgr/mgr_commands.cc)
   add_executable(ceph-mgr ${mgr_srcs}
                  $<TARGET_OBJECTS:heap_profiler_objs>)
@@ -726,7 +735,6 @@ add_subdirectory(ceph-volume)
 add_subdirectory(ceph-detect-init)
 
 ## dencoder
-CHECK_C_COMPILER_FLAG("-fvar-tracking-assignments" HAS_VTA)
 if(HAS_VTA)
   set_source_files_properties(test/encoding/ceph_dencoder.cc
     PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
diff --git a/ceph/src/arch/arm.c b/ceph/src/arch/arm.c
index e3cfb5c59..a8562f596 100644
--- a/ceph/src/arch/arm.c
+++ b/ceph/src/arch/arm.c
@@ -1,3 +1,4 @@
+#include "acconfig.h"
 #include "arch/probe.h"
 
 /* flags we export */
@@ -45,10 +46,8 @@ int ceph_arch_arm_probe(void)
 	ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON;
 #elif __aarch64__ && __linux__
 	ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD;
-# ifdef HWCAP_CRC32
+# if defined(HAVE_ARMV8_CRC) && defined(HWCAP_CRC32)
 	ceph_arch_aarch64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32;
-# else
-	ceph_arch_aarch64_crc32 = 0;  // sorry!
 # endif
 #else
 	if (0)
diff --git a/ceph/src/ceph-disk/ceph_disk/main.py b/ceph/src/ceph-disk/ceph_disk/main.py
old mode 100755
new mode 100644
index 8b0c5dbc3..6516750d2
--- a/ceph/src/ceph-disk/ceph_disk/main.py
+++ b/ceph/src/ceph-disk/ceph_disk/main.py
@@ -24,6 +24,7 @@ import argparse
 import base64
 import errno
 import fcntl
+import functools
 import json
 import logging
 import os
@@ -41,12 +42,23 @@ import pwd
 import grp
 import textwrap
 import glob
+import warnings
 
 CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
 CEPH_LOCKBOX_ONDISK_MAGIC = 'ceph lockbox volume v001'
 
 KEY_MANAGEMENT_MODE_V1 = 'ceph-mon v1'
 
+DEPRECATION_WARNING = """
+*******************************************************************************
+This tool is now deprecated in favor of ceph-volume.
+It is recommended to use ceph-volume for OSD deployments. For details see:
+
+    http://docs.ceph.com/docs/master/ceph-volume/#migrating
+
+*******************************************************************************
+"""
+
 PTYPE = {
     'regular': {
         'journal': {
@@ -721,6 +733,21 @@ def get_partition_mpath(dev, pnum):
         return None
 
 
+def retry(on_error=Exception, max_tries=10, wait=0.2, backoff=0):
+    def wrapper(func):
+        @functools.wraps(func)
+        def repeat(*args, **kwargs):
+            for tries in range(max_tries - 1):
+                try:
+                    return func(*args, **kwargs)
+                except on_error:
+                    time.sleep(wait + backoff * tries)
+            return func(*args, **kwargs)
+        return repeat
+    return wrapper
+
+
+@retry(Error)
 def get_partition_dev(dev, pnum):
     """
     get the device name for a partition
@@ -732,36 +759,25 @@ def get_partition_dev(dev, pnum):
        sda 1 -> sda1
        cciss/c0d1 1 -> cciss!c0d1p1
     """
-    max_retry = 10
-    for retry in range(0, max_retry + 1):
-        partname = None
-        error_msg = ""
-        if is_mpath(dev):
-            partname = get_partition_mpath(dev, pnum)
-        else:
-            name = get_dev_name(os.path.realpath(dev))
-            sys_entry = os.path.join(BLOCKDIR, name)
-            error_msg = " in %s" % sys_entry
-            for f in os.listdir(sys_entry):
-                if f.startswith(name) and f.endswith(str(pnum)):
-                    # we want the shortest name that starts with the base name
-                    # and ends with the partition number
-                    if not partname or len(f) < len(partname):
-                        partname = f
-        if partname:
-            if retry:
-                LOG.info('Found partition %d for %s after %d tries' %
-                         (pnum, dev, retry))
-            return get_dev_path(partname)
-        else:
-            if retry < max_retry:
-                LOG.info('Try %d/%d : partition %d for %s does not exist%s' %
-                         (retry + 1, max_retry, pnum, dev, error_msg))
-                time.sleep(.2)
-                continue
-            else:
-                raise Error('partition %d for %s does not appear to exist%s' %
-                            (pnum, dev, error_msg))
+    partname = None
+    error_msg = ""
+    if is_mpath(dev):
+        partname = get_partition_mpath(dev, pnum)
+    else:
+        name = get_dev_name(os.path.realpath(dev))
+        sys_entry = os.path.join(BLOCKDIR, name)
+        error_msg = " in %s" % sys_entry
+        for f in os.listdir(sys_entry):
+            if f.startswith(name) and f.endswith(str(pnum)):
+                # we want the shortest name that starts with the base name
+                # and ends with the partition number
+                if not partname or len(f) < len(partname):
+                    partname = f
+    if partname:
+        return get_dev_path(partname)
+    else:
+        raise Error('partition %d for %s does not appear to exist%s' %
+                    (pnum, dev, error_msg))
 
 
 def list_all_partitions():
@@ -1374,22 +1390,14 @@ def _dmcrypt_map(
         raise Error('unable to map device', rawdev, e)
 
 
-def dmcrypt_unmap(
-    _uuid
-):
+@retry(Error, max_tries=10, wait=0.5, backoff=1.0)
+def dmcrypt_unmap(_uuid):
     if not os.path.exists('/dev/mapper/' + _uuid):
         return
-    retries = 0
-    while True:
-        try:
-            command_check_call(['cryptsetup', 'remove', _uuid])
-            break
-        except subprocess.CalledProcessError as e:
-            if retries == 10:
-                raise Error('unable to unmap device', _uuid, e)
-            else:
-                time.sleep(0.5 + retries * 1.0)
-                retries += 1
+    try:
+        command_check_call(['cryptsetup', 'remove', _uuid])
+    except subprocess.CalledProcessError as e:
+        raise Error('unable to unmap device', _uuid, e)
 
 
 def mount(
@@ -1451,6 +1459,7 @@ def mount(
     return path
 
 
+@retry(UnmountError, max_tries=3, wait=0.5, backoff=1.0)
 def unmount(
     path,
     do_rm=True,
@@ -1458,25 +1467,17 @@ def unmount(
     """
     Unmount and removes the given mount point.
     """
-    retries = 0
-    while True:
-        try:
-            LOG.debug('Unmounting %s', path)
-            command_check_call(
-                [
-                    '/bin/umount',
-                    '--',
-                    path,
-                ],
-            )
-            break
-        except subprocess.CalledProcessError as e:
-            # on failure, retry 3 times with incremental backoff
-            if retries == 3:
-                raise UnmountError(e)
-            else:
-                time.sleep(0.5 + retries * 1.0)
-                retries += 1
+    try:
+        LOG.debug('Unmounting %s', path)
+        command_check_call(
+            [
+                '/bin/umount',
+                '--',
+                path,
+            ],
+        )
+    except subprocess.CalledProcessError as e:
+        raise UnmountError(e)
     if not do_rm:
         return
     os.rmdir(path)
@@ -1855,6 +1856,7 @@ class DevicePartition(object):
         return self.ptype_map[name]['ready']
 
     @staticmethod
+    @retry(OSError)
     def factory(path, dev, args):
         dmcrypt_type = CryptHelpers.get_dmcrypt_type(args)
         if ((path is not None and is_mpath(path)) or
@@ -3248,7 +3250,7 @@ def systemd_start(
     osd_id,
 ):
     systemd_disable(path, osd_id)
-    if is_mounted(path):
+    if os.path.ismount(path):
         style = ['--runtime']
     else:
         style = []
@@ -3760,6 +3762,20 @@ def main_activate(args):
             )
             osd_data = get_mount_point(cluster, osd_id)
 
+            args.cluster = cluster
+            if args.dmcrypt:
+                for name in Space.NAMES:
+                    # Check if encrypted device in journal
+                    dev_path = os.path.join(osd_data, name + '_dmcrypt')
+                    if not os.path.exists(dev_path):
+                        continue
+                    partition = DevicePartition.factory(
+                        path=None,
+                        dev=dev_path,
+                        args=args)
+                    partition.rawdev = args.path
+                    partition.map()
+
         elif stat.S_ISDIR(mode):
             (cluster, osd_id) = activate_dir(
                 path=args.path,
@@ -5633,6 +5649,8 @@ def make_zap_parser(subparsers):
 
 
 def main(argv):
+    # Deprecate from the very beginning
+    warnings.warn(DEPRECATION_WARNING)
     args = parse_args(argv)
 
     setup_logging(args.verbose, args.log_stdout)
@@ -5652,10 +5670,20 @@ def main(argv):
     CEPH_PREF_GROUP = args.setgroup
 
     if args.verbose:
-        args.func(args)
+        try:
+            args.func(args)
+        except Exception:
+            # warn on any exception when running with verbosity
+            warnings.warn(DEPRECATION_WARNING)
+            # but still raise the original issue
+            raise
+
     else:
         main_catch(args.func, args)
 
+    # if there aren't any errors, still log again at the very bottom
+    warnings.warn(DEPRECATION_WARNING)
+
 
 def setup_logging(verbose, log_stdout):
     loglevel = logging.WARNING
@@ -5682,6 +5710,8 @@ def main_catch(func, args):
         func(args)
 
     except Error as e:
+        # warn on generic 'error' exceptions
+        warnings.warn(DEPRECATION_WARNING)
         raise SystemExit(
             '{prog}: {msg}'.format(
                 prog=args.prog,
@@ -5690,6 +5720,8 @@ def main_catch(func, args):
         )
 
     except CephDiskException as error:
+        # warn on ceph-disk exceptions
+        warnings.warn(DEPRECATION_WARNING)
         exc_name = error.__class__.__name__
         raise SystemExit(
             '{prog} {exc_name}: {msg}'.format(
diff --git a/ceph/src/ceph-disk/tox.ini b/ceph/src/ceph-disk/tox.ini
index a2bc483a2..bbf1e21c6 100644
--- a/ceph/src/ceph-disk/tox.ini
+++ b/ceph/src/ceph-disk/tox.ini
@@ -25,4 +25,4 @@ commands = coverage run --append --source=ceph_disk {envbindir}/py.test -vv {tox
            coverage report --show-missing
 
 [testenv:flake8]
-commands = flake8 --ignore=H105,H405,E127 ceph_disk tests
+commands = flake8 --ignore=H105,H405,E127,E722 ceph_disk tests
diff --git a/ceph/src/ceph-volume/ceph_volume/api/__init__.py b/ceph/src/ceph-volume/ceph_volume/api/__init__.py
new file mode 100644
index 000000000..ecc971299
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/api/__init__.py
@@ -0,0 +1,3 @@
+"""
+Device API that can be shared among other implementations.
+"""
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/api.py b/ceph/src/ceph-volume/ceph_volume/api/lvm.py
similarity index 89%
rename from ceph/src/ceph-volume/ceph_volume/devices/lvm/api.py
rename to ceph/src/ceph-volume/ceph_volume/api/lvm.py
index e5bc26234..d82aee685 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/api.py
+++ b/ceph/src/ceph-volume/ceph_volume/api/lvm.py
@@ -131,6 +131,22 @@ def get_api_pvs():
     return _output_parser(stdout, fields)
 
 
+def get_lv_from_argument(argument):
+    """
+    Helper proxy function that consumes a possible logical volume passed in from the CLI
+    in the form of `vg/lv`, but with some validation so that an argument that is a full
+    path to a device can be ignored
+    """
+    if argument.startswith('/'):
+        lv = get_lv(lv_path=argument)
+        return lv
+    try:
+        vg_name, lv_name = argument.split('/')
+    except (ValueError, AttributeError):
+        return None
+    return get_lv(lv_name=lv_name, vg_name=vg_name)
+
+
 def get_lv(lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
     """
     Return a matching lv for the current system, requiring ``lv_name``,
@@ -177,24 +193,69 @@ def create_pv(device):
     ])
 
 
-def create_lv(name, group, size=None, **tags):
+def create_vg(name, *devices):
+    """
+    Create a Volume Group. Command looks like::
+
+        vgcreate --force --yes group_name device
+
+    Once created the volume group is returned as a ``VolumeGroup`` object
+    """
+    process.run([
+        'sudo',
+        'vgcreate',
+        '--force',
+        '--yes',
+        name] + list(devices)
+    )
+
+    vg = get_vg(vg_name=name)
+    return vg
+
+
+def remove_lv(path):
+    """
+    Removes a logical volume given it's absolute path.
+
+    Will return True if the lv is successfully removed or
+    raises a RuntimeError if the removal fails.
+    """
+    stdout, stderr, returncode = process.call(
+        [
+            'sudo',
+            'lvremove',
+            '-v',  # verbose
+            '-f',  # force it
+            path
+        ],
+        show_command=True,
+        terminal_verbose=True,
+    )
+    if returncode != 0:
+        raise RuntimeError("Unable to remove %s".format(path))
+    return True
+
+
+def create_lv(name, group, size=None, tags=None):
     """
     Create a Logical Volume in a Volume Group. Command looks like::
 
         lvcreate -L 50G -n gfslv vg0
 
-    ``name``, ``group``, and ``size`` are required. Tags are optional and are "translated" to include
-    the prefixes for the Ceph LVM tag API.
+    ``name``, ``group``, are required. If ``size`` is provided it must follow
+    lvm's size notation (like 1G, or 20M). Tags are an optional dictionary and is expected to
+    conform to the convention of prefixing them with "ceph." like::
 
+        {"ceph.block_device": "/dev/ceph/osd-1"}
     """
     # XXX add CEPH_VOLUME_LVM_DEBUG to enable -vvvv on lv operations
     type_path_tag = {
         'journal': 'ceph.journal_device',
         'data': 'ceph.data_device',
-        'block': 'ceph.block',
-        'wal': 'ceph.wal',
-        'db': 'ceph.db',
-        'lockbox': 'ceph.lockbox_device',
+        'block': 'ceph.block_device',
+        'wal': 'ceph.wal_device',
+        'db': 'ceph.db_device',
+        'lockbox': 'ceph.lockbox_device',  # XXX might not ever need this lockbox sorcery
     }
     if size:
         process.run([
@@ -202,7 +263,7 @@ def create_lv(name, group, size=None, **tags):
             'lvcreate',
             '--yes',
             '-L',
-            '%sG' % size,
+            '%s' % size,
             '-n', name, group
         ])
     # create the lv with all the space available, this is needed because the
@@ -218,17 +279,15 @@ def create_lv(name, group, size=None, **tags):
         ])
 
     lv = get_lv(lv_name=name, vg_name=group)
-    ceph_tags = {}
-    for k, v in tags.items():
-        ceph_tags['ceph.%s' % k] = v
-    lv.set_tags(ceph_tags)
+    lv.set_tags(tags)
 
     # when creating a distinct type, the caller doesn't know what the path will
     # be so this function will set it after creation using the mapping
-    path_tag = type_path_tag[tags['type']]
-    lv.set_tags(
-        {path_tag: lv.lv_path}
-    )
+    path_tag = type_path_tag.get(tags.get('ceph.type'))
+    if path_tag:
+        lv.set_tags(
+            {path_tag: lv.lv_path}
+        )
     return lv
 
 
@@ -584,6 +643,23 @@ class Volume(object):
     def __repr__(self):
         return self.__str__()
 
+    def as_dict(self):
+        obj = {}
+        obj.update(self.lv_api)
+        obj['tags'] = self.tags
+        obj['name'] = self.name
+        obj['type'] = self.tags['ceph.type']
+        obj['path'] = self.lv_path
+        return obj
+
+    def clear_tags(self):
+        """
+        Removes all tags from the Logical Volume.
+        """
+        for k, v in self.tags.items():
+            tag = "%s=%s" % (k, v)
+            process.run(['sudo', 'lvchange', '--deltag', tag, self.lv_path])
+
     def set_tags(self, tags):
         """
         :param tags: A dictionary of tag names and values, like::
diff --git a/ceph/src/ceph-volume/ceph_volume/decorators.py b/ceph/src/ceph-volume/ceph_volume/decorators.py
index c1e14bc79..d0be93817 100644
--- a/ceph/src/ceph-volume/ceph_volume/decorators.py
+++ b/ceph/src/ceph-volume/ceph_volume/decorators.py
@@ -58,6 +58,9 @@ def catches(catch=None, handler=None, exit=True):
             try:
                 return f(*a, **kw)
             except catch as e:
+                import logging
+                logger = logging.getLogger('ceph_volume')
+                logger.exception('exception caught by decorator')
                 if os.environ.get('CEPH_VOLUME_DEBUG'):
                     raise
                 if handler:
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/__init__.py b/ceph/src/ceph-volume/ceph_volume/devices/__init__.py
index c77c344d6..8af5d1e74 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/__init__.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/__init__.py
@@ -1 +1 @@
-from . import lvm # noqa
+from . import lvm, simple # noqa
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py
index 5a755672a..0a50e7a33 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py
@@ -1,16 +1,25 @@
 from __future__ import print_function
 import argparse
+import logging
+import os
 from textwrap import dedent
 from ceph_volume import process, conf, decorators
 from ceph_volume.util import system, disk
+from ceph_volume.util import prepare as prepare_utils
 from ceph_volume.systemd import systemctl
-from . import api
+from ceph_volume.api import lvm as api
+
+
+logger = logging.getLogger(__name__)
 
 
 def activate_filestore(lvs):
     # find the osd
     osd_lv = lvs.get(lv_tags={'ceph.type': 'data'})
+    if not osd_lv:
+        raise RuntimeError('Unable to find a data LV for filestore activation')
     osd_id = osd_lv.tags['ceph.osd_id']
+    conf.cluster = osd_lv.tags['ceph.cluster_name']
     # it may have a volume with a journal
     osd_journal_lv = lvs.get(lv_tags={'ceph.type': 'journal'})
     # TODO: add sensible error reporting if this is ever the case
@@ -29,7 +38,7 @@ def activate_filestore(lvs):
     # mount the osd
     source = osd_lv.lv_path
     destination = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
-    if not system.is_mounted(source, destination=destination):
+    if not system.device_is_mounted(source, destination=destination):
         process.run(['sudo', 'mount', '-v', source, destination])
 
     # always re-do the symlink regardless if it exists, so that the journal
@@ -47,9 +56,76 @@ def activate_filestore(lvs):
     systemctl.start_osd(osd_id)
 
 
+def get_osd_device_path(osd_lv, lvs, device_type):
+    """
+    ``device_type`` can be one of ``db``, ``wal`` or ``block`` so that
+    we can query ``lvs`` (a ``Volumes`` object) and fallback to querying the uuid
+    if that is not present.
+
+    Return a path if possible, failing to do that a ``None``, since some of these devices
+    are optional
+    """
+    osd_lv = lvs.get(lv_tags={'ceph.type': 'block'})
+    uuid_tag = 'ceph.%s_uuid' % device_type
+    device_uuid = osd_lv.tags.get(uuid_tag)
+    if not device_uuid:
+        return None
+
+    device_lv = lvs.get(lv_uuid=device_uuid)
+    if device_lv:
+        return device_lv.lv_path
+    else:
+        # this could be a regular device, so query it with blkid
+        physical_device = disk.get_device_from_partuuid(device_uuid)
+        return physical_device or None
+    return None
+
+
 def activate_bluestore(lvs):
-    # TODO
-    pass
+    # find the osd
+    osd_lv = lvs.get(lv_tags={'ceph.type': 'block'})
+    osd_id = osd_lv.tags['ceph.osd_id']
+    conf.cluster = osd_lv.tags['ceph.cluster_name']
+    osd_fsid = osd_lv.tags['ceph.osd_fsid']
+    db_device_path = get_osd_device_path(osd_lv, lvs, 'db')
+    wal_device_path = get_osd_device_path(osd_lv, lvs, 'wal')
+
+    # mount on tmpfs the osd directory
+    osd_path = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
+    if not system.path_is_mounted(osd_path):
+        # mkdir -p and mount as tmpfs
+        prepare_utils.create_osd_path(osd_id, tmpfs=True)
+    # XXX This needs to be removed once ceph-bluestore-tool can deal with
+    # symlinks that exist in the osd dir
+    for link_name in ['block', 'block.db', 'block.wal']:
+        link_path = os.path.join(osd_path, link_name)
+        if os.path.exists(link_path):
+            os.unlink(os.path.join(osd_path, link_name))
+    # Once symlinks are removed, the osd dir can be 'primed again.
+    process.run([
+        'sudo', 'ceph-bluestore-tool', '--cluster=%s' % conf.cluster,
+        'prime-osd-dir', '--dev', osd_lv.lv_path,
+        '--path', osd_path])
+    # always re-do the symlink regardless if it exists, so that the block,
+    # block.wal, and block.db devices that may have changed can be mapped
+    # correctly every time
+    process.run(['sudo', 'ln', '-snf', osd_lv.lv_path, os.path.join(osd_path, 'block')])
+    system.chown(os.path.join(osd_path, 'block'))
+    system.chown(osd_path)
+    if db_device_path:
+        destination = os.path.join(osd_path, 'block.db')
+        process.run(['sudo', 'ln', '-snf', db_device_path, destination])
+        system.chown(db_device_path)
+    if wal_device_path:
+        destination = os.path.join(osd_path, 'block.wal')
+        process.run(['sudo', 'ln', '-snf', wal_device_path, destination])
+        system.chown(wal_device_path)
+
+    # enable the ceph-volume unit for this OSD
+    systemctl.enable_volume(osd_id, osd_fsid, 'lvm')
+
+    # start the OSD
+    systemctl.start_osd(osd_id)
 
 
 class Activate(object):
@@ -69,7 +145,22 @@ class Activate(object):
             lvs.filter(lv_tags={'ceph.osd_fsid': args.osd_fsid})
         if not lvs:
             raise RuntimeError('could not find osd.%s with fsid %s' % (args.osd_id, args.osd_fsid))
-        activate_filestore(lvs)
+        # This argument is only available when passed in directly or via
+        # systemd, not when ``create`` is being used
+        if getattr(args, 'auto_detect_objectstore', False):
+            logger.info('auto detecting objectstore')
+            # may get multiple lvs, so can't do lvs.get() calls here
+            for lv in lvs:
+                has_journal = lv.tags.get('ceph.journal_uuid')
+                if has_journal:
+                    logger.info('found a journal associated with the OSD, assuming filestore')
+                    return activate_filestore(lvs)
+            logger.info('unable to find a journal associated with the OSD, assuming bluestore')
+            return activate_bluestore(lvs)
+        if args.bluestore:
+            activate_bluestore(lvs)
+        elif args.filestore:
+            activate_filestore(lvs)
 
     def main(self):
         sub_command_help = dedent("""
@@ -100,18 +191,27 @@ class Activate(object):
             nargs='?',
             help='The FSID of the OSD, similar to a SHA1'
         )
+        parser.add_argument(
+            '--auto-detect-objectstore',
+            action='store_true',
+            help='Autodetect the objectstore by inspecting the OSD',
+        )
         parser.add_argument(
             '--bluestore',
-            action='store_true', default=False,
+            action='store_true',
             help='filestore objectstore (not yet implemented)',
         )
         parser.add_argument(
             '--filestore',
-            action='store_true', default=True,
+            action='store_true',
             help='filestore objectstore (current default)',
         )
         if len(self.argv) == 0:
             print(sub_command_help)
             return
         args = parser.parse_args(self.argv)
+        # Default to bluestore here since defaulting it in add_argument may
+        # cause both to be True
+        if not args.bluestore and not args.filestore:
+            args.bluestore = True
         self.activate(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/common.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/common.py
index b4e4ee3ad..b2fbbf991 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/common.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/common.py
@@ -15,30 +15,30 @@ def common_parser(prog, description):
     required_args = parser.add_argument_group('required arguments')
     parser.add_argument(
         '--journal',
-        help='A logical volume (vg_name/lv_name), or path to a device',
+        help='(filestore) A logical volume (vg_name/lv_name), or path to a device',
     )
     required_args.add_argument(
         '--data',
         required=True,
         type=arg_validators.LVPath(),
-        help='A logical volume (vg_name/lv_name) for OSD data',
+        help='OSD data path. A physical device or logical volume',
     )
     parser.add_argument(
         '--journal-size',
         default=5,
         metavar='GB',
         type=int,
-        help='Size (in GB) A logical group name or a path to a logical volume',
+        help='(filestore) Size (in GB) for the journal',
     )
     parser.add_argument(
         '--bluestore',
-        action='store_true', default=False,
-        help='Use the bluestore objectstore (not currently supported)',
+        action='store_true',
+        help='Use the bluestore objectstore',
     )
     parser.add_argument(
         '--filestore',
-        action='store_true', default=True,
-        help='Use the filestore objectstore (currently the only supported object store)',
+        action='store_true',
+        help='Use the filestore objectstore',
     )
     parser.add_argument(
         '--osd-id',
@@ -48,6 +48,16 @@ def common_parser(prog, description):
         '--osd-fsid',
         help='Reuse an existing OSD fsid',
     )
+    parser.add_argument(
+        '--block.db',
+        dest='block_db',
+        help='(bluestore) Path to bluestore block.db logical volume or device',
+    )
+    parser.add_argument(
+        '--block.wal',
+        dest='block_wal',
+        help='(bluestore) Path to bluestore block.wal logical volume or device',
+    )
     # Do not parse args, so that consumers can do something before the args get
     # parsed triggering argparse behavior
     return parser
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/create.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/create.py
index 8c747f342..353b26ab4 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/create.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/create.py
@@ -50,4 +50,8 @@ class Create(object):
             print(sub_command_help)
             return
         args = parser.parse_args(self.argv)
+        # Default to bluestore here since defaulting it in add_argument may
+        # cause both to be True
+        if args.bluestore is None and args.filestore is None:
+            args.bluestore = True
         self.create(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/listing.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/listing.py
new file mode 100644
index 000000000..6982f91bc
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/listing.py
@@ -0,0 +1,244 @@
+from __future__ import print_function
+import argparse
+import json
+import logging
+from textwrap import dedent
+from ceph_volume import decorators
+from ceph_volume.util import disk
+from ceph_volume.api import lvm as api
+
+logger = logging.getLogger(__name__)
+
+
+osd_list_header_template = """\n
+{osd_id:=^20}"""
+
+
+osd_device_header_template = """
+
+  [{type: >4}]    {path}
+"""
+
+device_metadata_item_template = """
+      {tag_name: <25} {value}"""
+
+
+def readable_tag(tag):
+    actual_name = tag.split('.')[-1]
+    return actual_name.replace('_', ' ')
+
+
+def pretty_report(report):
+    output = []
+    for _id, devices in report.items():
+        output.append(
+            osd_list_header_template.format(osd_id=" osd.%s " % _id)
+        )
+        for device in devices:
+            output.append(
+                osd_device_header_template.format(
+                    type=device['type'],
+                    path=device['path']
+                )
+            )
+            for tag_name, value in device.get('tags', {}).items():
+                output.append(
+                    device_metadata_item_template.format(
+                        tag_name=readable_tag(tag_name),
+                        value=value
+                    )
+                )
+    print(''.join(output))
+
+
+class List(object):
+
+    help = 'list logical volumes and devices associated with Ceph'
+
+    def __init__(self, argv):
+        self.argv = argv
+
+    @decorators.needs_root
+    def list(self, args):
+        # ensure everything is up to date before calling out
+        # to list lv's
+        self.update()
+        report = self.generate(args)
+        if args.format == 'json':
+            # If the report is empty, we don't return a non-zero exit status
+            # because it is assumed this is going to be consumed by automated
+            # systems like ceph-ansible which would be forced to ignore the
+            # non-zero exit status if all they need is the information in the
+            # JSON object
+            print(json.dumps(report, indent=4, sort_keys=True))
+        else:
+            if not report:
+                raise SystemExit('No valid Ceph devices found')
+            pretty_report(report)
+
+    def update(self):
+        """
+        Ensure all journal devices are up to date if they aren't a logical
+        volume
+        """
+        lvs = api.Volumes()
+        for lv in lvs:
+            try:
+                lv.tags['ceph.osd_id']
+            except KeyError:
+                # only consider ceph-based logical volumes, everything else
+                # will get ignored
+                continue
+
+            for device_type in ['journal', 'block', 'wal', 'db']:
+                device_name = 'ceph.%s_device' % device_type
+                device_uuid = lv.tags.get('ceph.%s_uuid' % device_type)
+                if not device_uuid:
+                    # bluestore will not have a journal, filestore will not have
+                    # a block/wal/db, so we must skip if not present
+                    continue
+                disk_device = disk.get_device_from_partuuid(device_uuid)
+                if disk_device:
+                    if lv.tags[device_name] != disk_device:
+                        # this means that the device has changed, so it must be updated
+                        # on the API to reflect this
+                        lv.set_tags({device_name: disk_device})
+
+    def generate(self, args):
+        """
+        Generate reports for an individual device or for all Ceph-related
+        devices, logical or physical, as long as they have been prepared by
+        this tool before and contain enough metadata.
+        """
+        if args.device:
+            return self.single_report(args.device)
+        else:
+            return self.full_report()
+
+    def single_report(self, device):
+        """
+        Generate a report for a single device. This can be either a logical
+        volume in the form of vg/lv or a device with an absolute path like
+        /dev/sda1
+        """
+        lvs = api.Volumes()
+        report = {}
+        lv = api.get_lv_from_argument(device)
+        if lv:
+            try:
+                _id = lv.tags['ceph.osd_id']
+            except KeyError:
+                logger.warning('device is not part of ceph: %s', device)
+                return report
+
+            report.setdefault(_id, [])
+            report[_id].append(
+                lv.as_dict()
+            )
+
+        else:
+            # this has to be a journal/wal/db device (not a logical volume) so try
+            # to find the PARTUUID that should be stored in the OSD logical
+            # volume
+            for device_type in ['journal', 'block', 'wal', 'db']:
+                device_tag_name = 'ceph.%s_device' % device_type
+                device_tag_uuid = 'ceph.%s_uuid' % device_type
+                associated_lv = lvs.get(lv_tags={device_tag_name: device})
+                if associated_lv:
+                    _id = associated_lv.tags['ceph.osd_id']
+                    uuid = associated_lv.tags[device_tag_uuid]
+
+                    report.setdefault(_id, [])
+                    report[_id].append(
+                        {
+                            'tags': {'PARTUUID': uuid},
+                            'type': device_type,
+                            'path': device,
+                        }
+                    )
+        return report
+
+    def full_report(self):
+        """
+        Generate a report for all the logical volumes and associated devices
+        that have been previously prepared by Ceph
+        """
+        lvs = api.Volumes()
+        report = {}
+        for lv in lvs:
+            try:
+                _id = lv.tags['ceph.osd_id']
+            except KeyError:
+                # only consider ceph-based logical volumes, everything else
+                # will get ignored
+                continue
+
+            report.setdefault(_id, [])
+            report[_id].append(
+                lv.as_dict()
+            )
+
+            for device_type in ['journal', 'block', 'wal', 'db']:
+                device_uuid = lv.tags.get('ceph.%s_uuid' % device_type)
+                if not device_uuid:
+                    # bluestore will not have a journal, filestore will not have
+                    # a block/wal/db, so we must skip if not present
+                    continue
+                if not api.get_lv(lv_uuid=device_uuid):
+                    # means we have a regular device, so query blkid
+                    disk_device = disk.get_device_from_partuuid(device_uuid)
+                    if disk_device:
+                        report[_id].append(
+                            {
+                                'tags': {'PARTUUID': device_uuid},
+                                'type': device_type,
+                                'path': disk_device,
+                            }
+                        )
+
+        return report
+
+    def main(self):
+        sub_command_help = dedent("""
+        List devices or logical volumes associated with Ceph. An association is
+        determined if a device has information relating to an OSD. This is
+        verified by querying LVM's metadata and correlating it with devices.
+
+        The lvs associated with the OSD need to have been prepared previously,
+        so that all needed tags and metadata exist.
+
+        Full listing of all system devices associated with a cluster::
+
+            ceph-volume lvm list
+
+        List a particular device, reporting all metadata about it::
+
+            ceph-volume lvm list /dev/sda1
+
+        List a logical volume, along with all its metadata (vg is a volume
+        group, and lv the logical volume name)::
+
+            ceph-volume lvm list {vg/lv}
+        """)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume lvm list',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=sub_command_help,
+        )
+
+        parser.add_argument(
+            'device',
+            metavar='DEVICE',
+            nargs='?',
+            help='Path to an lv (as vg/lv) or to a device like /dev/sda1'
+        )
+
+        parser.add_argument(
+            '--format',
+            help='output format, defaults to "pretty"',
+            default='pretty',
+            choices=['json', 'pretty'],
+        )
+
+        args = parser.parse_args(self.argv)
+        self.list(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/main.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/main.py
index 59e69329b..8b698a03f 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/main.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/main.py
@@ -5,6 +5,8 @@ from . import activate
 from . import prepare
 from . import create
 from . import trigger
+from . import listing
+from . import zap
 
 
 class LVM(object):
@@ -22,6 +24,8 @@ class LVM(object):
         'prepare': prepare.Prepare,
         'create': create.Create,
         'trigger': trigger.Trigger,
+        'list': listing.List,
+        'zap': zap.Zap,
     }
 
     def __init__(self, argv):
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/prepare.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/prepare.py
index 1ca5b0d88..5a7daa3dc 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/prepare.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/prepare.py
@@ -1,17 +1,17 @@
 from __future__ import print_function
 import json
-import os
+import uuid
 from textwrap import dedent
 from ceph_volume.util import prepare as prepare_utils
 from ceph_volume.util import system, disk
 from ceph_volume import conf, decorators, terminal
-from . import api
+from ceph_volume.api import lvm as api
 from .common import prepare_parser
 
 
 def prepare_filestore(device, journal, secrets, id_=None, fsid=None):
     """
-    :param device: The name of the volume group or lvm to work with
+    :param device: The name of the logical volume to work with
     :param journal: similar to device but can also be a regular/plain disk
     :param secrets: A dict with the secrets needed to create the osd (e.g. cephx)
     :param id_: The OSD id
@@ -25,7 +25,7 @@ def prepare_filestore(device, journal, secrets, id_=None, fsid=None):
     # allow re-using an id, in case a prepare failed
     osd_id = id_ or prepare_utils.create_id(fsid, json_secrets)
     # create the directory
-    prepare_utils.create_path(osd_id)
+    prepare_utils.create_osd_path(osd_id)
     # format the device
     prepare_utils.format_device(device)
     # mount the data device
@@ -35,13 +35,42 @@ def prepare_filestore(device, journal, secrets, id_=None, fsid=None):
     # get the latest monmap
     prepare_utils.get_monmap(osd_id)
     # prepare the osd filesystem
-    prepare_utils.osd_mkfs(osd_id, fsid)
+    prepare_utils.osd_mkfs_filestore(osd_id, fsid)
     # write the OSD keyring if it doesn't exist already
     prepare_utils.write_keyring(osd_id, cephx_secret)
 
 
-def prepare_bluestore():
-    raise NotImplemented()
+def prepare_bluestore(block, wal, db, secrets, id_=None, fsid=None):
+    """
+    :param block: The name of the logical volume for the bluestore data
+    :param wal: a regular/plain disk or logical volume, to be used for block.wal
+    :param db: a regular/plain disk or logical volume, to be used for block.db
+    :param secrets: A dict with the secrets needed to create the osd (e.g. cephx)
+    :param id_: The OSD id
+    :param fsid: The OSD fsid, also known as the OSD UUID
+    """
+    cephx_secret = secrets.get('cephx_secret', prepare_utils.create_key())
+    json_secrets = json.dumps(secrets)
+
+    # allow re-using an existing fsid, in case prepare failed
+    fsid = fsid or system.generate_uuid()
+    # allow re-using an id, in case a prepare failed
+    osd_id = id_ or prepare_utils.create_id(fsid, json_secrets)
+    # create the directory
+    prepare_utils.create_osd_path(osd_id, tmpfs=True)
+    # symlink the block
+    prepare_utils.link_block(block, osd_id)
+    # get the latest monmap
+    prepare_utils.get_monmap(osd_id)
+    # write the OSD keyring if it doesn't exist already
+    prepare_utils.write_keyring(osd_id, cephx_secret)
+    # prepare the osd filesystem
+    prepare_utils.osd_mkfs_bluestore(
+        osd_id, fsid,
+        keyring=cephx_secret,
+        wal=wal,
+        db=db
+    )
 
 
 class Prepare(object):
@@ -51,19 +80,20 @@ class Prepare(object):
     def __init__(self, argv):
         self.argv = argv
 
-    def get_journal_ptuuid(self, argument):
+    def get_ptuuid(self, argument):
         uuid = disk.get_partuuid(argument)
         if not uuid:
             terminal.error('blkid could not detect a PARTUUID for device: %s' % argument)
-            raise RuntimeError('unable to use device for a journal')
+            raise RuntimeError('unable to use device')
         return uuid
 
-    def get_journal_lv(self, argument):
+    def get_lv(self, argument):
         """
-        Perform some parsing of the value of ``--journal`` so that the process
-        can determine correctly if it got a device path or an lv
-        :param argument: The value of ``--journal``, that will need to be split
-        to retrieve the actual lv
+        Perform some parsing of the command-line value so that the process
+        can determine correctly if it got a device path or an lv.
+
+        :param argument: The command-line value that will need to be split to
+                         retrieve the actual lv
         """
         try:
             vg_name, lv_name = argument.split('/')
@@ -71,6 +101,66 @@ class Prepare(object):
             return None
         return api.get_lv(lv_name=lv_name, vg_name=vg_name)
 
+    def setup_device(self, device_type, device_name, tags):
+        """
+        Check if ``device`` is an lv, if so, set the tags, making sure to
+        update the tags with the lv_uuid and lv_path which the incoming tags
+        will not have.
+
+        If the device is not a logical volume, then retrieve the partition UUID
+        by querying ``blkid``
+        """
+        if device_name is None:
+            return '', '', tags
+        tags['ceph.type'] = device_type
+        lv = self.get_lv(device_name)
+        if lv:
+            uuid = lv.lv_uuid
+            path = lv.lv_path
+            tags['ceph.%s_uuid' % device_type] = uuid
+            tags['ceph.%s_device' % device_type] = path
+            lv.set_tags(tags)
+        else:
+            # otherwise assume this is a regular disk partition
+            uuid = self.get_ptuuid(device_name)
+            path = device_name
+            tags['ceph.%s_uuid' % device_type] = uuid
+            tags['ceph.%s_device' % device_type] = path
+        return path, uuid, tags
+
+    def prepare_device(self, arg, device_type, cluster_fsid, osd_fsid):
+        """
+        Check if ``arg`` is a device or partition to create an LV out of it
+        with a distinct volume group name, assigning LV tags on it and
+        ultimately, returning the logical volume object.  Failing to detect
+        a device or partition will result in error.
+
+        :param arg: The value of ``--data`` when parsing args
+        :param device_type: Usually, either ``data`` or ``block`` (filestore vs. bluestore)
+        :param cluster_fsid: The cluster fsid/uuid
+        :param osd_fsid: The OSD fsid/uuid
+        """
+        if disk.is_partition(arg) or disk.is_device(arg):
+            # we must create a vg, and then a single lv
+            vg_name = "ceph-%s" % cluster_fsid
+            if api.get_vg(vg_name=vg_name):
+                # means we already have a group for this, make a different one
+                # XXX this could end up being annoying for an operator, maybe?
+                vg_name = "ceph-%s" % str(uuid.uuid4())
+            api.create_vg(vg_name, arg)
+            lv_name = "osd-%s-%s" % (device_type, osd_fsid)
+            return api.create_lv(
+                lv_name,
+                vg_name,  # the volume group
+                tags={'ceph.type': device_type})
+        else:
+            error = [
+                'Cannot use device (%s).',
+                'A vg/lv path or an existing device is needed' % arg]
+            raise RuntimeError(' '.join(error))
+
+        raise RuntimeError('no data logical volume found with: %s' % arg)
+
     @decorators.needs_root
     def prepare(self, args):
         # FIXME we don't allow re-using a keyring, we always generate one for the
@@ -80,69 +170,66 @@ class Prepare(object):
         secrets = {'cephx_secret': prepare_utils.create_key()}
 
         cluster_fsid = conf.ceph.get('global', 'fsid')
-        fsid = args.osd_fsid or system.generate_uuid()
-        #osd_id = args.osd_id or prepare_utils.create_id(fsid)
+        osd_fsid = args.osd_fsid or system.generate_uuid()
         # allow re-using an id, in case a prepare failed
-        osd_id = args.osd_id or prepare_utils.create_id(fsid, json.dumps(secrets))
-        vg_name, lv_name = args.data.split('/')
+        osd_id = args.osd_id or prepare_utils.create_id(osd_fsid, json.dumps(secrets))
         if args.filestore:
-            data_lv = api.get_lv(lv_name=lv_name, vg_name=vg_name)
-
-            # we must have either an existing data_lv or a newly created, so lets make
-            # sure that the tags are correct
-            if not data_lv:
-                raise RuntimeError('no data logical volume found with: %s' % args.data)
-
             if not args.journal:
                 raise RuntimeError('--journal is required when using --filestore')
 
-            journal_lv = self.get_journal_lv(args.journal)
-            if journal_lv:
-                journal_device = journal_lv.lv_path
-                journal_uuid = journal_lv.lv_uuid
-                # we can only set tags on an lv, the pv (if any) can't as we
-                # aren't making it part of an lvm group (vg)
-                journal_lv.set_tags({
-                    'ceph.type': 'journal',
-                    'ceph.osd_fsid': fsid,
-                    'ceph.osd_id': osd_id,
-                    'ceph.cluster_fsid': cluster_fsid,
-                    'ceph.journal_device': journal_device,
-                    'ceph.journal_uuid': journal_uuid,
-                    'ceph.data_device': data_lv.lv_path,
-                    'ceph.data_uuid': data_lv.lv_uuid,
-                })
-
-            # allow a file
-            elif os.path.isfile(args.journal):
-                journal_uuid = ''
-                journal_device = args.journal
-
-            # otherwise assume this is a regular disk partition
-            else:
-                journal_uuid = self.get_journal_ptuuid(args.journal)
-                journal_device = args.journal
+            data_lv = self.get_lv(args.data)
+            if not data_lv:
+                data_lv = self.prepare_device(args.data, 'data', cluster_fsid, osd_fsid)
 
-            data_lv.set_tags({
-                'ceph.type': 'data',
-                'ceph.osd_fsid': fsid,
+            tags = {
+                'ceph.osd_fsid': osd_fsid,
                 'ceph.osd_id': osd_id,
                 'ceph.cluster_fsid': cluster_fsid,
-                'ceph.journal_device': journal_device,
-                'ceph.journal_uuid': journal_uuid,
+                'ceph.cluster_name': conf.cluster,
                 'ceph.data_device': data_lv.lv_path,
                 'ceph.data_uuid': data_lv.lv_uuid,
-            })
+            }
+
+            journal_device, journal_uuid, tags = self.setup_device('journal', args.journal, tags)
+
+            tags['ceph.type'] = 'data'
+            data_lv.set_tags(tags)
 
             prepare_filestore(
                 data_lv.lv_path,
                 journal_device,
                 secrets,
                 id_=osd_id,
-                fsid=fsid,
+                fsid=osd_fsid,
             )
         elif args.bluestore:
-            prepare_bluestore(args)
+            block_lv = self.get_lv(args.data)
+            if not block_lv:
+                block_lv = self.prepare_device(args.data, 'block', cluster_fsid, osd_fsid)
+
+            tags = {
+                'ceph.osd_fsid': osd_fsid,
+                'ceph.osd_id': osd_id,
+                'ceph.cluster_fsid': cluster_fsid,
+                'ceph.cluster_name': conf.cluster,
+                'ceph.block_device': block_lv.lv_path,
+                'ceph.block_uuid': block_lv.lv_uuid,
+            }
+
+            wal_device, wal_uuid, tags = self.setup_device('wal', args.block_wal, tags)
+            db_device, db_uuid, tags = self.setup_device('db', args.block_db, tags)
+
+            tags['ceph.type'] = 'block'
+            block_lv.set_tags(tags)
+
+            prepare_bluestore(
+                block_lv.lv_path,
+                wal_device,
+                db_device,
+                secrets,
+                id_=osd_id,
+                fsid=osd_fsid,
+            )
 
     def main(self):
         sub_command_help = dedent("""
@@ -166,17 +253,30 @@ class Prepare(object):
 
           Existing logical volume (lv) or device:
 
-              ceph-volume lvm prepare --data {logical volume} --journal /path/to/{lv}|{device}
+              ceph-volume lvm prepare --filestore --data {vg/lv} --journal /path/to/device
 
           Or:
 
-              ceph-volume lvm prepare --data {data volume group} --journal {journal volume group}
+              ceph-volume lvm prepare --filestore --data {vg/lv} --journal {vg/lv}
+
+          Existing block device, that will be made a group and logical volume:
+
+              ceph-volume lvm prepare --filestore --data /path/to/device --journal {vg/lv}
+
+        Bluestore
+        ---------
+
+          Existing logical volume (lv):
+
+              ceph-volume lvm prepare --bluestore --data {vg/lv}
+
+          Existing block device, that will be made a group and logical volume:
 
-        Collocated (same group) for data and journal
-        --------------------------------------------
+              ceph-volume lvm prepare --bluestore --data /path/to/device
 
-              ceph-volume lvm prepare --data {volume group}
+          Optionally, can consume db and wal devices or logical volumes:
 
+              ceph-volume lvm prepare --bluestore --data {vg/lv} --block.wal {device} --block-db {vg/lv}
         """)
         parser = prepare_parser(
             prog='ceph-volume lvm prepare',
@@ -186,4 +286,8 @@ class Prepare(object):
             print(sub_command_help)
             return
         args = parser.parse_args(self.argv)
+        # Default to bluestore here since defaulting it in add_argument may
+        # cause both to be True
+        if args.bluestore is None and args.filestore is None:
+            args.bluestore = True
         self.prepare(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/trigger.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/trigger.py
index 911162072..dc57011df 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/trigger.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/trigger.py
@@ -67,4 +67,4 @@ class Trigger(object):
         args = parser.parse_args(self.argv)
         osd_id = parse_osd_id(args.systemd_data)
         osd_uuid = parse_osd_uuid(args.systemd_data)
-        Activate([osd_id, osd_uuid]).main()
+        Activate(['--auto-detect-objectstore', osd_id, osd_uuid]).main()
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py
new file mode 100644
index 000000000..df19686ff
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py
@@ -0,0 +1,107 @@
+import argparse
+import logging
+
+from textwrap import dedent
+
+from ceph_volume import decorators, terminal, process
+from ceph_volume.api import lvm as api
+
+logger = logging.getLogger(__name__)
+
+
+def wipefs(path):
+    """
+    Removes the filesystem from an lv or partition.
+    """
+    process.run([
+        'sudo',
+        'wipefs',
+        '--all',
+        path
+    ])
+
+
+def zap_data(path):
+    """
+    Clears all data from the given path. Path should be
+    an absolute path to an lv or partition.
+
+    10M of data is written to the path to make sure that
+    there is no trace left of any previous Filesystem.
+    """
+    process.run([
+        'dd',
+        'if=/dev/zero',
+        'of={path}'.format(path=path),
+        'bs=1M',
+        'count=10',
+    ])
+
+
+class Zap(object):
+
+    help = 'Removes all data and filesystems from a logical volume or partition.'
+
+    def __init__(self, argv):
+        self.argv = argv
+
+    @decorators.needs_root
+    def zap(self, args):
+        device = args.device
+        lv = api.get_lv_from_argument(device)
+        if lv:
+            # we are zapping a logical volume
+            path = lv.lv_path
+        else:
+            # we are zapping a partition
+            #TODO: ensure device is a partition
+            path = device
+
+        logger.info("Zapping: %s", path)
+        terminal.write("Zapping: %s" % path)
+
+        wipefs(path)
+        zap_data(path)
+
+        if lv:
+            # remove all lvm metadata
+            lv.clear_tags()
+
+        terminal.success("Zapping successful for: %s" % path)
+
+    def main(self):
+        sub_command_help = dedent("""
+        Zaps the given logical volume or partition. If given a path to a logical
+        volume it must be in the format of vg/lv. Any filesystems present
+        on the given lv or partition will be removed and all data will be purged.
+
+        However, the lv or partition will be kept intact.
+
+        Example calls for supported scenarios:
+
+          Zapping a logical volume:
+
+              ceph-volume lvm zap {vg name/lv name}
+
+          Zapping a partition:
+
+              ceph-volume lvm zap /dev/sdc1
+
+        """)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume lvm zap',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=sub_command_help,
+        )
+
+        parser.add_argument(
+            'device',
+            metavar='DEVICE',
+            nargs='?',
+            help='Path to an lv (as vg/lv) or to a partition like /dev/sda1'
+        )
+        if len(self.argv) == 0:
+            print(sub_command_help)
+            return
+        args = parser.parse_args(self.argv)
+        self.zap(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/simple/__init__.py b/ceph/src/ceph-volume/ceph_volume/devices/simple/__init__.py
new file mode 100644
index 000000000..280e130ed
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/devices/simple/__init__.py
@@ -0,0 +1 @@
+from .main import Simple # noqa
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/simple/activate.py b/ceph/src/ceph-volume/ceph_volume/devices/simple/activate.py
new file mode 100644
index 000000000..fdc50f0fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/devices/simple/activate.py
@@ -0,0 +1,152 @@
+from __future__ import print_function
+import argparse
+import json
+import logging
+import os
+from textwrap import dedent
+from ceph_volume import process, decorators, terminal
+from ceph_volume.util import system, disk
+from ceph_volume.systemd import systemctl
+
+
+logger = logging.getLogger(__name__)
+
+
+class Activate(object):
+
+    help = 'Enable systemd units to mount configured devices and start a Ceph OSD'
+
+    def __init__(self, argv, systemd=False):
+        self.argv = argv
+        self.systemd = systemd
+
+    @decorators.needs_root
+    def activate(self, args):
+        with open(args.json_config, 'r') as fp:
+            osd_metadata = json.load(fp)
+
+        osd_id = osd_metadata.get('whoami', args.osd_id)
+        osd_fsid = osd_metadata.get('fsid', args.osd_fsid)
+
+        cluster_name = osd_metadata.get('cluster_name', 'ceph')
+        osd_dir = '/var/lib/ceph/osd/%s-%s' % (cluster_name, osd_id)
+        data_uuid = osd_metadata.get('data', {}).get('uuid')
+        if not data_uuid:
+            raise RuntimeError(
+                'Unable to activate OSD %s - no "uuid" key found for data' % args.osd_id
+            )
+        data_device = disk.get_device_from_partuuid(data_uuid)
+        journal_device = disk.get_device_from_partuuid(osd_metadata.get('journal', {}).get('uuid'))
+        block_device = disk.get_device_from_partuuid(osd_metadata.get('block', {}).get('uuid'))
+        block_db_device = disk.get_device_from_partuuid(osd_metadata.get('block.db', {}).get('uuid'))
+        block_wal_device = disk.get_device_from_partuuid(
+            osd_metadata.get('block.wal', {}).get('uuid')
+        )
+
+        if not system.device_is_mounted(data_device, destination=osd_dir):
+            process.run(['sudo', 'mount', '-v', data_device, osd_dir])
+
+        device_map = {
+            'journal': journal_device,
+            'block': block_device,
+            'block.db': block_db_device,
+            'block.wal': block_wal_device
+        }
+
+        for name, device in device_map.items():
+            if not device:
+                continue
+            # always re-do the symlink regardless if it exists, so that the journal
+            # device path that may have changed can be mapped correctly every time
+            destination = os.path.join(osd_dir, name)
+            process.run(['sudo', 'ln', '-snf', device, destination])
+
+            # make sure that the journal has proper permissions
+            system.chown(device)
+
+        if not self.systemd:
+            # enable the ceph-volume unit for this OSD
+            systemctl.enable_volume(osd_id, osd_fsid, 'simple')
+
+            # disable any/all ceph-disk units
+            systemctl.mask_ceph_disk()
+
+        # enable the OSD
+        systemctl.enable_osd(osd_id)
+
+        # start the OSD
+        systemctl.start_osd(osd_id)
+
+        if not self.systemd:
+            terminal.success('Successfully activated OSD %s with FSID %s' % (osd_id, osd_fsid))
+            terminal.warning(
+                ('All ceph-disk systemd units have been disabled to '
+                 'prevent OSDs getting triggered by UDEV events')
+            )
+
+    def main(self):
+        sub_command_help = dedent("""
+        Activate OSDs by mounting devices previously configured to their
+        appropriate destination::
+
+            ceph-volume simple activate {ID} {FSID}
+
+        Or using a JSON file directly::
+
+            ceph-volume simple activate --file /etc/ceph/osd/{ID}-{FSID}.json
+
+        The OSD must have been "scanned" previously (see ``ceph-volume simple
+        scan``), so that all needed OSD device information and metadata exist.
+
+        A previously scanned OSD would exist like::
+
+            /etc/ceph/osd/{ID}-{FSID}.json
+
+
+        Environment variables supported:
+
+        CEPH_VOLUME_SIMPLE_JSON_DIR: Directory location for scanned OSD JSON configs
+        """)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume simple activate',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=sub_command_help,
+        )
+        parser.add_argument(
+            'osd_id',
+            metavar='ID',
+            nargs='?',
+            help='The ID of the OSD, usually an integer, like 0'
+        )
+        parser.add_argument(
+            'osd_fsid',
+            metavar='FSID',
+            nargs='?',
+            help='The FSID of the OSD, similar to a SHA1'
+        )
+        parser.add_argument(
+            '--file',
+            help='The path to a JSON file, from a scanned OSD'
+        )
+        if len(self.argv) == 0:
+            print(sub_command_help)
+            return
+        args = parser.parse_args(self.argv)
+        if not args.file:
+            if not args.osd_id and not args.osd_fsid:
+                terminal.error('ID and FSID are required to find the right OSD to activate')
+                terminal.error('from a scanned OSD location in /etc/ceph/osd/')
+                raise RuntimeError('Unable to activate without both ID and FSID')
+        # don't allow a CLI flag to specify the JSON dir, because that might
+        # implicitly indicate that it would be possible to activate a json file
+        # at a non-default location which would not work at boot time if the
+        # custom location is not exposed through an ENV var
+        json_dir = os.environ.get('CEPH_VOLUME_SIMPLE_JSON_DIR', '/etc/ceph/osd/')
+        if args.file:
+            json_config = args.file
+        else:
+            json_config = os.path.join(json_dir, '%s-%s.json' % (args.osd_id, args.osd_fsid))
+        if not os.path.exists(json_config):
+            raise RuntimeError('Expected JSON config path not found: %s' % json_config)
+        args.json_config = json_config
+        self.activate(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/simple/main.py b/ceph/src/ceph-volume/ceph_volume/devices/simple/main.py
new file mode 100644
index 000000000..2119963f8
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/devices/simple/main.py
@@ -0,0 +1,41 @@
+import argparse
+from textwrap import dedent
+from ceph_volume import terminal
+from . import scan
+from . import activate
+from . import trigger
+
+
+class Simple(object):
+
+    help = 'Manage already deployed OSDs with ceph-volume'
+
+    _help = dedent("""
+    Take over a deployed OSD, persisting its metadata in /etc/ceph/osd/ so that it can be managed
+    with ceph-volume directly. Avoids UDEV and ceph-disk handling.
+
+    {sub_help}
+    """)
+
+    mapper = {
+        'scan': scan.Scan,
+        'activate': activate.Activate,
+        'trigger': trigger.Trigger,
+    }
+
+    def __init__(self, argv):
+        self.argv = argv
+
+    def print_help(self, sub_help):
+        return self._help.format(sub_help=sub_help)
+
+    def main(self):
+        terminal.dispatch(self.mapper, self.argv)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume simple',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=self.print_help(terminal.subhelp(self.mapper)),
+        )
+        parser.parse_args(self.argv)
+        if len(self.argv) <= 1:
+            return parser.print_help()
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/simple/scan.py b/ceph/src/ceph-volume/ceph_volume/devices/simple/scan.py
new file mode 100644
index 000000000..905baf415
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/devices/simple/scan.py
@@ -0,0 +1,206 @@
+from __future__ import print_function
+import argparse
+import json
+import logging
+import os
+from textwrap import dedent
+from ceph_volume import decorators, terminal, conf
+from ceph_volume.api import lvm
+from ceph_volume.util import arg_validators, system, disk
+
+
+logger = logging.getLogger(__name__)
+
+
+class Scan(object):
+
+    help = 'Capture metadata from an OSD data partition or directory'
+
+    def __init__(self, argv):
+        self.argv = argv
+        self._etc_path = '/etc/ceph/osd/'
+
+    @property
+    def etc_path(self):
+        if os.path.isdir(self._etc_path):
+            return self._etc_path
+
+        if not os.path.exists(self._etc_path):
+            os.mkdir(self._etc_path)
+            return self._etc_path
+
+        error = "OSD Configuration path (%s) needs to be a directory" % self._etc_path
+        raise RuntimeError(error)
+
+    def get_contents(self, path):
+        with open(path, 'r') as fp:
+            contents = fp.readlines()
+        if len(contents) > 1:
+            return ''.join(contents)
+        return ''.join(contents).strip().strip('\n')
+
+    def scan_device(self, path):
+        device_metadata = {'path': None, 'uuid': None}
+        if not path:
+            return device_metadata
+        # cannot read the symlink if this is tmpfs
+        if os.path.islink(path):
+            device = os.readlink(path)
+        else:
+            device = path
+        lvm_device = lvm.get_lv_from_argument(device)
+        if lvm_device:
+            device_uuid = lvm_device.lv_uuid
+        else:
+            device_uuid = disk.get_partuuid(device)
+
+        device_metadata['uuid'] = device_uuid
+        device_metadata['path'] = device
+
+        return device_metadata
+
+    def scan_directory(self, path):
+        osd_metadata = {'cluster_name': conf.cluster}
+        path_mounts = system.get_mounts(paths=True)
+        for _file in os.listdir(path):
+            file_path = os.path.join(path, _file)
+            if os.path.islink(file_path):
+                osd_metadata[_file] = self.scan_device(file_path)
+            if os.path.isdir(file_path):
+                continue
+            # the check for binary needs to go before the file, to avoid
+            # capturing data from binary files but still be able to capture
+            # contents from actual files later
+            if system.is_binary(file_path):
+                continue
+            if os.path.isfile(file_path):
+                osd_metadata[_file] = self.get_contents(file_path)
+
+        device = path_mounts.get(path)
+        # it is possible to have more than one device, pick the first one, and
+        # warn that it is possible that more than one device is 'data'
+        if not device:
+            terminal.error('Unable to detect device mounted for path: %s' % path)
+            raise RuntimeError('Cannot activate OSD')
+        osd_metadata['data'] = self.scan_device(device[0] if len(device) else None)
+
+        return osd_metadata
+
+    @decorators.needs_root
+    def scan(self, args):
+        osd_metadata = {'cluster_name': conf.cluster}
+        device_mounts = system.get_mounts(devices=True)
+        osd_path = None
+        logger.info('detecting if argument is a device or a directory: %s', args.osd_path)
+        if os.path.isdir(args.osd_path):
+            logger.info('will scan directly, path is a directory')
+            osd_path = args.osd_path
+        else:
+            # assume this is a device, check if it is mounted and use that path
+            logger.info('path is not a directory, will check if mounted')
+            if system.device_is_mounted(args.osd_path):
+                logger.info('argument is a device, which is mounted')
+                mounted_osd_paths = device_mounts.get(args.osd_path)
+                osd_path = mounted_osd_paths[0] if len(mounted_osd_paths) else None
+
+        # argument is not a directory, and it is not a device that is mounted
+        # somewhere so temporarily mount it to poke inside, otherwise, scan
+        # directly
+        if not osd_path:
+            logger.info('device is not mounted, will mount it temporarily to scan')
+            with system.tmp_mount(args.osd_path) as osd_path:
+                osd_metadata = self.scan_directory(osd_path)
+        else:
+            logger.info('will scan OSD directory at path: %s', osd_path)
+            osd_metadata = self.scan_directory(osd_path)
+
+        osd_id = osd_metadata['whoami']
+        osd_fsid = osd_metadata['fsid']
+        filename = '%s-%s.json' % (osd_id, osd_fsid)
+        json_path = os.path.join(self.etc_path, filename)
+        if os.path.exists(json_path) and not args.stdout:
+            if not args.force:
+                raise RuntimeError(
+                    '--force was not used and OSD metadata file exists: %s' % json_path
+                )
+
+        if args.stdout:
+            print(json.dumps(osd_metadata, indent=4, sort_keys=True, ensure_ascii=False))
+        else:
+            with open(json_path, 'w') as fp:
+                json.dump(osd_metadata, fp, indent=4, sort_keys=True, ensure_ascii=False)
+            terminal.success(
+                'OSD %s got scanned and metadata persisted to file: %s' % (
+                    osd_id,
+                    json_path
+                )
+            )
+            terminal.success(
+                'To take over managment of this scanned OSD, and disable ceph-disk and udev, run:'
+            )
+            terminal.success('    ceph-volume simple activate %s %s' % (osd_id, osd_fsid))
+
+        if not osd_metadata.get('data'):
+            msg = 'Unable to determine device mounted on %s' % args.osd_path
+            logger.warning(msg)
+            terminal.warning(msg)
+            terminal.warning('OSD will not be able to start without this information:')
+            terminal.warning('    "data": "/path/to/device",')
+            logger.warning('Unable to determine device mounted on %s' % args.osd_path)
+
+    def main(self):
+        sub_command_help = dedent("""
+        Scan an OSD directory for files and configurations that will allow to
+        take over the management of the OSD.
+
+        Scanned OSDs will get their configurations stored in
+        /etc/ceph/osd/<id>-<fsid>.json
+
+        For an OSD ID of 0 with fsid of ``a9d50838-e823-43d6-b01f-2f8d0a77afc2``
+        that could mean a scan command that looks like::
+
+            ceph-volume lvm scan /var/lib/ceph/osd/ceph-0
+
+        Which would store the metadata in a JSON file at::
+
+            /etc/ceph/osd/0-a9d50838-e823-43d6-b01f-2f8d0a77afc2.json
+
+        To a scan an existing, running, OSD:
+
+            ceph-volume simple scan /var/lib/ceph/osd/{cluster}-{osd id}
+
+        And to scan a device (mounted or unmounted) that has OSD data in it, for example /dev/sda1
+
+            ceph-volume simple scan /dev/sda1
+        """)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume simple scan',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=sub_command_help,
+        )
+
+        parser.add_argument(
+            '-f', '--force',
+            action='store_true',
+            help='If OSD has already been scanned, the JSON file will be overwritten'
+        )
+
+        parser.add_argument(
+            '--stdout',
+            action='store_true',
+            help='Do not save to a file, output metadata to stdout'
+        )
+
+        parser.add_argument(
+            'osd_path',
+            metavar='OSD_PATH',
+            type=arg_validators.OSDPath(),
+            nargs='?',
+            help='Path to an existing OSD directory or OSD data partition'
+        )
+
+        if len(self.argv) == 0:
+            print(sub_command_help)
+            return
+        args = parser.parse_args(self.argv)
+        self.scan(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/simple/trigger.py b/ceph/src/ceph-volume/ceph_volume/devices/simple/trigger.py
new file mode 100644
index 000000000..aeb5cf1aa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/devices/simple/trigger.py
@@ -0,0 +1,70 @@
+from __future__ import print_function
+import argparse
+from textwrap import dedent
+from ceph_volume.exceptions import SuffixParsingError
+from ceph_volume import decorators
+from .activate import Activate
+
+
+def parse_osd_id(string):
+    osd_id = string.split('-', 1)[0]
+    if not osd_id:
+        raise SuffixParsingError('OSD id', string)
+    if osd_id.isdigit():
+        return osd_id
+    raise SuffixParsingError('OSD id', string)
+
+
+def parse_osd_uuid(string):
+    osd_id = '%s-' % parse_osd_id(string)
+    # remove the id first
+    osd_uuid = string.split(osd_id, 1)[-1]
+    if not osd_uuid:
+        raise SuffixParsingError('OSD uuid', string)
+    return osd_uuid
+
+
+class Trigger(object):
+
+    help = 'systemd helper to activate an OSD'
+
+    def __init__(self, argv):
+        self.argv = argv
+
+    @decorators.needs_root
+    def main(self):
+        sub_command_help = dedent("""
+        ** DO NOT USE DIRECTLY **
+        This tool is meant to help the systemd unit that knows about OSDs.
+
+        Proxy OSD activation to ``ceph-volume simple activate`` by parsing the
+        input from systemd, detecting the UUID and ID associated with an OSD::
+
+            ceph-volume simple trigger {SYSTEMD-DATA}
+
+        The systemd "data" is expected to be in the format of::
+
+            {OSD ID}-{OSD UUID}
+
+        The devices associated with the OSD need to have been scanned previously,
+        so that all needed metadata can be used for starting the OSD process.
+        """)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume simple trigger',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=sub_command_help,
+        )
+
+        parser.add_argument(
+            'systemd_data',
+            metavar='SYSTEMD_DATA',
+            nargs='?',
+            help='Data from a systemd unit containing ID and UUID of the OSD, like 0-asdf-lkjh'
+        )
+        if len(self.argv) == 0:
+            print(sub_command_help)
+            return
+        args = parser.parse_args(self.argv)
+        osd_id = parse_osd_id(args.systemd_data)
+        osd_uuid = parse_osd_uuid(args.systemd_data)
+        Activate([osd_id, osd_uuid], systemd=True).main()
diff --git a/ceph/src/ceph-volume/ceph_volume/main.py b/ceph/src/ceph-volume/ceph_volume/main.py
index d4bee154d..e7ed5d88c 100644
--- a/ceph/src/ceph-volume/ceph_volume/main.py
+++ b/ceph/src/ceph-volume/ceph_volume/main.py
@@ -27,7 +27,7 @@ Ceph Conf: {ceph_path}
     """
 
     def __init__(self, argv=None, parse=True):
-        self.mapper = {'lvm': devices.lvm.LVM}
+        self.mapper = {'lvm': devices.lvm.LVM, 'simple': devices.simple.Simple}
         self.plugin_help = "No plugins found/loaded"
         if argv is None:
             self.argv = sys.argv
diff --git a/ceph/src/ceph-volume/ceph_volume/process.py b/ceph/src/ceph-volume/ceph_volume/process.py
index bc5047a17..4b6a9c284 100644
--- a/ceph/src/ceph-volume/ceph_volume/process.py
+++ b/ceph/src/ceph-volume/ceph_volume/process.py
@@ -48,6 +48,47 @@ def log_descriptors(reads, process, terminal_logging):
             pass
 
 
+def obfuscate(command_, on=None):
+    """
+    Certain commands that are useful to log might contain information that
+    should be replaced by '*' like when creating OSDs and the keyryings are
+    being passed, which should not be logged.
+
+    :param on: A string (will match a flag) or an integer (will match an index)
+
+    If matching on a flag (when ``on`` is a string) it will obfuscate on the
+    value for that flag. That is a command like ['ls', '-l', '/'] that calls
+    `obfuscate(command, on='-l')` will obfustace '/' which is the value for
+    `-l`.
+
+    The reason for `on` to allow either a string or an integer, altering
+    behavior for both is because it is easier for ``run`` and ``call`` to just
+    pop a value to obfuscate (vs. allowing an index or a flag)
+    """
+    command = command_[:]
+    msg = "Running command: %s" % ' '.join(command)
+    if on in [None, False]:
+        return msg
+
+    if isinstance(on, int):
+        index = on
+
+    else:
+        try:
+            index = command.index(on) + 1
+        except ValueError:
+            # if the flag just doesn't exist then it doesn't matter just return
+            # the base msg
+            return msg
+
+    try:
+        command[index] = '*' * len(command[index])
+    except IndexError: # the index was completely out of range
+        return msg
+
+    return "Running command: %s" % ' '.join(command)
+
+
 def run(command, **kw):
     """
     A real-time-logging implementation of a remote subprocess.Popen call where
@@ -57,7 +98,7 @@ def run(command, **kw):
     :param stop_on_error: If a nonzero exit status is return, it raises a ``RuntimeError``
     """
     stop_on_error = kw.pop('stop_on_error', True)
-    command_msg = "Running command: %s" % ' '.join(command)
+    command_msg = obfuscate(command, kw.pop('obfuscate', None))
     stdin = kw.pop('stdin', None)
     logger.info(command_msg)
     terminal.write(command_msg)
@@ -115,10 +156,12 @@ def call(command, **kw):
                              it is forcefully set to True if a return code is non-zero
     """
     terminal_verbose = kw.pop('terminal_verbose', False)
+    show_command = kw.pop('show_command', False)
     command_msg = "Running command: %s" % ' '.join(command)
     stdin = kw.pop('stdin', None)
     logger.info(command_msg)
-    terminal.write(command_msg)
+    if show_command:
+        terminal.write(command_msg)
 
     process = subprocess.Popen(
         command,
diff --git a/ceph/src/ceph-volume/ceph_volume/systemd/systemctl.py b/ceph/src/ceph-volume/ceph_volume/systemd/systemctl.py
index 9bb4d7d3a..ab8f3e70a 100644
--- a/ceph/src/ceph-volume/ceph_volume/systemd/systemctl.py
+++ b/ceph/src/ceph-volume/ceph_volume/systemd/systemctl.py
@@ -20,6 +20,10 @@ def disable(unit):
     process.run(['sudo', 'systemctl', 'disable', unit])
 
 
+def mask(unit):
+    process.run(['sudo', 'systemctl', 'mask', unit])
+
+
 def start_osd(id_):
     return start(osd_unit % id_)
 
@@ -40,9 +44,20 @@ def enable_volume(id_, fsid, device_type='lvm'):
     return enable(volume_unit % (device_type, id_, fsid))
 
 
+def mask_ceph_disk():
+    # systemctl allows using a glob like '*' for masking, but there was a bug
+    # in that it wouldn't allow this for service templates. This means that
+    # masking ceph-disk@* will not work, so we must link the service directly.
+    # /etc/systemd takes precendence regardless of the location of the unit
+    process.run(
+        ['sudo', 'ln', '-sf', '/dev/null',  '/etc/systemd/system/ceph-disk@.service']
+    )
+
+
 #
 # templates
 #
 
 osd_unit = "ceph-osd@%s"
+ceph_disk_unit = "ceph-disk@%s"
 volume_unit = "ceph-volume@%s-%s-%s"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_api.py b/ceph/src/ceph-volume/ceph_volume/tests/api/test_lvm.py
similarity index 84%
rename from ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_api.py
rename to ceph/src/ceph-volume/ceph_volume/tests/api/test_lvm.py
index d6aa54904..3639f01e5 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_api.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/api/test_lvm.py
@@ -1,6 +1,6 @@
 import pytest
 from ceph_volume import process, exceptions
-from ceph_volume.devices.lvm import api
+from ceph_volume.api import lvm as api
 
 
 class TestParseTags(object):
@@ -73,6 +73,9 @@ def volumes(monkeypatch):
     monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
     volumes = api.Volumes()
     volumes._purge()
+    # also patch api.Volumes so that when it is called, it will use the newly
+    # created fixture, with whatever the test method wants to append to it
+    monkeypatch.setattr(api, 'Volumes', lambda: volumes)
     return volumes
 
 
@@ -189,6 +192,20 @@ class TestVolumes(object):
         with pytest.raises(exceptions.MultipleLVsError):
             volumes.get(lv_name='foo')
 
+    def test_as_dict_infers_type_from_tags(self, volumes):
+        lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
+        volumes.append(osd)
+        result = volumes.get(lv_tags={'ceph.type': 'data'}).as_dict()
+        assert result['type'] == 'data'
+
+    def test_as_dict_populates_path_from_lv_api(self, volumes):
+        lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
+        volumes.append(osd)
+        result = volumes.get(lv_tags={'ceph.type': 'data'}).as_dict()
+        assert result['path'] == '/dev/vg/lv'
+
     def test_find_the_correct_one(self, volumes):
         volume1 = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags='')
         volume2 = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='')
@@ -311,6 +328,47 @@ class TestVolumeGroups(object):
             volume_groups.filter()
 
 
+class TestGetLVFromArgument(object):
+
+    def setup(self):
+        self.foo_volume = api.Volume(
+            lv_name='foo', lv_path='/path/to/lv',
+            vg_name='foo_group', lv_tags=''
+        )
+
+    def test_non_absolute_path_is_not_valid(self, volumes):
+        volumes.append(self.foo_volume)
+        assert api.get_lv_from_argument('foo') is None
+
+    def test_too_many_slashes_is_invalid(self, volumes):
+        volumes.append(self.foo_volume)
+        assert api.get_lv_from_argument('path/to/lv') is None
+
+    def test_absolute_path_is_not_lv(self, volumes):
+        volumes.append(self.foo_volume)
+        assert api.get_lv_from_argument('/path') is None
+
+    def test_absolute_path_is_lv(self, volumes):
+        volumes.append(self.foo_volume)
+        assert api.get_lv_from_argument('/path/to/lv') == self.foo_volume
+
+
+class TestRemoveLV(object):
+
+    def test_removes_lv(self, monkeypatch):
+        def mock_call(cmd, **kw):
+            return ('', '', 0)
+        monkeypatch.setattr(process, 'call', mock_call)
+        assert api.remove_lv("vg/lv")
+
+    def test_fails_to_remove_lv(self, monkeypatch):
+        def mock_call(cmd, **kw):
+            return ('', '', 1)
+        monkeypatch.setattr(process, 'call', mock_call)
+        with pytest.raises(RuntimeError):
+            api.remove_lv("vg/lv")
+
+
 class TestCreateLV(object):
 
     def setup(self):
@@ -320,7 +378,7 @@ class TestCreateLV(object):
         monkeypatch.setattr(process, 'run', capture)
         monkeypatch.setattr(process, 'call', capture)
         monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
-        api.create_lv('foo', 'foo_group', size=5, type='data')
+        api.create_lv('foo', 'foo_group', size='5G', tags={'ceph.type': 'data'})
         expected = ['sudo', 'lvcreate', '--yes', '-L', '5G', '-n', 'foo', 'foo_group']
         assert capture.calls[0]['args'][0] == expected
 
@@ -328,7 +386,7 @@ class TestCreateLV(object):
         monkeypatch.setattr(process, 'run', capture)
         monkeypatch.setattr(process, 'call', capture)
         monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
-        api.create_lv('foo', 'foo_group', size=5, type='data')
+        api.create_lv('foo', 'foo_group', size='5G', tags={'ceph.type': 'data'})
         ceph_tag = ['sudo', 'lvchange', '--addtag', 'ceph.type=data', '/path']
         assert capture.calls[1]['args'][0] == ceph_tag
 
@@ -336,6 +394,6 @@ class TestCreateLV(object):
         monkeypatch.setattr(process, 'run', capture)
         monkeypatch.setattr(process, 'call', capture)
         monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
-        api.create_lv('foo', 'foo_group', size=5, type='data')
+        api.create_lv('foo', 'foo_group', size='5G', tags={'ceph.type': 'data'})
         data_tag = ['sudo', 'lvchange', '--addtag', 'ceph.data_device=/path', '/path']
         assert capture.calls[2]['args'][0] == data_tag
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/conftest.py b/ceph/src/ceph-volume/ceph_volume/tests/conftest.py
index 7a580e57c..f58033461 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/conftest.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/conftest.py
@@ -1,5 +1,7 @@
+import os
 import pytest
-from ceph_volume.devices.lvm import api
+from ceph_volume.api import lvm as lvm_api
+
 
 class Capture(object):
 
@@ -12,6 +14,18 @@ class Capture(object):
         self.calls.append({'args': a, 'kwargs': kw})
 
 
+class Factory(object):
+
+    def __init__(self, **kw):
+        for k, v in kw.items():
+            setattr(self, k, v)
+
+
+@pytest.fixture
+def factory():
+    return Factory
+
+
 @pytest.fixture
 def capture():
     return Capture()
@@ -20,7 +34,7 @@ def capture():
 @pytest.fixture
 def volumes(monkeypatch):
     monkeypatch.setattr('ceph_volume.process.call', lambda x: ('', '', 0))
-    volumes = api.Volumes()
+    volumes = lvm_api.Volumes()
     volumes._purge()
     return volumes
 
@@ -28,7 +42,7 @@ def volumes(monkeypatch):
 @pytest.fixture
 def volume_groups(monkeypatch):
     monkeypatch.setattr('ceph_volume.process.call', lambda x: ('', '', 0))
-    vgs = api.VolumeGroups()
+    vgs = lvm_api.VolumeGroups()
     vgs._purge()
     return vgs
 
@@ -40,3 +54,17 @@ def is_root(monkeypatch):
     is root (or is sudoing to superuser) can continue as-is
     """
     monkeypatch.setattr('os.getuid', lambda: 0)
+
+
+@pytest.fixture
+def tmpfile(tmpdir):
+    """
+    Create a temporary file, optionally filling it with contents, returns an
+    absolute path to the file when called
+    """
+    def generate_file(name='file', contents=''):
+        path = os.path.join(str(tmpdir), name)
+        with open(path, 'w') as fp:
+            fp.write(contents)
+        return path
+    return generate_file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_activate.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_activate.py
index 40df77576..ce623aac9 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_activate.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_activate.py
@@ -1,10 +1,15 @@
 import pytest
-from ceph_volume.devices.lvm import activate, api
+from ceph_volume.devices.lvm import activate
+from ceph_volume.api import lvm as api
 
 
 class Args(object):
 
     def __init__(self, **kw):
+        # default flags
+        self.bluestore = False
+        self.filestore = False
+        self.auto_detect_objectstore = None
         for k, v in kw.items():
             setattr(self, k, v)
 
@@ -20,7 +25,16 @@ class TestActivate(object):
         volumes.append(FooVolume)
         monkeypatch.setattr(api, 'Volumes', lambda: volumes)
         monkeypatch.setattr(activate, 'activate_filestore', capture)
-        args = Args(osd_id=None, osd_fsid='1234')
+        args = Args(osd_id=None, osd_fsid='1234', filestore=True)
+        activate.Activate([]).activate(args)
+        assert capture.calls[0]['args'][0] == [FooVolume]
+
+    def test_no_osd_id_matches_fsid_bluestore(self, is_root, volumes, monkeypatch, capture):
+        FooVolume = api.Volume(lv_name='foo', lv_path='/dev/vg/foo', lv_tags="ceph.osd_fsid=1234")
+        volumes.append(FooVolume)
+        monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+        monkeypatch.setattr(activate, 'activate_bluestore', capture)
+        args = Args(osd_id=None, osd_fsid='1234', bluestore=True)
         activate.Activate([]).activate(args)
         assert capture.calls[0]['args'][0] == [FooVolume]
 
@@ -32,3 +46,33 @@ class TestActivate(object):
         args = Args(osd_id=None, osd_fsid='1234')
         with pytest.raises(RuntimeError):
             activate.Activate([]).activate(args)
+
+
+class TestActivateFlags(object):
+
+    def test_default_objectstore(self, capture):
+        args = ['0', 'asdf-ljh-asdf']
+        activation = activate.Activate(args)
+        activation.activate = capture
+        activation.main()
+        parsed_args = capture.calls[0]['args'][0]
+        assert parsed_args.filestore is False
+        assert parsed_args.bluestore is True
+
+    def test_uses_filestore(self, capture):
+        args = ['--filestore', '0', 'asdf-ljh-asdf']
+        activation = activate.Activate(args)
+        activation.activate = capture
+        activation.main()
+        parsed_args = capture.calls[0]['args'][0]
+        assert parsed_args.filestore is True
+        assert parsed_args.bluestore is False
+
+    def test_uses_bluestore(self, capture):
+        args = ['--bluestore', '0', 'asdf-ljh-asdf']
+        activation = activate.Activate(args)
+        activation.activate = capture
+        activation.main()
+        parsed_args = capture.calls[0]['args'][0]
+        assert parsed_args.filestore is False
+        assert parsed_args.bluestore is True
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_listing.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_listing.py
new file mode 100644
index 000000000..b780ea2e9
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_listing.py
@@ -0,0 +1,176 @@
+import pytest
+from ceph_volume.devices import lvm
+from ceph_volume.api import lvm as api
+
+
+class TestReadableTag(object):
+
+    def test_dots_get_replaced(self):
+        result = lvm.listing.readable_tag('ceph.foo')
+        assert result == 'foo'
+
+    def test_underscores_are_replaced_with_spaces(self):
+        result = lvm.listing.readable_tag('ceph.long_tag')
+        assert result == 'long tag'
+
+
+class TestPrettyReport(object):
+
+    def test_is_empty(self, capsys):
+        lvm.listing.pretty_report({})
+        stdout, stderr = capsys.readouterr()
+        assert stdout == '\n'
+
+    def test_type_and_path_are_reported(self, capsys):
+        lvm.listing.pretty_report({0: [{'type': 'data', 'path': '/dev/sda1'}]})
+        stdout, stderr = capsys.readouterr()
+        assert '[data]    /dev/sda1' in stdout
+
+    def test_osd_id_header_is_reported(self, capsys):
+        lvm.listing.pretty_report({0: [{'type': 'data', 'path': '/dev/sda1'}]})
+        stdout, stderr = capsys.readouterr()
+        assert '====== osd.0 =======' in stdout
+
+    def test_tags_are_included(self, capsys):
+        lvm.listing.pretty_report(
+            {0: [{
+                'type': 'data',
+                'path': '/dev/sda1',
+                'tags': {'ceph.osd_id': '0'}
+            }]}
+        )
+        stdout, stderr = capsys.readouterr()
+        assert 'osd id' in stdout
+
+
+class TestList(object):
+
+    def test_empty_full_json_zero_exit_status(self, is_root, volumes, factory, capsys):
+        args = factory(format='json', device=None)
+        lvm.listing.List([]).list(args)
+        stdout, stderr = capsys.readouterr()
+        assert stdout == '{}\n'
+
+    def test_empty_device_json_zero_exit_status(self, is_root, volumes, factory, capsys):
+        args = factory(format='json', device='/dev/sda1')
+        lvm.listing.List([]).list(args)
+        stdout, stderr = capsys.readouterr()
+        assert stdout == '{}\n'
+
+    def test_empty_full_zero_exit_status(self, is_root, volumes, factory):
+        args = factory(format='pretty', device=None)
+        with pytest.raises(SystemExit):
+            lvm.listing.List([]).list(args)
+
+    def test_empty_device_zero_exit_status(self, is_root, volumes, factory):
+        args = factory(format='pretty', device='/dev/sda1')
+        with pytest.raises(SystemExit):
+            lvm.listing.List([]).list(args)
+
+
+class TestFullReport(object):
+
+    def test_no_ceph_lvs(self, volumes, monkeypatch):
+        # ceph lvs are detected by looking into its tags
+        osd = api.Volume(lv_name='volume1', lv_path='/dev/VolGroup/lv', lv_tags={})
+        volumes.append(osd)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).full_report()
+        assert result == {}
+
+    def test_ceph_data_lv_reported(self, volumes, monkeypatch):
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).full_report()
+        assert result['0'][0]['name'] == 'volume1'
+
+    def test_ceph_journal_lv_reported(self, volumes, monkeypatch):
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+        journal_tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=journal'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        journal = api.Volume(
+            lv_name='journal', lv_uuid='x', lv_path='/dev/VolGroup/journal', lv_tags=journal_tags)
+        volumes.append(osd)
+        volumes.append(journal)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).full_report()
+        assert result['0'][0]['name'] == 'volume1'
+        assert result['0'][1]['name'] == 'journal'
+
+    def test_ceph_wal_lv_reported(self, volumes, monkeypatch):
+        tags = 'ceph.osd_id=0,ceph.wal_uuid=x,ceph.type=data'
+        wal_tags = 'ceph.osd_id=0,ceph.wal_uuid=x,ceph.type=wal'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        wal = api.Volume(
+            lv_name='wal', lv_uuid='x', lv_path='/dev/VolGroup/wal', lv_tags=wal_tags)
+        volumes.append(osd)
+        volumes.append(wal)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).full_report()
+        assert result['0'][0]['name'] == 'volume1'
+        assert result['0'][1]['name'] == 'wal'
+
+    def test_physical_journal_gets_reported(self, volumes, monkeypatch):
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        monkeypatch.setattr(lvm.listing.disk, 'get_device_from_partuuid', lambda x: '/dev/sda1')
+        result = lvm.listing.List([]).full_report()
+        assert result['0'][1]['path'] == '/dev/sda1'
+        assert result['0'][1]['tags'] == {'PARTUUID': 'x'}
+        assert result['0'][1]['type'] == 'journal'
+
+    def test_physical_wal_gets_reported(self, volumes, monkeypatch):
+        tags = 'ceph.osd_id=0,ceph.wal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        monkeypatch.setattr(lvm.listing.disk, 'get_device_from_partuuid', lambda x: '/dev/sda1')
+        result = lvm.listing.List([]).full_report()
+        assert result['0'][1]['path'] == '/dev/sda1'
+        assert result['0'][1]['tags'] == {'PARTUUID': 'x'}
+        assert result['0'][1]['type'] == 'wal'
+
+
+class TestSingleReport(object):
+
+    def test_not_a_ceph_lv(self, volumes, monkeypatch):
+        # ceph lvs are detected by looking into its tags
+        lv = api.Volume(
+            lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags={})
+        volumes.append(lv)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).single_report('VolGroup/lv')
+        assert result == {}
+
+    def test_report_a_ceph_lv(self, volumes, monkeypatch):
+        # ceph lvs are detected by looking into its tags
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+        lv = api.Volume(
+            lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(lv)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).single_report('VolGroup/lv')
+        assert result['0'][0]['name'] == 'lv'
+        assert result['0'][0]['lv_tags'] == tags
+        assert result['0'][0]['path'] == '/dev/VolGroup/lv'
+
+    def test_report_a_ceph_journal_device(self, volumes, monkeypatch):
+        # ceph lvs are detected by looking into its tags
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data,ceph.journal_device=/dev/sda1'
+        lv = api.Volume(
+            lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(lv)
+        monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+        result = lvm.listing.List([]).single_report('/dev/sda1')
+        assert result['0'][0]['tags'] == {'PARTUUID': 'x'}
+        assert result['0'][0]['type'] == 'journal'
+        assert result['0'][0]['path'] == '/dev/sda1'
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_prepare.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_prepare.py
index fabae296a..c69394fd6 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_prepare.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_prepare.py
@@ -33,8 +33,9 @@ class TestPrepare(object):
         with pytest.raises(SystemExit):
             lvm.prepare.Prepare(argv=['--help']).main()
         stdout, stderr = capsys.readouterr()
-        assert 'required arguments:' in stdout
-        assert 'A logical group name or a path' in stdout
+        assert 'Use the filestore objectstore' in stdout
+        assert 'Use the bluestore objectstore' in stdout
+        assert 'A physical device or logical' in stdout
 
 
 class TestGetJournalLV(object):
@@ -43,13 +44,13 @@ class TestGetJournalLV(object):
     def test_no_journal_on_invalid_path(self, monkeypatch, arg):
         monkeypatch.setattr(lvm.prepare.api, 'get_lv', lambda **kw: False)
         prepare = lvm.prepare.Prepare([])
-        assert prepare.get_journal_lv(arg) is None
+        assert prepare.get_lv(arg) is None
 
     def test_no_journal_lv_found(self, monkeypatch):
         # patch it with 0 so we know we are getting to get_lv
         monkeypatch.setattr(lvm.prepare.api, 'get_lv', lambda **kw: 0)
         prepare = lvm.prepare.Prepare([])
-        assert prepare.get_journal_lv('vg/lv') == 0
+        assert prepare.get_lv('vg/lv') == 0
 
 
 class TestActivate(object):
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py
new file mode 100644
index 000000000..bae3276a9
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py
@@ -0,0 +1,23 @@
+import os
+import pytest
+from ceph_volume.devices.simple import activate
+
+
+class TestActivate(object):
+
+    def test_no_data_uuid(self, factory, tmpfile, is_root, monkeypatch, capture):
+        json_config = tmpfile(contents='{}')
+        args = factory(osd_id='0', osd_fsid='1234', json_config=json_config)
+        with pytest.raises(RuntimeError):
+            activate.Activate([]).activate(args)
+
+    def test_invalid_json_path(self):
+        os.environ['CEPH_VOLUME_SIMPLE_JSON_DIR'] = '/non/existing/path'
+        with pytest.raises(RuntimeError) as error:
+            activate.Activate(['1', 'asdf']).main()
+        assert 'RuntimeError: Expected JSON config path not found' in str(error)
+
+    def test_main_spits_help_with_no_arguments(self, capsys):
+        activate.Activate([]).main()
+        stdout, stderr = capsys.readouterr()
+        assert 'Activate OSDs by mounting devices previously configured' in stdout
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py
new file mode 100644
index 000000000..d68fe63cb
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py
@@ -0,0 +1,52 @@
+import os
+import pytest
+from ceph_volume.devices.simple import scan
+
+
+class TestScan(object):
+
+    def test_main_spits_help_with_no_arguments(self, capsys):
+        scan.Scan([]).main()
+        stdout, stderr = capsys.readouterr()
+        assert 'Scan an OSD directory for files' in stdout
+
+
+class TestGetContents(object):
+
+    def test_multiple_lines_are_left_as_is(self, tmpfile):
+        magic_file = tmpfile(contents='first\nsecond\n')
+        scanner = scan.Scan([])
+        assert scanner.get_contents(magic_file) == 'first\nsecond\n'
+
+    def test_extra_whitespace_gets_removed(self, tmpfile):
+        magic_file = tmpfile(contents='first   ')
+        scanner = scan.Scan([])
+        assert scanner.get_contents(magic_file) == 'first'
+
+    def test_single_newline_values_are_trimmed(self, tmpfile):
+        magic_file = tmpfile(contents='first\n')
+        scanner = scan.Scan([])
+        assert scanner.get_contents(magic_file) == 'first'
+
+
+class TestEtcPath(object):
+
+    def test_directory_is_valid(self, tmpdir):
+        path = str(tmpdir)
+        scanner = scan.Scan([])
+        scanner._etc_path = path
+        assert scanner.etc_path == path
+
+    def test_directory_does_not_exist_gets_created(self, tmpdir):
+        path = os.path.join(str(tmpdir), 'subdir')
+        scanner = scan.Scan([])
+        scanner._etc_path = path
+        assert scanner.etc_path == path
+        assert os.path.isdir(path)
+
+    def test_complains_when_file(self, tmpfile):
+        path = tmpfile()
+        scanner = scan.Scan([])
+        scanner._etc_path = path
+        with pytest.raises(RuntimeError):
+            scanner.etc_path
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_trigger.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_trigger.py
new file mode 100644
index 000000000..d3220f2b0
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_trigger.py
@@ -0,0 +1,45 @@
+import pytest
+from ceph_volume import exceptions
+from ceph_volume.devices.simple import trigger
+
+
+class TestParseOSDid(object):
+
+    def test_no_id_found_if_no_digit(self):
+        with pytest.raises(exceptions.SuffixParsingError):
+            trigger.parse_osd_id('asdlj-ljahsdfaslkjhdfa')
+
+    def test_no_id_found(self):
+        with pytest.raises(exceptions.SuffixParsingError):
+            trigger.parse_osd_id('ljahsdfaslkjhdfa')
+
+    def test_id_found(self):
+        result = trigger.parse_osd_id('1-ljahsdfaslkjhdfa')
+        assert result == '1'
+
+
+class TestParseOSDUUID(object):
+
+    def test_uuid_is_parsed(self):
+        result = trigger.parse_osd_uuid('1-asdf-ljkh-asdf-ljkh-asdf')
+        assert result == 'asdf-ljkh-asdf-ljkh-asdf'
+
+    def test_uuid_is_parsed_longer_sha1(self):
+        result = trigger.parse_osd_uuid('1-foo-bar-asdf-ljkh-asdf-ljkh-asdf')
+        assert result == 'foo-bar-asdf-ljkh-asdf-ljkh-asdf'
+
+    def test_uuid_is_not_found(self):
+        with pytest.raises(exceptions.SuffixParsingError):
+            trigger.parse_osd_uuid('ljahsdfaslkjhdfa')
+
+    def test_uuid_is_not_found_missing_id(self):
+        with pytest.raises(exceptions.SuffixParsingError):
+            trigger.parse_osd_uuid('ljahs-dfa-slkjhdfa-foo')
+
+    def test_robust_double_id_in_uuid(self):
+        # it is possible to have the id in the SHA1, this should
+        # be fine parsing that
+        result = trigger.parse_osd_uuid("1-abc959fd-1ec9-4864-b141-3154f9b9f8ed")
+        assert result == 'abc959fd-1ec9-4864-b141-3154f9b9f8ed'
+
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py
new file mode 100644
index 000000000..bc26e33da
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py
@@ -0,0 +1,17 @@
+import pytest
+from ceph_volume.devices import lvm
+
+
+class TestZap(object):
+
+    def test_main_spits_help_with_no_arguments(self, capsys):
+        lvm.zap.Zap([]).main()
+        stdout, stderr = capsys.readouterr()
+        assert 'Zaps the given logical volume or partition' in stdout
+
+    def test_main_shows_full_help(self, capsys):
+        with pytest.raises(SystemExit):
+            lvm.zap.Zap(argv=['--help']).main()
+        stdout, stderr = capsys.readouterr()
+        assert 'optional arguments' in stdout
+        assert 'positional arguments' in stdout
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/Vagrantfile
deleted file mode 120000
index 2572fa2c9..000000000
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/Vagrantfile
+++ /dev/null
@@ -1 +0,0 @@
-../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/Vagrantfile
new file mode 120000
index 000000000..16076e424
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/Vagrantfile
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/group_vars/all
new file mode 100644
index 000000000..17e9044e1
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/group_vars/all
@@ -0,0 +1,28 @@
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "bluestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+lvm_volumes:
+  - data: data-lv1
+    data_vg: test_group
+  - data: data-lv2
+    data_vg: test_group
+    db: journal1
+    db_vg: journals
+  - data: /dev/sdd1
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/hosts
similarity index 100%
rename from ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/hosts
rename to ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/hosts
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/setup.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/setup.yml
new file mode 120000
index 000000000..1c1a3ce8d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/setup.yml
@@ -0,0 +1 @@
+../../../playbooks/setup_partitions.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/vagrant_variables.yml
similarity index 100%
rename from ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/vagrant_variables.yml
rename to ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/vagrant_variables.yml
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/Vagrantfile
new file mode 120000
index 000000000..16076e424
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/Vagrantfile
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/group_vars/all
similarity index 94%
rename from ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/group_vars/all
rename to ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/group_vars/all
index e7c1f7230..e7ff18ed1 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/group_vars/all
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/group_vars/all
@@ -20,6 +20,8 @@ lvm_volumes:
     journal: journal1
     data_vg: test_group
     journal_vg: journals
+  - data: /dev/sdd1
+    journal: /dev/sdd2
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/hosts
similarity index 100%
rename from ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/hosts
rename to ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/hosts
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/setup.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/setup.yml
new file mode 120000
index 000000000..1c1a3ce8d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/setup.yml
@@ -0,0 +1 @@
+../../../playbooks/setup_partitions.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/vagrant_variables.yml
new file mode 100644
index 000000000..7d1a4449a
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/vagrant_variables.yml
@@ -0,0 +1,56 @@
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=â&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/setup_partitions.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/setup_partitions.yml
new file mode 100644
index 000000000..37a48949b
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/setup_partitions.yml
@@ -0,0 +1,27 @@
+---
+
+- hosts: osds
+  gather_facts: false
+  become: yes
+  tasks:
+
+    - name: partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
+    - name: partition /dev/sdd lvm journals
+      parted:
+        device: /dev/sdd
+        number: 2
+        part_start: 50%
+        part_end: 100%
+        unit: '%'
+        state: present
+        label: gpt
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini
new file mode 100644
index 000000000..797138f7d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini
@@ -0,0 +1,59 @@
+[tox]
+envlist = {centos7,xenial}-{filestore,bluestore}-{create,prepare_activate}
+skipsdist = True
+
+[testenv]
+whitelist_externals =
+    vagrant
+    bash
+    git
+passenv=*
+setenv=
+  ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config
+  ANSIBLE_STDOUT_CALLBACK = debug
+  ANSIBLE_RETRY_FILES_ENABLED = False
+  VAGRANT_CWD = {changedir}
+  CEPH_VOLUME_DEBUG = 1
+deps=
+  ansible==2.4.1
+  testinfra==1.7.1
+  pytest-xdist
+changedir=
+  centos7-filestore-create: {toxinidir}/centos7/filestore/create
+  centos7-bluestore-create: {toxinidir}/centos7/bluestore/create
+  xenial-filestore-create: {toxinidir}/xenial/filestore/create
+  xenial-bluestore-create: {toxinidir}/xenial/bluestore/create
+  # TODO: these are placeholders for now, eventually we want to
+  # test the prepare/activate workflow of ceph-volume as well
+  xenial-filestore-prepare_activate: {toxinidir}/xenial/filestore/prepare_activate
+  xenial-bluestore-prepare_activate: {toxinidir}/xenial/bluestore/prepare_activate
+  centos7-filestore-prepare_activate: {toxinidir}/xenial/filestore/prepare_activate
+  centos7-bluestore-prepare_activate: {toxinidir}/xenial/bluestore/prepare_activate
+commands=
+  git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
+
+  vagrant up --no-provision {posargs:--provider=virtualbox}
+  bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
+
+  # create logical volumes to test with on the vms
+  ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/lvm_setup.yml
+
+  # ad-hoc/local test setup for lvm
+  ansible-playbook -vv -i {changedir}/hosts {changedir}/setup.yml
+
+  # use ceph-ansible to deploy a ceph cluster on the vms
+  ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/site.yml.sample --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest}"
+
+  # prepare nodes for testing with testinfra
+  ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
+
+  # test cluster state using ceph-ansible tests
+  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+  # reboot all vms
+  vagrant reload --no-provision
+
+  # retest to ensure cluster came back up correctly after rebooting
+  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+  vagrant destroy --force
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/Vagrantfile
new file mode 120000
index 000000000..16076e424
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/Vagrantfile
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/group_vars/all
new file mode 100644
index 000000000..17e9044e1
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/group_vars/all
@@ -0,0 +1,28 @@
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "bluestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+lvm_volumes:
+  - data: data-lv1
+    data_vg: test_group
+  - data: data-lv2
+    data_vg: test_group
+    db: journal1
+    db_vg: journals
+  - data: /dev/sdd1
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/hosts
new file mode 100644
index 000000000..f6a265ab3
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/hosts
@@ -0,0 +1,5 @@
+[mons]
+mon0
+
+[osds]
+osd0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/setup.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/setup.yml
new file mode 120000
index 000000000..1c1a3ce8d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/setup.yml
@@ -0,0 +1 @@
+../../../playbooks/setup_partitions.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/vagrant_variables.yml
new file mode 100644
index 000000000..7252344dd
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/vagrant_variables.yml
@@ -0,0 +1,56 @@
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=â&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/Vagrantfile
new file mode 120000
index 000000000..16076e424
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/Vagrantfile
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/group_vars/all
similarity index 94%
rename from ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/group_vars/all
rename to ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/group_vars/all
index e7c1f7230..e7ff18ed1 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/group_vars/all
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/group_vars/all
@@ -20,6 +20,8 @@ lvm_volumes:
     journal: journal1
     data_vg: test_group
     journal_vg: journals
+  - data: /dev/sdd1
+    journal: /dev/sdd2
 os_tuning_params:
   - { name: kernel.pid_max, value: 4194303 }
   - { name: fs.file-max, value: 26234859 }
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/hosts
new file mode 100644
index 000000000..f6a265ab3
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/hosts
@@ -0,0 +1,5 @@
+[mons]
+mon0
+
+[osds]
+osd0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/setup.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/setup.yml
new file mode 120000
index 000000000..1c1a3ce8d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/setup.yml
@@ -0,0 +1 @@
+../../../playbooks/setup_partitions.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/vagrant_variables.yml
similarity index 100%
rename from ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/vagrant_variables.yml
rename to ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/vagrant_variables.yml
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/Vagrantfile
new file mode 120000
index 000000000..16076e424
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/Vagrantfile
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/group_vars/all
new file mode 100644
index 000000000..560c8b03b
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/group_vars/all
@@ -0,0 +1,19 @@
+---
+
+ceph_dev: True
+cluster: test
+public_network: "192.168.1.0/24"
+cluster_network: "192.168.2.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "bluestore"
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd0.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd0.yml
new file mode 100644
index 000000000..2e1c7ee9e
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd0.yml
@@ -0,0 +1,7 @@
+---
+
+devices:
+  - '/dev/sdb'
+dedicated_devices:
+  - '/dev/sdc'
+osd_scenario: "non-collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd1.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd1.yml
new file mode 100644
index 000000000..7e90071c9
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/host_vars/osd1.yml
@@ -0,0 +1,6 @@
+---
+
+devices:
+  - '/dev/sdb'
+  - '/dev/sdc'
+osd_scenario: "collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/hosts
new file mode 100644
index 000000000..e0c08b946
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/hosts
@@ -0,0 +1,9 @@
+[mons]
+mon0 monitor_interface=eth1
+
+[osds]
+osd0 
+osd1
+
+[mgrs]
+mon0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/test.yml
new file mode 100644
index 000000000..24e2c0353
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/test.yml
@@ -0,0 +1,31 @@
+---
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    - name: list all OSD directories
+      find:
+        paths: /var/lib/ceph/osd
+        file_type: directory
+      register: osd_paths
+
+    - name: scan all OSD directories
+      command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_paths.files }}"
+
+    - name: list all OSD JSON files
+      find:
+        paths: /etc/ceph/osd
+        file_type: file
+      register: osd_configs
+
+    - name: activate all scanned OSDs
+      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_configs.files }}"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/vagrant_variables.yml
new file mode 100644
index 000000000..63700c3c9
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/activate/vagrant_variables.yml
@@ -0,0 +1,73 @@
+---
+
+# DEPLOY CONTAINERIZED DAEMONS
+docker: false
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 2
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+
+# INSTALL SOURCE OF CEPH
+# valid values are 'stable' and 'dev'
+ceph_install_source: stable
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.1
+cluster_subnet: 192.168.2
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# Disks
+# For libvirt use disks: "[ '/dev/vdb', '/dev/vdc' ]"
+# For CentOS7 use disks: "[ '/dev/sda', '/dev/sdb' ]"
+disks: "[ '/dev/sdb', '/dev/sdc' ]"
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=â&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/Vagrantfile
new file mode 120000
index 000000000..16076e424
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/Vagrantfile
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/group_vars/all
new file mode 100644
index 000000000..8902bdda3
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/group_vars/all
@@ -0,0 +1,19 @@
+---
+
+ceph_dev: True
+cluster: test
+public_network: "192.168.1.0/24"
+cluster_network: "192.168.2.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "filestore"
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd0.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd0.yml
new file mode 100644
index 000000000..2e1c7ee9e
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd0.yml
@@ -0,0 +1,7 @@
+---
+
+devices:
+  - '/dev/sdb'
+dedicated_devices:
+  - '/dev/sdc'
+osd_scenario: "non-collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd1.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd1.yml
new file mode 100644
index 000000000..7e90071c9
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/host_vars/osd1.yml
@@ -0,0 +1,6 @@
+---
+
+devices:
+  - '/dev/sdb'
+  - '/dev/sdc'
+osd_scenario: "collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/hosts
new file mode 100644
index 000000000..e0c08b946
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/hosts
@@ -0,0 +1,9 @@
+[mons]
+mon0 monitor_interface=eth1
+
+[osds]
+osd0 
+osd1
+
+[mgrs]
+mon0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml
new file mode 100644
index 000000000..24e2c0353
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml
@@ -0,0 +1,31 @@
+---
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    - name: list all OSD directories
+      find:
+        paths: /var/lib/ceph/osd
+        file_type: directory
+      register: osd_paths
+
+    - name: scan all OSD directories
+      command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_paths.files }}"
+
+    - name: list all OSD JSON files
+      find:
+        paths: /etc/ceph/osd
+        file_type: file
+      register: osd_configs
+
+    - name: activate all scanned OSDs
+      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_configs.files }}"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/vagrant_variables.yml
new file mode 100644
index 000000000..63700c3c9
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/vagrant_variables.yml
@@ -0,0 +1,73 @@
+---
+
+# DEPLOY CONTAINERIZED DAEMONS
+docker: false
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 2
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+
+# INSTALL SOURCE OF CEPH
+# valid values are 'stable' and 'dev'
+ceph_install_source: stable
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.1
+cluster_subnet: 192.168.2
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# Disks
+# For libvirt use disks: "[ '/dev/vdb', '/dev/vdc' ]"
+# For CentOS7 use disks: "[ '/dev/sda', '/dev/sdb' ]"
+disks: "[ '/dev/sdb', '/dev/sdc' ]"
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=â&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/tox.ini b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini
similarity index 68%
rename from ceph/src/ceph-volume/ceph_volume/tests/functional/tox.ini
rename to ceph/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini
index 6e0dfbf2d..0d2e68adc 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/tox.ini
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = {centos7,xenial}-{create,prepare_activate}
+envlist = {centos7,xenial}-{filestore,bluestore}-{activate}
 skipsdist = True
 
 [testenv]
@@ -15,24 +15,19 @@ setenv=
   VAGRANT_CWD = {changedir}
   CEPH_VOLUME_DEBUG = 1
 deps=
-  ansible==2.3.1
-  testinfra==1.6.0
+  ansible==2.4.1
+  testinfra==1.7.1
   pytest-xdist
 changedir=
-  centos7-create: {toxinidir}/centos7/create
-  xenial-create: {toxinidir}/xenial/create
-  # TODO: these are placeholders for now, eventually we want to
-  # test the prepare/activate workflow of ceph-volume as well
-  xenial-prepare_activate: {toxinidir}/xenial/prepare_activate
-  centos7-prepare_activate: {toxinidir}/xenial/prepare_activate
+  centos7-filestore-activate: {toxinidir}/centos7/filestore/activate
+  centos7-bluestore-activate: {toxinidir}/centos7/bluestore/activate
+  xenial-filestore-activate: {toxinidir}/xenial/filestore/activate
+  xenial-bluestore-activate: {toxinidir}/xenial/bluestore/activate
 commands=
   git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
 
   vagrant up --no-provision {posargs:--provider=virtualbox}
-  bash {toxinidir}/scripts/generate_ssh_config.sh {changedir}
-
-  # create logical volumes to test with on the vms
-  ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/lvm_setup.yml
+  bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
 
   # use ceph-ansible to deploy a ceph cluster on the vms
   ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/site.yml.sample --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest}"
@@ -43,6 +38,9 @@ commands=
   # test cluster state using ceph-ansible tests
   testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
 
+  # make ceph-volume simple take over all the OSDs that got deployed, disabling ceph-disk
+  ansible-playbook -vv -i {changedir}/hosts {changedir}/test.yml
+
   # reboot all vms
   vagrant reload --no-provision
 
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/Vagrantfile
new file mode 120000
index 000000000..16076e424
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/Vagrantfile
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/group_vars/all
new file mode 100644
index 000000000..560c8b03b
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/group_vars/all
@@ -0,0 +1,19 @@
+---
+
+ceph_dev: True
+cluster: test
+public_network: "192.168.1.0/24"
+cluster_network: "192.168.2.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "bluestore"
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd0.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd0.yml
new file mode 100644
index 000000000..2e1c7ee9e
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd0.yml
@@ -0,0 +1,7 @@
+---
+
+devices:
+  - '/dev/sdb'
+dedicated_devices:
+  - '/dev/sdc'
+osd_scenario: "non-collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd1.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd1.yml
new file mode 100644
index 000000000..7e90071c9
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/host_vars/osd1.yml
@@ -0,0 +1,6 @@
+---
+
+devices:
+  - '/dev/sdb'
+  - '/dev/sdc'
+osd_scenario: "collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/hosts
new file mode 100644
index 000000000..e0c08b946
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/hosts
@@ -0,0 +1,9 @@
+[mons]
+mon0 monitor_interface=eth1
+
+[osds]
+osd0 
+osd1
+
+[mgrs]
+mon0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/test.yml
new file mode 100644
index 000000000..24e2c0353
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/test.yml
@@ -0,0 +1,31 @@
+---
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    - name: list all OSD directories
+      find:
+        paths: /var/lib/ceph/osd
+        file_type: directory
+      register: osd_paths
+
+    - name: scan all OSD directories
+      command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_paths.files }}"
+
+    - name: list all OSD JSON files
+      find:
+        paths: /etc/ceph/osd
+        file_type: file
+      register: osd_configs
+
+    - name: activate all scanned OSDs
+      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_configs.files }}"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/vagrant_variables.yml
new file mode 100644
index 000000000..b4aa759ab
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/bluestore/activate/vagrant_variables.yml
@@ -0,0 +1,73 @@
+---
+
+# DEPLOY CONTAINERIZED DAEMONS
+docker: false
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 2
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+
+# INSTALL SOURCE OF CEPH
+# valid values are 'stable' and 'dev'
+ceph_install_source: stable
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.1
+cluster_subnet: 192.168.2
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# Disks
+# For libvirt use disks: "[ '/dev/vdb', '/dev/vdc' ]"
+# For CentOS7 use disks: "[ '/dev/sda', '/dev/sdb' ]"
+disks: "[ '/dev/sdb', '/dev/sdc' ]"
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=â&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/Vagrantfile
new file mode 120000
index 000000000..16076e424
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/Vagrantfile
@@ -0,0 +1 @@
+../../../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/group_vars/all b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/group_vars/all
new file mode 100644
index 000000000..8902bdda3
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/group_vars/all
@@ -0,0 +1,19 @@
+---
+
+ceph_dev: True
+cluster: test
+public_network: "192.168.1.0/24"
+cluster_network: "192.168.2.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "filestore"
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+  global:
+    osd_pool_default_pg_num: 8
+    osd_pool_default_size: 1
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd0.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd0.yml
new file mode 100644
index 000000000..2e1c7ee9e
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd0.yml
@@ -0,0 +1,7 @@
+---
+
+devices:
+  - '/dev/sdb'
+dedicated_devices:
+  - '/dev/sdc'
+osd_scenario: "non-collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd1.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd1.yml
new file mode 100644
index 000000000..7e90071c9
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/host_vars/osd1.yml
@@ -0,0 +1,6 @@
+---
+
+devices:
+  - '/dev/sdb'
+  - '/dev/sdc'
+osd_scenario: "collocated"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/hosts b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/hosts
new file mode 100644
index 000000000..e0c08b946
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/hosts
@@ -0,0 +1,9 @@
+[mons]
+mon0 monitor_interface=eth1
+
+[osds]
+osd0 
+osd1
+
+[mgrs]
+mon0
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml
new file mode 100644
index 000000000..24e2c0353
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml
@@ -0,0 +1,31 @@
+---
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    - name: list all OSD directories
+      find:
+        paths: /var/lib/ceph/osd
+        file_type: directory
+      register: osd_paths
+
+    - name: scan all OSD directories
+      command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_paths.files }}"
+
+    - name: list all OSD JSON files
+      find:
+        paths: /etc/ceph/osd
+        file_type: file
+      register: osd_configs
+
+    - name: activate all scanned OSDs
+      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+      with_items:
+        - "{{ osd_configs.files }}"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/vagrant_variables.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/vagrant_variables.yml
new file mode 100644
index 000000000..b4aa759ab
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/vagrant_variables.yml
@@ -0,0 +1,73 @@
+---
+
+# DEPLOY CONTAINERIZED DAEMONS
+docker: false
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 2
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+
+# INSTALL SOURCE OF CEPH
+# valid values are 'stable' and 'dev'
+ceph_install_source: stable
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.1
+cluster_subnet: 192.168.2
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# Disks
+# For libvirt use disks: "[ '/dev/vdb', '/dev/vdc' ]"
+# For CentOS7 use disks: "[ '/dev/sda', '/dev/sdb' ]"
+disks: "[ '/dev/sdb', '/dev/sdc' ]"
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+#   - https://atlas.hashicorp.com/boxes/search?utf8=â&sort=&provider=virtualbox&q=
+#   - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location.  vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+
+os_tuning_params:
+  - { name: kernel.pid_max, value: 4194303 }
+  - { name: fs.file-max, value: 26234859 }
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/Vagrantfile b/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/Vagrantfile
deleted file mode 120000
index 2572fa2c9..000000000
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/Vagrantfile
+++ /dev/null
@@ -1 +0,0 @@
-../../Vagrantfile
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_arg_validators.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_arg_validators.py
index 917469128..22b962b1d 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_arg_validators.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_arg_validators.py
@@ -1,10 +1,11 @@
 import pytest
 import argparse
+from ceph_volume import exceptions
 from ceph_volume.util import arg_validators
 
 
 invalid_lv_paths = [
-    '', 'lv_name', '///', '/lv_name', 'lv_name/',
+    '', 'lv_name', '/lv_name', 'lv_name/',
     '/dev/lv_group/lv_name'
 ]
 
@@ -22,3 +23,31 @@ class TestLVPath(object):
     def test_is_valid(self):
         path = 'vg/lv'
         assert self.validator(path) == path
+
+    def test_abspath_is_valid(self):
+        path = '/'
+        assert self.validator(path) == path
+
+
+class TestOSDPath(object):
+
+    def setup(self):
+        self.validator = arg_validators.OSDPath()
+
+    def test_is_not_root(self):
+        with pytest.raises(exceptions.SuperUserError):
+            self.validator('')
+
+    def test_path_is_not_a_directory(self, is_root, tmpfile, monkeypatch):
+        monkeypatch.setattr(arg_validators.disk, 'is_partition', lambda x: False)
+        validator = arg_validators.OSDPath()
+        with pytest.raises(argparse.ArgumentError):
+            validator(tmpfile())
+
+    def test_files_are_missing(self, is_root, tmpdir, monkeypatch):
+        tmppath = str(tmpdir)
+        monkeypatch.setattr(arg_validators.disk, 'is_partition', lambda x: False)
+        validator = arg_validators.OSDPath()
+        with pytest.raises(argparse.ArgumentError) as error:
+            validator(tmppath)
+        assert 'Required file (ceph_fsid) was not found in OSD' in str(error)
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_system.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_system.py
index 7cb6a1f14..56b88b3f4 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_system.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_system.py
@@ -1,6 +1,7 @@
 import os
 import pwd
 import getpass
+import pytest
 from textwrap import dedent
 from ceph_volume.util import system
 
@@ -34,7 +35,74 @@ class TestMkdirP(object):
         assert os.path.isdir(path)
 
 
-class TestIsMounted(object):
+@pytest.fixture
+def fake_proc(tmpdir, monkeypatch):
+    PROCDIR = str(tmpdir)
+    proc_path = os.path.join(PROCDIR, 'mounts')
+    with open(proc_path, 'w') as f:
+        f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
+            rootfs / rootfs rw 0 0
+            sysfs /sys sysfs rw,seclabel,nosuid,nodev,noexec,relatime 0 0
+            proc /proc proc rw,nosuid,nodev,noexec,relatime 0 0
+            devtmpfs /dev devtmpfs rw,seclabel,nosuid,size=238292k,nr_inodes=59573,mode=755 0 0
+            securityfs /sys/kernel/security securityfs rw,nosuid,nodev,noexec,relatime 0 0
+            tmpfs /dev/shm tmpfs rw,seclabel,nosuid,nodev 0 0
+            devpts /dev/pts devpts rw,seclabel,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=000 0 0
+            tmpfs /run tmpfs rw,seclabel,nosuid,nodev,mode=755 0 0
+            tmpfs /sys/fs/cgroup tmpfs ro,seclabel,nosuid,nodev,noexec,mode=755 0 0
+            cgroup /sys/fs/cgroup/systemd cgroup rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd 0 0
+            cgroup /sys/fs/cgroup/freezer cgroup rw,nosuid,nodev,noexec,relatime,freezer 0 0
+            configfs /sys/kernel/config configfs rw,relatime 0 0
+            /dev/mapper/VolGroup00-LogVol00 / xfs rw,seclabel,relatime,attr2,inode64,noquota 0 0
+            selinuxfs /sys/fs/selinux selinuxfs rw,relatime 0 0
+            debugfs /sys/kernel/debug debugfs rw,relatime 0 0
+            hugetlbfs /dev/hugepages hugetlbfs rw,seclabel,relatime 0 0
+            mqueue /dev/mqueue mqueue rw,seclabel,relatime 0 0
+            sunrpc /far/lib/nfs/rpc_pipefs rpc_pipefs rw,relatime 0 0
+            /dev/sde4 /two/field/path
+            nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
+            /dev/sde2 /boot xfs rw,seclabel,relatime,attr2,inode64,noquota 0 0
+            tmpfs /far/lib/ceph/osd/ceph-5 tmpfs rw,seclabel,relatime 0 0
+            tmpfs /far/lib/ceph/osd/ceph-7 tmpfs rw,seclabel,relatime 0 0
+            /dev/sda1 /far/lib/ceph/osd/ceph-0 xfs rw,seclabel,noatime,attr2,inode64,noquota 0 0
+            tmpfs /run/user/1000 tmpfs rw,seclabel,nosuid,nodev,relatime,size=50040k,mode=700,uid=1000,gid=1000 0 0
+            /dev/sdc2 /boot xfs rw,seclabel,relatime,attr2,inode64,noquota 0 0
+            tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
+    monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
+    monkeypatch.setattr(os.path, 'exists', lambda x: True)
+
+
+class TestPathIsMounted(object):
+
+    def test_is_mounted(self, fake_proc):
+        assert system.path_is_mounted('/boot') is True
+
+    def test_is_not_mounted(self, fake_proc):
+        assert system.path_is_mounted('/far/fib/feph') is False
+
+    def test_is_not_mounted_at_destination(self, fake_proc):
+        assert system.path_is_mounted('/boot', destination='/dev/sda1') is False
+
+    def test_is_mounted_at_destination(self, fake_proc):
+        assert system.path_is_mounted('/boot', destination='/dev/sdc2') is True
+
+
+class TestDeviceIsMounted(object):
+
+    def test_is_mounted(self, fake_proc):
+        assert system.device_is_mounted('/dev/sda1') is True
+
+    def test_path_is_not_device(self, fake_proc):
+        assert system.device_is_mounted('/far/lib/ceph/osd/ceph-7') is False
+
+    def test_is_not_mounted_at_destination(self, fake_proc):
+        assert system.device_is_mounted('/dev/sda1', destination='/far/lib/ceph/osd/test-1') is False
+
+    def test_is_mounted_at_destination(self, fake_proc):
+        assert system.device_is_mounted('/dev/sda1', destination='/far/lib/ceph/osd/ceph-7') is False
+
+
+class TestGetMounts(object):
 
     def test_not_mounted(self, tmpdir, monkeypatch):
         PROCDIR = str(tmpdir)
@@ -42,48 +110,47 @@ class TestIsMounted(object):
         with open(proc_path, 'w') as f:
             f.write('')
         monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
-        assert system.is_mounted('sdb') is False
+        assert system.get_mounts() == {}
 
-    def test_is_mounted_(self, tmpdir, monkeypatch):
-        PROCDIR = str(tmpdir)
-        proc_path = os.path.join(PROCDIR, 'mounts')
-        with open(proc_path, 'w') as f:
-            f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
-                    /dev/sdc2 /boot xfs rw,seclabel,relatime,attr2,inode64,noquota 0 0
-                    tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
-        monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
-        monkeypatch.setattr(os.path, 'exists', lambda x: True)
-        assert system.is_mounted('/dev/sdc2') is True
+    def test_is_mounted_(self, fake_proc):
+        result = system.get_mounts()
+        assert result['/dev/sdc2'] == ['/boot']
 
-    def test_ignores_two_fields(self, tmpdir, monkeypatch):
-        PROCDIR = str(tmpdir)
-        proc_path = os.path.join(PROCDIR, 'mounts')
-        with open(proc_path, 'w') as f:
-            f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
-                    /dev/sdc2 /boot
-                    tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
-        monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
-        monkeypatch.setattr(os.path, 'exists', lambda x: True)
-        assert system.is_mounted('/dev/sdc2') is False
+    def test_ignores_two_fields(self, fake_proc):
+        result = system.get_mounts()
+        assert result.get('/dev/sde4') is None
 
-    def test_not_mounted_at_destination(self, tmpdir, monkeypatch):
-        PROCDIR = str(tmpdir)
-        proc_path = os.path.join(PROCDIR, 'mounts')
-        with open(proc_path, 'w') as f:
-            f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
-                    /dev/sdc2 /var/lib/ceph/osd/ceph-9 xfs rw,attr2,inode64,noquota 0 0
-                    tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
-        monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
-        monkeypatch.setattr(os.path, 'exists', lambda x: True)
-        assert system.is_mounted('/dev/sdc2', '/var/lib/ceph/osd/ceph-0') is False
+    def test_tmpfs_is_reported(self, fake_proc):
+        result = system.get_mounts()
+        assert result['tmpfs'][0] == '/dev/shm'
+
+    def test_non_skip_devs_arent_reported(self, fake_proc):
+        result = system.get_mounts()
+        assert result.get('cgroup') is None
+
+    def test_multiple_mounts_are_appended(self, fake_proc):
+        result = system.get_mounts()
+        assert len(result['tmpfs']) == 7
 
-    def test_is_mounted_at_destination(self, tmpdir, monkeypatch):
+    def test_nonexistent_devices_are_skipped(self, tmpdir, monkeypatch):
         PROCDIR = str(tmpdir)
         proc_path = os.path.join(PROCDIR, 'mounts')
         with open(proc_path, 'w') as f:
             f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
-                    /dev/sdc2 /var/lib/ceph/osd/ceph-0 xfs rw,attr2,inode64,noquota 0 0
-                    tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
+                    /dev/sda1 /far/lib/ceph/osd/ceph-0 xfs rw,attr2,inode64,noquota 0 0
+                    /dev/sda2 /far/lib/ceph/osd/ceph-1 xfs rw,attr2,inode64,noquota 0 0"""))
         monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
-        monkeypatch.setattr(os.path, 'exists', lambda x: True)
-        assert system.is_mounted('/dev/sdc2', '/var/lib/ceph/osd/ceph-0') is True
+        monkeypatch.setattr(os.path, 'exists', lambda x: False if x == '/dev/sda1' else True)
+        result = system.get_mounts()
+        assert result.get('/dev/sda1') is None
+
+
+class TestIsBinary(object):
+
+    def test_is_binary(self, tmpfile):
+        binary_path = tmpfile(contents='asd\n\nlkjh\x00')
+        assert system.is_binary(binary_path)
+
+    def test_is_not_binary(self, tmpfile):
+        binary_path = tmpfile(contents='asd\n\nlkjh0')
+        assert system.is_binary(binary_path) is False
diff --git a/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py b/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py
index feb470716..349d5da17 100644
--- a/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py
@@ -1,4 +1,8 @@
 import argparse
+import os
+from ceph_volume import terminal
+from ceph_volume import decorators
+from ceph_volume.util import disk
 
 
 class LVPath(object):
@@ -7,12 +11,20 @@ class LVPath(object):
 
         <vg name>/<lv name>
 
+    Or a full path to a device, like ``/dev/sda``
+
     Because for LVM it is better to be specific on what group does an lv
     belongs to.
     """
 
     def __call__(self, string):
         error = None
+        if string.startswith('/'):
+            if not os.path.exists(string):
+                error = "Argument (device) does not exist: %s" % string
+                raise argparse.ArgumentError(None, error)
+            else:
+                return string
         try:
             vg, lv = string.split('/')
         except ValueError:
@@ -27,3 +39,35 @@ class LVPath(object):
         if error:
             raise argparse.ArgumentError(None, error)
         return string
+
+
+class OSDPath(object):
+    """
+    Validate path exists and it looks like an OSD directory.
+    """
+
+    @decorators.needs_root
+    def __call__(self, string):
+        if not os.path.exists(string):
+            error = "Path does not exist: %s" % string
+            raise argparse.ArgumentError(None, error)
+
+        arg_is_partition = disk.is_partition(string)
+        if arg_is_partition:
+            return os.path.abspath(string)
+        absolute_path = os.path.abspath(string)
+        if not os.path.isdir(absolute_path):
+            error = "Argument is not a directory or device which is required to scan"
+            raise argparse.ArgumentError(None, error)
+        key_files = ['ceph_fsid', 'fsid', 'keyring', 'ready', 'type', 'whoami']
+        dir_files = os.listdir(absolute_path)
+        for key_file in key_files:
+            if key_file not in dir_files:
+                terminal.error('All following files must exist in path: %s' % ' '.join(key_files))
+                error = "Required file (%s) was not found in OSD dir path: %s" % (
+                    key_file,
+                    absolute_path
+                )
+                raise argparse.ArgumentError(None, error)
+
+        return os.path.abspath(string)
diff --git a/ceph/src/ceph-volume/ceph_volume/util/disk.py b/ceph/src/ceph-volume/ceph_volume/util/disk.py
index 0d3061d3c..da3dc9341 100644
--- a/ceph/src/ceph-volume/ceph_volume/util/disk.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/disk.py
@@ -1,3 +1,5 @@
+import os
+import stat
 from ceph_volume import process
 
 
@@ -22,3 +24,160 @@ def get_device_from_partuuid(partuuid):
         ['sudo', 'blkid', '-t', 'PARTUUID="%s"' % partuuid, '-o', 'device']
     )
     return ' '.join(out).strip()
+
+
+def _stat_is_device(stat_obj):
+    """
+    Helper function that will interpret ``os.stat`` output directly, so that other
+    functions can call ``os.stat`` once and interpret that result several times
+    """
+    return stat.S_ISBLK(stat_obj)
+
+
+def lsblk(device, columns=None):
+    """
+    Create a dictionary of identifying values for a device using ``lsblk``.
+    Each supported column is a key, in its *raw* format (all uppercase
+    usually).  ``lsblk`` has support for certain "columns" (in blkid these
+    would be labels), and these columns vary between distributions and
+    ``lsblk`` versions. The newer versions support a richer set of columns,
+    while older ones were a bit limited.
+
+    These are the default lsblk columns reported which are safe to use for
+    Ubuntu 14.04.5 LTS:
+
+         NAME  device name
+        KNAME  internal kernel device name
+      MAJ:MIN  major:minor device number
+       FSTYPE  filesystem type
+   MOUNTPOINT  where the device is mounted
+        LABEL  filesystem LABEL
+         UUID  filesystem UUID
+           RO  read-only device
+           RM  removable device
+        MODEL  device identifier
+         SIZE  size of the device
+        STATE  state of the device
+        OWNER  user name
+        GROUP  group name
+         MODE  device node permissions
+    ALIGNMENT  alignment offset
+       MIN-IO  minimum I/O size
+       OPT-IO  optimal I/O size
+      PHY-SEC  physical sector size
+      LOG-SEC  logical sector size
+         ROTA  rotational device
+        SCHED  I/O scheduler name
+      RQ-SIZE  request queue size
+         TYPE  device type
+     DISC-ALN  discard alignment offset
+    DISC-GRAN  discard granularity
+     DISC-MAX  discard max bytes
+    DISC-ZERO  discard zeroes data
+
+    There is a bug in ``lsblk`` where using all the available (supported)
+    columns will result in no output (!), in order to workaround this the
+    following columns have been removed from the default reporting columns:
+
+    * RQ-SIZE (request queue size)
+    * MIN-IO  minimum I/O size
+    * OPT-IO  optimal I/O size
+
+    These should be available however when using `columns`. For example::
+
+        >>> lsblk('/dev/sda1', columns=['OPT-IO'])
+        {'OPT-IO': '0'}
+
+    Normal CLI output, as filtered by the flags in this function will look like ::
+
+        $ sudo lsblk --nodeps -P -o NAME,KNAME,MAJ:MIN,FSTYPE,MOUNTPOINT
+        NAME="sda1" KNAME="sda1" MAJ:MIN="8:1" FSTYPE="ext4" MOUNTPOINT="/"
+
+    :param columns: A list of columns to report as keys in its original form.
+    """
+    default_columns = [
+        'NAME', 'KNAME', 'MAJ:MIN', 'FSTYPE', 'MOUNTPOINT', 'LABEL', 'UUID',
+        'RO', 'RM', 'MODEL', 'SIZE', 'STATE', 'OWNER', 'GROUP', 'MODE',
+        'ALIGNMENT', 'PHY-SEC', 'LOG-SEC', 'ROTA', 'SCHED', 'TYPE', 'DISC-ALN',
+        'DISC-GRAN', 'DISC-MAX', 'DISC-ZERO'
+    ]
+    device = device.rstrip('/')
+    columns = columns or default_columns
+    # --nodeps -> Avoid adding children/parents to the device, only give information
+    #             on the actual device we are querying for
+    # -P       -> Produce pairs of COLUMN="value"
+    # -o       -> Use the columns specified or default ones provided by this function
+    command = ['sudo', 'lsblk', '--nodeps', '-P', '-o']
+    command.append(','.join(columns))
+    command.append(device)
+    out, err, rc = process.call(command)
+
+    if rc != 0:
+        return {}
+
+    # parse the COLUMN="value" output to construct the dictionary
+    pairs = ' '.join(out).split()
+    parsed = {}
+    for pair in pairs:
+        try:
+            column, value = pair.split('=')
+        except ValueError:
+            continue
+        parsed[column] = value.strip().strip().strip('"')
+    return parsed
+
+
+def _lsblk_type(device):
+    """
+    Helper function that will use the ``TYPE`` label output of ``lsblk`` to determine
+    if a device is a partition or disk.
+    It does not process the output to return a boolean, but it does process it to return the
+    """
+    out, err, rc = process.call(
+        ['sudo', 'blkid', '-s', 'PARTUUID', '-o', 'value', device]
+    )
+    return ' '.join(out).strip()
+
+
+def is_device(dev):
+    """
+    Boolean to determine if a given device is a block device (**not**
+    a partition!)
+
+    For example: /dev/sda would return True, but not /dev/sdc1
+    """
+    if not os.path.exists(dev):
+        return False
+    # use lsblk first, fall back to using stat
+    TYPE = lsblk(dev).get('TYPE')
+    if TYPE:
+        return TYPE == 'disk'
+
+    # fallback to stat
+    return _stat_is_device(os.lstat(dev).st_mode)
+    if stat.S_ISBLK(os.lstat(dev)):
+        return True
+    return False
+
+
+def is_partition(dev):
+    """
+    Boolean to determine if a given device is a partition, like /dev/sda1
+    """
+    if not os.path.exists(dev):
+        return False
+    # use lsblk first, fall back to using stat
+    TYPE = lsblk(dev).get('TYPE')
+    if TYPE:
+        return TYPE == 'part'
+
+    # fallback to stat
+    stat_obj = os.stat(dev)
+    if _stat_is_device(stat_obj.st_mode):
+        return False
+
+    major = os.major(stat_obj.st_rdev)
+    minor = os.minor(stat_obj.st_rdev)
+    if os.path.exists('/sys/dev/block/%d:%d/partition' % (major, minor)):
+        return True
+    return False
diff --git a/ceph/src/ceph-volume/ceph_volume/util/prepare.py b/ceph/src/ceph-volume/ceph_volume/util/prepare.py
index eefa0adc2..6b38fe097 100644
--- a/ceph/src/ceph-volume/ceph_volume/util/prepare.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/prepare.py
@@ -57,8 +57,21 @@ def create_id(fsid, json_secrets):
     return ' '.join(stdout).strip()
 
 
-def create_path(osd_id):
+def mount_tmpfs(path):
+    process.run([
+        'sudo',
+        'mount',
+        '-t',
+        'tmpfs', 'tmpfs',
+        path
+    ])
+
+
+def create_osd_path(osd_id, tmpfs=False):
+    path = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
     system.mkdir_p('/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id))
+    if tmpfs:
+        mount_tmpfs(path)
 
 
 def format_device(device):
@@ -98,15 +111,39 @@ def mount_osd(device, osd_id):
     process.run(command)
 
 
-def link_journal(journal_device, osd_id):
-    journal_path = '/var/lib/ceph/osd/%s-%s/journal' % (
+def _link_device(device, device_type, osd_id):
+    """
+    Allow linking any device type in an OSD directory. ``device`` must the be
+    source, with an absolute path and ``device_type`` will be the destination
+    name, like 'journal', or 'block'
+    """
+    device_path = '/var/lib/ceph/osd/%s-%s/%s' % (
         conf.cluster,
-        osd_id
+        osd_id,
+        device_type
     )
-    command = ['sudo', 'ln', '-s', journal_device, journal_path]
+    command = ['sudo', 'ln', '-s', device, device_path]
+    system.chown(device)
+
     process.run(command)
 
 
+def link_journal(journal_device, osd_id):
+    _link_device(journal_device, 'journal', osd_id)
+
+
+def link_block(block_device, osd_id):
+    _link_device(block_device, 'block', osd_id)
+
+
+def link_wal(wal_device, osd_id):
+    _link_device(wal_device, 'block.wal', osd_id)
+
+
+def link_db(db_device, osd_id):
+    _link_device(db_device, 'block.db', osd_id)
+
+
 def get_monmap(osd_id):
     """
     Before creating the OSD files, a monmap needs to be retrieved so that it
@@ -130,7 +167,64 @@ def get_monmap(osd_id):
     ])
 
 
-def osd_mkfs(osd_id, fsid):
+def osd_mkfs_bluestore(osd_id, fsid, keyring=None, wal=False, db=False):
+    """
+    Create the files for the OSD to function. A normal call will look like:
+
+          ceph-osd --cluster ceph --mkfs --mkkey -i 0 \
+                   --monmap /var/lib/ceph/osd/ceph-0/activate.monmap \
+                   --osd-data /var/lib/ceph/osd/ceph-0 \
+                   --osd-uuid 8d208665-89ae-4733-8888-5d3bfbeeec6c \
+                   --keyring /var/lib/ceph/osd/ceph-0/keyring \
+                   --setuser ceph --setgroup ceph
+
+    In some cases it is required to use the keyring, when it is passed in as
+    a keywork argument it is used as part of the ceph-osd command
+    """
+    path = '/var/lib/ceph/osd/%s-%s/' % (conf.cluster, osd_id)
+    monmap = os.path.join(path, 'activate.monmap')
+
+    system.chown(path)
+
+    base_command = [
+        'sudo',
+        'ceph-osd',
+        '--cluster', conf.cluster,
+        # undocumented flag, sets the `type` file to contain 'bluestore'
+        '--osd-objectstore', 'bluestore',
+        '--mkfs',
+        '-i', osd_id,
+        '--monmap', monmap,
+    ]
+
+    supplementary_command = [
+        '--osd-data', path,
+        '--osd-uuid', fsid,
+        '--setuser', 'ceph',
+        '--setgroup', 'ceph'
+    ]
+
+    if keyring is not None:
+        base_command.extend(['--key', keyring])
+
+    if wal:
+        base_command.extend(
+            ['--bluestore-block-wal-path', wal]
+        )
+        system.chown(wal)
+
+    if db:
+        base_command.extend(
+            ['--bluestore-block-db-path', db]
+        )
+        system.chown(db)
+
+    command = base_command + supplementary_command
+
+    process.run(command, obfuscate='--key')
+
+
+def osd_mkfs_filestore(osd_id, fsid):
     """
     Create the files for the OSD to function. A normal call will look like:
 
@@ -154,6 +248,8 @@ def osd_mkfs(osd_id, fsid):
         'sudo',
         'ceph-osd',
         '--cluster', conf.cluster,
+        # undocumented flag, sets the `type` file to contain 'filestore'
+        '--osd-objectstore', 'filestore',
         '--mkfs',
         '-i', osd_id,
         '--monmap', monmap,
diff --git a/ceph/src/ceph-volume/ceph_volume/util/system.py b/ceph/src/ceph-volume/ceph_volume/util/system.py
index 084a0e0d3..d580a4c28 100644
--- a/ceph/src/ceph-volume/ceph_volume/util/system.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/system.py
@@ -2,6 +2,7 @@ import errno
 import os
 import pwd
 import platform
+import tempfile
 import uuid
 from ceph_volume import process
 from . import as_string
@@ -68,37 +69,122 @@ def chown(path, recursive=True):
         os.chown(path, uid, gid)
 
 
-def is_mounted(source, destination=None):
+def is_binary(path):
     """
-    Check if the given device is mounted, optionally validating destination.
-    This relies on absolute path devices, it will ignore non-absolute
-    entries like::
+    Detect if a file path is a binary or not. Will falsely report as binary
+    when utf-16 encoded. In the ceph universe there is no such risk (yet)
+    """
+    with open(path, 'rb') as fp:
+        contents = fp.read(8192)
+    if b'\x00' in contents:  # a null byte may signal binary
+        return True
+    return False
+
+
+class tmp_mount(object):
+    """
+    Temporarily mount a device on a temporary directory,
+    and unmount it upon exit
+    """
+
+    def __init__(self, device):
+        self.device = device
+        self.path = None
+
+    def __enter__(self):
+        self.path = tempfile.mkdtemp()
+        process.run([
+            'sudo',
+            'mount',
+            '-v',
+            self.device,
+            self.path
+        ])
+        return self.path
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        process.run([
+            'sudo',
+            'umount',
+            '-v',
+            self.path
+        ])
+
+
+def path_is_mounted(path, destination=None):
+    """
+    Check if the given path is mounted
+    """
+    mounts = get_mounts(paths=True)
+    realpath = os.path.realpath(path)
+    mounted_locations = mounts.get(realpath, [])
+
+    if destination:
+        if destination.startswith('/'):
+            destination = os.path.realpath(destination)
+        return destination in mounted_locations
+    return mounted_locations != []
+
 
-        tmpfs /run tmpfs rw,seclabel,nosuid,nodev,mode=755 0 0
+def device_is_mounted(dev, destination=None):
+    """
+    Check if the given device is mounted, optionally validating that a
+    destination exists
+    """
+    mounts = get_mounts(devices=True)
+    realpath = os.path.realpath(dev) if dev.startswith('/') else dev
+    destination = os.path.realpath(destination) if destination else None
+    mounted_locations = mounts.get(realpath, [])
+
+    if destination:
+        return destination in mounted_locations
+    return mounted_locations != []
+
+
+def get_mounts(devices=False, paths=False):
+    """
+    Create a mapping of all available system mounts so that other helpers can
+    detect nicely what path or device is mounted
 
-    But will parse paths that are absolute like::
+    It ignores (most of) non existing devices, but since some setups might need
+    some extra device information, it will make an exception for:
 
-        /dev/sdc2 /boot xfs rw,attr2,inode64,noquota 0 0
+    - tmpfs
+    - devtmpfs
 
-    When destination is passed in, it will check that the entry where the
-    source appears is mounted to where destination defines. This is useful so
-    that an error message can report that a source is not mounted at an
-    expected destination.
+    If ``devices`` is set to ``True`` the mapping will be a device-to-path(s),
+    if ``paths`` is set to ``True`` then the mapping will be
+    a path-to-device(s)
     """
-    dev = os.path.realpath(source)
-    with open(PROCDIR + '/mounts', 'rb') as proc_mounts:
-        for line in proc_mounts:
-            fields = line.split()
-            if len(fields) < 3:
+    devices_mounted = {}
+    paths_mounted = {}
+    do_not_skip = ['tmpfs', 'devtmpfs']
+    default_to_devices = devices is False and paths is False
+
+    with open(PROCDIR + '/mounts', 'rb') as mounts:
+        proc_mounts = mounts.readlines()
+
+    for line in proc_mounts:
+        fields = [as_string(f) for f in line.split()]
+        if len(fields) < 3:
+            continue
+        device = os.path.realpath(fields[0]) if fields[0].startswith('/') else fields[0]
+        path = os.path.realpath(fields[1])
+        # only care about actual existing devices
+        if not os.path.exists(device) or not device.startswith('/'):
+            if device not in do_not_skip:
                 continue
-            mounted_device = fields[0]
-            mounted_path = fields[1]
-            if os.path.isabs(mounted_device) and os.path.exists(mounted_device):
-                mounted_device = os.path.realpath(mounted_device)
-                if as_string(mounted_device) == dev:
-                    if destination:
-                        destination = os.path.realpath(destination)
-                        return destination == as_string(os.path.realpath(mounted_path))
-                    else:
-                        return True
-    return False
+        if device in devices_mounted.keys():
+            devices_mounted[device].append(path)
+        else:
+            devices_mounted[device] = [path]
+        if path in paths_mounted.keys():
+            paths_mounted[path].append(device)
+        else:
+            paths_mounted[path] = [device]
+
+    # Default to returning information for devices if
+    if devices is True or default_to_devices:
+        return devices_mounted
+    else:
+        return paths_mounted
diff --git a/ceph/src/ceph.in b/ceph/src/ceph.in
index cb1bd8694..7c1eda2c0 100755
--- a/ceph/src/ceph.in
+++ b/ceph/src/ceph.in
@@ -48,7 +48,7 @@ PRIO_USEFUL = 5
 PRIO_UNINTERESTING = 2
 PRIO_DEBUGONLY = 0
 
-PRIO_DEFAULT = PRIO_USEFUL
+PRIO_DEFAULT = PRIO_INTERESTING
 
 # Make life easier on developers:
 # If our parent dir contains CMakeCache.txt and bin/init-ceph,
@@ -228,7 +228,7 @@ def validate_target(target):
                   file=sys.stderr)
             return False
 
-        if service_id in exist_ids:
+        if service_id in exist_ids or len(exist_ids) > 0 and service_id == '*':
             return True
         else:
             print('WARN: the service id you provided does not exist. service id should '
diff --git a/ceph/src/ceph_mgr.cc b/ceph/src/ceph_mgr.cc
index 91043f6e8..5e8f6798e 100644
--- a/ceph/src/ceph_mgr.cc
+++ b/ceph/src/ceph_mgr.cc
@@ -16,7 +16,10 @@
 
 #include <Python.h>
 
+#include <pthread.h>
+
 #include "include/types.h"
+#include "include/compat.h"
 #include "common/config.h"
 #include "common/ceph_argparse.h"
 #include "common/errno.h"
@@ -38,6 +41,8 @@ static void usage()
  */
 int main(int argc, const char **argv)
 {
+  ceph_pthread_setname(pthread_self(), "ceph-mgr");
+
   vector<const char*> args;
   argv_to_vec(argc, argv, args);
   env_to_vec(args);
diff --git a/ceph/src/ceph_mon.cc b/ceph/src/ceph_mon.cc
index 3663bb04e..41a6ee0eb 100644
--- a/ceph/src/ceph_mon.cc
+++ b/ceph/src/ceph_mon.cc
@@ -247,7 +247,6 @@ int main(int argc, const char **argv)
 			 flags, "mon_data");
   ceph_heap_profiler_init();
 
-  uuid_d fsid;
   std::string val;
   for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
     if (ceph_argparse_double_dash(args, i)) {
@@ -331,10 +330,11 @@ int main(int argc, const char **argv)
     MonMap monmap;
 
     // load or generate monmap
-    if (g_conf->monmap.length()) {
-      int err = monmapbl.read_file(g_conf->monmap.c_str(), &error);
+    const auto monmap_fn = g_conf->get_val<string>("monmap");
+    if (monmap_fn.length()) {
+      int err = monmapbl.read_file(monmap_fn.c_str(), &error);
       if (err < 0) {
-	derr << argv[0] << ": error reading " << g_conf->monmap << ": " << error << dendl;
+	derr << argv[0] << ": error reading " << monmap_fn << ": " << error << dendl;
 	exit(1);
       }
       try {
@@ -342,9 +342,8 @@ int main(int argc, const char **argv)
 
 	// always mark seed/mkfs monmap as epoch 0
 	monmap.set_epoch(0);
-      }
-      catch (const buffer::error& e) {
-	derr << argv[0] << ": error decoding monmap " << g_conf->monmap << ": " << e.what() << dendl;
+      } catch (const buffer::error& e) {
+	derr << argv[0] << ": error decoding monmap " << monmap_fn << ": " << e.what() << dendl;
 	exit(1);
       }      
     } else {
@@ -393,9 +392,10 @@ int main(int argc, const char **argv)
       }
     }
 
-    if (!g_conf->fsid.is_zero()) {
-      monmap.fsid = g_conf->fsid;
-      dout(0) << argv[0] << ": set fsid to " << g_conf->fsid << dendl;
+    const auto fsid = g_conf->get_val<uuid_d>("fsid");
+    if (!fsid.is_zero()) {
+      monmap.fsid = fsid;
+      dout(0) << argv[0] << ": set fsid to " << fsid << dendl;
     }
     
     if (monmap.fsid.is_zero()) {
diff --git a/ceph/src/ceph_osd.cc b/ceph/src/ceph_osd.cc
index d7e54a3a3..1cfda9c1d 100644
--- a/ceph/src/ceph_osd.cc
+++ b/ceph/src/ceph_osd.cc
@@ -266,29 +266,6 @@ int main(int argc, const char **argv)
   cephd_preload_embedded_plugins();
 #endif
 
-  if (mkfs) {
-    common_init_finish(g_ceph_context);
-    MonClient mc(g_ceph_context);
-    if (mc.build_initial_monmap() < 0)
-      return -1;
-    if (mc.get_monmap_privately() < 0)
-      return -1;
-
-    if (mc.monmap.fsid.is_zero()) {
-      derr << "must specify cluster fsid" << dendl;
-      return -EINVAL;
-    }
-
-    int err = OSD::mkfs(g_ceph_context, store, g_conf->osd_data,
-			mc.monmap.fsid, whoami);
-    if (err < 0) {
-      derr << TEXT_RED << " ** ERROR: error creating empty object store in "
-	   << g_conf->osd_data << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
-      exit(1);
-    }
-    derr << "created object store " << g_conf->osd_data
-	 << " for osd." << whoami << " fsid " << mc.monmap.fsid << dendl;
-  }
   if (mkkey) {
     common_init_finish(g_ceph_context);
     KeyRing *keyring = KeyRing::create_empty();
@@ -317,6 +294,29 @@ int main(int argc, const char **argv)
 	derr << "created new key in keyring " << g_conf->keyring << dendl;
     }
   }
+  if (mkfs) {
+    common_init_finish(g_ceph_context);
+    MonClient mc(g_ceph_context);
+    if (mc.build_initial_monmap() < 0)
+      return -1;
+    if (mc.get_monmap_privately() < 0)
+      return -1;
+
+    if (mc.monmap.fsid.is_zero()) {
+      derr << "must specify cluster fsid" << dendl;
+      return -EINVAL;
+    }
+
+    int err = OSD::mkfs(g_ceph_context, store, g_conf->osd_data,
+			mc.monmap.fsid, whoami);
+    if (err < 0) {
+      derr << TEXT_RED << " ** ERROR: error creating empty object store in "
+	   << g_conf->osd_data << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
+      exit(1);
+    }
+    derr << "created object store " << g_conf->osd_data
+	 << " for osd." << whoami << " fsid " << mc.monmap.fsid << dendl;
+  }
   if (mkfs || mkkey)
     exit(0);
   if (mkjournal) {
diff --git a/ceph/src/client/Client.cc b/ceph/src/client/Client.cc
index 29a9c49f7..1d9277a61 100644
--- a/ceph/src/client/Client.cc
+++ b/ceph/src/client/Client.cc
@@ -5944,19 +5944,6 @@ void Client::unmount()
   ldout(cct, 2) << "unmounted." << dendl;
 }
 
-
-
-class C_C_Tick : public Context {
-  Client *client;
-public:
-  explicit C_C_Tick(Client *c) : client(c) {}
-  void finish(int r) override {
-    // Called back via Timer, which takes client_lock for us
-    assert(client->client_lock.is_locked_by_me());
-    client->tick();
-  }
-};
-
 void Client::flush_cap_releases()
 {
   // send any cap releases
@@ -5985,9 +5972,13 @@ void Client::tick()
   }
 
   ldout(cct, 21) << "tick" << dendl;
-  tick_event = new C_C_Tick(this);
-  timer.add_event_after(cct->_conf->client_tick_interval, tick_event);
-
+  tick_event = timer.add_event_after(
+    cct->_conf->client_tick_interval,
+    new FunctionContext([this](int) {
+	// Called back via Timer, which takes client_lock for us
+	assert(client_lock.is_locked_by_me());
+	tick();
+      }));
   utime_t now = ceph_clock_now();
 
   if (!mounted && !mds_requests.empty()) {
diff --git a/ceph/src/client/Client.h b/ceph/src/client/Client.h
index e89a25440..16aef0312 100644
--- a/ceph/src/client/Client.h
+++ b/ceph/src/client/Client.h
@@ -498,7 +498,6 @@ protected:
   friend class C_Client_CacheInvalidate;  // calls ino_invalidate_cb
   friend class C_Client_DentryInvalidate;  // calls dentry_invalidate_cb
   friend class C_Block_Sync; // Calls block map and protected helpers
-  friend class C_C_Tick; // Asserts on client_lock
   friend class C_Client_RequestInterrupt;
   friend class C_Client_Remount;
   friend void intrusive_ptr_release(Inode *in);
diff --git a/ceph/src/cls/journal/cls_journal.cc b/ceph/src/cls/journal/cls_journal.cc
index f966c07f2..2f1d18d0d 100644
--- a/ceph/src/cls/journal/cls_journal.cc
+++ b/ceph/src/cls/journal/cls_journal.cc
@@ -188,6 +188,7 @@ int expire_tags(cls_method_context_t hctx, const std::string *skip_client_id) {
       if (tag.tid >= minimum_tag_tid) {
         // no need to check for tag classes beyond this point
         vals.clear();
+        more = false;
         break;
       }
     }
@@ -1047,6 +1048,7 @@ int journal_tag_list(cls_method_context_t hctx, bufferlist *in,
         // completed calculation of tag class minimums
         if (tag.tid >= minimum_tag_tid) {
           vals.clear();
+          more = false;
           break;
         }
       } else if (tag_pass == TAG_PASS_LIST) {
diff --git a/ceph/src/cls/rbd/cls_rbd.cc b/ceph/src/cls/rbd/cls_rbd.cc
index 79795dbc3..90a48a821 100644
--- a/ceph/src/cls/rbd/cls_rbd.cc
+++ b/ceph/src/cls/rbd/cls_rbd.cc
@@ -2460,7 +2460,7 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
     CLS_ERR("object map footer read failed");
     return r;
   }
- 
+
   try {
     bufferlist::iterator it = footer_bl.begin();
     object_map.decode_footer(it);
@@ -2496,13 +2496,14 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
   }
 
   bool updated = false;
-  for (uint64_t object_no = start_object_no; object_no < end_object_no;
-       ++object_no) {
-    uint8_t state = object_map[object_no];
+  auto it = object_map.begin() + start_object_no;
+  auto end_it = object_map.begin() + end_object_no;
+  for (; it != end_it; ++it) {
+    uint8_t state = *it;
     if ((!current_object_state || state == *current_object_state ||
         (*current_object_state == OBJECT_EXISTS &&
          state == OBJECT_EXISTS_CLEAN)) && state != new_object_state) {
-      object_map[object_no] = new_object_state;
+      *it = new_object_state;
       updated = true;
     }
   }
@@ -3167,6 +3168,22 @@ int uuid_get(cls_method_context_t hctx, std::string *mirror_uuid) {
   return 0;
 }
 
+int list_watchers(cls_method_context_t hctx,
+                  std::set<entity_inst_t> *entities) {
+  obj_list_watch_response_t watchers;
+  int r = cls_cxx_list_watchers(hctx, &watchers);
+  if (r < 0 && r != -ENOENT) {
+    CLS_ERR("error listing watchers: '%s'", cpp_strerror(r).c_str());
+    return r;
+  }
+
+  entities->clear();
+  for (auto &w : watchers.entries) {
+    entities->emplace(w.name, w.addr);
+  }
+  return 0;
+}
+
 int read_peers(cls_method_context_t hctx,
                std::vector<cls::rbd::MirrorPeer> *peers) {
   std::string last_read = PEER_KEY_PREFIX;
@@ -3419,6 +3436,7 @@ int image_status_remove(cls_method_context_t hctx,
 }
 
 int image_status_get(cls_method_context_t hctx, const string &global_image_id,
+                     const std::set<entity_inst_t> &watchers,
 		     cls::rbd::MirrorImageStatus *status) {
 
   bufferlist bl;
@@ -3441,23 +3459,9 @@ int image_status_get(cls_method_context_t hctx, const string &global_image_id,
     return -EIO;
   }
 
-  obj_list_watch_response_t watchers;
-  r = cls_cxx_list_watchers(hctx, &watchers);
-  if (r < 0 && r != -ENOENT) {
-    CLS_ERR("error listing watchers: '%s'", cpp_strerror(r).c_str());
-    return r;
-  }
 
   *status = static_cast<cls::rbd::MirrorImageStatus>(ondisk_status);
-  status->up = false;
-  for (auto &w : watchers.entries) {
-    if (w.name == ondisk_status.origin.name &&
-	w.addr == ondisk_status.origin.addr) {
-      status->up = true;
-      break;
-    }
-  }
-
+  status->up = (watchers.find(ondisk_status.origin) != watchers.end());
   return 0;
 }
 
@@ -3469,11 +3473,17 @@ int image_status_list(cls_method_context_t hctx,
   int max_read = RBD_MAX_KEYS_READ;
   bool more = true;
 
+  std::set<entity_inst_t> watchers;
+  int r = list_watchers(hctx, &watchers);
+  if (r < 0) {
+    return r;
+  }
+
   while (more && mirror_images->size() < max_return) {
     std::map<std::string, bufferlist> vals;
     CLS_LOG(20, "last_read = '%s'", last_read.c_str());
-    int r = cls_cxx_map_get_vals(hctx, last_read, IMAGE_KEY_PREFIX, max_read,
-                                 &vals, &more);
+    r = cls_cxx_map_get_vals(hctx, last_read, IMAGE_KEY_PREFIX, max_read, &vals,
+                             &more);
     if (r < 0) {
       CLS_ERR("error reading mirror image directory by name: %s",
               cpp_strerror(r).c_str());
@@ -3496,7 +3506,8 @@ int image_status_list(cls_method_context_t hctx,
       (*mirror_images)[image_id] = mirror_image;
 
       cls::rbd::MirrorImageStatus status;
-      int r1 = image_status_get(hctx, mirror_image.global_image_id, &status);
+      int r1 = image_status_get(hctx, mirror_image.global_image_id, watchers,
+                                &status);
       if (r1 < 0) {
 	continue;
       }
@@ -3513,20 +3524,12 @@ int image_status_list(cls_method_context_t hctx,
 
 int image_status_get_summary(cls_method_context_t hctx,
 	std::map<cls::rbd::MirrorImageStatusState, int> *states) {
-  obj_list_watch_response_t watchers_;
-  int r = cls_cxx_list_watchers(hctx, &watchers_);
+  std::set<entity_inst_t> watchers;
+  int r = list_watchers(hctx, &watchers);
   if (r < 0) {
-    if (r != -ENOENT) {
-      CLS_ERR("error listing watchers: '%s'", cpp_strerror(r).c_str());
-    }
     return r;
   }
 
-  set<entity_inst_t> watchers;
-  for (auto &w : watchers_.entries) {
-    watchers.insert(entity_inst_t(w.name, w.addr));
-  }
-
   states->clear();
 
   string last_read = IMAGE_KEY_PREFIX;
@@ -3559,7 +3562,7 @@ int image_status_get_summary(cls_method_context_t hctx,
       }
 
       cls::rbd::MirrorImageStatus status;
-      image_status_get(hctx, mirror_image.global_image_id, &status);
+      image_status_get(hctx, mirror_image.global_image_id, watchers, &status);
 
       cls::rbd::MirrorImageStatusState state = status.up ? status.state :
 	cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN;
@@ -3575,20 +3578,12 @@ int image_status_get_summary(cls_method_context_t hctx,
 }
 
 int image_status_remove_down(cls_method_context_t hctx) {
-  obj_list_watch_response_t watchers_;
-  int r = cls_cxx_list_watchers(hctx, &watchers_);
+  std::set<entity_inst_t> watchers;
+  int r = list_watchers(hctx, &watchers);
   if (r < 0) {
-    if (r != -ENOENT) {
-      CLS_ERR("error listing watchers: '%s'", cpp_strerror(r).c_str());
-    }
     return r;
   }
 
-  set<entity_inst_t> watchers;
-  for (auto &w : watchers_.entries) {
-    watchers.insert(entity_inst_t(w.name, w.addr));
-  }
-
   string last_read = STATUS_GLOBAL_KEY_PREFIX;
   int max_read = RBD_MAX_KEYS_READ;
   bool more = true;
@@ -4275,8 +4270,14 @@ int mirror_image_status_get(cls_method_context_t hctx, bufferlist *in,
     return -EINVAL;
   }
 
+  std::set<entity_inst_t> watchers;
+  int r = mirror::list_watchers(hctx, &watchers);
+  if (r < 0) {
+    return r;
+  }
+
   cls::rbd::MirrorImageStatus status;
-  int r = mirror::image_status_get(hctx, global_image_id, &status);
+  r = mirror::image_status_get(hctx, global_image_id, watchers, &status);
   if (r < 0) {
     return r;
   }
diff --git a/ceph/src/cls/rgw/cls_rgw.cc b/ceph/src/cls/rgw/cls_rgw.cc
index 17a618053..354a132da 100644
--- a/ceph/src/cls/rgw/cls_rgw.cc
+++ b/ceph/src/cls/rgw/cls_rgw.cc
@@ -445,8 +445,9 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
         CLS_LOG(20, "entry %s[%s] is not valid\n", key.name.c_str(), key.instance.c_str());
         continue;
       }
-
-      if (!op.list_versions && !entry.is_visible()) {
+      
+      // filter out noncurrent versions, delete markers, and initial marker
+      if (!op.list_versions && (!entry.is_visible() || op.start_obj.name == key.name)) {
         CLS_LOG(20, "entry %s[%s] is not visible\n", key.name.c_str(), key.instance.c_str());
         continue;
       }
@@ -935,6 +936,7 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist
     unaccount_entry(header, remove_entry);
 
     if (op.log_op && !header.syncstopped) {
+      ++header.ver; // increment index version, or we'll overwrite keys previously written
       rc = log_index_operation(hctx, remove_key, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
                                remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags, NULL, NULL, &op.zones_trace);
       if (rc < 0)
@@ -1863,7 +1865,8 @@ static int rgw_bucket_clear_olh(cls_method_context_t hctx, bufferlist *in, buffe
   return 0;
 }
 
-int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+int rgw_dir_suggest_changes(cls_method_context_t hctx,
+			    bufferlist *in, bufferlist *out)
 {
   CLS_LOG(1, "rgw_dir_suggest_changes()");
 
@@ -1956,8 +1959,21 @@ int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlis
         }
         break;
       case CEPH_RGW_UPDATE:
+	if (!cur_disk.exists) {
+	  // this update would only have been sent by the rgw client
+	  // if the rgw_bucket_dir_entry existed, however between that
+	  // check and now the entry has diappeared, so we were likely
+	  // in the midst of a delete op, and we will not recreate the
+	  // entry
+	  CLS_LOG(10,
+		  "CEPH_RGW_UPDATE not applied because rgw_bucket_dir_entry"
+		  " no longer exists\n");
+	  break;
+	}
+
         CLS_LOG(10, "CEPH_RGW_UPDATE name=%s instance=%s total_entries: %" PRId64 " -> %" PRId64 "\n",
                 cur_change.key.name.c_str(), cur_change.key.instance.c_str(), stats.num_entries, stats.num_entries + 1);
+
         stats.num_entries++;
         stats.total_size += cur_change.meta.accounted_size;
         stats.total_size_rounded += cls_rgw_get_rounded_size(cur_change.meta.accounted_size);
@@ -1978,10 +1994,9 @@ int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlis
           }
         }
         break;
-      }
-    }
-
-  }
+      } // switch(op)
+    } // if (cur_disk.pending_map.empty())
+  } // while (!in_iter.end())
 
   if (header_changed) {
     return write_bucket_header(hctx, &header);
@@ -2900,9 +2915,7 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
   bool by_user = !user.empty();
   uint32_t i = 0;
   string user_key;
-
-  if (truncated)
-    *truncated = false;
+  bool truncated_status = false;
 
   if (!by_user) {
     usage_record_prefix_by_time(end, end_key);
@@ -2922,11 +2935,14 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
   }
 
   CLS_LOG(20, "usage_iterate_range start_key=%s", start_key.c_str());
-  int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, max_entries, &keys, truncated);
+  int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, max_entries, &keys, &truncated_status);
   if (ret < 0)
     return ret;
 
-
+  if (truncated) {
+    *truncated = truncated_status;
+  }
+      
   map<string, bufferlist>::iterator iter = keys.begin();
   if (iter == keys.end())
     return 0;
@@ -2939,11 +2955,17 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
 
     if (!by_user && key.compare(end_key) >= 0) {
       CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
+      if (truncated_status) {
+        key_iter = key;
+      }
       return 0;
     }
 
     if (by_user && key.compare(0, user_key.size(), user_key) != 0) {
       CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
+      if (truncated_status) {
+        key_iter = key;
+      }
       return 0;
     }
 
diff --git a/ceph/src/cls/user/cls_user.cc b/ceph/src/cls/user/cls_user.cc
index 7912578aa..840470e9f 100644
--- a/ceph/src/cls/user/cls_user.cc
+++ b/ceph/src/cls/user/cls_user.cc
@@ -163,7 +163,6 @@ static int cls_user_set_buckets_info(cls_method_context_t hctx, bufferlist *in,
     if (!op.add){
       apply_entry_stats(update_entry, &entry);
     }
-
     entry.user_stats_sync = true;
 
     ret = write_entry(hctx, key, entry);
diff --git a/ceph/src/cls/user/cls_user_types.h b/ceph/src/cls/user/cls_user_types.h
index 8595f25fd..6ffd93323 100644
--- a/ceph/src/cls/user/cls_user_types.h
+++ b/ceph/src/cls/user/cls_user_types.h
@@ -101,14 +101,14 @@ struct cls_user_bucket_entry {
   cls_user_bucket bucket;
   size_t size;
   size_t size_rounded;
-  real_time creation_time;
+  ceph::real_time creation_time;
   uint64_t count;
   bool user_stats_sync;
 
   cls_user_bucket_entry() : size(0), size_rounded(0), count(0), user_stats_sync(false) {}
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(7, 5, bl);
+    ENCODE_START(9, 5, bl);
     uint64_t s = size;
     __u32 mt = ceph::real_clock::to_time_t(creation_time);
     string empty_str;  // originally had the bucket name here, but we encode bucket later
@@ -121,10 +121,11 @@ struct cls_user_bucket_entry {
     ::encode(s, bl);
     ::encode(user_stats_sync, bl);
     ::encode(creation_time, bl);
+    //::encode(placement_rule, bl); removed in v9
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
-    DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
+    DECODE_START_LEGACY_COMPAT_LEN(9, 5, 5, bl);
     __u32 mt;
     uint64_t s;
     string empty_str;  // backward compatibility
@@ -146,6 +147,10 @@ struct cls_user_bucket_entry {
       ::decode(user_stats_sync, bl);
     if (struct_v >= 7)
       ::decode(creation_time, bl);
+    if (struct_v == 8) { // added in v8, removed in v9
+      std::string placement_rule;
+      ::decode(placement_rule, bl);
+    }
     DECODE_FINISH(bl);
   }
   void dump(Formatter *f) const;
diff --git a/ceph/src/common/AsyncReserver.h b/ceph/src/common/AsyncReserver.h
index 28512ac80..d5c7a852d 100644
--- a/ceph/src/common/AsyncReserver.h
+++ b/ceph/src/common/AsyncReserver.h
@@ -18,6 +18,8 @@
 #include "common/Finisher.h"
 #include "common/Formatter.h"
 
+#define rdout(x) lgeneric_subdout(cct,reserver,x)
+
 /**
  * Manages a configurable number of asyncronous reservations.
  *
@@ -27,38 +29,104 @@
  */
 template <typename T>
 class AsyncReserver {
+  CephContext *cct;
   Finisher *f;
   unsigned max_allowed;
   unsigned min_priority;
   Mutex lock;
 
-  map<unsigned, list<pair<T, Context*> > > queues;
-  map<T, pair<unsigned, typename list<pair<T, Context*> >::iterator > > queue_pointers;
-  set<T> in_progress;
+  struct Reservation {
+    T item;
+    unsigned prio = 0;
+    Context *grant = 0;
+    Context *preempt = 0;
+    Reservation() {}
+    Reservation(T i, unsigned pr, Context *g, Context *p = 0)
+      : item(i), prio(pr), grant(g), preempt(p) {}
+    void dump(Formatter *f) const {
+      f->dump_stream("item") << item;
+      f->dump_unsigned("prio", prio);
+      f->dump_bool("can_preempt", !!preempt);
+    }
+    friend ostream& operator<<(ostream& out, const Reservation& r) {
+      return out << r.item << "(prio " << r.prio << " grant " << r.grant
+		 << " preempt " << r.preempt << ")";
+    }
+  };
+
+  map<unsigned, list<Reservation>> queues;
+  map<T, pair<unsigned, typename list<Reservation>::iterator>> queue_pointers;
+  map<T,Reservation> in_progress;
+  set<pair<unsigned,T>> preempt_by_prio;  ///< in_progress that can be preempted
+
+  void preempt_one() {
+    assert(!preempt_by_prio.empty());
+    auto q = in_progress.find(preempt_by_prio.begin()->second);
+    assert(q != in_progress.end());
+    Reservation victim = q->second;
+    rdout(10) << __func__ << " preempt " << victim << dendl;
+    f->queue(victim.preempt);
+    victim.preempt = nullptr;
+    in_progress.erase(q);
+    preempt_by_prio.erase(preempt_by_prio.begin());
+  }
 
   void do_queues() {
-    typename map<unsigned, list<pair<T, Context*> > >::reverse_iterator it;
-    for (it = queues.rbegin();
-         it != queues.rend() &&
-	   in_progress.size() < max_allowed &&
-	   it->first >= min_priority;
-         ++it) {
-      while (in_progress.size() < max_allowed &&
-             !it->second.empty()) {
-        pair<T, Context*> p = it->second.front();
-        queue_pointers.erase(p.first);
-        it->second.pop_front();
-        f->queue(p.second);
-        in_progress.insert(p.first);
+    rdout(20) << __func__ << ":\n";
+    JSONFormatter jf(true);
+    jf.open_object_section("queue");
+    _dump(&jf);
+    jf.close_section();
+    jf.flush(*_dout);
+    *_dout << dendl;
+
+    // in case min_priority was adjusted up or max_allowed was adjusted down
+    while (!preempt_by_prio.empty() &&
+	   (in_progress.size() > max_allowed ||
+	    preempt_by_prio.begin()->first < min_priority)) {
+      preempt_one();
+    }
+
+    while (!queues.empty()) {
+      // choose highest priority queue
+      auto it = queues.end();
+      --it;
+      assert(!it->second.empty());
+      if (it->first < min_priority) {
+	break;
+      }
+      if (in_progress.size() >= max_allowed &&
+	  !preempt_by_prio.empty() &&
+	  it->first > preempt_by_prio.begin()->first) {
+	preempt_one();
+      }
+      if (in_progress.size() >= max_allowed) {
+	break; // no room
+      }
+      // grant
+      Reservation p = it->second.front();
+      rdout(10) << __func__ << " grant " << p << dendl;
+      queue_pointers.erase(p.item);
+      it->second.pop_front();
+      if (it->second.empty()) {
+	queues.erase(it);
+      }
+      f->queue(p.grant);
+      p.grant = nullptr;
+      in_progress[p.item] = p;
+      if (p.preempt) {
+	preempt_by_prio.insert(make_pair(p.prio, p.item));
       }
     }
   }
 public:
   AsyncReserver(
+    CephContext *cct,
     Finisher *f,
     unsigned max_allowed,
     unsigned min_priority = 0)
-    : f(f),
+    : cct(cct),
+      f(f),
       max_allowed(max_allowed),
       min_priority(min_priority),
       lock("AsyncReserver::lock") {}
@@ -77,27 +145,26 @@ public:
 
   void dump(Formatter *f) {
     Mutex::Locker l(lock);
+    _dump(f);
+  }
+  void _dump(Formatter *f) {
     f->dump_unsigned("max_allowed", max_allowed);
     f->dump_unsigned("min_priority", min_priority);
     f->open_array_section("queues");
-    for (typename map<unsigned, list<pair<T, Context*> > > ::const_iterator p =
-	   queues.begin(); p != queues.end(); ++p) {
+    for (auto& p : queues) {
       f->open_object_section("queue");
-      f->dump_unsigned("priority", p->first);
+      f->dump_unsigned("priority", p.first);
       f->open_array_section("items");
-      for (typename list<pair<T, Context*> >::const_iterator q =
-	     p->second.begin(); q != p->second.end(); ++q) {
-	f->dump_stream("item") << q->first;
+      for (auto& q : p.second) {
+	f->dump_object("item", q);
       }
       f->close_section();
       f->close_section();
     }
     f->close_section();
     f->open_array_section("in_progress");
-    for (typename set<T>::const_iterator p = in_progress.begin();
-	 p != in_progress.end();
-	 ++p) {
-      f->dump_stream("item") << *p;
+    for (auto& p : in_progress) {
+      f->dump_object("item", p.second);
     }
     f->close_section();
   }
@@ -113,13 +180,17 @@ public:
   void request_reservation(
     T item,                   ///< [in] reservation key
     Context *on_reserved,     ///< [in] callback to be called on reservation
-    unsigned prio
+    unsigned prio,            ///< [in] priority
+    Context *on_preempt = 0   ///< [in] callback to be called if we are preempted (optional)
     ) {
     Mutex::Locker l(lock);
+    Reservation r(item, prio, on_reserved, on_preempt);
+    rdout(10) << __func__ << " queue " << r << dendl;
     assert(!queue_pointers.count(item) &&
 	   !in_progress.count(item));
-    queues[prio].push_back(make_pair(item, on_reserved));
-    queue_pointers.insert(make_pair(item, make_pair(prio,--(queues[prio]).end())));
+    queues[prio].push_back(r);
+    queue_pointers.insert(make_pair(item,
+				    make_pair(prio,--(queues[prio]).end())));
     do_queues();
   }
 
@@ -134,13 +205,31 @@ public:
     T item                   ///< [in] key for reservation to cancel
     ) {
     Mutex::Locker l(lock);
-    if (queue_pointers.count(item)) {
-      unsigned prio = queue_pointers[item].first;
-      delete queue_pointers[item].second->second;
-      queues[prio].erase(queue_pointers[item].second);
-      queue_pointers.erase(item);
+    auto i = queue_pointers.find(item);
+    if (i != queue_pointers.end()) {
+      unsigned prio = i->second.first;
+      const Reservation& r = *i->second.second;
+      rdout(10) << __func__ << " cancel " << r << " (was queued)" << dendl;
+      delete r.grant;
+      delete r.preempt;
+      queues[prio].erase(i->second.second);
+      if (queues[prio].empty()) {
+	queues.erase(prio);
+      }
+      queue_pointers.erase(i);
     } else {
-      in_progress.erase(item);
+      auto p = in_progress.find(item);
+      if (p != in_progress.end()) {
+	rdout(10) << __func__ << " cancel " << p->second
+		  << " (was in progress)" << dendl;
+	if (p->second.preempt) {
+	  preempt_by_prio.erase(make_pair(p->second.prio, p->second.item));
+	  delete p->second.preempt;
+	}
+	in_progress.erase(p);
+      } else {
+	rdout(10) << __func__ << " cancel " << item << " (not found)" << dendl;
+      }
     }
     do_queues();
   }
@@ -157,4 +246,5 @@ public:
   static const unsigned MAX_PRIORITY = (unsigned)-1;
 };
 
+#undef rdout
 #endif
diff --git a/ceph/src/common/LogClient.cc b/ceph/src/common/LogClient.cc
index aeb2f5bfc..6157194bb 100644
--- a/ceph/src/common/LogClient.cc
+++ b/ceph/src/common/LogClient.cc
@@ -88,7 +88,7 @@ int parse_log_client_options(CephContext *cct,
     return r;
   }
 
-  fsid = cct->_conf->fsid;
+  fsid = cct->_conf->get_val<uuid_d>("fsid");
   host = cct->_conf->host;
   return 0;
 }
diff --git a/ceph/src/common/Timer.cc b/ceph/src/common/Timer.cc
index f211a6f8f..45305f553 100644
--- a/ceph/src/common/Timer.cc
+++ b/ceph/src/common/Timer.cc
@@ -114,7 +114,7 @@ void SafeTimer::timer_thread()
   lock.Unlock();
 }
 
-bool SafeTimer::add_event_after(double seconds, Context *callback)
+Context* SafeTimer::add_event_after(double seconds, Context *callback)
 {
   assert(lock.is_locked());
 
@@ -123,14 +123,14 @@ bool SafeTimer::add_event_after(double seconds, Context *callback)
   return add_event_at(when, callback);
 }
 
-bool SafeTimer::add_event_at(utime_t when, Context *callback)
+Context* SafeTimer::add_event_at(utime_t when, Context *callback)
 {
   assert(lock.is_locked());
   ldout(cct,10) << __func__ << " " << when << " -> " << callback << dendl;
   if (stopping) {
     ldout(cct,5) << __func__ << " already shutdown, event not added" << dendl;
     delete callback;
-    return false;
+    return nullptr;
   }
   scheduled_map_t::value_type s_val(when, callback);
   scheduled_map_t::iterator i = schedule.insert(s_val);
@@ -145,7 +145,7 @@ bool SafeTimer::add_event_at(utime_t when, Context *callback)
    * adjust our timeout. */
   if (i == schedule.begin())
     cond.Signal();
-  return true;
+  return callback;
 }
 
 bool SafeTimer::cancel_event(Context *callback)
diff --git a/ceph/src/common/Timer.h b/ceph/src/common/Timer.h
index 861b239ca..8fd478a99 100644
--- a/ceph/src/common/Timer.h
+++ b/ceph/src/common/Timer.h
@@ -70,8 +70,8 @@ public:
 
   /* Schedule an event in the future
    * Call with the event_lock LOCKED */
-  bool add_event_after(double seconds, Context *callback);
-  bool add_event_at(utime_t when, Context *callback);
+  Context* add_event_after(double seconds, Context *callback);
+  Context* add_event_at(utime_t when, Context *callback);
 
   /* Cancel an event.
    * Call with the event_lock LOCKED
diff --git a/ceph/src/common/bit_vector.hpp b/ceph/src/common/bit_vector.hpp
index 6a6e6b7d0..b010970b3 100644
--- a/ceph/src/common/bit_vector.hpp
+++ b/ceph/src/common/bit_vector.hpp
@@ -14,6 +14,7 @@
 #include "common/Formatter.h"
 #include "include/assert.h"
 #include "include/encoding.h"
+#include <utility>
 
 namespace ceph {
 
@@ -28,36 +29,150 @@ private:
   // must be power of 2
   BOOST_STATIC_ASSERT((_bit_count != 0) && !(_bit_count & (_bit_count - 1)));
   BOOST_STATIC_ASSERT(_bit_count <= BITS_PER_BYTE);
-public:
-  static const uint32_t BLOCK_SIZE;
 
-  class ConstReference {
+  template <typename DataIterator>
+  class ReferenceImpl {
+  protected:
+    DataIterator m_data_iterator;
+    uint64_t m_shift;
+
+    ReferenceImpl(const DataIterator& data_iterator, uint64_t shift)
+      : m_data_iterator(data_iterator), m_shift(shift) {
+    }
+    ReferenceImpl(DataIterator&& data_iterator, uint64_t shift)
+      : m_data_iterator(std::move(data_iterator)), m_shift(shift) {
+    }
+
   public:
-    operator uint8_t() const;
+    inline operator uint8_t() const {
+      return (*m_data_iterator >> m_shift) & MASK;
+    }
+  };
+
+public:
+
+  class ConstReference : public ReferenceImpl<bufferlist::const_iterator> {
   private:
     friend class BitVector;
-    const BitVector &m_bit_vector;
-    uint64_t m_offset;
 
-    ConstReference(const BitVector &bit_vector, uint64_t offset);
+    ConstReference(const bufferlist::const_iterator& data_iterator,
+                   uint64_t shift)
+      : ReferenceImpl<bufferlist::const_iterator>(data_iterator, shift) {
+    }
+    ConstReference(bufferlist::const_iterator&& data_iterator, uint64_t shift)
+      : ReferenceImpl<bufferlist::const_iterator>(std::move(data_iterator),
+                                                  shift) {
+    }
   };
 
-  class Reference {
+  class Reference : public ReferenceImpl<bufferlist::iterator> {
   public:
-    operator uint8_t() const;
     Reference& operator=(uint8_t v);
+
+  private:
+    friend class BitVector;
+
+    Reference(const bufferlist::iterator& data_iterator, uint64_t shift)
+      : ReferenceImpl<bufferlist::iterator>(data_iterator, shift) {
+    }
+    Reference(bufferlist::iterator&& data_iterator, uint64_t shift)
+      : ReferenceImpl<bufferlist::iterator>(std::move(data_iterator), shift) {
+    }
+  };
+
+public:
+  template <typename BitVectorT, typename DataIterator>
+  class IteratorImpl {
   private:
     friend class BitVector;
-    BitVector &m_bit_vector;
-    uint64_t m_offset;
 
-    Reference(BitVector &bit_vector, uint64_t offset);
+    uint64_t m_offset = 0;
+    BitVectorT *m_bit_vector;
+
+    // cached derived values
+    uint64_t m_index = 0;
+    uint64_t m_shift = 0;
+    DataIterator m_data_iterator;
+
+    IteratorImpl(BitVectorT *bit_vector, uint64_t offset)
+      : m_bit_vector(bit_vector),
+        m_data_iterator(bit_vector->m_data.begin()) {
+      *this += offset;
+    }
+
+  public:
+    inline IteratorImpl& operator++() {
+      ++m_offset;
+
+      uint64_t index;
+      compute_index(m_offset, &index, &m_shift);
+
+      assert(index == m_index || index == m_index + 1);
+      if (index > m_index) {
+        m_index = index;
+        ++m_data_iterator;
+      }
+      return *this;
+    }
+    inline IteratorImpl& operator+=(uint64_t offset) {
+      m_offset += offset;
+      compute_index(m_offset, &m_index, &m_shift);
+      if (m_offset < m_bit_vector->size()) {
+        m_data_iterator.seek(m_index);
+      } else {
+        m_data_iterator = m_bit_vector->m_data.end();
+      }
+      return *this;
+    }
+
+    inline IteratorImpl operator++(int) {
+      IteratorImpl iterator_impl(*this);
+      ++iterator_impl;
+      return iterator_impl;
+    }
+    inline IteratorImpl operator+(uint64_t offset) {
+      IteratorImpl iterator_impl(*this);
+      iterator_impl += offset;
+      return iterator_impl;
+    }
+
+    inline bool operator==(const IteratorImpl& rhs) const {
+      return (m_offset == rhs.m_offset && m_bit_vector == rhs.m_bit_vector);
+    }
+    inline bool operator!=(const IteratorImpl& rhs) const {
+      return (m_offset != rhs.m_offset || m_bit_vector != rhs.m_bit_vector);
+    }
+
+    inline ConstReference operator*() const {
+      return ConstReference(m_data_iterator, m_shift);
+    }
+    inline Reference operator*() {
+      return Reference(m_data_iterator, m_shift);
+    }
   };
 
+  typedef IteratorImpl<const BitVector,
+                       bufferlist::const_iterator> ConstIterator;
+  typedef IteratorImpl<BitVector, bufferlist::iterator> Iterator;
+
+  static const uint32_t BLOCK_SIZE;
   static const uint8_t BIT_COUNT = _bit_count;
 
   BitVector();
 
+  inline ConstIterator begin() const {
+    return ConstIterator(this, 0);
+  }
+  inline ConstIterator end() const {
+    return ConstIterator(this, m_size);
+  }
+  inline Iterator begin() {
+    return Iterator(this, 0);
+  }
+  inline Iterator end() {
+    return Iterator(this, m_size);
+  }
+
   void set_crc_enabled(bool enabled) {
     m_crc_enabled = enabled;
   }
@@ -345,55 +460,33 @@ bool BitVector<_b>::operator==(const BitVector &b) const {
 
 template <uint8_t _b>
 typename BitVector<_b>::Reference BitVector<_b>::operator[](uint64_t offset) {
-  return Reference(*this, offset);
-}
-
-template <uint8_t _b>
-typename BitVector<_b>::ConstReference BitVector<_b>::operator[](uint64_t offset) const {
-  return ConstReference(*this, offset);
-}
-
-template <uint8_t _b>
-BitVector<_b>::ConstReference::ConstReference(const BitVector<_b> &bit_vector,
-					      uint64_t offset)
-  : m_bit_vector(bit_vector), m_offset(offset)
-{
-}
-
-template <uint8_t _b>
-BitVector<_b>::ConstReference::operator uint8_t() const {
   uint64_t index;
   uint64_t shift;
-  this->m_bit_vector.compute_index(this->m_offset, &index, &shift);
+  compute_index(offset, &index, &shift);
 
-  return (this->m_bit_vector.m_data[index] >> shift) & MASK;
+  bufferlist::iterator data_iterator(m_data.begin());
+  data_iterator.seek(index);
+  return Reference(std::move(data_iterator), shift);
 }
 
 template <uint8_t _b>
-BitVector<_b>::Reference::Reference(BitVector<_b> &bit_vector, uint64_t offset)
-  : m_bit_vector(bit_vector), m_offset(offset)
-{
-}
-
-template <uint8_t _b>
-BitVector<_b>::Reference::operator uint8_t() const {
+typename BitVector<_b>::ConstReference BitVector<_b>::operator[](uint64_t offset) const {
   uint64_t index;
   uint64_t shift;
-  this->m_bit_vector.compute_index(this->m_offset, &index, &shift);
+  compute_index(offset, &index, &shift);
 
-  return (this->m_bit_vector.m_data[index] >> shift) & MASK;
+  bufferlist::const_iterator data_iterator(m_data.begin());
+  data_iterator.seek(index);
+  return ConstReference(std::move(data_iterator), shift);
 }
 
 template <uint8_t _b>
 typename BitVector<_b>::Reference& BitVector<_b>::Reference::operator=(uint8_t v) {
-  uint64_t index;
-  uint64_t shift;
-  this->m_bit_vector.compute_index(this->m_offset, &index, &shift);
-
-  uint8_t mask = MASK << shift;
-  char packed_value = (this->m_bit_vector.m_data[index] & ~mask) |
-		      ((v << shift) & mask);
-  this->m_bit_vector.m_data.copy_in(index, 1, &packed_value);
+  uint8_t mask = MASK << this->m_shift;
+  char packed_value = (*this->m_data_iterator & ~mask) |
+                      ((v << this->m_shift) & mask);
+  bufferlist::iterator it(this->m_data_iterator);
+  it.copy_in(1, &packed_value, true);
   return *this;
 }
 
diff --git a/ceph/src/common/buffer.cc b/ceph/src/common/buffer.cc
index b8e87d1ee..18ae276cc 100644
--- a/ceph/src/common/buffer.cc
+++ b/ceph/src/common/buffer.cc
@@ -172,17 +172,17 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
     char *data;
     unsigned len;
     std::atomic<unsigned> nref { 0 };
-    int mempool = mempool::mempool_buffer_anon;
+    int mempool;
 
     mutable std::atomic_flag crc_spinlock = ATOMIC_FLAG_INIT;
     map<pair<size_t, size_t>, pair<uint32_t, uint32_t> > crc_map;
 
-    explicit raw(unsigned l)
-      : data(NULL), len(l), nref(0) {
+    explicit raw(unsigned l, int mempool=mempool::mempool_buffer_anon)
+      : data(NULL), len(l), nref(0), mempool(mempool) {
       mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
     }
-    raw(char *c, unsigned l)
-      : data(c), len(l), nref(0) {
+    raw(char *c, unsigned l, int mempool=mempool::mempool_buffer_anon)
+      : data(c), len(l), nref(0), mempool(mempool) {
       mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
     }
     virtual ~raw() {
@@ -281,8 +281,9 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
   class buffer::raw_combined : public buffer::raw {
     size_t alignment;
   public:
-    raw_combined(char *dataptr, unsigned l, unsigned align=0)
-      : raw(dataptr, l),
+    raw_combined(char *dataptr, unsigned l, unsigned align,
+		 int mempool)
+      : raw(dataptr, l, mempool),
 	alignment(align) {
       inc_total_alloc(len);
       inc_history_alloc(len);
@@ -294,7 +295,9 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
       return create(len, alignment);
     }
 
-    static raw_combined *create(unsigned len, unsigned align=0) {
+    static raw_combined *create(unsigned len,
+				unsigned align,
+				int mempool = mempool::mempool_buffer_anon) {
       if (!align)
 	align = sizeof(size_t);
       size_t rawlen = ROUND_UP_TO(sizeof(buffer::raw_combined),
@@ -314,7 +317,7 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
 
       // actual data first, since it has presumably larger alignment restriction
       // then put the raw_combined at the end
-      return new (ptr + datalen) raw_combined(ptr, len, align);
+      return new (ptr + datalen) raw_combined(ptr, len, align, mempool);
     }
 
     static void operator delete(void *ptr) {
@@ -771,6 +774,9 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
   buffer::raw* buffer::create(unsigned len) {
     return buffer::create_aligned(len, sizeof(size_t));
   }
+  buffer::raw* buffer::create_in_mempool(unsigned len, int mempool) {
+    return buffer::create_aligned_in_mempool(len, sizeof(size_t), mempool);
+  }
   buffer::raw* buffer::claim_char(unsigned len, char *buf) {
     return new raw_claimed_char(len, buf);
   }
@@ -787,7 +793,8 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
     return new raw_claim_buffer(buf, len, std::move(del));
   }
 
-  buffer::raw* buffer::create_aligned(unsigned len, unsigned align) {
+  buffer::raw* buffer::create_aligned_in_mempool(
+    unsigned len, unsigned align, int mempool) {
     // If alignment is a page multiple, use a separate buffer::raw to
     // avoid fragmenting the heap.
     //
@@ -805,7 +812,12 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
       return new raw_hack_aligned(len, align);
 #endif
     }
-    return raw_combined::create(len, align);
+    return raw_combined::create(len, align, mempool);
+  }
+  buffer::raw* buffer::create_aligned(
+    unsigned len, unsigned align) {
+    return create_aligned_in_mempool(len, align,
+				     mempool::mempool_buffer_anon);
   }
 
   buffer::raw* buffer::create_page_aligned(unsigned len) {
@@ -952,6 +964,24 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
 
   bool buffer::ptr::at_buffer_tail() const { return _off + _len == _raw->len; }
 
+  int buffer::ptr::get_mempool() const {
+    if (_raw) {
+      return _raw->mempool;
+    }
+    return mempool::mempool_buffer_anon;
+  }
+
+  void buffer::ptr::reassign_to_mempool(int pool) {
+    if (_raw) {
+      _raw->reassign_to_mempool(pool);
+    }
+  }
+  void buffer::ptr::try_assign_to_mempool(int pool) {
+    if (_raw) {
+      _raw->try_assign_to_mempool(pool);
+    }
+  }
+
   const char *buffer::ptr::c_str() const {
     assert(_raw);
     if (buffer_track_c_str)
@@ -1493,7 +1523,6 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
   {
     std::swap(_len, other._len);
     std::swap(_memcopy_count, other._memcopy_count);
-    std::swap(_mempool, other._mempool);
     _buffers.swap(other._buffers);
     append_buffer.swap(other.append_buffer);
     //last_p.swap(other.last_p);
@@ -1666,9 +1695,16 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
     return is_aligned(CEPH_PAGE_SIZE);
   }
 
+  int buffer::list::get_mempool() const
+  {
+    if (_buffers.empty()) {
+      return mempool::mempool_buffer_anon;
+    }
+    return _buffers.back().get_mempool();
+  }
+
   void buffer::list::reassign_to_mempool(int pool)
   {
-    _mempool = pool;
     if (append_buffer.get_raw()) {
       append_buffer.get_raw()->reassign_to_mempool(pool);
     }
@@ -1679,7 +1715,6 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
 
   void buffer::list::try_assign_to_mempool(int pool)
   {
-    _mempool = pool;
     if (append_buffer.get_raw()) {
       append_buffer.get_raw()->try_assign_to_mempool(pool);
     }
@@ -1778,10 +1813,7 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
   void buffer::list::reserve(size_t prealloc)
   {
     if (append_buffer.unused_tail_length() < prealloc) {
-      append_buffer = buffer::create(prealloc);
-      if (_mempool >= 0) {
-	append_buffer.get_raw()->reassign_to_mempool(_mempool);
-      }
+      append_buffer = buffer::create_in_mempool(prealloc, get_mempool());
       append_buffer.set_length(0);   // unused, so far.
     }
   }
@@ -1879,11 +1911,9 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
     unsigned gap = append_buffer.unused_tail_length();
     if (!gap) {
       // make a new append_buffer!
-      append_buffer = raw_combined::create(CEPH_BUFFER_APPEND_SIZE);
+      append_buffer = raw_combined::create(CEPH_BUFFER_APPEND_SIZE, 0,
+					   get_mempool());
       append_buffer.set_length(0);   // unused, so far.
-      if (_mempool >= 0) {
-	append_buffer.get_raw()->reassign_to_mempool(_mempool);
-      }
     }
     append(append_buffer, append_buffer.append(c) - 1, 1);	// add segment to the list
   }
@@ -1909,11 +1939,8 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
       size_t need = ROUND_UP_TO(len, sizeof(size_t)) + sizeof(raw_combined);
       size_t alen = ROUND_UP_TO(need, CEPH_BUFFER_ALLOC_UNIT) -
 	sizeof(raw_combined);
-      append_buffer = raw_combined::create(alen);
+      append_buffer = raw_combined::create(alen, 0, get_mempool());
       append_buffer.set_length(0);   // unused, so far.
-      if (_mempool >= 0) {
-	append_buffer.get_raw()->reassign_to_mempool(_mempool);
-      }
     }
   }
 
diff --git a/ceph/src/common/ceph_context.cc b/ceph/src/common/ceph_context.cc
index 2cf0f7d8c..64b2ec674 100644
--- a/ceph/src/common/ceph_context.cc
+++ b/ceph/src/common/ceph_context.cc
@@ -259,7 +259,7 @@ public:
     }
 
     if (log->graylog() && changed.count("fsid")) {
-      log->graylog()->set_fsid(conf->fsid);
+      log->graylog()->set_fsid(conf->get_val<uuid_d>("fsid"));
     }
   }
 };
diff --git a/ceph/src/common/common_init.cc b/ceph/src/common/common_init.cc
index 9cb1b1207..7889f42a0 100644
--- a/ceph/src/common/common_init.cc
+++ b/ceph/src/common/common_init.cc
@@ -58,6 +58,10 @@ CephContext *common_preinit(const CephInitParameters &iparams,
     conf->set_val_or_die("err_to_stderr", "false");
     conf->set_val_or_die("log_flush_on_exit", "false");
   }
+  if (code_env != CODE_ENVIRONMENT_DAEMON) {
+    // NOTE: disable ms subsystem gathering in clients by default
+    conf->set_val_or_die("debug_ms", "0/0");
+  }
 
   return cct;
 }
diff --git a/ceph/src/common/config.cc b/ceph/src/common/config.cc
index ea372bfb3..3cbb27e34 100644
--- a/ceph/src/common/config.cc
+++ b/ceph/src/common/config.cc
@@ -492,7 +492,10 @@ int md_config_t::parse_argv(std::vector<const char*>& args)
       set_val_or_die("client_mountpoint", val.c_str());
     }
     else {
-      parse_option(args, i, NULL);
+      int r = parse_option(args, i, NULL);
+      if (r < 0) {
+        return r;
+      }
     }
   }
 
@@ -536,8 +539,16 @@ int md_config_t::parse_option(std::vector<const char*>& args,
     std::string as_option("--");
     as_option += "debug_";
     as_option += subsys.get_name(o);
-    if (ceph_argparse_witharg(args, i, &val,
+    ostringstream err;
+    if (ceph_argparse_witharg(args, i, &val, err,
 			      as_option.c_str(), (char*)NULL)) {
+      if (err.tellp()) {
+        if (oss) {
+          *oss << err.str();
+        }
+        ret = -EINVAL;
+        break;
+      }
       int log, gather;
       int r = sscanf(val.c_str(), "%d/%d", &log, &gather);
       if (r >= 1) {
diff --git a/ceph/src/common/legacy_config_opts.h b/ceph/src/common/legacy_config_opts.h
index cb6b406bb..e0f0ad7e3 100644
--- a/ceph/src/common/legacy_config_opts.h
+++ b/ceph/src/common/legacy_config_opts.h
@@ -14,15 +14,11 @@
 
 /* note: no header guard */
 OPTION(host, OPT_STR) // "" means that ceph will use short hostname
-OPTION(fsid, OPT_UUID)
 OPTION(public_addr, OPT_ADDR)
 OPTION(public_bind_addr, OPT_ADDR)
 OPTION(cluster_addr, OPT_ADDR)
 OPTION(public_network, OPT_STR)
 OPTION(cluster_network, OPT_STR)
-OPTION(monmap, OPT_STR)
-OPTION(mon_host, OPT_STR)
-OPTION(mon_dns_srv_name, OPT_STR)
 OPTION(lockdep, OPT_BOOL)
 OPTION(lockdep_force_backtrace, OPT_BOOL) // always gather current backtrace at every lock
 OPTION(run_dir, OPT_STR)       // the "/var/run/ceph" dir, created on daemon startup
@@ -239,8 +235,6 @@ OPTION(mon_timecheck_interval, OPT_FLOAT) // on leader, timecheck (clock drift c
 OPTION(mon_timecheck_skew_interval, OPT_FLOAT) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
 OPTION(mon_pg_stuck_threshold, OPT_INT) // number of seconds after which pgs can be considered stuck inactive, unclean, etc (see doc/control.rst under dump_stuck for more info)
 OPTION(mon_pg_min_inactive, OPT_U64) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR.
-OPTION(mon_pg_warn_min_per_osd, OPT_INT)  // min # pgs per (in) osd before we warn the admin
-OPTION(mon_pg_warn_max_per_osd, OPT_INT)  // max # pgs per (in) osd before we warn the admin
 OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT) // max skew few average in objects per pg
 OPTION(mon_pg_warn_min_objects, OPT_INT)  // do not warn below this object #
 OPTION(mon_pg_warn_min_pool_objects, OPT_INT)  // do not warn on pools below this object #
@@ -267,7 +261,6 @@ OPTION(mon_max_mdsmap_epochs, OPT_INT)
 OPTION(mon_max_osd, OPT_INT)
 OPTION(mon_probe_timeout, OPT_DOUBLE)
 OPTION(mon_client_bytes, OPT_U64)  // client msg data allowed in memory (in bytes)
-OPTION(mon_mgr_proxy_client_bytes_ratio, OPT_FLOAT) // ratio of mon_client_bytes that can be consumed by proxied mgr commands before we error out to client
 OPTION(mon_log_max_summary, OPT_U64)
 OPTION(mon_daemon_bytes, OPT_U64)  // mds, osd message memory cap (in bytes)
 OPTION(mon_max_log_entries_per_event, OPT_INT)
@@ -1541,27 +1534,6 @@ OPTION(rgw_shard_warning_threshold, OPT_DOUBLE) // pct of safe max
 
 OPTION(rgw_swift_versioning_enabled, OPT_BOOL) // whether swift object versioning feature is enabled
 
-OPTION(mgr_module_path, OPT_STR) // where to load python modules from
-OPTION(mgr_initial_modules, OPT_STR)  // Which modules to load
-OPTION(mgr_data, OPT_STR) // where to find keyring etc
-OPTION(mgr_tick_period, OPT_INT)  // How frequently to tick
-OPTION(mgr_stats_period, OPT_INT) // How frequently clients send stats
-OPTION(mgr_client_bytes, OPT_U64) // bytes from clients
-OPTION(mgr_client_messages, OPT_U64)      // messages from clients
-OPTION(mgr_osd_bytes, OPT_U64)   // bytes from osds
-OPTION(mgr_osd_messages, OPT_U64)       // messages from osds
-OPTION(mgr_mds_bytes, OPT_U64)   // bytes from mdss
-OPTION(mgr_mds_messages, OPT_U64)        // messages from mdss
-OPTION(mgr_mon_bytes, OPT_U64)   // bytes from mons
-OPTION(mgr_mon_messages, OPT_U64)        // messages from mons
-
-OPTION(mgr_connect_retry_interval, OPT_DOUBLE)
-OPTION(mgr_service_beacon_grace, OPT_DOUBLE)
-
-OPTION(mon_mgr_digest_period, OPT_INT)  // How frequently to send digests
-OPTION(mon_mgr_beacon_grace, OPT_INT)  // How long to wait to failover
-OPTION(mon_mgr_inactive_grace, OPT_INT) // How long before health WARN -> ERR
-OPTION(mon_mgr_mkfs_grace, OPT_INT) // How long before we complain about MGR_DOWN
 OPTION(rgw_crypt_require_ssl, OPT_BOOL) // requests including encryption key headers must be sent over ssl
 OPTION(rgw_crypt_default_encryption_key, OPT_STR) // base64 encoded key for encryption of rgw objects
 OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR) // extra keys that may be used for aws:kms
diff --git a/ceph/src/common/options.cc b/ceph/src/common/options.cc
index 0be052e1b..33f920525 100644
--- a/ceph/src/common/options.cc
+++ b/ceph/src/common/options.cc
@@ -11,6 +11,9 @@
 #include <boost/lexical_cast.hpp>
 #include <boost/regex.hpp>
 
+// Definitions for enums
+#include "common/perf_counters.h"
+
 
 void Option::dump_value(const char *field_name,
     const Option::value_t &v, Formatter *f) const
@@ -160,12 +163,24 @@ std::vector<Option> get_global_options() {
     Option("public_network", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .add_service({"mon", "mds", "osd", "mgr"})
     .add_tag("network")
-    .set_description(""),
+    .set_description("Network(s) from which to choose a public address to bind to"),
+
+    Option("public_network_interface", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .add_service({"mon", "mds", "osd", "mgr"})
+    .add_tag("network")
+    .set_description("Interface name(s) from which to choose an address from a public_network to bind to; public_network must also be specified.")
+    .add_see_also("public_network"),
 
     Option("cluster_network", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .add_service("osd")
     .add_tag("network")
-    .set_description(""),
+    .set_description("Network(s) from which to choose a cluster address to bind to"),
+
+    Option("cluster_network_interface", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .add_service({"mon", "mds", "osd", "mgr"})
+    .add_tag("network")
+    .set_description("Interface name(s) from which to choose an address from a cluster_network to bind to; cluster_network must also be specified.")
+    .add_see_also("cluster_network"),
 
     Option("monmap", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_description("path to MonMap file")
@@ -183,6 +198,7 @@ std::vector<Option> get_global_options() {
     .add_service("common"),
 
     Option("mon_dns_srv_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("ceph-mon")
     .set_description("name of DNS SRV record to check for monitor addresses")
     .add_service("common")
     .add_tag("network")
@@ -521,11 +537,16 @@ std::vector<Option> get_global_options() {
 
     Option("key", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("")
-    .set_description(""),
+    .set_description("Authentication key")
+    .set_long_description("A CephX authentication key, base64 encoded.  It normally looks something like 'AQAtut9ZdMbNJBAAHz6yBAWyJyz2yYRyeMWDag=='.")
+    .add_see_also("keyfile")
+    .add_see_also("keyring"),
 
     Option("keyfile", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("")
-    .set_description(""),
+    .set_description("Path to a file containing a key")
+    .set_long_description("The file should contain a CephX authentication key and optionally a trailing newline, but nothing else.")
+    .add_see_also("key"),
 
     Option("keyring", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default(
@@ -537,7 +558,10 @@ std::vector<Option> get_global_options() {
       "/usr/local/etc/ceph/keyring,/usr/local/etc/ceph/keyring.bin," 
   #endif
     )
-    .set_description(""),
+    .set_description("Path to a keyring file.")
+    .set_long_description("A keyring file is an INI-style formatted file where the section names are client or daemon names (e.g., 'osd.0') and each section contains a 'key' property with CephX authentication key as the value.")
+    .add_see_also("key")
+    .add_see_also("keyfile"),
 
     Option("heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(5)
@@ -1011,13 +1035,13 @@ std::vector<Option> get_global_options() {
     .set_default(1)
     .set_description(""),
 
-    Option("mon_pg_warn_min_per_osd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("mon_pg_warn_min_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(30)
-    .set_description(""),
+    .set_description("minimal number PGs per (in) osd before we warn the admin"),
 
-    Option("mon_pg_warn_max_per_osd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
-    .set_default(300)
-    .set_description(""),
+    Option("mon_max_pg_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(200)
+    .set_description("Max number of PGs per OSD the cluster will allow"),
 
     Option("mon_pg_warn_max_object_skew", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(10.0)
@@ -1127,9 +1151,10 @@ std::vector<Option> get_global_options() {
     .set_default(100ul << 20)
     .set_description(""),
 
-    Option("mon_mgr_proxy_client_bytes_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    Option("mon_mgr_proxy_client_bytes_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(.3)
-    .set_description(""),
+    .set_description("ratio of mon_client_bytes that can be consumed by "
+                     "proxied mgr commands before we error out to client"),
 
     Option("mon_log_max_summary", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(50)
@@ -1273,6 +1298,10 @@ std::vector<Option> get_global_options() {
     .set_default(false)
     .set_description(""),
 
+    Option("mon_fixup_legacy_erasure_code_profiles", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Automatically adjust ruleset-* to crush-* so that legacy apps can set modern erasure code profiles without modification"),
+
     Option("mon_debug_deprecated_as_obsolete", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(false)
     .set_description(""),
@@ -2544,6 +2573,13 @@ std::vector<Option> get_global_options() {
     .set_default(100)
     .set_description(""),
 
+    Option("osd_max_pg_per_osd_hard_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_min(1)
+    .set_description("Maximum number of PG per OSD, a factor of 'mon_max_pg_per_osd'")
+    .set_long_description("OSD will refuse to instantiate PG if the number of PG it serves exceeds this number.")
+    .add_see_also("mon_max_pg_per_osd"),
+
     Option("osd_op_complaint_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(30)
     .set_description(""),
@@ -2580,6 +2616,10 @@ std::vector<Option> get_global_options() {
     .set_default(false)
     .set_description(""),
 
+    Option("osd_debug_shutdown", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Turn up debug levels during shutdown"),
+
     Option("osd_debug_crash_on_ignored_backoff", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(false)
     .set_description(""),
@@ -3002,6 +3042,10 @@ std::vector<Option> get_global_options() {
     // --------------------------
     // bluestore
 
+    Option("bdev_inject_bad_size", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
     Option("bdev_debug_inflight_ios", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(false)
     .set_description(""),
@@ -3113,6 +3157,10 @@ std::vector<Option> get_global_options() {
     .set_default(1*1024*1024*1024)
     .set_description("minimum disk space allocated to BlueFS (e.g., at mkfs)"),
 
+    Option("bluestore_bluefs_min_free", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1*1024*1024*1024)
+    .set_description("minimum free space allocated to BlueFS"),
+
     Option("bluestore_bluefs_min_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(.02)
     .set_description("Minimum fraction of free space devoted to BlueFS"),
@@ -3626,7 +3674,7 @@ std::vector<Option> get_global_options() {
     // filestore
 
     Option("filestore_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
-    .set_default("")
+    .set_default("max_background_compactions=8,compaction_readahead_size=2097152,compression=kNoCompression")
     .set_description(""),
 
     Option("filestore_omap_backend", Option::TYPE_STR, Option::LEVEL_ADVANCED)
@@ -4009,6 +4057,14 @@ std::vector<Option> get_global_options() {
     .set_default(0)
     .set_description(""),
 
+  Option("mgr_stats_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+  .set_default((int64_t)PerfCountersBuilder::PRIO_USEFUL)
+  .set_description("Lowest perfcounter priority collected by mgr")
+  .set_long_description("Daemons only set perf counter data to the manager "
+    "daemon if the counter has a priority higher than this.")
+  .set_min_max((int64_t)PerfCountersBuilder::PRIO_DEBUGONLY,
+               (int64_t)PerfCountersBuilder::PRIO_CRITICAL),
+
     Option("journal_zero_on_create", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
     .set_description(""),
@@ -4043,79 +4099,110 @@ std::vector<Option> get_global_options() {
 
     Option("mgr_module_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default(CEPH_PKGLIBDIR "/mgr")
-    .set_description(""),
+    .add_service("mgr")
+    .set_description("Filesystem path to manager modules."),
 
-    Option("mgr_initial_modules", Option::TYPE_STR, Option::LEVEL_ADVANCED)
-    .set_default("restful status")
-    .set_description(""),
+    Option("mgr_initial_modules", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("restful status balancer")
+    .add_service("mon")
+    .set_description("List of manager modules to enable when the cluster is "
+                     "first started")
+    .set_long_description("This list of module names is read by the monitor "
+        "when the cluster is first started after installation, to populate "
+        "the list of enabled manager modules.  Subsequent updates are done using "
+        "the 'mgr module [enable|disable]' commands.  List may be comma "
+        "or space separated."),
 
     Option("mgr_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_default("/var/lib/ceph/mgr/$cluster-$id")
-    .set_description(""),
+    .add_service("mgr")
+    .set_description("Filesystem path to the ceph-mgr data directory, used to "
+                     "contain keyring."),
 
     Option("mgr_tick_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(2)
-    .set_description(""),
+    .add_service("mgr")
+    .set_description("Period in seconds of beacon messages to monitor"),
 
-    Option("mgr_stats_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("mgr_stats_period", Option::TYPE_INT, Option::LEVEL_BASIC)
     .set_default(5)
-    .set_description(""),
-
-    Option("mgr_client_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .add_service("mgr")
+    .set_description("Period in seconds of OSD/MDS stats reports to manager")
+    .set_long_description("Use this setting to control the granularity of "
+                          "time series data collection from daemons.  Adjust "
+                          "upwards if the manager CPU load is too high, or "
+                          "if you simply do not require the most up to date "
+                          "performance counter data."),
+
+    Option("mgr_client_bytes", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(128*1048576)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_client_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_client_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(512)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_osd_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_osd_bytes", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(512*1048576)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_osd_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_osd_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(8192)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_mds_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_mds_bytes", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(128*1048576)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_mds_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_mds_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(128)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_mon_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_mon_bytes", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(128*1048576)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_mon_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    Option("mgr_mon_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(128)
-    .set_description(""),
+    .add_service("mgr"),
 
-    Option("mgr_connect_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    Option("mgr_connect_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(1.0)
-    .set_description(""),
+    .add_service("common"),
 
     Option("mgr_service_beacon_grace", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(60.0)
-    .set_description(""),
+    .add_service("mgr")
+    .set_description("Period in seconds from last beacon to manager dropping "
+                     "state about a monitored service (RGW, rbd-mirror etc)"),
 
-    Option("mon_mgr_digest_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("mon_mgr_digest_period", Option::TYPE_INT, Option::LEVEL_DEV)
     .set_default(5)
-    .set_description(""),
+    .add_service("mon")
+    .set_description("Period in seconds between monitor-to-manager "
+                     "health/status updates"),
 
     Option("mon_mgr_beacon_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(30)
-    .set_description(""),
+    .add_service("mon")
+    .set_description("Period in seconds from last beacon to monitor marking "
+                     "a manager daemon as failed"),
 
     Option("mon_mgr_inactive_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(60)
-    .set_description(""),
+    .add_service("mon")
+    .set_description("Period in seconds after cluster creation during which "
+                     "cluster may have no active manager")
+    .set_long_description("This grace period enables the cluster to come "
+                          "up cleanly without raising spurious health check "
+                          "failures about managers that aren't online yet"),
 
     Option("mon_mgr_mkfs_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(60)
-    .set_description(""),
+    .add_service("mon")
+    .set_description("Period in seconds that the cluster may have no active "
+                     "manager before this is reported as an ERR rather than "
+                     "a WARN"),
 
     Option("mutex_perf_counter", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
@@ -4165,6 +4252,18 @@ std::vector<Option> get_rgw_options() {
     .set_default(1 * 1024 * 1024)
     .set_description(""),
 
+    Option("rgw_max_attr_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("The maximum length of metadata value. 0 skips the check"),
+
+    Option("rgw_max_attr_name_len", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("The maximum length of metadata name. 0 skips the check"),
+
+    Option("rgw_max_attrs_num_in_req", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("The maximum number of metadata items that can be put via single request"),
+
     Option("rgw_override_bucket_index_max_shards", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(0)
     .set_description(""),
@@ -5748,6 +5847,14 @@ std::vector<Option> get_mds_options() {
     Option("mds_client_writeable_range_max_inc_objs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(1024)
     .set_description(""),
+ 
+    Option("mds_min_caps_per_client", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description("minimum number of capabilities a client may hold"),
+
+    Option("mds_max_ratio_caps_per_client", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.8)
+    .set_description("maximum ratio of current caps that may be recalled during MDS cache pressure"),
   });
 }
 
diff --git a/ceph/src/common/perf_counters.cc b/ceph/src/common/perf_counters.cc
index f3bc4e378..048c93a64 100644
--- a/ceph/src/common/perf_counters.cc
+++ b/ceph/src/common/perf_counters.cc
@@ -53,7 +53,7 @@ void PerfCountersCollection::add(class PerfCounters *l)
     path += ".";
     path += data.name;
 
-    by_path[path] = &data;
+    by_path[path] = {&data, l};
   }
 }
 
@@ -396,12 +396,7 @@ void PerfCounters::dump_formatted_generic(Formatter *f, bool schema,
       } else {
         f->dump_string("nick", "");
       }
-      if (d->prio) {
-	int p = std::max(std::min(d->prio + prio_adjust,
-				  (int)PerfCountersBuilder::PRIO_CRITICAL),
-			 0);
-	f->dump_int("priority", p);
-      }
+      f->dump_int("priority", get_adjusted_priority(d->prio));
       f->close_section();
     } else {
       if (d->type & PERFCOUNTER_LONGRUNAVG) {
@@ -549,7 +544,7 @@ void PerfCountersBuilder::add_impl(
     assert(strlen(nick) <= 4);
   }
   data.nick = nick;
-  data.prio = prio;
+  data.prio = prio ? prio : prio_default;
   data.type = (enum perfcounter_type_d)ty;
   data.histogram = std::move(histogram);
 }
diff --git a/ceph/src/common/perf_counters.h b/ceph/src/common/perf_counters.h
index e831b73ad..846e407ad 100644
--- a/ceph/src/common/perf_counters.h
+++ b/ceph/src/common/perf_counters.h
@@ -42,6 +42,80 @@ enum perfcounter_type_d : uint8_t
 };
 
 
+/* Class for constructing a PerfCounters object.
+ *
+ * This class performs some validation that the parameters we have supplied are
+ * correct in create_perf_counters().
+ *
+ * In the future, we will probably get rid of the first/last arguments, since
+ * PerfCountersBuilder can deduce them itself.
+ */
+class PerfCountersBuilder
+{
+public:
+  PerfCountersBuilder(CephContext *cct, const std::string &name,
+		    int first, int last);
+  ~PerfCountersBuilder();
+
+  // prio values: higher is better, and higher values get included in
+  // 'ceph daemonperf' (and similar) results.
+  // Use of priorities enables us to add large numbers of counters
+  // internally without necessarily overwhelming consumers.
+  enum {
+    PRIO_CRITICAL = 10,
+    // 'interesting' is the default threshold for `daemonperf` output
+    PRIO_INTERESTING = 8,
+    // `useful` is the default threshold for transmission to ceph-mgr
+    // and inclusion in prometheus/influxdb plugin output
+    PRIO_USEFUL = 5,
+    PRIO_UNINTERESTING = 2,
+    PRIO_DEBUGONLY = 0,
+  };
+  void add_u64(int key, const char *name,
+	       const char *description=NULL, const char *nick = NULL,
+	       int prio=0);
+  void add_u64_counter(int key, const char *name,
+		       const char *description=NULL,
+		       const char *nick = NULL,
+		       int prio=0);
+  void add_u64_avg(int key, const char *name,
+		   const char *description=NULL,
+		   const char *nick = NULL,
+		   int prio=0);
+  void add_time(int key, const char *name,
+		const char *description=NULL,
+		const char *nick = NULL,
+		int prio=0);
+  void add_time_avg(int key, const char *name,
+		    const char *description=NULL,
+		    const char *nick = NULL,
+		    int prio=0);
+  void add_u64_counter_histogram(
+    int key, const char* name,
+    PerfHistogramCommon::axis_config_d x_axis_config,
+    PerfHistogramCommon::axis_config_d y_axis_config,
+    const char *description=NULL,
+    const char* nick = NULL,
+    int prio=0);
+
+  void set_prio_default(int prio_)
+  {
+    prio_default = prio_;
+  }
+
+  PerfCounters* create_perf_counters();
+private:
+  PerfCountersBuilder(const PerfCountersBuilder &rhs);
+  PerfCountersBuilder& operator=(const PerfCountersBuilder &rhs);
+  void add_impl(int idx, const char *name,
+                const char *description, const char *nick, int prio, int ty,
+                unique_ptr<PerfHistogram<>> histogram = nullptr);
+
+  PerfCounters *m_perf_counters;
+
+  int prio_default = 0;
+};
+
 /*
  * A PerfCounters object is usually associated with a single subsystem.
  * It contains counters which we modify to track performance and throughput
@@ -96,7 +170,7 @@ public:
     const char *name;
     const char *description;
     const char *nick;
-    int prio = 0;
+    uint8_t prio = 0;
     enum perfcounter_type_d type;
     std::atomic<uint64_t> u64 = { 0 };
     std::atomic<uint64_t> avgcount = { 0 };
@@ -179,6 +253,12 @@ public:
     prio_adjust = p;
   }
 
+  int get_adjusted_priority(int p) const {
+    return std::max(std::min(p + prio_adjust,
+                             (int)PerfCountersBuilder::PRIO_CRITICAL),
+                    0);
+  }
+
 private:
   PerfCounters(CephContext *cct, const std::string &name,
 	     int lower_bound, int upper_bound);
@@ -240,8 +320,17 @@ public:
     dump_formatted_generic(f, schema, true, logger, counter);
   }
 
+  // A reference to a perf_counter_data_any_d, with an accompanying
+  // pointer to the enclosing PerfCounters, in order that the consumer
+  // can see the prio_adjust
+  class PerfCounterRef
+  {
+    public:
+    PerfCounters::perf_counter_data_any_d *data;
+    PerfCounters *perf_counters;
+  };
   typedef std::map<std::string,
-          PerfCounters::perf_counter_data_any_d *> CounterMap;
+          PerfCounterRef> CounterMap;
 
   void with_counters(std::function<void(const CounterMap &)>) const;
 
@@ -257,71 +346,11 @@ private:
 
   perf_counters_set_t m_loggers;
 
-  std::map<std::string, PerfCounters::perf_counter_data_any_d *> by_path; 
+  CounterMap by_path; 
 
   friend class PerfCountersCollectionTest;
 };
 
-/* Class for constructing a PerfCounters object.
- *
- * This class performs some validation that the parameters we have supplied are
- * correct in create_perf_counters().
- *
- * In the future, we will probably get rid of the first/last arguments, since
- * PerfCountersBuilder can deduce them itself.
- */
-class PerfCountersBuilder
-{
-public:
-  PerfCountersBuilder(CephContext *cct, const std::string &name,
-		    int first, int last);
-  ~PerfCountersBuilder();
-
-  // prio values: higher is better, and higher values get included in
-  // 'ceph daemonperf' (and similar) results.
-  enum {
-    PRIO_CRITICAL = 10,
-    PRIO_INTERESTING = 8,
-    PRIO_USEFUL = 5,
-    PRIO_UNINTERESTING = 2,
-    PRIO_DEBUGONLY = 0,
-  };
-  void add_u64(int key, const char *name,
-	       const char *description=NULL, const char *nick = NULL,
-	       int prio=0);
-  void add_u64_counter(int key, const char *name,
-		       const char *description=NULL,
-		       const char *nick = NULL,
-		       int prio=0);
-  void add_u64_avg(int key, const char *name,
-		   const char *description=NULL,
-		   const char *nick = NULL,
-		   int prio=0);
-  void add_time(int key, const char *name,
-		const char *description=NULL,
-		const char *nick = NULL,
-		int prio=0);
-  void add_time_avg(int key, const char *name,
-		    const char *description=NULL,
-		    const char *nick = NULL,
-		    int prio=0);
-  void add_u64_counter_histogram(
-    int key, const char* name,
-    PerfHistogramCommon::axis_config_d x_axis_config,
-    PerfHistogramCommon::axis_config_d y_axis_config,
-    const char *description=NULL,
-    const char* nick = NULL,
-    int prio=0);
-
-  PerfCounters* create_perf_counters();
-private:
-  PerfCountersBuilder(const PerfCountersBuilder &rhs);
-  PerfCountersBuilder& operator=(const PerfCountersBuilder &rhs);
-  void add_impl(int idx, const char *name,
-                const char *description, const char *nick, int prio, int ty,
-                unique_ptr<PerfHistogram<>> histogram = nullptr);
 
-  PerfCounters *m_perf_counters;
-};
 
 #endif
diff --git a/ceph/src/common/pick_address.cc b/ceph/src/common/pick_address.cc
index dfea843e9..4bda7ba1b 100644
--- a/ceph/src/common/pick_address.cc
+++ b/ceph/src/common/pick_address.cc
@@ -22,29 +22,79 @@
 
 #define dout_subsys ceph_subsys_
 
-static const struct sockaddr *find_ip_in_subnet_list(CephContext *cct,
-						     const struct ifaddrs *ifa,
-						     const std::string &networks)
+const struct sockaddr *find_ip_in_subnet_list(
+  CephContext *cct,
+  const struct ifaddrs *ifa,
+  const std::string &networks,
+  const std::string &interfaces)
 {
   std::list<string> nets;
   get_str_list(networks, nets);
+  std::list<string> ifs;
+  get_str_list(interfaces, ifs);
+
+  // filter interfaces by name
+  const struct ifaddrs *filtered = 0;
+  if (ifs.empty()) {
+    filtered = ifa;
+  } else {
+    if (nets.empty()) {
+      lderr(cct) << "interface names specified but not network names" << dendl;
+      exit(1);
+    }
+    const struct ifaddrs *t = ifa;
+    struct ifaddrs *head = 0;
+    while (t != NULL) {
+      bool match = false;
+      for (auto& i : ifs) {
+	if (strcmp(i.c_str(), t->ifa_name) == 0) {
+	  match = true;
+	  break;
+	}
+      }
+      if (match) {
+	struct ifaddrs *n = new ifaddrs;
+	memcpy(n, t, sizeof(*t));
+	n->ifa_next = head;
+	head = n;
+      }
+      t = t->ifa_next;
+    }
+    if (head == NULL) {
+      lderr(cct) << "no interfaces matching " << ifs << dendl;
+      exit(1);
+    }
+    filtered = head;
+  }
 
-  for(std::list<string>::iterator s = nets.begin(); s != nets.end(); ++s) {
-      struct sockaddr_storage net;
-      unsigned int prefix_len;
+  struct sockaddr *r = NULL;
+  for (std::list<string>::iterator s = nets.begin(); s != nets.end(); ++s) {
+    struct sockaddr_storage net;
+    unsigned int prefix_len;
 
-      if (!parse_network(s->c_str(), &net, &prefix_len)) {
-	lderr(cct) << "unable to parse network: " << *s << dendl;
-	exit(1);
-      }
+    if (!parse_network(s->c_str(), &net, &prefix_len)) {
+      lderr(cct) << "unable to parse network: " << *s << dendl;
+      exit(1);
+    }
+
+    const struct ifaddrs *found = find_ip_in_subnet(
+      filtered,
+      (struct sockaddr *) &net, prefix_len);
+    if (found) {
+      r = found->ifa_addr;
+      break;
+    }
+  }
 
-      const struct ifaddrs *found = find_ip_in_subnet(ifa,
-                                      (struct sockaddr *) &net, prefix_len);
-      if (found)
-	return found->ifa_addr;
+  if (filtered != ifa) {
+    while (filtered) {
+      struct ifaddrs *t = filtered->ifa_next;
+      delete filtered;
+      filtered = t;
     }
+  }
 
-  return NULL;
+  return r;
 }
 
 // observe this change
@@ -67,11 +117,14 @@ struct Observer : public md_config_obs_t {
 static void fill_in_one_address(CephContext *cct,
 				const struct ifaddrs *ifa,
 				const string networks,
+				const string interfaces,
 				const char *conf_var)
 {
-  const struct sockaddr *found = find_ip_in_subnet_list(cct, ifa, networks);
+  const struct sockaddr *found = find_ip_in_subnet_list(cct, ifa, networks,
+							interfaces);
   if (!found) {
-    lderr(cct) << "unable to find any IP address in networks: " << networks << dendl;
+    lderr(cct) << "unable to find any IP address in networks '" << networks
+	       << "' interfaces '" << interfaces << "'" << dendl;
     exit(1);
   }
 
@@ -111,22 +164,29 @@ void pick_addresses(CephContext *cct, int needs)
     exit(1);
   }
 
-
   if ((needs & CEPH_PICK_ADDRESS_PUBLIC)
       && cct->_conf->public_addr.is_blank_ip()
       && !cct->_conf->public_network.empty()) {
-    fill_in_one_address(cct, ifa, cct->_conf->public_network, "public_addr");
+    fill_in_one_address(cct, ifa, cct->_conf->public_network,
+			cct->_conf->get_val<string>("public_network_interface"),
+			"public_addr");
   }
 
   if ((needs & CEPH_PICK_ADDRESS_CLUSTER)
       && cct->_conf->cluster_addr.is_blank_ip()) {
     if (!cct->_conf->cluster_network.empty()) {
-      fill_in_one_address(cct, ifa, cct->_conf->cluster_network, "cluster_addr");
+      fill_in_one_address(
+	cct, ifa, cct->_conf->cluster_network,
+	cct->_conf->get_val<string>("cluster_network_interface"),
+	"cluster_addr");
     } else {
       if (!cct->_conf->public_network.empty()) {
         lderr(cct) << "Public network was set, but cluster network was not set " << dendl;
         lderr(cct) << "    Using public network also for cluster network" << dendl;
-        fill_in_one_address(cct, ifa, cct->_conf->public_network, "cluster_addr");
+        fill_in_one_address(
+	  cct, ifa, cct->_conf->public_network,
+	  cct->_conf->get_val<string>("public_network_interface"),
+	  "cluster_addr");
       }
     }
   }
diff --git a/ceph/src/common/pick_address.h b/ceph/src/common/pick_address.h
index c7c813d64..73020602b 100644
--- a/ceph/src/common/pick_address.h
+++ b/ceph/src/common/pick_address.h
@@ -47,4 +47,11 @@ std::string pick_iface(CephContext *cct, const struct sockaddr_storage &network)
  */
 bool have_local_addr(CephContext *cct, const list<entity_addr_t>& ls, entity_addr_t *match);
 
+
+const struct sockaddr *find_ip_in_subnet_list(
+  CephContext *cct,
+  const struct ifaddrs *ifa,
+  const std::string &networks,
+  const std::string &interfaces);
+
 #endif
diff --git a/ceph/src/common/subsys.h b/ceph/src/common/subsys.h
index 105ca99b7..6e6c26fa7 100644
--- a/ceph/src/common/subsys.h
+++ b/ceph/src/common/subsys.h
@@ -53,6 +53,7 @@ SUBSYS(tp, 0, 5)
 SUBSYS(auth, 1, 5)
 SUBSYS(crypto, 1, 5)
 SUBSYS(finisher, 1, 1)
+SUBSYS(reserver, 1, 1)
 SUBSYS(heartbeatmap, 1, 5)
 SUBSYS(perfcounter, 1, 5)
 SUBSYS(rgw, 1, 5)                 // log level for the Rados gateway
diff --git a/ceph/src/crush/CrushTreeDumper.h b/ceph/src/crush/CrushTreeDumper.h
index 4a3da9b82..6591b1bcd 100644
--- a/ceph/src/crush/CrushTreeDumper.h
+++ b/ceph/src/crush/CrushTreeDumper.h
@@ -68,7 +68,7 @@ namespace CrushTreeDumper {
     explicit Dumper(const CrushWrapper *crush_,
 		    const name_map_t& weight_set_names_)
       : crush(crush_), weight_set_names(weight_set_names_) {
-      crush->find_nonshadow_roots(roots);
+      crush->find_nonshadow_roots(&roots);
       root = roots.begin();
     }
     explicit Dumper(const CrushWrapper *crush_,
@@ -76,9 +76,9 @@ namespace CrushTreeDumper {
                     bool show_shadow)
       : crush(crush_), weight_set_names(weight_set_names_) {
       if (show_shadow) {
-        crush->find_roots(roots);
+        crush->find_roots(&roots);
       } else {
-        crush->find_nonshadow_roots(roots);
+        crush->find_nonshadow_roots(&roots);
       }
       root = roots.begin();
     }
diff --git a/ceph/src/crush/CrushWrapper.cc b/ceph/src/crush/CrushWrapper.cc
index bf6f3cf5a..52af91f6f 100644
--- a/ceph/src/crush/CrushWrapper.cc
+++ b/ceph/src/crush/CrushWrapper.cc
@@ -13,7 +13,7 @@
 
 #define dout_subsys ceph_subsys_crush
 
-bool CrushWrapper::has_legacy_rulesets() const
+bool CrushWrapper::has_legacy_rule_ids() const
 {
   for (unsigned i=0; i<crush->max_rules; i++) {
     crush_rule *r = crush->rules[i];
@@ -25,51 +25,17 @@ bool CrushWrapper::has_legacy_rulesets() const
   return false;
 }
 
-int CrushWrapper::renumber_rules_by_ruleset()
+std::map<int, int> CrushWrapper::renumber_rules()
 {
-  int max_ruleset = 0;
+  std::map<int, int> result;
   for (unsigned i=0; i<crush->max_rules; i++) {
     crush_rule *r = crush->rules[i];
-    if (r && r->mask.ruleset >= max_ruleset) {
-      max_ruleset = r->mask.ruleset + 1;
+    if (r && r->mask.ruleset != i) {
+      result[r->mask.ruleset] = i;
+      r->mask.ruleset = i;
     }
   }
-  struct crush_rule **newrules =
-    (crush_rule**)calloc(1, max_ruleset * sizeof(crush_rule*));
-  for (unsigned i=0; i<crush->max_rules; i++) {
-    crush_rule *r = crush->rules[i];
-    if (!r)
-      continue;
-    if (newrules[r->mask.ruleset]) {
-      // collision, we can't do it.
-      free(newrules);
-      return -EINVAL;
-    }
-    newrules[r->mask.ruleset] = r;
-  }
-
-  // success, swap!
-  free(crush->rules);
-  crush->rules = newrules;
-  crush->max_rules = max_ruleset;
-  return 0;
-}
-
-bool CrushWrapper::has_multirule_rulesets() const
-{
-  for (unsigned i=0; i<crush->max_rules; i++) {
-    crush_rule *r = crush->rules[i];
-    if (!r)
-      continue;
-    for (unsigned j=i+1; j<crush->max_rules; j++) {
-      crush_rule *s = crush->rules[j];
-      if (!s)
-	continue;
-      if (r->mask.ruleset == s->mask.ruleset)
-	return true;
-    }
-  }
-  return false;
+  return result;
 }
 
 bool CrushWrapper::has_non_straw2_buckets() const
@@ -318,7 +284,7 @@ int CrushWrapper::rename_rule(const string& srcname,
   return 0;
 }
 
-void CrushWrapper::find_takes(set<int>& roots) const
+void CrushWrapper::find_takes(set<int> *roots) const
 {
   for (unsigned i=0; i<crush->max_rules; i++) {
     crush_rule *r = crush->rules[i];
@@ -326,19 +292,19 @@ void CrushWrapper::find_takes(set<int>& roots) const
       continue;
     for (unsigned j=0; j<r->len; j++) {
       if (r->steps[j].op == CRUSH_RULE_TAKE)
-	roots.insert(r->steps[j].arg1);
+	roots->insert(r->steps[j].arg1);
     }
   }
 }
 
-void CrushWrapper::find_roots(set<int>& roots) const
+void CrushWrapper::find_roots(set<int> *roots) const
 {
   for (int i = 0; i < crush->max_buckets; i++) {
     if (!crush->buckets[i])
       continue;
     crush_bucket *b = crush->buckets[i];
     if (!_search_item_exists(b->id))
-      roots.insert(b->id);
+      roots->insert(b->id);
   }
 }
 
@@ -1439,7 +1405,7 @@ int CrushWrapper::populate_classes(
   // finish constructing the containing buckets.
   map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> weights
   set<int> roots;
-  find_nonshadow_roots(roots);
+  find_nonshadow_roots(&roots);
   for (auto &r : roots) {
     if (r >= 0)
       continue;
@@ -1457,7 +1423,7 @@ int CrushWrapper::populate_classes(
 int CrushWrapper::trim_roots_with_class()
 {
   set<int> roots;
-  find_shadow_roots(roots);
+  find_shadow_roots(&roots);
   for (auto &r : roots) {
     if (r >= 0)
       continue;
@@ -1499,7 +1465,7 @@ int32_t CrushWrapper::_alloc_class_id() const {
 void CrushWrapper::reweight(CephContext *cct)
 {
   set<int> roots;
-  find_roots(roots);
+  find_roots(&roots);
   for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
     if (*p >= 0)
       continue;
@@ -1627,7 +1593,56 @@ int CrushWrapper::add_simple_rule(
 			    rule_type, -1, err);
 }
 
-int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
+float CrushWrapper::_get_take_weight_osd_map(int root,
+					     map<int,float> *pmap) const
+{
+  float sum = 0.0;
+  list<int> q;
+  q.push_back(root);
+  //breadth first iterate the OSD tree
+  while (!q.empty()) {
+    int bno = q.front();
+    q.pop_front();
+    crush_bucket *b = crush->buckets[-1-bno];
+    assert(b);
+    for (unsigned j=0; j<b->size; ++j) {
+      int item_id = b->items[j];
+      if (item_id >= 0) { //it's an OSD
+	float w = crush_get_bucket_item_weight(b, j);
+	(*pmap)[item_id] = w;
+	sum += w;
+      } else { //not an OSD, expand the child later
+	q.push_back(item_id);
+      }
+    }
+  }
+  return sum;
+}
+
+void CrushWrapper::_normalize_weight_map(float sum,
+					 const map<int,float>& m,
+					 map<int,float> *pmap) const
+{
+  for (auto& p : m) {
+    map<int,float>::iterator q = pmap->find(p.first);
+    if (q == pmap->end()) {
+      (*pmap)[p.first] = p.second / sum;
+    } else {
+      q->second += p.second / sum;
+    }
+  }
+}
+
+int CrushWrapper::get_take_weight_osd_map(int root, map<int,float> *pmap) const
+{
+  map<int,float> m;
+  float sum = _get_take_weight_osd_map(root, &m);
+  _normalize_weight_map(sum, m, pmap);
+  return 0;
+}
+
+int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno,
+					  map<int,float> *pmap) const
 {
   if (ruleno >= crush->max_rules)
     return -ENOENT;
@@ -1650,35 +1665,10 @@ int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
 	m[n] = 1.0;
 	sum = 1.0;
       } else {
-	list<int> q;
-	q.push_back(n);
-	//breadth first iterate the OSD tree
-	while (!q.empty()) {
-	  int bno = q.front();
-	  q.pop_front();
-	  crush_bucket *b = crush->buckets[-1-bno];
-	  assert(b);
-	  for (unsigned j=0; j<b->size; ++j) {
-	    int item_id = b->items[j];
-	    if (item_id >= 0) { //it's an OSD
-	      float w = crush_get_bucket_item_weight(b, j);
-	      m[item_id] = w;
-	      sum += w;
-	    } else { //not an OSD, expand the child later
-	      q.push_back(item_id);
-	    }
-	  }
-	}
-      }
-    }
-    for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
-      map<int,float>::iterator q = pmap->find(p->first);
-      if (q == pmap->end()) {
-	(*pmap)[p->first] = p->second / sum;
-      } else {
-	q->second += p->second / sum;
+	sum += _get_take_weight_osd_map(n, &m);
       }
     }
+    _normalize_weight_map(sum, m, pmap);
   }
 
   return 0;
@@ -1839,6 +1829,16 @@ int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
   return 0;
 }
 
+int CrushWrapper::bucket_set_alg(int bid, int alg)
+{
+  crush_bucket *b = get_bucket(bid);
+  if (!b) {
+    return -ENOENT;
+  }
+  b->alg = alg;
+  return 0;
+}
+
 int CrushWrapper::update_device_class(int id,
                                       const string& class_name,
                                       const string& name,
@@ -2054,6 +2054,44 @@ int CrushWrapper::get_rules_by_class(const string &class_name, set<int> *rules)
   return 0;
 }
 
+// return rules that might reference the given osd
+int CrushWrapper::get_rules_by_osd(int osd, set<int> *rules)
+{
+  assert(rules);
+  rules->clear();
+  if (osd < 0) {
+    return -EINVAL;
+  }
+  for (unsigned i = 0; i < crush->max_rules; ++i) {
+    crush_rule *r = crush->rules[i];
+    if (!r)
+      continue;
+    for (unsigned j = 0; j < r->len; ++j) {
+      if (r->steps[j].op == CRUSH_RULE_TAKE) {
+        int step_item = r->steps[j].arg1;
+        list<int> unordered;
+        int rc = _get_leaves(step_item, &unordered);
+        if (rc < 0) {
+          return rc; // propagate fatal errors!
+        }
+        bool match = false;
+        for (auto &o: unordered) {
+          assert(o >= 0);
+          if (o == osd) {
+            match = true;
+            break;
+          }
+        }
+        if (match) {
+          rules->insert(i);
+          break;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
 bool CrushWrapper::_class_is_dead(int class_id)
 {
   for (auto &p: class_map) {
@@ -2604,7 +2642,7 @@ namespace {
 
     void dump(Formatter *f) {
       set<int> roots;
-      crush->find_roots(roots);
+      crush->find_roots(&roots);
       for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
 	dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
       }
diff --git a/ceph/src/crush/CrushWrapper.h b/ceph/src/crush/CrushWrapper.h
index 384af2c44..607b7c61e 100644
--- a/ceph/src/crush/CrushWrapper.h
+++ b/ceph/src/crush/CrushWrapper.h
@@ -120,14 +120,25 @@ public:
     set_tunables_default();
   }
 
-  /// true if any rule has a ruleset != the rule id
-  bool has_legacy_rulesets() const;
-
-  /// fix rules whose ruleid != ruleset
-  int renumber_rules_by_ruleset();
+  /**
+   * true if any rule has a rule id != its position in the array
+   *
+   * These indicate "ruleset" IDs that were created by older versions
+   * of Ceph.  They are cleaned up in renumber_rules so that eventually
+   * we can remove the code for handling them.
+   */
+  bool has_legacy_rule_ids() const;
 
-  /// true if any ruleset has more than 1 rule
-  bool has_multirule_rulesets() const;
+  /**
+   * fix rules whose ruleid != ruleset
+   *
+   * These rules were created in older versions of Ceph.  The concept
+   * of a ruleset no longer exists.
+   *
+   * Return a map of old ID -> new ID.  Caller must update OSDMap
+   * to use new IDs.
+   */
+  std::map<int, int> renumber_rules();
 
   /// true if any buckets that aren't straw2
   bool has_non_straw2_buckets() const;
@@ -574,25 +585,25 @@ public:
    *
    * Note that these may not be parentless roots.
    */
-  void find_takes(set<int>& roots) const;
+  void find_takes(set<int> *roots) const;
 
   /**
    * find tree roots
    *
    * These are parentless nodes in the map.
    */
-  void find_roots(set<int>& roots) const;
+  void find_roots(set<int> *roots) const;
 
 
   /**
    * find tree roots that contain shadow (device class) items only
    */
-  void find_shadow_roots(set<int>& roots) const {
+  void find_shadow_roots(set<int> *roots) const {
     set<int> all;
-    find_roots(all);
+    find_roots(&all);
     for (auto& p: all) {
       if (is_shadow_item(p)) {
-        roots.insert(p);
+        roots->insert(p);
       }
     }
   }
@@ -603,12 +614,12 @@ public:
    * These are parentless nodes in the map that are not shadow
    * items for device classes.
    */
-  void find_nonshadow_roots(set<int>& roots) const {
+  void find_nonshadow_roots(set<int> *roots) const {
     set<int> all;
-    find_roots(all);
+    find_roots(&all);
     for (auto& p: all) {
       if (!is_shadow_item(p)) {
-        roots.insert(p);
+        roots->insert(p);
       }
     }
   }
@@ -973,6 +984,17 @@ public:
       return true;
     return false;
   }
+  bool rule_has_take(unsigned ruleno, int take) const {
+    if (!crush) return false;
+    crush_rule *rule = get_rule(ruleno);
+    for (unsigned i = 0; i < rule->len; ++i) {
+      if (rule->steps[i].op == CRUSH_RULE_TAKE &&
+	  rule->steps[i].arg1 == take) {
+	return true;
+      }
+    }
+    return false;
+  }
   int get_rule_len(unsigned ruleno) const {
     crush_rule *r = get_rule(ruleno);
     if (IS_ERR(r)) return PTR_ERR(r);
@@ -1014,6 +1036,12 @@ public:
     return s->arg2;
   }
 
+private:
+  float _get_take_weight_osd_map(int root, map<int,float> *pmap) const;
+  void _normalize_weight_map(float sum, const map<int,float>& m,
+			     map<int,float> *pmap) const;
+
+public:
   /**
    * calculate a map of osds to weights for a given rule
    *
@@ -1024,7 +1052,19 @@ public:
    * @param pmap [out] map of osd to weight
    * @return 0 for success, or negative error code
    */
-  int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap);
+  int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap) const;
+
+  /**
+   * calculate a map of osds to weights for a given starting root
+   *
+   * Generate a map of which OSDs get how much relative weight for a
+   * given starting root
+   *
+   * @param root node
+   * @param pmap [out] map of osd to weight
+   * @return 0 for success, or negative error code
+   */
+  int get_take_weight_osd_map(int root, map<int,float> *pmap) const;
 
   /* modifiers */
 
@@ -1206,8 +1246,9 @@ public:
   void finalize() {
     assert(crush);
     crush_finalize(crush);
-    have_uniform_rules = !has_legacy_rulesets();
+    have_uniform_rules = !has_legacy_rule_ids();
   }
+  int bucket_set_alg(int id, int alg);
 
   int update_device_class(int id, const string& class_name, const string& name, ostream *ss);
   int remove_device_class(CephContext *cct, int id, ostream *ss);
@@ -1221,6 +1262,7 @@ public:
   int populate_classes(
     const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket);
   int get_rules_by_class(const string &class_name, set<int> *rules);
+  int get_rules_by_osd(int osd, set<int> *rules);
   bool _class_is_dead(int class_id);
   void cleanup_dead_classes();
   int rebuild_roots_with_classes();
@@ -1282,7 +1324,7 @@ public:
   /**
    * Return the lowest numbered ruleset of type `type`
    *
-   * @returns a ruleset ID, or -1 if no matching rulesets found.
+   * @returns a ruleset ID, or -1 if no matching rules found.
    */
   int find_first_ruleset(int type) const {
     int result = -1;
diff --git a/ceph/src/include/buffer.h b/ceph/src/include/buffer.h
index d9c92ce64..226a60ff1 100644
--- a/ceph/src/include/buffer.h
+++ b/ceph/src/include/buffer.h
@@ -151,11 +151,13 @@ namespace buffer CEPH_BUFFER_API {
    */
   raw* copy(const char *c, unsigned len);
   raw* create(unsigned len);
+  raw* create_in_mempool(unsigned len, int mempool);
   raw* claim_char(unsigned len, char *buf);
   raw* create_malloc(unsigned len);
   raw* claim_malloc(unsigned len, char *buf);
   raw* create_static(unsigned len, char *buf);
   raw* create_aligned(unsigned len, unsigned align);
+  raw* create_aligned_in_mempool(unsigned len, unsigned align, int mempool);
   raw* create_page_aligned(unsigned len);
   raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
   raw* create_unshareable(unsigned len);
@@ -288,6 +290,10 @@ namespace buffer CEPH_BUFFER_API {
       return have_raw() && (start() > 0 || end() < raw_length());
     }
 
+    int get_mempool() const;
+    void reassign_to_mempool(int pool);
+    void try_assign_to_mempool(int pool);
+
     // accessors
     raw *get_raw() const { return _raw; }
     const char *c_str() const;
@@ -348,7 +354,6 @@ namespace buffer CEPH_BUFFER_API {
     unsigned _len;
     unsigned _memcopy_count; //the total of memcopy using rebuild().
     ptr append_buffer;  // where i put small appends.
-    int _mempool = -1;
 
   public:
     class iterator;
@@ -443,6 +448,7 @@ namespace buffer CEPH_BUFFER_API {
 
       void advance(int o);
       void seek(unsigned o);
+      using iterator_impl<false>::operator*;
       char operator*();
       iterator& operator++();
       ptr get_current_ptr();
@@ -682,7 +688,6 @@ namespace buffer CEPH_BUFFER_API {
       _memcopy_count = other._memcopy_count;
       last_p = begin();
       append_buffer.swap(other.append_buffer);
-      _mempool = other._mempool;
       other.clear();
       return *this;
     }
@@ -691,6 +696,7 @@ namespace buffer CEPH_BUFFER_API {
     const ptr& front() const { return _buffers.front(); }
     const ptr& back() const { return _buffers.back(); }
 
+    int get_mempool() const;
     void reassign_to_mempool(int pool);
     void try_assign_to_mempool(int pool);
 
diff --git a/ceph/src/include/rados/rgw_file.h b/ceph/src/include/rados/rgw_file.h
index 95ca603c9..5c76c471c 100644
--- a/ceph/src/include/rados/rgw_file.h
+++ b/ceph/src/include/rados/rgw_file.h
@@ -27,7 +27,7 @@ extern "C" {
 
 #define LIBRGW_FILE_VER_MAJOR 1
 #define LIBRGW_FILE_VER_MINOR 1
-#define LIBRGW_FILE_VER_EXTRA 4
+#define LIBRGW_FILE_VER_EXTRA 6
 
 #define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
 #define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA)
@@ -126,6 +126,10 @@ int rgw_mount(librgw_t rgw, const char *uid, const char *key,
 	      const char *secret, struct rgw_fs **rgw_fs,
 	      uint32_t flags);
 
+int rgw_mount2(librgw_t rgw, const char *uid, const char *key,
+               const char *secret, const char *root, struct rgw_fs **rgw_fs,
+               uint32_t flags);
+
 /*
  register invalidate callbacks
 */
@@ -217,6 +221,12 @@ int rgw_readdir(struct rgw_fs *rgw_fs,
 		rgw_readdir_cb rcb, void *cb_arg, bool *eof,
 		uint32_t flags);
 
+/* enumeration continuing from name */
+int rgw_readdir2(struct rgw_fs *rgw_fs,
+		 struct rgw_file_handle *parent_fh, const char *name,
+		 rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+		 uint32_t flags);
+
 /* project offset of dirent name */
 #define RGW_DIRENT_OFFSET_FLAG_NONE 0x0000
 
diff --git a/ceph/src/include/sock_compat.h b/ceph/src/include/sock_compat.h
index 56eb92bd6..f9dc24b1d 100644
--- a/ceph/src/include/sock_compat.h
+++ b/ceph/src/include/sock_compat.h
@@ -11,4 +11,17 @@
 # define MSG_MORE 0
 #endif
 
+/*
+ * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE.
+ */
+#ifndef MSG_NOSIGNAL
+# define MSG_NOSIGNAL 0
+# ifdef SO_NOSIGPIPE
+#  define CEPH_USE_SO_NOSIGPIPE
+# else
+#  define CEPH_USE_SIGPIPE_BLOCKER
+#  warning "Using SIGPIPE blocking instead of suppression; this is not well-tested upstream!"
+# endif
+#endif
+
 #endif
diff --git a/ceph/src/journal/JournalMetadata.cc b/ceph/src/journal/JournalMetadata.cc
index 3d6fcfb2e..4073216bc 100644
--- a/ceph/src/journal/JournalMetadata.cc
+++ b/ceph/src/journal/JournalMetadata.cc
@@ -802,9 +802,9 @@ void JournalMetadata::schedule_commit_task() {
   assert(m_lock.is_locked());
   assert(m_commit_position_ctx != nullptr);
   if (m_commit_position_task_ctx == NULL) {
-    m_commit_position_task_ctx = new C_CommitPositionTask(this);
-    m_timer->add_event_after(m_settings.commit_interval,
-                             m_commit_position_task_ctx);
+    m_commit_position_task_ctx =
+      m_timer->add_event_after(m_settings.commit_interval,
+			       new C_CommitPositionTask(this));
   }
 }
 
diff --git a/ceph/src/journal/ObjectPlayer.cc b/ceph/src/journal/ObjectPlayer.cc
index 92dd70261..8292ebb1a 100644
--- a/ceph/src/journal/ObjectPlayer.cc
+++ b/ceph/src/journal/ObjectPlayer.cc
@@ -234,9 +234,12 @@ void ObjectPlayer::schedule_watch() {
   }
 
   ldout(m_cct, 20) << __func__ << ": " << m_oid << " scheduling watch" << dendl;
-  assert(m_watch_task == NULL);
-  m_watch_task = new C_WatchTask(this);
-  m_timer.add_event_after(m_watch_interval, m_watch_task);
+  assert(m_watch_task == nullptr);
+  m_watch_task = m_timer.add_event_after(
+    m_watch_interval,
+    new FunctionContext([this](int) {
+	handle_watch_task();
+      }));
 }
 
 bool ObjectPlayer::cancel_watch() {
@@ -301,10 +304,6 @@ void ObjectPlayer::C_Fetch::finish(int r) {
   on_finish->complete(r);
 }
 
-void ObjectPlayer::C_WatchTask::finish(int r) {
-  object_player->handle_watch_task();
-}
-
 void ObjectPlayer::C_WatchFetch::finish(int r) {
   object_player->handle_watch_fetched(r);
 }
diff --git a/ceph/src/journal/ObjectPlayer.h b/ceph/src/journal/ObjectPlayer.h
index 3d495ba7f..a3cbe8073 100644
--- a/ceph/src/journal/ObjectPlayer.h
+++ b/ceph/src/journal/ObjectPlayer.h
@@ -90,12 +90,6 @@ private:
     }
     void finish(int r) override;
   };
-  struct C_WatchTask : public Context {
-    ObjectPlayerPtr object_player;
-    C_WatchTask(ObjectPlayer *o) : object_player(o) {
-    }
-    void finish(int r) override;
-  };
   struct C_WatchFetch : public Context {
     ObjectPlayerPtr object_player;
     C_WatchFetch(ObjectPlayer *o) : object_player(o) {
diff --git a/ceph/src/journal/ObjectRecorder.cc b/ceph/src/journal/ObjectRecorder.cc
index a2faeae8a..a87c31ddb 100644
--- a/ceph/src/journal/ObjectRecorder.cc
+++ b/ceph/src/journal/ObjectRecorder.cc
@@ -28,7 +28,7 @@ ObjectRecorder::ObjectRecorder(librados::IoCtx &ioctx, const std::string &oid,
     m_timer_lock(timer_lock), m_handler(handler), m_order(order),
     m_soft_max_size(1 << m_order), m_flush_interval(flush_interval),
     m_flush_bytes(flush_bytes), m_flush_age(flush_age), m_flush_handler(this),
-    m_append_task(NULL), m_lock(lock), m_append_tid(0), m_pending_bytes(0),
+    m_lock(lock), m_append_tid(0), m_pending_bytes(0),
     m_size(0), m_overflowed(false), m_object_closed(false),
     m_in_flight_flushes(false), m_aio_scheduled(false) {
   m_ioctx.dup(ioctx);
@@ -194,9 +194,11 @@ void ObjectRecorder::cancel_append_task() {
 
 void ObjectRecorder::schedule_append_task() {
   Mutex::Locker locker(m_timer_lock);
-  if (m_append_task == NULL && m_flush_age > 0) {
-    m_append_task = new C_AppendTask(this);
-    m_timer.add_event_after(m_flush_age, m_append_task);
+  if (m_append_task == nullptr && m_flush_age > 0) {
+    m_append_task = m_timer.add_event_after(
+      m_flush_age, new FunctionContext([this](int) {
+	  handle_append_task();
+	}));
   }
 }
 
diff --git a/ceph/src/journal/ObjectRecorder.h b/ceph/src/journal/ObjectRecorder.h
index aad466901..22a46697c 100644
--- a/ceph/src/journal/ObjectRecorder.h
+++ b/ceph/src/journal/ObjectRecorder.h
@@ -90,14 +90,6 @@ private:
       object_recorder->flush(future);
     }
   };
-  struct C_AppendTask : public Context {
-    ObjectRecorder *object_recorder;
-    C_AppendTask(ObjectRecorder *o) : object_recorder(o) {
-    }
-    void finish(int r) override {
-      object_recorder->handle_append_task();
-    }
-  };
   struct C_AppendFlush : public Context {
     ObjectRecorder *object_recorder;
     uint64_t tid;
@@ -132,7 +124,7 @@ private:
 
   FlushHandler m_flush_handler;
 
-  C_AppendTask *m_append_task;
+  Context *m_append_task = nullptr;
 
   mutable std::shared_ptr<Mutex> m_lock;
   AppendBuffers m_append_buffers;
diff --git a/ceph/src/kv/KeyValueDB.h b/ceph/src/kv/KeyValueDB.h
index 37a78480f..818884a1a 100644
--- a/ceph/src/kv/KeyValueDB.h
+++ b/ceph/src/kv/KeyValueDB.h
@@ -12,6 +12,7 @@
 #include <boost/scoped_ptr.hpp>
 #include "include/encoding.h"
 #include "common/Formatter.h"
+#include "common/perf_counters.h"
 
 using std::string;
 /**
@@ -350,6 +351,15 @@ public:
   virtual void get_statistics(Formatter *f) {
     return;
   }
+
+  /**
+   * Return your perf counters if you have any.  Subclasses are not
+   * required to implement this, and callers must respect a null return
+   * value.
+   */
+  virtual PerfCounters *get_perf_counters() {
+    return nullptr;
+  }
 protected:
   /// List of matching prefixes and merge operators
   std::vector<std::pair<std::string,
diff --git a/ceph/src/kv/LevelDBStore.h b/ceph/src/kv/LevelDBStore.h
index be344ff18..5a3ced9e4 100644
--- a/ceph/src/kv/LevelDBStore.h
+++ b/ceph/src/kv/LevelDBStore.h
@@ -184,6 +184,11 @@ public:
 
   void close() override;
 
+  PerfCounters *get_perf_counters() override
+  {
+    return logger;
+  }
+
   class LevelDBTransactionImpl : public KeyValueDB::TransactionImpl {
   public:
     leveldb::WriteBatch bat;
diff --git a/ceph/src/kv/RocksDBStore.h b/ceph/src/kv/RocksDBStore.h
index 6a7c0e377..44c99e1b2 100644
--- a/ceph/src/kv/RocksDBStore.h
+++ b/ceph/src/kv/RocksDBStore.h
@@ -158,6 +158,11 @@ public:
   void split_stats(const std::string &s, char delim, std::vector<std::string> &elems);
   void get_statistics(Formatter *f) override;
 
+  PerfCounters *get_perf_counters() override
+  {
+    return logger;
+  }
+
   struct  RocksWBHandler: public rocksdb::WriteBatch::Handler {
     std::string seen ;
     int num_seen = 0;
diff --git a/ceph/src/librbd/ObjectMap.cc b/ceph/src/librbd/ObjectMap.cc
index 0930e6a56..257554dc1 100644
--- a/ceph/src/librbd/ObjectMap.cc
+++ b/ceph/src/librbd/ObjectMap.cc
@@ -111,10 +111,10 @@ bool ObjectMap<I>::object_may_exist(uint64_t object_no) const
 }
 
 template <typename I>
-bool ObjectMap<I>::update_required(uint64_t object_no, uint8_t new_state) {
+bool ObjectMap<I>::update_required(const ceph::BitVector<2>::Iterator& it,
+                                   uint8_t new_state) {
   assert(m_image_ctx.object_map_lock.is_wlocked());
-  uint8_t state = (*this)[object_no];
-
+  uint8_t state = *it;
   if ((state == new_state) ||
       (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
       (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING)) {
@@ -224,7 +224,7 @@ void ObjectMap<I>::detained_aio_update(UpdateOperation &&op) {
 
   BlockGuardCell *cell;
   int r = m_update_guard->detain({op.start_object_no, op.end_object_no},
-                                &op, &cell);
+                                 &op, &cell);
   if (r < 0) {
     lderr(cct) << "failed to detain object map update: " << cpp_strerror(r)
                << dendl;
@@ -297,13 +297,14 @@ void ObjectMap<I>::aio_update(uint64_t snap_id, uint64_t start_object_no,
       return;
     }
 
-    uint64_t object_no;
-    for (object_no = start_object_no; object_no < end_object_no; ++object_no) {
-      if (update_required(object_no, new_state)) {
+    auto it = m_object_map.begin() + start_object_no;
+    auto end_it = m_object_map.begin() + end_object_no;
+    for (; it != end_it; ++it) {
+      if (update_required(it, new_state)) {
         break;
       }
     }
-    if (object_no == end_object_no) {
+    if (it == end_it) {
       ldout(cct, 20) << "object map update not required" << dendl;
       m_image_ctx.op_work_queue->queue(on_finish, 0);
       return;
diff --git a/ceph/src/librbd/ObjectMap.h b/ceph/src/librbd/ObjectMap.h
index 427ecdf16..ebd1a9ba3 100644
--- a/ceph/src/librbd/ObjectMap.h
+++ b/ceph/src/librbd/ObjectMap.h
@@ -69,15 +69,15 @@ public:
                   const ZTracer::Trace &parent_trace, T *callback_object) {
     assert(start_object_no < end_object_no);
     if (snap_id == CEPH_NOSNAP) {
-      uint64_t object_no;
-      for (object_no = start_object_no; object_no < end_object_no;
-           ++object_no) {
-        if (update_required(object_no, new_state)) {
+      auto it = m_object_map.begin() + start_object_no;
+      auto end_it = m_object_map.begin() + end_object_no;
+      for (; it != end_it; ++it) {
+        if (update_required(it, new_state)) {
           break;
         }
       }
 
-      if (object_no == end_object_no) {
+      if (it == end_it) {
         return false;
       }
 
@@ -133,7 +133,8 @@ private:
                   uint64_t end_object_no, uint8_t new_state,
                   const boost::optional<uint8_t> &current_state,
                   const ZTracer::Trace &parent_trace, Context *on_finish);
-  bool update_required(uint64_t object_no, uint8_t new_state);
+  bool update_required(const ceph::BitVector<2>::Iterator &it,
+                       uint8_t new_state);
 
 };
 
diff --git a/ceph/src/librbd/api/Mirror.cc b/ceph/src/librbd/api/Mirror.cc
index 8661e7a73..46e5135d3 100644
--- a/ceph/src/librbd/api/Mirror.cc
+++ b/ceph/src/librbd/api/Mirror.cc
@@ -60,7 +60,7 @@ int list_mirror_images(librados::IoCtx& io_ctx,
     std::map<std::string, std::string> mirror_images;
     r =  cls_client::mirror_image_list(&io_ctx, last_read, max_read,
                                        &mirror_images);
-    if (r < 0) {
+    if (r < 0 && r != -ENOENT) {
       lderr(cct) << "error listing mirrored image directory: "
                  << cpp_strerror(r) << dendl;
       return r;
@@ -810,6 +810,10 @@ int Mirror<I>::image_status_list(librados::IoCtx& io_ctx,
   for (auto it = images_.begin(); it != images_.end(); ++it) {
     auto &image_id = it->first;
     auto &info = it->second;
+    if (info.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLED) {
+      continue;
+    }
+
     auto &image_name = id_to_name[image_id];
     if (image_name.empty()) {
       lderr(cct) << "failed to find image name for image " << image_id << ", "
@@ -840,7 +844,7 @@ int Mirror<I>::image_status_summary(librados::IoCtx& io_ctx,
 
   std::map<cls::rbd::MirrorImageStatusState, int> states_;
   int r = cls_client::mirror_image_status_get_summary(&io_ctx, &states_);
-  if (r < 0) {
+  if (r < 0 && r != -ENOENT) {
     lderr(cct) << "failed to get mirror status summary: "
                << cpp_strerror(r) << dendl;
     return r;
diff --git a/ceph/src/librbd/io/ObjectRequest.cc b/ceph/src/librbd/io/ObjectRequest.cc
index 0b2415cf4..d6a3905de 100644
--- a/ceph/src/librbd/io/ObjectRequest.cc
+++ b/ceph/src/librbd/io/ObjectRequest.cc
@@ -53,6 +53,16 @@ ObjectRequest<I>::create_truncate(I *ictx, const std::string &oid,
                                    object_off, snapc, parent_trace, completion);
 }
 
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_trim(I *ictx, const std::string &oid,
+                              uint64_t object_no, const ::SnapContext &snapc,
+                              bool post_object_map_update,
+                              Context *completion) {
+  return new ObjectTrimRequest(util::get_image_ctx(ictx), oid, object_no,
+                               snapc, post_object_map_update, completion);
+}
+
 template <typename I>
 ObjectRequest<I>*
 ObjectRequest<I>::create_write(I *ictx, const std::string &oid,
diff --git a/ceph/src/librbd/io/ObjectRequest.h b/ceph/src/librbd/io/ObjectRequest.h
index e1ec6dc61..fa99bda44 100644
--- a/ceph/src/librbd/io/ObjectRequest.h
+++ b/ceph/src/librbd/io/ObjectRequest.h
@@ -58,6 +58,11 @@ public:
                                         const ::SnapContext &snapc,
 					const ZTracer::Trace &parent_trace,
                                         Context *completion);
+  static ObjectRequest* create_trim(ImageCtxT *ictx, const std::string &oid,
+                                    uint64_t object_no,
+                                    const ::SnapContext &snapc,
+                                    bool post_object_map_update,
+                                    Context *completion);
   static ObjectRequest* create_write(ImageCtxT *ictx, const std::string &oid,
                                      uint64_t object_no,
                                      uint64_t object_off,
diff --git a/ceph/src/librbd/object_map/UpdateRequest.cc b/ceph/src/librbd/object_map/UpdateRequest.cc
index 8c0ec69de..9ceb28a17 100644
--- a/ceph/src/librbd/object_map/UpdateRequest.cc
+++ b/ceph/src/librbd/object_map/UpdateRequest.cc
@@ -7,28 +7,44 @@
 #include "common/dout.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
 #include "cls/lock/cls_lock_client.h"
 #include <string>
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
-#define dout_prefix *_dout << "librbd::object_map::UpdateRequest: "
+#define dout_prefix *_dout << "librbd::object_map::UpdateRequest: " << this \
+                           << " " << __func__ << ": "
 
 namespace librbd {
 namespace object_map {
 
+namespace {
+
+// keep aligned to bit_vector 4K block sizes
+const uint64_t MAX_OBJECTS_PER_UPDATE = 256 * (1 << 10);
+
+}
+
 template <typename I>
 void UpdateRequest<I>::send() {
+  update_object_map();
+}
+
+template <typename I>
+void UpdateRequest<I>::update_object_map() {
   assert(m_image_ctx.snap_lock.is_locked());
   assert(m_image_ctx.object_map_lock.is_locked());
   CephContext *cct = m_image_ctx.cct;
 
-  // safe to update in-memory state first without handling rollback since any
-  // failures will invalidate the object map
+  // break very large requests into manageable batches
+  m_update_end_object_no = MIN(
+    m_end_object_no, m_update_start_object_no + MAX_OBJECTS_PER_UPDATE);
+
   std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
-  ldout(cct, 20) << this << " updating object map"
-                 << ": ictx=" << &m_image_ctx << ", oid=" << oid << ", ["
-		 << m_start_object_no << "," << m_end_object_no << ") = "
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", oid=" << oid << ", "
+                 << "[" << m_update_start_object_no << ","
+                        << m_update_end_object_no << ") = "
 		 << (m_current_state ?
 		       stringify(static_cast<uint32_t>(*m_current_state)) : "")
 		 << "->" << static_cast<uint32_t>(m_new_state)
@@ -38,10 +54,12 @@ void UpdateRequest<I>::send() {
   if (m_snap_id == CEPH_NOSNAP) {
     rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
   }
-  cls_client::object_map_update(&op, m_start_object_no, m_end_object_no,
-				m_new_state, m_current_state);
+  cls_client::object_map_update(&op, m_update_start_object_no,
+                                m_update_end_object_no, m_new_state,
+                                m_current_state);
 
-  librados::AioCompletion *rados_completion = create_callback_completion();
+  auto rados_completion = librbd::util::create_rados_callback<
+    UpdateRequest<I>, &UpdateRequest<I>::handle_update_object_map>(this);
   std::vector<librados::snap_t> snaps;
   int r = m_image_ctx.md_ctx.aio_operate(
     oid, rados_completion, &op, 0, snaps,
@@ -51,26 +69,53 @@ void UpdateRequest<I>::send() {
 }
 
 template <typename I>
-void UpdateRequest<I>::finish_request() {
-  RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
-  RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
-  ldout(m_image_ctx.cct, 20) << this << " on-disk object map updated"
-                             << dendl;
+void UpdateRequest<I>::handle_update_object_map(int r) {
+  ldout(m_image_ctx.cct, 20) << "r=" << r << dendl;
+
+  {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+    update_in_memory_object_map();
+
+    if (m_update_end_object_no < m_end_object_no) {
+      m_update_start_object_no = m_update_end_object_no;
+      update_object_map();
+      return;
+    }
+  }
+
+  // no more batch updates to send
+  complete(r);
+}
+
+template <typename I>
+void UpdateRequest<I>::update_in_memory_object_map() {
+  assert(m_image_ctx.snap_lock.is_locked());
+  assert(m_image_ctx.object_map_lock.is_locked());
 
   // rebuilding the object map might update on-disk only
   if (m_snap_id == m_image_ctx.snap_id) {
-    for (uint64_t object_no = m_start_object_no;
-         object_no < MIN(m_end_object_no, m_object_map.size());
-         ++object_no) {
-      uint8_t state = m_object_map[object_no];
+    ldout(m_image_ctx.cct, 20) << dendl;
+
+    auto it = m_object_map.begin() +
+                    MIN(m_update_start_object_no, m_object_map.size());
+    auto end_it = m_object_map.begin() +
+                    MIN(m_update_end_object_no, m_object_map.size());
+    for (; it != end_it; ++it) {
+      auto state_ref = *it;
+      uint8_t state = state_ref;
       if (!m_current_state || state == *m_current_state ||
           (*m_current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) {
-        m_object_map[object_no] = m_new_state;
+        state_ref = m_new_state;
       }
     }
   }
 }
 
+template <typename I>
+void UpdateRequest<I>::finish_request() {
+}
+
 } // namespace object_map
 } // namespace librbd
 
diff --git a/ceph/src/librbd/object_map/UpdateRequest.h b/ceph/src/librbd/object_map/UpdateRequest.h
index 175160752..cb9804d07 100644
--- a/ceph/src/librbd/object_map/UpdateRequest.h
+++ b/ceph/src/librbd/object_map/UpdateRequest.h
@@ -41,7 +41,8 @@ public:
       	        const ZTracer::Trace &parent_trace, Context *on_finish)
     : Request(image_ctx, snap_id, on_finish), m_object_map(*object_map),
       m_start_object_no(start_object_no), m_end_object_no(end_object_no),
-      m_new_state(new_state), m_current_state(current_state),
+      m_update_start_object_no(start_object_no), m_new_state(new_state),
+      m_current_state(current_state),
       m_trace(util::create_trace(image_ctx, "update object map", parent_trace))
   {
     m_trace.event("start");
@@ -56,12 +57,35 @@ protected:
   void finish_request() override;
 
 private:
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    |/------------------\
+   *    v                   | (repeat in batches)
+   * UPDATE_OBJECT_MAP -----/
+   *    |
+   *    v
+   * <finish>
+   *
+   * @endverbatim
+   */
+
   ceph::BitVector<2> &m_object_map;
   uint64_t m_start_object_no;
   uint64_t m_end_object_no;
+  uint64_t m_update_start_object_no;
+  uint64_t m_update_end_object_no = 0;
   uint8_t m_new_state;
   boost::optional<uint8_t> m_current_state;
   ZTracer::Trace m_trace;
+
+  void update_object_map();
+  void handle_update_object_map(int r);
+
+  void update_in_memory_object_map();
+
 };
 
 } // namespace object_map
diff --git a/ceph/src/librbd/operation/SnapshotCreateRequest.cc b/ceph/src/librbd/operation/SnapshotCreateRequest.cc
index d332d870c..63ddd8848 100644
--- a/ceph/src/librbd/operation/SnapshotCreateRequest.cc
+++ b/ceph/src/librbd/operation/SnapshotCreateRequest.cc
@@ -129,7 +129,7 @@ void SnapshotCreateRequest<I>::send_allocate_snap_id() {
   librados::AioCompletion *rados_completion = create_rados_callback<
     SnapshotCreateRequest<I>,
     &SnapshotCreateRequest<I>::handle_allocate_snap_id>(this);
-  image_ctx.md_ctx.aio_selfmanaged_snap_create(&m_snap_id, rados_completion);
+  image_ctx.data_ctx.aio_selfmanaged_snap_create(&m_snap_id, rados_completion);
   rados_completion->release();
 }
 
@@ -255,7 +255,7 @@ void SnapshotCreateRequest<I>::send_release_snap_id() {
   librados::AioCompletion *rados_completion = create_rados_callback<
     SnapshotCreateRequest<I>,
     &SnapshotCreateRequest<I>::handle_release_snap_id>(this);
-  image_ctx.md_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
+  image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
   rados_completion->release();
 }
 
diff --git a/ceph/src/librbd/operation/SnapshotRemoveRequest.cc b/ceph/src/librbd/operation/SnapshotRemoveRequest.cc
index 4cbfc03b0..fe19ff5c9 100644
--- a/ceph/src/librbd/operation/SnapshotRemoveRequest.cc
+++ b/ceph/src/librbd/operation/SnapshotRemoveRequest.cc
@@ -204,7 +204,7 @@ void SnapshotRemoveRequest<I>::send_release_snap_id() {
 
   librados::AioCompletion *rados_completion =
     this->create_callback_completion();
-  image_ctx.md_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
+  image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
   rados_completion->release();
 }
 
diff --git a/ceph/src/librbd/operation/TrimRequest.cc b/ceph/src/librbd/operation/TrimRequest.cc
index 46ec967b5..28f2deb1a 100644
--- a/ceph/src/librbd/operation/TrimRequest.cc
+++ b/ceph/src/librbd/operation/TrimRequest.cc
@@ -45,8 +45,8 @@ public:
     string oid = image_ctx.get_object_name(m_object_no);
     ldout(image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl;
 
-    auto req = new io::ObjectTrimRequest(&image_ctx, oid, m_object_no,
-                                         m_snapc, false, this);
+    auto req = io::ObjectRequest<I>::create_trim(&image_ctx, oid, m_object_no,
+                                                 m_snapc, false, this);
     req->send();
     return 0;
   }
@@ -58,7 +58,7 @@ private:
 template <typename I>
 class C_RemoveObject : public C_AsyncObjectThrottle<I> {
 public:
-  C_RemoveObject(AsyncObjectThrottle<I> &throttle, ImageCtx *image_ctx,
+  C_RemoveObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
                  uint64_t object_no)
     : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_no(object_no)
   {
@@ -105,6 +105,7 @@ TrimRequest<I>::TrimRequest(I &image_ctx, Context *on_finish,
   m_delete_off = MIN(new_num_periods * period, original_size);
   // first object we can delete free and clear
   m_delete_start = new_num_periods * image_ctx.get_stripe_count();
+  m_delete_start_min = m_delete_start;
   m_num_objects = Striper::get_num_objects(image_ctx.layout, original_size);
 
   CephContext *cct = image_ctx.cct;
@@ -131,33 +132,23 @@ bool TrimRequest<I>::should_complete(int r)
 
   RWLock::RLocker owner_lock(image_ctx.owner_lock);
   switch (m_state) {
-  case STATE_PRE_COPYUP:
-    ldout(cct, 5) << " PRE_COPYUP" << dendl;
+  case STATE_PRE_TRIM:
+    ldout(cct, 5) << " PRE_TRIM" << dendl;
     send_copyup_objects();
     break;
 
   case STATE_COPYUP_OBJECTS:
     ldout(cct, 5) << " COPYUP_OBJECTS" << dendl;
-    send_post_copyup();
-    break;
-
-  case STATE_POST_COPYUP:
-    ldout(cct, 5) << " POST_COPYUP" << dendl;
-    send_pre_remove();
-    break;
-
-  case STATE_PRE_REMOVE:
-    ldout(cct, 5) << " PRE_REMOVE" << dendl;
     send_remove_objects();
     break;
 
   case STATE_REMOVE_OBJECTS:
     ldout(cct, 5) << " REMOVE_OBJECTS" << dendl;
-    send_post_remove();
+    send_post_trim();
     break;
 
-  case STATE_POST_REMOVE:
-    ldout(cct, 5) << " POST_OBJECTS" << dendl;
+  case STATE_POST_TRIM:
+    ldout(cct, 5) << " POST_TRIM" << dendl;
     send_clean_boundary();
     break;
 
@@ -180,198 +171,132 @@ bool TrimRequest<I>::should_complete(int r)
 
 template <typename I>
 void TrimRequest<I>::send() {
-  send_pre_copyup();
+  send_pre_trim();
 }
 
 template<typename I>
-void TrimRequest<I>::send_copyup_objects() {
+void TrimRequest<I>::send_pre_trim() {
   I &image_ctx = this->m_image_ctx;
   assert(image_ctx.owner_lock.is_locked());
 
-  ldout(image_ctx.cct, 5) << this << " send_copyup_objects: "
-                          << " start object=" << m_copyup_start << ", "
-                          << " end object=" << m_copyup_end << dendl;
-  m_state = STATE_COPYUP_OBJECTS;
+  if (m_delete_start >= m_num_objects) {
+    send_clean_boundary();
+    return;
+  }
 
-  ::SnapContext snapc;
   {
     RWLock::RLocker snap_locker(image_ctx.snap_lock);
-    RWLock::RLocker parent_locker(image_ctx.parent_lock);
-    snapc = image_ctx.snapc;
-  }
-
-  Context *ctx = this->create_callback_context();
-  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
-    boost::lambda::bind(boost::lambda::new_ptr<C_CopyupObject<I> >(),
-      boost::lambda::_1, &image_ctx, snapc, boost::lambda::_2));
-  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
-    this, image_ctx, context_factory, ctx, &m_prog_ctx, m_copyup_start,
-    m_copyup_end);
-  throttle->start_ops(image_ctx.concurrent_management_ops);
-}
+    if (image_ctx.object_map != nullptr) {
+      ldout(image_ctx.cct, 5) << this << " send_pre_trim: "
+                              << " delete_start_min=" << m_delete_start_min
+                              << " num_objects=" << m_num_objects << dendl;
+      m_state = STATE_PRE_TRIM;
 
-template <typename I>
-void TrimRequest<I>::send_remove_objects() {
-  I &image_ctx = this->m_image_ctx;
-  assert(image_ctx.owner_lock.is_locked());
+      assert(image_ctx.exclusive_lock->is_lock_owner());
 
-  ldout(image_ctx.cct, 5) << this << " send_remove_objects: "
-			    << " delete_start=" << m_delete_start
-			    << " num_objects=" << m_num_objects << dendl;
-  m_state = STATE_REMOVE_OBJECTS;
+      RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
+      if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
+            CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_PENDING,
+            OBJECT_EXISTS, {}, this)) {
+        return;
+      }
+    }
+  }
 
-  Context *ctx = this->create_callback_context();
-  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
-    boost::lambda::bind(boost::lambda::new_ptr<C_RemoveObject<I> >(),
-      boost::lambda::_1, &image_ctx, boost::lambda::_2));
-  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
-    this, image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start,
-    m_num_objects);
-  throttle->start_ops(image_ctx.concurrent_management_ops);
+  send_copyup_objects();
 }
 
 template<typename I>
-void TrimRequest<I>::send_pre_copyup() {
+void TrimRequest<I>::send_copyup_objects() {
   I &image_ctx = this->m_image_ctx;
   assert(image_ctx.owner_lock.is_locked());
 
-  if (m_delete_start >= m_num_objects) {
-    send_clean_boundary();
-    return;
-  }
-
+  ::SnapContext snapc;
   bool has_snapshots;
   uint64_t parent_overlap;
   {
     RWLock::RLocker snap_locker(image_ctx.snap_lock);
     RWLock::RLocker parent_locker(image_ctx.parent_lock);
 
+    snapc = image_ctx.snapc;
     has_snapshots = !image_ctx.snaps.empty();
     int r = image_ctx.get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
     assert(r == 0);
   }
 
   // copyup is only required for portion of image that overlaps parent
-  m_copyup_end = Striper::get_num_objects(image_ctx.layout, parent_overlap);
+  uint64_t copyup_end = Striper::get_num_objects(image_ctx.layout,
+                                                 parent_overlap);
 
   // TODO: protect against concurrent shrink and snap create?
   // skip to remove if no copyup is required.
-  if (m_copyup_end <= m_delete_start || !has_snapshots) {
-    send_pre_remove();
+  if (copyup_end <= m_delete_start || !has_snapshots) {
+    send_remove_objects();
     return;
   }
 
-  m_copyup_start = m_delete_start;
-  m_delete_start = m_copyup_end;
-
-  {
-    RWLock::RLocker snap_locker(image_ctx.snap_lock);
-    if (image_ctx.object_map != nullptr) {
-      ldout(image_ctx.cct, 5) << this << " send_pre_copyup: "
-                              << " copyup_start=" << m_copyup_start
-                              << " copyup_end=" << m_copyup_end << dendl;
-      m_state = STATE_PRE_COPYUP;
-
-      assert(image_ctx.exclusive_lock->is_lock_owner());
+  uint64_t copyup_start = m_delete_start;
+  m_delete_start = copyup_end;
 
-      RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
-      if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
-            CEPH_NOSNAP, m_copyup_start, m_copyup_end, OBJECT_PENDING,
-            OBJECT_EXISTS, {}, this)) {
-        return;
-      }
-    }
-  }
+  ldout(image_ctx.cct, 5) << this << " send_copyup_objects: "
+                          << " start object=" << copyup_start << ", "
+                          << " end object=" << copyup_end << dendl;
+  m_state = STATE_COPYUP_OBJECTS;
 
-  send_copyup_objects();
+  Context *ctx = this->create_callback_context();
+  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_CopyupObject<I> >(),
+      boost::lambda::_1, &image_ctx, snapc, boost::lambda::_2));
+  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+    this, image_ctx, context_factory, ctx, &m_prog_ctx, copyup_start,
+    copyup_end);
+  throttle->start_ops(image_ctx.concurrent_management_ops);
 }
 
 template <typename I>
-void TrimRequest<I>::send_pre_remove() {
+void TrimRequest<I>::send_remove_objects() {
   I &image_ctx = this->m_image_ctx;
   assert(image_ctx.owner_lock.is_locked());
-  if (m_delete_start >= m_num_objects) {
-    send_clean_boundary();
-    return;
-  }
-
-  {
-    RWLock::RLocker snap_locker(image_ctx.snap_lock);
-    if (image_ctx.object_map != nullptr) {
-      ldout(image_ctx.cct, 5) << this << " send_pre_remove: "
-				<< " delete_start=" << m_delete_start
-				<< " num_objects=" << m_num_objects << dendl;
-      m_state = STATE_PRE_REMOVE;
-
-      assert(image_ctx.exclusive_lock->is_lock_owner());
 
-      // flag the objects as pending deletion
-      RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
-      if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
-            CEPH_NOSNAP, m_delete_start, m_num_objects, OBJECT_PENDING,
-            OBJECT_EXISTS, {}, this)) {
-        return;
-      }
-    }
-  }
+  ldout(image_ctx.cct, 5) << this << " send_remove_objects: "
+			    << " delete_start=" << m_delete_start
+			    << " num_objects=" << m_num_objects << dendl;
+  m_state = STATE_REMOVE_OBJECTS;
 
-  // no object map update required
-  send_remove_objects();
+  Context *ctx = this->create_callback_context();
+  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_RemoveObject<I> >(),
+      boost::lambda::_1, &image_ctx, boost::lambda::_2));
+  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+    this, image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start,
+    m_num_objects);
+  throttle->start_ops(image_ctx.concurrent_management_ops);
 }
 
 template<typename I>
-void TrimRequest<I>::send_post_copyup() {
-  I &image_ctx = this->m_image_ctx;
-  assert(image_ctx.owner_lock.is_locked());
-
-  {
-    RWLock::RLocker snap_locker(image_ctx.snap_lock);
-    if (image_ctx.object_map != nullptr) {
-      ldout(image_ctx.cct, 5) << this << " send_post_copyup:"
-                              << " copyup_start=" << m_copyup_start
-                              << " copyup_end=" << m_copyup_end << dendl;
-      m_state = STATE_POST_COPYUP;
-
-      assert(image_ctx.exclusive_lock->is_lock_owner());
-
-      RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
-      if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
-            CEPH_NOSNAP, m_copyup_start, m_copyup_end, OBJECT_NONEXISTENT,
-            OBJECT_PENDING, {}, this)) {
-        return;
-      }
-    }
-  }
-
-  send_pre_remove();
-}
-
-template <typename I>
-void TrimRequest<I>::send_post_remove() {
+void TrimRequest<I>::send_post_trim() {
   I &image_ctx = this->m_image_ctx;
   assert(image_ctx.owner_lock.is_locked());
 
   {
     RWLock::RLocker snap_locker(image_ctx.snap_lock);
     if (image_ctx.object_map != nullptr) {
-      ldout(image_ctx.cct, 5) << this << " send_post_remove: "
-          		        << " delete_start=" << m_delete_start
-          		        << " num_objects=" << m_num_objects << dendl;
-      m_state = STATE_POST_REMOVE;
+      ldout(image_ctx.cct, 5) << this << " send_post_trim:"
+                              << " delete_start_min=" << m_delete_start_min
+                              << " num_objects=" << m_num_objects << dendl;
+      m_state = STATE_POST_TRIM;
 
       assert(image_ctx.exclusive_lock->is_lock_owner());
 
-      // flag the pending objects as removed
       RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
       if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
-            CEPH_NOSNAP, m_delete_start, m_num_objects, OBJECT_NONEXISTENT,
+            CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_NONEXISTENT,
             OBJECT_PENDING, {}, this)) {
         return;
       }
     }
   }
 
-  // no object map update required
   send_clean_boundary();
 }
 
@@ -413,13 +338,15 @@ void TrimRequest<I>::send_clean_boundary() {
     ldout(cct, 20) << " ex " << *p << dendl;
     Context *req_comp = new C_ContextCompletion(*completion);
 
-    io::ObjectRequest<> *req;
+    io::ObjectRequest<I> *req;
     if (p->offset == 0) {
-      req = new io::ObjectTrimRequest(&image_ctx, p->oid.name, p->objectno,
-                                      snapc, true, req_comp);
+      req = io::ObjectRequest<I>::create_trim(&image_ctx, p->oid.name,
+                                              p->objectno, snapc, true,
+                                              req_comp);
     } else {
-      req = new io::ObjectTruncateRequest(&image_ctx, p->oid.name, p->objectno,
-                                          p->offset, snapc, {}, req_comp);
+      req = io::ObjectRequest<I>::create_truncate(&image_ctx, p->oid.name,
+                                                  p->objectno, p->offset, snapc,
+                                                  {}, req_comp);
     }
     req->send();
   }
diff --git a/ceph/src/librbd/operation/TrimRequest.h b/ceph/src/librbd/operation/TrimRequest.h
index 5eb9fdffe..8526046c9 100644
--- a/ceph/src/librbd/operation/TrimRequest.h
+++ b/ceph/src/librbd/operation/TrimRequest.h
@@ -24,6 +24,10 @@ public:
                            prog_ctx);
   }
 
+  TrimRequest(ImageCtxT &image_ctx, Context *on_finish,
+	      uint64_t original_size, uint64_t new_size,
+	      ProgressContext &prog_ctx);
+
   void send() override;
 
 protected:
@@ -33,75 +37,63 @@ protected:
    *
    * @verbatim
    *
-   *     <start> . . . . > STATE_FINISHED . . . . . . . . .
-   *      |    . . . . . . . . . . > . . . . . . . . .    .
-   *      |   /                                      .    .
-   * STATE_PRE_COPYUP ---> STATE_COPYUP_OBJECTS      .    .
-   *                                |                .    .
-   *        /-----------------------/                v    .
-   *        |                                        .    .
-   *        v                                        .    .
-   * STATE_POST_COPYUP. . . > .                      .    .
-   *      |    . . . . . . . . . . < . . . . . . . . .    .
-   *      |    |              .                           .
-   *      v    v              v                           .
-   * STATE_PRE_REMOVE ---> STATE_REMOVE_OBJECTS           .
-   *                                |   .   .             .
-   *        /-----------------------/   .   . . . . . .   .
-   *        |                           .             .   .
-   *        v                           v             v   v
-   * STATE_POST_REMOVE --> STATE_CLEAN_BOUNDARY ---> <finish>
-   *        .                                           ^
-   *        .                                           .
-   *        . . . . . . . . . . . . . . . . . . . . . . .
-   *
-   * @endverbatim
+   *     <start>  . . . . . . . . . . . . . . . . .
+   *        |                                     .
+   *        v (skip if not needed)                .
+   * STATE_PRE_TRIM                               .
+   *        |                                     .
+   *        v (skip if not needed)                .
+   * STATE_COPYUP_OBJECTS                         .
+   *        |                                     .
+   *        v (skip if not needed)                .
+   * STATE_REMOVE_OBJECTS                         .
+   *        |                                     .
+   *        v (skip if not needed)                .
+   * STATE_POST_TRIM                              .
+   *        |                                     .
+   *        v (skip if not needed)                .
+   * STATE_CLEAN_BOUNDARY                         .
+   *        |                                     .
+   *        v                                     .
+   * STATE_FINISHED < . . . . . . . . . . . . . . .
+   *        |
+   *        v
+   *    <finish>
    *
    * The _COPYUP_OBJECTS state is skipped if there is no parent overlap
    * within the new image size and the image does not have any snapshots.
-   * The _PRE_REMOVE/_POST_REMOVE states are skipped if the object map
+   * The _PRE_TRIM/_POST_TRIM states are skipped if the object map
    * isn't enabled. The _REMOVE_OBJECTS state is skipped if no whole objects
    * are removed.  The _CLEAN_BOUNDARY state is skipped if no boundary
    * objects are cleaned.  The state machine will immediately transition
    * to _FINISHED state if there are no bytes to trim.
-   */ 
+   */
 
   enum State {
-    STATE_PRE_COPYUP,
+    STATE_PRE_TRIM,
     STATE_COPYUP_OBJECTS,
-    STATE_POST_COPYUP,
-    STATE_PRE_REMOVE,
     STATE_REMOVE_OBJECTS,
-    STATE_POST_REMOVE,
+    STATE_POST_TRIM,
     STATE_CLEAN_BOUNDARY,
     STATE_FINISHED
   };
 
   bool should_complete(int r) override;
 
-  State m_state;
+  State m_state = STATE_PRE_TRIM;
 
 private:
   uint64_t m_delete_start;
+  uint64_t m_delete_start_min = 0;
   uint64_t m_num_objects;
   uint64_t m_delete_off;
   uint64_t m_new_size;
   ProgressContext &m_prog_ctx;
 
-  uint64_t m_copyup_start;
-  uint64_t m_copyup_end;
-
-  TrimRequest(ImageCtxT &image_ctx, Context *on_finish,
-	      uint64_t original_size, uint64_t new_size,
-	      ProgressContext &prog_ctx);
-
-  void send_pre_copyup();
+  void send_pre_trim();
   void send_copyup_objects();
-  void send_post_copyup();
-
-  void send_pre_remove();
   void send_remove_objects();
-  void send_post_remove();
+  void send_post_trim();
 
   void send_clean_boundary();
   void send_finish(int r);
diff --git a/ceph/src/mds/Beacon.cc b/ceph/src/mds/Beacon.cc
index 10ee8c242..e6bf3930e 100644
--- a/ceph/src/mds/Beacon.cc
+++ b/ceph/src/mds/Beacon.cc
@@ -34,18 +34,6 @@
 #define dout_prefix *_dout << "mds.beacon." << name << ' '
 
 
-class Beacon::C_MDS_BeaconSender : public Context {
-public:
-  explicit C_MDS_BeaconSender(Beacon *beacon_) : beacon(beacon_) {}
-  void finish(int r) override {
-    assert(beacon->lock.is_locked_by_me());
-    beacon->sender = NULL;
-    beacon->_send();
-  }
-private:
-  Beacon *beacon;
-};
-
 Beacon::Beacon(CephContext *cct_, MonClient *monc_, std::string name_) :
   Dispatcher(cct_), lock("Beacon"), monc(monc_), timer(g_ceph_context, lock),
   name(name_), standby_for_rank(MDS_RANK_NONE),
@@ -53,7 +41,6 @@ Beacon::Beacon(CephContext *cct_, MonClient *monc_, std::string name_) :
   awaiting_seq(-1)
 {
   last_seq = 0;
-  sender = NULL;
   was_laggy = false;
 
   epoch = 0;
@@ -192,8 +179,13 @@ void Beacon::_send()
   if (sender) {
     timer.cancel_event(sender);
   }
-  sender = new C_MDS_BeaconSender(this);
-  timer.add_event_after(g_conf->mds_beacon_interval, sender);
+  sender = timer.add_event_after(
+    g_conf->mds_beacon_interval,
+    new FunctionContext([this](int) {
+	assert(lock.is_locked_by_me());
+	sender = nullptr;
+	_send();
+      }));
 
   if (!cct->get_heartbeat_map()->is_healthy()) {
     /* If anything isn't progressing, let avoid sending a beacon so that
diff --git a/ceph/src/mds/Beacon.h b/ceph/src/mds/Beacon.h
index 571f7f559..201804def 100644
--- a/ceph/src/mds/Beacon.h
+++ b/ceph/src/mds/Beacon.h
@@ -102,8 +102,7 @@ private:
   MDSHealth health;
 
   // Ticker
-  class C_MDS_BeaconSender;
-  C_MDS_BeaconSender *sender;
+  Context *sender = nullptr;
 
   version_t awaiting_seq;
   Cond waiting_cond;
diff --git a/ceph/src/mds/CInode.h b/ceph/src/mds/CInode.h
index 8d868d078..f1b371678 100644
--- a/ceph/src/mds/CInode.h
+++ b/ceph/src/mds/CInode.h
@@ -208,7 +208,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
   static const int MASK_STATE_EXPORTED =
     (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
   static const int MASK_STATE_EXPORT_KEPT =
-    (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS);
+    (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS|STATE_QUEUEDEXPORTPIN);
 
   // -- waiters --
   static const uint64_t WAIT_DIR         = (1<<0);
diff --git a/ceph/src/mds/FSMap.cc b/ceph/src/mds/FSMap.cc
index 95e78b234..b224e1119 100644
--- a/ceph/src/mds/FSMap.cc
+++ b/ceph/src/mds/FSMap.cc
@@ -431,17 +431,17 @@ void FSMap::encode(bufferlist& bl, uint64_t features) const
 
 void FSMap::decode(bufferlist::iterator& p)
 {
-  // Because the mon used to store an MDSMap where we now
-  // store an FSMap, FSMap knows how to decode the legacy
-  // MDSMap format (it never needs to encode it though).
-  MDSMap legacy_mds_map;
-  
   // The highest MDSMap encoding version before we changed the
   // MDSMonitor to store an FSMap instead of an MDSMap was
   // 5, so anything older than 6 is decoded as an MDSMap,
   // and anything newer is decoded as an FSMap.
   DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p);
   if (struct_v < 6) {
+    // Because the mon used to store an MDSMap where we now
+    // store an FSMap, FSMap knows how to decode the legacy
+    // MDSMap format (it never needs to encode it though).
+    MDSMap legacy_mds_map;
+
     // Decoding an MDSMap (upgrade)
     ::decode(epoch, p);
     ::decode(legacy_mds_map.flags, p);
@@ -621,6 +621,12 @@ void FSMap::decode(bufferlist::iterator& p)
   DECODE_FINISH(p);
 }
 
+void FSMap::sanitize(std::function<bool(int64_t pool)> pool_exists)
+{
+  for (auto &fs : filesystems) {
+    fs.second->mds_map.sanitize(pool_exists);
+  }
+}
 
 void Filesystem::encode(bufferlist& bl, uint64_t features) const
 {
diff --git a/ceph/src/mds/FSMap.h b/ceph/src/mds/FSMap.h
index ea102a712..3bb97ee58 100644
--- a/ceph/src/mds/FSMap.h
+++ b/ceph/src/mds/FSMap.h
@@ -493,6 +493,7 @@ public:
     bufferlist::iterator p = bl.begin();
     decode(p);
   }
+  void sanitize(std::function<bool(int64_t pool)> pool_exists);
 
   void print(ostream& out) const;
   void print_summary(Formatter *f, ostream *out) const;
diff --git a/ceph/src/mds/MDCache.cc b/ceph/src/mds/MDCache.cc
index a2510f989..b40833fd1 100644
--- a/ceph/src/mds/MDCache.cc
+++ b/ceph/src/mds/MDCache.cc
@@ -1679,7 +1679,7 @@ void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
       mut->add_cow_inode(oldin);
       if (pcow_inode)
 	*pcow_inode = oldin;
-      CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, follows);
+      CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, oldin->last);
       oldin->inode.version = olddn->pre_dirty();
       dout(10) << " olddn " << *olddn << dendl;
       bool need_snapflush = !oldin->client_snap_caps.empty();
@@ -6481,8 +6481,8 @@ void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap
       unexpirables.push_back(dn);
     } else {
       trimmed++;
+      if (count > 0) count--;
     }
-    count--;
   }
 
   for (auto &dn : unexpirables) {
diff --git a/ceph/src/mds/MDSDaemon.cc b/ceph/src/mds/MDSDaemon.cc
index 087c995a8..4c30b6747 100644
--- a/ceph/src/mds/MDSDaemon.cc
+++ b/ceph/src/mds/MDSDaemon.cc
@@ -69,18 +69,6 @@
 #undef dout_prefix
 #define dout_prefix *_dout << "mds." << name << ' '
 
-
-class MDSDaemon::C_MDS_Tick : public Context {
-  protected:
-    MDSDaemon *mds_daemon;
-public:
-  explicit C_MDS_Tick(MDSDaemon *m) : mds_daemon(m) {}
-  void finish(int r) override {
-    assert(mds_daemon->mds_lock.is_locked_by_me());
-    mds_daemon->tick();
-  }
-};
-
 // cons/des
 MDSDaemon::MDSDaemon(const std::string &n, Messenger *m, MonClient *mc) :
   Dispatcher(m->cct),
@@ -102,7 +90,6 @@ MDSDaemon::MDSDaemon(const std::string &n, Messenger *m, MonClient *mc) :
   mgrc(m->cct, m),
   log_client(m->cct, messenger, &mc->monmap, LogClient::NO_FLAGS),
   mds_rank(NULL),
-  tick_event(0),
   asok_hook(NULL)
 {
   orig_argc = 0;
@@ -545,8 +532,12 @@ void MDSDaemon::reset_tick()
   if (tick_event) timer.cancel_event(tick_event);
 
   // schedule
-  tick_event = new C_MDS_Tick(this);
-  timer.add_event_after(g_conf->mds_tick_interval, tick_event);
+  tick_event = timer.add_event_after(
+    g_conf->mds_tick_interval,
+    new FunctionContext([this](int) {
+	assert(mds_lock.is_locked_by_me());
+	tick();
+      }));
 }
 
 void MDSDaemon::tick()
diff --git a/ceph/src/mds/MDSDaemon.h b/ceph/src/mds/MDSDaemon.h
index 0c7a1a737..0e3bbaf26 100644
--- a/ceph/src/mds/MDSDaemon.h
+++ b/ceph/src/mds/MDSDaemon.h
@@ -87,8 +87,7 @@ class MDSDaemon : public Dispatcher, public md_config_obs_t {
 				  const std::set <std::string> &changed) override;
  protected:
   // tick and other timer fun
-  class C_MDS_Tick;
-  C_MDS_Tick *tick_event;
+  Context *tick_event = nullptr;
   void     reset_tick();
 
   void wait_for_omap_osds();
diff --git a/ceph/src/mds/MDSMap.cc b/ceph/src/mds/MDSMap.cc
index 1d38f19f4..9dfce950f 100644
--- a/ceph/src/mds/MDSMap.cc
+++ b/ceph/src/mds/MDSMap.cc
@@ -12,14 +12,16 @@
  * 
  */
 
+#include "common/debug.h"
+#include "mon/health_check.h"
 
 #include "MDSMap.h"
 
 #include <sstream>
 using std::stringstream;
 
-#include "mon/health_check.h"
-
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_
 
 // features
 CompatSet get_mdsmap_compat_set_all() {
@@ -635,6 +637,23 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
   ENCODE_FINISH(bl);
 }
 
+void MDSMap::sanitize(std::function<bool(int64_t pool)> pool_exists)
+{
+  /* Before we did stricter checking, it was possible to remove a data pool
+   * without also deleting it from the MDSMap. Check for that here after
+   * decoding the data pools.
+   */
+
+  for (auto it = data_pools.begin(); it != data_pools.end();) {
+    if (!pool_exists(*it)) {
+      dout(0) << "removed non-existant data pool " << *it << " from MDSMap" << dendl;
+      it = data_pools.erase(it);
+    } else {
+      it++;
+    }
+  }
+}
+
 void MDSMap::decode(bufferlist::iterator& p)
 {
   std::map<mds_rank_t,int32_t> inc;  // Legacy field, parse and drop
diff --git a/ceph/src/mds/MDSMap.h b/ceph/src/mds/MDSMap.h
index 744e64235..454f422dd 100644
--- a/ceph/src/mds/MDSMap.h
+++ b/ceph/src/mds/MDSMap.h
@@ -660,7 +660,7 @@ public:
     bufferlist::iterator p = bl.begin();
     decode(p);
   }
-
+  void sanitize(std::function<bool(int64_t pool)> pool_exists);
 
   void print(ostream& out) const;
   void print_summary(Formatter *f, ostream *out) const;
diff --git a/ceph/src/mds/MDSRank.cc b/ceph/src/mds/MDSRank.cc
index 77f181978..51ae37894 100644
--- a/ceph/src/mds/MDSRank.cc
+++ b/ceph/src/mds/MDSRank.cc
@@ -1055,6 +1055,14 @@ void MDSRank::boot_start(BootStep step, int r)
         dout(2) << "boot_start " << step << ": opening mds log" << dendl;
         mdlog->open(gather.new_sub());
 
+	if (is_starting()) {
+	  dout(2) << "boot_start " << step << ": opening purge queue" << dendl;
+	  purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
+	} else if (!standby_replaying) {
+	  dout(2) << "boot_start " << step << ": opening purge queue (async)" << dendl;
+	  purge_queue.open(NULL);
+	}
+
         if (mdsmap->get_tableserver() == whoami) {
           dout(2) << "boot_start " << step << ": opening snap table" << dendl;
           snapserver->set_rank(whoami);
@@ -1073,8 +1081,6 @@ void MDSRank::boot_start(BootStep step, int r)
 
         mdcache->open_mydir_inode(gather.new_sub());
 
-        purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
-
         if (is_starting() ||
             whoami == mdsmap->get_root()) {  // load root inode off disk if we are auth
           mdcache->open_root_inode(gather.new_sub());
@@ -1087,8 +1093,17 @@ void MDSRank::boot_start(BootStep step, int r)
       break;
     case MDS_BOOT_PREPARE_LOG:
       if (is_any_replay()) {
-        dout(2) << "boot_start " << step << ": replaying mds log" << dendl;
-        mdlog->replay(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
+	dout(2) << "boot_start " << step << ": replaying mds log" << dendl;
+	MDSGatherBuilder gather(g_ceph_context,
+	    new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
+
+	if (!standby_replaying) {
+	  dout(2) << "boot_start " << step << ": waiting for purge queue recovered" << dendl;
+	  purge_queue.wait_for_recovery(new C_IO_Wrapper(this, gather.new_sub()));
+	}
+
+	mdlog->replay(gather.new_sub());
+	gather.activate();
       } else {
         dout(2) << "boot_start " << step << ": positioning at end of old mds log" << dendl;
         mdlog->append();
@@ -1214,7 +1229,16 @@ void MDSRank::_standby_replay_restart_finish(int r, uint64_t old_read_pos)
   }
 }
 
-inline void MDSRank::standby_replay_restart()
+class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext {
+public:
+  explicit C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {}
+  void finish(int r) override {
+    assert(!r);
+    mds->standby_replay_restart();
+  }
+};
+
+void MDSRank::standby_replay_restart()
 {
   if (standby_replaying) {
     /* Go around for another pass of replaying in standby */
@@ -1227,15 +1251,17 @@ inline void MDSRank::standby_replay_restart()
     /* We are transitioning out of standby: wait for OSD map update
        before making final pass */
     dout(1) << "standby_replay_restart (final takeover pass)" << dendl;
-    Context *fin = new C_IO_Wrapper(this, new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
-    bool const ready =
-      objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(), fin);
+    Context *fin = new C_IO_Wrapper(this, new C_MDS_StandbyReplayRestart(this));
+    bool ready = objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(), fin);
     if (ready) {
       delete fin;
       mdlog->get_journaler()->reread_head_and_probe(
         new C_MDS_StandbyReplayRestartFinish(
           this,
 	  mdlog->get_journaler()->get_read_pos()));
+
+      dout(1) << " opening purge queue (async)" << dendl;
+      purge_queue.open(NULL);
     } else {
       dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
               << " (which blacklists prior instance)" << dendl;
@@ -1243,15 +1269,6 @@ inline void MDSRank::standby_replay_restart()
   }
 }
 
-class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext {
-public:
-  explicit C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {}
-  void finish(int r) override {
-    assert(!r);
-    mds->standby_replay_restart();
-  }
-};
-
 void MDSRank::replay_done()
 {
   dout(1) << "replay_done" << (standby_replaying ? " (as standby)" : "") << dendl;
diff --git a/ceph/src/mds/PurgeQueue.cc b/ceph/src/mds/PurgeQueue.cc
index f520240da..49e48b04c 100644
--- a/ceph/src/mds/PurgeQueue.cc
+++ b/ceph/src/mds/PurgeQueue.cc
@@ -79,7 +79,8 @@ PurgeQueue::PurgeQueue(
     max_purge_ops(0),
     drain_initial(0),
     draining(false),
-    delayed_flush(nullptr)
+    delayed_flush(nullptr),
+    recovered(false)
 {
   assert(cct != nullptr);
   assert(on_error != nullptr);
@@ -147,11 +148,14 @@ void PurgeQueue::open(Context *completion)
 
   Mutex::Locker l(lock);
 
-  journaler.recover(new FunctionContext([this, completion](int r){
+  if (completion)
+    waiting_for_recovery.push_back(completion);
+
+  journaler.recover(new FunctionContext([this](int r){
     if (r == -ENOENT) {
       dout(1) << "Purge Queue not found, assuming this is an upgrade and "
                  "creating it." << dendl;
-      create(completion);
+      create(NULL);
     } else if (r == 0) {
       Mutex::Locker l(lock);
       dout(4) << "open complete" << dendl;
@@ -162,12 +166,13 @@ void PurgeQueue::open(Context *completion)
       if (journaler.last_committed.write_pos < journaler.get_write_pos()) {
 	dout(4) << "recovering write_pos" << dendl;
 	journaler.set_read_pos(journaler.last_committed.write_pos);
-	_recover(completion);
+	_recover();
 	return;
       }
 
       journaler.set_writeable();
-      completion->complete(0);
+      recovered = true;
+      finish_contexts(g_ceph_context, waiting_for_recovery);
     } else {
       derr << "Error " << r << " loading Journaler" << dendl;
       on_error->complete(r);
@@ -175,8 +180,16 @@ void PurgeQueue::open(Context *completion)
   }));
 }
 
+void PurgeQueue::wait_for_recovery(Context* c)
+{
+  Mutex::Locker l(lock);
+  if (recovered)
+    c->complete(0);
+  else
+    waiting_for_recovery.push_back(c);
+}
 
-void PurgeQueue::_recover(Context *completion)
+void PurgeQueue::_recover()
 {
   assert(lock.is_locked_by_me());
 
@@ -185,9 +198,9 @@ void PurgeQueue::_recover(Context *completion)
     if (!journaler.is_readable() &&
 	!journaler.get_error() &&
 	journaler.get_read_pos() < journaler.get_write_pos()) {
-      journaler.wait_for_readable(new FunctionContext([this, completion](int r) {
+      journaler.wait_for_readable(new FunctionContext([this](int r) {
         Mutex::Locker l(lock);
-	_recover(completion);
+	_recover();
       }));
       return;
     }
@@ -204,7 +217,8 @@ void PurgeQueue::_recover(Context *completion)
       // restore original read_pos
       journaler.set_read_pos(journaler.last_committed.expire_pos);
       journaler.set_writeable();
-      completion->complete(0);
+      recovered = true;
+      finish_contexts(g_ceph_context, waiting_for_recovery);
       return;
     }
 
@@ -219,11 +233,18 @@ void PurgeQueue::create(Context *fin)
   dout(4) << "creating" << dendl;
   Mutex::Locker l(lock);
 
+  if (fin)
+    waiting_for_recovery.push_back(fin);
+
   file_layout_t layout = file_layout_t::get_default();
   layout.pool_id = metadata_pool;
   journaler.set_writeable();
   journaler.create(&layout, JOURNAL_FORMAT_RESILIENT);
-  journaler.write_head(fin);
+  journaler.write_head(new FunctionContext([this](int r) {
+    Mutex::Locker l(lock);
+    recovered = true;
+    finish_contexts(g_ceph_context, waiting_for_recovery);
+  }));
 }
 
 /**
diff --git a/ceph/src/mds/PurgeQueue.h b/ceph/src/mds/PurgeQueue.h
index aed66c94e..6a13a57ee 100644
--- a/ceph/src/mds/PurgeQueue.h
+++ b/ceph/src/mds/PurgeQueue.h
@@ -113,7 +113,7 @@ protected:
   bool draining;
 
   // recover the journal write_pos (drop any partial written entry)
-  void _recover(Context *completion);
+  void _recover();
 
   /**
    * @return true if we were in a position to try and consume something:
@@ -130,6 +130,8 @@ protected:
   void _execute_item_complete(
       uint64_t expire_to);
 
+  bool recovered;
+  std::list<Context*> waiting_for_recovery;
 
 public:
   void init();
@@ -144,6 +146,8 @@ public:
   // Read the Journaler header for an existing queue and start consuming
   void open(Context *completion);
 
+  void wait_for_recovery(Context *c);
+
   // Submit one entry to the work queue.  Call back when it is persisted
   // to the queue (there is no callback for when it is executed)
   void push(const PurgeItem &pi, Context *completion);
diff --git a/ceph/src/mds/Server.cc b/ceph/src/mds/Server.cc
index 3d34bd4c1..38c44523e 100644
--- a/ceph/src/mds/Server.cc
+++ b/ceph/src/mds/Server.cc
@@ -341,6 +341,9 @@ void Server::handle_client_session(MClientSession *m)
 	session->is_stale() ||
 	session->is_killing()) {
       dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
+      // set client metadata for session opened by prepare_force_open_sessions
+      if (!m->client_meta.empty())
+	session->set_client_metadata(m->client_meta);
       m->put();
       return;
     }
@@ -618,9 +621,7 @@ void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
 	   << " initial v " << mds->sessionmap.get_version() << dendl;
   
 
-  int sessions_inserted = 0;
   for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
-    sessions_inserted++;
 
     Session *session = mds->sessionmap.get_session(p->second.name);
     assert(session);
@@ -1085,8 +1086,14 @@ void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
 void Server::recall_client_state(void)
 {
   /* try to recall at least 80% of all caps */
-  uint64_t max_caps_per_client = (Capability::count() * .8);
-  uint64_t min_caps_per_client = 100;
+  uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
+  uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
+  if (max_caps_per_client < min_caps_per_client) {
+    dout(0) << "max_caps_per_client " << max_caps_per_client
+            << " < min_caps_per_client " << min_caps_per_client << dendl;
+    max_caps_per_client = min_caps_per_client + 1;
+  }
+
   /* unless this ratio is smaller: */
   /* ratio: determine the amount of caps to recall from each client. Use
    * percentage full over the cache reservation. Cap the ratio at 80% of client
@@ -1109,14 +1116,12 @@ void Server::recall_client_state(void)
 	     << ", leases " << session->leases.size()
 	     << dendl;
 
-    if (session->caps.size() > min_caps_per_client) {	
-      uint64_t newlim = MIN((session->caps.size() * ratio), max_caps_per_client);
-      if (session->caps.size() > newlim) {
-          MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
-          m->head.max_caps = newlim;
-          mds->send_message_client(m, session);
-          session->notify_recall_sent(newlim);
-      }
+    uint64_t newlim = MAX(MIN((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
+    if (session->caps.size() > newlim) {
+      MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
+      m->head.max_caps = newlim;
+      mds->send_message_client(m, session);
+      session->notify_recall_sent(newlim);
     }
   }
 }
diff --git a/ceph/src/messages/MMgrBeacon.h b/ceph/src/messages/MMgrBeacon.h
index c877afc15..5def13c14 100644
--- a/ceph/src/messages/MMgrBeacon.h
+++ b/ceph/src/messages/MMgrBeacon.h
@@ -23,7 +23,7 @@
 
 class MMgrBeacon : public PaxosServiceMessage {
 
-  static const int HEAD_VERSION = 5;
+  static const int HEAD_VERSION = 6;
   static const int COMPAT_VERSION = 1;
 
 protected:
@@ -35,6 +35,9 @@ protected:
   std::set<std::string> available_modules;
   map<string,string> metadata; ///< misc metadata about this osd
 
+  // From active daemon to populate MgrMap::services
+  std::map<std::string, std::string> services;
+
   // Only populated during activation
   std::vector<MonCommand> command_descs;
 
@@ -65,6 +68,15 @@ public:
     return metadata;
   }
 
+  const std::map<std::string,std::string>& get_services() const {
+    return services;
+  }
+
+  void set_services(const std::map<std::string, std::string> &svcs)
+  {
+    services = svcs;
+  }
+
   void set_command_descs(const std::vector<MonCommand> &cmds)
   {
     command_descs = cmds;
@@ -98,6 +110,7 @@ public:
     ::encode(available_modules, payload);
     ::encode(command_descs, payload);
     ::encode(metadata, payload);
+    ::encode(services, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
@@ -118,6 +131,9 @@ public:
     if (header.version >= 5) {
       ::decode(metadata, p);
     }
+    if (header.version >= 6) {
+      ::decode(services, p);
+    }
   }
 };
 
diff --git a/ceph/src/messages/MMgrConfigure.h b/ceph/src/messages/MMgrConfigure.h
index 27c7cbca4..10d990807 100644
--- a/ceph/src/messages/MMgrConfigure.h
+++ b/ceph/src/messages/MMgrConfigure.h
@@ -23,25 +23,33 @@
  */
 class MMgrConfigure : public Message
 {
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
 public:
   uint32_t stats_period;
+  
+  // Default 0 means if unspecified will include all stats
+  uint32_t stats_threshold = 0;
 
   void decode_payload() override
   {
     bufferlist::iterator p = payload.begin();
     ::decode(stats_period, p);
+    if (header.version >= 2) {
+      ::decode(stats_threshold, p);
+    }
   }
 
   void encode_payload(uint64_t features) override {
     ::encode(stats_period, payload);
+    ::encode(stats_threshold, payload);
   }
 
   const char *get_type_name() const override { return "mgrconfigure"; }
   void print(ostream& out) const override {
-    out << get_type_name() << "()";
+    out << get_type_name() << "(period=" << stats_period
+                           << ", threshold=" << stats_threshold << ")";
   }
 
   MMgrConfigure()
diff --git a/ceph/src/messages/MMgrReport.h b/ceph/src/messages/MMgrReport.h
index 9b033ec23..26268927b 100644
--- a/ceph/src/messages/MMgrReport.h
+++ b/ceph/src/messages/MMgrReport.h
@@ -29,27 +29,36 @@ public:
   std::string nick;
   enum perfcounter_type_d type;
 
+  // For older clients that did not send priority, pretend everything
+  // is "useful" so that mgr plugins filtering on prio will get some
+  // data (albeit probably more than they wanted)
+  uint8_t priority = PerfCountersBuilder::PRIO_USEFUL;
+
   void encode(bufferlist &bl) const
   {
     // TODO: decide whether to drop the per-type
     // encoding here, we could rely on the MgrReport
     // verisoning instead.
-    ENCODE_START(1, 1, bl);
+    ENCODE_START(2, 1, bl);
     ::encode(path, bl);
     ::encode(description, bl);
     ::encode(nick, bl);
     static_assert(sizeof(type) == 1, "perfcounter_type_d must be one byte");
     ::encode((uint8_t)type, bl);
+    ::encode(priority, bl);
     ENCODE_FINISH(bl);
   }
   
   void decode(bufferlist::iterator &p)
   {
-    DECODE_START(1, p);
+    DECODE_START(2, p);
     ::decode(path, p);
     ::decode(description, p);
     ::decode(nick, p);
     ::decode((uint8_t&)type, p);
+    if (struct_v >= 2) {
+      ::decode(priority, p);
+    }
     DECODE_FINISH(p);
   }
 };
diff --git a/ceph/src/messages/MOSDMap.h b/ceph/src/messages/MOSDMap.h
index 72246288a..865642cf4 100644
--- a/ceph/src/messages/MOSDMap.h
+++ b/ceph/src/messages/MOSDMap.h
@@ -86,7 +86,8 @@ public:
 	(features & CEPH_FEATURE_PGPOOL3) == 0 ||
 	(features & CEPH_FEATURE_OSDENC) == 0 ||
         (features & CEPH_FEATURE_OSDMAP_ENC) == 0 ||
-	(features & CEPH_FEATURE_MSG_ADDR2) == 0) {
+	(features & CEPH_FEATURE_MSG_ADDR2) == 0 ||
+	!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
       if ((features & CEPH_FEATURE_PGID64) == 0 ||
 	  (features & CEPH_FEATURE_PGPOOL3) == 0)
 	header.version = 1;  // old old_client version
@@ -112,6 +113,14 @@ public:
 	  inc.fullmap.clear();
 	  m.encode(inc.fullmap, features | CEPH_FEATURE_RESERVED);
 	}
+	if (inc.crush.length()) {
+	  // embedded crush map
+	  CrushWrapper c;
+	  auto p = inc.crush.begin();
+	  c.decode(p);
+	  inc.crush.clear();
+	  c.encode(inc.crush, features);
+	}
 	inc.encode(p->second, features | CEPH_FEATURE_RESERVED);
       }
       for (map<epoch_t,bufferlist>::iterator p = maps.begin();
diff --git a/ceph/src/messages/MOSDPGTemp.h b/ceph/src/messages/MOSDPGTemp.h
index 3ea7a211c..1cdaed9b2 100644
--- a/ceph/src/messages/MOSDPGTemp.h
+++ b/ceph/src/messages/MOSDPGTemp.h
@@ -23,9 +23,15 @@ class MOSDPGTemp : public PaxosServiceMessage {
  public:
   epoch_t map_epoch = 0;
   map<pg_t, vector<int32_t> > pg_temp;
+  bool forced = false;
 
-  MOSDPGTemp(epoch_t e) : PaxosServiceMessage(MSG_OSD_PGTEMP, e), map_epoch(e) { }
-  MOSDPGTemp() : PaxosServiceMessage(MSG_OSD_PGTEMP, 0) {}
+  MOSDPGTemp(epoch_t e)
+    : PaxosServiceMessage(MSG_OSD_PGTEMP, e, HEAD_VERSION, COMPAT_VERSION),
+      map_epoch(e)
+  {}
+  MOSDPGTemp()
+    : MOSDPGTemp(0)
+  {}
 private:
   ~MOSDPGTemp() override {}
 
@@ -34,19 +40,25 @@ public:
     paxos_encode();
     ::encode(map_epoch, payload);
     ::encode(pg_temp, payload);
+    ::encode(forced, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
     paxos_decode(p);
     ::decode(map_epoch, p);
     ::decode(pg_temp, p);
+    if (header.version >= 2) {
+      ::decode(forced, p);
+    }
   }
 
   const char *get_type_name() const override { return "osd_pgtemp"; }
   void print(ostream &out) const override {
     out << "osd_pgtemp(e" << map_epoch << " " << pg_temp << " v" << version << ")";
   }
-  
+private:
+  static constexpr int HEAD_VERSION = 2;
+  static constexpr int COMPAT_VERSION = 1;
 };
 
 #endif
diff --git a/ceph/src/mgr/ActivePyModule.cc b/ceph/src/mgr/ActivePyModule.cc
new file mode 100644
index 000000000..90040af93
--- /dev/null
+++ b/ceph/src/mgr/ActivePyModule.cc
@@ -0,0 +1,225 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "BaseMgrModule.h"
+
+#include "PyFormatter.h"
+
+#include "common/debug.h"
+
+#include "ActivePyModule.h"
+
+//XXX courtesy of http://stackoverflow.com/questions/1418015/how-to-get-python-exception-text
+#include <boost/python.hpp>
+#include "include/assert.h"  // boost clobbers this
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+// decode a Python exception into a string
+std::string handle_pyerror()
+{
+    using namespace boost::python;
+    using namespace boost;
+
+    PyObject *exc, *val, *tb;
+    object formatted_list, formatted;
+    PyErr_Fetch(&exc, &val, &tb);
+    handle<> hexc(exc), hval(allow_null(val)), htb(allow_null(tb));
+    object traceback(import("traceback"));
+    if (!tb) {
+        object format_exception_only(traceback.attr("format_exception_only"));
+        formatted_list = format_exception_only(hexc, hval);
+    } else {
+        object format_exception(traceback.attr("format_exception"));
+        formatted_list = format_exception(hexc,hval, htb);
+    }
+    formatted = str("").join(formatted_list);
+    return extract<std::string>(formatted);
+}
+
+int ActivePyModule::load(ActivePyModules *py_modules)
+{
+  assert(py_modules);
+  Gil gil(pMyThreadState, true);
+
+  // We tell the module how we name it, so that it can be consistent
+  // with us in logging etc.
+  auto pThisPtr = PyCapsule_New(this, nullptr, nullptr);
+  auto pPyModules = PyCapsule_New(py_modules, nullptr, nullptr);
+  auto pModuleName = PyString_FromString(module_name.c_str());
+  auto pArgs = PyTuple_Pack(3, pModuleName, pPyModules, pThisPtr);
+
+  pClassInstance = PyObject_CallObject(pClass, pArgs);
+  Py_DECREF(pClass);
+  Py_DECREF(pModuleName);
+  Py_DECREF(pArgs);
+  if (pClassInstance == nullptr) {
+    derr << "Failed to construct class in '" << module_name << "'" << dendl;
+    derr << handle_pyerror() << dendl;
+    return -EINVAL;
+  } else {
+    dout(1) << "Constructed class from module: " << module_name << dendl;
+  }
+
+  return load_commands();
+}
+
+void ActivePyModule::notify(const std::string &notify_type, const std::string &notify_id)
+{
+  assert(pClassInstance != nullptr);
+
+  Gil gil(pMyThreadState, true);
+
+  // Execute
+  auto pValue = PyObject_CallMethod(pClassInstance,
+       const_cast<char*>("notify"), const_cast<char*>("(ss)"),
+       notify_type.c_str(), notify_id.c_str());
+
+  if (pValue != NULL) {
+    Py_DECREF(pValue);
+  } else {
+    derr << module_name << ".notify:" << dendl;
+    derr << handle_pyerror() << dendl;
+    // FIXME: callers can't be expected to handle a python module
+    // that has spontaneously broken, but Mgr() should provide
+    // a hook to unload misbehaving modules when they have an
+    // error somewhere like this
+  }
+}
+
+void ActivePyModule::notify_clog(const LogEntry &log_entry)
+{
+  assert(pClassInstance != nullptr);
+
+  Gil gil(pMyThreadState, true);
+
+  // Construct python-ized LogEntry
+  PyFormatter f;
+  log_entry.dump(&f);
+  auto py_log_entry = f.get();
+
+  // Execute
+  auto pValue = PyObject_CallMethod(pClassInstance,
+       const_cast<char*>("notify"), const_cast<char*>("(sN)"),
+       "clog", py_log_entry);
+
+  if (pValue != NULL) {
+    Py_DECREF(pValue);
+  } else {
+    derr << module_name << ".notify_clog:" << dendl;
+    derr << handle_pyerror() << dendl;
+    // FIXME: callers can't be expected to handle a python module
+    // that has spontaneously broken, but Mgr() should provide
+    // a hook to unload misbehaving modules when they have an
+    // error somewhere like this
+  }
+}
+
+int ActivePyModule::load_commands()
+{
+  // Don't need a Gil here -- this is called from ActivePyModule::load(),
+  // which already has one.
+  PyObject *command_list = PyObject_GetAttrString(pClassInstance, "COMMANDS");
+  if (command_list == nullptr) {
+    // Even modules that don't define command should still have the COMMANDS
+    // from the MgrModule definition.  Something is wrong!
+    derr << "Module " << get_name() << " has missing COMMANDS member" << dendl;
+    return -EINVAL;
+  }
+  if (!PyObject_TypeCheck(command_list, &PyList_Type)) {
+    // Relatively easy mistake for human to make, e.g. defining COMMANDS
+    // as a {} instead of a []
+    derr << "Module " << get_name() << " has COMMANDS member of wrong type ("
+            "should be a list)" << dendl;
+    return -EINVAL;
+  }
+  const size_t list_size = PyList_Size(command_list);
+  for (size_t i = 0; i < list_size; ++i) {
+    PyObject *command = PyList_GetItem(command_list, i);
+    assert(command != nullptr);
+
+    ModuleCommand item;
+
+    PyObject *pCmd = PyDict_GetItemString(command, "cmd");
+    assert(pCmd != nullptr);
+    item.cmdstring = PyString_AsString(pCmd);
+
+    dout(20) << "loaded command " << item.cmdstring << dendl;
+
+    PyObject *pDesc = PyDict_GetItemString(command, "desc");
+    assert(pDesc != nullptr);
+    item.helpstring = PyString_AsString(pDesc);
+
+    PyObject *pPerm = PyDict_GetItemString(command, "perm");
+    assert(pPerm != nullptr);
+    item.perm = PyString_AsString(pPerm);
+
+    item.handler = this;
+
+    commands.push_back(item);
+  }
+  Py_DECREF(command_list);
+
+  dout(10) << "loaded " << commands.size() << " commands" << dendl;
+
+  return 0;
+}
+
+int ActivePyModule::handle_command(
+  const cmdmap_t &cmdmap,
+  std::stringstream *ds,
+  std::stringstream *ss)
+{
+  assert(ss != nullptr);
+  assert(ds != nullptr);
+
+  Gil gil(pMyThreadState, true);
+
+  PyFormatter f;
+  cmdmap_dump(cmdmap, &f);
+  PyObject *py_cmd = f.get();
+
+  auto pResult = PyObject_CallMethod(pClassInstance,
+      const_cast<char*>("handle_command"), const_cast<char*>("(O)"), py_cmd);
+
+  Py_DECREF(py_cmd);
+
+  int r = 0;
+  if (pResult != NULL) {
+    if (PyTuple_Size(pResult) != 3) {
+      r = -EINVAL;
+    } else {
+      r = PyInt_AsLong(PyTuple_GetItem(pResult, 0));
+      *ds << PyString_AsString(PyTuple_GetItem(pResult, 1));
+      *ss << PyString_AsString(PyTuple_GetItem(pResult, 2));
+    }
+
+    Py_DECREF(pResult);
+  } else {
+    *ds << "";
+    *ss << handle_pyerror();
+    r = -EINVAL;
+  }
+
+  return r;
+}
+
+void ActivePyModule::get_health_checks(health_check_map_t *checks)
+{
+  checks->merge(health_checks);
+}
+
diff --git a/ceph/src/mgr/MgrPyModule.h b/ceph/src/mgr/ActivePyModule.h
similarity index 70%
rename from ceph/src/mgr/MgrPyModule.h
rename to ceph/src/mgr/ActivePyModule.h
index 6eea29eee..0c2ee12f3 100644
--- a/ceph/src/mgr/MgrPyModule.h
+++ b/ceph/src/mgr/ActivePyModule.h
@@ -13,8 +13,7 @@
  */
 
 
-#ifndef MGR_PY_MODULE_H_
-#define MGR_PY_MODULE_H_
+#pragma once
 
 // Python.h comes first because otherwise it clobbers ceph's assert
 #include "Python.h"
@@ -22,13 +21,18 @@
 #include "common/cmdparse.h"
 #include "common/LogEntry.h"
 #include "common/Mutex.h"
+#include "common/Thread.h"
 #include "mon/health_check.h"
+#include "mgr/Gil.h"
+
+#include "PyModuleRunner.h"
 
 #include <vector>
 #include <string>
 
 
-class MgrPyModule;
+class ActivePyModule;
+class ActivePyModules;
 
 /**
  * A Ceph CLI command description provided from a Python module
@@ -38,30 +42,29 @@ public:
   std::string cmdstring;
   std::string helpstring;
   std::string perm;
-  MgrPyModule *handler;
+  ActivePyModule *handler;
 };
 
-class MgrPyModule
+class ActivePyModule : public PyModuleRunner
 {
 private:
-  const std::string module_name;
-  PyObject *pClassInstance;
-  PyThreadState *pMainThreadState;
-  PyThreadState *pMyThreadState = nullptr;
-
   health_check_map_t health_checks;
 
   std::vector<ModuleCommand> commands;
 
   int load_commands();
 
+  // Optional, URI exposed by plugins that implement serve()
+  std::string uri;
+
 public:
-  MgrPyModule(const std::string &module_name, const std::string &sys_path, PyThreadState *main_ts);
-  ~MgrPyModule();
+  ActivePyModule(const std::string &module_name_,
+      PyObject *pClass_,
+      const SafeThreadState &my_ts_)
+    : PyModuleRunner(module_name_, pClass_, my_ts_)
+  {}
 
-  int load();
-  int serve();
-  void shutdown();
+  int load(ActivePyModules *py_modules);
   void notify(const std::string &notify_type, const std::string &notify_id);
   void notify_clog(const LogEntry &le);
 
@@ -70,11 +73,6 @@ public:
     return commands;
   }
 
-  const std::string &get_name() const
-  {
-    return module_name;
-  }
-
   int handle_command(
     const cmdmap_t &cmdmap,
     std::stringstream *ds,
@@ -84,9 +82,17 @@ public:
     health_checks = std::move(c);
   }
   void get_health_checks(health_check_map_t *checks);
+
+  void set_uri(const std::string &str)
+  {
+    uri = str;
+  }
+
+  std::string get_uri() const
+  {
+    return uri;
+  }
 };
 
 std::string handle_pyerror();
 
-#endif
-
diff --git a/ceph/src/mgr/PyModules.cc b/ceph/src/mgr/ActivePyModules.cc
similarity index 61%
rename from ceph/src/mgr/PyModules.cc
rename to ceph/src/mgr/ActivePyModules.cc
index d8b7b01cc..6025e62a4 100644
--- a/ceph/src/mgr/PyModules.cc
+++ b/ceph/src/mgr/ActivePyModules.cc
@@ -12,7 +12,7 @@
  */
 
 // Include this first to get python headers earlier
-#include "PyState.h"
+#include "BaseMgrModule.h"
 #include "Gil.h"
 
 #include "common/errno.h"
@@ -25,30 +25,29 @@
 
 #include "mgr/MgrContext.h"
 
-#include "PyModules.h"
+// For ::config_prefix
+#include "PyModuleRegistry.h"
+
+#include "ActivePyModules.h"
 
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_mgr
 #undef dout_prefix
 #define dout_prefix *_dout << "mgr " << __func__ << " "
 
-// definition for non-const static member
-std::string PyModules::config_prefix;
-
-// constructor/destructor implementations cannot be in .h,
-// because ServeThread is still an "incomplete" type there
 
-PyModules::PyModules(DaemonStateIndex &ds, ClusterState &cs,
+ActivePyModules::ActivePyModules(PyModuleConfig const &config_,
+          DaemonStateIndex &ds, ClusterState &cs,
 	  MonClient &mc, LogChannelRef clog_, Objecter &objecter_,
           Client &client_, Finisher &f)
-  : daemon_state(ds), cluster_state(cs), monc(mc), clog(clog_),
-    objecter(objecter_), client(client_), finisher(f),
-    lock("PyModules")
+  : config_cache(config_), daemon_state(ds), cluster_state(cs),
+    monc(mc), clog(clog_), objecter(objecter_), client(client_), finisher(f),
+    lock("ActivePyModules")
 {}
 
-PyModules::~PyModules() = default;
+ActivePyModules::~ActivePyModules() = default;
 
-void PyModules::dump_server(const std::string &hostname,
+void ActivePyModules::dump_server(const std::string &hostname,
                       const DaemonStateCollection &dmc,
                       Formatter *f)
 {
@@ -82,7 +81,7 @@ void PyModules::dump_server(const std::string &hostname,
 
 
 
-PyObject *PyModules::get_server_python(const std::string &hostname)
+PyObject *ActivePyModules::get_server_python(const std::string &hostname)
 {
   PyThreadState *tstate = PyEval_SaveThread();
   Mutex::Locker l(lock);
@@ -97,7 +96,7 @@ PyObject *PyModules::get_server_python(const std::string &hostname)
 }
 
 
-PyObject *PyModules::list_servers_python()
+PyObject *ActivePyModules::list_servers_python()
 {
   PyThreadState *tstate = PyEval_SaveThread();
   Mutex::Locker l(lock);
@@ -105,24 +104,30 @@ PyObject *PyModules::list_servers_python()
   dout(10) << " >" << dendl;
 
   PyFormatter f(false, true);
-  const auto &all = daemon_state.get_all_servers();
-  for (const auto &i : all) {
-    const auto &hostname = i.first;
+  daemon_state.with_daemons_by_server([this, &f]
+      (const std::map<std::string, DaemonStateCollection> &all) {
+    for (const auto &i : all) {
+      const auto &hostname = i.first;
 
-    f.open_object_section("server");
-    dump_server(hostname, i.second, &f);
-    f.close_section();
-  }
+      f.open_object_section("server");
+      dump_server(hostname, i.second, &f);
+      f.close_section();
+    }
+  });
 
   return f.get();
 }
 
-PyObject *PyModules::get_metadata_python(
-  std::string const &handle,
-  const std::string &svc_name,
+PyObject *ActivePyModules::get_metadata_python(
+  const std::string &svc_type,
   const std::string &svc_id)
 {
-  auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
+  auto metadata = daemon_state.get(DaemonKey(svc_type, svc_id));
+  if (metadata == nullptr) {
+    derr << "Requested missing service " << svc_type << "." << svc_id << dendl;
+    Py_RETURN_NONE;
+  }
+
   Mutex::Locker l(metadata->lock);
   PyFormatter f;
   f.dump_string("hostname", metadata->hostname);
@@ -133,12 +138,16 @@ PyObject *PyModules::get_metadata_python(
   return f.get();
 }
 
-PyObject *PyModules::get_daemon_status_python(
-  std::string const &handle,
-  const std::string &svc_name,
+PyObject *ActivePyModules::get_daemon_status_python(
+  const std::string &svc_type,
   const std::string &svc_id)
 {
-  auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
+  auto metadata = daemon_state.get(DaemonKey(svc_type, svc_id));
+  if (metadata == nullptr) {
+    derr << "Requested missing service " << svc_type << "." << svc_id << dendl;
+    Py_RETURN_NONE;
+  }
+
   Mutex::Locker l(metadata->lock);
   PyFormatter f;
   for (const auto &i : metadata->service_status) {
@@ -147,7 +156,7 @@ PyObject *PyModules::get_daemon_status_python(
   return f.get();
 }
 
-PyObject *PyModules::get_python(const std::string &what)
+PyObject *ActivePyModules::get_python(const std::string &what)
 {
   PyThreadState *tstate = PyEval_SaveThread();
   Mutex::Locker l(lock);
@@ -254,7 +263,22 @@ PyObject *PyModules::get_python(const std::string &what)
         }
     );
     return f.get();
-
+  } else if (what == "pg_status") {
+    PyFormatter f;
+    cluster_state.with_pgmap(
+        [&f](const PGMap &pg_map) {
+	  pg_map.print_summary(&f, nullptr);
+        }
+    );
+    return f.get();
+  } else if (what == "pg_dump") {
+    PyFormatter f;
+        cluster_state.with_pgmap(
+        [&f](const PGMap &pg_map) {
+	  pg_map.dump(&f);
+        }
+    );
+    return f.get();
   } else if (what == "df") {
     PyFormatter f;
 
@@ -297,208 +321,61 @@ PyObject *PyModules::get_python(const std::string &what)
   }
 }
 
-std::string PyModules::get_site_packages()
-{
-  std::stringstream site_packages;
-
-  // CPython doesn't auto-add site-packages dirs to sys.path for us,
-  // but it does provide a module that we can ask for them.
-  auto site_module = PyImport_ImportModule("site");
-  assert(site_module);
-
-  auto site_packages_fn = PyObject_GetAttrString(site_module, "getsitepackages");
-  if (site_packages_fn != nullptr) {
-    auto site_packages_list = PyObject_CallObject(site_packages_fn, nullptr);
-    assert(site_packages_list);
-
-    auto n = PyList_Size(site_packages_list);
-    for (Py_ssize_t i = 0; i < n; ++i) {
-      if (i != 0) {
-        site_packages << ":";
-      }
-      site_packages << PyString_AsString(PyList_GetItem(site_packages_list, i));
-    }
-
-    Py_DECREF(site_packages_list);
-    Py_DECREF(site_packages_fn);
-  } else {
-    // Fall back to generating our own site-packages paths by imitating
-    // what the standard site.py does.  This is annoying but it lets us
-    // run inside virtualenvs :-/
-
-    auto site_packages_fn = PyObject_GetAttrString(site_module, "addsitepackages");
-    assert(site_packages_fn);
-
-    auto known_paths = PySet_New(nullptr);
-    auto pArgs = PyTuple_Pack(1, known_paths);
-    PyObject_CallObject(site_packages_fn, pArgs);
-    Py_DECREF(pArgs);
-    Py_DECREF(known_paths);
-    Py_DECREF(site_packages_fn);
-
-    auto sys_module = PyImport_ImportModule("sys");
-    assert(sys_module);
-    auto sys_path = PyObject_GetAttrString(sys_module, "path");
-    assert(sys_path);
-
-    dout(1) << "sys.path:" << dendl;
-    auto n = PyList_Size(sys_path);
-    bool first = true;
-    for (Py_ssize_t i = 0; i < n; ++i) {
-      dout(1) << "  " << PyString_AsString(PyList_GetItem(sys_path, i)) << dendl;
-      if (first) {
-        first = false;
-      } else {
-        site_packages << ":";
-      }
-      site_packages << PyString_AsString(PyList_GetItem(sys_path, i));
-    }
-
-    Py_DECREF(sys_path);
-    Py_DECREF(sys_module);
-  }
-
-  Py_DECREF(site_module);
-
-  return site_packages.str();
-}
-
-
-int PyModules::init()
-{
-  Mutex::Locker locker(lock);
-
-  global_handle = this;
-  // namespace in config-key prefixed by "mgr/"
-  config_prefix = std::string(g_conf->name.get_type_str()) + "/";
-
-  // Set up global python interpreter
-  Py_SetProgramName(const_cast<char*>(PYTHON_EXECUTABLE));
-  Py_InitializeEx(0);
-
-  // Let CPython know that we will be calling it back from other
-  // threads in future.
-  if (! PyEval_ThreadsInitialized()) {
-    PyEval_InitThreads();
-  }
-
-  // Configure sys.path to include mgr_module_path
-  std::string sys_path = std::string(Py_GetPath()) + ":" + get_site_packages()
-                         + ":" + g_conf->mgr_module_path;
-  dout(10) << "Computed sys.path '" << sys_path << "'" << dendl;
-
-  // Drop the GIL and remember the main thread state (current
-  // thread state becomes NULL)
-  pMainThreadState = PyEval_SaveThread();
-
-  std::list<std::string> failed_modules;
-
-  // Load python code
-  set<string> ls;
-  cluster_state.with_mgrmap([&](const MgrMap& m) {
-      ls = m.modules;
-    });
-  for (const auto& module_name : ls) {
-    dout(1) << "Loading python module '" << module_name << "'" << dendl;
-    auto mod = std::unique_ptr<MgrPyModule>(new MgrPyModule(module_name, sys_path, pMainThreadState));
-    int r = mod->load();
-    if (r != 0) {
-      // Don't use handle_pyerror() here; we don't have the GIL
-      // or the right thread state (this is deliberate).
-      derr << "Error loading module '" << module_name << "': "
-        << cpp_strerror(r) << dendl;
-      failed_modules.push_back(module_name);
-      // Don't drop out here, load the other modules
-    } else {
-      // Success!
-      modules[module_name] = std::move(mod);
-    }
-  }
-
-  if (!failed_modules.empty()) {
-    clog->error() << "Failed to load ceph-mgr modules: " << joinify(
-        failed_modules.begin(), failed_modules.end(), std::string(", "));
-  }
-
-  return 0;
-}
-
-class ServeThread : public Thread
+int ActivePyModules::start_one(std::string const &module_name,
+    PyObject *pClass, const SafeThreadState &pMyThreadState)
 {
-  MgrPyModule *mod;
-
-public:
-  bool running;
-
-  ServeThread(MgrPyModule *mod_)
-    : mod(mod_) {}
-
-  void *entry() override
-  {
-    running = true;
+  Mutex::Locker l(lock);
 
-    // No need to acquire the GIL here; the module does it.
-    dout(4) << "Entering thread for " << mod->get_name() << dendl;
-    mod->serve();
+  assert(modules.count(module_name) == 0);
 
-    running = false;
-    return nullptr;
-  }
-};
+  modules[module_name].reset(new ActivePyModule(
+      module_name, pClass,
+      pMyThreadState));
 
-void PyModules::start()
-{
-  Mutex::Locker l(lock);
-
-  dout(1) << "Creating threads for " << modules.size() << " modules" << dendl;
-  for (auto& i : modules) {
-    auto thread = new ServeThread(i.second.get());
-    serve_threads[i.first].reset(thread);
-  }
+  int r = modules[module_name]->load(this);
+  if (r != 0) {
+    return r;
+  } else {
+    dout(4) << "Starting thread for " << module_name << dendl;
+    // Giving Thread the module's module_name member as its
+    // char* thread name: thread must not outlive module class lifetime.
+    modules[module_name]->thread.create(
+        modules[module_name]->get_name().c_str());
 
-  for (auto &i : serve_threads) {
-    std::ostringstream thread_name;
-    thread_name << "mgr." << i.first;
-    dout(4) << "Starting thread for " << i.first << dendl;
-    i.second->create(thread_name.str().c_str());
+    return 0;
   }
 }
 
-void PyModules::shutdown()
+void ActivePyModules::shutdown()
 {
   Mutex::Locker locker(lock);
-  assert(global_handle);
 
   // Signal modules to drop out of serve() and/or tear down resources
   for (auto &i : modules) {
     auto module = i.second.get();
     const auto& name = i.first;
-    dout(10) << "waiting for module " << name << " to shutdown" << dendl;
+
     lock.Unlock();
+    dout(10) << "calling module " << name << " shutdown()" << dendl;
     module->shutdown();
+    dout(10) << "module " << name << " shutdown() returned" << dendl;
     lock.Lock();
-    dout(10) << "module " << name << " shutdown" << dendl;
   }
 
   // For modules implementing serve(), finish the threads where we
   // were running that.
-  for (auto &i : serve_threads) {
+  for (auto &i : modules) {
     lock.Unlock();
-    i.second->join();
+    dout(10) << "joining module " << i.first << dendl;
+    i.second->thread.join();
+    dout(10) << "joined module " << i.first << dendl;
     lock.Lock();
   }
-  serve_threads.clear();
 
   modules.clear();
-
-  PyEval_RestoreThread(pMainThreadState);
-  Py_Finalize();
-
-  // nobody needs me anymore.
-  global_handle = nullptr;
 }
 
-void PyModules::notify_all(const std::string &notify_type,
+void ActivePyModules::notify_all(const std::string &notify_type,
                      const std::string &notify_id)
 {
   Mutex::Locker l(lock);
@@ -506,8 +383,6 @@ void PyModules::notify_all(const std::string &notify_type,
   dout(10) << __func__ << ": notify_all " << notify_type << dendl;
   for (auto& i : modules) {
     auto module = i.second.get();
-    if (!serve_threads[i.first]->running)
-      continue;
     // Send all python calls down a Finisher to avoid blocking
     // C++ code, and avoid any potential lock cycles.
     finisher.queue(new FunctionContext([module, notify_type, notify_id](int r){
@@ -516,15 +391,13 @@ void PyModules::notify_all(const std::string &notify_type,
   }
 }
 
-void PyModules::notify_all(const LogEntry &log_entry)
+void ActivePyModules::notify_all(const LogEntry &log_entry)
 {
   Mutex::Locker l(lock);
 
   dout(10) << __func__ << ": notify_all (clog)" << dendl;
   for (auto& i : modules) {
     auto module = i.second.get();
-    if (!serve_threads[i.first]->running)
-      continue;
     // Send all python calls down a Finisher to avoid blocking
     // C++ code, and avoid any potential lock cycles.
     //
@@ -537,14 +410,15 @@ void PyModules::notify_all(const LogEntry &log_entry)
   }
 }
 
-bool PyModules::get_config(const std::string &handle,
+bool ActivePyModules::get_config(const std::string &module_name,
     const std::string &key, std::string *val) const
 {
   PyThreadState *tstate = PyEval_SaveThread();
   Mutex::Locker l(lock);
   PyEval_RestoreThread(tstate);
 
-  const std::string global_key = config_prefix + handle + "/" + key;
+  const std::string global_key = PyModuleRegistry::config_prefix
+    + module_name + "/" + key;
 
   dout(4) << __func__ << "key: " << global_key << dendl;
 
@@ -556,14 +430,15 @@ bool PyModules::get_config(const std::string &handle,
   }
 }
 
-PyObject *PyModules::get_config_prefix(const std::string &handle,
+PyObject *ActivePyModules::get_config_prefix(const std::string &module_name,
     const std::string &prefix) const
 {
   PyThreadState *tstate = PyEval_SaveThread();
   Mutex::Locker l(lock);
   PyEval_RestoreThread(tstate);
 
-  const std::string base_prefix = config_prefix + handle + "/";
+  const std::string base_prefix = PyModuleRegistry::config_prefix
+                                    + module_name + "/";
   const std::string global_prefix = base_prefix + prefix;
   dout(4) << __func__ << "prefix: " << global_prefix << dendl;
 
@@ -576,10 +451,11 @@ PyObject *PyModules::get_config_prefix(const std::string &handle,
   return f.get();
 }
 
-void PyModules::set_config(const std::string &handle,
+void ActivePyModules::set_config(const std::string &module_name,
     const std::string &key, const boost::optional<std::string>& val)
 {
-  const std::string global_key = config_prefix + handle + "/" + key;
+  const std::string global_key = PyModuleRegistry::config_prefix
+                                   + module_name + "/" + key;
 
   Command set_cmd;
   {
@@ -619,7 +495,7 @@ void PyModules::set_config(const std::string &handle,
   }
 }
 
-std::vector<ModuleCommand> PyModules::get_py_commands() const
+std::vector<ModuleCommand> ActivePyModules::get_py_commands() const
 {
   Mutex::Locker l(lock);
 
@@ -635,7 +511,7 @@ std::vector<ModuleCommand> PyModules::get_py_commands() const
   return result;
 }
 
-std::vector<MonCommand> PyModules::get_commands() const
+std::vector<MonCommand> ActivePyModules::get_commands() const
 {
   std::vector<ModuleCommand> commands = get_py_commands();
   std::vector<MonCommand> result;
@@ -646,27 +522,23 @@ std::vector<MonCommand> PyModules::get_commands() const
   return result;
 }
 
-void PyModules::insert_config(const std::map<std::string,
-                              std::string> &new_config)
+
+std::map<std::string, std::string> ActivePyModules::get_services() const
 {
+  std::map<std::string, std::string> result;
   Mutex::Locker l(lock);
+  for (const auto& i : modules) {
+    const auto &module = i.second.get();
+    std::string svc_str = module->get_uri();
+    if (!svc_str.empty()) {
+      result[module->get_name()] = svc_str;
+    }
+  }
 
-  dout(4) << "Loaded " << new_config.size() << " config settings" << dendl;
-  config_cache = new_config;
-}
-
-void PyModules::log(const std::string &handle,
-    int level, const std::string &record)
-{
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr[" << handle << "] "
-  dout(level) << record << dendl;
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr " << __func__ << " "
+  return result;
 }
 
-PyObject* PyModules::get_counter_python(
-    const std::string &handle,
+PyObject* ActivePyModules::get_counter_python(
     const std::string &svc_name,
     const std::string &svc_id,
     const std::string &path)
@@ -679,9 +551,8 @@ PyObject* PyModules::get_counter_python(
   f.open_array_section(path.c_str());
 
   auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
-
-  Mutex::Locker l2(metadata->lock);
   if (metadata) {
+    Mutex::Locker l2(metadata->lock);
     if (metadata->perf_counters.instances.count(path)) {
       auto counter_instance = metadata->perf_counters.instances.at(path);
       const auto &data = counter_instance.get_data();
@@ -708,8 +579,7 @@ PyObject* PyModules::get_counter_python(
   return f.get();
 }
 
-PyObject* PyModules::get_perf_schema_python(
-    const std::string &handle,
+PyObject* ActivePyModules::get_perf_schema_python(
     const std::string svc_type,
     const std::string &svc_id)
 {
@@ -717,42 +587,42 @@ PyObject* PyModules::get_perf_schema_python(
   Mutex::Locker l(lock);
   PyEval_RestoreThread(tstate);
 
-  DaemonStateCollection states;
+  DaemonStateCollection daemons;
 
   if (svc_type == "") {
-    states = daemon_state.get_all();
+    daemons = std::move(daemon_state.get_all());
   } else if (svc_id.empty()) {
-    states = daemon_state.get_by_service(svc_type);
+    daemons = std::move(daemon_state.get_by_service(svc_type));
   } else {
     auto key = DaemonKey(svc_type, svc_id);
     // so that the below can be a loop in all cases
-    if (daemon_state.exists(key)) {
-      states[key] = daemon_state.get(key);
+    auto got = daemon_state.get(key);
+    if (got != nullptr) {
+      daemons[key] = got;
     }
   }
 
   PyFormatter f;
-  f.open_object_section("perf_schema");
-
-  // FIXME: this is unsafe, I need to either be inside DaemonStateIndex's
-  // lock or put a lock on individual DaemonStates
-  if (!states.empty()) {
-    for (auto statepair : states) {
-      std::ostringstream daemon_name;
+  if (!daemons.empty()) {
+    for (auto statepair : daemons) {
       auto key = statepair.first;
       auto state = statepair.second;
-      Mutex::Locker l(state->lock);
+
+      std::ostringstream daemon_name;
       daemon_name << key.first << "." << key.second;
       f.open_object_section(daemon_name.str().c_str());
 
-      for (auto typestr : state->perf_counters.declared_types) {
-	f.open_object_section(typestr.c_str());
-	auto type = state->perf_counters.types[typestr];
+      Mutex::Locker l(state->lock);
+      for (auto ctr_inst_iter : state->perf_counters.instances) {
+        const auto &counter_name = ctr_inst_iter.first;
+	f.open_object_section(counter_name.c_str());
+	auto type = state->perf_counters.types[counter_name];
 	f.dump_string("description", type.description);
 	if (!type.nick.empty()) {
 	  f.dump_string("nick", type.nick);
 	}
 	f.dump_unsigned("type", type.type);
+	f.dump_unsigned("priority", type.priority);
 	f.close_section();
       }
       f.close_section();
@@ -761,11 +631,10 @@ PyObject* PyModules::get_perf_schema_python(
     dout(4) << __func__ << ": No daemon state found for "
               << svc_type << "." << svc_id << ")" << dendl;
   }
-  f.close_section();
   return f.get();
 }
 
-PyObject *PyModules::get_context()
+PyObject *ActivePyModules::get_context()
 {
   PyThreadState *tstate = PyEval_SaveThread();
   Mutex::Locker l(lock);
@@ -778,50 +647,91 @@ PyObject *PyModules::get_context()
   return capsule;
 }
 
-static void _list_modules(
-  const std::string path,
-  std::set<std::string> *modules)
+/**
+ * Helper for our wrapped types that take a capsule in their constructor.
+ */
+PyObject *construct_with_capsule(
+    const std::string &module_name,
+    const std::string &clsname,
+    void *wrapped)
 {
-  DIR *dir = opendir(path.c_str());
-  if (!dir) {
-    return;
+  // Look up the OSDMap type which we will construct
+  PyObject *module = PyImport_ImportModule(module_name.c_str());
+  if (!module) {
+    derr << "Failed to import python module:" << dendl;
+    derr << handle_pyerror() << dendl;
   }
-  struct dirent *entry = NULL;
-  while ((entry = readdir(dir)) != NULL) {
-    string n(entry->d_name);
-    string fn = path + "/" + n;
-    struct stat st;
-    int r = ::stat(fn.c_str(), &st);
-    if (r == 0 && S_ISDIR(st.st_mode)) {
-      string initfn = fn + "/module.py";
-      r = ::stat(initfn.c_str(), &st);
-      if (r == 0) {
-	modules->insert(n);
-      }
-    }
+  assert(module);
+
+  PyObject *wrapper_type = PyObject_GetAttrString(
+      module, (const char*)clsname.c_str());
+  if (!wrapper_type) {
+    derr << "Failed to get python type:" << dendl;
+    derr << handle_pyerror() << dendl;
   }
-  closedir(dir);
+  assert(wrapper_type);
+
+  // Construct a capsule containing an OSDMap.
+  auto wrapped_capsule = PyCapsule_New(wrapped, nullptr, nullptr);
+  assert(wrapped_capsule);
+
+  // Construct the python OSDMap
+  auto pArgs = PyTuple_Pack(1, wrapped_capsule);
+  auto wrapper_instance = PyObject_CallObject(wrapper_type, pArgs);
+  if (wrapper_instance == nullptr) {
+    derr << "Failed to construct python OSDMap:" << dendl;
+    derr << handle_pyerror() << dendl;
+  }
+  assert(wrapper_instance != nullptr);
+  Py_DECREF(pArgs);
+  Py_DECREF(wrapped_capsule);
+
+  Py_DECREF(wrapper_type);
+  Py_DECREF(module);
+
+  return wrapper_instance;
 }
 
-void PyModules::list_modules(std::set<std::string> *modules)
+PyObject *ActivePyModules::get_osdmap()
 {
-  _list_modules(g_conf->mgr_module_path, modules);
+  PyThreadState *tstate = PyEval_SaveThread();
+  Mutex::Locker l(lock);
+  PyEval_RestoreThread(tstate);
+
+  OSDMap *newmap = new OSDMap;
+
+  cluster_state.with_osdmap([&](const OSDMap& o) {
+      newmap->deepish_copy_from(o);
+    });
+
+  return construct_with_capsule("mgr_module", "OSDMap", (void*)newmap);
 }
 
-void PyModules::set_health_checks(const std::string& handle,
+void ActivePyModules::set_health_checks(const std::string& module_name,
 				  health_check_map_t&& checks)
 {
   Mutex::Locker l(lock);
-  auto p = modules.find(handle);
+  auto p = modules.find(module_name);
   if (p != modules.end()) {
     p->second->set_health_checks(std::move(checks));
   }
 }
 
-void PyModules::get_health_checks(health_check_map_t *checks)
+void ActivePyModules::get_health_checks(health_check_map_t *checks)
 {
   Mutex::Locker l(lock);
   for (auto& p : modules) {
     p.second->get_health_checks(checks);
   }
 }
+
+void ActivePyModules::set_uri(const std::string& module_name,
+                        const std::string &uri)
+{
+  Mutex::Locker l(lock);
+
+  dout(4) << " module " << module_name << " set URI '" << uri << "'" << dendl;
+
+  modules[module_name]->set_uri(uri);
+}
+
diff --git a/ceph/src/mgr/PyModules.h b/ceph/src/mgr/ActivePyModules.h
similarity index 66%
rename from ceph/src/mgr/PyModules.h
rename to ceph/src/mgr/ActivePyModules.h
index c7aad4e5d..21e6529a9 100644
--- a/ceph/src/mgr/PyModules.h
+++ b/ceph/src/mgr/ActivePyModules.h
@@ -11,14 +11,12 @@
  * Foundation.  See file COPYING.
  */
 
-#ifndef PY_MODULES_H_
-#define PY_MODULES_H_
+#pragma once
 
-#include "MgrPyModule.h"
+#include "ActivePyModule.h"
 
 #include "common/Finisher.h"
 #include "common/Mutex.h"
-#include "common/Thread.h"
 
 #include "osdc/Objecter.h"
 #include "client/Client.h"
@@ -29,13 +27,15 @@
 #include "DaemonState.h"
 #include "ClusterState.h"
 
-class ServeThread;
 class health_check_map_t;
 
-class PyModules
+typedef std::map<std::string, std::string> PyModuleConfig;
+
+class ActivePyModules
 {
-  std::map<std::string, std::unique_ptr<MgrPyModule>> modules;
-  std::map<std::string, std::unique_ptr<ServeThread>> serve_threads;
+
+  std::map<std::string, std::unique_ptr<ActivePyModule>> modules;
+  PyModuleConfig config_cache;
   DaemonStateIndex &daemon_state;
   ClusterState &cluster_state;
   MonClient &monc;
@@ -44,20 +44,16 @@ class PyModules
   Client   &client;
   Finisher &finisher;
 
-  mutable Mutex lock{"PyModules::lock"};
-
-  std::string get_site_packages();
 
-  PyThreadState *pMainThreadState = nullptr;
+  mutable Mutex lock{"ActivePyModules::lock"};
 
 public:
-  static std::string config_prefix;
-
-  PyModules(DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+  ActivePyModules(PyModuleConfig const &config_,
+            DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
             LogChannelRef clog_, Objecter &objecter_, Client &client_,
             Finisher &f);
 
-  ~PyModules();
+  ~ActivePyModules();
 
   // FIXME: wrap for send_command?
   MonClient &get_monc() {return monc;}
@@ -68,23 +64,31 @@ public:
   PyObject *get_server_python(const std::string &hostname);
   PyObject *list_servers_python();
   PyObject *get_metadata_python(
-    std::string const &handle,
-    const std::string &svc_name, const std::string &svc_id);
+    const std::string &svc_type, const std::string &svc_id);
   PyObject *get_daemon_status_python(
-    std::string const &handle,
-    const std::string &svc_name, const std::string &svc_id);
+    const std::string &svc_type, const std::string &svc_id);
   PyObject *get_counter_python(
-    std::string const &handle,
-    const std::string &svc_name,
+    const std::string &svc_type,
     const std::string &svc_id,
     const std::string &path);
   PyObject *get_perf_schema_python(
-     const std::string &handle,
      const std::string svc_type,
      const std::string &svc_id);
   PyObject *get_context();
+  PyObject *get_osdmap();
+
+  bool get_config(const std::string &module_name,
+      const std::string &key, std::string *val) const;
+  PyObject *get_config_prefix(const std::string &module_name,
+			      const std::string &prefix) const;
+  void set_config(const std::string &module_name,
+      const std::string &key, const boost::optional<std::string> &val);
+
+  void set_health_checks(const std::string& module_name,
+			 health_check_map_t&& checks);
+  void get_health_checks(health_check_map_t *checks);
 
-  std::map<std::string, std::string> config_cache;
+  void set_uri(const std::string& module_name, const std::string &uri);
 
   // Python command definitions, including callback
   std::vector<ModuleCommand> get_py_commands() const;
@@ -92,7 +96,7 @@ public:
   // Monitor command definitions, suitable for CLI
   std::vector<MonCommand> get_commands() const;
 
-  void insert_config(const std::map<std::string, std::string> &new_config);
+  std::map<std::string, std::string> get_services() const;
 
   // Public so that MonCommandCompletion can use it
   // FIXME: for send_command completion notifications,
@@ -102,29 +106,14 @@ public:
   void notify_all(const LogEntry &log_entry);
 
   int init();
-  void start();
   void shutdown();
 
+  int start_one(std::string const &module_name,
+                PyObject *pClass,
+                const SafeThreadState &pMyThreadState);
+
   void dump_server(const std::string &hostname,
                    const DaemonStateCollection &dmc,
                    Formatter *f);
-
-  bool get_config(const std::string &handle,
-      const std::string &key, std::string *val) const;
-  PyObject *get_config_prefix(const std::string &handle,
-			      const std::string &prefix) const;
-  void set_config(const std::string &handle,
-      const std::string &key, const boost::optional<std::string> &val);
-
-  void set_health_checks(const std::string& handle,
-			 health_check_map_t&& checks);
-  void get_health_checks(health_check_map_t *checks);
-
-  void log(const std::string &handle,
-           int level, const std::string &record);
-
-  static void list_modules(std::set<std::string> *modules);
 };
 
-#endif
-
diff --git a/ceph/src/mgr/BaseMgrModule.cc b/ceph/src/mgr/BaseMgrModule.cc
new file mode 100644
index 000000000..74d1a948f
--- /dev/null
+++ b/ceph/src/mgr/BaseMgrModule.cc
@@ -0,0 +1,636 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+/**
+ * The interface we present to python code that runs within
+ * ceph-mgr.  This is implemented as a Python class from which
+ * all modules must inherit -- access to the Ceph state is then
+ * available as methods on that object.
+ */
+
+#include "Python.h"
+
+#include "Mgr.h"
+
+#include "mon/MonClient.h"
+#include "common/errno.h"
+#include "common/version.h"
+
+#include "BaseMgrModule.h"
+#include "Gil.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+#define PLACEHOLDER ""
+
+
+typedef struct {
+  PyObject_HEAD
+  ActivePyModules *py_modules;
+  ActivePyModule *this_module;
+} BaseMgrModule;
+
+class MonCommandCompletion : public Context
+{
+  ActivePyModules *py_modules;
+  PyObject *python_completion;
+  const std::string tag;
+  SafeThreadState pThreadState;
+
+public:
+  std::string outs;
+  bufferlist outbl;
+
+  MonCommandCompletion(
+      ActivePyModules *py_modules_, PyObject* ev,
+      const std::string &tag_, PyThreadState *ts_)
+    : py_modules(py_modules_), python_completion(ev),
+      tag(tag_), pThreadState(ts_)
+  {
+    assert(python_completion != nullptr);
+    Py_INCREF(python_completion);
+  }
+
+  ~MonCommandCompletion() override
+  {
+    if (python_completion) {
+      // Usually do this in finish(): this path is only for if we're
+      // being destroyed without completing.
+      Gil gil(pThreadState, true);
+      Py_DECREF(python_completion);
+      python_completion = nullptr;
+    }
+  }
+
+  void finish(int r) override
+  {
+    assert(python_completion != nullptr);
+
+    dout(10) << "MonCommandCompletion::finish()" << dendl;
+    {
+      // Scoped so the Gil is released before calling notify_all()
+      // Create new thread state because this is called via the MonClient
+      // Finisher, not the PyModules finisher.
+      Gil gil(pThreadState, true);
+
+      auto set_fn = PyObject_GetAttrString(python_completion, "complete");
+      assert(set_fn != nullptr);
+
+      auto pyR = PyInt_FromLong(r);
+      auto pyOutBl = PyString_FromString(outbl.to_str().c_str());
+      auto pyOutS = PyString_FromString(outs.c_str());
+      auto args = PyTuple_Pack(3, pyR, pyOutBl, pyOutS);
+      Py_DECREF(pyR);
+      Py_DECREF(pyOutBl);
+      Py_DECREF(pyOutS);
+
+      auto rtn = PyObject_CallObject(set_fn, args);
+      if (rtn != nullptr) {
+	Py_DECREF(rtn);
+      }
+      Py_DECREF(args);
+      Py_DECREF(set_fn);
+
+      Py_DECREF(python_completion);
+      python_completion = nullptr;
+    }
+    py_modules->notify_all("command", tag);
+  }
+};
+
+
+static PyObject*
+ceph_send_command(BaseMgrModule *self, PyObject *args)
+{
+  // Like mon, osd, mds
+  char *type = nullptr;
+
+  // Like "23" for an OSD or "myid" for an MDS
+  char *name = nullptr;
+
+  char *cmd_json = nullptr;
+  char *tag = nullptr;
+  PyObject *completion = nullptr;
+  if (!PyArg_ParseTuple(args, "Ossss:ceph_send_command",
+        &completion, &type, &name, &cmd_json, &tag)) {
+    return nullptr;
+  }
+
+  auto set_fn = PyObject_GetAttrString(completion, "complete");
+  if (set_fn == nullptr) {
+    ceph_abort();  // TODO raise python exception instead
+  } else {
+    assert(PyCallable_Check(set_fn));
+  }
+  Py_DECREF(set_fn);
+
+  auto c = new MonCommandCompletion(self->py_modules,
+      completion, tag, PyThreadState_Get());
+  if (std::string(type) == "mon") {
+    self->py_modules->get_monc().start_mon_command(
+        {cmd_json},
+        {},
+        &c->outbl,
+        &c->outs,
+        c);
+  } else if (std::string(type) == "osd") {
+    std::string err;
+    uint64_t osd_id = strict_strtoll(name, 10, &err);
+    if (!err.empty()) {
+      delete c;
+      string msg("invalid osd_id: ");
+      msg.append("\"").append(name).append("\"");
+      PyErr_SetString(PyExc_ValueError, msg.c_str());
+      return nullptr;
+    }
+
+    ceph_tid_t tid;
+    self->py_modules->get_objecter().osd_command(
+        osd_id,
+        {cmd_json},
+        {},
+        &tid,
+        &c->outbl,
+        &c->outs,
+        c);
+  } else if (std::string(type) == "mds") {
+    int r = self->py_modules->get_client().mds_command(
+        name,
+        {cmd_json},
+        {},
+        &c->outbl,
+        &c->outs,
+        c);
+    if (r != 0) {
+      string msg("failed to send command to mds: ");
+      msg.append(cpp_strerror(r));
+      PyErr_SetString(PyExc_RuntimeError, msg.c_str());
+      return nullptr;
+    }
+  } else if (std::string(type) == "pg") {
+    pg_t pgid;
+    if (!pgid.parse(name)) {
+      delete c;
+      string msg("invalid pgid: ");
+      msg.append("\"").append(name).append("\"");
+      PyErr_SetString(PyExc_ValueError, msg.c_str());
+      return nullptr;
+    }
+
+    ceph_tid_t tid;
+    self->py_modules->get_objecter().pg_command(
+        pgid,
+        {cmd_json},
+        {},
+        &tid,
+        &c->outbl,
+        &c->outs,
+        c);
+    return nullptr;
+  } else {
+    delete c;
+    string msg("unknown service type: ");
+    msg.append(type);
+    PyErr_SetString(PyExc_ValueError, msg.c_str());
+    return nullptr;
+  }
+
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_set_health_checks(BaseMgrModule *self, PyObject *args)
+{
+  PyObject *checks = NULL;
+  if (!PyArg_ParseTuple(args, "O:ceph_set_health_checks", &checks)) {
+    return NULL;
+  }
+  if (!PyDict_Check(checks)) {
+    derr << __func__ << " arg not a dict" << dendl;
+    Py_RETURN_NONE;
+  }
+  PyObject *checksls = PyDict_Items(checks);
+  health_check_map_t out_checks;
+  for (int i = 0; i < PyList_Size(checksls); ++i) {
+    PyObject *kv = PyList_GET_ITEM(checksls, i);
+    char *check_name = nullptr;
+    PyObject *check_info = nullptr;
+    if (!PyArg_ParseTuple(kv, "sO:pair", &check_name, &check_info)) {
+      derr << __func__ << " dict item " << i
+	   << " not a size 2 tuple" << dendl;
+      continue;
+    }
+    if (!PyDict_Check(check_info)) {
+      derr << __func__ << " item " << i << " " << check_name
+	   << " value not a dict" << dendl;
+      continue;
+    }
+    health_status_t severity = HEALTH_OK;
+    string summary;
+    list<string> detail;
+    PyObject *infols = PyDict_Items(check_info);
+    for (int j = 0; j < PyList_Size(infols); ++j) {
+      PyObject *pair = PyList_GET_ITEM(infols, j);
+      if (!PyTuple_Check(pair)) {
+	derr << __func__ << " item " << i << " pair " << j
+	     << " not a tuple" << dendl;
+	continue;
+      }
+      char *k = nullptr;
+      PyObject *v = nullptr;
+      if (!PyArg_ParseTuple(pair, "sO:pair", &k, &v)) {
+	derr << __func__ << " item " << i << " pair " << j
+	     << " not a size 2 tuple" << dendl;
+	continue;
+      }
+      string ks(k);
+      if (ks == "severity") {
+	if (!PyString_Check(v)) {
+	  derr << __func__ << " check " << check_name
+	       << " severity value not string" << dendl;
+	  continue;
+	}
+	string vs(PyString_AsString(v));
+	if (vs == "warning") {
+	  severity = HEALTH_WARN;
+	} else if (vs == "error") {
+	  severity = HEALTH_ERR;
+	}
+      } else if (ks == "summary") {
+	if (!PyString_Check(v)) {
+	  derr << __func__ << " check " << check_name
+	       << " summary value not string" << dendl;
+	  continue;
+	}
+	summary = PyString_AsString(v);
+      } else if (ks == "detail") {
+	if (!PyList_Check(v)) {
+	  derr << __func__ << " check " << check_name
+	       << " detail value not list" << dendl;
+	  continue;
+	}
+	for (int k = 0; k < PyList_Size(v); ++k) {
+	  PyObject *di = PyList_GET_ITEM(v, k);
+	  if (!PyString_Check(di)) {
+	    derr << __func__ << " check " << check_name
+		 << " detail item " << k << " not a string" << dendl;
+	    continue;
+	  }
+	  detail.push_back(PyString_AsString(di));
+	}
+      } else {
+	derr << __func__ << " check " << check_name
+	     << " unexpected key " << k << dendl;
+      }
+    }
+    auto& d = out_checks.add(check_name, severity, summary);
+    d.detail.swap(detail);
+  }
+
+  JSONFormatter jf(true);
+  dout(10) << "module " << self->this_module->get_name()
+          << " health checks:\n";
+  out_checks.dump(&jf);
+  jf.flush(*_dout);
+  *_dout << dendl;
+
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->set_health_checks(self->this_module->get_name(),
+                                      std::move(out_checks));
+  PyEval_RestoreThread(tstate);
+  
+  Py_RETURN_NONE;
+}
+
+
+static PyObject*
+ceph_state_get(BaseMgrModule *self, PyObject *args)
+{
+  char *what = NULL;
+  if (!PyArg_ParseTuple(args, "s:ceph_state_get", &what)) {
+    return NULL;
+  }
+
+  return self->py_modules->get_python(what);
+}
+
+
+static PyObject*
+ceph_get_server(BaseMgrModule *self, PyObject *args)
+{
+  char *hostname = NULL;
+  if (!PyArg_ParseTuple(args, "z:ceph_get_server", &hostname)) {
+    return NULL;
+  }
+
+  if (hostname) {
+    return self->py_modules->get_server_python(hostname);
+  } else {
+    return self->py_modules->list_servers_python();
+  }
+}
+
+static PyObject*
+ceph_get_mgr_id(BaseMgrModule *self, PyObject *args)
+{
+  return PyString_FromString(g_conf->name.get_id().c_str());
+}
+
+static PyObject*
+ceph_config_get(BaseMgrModule *self, PyObject *args)
+{
+  char *what = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_config_get", &what)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  std::string value;
+  bool found = self->py_modules->get_config(self->this_module->get_name(),
+      what, &value);
+  if (found) {
+    dout(10) << "ceph_config_get " << what << " found: " << value.c_str() << dendl;
+    return PyString_FromString(value.c_str());
+  } else {
+    dout(4) << "ceph_config_get " << what << " not found " << dendl;
+    Py_RETURN_NONE;
+  }
+}
+
+static PyObject*
+ceph_config_get_prefix(BaseMgrModule *self, PyObject *args)
+{
+  char *prefix = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_config_get", &prefix)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  return self->py_modules->get_config_prefix(self->this_module->get_name(),
+      prefix);
+}
+
+static PyObject*
+ceph_config_set(BaseMgrModule *self, PyObject *args)
+{
+  char *key = nullptr;
+  char *value = nullptr;
+  if (!PyArg_ParseTuple(args, "sz:ceph_config_set", &key, &value)) {
+    return nullptr;
+  }
+  boost::optional<string> val;
+  if (value) {
+    val = value;
+  }
+  self->py_modules->set_config(self->this_module->get_name(), key, val);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+get_metadata(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_name = NULL;
+  char *svc_id = NULL;
+  if (!PyArg_ParseTuple(args, "ss:get_metadata", &svc_name, &svc_id)) {
+    return nullptr;
+  }
+  return self->py_modules->get_metadata_python(svc_name, svc_id);
+}
+
+static PyObject*
+get_daemon_status(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_name = NULL;
+  char *svc_id = NULL;
+  if (!PyArg_ParseTuple(args, "ss:get_daemon_status", &svc_name,
+			&svc_id)) {
+    return nullptr;
+  }
+  return self->py_modules->get_daemon_status_python(svc_name, svc_id);
+}
+
+static PyObject*
+ceph_log(BaseMgrModule *self, PyObject *args)
+{
+
+  int level = 0;
+  char *record = nullptr;
+  if (!PyArg_ParseTuple(args, "is:log", &level, &record)) {
+    return nullptr;
+  }
+
+  assert(self->this_module);
+
+  self->this_module->log(level, record);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject *
+ceph_get_version(BaseMgrModule *self, PyObject *args)
+{
+  return PyString_FromString(pretty_version_to_str().c_str());
+}
+
+static PyObject *
+ceph_get_context(BaseMgrModule *self, PyObject *args)
+{
+  return self->py_modules->get_context();
+}
+
+static PyObject*
+get_counter(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_name = nullptr;
+  char *svc_id = nullptr;
+  char *counter_path = nullptr;
+  if (!PyArg_ParseTuple(args, "sss:get_counter", &svc_name,
+                                                  &svc_id, &counter_path)) {
+    return nullptr;
+  }
+  return self->py_modules->get_counter_python(
+      svc_name, svc_id, counter_path);
+}
+
+static PyObject*
+get_perf_schema(BaseMgrModule *self, PyObject *args)
+{
+  char *type_str = nullptr;
+  char *svc_id = nullptr;
+  if (!PyArg_ParseTuple(args, "ss:get_perf_schema", &type_str,
+                                                    &svc_id)) {
+    return nullptr;
+  }
+
+  return self->py_modules->get_perf_schema_python(type_str, svc_id);
+}
+
+static PyObject *
+ceph_get_osdmap(BaseMgrModule *self, PyObject *args)
+{
+  return self->py_modules->get_osdmap();
+}
+
+static PyObject*
+ceph_set_uri(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_str = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_advertize_service",
+        &svc_str)) {
+    return nullptr;
+  }
+
+  // We call down into PyModules even though we have a MgrPyModule
+  // reference here, because MgrPyModule's fields are protected
+  // by PyModules' lock.
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->set_uri(self->this_module->get_name(), svc_str);
+  PyEval_RestoreThread(tstate);
+
+  Py_RETURN_NONE;
+}
+
+
+PyMethodDef BaseMgrModule_methods[] = {
+  {"_ceph_get", (PyCFunction)ceph_state_get, METH_VARARGS,
+   "Get a cluster object"},
+
+  {"_ceph_get_server", (PyCFunction)ceph_get_server, METH_VARARGS,
+   "Get a server object"},
+
+  {"_ceph_get_metadata", (PyCFunction)get_metadata, METH_VARARGS,
+   "Get a service's metadata"},
+
+  {"_ceph_get_daemon_status", (PyCFunction)get_daemon_status, METH_VARARGS,
+   "Get a service's status"},
+
+  {"_ceph_send_command", (PyCFunction)ceph_send_command, METH_VARARGS,
+   "Send a mon command"},
+
+  {"_ceph_set_health_checks", (PyCFunction)ceph_set_health_checks, METH_VARARGS,
+   "Set health checks for this module"},
+
+  {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS,
+   "Get the name of the Mgr daemon where we are running"},
+
+  {"_ceph_get_config", (PyCFunction)ceph_config_get, METH_VARARGS,
+   "Get a configuration value"},
+
+  {"_ceph_get_config_prefix", (PyCFunction)ceph_config_get_prefix, METH_VARARGS,
+   "Get all configuration values with a given prefix"},
+
+  {"_ceph_set_config", (PyCFunction)ceph_config_set, METH_VARARGS,
+   "Set a configuration value"},
+
+  {"_ceph_get_counter", (PyCFunction)get_counter, METH_VARARGS,
+    "Get a performance counter"},
+
+  {"_ceph_get_perf_schema", (PyCFunction)get_perf_schema, METH_VARARGS,
+    "Get the performance counter schema"},
+
+  {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS,
+   "Emit a (local) log message"},
+
+  {"_ceph_get_version", (PyCFunction)ceph_get_version, METH_VARARGS,
+   "Get the ceph version of this process"},
+
+  {"_ceph_get_context", (PyCFunction)ceph_get_context, METH_NOARGS,
+    "Get a CephContext* in a python capsule"},
+
+  {"_ceph_get_osdmap", (PyCFunction)ceph_get_osdmap, METH_NOARGS,
+    "Get an OSDMap* in a python capsule"},
+
+  {"_ceph_set_uri", (PyCFunction)ceph_set_uri, METH_VARARGS,
+    "Advertize a service URI served by this module"},
+
+  {NULL, NULL, 0, NULL}
+};
+
+
+static PyObject *
+BaseMgrModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    BaseMgrModule *self;
+
+    self = (BaseMgrModule *)type->tp_alloc(type, 0);
+
+    return (PyObject *)self;
+}
+
+static int
+BaseMgrModule_init(BaseMgrModule *self, PyObject *args, PyObject *kwds)
+{
+    PyObject *py_modules_capsule = nullptr;
+    PyObject *this_module_capsule = nullptr;
+    static const char *kwlist[] = {"py_modules", "this_module", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "OO",
+                                      const_cast<char**>(kwlist),
+                                      &py_modules_capsule,
+                                      &this_module_capsule)) {
+        return -1;
+    }
+
+    self->py_modules = (ActivePyModules*)PyCapsule_GetPointer(
+        py_modules_capsule, nullptr);
+    assert(self->py_modules);
+    self->this_module = (ActivePyModule*)PyCapsule_GetPointer(
+        this_module_capsule, nullptr);
+    assert(self->this_module);
+
+    return 0;
+}
+
+PyTypeObject BaseMgrModuleType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BaseMgrModule", /* tp_name */
+  sizeof(BaseMgrModule),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  0,                         /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "ceph-mgr Python Plugin", /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BaseMgrModule_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BaseMgrModule_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  BaseMgrModule_new,     /* tp_new */
+};
+
diff --git a/ceph/src/mgr/BaseMgrModule.h b/ceph/src/mgr/BaseMgrModule.h
new file mode 100644
index 000000000..2c2e5deb3
--- /dev/null
+++ b/ceph/src/mgr/BaseMgrModule.h
@@ -0,0 +1,7 @@
+
+#pragma once
+
+#include "Python.h"
+
+extern PyTypeObject BaseMgrModuleType;
+
diff --git a/ceph/src/mgr/BaseMgrStandbyModule.cc b/ceph/src/mgr/BaseMgrStandbyModule.cc
new file mode 100644
index 000000000..b7bd0f663
--- /dev/null
+++ b/ceph/src/mgr/BaseMgrStandbyModule.cc
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#include "BaseMgrStandbyModule.h"
+
+#include "StandbyPyModules.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+typedef struct {
+  PyObject_HEAD
+  StandbyPyModule *this_module;
+} BaseMgrStandbyModule;
+
+static PyObject *
+BaseMgrStandbyModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    BaseMgrStandbyModule *self;
+
+    self = (BaseMgrStandbyModule *)type->tp_alloc(type, 0);
+
+    return (PyObject *)self;
+}
+
+static int
+BaseMgrStandbyModule_init(BaseMgrStandbyModule *self, PyObject *args, PyObject *kwds)
+{
+    PyObject *this_module_capsule = nullptr;
+    static const char *kwlist[] = {"this_module", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+                                      const_cast<char**>(kwlist),
+                                      &this_module_capsule)) {
+        return -1;
+    }
+
+    self->this_module = (StandbyPyModule*)PyCapsule_GetPointer(
+        this_module_capsule, nullptr);
+    assert(self->this_module);
+
+    return 0;
+}
+
+static PyObject*
+ceph_get_mgr_id(BaseMgrStandbyModule *self, PyObject *args)
+{
+  return PyString_FromString(g_conf->name.get_id().c_str());
+}
+
+static PyObject*
+ceph_config_get(BaseMgrStandbyModule *self, PyObject *args)
+{
+  char *what = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_config_get", &what)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  std::string value;
+  bool found = self->this_module->get_config(what, &value);
+  if (found) {
+    dout(10) << "ceph_config_get " << what << " found: " << value.c_str() << dendl;
+    return PyString_FromString(value.c_str());
+  } else {
+    dout(4) << "ceph_config_get " << what << " not found " << dendl;
+    Py_RETURN_NONE;
+  }
+}
+
+static PyObject*
+ceph_get_active_uri(BaseMgrStandbyModule *self, PyObject *args)
+{
+  return PyString_FromString(self->this_module->get_active_uri().c_str());
+}
+
+static PyObject*
+ceph_log(BaseMgrStandbyModule *self, PyObject *args)
+{
+  int level = 0;
+  char *record = nullptr;
+  if (!PyArg_ParseTuple(args, "is:log", &level, &record)) {
+    return nullptr;
+  }
+
+  assert(self->this_module);
+
+  self->this_module->log(level, record);
+
+  Py_RETURN_NONE;
+}
+
+PyMethodDef BaseMgrStandbyModule_methods[] = {
+
+  {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS,
+   "Get the name of the Mgr daemon where we are running"},
+
+  {"_ceph_get_config", (PyCFunction)ceph_config_get, METH_VARARGS,
+   "Get a configuration value"},
+
+  {"_ceph_get_active_uri", (PyCFunction)ceph_get_active_uri, METH_NOARGS,
+   "Get the URI of the active instance of this module, if any"},
+
+  {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS,
+   "Emit a log message"},
+
+  {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BaseMgrStandbyModuleType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BaseMgrStandbyModule", /* tp_name */
+  sizeof(BaseMgrStandbyModule),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  0,                         /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "ceph-mgr Standby Python Plugin", /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BaseMgrStandbyModule_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BaseMgrStandbyModule_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  BaseMgrStandbyModule_new,     /* tp_new */
+};
diff --git a/ceph/src/mgr/BaseMgrStandbyModule.h b/ceph/src/mgr/BaseMgrStandbyModule.h
new file mode 100644
index 000000000..c5c6beb1d
--- /dev/null
+++ b/ceph/src/mgr/BaseMgrStandbyModule.h
@@ -0,0 +1,7 @@
+
+#pragma once
+
+#include "Python.h"
+
+extern PyTypeObject BaseMgrStandbyModuleType;
+
diff --git a/ceph/src/mgr/DaemonServer.cc b/ceph/src/mgr/DaemonServer.cc
index 34aac1871..321a38ad5 100644
--- a/ceph/src/mgr/DaemonServer.cc
+++ b/ceph/src/mgr/DaemonServer.cc
@@ -41,26 +41,26 @@ DaemonServer::DaemonServer(MonClient *monc_,
                            Finisher &finisher_,
 			   DaemonStateIndex &daemon_state_,
 			   ClusterState &cluster_state_,
-			   PyModules &py_modules_,
+			   PyModuleRegistry &py_modules_,
 			   LogChannelRef clog_,
 			   LogChannelRef audit_clog_)
     : Dispatcher(g_ceph_context),
       client_byte_throttler(new Throttle(g_ceph_context, "mgr_client_bytes",
-					 g_conf->mgr_client_bytes)),
+					 g_conf->get_val<uint64_t>("mgr_client_bytes"))),
       client_msg_throttler(new Throttle(g_ceph_context, "mgr_client_messages",
-					g_conf->mgr_client_messages)),
+					g_conf->get_val<uint64_t>("mgr_client_messages"))),
       osd_byte_throttler(new Throttle(g_ceph_context, "mgr_osd_bytes",
-				      g_conf->mgr_osd_bytes)),
+				      g_conf->get_val<uint64_t>("mgr_osd_bytes"))),
       osd_msg_throttler(new Throttle(g_ceph_context, "mgr_osd_messsages",
-				     g_conf->mgr_osd_messages)),
+				     g_conf->get_val<uint64_t>("mgr_osd_messages"))),
       mds_byte_throttler(new Throttle(g_ceph_context, "mgr_mds_bytes",
-				      g_conf->mgr_mds_bytes)),
+				      g_conf->get_val<uint64_t>("mgr_mds_bytes"))),
       mds_msg_throttler(new Throttle(g_ceph_context, "mgr_mds_messsages",
-				     g_conf->mgr_mds_messages)),
+				     g_conf->get_val<uint64_t>("mgr_mds_messages"))),
       mon_byte_throttler(new Throttle(g_ceph_context, "mgr_mon_bytes",
-				      g_conf->mgr_mon_bytes)),
+				      g_conf->get_val<uint64_t>("mgr_mon_bytes"))),
       mon_msg_throttler(new Throttle(g_ceph_context, "mgr_mon_messsages",
-				     g_conf->mgr_mon_messages)),
+				     g_conf->get_val<uint64_t>("mgr_mon_messages"))),
       msgr(nullptr),
       monc(monc_),
       finisher(finisher_),
@@ -73,11 +73,15 @@ DaemonServer::DaemonServer(MonClient *monc_,
                     g_conf->auth_supported.empty() ?
                       g_conf->auth_cluster_required :
                       g_conf->auth_supported),
-      lock("DaemonServer")
-{}
+      lock("DaemonServer"),
+      pgmap_ready(false)
+{
+  g_conf->add_observer(this);
+}
 
 DaemonServer::~DaemonServer() {
   delete msgr;
+  g_conf->remove_observer(this);
 }
 
 int DaemonServer::init(uint64_t gid, entity_addr_t client_addr)
@@ -232,6 +236,11 @@ bool DaemonServer::ms_handle_reset(Connection *con)
     dout(10) << "unregistering osd." << session->osd_id
 	     << "  session " << session << " con " << con << dendl;
     osd_cons[session->osd_id].erase(con);
+
+    auto iter = daemon_connections.find(con);
+    if (iter != daemon_connections.end()) {
+      daemon_connections.erase(iter);
+    }
   }
   return false;
 }
@@ -244,8 +253,9 @@ bool DaemonServer::ms_handle_refused(Connection *con)
 
 bool DaemonServer::ms_dispatch(Message *m)
 {
-  Mutex::Locker l(lock);
-
+  // Note that we do *not* take ::lock here, in order to avoid
+  // serializing all message handling.  It's up to each handler
+  // to take whatever locks it needs.
   switch (m->get_type()) {
     case MSG_PGSTATS:
       cluster_state.ingest_pgstats(static_cast<MPGStats*>(m));
@@ -266,29 +276,36 @@ bool DaemonServer::ms_dispatch(Message *m)
 
 void DaemonServer::maybe_ready(int32_t osd_id)
 {
-  if (!pgmap_ready && reported_osds.find(osd_id) == reported_osds.end()) {
-    dout(4) << "initial report from osd " << osd_id << dendl;
-    reported_osds.insert(osd_id);
-    std::set<int32_t> up_osds;
+  if (pgmap_ready.load()) {
+    // Fast path: we don't need to take lock because pgmap_ready
+    // is already set
+  } else {
+    Mutex::Locker l(lock);
 
-    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
-        osdmap.get_up_osds(up_osds);
-    });
+    if (reported_osds.find(osd_id) == reported_osds.end()) {
+      dout(4) << "initial report from osd " << osd_id << dendl;
+      reported_osds.insert(osd_id);
+      std::set<int32_t> up_osds;
 
-    std::set<int32_t> unreported_osds;
-    std::set_difference(up_osds.begin(), up_osds.end(),
-                        reported_osds.begin(), reported_osds.end(),
-                        std::inserter(unreported_osds, unreported_osds.begin()));
+      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+          osdmap.get_up_osds(up_osds);
+      });
 
-    if (unreported_osds.size() == 0) {
-      dout(4) << "all osds have reported, sending PG state to mon" << dendl;
-      pgmap_ready = true;
-      reported_osds.clear();
-      // Avoid waiting for next tick
-      send_report();
-    } else {
-      dout(4) << "still waiting for " << unreported_osds.size() << " osds"
-                 " to report in before PGMap is ready" << dendl;
+      std::set<int32_t> unreported_osds;
+      std::set_difference(up_osds.begin(), up_osds.end(),
+                          reported_osds.begin(), reported_osds.end(),
+                          std::inserter(unreported_osds, unreported_osds.begin()));
+
+      if (unreported_osds.size() == 0) {
+        dout(4) << "all osds have reported, sending PG state to mon" << dendl;
+        pgmap_ready = true;
+        reported_osds.clear();
+        // Avoid waiting for next tick
+        send_report();
+      } else {
+        dout(4) << "still waiting for " << unreported_osds.size() << " osds"
+                   " to report in before PGMap is ready" << dendl;
+      }
     }
   }
 }
@@ -305,6 +322,8 @@ void DaemonServer::shutdown()
 
 bool DaemonServer::handle_open(MMgrOpen *m)
 {
+  Mutex::Locker l(lock);
+
   DaemonKey key;
   if (!m->service_name.empty()) {
     key.first = m->service_name;
@@ -315,9 +334,7 @@ bool DaemonServer::handle_open(MMgrOpen *m)
 
   dout(4) << "from " << m->get_connection() << "  " << key << dendl;
 
-  auto configure = new MMgrConfigure();
-  configure->stats_period = g_conf->mgr_stats_period;
-  m->get_connection()->send_message(configure);
+  _send_configure(m->get_connection());
 
   DaemonStatePtr daemon;
   if (daemon_state.exists(key)) {
@@ -326,7 +343,7 @@ bool DaemonServer::handle_open(MMgrOpen *m)
   if (daemon) {
     dout(20) << "updating existing DaemonState for " << m->daemon_name << dendl;
     Mutex::Locker l(daemon->lock);
-    daemon_state.get(key)->perf_counters.clear();
+    daemon->perf_counters.clear();
   }
 
   if (m->service_daemon) {
@@ -358,6 +375,15 @@ bool DaemonServer::handle_open(MMgrOpen *m)
     }
   }
 
+  if (m->get_connection()->get_peer_type() != entity_name_t::TYPE_CLIENT &&
+      m->service_name.empty())
+  {
+    // Store in set of the daemon/service connections, i.e. those
+    // connections that require an update in the event of stats
+    // configuration changes.
+    daemon_connections.insert(m->get_connection());
+  }
+
   m->put();
   return true;
 }
@@ -384,6 +410,7 @@ bool DaemonServer::handle_report(MMgrReport *m)
     return true;
   }
 
+  // Look up the DaemonState
   DaemonStatePtr daemon;
   if (daemon_state.exists(key)) {
     dout(20) << "updating existing DaemonState for " << key << dendl;
@@ -398,12 +425,26 @@ bool DaemonServer::handle_report(MMgrReport *m)
     // daemons without sessions, and ensuring that session open
     // always contains metadata.
   }
+
+  // Update the DaemonState
   assert(daemon != nullptr);
-  auto &daemon_counters = daemon->perf_counters;
   {
     Mutex::Locker l(daemon->lock);
+    auto &daemon_counters = daemon->perf_counters;
     daemon_counters.update(m);
+
+    if (daemon->service_daemon) {
+      utime_t now = ceph_clock_now();
+      if (m->daemon_status) {
+        daemon->service_status = *m->daemon_status;
+        daemon->service_status_stamp = now;
+      }
+      daemon->last_service_beacon = now;
+    } else if (m->daemon_status) {
+      derr << "got status from non-daemon " << key << dendl;
+    }
   }
+
   // if there are any schema updates, notify the python modules
   if (!m->declare_types.empty() || !m->undeclare_types.empty()) {
     ostringstream oss;
@@ -411,17 +452,6 @@ bool DaemonServer::handle_report(MMgrReport *m)
     py_modules.notify_all("perf_schema_update", oss.str());
   }
 
-  if (daemon->service_daemon) {
-    utime_t now = ceph_clock_now();
-    if (m->daemon_status) {
-      daemon->service_status = *m->daemon_status;
-      daemon->service_status_stamp = now;
-    }
-    daemon->last_service_beacon = now;
-  } else if (m->daemon_status) {
-    derr << "got status from non-daemon " << key << dendl;
-  }
-
   m->put();
   return true;
 }
@@ -496,6 +526,7 @@ bool DaemonServer::_allowed_command(
 
 bool DaemonServer::handle_command(MCommand *m)
 {
+  Mutex::Locker l(lock);
   int r = 0;
   std::stringstream ss;
   std::string prefix;
@@ -705,6 +736,19 @@ bool DaemonServer::handle_command(MCommand *m)
     return true;
   }
 
+  if (prefix == "config set") {
+    std::string key;
+    std::string val;
+    cmd_getval(cct, cmdctx->cmdmap, "key", key);
+    cmd_getval(cct, cmdctx->cmdmap, "value", val);
+    r = cct->_conf->set_val(key, val, true, &ss);
+    if (r == 0) {
+      cct->_conf->apply_changes(nullptr);
+    }
+    cmdctx->reply(0, ss);
+    return true;
+  }
+
   // -----------
   // PG commands
 
@@ -1148,7 +1192,7 @@ bool DaemonServer::handle_command(MCommand *m)
 		  }
 		  break;
 		case OFR_BACKFILL:
-		  if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL)) == 0) {
+		  if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING)) == 0) {
 		    ss << "pg " << pstr << " doesn't require backfilling; ";
 		    continue;
 		  } else  if (workpg.state & PG_STATE_FORCED_BACKFILL) {
@@ -1238,7 +1282,7 @@ bool DaemonServer::handle_command(MCommand *m)
   }
 
   // None of the special native commands, 
-  MgrPyModule *handler = nullptr;
+  ActivePyModule *handler = nullptr;
   auto py_commands = py_modules.get_py_commands();
   for (const auto &pyc : py_commands) {
     auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring);
@@ -1273,7 +1317,7 @@ bool DaemonServer::handle_command(MCommand *m)
 void DaemonServer::_prune_pending_service_map()
 {
   utime_t cutoff = ceph_clock_now();
-  cutoff -= g_conf->mgr_service_beacon_grace;
+  cutoff -= g_conf->get_val<double>("mgr_service_beacon_grace");
   auto p = pending_service_map.services.begin();
   while (p != pending_service_map.services.end()) {
     auto q = p->second.daemons.begin();
@@ -1313,7 +1357,7 @@ void DaemonServer::_prune_pending_service_map()
 void DaemonServer::send_report()
 {
   if (!pgmap_ready) {
-    if (ceph_clock_now() - started_at > g_conf->mgr_stats_period * 4.0) {
+    if (ceph_clock_now() - started_at > g_conf->get_val<int64_t>("mgr_stats_period") * 4.0) {
       pgmap_ready = true;
       reported_osds.clear();
       dout(1) << "Giving up on OSDs that haven't reported yet, sending "
@@ -1404,3 +1448,48 @@ void DaemonServer::got_service_map()
     daemon_state.cull(p.first, names);
   }
 }
+
+
+const char** DaemonServer::get_tracked_conf_keys() const
+{
+  static const char *KEYS[] = {
+    "mgr_stats_threshold",
+    "mgr_stats_period",
+    nullptr
+  };
+
+  return KEYS;
+}
+
+void DaemonServer::handle_conf_change(const struct md_config_t *conf,
+                                              const std::set <std::string> &changed)
+{
+  dout(4) << "ohai" << dendl;
+  // We may be called within lock (via MCommand `config set`) or outwith the
+  // lock (via admin socket `config set`), so handle either case.
+  const bool initially_locked = lock.is_locked_by_me();
+  if (!initially_locked) {
+    lock.Lock();
+  }
+
+  if (changed.count("mgr_stats_threshold") || changed.count("mgr_stats_period")) {
+    dout(4) << "Updating stats threshold/period on "
+            << daemon_connections.size() << " clients" << dendl;
+    // Send a fresh MMgrConfigure to all clients, so that they can follow
+    // the new policy for transmitting stats
+    for (auto &c : daemon_connections) {
+      _send_configure(c);
+    }
+  }
+}
+
+void DaemonServer::_send_configure(ConnectionRef c)
+{
+  assert(lock.is_locked_by_me());
+
+  auto configure = new MMgrConfigure();
+  configure->stats_period = g_conf->get_val<int64_t>("mgr_stats_period");
+  configure->stats_threshold = g_conf->get_val<int64_t>("mgr_stats_threshold");
+  c->send_message(configure);
+}
+
diff --git a/ceph/src/mgr/DaemonServer.h b/ceph/src/mgr/DaemonServer.h
index 6e4483202..fe809833c 100644
--- a/ceph/src/mgr/DaemonServer.h
+++ b/ceph/src/mgr/DaemonServer.h
@@ -14,7 +14,7 @@
 #ifndef DAEMON_SERVER_H_
 #define DAEMON_SERVER_H_
 
-#include "PyModules.h"
+#include "PyModuleRegistry.h"
 
 #include <set>
 #include <string>
@@ -42,7 +42,7 @@ struct MonCommand;
  * Server used in ceph-mgr to communicate with Ceph daemons like
  * MDSs and OSDs.
  */
-class DaemonServer : public Dispatcher
+class DaemonServer : public Dispatcher, public md_config_obs_t
 {
 protected:
   boost::scoped_ptr<Throttle> client_byte_throttler;
@@ -59,15 +59,20 @@ protected:
   Finisher  &finisher;
   DaemonStateIndex &daemon_state;
   ClusterState &cluster_state;
-  PyModules &py_modules;
+  PyModuleRegistry &py_modules;
   LogChannelRef clog, audit_clog;
 
   AuthAuthorizeHandlerRegistry auth_registry;
 
+  // Connections for daemons, and clients with service names set
+  // (i.e. those MgrClients that are allowed to send MMgrReports)
+  std::set<ConnectionRef> daemon_connections;
+
   /// connections for osds
   ceph::unordered_map<int,set<ConnectionRef>> osd_cons;
 
   ServiceMap pending_service_map;  // uncommitted
+
   epoch_t pending_service_map_dirty = 0;
 
   Mutex lock;
@@ -90,7 +95,7 @@ private:
   void _prune_pending_service_map();
 
   utime_t started_at;
-  bool pgmap_ready = false;
+  std::atomic<bool> pgmap_ready;
   std::set<int32_t> reported_osds;
   void maybe_ready(int32_t osd_id);
 
@@ -104,7 +109,7 @@ public:
                Finisher &finisher_,
 	       DaemonStateIndex &daemon_state_,
 	       ClusterState &cluster_state_,
-	       PyModules &py_modules_,
+	       PyModuleRegistry &py_modules_,
 	       LogChannelRef cl,
 	       LogChannelRef auditcl);
   ~DaemonServer() override;
@@ -128,6 +133,12 @@ public:
   bool handle_command(MCommand *m);
   void send_report();
   void got_service_map();
+
+  void _send_configure(ConnectionRef c);
+
+  virtual const char** get_tracked_conf_keys() const override;
+  virtual void handle_conf_change(const struct md_config_t *conf,
+                          const std::set <std::string> &changed) override;
 };
 
 #endif
diff --git a/ceph/src/mgr/DaemonState.cc b/ceph/src/mgr/DaemonState.cc
index 93fe13019..a7b8f572e 100644
--- a/ceph/src/mgr/DaemonState.cc
+++ b/ceph/src/mgr/DaemonState.cc
@@ -13,6 +13,8 @@
 
 #include "DaemonState.h"
 
+#include "MgrSession.h"
+
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_mgr
 #undef dout_prefix
@@ -20,7 +22,7 @@
 
 void DaemonStateIndex::insert(DaemonStatePtr dm)
 {
-  Mutex::Locker l(lock);
+  RWLock::WLocker l(lock);
 
   if (all.count(dm->key)) {
     _erase(dm->key);
@@ -32,7 +34,7 @@ void DaemonStateIndex::insert(DaemonStatePtr dm)
 
 void DaemonStateIndex::_erase(const DaemonKey& dmk)
 {
-  assert(lock.is_locked_by_me());
+  assert(lock.is_wlocked());
 
   const auto to_erase = all.find(dmk);
   assert(to_erase != all.end());
@@ -49,7 +51,7 @@ void DaemonStateIndex::_erase(const DaemonKey& dmk)
 DaemonStateCollection DaemonStateIndex::get_by_service(
   const std::string& svc) const
 {
-  Mutex::Locker l(lock);
+  RWLock::RLocker l(lock);
 
   DaemonStateCollection result;
 
@@ -65,7 +67,7 @@ DaemonStateCollection DaemonStateIndex::get_by_service(
 DaemonStateCollection DaemonStateIndex::get_by_server(
   const std::string &hostname) const
 {
-  Mutex::Locker l(lock);
+  RWLock::RLocker l(lock);
 
   if (by_server.count(hostname)) {
     return by_server.at(hostname);
@@ -76,16 +78,21 @@ DaemonStateCollection DaemonStateIndex::get_by_server(
 
 bool DaemonStateIndex::exists(const DaemonKey &key) const
 {
-  Mutex::Locker l(lock);
+  RWLock::RLocker l(lock);
 
   return all.count(key) > 0;
 }
 
 DaemonStatePtr DaemonStateIndex::get(const DaemonKey &key)
 {
-  Mutex::Locker l(lock);
+  RWLock::RLocker l(lock);
 
-  return all.at(key);
+  auto iter = all.find(key);
+  if (iter != all.end()) {
+    return iter->second;
+  } else {
+    return nullptr;
+  }
 }
 
 void DaemonStateIndex::cull(const std::string& svc_name,
@@ -93,7 +100,7 @@ void DaemonStateIndex::cull(const std::string& svc_name,
 {
   std::vector<string> victims;
 
-  Mutex::Locker l(lock);
+  RWLock::WLocker l(lock);
   auto begin = all.lower_bound({svc_name, ""});
   auto end = all.end();
   for (auto &i = begin; i != end; ++i) {
@@ -118,14 +125,18 @@ void DaemonPerfCounters::update(MMgrReport *report)
 	   << types.size() << " types, got "
            << report->packed.length() << " bytes of data" << dendl;
 
+  // Retrieve session state
+  MgrSessionRef session(static_cast<MgrSession*>(
+        report->get_connection()->get_priv()));
+
   // Load any newly declared types
   for (const auto &t : report->declare_types) {
     types.insert(std::make_pair(t.path, t));
-    declared_types.insert(t.path);
+    session->declared_types.insert(t.path);
   }
   // Remove any old types
   for (const auto &t : report->undeclare_types) {
-    declared_types.erase(t);
+    session->declared_types.erase(t);
   }
 
   const auto now = ceph_clock_now();
@@ -133,7 +144,7 @@ void DaemonPerfCounters::update(MMgrReport *report)
   // Parse packed data according to declared set of types
   bufferlist::iterator p = report->packed.begin();
   DECODE_START(1, p);
-  for (const auto &t_path : declared_types) {
+  for (const auto &t_path : session->declared_types) {
     const auto &t = types.at(t_path);
     uint64_t val = 0;
     uint64_t avgcount = 0;
diff --git a/ceph/src/mgr/DaemonState.h b/ceph/src/mgr/DaemonState.h
index 98853a2ec..846ce5dd8 100644
--- a/ceph/src/mgr/DaemonState.h
+++ b/ceph/src/mgr/DaemonState.h
@@ -20,7 +20,7 @@
 #include <set>
 #include <boost/circular_buffer.hpp>
 
-#include "common/Mutex.h"
+#include "common/RWLock.h"
 
 #include "msg/msg_types.h"
 
@@ -74,18 +74,11 @@ class DaemonPerfCounters
 
   std::map<std::string, PerfCounterInstance> instances;
 
-  // FIXME: this state is really local to DaemonServer, it's part
-  // of the protocol rather than being part of what other classes
-  // mgiht want to read.  Maybe have a separate session object
-  // inside DaemonServer instead of stashing session-ish state here?
-  std::set<std::string> declared_types;
-
   void update(MMgrReport *report);
 
   void clear()
   {
     instances.clear();
-    declared_types.clear();
   }
 };
 
@@ -133,38 +126,52 @@ typedef std::map<DaemonKey, DaemonStatePtr> DaemonStateCollection;
 class DaemonStateIndex
 {
   private:
+  mutable RWLock lock = {"DaemonStateIndex", true, true, true};
+
   std::map<std::string, DaemonStateCollection> by_server;
   DaemonStateCollection all;
-
   std::set<DaemonKey> updating;
 
-  mutable Mutex lock;
+  void _erase(const DaemonKey& dmk);
 
   public:
-
-  DaemonStateIndex() : lock("DaemonState") {}
+  DaemonStateIndex() {}
 
   // FIXME: shouldn't really be public, maybe construct DaemonState
   // objects internally to avoid this.
   PerfCounterTypes types;
 
   void insert(DaemonStatePtr dm);
-  void _erase(const DaemonKey& dmk);
-
   bool exists(const DaemonKey &key) const;
   DaemonStatePtr get(const DaemonKey &key);
+
+  // Note that these return by value rather than reference to avoid
+  // callers needing to stay in lock while using result.  Callers must
+  // still take the individual DaemonState::lock on each entry though.
   DaemonStateCollection get_by_server(const std::string &hostname) const;
   DaemonStateCollection get_by_service(const std::string &svc_name) const;
-
-  const DaemonStateCollection &get_all() const {return all;}
-  const std::map<std::string, DaemonStateCollection> &get_all_servers() const
-  {
-    return by_server;
+  DaemonStateCollection get_all() const {return all;}
+
+  template<typename Callback, typename...Args>
+  auto with_daemons_by_server(Callback&& cb, Args&&... args) const ->
+    decltype(cb(by_server, std::forward<Args>(args)...)) {
+    RWLock::RLocker l(lock);
+    
+    return std::forward<Callback>(cb)(by_server, std::forward<Args>(args)...);
   }
 
-  void notify_updating(const DaemonKey &k) { updating.insert(k); }
-  void clear_updating(const DaemonKey &k) { updating.erase(k); }
-  bool is_updating(const DaemonKey &k) { return updating.count(k) > 0; }
+  void notify_updating(const DaemonKey &k) {
+    RWLock::WLocker l(lock);
+    updating.insert(k);
+  }
+  void clear_updating(const DaemonKey &k) {
+    RWLock::WLocker l(lock);
+    updating.erase(k);
+  }
+  bool is_updating(const DaemonKey &k) {
+    RWLock::RLocker l(lock);
+    return updating.count(k) > 0;
+  }
 
   /**
    * Remove state for all daemons of this type whose names are
diff --git a/ceph/src/mgr/Gil.cc b/ceph/src/mgr/Gil.cc
new file mode 100644
index 000000000..9489a3131
--- /dev/null
+++ b/ceph/src/mgr/Gil.cc
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include "Python.h"
+
+#include "common/debug.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+#include "Gil.h"
+
+SafeThreadState::SafeThreadState(PyThreadState *ts_)
+    : ts(ts_)
+{
+  assert(ts != nullptr);
+  thread = pthread_self();
+}
+
+Gil::Gil(SafeThreadState &ts, bool new_thread) : pThreadState(ts)
+{
+  // Acquire the GIL, set the current thread state
+  PyEval_RestoreThread(pThreadState.ts);
+  dout(25) << "GIL acquired for thread state " << pThreadState.ts << dendl;
+
+  //
+  // If called from a separate OS thread (i.e. a thread not created
+  // by Python, that does't already have a python thread state that
+  // was created when that thread was active), we need to manually
+  // create and switch to a python thread state specifically for this
+  // OS thread.
+  //
+  // Note that instead of requring the caller to set new_thread == true
+  // when calling this from a separate OS thread, we could figure out
+  // if this was necessary automatically, as follows:
+  //
+  //   if (pThreadState->thread_id != PyThread_get_thread_ident()) {
+  //
+  // However, this means we're accessing pThreadState->thread_id, but
+  // the Python C API docs say that "The only public data member is
+  // PyInterpreterState *interp", i.e. doing this would violate
+  // something that's meant to be a black box.
+  //
+  if (new_thread) {
+    pNewThreadState = PyThreadState_New(pThreadState.ts->interp);
+    PyThreadState_Swap(pNewThreadState);
+    dout(20) << "Switched to new thread state " << pNewThreadState << dendl;
+  } else {
+    assert(pthread_self() == pThreadState.thread);
+  }
+}
+
+Gil::~Gil()
+{
+  if (pNewThreadState != nullptr) {
+    dout(20) << "Destroying new thread state " << pNewThreadState << dendl;
+    PyThreadState_Swap(pThreadState.ts);
+    PyThreadState_Clear(pNewThreadState);
+    PyThreadState_Delete(pNewThreadState);
+  }
+  // Release the GIL, reset the thread state to NULL
+  PyEval_SaveThread();
+  dout(25) << "GIL released for thread state " << pThreadState.ts << dendl;
+}
+
diff --git a/ceph/src/mgr/Gil.h b/ceph/src/mgr/Gil.h
index 522d4b0e1..ef9e76ac1 100644
--- a/ceph/src/mgr/Gil.h
+++ b/ceph/src/mgr/Gil.h
@@ -12,17 +12,38 @@
  *
  */
 
-#ifndef GIL_H_
-#define GIL_H_
+#pragma once
 
-#include "Python.h"
+struct _ts;
+typedef struct _ts PyThreadState;
 
-#include "common/debug.h"
+#include <pthread.h>
 
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_mgr
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+/**
+ * Wrap PyThreadState to carry a record of which POSIX thread
+ * the thread state relates to.  This allows the Gil class to
+ * validate that we're being used from the right thread.
+ */
+class SafeThreadState
+{
+  public:
+  SafeThreadState(PyThreadState *ts_);
+
+  SafeThreadState()
+    : ts(nullptr), thread(0)
+  {
+  }
+
+  PyThreadState *ts;
+  pthread_t thread;
+
+  void set(PyThreadState *ts_)
+  {
+    ts = ts_;
+    thread = pthread_self();
+  }
+};
 
 //
 // Use one of these in any scope in which you need to hold Python's
@@ -34,63 +55,18 @@
 // If in doubt, explicitly put a scope around the block of code you
 // know you need the GIL in.
 //
-// See the comment below for when to set new_thread == true
+// See the comment in Gil::Gil for when to set new_thread == true
 //
 class Gil {
 public:
   Gil(const Gil&) = delete;
   Gil& operator=(const Gil&) = delete;
 
-  Gil(PyThreadState *ts, bool new_thread = false) : pThreadState(ts)
-  {
-    assert(pThreadState != nullptr);
-
-    // Acquire the GIL, set the current thread state
-    PyEval_RestoreThread(pThreadState);
-    dout(20) << "GIL acquired for thread state " << pThreadState << dendl;
-
-    //
-    // If called from a separate OS thread (i.e. a thread not created
-    // by Python, that does't already have a python thread state that
-    // was created when that thread was active), we need to manually
-    // create and switch to a python thread state specifically for this
-    // OS thread.
-    //
-    // Note that instead of requring the caller to set new_thread == true
-    // when calling this from a separate OS thread, we could figure out
-    // if this was necessary automatically, as follows:
-    //
-    //   if (pThreadState->thread_id != PyThread_get_thread_ident()) {
-    //
-    // However, this means we're accessing pThreadState->thread_id, but
-    // the Python C API docs say that "The only public data member is
-    // PyInterpreterState *interp", i.e. doing this would violate
-    // something that's meant to be a black box.
-    //
-    if (new_thread) {
-      pNewThreadState = PyThreadState_New(pThreadState->interp);
-      PyThreadState_Swap(pNewThreadState);
-      dout(20) << "Switched to new thread state " << pNewThreadState << dendl;
-    }
-  }
-
-  ~Gil()
-  {
-    if (pNewThreadState != nullptr) {
-      dout(20) << "Destroying new thread state " << pNewThreadState << dendl;
-      PyThreadState_Swap(pThreadState);
-      PyThreadState_Clear(pNewThreadState);
-      PyThreadState_Delete(pNewThreadState);
-    }
-    // Release the GIL, reset the thread state to NULL
-    PyEval_SaveThread();
-    dout(20) << "GIL released for thread state " << pThreadState << dendl;
-  }
+  Gil(SafeThreadState &ts, bool new_thread = false);
+  ~Gil();
 
 private:
-  PyThreadState *pThreadState;
+  SafeThreadState &pThreadState;
   PyThreadState *pNewThreadState = nullptr;
 };
 
-#endif
-
diff --git a/ceph/src/mgr/Mgr.cc b/ceph/src/mgr/Mgr.cc
index 092b71fdb..c17d04187 100644
--- a/ceph/src/mgr/Mgr.cc
+++ b/ceph/src/mgr/Mgr.cc
@@ -24,7 +24,7 @@
 #include "mgr/MgrContext.h"
 #include "mgr/mgr_commands.h"
 
-#include "MgrPyModule.h"
+//#include "MgrPyModule.h"
 #include "DaemonServer.h"
 #include "messages/MMgrBeacon.h"
 #include "messages/MMgrDigest.h"
@@ -42,6 +42,7 @@
 
 
 Mgr::Mgr(MonClient *monc_, const MgrMap& mgrmap,
+         PyModuleRegistry *py_module_registry_,
 	 Messenger *clientm_, Objecter *objecter_,
 	 Client* client_, LogChannelRef clog_, LogChannelRef audit_clog_) :
   monc(monc_),
@@ -52,11 +53,12 @@ Mgr::Mgr(MonClient *monc_, const MgrMap& mgrmap,
   timer(g_ceph_context, lock),
   finisher(g_ceph_context, "Mgr", "mgr-fin"),
   digest_received(false),
-  py_modules(daemon_state, cluster_state, *monc, clog_, *objecter, *client,
-             finisher),
+  py_module_registry(py_module_registry_),
   cluster_state(monc, nullptr, mgrmap),
-  server(monc, finisher, daemon_state, cluster_state, py_modules,
+  server(monc, finisher, daemon_state, cluster_state, *py_module_registry,
          clog_, audit_clog_),
+  clog(clog_),
+  audit_clog(audit_clog_),
   initialized(false),
   initializing(false)
 {
@@ -220,7 +222,7 @@ void Mgr::init()
   // Preload config keys (`get` for plugins is to be a fast local
   // operation, we we don't have to synchronize these later because
   // all sets will come via mgr)
-  load_config();
+  auto loaded_config = load_config();
 
   // Wait for MgrDigest...
   dout(4) << "waiting for MgrDigest..." << dendl;
@@ -229,9 +231,9 @@ void Mgr::init()
   }
 
   // assume finisher already initialized in background_init
-  dout(4) << "starting PyModules..." << dendl;
-  py_modules.init();
-  py_modules.start();
+  dout(4) << "starting python modules..." << dendl;
+  py_module_registry->active_start(loaded_config, daemon_state, cluster_state, *monc,
+      clog, *objecter, *client, finisher);
 
   dout(4) << "Complete." << dendl;
   initializing = false;
@@ -327,7 +329,7 @@ void Mgr::load_all_metadata()
   }
 }
 
-void Mgr::load_config()
+std::map<std::string, std::string> Mgr::load_config()
 {
   assert(lock.is_locked_by_me());
 
@@ -345,7 +347,7 @@ void Mgr::load_config()
     std::string const key = key_str.get_str();
     dout(20) << "saw key '" << key << "'" << dendl;
 
-    const std::string config_prefix = PyModules::config_prefix;
+    const std::string config_prefix = PyModuleRegistry::config_prefix;
 
     if (key.substr(0, config_prefix.size()) == config_prefix) {
       dout(20) << "fetching '" << key << "'" << dendl;
@@ -361,7 +363,7 @@ void Mgr::load_config()
     }
   }
 
-  py_modules.insert_config(loaded);
+  return loaded;
 }
 
 void Mgr::shutdown()
@@ -377,7 +379,7 @@ void Mgr::shutdown()
       server.shutdown();
     }
     // after the messenger is stopped, signal modules to shutdown via finisher
-    py_modules.shutdown();
+    py_module_registry->active_shutdown();
   }));
 
   // Then stop the finisher to ensure its enqueued contexts aren't going
@@ -460,7 +462,7 @@ void Mgr::handle_osd_map()
 void Mgr::handle_log(MLog *m)
 {
   for (const auto &e : m->entries) {
-    py_modules.notify_all(e);
+    py_module_registry->notify_all(e);
   }
 
   m->put();
@@ -483,18 +485,18 @@ bool Mgr::ms_dispatch(Message *m)
       handle_mgr_digest(static_cast<MMgrDigest*>(m));
       break;
     case CEPH_MSG_MON_MAP:
-      py_modules.notify_all("mon_map", "");
+      py_module_registry->notify_all("mon_map", "");
       m->put();
       break;
     case CEPH_MSG_FS_MAP:
-      py_modules.notify_all("fs_map", "");
+      py_module_registry->notify_all("fs_map", "");
       handle_fs_map((MFSMap*)m);
       return false; // I shall let this pass through for Client
       break;
     case CEPH_MSG_OSD_MAP:
       handle_osd_map();
 
-      py_modules.notify_all("osd_map", "");
+      py_module_registry->notify_all("osd_map", "");
 
       // Continuous subscribe, so that we can generate notifications
       // for our MgrPyModules
@@ -503,7 +505,7 @@ bool Mgr::ms_dispatch(Message *m)
       break;
     case MSG_SERVICE_MAP:
       handle_service_map((MServiceMap*)m);
-      py_modules.notify_all("service_map", "");
+      py_module_registry->notify_all("service_map", "");
       m->put();
       break;
     case MSG_LOG:
@@ -614,12 +616,12 @@ void Mgr::handle_mgr_digest(MMgrDigest* m)
   dout(10) << m->mon_status_json.length() << dendl;
   dout(10) << m->health_json.length() << dendl;
   cluster_state.load_digest(m);
-  py_modules.notify_all("mon_status", "");
-  py_modules.notify_all("health", "");
+  py_module_registry->notify_all("mon_status", "");
+  py_module_registry->notify_all("health", "");
 
   // Hack: use this as a tick/opportunity to prompt python-land that
   // the pgmap might have changed since last time we were here.
-  py_modules.notify_all("pg_summary", "");
+  py_module_registry->notify_all("pg_summary", "");
   dout(10) << "done." << dendl;
 
   m->put();
@@ -641,8 +643,15 @@ std::vector<MonCommand> Mgr::get_command_set() const
   Mutex::Locker l(lock);
 
   std::vector<MonCommand> commands = mgr_commands;
-  std::vector<MonCommand> py_commands = py_modules.get_commands();
+  std::vector<MonCommand> py_commands = py_module_registry->get_commands();
   commands.insert(commands.end(), py_commands.begin(), py_commands.end());
   return commands;
 }
 
+std::map<std::string, std::string> Mgr::get_services() const
+{
+  Mutex::Locker l(lock);
+
+  return py_module_registry->get_services();
+}
+
diff --git a/ceph/src/mgr/Mgr.h b/ceph/src/mgr/Mgr.h
index 68f2b40b4..9a6b3974b 100644
--- a/ceph/src/mgr/Mgr.h
+++ b/ceph/src/mgr/Mgr.h
@@ -32,7 +32,7 @@
 #include "mon/MgrMap.h"
 
 #include "DaemonServer.h"
-#include "PyModules.h"
+#include "PyModuleRegistry.h"
 
 #include "DaemonState.h"
 #include "ClusterState.h"
@@ -44,8 +44,6 @@ class MServiceMap;
 class Objecter;
 class Client;
 
-class MgrPyModule;
-
 class Mgr {
 protected:
   MonClient *monc;
@@ -62,13 +60,16 @@ protected:
   bool digest_received;
   Cond digest_cond;
 
-  PyModules py_modules;
+  PyModuleRegistry *py_module_registry;
   DaemonStateIndex daemon_state;
   ClusterState cluster_state;
 
   DaemonServer server;
 
-  void load_config();
+  LogChannelRef clog;
+  LogChannelRef audit_clog;
+
+  PyModuleConfig load_config();
   void load_all_metadata();
   void init();
 
@@ -77,6 +78,7 @@ protected:
 
 public:
   Mgr(MonClient *monc_, const MgrMap& mgrmap,
+      PyModuleRegistry *py_module_registry_,
       Messenger *clientm_, Objecter *objecter_,
       Client *client_, LogChannelRef clog_, LogChannelRef audit_clog_);
   ~Mgr();
@@ -100,6 +102,7 @@ public:
   void shutdown();
 
   std::vector<MonCommand> get_command_set() const;
+  std::map<std::string, std::string> get_services() const;
 };
 
 #endif
diff --git a/ceph/src/mgr/MgrClient.cc b/ceph/src/mgr/MgrClient.cc
index 849590ba9..c72470d9b 100644
--- a/ceph/src/mgr/MgrClient.cc
+++ b/ceph/src/mgr/MgrClient.cc
@@ -111,14 +111,15 @@ void MgrClient::reconnect()
   if (last_connect_attempt != utime_t()) {
     utime_t now = ceph_clock_now();
     utime_t when = last_connect_attempt;
-    when += cct->_conf->mgr_connect_retry_interval;
+    when += cct->_conf->get_val<double>("mgr_connect_retry_interval");
     if (now < when) {
       if (!connect_retry_callback) {
-	connect_retry_callback = new FunctionContext([this](int r){
-	    connect_retry_callback = nullptr;
-	    reconnect();
-	  });
-	timer.add_event_at(when, connect_retry_callback);
+	connect_retry_callback = timer.add_event_at(
+	  when,
+	  new FunctionContext([this](int r){
+	      connect_retry_callback = nullptr;
+	      reconnect();
+	    }));
       }
       ldout(cct, 4) << "waiting to retry connect until " << when << dendl;
       return;
@@ -227,22 +228,48 @@ void MgrClient::send_report()
   pcc->with_counters([this, report](
         const PerfCountersCollection::CounterMap &by_path)
   {
+    // Helper for checking whether a counter should be included
+    auto include_counter = [this](
+        const PerfCounters::perf_counter_data_any_d &ctr,
+        const PerfCounters &perf_counters)
+    {
+      return perf_counters.get_adjusted_priority(ctr.prio) >= (int)stats_threshold;
+    };
+
+    // Helper for cases where we want to forget a counter
+    auto undeclare = [report, this](const std::string &path)
+    {
+      report->undeclare_types.push_back(path);
+      ldout(cct,20) << " undeclare " << path << dendl;
+      session->declared.erase(path);
+    };
+
     ENCODE_START(1, 1, report->packed);
+
+    // Find counters that no longer exist, and undeclare them
     for (auto p = session->declared.begin(); p != session->declared.end(); ) {
-      if (by_path.count(*p) == 0) {
-	report->undeclare_types.push_back(*p);
-	ldout(cct,20) << __func__ << " undeclare " << *p << dendl;
-	p = session->declared.erase(p);
-      } else {
-	++p;
+      const auto &path = *(p++);
+      if (by_path.count(path) == 0) {
+        undeclare(path);
       }
     }
+
     for (const auto &i : by_path) {
       auto& path = i.first;
-      auto& data = *(i.second);
+      auto& data = *(i.second.data);
+      auto& perf_counters = *(i.second.perf_counters);
+
+      // Find counters that still exist, but are no longer permitted by
+      // stats_threshold
+      if (!include_counter(data, perf_counters)) {
+        if (session->declared.count(path)) {
+          undeclare(path);
+        }
+        continue;
+      }
 
       if (session->declared.count(path) == 0) {
-	ldout(cct,20) << __func__ << " declare " << path << dendl;
+	ldout(cct,20) << " declare " << path << dendl;
 	PerfCounterType type;
 	type.path = path;
 	if (data.description) {
@@ -252,6 +279,7 @@ void MgrClient::send_report()
 	  type.nick = data.nick;
 	}
 	type.type = data.type;
+        type.priority = perf_counters.get_adjusted_priority(data.prio);
 	report->declare_types.push_back(std::move(type));
 	session->declared.insert(path);
       }
@@ -264,8 +292,11 @@ void MgrClient::send_report()
     }
     ENCODE_FINISH(report->packed);
 
-    ldout(cct, 20) << by_path.size() << " counters, of which "
-		   << report->declare_types.size() << " new" << dendl;
+    ldout(cct, 20) << "sending " << session->declared.size() << " counters ("
+                      "of possible " << by_path.size() << "), "
+		   << report->declare_types.size() << " new, "
+                   << report->undeclare_types.size() << " removed"
+                   << dendl;
   });
 
   ldout(cct, 20) << "encoded " << report->packed.length() << " bytes" << dendl;
@@ -313,6 +344,11 @@ bool MgrClient::handle_mgr_configure(MMgrConfigure *m)
 
   ldout(cct, 4) << "stats_period=" << m->stats_period << dendl;
 
+  if (stats_threshold != m->stats_threshold) {
+    ldout(cct, 4) << "updated stats threshold: " << m->stats_threshold << dendl;
+    stats_threshold = m->stats_threshold;
+  }
+
   bool starting = (stats_period == 0) && (m->stats_period != 0);
   stats_period = m->stats_period;
   if (starting) {
diff --git a/ceph/src/mgr/MgrClient.h b/ceph/src/mgr/MgrClient.h
index 09fe831b3..08ff24c23 100644
--- a/ceph/src/mgr/MgrClient.h
+++ b/ceph/src/mgr/MgrClient.h
@@ -59,6 +59,7 @@ protected:
   Mutex lock = {"MgrClient::lock"};
 
   uint32_t stats_period = 0;
+  uint32_t stats_threshold = 0;
   SafeTimer timer;
 
   CommandTable<MgrCommand> command_table;
diff --git a/ceph/src/mgr/MgrCommands.h b/ceph/src/mgr/MgrCommands.h
index 1818454e1..79766fafe 100644
--- a/ceph/src/mgr/MgrCommands.h
+++ b/ceph/src/mgr/MgrCommands.h
@@ -131,3 +131,8 @@ COMMAND("service dump",
         "dump service map", "service", "r", "cli,rest")
 COMMAND("service status",
         "dump service state", "service", "r", "cli,rest")
+
+COMMAND("config set " \
+	"name=key,type=CephString name=value,type=CephString",
+	"Set a configuration option at runtime (not persistent)",
+	"mgr", "rw", "cli,rest")
diff --git a/ceph/src/mgr/MgrPyModule.cc b/ceph/src/mgr/MgrPyModule.cc
deleted file mode 100644
index a2bf73ca3..000000000
--- a/ceph/src/mgr/MgrPyModule.cc
+++ /dev/null
@@ -1,371 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 John Spray <john.spray@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- */
-
-#include "PyState.h"
-#include "Gil.h"
-
-#include "PyFormatter.h"
-
-#include "common/debug.h"
-
-#include "MgrPyModule.h"
-
-//XXX courtesy of http://stackoverflow.com/questions/1418015/how-to-get-python-exception-text
-#include <boost/python.hpp>
-#include "include/assert.h"  // boost clobbers this
-
-// decode a Python exception into a string
-std::string handle_pyerror()
-{
-    using namespace boost::python;
-    using namespace boost;
-
-    PyObject *exc, *val, *tb;
-    object formatted_list, formatted;
-    PyErr_Fetch(&exc, &val, &tb);
-    handle<> hexc(exc), hval(allow_null(val)), htb(allow_null(tb));
-    object traceback(import("traceback"));
-    if (!tb) {
-        object format_exception_only(traceback.attr("format_exception_only"));
-        formatted_list = format_exception_only(hexc, hval);
-    } else {
-        object format_exception(traceback.attr("format_exception"));
-        formatted_list = format_exception(hexc,hval, htb);
-    }
-    formatted = str("").join(formatted_list);
-    return extract<std::string>(formatted);
-}
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_mgr
-
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr[py] "
-
-namespace {
-  PyObject* log_write(PyObject*, PyObject* args) {
-    char* m = nullptr;
-    if (PyArg_ParseTuple(args, "s", &m)) {
-      auto len = strlen(m);
-      if (len && m[len-1] == '\n') {
-	m[len-1] = '\0';
-      }
-      dout(4) << m << dendl;
-    }
-    Py_RETURN_NONE;
-  }
-
-  PyObject* log_flush(PyObject*, PyObject*){
-    Py_RETURN_NONE;
-  }
-
-  static PyMethodDef log_methods[] = {
-    {"write", log_write, METH_VARARGS, "write stdout and stderr"},
-    {"flush", log_flush, METH_VARARGS, "flush"},
-    {nullptr, nullptr, 0, nullptr}
-  };
-}
-
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr " << __func__ << " "
-
-MgrPyModule::MgrPyModule(const std::string &module_name_, const std::string &sys_path, PyThreadState *main_ts_)
-  : module_name(module_name_),
-    pClassInstance(nullptr),
-    pMainThreadState(main_ts_)
-{
-  assert(pMainThreadState != nullptr);
-
-  Gil gil(pMainThreadState);
-
-  pMyThreadState = Py_NewInterpreter();
-  if (pMyThreadState == nullptr) {
-    derr << "Failed to create python sub-interpreter for '" << module_name << '"' << dendl;
-  } else {
-    // Some python modules do not cope with an unpopulated argv, so lets
-    // fake one.  This step also picks up site-packages into sys.path.
-    const char *argv[] = {"ceph-mgr"};
-    PySys_SetArgv(1, (char**)argv);
-
-    if (g_conf->daemonize) {
-      auto py_logger = Py_InitModule("ceph_logger", log_methods);
-#if PY_MAJOR_VERSION >= 3
-      PySys_SetObject("stderr", py_logger);
-      PySys_SetObject("stdout", py_logger);
-#else
-      PySys_SetObject(const_cast<char*>("stderr"), py_logger);
-      PySys_SetObject(const_cast<char*>("stdout"), py_logger);
-#endif
-    }
-    // Populate python namespace with callable hooks
-    Py_InitModule("ceph_state", CephStateMethods);
-
-    PySys_SetPath(const_cast<char*>(sys_path.c_str()));
-  }
-}
-
-MgrPyModule::~MgrPyModule()
-{
-  if (pMyThreadState != nullptr) {
-    Gil gil(pMyThreadState);
-
-    Py_XDECREF(pClassInstance);
-
-    //
-    // Ideally, now, we'd be able to do this:
-    //
-    //    Py_EndInterpreter(pMyThreadState);
-    //    PyThreadState_Swap(pMainThreadState);
-    //
-    // Unfortunately, if the module has any other *python* threads active
-    // at this point, Py_EndInterpreter() will abort with:
-    //
-    //    Fatal Python error: Py_EndInterpreter: not the last thread
-    //
-    // This can happen when using CherryPy in a module, becuase CherryPy
-    // runs an extra thread as a timeout monitor, which spends most of its
-    // life inside a time.sleep(60).  Unless you are very, very lucky with
-    // the timing calling this destructor, that thread will still be stuck
-    // in a sleep, and Py_EndInterpreter() will abort.
-    //
-    // This could of course also happen with a poorly written module which
-    // made no attempt to clean up any additional threads it created.
-    //
-    // The safest thing to do is just not call Py_EndInterpreter(), and
-    // let Py_Finalize() kill everything after all modules are shut down.
-    //
-  }
-}
-
-int MgrPyModule::load()
-{
-  if (pMyThreadState == nullptr) {
-    derr << "No python sub-interpreter exists for module '" << module_name << "'" << dendl;
-    return -EINVAL;
-  }
-
-  Gil gil(pMyThreadState);
-
-  // Load the module
-  PyObject *pName = PyString_FromString(module_name.c_str());
-  auto pModule = PyImport_Import(pName);
-  Py_DECREF(pName);
-  if (pModule == nullptr) {
-    derr << "Module not found: '" << module_name << "'" << dendl;
-    derr << handle_pyerror() << dendl;
-    return -ENOENT;
-  }
-
-  // Find the class
-  // TODO: let them call it what they want instead of just 'Module'
-  auto pClass = PyObject_GetAttrString(pModule, (const char*)"Module");
-  Py_DECREF(pModule);
-  if (pClass == nullptr) {
-    derr << "Class not found in module '" << module_name << "'" << dendl;
-    derr << handle_pyerror() << dendl;
-    return -EINVAL;
-  }
-
-  
-  // Just using the module name as the handle, replace with a
-  // uuidish thing if needed
-  auto pyHandle = PyString_FromString(module_name.c_str());
-  auto pArgs = PyTuple_Pack(1, pyHandle);
-  pClassInstance = PyObject_CallObject(pClass, pArgs);
-  Py_DECREF(pClass);
-  Py_DECREF(pyHandle);
-  Py_DECREF(pArgs);
-  if (pClassInstance == nullptr) {
-    derr << "Failed to construct class in '" << module_name << "'" << dendl;
-    derr << handle_pyerror() << dendl;
-    return -EINVAL;
-  } else {
-    dout(1) << "Constructed class from module: " << module_name << dendl;
-  }
-
-  return load_commands();
-}
-
-int MgrPyModule::serve()
-{
-  assert(pClassInstance != nullptr);
-
-  // This method is called from a separate OS thread (i.e. a thread not
-  // created by Python), so tell Gil to wrap this in a new thread state.
-  Gil gil(pMyThreadState, true);
-
-  auto pValue = PyObject_CallMethod(pClassInstance,
-      const_cast<char*>("serve"), nullptr);
-
-  int r = 0;
-  if (pValue != NULL) {
-    Py_DECREF(pValue);
-  } else {
-    derr << module_name << ".serve:" << dendl;
-    derr << handle_pyerror() << dendl;
-    return -EINVAL;
-  }
-
-  return r;
-}
-
-// FIXME: DRY wrt serve
-void MgrPyModule::shutdown()
-{
-  assert(pClassInstance != nullptr);
-
-  Gil gil(pMyThreadState);
-
-  auto pValue = PyObject_CallMethod(pClassInstance,
-      const_cast<char*>("shutdown"), nullptr);
-
-  if (pValue != NULL) {
-    Py_DECREF(pValue);
-  } else {
-    derr << "Failed to invoke shutdown() on " << module_name << dendl;
-    derr << handle_pyerror() << dendl;
-  }
-}
-
-void MgrPyModule::notify(const std::string &notify_type, const std::string &notify_id)
-{
-  assert(pClassInstance != nullptr);
-
-  Gil gil(pMyThreadState);
-
-  // Execute
-  auto pValue = PyObject_CallMethod(pClassInstance,
-       const_cast<char*>("notify"), const_cast<char*>("(ss)"),
-       notify_type.c_str(), notify_id.c_str());
-
-  if (pValue != NULL) {
-    Py_DECREF(pValue);
-  } else {
-    derr << module_name << ".notify:" << dendl;
-    derr << handle_pyerror() << dendl;
-    // FIXME: callers can't be expected to handle a python module
-    // that has spontaneously broken, but Mgr() should provide
-    // a hook to unload misbehaving modules when they have an
-    // error somewhere like this
-  }
-}
-
-void MgrPyModule::notify_clog(const LogEntry &log_entry)
-{
-  assert(pClassInstance != nullptr);
-
-  Gil gil(pMyThreadState);
-
-  // Construct python-ized LogEntry
-  PyFormatter f;
-  log_entry.dump(&f);
-  auto py_log_entry = f.get();
-
-  // Execute
-  auto pValue = PyObject_CallMethod(pClassInstance,
-       const_cast<char*>("notify"), const_cast<char*>("(sN)"),
-       "clog", py_log_entry);
-
-  if (pValue != NULL) {
-    Py_DECREF(pValue);
-  } else {
-    derr << module_name << ".notify_clog:" << dendl;
-    derr << handle_pyerror() << dendl;
-    // FIXME: callers can't be expected to handle a python module
-    // that has spontaneously broken, but Mgr() should provide
-    // a hook to unload misbehaving modules when they have an
-    // error somewhere like this
-  }
-}
-
-int MgrPyModule::load_commands()
-{
-  // Don't need a Gil here -- this is called from MgrPyModule::load(),
-  // which already has one.
-  PyObject *command_list = PyObject_GetAttrString(pClassInstance, "COMMANDS");
-  assert(command_list != nullptr);
-  const size_t list_size = PyList_Size(command_list);
-  for (size_t i = 0; i < list_size; ++i) {
-    PyObject *command = PyList_GetItem(command_list, i);
-    assert(command != nullptr);
-
-    ModuleCommand item;
-
-    PyObject *pCmd = PyDict_GetItemString(command, "cmd");
-    assert(pCmd != nullptr);
-    item.cmdstring = PyString_AsString(pCmd);
-
-    dout(20) << "loaded command " << item.cmdstring << dendl;
-
-    PyObject *pDesc = PyDict_GetItemString(command, "desc");
-    assert(pDesc != nullptr);
-    item.helpstring = PyString_AsString(pDesc);
-
-    PyObject *pPerm = PyDict_GetItemString(command, "perm");
-    assert(pPerm != nullptr);
-    item.perm = PyString_AsString(pPerm);
-
-    item.handler = this;
-
-    commands.push_back(item);
-  }
-  Py_DECREF(command_list);
-
-  dout(10) << "loaded " << commands.size() << " commands" << dendl;
-
-  return 0;
-}
-
-int MgrPyModule::handle_command(
-  const cmdmap_t &cmdmap,
-  std::stringstream *ds,
-  std::stringstream *ss)
-{
-  assert(ss != nullptr);
-  assert(ds != nullptr);
-
-  Gil gil(pMyThreadState);
-
-  PyFormatter f;
-  cmdmap_dump(cmdmap, &f);
-  PyObject *py_cmd = f.get();
-
-  auto pResult = PyObject_CallMethod(pClassInstance,
-      const_cast<char*>("handle_command"), const_cast<char*>("(O)"), py_cmd);
-
-  Py_DECREF(py_cmd);
-
-  int r = 0;
-  if (pResult != NULL) {
-    if (PyTuple_Size(pResult) != 3) {
-      r = -EINVAL;
-    } else {
-      r = PyInt_AsLong(PyTuple_GetItem(pResult, 0));
-      *ds << PyString_AsString(PyTuple_GetItem(pResult, 1));
-      *ss << PyString_AsString(PyTuple_GetItem(pResult, 2));
-    }
-
-    Py_DECREF(pResult);
-  } else {
-    *ds << "";
-    *ss << handle_pyerror();
-    r = -EINVAL;
-  }
-
-  return r;
-}
-
-void MgrPyModule::get_health_checks(health_check_map_t *checks)
-{
-  checks->merge(health_checks);
-}
diff --git a/ceph/src/mgr/MgrSession.h b/ceph/src/mgr/MgrSession.h
index 72afd7a67..c52e2e177 100644
--- a/ceph/src/mgr/MgrSession.h
+++ b/ceph/src/mgr/MgrSession.h
@@ -23,6 +23,8 @@ struct MgrSession : public RefCountedObject {
   // mon caps are suitably generic for mgr
   MonCap caps;
 
+  std::set<std::string> declared_types;
+
   MgrSession(CephContext *cct) : RefCountedObject(cct, 0) {}
   ~MgrSession() override {}
 };
diff --git a/ceph/src/mgr/MgrStandby.cc b/ceph/src/mgr/MgrStandby.cc
index 99df69c90..171add02f 100644
--- a/ceph/src/mgr/MgrStandby.cc
+++ b/ceph/src/mgr/MgrStandby.cc
@@ -46,6 +46,7 @@ MgrStandby::MgrStandby(int argc, const char **argv) :
   audit_clog(log_client.create_channel(CLOG_CHANNEL_AUDIT)),
   lock("MgrStandby::lock"),
   timer(g_ceph_context, lock),
+  py_module_registry(clog),
   active_mgr(nullptr),
   orig_argc(argc),
   orig_argv(argv),
@@ -151,15 +152,14 @@ void MgrStandby::send_beacon()
   dout(1) << state_str() << dendl;
 
   set<string> modules;
-  PyModules::list_modules(&modules);
+  PyModuleRegistry::list_modules(&modules);
 
   // Whether I think I am available (request MgrMonitor to set me
   // as available in the map)
   bool available = active_mgr != nullptr && active_mgr->is_initialized();
 
   auto addr = available ? active_mgr->get_server_addr() : entity_addr_t();
-  dout(10) << "sending beacon as gid " << monc.get_global_id()
-	   << " modules " << modules << dendl;
+  dout(10) << "sending beacon as gid " << monc.get_global_id() << dendl;
 
   map<string,string> metadata;
   collect_sys_info(&metadata, g_ceph_context);
@@ -172,13 +172,17 @@ void MgrStandby::send_beacon()
 				 modules,
 				 std::move(metadata));
 
-  if (available && !available_in_map) {
-    // We are informing the mon that we are done initializing: inform
-    // it of our command set.  This has to happen after init() because
-    // it needs the python modules to have loaded.
-    m->set_command_descs(active_mgr->get_command_set());
-    dout(4) << "going active, including " << m->get_command_descs().size()
-            << " commands in beacon" << dendl;
+  if (available) {
+    if (!available_in_map) {
+      // We are informing the mon that we are done initializing: inform
+      // it of our command set.  This has to happen after init() because
+      // it needs the python modules to have loaded.
+      m->set_command_descs(active_mgr->get_command_set());
+      dout(4) << "going active, including " << m->get_command_descs().size()
+              << " commands in beacon" << dendl;
+    }
+
+    m->set_services(active_mgr->get_services());
   }
                                  
   monc.send_mon_message(m);
@@ -189,14 +193,14 @@ void MgrStandby::tick()
   dout(10) << __func__ << dendl;
   send_beacon();
 
-  if (active_mgr) {
+  if (active_mgr && active_mgr->is_initialized()) {
     active_mgr->tick();
   }
 
-  timer.add_event_after(g_conf->mgr_tick_period, new FunctionContext(
-        [this](int r){
+  timer.add_event_after(g_conf->get_val<int64_t>("mgr_tick_period"),
+      new FunctionContext([this](int r){
           tick();
-        }
+      }
   )); 
 }
 
@@ -213,6 +217,8 @@ void MgrStandby::shutdown()
   // Expect already to be locked as we're called from signal handler
   assert(lock.is_locked_by_me());
 
+  dout(4) << "Shutting down" << dendl;
+
   // stop sending beacon first, i use monc to talk with monitors
   timer.shutdown();
   // client uses monc and objecter
@@ -223,6 +229,9 @@ void MgrStandby::shutdown()
   if (active_mgr) {
     active_mgr->shutdown();
   }
+
+  py_module_registry.shutdown();
+
   // objecter is used by monc and active_mgr
   objecter.shutdown();
   // client_messenger is used by all of them, so stop it in the end
@@ -302,10 +311,24 @@ void MgrStandby::handle_mgr_map(MMgrMap* mmap)
   const bool active_in_map = map.active_gid == monc.get_global_id();
   dout(4) << "active in map: " << active_in_map
           << " active is " << map.active_gid << dendl;
+
+  if (!py_module_registry.is_initialized()) {
+    int r = py_module_registry.init(map);
+
+    // FIXME: error handling
+    assert(r == 0);
+  } else {
+    bool need_respawn = py_module_registry.handle_mgr_map(map);
+    if (need_respawn) {
+      respawn();
+    }
+  }
+
   if (active_in_map) {
     if (!active_mgr) {
       dout(1) << "Activating!" << dendl;
-      active_mgr.reset(new Mgr(&monc, map, client_messenger.get(), &objecter,
+      active_mgr.reset(new Mgr(&monc, map, &py_module_registry,
+                               client_messenger.get(), &objecter,
 			       &client, clog, audit_clog));
       active_mgr->background_init(new FunctionContext(
             [this](int r){
@@ -327,10 +350,16 @@ void MgrStandby::handle_mgr_map(MMgrMap* mmap)
       dout(4) << "Map now says I am available" << dendl;
       available_in_map = true;
     }
+  } else if (active_mgr != nullptr) {
+    derr << "I was active but no longer am" << dendl;
+    respawn();
   } else {
-    if (active_mgr != nullptr) {
-      derr << "I was active but no longer am" << dendl;
-      respawn();
+    if (map.active_gid != 0 && map.active_name != g_conf->name.get_id()) {
+      // I am the standby and someone else is active, start modules
+      // in standby mode to do redirects if needed
+      if (!py_module_registry.is_standby_running()) {
+        py_module_registry.standby_start(&monc);
+      }
     }
   }
 
@@ -412,6 +441,12 @@ int MgrStandby::main(vector<const char *> args)
 
 std::string MgrStandby::state_str()
 {
-  return active_mgr == nullptr ? "standby" : "active";
+  if (active_mgr == nullptr) {
+    return "standby";
+  } else if (active_mgr->is_initialized()) {
+    return "active";
+  } else {
+    return "active (starting)";
+  }
 }
 
diff --git a/ceph/src/mgr/MgrStandby.h b/ceph/src/mgr/MgrStandby.h
index e24f175ca..a64fd7e99 100644
--- a/ceph/src/mgr/MgrStandby.h
+++ b/ceph/src/mgr/MgrStandby.h
@@ -23,6 +23,7 @@
 #include "client/Client.h"
 #include "mon/MonClient.h"
 #include "osdc/Objecter.h"
+#include "PyModuleRegistry.h"
 
 
 class MMgrMap;
@@ -48,6 +49,7 @@ protected:
   Mutex lock;
   SafeTimer timer;
 
+  PyModuleRegistry py_module_registry;
   std::shared_ptr<Mgr> active_mgr;
 
   int orig_argc;
diff --git a/ceph/src/mgr/PyModuleRegistry.cc b/ceph/src/mgr/PyModuleRegistry.cc
new file mode 100644
index 000000000..7004ae244
--- /dev/null
+++ b/ceph/src/mgr/PyModuleRegistry.cc
@@ -0,0 +1,450 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/backport14.h"
+
+#include "BaseMgrModule.h"
+#include "PyOSDMap.h"
+#include "BaseMgrStandbyModule.h"
+#include "Gil.h"
+
+#include "ActivePyModules.h"
+
+#include "PyModuleRegistry.h"
+
+// definition for non-const static member
+std::string PyModuleRegistry::config_prefix;
+
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr[py] "
+
+namespace {
+  PyObject* log_write(PyObject*, PyObject* args) {
+    char* m = nullptr;
+    if (PyArg_ParseTuple(args, "s", &m)) {
+      auto len = strlen(m);
+      if (len && m[len-1] == '\n') {
+	m[len-1] = '\0';
+      }
+      dout(4) << m << dendl;
+    }
+    Py_RETURN_NONE;
+  }
+
+  PyObject* log_flush(PyObject*, PyObject*){
+    Py_RETURN_NONE;
+  }
+
+  static PyMethodDef log_methods[] = {
+    {"write", log_write, METH_VARARGS, "write stdout and stderr"},
+    {"flush", log_flush, METH_VARARGS, "flush"},
+    {nullptr, nullptr, 0, nullptr}
+  };
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+
+
+std::string PyModule::get_site_packages()
+{
+  std::stringstream site_packages;
+
+  // CPython doesn't auto-add site-packages dirs to sys.path for us,
+  // but it does provide a module that we can ask for them.
+  auto site_module = PyImport_ImportModule("site");
+  assert(site_module);
+
+  auto site_packages_fn = PyObject_GetAttrString(site_module, "getsitepackages");
+  if (site_packages_fn != nullptr) {
+    auto site_packages_list = PyObject_CallObject(site_packages_fn, nullptr);
+    assert(site_packages_list);
+
+    auto n = PyList_Size(site_packages_list);
+    for (Py_ssize_t i = 0; i < n; ++i) {
+      if (i != 0) {
+        site_packages << ":";
+      }
+      site_packages << PyString_AsString(PyList_GetItem(site_packages_list, i));
+    }
+
+    Py_DECREF(site_packages_list);
+    Py_DECREF(site_packages_fn);
+  } else {
+    // Fall back to generating our own site-packages paths by imitating
+    // what the standard site.py does.  This is annoying but it lets us
+    // run inside virtualenvs :-/
+
+    auto site_packages_fn = PyObject_GetAttrString(site_module, "addsitepackages");
+    assert(site_packages_fn);
+
+    auto known_paths = PySet_New(nullptr);
+    auto pArgs = PyTuple_Pack(1, known_paths);
+    PyObject_CallObject(site_packages_fn, pArgs);
+    Py_DECREF(pArgs);
+    Py_DECREF(known_paths);
+    Py_DECREF(site_packages_fn);
+
+    auto sys_module = PyImport_ImportModule("sys");
+    assert(sys_module);
+    auto sys_path = PyObject_GetAttrString(sys_module, "path");
+    assert(sys_path);
+
+    dout(1) << "sys.path:" << dendl;
+    auto n = PyList_Size(sys_path);
+    bool first = true;
+    for (Py_ssize_t i = 0; i < n; ++i) {
+      dout(1) << "  " << PyString_AsString(PyList_GetItem(sys_path, i)) << dendl;
+      if (first) {
+        first = false;
+      } else {
+        site_packages << ":";
+      }
+      site_packages << PyString_AsString(PyList_GetItem(sys_path, i));
+    }
+
+    Py_DECREF(sys_path);
+    Py_DECREF(sys_module);
+  }
+
+  Py_DECREF(site_module);
+
+  return site_packages.str();
+}
+
+int PyModuleRegistry::init(const MgrMap &map)
+{
+  Mutex::Locker locker(lock);
+
+  // Don't try and init me if you don't really have a map
+  assert(map.epoch > 0);
+
+  mgr_map = map;
+
+  // namespace in config-key prefixed by "mgr/"
+  config_prefix = std::string(g_conf->name.get_type_str()) + "/";
+
+  // Set up global python interpreter
+  Py_SetProgramName(const_cast<char*>(PYTHON_EXECUTABLE));
+  Py_InitializeEx(0);
+
+  // Let CPython know that we will be calling it back from other
+  // threads in future.
+  if (! PyEval_ThreadsInitialized()) {
+    PyEval_InitThreads();
+  }
+
+  // Drop the GIL and remember the main thread state (current
+  // thread state becomes NULL)
+  pMainThreadState = PyEval_SaveThread();
+  assert(pMainThreadState != nullptr);
+
+  std::list<std::string> failed_modules;
+
+  // Load python code
+  for (const auto& module_name : mgr_map.modules) {
+    dout(1) << "Loading python module '" << module_name << "'" << dendl;
+    auto mod = ceph::make_unique<PyModule>(module_name);
+    int r = mod->load(pMainThreadState);
+    if (r != 0) {
+      // Don't use handle_pyerror() here; we don't have the GIL
+      // or the right thread state (this is deliberate).
+      derr << "Error loading module '" << module_name << "': "
+        << cpp_strerror(r) << dendl;
+      failed_modules.push_back(module_name);
+      // Don't drop out here, load the other modules
+    } else {
+      // Success!
+      modules[module_name] = std::move(mod);
+    }
+  }
+
+  if (!failed_modules.empty()) {
+    clog->error() << "Failed to load ceph-mgr modules: " << joinify(
+        failed_modules.begin(), failed_modules.end(), std::string(", "));
+  }
+
+  return 0;
+}
+
+
+int PyModule::load(PyThreadState *pMainThreadState)
+{
+  assert(pMainThreadState != nullptr);
+
+  // Configure sub-interpreter and construct C++-generated python classes
+  {
+    SafeThreadState sts(pMainThreadState);
+    Gil gil(sts);
+
+    auto thread_state = Py_NewInterpreter();
+    if (thread_state == nullptr) {
+      derr << "Failed to create python sub-interpreter for '" << module_name << '"' << dendl;
+      return -EINVAL;
+    } else {
+      pMyThreadState.set(thread_state);
+      // Some python modules do not cope with an unpopulated argv, so lets
+      // fake one.  This step also picks up site-packages into sys.path.
+      const char *argv[] = {"ceph-mgr"};
+      PySys_SetArgv(1, (char**)argv);
+
+      if (g_conf->daemonize) {
+        auto py_logger = Py_InitModule("ceph_logger", log_methods);
+#if PY_MAJOR_VERSION >= 3
+        PySys_SetObject("stderr", py_logger);
+        PySys_SetObject("stdout", py_logger);
+#else
+        PySys_SetObject(const_cast<char*>("stderr"), py_logger);
+        PySys_SetObject(const_cast<char*>("stdout"), py_logger);
+#endif
+      }
+
+      // Configure sys.path to include mgr_module_path
+      std::string sys_path = std::string(Py_GetPath()) + ":" + get_site_packages()
+                             + ":" + g_conf->get_val<std::string>("mgr_module_path");
+      dout(10) << "Computed sys.path '" << sys_path << "'" << dendl;
+
+      PySys_SetPath(const_cast<char*>(sys_path.c_str()));
+    }
+
+    PyMethodDef ModuleMethods[] = {
+      {nullptr}
+    };
+
+    // Initialize module
+    PyObject *ceph_module = Py_InitModule("ceph_module", ModuleMethods);
+    assert(ceph_module != nullptr);
+
+    auto load_class = [ceph_module](const char *name, PyTypeObject *type)
+    {
+      type->tp_new = PyType_GenericNew;
+      if (PyType_Ready(type) < 0) {
+          assert(0);
+      }
+      Py_INCREF(type);
+
+      PyModule_AddObject(ceph_module, name, (PyObject *)type);
+    };
+
+    load_class("BaseMgrModule", &BaseMgrModuleType);
+    load_class("BaseMgrStandbyModule", &BaseMgrStandbyModuleType);
+    load_class("BasePyOSDMap", &BasePyOSDMapType);
+    load_class("BasePyOSDMapIncremental", &BasePyOSDMapIncrementalType);
+    load_class("BasePyCRUSH", &BasePyCRUSHType);
+  }
+
+  // Environment is all good, import the external module
+  {
+    Gil gil(pMyThreadState);
+
+    // Load the module
+    PyObject *pName = PyString_FromString(module_name.c_str());
+    auto pModule = PyImport_Import(pName);
+    Py_DECREF(pName);
+    if (pModule == nullptr) {
+      derr << "Module not found: '" << module_name << "'" << dendl;
+      derr << handle_pyerror() << dendl;
+      return -ENOENT;
+    }
+
+    // Find the class
+    // TODO: let them call it what they want instead of just 'Module'
+    pClass = PyObject_GetAttrString(pModule, (const char*)"Module");
+    if (pClass == nullptr) {
+      derr << "Class not found in module '" << module_name << "'" << dendl;
+      derr << handle_pyerror() << dendl;
+      return -EINVAL;
+    }
+
+    pStandbyClass = PyObject_GetAttrString(pModule,
+                                           (const char*)"StandbyModule");
+    if (pStandbyClass) {
+      dout(4) << "Standby mode available in module '" << module_name
+              << "'" << dendl;
+    } else {
+      dout(4) << "Standby mode not provided by module '" << module_name
+              << "'" << dendl;
+      PyErr_Clear();
+    }
+
+    Py_DECREF(pModule);
+  }
+
+  return 0;
+} 
+
+PyModule::~PyModule()
+{
+  if (pMyThreadState.ts != nullptr) {
+    Gil gil(pMyThreadState, true);
+    Py_XDECREF(pClass);
+    Py_XDECREF(pStandbyClass);
+  }
+}
+
+void PyModuleRegistry::standby_start(MonClient *monc)
+{
+  Mutex::Locker l(lock);
+  assert(active_modules == nullptr);
+  assert(standby_modules == nullptr);
+  assert(is_initialized());
+
+  dout(4) << "Starting modules in standby mode" << dendl;
+
+  standby_modules.reset(new StandbyPyModules(monc, mgr_map));
+
+  std::set<std::string> failed_modules;
+  for (const auto &i : modules) {
+    if (i.second->pStandbyClass) {
+      dout(4) << "starting module " << i.second->get_name() << dendl;
+      int r = standby_modules->start_one(i.first,
+              i.second->pStandbyClass,
+              i.second->pMyThreadState);
+      if (r != 0) {
+        derr << "failed to start module '" << i.second->get_name()
+             << "'" << dendl;;
+        failed_modules.insert(i.second->get_name());
+        // Continue trying to load any other modules
+      }
+    } else {
+      dout(4) << "skipping module '" << i.second->get_name() << "' because "
+                 "it does not implement a standby mode" << dendl;
+    }
+  }
+
+  if (!failed_modules.empty()) {
+    clog->error() << "Failed to execute ceph-mgr module(s) in standby mode: "
+        << joinify(failed_modules.begin(), failed_modules.end(),
+                   std::string(", "));
+  }
+}
+
+void PyModuleRegistry::active_start(
+            PyModuleConfig &config_,
+            DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+            LogChannelRef clog_, Objecter &objecter_, Client &client_,
+            Finisher &f)
+{
+  Mutex::Locker locker(lock);
+
+  dout(4) << "Starting modules in active mode" << dendl;
+
+  assert(active_modules == nullptr);
+  assert(is_initialized());
+
+  if (standby_modules != nullptr) {
+    standby_modules->shutdown();
+    standby_modules.reset();
+  }
+
+  active_modules.reset(new ActivePyModules(
+              config_, ds, cs, mc, clog_, objecter_, client_, f));
+
+  for (const auto &i : modules) {
+    dout(4) << "Starting " << i.first << dendl;
+    int r = active_modules->start_one(i.first,
+            i.second->pClass,
+            i.second->pMyThreadState);
+    if (r != 0) {
+      derr << "Failed to run module in active mode ('" << i.first << "')"
+           << dendl;
+    }
+  }
+}
+
+void PyModuleRegistry::active_shutdown()
+{
+  Mutex::Locker locker(lock);
+
+  if (active_modules != nullptr) {
+    active_modules->shutdown();
+    active_modules.reset();
+  }
+}
+
+void PyModuleRegistry::shutdown()
+{
+  Mutex::Locker locker(lock);
+
+  if (standby_modules != nullptr) {
+    standby_modules->shutdown();
+    standby_modules.reset();
+  }
+
+  // Ideally, now, we'd be able to do this for all modules:
+  //
+  //    Py_EndInterpreter(pMyThreadState);
+  //    PyThreadState_Swap(pMainThreadState);
+  //
+  // Unfortunately, if the module has any other *python* threads active
+  // at this point, Py_EndInterpreter() will abort with:
+  //
+  //    Fatal Python error: Py_EndInterpreter: not the last thread
+  //
+  // This can happen when using CherryPy in a module, becuase CherryPy
+  // runs an extra thread as a timeout monitor, which spends most of its
+  // life inside a time.sleep(60).  Unless you are very, very lucky with
+  // the timing calling this destructor, that thread will still be stuck
+  // in a sleep, and Py_EndInterpreter() will abort.
+  //
+  // This could of course also happen with a poorly written module which
+  // made no attempt to clean up any additional threads it created.
+  //
+  // The safest thing to do is just not call Py_EndInterpreter(), and
+  // let Py_Finalize() kill everything after all modules are shut down.
+
+  modules.clear();
+
+  PyEval_RestoreThread(pMainThreadState);
+  Py_Finalize();
+}
+
+static void _list_modules(
+  const std::string path,
+  std::set<std::string> *modules)
+{
+  DIR *dir = opendir(path.c_str());
+  if (!dir) {
+    return;
+  }
+  struct dirent *entry = NULL;
+  while ((entry = readdir(dir)) != NULL) {
+    string n(entry->d_name);
+    string fn = path + "/" + n;
+    struct stat st;
+    int r = ::stat(fn.c_str(), &st);
+    if (r == 0 && S_ISDIR(st.st_mode)) {
+      string initfn = fn + "/module.py";
+      r = ::stat(initfn.c_str(), &st);
+      if (r == 0) {
+	modules->insert(n);
+      }
+    }
+  }
+  closedir(dir);
+}
+
+void PyModuleRegistry::list_modules(std::set<std::string> *modules)
+{
+  _list_modules(g_conf->get_val<std::string>("mgr_module_path"), modules);
+}
+
diff --git a/ceph/src/mgr/PyModuleRegistry.h b/ceph/src/mgr/PyModuleRegistry.h
new file mode 100644
index 000000000..5564e7f1f
--- /dev/null
+++ b/ceph/src/mgr/PyModuleRegistry.h
@@ -0,0 +1,173 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#pragma once
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include "Python.h"
+
+#include <string>
+#include <map>
+#include <memory>
+
+#include "common/LogClient.h"
+
+#include "ActivePyModules.h"
+#include "StandbyPyModules.h"
+
+class PyModule
+{
+private:
+  const std::string module_name;
+  std::string get_site_packages();
+
+public:
+  SafeThreadState pMyThreadState;
+  PyObject *pClass = nullptr;
+  PyObject *pStandbyClass = nullptr;
+
+  PyModule(const std::string &module_name_)
+    : module_name(module_name_)
+  {
+  }
+
+  ~PyModule();
+
+  int load(PyThreadState *pMainThreadState);
+
+  std::string get_name() const {
+    return module_name;
+  }
+};
+
+/**
+ * This class is responsible for setting up the python runtime environment
+ * and importing the python modules.
+ *
+ * It is *not* responsible for constructing instances of their BaseMgrModule
+ * subclasses.
+ */
+class PyModuleRegistry
+{
+private:
+  mutable Mutex lock{"PyModuleRegistry::lock"};
+
+  LogChannelRef clog;
+
+  std::map<std::string, std::unique_ptr<PyModule>> modules;
+
+  std::unique_ptr<ActivePyModules> active_modules;
+  std::unique_ptr<StandbyPyModules> standby_modules;
+
+  PyThreadState *pMainThreadState;
+
+  // We have our own copy of MgrMap, because we are constructed
+  // before ClusterState exists.
+  MgrMap mgr_map;
+
+public:
+  static std::string config_prefix;
+
+  static void list_modules(std::set<std::string> *modules);
+
+  PyModuleRegistry(LogChannelRef clog_)
+    : clog(clog_)
+  {}
+
+  bool handle_mgr_map(const MgrMap &mgr_map_)
+  {
+    Mutex::Locker l(lock);
+
+    bool modules_changed = mgr_map_.modules != mgr_map.modules;
+    mgr_map = mgr_map_;
+
+    if (standby_modules != nullptr) {
+      standby_modules->handle_mgr_map(mgr_map_);
+    }
+
+    return modules_changed;
+  }
+
+  bool is_initialized() const
+  {
+    return mgr_map.epoch > 0;
+  }
+
+  int init(const MgrMap &map);
+
+  void active_start(
+                PyModuleConfig &config_,
+                DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+                LogChannelRef clog_, Objecter &objecter_, Client &client_,
+                Finisher &f);
+  void standby_start(
+      MonClient *monc);
+
+  bool is_standby_running() const
+  {
+    return standby_modules != nullptr;
+  }
+
+  void active_shutdown();
+  void shutdown();
+
+  template<typename Callback, typename...Args>
+  void with_active_modules(Callback&& cb, Args&&...args) const
+  {
+    Mutex::Locker l(lock);
+    assert(active_modules != nullptr);
+
+    std::forward<Callback>(cb)(*active_modules, std::forward<Args>(args)...);
+  }
+
+  // FIXME: breaking interface so that I don't have to go rewrite all
+  // the places that call into these (for now)
+  // >>>
+  void notify_all(const std::string &notify_type,
+                  const std::string &notify_id)
+  {
+    if (active_modules) {
+      active_modules->notify_all(notify_type, notify_id);
+    }
+  }
+
+  void notify_all(const LogEntry &log_entry)
+  {
+    if (active_modules) {
+      active_modules->notify_all(log_entry);
+    }
+  }
+
+  std::vector<MonCommand> get_commands() const
+  {
+    assert(active_modules);
+    return active_modules->get_commands();
+  }
+  std::vector<ModuleCommand> get_py_commands() const
+  {
+    assert(active_modules);
+    return active_modules->get_py_commands();
+  }
+  void get_health_checks(health_check_map_t *checks)
+  {
+    assert(active_modules);
+    active_modules->get_health_checks(checks);
+  }
+  std::map<std::string, std::string> get_services() const
+  {
+    assert(active_modules);
+    return active_modules->get_services();
+  }
+  // <<< (end of ActivePyModules cheeky call-throughs)
+};
diff --git a/ceph/src/mgr/PyModuleRunner.cc b/ceph/src/mgr/PyModuleRunner.cc
new file mode 100644
index 000000000..5e04e3da2
--- /dev/null
+++ b/ceph/src/mgr/PyModuleRunner.cc
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include "Python.h"
+
+#include "common/debug.h"
+#include "mgr/Gil.h"
+
+#include "PyModuleRunner.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+
+std::string handle_pyerror();
+
+PyModuleRunner::~PyModuleRunner()
+{
+  Gil gil(pMyThreadState, true);
+
+  if (pClassInstance) {
+    Py_XDECREF(pClassInstance);
+    pClassInstance = nullptr;
+  }
+
+  Py_DECREF(pClass);
+  pClass = nullptr;
+}
+
+int PyModuleRunner::serve()
+{
+  assert(pClassInstance != nullptr);
+
+  // This method is called from a separate OS thread (i.e. a thread not
+  // created by Python), so tell Gil to wrap this in a new thread state.
+  Gil gil(pMyThreadState, true);
+
+  auto pValue = PyObject_CallMethod(pClassInstance,
+      const_cast<char*>("serve"), nullptr);
+
+  int r = 0;
+  if (pValue != NULL) {
+    Py_DECREF(pValue);
+  } else {
+    derr << module_name << ".serve:" << dendl;
+    derr << handle_pyerror() << dendl;
+    return -EINVAL;
+  }
+
+  return r;
+}
+
+void PyModuleRunner::shutdown()
+{
+  assert(pClassInstance != nullptr);
+
+  Gil gil(pMyThreadState, true);
+
+  auto pValue = PyObject_CallMethod(pClassInstance,
+      const_cast<char*>("shutdown"), nullptr);
+
+  if (pValue != NULL) {
+    Py_DECREF(pValue);
+  } else {
+    derr << "Failed to invoke shutdown() on " << module_name << dendl;
+    derr << handle_pyerror() << dendl;
+  }
+}
+
+void PyModuleRunner::log(int level, const std::string &record)
+{
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr[" << module_name << "] "
+  dout(level) << record << dendl;
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+}
+
+void* PyModuleRunner::PyModuleRunnerThread::entry()
+{
+  // No need to acquire the GIL here; the module does it.
+  dout(4) << "Entering thread for " << mod->get_name() << dendl;
+  mod->serve();
+  return nullptr;
+}
diff --git a/ceph/src/mgr/PyModuleRunner.h b/ceph/src/mgr/PyModuleRunner.h
new file mode 100644
index 000000000..cb51df3ea
--- /dev/null
+++ b/ceph/src/mgr/PyModuleRunner.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#pragma once
+
+#include "common/Thread.h"
+#include "mgr/Gil.h"
+
+/**
+ * Implement the pattern of calling serve() on a module in a thread,
+ * until shutdown() is called.
+ */
+class PyModuleRunner
+{
+protected:
+  const std::string module_name;
+
+  // Passed in by whoever loaded our python module and looked up
+  // the symbols in it.
+  PyObject *pClass = nullptr;
+
+  // Passed in by whoever created our subinterpreter for us
+  SafeThreadState pMyThreadState = nullptr;
+
+  // Populated when we construct our instance of pClass in load()
+  PyObject *pClassInstance = nullptr;
+
+  class PyModuleRunnerThread : public Thread
+  {
+    PyModuleRunner *mod;
+
+  public:
+    PyModuleRunnerThread(PyModuleRunner *mod_)
+      : mod(mod_) {}
+
+    void *entry() override;
+  };
+
+public:
+  int serve();
+  void shutdown();
+  void log(int level, const std::string &record);
+
+  PyModuleRunner(
+      const std::string &module_name_,
+      PyObject *pClass_,
+      const SafeThreadState &pMyThreadState_)
+    : 
+      module_name(module_name_),
+      pClass(pClass_), pMyThreadState(pMyThreadState_),
+      thread(this)
+  {
+    assert(pClass != nullptr);
+    assert(pMyThreadState.ts != nullptr);
+    assert(!module_name.empty());
+  }
+
+  ~PyModuleRunner();
+
+  PyModuleRunnerThread thread;
+
+  std::string const &get_name() const { return module_name; }
+};
+
+
diff --git a/ceph/src/mgr/PyOSDMap.cc b/ceph/src/mgr/PyOSDMap.cc
new file mode 100644
index 000000000..8bae2e4b5
--- /dev/null
+++ b/ceph/src/mgr/PyOSDMap.cc
@@ -0,0 +1,589 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Mgr.h"
+
+#include "osd/OSDMap.h"
+#include "common/errno.h"
+#include "common/version.h"
+#include "include/stringify.h"
+
+#include "PyOSDMap.h"
+#include "PyFormatter.h"
+#include "Gil.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+
+typedef struct {
+  PyObject_HEAD
+  OSDMap *osdmap;
+} BasePyOSDMap;
+
+typedef struct {
+  PyObject_HEAD
+  OSDMap::Incremental *inc;
+} BasePyOSDMapIncremental;
+
+typedef struct {
+  PyObject_HEAD
+  ceph::shared_ptr<CrushWrapper> crush;
+} BasePyCRUSH;
+
+// ----------
+
+static PyObject *osdmap_get_epoch(BasePyOSDMap *self, PyObject *obj)
+{
+  return PyInt_FromLong(self->osdmap->get_epoch());
+}
+
+static PyObject *osdmap_get_crush_version(BasePyOSDMap* self, PyObject *obj)
+{
+  return PyInt_FromLong(self->osdmap->get_crush_version());
+}
+
+static PyObject *osdmap_dump(BasePyOSDMap* self, PyObject *obj)
+{
+  PyFormatter f;
+  self->osdmap->dump(&f);
+  return f.get();
+}
+
+static PyObject *osdmap_new_incremental(BasePyOSDMap *self, PyObject *obj)
+{
+  OSDMap::Incremental *inc = new OSDMap::Incremental;
+
+  inc->fsid = self->osdmap->get_fsid();
+  inc->epoch = self->osdmap->get_epoch() + 1;
+  // always include latest crush map here... this is okay since we never
+  // actually use this map in the real world (and even if we did it would
+  // be a no-op).
+  self->osdmap->crush->encode(inc->crush, CEPH_FEATURES_ALL);
+  dout(10) << __func__ << " " << inc << dendl;
+
+  return construct_with_capsule("mgr_module", "OSDMapIncremental",
+                                (void*)(inc));
+}
+
+static PyObject *osdmap_apply_incremental(BasePyOSDMap *self,
+    BasePyOSDMapIncremental *incobj)
+{
+  if (!PyObject_TypeCheck(incobj, &BasePyOSDMapIncrementalType)) {
+    derr << "Wrong type in osdmap_apply_incremental!" << dendl;
+    return nullptr;
+  }
+
+  bufferlist bl;
+  self->osdmap->encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
+  OSDMap *next = new OSDMap;
+  next->decode(bl);
+  next->apply_incremental(*(incobj->inc));
+  dout(10) << __func__ << " map " << self->osdmap << " inc " << incobj->inc
+	   << " next " << next << dendl;
+
+  return construct_with_capsule("mgr_module", "OSDMap", (void*)next);
+}
+
+static PyObject *osdmap_get_crush(BasePyOSDMap* self, PyObject *obj)
+{
+  return construct_with_capsule("mgr_module", "CRUSHMap",
+      (void*)(&(self->osdmap->crush)));
+}
+
+static PyObject *osdmap_get_pools_by_take(BasePyOSDMap* self, PyObject *args)
+{
+  int take;
+  if (!PyArg_ParseTuple(args, "i:get_pools_by_take",
+			&take)) {
+    return nullptr;
+  }
+
+  PyFormatter f;
+  f.open_array_section("pools");
+  for (auto& p : self->osdmap->get_pools()) {
+    if (self->osdmap->crush->rule_has_take(p.second.crush_rule, take)) {
+      f.dump_int("pool", p.first);
+    }
+  }
+  f.close_section();
+  return f.get();
+}
+
+static PyObject *osdmap_calc_pg_upmaps(BasePyOSDMap* self, PyObject *args)
+{
+  PyObject *pool_list;
+  BasePyOSDMapIncremental *incobj;
+  double max_deviation = 0;
+  int max_iterations = 0;
+  if (!PyArg_ParseTuple(args, "OdiO:calc_pg_upmaps",
+			&incobj, &max_deviation,
+			&max_iterations, &pool_list)) {
+    return nullptr;
+  }
+
+  dout(10) << __func__ << " osdmap " << self->osdmap << " inc " << incobj->inc
+	   << " max_deviation " << max_deviation
+	   << " max_iterations " << max_iterations
+	   << dendl;
+  set<int64_t> pools;
+  // FIXME: unpack pool_list and translate to pools set
+  int r = self->osdmap->calc_pg_upmaps(g_ceph_context,
+				 max_deviation,
+				 max_iterations,
+				 pools,
+				 incobj->inc);
+  dout(10) << __func__ << " r = " << r << dendl;
+  return PyInt_FromLong(r);
+}
+
+static PyObject *osdmap_map_pool_pgs_up(BasePyOSDMap* self, PyObject *args)
+{
+  int poolid;
+  if (!PyArg_ParseTuple(args, "i:map_pool_pgs_up",
+			&poolid)) {
+    return nullptr;
+  }
+  auto pi = self->osdmap->get_pg_pool(poolid);
+  if (!pi)
+    return nullptr;
+  map<pg_t,vector<int>> pm;
+  for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
+    pg_t pgid(ps, poolid);
+    self->osdmap->pg_to_up_acting_osds(pgid, &pm[pgid], nullptr, nullptr, nullptr);
+  }
+  PyFormatter f;
+  for (auto p : pm) {
+    string pg = stringify(p.first);
+    f.open_array_section(pg.c_str());
+    for (auto o : p.second) {
+      f.dump_int("osd", o);
+    }
+    f.close_section();
+  }
+  return f.get();
+}
+
+static int
+BasePyOSDMap_init(BasePyOSDMap *self, PyObject *args, PyObject *kwds)
+{
+    PyObject *osdmap_capsule = nullptr;
+    static const char *kwlist[] = {"osdmap_capsule", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+                                      const_cast<char**>(kwlist),
+                                      &osdmap_capsule)) {
+      assert(0);
+        return -1;
+    }
+    assert(PyObject_TypeCheck(osdmap_capsule, &PyCapsule_Type));
+
+    self->osdmap = (OSDMap*)PyCapsule_GetPointer(
+        osdmap_capsule, nullptr);
+    assert(self->osdmap);
+
+    return 0;
+}
+
+
+static void
+BasePyOSDMap_dealloc(BasePyOSDMap *self)
+{
+  if (self->osdmap) {
+    delete self->osdmap;
+    self->osdmap = nullptr;
+  } else {
+    derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl;
+  }
+  Py_TYPE(self)->tp_free(self);
+}
+
+
+PyMethodDef BasePyOSDMap_methods[] = {
+  {"_get_epoch", (PyCFunction)osdmap_get_epoch, METH_NOARGS, "Get OSDMap epoch"},
+  {"_get_crush_version", (PyCFunction)osdmap_get_crush_version, METH_NOARGS,
+    "Get CRUSH version"},
+  {"_dump", (PyCFunction)osdmap_dump, METH_NOARGS, "Dump OSDMap::Incremental"},
+  {"_new_incremental", (PyCFunction)osdmap_new_incremental, METH_NOARGS,
+   "Create OSDMap::Incremental"},
+  {"_apply_incremental", (PyCFunction)osdmap_apply_incremental, METH_O,
+   "Apply OSDMap::Incremental and return the resulting OSDMap"},
+  {"_get_crush", (PyCFunction)osdmap_get_crush, METH_NOARGS, "Get CrushWrapper"},
+  {"_get_pools_by_take", (PyCFunction)osdmap_get_pools_by_take, METH_VARARGS,
+   "Get pools that have CRUSH rules that TAKE the given root"},
+  {"_calc_pg_upmaps", (PyCFunction)osdmap_calc_pg_upmaps, METH_VARARGS,
+   "Calculate new pg-upmap values"},
+  {"_map_pool_pgs_up", (PyCFunction)osdmap_map_pool_pgs_up, METH_VARARGS,
+   "Calculate up set mappings for all PGs in a pool"},
+  {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyOSDMapType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BasePyOSDMap", /* tp_name */
+  sizeof(BasePyOSDMap),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  (destructor)BasePyOSDMap_dealloc,      /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "Ceph OSDMap",             /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BasePyOSDMap_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BasePyOSDMap_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  0,     /* tp_new */
+};
+
+// ----------
+
+
+static int
+BasePyOSDMapIncremental_init(BasePyOSDMapIncremental *self,
+    PyObject *args, PyObject *kwds)
+{
+    PyObject *inc_capsule = nullptr;
+    static const char *kwlist[] = {"inc_capsule", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+                                      const_cast<char**>(kwlist),
+                                      &inc_capsule)) {
+      assert(0);
+        return -1;
+    }
+    assert(PyObject_TypeCheck(inc_capsule, &PyCapsule_Type));
+
+    self->inc = (OSDMap::Incremental*)PyCapsule_GetPointer(
+        inc_capsule, nullptr);
+    assert(self->inc);
+
+    return 0;
+}
+
+static void
+BasePyOSDMapIncremental_dealloc(BasePyOSDMapIncremental *self)
+{
+  if (self->inc) {
+    delete self->inc;
+    self->inc = nullptr;
+  } else {
+    derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl;
+  }
+  Py_TYPE(self)->tp_free(self);
+}
+
+static PyObject *osdmap_inc_get_epoch(BasePyOSDMapIncremental *self,
+    PyObject *obj)
+{
+  return PyInt_FromLong(self->inc->epoch);
+}
+
+static PyObject *osdmap_inc_dump(BasePyOSDMapIncremental *self,
+    PyObject *obj)
+{
+  PyFormatter f;
+  self->inc->dump(&f);
+  return f.get();
+}
+
+static int get_int_float_map(PyObject *obj, map<int,double> *out)
+{
+  PyObject *ls = PyDict_Items(obj);
+  for (int j = 0; j < PyList_Size(ls); ++j) {
+    PyObject *pair = PyList_GET_ITEM(ls, j);
+    if (!PyTuple_Check(pair)) {
+      derr << __func__ << " item " << j << " not a tuple" << dendl;
+      Py_DECREF(ls);
+      return -1;
+    }
+    int k;
+    double v;
+    if (!PyArg_ParseTuple(pair, "id:pair", &k, &v)) {
+      derr << __func__ << " item " << j << " not a size 2 tuple" << dendl;
+      Py_DECREF(ls);
+      return -1;
+    }
+    (*out)[k] = v;
+  }
+
+  Py_DECREF(ls);
+  return 0;
+}
+
+static PyObject *osdmap_inc_set_osd_reweights(BasePyOSDMapIncremental *self,
+    PyObject *weightobj)
+{
+  map<int,double> wm;
+  if (get_int_float_map(weightobj, &wm) < 0) {
+    return nullptr;
+  }
+
+  for (auto i : wm) {
+    self->inc->new_weight[i.first] = std::max(0.0, std::min(1.0, i.second)) * 0x10000;
+  }
+  Py_RETURN_NONE;
+}
+
+static PyObject *osdmap_inc_set_compat_weight_set_weights(
+  BasePyOSDMapIncremental *self, PyObject *weightobj)
+{
+  map<int,double> wm;
+  if (get_int_float_map(weightobj, &wm) < 0) {
+    return nullptr;
+  }
+
+  CrushWrapper crush;
+  assert(self->inc->crush.length());  // see new_incremental
+  auto p = self->inc->crush.begin();
+  ::decode(crush, p);
+  crush.create_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS, 1);
+  for (auto i : wm) {
+    crush.choose_args_adjust_item_weightf(
+      g_ceph_context,
+      crush.choose_args_get(CrushWrapper::DEFAULT_CHOOSE_ARGS),
+      i.first,
+      { i.second },
+      nullptr);
+  }
+  self->inc->crush.clear();
+  crush.encode(self->inc->crush, CEPH_FEATURES_ALL);
+  Py_RETURN_NONE;
+}
+
+PyMethodDef BasePyOSDMapIncremental_methods[] = {
+  {"_get_epoch", (PyCFunction)osdmap_inc_get_epoch, METH_NOARGS,
+    "Get OSDMap::Incremental epoch"},
+  {"_dump", (PyCFunction)osdmap_inc_dump, METH_NOARGS,
+    "Dump OSDMap::Incremental"},
+  {"_set_osd_reweights", (PyCFunction)osdmap_inc_set_osd_reweights,
+    METH_O, "Set osd reweight values"},
+  {"_set_crush_compat_weight_set_weights",
+   (PyCFunction)osdmap_inc_set_compat_weight_set_weights, METH_O,
+   "Set weight values in the pending CRUSH compat weight-set"},
+  {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyOSDMapIncrementalType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BasePyOSDMapIncremental", /* tp_name */
+  sizeof(BasePyOSDMapIncremental),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  (destructor)BasePyOSDMapIncremental_dealloc,      /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "Ceph OSDMapIncremental",  /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BasePyOSDMapIncremental_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BasePyOSDMapIncremental_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  0,                         /* tp_new */
+};
+
+
+// ----------
+
+static int
+BasePyCRUSH_init(BasePyCRUSH *self,
+    PyObject *args, PyObject *kwds)
+{
+    PyObject *crush_capsule = nullptr;
+    static const char *kwlist[] = {"crush_capsule", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+                                      const_cast<char**>(kwlist),
+                                      &crush_capsule)) {
+      assert(0);
+        return -1;
+    }
+    assert(PyObject_TypeCheck(crush_capsule, &PyCapsule_Type));
+
+    auto ptr_ref = (ceph::shared_ptr<CrushWrapper>*)(
+        PyCapsule_GetPointer(crush_capsule, nullptr));
+
+    // We passed a pointer to a shared pointer, which is weird, but
+    // just enough to get it into the constructor: this is a real shared
+    // pointer construction now, and then we throw away that pointer to
+    // the shared pointer.
+    self->crush = *ptr_ref;
+    assert(self->crush);
+
+    return 0;
+}
+
+static void
+BasePyCRUSH_dealloc(BasePyCRUSH *self)
+{
+  self->crush.reset();
+  Py_TYPE(self)->tp_free(self);
+}
+
+static PyObject *crush_dump(BasePyCRUSH *self, PyObject *obj)
+{
+  PyFormatter f;
+  self->crush->dump(&f);
+  return f.get();
+}
+
+static PyObject *crush_get_item_name(BasePyCRUSH *self, PyObject *args)
+{
+  int item;
+  if (!PyArg_ParseTuple(args, "i:get_item_name", &item)) {
+    return nullptr;
+  }
+  if (!self->crush->item_exists(item)) {
+    Py_RETURN_NONE;
+  }
+  return PyString_FromString(self->crush->get_item_name(item));
+}
+
+static PyObject *crush_get_item_weight(BasePyCRUSH *self, PyObject *args)
+{
+  int item;
+  if (!PyArg_ParseTuple(args, "i:get_item_weight", &item)) {
+    return nullptr;
+  }
+  if (!self->crush->item_exists(item)) {
+    Py_RETURN_NONE;
+  }
+  return PyFloat_FromDouble(self->crush->get_item_weightf(item));
+}
+
+static PyObject *crush_find_takes(BasePyCRUSH *self, PyObject *obj)
+{
+  set<int> takes;
+  self->crush->find_takes(&takes);
+  PyFormatter f;
+  f.open_array_section("takes");
+  for (auto root : takes) {
+    f.dump_int("root", root);
+  }
+  f.close_section();
+  return f.get();
+}
+
+static PyObject *crush_get_take_weight_osd_map(BasePyCRUSH *self, PyObject *args)
+{
+  int root;
+  if (!PyArg_ParseTuple(args, "i:get_take_weight_osd_map",
+			&root)) {
+    return nullptr;
+  }
+  map<int,float> wmap;
+
+  if (!self->crush->item_exists(root)) {
+    return nullptr;
+  }
+
+  self->crush->get_take_weight_osd_map(root, &wmap);
+  PyFormatter f;
+  f.open_object_section("weights");
+  for (auto& p : wmap) {
+    string n = stringify(p.first);     // ick
+    f.dump_float(n.c_str(), p.second);
+  }
+  f.close_section();
+  return f.get();
+}
+
+PyMethodDef BasePyCRUSH_methods[] = {
+  {"_dump", (PyCFunction)crush_dump, METH_NOARGS, "Dump map"},
+  {"_get_item_name", (PyCFunction)crush_get_item_name, METH_VARARGS,
+    "Get item name"},
+  {"_get_item_weight", (PyCFunction)crush_get_item_weight, METH_VARARGS,
+    "Get item weight"},
+  {"_find_takes", (PyCFunction)crush_find_takes, METH_NOARGS,
+    "Find distinct TAKE roots"},
+  {"_get_take_weight_osd_map", (PyCFunction)crush_get_take_weight_osd_map,
+    METH_VARARGS, "Get OSD weight map for a given TAKE root node"},
+  {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyCRUSHType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BasePyCRUSH", /* tp_name */
+  sizeof(BasePyCRUSH),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  (destructor)BasePyCRUSH_dealloc,      /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "Ceph OSDMapIncremental",  /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BasePyCRUSH_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BasePyCRUSH_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  0,                         /* tp_new */
+};
diff --git a/ceph/src/mgr/PyOSDMap.h b/ceph/src/mgr/PyOSDMap.h
new file mode 100644
index 000000000..09e5b041c
--- /dev/null
+++ b/ceph/src/mgr/PyOSDMap.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+
+#include "Python.h"
+
+
+
+extern PyTypeObject BasePyOSDMapType;
+extern PyTypeObject BasePyOSDMapIncrementalType;
+extern PyTypeObject BasePyCRUSHType;
+
+PyObject *construct_with_capsule(
+    const std::string &module,
+    const std::string &clsname,
+    void *wrapped);
+
diff --git a/ceph/src/mgr/PyState.cc b/ceph/src/mgr/PyState.cc
deleted file mode 100644
index fb6b831de..000000000
--- a/ceph/src/mgr/PyState.cc
+++ /dev/null
@@ -1,490 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 John Spray <john.spray@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- */
-
-/**
- * The interface we present to python code that runs within
- * ceph-mgr.
- */
-
-#include "Mgr.h"
-
-#include "mon/MonClient.h"
-#include "common/errno.h"
-#include "common/version.h"
-
-#include "PyState.h"
-#include "Gil.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_mgr
-
-PyModules *global_handle = NULL;
-
-
-class MonCommandCompletion : public Context
-{
-  PyObject *python_completion;
-  const std::string tag;
-  PyThreadState *pThreadState;
-
-public:
-  std::string outs;
-  bufferlist outbl;
-
-  MonCommandCompletion(PyObject* ev, const std::string &tag_, PyThreadState *ts_)
-    : python_completion(ev), tag(tag_), pThreadState(ts_)
-  {
-    assert(python_completion != nullptr);
-    Py_INCREF(python_completion);
-  }
-
-  ~MonCommandCompletion() override
-  {
-    Py_DECREF(python_completion);
-  }
-
-  void finish(int r) override
-  {
-    dout(10) << "MonCommandCompletion::finish()" << dendl;
-    {
-      // Scoped so the Gil is released before calling notify_all()
-      // Create new thread state because this is called via the MonClient
-      // Finisher, not the PyModules finisher.
-      Gil gil(pThreadState, true);
-
-      auto set_fn = PyObject_GetAttrString(python_completion, "complete");
-      assert(set_fn != nullptr);
-
-      auto pyR = PyInt_FromLong(r);
-      auto pyOutBl = PyString_FromString(outbl.to_str().c_str());
-      auto pyOutS = PyString_FromString(outs.c_str());
-      auto args = PyTuple_Pack(3, pyR, pyOutBl, pyOutS);
-      Py_DECREF(pyR);
-      Py_DECREF(pyOutBl);
-      Py_DECREF(pyOutS);
-
-      auto rtn = PyObject_CallObject(set_fn, args);
-      if (rtn != nullptr) {
-	Py_DECREF(rtn);
-      }
-      Py_DECREF(args);
-    }
-    global_handle->notify_all("command", tag);
-  }
-};
-
-
-static PyObject*
-ceph_send_command(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-
-  // Like mon, osd, mds
-  char *type = nullptr;
-
-  // Like "23" for an OSD or "myid" for an MDS
-  char *name = nullptr;
-
-  char *cmd_json = nullptr;
-  char *tag = nullptr;
-  PyObject *completion = nullptr;
-  if (!PyArg_ParseTuple(args, "sOssss:ceph_send_command",
-        &handle, &completion, &type, &name, &cmd_json, &tag)) {
-    return nullptr;
-  }
-
-  auto set_fn = PyObject_GetAttrString(completion, "complete");
-  if (set_fn == nullptr) {
-    ceph_abort();  // TODO raise python exception instead
-  } else {
-    assert(PyCallable_Check(set_fn));
-  }
-  Py_DECREF(set_fn);
-
-  auto c = new MonCommandCompletion(completion, tag, PyThreadState_Get());
-  if (std::string(type) == "mon") {
-    global_handle->get_monc().start_mon_command(
-        {cmd_json},
-        {},
-        &c->outbl,
-        &c->outs,
-        c);
-  } else if (std::string(type) == "osd") {
-    std::string err;
-    uint64_t osd_id = strict_strtoll(name, 10, &err);
-    if (!err.empty()) {
-      delete c;
-      string msg("invalid osd_id: ");
-      msg.append("\"").append(name).append("\"");
-      PyErr_SetString(PyExc_ValueError, msg.c_str());
-      return nullptr;
-    }
-
-    ceph_tid_t tid;
-    global_handle->get_objecter().osd_command(
-        osd_id,
-        {cmd_json},
-        {},
-        &tid,
-        &c->outbl,
-        &c->outs,
-        c);
-  } else if (std::string(type) == "mds") {
-    int r = global_handle->get_client().mds_command(
-        name,
-        {cmd_json},
-        {},
-        &c->outbl,
-        &c->outs,
-        c);
-    if (r != 0) {
-      string msg("failed to send command to mds: ");
-      msg.append(cpp_strerror(r));
-      PyErr_SetString(PyExc_RuntimeError, msg.c_str());
-      return nullptr;
-    }
-  } else if (std::string(type) == "pg") {
-    pg_t pgid;
-    if (!pgid.parse(name)) {
-      delete c;
-      string msg("invalid pgid: ");
-      msg.append("\"").append(name).append("\"");
-      PyErr_SetString(PyExc_ValueError, msg.c_str());
-      return nullptr;
-    }
-
-    ceph_tid_t tid;
-    global_handle->get_objecter().pg_command(
-        pgid,
-        {cmd_json},
-        {},
-        &tid,
-        &c->outbl,
-        &c->outs,
-        c);
-    return nullptr;
-  } else {
-    delete c;
-    string msg("unknown service type: ");
-    msg.append(type);
-    PyErr_SetString(PyExc_ValueError, msg.c_str());
-    return nullptr;
-  }
-
-  Py_RETURN_NONE;
-}
-
-static PyObject*
-ceph_set_health_checks(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  PyObject *checks = NULL;
-  if (!PyArg_ParseTuple(args, "sO:ceph_set_health_checks", &handle, &checks)) {
-    return NULL;
-  }
-  if (!PyDict_Check(checks)) {
-    derr << __func__ << " arg not a dict" << dendl;
-    Py_RETURN_NONE;
-  }
-  PyObject *checksls = PyDict_Items(checks);
-  health_check_map_t out_checks;
-  for (int i = 0; i < PyList_Size(checksls); ++i) {
-    PyObject *kv = PyList_GET_ITEM(checksls, i);
-    char *check_name = nullptr;
-    PyObject *check_info = nullptr;
-    if (!PyArg_ParseTuple(kv, "sO:pair", &check_name, &check_info)) {
-      derr << __func__ << " dict item " << i
-	   << " not a size 2 tuple" << dendl;
-      continue;
-    }
-    if (!PyDict_Check(check_info)) {
-      derr << __func__ << " item " << i << " " << check_name
-	   << " value not a dict" << dendl;
-      continue;
-    }
-    health_status_t severity = HEALTH_OK;
-    string summary;
-    list<string> detail;
-    PyObject *infols = PyDict_Items(check_info);
-    for (int j = 0; j < PyList_Size(infols); ++j) {
-      PyObject *pair = PyList_GET_ITEM(infols, j);
-      if (!PyTuple_Check(pair)) {
-	derr << __func__ << " item " << i << " pair " << j
-	     << " not a tuple" << dendl;
-	continue;
-      }
-      char *k = nullptr;
-      PyObject *v = nullptr;
-      if (!PyArg_ParseTuple(pair, "sO:pair", &k, &v)) {
-	derr << __func__ << " item " << i << " pair " << j
-	     << " not a size 2 tuple" << dendl;
-	continue;
-      }
-      string ks(k);
-      if (ks == "severity") {
-	if (!PyString_Check(v)) {
-	  derr << __func__ << " check " << check_name
-	       << " severity value not string" << dendl;
-	  continue;
-	}
-	string vs(PyString_AsString(v));
-	if (vs == "warning") {
-	  severity = HEALTH_WARN;
-	} else if (vs == "error") {
-	  severity = HEALTH_ERR;
-	}
-      } else if (ks == "summary") {
-	if (!PyString_Check(v)) {
-	  derr << __func__ << " check " << check_name
-	       << " summary value not string" << dendl;
-	  continue;
-	}
-	summary = PyString_AsString(v);
-      } else if (ks == "detail") {
-	if (!PyList_Check(v)) {
-	  derr << __func__ << " check " << check_name
-	       << " detail value not list" << dendl;
-	  continue;
-	}
-	for (int k = 0; k < PyList_Size(v); ++k) {
-	  PyObject *di = PyList_GET_ITEM(v, k);
-	  if (!PyString_Check(di)) {
-	    derr << __func__ << " check " << check_name
-		 << " detail item " << k << " not a string" << dendl;
-	    continue;
-	  }
-	  detail.push_back(PyString_AsString(di));
-	}
-      } else {
-	derr << __func__ << " check " << check_name
-	     << " unexpected key " << k << dendl;
-      }
-    }
-    auto& d = out_checks.add(check_name, severity, summary);
-    d.detail.swap(detail);
-  }
-
-  JSONFormatter jf(true);
-  dout(10) << "module " << handle << " health checks:\n";
-  out_checks.dump(&jf);
-  jf.flush(*_dout);
-  *_dout << dendl;
-
-  global_handle->set_health_checks(handle, std::move(out_checks));
-  
-  Py_RETURN_NONE;
-}
-
-
-static PyObject*
-ceph_state_get(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *what = NULL;
-  if (!PyArg_ParseTuple(args, "ss:ceph_state_get", &handle, &what)) {
-    return NULL;
-  }
-
-  return global_handle->get_python(what);
-}
-
-
-static PyObject*
-ceph_get_server(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *hostname = NULL;
-  if (!PyArg_ParseTuple(args, "sz:ceph_get_server", &handle, &hostname)) {
-    return NULL;
-  }
-
-  if (hostname) {
-    return global_handle->get_server_python(hostname);
-  } else {
-    return global_handle->list_servers_python();
-  }
-}
-
-static PyObject*
-ceph_get_mgr_id(PyObject *self, PyObject *args)
-{
-  return PyString_FromString(g_conf->name.get_id().c_str());
-}
-
-static PyObject*
-ceph_config_get(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *what = nullptr;
-  if (!PyArg_ParseTuple(args, "ss:ceph_config_get", &handle, &what)) {
-    derr << "Invalid args!" << dendl;
-    return nullptr;
-  }
-
-  std::string value;
-  bool found = global_handle->get_config(handle, what, &value);
-  if (found) {
-    dout(10) << "ceph_config_get " << what << " found: " << value.c_str() << dendl;
-    return PyString_FromString(value.c_str());
-  } else {
-    dout(4) << "ceph_config_get " << what << " not found " << dendl;
-    Py_RETURN_NONE;
-  }
-}
-
-static PyObject*
-ceph_config_get_prefix(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *prefix = nullptr;
-  if (!PyArg_ParseTuple(args, "ss:ceph_config_get", &handle, &prefix)) {
-    derr << "Invalid args!" << dendl;
-    return nullptr;
-  }
-
-  return global_handle->get_config_prefix(handle, prefix);
-}
-
-static PyObject*
-ceph_config_set(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *key = nullptr;
-  char *value = nullptr;
-  if (!PyArg_ParseTuple(args, "ssz:ceph_config_set", &handle, &key, &value)) {
-    return nullptr;
-  }
-  boost::optional<string> val;
-  if (value) {
-    val = value;
-  }
-  global_handle->set_config(handle, key, val);
-
-  Py_RETURN_NONE;
-}
-
-static PyObject*
-get_metadata(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *svc_name = NULL;
-  char *svc_id = NULL;
-  if (!PyArg_ParseTuple(args, "sss:get_metadata", &handle, &svc_name, &svc_id)) {
-    return nullptr;
-  }
-  return global_handle->get_metadata_python(handle, svc_name, svc_id);
-}
-
-static PyObject*
-get_daemon_status(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *svc_name = NULL;
-  char *svc_id = NULL;
-  if (!PyArg_ParseTuple(args, "sss:get_daemon_status", &handle, &svc_name,
-			&svc_id)) {
-    return nullptr;
-  }
-  return global_handle->get_daemon_status_python(handle, svc_name, svc_id);
-}
-
-static PyObject*
-ceph_log(PyObject *self, PyObject *args)
-{
-  int level = 0;
-  char *record = nullptr;
-  char *handle = nullptr;
-  if (!PyArg_ParseTuple(args, "sis:log", &handle, &level, &record)) {
-    return nullptr;
-  }
-
-  global_handle->log(handle, level, record);
-
-  Py_RETURN_NONE;
-}
-
-static PyObject *
-ceph_get_version(PyObject *self, PyObject *args)
-{
-  return PyString_FromString(pretty_version_to_str().c_str());
-}
-
-static PyObject *
-ceph_get_context(PyObject *self, PyObject *args)
-{
-  return global_handle->get_context();
-}
-
-static PyObject*
-get_counter(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *svc_name = nullptr;
-  char *svc_id = nullptr;
-  char *counter_path = nullptr;
-  if (!PyArg_ParseTuple(args, "ssss:get_counter", &handle, &svc_name,
-                                                  &svc_id, &counter_path)) {
-    return nullptr;
-  }
-  return global_handle->get_counter_python(
-      handle, svc_name, svc_id, counter_path);
-}
-
-static PyObject*
-get_perf_schema(PyObject *self, PyObject *args)
-{
-  char *handle = nullptr;
-  char *type_str = nullptr;
-  char *svc_id = nullptr;
-  if (!PyArg_ParseTuple(args, "sss:get_perf_schema", &handle, &type_str,
-                                                     &svc_id)) {
-    return nullptr;
-  }
-
-  return global_handle->get_perf_schema_python(handle, type_str, svc_id);
-}
-
-PyMethodDef CephStateMethods[] = {
-    {"get", ceph_state_get, METH_VARARGS,
-     "Get a cluster object"},
-    {"get_server", ceph_get_server, METH_VARARGS,
-     "Get a server object"},
-    {"get_metadata", get_metadata, METH_VARARGS,
-     "Get a service's metadata"},
-    {"get_daemon_status", get_daemon_status, METH_VARARGS,
-     "Get a service's status"},
-    {"send_command", ceph_send_command, METH_VARARGS,
-     "Send a mon command"},
-    {"set_health_checks", ceph_set_health_checks, METH_VARARGS,
-     "Set health checks for this module"},
-    {"get_mgr_id", ceph_get_mgr_id, METH_NOARGS,
-     "Get the mgr id"},
-    {"get_config", ceph_config_get, METH_VARARGS,
-     "Get a configuration value"},
-    {"get_config_prefix", ceph_config_get_prefix, METH_VARARGS,
-     "Get all configuration values with a given prefix"},
-    {"set_config", ceph_config_set, METH_VARARGS,
-     "Set a configuration value"},
-    {"get_counter", get_counter, METH_VARARGS,
-      "Get a performance counter"},
-    {"get_perf_schema", get_perf_schema, METH_VARARGS,
-      "Get the performance counter schema"},
-    {"log", ceph_log, METH_VARARGS,
-     "Emit a (local) log message"},
-    {"get_version", ceph_get_version, METH_VARARGS,
-     "Get the ceph version of this process"},
-    {"get_context", ceph_get_context, METH_NOARGS,
-      "Get a CephContext* in a python capsule"},
-    {NULL, NULL, 0, NULL}
-};
-
diff --git a/ceph/src/mgr/PyState.h b/ceph/src/mgr/PyState.h
deleted file mode 100644
index e53296b07..000000000
--- a/ceph/src/mgr/PyState.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef PYSTATE_H_
-#define PYSTATE_H_
-
-#include "Python.h"
-
-class PyModules;
-
-extern PyModules *global_handle;
-extern PyMethodDef CephStateMethods[];
-
-#endif
-
diff --git a/ceph/src/mgr/StandbyPyModules.cc b/ceph/src/mgr/StandbyPyModules.cc
new file mode 100644
index 000000000..e567269a3
--- /dev/null
+++ b/ceph/src/mgr/StandbyPyModules.cc
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "StandbyPyModules.h"
+
+#include "common/debug.h"
+
+#include "mgr/MgrContext.h"
+#include "mgr/Gil.h"
+
+
+#include <boost/python.hpp>
+#include "include/assert.h"  // boost clobbers this
+
+// For ::config_prefix
+#include "PyModuleRegistry.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+// Declaration fulfilled by ActivePyModules
+std::string handle_pyerror();
+
+
+StandbyPyModules::StandbyPyModules(MonClient *monc_, const MgrMap &mgr_map_)
+    : monc(monc_), load_config_thread(monc, &state)
+{
+  state.set_mgr_map(mgr_map_);
+}
+
+// FIXME: completely identical to ActivePyModules
+void StandbyPyModules::shutdown()
+{
+  Mutex::Locker locker(lock);
+
+  if (!state.is_config_loaded && load_config_thread.is_started()) {
+    // FIXME: handle cases where initial load races with shutdown
+    // this is actually not super rare because 
+    assert(0);
+    //load_config_thread.kill(SIGKILL);
+  }
+
+  // Signal modules to drop out of serve() and/or tear down resources
+  for (auto &i : modules) {
+    auto module = i.second.get();
+    const auto& name = i.first;
+    dout(10) << "waiting for module " << name << " to shutdown" << dendl;
+    lock.Unlock();
+    module->shutdown();
+    lock.Lock();
+    dout(10) << "module " << name << " shutdown" << dendl;
+  }
+
+  // For modules implementing serve(), finish the threads where we
+  // were running that.
+  for (auto &i : modules) {
+    lock.Unlock();
+    dout(10) << "joining thread for module " << i.first << dendl;
+    i.second->thread.join();
+    dout(10) << "joined thread for module " << i.first << dendl;
+    lock.Lock();
+  }
+
+  modules.clear();
+}
+
+int StandbyPyModules::start_one(std::string const &module_name,
+    PyObject *pClass, const SafeThreadState &pMyThreadState)
+{
+  Mutex::Locker l(lock);
+
+  assert(modules.count(module_name) == 0);
+
+  modules[module_name].reset(new StandbyPyModule(
+      state,
+      module_name, pClass,
+      pMyThreadState));
+
+  if (modules.size() == 1) {
+    load_config_thread.create("LoadConfig");
+  }
+
+  int r = modules[module_name]->load();
+  if (r != 0) {
+    modules.erase(module_name);
+    return r;
+  } else {
+    dout(4) << "Starting thread for " << module_name << dendl;
+    // Giving Thread the module's module_name member as its
+    // char* thread name: thread must not outlive module class lifetime.
+    modules[module_name]->thread.create(
+        modules[module_name]->get_name().c_str());
+    return 0;
+  }
+}
+
+int StandbyPyModule::load()
+{
+  Gil gil(pMyThreadState, true);
+
+  // We tell the module how we name it, so that it can be consistent
+  // with us in logging etc.
+  auto pThisPtr = PyCapsule_New(this, nullptr, nullptr);
+  assert(pThisPtr != nullptr);
+  auto pModuleName = PyString_FromString(module_name.c_str());
+  assert(pModuleName != nullptr);
+  auto pArgs = PyTuple_Pack(2, pModuleName, pThisPtr);
+  Py_DECREF(pThisPtr);
+  Py_DECREF(pModuleName);
+
+  pClassInstance = PyObject_CallObject(pClass, pArgs);
+  Py_DECREF(pArgs);
+  if (pClassInstance == nullptr) {
+    derr << "Failed to construct class in '" << module_name << "'" << dendl;
+    derr << handle_pyerror() << dendl;
+    return -EINVAL;
+  } else {
+    dout(1) << "Constructed class from module: " << module_name << dendl;
+    return 0;
+  }
+}
+
+void *StandbyPyModules::LoadConfigThread::entry()
+{
+  dout(10) << "listing keys" << dendl;
+  JSONCommand cmd;
+  cmd.run(monc, "{\"prefix\": \"config-key ls\"}");
+  cmd.wait();
+  assert(cmd.r == 0);
+
+  std::map<std::string, std::string> loaded;
+  
+  for (auto &key_str : cmd.json_result.get_array()) {
+    std::string const key = key_str.get_str();
+    dout(20) << "saw key '" << key << "'" << dendl;
+
+    const std::string config_prefix = PyModuleRegistry::config_prefix;
+
+    if (key.substr(0, config_prefix.size()) == config_prefix) {
+      dout(20) << "fetching '" << key << "'" << dendl;
+      Command get_cmd;
+      std::ostringstream cmd_json;
+      cmd_json << "{\"prefix\": \"config-key get\", \"key\": \"" << key << "\"}";
+      get_cmd.run(monc, cmd_json.str());
+      get_cmd.wait();
+      assert(get_cmd.r == 0);
+      loaded[key] = get_cmd.outbl.to_str();
+    }
+  }
+  state->loaded_config(loaded);
+
+  return nullptr;
+}
+
+bool StandbyPyModule::get_config(const std::string &key,
+                                 std::string *value) const
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  PyEval_RestoreThread(tstate);
+
+  const std::string global_key = PyModuleRegistry::config_prefix
+    + module_name + "/" + key;
+
+  dout(4) << __func__ << "key: " << global_key << dendl;
+
+  return state.with_config([global_key, value](const PyModuleConfig &config){
+    if (config.count(global_key)) {
+      *value = config.at(global_key);
+      return true;
+    } else {
+      return false;
+    }
+  });
+}
+
+std::string StandbyPyModule::get_active_uri() const
+{
+  std::string result;
+  state.with_mgr_map([&result, this](const MgrMap &mgr_map){
+    auto iter = mgr_map.services.find(module_name);
+    if (iter != mgr_map.services.end()) {
+      result = iter->second;
+    }
+  });
+
+  return result;
+}
+
diff --git a/ceph/src/mgr/StandbyPyModules.h b/ceph/src/mgr/StandbyPyModules.h
new file mode 100644
index 000000000..4f011464a
--- /dev/null
+++ b/ceph/src/mgr/StandbyPyModules.h
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#pragma once
+
+#include "Python.h"
+
+#include <string>
+#include <map>
+
+#include "common/Thread.h"
+#include "common/Mutex.h"
+
+#include "mgr/Gil.h"
+#include "mon/MonClient.h"
+#include "mon/MgrMap.h"
+#include "mgr/PyModuleRunner.h"
+
+typedef std::map<std::string, std::string> PyModuleConfig;
+
+/**
+ * State that is read by all modules running in standby mode
+ */
+class StandbyPyModuleState
+{
+  mutable Mutex lock{"StandbyPyModuleState::lock"};
+
+  MgrMap mgr_map;
+  PyModuleConfig config_cache;
+
+  mutable Cond config_loaded;
+
+public:
+
+  bool is_config_loaded = false;
+
+  void set_mgr_map(const MgrMap &mgr_map_)
+  {
+    Mutex::Locker l(lock);
+
+    mgr_map = mgr_map_;
+  }
+
+  void loaded_config(const PyModuleConfig &config_)
+  {
+    Mutex::Locker l(lock);
+
+    config_cache = config_;
+    is_config_loaded = true;
+    config_loaded.Signal();
+  }
+
+  template<typename Callback, typename...Args>
+  void with_mgr_map(Callback&& cb, Args&&...args) const
+  {
+    Mutex::Locker l(lock);
+    std::forward<Callback>(cb)(mgr_map, std::forward<Args>(args)...);
+  }
+
+  template<typename Callback, typename...Args>
+  auto with_config(Callback&& cb, Args&&... args) const ->
+    decltype(cb(config_cache, std::forward<Args>(args)...)) {
+    Mutex::Locker l(lock);
+
+    if (!is_config_loaded) {
+      config_loaded.Wait(lock);
+    }
+
+    return std::forward<Callback>(cb)(config_cache, std::forward<Args>(args)...);
+  }
+};
+
+
+class StandbyPyModule : public PyModuleRunner
+{
+  StandbyPyModuleState &state;
+
+  public:
+
+  StandbyPyModule(
+      StandbyPyModuleState &state_,
+      const std::string &module_name_,
+      PyObject *pClass_,
+      const SafeThreadState &pMyThreadState_)
+    :
+      PyModuleRunner(module_name_, pClass_, pMyThreadState_),
+      state(state_)
+  {
+  }
+
+  bool get_config(const std::string &key, std::string *value) const;
+  std::string get_active_uri() const;
+
+  int load();
+};
+
+class StandbyPyModules
+{
+private:
+  mutable Mutex lock{"StandbyPyModules::lock"};
+  std::map<std::string, std::unique_ptr<StandbyPyModule>> modules;
+
+  MonClient *monc;
+
+  StandbyPyModuleState state;
+
+  void load_config();
+  class LoadConfigThread : public Thread
+  {
+    protected:
+      MonClient *monc;
+      StandbyPyModuleState *state;
+    public:
+    LoadConfigThread(MonClient *monc_, StandbyPyModuleState *state_)
+      : monc(monc_), state(state_)
+    {}
+    void *entry() override;
+  };
+
+  LoadConfigThread load_config_thread;
+
+public:
+
+  StandbyPyModules(
+      MonClient *monc_,
+      const MgrMap &mgr_map_);
+
+  int start_one(std::string const &module_name,
+                PyObject *pClass,
+                const SafeThreadState &pMyThreadState);
+
+  void shutdown();
+
+  void handle_mgr_map(const MgrMap &mgr_map)
+  {
+    state.set_mgr_map(mgr_map);
+  }
+
+};
diff --git a/ceph/src/mon/AuthMonitor.cc b/ceph/src/mon/AuthMonitor.cc
index c9c836dc5..1f13145a6 100644
--- a/ceph/src/mon/AuthMonitor.cc
+++ b/ceph/src/mon/AuthMonitor.cc
@@ -1551,11 +1551,15 @@ void AuthMonitor::upgrade_format()
       }
     }
 
-    // add bootstrap key
-    {
+    // add bootstrap key if it does not already exist
+    // (might have already been get-or-create'd by
+    //  ceph-create-keys)
+    EntityName bootstrap_mgr_name;
+    int r = bootstrap_mgr_name.from_str("client.bootstrap-mgr");
+    assert(r);
+    if (!mon->key_server.contains(bootstrap_mgr_name)) {
       KeyServerData::Incremental auth_inc;
-      bool r = auth_inc.name.from_str("client.bootstrap-mgr");
-      assert(r);
+      auth_inc.name = bootstrap_mgr_name;
       ::encode("allow profile bootstrap-mgr", auth_inc.auth.caps["mon"]);
       auth_inc.op = KeyServerData::AUTH_INC_ADD;
       // generate key
diff --git a/ceph/src/mon/Elector.cc b/ceph/src/mon/Elector.cc
index b7fde8552..f69bcf16d 100644
--- a/ceph/src/mon/Elector.cc
+++ b/ceph/src/mon/Elector.cc
@@ -159,11 +159,11 @@ void Elector::reset_timer(double plus)
    * as far as we know, we may even be dead); so, just propose ourselves as the
    * Leader.
    */
-  expire_event = new C_MonContext(mon, [this](int) {
-      expire();
-    });
-  mon->timer.add_event_after(g_conf->mon_election_timeout + plus,
-			     expire_event);
+  expire_event = mon->timer.add_event_after(
+    g_conf->mon_election_timeout + plus,
+    new C_MonContext(mon, [this](int) {
+	expire();
+      }));
 }
 
 
diff --git a/ceph/src/mon/LogMonitor.cc b/ceph/src/mon/LogMonitor.cc
index c7c2b281b..b52767f9b 100644
--- a/ceph/src/mon/LogMonitor.cc
+++ b/ceph/src/mon/LogMonitor.cc
@@ -429,23 +429,24 @@ bool LogMonitor::preprocess_command(MonOpRequestRef op)
     };
 
     auto rp = summary.tail.rbegin();
-    while (num > 0 && rp != summary.tail.rend()) {
+    for (; num > 0 && rp != summary.tail.rend(); ++rp) {
       if (match(*rp)) {
         num--;
       }
-      ++rp;
+    }
+    if (rp == summary.tail.rend()) {
+      --rp;
     }
     ostringstream ss;
-    auto p = summary.tail.begin();
-    for ( ; p != summary.tail.end(); ++p) {
-      if (!match(*p)) {
+    for (; rp != summary.tail.rbegin(); --rp) {
+      if (!match(*rp)) {
         continue;
       }
 
       if (f) {
-	f->dump_object("entry", *p);
+	f->dump_object("entry", *rp);
       } else {
-	ss << *p << "\n";
+	ss << *rp << "\n";
       }
     }
     if (f) {
@@ -811,7 +812,7 @@ ceph::logging::Graylog::Ref LogMonitor::log_channel_info::get_graylog(
   if (graylogs.count(channel) == 0) {
     auto graylog(std::make_shared<ceph::logging::Graylog>("mon"));
 
-    graylog->set_fsid(g_conf->fsid);
+    graylog->set_fsid(g_conf->get_val<uuid_d>("fsid"));
     graylog->set_hostname(g_conf->host);
     graylog->set_destination(get_str_map_key(log_to_graylog_host, channel,
 					     &CLOG_CONFIG_DEFAULT_KEY),
diff --git a/ceph/src/mon/MDSMonitor.cc b/ceph/src/mon/MDSMonitor.cc
index 32a80a7b8..59ebaaf75 100644
--- a/ceph/src/mon/MDSMonitor.cc
+++ b/ceph/src/mon/MDSMonitor.cc
@@ -49,6 +49,10 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, FSMap const& fsmap) {
 		<< ").mds e" << fsmap.get_epoch() << " ";
 }
 
+static const string MDS_METADATA_PREFIX("mds_metadata");
+static const string MDS_HEALTH_PREFIX("mds_health");
+
+
 /*
  * Specialized implementation of cmd_getval to allow us to parse
  * out strongly-typedef'd types
@@ -71,9 +75,6 @@ template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
   return cmd_getval(cct, cmdmap, k, (int64_t&)val);
 }
 
-static const string MDS_METADATA_PREFIX("mds_metadata");
-
-
 // my methods
 
 void MDSMonitor::print_map(FSMap &m, int dbl)
@@ -89,6 +90,12 @@ void MDSMonitor::create_initial()
   dout(10) << "create_initial" << dendl;
 }
 
+void MDSMonitor::get_store_prefixes(std::set<string>& s)
+{
+  s.insert(service_name);
+  s.insert(MDS_METADATA_PREFIX);
+  s.insert(MDS_HEALTH_PREFIX);
+}
 
 void MDSMonitor::update_from_paxos(bool *need_bootstrap)
 {
@@ -133,6 +140,11 @@ void MDSMonitor::create_pending()
   pending_fsmap = fsmap;
   pending_fsmap.epoch++;
 
+  if (mon->osdmon()->is_readable()) {
+    auto &osdmap = mon->osdmon()->osdmap;
+    pending_fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
+  }
+
   dout(10) << "create_pending e" << pending_fsmap.epoch << dendl;
 }
 
diff --git a/ceph/src/mon/MDSMonitor.h b/ceph/src/mon/MDSMonitor.h
index c14c96039..3d84f92a8 100644
--- a/ceph/src/mon/MDSMonitor.h
+++ b/ceph/src/mon/MDSMonitor.h
@@ -34,14 +34,13 @@ class MMDSLoadTargets;
 class MMDSMap;
 class FileSystemCommandHandler;
 
-#define MDS_HEALTH_PREFIX "mds_health"
-
 class MDSMonitor : public PaxosService {
  public:
   MDSMonitor(Monitor *mn, Paxos *p, string service_name);
 
   // service methods
   void create_initial() override;
+  void get_store_prefixes(std::set<string>& s) override;
   void update_from_paxos(bool *need_bootstrap) override;
   void init() override;
   void create_pending() override; 
diff --git a/ceph/src/mon/MgrMap.h b/ceph/src/mon/MgrMap.h
index 01ed2515c..1af3a0ee7 100644
--- a/ceph/src/mon/MgrMap.h
+++ b/ceph/src/mon/MgrMap.h
@@ -77,6 +77,10 @@ public:
   std::set<std::string> modules;
   std::set<std::string> available_modules;
 
+  // Map of module name to URI, indicating services exposed by
+  // running modules on the active mgr daemon.
+  std::map<std::string, std::string> services;
+
   epoch_t get_epoch() const { return epoch; }
   entity_addr_t get_active_addr() const { return active_addr; }
   uint64_t get_active_gid() const { return active_gid; }
@@ -120,7 +124,7 @@ public:
 
   void encode(bufferlist& bl, uint64_t features) const
   {
-    ENCODE_START(2, 1, bl);
+    ENCODE_START(3, 1, bl);
     ::encode(epoch, bl);
     ::encode(active_addr, bl, features);
     ::encode(active_gid, bl);
@@ -129,6 +133,7 @@ public:
     ::encode(standbys, bl);
     ::encode(modules, bl);
     ::encode(available_modules, bl);
+    ::encode(services, bl);
     ENCODE_FINISH(bl);
   }
 
@@ -145,6 +150,9 @@ public:
       ::decode(modules, p);
       ::decode(available_modules, p);
     }
+    if (struct_v >= 3) {
+      ::decode(services, p);
+    }
     DECODE_FINISH(p);
   }
 
@@ -177,6 +185,12 @@ public:
       f->dump_string("module", j);
     }
     f->close_section();
+
+    f->open_object_section("services");
+    for (const auto &i : services) {
+      f->dump_string(i.first.c_str(), i.second);
+    }
+    f->close_section();
   }
 
   static void generate_test_instances(list<MgrMap*> &l) {
diff --git a/ceph/src/mon/MgrMonitor.cc b/ceph/src/mon/MgrMonitor.cc
index 3840b642b..a307dd4df 100644
--- a/ceph/src/mon/MgrMonitor.cc
+++ b/ceph/src/mon/MgrMonitor.cc
@@ -43,7 +43,9 @@ const static std::string command_descs_prefix = "mgr_command_descs";
 
 void MgrMonitor::create_initial()
 {
-  boost::tokenizer<> tok(g_conf->mgr_initial_modules);
+  // Take a local copy of initial_modules for tokenizer to iterate over.
+  auto initial_modules = g_conf->get_val<std::string>("mgr_initial_modules");
+  boost::tokenizer<> tok(initial_modules);
   for (auto& m : tok) {
     pending_map.modules.insert(m);
   }
@@ -53,6 +55,13 @@ void MgrMonitor::create_initial()
 	   << dendl;
 }
 
+void MgrMonitor::get_store_prefixes(std::set<string>& s)
+{
+  s.insert(service_name);
+  s.insert(command_descs_prefix);
+  s.insert(MGR_METADATA_PREFIX);
+}
+
 void MgrMonitor::update_from_paxos(bool *need_bootstrap)
 {
   version_t version = get_last_committed();
@@ -85,8 +94,9 @@ void MgrMonitor::update_from_paxos(bool *need_bootstrap)
     check_subs();
 
     if (version == 1
-	|| (map.get_available()
-	    && (!old_available || old_gid != map.get_active_gid()))) {
+        || command_descs.empty()
+        || (map.get_available()
+            && (!old_available || old_gid != map.get_active_gid()))) {
       dout(4) << "mkfs or daemon transitioned to available, loading commands"
 	      << dendl;
       bufferlist loaded_commands;
@@ -108,6 +118,16 @@ void MgrMonitor::create_pending()
 {
   pending_map = map;
   pending_map.epoch++;
+
+  if (map.get_epoch() == 1 &&
+      command_descs.empty() &&
+      pending_command_descs.empty()) {
+    // we've been through the initial map and we haven't populated the
+    // command_descs vector. This likely means we came from kraken, where
+    // we wouldn't populate the vector, nor would we write it to disk, on
+    // create_initial().
+    create_initial();
+  }
 }
 
 health_status_t MgrMonitor::should_warn_about_mgr_down()
@@ -120,10 +140,10 @@ health_status_t MgrMonitor::should_warn_about_mgr_down()
   // no OSDs are ever created.
   if (ever_had_active_mgr ||
       (mon->osdmon()->osdmap.get_num_osds() > 0 &&
-       now > mon->monmap->created + g_conf->mon_mgr_mkfs_grace)) {
+       now > mon->monmap->created + g_conf->get_val<int64_t>("mon_mgr_mkfs_grace"))) {
     health_status_t level = HEALTH_WARN;
     if (first_seen_inactive != utime_t() &&
-	now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) {
+	now - first_seen_inactive > g_conf->get_val<int64_t>("mon_mgr_inactive_grace")) {
       level = HEALTH_ERR;
     }
     return level;
@@ -293,6 +313,13 @@ bool MgrMonitor::prepare_beacon(MonOpRequestRef op)
   bool updated = false;
 
   if (pending_map.active_gid == m->get_gid()) {
+    if (pending_map.services != m->get_services()) {
+      dout(4) << "updated services from mgr." << m->get_name()
+              << ": " << m->get_services() << dendl;
+      pending_map.services = m->get_services();
+      updated = true;
+    }
+
     // A beacon from the currently active daemon
     if (pending_map.active_addr != m->get_server_addr()) {
       dout(4) << "learned address " << m->get_server_addr()
@@ -453,10 +480,11 @@ void MgrMonitor::send_digests()
     sub->session->con->send_message(mdigest);
   }
 
-  digest_event = new C_MonContext(mon, [this](int){
+  digest_event = mon->timer.add_event_after(
+    g_conf->get_val<int64_t>("mon_mgr_digest_period"),
+    new C_MonContext(mon, [this](int) {
       send_digests();
-  });
-  mon->timer.add_event_after(g_conf->mon_mgr_digest_period, digest_event);
+  }));
 }
 
 void MgrMonitor::cancel_timer()
@@ -496,7 +524,7 @@ void MgrMonitor::get_health(
     if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
       utime_t now = ceph_clock_now();
       if (first_seen_inactive != utime_t() &&
-	  now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) {
+	  now - first_seen_inactive > g_conf->get_val<int64_t>("mon_mgr_inactive_grace")) {
 	level = HEALTH_ERR;
       }
     }
@@ -510,7 +538,28 @@ void MgrMonitor::tick()
     return;
 
   const auto now = ceph::coarse_mono_clock::now();
-  const auto cutoff = now - std::chrono::seconds(g_conf->mon_mgr_beacon_grace);
+
+  const auto mgr_beacon_grace = std::chrono::seconds(
+      g_conf->get_val<int64_t>("mon_mgr_beacon_grace"));
+
+  // Note that this is the mgr daemon's tick period, not ours (the
+  // beacon is sent with this period).
+  const auto mgr_tick_period = std::chrono::seconds(
+      g_conf->get_val<int64_t>("mgr_tick_period"));
+
+  if (last_tick != ceph::coarse_mono_clock::time_point::min()
+      && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) {
+    // This case handles either local slowness (calls being delayed
+    // for whatever reason) or cluster election slowness (a long gap
+    // between calls while an election happened)
+    dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
+            "(slow election?) of " << now - last_tick << " seconds" << dendl;
+    for (auto &i : last_beacon) {
+      i.second = now;
+    }
+  }
+
+  last_tick = now;
 
   // Populate any missing beacons (i.e. no beacon since MgrMonitor
   // instantiation) with the current time, so that they will
@@ -528,6 +577,7 @@ void MgrMonitor::tick()
   // Cull standbys first so that any remaining standbys
   // will be eligible to take over from the active if we cull him.
   std::list<uint64_t> dead_standbys;
+  const auto cutoff = now - mgr_beacon_grace;
   for (const auto &i : pending_map.standbys) {
     auto last_beacon_time = last_beacon.at(i.first);
     if (last_beacon_time < cutoff) {
@@ -563,7 +613,7 @@ void MgrMonitor::tick()
     if (promote_standby()) {
       dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
       mon->clog->info() << "Activating manager daemon "
-                        << pending_map.active_name;
+                      << pending_map.active_name;
       propose = true;
     }
   }
@@ -571,8 +621,9 @@ void MgrMonitor::tick()
   if (!pending_map.available &&
       !ever_had_active_mgr &&
       should_warn_about_mgr_down() != HEALTH_OK) {
-    dout(10) << " exceeded mon_mgr_mkfs_grace " << g_conf->mon_mgr_mkfs_grace
-	     << " seconds" << dendl;
+    dout(10) << " exceeded mon_mgr_mkfs_grace "
+             << g_conf->get_val<int64_t>("mon_mgr_mkfs_grace")
+             << " seconds" << dendl;
     propose = true;
   }
 
@@ -585,6 +636,7 @@ void MgrMonitor::on_restart()
 {
   // Clear out the leader-specific state.
   last_beacon.clear();
+  last_tick = ceph::coarse_mono_clock::now();
 }
 
 
@@ -619,6 +671,7 @@ void MgrMonitor::drop_active()
   pending_map.active_gid = 0;
   pending_map.available = false;
   pending_map.active_addr = entity_addr_t();
+  pending_map.services.clear();
 
   // So that when new active mgr subscribes to mgrdigest, it will
   // get an immediate response instead of waiting for next timer
@@ -685,9 +738,27 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
     }
     f->flush(rdata);
   } else if (prefix == "mgr module ls") {
-    f->open_array_section("modules");
-    for (auto& p : map.modules) {
-      f->dump_string("module", p);
+    f->open_object_section("modules");
+    {
+      f->open_array_section("enabled_modules");
+      for (auto& p : map.modules) {
+        f->dump_string("module", p);
+      }
+      f->close_section();
+      f->open_array_section("disabled_modules");
+      for (auto& p : map.available_modules) {
+        if (map.modules.count(p) == 0) {
+          f->dump_string("module", p);
+        }
+      }
+      f->close_section();
+    }
+    f->close_section();
+    f->flush(rdata);
+  } else if (prefix == "mgr services") {
+    f->open_object_section("services");
+    for (const auto &i : map.services) {
+      f->dump_string(i.first.c_str(), i.second);
     }
     f->close_section();
     f->flush(rdata);
diff --git a/ceph/src/mon/MgrMonitor.h b/ceph/src/mon/MgrMonitor.h
index 563ae7c5d..82315d355 100644
--- a/ceph/src/mon/MgrMonitor.h
+++ b/ceph/src/mon/MgrMonitor.h
@@ -79,6 +79,7 @@ public:
   bool in_use() const { return map.epoch > 0; }
 
   void create_initial() override;
+  void get_store_prefixes(std::set<string>& s) override;
   void update_from_paxos(bool *need_bootstrap) override;
   void create_pending() override;
   void encode_pending(MonitorDBStore::TransactionRef t) override;
@@ -117,6 +118,11 @@ public:
   void count_metadata(const string& field, std::map<string,int> *out);
 
   friend class C_Updated;
+
+  // When did the mon last call into our tick() method?  Used for detecting
+  // when the mon was not updating us for some period (e.g. during slow
+  // election) to reset last_beacon timeouts
+  ceph::coarse_mono_clock::time_point last_tick;
 };
 
 #endif
diff --git a/ceph/src/mon/MonCommands.h b/ceph/src/mon/MonCommands.h
index 946a95756..176ca4055 100644
--- a/ceph/src/mon/MonCommands.h
+++ b/ceph/src/mon/MonCommands.h
@@ -548,6 +548,9 @@ COMMAND("osd crush add " \
 	"name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]", \
 	"add or update crushmap position and weight for <name> with <weight> and location <args>", \
 	"osd", "rw", "cli,rest")
+COMMAND("osd crush set-all-straw-buckets-to-straw2",
+        "convert all CRUSH current straw buckets to use the straw2 algorithm",
+	"osd", "rw", "cli,rest")
 COMMAND("osd crush set-device-class " \
         "name=class,type=CephString " \
 	"name=ids,type=CephString,n=N", \
@@ -739,13 +742,15 @@ COMMAND("osd erasure-code-profile ls", \
 	"list all erasure code profiles", \
 	"osd", "r", "cli,rest")
 COMMAND("osd set " \
-	"name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds", \
+	"name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds " \
+	"name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"set <key>", "osd", "rw", "cli,rest")
 COMMAND("osd unset " \
 	"name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent", \
 	"unset <key>", "osd", "rw", "cli,rest")
 COMMAND("osd require-osd-release "\
-	"name=release,type=CephChoices,strings=luminous",
+	"name=release,type=CephChoices,strings=luminous " \
+	"name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"set the minimum allowed OSD release to participate in the cluster",
 	"osd", "rw", "cli,rest")
 COMMAND("osd cluster_snap", "take cluster snapshot (disabled)", \
@@ -1073,6 +1078,9 @@ COMMAND("mgr fail name=who,type=CephString", \
 	"treat the named manager daemon as failed", "mgr", "rw", "cli,rest")
 COMMAND("mgr module ls",
 	"list active mgr modules", "mgr", "r", "cli,rest")
+COMMAND("mgr services",
+	"list service endpoints provided by mgr modules",
+        "mgr", "r", "cli,rest")
 COMMAND("mgr module enable "						\
 	"name=module,type=CephString "					\
 	"name=force,type=CephChoices,strings=--force,req=false",
diff --git a/ceph/src/mon/MonMap.cc b/ceph/src/mon/MonMap.cc
index 218df9aa8..7a1b9420e 100644
--- a/ceph/src/mon/MonMap.cc
+++ b/ceph/src/mon/MonMap.cc
@@ -439,31 +439,34 @@ int MonMap::build_initial(CephContext *cct, ostream& errout)
 {
   const md_config_t *conf = cct->_conf;
   // file?
-  if (!conf->monmap.empty()) {
+  const auto monmap = conf->get_val<std::string>("monmap");
+  if (!monmap.empty()) {
     int r;
     try {
-      r = read(conf->monmap.c_str());
+      r = read(monmap.c_str());
     }
     catch (const buffer::error &e) {
       r = -EINVAL;
     }
     if (r >= 0)
       return 0;
-    errout << "unable to read/decode monmap from " << conf->monmap
+    errout << "unable to read/decode monmap from " << monmap
 	 << ": " << cpp_strerror(-r) << std::endl;
     return r;
   }
 
   // fsid from conf?
-  if (!cct->_conf->fsid.is_zero()) {
-    fsid = cct->_conf->fsid;
+  const auto new_fsid = conf->get_val<uuid_d>("fsid");
+  if (!new_fsid.is_zero()) {
+    fsid = new_fsid;
   }
 
   // -m foo?
-  if (!conf->mon_host.empty()) {
-    int r = build_from_host_list(conf->mon_host, "noname-");
+  const auto mon_host = conf->get_val<std::string>("mon_host");
+  if (!mon_host.empty()) {
+    int r = build_from_host_list(mon_host, "noname-");
     if (r < 0) {
-      errout << "unable to parse addrs in '" << conf->mon_host << "'"
+      errout << "unable to parse addrs in '" << mon_host << "'"
              << std::endl;
       return r;
     }
@@ -536,7 +539,7 @@ int MonMap::build_initial(CephContext *cct, ostream& errout)
 
   if (size() == 0) {
     // no info found from conf options lets try use DNS SRV records
-    string srv_name = conf->mon_dns_srv_name;
+    string srv_name = conf->get_val<std::string>("mon_dns_srv_name");
     string domain;
     // check if domain is also provided and extract it from srv_name
     size_t idx = srv_name.find("_");
diff --git a/ceph/src/mon/Monitor.cc b/ceph/src/mon/Monitor.cc
index 4e5914aa7..af3683f67 100644
--- a/ceph/src/mon/Monitor.cc
+++ b/ceph/src/mon/Monitor.cc
@@ -571,14 +571,22 @@ int Monitor::preinit()
   assert(!logger);
   {
     PerfCountersBuilder pcb(g_ceph_context, "mon", l_mon_first, l_mon_last);
-    pcb.add_u64(l_mon_num_sessions, "num_sessions", "Open sessions", "sess");
-    pcb.add_u64_counter(l_mon_session_add, "session_add", "Created sessions", "sadd");
-    pcb.add_u64_counter(l_mon_session_rm, "session_rm", "Removed sessions", "srm");
-    pcb.add_u64_counter(l_mon_session_trim, "session_trim", "Trimmed sessions");
-    pcb.add_u64_counter(l_mon_num_elections, "num_elections", "Elections participated in");
-    pcb.add_u64_counter(l_mon_election_call, "election_call", "Elections started");
-    pcb.add_u64_counter(l_mon_election_win, "election_win", "Elections won");
-    pcb.add_u64_counter(l_mon_election_lose, "election_lose", "Elections lost");
+    pcb.add_u64(l_mon_num_sessions, "num_sessions", "Open sessions", "sess",
+        PerfCountersBuilder::PRIO_USEFUL);
+    pcb.add_u64_counter(l_mon_session_add, "session_add", "Created sessions",
+        "sadd", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mon_session_rm, "session_rm", "Removed sessions",
+        "srm", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mon_session_trim, "session_trim", "Trimmed sessions",
+        "strm", PerfCountersBuilder::PRIO_USEFUL);
+    pcb.add_u64_counter(l_mon_num_elections, "num_elections", "Elections participated in",
+        "ecnt", PerfCountersBuilder::PRIO_USEFUL);
+    pcb.add_u64_counter(l_mon_election_call, "election_call", "Elections started",
+        "estt", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mon_election_win, "election_win", "Elections won",
+        "ewon", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mon_election_lose, "election_lose", "Elections lost",
+        "elst", PerfCountersBuilder::PRIO_INTERESTING);
     logger = pcb.create_perf_counters();
     cct->get_perfcounters_collection()->add(logger);
   }
@@ -1242,10 +1250,11 @@ void Monitor::sync_reset_timeout()
   dout(10) << __func__ << dendl;
   if (sync_timeout_event)
     timer.cancel_event(sync_timeout_event);
-  sync_timeout_event = new C_MonContext(this, [this](int) {
-      sync_timeout();
-    });
-  timer.add_event_after(g_conf->mon_sync_timeout, sync_timeout_event);
+  sync_timeout_event = timer.add_event_after(
+    g_conf->mon_sync_timeout,
+    new C_MonContext(this, [this](int) {
+	sync_timeout();
+      }));
 }
 
 void Monitor::sync_finish(version_t last_committed)
@@ -1588,8 +1597,12 @@ void Monitor::reset_probe_timeout()
       probe_timeout(r);
     });
   double t = g_conf->mon_probe_timeout;
-  timer.add_event_after(t, probe_timeout_event);
-  dout(10) << "reset_probe_timeout " << probe_timeout_event << " after " << t << " seconds" << dendl;
+  if (timer.add_event_after(t, probe_timeout_event)) {
+    dout(10) << "reset_probe_timeout " << probe_timeout_event
+	     << " after " << t << " seconds" << dendl;
+  } else {
+    probe_timeout_event = nullptr;
+  }
 }
 
 void Monitor::probe_timeout(int r)
@@ -2291,14 +2304,14 @@ void Monitor::health_tick_start()
   dout(15) << __func__ << dendl;
 
   health_tick_stop();
-  health_tick_event = new C_MonContext(this, [this](int r) {
-      if (r < 0)
-        return;
-      do_health_to_clog();
-      health_tick_start();
-    });
-  timer.add_event_after(cct->_conf->mon_health_to_clog_tick_interval,
-                        health_tick_event);
+  health_tick_event = timer.add_event_after(
+    cct->_conf->mon_health_to_clog_tick_interval,
+    new C_MonContext(this, [this](int r) {
+	if (r < 0)
+	  return;
+	do_health_to_clog();
+	health_tick_start();
+      }));
 }
 
 void Monitor::health_tick_stop()
@@ -2345,7 +2358,9 @@ void Monitor::health_interval_start()
         return;
       do_health_to_clog_interval();
     });
-  timer.add_event_at(next, health_interval_event);
+  if (!timer.add_event_at(next, health_interval_event)) {
+    health_interval_event = nullptr;
+  }
 }
 
 void Monitor::health_interval_stop()
@@ -2495,21 +2510,26 @@ health_status_t Monitor::get_health_status(
     *plain += "\n";
   }
 
+  const std::string old_fields_message = "'ceph health' JSON format has "
+    "changed in luminous. If you see this your monitoring system is "
+    "scraping the wrong fields. Disable this with 'mon health preluminous "
+    "compat warning = false'";
+
   if (f && (compat || compat_warn)) {
     health_status_t cr = compat_warn ? min(HEALTH_WARN, r) : r;
+    f->open_array_section("summary");
+    if (compat_warn) {
+      f->open_object_section("item");
+      f->dump_stream("severity") << HEALTH_WARN;
+      f->dump_string("summary", old_fields_message);
+      f->close_section();
+    }
     if (compat) {
-      f->open_array_section("summary");
-      if (compat_warn) {
-	f->open_object_section("item");
-	f->dump_stream("severity") << HEALTH_WARN;
-	f->dump_string("summary", "'ceph health' JSON format has changed in luminous; update your health monitoring scripts");
-	f->close_section();
-      }
       for (auto& svc : paxos_service) {
-	svc->get_health_checks().dump_summary_compat(f);
+        svc->get_health_checks().dump_summary_compat(f);
       }
-      f->close_section();
     }
+    f->close_section();
     f->dump_stream("overall_status") << cr;
   }
 
@@ -2517,7 +2537,7 @@ health_status_t Monitor::get_health_status(
     if (f && (compat || compat_warn)) {
       f->open_array_section("detail");
       if (compat_warn) {
-	f->dump_string("item", "'ceph health' JSON format has changed in luminous. If you see this your monitoring system is scraping the wrong fields. Disable this with 'mon health preluminous compat warning = false'");
+	f->dump_string("item", old_fields_message);
       }
     }
 
@@ -3149,8 +3169,8 @@ void Monitor::handle_command(MonOpRequestRef op)
       osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
     const auto& hdr = m->get_header();
     uint64_t size = hdr.front_len + hdr.middle_len + hdr.data_len;
-    uint64_t max =
-      g_conf->mon_client_bytes * g_conf->mon_mgr_proxy_client_bytes_ratio;
+    uint64_t max = g_conf->get_val<uint64_t>("mon_client_bytes")
+                 * g_conf->get_val<double>("mon_mgr_proxy_client_bytes_ratio");
     if (mgr_proxy_bytes + size > max) {
       dout(10) << __func__ << " current mgr proxy bytes " << mgr_proxy_bytes
 	       << " + " << size << " > max " << max << dendl;
@@ -4578,10 +4598,11 @@ void Monitor::timecheck_reset_event()
            << " rounds_since_clean " << timecheck_rounds_since_clean
            << dendl;
 
-  timecheck_event = new C_MonContext(this, [this](int) {
-      timecheck_start_round();
-    });
-  timer.add_event_after(delay, timecheck_event);
+  timecheck_event = timer.add_event_after(
+    delay,
+    new C_MonContext(this, [this](int) {
+	timecheck_start_round();
+      }));
 }
 
 void Monitor::timecheck_check_skews()
@@ -5427,10 +5448,11 @@ void Monitor::scrub_event_start()
     return;
   }
 
-  scrub_event = new C_MonContext(this, [this](int) {
+  scrub_event = timer.add_event_after(
+    cct->_conf->mon_scrub_interval,
+    new C_MonContext(this, [this](int) {
       scrub_start();
-    });
-  timer.add_event_after(cct->_conf->mon_scrub_interval, scrub_event);
+      }));
 }
 
 void Monitor::scrub_event_cancel()
@@ -5454,11 +5476,11 @@ void Monitor::scrub_reset_timeout()
 {
   dout(15) << __func__ << " reset timeout event" << dendl;
   scrub_cancel_timeout();
-
-  scrub_timeout_event = new C_MonContext(this, [this](int) {
+  scrub_timeout_event = timer.add_event_after(
+    g_conf->mon_scrub_timeout,
+    new C_MonContext(this, [this](int) {
       scrub_timeout();
-    });
-  timer.add_event_after(g_conf->mon_scrub_timeout, scrub_timeout_event);
+    }));
 }
 
 /************ TICK ***************/
diff --git a/ceph/src/mon/MonitorDBStore.h b/ceph/src/mon/MonitorDBStore.h
index 707d635af..00e56a9d8 100644
--- a/ceph/src/mon/MonitorDBStore.h
+++ b/ceph/src/mon/MonitorDBStore.h
@@ -624,6 +624,8 @@ class MonitorDBStore
       db->init(g_conf->mon_rocksdb_options);
     else
       db->init();
+
+
   }
 
   int open(ostream &out) {
@@ -640,6 +642,16 @@ class MonitorDBStore
     r = db->open(out);
     if (r < 0)
       return r;
+
+    // Monitors are few in number, so the resource cost of exposing 
+    // very detailed stats is low: ramp up the priority of all the
+    // KV store's perf counters.  Do this after open, because backend may
+    // not have constructed PerfCounters earlier.
+    if (db->get_perf_counters()) {
+      db->get_perf_counters()->set_prio_adjust(
+          PerfCountersBuilder::PRIO_USEFUL - PerfCountersBuilder::PRIO_DEBUGONLY);
+    }
+
     io_work.start();
     is_open = true;
     return 0;
diff --git a/ceph/src/mon/OSDMonitor.cc b/ceph/src/mon/OSDMonitor.cc
index 46f702f40..c9ff10b34 100644
--- a/ceph/src/mon/OSDMonitor.cc
+++ b/ceph/src/mon/OSDMonitor.cc
@@ -81,7 +81,8 @@
 #include <boost/algorithm/string/predicate.hpp>
 
 #define dout_subsys ceph_subsys_mon
-#define OSD_PG_CREATING_PREFIX "osd_pg_creating"
+static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
+static const string OSD_METADATA_PREFIX("osd_metadata");
 
 namespace {
 
@@ -268,6 +269,7 @@ void OSDMonitor::get_store_prefixes(std::set<string>& s)
 {
   s.insert(service_name);
   s.insert(OSD_PG_CREATING_PREFIX);
+  s.insert(OSD_METADATA_PREFIX);
 }
 
 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
@@ -566,23 +568,6 @@ void OSDMonitor::on_active()
 void OSDMonitor::on_restart()
 {
   last_osd_report.clear();
-
-  if (mon->is_leader()) {
-    // fix ruleset != ruleid
-    if (osdmap.crush->has_legacy_rulesets() &&
-	!osdmap.crush->has_multirule_rulesets()) {
-      CrushWrapper newcrush;
-      _get_pending_crush(newcrush);
-      int r = newcrush.renumber_rules_by_ruleset();
-      if (r >= 0) {
-	dout(1) << __func__ << " crush map has ruleset != rule id; fixing" << dendl;
-	pending_inc.crush.clear();
-	newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
-      } else {
-	dout(10) << __func__ << " unable to renumber rules by ruleset" << dendl;
-      }
-    }
-  }
 }
 
 void OSDMonitor::on_shutdown()
@@ -662,6 +647,40 @@ void OSDMonitor::create_pending()
 	      << pending_inc.new_nearfull_ratio << dendl;
     }
   }
+
+  // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
+  // structure.
+  if (osdmap.crush->has_legacy_rule_ids()) {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    // First, for all pools, work out which rule they really used
+    // by resolving ruleset to rule.
+    for (const auto &i : osdmap.get_pools()) {
+      const auto pool_id = i.first;
+      const auto &pool = i.second;
+      int new_rule_id = newcrush.find_rule(pool.crush_rule,
+					   pool.type, pool.size);
+
+      dout(1) << __func__ << " rewriting pool "
+	      << osdmap.get_pool_name(pool_id) << " crush ruleset "
+	      << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
+      if (pending_inc.new_pools.count(pool_id) == 0) {
+	pending_inc.new_pools[pool_id] = pool;
+      }
+      pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
+    }
+
+    // Now, go ahead and renumber all the rules so that their
+    // rule_id field corresponds to their position in the array
+    auto old_to_new = newcrush.renumber_rules();
+    dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
+    for (const auto &i : old_to_new) {
+      dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
+    }
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
+  }
 }
 
 creating_pgs_t
@@ -972,31 +991,190 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
     tmp.apply_incremental(pending_inc);
 
     if (tmp.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
-      // set or clear full/nearfull?
-      int full, backfill, nearfull;
-      tmp.count_full_nearfull_osds(&full, &backfill, &nearfull);
-      if (full > 0) {
-	if (!tmp.test_flag(CEPH_OSDMAP_FULL)) {
-	  dout(10) << __func__ << " setting full flag" << dendl;
-	  add_flag(CEPH_OSDMAP_FULL);
-	  remove_flag(CEPH_OSDMAP_NEARFULL);
-	}
-      } else {
-	if (tmp.test_flag(CEPH_OSDMAP_FULL)) {
-	  dout(10) << __func__ << " clearing full flag" << dendl;
-	  remove_flag(CEPH_OSDMAP_FULL);
-	}
-	if (nearfull > 0) {
-	  if (!tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
-	    dout(10) << __func__ << " setting nearfull flag" << dendl;
-	    add_flag(CEPH_OSDMAP_NEARFULL);
-	  }
-	} else {
-	  if (tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
-	    dout(10) << __func__ << " clearing nearfull flag" << dendl;
-	    remove_flag(CEPH_OSDMAP_NEARFULL);
-	  }
-	}
+      // remove any legacy osdmap nearfull/full flags
+      {
+        if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
+          dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
+                   << dendl;
+          remove_flag(CEPH_OSDMAP_NEARFULL);
+          remove_flag(CEPH_OSDMAP_FULL);
+        }
+      }
+      // collect which pools are currently affected by
+      // the near/backfill/full osd(s),
+      // and set per-pool near/backfill/full flag instead
+      set<int64_t> full_pool_ids;
+      set<int64_t> backfillfull_pool_ids;
+      set<int64_t> nearfull_pool_ids;
+      tmp.get_full_pools(g_ceph_context,
+                         &full_pool_ids,
+                         &backfillfull_pool_ids,
+                         &nearfull_pool_ids);
+      if (full_pool_ids.empty() ||
+          backfillfull_pool_ids.empty() ||
+          nearfull_pool_ids.empty()) {
+        // normal case - no nearfull, backfillfull or full osds
+        // try cancel any improper nearfull/backfillfull/full pool
+        // flags first
+        for (auto &pool: tmp.get_pools()) {
+          auto p = pool.first;
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
+              nearfull_pool_ids.empty()) {
+            dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+                     << "'s nearfull flag" << dendl;
+            if (pending_inc.new_pools.count(p) == 0) {
+              // load original pool info first!
+              pending_inc.new_pools[p] = pool.second;
+            }
+            pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+          }
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
+              backfillfull_pool_ids.empty()) {
+            dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+                     << "'s backfillfull flag" << dendl;
+            if (pending_inc.new_pools.count(p) == 0) {
+              pending_inc.new_pools[p] = pool.second;
+            }
+            pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+          }
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
+              full_pool_ids.empty()) {
+            if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+              // set by EQUOTA, skipping
+              continue;
+            }
+            dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+                     << "'s full flag" << dendl;
+            if (pending_inc.new_pools.count(p) == 0) {
+              pending_inc.new_pools[p] = pool.second;
+            }
+            pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
+          }
+        }
+      }
+      if (!full_pool_ids.empty()) {
+        dout(10) << __func__ << " marking pool(s) " << full_pool_ids
+                 << " as full" << dendl;
+        for (auto &p: full_pool_ids) {
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
+            continue;
+          }
+          if (pending_inc.new_pools.count(p) == 0) {
+            pending_inc.new_pools[p] = tmp.pools[p];
+          }
+          pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
+          pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+          pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+        }
+        // cancel FLAG_FULL for pools which are no longer full too
+        for (auto &pool: tmp.get_pools()) {
+          auto p = pool.first;
+          if (full_pool_ids.count(p)) {
+            // skip pools we have just marked as full above
+            continue;
+          }
+          if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
+               tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+            // don't touch if currently is not full
+            // or is running out of quota (and hence considered as full)
+            continue;
+          }
+          dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+                   << "'s full flag" << dendl;
+          if (pending_inc.new_pools.count(p) == 0) {
+            pending_inc.new_pools[p] = pool.second;
+          }
+          pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
+        }
+      }
+      if (!backfillfull_pool_ids.empty()) {
+        for (auto &p: backfillfull_pool_ids) {
+          if (full_pool_ids.count(p)) {
+            // skip pools we have already considered as full above
+            continue;
+          }
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+            // make sure FLAG_FULL is truly set, so we are safe not
+            // to set a extra (redundant) FLAG_BACKFILLFULL flag
+            assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
+            continue;
+          }
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+            // don't bother if pool is already marked as backfillfull
+            continue;
+          }
+          dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
+                   << "'s as backfillfull" << dendl;
+          if (pending_inc.new_pools.count(p) == 0) {
+            pending_inc.new_pools[p] = tmp.pools[p];
+          }
+          pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
+          pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+        }
+        // cancel FLAG_BACKFILLFULL for pools
+        // which are no longer backfillfull too
+        for (auto &pool: tmp.get_pools()) {
+          auto p = pool.first;
+          if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
+            // skip pools we have just marked as backfillfull/full above
+            continue;
+          }
+          if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+            // and don't touch if currently is not backfillfull
+            continue;
+          }
+          dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+                   << "'s backfillfull flag" << dendl;
+          if (pending_inc.new_pools.count(p) == 0) {
+            pending_inc.new_pools[p] = pool.second;
+          }
+          pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+        }
+      }
+      if (!nearfull_pool_ids.empty()) {
+        for (auto &p: nearfull_pool_ids) {
+          if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
+            continue;
+          }
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+            // make sure FLAG_FULL is truly set, so we are safe not
+            // to set a extra (redundant) FLAG_NEARFULL flag
+            assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
+            continue;
+          }
+          if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
+            // don't bother if pool is already marked as nearfull
+            continue;
+          }
+          dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
+                   << "'s as nearfull" << dendl;
+          if (pending_inc.new_pools.count(p) == 0) {
+            pending_inc.new_pools[p] = tmp.pools[p];
+          }
+          pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
+        }
+        // cancel FLAG_NEARFULL for pools
+        // which are no longer nearfull too
+        for (auto &pool: tmp.get_pools()) {
+          auto p = pool.first;
+          if (full_pool_ids.count(p) ||
+              backfillfull_pool_ids.count(p) ||
+              nearfull_pool_ids.count(p)) {
+            // skip pools we have just marked as
+            // nearfull/backfillfull/full above
+            continue;
+          }
+          if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
+            // and don't touch if currently is not nearfull
+            continue;
+          }
+          dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+                   << "'s nearfull flag" << dendl;
+          if (pending_inc.new_pools.count(p) == 0) {
+            pending_inc.new_pools[p] = pool.second;
+          }
+          pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+        }
       }
 
       // min_compat_client?
@@ -2668,6 +2846,10 @@ bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
     goto ignore;
   }
 
+  if (m->forced) {
+    return false;
+  }
+
   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
     dout(20) << " " << p->first
 	     << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
@@ -3749,16 +3931,6 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
       }
     }
 
-    if (osdmap.crush->has_multirule_rulesets()) {
-      ostringstream ss;
-      ss << "CRUSH map contains multirule rulesets";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-      if (detail) {
-	ss << "; please manually fix the map";
-	detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-      }
-    }
-
     // Not using 'sortbitwise' and should be?
     if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
         (osdmap.get_up_osd_features() &
@@ -5264,10 +5436,20 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
   return true;
 }
 
-void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags)
+void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
+{
+  pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
+    osdmap.get_pg_pool(pool_id));
+  assert(pool);
+  pool->set_flag(flags);
+}
+
+void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
 {
-  const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
-  pending_inc.get_new_pool(pool_id, pool)->flags = flags;
+  pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
+    osdmap.get_pg_pool(pool_id));
+  assert(pool);
+  pool->unset_flag(flags);
 }
 
 bool OSDMonitor::update_pools_status()
@@ -5290,14 +5472,16 @@ bool OSDMonitor::update_pools_status()
       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
 
-    if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
+    if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
       if (pool_is_full)
         continue;
 
       mon->clog->info() << "pool '" << pool_name
-                       << "' no longer full; removing FULL flag";
-
-      update_pool_flags(it->first, pool.get_flags() & ~pg_pool_t::FLAG_FULL);
+                       << "' no longer out of quota; removing NO_QUOTA flag";
+      // below we cancel FLAG_FULL too, we'll set it again in
+      // OSDMonitor::encode_pending if it still fails the osd-full checking.
+      clear_pool_flags(it->first,
+                       pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
       ret = true;
     } else {
       if (!pool_is_full)
@@ -5315,7 +5499,14 @@ bool OSDMonitor::update_pools_status()
                          << " (reached quota's max_objects: "
                          << pool.quota_max_objects << ")";
       }
-      update_pool_flags(it->first, pool.get_flags() | pg_pool_t::FLAG_FULL);
+      // set both FLAG_FULL_NO_QUOTA and FLAG_FULL
+      // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
+      // since FLAG_FULL should always take precedence
+      set_pool_flags(it->first,
+                     pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
+      clear_pool_flags(it->first,
+                       pg_pool_t::FLAG_NEARFULL |
+                       pg_pool_t::FLAG_BACKFILLFULL);
       ret = true;
     }
   }
@@ -5625,9 +5816,22 @@ int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_pr
       user_map[*i] = string();
       (*erasure_code_profile_map)[*i] = string();
     } else {
-      const string key = i->substr(0, equal);
+      string key = i->substr(0, equal);
       equal++;
       const string value = i->substr(equal);
+      if (key.find("ruleset-") == 0) {
+	if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+	    g_conf->get_val<bool>("mon_fixup_legacy_erasure_code_profiles")) {
+	  mon->clog->warn() << "erasure code profile property '" << key
+			    << "' is no longer supported; try "
+			    << "'crush-" << key.substr(8) << "' instead";
+	  key = string("crush-") + key.substr(8);
+	} else {
+	  *ss << "property '" << key << "' is no longer supported; try "
+	      << "'crush-" << key.substr(8) << "' instead";
+	  return -EINVAL;
+	}
+      }
       user_map[key] = value;
       (*erasure_code_profile_map)[key] = value;
     }
@@ -5797,6 +6001,36 @@ int OSDMonitor::get_crush_rule(const string &rule_name,
   return 0;
 }
 
+int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
+{
+  auto max_pgs_per_osd = g_conf->get_val<uint64_t>("mon_max_pg_per_osd");
+  auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);   // assume min cluster size 3
+  auto max_pgs = max_pgs_per_osd * num_osds;
+  uint64_t projected = 0;
+  if (pool < 0) {
+    projected += pg_num * size;
+  }
+  for (const auto& i : osdmap.get_pools()) {
+    if (i.first == pool) {
+      projected += pg_num * size;
+    } else {
+      projected += i.second.get_pg_num() * i.second.get_size();
+    }
+  }
+  if (projected > max_pgs) {
+    if (pool >= 0) {
+      *ss << "pool id " << pool;
+    }
+    *ss << " pg_num " << pg_num << " size " << size
+	<< " would mean " << projected
+	<< " total pgs, which exceeds max " << max_pgs
+	<< " (mon_max_pg_per_osd " << max_pgs_per_osd
+	<< " * num_in_osds " << num_osds << ")";
+    return -ERANGE;
+  }
+  return 0;
+}
+
 /**
  * @param name The name of the new pool
  * @param auid The auid of the pool owner. Can be -1
@@ -5876,6 +6110,11 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
     dout(10) << " prepare_pool_size returns " << r << dendl;
     return r;
   }
+  r = check_pg_num(-1, pg_num, size, ss);
+  if (r) {
+    dout(10) << " prepare_pool_size returns " << r << dendl;
+    return r;
+  }
 
   if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
     return -EINVAL;
@@ -6052,6 +6291,10 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
       ss << "pool size must be between 1 and 10";
       return -EINVAL;
     }
+    int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
+    if (r < 0) {
+      return r;
+    }
     p.size = n;
     if (n < p.min_size)
       p.min_size = n;
@@ -6121,6 +6364,10 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
          << " (you may adjust 'mon max pool pg num' for higher values)";
       return -ERANGE;
     }
+    int r = check_pg_num(pool, n, p.get_size(), &ss);
+    if (r) {
+      return r;
+    }
     string force;
     cmd_getval(g_ceph_context,cmdmap, "force", force);
     if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
@@ -6250,7 +6497,15 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
     bloomp->set_fpp(f);
   } else if (var == "use_gmt_hitset") {
     if (val == "true" || (interr.empty() && n == 1)) {
-      if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
+      string force;
+      cmd_getval(g_ceph_context, cmdmap, "force", force);
+      if (!osdmap.get_num_up_osds() && force != "--yes-i-really-mean-it") {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        return -EPERM;
+      }
+      if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)
+          && force != "--yes-i-really-mean-it") {
 	ss << "not all OSDs support GMT hit set.";
 	return -EINVAL;
       }
@@ -7412,7 +7667,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       }
     }
 
-    if (crush.has_legacy_rulesets()) {
+    if (crush.has_legacy_rule_ids()) {
       err = -EINVAL;
       ss << "crush maps with ruleset != ruleid are no longer allowed";
       goto reply;
@@ -7422,16 +7677,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
 
-    const auto& osdmap_pools = osdmap.get_pools();
-    for (auto pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
-      const int64_t pool_id = pit->first;
-      const pg_pool_t &pool = pit->second;
-      int ruleno = pool.get_crush_rule();
-      if (!crush.rule_exists(ruleno)) {
-	ss << " the crush rule no "<< ruleno << " for pool id " << pool_id << " is in use";
-	err = -EINVAL;
-	goto reply;
-      }
+    err = osdmap.validate_crush_rules(&crush, &ss);
+    if (err < 0) {
+      goto reply;
     }
 
     if (g_conf->mon_osd_crush_smoke_test) {
@@ -7460,6 +7708,26 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     ss << osdmap.get_crush_version() + 1;
     goto update;
 
+  } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
+      int bid = -1 - b;
+      if (newcrush.bucket_exists(bid) &&
+	  newcrush.get_bucket_alg(bid)) {
+	dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
+	newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
+      }
+    }
+    if (!validate_crush_against_features(&newcrush, ss)) {
+      err = -EINVAL;
+      goto reply;
+    }
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
   } else if (prefix == "osd crush set-device-class") {
     if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
@@ -8596,7 +8864,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       // FIXME: this is ok in some situations, but let's not bother with that
       // complexity now.
       int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
-      if (osdmap.crush_ruleset_in_use(ruleset)) {
+      if (osdmap.crush_rule_in_use(ruleset)) {
 	ss << "crush ruleset " << name << " " << ruleset << " is in use";
 	err = -EBUSY;
 	goto reply;
@@ -8801,6 +9069,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
 
   } else if (prefix == "osd set") {
+    string sure;
+    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
     string key;
     cmd_getval(g_ceph_context, cmdmap, "key", key);
     if (key == "full")
@@ -8828,7 +9098,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     else if (key == "notieragent")
       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
     else if (key == "sortbitwise") {
-      if (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT) {
+      if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        err = -EPERM;
+        goto reply;
+      }
+      if ((osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)
+          || sure == "--yes-i-really-mean-it") {
 	return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
       } else {
 	ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
@@ -8836,7 +9113,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 	goto reply;
       }
     } else if (key == "recovery_deletes") {
-      if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)) {
+      if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        err = -EPERM;
+        goto reply;
+      }
+      if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)
+          || sure == "--yes-i-really-mean-it") {
 	return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
       } else {
 	ss << "not all up OSDs have OSD_RECOVERY_DELETES feature";
@@ -8844,6 +9128,12 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 	goto reply;
       }
     } else if (key == "require_jewel_osds") {
+      if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        err = -EPERM;
+        goto reply;
+      }
       if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
 	ss << "the sortbitwise flag must be set before require_jewel_osds";
 	err = -EPERM;
@@ -8852,13 +9142,20 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 	ss << "require_osd_release is already >= jewel";
 	err = 0;
 	goto reply;
-      } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)) {
+      } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)
+                 || sure == "--yes-i-really-mean-it") {
 	return prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_JEWEL);
       } else {
 	ss << "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
 	err = -EPERM;
       }
     } else if (key == "require_kraken_osds") {
+      if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        err = -EPERM;
+        goto reply;
+      }
       if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
 	ss << "the sortbitwise flag must be set before require_kraken_osds";
 	err = -EPERM;
@@ -8867,7 +9164,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 	ss << "require_osd_release is already >= kraken";
 	err = 0;
 	goto reply;
-      } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)) {
+      } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)
+                 || sure == "--yes-i-really-mean-it") {
 	bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_KRAKEN);
 	// ensure JEWEL is also set
 	pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
@@ -8916,6 +9214,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   } else if (prefix == "osd require-osd-release") {
     string release;
     cmd_getval(g_ceph_context, cmdmap, "release", release);
+    string sure;
+    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
     if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
       ss << "the sortbitwise flag must be set first";
       err = -EPERM;
@@ -9032,7 +9332,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
             if (osdmap.is_up(osd)) {
               msg << ", while it was still marked up";
             } else {
-              msg << ", after it was down for " << int(down_pending_out[osd].sec())
+              auto period = ceph_clock_now() - down_pending_out[osd];
+              msg << ", after it was down for " << int(period.sec())
                   << " seconds";
             }
 
diff --git a/ceph/src/mon/OSDMonitor.h b/ceph/src/mon/OSDMonitor.h
index baee6a894..9767f1003 100644
--- a/ceph/src/mon/OSDMonitor.h
+++ b/ceph/src/mon/OSDMonitor.h
@@ -43,8 +43,6 @@ class MOSDMap;
 #include "erasure-code/ErasureCodeInterface.h"
 #include "mon/MonOpRequest.h"
 
-#define OSD_METADATA_PREFIX "osd_metadata"
-
 /// information about a particular peer's failure reports for one osd
 struct failure_reporter_t {
   utime_t failed_since;     ///< when they think it failed
@@ -346,6 +344,7 @@ private:
 				const string &erasure_code_profile,
 				unsigned *stripe_width,
 				ostream *ss);
+  int check_pg_num(int64_t pool, int pg_num, int size, ostream* ss);
   int prepare_new_pool(string& name, uint64_t auid,
 		       int crush_rule,
 		       const string &crush_rule_name,
@@ -357,7 +356,8 @@ private:
 		       ostream *ss);
   int prepare_new_pool(MonOpRequestRef op);
 
-  void update_pool_flags(int64_t pool_id, uint64_t flags);
+  void set_pool_flags(int64_t pool_id, uint64_t flags);
+  void clear_pool_flags(int64_t pool_id, uint64_t flags);
   bool update_pools_status();
 
   bool prepare_set_flag(MonOpRequestRef op, int flag);
diff --git a/ceph/src/mon/PGMap.cc b/ceph/src/mon/PGMap.cc
index fde038fc5..6fcc3c072 100644
--- a/ceph/src/mon/PGMap.cc
+++ b/ceph/src/mon/PGMap.cc
@@ -824,8 +824,9 @@ void PGMapDigest::dump_object_stat_sum(
     curr_object_copies_rate = (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
 
   float used = 0.0;
+  // note avail passed in is raw_avail, calc raw_used here.
   if (avail) {
-    used = sum.num_bytes * curr_object_copies_rate;
+    used = sum.num_bytes * raw_used_rate * curr_object_copies_rate;
     used /= used + avail;
   } else if (sum.num_bytes) {
     used = 1.0;
@@ -2921,27 +2922,28 @@ void PGMap::get_health_checks(
   }
 
   // TOO_FEW_PGS
-  int num_in = osdmap.get_num_in_osds();
-  int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
-  if (num_in &&
-      cct->_conf->mon_pg_warn_min_per_osd > 0 &&
-      osdmap.get_pools().size() > 0) {
-    int per = sum_pg_up / num_in;
-    if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
+  unsigned num_in = osdmap.get_num_in_osds();
+  auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
+  const auto min_pg_per_osd =
+    cct->_conf->get_val<uint64_t>("mon_pg_warn_min_per_osd");
+  if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
+    auto per = sum_pg_up / num_in;
+    if (per < min_pg_per_osd && per) {
       ostringstream ss;
       ss << "too few PGs per OSD (" << per
-	 << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
+	 << " < min " << min_pg_per_osd << ")";
       checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
     }
   }
 
   // TOO_MANY_PGS
-  if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
-    int per = sum_pg_up / num_in;
-    if (per > cct->_conf->mon_pg_warn_max_per_osd) {
+  auto max_pg_per_osd = cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd");
+  if (num_in && max_pg_per_osd > 0) {
+    auto per = sum_pg_up / num_in;
+    if (per > max_pg_per_osd) {
       ostringstream ss;
       ss << "too many PGs per OSD (" << per
-	 << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
+	 << " > max " << max_pg_per_osd << ")";
       checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
     }
   }
@@ -3326,7 +3328,7 @@ void PGMap::get_health(
       note["incomplete"] += p->second;
     if (p->first & PG_STATE_BACKFILL_WAIT)
       note["backfill_wait"] += p->second;
-    if (p->first & PG_STATE_BACKFILL)
+    if (p->first & PG_STATE_BACKFILLING)
       note["backfilling"] += p->second;
     if (p->first & PG_STATE_BACKFILL_TOOFULL)
       note["backfill_toofull"] += p->second;
@@ -3431,7 +3433,7 @@ void PGMap::get_health(
 	                        PG_STATE_RECOVERY_TOOFULL |
 	                        PG_STATE_INCOMPLETE |
 	                        PG_STATE_BACKFILL_WAIT |
-	                        PG_STATE_BACKFILL |
+	                        PG_STATE_BACKFILLING |
 	                        PG_STATE_BACKFILL_TOOFULL)) &&
 	    stuck_pgs.count(p->first) == 0) {
 	  if (max > 0) {
@@ -3588,27 +3590,31 @@ void PGMap::get_health(
   }
 
   // pg skew
-  int num_in = osdmap.get_num_in_osds();
-  int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
+  auto num_in = osdmap.get_num_in_osds();
+  auto sum_pg_up = MAX(static_cast<unsigned>(pg_sum.up), pg_stat.size());
   int sum_objects = pg_sum.stats.sum.num_objects;
   if (sum_objects < cct->_conf->mon_pg_warn_min_objects) {
     return;
   }
-  if (num_in && cct->_conf->mon_pg_warn_min_per_osd > 0) {
-    int per = sum_pg_up / num_in;
-    if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
+  const auto min_pg_per_osd =
+    cct->_conf->get_val<uint64_t>("mon_pg_warn_min_per_osd");
+  if (num_in && min_pg_per_osd > 0) {
+    auto per = sum_pg_up / num_in;
+    if (per < min_pg_per_osd && per) {
       ostringstream ss;
-      ss << "too few PGs per OSD (" << per << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
+      ss << "too few PGs per OSD (" << per << " < min " << min_pg_per_osd << ")";
       summary.push_back(make_pair(HEALTH_WARN, ss.str()));
       if (detail)
 	detail->push_back(make_pair(HEALTH_WARN, ss.str()));
     }
   }
-  if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
+  int64_t max_pg_per_osd = cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd");
+  if (num_in && max_pg_per_osd > 0) {
     int per = sum_pg_up / num_in;
-    if (per > cct->_conf->mon_pg_warn_max_per_osd) {
+    if (per > max_pg_per_osd) {
       ostringstream ss;
-      ss << "too many PGs per OSD (" << per << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
+      ss << "too many PGs per OSD (" << per << " > max "
+	 << max_pg_per_osd << ")";
       summary.push_back(make_pair(HEALTH_WARN, ss.str()));
       if (detail)
 	detail->push_back(make_pair(HEALTH_WARN, ss.str()));
@@ -3882,13 +3888,13 @@ int process_pg_map_command(
         state = -1;
         break;
       } else {
-        int filter = pg_string_state(state_str);
-        if (filter < 0) {
+        auto filter = pg_string_state(state_str);
+        if (!filter) {
           *ss << "'" << state_str << "' is not a valid pg state,"
               << " available choices: " << pg_state_string(0xFFFFFFFF);
           return -EINVAL;
         }
-        state |= filter;
+        state |= *filter;
       }
 
       states.pop_back();
diff --git a/ceph/src/mon/Paxos.cc b/ceph/src/mon/Paxos.cc
index 3555d60db..e92438769 100644
--- a/ceph/src/mon/Paxos.cc
+++ b/ceph/src/mon/Paxos.cc
@@ -86,6 +86,11 @@ void Paxos::init()
 void Paxos::init_logger()
 {
   PerfCountersBuilder pcb(g_ceph_context, "paxos", l_paxos_first, l_paxos_last);
+
+  // Because monitors are so few in number, the resource cost of capturing
+  // almost all their perf counters at USEFUL is trivial.
+  pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
   pcb.add_u64_counter(l_paxos_start_leader, "start_leader", "Starts in leader role");
   pcb.add_u64_counter(l_paxos_start_peon, "start_peon", "Starts in peon role");
   pcb.add_u64_counter(l_paxos_restart, "restart", "Restarts");
@@ -195,14 +200,14 @@ void Paxos::collect(version_t oldpn)
   }
 
   // set timeout event
-  collect_timeout_event = new C_MonContext(mon, [this](int r) {
+  collect_timeout_event = mon->timer.add_event_after(
+    g_conf->mon_accept_timeout_factor *
+    g_conf->mon_lease,
+    new C_MonContext(mon, [this](int r) {
 	if (r == -ECANCELED)
 	  return;
 	collect_timeout();
-    });
-  mon->timer.add_event_after(g_conf->mon_accept_timeout_factor *
-			     g_conf->mon_lease,
-			     collect_timeout_event);
+    }));
 }
 
 
@@ -687,14 +692,13 @@ void Paxos::begin(bufferlist& v)
   }
 
   // set timeout event
-  accept_timeout_event = new C_MonContext(mon, [this](int r) {
-      if (r == -ECANCELED)
-	return;
-      accept_timeout();
-    });
-  mon->timer.add_event_after(g_conf->mon_accept_timeout_factor *
-			     g_conf->mon_lease,
-			     accept_timeout_event);
+  accept_timeout_event = mon->timer.add_event_after(
+    g_conf->mon_accept_timeout_factor * g_conf->mon_lease,
+    new C_MonContext(mon, [this](int r) {
+	if (r == -ECANCELED)
+	  return;
+	accept_timeout();
+      }));
 }
 
 // peon
@@ -992,26 +996,25 @@ void Paxos::extend_lease()
   // set timeout event.
   //  if old timeout is still in place, leave it.
   if (!lease_ack_timeout_event) {
-    lease_ack_timeout_event = new C_MonContext(mon, [this](int r) {
-	if (r == -ECANCELED)
-	  return;
-	lease_ack_timeout();
-      });
-    mon->timer.add_event_after(g_conf->mon_lease_ack_timeout_factor *
-			       g_conf->mon_lease,
-			       lease_ack_timeout_event);
+    lease_ack_timeout_event = mon->timer.add_event_after(
+      g_conf->mon_lease_ack_timeout_factor * g_conf->mon_lease,
+      new C_MonContext(mon, [this](int r) {
+	  if (r == -ECANCELED)
+	    return;
+	  lease_ack_timeout();
+	}));
   }
 
   // set renew event
-  lease_renew_event = new C_MonContext(mon, [this](int r) {
-      if (r == -ECANCELED)
-	return;
-      lease_renew_timeout();
-    });
   utime_t at = lease_expire;
   at -= g_conf->mon_lease;
   at += g_conf->mon_lease_renew_interval_factor * g_conf->mon_lease;
-  mon->timer.add_event_at(at, lease_renew_event);
+  lease_renew_event = mon->timer.add_event_at(
+    at, new C_MonContext(mon, [this](int r) {
+	if (r == -ECANCELED)
+	  return;
+	lease_renew_timeout();
+    }));
 }
 
 void Paxos::warn_on_future_time(utime_t t, entity_name_t from)
@@ -1195,14 +1198,13 @@ void Paxos::reset_lease_timeout()
   dout(20) << "reset_lease_timeout - setting timeout event" << dendl;
   if (lease_timeout_event)
     mon->timer.cancel_event(lease_timeout_event);
-  lease_timeout_event = new C_MonContext(mon, [this](int r) {
-      if (r == -ECANCELED)
-	return;
-      lease_timeout();
-    });
-  mon->timer.add_event_after(g_conf->mon_lease_ack_timeout_factor *
-			     g_conf->mon_lease,
-			     lease_timeout_event);
+  lease_timeout_event = mon->timer.add_event_after(
+    g_conf->mon_lease_ack_timeout_factor * g_conf->mon_lease,
+    new C_MonContext(mon, [this](int r) {
+	if (r == -ECANCELED)
+	  return;
+	lease_timeout();
+      }));
 }
 
 void Paxos::lease_timeout()
diff --git a/ceph/src/mon/PaxosService.cc b/ceph/src/mon/PaxosService.cc
index dcd83506c..de732c322 100644
--- a/ceph/src/mon/PaxosService.cc
+++ b/ceph/src/mon/PaxosService.cc
@@ -117,7 +117,7 @@ bool PaxosService::dispatch(MonOpRequestRef op)
        * Callback class used to propose the pending value once the proposal_timer
        * fires up.
        */
-    proposal_timer = new C_MonContext(mon, [this](int r) {
+    auto do_propose = new C_MonContext(mon, [this](int r) {
         proposal_timer = 0;
         if (r >= 0) {
           propose_pending();
@@ -127,9 +127,9 @@ bool PaxosService::dispatch(MonOpRequestRef op)
           assert(0 == "bad return value for proposal_timer");
         }
     });
-    dout(10) << " setting proposal_timer " << proposal_timer
+    dout(10) << " setting proposal_timer " << do_propose
              << " with delay of " << delay << dendl;
-    mon->timer.add_event_after(delay, proposal_timer);
+    proposal_timer = mon->timer.add_event_after(delay, do_propose);
   } else {
     dout(10) << " proposal_timer already set" << dendl;
   }
diff --git a/ceph/src/msg/Messenger.h b/ceph/src/msg/Messenger.h
index 7c1a0d1fa..c6dbcc176 100644
--- a/ceph/src/msg/Messenger.h
+++ b/ceph/src/msg/Messenger.h
@@ -31,6 +31,7 @@ using namespace std;
 
 #include <errno.h>
 #include <sstream>
+#include <signal.h>
 
 #define SOCKET_PRIORITY_MIN_DELAY 6
 
@@ -558,11 +559,53 @@ protected:
   /**
    * @} // Subclass Interfacing
    */
+public:
+#ifdef CEPH_USE_SIGPIPE_BLOCKER
+  /**
+   * We need to disable SIGPIPE on all platforms, and if they
+   * don't give us a better mechanism (read: are on Solaris) that
+   * means blocking the signal whenever we do a send or sendmsg...
+   * That means any implementations must invoke MSGR_SIGPIPE_STOPPER in-scope
+   * whenever doing so. On most systems that's blank, but on systems where
+   * it's needed we construct an RAII object to plug and un-plug the SIGPIPE.
+   * See http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html
+   */
+  struct sigpipe_stopper {
+    bool blocked;
+    sigset_t existing_mask;
+    sigset_t pipe_mask;
+    sigpipe_stopper() {
+      sigemptyset(&pipe_mask);
+      sigaddset(&pipe_mask, SIGPIPE);
+      sigset_t signals;
+      sigemptyset(&signals);
+      sigpending(&signals);
+      if (sigismember(&signals, SIGPIPE)) {
+	blocked = false;
+      } else {
+	blocked = true;
+	int r = pthread_sigmask(SIG_BLOCK, &pipe_mask, &existing_mask);
+	assert(r == 0);
+      }
+    }
+    ~sigpipe_stopper() {
+      if (blocked) {
+	struct timespec nowait{0};
+	int r = sigtimedwait(&pipe_mask, 0, &nowait);
+	assert(r == EAGAIN || r == 0);
+	r = pthread_sigmask(SIG_SETMASK, &existing_mask, 0);
+	assert(r == 0);
+      }
+    }
+  };
+#  define MSGR_SIGPIPE_STOPPER Messenger::sigpipe_stopper stopper();
+#else
+#  define MSGR_SIGPIPE_STOPPER
+#endif
   /**
    * @defgroup Dispatcher Interfacing
    * @{
    */
-public:
   /**
    * Determine whether a message can be fast-dispatched. We will
    * query each Dispatcher in sequence to determine if they are
diff --git a/ceph/src/msg/async/AsyncConnection.h b/ceph/src/msg/async/AsyncConnection.h
index 005b7c13a..ab2ff2c4a 100644
--- a/ceph/src/msg/async/AsyncConnection.h
+++ b/ceph/src/msg/async/AsyncConnection.h
@@ -19,7 +19,6 @@
 
 #include <atomic>
 #include <pthread.h>
-#include <signal.h>
 #include <climits>
 #include <list>
 #include <mutex>
diff --git a/ceph/src/msg/async/PosixStack.cc b/ceph/src/msg/async/PosixStack.cc
index d0e6b5af0..5fb975ae9 100644
--- a/ceph/src/msg/async/PosixStack.cc
+++ b/ceph/src/msg/async/PosixStack.cc
@@ -26,11 +26,12 @@
 
 #include "include/buffer.h"
 #include "include/str_list.h"
-#include "include/sock_compat.h"
 #include "common/errno.h"
 #include "common/strtol.h"
 #include "common/dout.h"
 #include "common/simple_spin.h"
+#include "msg/Messenger.h"
+#include "include/sock_compat.h"
 
 #define dout_subsys ceph_subsys_ms
 #undef dout_prefix
@@ -41,11 +42,6 @@ class PosixConnectedSocketImpl final : public ConnectedSocketImpl {
   int _fd;
   entity_addr_t sa;
   bool connected;
-#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
-  sigset_t sigpipe_mask;
-  bool sigpipe_pending;
-  bool sigpipe_unblock;
-#endif
 
  public:
   explicit PosixConnectedSocketImpl(NetHandler &h, const entity_addr_t &sa, int f, bool connected)
@@ -77,83 +73,15 @@ class PosixConnectedSocketImpl final : public ConnectedSocketImpl {
     return r;
   }
 
-  /*
-   SIGPIPE suppression - for platforms without SO_NOSIGPIPE or MSG_NOSIGNAL
-    http://krokisplace.blogspot.in/2010/02/suppressing-sigpipe-in-library.html 
-    http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html 
-  */
-  static void suppress_sigpipe()
-  {
-  #if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
-    /*
-      We want to ignore possible SIGPIPE that we can generate on write.
-      SIGPIPE is delivered *synchronously* and *only* to the thread
-      doing the write.  So if it is reported as already pending (which
-      means the thread blocks it), then we do nothing: if we generate
-      SIGPIPE, it will be merged with the pending one (there's no
-      queuing), and that suits us well.  If it is not pending, we block
-      it in this thread (and we avoid changing signal action, because it
-      is per-process).
-    */
-    sigset_t pending;
-    sigemptyset(&pending);
-    sigpending(&pending);
-    sigpipe_pending = sigismember(&pending, SIGPIPE);
-    if (!sigpipe_pending) {
-      sigset_t blocked;
-      sigemptyset(&blocked);
-      pthread_sigmask(SIG_BLOCK, &sigpipe_mask, &blocked);
-
-      /* Maybe is was blocked already?  */
-      sigpipe_unblock = ! sigismember(&blocked, SIGPIPE);
-    }
-  #endif /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
-  }
-
-  static void restore_sigpipe()
-  {
-  #if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
-    /*
-      If SIGPIPE was pending already we do nothing.  Otherwise, if it
-      become pending (i.e., we generated it), then we sigwait() it (thus
-      clearing pending status).  Then we unblock SIGPIPE, but only if it
-      were us who blocked it.
-    */
-    if (!sigpipe_pending) {
-      sigset_t pending;
-      sigemptyset(&pending);
-      sigpending(&pending);
-      if (sigismember(&pending, SIGPIPE)) {
-        /*
-          Protect ourselves from a situation when SIGPIPE was sent
-          by the user to the whole process, and was delivered to
-          other thread before we had a chance to wait for it.
-        */
-        static const struct timespec nowait = { 0, 0 };
-        TEMP_FAILURE_RETRY(sigtimedwait(&sigpipe_mask, NULL, &nowait));
-      }
-
-      if (sigpipe_unblock)
-        pthread_sigmask(SIG_UNBLOCK, &sigpipe_mask, NULL);
-    }
-  #endif  /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
-  }
-
   // return the sent length
   // < 0 means error occured
   static ssize_t do_sendmsg(int fd, struct msghdr &msg, unsigned len, bool more)
   {
-    suppress_sigpipe();
-
     size_t sent = 0;
     while (1) {
+      MSGR_SIGPIPE_STOPPER;
       ssize_t r;
-  #if defined(MSG_NOSIGNAL)
       r = ::sendmsg(fd, &msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
-  #else
-      r = ::sendmsg(fd, &msg, (more ? MSG_MORE : 0));
-  #endif /* defined(MSG_NOSIGNAL) */
-
       if (r < 0) {
         if (errno == EINTR) {
           continue;
@@ -179,7 +107,6 @@ class PosixConnectedSocketImpl final : public ConnectedSocketImpl {
         }
       }
     }
-    restore_sigpipe();
     return (ssize_t)sent;
   }
 
diff --git a/ceph/src/msg/async/net_handler.cc b/ceph/src/msg/async/net_handler.cc
index 19adb2c83..99ca1f32d 100644
--- a/ceph/src/msg/async/net_handler.cc
+++ b/ceph/src/msg/async/net_handler.cc
@@ -119,7 +119,7 @@ int NetHandler::set_socket_options(int sd, bool nodelay, int size)
   }
 
   // block ESIGPIPE
-#ifdef SO_NOSIGPIPE
+#ifdef CEPH_USE_SO_NOSIGPIPE
   int val = 1;
   r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
   if (r) {
diff --git a/ceph/src/msg/simple/Pipe.cc b/ceph/src/msg/simple/Pipe.cc
index 4a7ab9aca..848efd45c 100644
--- a/ceph/src/msg/simple/Pipe.cc
+++ b/ceph/src/msg/simple/Pipe.cc
@@ -907,7 +907,7 @@ void Pipe::set_socket_options()
   }
 
   // block ESIGPIPE
-#if defined(SO_NOSIGPIPE)
+#ifdef CEPH_USE_SO_NOSIGPIPE
   int val = 1;
   int r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
   if (r) {
@@ -2257,91 +2257,21 @@ int Pipe::read_message(Message **pm, AuthSessionHandler* auth_handler)
   return ret;
 }
 
-/* 
- SIGPIPE suppression - for platforms without SO_NOSIGPIPE or MSG_NOSIGNAL
-  http://krokisplace.blogspot.in/2010/02/suppressing-sigpipe-in-library.html 
-  http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html 
-*/
-void Pipe::suppress_sigpipe()
-{
-#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
-  /*
-    We want to ignore possible SIGPIPE that we can generate on write.
-    SIGPIPE is delivered *synchronously* and *only* to the thread
-    doing the write.  So if it is reported as already pending (which
-    means the thread blocks it), then we do nothing: if we generate
-    SIGPIPE, it will be merged with the pending one (there's no
-    queuing), and that suits us well.  If it is not pending, we block
-    it in this thread (and we avoid changing signal action, because it
-    is per-process).
-  */
-  sigset_t pending;
-  sigemptyset(&pending);
-  sigpending(&pending);
-  sigpipe_pending = sigismember(&pending, SIGPIPE);
-  if (!sigpipe_pending) {
-    sigset_t blocked;
-    sigemptyset(&blocked);
-    pthread_sigmask(SIG_BLOCK, &sigpipe_mask, &blocked);
-
-    /* Maybe is was blocked already?  */
-    sigpipe_unblock = ! sigismember(&blocked, SIGPIPE);
-  }
-#endif  /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
-}
-
-
-void Pipe::restore_sigpipe()
-{
-#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
-  /*
-    If SIGPIPE was pending already we do nothing.  Otherwise, if it
-    become pending (i.e., we generated it), then we sigwait() it (thus
-    clearing pending status).  Then we unblock SIGPIPE, but only if it
-    were us who blocked it.
-  */
-  if (!sigpipe_pending) {
-    sigset_t pending;
-    sigemptyset(&pending);
-    sigpending(&pending);
-    if (sigismember(&pending, SIGPIPE)) {
-      /*
-        Protect ourselves from a situation when SIGPIPE was sent
-        by the user to the whole process, and was delivered to
-        other thread before we had a chance to wait for it.
-      */
-      static const struct timespec nowait = { 0, 0 };
-      TEMP_FAILURE_RETRY(sigtimedwait(&sigpipe_mask, NULL, &nowait));
-    }
-
-    if (sigpipe_unblock)
-      pthread_sigmask(SIG_UNBLOCK, &sigpipe_mask, NULL);
-  }
-#endif  /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
-}
-
-
 int Pipe::do_sendmsg(struct msghdr *msg, unsigned len, bool more)
 {
-  suppress_sigpipe();
+  MSGR_SIGPIPE_STOPPER;
   while (len > 0) {
     int r;
-#if defined(MSG_NOSIGNAL)
     r = ::sendmsg(sd, msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
-#else
-    r = ::sendmsg(sd, msg, (more ? MSG_MORE : 0));
-#endif
     if (r == 0) 
       ldout(msgr->cct,10) << "do_sendmsg hmm do_sendmsg got r==0!" << dendl;
     if (r < 0) {
       r = -errno; 
       ldout(msgr->cct,1) << "do_sendmsg error " << cpp_strerror(r) << dendl;
-      restore_sigpipe();
       return r;
     }
     if (state == STATE_CLOSED) {
       ldout(msgr->cct,10) << "do_sendmsg oh look, state == CLOSED, giving up" << dendl;
-      restore_sigpipe();
       return -EINTR; // close enough
     }
 
@@ -2366,7 +2296,6 @@ int Pipe::do_sendmsg(struct msghdr *msg, unsigned len, bool more)
       }
     }
   }
-  restore_sigpipe();
   return 0;
 }
 
@@ -2733,15 +2662,9 @@ int Pipe::tcp_write(const char *buf, unsigned len)
 
   //lgeneric_dout(cct, DBL) << "tcp_write writing " << len << dendl;
   assert(len > 0);
-  suppress_sigpipe();
-
   while (len > 0) {
-    int did;
-#if defined(MSG_NOSIGNAL)
-    did = ::send( sd, buf, len, MSG_NOSIGNAL );
-#else
-    did = ::send( sd, buf, len, 0);
-#endif
+    MSGR_SIGPIPE_STOPPER;
+    int did = ::send( sd, buf, len, MSG_NOSIGNAL );
     if (did < 0) {
       //lgeneric_dout(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl;
       //lgeneric_derr(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl;
@@ -2751,7 +2674,5 @@ int Pipe::tcp_write(const char *buf, unsigned len)
     buf += did;
     //lgeneric_dout(cct, DBL) << "tcp_write did " << did << ", " << len << " left" << dendl;
   }
-  restore_sigpipe();
-
   return 0;
 }
diff --git a/ceph/src/msg/simple/Pipe.h b/ceph/src/msg/simple/Pipe.h
index 9dd00d1b4..d8d2a8e08 100644
--- a/ceph/src/msg/simple/Pipe.h
+++ b/ceph/src/msg/simple/Pipe.h
@@ -115,11 +115,6 @@ static const int SM_IOV_MAX = (IOV_MAX >= 1024 ? IOV_MAX / 4 : IOV_MAX);
   private:
     int sd;
     struct iovec msgvec[SM_IOV_MAX];
-#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
-    sigset_t sigpipe_mask;
-    bool sigpipe_pending;
-    bool sigpipe_unblock;
-#endif
 
   public:
     int port;
@@ -188,10 +183,6 @@ static const int SM_IOV_MAX = (IOV_MAX >= 1024 ? IOV_MAX / 4 : IOV_MAX);
     int write_keepalive();
     int write_keepalive2(char tag, const utime_t &t);
 
-    void suppress_sigpipe();
-    void restore_sigpipe();
-
-
     void fault(bool reader=false);
 
     void was_session_reset();
diff --git a/ceph/src/os/ObjectMap.h b/ceph/src/os/ObjectMap.h
index 67a5780ae..32b423c28 100644
--- a/ceph/src/os/ObjectMap.h
+++ b/ceph/src/os/ObjectMap.h
@@ -152,7 +152,7 @@ public:
     const SequencerPosition *spos=0   ///< [in] Sequencer
     ) { return 0; }
 
-  virtual int check(std::ostream &out, bool repair = false) { return 0; }
+  virtual int check(std::ostream &out, bool repair = false, bool force = false) { return 0; }
 
   virtual void compact() {}
 
diff --git a/ceph/src/os/ObjectStore.h b/ceph/src/os/ObjectStore.h
index 97624c09f..2daf2c64b 100644
--- a/ceph/src/os/ObjectStore.h
+++ b/ceph/src/os/ObjectStore.h
@@ -1543,6 +1543,9 @@ public:
   virtual int fsck(bool deep) {
     return -EOPNOTSUPP;
   }
+  virtual int repair(bool deep) {
+    return -EOPNOTSUPP;
+  }
 
   virtual void set_cache_shards(unsigned num) { }
 
diff --git a/ceph/src/os/bluestore/BitmapFreelistManager.cc b/ceph/src/os/bluestore/BitmapFreelistManager.cc
index 075d3eff6..2480945f4 100644
--- a/ceph/src/os/bluestore/BitmapFreelistManager.cc
+++ b/ceph/src/os/bluestore/BitmapFreelistManager.cc
@@ -58,9 +58,11 @@ BitmapFreelistManager::BitmapFreelistManager(CephContext* cct,
 {
 }
 
-int BitmapFreelistManager::create(uint64_t new_size, KeyValueDB::Transaction txn)
+int BitmapFreelistManager::create(uint64_t new_size, uint64_t min_alloc_size,
+				  KeyValueDB::Transaction txn)
 {
-  bytes_per_block = cct->_conf->bdev_block_size;
+  bytes_per_block = std::max(cct->_conf->bdev_block_size,
+			     (int64_t)min_alloc_size);
   assert(ISP2(bytes_per_block));
   size = P2ALIGN(new_size, bytes_per_block);
   blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key;
@@ -105,7 +107,7 @@ int BitmapFreelistManager::create(uint64_t new_size, KeyValueDB::Transaction txn
   return 0;
 }
 
-int BitmapFreelistManager::init()
+int BitmapFreelistManager::init(uint64_t dev_size)
 {
   dout(1) << __func__ << dendl;
 
@@ -153,6 +155,49 @@ int BitmapFreelistManager::init()
 	   << " blocks_per_key 0x" << blocks_per_key
 	   << std::dec << dendl;
   _init_misc();
+
+  // check for http://tracker.ceph.com/issues/21089 inconsistency
+  {
+    uint64_t new_size = P2ALIGN(dev_size, bytes_per_block);
+    if (new_size != size) {
+      uint64_t bad_size = new_size & ~bytes_per_block;
+      if (size == bad_size) {
+	derr << __func__ << " size is 0x" << std::hex << size << " should be 0x"
+	     << new_size << " and appears to be due to #21089" << std::dec
+	     << dendl;
+
+	uint64_t new_blocks = new_size / bytes_per_block;
+	if (new_blocks / blocks_per_key * blocks_per_key != new_blocks) {
+	  new_blocks = (new_blocks / blocks_per_key + 1) *
+	    blocks_per_key;
+	}
+
+	KeyValueDB::Transaction t = kvdb->get_transaction();
+	{
+	  bufferlist sizebl;
+	  ::encode(new_size, sizebl);
+	  t->set(meta_prefix, "size", sizebl);
+	}
+	if (new_blocks != blocks) {
+	  derr << "blocks is 0x" << std::hex << blocks << " should be 0x"
+	       << new_blocks << std::dec << dendl;
+	  bufferlist bl;
+	  ::encode(new_blocks, bl);
+	  t->set(meta_prefix, "blocks", bl);
+	  _xor(new_size, new_blocks * bytes_per_block - new_size, t);
+	} else {
+	  derr << "blocks are ok" << dendl;
+	  _xor(bad_size, bytes_per_block, t);
+	}
+	int r = kvdb->submit_transaction_sync(t);
+	assert(r == 0);
+	size = new_size;
+	blocks = new_blocks;
+	derr << __func__ << " fixed inconsistency, size now 0x" << std::hex
+	     << size << " blocks 0x" << blocks << std::dec << dendl;
+      }
+    }
+  }
   return 0;
 }
 
diff --git a/ceph/src/os/bluestore/BitmapFreelistManager.h b/ceph/src/os/bluestore/BitmapFreelistManager.h
index 9ed39ff56..10982545b 100644
--- a/ceph/src/os/bluestore/BitmapFreelistManager.h
+++ b/ceph/src/os/bluestore/BitmapFreelistManager.h
@@ -51,9 +51,10 @@ public:
 
   static void setup_merge_operator(KeyValueDB *db, string prefix);
 
-  int create(uint64_t size, KeyValueDB::Transaction txn) override;
+  int create(uint64_t size, uint64_t min_alloc_size,
+	     KeyValueDB::Transaction txn) override;
 
-  int init() override;
+  int init(uint64_t dev_size) override;
   void shutdown() override;
 
   void dump() override;
diff --git a/ceph/src/os/bluestore/BlueFS.cc b/ceph/src/os/bluestore/BlueFS.cc
index 1d39d7635..4d07b48c2 100644
--- a/ceph/src/os/bluestore/BlueFS.cc
+++ b/ceph/src/os/bluestore/BlueFS.cc
@@ -255,6 +255,16 @@ void BlueFS::dump_perf_counters(Formatter *f)
   f->close_section();
 }
 
+void BlueFS::dump_block_extents(ostream& out)
+{
+  for (unsigned i = 0; i < MAX_BDEV; ++i) {
+    if (!bdev[i]) {
+      continue;
+    }
+    out << i << " : size 0x" << std::hex << bdev[i]->get_size()
+	<< " : own 0x" << block_all[i] << std::dec << "\n";
+  }
+}
 
 void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
 {
@@ -1191,6 +1201,14 @@ void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
   new_log = new File;
   new_log->fnode.ino = 0;   // so that _flush_range won't try to log the fnode
 
+  // 0. wait for any racing flushes to complete.  (We do not want to block
+  // in _flush_sync_log with jump_to set or else a racing thread might flush
+  // our entries and our jump_to update won't be correct.)
+  while (log_flushing) {
+    dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
+    log_cond.wait(l);
+  }
+
   // 1. allocate new log space and jump to it.
   old_log_jump_to = log_file->fnode.get_allocated();
   uint64_t need = old_log_jump_to + cct->_conf->bluefs_max_log_runway;
@@ -1351,16 +1369,19 @@ int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
   while (log_flushing) {
     dout(10) << __func__ << " want_seq " << want_seq
 	     << " log is currently flushing, waiting" << dendl;
+    assert(!jump_to);
     log_cond.wait(l);
   }
   if (want_seq && want_seq <= log_seq_stable) {
     dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
 	     << log_seq_stable << ", done" << dendl;
+    assert(!jump_to);
     return 0;
   }
   if (log_t.empty() && dirty_files.empty()) {
     dout(10) << __func__ << " want_seq " << want_seq
 	     << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
+    assert(!jump_to);
     return 0;
   }
 
@@ -1505,6 +1526,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
       derr << __func__ << " allocated: 0x" << std::hex << allocated
            << " offset: 0x" << offset << " length: 0x" << length << std::dec
            << dendl;
+      assert(0 == "bluefs enospc");
       return r;
     }
     h->file->fnode.recalc_allocated();
diff --git a/ceph/src/os/bluestore/BlueFS.h b/ceph/src/os/bluestore/BlueFS.h
index 1b38a6ab1..36b41b451 100644
--- a/ceph/src/os/bluestore/BlueFS.h
+++ b/ceph/src/os/bluestore/BlueFS.h
@@ -342,6 +342,8 @@ public:
   void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...]
   void dump_perf_counters(Formatter *f);
 
+  void dump_block_extents(ostream& out);
+
   /// get current extents that we own for given block device
   int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
 
diff --git a/ceph/src/os/bluestore/BlueStore.cc b/ceph/src/os/bluestore/BlueStore.cc
index 3566005cd..5fe5b98c5 100644
--- a/ceph/src/os/bluestore/BlueStore.cc
+++ b/ceph/src/os/bluestore/BlueStore.cc
@@ -1615,6 +1615,12 @@ bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
   return false;
 }
 
+void BlueStore::OnodeSpace::dump(CephContext *cct, int lvl)
+{
+  for (auto& i : onode_map) {
+    ldout(cct, lvl) << i.first << " : " << i.second << dendl;
+  }
+}
 
 // SharedBlob
 
@@ -1661,7 +1667,7 @@ void BlueStore::SharedBlob::put()
 			     << " removing self from set " << get_parent()
 			     << dendl;
     if (get_parent()) {
-      if (get_parent()->remove(this)) {
+      if (get_parent()->try_remove(this)) {
 	delete this;
       } else {
 	ldout(coll->store->cct, 20)
@@ -1692,6 +1698,19 @@ void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
   }
 }
 
+// SharedBlobSet
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
+
+void BlueStore::SharedBlobSet::dump(CephContext *cct, int lvl)
+{
+  std::lock_guard<std::mutex> l(lock);
+  for (auto& i : sb_map) {
+    ldout(cct, lvl) << i.first << " : " << *i.second << dendl;
+  }
+}
+
 // Blob
 
 #undef dout_prefix
@@ -1976,6 +1995,7 @@ void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
       unsigned n;
       // we need to encode inline_bl to measure encoded length
       bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
+      inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
       assert(!never_happen);
       size_t len = inline_bl.length();
       dout(20) << __func__ << "  inline shard " << len << " bytes from " << n
@@ -3228,12 +3248,17 @@ BlueStore::OnodeRef BlueStore::Collection::get_onode(
     on->exists = true;
     bufferptr::iterator p = v.front().begin_deep();
     on->onode.decode(p);
+    for (auto& i : on->onode.attrs) {
+      i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+    }
 
     // initialize extent_map
     on->extent_map.decode_spanning_blobs(p);
     if (on->onode.extent_map_shards.empty()) {
       denc(on->extent_map.inline_bl, p);
       on->extent_map.decode_some(on->extent_map.inline_bl);
+      on->extent_map.inline_bl.reassign_to_mempool(
+	mempool::mempool_bluestore_cache_other);
     } else {
       on->extent_map.init_shards(false, false);
     }
@@ -3292,13 +3317,13 @@ void BlueStore::Collection::split_cache(
 	  continue;
 	}
 	ldout(store->cct, 20) << __func__ << "  moving " << *sb << dendl;
-	sb->coll = dest;
 	if (sb->get_sbid()) {
 	  ldout(store->cct, 20) << __func__
 				<< "   moving registration " << *sb << dendl;
 	  shared_blob_set.remove(sb);
 	  dest->shared_blob_set.add(dest, sb);
 	}
+	sb->coll = dest;
 	if (dest->cache != cache) {
 	  for (auto& i : sb->bc.buffer_map) {
 	    if (!i.second->is_writing()) {
@@ -3631,8 +3656,8 @@ void BlueStore::_set_compression()
     return;
   }
 
-  if (cct->_conf->bluestore_compression_max_blob_size) {
-    comp_min_blob_size = cct->_conf->bluestore_compression_max_blob_size;
+  if (cct->_conf->bluestore_compression_min_blob_size) {
+    comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
   } else {
     assert(bdev);
     if (bdev->is_rotational()) {
@@ -3774,6 +3799,36 @@ int BlueStore::_set_cache_sizes()
   return 0;
 }
 
+int BlueStore::write_meta(const std::string& key, const std::string& value)
+{
+  bluestore_bdev_label_t label;
+  string p = path + "/block";
+  int r = _read_bdev_label(cct, p, &label);
+  if (r < 0) {
+    return ObjectStore::write_meta(key, value);
+  }
+  label.meta[key] = value;
+  r = _write_bdev_label(cct, p, label);
+  assert(r == 0);
+  return ObjectStore::write_meta(key, value);
+}
+
+int BlueStore::read_meta(const std::string& key, std::string *value)
+{
+  bluestore_bdev_label_t label;
+  string p = path + "/block";
+  int r = _read_bdev_label(cct, p, &label);
+  if (r < 0) {
+    return ObjectStore::read_meta(key, value);
+  }
+  auto i = label.meta.find(key);
+  if (i == label.meta.end()) {
+    return ObjectStore::read_meta(key, value);
+  }
+  *value = i->second;
+  return 0;
+}
+
 void BlueStore::_init_logger()
 {
   PerfCountersBuilder b(cct, "bluestore",
@@ -3966,7 +4021,8 @@ void BlueStore::_close_path()
   path_fd = -1;
 }
 
-int BlueStore::_write_bdev_label(string path, bluestore_bdev_label_t label)
+int BlueStore::_write_bdev_label(CephContext *cct,
+				 string path, bluestore_bdev_label_t label)
 {
   dout(10) << __func__ << " path " << path << " label " << label << dendl;
   bufferlist bl;
@@ -3990,6 +4046,11 @@ int BlueStore::_write_bdev_label(string path, bluestore_bdev_label_t label)
     derr << __func__ << " failed to write to " << path
 	 << ": " << cpp_strerror(r) << dendl;
   }
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " failed to fsync " << path
+	 << ": " << cpp_strerror(r) << dendl;
+  }
   VOID_TEMP_FAILURE_RETRY(::close(fd));
   return r;
 }
@@ -4047,7 +4108,7 @@ int BlueStore::_check_or_set_bdev_label(
     label.size = size;
     label.btime = ceph_clock_now();
     label.description = desc;
-    int r = _write_bdev_label(path, label);
+    int r = _write_bdev_label(cct, path, label);
     if (r < 0)
       return r;
   } else {
@@ -4159,18 +4220,19 @@ int BlueStore::_open_fm(bool create)
       bl.append(freelist_type);
       t->set(PREFIX_SUPER, "freelist_type", bl);
     }
-    fm->create(bdev->get_size(), t);
+    fm->create(bdev->get_size(), min_alloc_size, t);
 
     // allocate superblock reserved space.  note that we do not mark
     // bluefs space as allocated in the freelist; we instead rely on
     // bluefs_extents.
-    fm->allocate(0, SUPER_RESERVED, t);
+    uint64_t reserved = ROUND_UP_TO(MAX(SUPER_RESERVED, min_alloc_size),
+				    min_alloc_size);
+    fm->allocate(0, reserved, t);
 
-    uint64_t reserved = 0;
     if (cct->_conf->bluestore_bluefs) {
       assert(bluefs_extents.num_intervals() == 1);
       interval_set<uint64_t>::iterator p = bluefs_extents.begin();
-      reserved = p.get_start() + p.get_len();
+      reserved = ROUND_UP_TO(p.get_start() + p.get_len(), min_alloc_size);
       dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
 	       << " for bluefs" << dendl;
       bufferlist bl;
@@ -4178,8 +4240,6 @@ int BlueStore::_open_fm(bool create)
       t->set(PREFIX_SUPER, "bluefs_extents", bl);
       dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
 	       << std::dec << dendl;
-    } else {
-      reserved = SUPER_RESERVED;
     }
 
     if (cct->_conf->bluestore_debug_prefill > 0) {
@@ -4226,7 +4286,7 @@ int BlueStore::_open_fm(bool create)
     db->submit_transaction_sync(t);
   }
 
-  int r = fm->init();
+  int r = fm->init(bdev->get_size());
   if (r < 0) {
     derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
     delete fm;
@@ -4493,7 +4553,9 @@ int BlueStore::_open_db(bool create)
     string bfn;
     struct stat st;
 
-    bfn = path + "/block.db";
+    if (read_meta("path_block.db", &bfn) < 0) {
+      bfn = path + "/block.db";
+    }
     if (::stat(bfn.c_str(), &st) == 0) {
       r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
       if (r < 0) {
@@ -4532,7 +4594,9 @@ int BlueStore::_open_db(bool create)
     }
 
     // shared device
-    bfn = path + "/block";
+    if (read_meta("path_block", &bfn) < 0) {
+      bfn = path + "/block";
+    }
     r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
     if (r < 0) {
       derr << __func__ << " add block device(" << bfn << ") returned: " 
@@ -4545,6 +4609,13 @@ int BlueStore::_open_db(bool create)
 	bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
 			    cct->_conf->bluestore_bluefs_gift_ratio);
       initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
+      if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
+	derr << __func__ << " bluefs_alloc_size 0x" << std::hex
+	     << cct->_conf->bluefs_alloc_size << " is not a multiple of "
+	     << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+	r = -EINVAL;
+	goto free_bluefs;
+      }
       // align to bluefs's alloc_size
       initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
       // put bluefs in the middle of the device in case it is an HDD
@@ -4554,7 +4625,9 @@ int BlueStore::_open_db(bool create)
       bluefs_extents.insert(start, initial);
     }
 
-    bfn = path + "/block.wal";
+    if (read_meta("path_block.wal", &bfn) < 0) {
+      bfn = path + "/block.wal";
+    }
     if (::stat(bfn.c_str(), &st) == 0) {
       r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
       if (r < 0) {
@@ -4817,9 +4890,11 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
 	     << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
 	     << ", should reclaim " << pretty_si_t(reclaim) << dendl;
   }
+
+  // don't take over too much of the freespace
+  uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
   if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
-    cct->_conf->bluestore_bluefs_min <
-      (uint64_t)(cct->_conf->bluestore_bluefs_max_ratio * total_free)) {
+      cct->_conf->bluestore_bluefs_min < free_cap) {
     uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
     dout(10) << __func__ << " bluefs_total " << bluefs_total
 	     << " < min " << cct->_conf->bluestore_bluefs_min
@@ -4828,6 +4903,17 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
       gift = g;
     reclaim = 0;
   }
+  uint64_t min_free = cct->_conf->get_val<uint64_t>("bluestore_bluefs_min_free");
+  if (bluefs_free < min_free &&
+      min_free < free_cap) {
+    uint64_t g = min_free - bluefs_free;
+    dout(10) << __func__ << " bluefs_free " << bluefs_total
+	     << " < min " << min_free
+	     << ", should gift " << pretty_si_t(g) << dendl;
+    if (g > gift)
+      gift = g;
+    reclaim = 0;
+  }
 
   if (gift) {
     // round up to alloc size
@@ -5150,6 +5236,39 @@ int BlueStore::mkfs()
   if (r < 0)
     goto out_close_fsid;
 
+  {
+    string wal_path = cct->_conf->get_val<string>("bluestore_block_wal_path");
+    if (wal_path.size()) {
+      write_meta("path_block.wal", wal_path);
+    }
+    string db_path = cct->_conf->get_val<string>("bluestore_block_db_path");
+    if (db_path.size()) {
+      write_meta("path_block.db", db_path);
+    }
+  }
+
+  // choose min_alloc_size
+  if (cct->_conf->bluestore_min_alloc_size) {
+    min_alloc_size = cct->_conf->bluestore_min_alloc_size;
+  } else {
+    assert(bdev);
+    if (bdev->is_rotational()) {
+      min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
+    } else {
+      min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
+    }
+  }
+
+  // make sure min_alloc_size is power of 2 aligned.
+  if (!ISP2(min_alloc_size)) {
+    derr << __func__ << " min_alloc_size 0x"
+	 << std::hex << min_alloc_size << std::dec
+	 << " is not power of 2 aligned!"
+	 << dendl;
+    r = -EINVAL;
+    goto out_close_bdev;
+  }
+
   r = _open_db(true);
   if (r < 0)
     goto out_close_bdev;
@@ -5167,28 +5286,6 @@ int BlueStore::mkfs()
       t->set(PREFIX_SUPER, "blobid_max", bl);
     }
 
-    // choose min_alloc_size
-    if (cct->_conf->bluestore_min_alloc_size) {
-      min_alloc_size = cct->_conf->bluestore_min_alloc_size;
-    } else {
-      assert(bdev);
-      if (bdev->is_rotational()) {
-	min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
-      } else {
-	min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
-      }
-    }
-
-    // make sure min_alloc_size is power of 2 aligned.
-    if (!ISP2(min_alloc_size)) {
-      derr << __func__ << " min_alloc_size 0x"
-           << std::hex << min_alloc_size << std::dec
-           << " is not power of 2 aligned!"
-           << dendl;
-      r = -EINVAL;
-      goto out_close_fm;
-    }
-
     {
       bufferlist bl;
       ::encode((uint64_t)min_alloc_size, bl);
@@ -5205,7 +5302,7 @@ int BlueStore::mkfs()
   if (r < 0)
     goto out_close_fm;
 
-  r = write_meta("bluefs", stringify((int)cct->_conf->bluestore_bluefs));
+  r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
   if (r < 0)
     goto out_close_fm;
 
@@ -5268,6 +5365,8 @@ int BlueStore::_mount(bool kv_only)
 {
   dout(1) << __func__ << " path " << path << dendl;
 
+  _kv_only = kv_only;
+
   {
     string type;
     int r = read_meta("type", &type);
@@ -5378,23 +5477,24 @@ int BlueStore::_mount(bool kv_only)
 
 int BlueStore::umount()
 {
-  assert(mounted);
+  assert(_kv_only || mounted);
   dout(1) << __func__ << dendl;
 
   _osr_drain_all();
   _osr_unregister_all();
 
-  mempool_thread.shutdown();
-
-  dout(20) << __func__ << " stopping kv thread" << dendl;
-  _kv_stop();
-  _reap_collections();
-  _flush_cache();
-  dout(20) << __func__ << " closing" << dendl;
-
   mounted = false;
-  _close_alloc();
-  _close_fm();
+  if (!_kv_only) {
+    mempool_thread.shutdown();
+    dout(20) << __func__ << " stopping kv thread" << dendl;
+    _kv_stop();
+    _reap_collections();
+    _flush_cache();
+    dout(20) << __func__ << " closing" << dendl;
+
+    _close_alloc();
+    _close_fm();
+  }
   _close_db();
   _close_bdev();
   _close_fsid();
@@ -5445,7 +5545,7 @@ int BlueStore::_fsck_check_extents(
     }
     bool already = false;
     apply(
-      e.offset, e.length, block_size, used_blocks, __func__,
+      e.offset, e.length, min_alloc_size, used_blocks, __func__,
       [&](uint64_t pos, mempool_dynamic_bitset &bs) {
 	if (bs.test(pos))
 	  already = true;
@@ -5466,10 +5566,13 @@ int BlueStore::_fsck_check_extents(
   return errors;
 }
 
-int BlueStore::fsck(bool deep)
+int BlueStore::_fsck(bool deep, bool repair)
 {
-  dout(1) << __func__ << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
+  dout(1) << __func__
+	  << (repair ? " fsck" : " repair")
+	  << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
   int errors = 0;
+  int repaired = 0;
 
   typedef btree::btree_set<
     uint64_t,std::less<uint64_t>,
@@ -5547,9 +5650,10 @@ int BlueStore::fsck(bool deep)
   if (r < 0)
     goto out_scan;
 
-  used_blocks.resize(bdev->get_size() / block_size);
+  used_blocks.resize(bdev->get_size() / min_alloc_size);
   apply(
-    0, SUPER_RESERVED, block_size, used_blocks, "0~SUPER_RESERVED",
+    0, MAX(min_alloc_size, SUPER_RESERVED), min_alloc_size, used_blocks,
+    "0~SUPER_RESERVED",
     [&](uint64_t pos, mempool_dynamic_bitset &bs) {
       bs.set(pos);
     }
@@ -5558,7 +5662,7 @@ int BlueStore::fsck(bool deep)
   if (bluefs) {
     for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
       apply(
-        e.get_start(), e.get_len(), block_size, used_blocks, "bluefs",
+        e.get_start(), e.get_len(), min_alloc_size, used_blocks, "bluefs",
         [&](uint64_t pos, mempool_dynamic_bitset &bs) {
           bs.set(pos);
         }
@@ -5593,7 +5697,7 @@ int BlueStore::fsck(bool deep)
       if (is_extent_shard_key(it->key())) {
 	while (!expecting_shards.empty() &&
 	       expecting_shards.front() < it->key()) {
-	  derr << __func__ << " error: missing shard key "
+	  derr << "fsck error: missing shard key "
 	       << pretty_binary_string(expecting_shards.front())
 	       << dendl;
 	  ++errors;
@@ -5609,18 +5713,18 @@ int BlueStore::fsck(bool deep)
         uint32_t offset;
         string okey;
         get_key_extent_shard(it->key(), &okey, &offset);
-        derr << __func__ << " error: stray shard 0x" << std::hex << offset
+        derr << "fsck error: stray shard 0x" << std::hex << offset
 	     << std::dec << dendl;
         if (expecting_shards.empty()) {
-          derr << __func__ << " error: " << pretty_binary_string(it->key())
+          derr << "fsck error: " << pretty_binary_string(it->key())
                << " is unexpected" << dendl;
           ++errors;
           continue;
         }
 	while (expecting_shards.front() > it->key()) {
-	  derr << __func__ << " error:   saw " << pretty_binary_string(it->key())
+	  derr << "fsck error:   saw " << pretty_binary_string(it->key())
 	       << dendl;
-	  derr << __func__ << " error:   exp "
+	  derr << "fsck error:   exp "
 	       << pretty_binary_string(expecting_shards.front()) << dendl;
 	  ++errors;
 	  expecting_shards.pop_front();
@@ -5634,7 +5738,7 @@ int BlueStore::fsck(bool deep)
       ghobject_t oid;
       int r = get_key_object(it->key(), &oid);
       if (r < 0) {
-        derr << __func__ << " error: bad object key "
+        derr << "fsck error: bad object key "
              << pretty_binary_string(it->key()) << dendl;
 	++errors;
 	continue;
@@ -5654,7 +5758,7 @@ int BlueStore::fsck(bool deep)
 	  }
 	}
 	if (!c) {
-          derr << __func__ << " error: stray object " << oid
+          derr << "fsck error: stray object " << oid
                << " not owned by any collection" << dendl;
 	  ++errors;
 	  continue;
@@ -5665,7 +5769,7 @@ int BlueStore::fsck(bool deep)
 
       if (!expecting_shards.empty()) {
 	for (auto &k : expecting_shards) {
-	  derr << __func__ << " error: missing shard key "
+	  derr << "fsck error: missing shard key "
 	       << pretty_binary_string(k) << dendl;
 	}
 	++errors;
@@ -5677,12 +5781,12 @@ int BlueStore::fsck(bool deep)
       OnodeRef o = c->get_onode(oid, false);
       if (o->onode.nid) {
 	if (o->onode.nid > nid_max) {
-	  derr << __func__ << " error: " << oid << " nid " << o->onode.nid
+	  derr << "fsck error: " << oid << " nid " << o->onode.nid
 	       << " > nid_max " << nid_max << dendl;
 	  ++errors;
 	}
 	if (used_nids.count(o->onode.nid)) {
-	  derr << __func__ << " error: " << oid << " nid " << o->onode.nid
+	  derr << "fsck error: " << oid << " nid " << o->onode.nid
 	       << " already in use" << dendl;
 	  ++errors;
 	  continue; // go for next object
@@ -5704,7 +5808,7 @@ int BlueStore::fsck(bool deep)
 	get_extent_shard_key(o->key, s.shard_info->offset,
 			     &expecting_shards.back());
 	if (s.shard_info->offset >= o->onode.size) {
-	  derr << __func__ << " error: " << oid << " shard 0x" << std::hex
+	  derr << "fsck error: " << oid << " shard 0x" << std::hex
 	       << s.shard_info->offset << " past EOF at 0x" << o->onode.size
 	       << std::dec << dendl;
 	  ++errors;
@@ -5718,14 +5822,14 @@ int BlueStore::fsck(bool deep)
       for (auto& l : o->extent_map.extent_map) {
 	dout(20) << __func__ << "    " << l << dendl;
 	if (l.logical_offset < pos) {
-	  derr << __func__ << " error: " << oid << " lextent at 0x"
+	  derr << "fsck error: " << oid << " lextent at 0x"
 	       << std::hex << l.logical_offset
 	       << " overlaps with the previous, which ends at 0x" << pos
 	       << std::dec << dendl;
 	  ++errors;
 	}
 	if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
-	  derr << __func__ << " error: " << oid << " lextent at 0x"
+	  derr << "fsck error: " << oid << " lextent at 0x"
 	       << std::hex << l.logical_offset << "~" << l.length
 	       << " spans a shard boundary"
 	       << std::dec << dendl;
@@ -5771,7 +5875,7 @@ int BlueStore::fsck(bool deep)
 		 << std::dec << " for " << *i.first << dendl;
 	const bluestore_blob_t& blob = i.first->get_blob();
 	if (i.second & blob.unused) {
-	  derr << __func__ << " error: " << oid << " blob claims unused 0x"
+	  derr << "fsck error: " << oid << " blob claims unused 0x"
 	       << std::hex << blob.unused
 	       << " but extents reference 0x" << i.second
 	       << " on blob " << *i.first << dendl;
@@ -5793,7 +5897,7 @@ int BlueStore::fsck(bool deep)
 	    if ((blob.unused & mask) == mask) {
 	      // this csum chunk region is marked unused
 	      if (blob.get_csum_item(p) != 0) {
-		derr << __func__ << " error: " << oid
+		derr << "fsck error: " << oid
 		     << " blob claims csum chunk 0x" << std::hex << pos
 		     << "~" << csum_chunk_size
 		     << " is unused (mask 0x" << mask << " of unused 0x"
@@ -5811,7 +5915,7 @@ int BlueStore::fsck(bool deep)
 	const bluestore_blob_t& blob = i.first->get_blob();
 	bool equal = i.first->get_blob_use_tracker().equal(i.second);
 	if (!equal) {
-	  derr << __func__ << " error: " << oid << " blob " << *i.first
+	  derr << "fsck error: " << oid << " blob " << *i.first
 	       << " doesn't match expected ref_map " << i.second << dendl;
 	  ++errors;
 	}
@@ -5822,12 +5926,12 @@ int BlueStore::fsck(bool deep)
 	}
 	if (blob.is_shared()) {
 	  if (i.first->shared_blob->get_sbid() > blobid_max) {
-	    derr << __func__ << " error: " << oid << " blob " << blob
+	    derr << "fsck error: " << oid << " blob " << blob
 		 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
 		 << blobid_max << dendl;
 	    ++errors;
 	  } else if (i.first->shared_blob->get_sbid() == 0) {
-            derr << __func__ << " error: " << oid << " blob " << blob
+            derr << "fsck error: " << oid << " blob " << blob
                  << " marked as shared but has uninitialized sbid"
                  << dendl;
             ++errors;
@@ -5853,14 +5957,14 @@ int BlueStore::fsck(bool deep)
 	int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
 	if (r < 0) {
 	  ++errors;
-	  derr << __func__ << " error: " << oid << " error during read: "
+	  derr << "fsck error: " << oid << " error during read: "
 	       << cpp_strerror(r) << dendl;
 	}
       }
       // omap
       if (o->onode.has_omap()) {
 	if (used_omap_head.count(o->onode.nid)) {
-	  derr << __func__ << " error: " << oid << " omap_head " << o->onode.nid
+	  derr << "fsck error: " << oid << " omap_head " << o->onode.nid
 	       << " already in use" << dendl;
 	  ++errors;
 	} else {
@@ -5876,14 +5980,14 @@ int BlueStore::fsck(bool deep)
       string key = it->key();
       uint64_t sbid;
       if (get_key_shared_blob(key, &sbid)) {
-	derr << __func__ << " error: bad key '" << key
+	derr << "fsck error: bad key '" << key
 	     << "' in shared blob namespace" << dendl;
 	++errors;
 	continue;
       }
       auto p = sb_info.find(sbid);
       if (p == sb_info.end()) {
-	derr << __func__ << " error: found stray shared blob data for sbid 0x"
+	derr << "fsck error: found stray shared blob data for sbid 0x"
 	     << std::hex << sbid << std::dec << dendl;
 	++errors;
       } else {
@@ -5895,7 +5999,7 @@ int BlueStore::fsck(bool deep)
 	::decode(shared_blob, blp);
 	dout(20) << __func__ << "  " << *sbi.sb << " " << shared_blob << dendl;
 	if (shared_blob.ref_map != sbi.ref_map) {
-	  derr << __func__ << " error: shared blob 0x" << std::hex << sbid
+	  derr << "fsck error: shared blob 0x" << std::hex << sbid
 	       << std::dec << " ref_map " << shared_blob.ref_map
 	       << " != expected " << sbi.ref_map << dendl;
 	  ++errors;
@@ -5913,12 +6017,12 @@ int BlueStore::fsck(bool deep)
     }
   }
   for (auto &p : sb_info) {
-    derr << __func__ << " error: shared_blob 0x" << p.first
+    derr << "fsck error: shared_blob 0x" << p.first
 	 << " key is missing (" << *p.second.sb << ")" << dendl;
     ++errors;
   }
   if (!(actual_statfs == expected_statfs)) {
-    derr << __func__ << " error: actual " << actual_statfs
+    derr << "fsck error: actual " << actual_statfs
 	 << " != expected " << expected_statfs << dendl;
     ++errors;
   }
@@ -5930,7 +6034,7 @@ int BlueStore::fsck(bool deep)
       uint64_t omap_head;
       _key_decode_u64(it->key().c_str(), &omap_head);
       if (used_omap_head.count(omap_head) == 0) {
-	derr << __func__ << " error: found stray omap data on omap_head "
+	derr << "fsck error: found stray omap data on omap_head "
 	     << omap_head << dendl;
 	++errors;
       }
@@ -5947,7 +6051,7 @@ int BlueStore::fsck(bool deep)
       try {
 	::decode(wt, p);
       } catch (buffer::error& e) {
-	derr << __func__ << " error: failed to decode deferred txn "
+	derr << "fsck error: failed to decode deferred txn "
 	     << pretty_binary_string(it->key()) << dendl;
 	r = -EIO;
         goto out_scan;
@@ -5957,7 +6061,7 @@ int BlueStore::fsck(bool deep)
 	       << " released 0x" << std::hex << wt.released << std::dec << dendl;
       for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
         apply(
-          e.get_start(), e.get_len(), block_size, used_blocks, "deferred",
+          e.get_start(), e.get_len(), min_alloc_size, used_blocks, "deferred",
           [&](uint64_t pos, mempool_dynamic_bitset &bs) {
             bs.set(pos);
           }
@@ -5972,7 +6076,8 @@ int BlueStore::fsck(bool deep)
     // know they are allocated.
     for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
       apply(
-        e.get_start(), e.get_len(), block_size, used_blocks, "bluefs_extents",
+        e.get_start(), e.get_len(), min_alloc_size, used_blocks,
+        "bluefs_extents",
         [&](uint64_t pos, mempool_dynamic_bitset &bs) {
 	  bs.reset(pos);
         }
@@ -5983,7 +6088,7 @@ int BlueStore::fsck(bool deep)
     while (fm->enumerate_next(&offset, &length)) {
       bool intersects = false;
       apply(
-        offset, length, block_size, used_blocks, "free",
+        offset, length, min_alloc_size, used_blocks, "free",
         [&](uint64_t pos, mempool_dynamic_bitset &bs) {
           if (bs.test(pos)) {
             intersects = true;
@@ -5993,79 +6098,26 @@ int BlueStore::fsck(bool deep)
         }
       );
       if (intersects) {
-	derr << __func__ << " error: free extent 0x" << std::hex << offset
-	     << "~" << length << std::dec
-	     << " intersects allocated blocks" << dendl;
-	++errors;
-      }
-    }
-    fm->enumerate_reset();
-    size_t count = used_blocks.count();
-    if (used_blocks.size() == count + 1) {
-      // this due to http://tracker.ceph.com/issues/21089
-      bufferlist fm_bpb_bl, fm_blocks_bl, fm_bpk_bl;
-      db->get(PREFIX_ALLOC, "bytes_per_block", &fm_bpb_bl);
-      db->get(PREFIX_ALLOC, "blocks", &fm_blocks_bl);
-      db->get(PREFIX_ALLOC, "blocks_per_key", &fm_bpk_bl);
-      uint64_t fm_blocks = 0;
-      uint64_t fm_bsize = 1;
-      uint64_t fm_blocks_per_key = 1;
-      try {
-	auto p = fm_blocks_bl.begin();
-	::decode(fm_blocks, p);
-	auto q = fm_bpb_bl.begin();
-	::decode(fm_bsize, q);
-	auto r = fm_bpk_bl.begin();
-	::decode(fm_blocks_per_key, r);
-      } catch (buffer::error& e) {
-      }
-      uint64_t dev_bsize = bdev->get_block_size();
-      uint64_t bad_size = bdev->get_size() & ~fm_bsize;
-      if (used_blocks.test(bad_size / dev_bsize) == 0) {
-	// this is the last block of the device that we previously
-	// (incorrectly) truncated off of the effective device size.  this
-	// prevented BitmapFreelistManager from marking it as used along with
-	// the other "past-eof" blocks in the last key slot.  mark it used
-	// now.
-	derr << __func__ << " warning: fixing leaked block 0x" << std::hex
-	     << bad_size << "~" << fm_bsize << std::dec << " due to old bug"
-	     << dendl;
-	KeyValueDB::Transaction t = db->get_transaction();
-	// fix freelistmanager metadata (the internal 'blocks' count is
-	// rounded up to include the trailing key, past eof)
-	uint64_t new_blocks = bdev->get_size() / fm_bsize;
-	if (new_blocks / fm_blocks_per_key * fm_blocks_per_key != new_blocks) {
-	  new_blocks = (new_blocks / fm_blocks_per_key + 1) *
-	    fm_blocks_per_key;
-	}
-	if (new_blocks != fm_blocks) {
-	  // the fm block count increased
-	  derr << __func__ << "  freelist block and key count changed, fixing 0x"
-	       << std::hex << bdev->get_size() << "~"
-	       << ((new_blocks * fm_bsize) - bdev->get_size()) << std::dec
-	       << dendl;
-	  bufferlist bl;
-	  ::encode(new_blocks, bl);
-	  t->set(PREFIX_ALLOC, "blocks", bl);
-	  fm->allocate(bdev->get_size(),
-		       (new_blocks * fm_bsize) - bdev->get_size(),
-		       t);
+	if (offset == SUPER_RESERVED &&
+	    length == min_alloc_size - SUPER_RESERVED) {
+	  // this is due to the change just after luminous to min_alloc_size
+	  // granularity allocations, and our baked in assumption at the top
+	  // of _fsck that 0~ROUND_UP_TO(SUPER_RESERVED,min_alloc_size) is used
+	  // (vs luminous's ROUND_UP_TO(SUPER_RESERVED,block_size)).  harmless,
+	  // since we will never allocate this region below min_alloc_size.
+	  dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
+		   << " and min_alloc_size, 0x" << std::hex << offset << "~"
+		   << length << dendl;
 	} else {
-	  // block count is the same, but size changed; fix just the size
-	  derr << __func__ << "  fixing just the stray block at 0x"
-	       << std::hex << bad_size << "~" << fm_bsize << std::dec << dendl;
-	  fm->allocate(bad_size, fm_bsize, t);
+	  derr << "fsck error: free extent 0x" << std::hex << offset
+	       << "~" << length << std::dec
+	       << " intersects allocated blocks" << dendl;
+	  ++errors;
 	}
-	bufferlist sizebl;
-	::encode(bdev->get_size(), sizebl);
-	t->set(PREFIX_ALLOC, "size", sizebl);
-	int r = db->submit_transaction_sync(t);
-	assert(r == 0);
-
-	used_blocks.set(bad_size / dev_bsize);
-	++count;
       }
     }
+    fm->enumerate_reset();
+    size_t count = used_blocks.count();
     if (used_blocks.size() != count) {
       assert(used_blocks.size() > count);
       ++errors;
@@ -6076,9 +6128,9 @@ int BlueStore::fsck(bool deep)
 	while (true) {
 	  size_t next = used_blocks.find_next(cur);
 	  if (next != cur + 1) {
-	    derr << __func__ << " error: leaked extent 0x" << std::hex
-		 << ((uint64_t)start * block_size) << "~"
-		 << ((cur + 1 - start) * block_size) << std::dec
+	    derr << "fsck error: leaked extent 0x" << std::hex
+		 << ((uint64_t)start * min_alloc_size) << "~"
+		 << ((cur + 1 - start) * min_alloc_size) << std::dec
 		 << dendl;
 	    start = next;
 	    break;
@@ -6121,9 +6173,10 @@ int BlueStore::fsck(bool deep)
 	  << dendl;
 
   utime_t duration = ceph_clock_now() - start;
-  dout(1) << __func__ << " finish with " << errors << " errors in "
+  dout(1) << __func__ << " finish with " << errors << " errors, " << repaired
+	  << " repaired, " << (errors - repaired) << " remaining in "
 	  << duration << " seconds" << dendl;
-  return errors;
+  return errors - repaired;
 }
 
 void BlueStore::collect_metadata(map<string,string> *pm)
@@ -8772,6 +8825,14 @@ void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
   bdev->aio_submit(&b->ioc);
 }
 
+struct C_DeferredTrySubmit : public Context {
+  BlueStore *store;
+  C_DeferredTrySubmit(BlueStore *s) : store(s) {}
+  void finish(int r) {
+    store->deferred_try_submit();
+  }
+};
+
 void BlueStore::_deferred_aio_finish(OpSequencer *osr)
 {
   dout(10) << __func__ << " osr " << osr << dendl;
@@ -8788,9 +8849,7 @@ void BlueStore::_deferred_aio_finish(OpSequencer *osr)
       deferred_queue.erase(q);
     } else if (deferred_aggressive) {
       dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
-      deferred_finisher.queue(new FunctionContext([&](int) {
-	    deferred_try_submit();
-	  }));
+      deferred_finisher.queue(new C_DeferredTrySubmit(this));
     } else {
       dout(20) << __func__ << " leaving queued, more pending" << dendl;
     }
@@ -8930,6 +8989,11 @@ int BlueStore::queue_transactions(
 	       << dendl;
       ++deferred_aggressive;
       deferred_try_submit();
+      {
+	// wake up any previously finished deferred events
+	std::lock_guard<std::mutex> l(kv_lock);
+	kv_cond.notify_one();
+      }
       throttle_deferred_bytes.get(txc->cost);
       --deferred_aggressive;
    }
@@ -9856,20 +9920,10 @@ int BlueStore::_do_alloc_write(
   dout(20) << __func__ << " txc " << txc
 	   << " " << wctx->writes.size() << " blobs"
 	   << dendl;
-
-  uint64_t need = 0;
-  auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
-  for (auto &wi : wctx->writes) {
-    need += wi.blob_length;
-  }
-  int r = alloc->reserve(need);
-  if (r < 0) {
-    derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
-	 << dendl;
-    return r;
+  if (wctx->writes.empty()) {
+    return 0;
   }
 
-  uint64_t hint = 0;
   CompressorRef c;
   double crr = 0;
   if (wctx->compress) {
@@ -9894,7 +9948,7 @@ int BlueStore::_do_alloc_write(
       cct->_conf->bluestore_compression_required_ratio,
       [&]() {
         double val;
-        if(coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
+        if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
           return boost::optional<double>(val);
         }
         return boost::optional<double>();
@@ -9909,78 +9963,102 @@ int BlueStore::_do_alloc_write(
     csum,
     [&]() {
       int val;
-      if(coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
+      if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
         return  boost::optional<int>(val);
       }
       return boost::optional<int>();
     }
   );
 
+  // compress (as needed) and calc needed space
+  uint64_t need = 0;
+  auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
   for (auto& wi : wctx->writes) {
-    BlobRef b = wi.b;
-    bluestore_blob_t& dblob = b->dirty_blob();
-    uint64_t b_off = wi.b_off;
-    bufferlist *l = &wi.bl;
-    uint64_t final_length = wi.blob_length;
-    uint64_t csum_length = wi.blob_length;
-    unsigned csum_order = block_size_order;
-    bufferlist compressed_bl;
-    bool compressed = false;
-    if(c && wi.blob_length > min_alloc_size) {
-
+    if (c && wi.blob_length > min_alloc_size) {
       utime_t start = ceph_clock_now();
 
       // compress
-      assert(b_off == 0);
-      assert(wi.blob_length == l->length());
-      bluestore_compression_header_t chdr;
-      chdr.type = c->get_type();
+      assert(wi.b_off == 0);
+      assert(wi.blob_length == wi.bl.length());
+
       // FIXME: memory alignment here is bad
       bufferlist t;
-
-      r = c->compress(*l, t);
+      int r = c->compress(wi.bl, t);
       assert(r == 0);
 
+      bluestore_compression_header_t chdr;
+      chdr.type = c->get_type();
       chdr.length = t.length();
-      ::encode(chdr, compressed_bl);
-      compressed_bl.claim_append(t);
-      uint64_t rawlen = compressed_bl.length();
-      uint64_t newlen = P2ROUNDUP(rawlen, min_alloc_size);
-      uint64_t want_len_raw = final_length * crr;
+      ::encode(chdr, wi.compressed_bl);
+      wi.compressed_bl.claim_append(t);
+
+      wi.compressed_len = wi.compressed_bl.length();
+      uint64_t newlen = P2ROUNDUP(wi.compressed_len, min_alloc_size);
+      uint64_t want_len_raw = wi.blob_length * crr;
       uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
-      if (newlen <= want_len && newlen < final_length) {
-        // Cool. We compressed at least as much as we were hoping to.
-        // pad out to min_alloc_size
-	compressed_bl.append_zero(newlen - rawlen);
-	logger->inc(l_bluestore_write_pad_bytes, newlen - rawlen);
+      if (newlen <= want_len && newlen < wi.blob_length) {
+	// Cool. We compressed at least as much as we were hoping to.
+	// pad out to min_alloc_size
+	wi.compressed_bl.append_zero(newlen - wi.compressed_len);
+	logger->inc(l_bluestore_write_pad_bytes, newlen - wi.compressed_len);
 	dout(20) << __func__ << std::hex << "  compressed 0x" << wi.blob_length
-		 << " -> 0x" << rawlen << " => 0x" << newlen
+		 << " -> 0x" << wi.compressed_len << " => 0x" << newlen
 		 << " with " << c->get_type()
 		 << std::dec << dendl;
-	txc->statfs_delta.compressed() += rawlen;
-	txc->statfs_delta.compressed_original() += l->length();
+	txc->statfs_delta.compressed() += wi.compressed_len;
+	txc->statfs_delta.compressed_original() += wi.blob_length;
 	txc->statfs_delta.compressed_allocated() += newlen;
-	l = &compressed_bl;
-	final_length = newlen;
-	csum_length = newlen;
-	csum_order = ctz(newlen);
-	dblob.set_compressed(wi.blob_length, rawlen);
-	compressed = true;
-        logger->inc(l_bluestore_compress_success_count);
+	logger->inc(l_bluestore_compress_success_count);
+	wi.compressed = true;
+	need += newlen;
       } else {
-	dout(20) << __func__ << std::hex << "  0x" << l->length()
-		 << " compressed to 0x" << rawlen << " -> 0x" << newlen
-                 << " with " << c->get_type()
-                 << ", which is more than required 0x" << want_len_raw
+	dout(20) << __func__ << std::hex << "  0x" << wi.blob_length
+		 << " compressed to 0x" << wi.compressed_len << " -> 0x" << newlen
+		 << " with " << c->get_type()
+		 << ", which is more than required 0x" << want_len_raw
 		 << " -> 0x" << want_len
-                 << ", leaving uncompressed"
-                 << std::dec << dendl;
-        logger->inc(l_bluestore_compress_rejected_count);
+		 << ", leaving uncompressed"
+		 << std::dec << dendl;
+	logger->inc(l_bluestore_compress_rejected_count);
+	need += wi.blob_length;
       }
       logger->tinc(l_bluestore_compress_lat,
 		   ceph_clock_now() - start);
+    } else {
+      need += wi.blob_length;
     }
-    if (!compressed && wi.new_blob) {
+  }
+  int r = alloc->reserve(need);
+  if (r < 0) {
+    derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
+	 << dendl;
+    return r;
+  }
+  AllocExtentVector prealloc;
+  prealloc.reserve(2 * wctx->writes.size());;
+  int prealloc_left = 0;
+  prealloc_left = alloc->allocate(
+    need, min_alloc_size, need,
+    0, &prealloc);
+  assert(prealloc_left == (int64_t)need);
+  dout(20) << __func__ << " prealloc " << prealloc << dendl;
+  auto prealloc_pos = prealloc.begin();
+
+  for (auto& wi : wctx->writes) {
+    BlobRef b = wi.b;
+    bluestore_blob_t& dblob = b->dirty_blob();
+    uint64_t b_off = wi.b_off;
+    bufferlist *l = &wi.bl;
+    uint64_t final_length = wi.blob_length;
+    uint64_t csum_length = wi.blob_length;
+    unsigned csum_order = block_size_order;
+    if (wi.compressed) {
+      final_length = wi.compressed_bl.length();
+      csum_length = final_length;
+      csum_order = ctz(csum_length);
+      l = &wi.compressed_bl;
+      dblob.set_compressed(wi.blob_length, wi.compressed_len);
+    } else if (wi.new_blob) {
       // initialize newly created blob only
       assert(dblob.is_mutable());
       if (l->length() != wi.blob_length) {
@@ -10015,17 +10093,27 @@ int BlueStore::_do_alloc_write(
     }
 
     AllocExtentVector extents;
-    extents.reserve(4);  // 4 should be (more than) enough for most allocations
-    int64_t got = alloc->allocate(final_length, min_alloc_size, 
-				  max_alloc_size.load(),
-                    		  hint, &extents);
-    assert(got == (int64_t)final_length);
-    need -= got;
-    txc->statfs_delta.allocated() += got;
+    int64_t left = final_length;
+    while (left > 0) {
+      assert(prealloc_left > 0);
+      if (prealloc_pos->length <= left) {
+	prealloc_left -= prealloc_pos->length;
+	left -= prealloc_pos->length;
+	txc->statfs_delta.allocated() += prealloc_pos->length;
+	extents.push_back(*prealloc_pos);
+	++prealloc_pos;
+      } else {
+	extents.emplace_back(prealloc_pos->offset, left);
+	prealloc_pos->offset += left;
+	prealloc_pos->length -= left;
+	prealloc_left -= left;
+	txc->statfs_delta.allocated() += left;
+	left = 0;
+	break;
+      }
+    }
     for (auto& p : extents) {
-      bluestore_pextent_t e = bluestore_pextent_t(p);
-      txc->allocated.insert(e.offset, e.length);
-      hint = p.end();
+      txc->allocated.insert(p.offset, p.length);
     }
     dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
 
@@ -10079,9 +10167,8 @@ int BlueStore::_do_alloc_write(
       }
     }
   }
-  if (need > 0) {
-    alloc->unreserve(need);
-  }
+  assert(prealloc_pos == prealloc.end());
+  assert(prealloc_left == 0);
   return 0;
 }
 
@@ -10607,10 +10694,14 @@ int BlueStore::_do_remove(
     if (b.is_shared() &&
 	sb->loaded &&
 	maybe_unshared_blobs.count(sb)) {
-      b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
-        expect[sb].get(off, len);
-	return 0;
-      });
+      if (b.is_compressed()) {
+	expect[sb].get(0, b.get_ondisk_length());
+      } else {
+	b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
+	    expect[sb].get(off, len);
+	    return 0;
+	  });
+      }
     }
   }
 
@@ -10671,10 +10762,14 @@ int BlueStore::_setattr(TransContext *txc,
 	   << " " << name << " (" << val.length() << " bytes)"
 	   << dendl;
   int r = 0;
-  if (val.is_partial())
-    o->onode.attrs[name.c_str()] = bufferptr(val.c_str(), val.length());
-  else
-    o->onode.attrs[name.c_str()] = val;
+  if (val.is_partial()) {
+    auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
+						       val.length());
+    b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+  } else {
+    auto& b = o->onode.attrs[name.c_str()] = val;
+    b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+  }
   txc->write_onode(o);
   dout(10) << __func__ << " " << c->cid << " " << o->oid
 	   << " " << name << " (" << val.length() << " bytes)"
@@ -10693,11 +10788,14 @@ int BlueStore::_setattrs(TransContext *txc,
   int r = 0;
   for (map<string,bufferptr>::const_iterator p = aset.begin();
        p != aset.end(); ++p) {
-    if (p->second.is_partial())
-      o->onode.attrs[p->first.c_str()] =
+    if (p->second.is_partial()) {
+      auto& b = o->onode.attrs[p->first.c_str()] =
 	bufferptr(p->second.c_str(), p->second.length());
-    else
-      o->onode.attrs[p->first.c_str()] = p->second;
+      b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+    } else {
+      auto& b = o->onode.attrs[p->first.c_str()] = p->second;
+      b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+    }
   }
   txc->write_onode(o);
   dout(10) << __func__ << " " << c->cid << " " << o->oid
@@ -11551,6 +11649,14 @@ void BlueStore::_flush_cache()
     assert(i->empty());
   }
   for (auto& p : coll_map) {
+    if (!p.second->onode_map.empty()) {
+      derr << __func__ << "stray onodes on " << p.first << dendl;
+      p.second->onode_map.dump(cct, 0);
+    }
+    if (!p.second->shared_blob_set.empty()) {
+      derr << __func__ << " stray shared blobs on " << p.first << dendl;
+      p.second->shared_blob_set.dump(cct, 0);
+    }
     assert(p.second->onode_map.empty());
     assert(p.second->shared_blob_set.empty());
   }
diff --git a/ceph/src/os/bluestore/BlueStore.h b/ceph/src/os/bluestore/BlueStore.h
index cf89f2438..57a868839 100644
--- a/ceph/src/os/bluestore/BlueStore.h
+++ b/ceph/src/os/bluestore/BlueStore.h
@@ -442,7 +442,7 @@ public:
       sb->coll = coll;
     }
 
-    bool remove(SharedBlob *sb) {
+    bool try_remove(SharedBlob *sb) {
       std::lock_guard<std::mutex> l(lock);
       if (sb->nref == 0) {
 	assert(sb->get_parent() == this);
@@ -452,10 +452,18 @@ public:
       return false;
     }
 
+    void remove(SharedBlob *sb) {
+      std::lock_guard<std::mutex> l(lock);
+      assert(sb->get_parent() == this);
+      sb_map.erase(sb->get_sbid());
+    }
+
     bool empty() {
       std::lock_guard<std::mutex> l(lock);
       return sb_map.empty();
     }
+
+    void dump(CephContext *cct, int lvl);
   };
 
 //#define CACHE_BLOB_BL  // not sure if this is a win yet or not... :/
@@ -1318,6 +1326,8 @@ public:
     void clear();
     bool empty();
 
+    void dump(CephContext *cct, int lvl);
+
     /// return true if f true for any item
     bool map_any(std::function<bool(OnodeRef)> f);
   };
@@ -1854,6 +1864,7 @@ private:
   KVSyncThread kv_sync_thread;
   std::mutex kv_lock;
   std::condition_variable kv_cond;
+  bool _kv_only = false;
   bool kv_sync_started = false;
   bool kv_stop = false;
   bool kv_finalize_started = false;
@@ -1978,8 +1989,9 @@ private:
   int _setup_block_symlink_or_file(string name, string path, uint64_t size,
 				   bool create);
 
-  int _write_bdev_label(string path, bluestore_bdev_label_t label);
 public:
+  static int _write_bdev_label(CephContext* cct,
+			       string path, bluestore_bdev_label_t label);
   static int _read_bdev_label(CephContext* cct, string path,
 			      bluestore_bdev_label_t *label);
 private:
@@ -2036,7 +2048,9 @@ private:
 
   bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o);
   void _deferred_queue(TransContext *txc);
+public:
   void deferred_try_submit();
+private:
   void _deferred_submit_unlock(OpSequencer *osr);
   void _deferred_aio_finish(OpSequencer *osr);
   int _deferred_replay();
@@ -2144,7 +2158,17 @@ public:
     return 0;
   }
 
-  int fsck(bool deep) override;
+  int write_meta(const std::string& key, const std::string& value) override;
+  int read_meta(const std::string& key, std::string *value) override;
+
+
+  int fsck(bool deep) override {
+    return _fsck(deep, false);
+  }
+  int repair(bool deep) override {
+    return _fsck(deep, true);
+  }
+  int _fsck(bool deep, bool repair);
 
   void set_cache_shards(unsigned num) override;
 
@@ -2452,6 +2476,10 @@ private:
       bool mark_unused;
       bool new_blob; ///< whether new blob was created
 
+      bool compressed = false;
+      bufferlist compressed_bl;
+      size_t compressed_len = 0;
+
       write_item(
 	uint64_t logical_offs,
         BlobRef b,
diff --git a/ceph/src/os/bluestore/FreelistManager.h b/ceph/src/os/bluestore/FreelistManager.h
index 7f5ad4d79..b4418b2c7 100644
--- a/ceph/src/os/bluestore/FreelistManager.h
+++ b/ceph/src/os/bluestore/FreelistManager.h
@@ -24,9 +24,10 @@ public:
 
   static void setup_merge_operators(KeyValueDB *db);
 
-  virtual int create(uint64_t size, KeyValueDB::Transaction txn) = 0;
+  virtual int create(uint64_t size, uint64_t min_alloc_size,
+		     KeyValueDB::Transaction txn) = 0;
 
-  virtual int init() = 0;
+  virtual int init(uint64_t dev_size) = 0;
   virtual void shutdown() = 0;
 
   virtual void dump() = 0;
diff --git a/ceph/src/os/bluestore/KernelDevice.cc b/ceph/src/os/bluestore/KernelDevice.cc
index 3ae5be1ea..420b59d55 100644
--- a/ceph/src/os/bluestore/KernelDevice.cc
+++ b/ceph/src/os/bluestore/KernelDevice.cc
@@ -130,6 +130,11 @@ int KernelDevice::open(const string& p)
   } else {
     size = st.st_size;
   }
+  if (cct->_conf->get_val<bool>("bdev_inject_bad_size")) {
+    derr << "injecting bad size; actual 0x" << std::hex << size
+	 << " but using 0x" << (size & ~block_size) << std::dec << dendl;
+    size &= ~(block_size);
+  }
 
   {
     char partition[PATH_MAX], devname[PATH_MAX];
diff --git a/ceph/src/os/bluestore/bluestore_tool.cc b/ceph/src/os/bluestore/bluestore_tool.cc
index c7c134051..db5586869 100644
--- a/ceph/src/os/bluestore/bluestore_tool.cc
+++ b/ceph/src/os/bluestore/bluestore_tool.cc
@@ -14,6 +14,7 @@
 #include "common/ceph_argparse.h"
 #include "include/stringify.h"
 #include "common/errno.h"
+#include "common/safe_io.h"
 
 #include "os/bluestore/BlueFS.h"
 #include "os/bluestore/BlueStore.h"
@@ -65,24 +66,88 @@ void validate_path(CephContext *cct, const string& path, bool bluefs)
   }
 }
 
+BlueFS *open_bluefs(
+  CephContext *cct,
+  const string& path,
+  const vector<string>& devs)
+{
+  validate_path(cct, path, true);
+  BlueFS *fs = new BlueFS(cct);
+
+  string main;
+  set<int> got;
+  for (auto& i : devs) {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct, i, &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << i << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    int id = -1;
+    if (label.description == "main")
+      main = i;
+    else if (label.description == "bluefs db")
+      id = BlueFS::BDEV_DB;
+    else if (label.description == "bluefs wal")
+      id = BlueFS::BDEV_WAL;
+    if (id >= 0) {
+      got.insert(id);
+      cout << " slot " << id << " " << i << std::endl;
+      int r = fs->add_block_device(id, i);
+      if (r < 0) {
+	cerr << "unable to open " << i << ": " << cpp_strerror(r) << std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+  }
+  if (main.length()) {
+    int id = BlueFS::BDEV_DB;
+    if (got.count(BlueFS::BDEV_DB))
+      id = BlueFS::BDEV_SLOW;
+    cout << " slot " << id << " " << main << std::endl;
+    int r = fs->add_block_device(id, main);
+    if (r < 0) {
+      cerr << "unable to open " << main << ": " << cpp_strerror(r)
+	   << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  int r = fs->mount();
+  if (r < 0) {
+    cerr << "unable to mount bluefs: " << cpp_strerror(r)
+	 << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  return fs;
+}
+
 int main(int argc, char **argv)
 {
   string out_dir;
   vector<string> devs;
   string path;
   string action;
+  string log_file;
+  string key, value;
+  int log_level = 30;
   bool fsck_deep = false;
   po::options_description po_options("Options");
   po_options.add_options()
     ("help,h", "produce help message")
     ("path", po::value<string>(&path), "bluestore path")
     ("out-dir", po::value<string>(&out_dir), "output directory")
+    ("log-file,l", po::value<string>(&log_file), "log file")
+    ("log-level", po::value<int>(&log_level), "log level (30=most, 20=lots, 10=some, 1=little)")
     ("dev", po::value<vector<string>>(&devs), "device(s)")
     ("deep", po::value<bool>(&fsck_deep), "deep fsck (read all data)")
+    ("key,k", po::value<string>(&key), "label metadata key name")
+    ("value,v", po::value<string>(&value), "label metadata value")
     ;
   po::options_description po_positional("Positional options");
   po_positional.add_options()
-    ("command", po::value<string>(&action), "fsck, bluefs-export, show-label")
+    ("command", po::value<string>(&action), "fsck, repair, bluefs-export, bluefs-bdev-sizes, bluefs-bdev-expand, show-label, set-label-key, rm-label-key, prime-osd-dir")
     ;
   po::options_description po_all("All options");
   po_all.add(po_options).add(po_positional);
@@ -112,23 +177,50 @@ int main(int argc, char **argv)
     exit(EXIT_FAILURE);
   }
 
-  if (action == "fsck") {
+  if (action == "fsck" || action == "repair") {
     if (path.empty()) {
       cerr << "must specify bluestore path" << std::endl;
       exit(EXIT_FAILURE);
     }
   }
+  if (action == "prime-osd-dir") {
+    if (devs.size() != 1) {
+      cerr << "must specify the main bluestore device" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (path.empty()) {
+      cerr << "must specify osd dir to prime" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  if (action == "set-label-key" ||
+      action == "rm-label-key") {
+    if (devs.size() != 1) {
+      cerr << "must specify the main bluestore device" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (key.size() == 0) {
+      cerr << "must specify a key name with -k" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (action == "set-label-key" && value.size() == 0) {
+      cerr << "must specify a value with -v" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
   if (action == "show-label") {
     if (devs.empty() && path.empty()) {
       cerr << "must specify bluestore path *or* raw device(s)" << std::endl;
       exit(EXIT_FAILURE);
     }
-    cout << "infering bluefs devices from bluestore path" << std::endl;
-    for (auto fn : {"block", "block.wal", "block.db"}) {
-      string p = path + "/" + fn;
-      struct stat st;
-      if (::stat(p.c_str(), &st) == 0) {
-	devs.push_back(p);
+    if (devs.empty()) {
+      cout << "infering bluefs devices from bluestore path" << std::endl;
+      for (auto fn : {"block", "block.wal", "block.db"}) {
+	string p = path + "/" + fn;
+	struct stat st;
+	if (::stat(p.c_str(), &st) == 0) {
+	  devs.push_back(p);
+	}
       }
     }
   }
@@ -150,8 +242,35 @@ int main(int argc, char **argv)
       }
     }
   }
+  if (action == "bluefs-bdev-sizes" || action == "bluefs-bdev-expand") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    cout << "infering bluefs devices from bluestore path" << std::endl;
+    for (auto fn : {"block", "block.wal", "block.db"}) {
+      string p = path + "/" + fn;
+      struct stat st;
+      if (::stat(p.c_str(), &st) == 0) {
+        devs.push_back(p);
+      }
+    }
+  }
 
   vector<const char*> args;
+  if (log_file.size()) {
+    args.push_back("--log-file");
+    args.push_back(log_file.c_str());
+    static char ll[10];
+    snprintf(ll, sizeof(ll), "%d", log_level);
+    args.push_back("--debug-bluestore");
+    args.push_back(ll);
+    args.push_back("--debug-bluefs");
+    args.push_back(ll);
+  }
+  args.push_back("--no-log-to-stderr");
+  args.push_back("--err-to-stderr");
+
   for (auto& i : ceph_option_strings) {
     args.push_back(i.c_str());
   }
@@ -161,21 +280,106 @@ int main(int argc, char **argv)
 			 CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(cct.get());
 
-  cout << "action " << action << std::endl;
-
   if (action == "fsck" ||
-      action == "fsck-deep") {
+      action == "repair") {
     validate_path(cct.get(), path, false);
     BlueStore bluestore(cct.get(), path);
-    int r = bluestore.fsck(fsck_deep);
+    int r;
+    if (action == "fsck") {
+      r = bluestore.fsck(fsck_deep);
+    } else {
+      r = bluestore.repair(fsck_deep);
+    }
     if (r < 0) {
       cerr << "error from fsck: " << cpp_strerror(r) << std::endl;
       exit(EXIT_FAILURE);
     }
+    cout << action << " success" << std::endl;
+  }
+  else if (action == "prime-osd-dir") {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+    if (r < 0) {
+      cerr << "failed to read label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    // kludge some things into the map that we want to populate into
+    // target dir
+    label.meta["path_block"] = devs.front();
+    label.meta["type"] = "bluestore";
+    label.meta["fsid"] = stringify(label.osd_uuid);
+    
+    for (auto kk : {
+	"whoami",
+	  "osd_key",
+	  "path_block", "path_block.db", "path_block.wal",
+	  "ceph_fsid",
+	  "fsid",
+	  "type",
+	  "ready" }) {
+      string k = kk;
+      auto i = label.meta.find(k);
+      if (i == label.meta.end()) {
+	continue;
+      }
+      string p = path + "/" + k;
+      string v = i->second;
+      if (k == "osd_key") {
+	p = path + "/keyring";
+	v = "[osd.";
+	v += label.meta["whoami"];
+	v += "]\nkey = " + i->second;
+      }
+      if (k.find("path_") == 0) {
+	p = path + "/" + k.substr(5);
+	int r = ::symlink(v.c_str(), p.c_str());
+	if (r < 0 && errno == EEXIST) {
+	  struct stat st;
+	  r = ::stat(p.c_str(), &st);
+	  if (r == 0 && S_ISLNK(st.st_mode)) {
+	    char target[PATH_MAX];
+	    r = ::readlink(p.c_str(), target, sizeof(target));
+	    if (r > 0) {
+	      if (v == target) {
+		r = 0;  // already matches our target
+	      } else {
+		::unlink(p.c_str());
+		r = ::symlink(v.c_str(), p.c_str());
+	      }
+	    } else {
+	      cerr << "error reading existing link at " << p << ": " << cpp_strerror(errno)
+		   << std::endl;
+	    }
+	  }
+	}
+	if (r < 0) {
+	  cerr << "error symlinking " << p << ": " << cpp_strerror(errno)
+	       << std::endl;
+	  exit(EXIT_FAILURE);
+	}
+      } else {
+	v += "\n";
+	int fd = ::open(p.c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0600);
+	if (fd < 0) {
+	  cerr << "error writing " << p << ": " << cpp_strerror(errno)
+	       << std::endl;
+	  exit(EXIT_FAILURE);
+	}
+	int r = safe_write(fd, v.c_str(), v.size());
+	if (r < 0) {
+	  cerr << "error writing to " << p << ": " << cpp_strerror(errno)
+	       << std::endl;
+	  exit(EXIT_FAILURE);
+	}
+	::close(fd);
+      }
+    }
   }
   else if (action == "show-label") {
     JSONFormatter jf(true);
-    jf.open_array_section("devices");
+    jf.open_object_section("devices");
     for (auto& i : devs) {
       bluestore_bdev_label_t label;
       int r = BlueStore::_read_bdev_label(cct.get(), i, &label);
@@ -191,58 +395,69 @@ int main(int argc, char **argv)
     jf.close_section();
     jf.flush(cout);
   }
-  else if (action == "bluefs-export") {
-    validate_path(cct.get(), path, true);
-    BlueFS fs(&(*cct));
-    string main;
-    set<int> got;
-    for (auto& i : devs) {
-      bluestore_bdev_label_t label;
-      int r = BlueStore::_read_bdev_label(cct.get(), i, &label);
-      if (r < 0) {
-	cerr << "unable to read label for " << i << ": "
-	     << cpp_strerror(r) << std::endl;
-	exit(EXIT_FAILURE);
-      }
-      int id = -1;
-      if (label.description == "main")
-	main = i;
-      else if (label.description == "bluefs db")
-	id = BlueFS::BDEV_DB;
-      else if (label.description == "bluefs wal")
-	id = BlueFS::BDEV_WAL;
-      if (id >= 0) {
-	got.insert(id);
-	cout << " slot " << id << " " << i << std::endl;
-	int r = fs.add_block_device(id, i);
-	if (r < 0) {
-	  cerr << "unable to open " << i << ": " << cpp_strerror(r) << std::endl;
-	  exit(EXIT_FAILURE);
-	}
-      }
+  else if (action == "set-label-key") {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
     }
-    if (main.length()) {
-      int id = BlueFS::BDEV_DB;
-      if (got.count(BlueFS::BDEV_DB))
-	id = BlueFS::BDEV_SLOW;
-      cout << " slot " << id << " " << main << std::endl;
-      int r = fs.add_block_device(id, main);
-      if (r < 0) {
-	cerr << "unable to open " << main << ": " << cpp_strerror(r)
-	     << std::endl;
-	exit(EXIT_FAILURE);
-      }
+    label.meta[key] = value;
+    r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
+    if (r < 0) {
+      cerr << "unable to write label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
     }
-
-    int r = fs.mount();
+  }
+  else if (action == "rm-label-key") {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
     if (r < 0) {
-      cerr << "unable to mount bluefs: " << cpp_strerror(r)
-	   << std::endl;
+      cerr << "unable to read label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (!label.meta.count(key)) {
+      cerr << "key '" << key << "' not present" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    label.meta.erase(key);
+    r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
+    if (r < 0) {
+      cerr << "unable to write label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
       exit(EXIT_FAILURE);
     }
+  }
+  else if (action == "bluefs-bdev-sizes") {
+    BlueFS *fs = open_bluefs(cct.get(), path, devs);
+    fs->dump_block_extents(cout);
+    delete fs;
+  }
+  else if (action == "bluefs-bdev-expand") {
+    BlueFS *fs = open_bluefs(cct.get(), path, devs);
+    cout << "start:" << std::endl;
+    fs->dump_block_extents(cout);
+    for (int devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB }) {
+      interval_set<uint64_t> before;
+      fs->get_block_extents(devid, &before);
+      uint64_t end = before.range_end();
+      uint64_t size = fs->get_block_device_size(devid);
+      if (end < size) {
+	cout << "expanding dev " << devid << " from 0x" << std::hex
+	     << end << " to 0x" << size << std::dec << std::endl;
+	fs->add_block_extent(devid, end, size-end);
+      }
+    }
+    delete fs;
+  }
+  else if (action == "bluefs-export") {
+    BlueFS *fs = open_bluefs(cct.get(), path, devs);
 
     vector<string> dirs;
-    r = fs.readdir("", &dirs);
+    int r = fs->readdir("", &dirs);
     if (r < 0) {
       cerr << "readdir in root failed: " << cpp_strerror(r) << std::endl;
       exit(EXIT_FAILURE);
@@ -252,7 +467,7 @@ int main(int argc, char **argv)
 	continue;
       cout << dir << "/" << std::endl;
       vector<string> ls;
-      r = fs.readdir(dir, &ls);
+      r = fs->readdir(dir, &ls);
       if (r < 0) {
 	cerr << "readdir " << dir << " failed: " << cpp_strerror(r) << std::endl;
 	exit(EXIT_FAILURE);
@@ -270,7 +485,7 @@ int main(int argc, char **argv)
 	cout << dir << "/" << file << std::endl;
 	uint64_t size;
 	utime_t mtime;
-	r = fs.stat(dir, file, &size, &mtime);
+	r = fs->stat(dir, file, &size, &mtime);
 	if (r < 0) {
 	  cerr << "stat " << file << " failed: " << cpp_strerror(r) << std::endl;
 	  exit(EXIT_FAILURE);
@@ -285,7 +500,7 @@ int main(int argc, char **argv)
 	assert(fd >= 0);
 	if (size > 0) {
 	  BlueFS::FileReader *h;
-	  r = fs.open_for_read(dir, file, &h, false);
+	  r = fs->open_for_read(dir, file, &h, false);
 	  if (r < 0) {
 	    cerr << "open_for_read " << dir << "/" << file << " failed: "
 		 << cpp_strerror(r) << std::endl;
@@ -295,7 +510,7 @@ int main(int argc, char **argv)
 	  int left = size;
 	  while (left) {
 	    bufferlist bl;
-	    r = fs.read(h, &h->buf, pos, left, &bl, NULL);
+	    r = fs->read(h, &h->buf, pos, left, &bl, NULL);
 	    if (r <= 0) {
 	      cerr << "read " << dir << "/" << file << " from " << pos
 		   << " failed: " << cpp_strerror(r) << std::endl;
@@ -315,7 +530,8 @@ int main(int argc, char **argv)
 	::close(fd);
       }
     }
-    fs.umount();
+    fs->umount();
+    delete fs;
   } else {
     cerr << "unrecognized action " << action << std::endl;
     return 1;
diff --git a/ceph/src/os/bluestore/bluestore_types.cc b/ceph/src/os/bluestore/bluestore_types.cc
index 9fb7ce84b..06f64d21d 100644
--- a/ceph/src/os/bluestore/bluestore_types.cc
+++ b/ceph/src/os/bluestore/bluestore_types.cc
@@ -47,22 +47,26 @@ void bluestore_bdev_label_t::encode(bufferlist& bl) const
   bl.append("bluestore block device\n");
   bl.append(stringify(osd_uuid));
   bl.append("\n");
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
   ::encode(osd_uuid, bl);
   ::encode(size, bl);
   ::encode(btime, bl);
   ::encode(description, bl);
+  ::encode(meta, bl);
   ENCODE_FINISH(bl);
 }
 
 void bluestore_bdev_label_t::decode(bufferlist::iterator& p)
 {
   p.advance(60); // see above
-  DECODE_START(1, p);
+  DECODE_START(2, p);
   ::decode(osd_uuid, p);
   ::decode(size, p);
   ::decode(btime, p);
   ::decode(description, p);
+  if (struct_v >= 2) {
+    ::decode(meta, p);
+  }
   DECODE_FINISH(p);
 }
 
@@ -72,6 +76,9 @@ void bluestore_bdev_label_t::dump(Formatter *f) const
   f->dump_unsigned("size", size);
   f->dump_stream("btime") << btime;
   f->dump_string("description", description);
+  for (auto& i : meta) {
+    f->dump_string(i.first.c_str(), i.second);
+  }
 }
 
 void bluestore_bdev_label_t::generate_test_instances(
@@ -82,14 +89,17 @@ void bluestore_bdev_label_t::generate_test_instances(
   o.back()->size = 123;
   o.back()->btime = utime_t(4, 5);
   o.back()->description = "fakey";
+  o.back()->meta["foo"] = "bar";
 }
 
 ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l)
 {
   return out << "bdev(osd_uuid " << l.osd_uuid
-	     << " size 0x" << std::hex << l.size << std::dec
-	     << " btime " << l.btime
-	     << " desc " << l.description << ")";
+	     << ", size 0x" << std::hex << l.size << std::dec
+	     << ", btime " << l.btime
+	     << ", desc " << l.description
+	     << ", " << l.meta.size() << " meta"
+	     << ")";
 }
 
 // cnode_t
diff --git a/ceph/src/os/bluestore/bluestore_types.h b/ceph/src/os/bluestore/bluestore_types.h
index 8e2b77aeb..f48f095a0 100644
--- a/ceph/src/os/bluestore/bluestore_types.h
+++ b/ceph/src/os/bluestore/bluestore_types.h
@@ -36,6 +36,8 @@ struct bluestore_bdev_label_t {
   utime_t btime;       ///< birth time
   string description;  ///< device description
 
+  map<string,string> meta; ///< {read,write}_meta() content from ObjectStore
+
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& p);
   void dump(Formatter *f) const;
@@ -558,6 +560,7 @@ public:
       int len;
       denc_varint(len, p);
       csum_data = p.get_ptr(len);
+      csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
     }
     if (has_unused()) {
       denc(unused, p);
@@ -823,6 +826,7 @@ public:
     csum_chunk_order = order;
     csum_data = buffer::create(get_csum_value_size() * len / get_csum_chunk_size());
     csum_data.zero();
+    csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
   }
 
   /// calculate csum for the buffer at the given b_off
diff --git a/ceph/src/os/filestore/DBObjectMap.cc b/ceph/src/os/filestore/DBObjectMap.cc
index b0430c499..b36d9fe55 100644
--- a/ceph/src/os/filestore/DBObjectMap.cc
+++ b/ceph/src/os/filestore/DBObjectMap.cc
@@ -55,9 +55,9 @@ static void append_escaped(const string &in, string *out)
   }
 }
 
-int DBObjectMap::check(std::ostream &out, bool repair)
+int DBObjectMap::check(std::ostream &out, bool repair, bool force)
 {
-  int errors = 0;
+  int errors = 0, comp_errors = 0;
   bool repaired = false;
   map<uint64_t, uint64_t> parent_to_num_children;
   map<uint64_t, uint64_t> parent_to_actual_num_children;
@@ -71,34 +71,37 @@ int DBObjectMap::check(std::ostream &out, bool repair)
       if (header.seq != 0)
 	parent_to_actual_num_children[header.seq] = header.num_children;
 
-      // Check complete table
-      bool complete_error = false;
-      boost::optional<string> prev;
-      KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
-      for (complete_iter->seek_to_first(); complete_iter->valid();
-           complete_iter->next()) {
-         if (prev && prev >= complete_iter->key()) {
-             out << "Bad complete for " << header.oid << std::endl;
-             complete_error = true;
-             break;
-         }
-         prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1);
-      }
-      if (complete_error) {
-        out << "Complete mapping for " << header.seq << " :" << std::endl;
-        for (complete_iter->seek_to_first(); complete_iter->valid();
-             complete_iter->next()) {
-          out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl;
-        }
-        if (repair) {
-          repaired = true;
-          KeyValueDB::Transaction t = db->get_transaction();
-          t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
-          db->submit_transaction(t);
-          out << "Cleared complete mapping to repair" << std::endl;
-        } else {
-          errors++;  // Only count when not repaired
-        }
+      if (state.v == 2 || force) {
+	// Check complete table
+	bool complete_error = false;
+	boost::optional<string> prev;
+	KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+	for (complete_iter->seek_to_first(); complete_iter->valid();
+	     complete_iter->next()) {
+	  if (prev && prev >= complete_iter->key()) {
+	     out << "Bad complete for " << header.oid << std::endl;
+	     complete_error = true;
+	     break;
+	  }
+	  prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1);
+	}
+	if (complete_error) {
+	  out << "Complete mapping for " << header.seq << " :" << std::endl;
+	  for (complete_iter->seek_to_first(); complete_iter->valid();
+	       complete_iter->next()) {
+	    out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl;
+	  }
+	  if (repair) {
+	    repaired = true;
+	    KeyValueDB::Transaction t = db->get_transaction();
+	    t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+	    db->submit_transaction(t);
+	    out << "Cleared complete mapping to repair" << std::endl;
+	  } else {
+	    errors++;  // Only count when not repaired
+	    comp_errors++;  // Track errors here for version update
+	  }
+	}
       }
 
       if (header.parent == 0)
@@ -137,6 +140,17 @@ int DBObjectMap::check(std::ostream &out, bool repair)
     }
     parent_to_actual_num_children.erase(i->first);
   }
+
+  // Only advance the version from 2 to 3 here
+  // Mark as legacy because there are still older structures
+  // we don't update.  The value of legacy is only used
+  // for internal assertions.
+  if (comp_errors == 0 && state.v == 2 && repair) {
+    state.v = 3;
+    state.legacy = true;
+    set_state();
+  }
+
   if (errors == 0 && repaired)
     return -1;
   return errors;
@@ -645,7 +659,7 @@ int DBObjectMap::rm_keys(const ghobject_t &oid,
     return db->submit_transaction(t);
   }
 
-  assert(state.v < 3);
+  assert(state.legacy);
 
   {
     // We only get here for legacy (v2) stores
@@ -852,7 +866,7 @@ int DBObjectMap::legacy_clone(const ghobject_t &oid,
 		       const ghobject_t &target,
 		       const SequencerPosition *spos)
 {
-  state.v = 2;
+  state.legacy = true;
 
   if (oid == target)
     return 0;
@@ -1021,15 +1035,22 @@ int DBObjectMap::upgrade_to_v2()
 
   state.v = 2;
 
+  set_state();
+  return 0;
+}
+
+void DBObjectMap::set_state()
+{
   Mutex::Locker l(header_lock);
   KeyValueDB::Transaction t = db->get_transaction();
   write_state(t);
-  db->submit_transaction_sync(t);
+  int ret = db->submit_transaction_sync(t);
+  assert(ret == 0);
   dout(1) << __func__ << " done" << dendl;
-  return 0;
+  return;
 }
 
-int DBObjectMap::init(bool do_upgrade)
+int DBObjectMap::get_state()
 {
   map<string, bufferlist> result;
   set<string> to_get;
@@ -1040,28 +1061,36 @@ int DBObjectMap::init(bool do_upgrade)
   if (!result.empty()) {
     bufferlist::iterator bliter = result.begin()->second.begin();
     state.decode(bliter);
-    if (state.v < 1) {
-      dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
-	      << dendl;
-      return -ENOTSUP;
-    }
-    if (state.v < 2) { // Needs upgrade
-      if (!do_upgrade) {
-	dout(1) << "DOBjbectMap requires an upgrade,"
-		<< " set filestore_update_to"
-		<< dendl;
-	return -ENOTSUP;
-      } else {
-	r = upgrade_to_v2();
-	if (r < 0)
-	  return r;
-      }
-    }
   } else {
     // New store
-    // Version 3 means that complete regions never used
-    state.v = 3;
+    state.v = State::CUR_VERSION;
     state.seq = 1;
+    state.legacy = false;
+  }
+  return 0;
+}
+
+int DBObjectMap::init(bool do_upgrade)
+{
+  int ret = get_state();
+  if (ret < 0)
+    return ret;
+  if (state.v < 1) {
+    dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
+	    << dendl;
+    return -ENOTSUP;
+  }
+  if (state.v < 2) { // Needs upgrade
+    if (!do_upgrade) {
+      dout(1) << "DOBjbectMap requires an upgrade,"
+	      << " set filestore_update_to"
+	      << dendl;
+      return -ENOTSUP;
+    } else {
+      int r = upgrade_to_v2();
+      if (r < 0)
+	return r;
+    }
   }
   ostringstream ss;
   int errors = check(ss, true);
@@ -1222,7 +1251,7 @@ void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t)
   dout(20) << "clear_header: clearing seq " << header->seq << dendl;
   t->rmkeys_by_prefix(user_prefix(header));
   t->rmkeys_by_prefix(sys_prefix(header));
-  if (state.v < 3)
+  if (state.legacy)
     t->rmkeys_by_prefix(complete_prefix(header)); // Needed when header.parent != 0
   t->rmkeys_by_prefix(xattr_prefix(header));
   set<string> keys;
diff --git a/ceph/src/os/filestore/DBObjectMap.h b/ceph/src/os/filestore/DBObjectMap.h
index 3f6798d2e..fb1653e48 100644
--- a/ceph/src/os/filestore/DBObjectMap.h
+++ b/ceph/src/os/filestore/DBObjectMap.h
@@ -219,13 +219,17 @@ public:
     );
 
   /// Read initial state from backing store
+  int get_state();
+  /// Write current state settings to DB
+  void set_state();
+  /// Read initial state and upgrade or initialize state
   int init(bool upgrade = false);
 
   /// Upgrade store to current version
   int upgrade_to_v2();
 
   /// Consistency check, debug, there must be no parallel writes
-  int check(std::ostream &out, bool repair = false) override;
+  int check(std::ostream &out, bool repair = false, bool force = false) override;
 
   /// Ensure that all previous operations are durable
   int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override;
@@ -261,30 +265,40 @@ public:
 
   /// persistent state for store @see generate_header
   struct State {
+    static const __u8 CUR_VERSION = 3;
     __u8 v;
     uint64_t seq;
-    State() : v(0), seq(1) {}
-    explicit State(uint64_t seq) : v(0), seq(seq) {}
+    // legacy is false when complete regions never used
+    bool legacy;
+    State() : v(0), seq(1), legacy(false) {}
+    explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {}
 
     void encode(bufferlist &bl) const {
-      ENCODE_START(2, 1, bl);
+      ENCODE_START(3, 1, bl);
       ::encode(v, bl);
       ::encode(seq, bl);
+      ::encode(legacy, bl);
       ENCODE_FINISH(bl);
     }
 
     void decode(bufferlist::iterator &bl) {
-      DECODE_START(2, bl);
+      DECODE_START(3, bl);
       if (struct_v >= 2)
 	::decode(v, bl);
       else
 	v = 0;
       ::decode(seq, bl);
+      if (struct_v >= 3)
+	::decode(legacy, bl);
+      else
+	legacy = false;
       DECODE_FINISH(bl);
     }
 
     void dump(Formatter *f) const {
+      f->dump_unsigned("v", v);
       f->dump_unsigned("seq", seq);
+      f->dump_bool("legacy", legacy);
     }
 
     static void generate_test_instances(list<State*> &o) {
diff --git a/ceph/src/os/filestore/FileJournal.cc b/ceph/src/os/filestore/FileJournal.cc
index 0491559ae..8dd75435d 100644
--- a/ceph/src/os/filestore/FileJournal.cc
+++ b/ceph/src/os/filestore/FileJournal.cc
@@ -1973,6 +1973,8 @@ bool FileJournal::read_entry(
         journaled_seq = seq;
       return true;
     }
+  } else {
+    derr << "do_read_entry(" << pos << "): " << ss.str() << dendl;
   }
 
   if (seq && seq < header.committed_up_to) {
@@ -1988,7 +1990,6 @@ bool FileJournal::read_entry(
     }
   }
 
-  dout(25) << ss.str() << dendl;
   dout(2) << "No further valid entries found, journal is most likely valid"
 	  << dendl;
   return false;
diff --git a/ceph/src/os/filestore/FileStore.cc b/ceph/src/os/filestore/FileStore.cc
index b097a3c18..caac76ec4 100644
--- a/ceph/src/os/filestore/FileStore.cc
+++ b/ceph/src/os/filestore/FileStore.cc
@@ -687,8 +687,7 @@ void FileStore::collect_metadata(map<string,string> *pm)
   (*pm)["filestore_f_type"] = ss.str();
 
   if (cct->_conf->filestore_collect_device_partition_information) {
-    rc = get_device_by_uuid(get_fsid(), "PARTUUID", partition_path,
-          dev_node);
+    rc = get_device_by_fd(fsid_fd, partition_path, dev_node, PATH_MAX);
   } else {
     rc = -EINVAL;
   }
diff --git a/ceph/src/osd/OSD.cc b/ceph/src/osd/OSD.cc
index 426da2440..926a3be7e 100644
--- a/ceph/src/osd/OSD.cc
+++ b/ceph/src/osd/OSD.cc
@@ -247,9 +247,9 @@ OSDService::OSDService(OSD *osd) :
   recovery_sleep_lock("OSDService::recovery_sleep_lock"),
   recovery_sleep_timer(cct, recovery_sleep_lock, false),
   reserver_finisher(cct),
-  local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
+  local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 		 cct->_conf->osd_min_recovery_priority),
-  remote_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
+  remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 		  cct->_conf->osd_min_recovery_priority),
   pg_temp_lock("OSDService::pg_temp_lock"),
   snap_sleep_lock("OSDService::snap_sleep_lock"),
@@ -258,7 +258,7 @@ OSDService::OSDService(OSD *osd) :
   scrub_sleep_lock("OSDService::scrub_sleep_lock"),
   scrub_sleep_timer(
     osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
-  snap_reserver(&reserver_finisher,
+  snap_reserver(cct, &reserver_finisher,
 		cct->_conf->osd_max_trimming_pgs),
   recovery_lock("OSDService::recovery_lock"),
   recovery_ops_active(0),
@@ -1789,7 +1789,7 @@ int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
     waiter.wait();
   }
 
-  ret = write_meta(store, sb.cluster_fsid, sb.osd_fsid, whoami);
+  ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
   if (ret) {
     derr << "OSD::mkfs: failed to write fsid file: error "
          << cpp_strerror(ret) << dendl;
@@ -1803,7 +1803,7 @@ free_store:
   return ret;
 }
 
-int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
+int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
 {
   char val[80];
   int r;
@@ -1823,6 +1823,14 @@ int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid,
   if (r < 0)
     return r;
 
+  string key = cct->_conf->get_val<string>("key");
+  lderr(cct) << "key " << key << dendl;
+  if (key.size()) {
+    r = store->write_meta("osd_key", key);
+    if (r < 0)
+      return r;
+  }
+
   r = store->write_meta("ready", "ready");
   if (r < 0)
     return r;
@@ -2956,6 +2964,9 @@ void OSD::create_logger()
   };
 
 
+  // All the basic OSD operation stats are to be considered useful
+  osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
   osd_plb.add_u64(
     l_osd_op_wip, "op_wip",
     "Replication operations currently being processed (primary)");
@@ -3043,6 +3054,10 @@ void OSD::create_logger()
     l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
     "Latency of read-modify-write operations (excluding queue time and wait for finished)");
 
+  // Now we move on to some more obscure stats, revert to assuming things
+  // are low priority unless otherwise specified.
+  osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+
   osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
     "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
   osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
@@ -3129,8 +3144,12 @@ void OSD::create_logger()
     l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
     "OSDMap buffer cache misses");
 
-  osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
-  osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
+  osd_plb.add_u64(
+    l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
+    PerfCountersBuilder::PRIO_USEFUL);
+  osd_plb.add_u64(
+    l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
+    PerfCountersBuilder::PRIO_USEFUL);
   osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
 
   osd_plb.add_u64_counter(
@@ -3250,11 +3269,14 @@ int OSD::shutdown()
   set_state(STATE_STOPPING);
 
   // Debugging
-  cct->_conf->set_val("debug_osd", "100");
-  cct->_conf->set_val("debug_journal", "100");
-  cct->_conf->set_val("debug_filestore", "100");
-  cct->_conf->set_val("debug_ms", "100");
-  cct->_conf->apply_changes(NULL);
+  if (cct->_conf->get_val<bool>("osd_debug_shutdown")) {
+    cct->_conf->set_val("debug_osd", "100");
+    cct->_conf->set_val("debug_journal", "100");
+    cct->_conf->set_val("debug_filestore", "100");
+    cct->_conf->set_val("debug_bluestore", "100");
+    cct->_conf->set_val("debug_ms", "100");
+    cct->_conf->apply_changes(NULL);
+  }
 
   // stop MgrClient earlier as it's more like an internal consumer of OSD
   mgrc.shutdown();
@@ -4063,6 +4085,11 @@ void OSD::build_past_intervals_parallel()
         ++i) {
       PG *pg = i->second;
 
+      // Ignore PGs only partially created (DNE)
+      if (pg->info.dne()) {
+	continue;
+      }
+
       auto rpib = pg->get_required_past_interval_bounds(
 	pg->info,
 	superblock.oldest_map);
@@ -4249,6 +4276,11 @@ int OSD::handle_pg_peering_evt(
       ceph_abort();
     }
 
+    const bool is_mon_create =
+      evt->get_event().dynamic_type() == PG::NullEvt::static_type();
+    if (maybe_wait_for_max_pg(pgid, is_mon_create)) {
+      return -EAGAIN;
+    }
     // do we need to resurrect a deleting pg?
     spg_t resurrected;
     PGRef old_pg_state;
@@ -4389,6 +4421,88 @@ int OSD::handle_pg_peering_evt(
   }
 }
 
+bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
+{
+  const auto max_pgs_per_osd =
+    (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
+     cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+
+  RWLock::RLocker pg_map_locker{pg_map_lock};
+  if (pg_map.size() < max_pgs_per_osd) {
+    return false;
+  }
+  lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+  if (is_mon_create) {
+    pending_creates_from_mon++;
+  } else {
+    pending_creates_from_osd.emplace(pgid.pgid);
+  }
+  dout(5) << __func__ << " withhold creation of pg " << pgid
+	  << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
+  return true;
+}
+
+// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
+// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
+// to up set if pg_temp is empty. so an empty pg_temp won't work.
+static vector<int32_t> twiddle(const vector<int>& acting) {
+  if (acting.size() > 1) {
+    return {acting[0]};
+  } else {
+    vector<int32_t> twiddled(acting.begin(), acting.end());
+    twiddled.push_back(-1);
+    return twiddled;
+  }
+}
+
+void OSD::resume_creating_pg()
+{
+  bool do_sub_pg_creates = false;
+  MOSDPGTemp *pgtemp = nullptr;
+  {
+    const auto max_pgs_per_osd =
+      (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
+       cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+    RWLock::RLocker l(pg_map_lock);
+    if (max_pgs_per_osd <= pg_map.size()) {
+      // this could happen if admin decreases this setting before a PG is removed
+      return;
+    }
+    unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
+    lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+    if (pending_creates_from_mon > 0) {
+      do_sub_pg_creates = true;
+      if (pending_creates_from_mon >= spare_pgs) {
+	spare_pgs = pending_creates_from_mon = 0;
+      } else {
+	spare_pgs -= pending_creates_from_mon;
+	pending_creates_from_mon = 0;
+      }
+    }
+    auto pg = pending_creates_from_osd.cbegin();
+    while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
+      if (!pgtemp) {
+	pgtemp = new MOSDPGTemp{osdmap->get_epoch()};
+      }
+      vector<int> acting;
+      osdmap->pg_to_up_acting_osds(*pg, nullptr, nullptr, &acting, nullptr);
+      pgtemp->pg_temp[*pg] = twiddle(acting);
+      pg = pending_creates_from_osd.erase(pg);
+      spare_pgs--;
+    }
+  }
+  if (do_sub_pg_creates) {
+    if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
+      dout(4) << __func__ << ": resolicit pg creates from mon since "
+	      << last_pg_create_epoch << dendl;
+      monc->renew_subs();
+    }
+  }
+  if (pgtemp) {
+    pgtemp->forced = true;
+    monc->send_mon_message(pgtemp);
+  }
+}
 
 void OSD::build_initial_pg_history(
   spg_t pgid,
@@ -5194,6 +5308,7 @@ void OSD::tick_without_osd_lock()
       sched_scrub();
     }
     service.promote_throttle_recalibrate();
+    resume_creating_pg();
     bool need_send_beacon = false;
     const auto now = ceph::coarse_mono_clock::now();
     {
@@ -8171,6 +8286,15 @@ void OSD::consume_map()
   assert(osd_lock.is_locked());
   dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
 
+  /** make sure the cluster is speaking in SORTBITWISE, because we don't
+   *  speak the older sorting version any more. Be careful not to force
+   *  a shutdown if we are merely processing old maps, though.
+   */
+  if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
+    derr << __func__ << " SORTBITWISE flag is not set" << dendl;
+    ceph_abort();
+  }
+
   int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
   list<PGRef> to_remove;
 
@@ -8198,6 +8322,16 @@ void OSD::consume_map()
 
       pg->unlock();
     }
+
+    lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+    for (auto pg = pending_creates_from_osd.cbegin();
+	 pg != pending_creates_from_osd.cend();) {
+      if (osdmap->get_pg_acting_rank(*pg, whoami) < 0) {
+	pg = pending_creates_from_osd.erase(pg);
+      } else {
+	++pg;
+      }
+    }
   }
 
   for (list<PGRef>::iterator i = to_remove.begin();
@@ -8252,11 +8386,6 @@ void OSD::activate_map()
 
   dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
 
-  if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
-    derr << __func__ << " SORTBITWISE flag is not set" << dendl;
-    ceph_abort();
-  }
-
   if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
     dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
     osdmap_subscribe(osdmap->get_epoch() + 1, false);
@@ -8531,7 +8660,6 @@ void OSD::handle_pg_create(OpRequestRef op)
 	       << dendl;
       continue;
     }
-
     if (handle_pg_peering_evt(
           pgid,
           history,
@@ -8546,8 +8674,13 @@ void OSD::handle_pg_create(OpRequestRef op)
       service.send_pg_created(pgid.pgid);
     }
   }
-  last_pg_create_epoch = m->epoch;
 
+  {
+    lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+    if (pending_creates_from_mon == 0) {
+      last_pg_create_epoch = m->epoch;
+    }
+  }
   maybe_update_heartbeat_peers();
 }
 
@@ -8667,7 +8800,7 @@ void OSD::do_notifies(
       continue;
     }
     service.share_map_peer(it->first, con.get(), curmap);
-    dout(7) << __func__ << " osd " << it->first
+    dout(7) << __func__ << " osd." << it->first
 	    << " on " << it->second.size() << " PGs" << dendl;
     MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
 				       it->second);
@@ -8923,6 +9056,8 @@ void OSD::handle_pg_backfill_reserve(OpRequestRef op)
 	m->query_epoch,
 	PG::RemoteBackfillReserved()));
   } else if (m->type == MBackfillReserve::REJECT) {
+    // NOTE: this is replica -> primary "i reject your request"
+    //      and also primary -> replica "cancel my previously-granted request"
     evt = PG::CephPeeringEvtRef(
       new PG::CephPeeringEvt(
 	m->query_epoch,
@@ -9229,7 +9364,6 @@ void OSD::_remove_pg(PG *pg)
   pg->put("PGMap"); // since we've taken it out of map
 }
 
-
 // =========================================================
 // RECOVERY
 
@@ -9315,7 +9449,7 @@ void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
       i->lock();
       int pgstate = i->get_state();
       if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
-	    ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL))) )
+	    ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING))) )
         i->_change_recovery_force_mode(newstate, false);
       i->unlock();
     }
@@ -9400,18 +9534,18 @@ void OSD::do_recovery(
       pg->discover_all_missing(*rctx.query_map);
       if (rctx.query_map->empty()) {
 	string action;
-        if (pg->state_test(PG_STATE_BACKFILL)) {
+        if (pg->state_test(PG_STATE_BACKFILLING)) {
 	  auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
 	    queued,
 	    queued,
-	    PG::CancelBackfill()));
+	    PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
 	  pg->queue_peering_event(evt);
 	  action = "in backfill";
         } else if (pg->state_test(PG_STATE_RECOVERING)) {
 	  auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
 	    queued,
 	    queued,
-	    PG::CancelRecovery()));
+	    PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
 	  pg->queue_peering_event(evt);
 	  action = "in recovery";
 	} else {
diff --git a/ceph/src/osd/OSD.h b/ceph/src/osd/OSD.h
index 42a152b3c..e34bd518e 100644
--- a/ceph/src/osd/OSD.h
+++ b/ceph/src/osd/OSD.h
@@ -1946,6 +1946,10 @@ protected:
   RWLock pg_map_lock; // this lock orders *above* individual PG _locks
   ceph::unordered_map<spg_t, PG*> pg_map; // protected by pg_map lock
 
+  std::mutex pending_creates_lock;
+  std::set<pg_t> pending_creates_from_osd;
+  unsigned pending_creates_from_mon = 0;
+
   map<spg_t, list<PG::CephPeeringEvtRef> > peering_wait_for_split;
   PGRecoveryStats pg_recovery_stats;
 
@@ -1995,7 +1999,9 @@ protected:
     const PastIntervals& pi,
     epoch_t epoch,
     PG::CephPeeringEvtRef evt);
-  
+  bool maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create);
+  void resume_creating_pg();
+
   void load_pgs();
   void build_past_intervals_parallel();
 
@@ -2407,7 +2413,8 @@ private:
   int update_crush_device_class();
   int update_crush_location();
 
-  static int write_meta(ObjectStore *store,
+  static int write_meta(CephContext *cct,
+			ObjectStore *store,
 			uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami);
 
   void handle_pg_scrub(struct MOSDScrub *m, PG* pg);
diff --git a/ceph/src/osd/OSDMap.cc b/ceph/src/osd/OSDMap.cc
index 5d7eb4238..3143bbccf 100644
--- a/ceph/src/osd/OSDMap.cc
+++ b/ceph/src/osd/OSDMap.cc
@@ -20,6 +20,7 @@
 #include "OSDMap.h"
 #include <algorithm>
 #include "common/config.h"
+#include "common/errno.h"
 #include "common/Formatter.h"
 #include "common/TextTable.h"
 #include "include/ceph_features.h"
@@ -1145,21 +1146,41 @@ int OSDMap::calc_num_osds()
   return num_osd;
 }
 
-void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const
+void OSDMap::get_full_pools(CephContext *cct,
+                            set<int64_t> *full,
+                            set<int64_t> *backfillfull,
+                            set<int64_t> *nearfull) const
 {
-  *full = 0;
-  *backfill = 0;
-  *nearfull = 0;
+  assert(full);
+  assert(backfillfull);
+  assert(nearfull);
+  full->clear();
+  backfillfull->clear();
+  nearfull->clear();
+
+  vector<int> full_osds;
+  vector<int> backfillfull_osds;
+  vector<int> nearfull_osds;
   for (int i = 0; i < max_osd; ++i) {
     if (exists(i) && is_up(i) && is_in(i)) {
       if (osd_state[i] & CEPH_OSD_FULL)
-	++(*full);
+        full_osds.push_back(i);
       else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
-	++(*backfill);
+	backfillfull_osds.push_back(i);
       else if (osd_state[i] & CEPH_OSD_NEARFULL)
-	++(*nearfull);
+	nearfull_osds.push_back(i);
     }
   }
+
+  for (auto i: full_osds) {
+    get_pool_ids_by_osd(cct, i, full);
+  }
+  for (auto i: backfillfull_osds) {
+    get_pool_ids_by_osd(cct, i, backfillfull);
+  }
+  for (auto i: nearfull_osds) {
+    get_pool_ids_by_osd(cct, i, nearfull);
+  }
 }
 
 static bool get_osd_utilization(
@@ -1430,6 +1451,8 @@ void OSDMap::_calc_up_osd_features()
     if (!is_up(osd))
       continue;
     const osd_xinfo_t &xi = get_xinfo(osd);
+    if (xi.features == 0)
+      continue;  // bogus xinfo, maybe #20751 or similar, skipping
     if (first) {
       cached_up_osd_features = xi.features;
       first = false;
@@ -3263,15 +3286,46 @@ void OSDMap::print_oneline_summary(ostream& out) const
     out << " nearfull";
 }
 
-bool OSDMap::crush_ruleset_in_use(int ruleset) const
+bool OSDMap::crush_rule_in_use(int rule_id) const
 {
   for (const auto &pool : pools) {
-    if (pool.second.crush_rule == ruleset)
+    if (pool.second.crush_rule == rule_id)
       return true;
   }
   return false;
 }
 
+int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
+				 ostream *ss) const
+{
+  for (auto& i : pools) {
+    auto& pool = i.second;
+    int ruleno = pool.get_crush_rule();
+    if (!newcrush->rule_exists(ruleno)) {
+      *ss << "pool " << i.first << " references crush_rule " << ruleno
+	  << " but it is not present";
+      return -EINVAL;
+    }
+    if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) {
+      *ss << "rule " << ruleno << " mask ruleset does not match rule id";
+      return -EINVAL;
+    }
+    if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) {
+      *ss << "pool " << i.first << " type does not match rule " << ruleno;
+      return -EINVAL;
+    }
+    if (pool.get_size() < (int)newcrush->get_rule_mask_min_size(ruleno) ||
+	pool.get_size() > (int)newcrush->get_rule_mask_max_size(ruleno)) {
+      *ss << "pool " << i.first << " size " << pool.get_size() << " does not"
+	  << " fall within rule " << ruleno
+	  << " min_size " << newcrush->get_rule_mask_min_size(ruleno)
+	  << " and max_size " << newcrush->get_rule_mask_max_size(ruleno);
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
+
 int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
 				  int nosd, int pg_bits, int pgp_bits,
 				  bool default_pool)
@@ -3817,8 +3871,9 @@ int OSDMap::calc_pg_upmaps(
       tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
       ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
       for (auto p : pmap) {
-	osd_weight[p.first] += p.second;
-	osd_weight_total += p.second;
+	auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
+	osd_weight[p.first] += adjusted_weight;
+	osd_weight_total += adjusted_weight;
       }
     }
     for (auto& i : osd_weight) {
@@ -3973,6 +4028,31 @@ int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
   return crush->get_leaves(name, osds);
 }
 
+// get pools whose crush rules might reference the given osd
+void OSDMap::get_pool_ids_by_osd(CephContext *cct,
+                                int osd,
+                                set<int64_t> *pool_ids) const
+{
+  assert(pool_ids);
+  set<int> raw_rules;
+  int r = crush->get_rules_by_osd(osd, &raw_rules);
+  if (r < 0) {
+    lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
+               << dendl;
+    assert(r >= 0);
+  }
+  set<int> rules;
+  for (auto &i: raw_rules) {
+    // exclude any dead rule
+    if (crush_rule_in_use(i)) {
+      rules.insert(i);
+    }
+  }
+  for (auto &r: rules) {
+    get_pool_ids_by_rule(r, pool_ids);
+  }
+}
+
 template <typename F>
 class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
 public:
@@ -4528,6 +4608,7 @@ void OSDMap::check_health(health_check_map_t *checks) const
   {
     // warn about flags
     uint64_t warn_flags =
+      CEPH_OSDMAP_NEARFULL |
       CEPH_OSDMAP_FULL |
       CEPH_OSDMAP_PAUSERD |
       CEPH_OSDMAP_PAUSEWR |
@@ -4634,23 +4715,49 @@ void OSDMap::check_health(health_check_map_t *checks) const
   // OSD_UPGRADE_FINISHED
   // none of these (yet) since we don't run until luminous upgrade is done.
 
-  // POOL_FULL
+  // POOL_NEARFULL/BACKFILLFULL/FULL
   {
-    list<string> detail;
+    list<string> full_detail, backfillfull_detail, nearfull_detail;
     for (auto it : get_pools()) {
       const pg_pool_t &pool = it.second;
+      const string& pool_name = get_pool_name(it.first);
       if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
-	const string& pool_name = get_pool_name(it.first);
 	stringstream ss;
-	ss << "pool '" << pool_name << "' is full";
-	detail.push_back(ss.str());
+        if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+          // may run out of space too,
+          // but we want EQUOTA taking precedence
+          ss << "pool '" << pool_name << "' is full (no quota)";
+        } else {
+          ss << "pool '" << pool_name << "' is full (no space)";
+        }
+	full_detail.push_back(ss.str());
+      } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+        stringstream ss;
+        ss << "pool '" << pool_name << "' is backfillfull";
+        backfillfull_detail.push_back(ss.str());
+      } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
+        stringstream ss;
+        ss << "pool '" << pool_name << "' is nearfull";
+        nearfull_detail.push_back(ss.str());
       }
     }
-    if (!detail.empty()) {
+    if (!full_detail.empty()) {
       ostringstream ss;
-      ss << detail.size() << " pool(s) full";
+      ss << full_detail.size() << " pool(s) full";
       auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
-      d.detail.swap(detail);
+      d.detail.swap(full_detail);
+    }
+    if (!backfillfull_detail.empty()) {
+      ostringstream ss;
+      ss << backfillfull_detail.size() << " pool(s) backfillfull";
+      auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str());
+      d.detail.swap(backfillfull_detail);
+    }
+    if (!nearfull_detail.empty()) {
+      ostringstream ss;
+      ss << nearfull_detail.size() << " pool(s) nearfull";
+      auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str());
+      d.detail.swap(nearfull_detail);
     }
   }
 }
diff --git a/ceph/src/osd/OSDMap.h b/ceph/src/osd/OSDMap.h
index e676bff0a..6ba565118 100644
--- a/ceph/src/osd/OSDMap.h
+++ b/ceph/src/osd/OSDMap.h
@@ -644,13 +644,15 @@ public:
   float get_nearfull_ratio() const {
     return nearfull_ratio;
   }
-  void count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const;
   void get_full_osd_util(
     const mempool::pgmap::unordered_map<int32_t,osd_stat_t> &osd_stat,
     map<int, float> *full,
     map<int, float> *backfill,
     map<int, float> *nearfull) const;
-
+  void get_full_pools(CephContext *cct,
+                      set<int64_t> *full,
+                      set<int64_t> *backfillfull,
+                      set<int64_t> *nearfull) const;
   void get_full_osd_counts(set<int> *full, set<int> *backfill,
 			   set<int> *nearfull) const;
 
@@ -1171,6 +1173,17 @@ public:
   mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() {
     return pools;
   }
+  void get_pool_ids_by_rule(int rule_id, set<int64_t> *pool_ids) const {
+    assert(pool_ids);
+    for (auto &p: pools) {
+      if ((int)p.second.get_crush_rule() == rule_id) {
+        pool_ids->insert(p.first);
+      }
+    }
+  }
+  void get_pool_ids_by_osd(CephContext *cct,
+                           int osd,
+                           set<int64_t> *pool_ids) const;
   const string& get_pool_name(int64_t p) const {
     auto i = pool_name.find(p);
     assert(i != pool_name.end());
@@ -1329,7 +1342,9 @@ public:
     const string& root,
     ostream *ss);
 
-  bool crush_ruleset_in_use(int ruleset) const;
+  bool crush_rule_in_use(int rule_id) const;
+
+  int validate_crush_rules(CrushWrapper *crush, ostream *ss) const;
 
   void clear_temp() {
     pg_temp->clear();
diff --git a/ceph/src/osd/PG.cc b/ceph/src/osd/PG.cc
index da4da001e..139a9ac6d 100644
--- a/ceph/src/osd/PG.cc
+++ b/ceph/src/osd/PG.cc
@@ -3878,21 +3878,21 @@ void PG::reject_reservation()
     get_osdmap()->get_epoch());
 }
 
-void PG::schedule_backfill_full_retry()
+void PG::schedule_backfill_retry(float delay)
 {
   Mutex::Locker lock(osd->recovery_request_lock);
   osd->recovery_request_timer.add_event_after(
-    cct->_conf->osd_backfill_retry_interval,
+    delay,
     new QueuePeeringEvt<RequestBackfill>(
       this, get_osdmap()->get_epoch(),
       RequestBackfill()));
 }
 
-void PG::schedule_recovery_full_retry()
+void PG::schedule_recovery_retry(float delay)
 {
   Mutex::Locker lock(osd->recovery_request_lock);
   osd->recovery_request_timer.add_event_after(
-    cct->_conf->osd_recovery_retry_interval,
+    delay,
     new QueuePeeringEvt<DoRecovery>(
       this, get_osdmap()->get_epoch(),
       DoRecovery()));
@@ -5530,8 +5530,6 @@ void PG::on_new_interval()
     upacting_features &= osdmap->get_xinfo(*p).features;
   }
 
-  assert(osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE));
-
   _on_new_interval();
 }
 
@@ -6395,18 +6393,19 @@ PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
   pg->queue_recovery();
   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
   pg->state_clear(PG_STATE_BACKFILL_WAIT);
-  pg->state_set(PG_STATE_BACKFILL);
+  pg->state_set(PG_STATE_BACKFILLING);
   pg->publish_stats_to_osd();
 }
 
 boost::statechart::result
-PG::RecoveryState::Backfilling::react(const CancelBackfill &)
+PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
 {
   PG *pg = context< RecoveryMachine >().pg;
+  ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
-  // XXX: Add a new pg state so user can see why backfill isn't proceeding
-  // Can't use PG_STATE_BACKFILL_WAIT since it means waiting for reservations
-  //pg->state_set(PG_STATE_BACKFILL_STALLED????);
+
+  pg->state_set(PG_STATE_BACKFILL_WAIT);
+  pg->state_clear(PG_STATE_BACKFILLING);
 
   for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
        it != pg->backfill_targets.end();
@@ -6424,9 +6423,13 @@ PG::RecoveryState::Backfilling::react(const CancelBackfill &)
     }
   }
 
-  pg->waiting_on_backfill.clear();
 
-  pg->schedule_backfill_full_retry();
+  if (!pg->waiting_on_backfill.empty()) {
+    pg->waiting_on_backfill.clear();
+    pg->finish_recovery_op(hobject_t::get_max());
+  }
+
+  pg->schedule_backfill_retry(c.delay);
   return transit<NotBackfilling>();
 }
 
@@ -6453,10 +6456,12 @@ PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
     }
   }
 
-  pg->waiting_on_backfill.clear();
-  pg->finish_recovery_op(hobject_t::get_max());
+  if (!pg->waiting_on_backfill.empty()) {
+    pg->waiting_on_backfill.clear();
+    pg->finish_recovery_op(hobject_t::get_max());
+  }
 
-  pg->schedule_backfill_full_retry();
+  pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
   return transit<NotBackfilling>();
 }
 
@@ -6466,7 +6471,7 @@ void PG::RecoveryState::Backfilling::exit()
   PG *pg = context< RecoveryMachine >().pg;
   pg->backfill_reserved = false;
   pg->backfill_reserving = false;
-  pg->state_clear(PG_STATE_BACKFILL);
+  pg->state_clear(PG_STATE_BACKFILLING);
   pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
   utime_t dur = ceph_clock_now() - enter_time;
   pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
@@ -6550,7 +6555,7 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationReje
   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
   pg->publish_stats_to_osd();
 
-  pg->schedule_backfill_full_retry();
+  pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
 
   return transit<NotBackfilling>();
 }
@@ -6568,7 +6573,10 @@ PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_conte
     new QueuePeeringEvt<LocalBackfillReserved>(
       pg, pg->get_osdmap()->get_epoch(),
       LocalBackfillReserved()),
-    pg->get_backfill_priority());
+    pg->get_backfill_priority(),
+    new QueuePeeringEvt<DeferBackfill>(
+      pg, pg->get_osdmap()->get_epoch(),
+      DeferBackfill(0.0)));
   pg->publish_stats_to_osd();
 }
 
@@ -6636,6 +6644,15 @@ PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
   context< RecoveryMachine >().log_enter(state_name);
 }
 
+boost::statechart::result
+PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->reject_reservation();
+  post_event(RemoteReservationRejected());
+  return discard_event();
+}
+
 void PG::RecoveryState::RepNotRecovering::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
@@ -6674,6 +6691,15 @@ PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &
   return transit<RepRecovering>();
 }
 
+boost::statechart::result
+PG::RecoveryState::RepWaitRecoveryReserved::react(
+  const RemoteReservationCanceled &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
+  return transit<RepNotRecovering>();
+}
+
 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
@@ -6700,12 +6726,12 @@ PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
     ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
 		       << dendl;
-    post_event(RemoteReservationRejected());
+    post_event(RejectRemoteReservation());
   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
       pg->osd->check_backfill_full(ss)) {
     ldout(pg->cct, 10) << "backfill reservation rejected: "
 		       << ss.str() << dendl;
-    post_event(RemoteReservationRejected());
+    post_event(RejectRemoteReservation());
   } else {
     pg->osd->remote_reserver.request_reservation(
       pg->info.pgid,
@@ -6734,15 +6760,13 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &
       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
 		       << "failure injection" << dendl;
-    pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
-    post_event(RemoteReservationRejected());
+    post_event(RejectRemoteReservation());
     return discard_event();
   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
 	     pg->osd->check_backfill_full(ss)) {
     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
 		       << ss.str() << dendl;
-    pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
-    post_event(RemoteReservationRejected());
+    post_event(RejectRemoteReservation());
     return discard_event();
   } else {
     pg->osd->send_message_osd_cluster(
@@ -6757,10 +6781,30 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &
 }
 
 boost::statechart::result
-PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejected &evt)
+PG::RecoveryState::RepWaitBackfillReserved::react(
+  const RejectRemoteReservation &evt)
 {
   PG *pg = context< RecoveryMachine >().pg;
   pg->reject_reservation();
+  post_event(RemoteReservationRejected());
+  return discard_event();
+}
+
+boost::statechart::result
+PG::RecoveryState::RepWaitBackfillReserved::react(
+  const RemoteReservationRejected &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
+  return transit<RepNotRecovering>();
+}
+
+boost::statechart::result
+PG::RecoveryState::RepWaitBackfillReserved::react(
+  const RemoteReservationCanceled &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
   return transit<RepNotRecovering>();
 }
 
@@ -6826,7 +6870,10 @@ PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_conte
     new QueuePeeringEvt<LocalRecoveryReserved>(
       pg, pg->get_osdmap()->get_epoch(),
       LocalRecoveryReserved()),
-    pg->get_recovery_priority());
+    pg->get_recovery_priority(),
+    new QueuePeeringEvt<DeferRecovery>(
+      pg, pg->get_osdmap()->get_epoch(),
+      DeferRecovery(0.0)));
   pg->publish_stats_to_osd();
 }
 
@@ -6835,7 +6882,7 @@ PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
 {
   PG *pg = context< RecoveryMachine >().pg;
   pg->state_set(PG_STATE_RECOVERY_TOOFULL);
-  pg->schedule_recovery_full_retry();
+  pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
   return transit<NotRecovering>();
 }
 
@@ -6933,6 +6980,7 @@ PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
   pg->state_clear(PG_STATE_RECOVERING);
   pg->state_clear(PG_STATE_FORCED_RECOVERY);
   release_reservations();
+  pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
   return transit<Recovered>();
 }
 
@@ -6943,17 +6991,20 @@ PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
   pg->state_clear(PG_STATE_RECOVERING);
   pg->state_clear(PG_STATE_FORCED_RECOVERY);
   release_reservations();
-  return transit<WaitRemoteBackfillReserved>();
+  pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+  return transit<WaitLocalBackfillReserved>();
 }
 
 boost::statechart::result
-PG::RecoveryState::Recovering::react(const CancelRecovery &evt)
+PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
 {
   PG *pg = context< RecoveryMachine >().pg;
+  ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
   pg->state_clear(PG_STATE_RECOVERING);
+  pg->state_set(PG_STATE_RECOVERY_WAIT);
   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
   release_reservations(true);
-  pg->schedule_recovery_full_retry();
+  pg->schedule_recovery_retry(evt.delay);
   return transit<NotRecovering>();
 }
 
@@ -6974,7 +7025,6 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx)
   context< RecoveryMachine >().log_enter(state_name);
 
   PG *pg = context< RecoveryMachine >().pg;
-  pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
 
   assert(!pg->needs_recovery());
 
diff --git a/ceph/src/osd/PG.h b/ceph/src/osd/PG.h
index 80b75dc5e..fd5f36aec 100644
--- a/ceph/src/osd/PG.h
+++ b/ceph/src/osd/PG.h
@@ -1410,8 +1410,8 @@ public:
   void handle_scrub_reserve_release(OpRequestRef op);
 
   void reject_reservation();
-  void schedule_backfill_full_retry();
-  void schedule_recovery_full_retry();
+  void schedule_backfill_retry(float retry);
+  void schedule_recovery_retry(float retry);
 
   // -- recovery state --
 
@@ -1563,6 +1563,21 @@ public:
       *out << #T;						   \
     }								   \
   };
+  struct DeferBackfill : boost::statechart::event<DeferBackfill> {
+    float delay;
+    explicit DeferBackfill(float delay) : delay(delay) {}
+    void print(std::ostream *out) const {
+      *out << "DeferBackfill: delay " << delay;
+    }
+  };
+  struct DeferRecovery : boost::statechart::event<DeferRecovery> {
+    float delay;
+    explicit DeferRecovery(float delay) : delay(delay) {}
+    void print(std::ostream *out) const {
+      *out << "DeferRecovery: delay " << delay;
+    }
+  };
+
   TrivialEvent(Initialize)
   TrivialEvent(Load)
   TrivialEvent(GotInfo)
@@ -1572,14 +1587,20 @@ public:
   TrivialEvent(Backfilled)
   TrivialEvent(LocalBackfillReserved)
   TrivialEvent(RemoteBackfillReserved)
+  TrivialEvent(RejectRemoteReservation)
   TrivialEvent(RemoteReservationRejected)
-  TrivialEvent(CancelBackfill)
+  TrivialEvent(RemoteReservationCanceled)
   TrivialEvent(RequestBackfill)
   TrivialEvent(RequestRecovery)
   TrivialEvent(RecoveryDone)
   TrivialEvent(BackfillTooFull)
   TrivialEvent(RecoveryTooFull)
-  TrivialEvent(CancelRecovery)
+
+  TrivialEvent(MakePrimary)
+  TrivialEvent(MakeStray)
+  TrivialEvent(NeedActingChange)
+  TrivialEvent(IsIncomplete)
+  TrivialEvent(IsDown)
 
   TrivialEvent(AllReplicasRecovered)
   TrivialEvent(DoRecovery)
@@ -1746,12 +1767,6 @@ public:
       }
     };
 
-    struct MakePrimary : boost::statechart::event< MakePrimary > {
-      MakePrimary() : boost::statechart::event< MakePrimary >() {}
-    };
-    struct MakeStray : boost::statechart::event< MakeStray > {
-      MakeStray() : boost::statechart::event< MakeStray >() {}
-    };
     struct Primary;
     struct Stray;
 
@@ -1767,17 +1782,8 @@ public:
 
     struct Peering;
     struct WaitActingChange;
-    struct NeedActingChange : boost::statechart::event< NeedActingChange > {
-      NeedActingChange() : boost::statechart::event< NeedActingChange >() {}
-    };
     struct Incomplete;
-    struct IsIncomplete : boost::statechart::event< IsIncomplete > {
-      IsIncomplete() : boost::statechart::event< IsIncomplete >() {}
-    };
     struct Down;
-    struct IsDown : boost::statechart::event< IsDown > {
-      IsDown() : boost::statechart::event< IsDown >() {}
-    };
 
     struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
       explicit Primary(my_context ctx);
@@ -1847,7 +1853,9 @@ public:
 	boost::statechart::custom_reaction< MNotifyRec >,
 	boost::statechart::custom_reaction< MLogRec >,
 	boost::statechart::custom_reaction< Backfilled >,
-	boost::statechart::custom_reaction< AllReplicasActivated >
+	boost::statechart::custom_reaction< AllReplicasActivated >,
+	boost::statechart::custom_reaction< DeferRecovery >,
+	boost::statechart::custom_reaction< DeferBackfill >
 	> reactions;
       boost::statechart::result react(const QueryState& q);
       boost::statechart::result react(const ActMap&);
@@ -1859,6 +1867,12 @@ public:
 	return discard_event();
       }
       boost::statechart::result react(const AllReplicasActivated&);
+      boost::statechart::result react(const DeferRecovery& evt) {
+	return discard_event();
+      }
+      boost::statechart::result react(const DeferBackfill& evt) {
+	return discard_event();
+      }
     };
 
     struct Clean : boost::statechart::state< Clean, Active >, NamedState {
@@ -1886,12 +1900,12 @@ public:
     struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
       typedef boost::mpl::list<
 	boost::statechart::transition< Backfilled, Recovered >,
-	boost::statechart::custom_reaction< CancelBackfill >,
+	boost::statechart::custom_reaction< DeferBackfill >,
 	boost::statechart::custom_reaction< RemoteReservationRejected >
 	> reactions;
       explicit Backfilling(my_context ctx);
       boost::statechart::result react(const RemoteReservationRejected& evt);
-      boost::statechart::result react(const CancelBackfill& evt);
+      boost::statechart::result react(const DeferBackfill& evt);
       void exit();
     };
 
@@ -1931,10 +1945,10 @@ public:
     struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
       typedef boost::mpl::list<
 	boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
-	boost::statechart::custom_reaction< CancelRecovery >
+	boost::statechart::custom_reaction< DeferRecovery >
 	> reactions;
       explicit NotRecovering(my_context ctx);
-      boost::statechart::result react(const CancelRecovery& evt) {
+      boost::statechart::result react(const DeferRecovery& evt) {
 	/* no-op */
 	return discard_event();
       }
@@ -1952,7 +1966,9 @@ public:
 	boost::statechart::custom_reaction< MQuery >,
 	boost::statechart::custom_reaction< MInfoRec >,
 	boost::statechart::custom_reaction< MLogRec >,
-	boost::statechart::custom_reaction< Activate >
+	boost::statechart::custom_reaction< Activate >,
+	boost::statechart::custom_reaction< DeferRecovery >,
+	boost::statechart::custom_reaction< DeferBackfill >
 	> reactions;
       boost::statechart::result react(const QueryState& q);
       boost::statechart::result react(const MInfoRec& infoevt);
@@ -1960,12 +1976,20 @@ public:
       boost::statechart::result react(const ActMap&);
       boost::statechart::result react(const MQuery&);
       boost::statechart::result react(const Activate&);
+      boost::statechart::result react(const DeferRecovery& evt) {
+	return discard_event();
+      }
+      boost::statechart::result react(const DeferBackfill& evt) {
+	return discard_event();
+      }
     };
 
     struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
       typedef boost::mpl::list<
 	boost::statechart::transition< RecoveryDone, RepNotRecovering >,
+	// for compat with old peers
 	boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
+	boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
 	boost::statechart::custom_reaction< BackfillTooFull >
 	> reactions;
       explicit RepRecovering(my_context ctx);
@@ -1976,45 +2000,62 @@ public:
     struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
       typedef boost::mpl::list<
 	boost::statechart::custom_reaction< RemoteBackfillReserved >,
-	boost::statechart::custom_reaction< RemoteReservationRejected >
+	boost::statechart::custom_reaction< RejectRemoteReservation >,
+	boost::statechart::custom_reaction< RemoteReservationRejected >,
+	boost::statechart::custom_reaction< RemoteReservationCanceled >
 	> reactions;
       explicit RepWaitBackfillReserved(my_context ctx);
       void exit();
       boost::statechart::result react(const RemoteBackfillReserved &evt);
+      boost::statechart::result react(const RejectRemoteReservation &evt);
       boost::statechart::result react(const RemoteReservationRejected &evt);
+      boost::statechart::result react(const RemoteReservationCanceled &evt);
     };
 
     struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
       typedef boost::mpl::list<
-	boost::statechart::custom_reaction< RemoteRecoveryReserved >
+	boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+	// for compat with old peers
+	boost::statechart::custom_reaction< RemoteReservationRejected >,
+	boost::statechart::custom_reaction< RemoteReservationCanceled >
 	> reactions;
       explicit RepWaitRecoveryReserved(my_context ctx);
       void exit();
       boost::statechart::result react(const RemoteRecoveryReserved &evt);
+      boost::statechart::result react(const RemoteReservationRejected &evt) {
+	// for compat with old peers
+	post_event(RemoteReservationCanceled());
+	return discard_event();
+      }
+      boost::statechart::result react(const RemoteReservationCanceled &evt);
     };
 
     struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
       typedef boost::mpl::list<
 	boost::statechart::custom_reaction< RequestBackfillPrio >,
         boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
+	boost::statechart::custom_reaction< RejectRemoteReservation >,
+	boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
+	boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
 	boost::statechart::transition< RecoveryDone, RepNotRecovering >  // for compat with pre-reservation peers
 	> reactions;
       explicit RepNotRecovering(my_context ctx);
       boost::statechart::result react(const RequestBackfillPrio &evt);
+      boost::statechart::result react(const RejectRemoteReservation &evt);
       void exit();
     };
 
     struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
       typedef boost::mpl::list <
 	boost::statechart::custom_reaction< AllReplicasRecovered >,
-	boost::statechart::custom_reaction< CancelRecovery >,
+	boost::statechart::custom_reaction< DeferRecovery >,
 	boost::statechart::custom_reaction< RequestBackfill >
 	> reactions;
       explicit Recovering(my_context ctx);
       void exit();
       void release_reservations(bool cancel = false);
       boost::statechart::result react(const AllReplicasRecovered &evt);
-      boost::statechart::result react(const CancelRecovery& evt);
+      boost::statechart::result react(const DeferRecovery& evt);
       boost::statechart::result react(const RequestBackfill &evt);
     };
 
diff --git a/ceph/src/osd/PrimaryLogPG.cc b/ceph/src/osd/PrimaryLogPG.cc
index 2877c28d6..3dd4eff09 100644
--- a/ceph/src/osd/PrimaryLogPG.cc
+++ b/ceph/src/osd/PrimaryLogPG.cc
@@ -717,7 +717,7 @@ void PrimaryLogPG::maybe_force_recovery()
   if (!is_degraded() &&
       !state_test(PG_STATE_RECOVERING |
                   PG_STATE_RECOVERY_WAIT |
-		  PG_STATE_BACKFILL |
+		  PG_STATE_BACKFILLING |
 		  PG_STATE_BACKFILL_WAIT |
 		  PG_STATE_BACKFILL_TOOFULL))
     return;
@@ -1553,7 +1553,7 @@ void PrimaryLogPG::calc_trim_to()
   if (is_degraded() ||
       state_test(PG_STATE_RECOVERING |
 		 PG_STATE_RECOVERY_WAIT |
-		 PG_STATE_BACKFILL |
+		 PG_STATE_BACKFILLING |
 		 PG_STATE_BACKFILL_WAIT |
 		 PG_STATE_BACKFILL_TOOFULL)) {
     target = cct->_conf->osd_max_pg_log_entries;
@@ -2041,6 +2041,7 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
   if (write_ordered && is_degraded_or_backfilling_object(head)) {
     if (can_backoff && g_conf->osd_backoff_on_degraded) {
       add_backoff(session, head, head);
+      maybe_kick_recovery(head);
     } else {
       wait_for_degraded_object(head, op);
     }
@@ -2394,7 +2395,6 @@ void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
   dout(20) << __func__ << " r=" << r << dendl;
   assert(op->may_write());
   const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
-  ObjectContextRef obc;
   mempool::osd_pglog::list<pg_log_entry_t> entries;
   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
 				   get_next_version(), eversion_t(), 0,
@@ -4715,13 +4715,28 @@ int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
   dout(20) << __func__ << dendl;
   ceph_osd_op& op = osd_op.op;
 
-  if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
+  auto& oi = ctx->new_obs.oi;
+  uint64_t size = oi.size;
+  if ((oi.truncate_seq < op.extent.truncate_seq) &&
+      (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
+    size = op.extent.truncate_size;
+  }
+
+  if (op.extent.offset >= size) {
+    op.extent.length = 0;
+  } else if (op.extent.offset + op.extent.length > size) {
+    op.extent.length = size - op.extent.offset;
+  }
+
+  if (op.extent.length == 0) {
+    dout(20) << __func__ << " zero length extent" << dendl;
+    return finish_extent_cmp(osd_op, bufferlist{});
+  } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
     dout(20) << __func__ << " object DNE" << dendl;
     return finish_extent_cmp(osd_op, {});
   } else if (pool.info.require_rollback()) {
     // If there is a data digest and it is possible we are reading
     // entire object, pass the digest.
-    auto& oi = ctx->new_obs.oi;
     boost::optional<uint32_t> maybe_crc;
     if (oi.is_data_digest() && op.checksum.offset == 0 &&
         op.checksum.length >= oi.size) {
@@ -6865,7 +6880,7 @@ inline int PrimaryLogPG::_delete_oid(
       }
     }
   } else {
-    legacy = false;
+    legacy = true;
   }
   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
 	   << " no_whiteout=" << (int)no_whiteout
@@ -11313,7 +11328,7 @@ bool PrimaryLogPG::start_recovery_ops(
   assert(is_primary());
 
   if (!state_test(PG_STATE_RECOVERING) &&
-      !state_test(PG_STATE_BACKFILL)) {
+      !state_test(PG_STATE_BACKFILLING)) {
     /* TODO: I think this case is broken and will make do_recovery()
      * unhappy since we're returning false */
     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
@@ -11348,7 +11363,7 @@ bool PrimaryLogPG::start_recovery_ops(
 
   bool deferred_backfill = false;
   if (recovering.empty() &&
-      state_test(PG_STATE_BACKFILL) &&
+      state_test(PG_STATE_BACKFILLING) &&
       !backfill_targets.empty() && started < max &&
       missing.num_missing() == 0 &&
       waiting_on_backfill.empty()) {
@@ -11417,6 +11432,9 @@ bool PrimaryLogPG::start_recovery_ops(
   if (state_test(PG_STATE_RECOVERING)) {
     state_clear(PG_STATE_RECOVERING);
     state_clear(PG_STATE_FORCED_RECOVERY);
+    if (get_osdmap()->get_pg_size(info.pgid.pgid) <= acting.size()) {
+      state_clear(PG_STATE_DEGRADED);
+    }
     if (needs_backfill()) {
       dout(10) << "recovery done, queuing backfill" << dendl;
       queue_peering_event(
@@ -11437,7 +11455,7 @@ bool PrimaryLogPG::start_recovery_ops(
             AllReplicasRecovered())));
     }
   } else { // backfilling
-    state_clear(PG_STATE_BACKFILL);
+    state_clear(PG_STATE_BACKFILLING);
     state_clear(PG_STATE_FORCED_BACKFILL);
     state_clear(PG_STATE_FORCED_RECOVERY);
     dout(10) << "recovery done, backfill done" << dendl;
diff --git a/ceph/src/osd/PrimaryLogPG.h b/ceph/src/osd/PrimaryLogPG.h
index a4d34d171..df2a45f58 100644
--- a/ceph/src/osd/PrimaryLogPG.h
+++ b/ceph/src/osd/PrimaryLogPG.h
@@ -1551,10 +1551,10 @@ private:
       };
       auto *pg = context< SnapTrimmer >().pg;
       if (pg->cct->_conf->osd_snap_trim_sleep > 0) {
-	wakeup = new OnTimer{pg, pg->get_osdmap()->get_epoch()};
 	Mutex::Locker l(pg->osd->snap_sleep_lock);
-	pg->osd->snap_sleep_timer.add_event_after(
-	  pg->cct->_conf->osd_snap_trim_sleep, wakeup);
+	wakeup = pg->osd->snap_sleep_timer.add_event_after(
+	  pg->cct->_conf->osd_snap_trim_sleep,
+	  new OnTimer{pg, pg->get_osdmap()->get_epoch()});
       } else {
 	post_event(SnapTrimTimerReady());
       }
diff --git a/ceph/src/osd/ReplicatedBackend.h b/ceph/src/osd/ReplicatedBackend.h
index 52048eb8f..7cb1df40c 100644
--- a/ceph/src/osd/ReplicatedBackend.h
+++ b/ceph/src/osd/ReplicatedBackend.h
@@ -58,7 +58,6 @@ public:
 
   void check_recovery_sources(const OSDMapRef& osdmap) override;
 
-  /// @see PGBackend::delay_message_until_active
   bool can_handle_while_inactive(OpRequestRef op) override;
 
   /// @see PGBackend::handle_message
diff --git a/ceph/src/osd/Watch.cc b/ceph/src/osd/Watch.cc
index df92bb771..7ff9f99b2 100644
--- a/ceph/src/osd/Watch.cc
+++ b/ceph/src/osd/Watch.cc
@@ -124,9 +124,9 @@ void Notify::register_cb()
   {
     osd->watch_lock.Lock();
     cb = new NotifyTimeoutCB(self.lock());
-    osd->watch_timer.add_event_after(
-      timeout,
-      cb);
+    if (!osd->watch_timer.add_event_after(timeout, cb)) {
+      cb = nullptr;
+    }
     osd->watch_lock.Unlock();
   }
 }
@@ -333,9 +333,9 @@ void Watch::register_cb()
     dout(15) << "registering callback, timeout: " << timeout << dendl;
   }
   cb = new HandleWatchTimeout(self.lock());
-  osd->watch_timer.add_event_after(
-    timeout,
-    cb);
+  if (!osd->watch_timer.add_event_after(timeout, cb)) {
+    cb = nullptr;
+  }
 }
 
 void Watch::unregister_cb()
diff --git a/ceph/src/osd/osd_types.cc b/ceph/src/osd/osd_types.cc
index 393cd7097..b22001af6 100644
--- a/ceph/src/osd/osd_types.cc
+++ b/ceph/src/osd/osd_types.cc
@@ -827,10 +827,9 @@ std::string pg_state_string(int state)
     oss << "peering+";
   if (state & PG_STATE_REPAIR)
     oss << "repair+";
-  if ((state & PG_STATE_BACKFILL_WAIT) &&
-      !(state &PG_STATE_BACKFILL))
+  if (state & PG_STATE_BACKFILL_WAIT)
     oss << "backfill_wait+";
-  if (state & PG_STATE_BACKFILL)
+  if (state & PG_STATE_BACKFILLING)
     oss << "backfilling+";
   if (state & PG_STATE_FORCED_BACKFILL)
     oss << "forced_backfill+";
@@ -854,9 +853,9 @@ std::string pg_state_string(int state)
   return ret;
 }
 
-int pg_string_state(const std::string& state)
+boost::optional<uint64_t> pg_string_state(const std::string& state)
 {
-  int type;
+  boost::optional<uint64_t> type;
   if (state == "active")
     type = PG_STATE_ACTIVE;
   else if (state == "clean")
@@ -887,8 +886,8 @@ int pg_string_state(const std::string& state)
     type = PG_STATE_REMAPPED;
   else if (state == "deep_scrub")
     type = PG_STATE_DEEP_SCRUB;
-  else if (state == "backfill")
-    type = PG_STATE_BACKFILL;
+  else if (state == "backfilling")
+    type = PG_STATE_BACKFILLING;
   else if (state == "forced_backfill")
     type = PG_STATE_FORCED_BACKFILL;
   else if (state == "backfill_toofull")
@@ -910,7 +909,7 @@ int pg_string_state(const std::string& state)
   else if (state == "snaptrim_error")
     type = PG_STATE_SNAPTRIM_ERROR;
   else
-    type = -1;
+    type = boost::none;
   return type;
 }
 
diff --git a/ceph/src/osd/osd_types.h b/ceph/src/osd/osd_types.h
index 43d9a98e7..a820c8f6b 100644
--- a/ceph/src/osd/osd_types.h
+++ b/ceph/src/osd/osd_types.h
@@ -986,7 +986,7 @@ inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
 #define PG_STATE_STALE        (1<<17) // our state for this pg is stale, unknown.
 #define PG_STATE_REMAPPED     (1<<18) // pg is explicitly remapped to different OSDs than CRUSH
 #define PG_STATE_DEEP_SCRUB   (1<<19) // deep scrub: check CRC32 on files
-#define PG_STATE_BACKFILL  (1<<20) // [active] backfilling pg content
+#define PG_STATE_BACKFILLING  (1<<20) // [active] backfilling pg content
 #define PG_STATE_BACKFILL_TOOFULL (1<<21) // backfill can't proceed: too full
 #define PG_STATE_RECOVERY_WAIT (1<<22) // waiting for recovery reservations
 #define PG_STATE_UNDERSIZED    (1<<23) // pg acting < pool size
@@ -1001,7 +1001,7 @@ inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
 
 std::string pg_state_string(int state);
 std::string pg_vector_string(const vector<int32_t> &a);
-int pg_string_state(const std::string& state);
+boost::optional<uint64_t> pg_string_state(const std::string& state);
 
 
 /*
@@ -1147,6 +1147,9 @@ struct pg_pool_t {
     FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
     FLAG_NOSCRUB = 1<<8, // block periodic scrub
     FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
+    FLAG_FULL_NO_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
+    FLAG_NEARFULL = 1<<11, // pool is nearfull
+    FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
   };
 
   static const char *get_flag_name(int f) {
@@ -1161,6 +1164,9 @@ struct pg_pool_t {
     case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
     case FLAG_NOSCRUB: return "noscrub";
     case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
+    case FLAG_FULL_NO_QUOTA: return "full_no_quota";
+    case FLAG_NEARFULL: return "nearfull";
+    case FLAG_BACKFILLFULL: return "backfillfull";
     default: return "???";
     }
   }
@@ -1199,6 +1205,12 @@ struct pg_pool_t {
       return FLAG_NOSCRUB;
     if (name == "nodeep-scrub")
       return FLAG_NODEEP_SCRUB;
+    if (name == "full_no_quota")
+      return FLAG_FULL_NO_QUOTA;
+    if (name == "nearfull")
+      return FLAG_NEARFULL;
+    if (name == "backfillfull")
+      return FLAG_BACKFILLFULL;
     return 0;
   }
 
diff --git a/ceph/src/osdc/ObjectCacher.cc b/ceph/src/osdc/ObjectCacher.cc
index a77d6b318..4afd1de9b 100644
--- a/ceph/src/osdc/ObjectCacher.cc
+++ b/ceph/src/osdc/ObjectCacher.cc
@@ -12,6 +12,7 @@
 #include "include/assert.h"
 
 #define MAX_FLUSH_UNDER_LOCK 20  ///< max bh's we start writeback on
+#define BUFFER_MEMORY_WEIGHT 12   // memory usage of BufferHead, count in (1<<n)
 
 using std::chrono::seconds;
 				 /// while holding the lock
@@ -625,7 +626,8 @@ ObjectCacher::ObjectCacher(CephContext *cct_, string name,
     flush_set_callback_arg(flush_callback_arg),
     last_read_tid(0), flusher_stop(false), flusher_thread(this),finisher(cct),
     stat_clean(0), stat_zero(0), stat_dirty(0), stat_rx(0), stat_tx(0),
-    stat_missing(0), stat_error(0), stat_dirty_waiting(0), reads_outstanding(0)
+    stat_missing(0), stat_error(0), stat_dirty_waiting(0),
+    stat_nr_dirty_waiters(0), reads_outstanding(0)
 {
   perf_start();
   finisher.start();
@@ -1255,7 +1257,11 @@ void ObjectCacher::trim()
 		 << get_stat_clean() << ", objects: max " << max_objects
 		 << " current " << ob_lru.lru_get_size() << dendl;
 
-  while (get_stat_clean() > 0 && (uint64_t) get_stat_clean() > max_size) {
+  uint64_t max_clean_bh = max_size >> BUFFER_MEMORY_WEIGHT;
+  uint64_t nr_clean_bh = bh_lru_rest.lru_get_size() - bh_lru_rest.lru_get_num_pinned();
+  while (get_stat_clean() > 0 &&
+	 ((uint64_t)get_stat_clean() > max_size ||
+	  nr_clean_bh > max_clean_bh)) {
     BufferHead *bh = static_cast<BufferHead*>(bh_lru_rest.lru_expire());
     if (!bh)
       break;
@@ -1267,6 +1273,8 @@ void ObjectCacher::trim()
     bh_remove(ob, bh);
     delete bh;
 
+    --nr_clean_bh;
+
     if (ob->complete) {
       ldout(cct, 10) << "trim clearing complete on " << *ob << dendl;
       ob->complete = false;
@@ -1782,9 +1790,14 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len,
   //  - do not wait for bytes other waiters are waiting on.  this means that
   //    threads do not wait for each other.  this effectively allows the cache
   //    size to balloon proportional to the data that is in flight.
+
+  uint64_t max_dirty_bh = max_dirty >> BUFFER_MEMORY_WEIGHT;
   while (get_stat_dirty() + get_stat_tx() > 0 &&
-	 (uint64_t) (get_stat_dirty() + get_stat_tx()) >=
-	 max_dirty + get_stat_dirty_waiting()) {
+	 (((uint64_t)(get_stat_dirty() + get_stat_tx()) >=
+	  max_dirty + get_stat_dirty_waiting()) ||
+	 (dirty_or_tx_bh.size() >=
+	  max_dirty_bh + get_stat_nr_dirty_waiters()))) {
+
     if (blocked == 0) {
       trace->event("start wait for writeback");
     }
@@ -1794,8 +1807,10 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len,
 		   << get_stat_dirty_waiting() << dendl;
     flusher_cond.Signal();
     stat_dirty_waiting += len;
+    ++stat_nr_dirty_waiters;
     stat_cond.Wait(lock);
     stat_dirty_waiting -= len;
+    --stat_nr_dirty_waiters;
     ++blocked;
     ldout(cct, 10) << __func__ << " woke up" << dendl;
   }
diff --git a/ceph/src/osdc/ObjectCacher.h b/ceph/src/osdc/ObjectCacher.h
index 31201a723..58b3e7aaf 100644
--- a/ceph/src/osdc/ObjectCacher.h
+++ b/ceph/src/osdc/ObjectCacher.h
@@ -461,6 +461,8 @@ class ObjectCacher {
   loff_t stat_error;
   loff_t stat_dirty_waiting;   // bytes that writers are waiting on to write
 
+  size_t stat_nr_dirty_waiters;
+
   void verify_stats() const;
 
   void bh_stat_add(BufferHead *bh);
@@ -468,9 +470,10 @@ class ObjectCacher {
   loff_t get_stat_tx() const { return stat_tx; }
   loff_t get_stat_rx() const { return stat_rx; }
   loff_t get_stat_dirty() const { return stat_dirty; }
-  loff_t get_stat_dirty_waiting() const { return stat_dirty_waiting; }
   loff_t get_stat_clean() const { return stat_clean; }
   loff_t get_stat_zero() const { return stat_zero; }
+  loff_t get_stat_dirty_waiting() const { return stat_dirty_waiting; }
+  size_t get_stat_nr_dirty_waiters() const { return stat_nr_dirty_waiters; }
 
   void touch_bh(BufferHead *bh) {
     if (bh->is_dirty())
diff --git a/ceph/src/pybind/ceph_volume_client.py b/ceph/src/pybind/ceph_volume_client.py
index 89722ec95..ac01807b6 100644
--- a/ceph/src/pybind/ceph_volume_client.py
+++ b/ceph/src/pybind/ceph_volume_client.py
@@ -31,7 +31,6 @@ class RadosError(Exception):
 
 
 RADOS_TIMEOUT = 10
-SNAP_DIR = ".snap"
 
 log = logging.getLogger(__name__)
 
@@ -204,6 +203,7 @@ CEPHFSVOLUMECLIENT_VERSION_HISTORY = """
     CephFSVolumeClient Version History:
 
     * 1 - Initial version
+    * 2 - Added get_object, put_object, delete_object methods to CephFSVolumeClient
 
 """
 
@@ -228,9 +228,7 @@ class CephFSVolumeClient(object):
     """
 
     # Current version
-    version = 1
-    # Earliest compatible version
-    compat_version = 1
+    version = 2
 
     # Where shall we create our volumes?
     POOL_PREFIX = "fsvolume_"
@@ -529,7 +527,7 @@ class CephFSVolumeClient(object):
         # We can't query the actual cluster config remotely, but since this is
         # just a heuristic we'll assume that the ceph.conf we have locally reflects
         # that in use in the rest of the cluster.
-        pg_warn_max_per_osd = int(self.rados.conf_get('mon_pg_warn_max_per_osd'))
+        pg_warn_max_per_osd = int(self.rados.conf_get('mon_max_pg_per_osd'))
 
         other_pgs = 0
         for pool in osd_map['pools']:
@@ -851,7 +849,7 @@ class CephFSVolumeClient(object):
         that encoded the metadata.
         """
         data['compat_version'] = 1
-        data['version'] = 1
+        data['version'] = self.version
         return self._metadata_set(self._auth_metadata_path(auth_id), data)
 
     def _volume_metadata_path(self, volume_path):
@@ -904,7 +902,7 @@ class CephFSVolumeClient(object):
         that encoded the metadata.
         """
         data['compat_version'] = 1
-        data['version'] = 1
+        data['version'] = self.version
         return self._metadata_set(self._volume_metadata_path(volume_path), data)
 
     def authorize(self, volume_path, auth_id, readonly=False, tenant_id=None):
@@ -1067,6 +1065,9 @@ class CephFSVolumeClient(object):
                 # occurrence of wanted auth caps and no occurrence of
                 # conflicting auth caps.
 
+                if not orig:
+                    return want
+
                 cap_tokens = set(orig.split(","))
 
                 cap_tokens.discard(unwanted)
@@ -1296,7 +1297,7 @@ class CephFSVolumeClient(object):
 
     def _snapshot_path(self, dir_path, snapshot_name):
         return os.path.join(
-            dir_path, SNAP_DIR, snapshot_name
+            dir_path, self.rados.conf_get('client_snapdir'), snapshot_name
         )
 
     def _snapshot_create(self, dir_path, snapshot_name):
@@ -1341,3 +1342,58 @@ class CephFSVolumeClient(object):
         src_snapshot_path = self._snapshot_path(self._get_path(src_volume_path), src_snapshot_name)
 
         self._cp_r(src_snapshot_path, dest_fs_path)
+
+    def put_object(self, pool_name, object_name, data):
+        """
+        Synchronously write data to an object.
+
+        :param pool_name: name of the pool
+        :type pool_name: str
+        :param object_name: name of the object
+        :type object_name: str
+        :param data: data to write
+        :type data: bytes
+        """
+        ioctx = self.rados.open_ioctx(pool_name)
+        max_size = int(self.rados.conf_get('osd_max_write_size')) * 1024 * 1024
+        if len(data) > max_size:
+            msg = ("Data to be written to object '{0}' exceeds "
+                   "{1} bytes".format(object_name, max_size))
+            log.error(msg)
+            raise CephFSVolumeClientError(msg)
+        try:
+            ioctx.write_full(object_name, data)
+        finally:
+            ioctx.close()
+
+    def get_object(self, pool_name, object_name):
+        """
+        Synchronously read data from object.
+
+        :param pool_name: name of the pool
+        :type pool_name: str
+        :param object_name: name of the object
+        :type object_name: str
+
+        :returns: bytes - data read from object
+        """
+        ioctx = self.rados.open_ioctx(pool_name)
+        max_size = int(self.rados.conf_get('osd_max_write_size')) * 1024 * 1024
+        try:
+            bytes_read = ioctx.read(object_name, max_size)
+            if ((len(bytes_read) == max_size) and
+                    (ioctx.read(object_name, 1, offset=max_size))):
+                log.warning("Size of object {0} exceeds '{1}' bytes "
+                            "read".format(object_name, max_size))
+        finally:
+            ioctx.close()
+        return bytes_read
+
+    def delete_object(self, pool_name, object_name):
+        ioctx = self.rados.open_ioctx(pool_name)
+        try:
+            ioctx.remove_object(object_name)
+        except rados.ObjectNotFound:
+            log.warn("Object '{0}' was already removed".format(object_name))
+        finally:
+            ioctx.close()
diff --git a/ceph/src/pybind/mgr/balancer/__init__.py b/ceph/src/pybind/mgr/balancer/__init__.py
new file mode 100644
index 000000000..79f5b86fd
--- /dev/null
+++ b/ceph/src/pybind/mgr/balancer/__init__.py
@@ -0,0 +1,2 @@
+
+from module import *  # NOQA
diff --git a/ceph/src/pybind/mgr/balancer/module.py b/ceph/src/pybind/mgr/balancer/module.py
new file mode 100644
index 000000000..b8cc087b9
--- /dev/null
+++ b/ceph/src/pybind/mgr/balancer/module.py
@@ -0,0 +1,933 @@
+
+"""
+Balance PG distribution across OSDs.
+"""
+
+import copy
+import errno
+import json
+import math
+import random
+import time
+from mgr_module import MgrModule, CommandResult
+from threading import Event
+
+# available modes: 'none', 'crush', 'crush-compat', 'upmap', 'osd_weight'
+default_mode = 'none'
+default_sleep_interval = 60   # seconds
+default_max_misplaced = .05    # max ratio of pgs replaced at a time
+
+TIME_FORMAT = '%Y-%m-%d_%H:%M:%S'
+
+
+class MappingState:
+    def __init__(self, osdmap, pg_dump, desc=''):
+        self.desc = desc
+        self.osdmap = osdmap
+        self.osdmap_dump = self.osdmap.dump()
+        self.crush = osdmap.get_crush()
+        self.crush_dump = self.crush.dump()
+        self.pg_dump = pg_dump
+        self.pg_stat = {
+            i['pgid']: i['stat_sum'] for i in pg_dump.get('pg_stats', [])
+        }
+        self.poolids = [p['pool'] for p in self.osdmap_dump.get('pools', [])]
+        self.pg_up = {}
+        self.pg_up_by_poolid = {}
+        for poolid in self.poolids:
+            self.pg_up_by_poolid[poolid] = osdmap.map_pool_pgs_up(poolid)
+            for a,b in self.pg_up_by_poolid[poolid].iteritems():
+                self.pg_up[a] = b
+
+    def calc_misplaced_from(self, other_ms):
+        num = len(other_ms.pg_up)
+        misplaced = 0
+        for pgid, before in other_ms.pg_up.iteritems():
+            if before != self.pg_up.get(pgid, []):
+                misplaced += 1
+        if num > 0:
+            return float(misplaced) / float(num)
+        return 0.0
+
+class Plan:
+    def __init__(self, name, ms):
+        self.mode = 'unknown'
+        self.name = name
+        self.initial = ms
+
+        self.osd_weights = {}
+        self.compat_ws = {}
+        self.inc = ms.osdmap.new_incremental()
+
+    def final_state(self):
+        self.inc.set_osd_reweights(self.osd_weights)
+        self.inc.set_crush_compat_weight_set_weights(self.compat_ws)
+        return MappingState(self.initial.osdmap.apply_incremental(self.inc),
+                            self.initial.pg_dump,
+                            'plan %s final' % self.name)
+
+    def dump(self):
+        return json.dumps(self.inc.dump(), indent=4)
+
+    def show(self):
+        ls = []
+        ls.append('# starting osdmap epoch %d' % self.initial.osdmap.get_epoch())
+        ls.append('# starting crush version %d' %
+                  self.initial.osdmap.get_crush_version())
+        ls.append('# mode %s' % self.mode)
+        if len(self.compat_ws) and \
+           '-1' not in self.initial.crush_dump.get('choose_args', {}):
+            ls.append('ceph osd crush weight-set create-compat')
+        for osd, weight in self.compat_ws.iteritems():
+            ls.append('ceph osd crush weight-set reweight-compat %s %f' %
+                      (osd, weight))
+        for osd, weight in self.osd_weights.iteritems():
+            ls.append('ceph osd reweight osd.%d %f' % (osd, weight))
+        incdump = self.inc.dump()
+        for pgid in incdump.get('old_pg_upmap_items', []):
+            ls.append('ceph osd rm-pg-upmap-items %s' % pgid)
+        for item in incdump.get('new_pg_upmap_items', []):
+            osdlist = []
+            for m in item['mappings']:
+                osdlist += [m['from'], m['to']]
+            ls.append('ceph osd pg-upmap-items %s %s' %
+                      (item['pgid'], ' '.join([str(a) for a in osdlist])))
+        return '\n'.join(ls)
+
+
+class Eval:
+    root_ids = {}        # root name -> id
+    pool_name = {}       # pool id -> pool name
+    pool_id = {}         # pool name -> id
+    pool_roots = {}      # pool name -> root name
+    root_pools = {}      # root name -> pools
+    target_by_root = {}  # root name -> target weight map
+    count_by_pool = {}
+    count_by_root = {}
+    actual_by_pool = {}  # pool -> by_* -> actual weight map
+    actual_by_root = {}  # pool -> by_* -> actual weight map
+    total_by_pool = {}   # pool -> by_* -> total
+    total_by_root = {}   # root -> by_* -> total
+    stats_by_pool = {}   # pool -> by_* -> stddev or avg -> value
+    stats_by_root = {}   # root -> by_* -> stddev or avg -> value
+
+    score_by_pool = {}
+    score_by_root = {}
+
+    score = 0.0
+
+    def __init__(self, ms):
+        self.ms = ms
+
+    def show(self, verbose=False):
+        if verbose:
+            r = self.ms.desc + '\n'
+            r += 'target_by_root %s\n' % self.target_by_root
+            r += 'actual_by_pool %s\n' % self.actual_by_pool
+            r += 'actual_by_root %s\n' % self.actual_by_root
+            r += 'count_by_pool %s\n' % self.count_by_pool
+            r += 'count_by_root %s\n' % self.count_by_root
+            r += 'total_by_pool %s\n' % self.total_by_pool
+            r += 'total_by_root %s\n' % self.total_by_root
+            r += 'stats_by_root %s\n' % self.stats_by_root
+            r += 'score_by_pool %s\n' % self.score_by_pool
+            r += 'score_by_root %s\n' % self.score_by_root
+        else:
+            r = self.ms.desc + ' '
+        r += 'score %f (lower is better)\n' % self.score
+        return r
+
+    def calc_stats(self, count, target, total):
+        num = max(len(target), 1)
+        r = {}
+        for t in ('pgs', 'objects', 'bytes'):
+            avg = float(total[t]) / float(num)
+            dev = 0.0
+
+            # score is a measure of how uneven the data distribution is.
+            # score lies between [0, 1), 0 means perfect distribution.
+            score = 0.0
+            sum_weight = 0.0
+
+            for k, v in count[t].iteritems():
+                # adjust/normalize by weight
+                if target[k]:
+                    adjusted = float(v) / target[k] / float(num)
+                else:
+                    adjusted = 0.0
+
+                # Overweighted devices and their weights are factors to calculate reweight_urgency.
+                # One 10% underfilled device with 5 2% overfilled devices, is arguably a better
+                # situation than one 10% overfilled with 5 2% underfilled devices
+                if adjusted > avg:
+                    '''
+                    F(x) = 2*phi(x) - 1, where phi(x) = cdf of standard normal distribution
+                    x = (adjusted - avg)/avg.
+                    Since, we're considering only over-weighted devices, x >= 0, and so phi(x) lies in [0.5, 1).
+                    To bring range of F(x) in range [0, 1), we need to make the above modification.
+
+                    In general, we need to use a function F(x), where x = (adjusted - avg)/avg
+                    1. which is bounded between 0 and 1, so that ultimately reweight_urgency will also be bounded.
+                    2. A larger value of x, should imply more urgency to reweight.
+                    3. Also, the difference between F(x) when x is large, should be minimal.
+                    4. The value of F(x) should get close to 1 (highest urgency to reweight) with steeply.
+
+                    Could have used F(x) = (1 - e^(-x)). But that had slower convergence to 1, compared to the one currently in use.
+
+                    cdf of standard normal distribution: https://stackoverflow.com/a/29273201
+                    '''
+                    score += target[k] * (math.erf(((adjusted - avg)/avg) / math.sqrt(2.0)))
+                    sum_weight += target[k]
+                dev += (avg - adjusted) * (avg - adjusted)
+            stddev = math.sqrt(dev / float(max(num - 1, 1)))
+            score = score / max(sum_weight, 1)
+            r[t] = {
+                'avg': avg,
+                'stddev': stddev,
+                'sum_weight': sum_weight,
+                'score': score,
+            }
+        return r
+
+class Module(MgrModule):
+    COMMANDS = [
+        {
+            "cmd": "balancer status",
+            "desc": "Show balancer status",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap",
+            "desc": "Set balancer mode",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer on",
+            "desc": "Enable automatic balancing",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer off",
+            "desc": "Disable automatic balancing",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer eval name=plan,type=CephString,req=false",
+            "desc": "Evaluate data distribution for the current cluster or specific plan",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer eval-verbose name=plan,type=CephString,req=false",
+            "desc": "Evaluate data distribution for the current cluster or specific plan (verbosely)",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer optimize name=plan,type=CephString",
+            "desc": "Run optimizer to create a new plan",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer show name=plan,type=CephString",
+            "desc": "Show details of an optimization plan",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer rm name=plan,type=CephString",
+            "desc": "Discard an optimization plan",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer reset",
+            "desc": "Discard all optimization plans",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer dump name=plan,type=CephString",
+            "desc": "Show an optimization plan",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer execute name=plan,type=CephString",
+            "desc": "Execute an optimization plan",
+            "perm": "r",
+        },
+    ]
+    active = False
+    run = True
+    plans = {}
+    mode = ''
+
+    def __init__(self, *args, **kwargs):
+        super(Module, self).__init__(*args, **kwargs)
+        self.event = Event()
+
+    def handle_command(self, command):
+        self.log.warn("Handling command: '%s'" % str(command))
+        if command['prefix'] == 'balancer status':
+            s = {
+                'plans': self.plans.keys(),
+                'active': self.active,
+                'mode': self.get_config('mode', default_mode),
+            }
+            return (0, json.dumps(s, indent=4), '')
+        elif command['prefix'] == 'balancer mode':
+            self.set_config('mode', command['mode'])
+            return (0, '', '')
+        elif command['prefix'] == 'balancer on':
+            if not self.active:
+                self.set_config('active', '1')
+                self.active = True
+            self.event.set()
+            return (0, '', '')
+        elif command['prefix'] == 'balancer off':
+            if self.active:
+                self.set_config('active', '')
+                self.active = False
+            self.event.set()
+            return (0, '', '')
+        elif command['prefix'] == 'balancer eval' or command['prefix'] == 'balancer eval-verbose':
+            verbose = command['prefix'] == 'balancer eval-verbose'
+            if 'plan' in command:
+                plan = self.plans.get(command['plan'])
+                if not plan:
+                    return (-errno.ENOENT, '', 'plan %s not found' %
+                            command['plan'])
+                ms = plan.final_state()
+            else:
+                ms = MappingState(self.get_osdmap(),
+                                  self.get("pg_dump"),
+                                  'current cluster')
+            return (0, self.evaluate(ms, verbose=verbose), '')
+        elif command['prefix'] == 'balancer optimize':
+            plan = self.plan_create(command['plan'])
+            self.optimize(plan)
+            return (0, '', '')
+        elif command['prefix'] == 'balancer rm':
+            self.plan_rm(command['name'])
+            return (0, '', '')
+        elif command['prefix'] == 'balancer reset':
+            self.plans = {}
+            return (0, '', '')
+        elif command['prefix'] == 'balancer dump':
+            plan = self.plans.get(command['plan'])
+            if not plan:
+                return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
+            return (0, plan.dump(), '')
+        elif command['prefix'] == 'balancer show':
+            plan = self.plans.get(command['plan'])
+            if not plan:
+                return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
+            return (0, plan.show(), '')
+        elif command['prefix'] == 'balancer execute':
+            plan = self.plans.get(command['plan'])
+            if not plan:
+                return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
+            self.execute(plan)
+            self.plan_rm(plan)
+            return (0, '', '')
+        else:
+            return (-errno.EINVAL, '',
+                    "Command not found '{0}'".format(command['prefix']))
+
+    def shutdown(self):
+        self.log.info('Stopping')
+        self.run = False
+        self.event.set()
+
+    def time_in_interval(self, tod, begin, end):
+        if begin <= end:
+            return tod >= begin and tod < end
+        else:
+            return tod >= begin or tod < end
+
+    def serve(self):
+        self.log.info('Starting')
+        while self.run:
+            self.active = self.get_config('active', '') is not ''
+            begin_time = self.get_config('begin_time') or '0000'
+            end_time = self.get_config('end_time') or '2400'
+            timeofday = time.strftime('%H%M', time.localtime())
+            self.log.debug('Waking up [%s, scheduled for %s-%s, now %s]',
+                           "active" if self.active else "inactive",
+                           begin_time, end_time, timeofday)
+            sleep_interval = float(self.get_config('sleep_interval',
+                                                   default_sleep_interval))
+            if self.active and self.time_in_interval(timeofday, begin_time, end_time):
+                self.log.debug('Running')
+                name = 'auto_%s' % time.strftime(TIME_FORMAT, time.gmtime())
+                plan = self.plan_create(name)
+                if self.optimize(plan):
+                    self.execute(plan)
+                self.plan_rm(name)
+            self.log.debug('Sleeping for %d', sleep_interval)
+            self.event.wait(sleep_interval)
+            self.event.clear()
+
+    def plan_create(self, name):
+        plan = Plan(name, MappingState(self.get_osdmap(),
+                                       self.get("pg_dump"),
+                                       'plan %s initial' % name))
+        self.plans[name] = plan
+        return plan
+
+    def plan_rm(self, name):
+        if name in self.plans:
+            del self.plans[name]
+
+    def calc_eval(self, ms):
+        pe = Eval(ms)
+        pool_rule = {}
+        pool_info = {}
+        for p in ms.osdmap_dump.get('pools',[]):
+            pe.pool_name[p['pool']] = p['pool_name']
+            pe.pool_id[p['pool_name']] = p['pool']
+            pool_rule[p['pool_name']] = p['crush_rule']
+            pe.pool_roots[p['pool_name']] = []
+            pool_info[p['pool_name']] = p
+        pools = pe.pool_id.keys()
+        if len(pools) == 0:
+            return pe
+        self.log.debug('pool_name %s' % pe.pool_name)
+        self.log.debug('pool_id %s' % pe.pool_id)
+        self.log.debug('pools %s' % pools)
+        self.log.debug('pool_rule %s' % pool_rule)
+
+        osd_weight = { a['osd']: a['weight']
+                       for a in ms.osdmap_dump.get('osds',[]) }
+
+        # get expected distributions by root
+        actual_by_root = {}
+        rootids = ms.crush.find_takes()
+        roots = []
+        for rootid in rootids:
+            root = ms.crush.get_item_name(rootid)
+            pe.root_ids[root] = rootid
+            roots.append(root)
+            ls = ms.osdmap.get_pools_by_take(rootid)
+            pe.root_pools[root] = []
+            for poolid in ls:
+                pe.pool_roots[pe.pool_name[poolid]].append(root)
+                pe.root_pools[root].append(pe.pool_name[poolid])
+            weight_map = ms.crush.get_take_weight_osd_map(rootid)
+            adjusted_map = {
+                osd: cw * osd_weight.get(osd, 1.0)
+                for osd,cw in weight_map.iteritems()
+            }
+            sum_w = sum(adjusted_map.values()) or 1.0
+            pe.target_by_root[root] = { osd: w / sum_w
+                                        for osd,w in adjusted_map.iteritems() }
+            actual_by_root[root] = {
+                'pgs': {},
+                'objects': {},
+                'bytes': {},
+            }
+            for osd in pe.target_by_root[root].iterkeys():
+                actual_by_root[root]['pgs'][osd] = 0
+                actual_by_root[root]['objects'][osd] = 0
+                actual_by_root[root]['bytes'][osd] = 0
+            pe.total_by_root[root] = {
+                'pgs': 0,
+                'objects': 0,
+                'bytes': 0,
+            }
+        self.log.debug('pool_roots %s' % pe.pool_roots)
+        self.log.debug('root_pools %s' % pe.root_pools)
+        self.log.debug('target_by_root %s' % pe.target_by_root)
+
+        # pool and root actual
+        for pool, pi in pool_info.iteritems():
+            poolid = pi['pool']
+            pm = ms.pg_up_by_poolid[poolid]
+            pgs = 0
+            objects = 0
+            bytes = 0
+            pgs_by_osd = {}
+            objects_by_osd = {}
+            bytes_by_osd = {}
+            for root in pe.pool_roots[pool]:
+                for osd in pe.target_by_root[root].iterkeys():
+                    pgs_by_osd[osd] = 0
+                    objects_by_osd[osd] = 0
+                    bytes_by_osd[osd] = 0
+            for pgid, up in pm.iteritems():
+                for osd in [int(osd) for osd in up]:
+                    pgs_by_osd[osd] += 1
+                    objects_by_osd[osd] += ms.pg_stat[pgid]['num_objects']
+                    bytes_by_osd[osd] += ms.pg_stat[pgid]['num_bytes']
+                    # pick a root to associate this pg instance with.
+                    # note that this is imprecise if the roots have
+                    # overlapping children.
+                    # FIXME: divide bytes by k for EC pools.
+                    for root in pe.pool_roots[pool]:
+                        if osd in pe.target_by_root[root]:
+                            actual_by_root[root]['pgs'][osd] += 1
+                            actual_by_root[root]['objects'][osd] += ms.pg_stat[pgid]['num_objects']
+                            actual_by_root[root]['bytes'][osd] += ms.pg_stat[pgid]['num_bytes']
+                            pgs += 1
+                            objects += ms.pg_stat[pgid]['num_objects']
+                            bytes += ms.pg_stat[pgid]['num_bytes']
+                            pe.total_by_root[root]['pgs'] += 1
+                            pe.total_by_root[root]['objects'] += ms.pg_stat[pgid]['num_objects']
+                            pe.total_by_root[root]['bytes'] += ms.pg_stat[pgid]['num_bytes']
+                            break
+            pe.count_by_pool[pool] = {
+                'pgs': {
+                    k: v
+                    for k, v in pgs_by_osd.iteritems()
+                },
+                'objects': {
+                    k: v
+                    for k, v in objects_by_osd.iteritems()
+                },
+                'bytes': {
+                    k: v
+                    for k, v in bytes_by_osd.iteritems()
+                },
+            }
+            pe.actual_by_pool[pool] = {
+                'pgs': {
+                    k: float(v) / float(max(pgs, 1))
+                    for k, v in pgs_by_osd.iteritems()
+                },
+                'objects': {
+                    k: float(v) / float(max(objects, 1))
+                    for k, v in objects_by_osd.iteritems()
+                },
+                'bytes': {
+                    k: float(v) / float(max(bytes, 1))
+                    for k, v in bytes_by_osd.iteritems()
+                },
+            }
+            pe.total_by_pool[pool] = {
+                'pgs': pgs,
+                'objects': objects,
+                'bytes': bytes,
+            }
+        for root, m in pe.total_by_root.iteritems():
+            pe.count_by_root[root] = {
+                'pgs': {
+                    k: float(v)
+                    for k, v in actual_by_root[root]['pgs'].iteritems()
+                },
+                'objects': {
+                    k: float(v)
+                    for k, v in actual_by_root[root]['objects'].iteritems()
+                },
+                'bytes': {
+                    k: float(v)
+                    for k, v in actual_by_root[root]['bytes'].iteritems()
+                },
+            }
+            pe.actual_by_root[root] = {
+                'pgs': {
+                    k: float(v) / float(max(pe.total_by_root[root]['pgs'], 1))
+                    for k, v in actual_by_root[root]['pgs'].iteritems()
+                },
+                'objects': {
+                    k: float(v) / float(max(pe.total_by_root[root]['objects'], 1))
+                    for k, v in actual_by_root[root]['objects'].iteritems()
+                },
+                'bytes': {
+                    k: float(v) / float(max(pe.total_by_root[root]['bytes'], 1))
+                    for k, v in actual_by_root[root]['bytes'].iteritems()
+                },
+            }
+        self.log.debug('actual_by_pool %s' % pe.actual_by_pool)
+        self.log.debug('actual_by_root %s' % pe.actual_by_root)
+
+        # average and stddev and score
+        pe.stats_by_root = {
+            a: pe.calc_stats(
+                b,
+                pe.target_by_root[a],
+                pe.total_by_root[a]
+            ) for a, b in pe.count_by_root.iteritems()
+        }
+
+	# the scores are already normalized
+        pe.score_by_root = {
+            r: {
+                'pgs': pe.stats_by_root[r]['pgs']['score'],
+                'objects': pe.stats_by_root[r]['objects']['score'],
+                'bytes': pe.stats_by_root[r]['bytes']['score'],
+            } for r in pe.total_by_root.keys()
+        }
+
+        # total score is just average of normalized stddevs
+        pe.score = 0.0
+        for r, vs in pe.score_by_root.iteritems():
+            for k, v in vs.iteritems():
+                pe.score += v
+        pe.score /= 3 * len(roots)
+        return pe
+
+    def evaluate(self, ms, verbose=False):
+        pe = self.calc_eval(ms)
+        return pe.show(verbose=verbose)
+
+    def optimize(self, plan):
+        self.log.info('Optimize plan %s' % plan.name)
+        plan.mode = self.get_config('mode', default_mode)
+        max_misplaced = float(self.get_config('max_misplaced',
+                                              default_max_misplaced))
+        self.log.info('Mode %s, max misplaced %f' %
+                      (plan.mode, max_misplaced))
+
+        info = self.get('pg_status')
+        unknown = info.get('unknown_pgs_ratio', 0.0)
+        degraded = info.get('degraded_ratio', 0.0)
+        inactive = info.get('inactive_pgs_ratio', 0.0)
+        misplaced = info.get('misplaced_ratio', 0.0)
+        self.log.debug('unknown %f degraded %f inactive %f misplaced %g',
+                       unknown, degraded, inactive, misplaced)
+        if unknown > 0.0:
+            self.log.info('Some PGs (%f) are unknown; waiting', unknown)
+        elif degraded > 0.0:
+            self.log.info('Some objects (%f) are degraded; waiting', degraded)
+        elif inactive > 0.0:
+            self.log.info('Some PGs (%f) are inactive; waiting', inactive)
+        elif misplaced >= max_misplaced:
+            self.log.info('Too many objects (%f > %f) are misplaced; waiting',
+                          misplaced, max_misplaced)
+        else:
+            if plan.mode == 'upmap':
+                return self.do_upmap(plan)
+            elif plan.mode == 'crush-compat':
+                return self.do_crush_compat(plan)
+            elif plan.mode == 'none':
+                self.log.info('Idle')
+            else:
+                self.log.info('Unrecognized mode %s' % plan.mode)
+        return False
+
+        ##
+
+    def do_upmap(self, plan):
+        self.log.info('do_upmap')
+        max_iterations = self.get_config('upmap_max_iterations', 10)
+        max_deviation = self.get_config('upmap_max_deviation', .01)
+
+        ms = plan.initial
+        pools = [str(i['pool_name']) for i in ms.osdmap_dump.get('pools',[])]
+        if len(pools) == 0:
+            self.log.info('no pools, nothing to do')
+            return False
+        # shuffle pool list so they all get equal (in)attention
+        random.shuffle(pools)
+        self.log.info('pools %s' % pools)
+
+        inc = plan.inc
+        total_did = 0
+        left = max_iterations
+        for pool in pools:
+            did = ms.osdmap.calc_pg_upmaps(inc, max_deviation, left, [pool])
+            total_did += did
+            left -= did
+            if left <= 0:
+                break
+        self.log.info('prepared %d/%d changes' % (total_did, max_iterations))
+        return True
+
+    def do_crush_compat(self, plan):
+        self.log.info('do_crush_compat')
+        max_iterations = self.get_config('crush_compat_max_iterations', 25)
+        if max_iterations < 1:
+            return False
+        step = self.get_config('crush_compat_step', .5)
+        if step <= 0 or step >= 1.0:
+            return False
+        max_misplaced = float(self.get_config('max_misplaced',
+                                              default_max_misplaced))
+        min_pg_per_osd = 2
+
+        ms = plan.initial
+        osdmap = ms.osdmap
+        crush = osdmap.get_crush()
+        pe = self.calc_eval(ms)
+        if pe.score == 0:
+            self.log.info('Distribution is already perfect')
+            return False
+
+        # get current osd reweights
+        orig_osd_weight = { a['osd']: a['weight']
+                            for a in ms.osdmap_dump.get('osds',[]) }
+        reweighted_osds = [ a for a,b in orig_osd_weight.iteritems()
+                            if b < 1.0 and b > 0.0 ]
+
+        # get current compat weight-set weights
+        orig_ws = self.get_compat_weight_set_weights()
+        orig_ws = { a: b for a, b in orig_ws.iteritems() if a >= 0 }
+
+        # Make sure roots don't overlap their devices.  If so, we
+        # can't proceed.
+        roots = pe.target_by_root.keys()
+        self.log.debug('roots %s', roots)
+        visited = {}
+        overlap = {}
+        root_ids = {}
+        for root, wm in pe.target_by_root.iteritems():
+            for osd in wm.iterkeys():
+                if osd in visited:
+                    overlap[osd] = 1
+                visited[osd] = 1
+        if len(overlap) > 0:
+            self.log.err('error: some osds belong to multiple subtrees: %s' %
+                         overlap)
+            return False
+
+        key = 'pgs'  # pgs objects or bytes
+
+        # go
+        best_ws = copy.deepcopy(orig_ws)
+        best_ow = copy.deepcopy(orig_osd_weight)
+        best_pe = pe
+        left = max_iterations
+        bad_steps = 0
+        next_ws = copy.deepcopy(best_ws)
+        next_ow = copy.deepcopy(best_ow)
+        while left > 0:
+            # adjust
+            self.log.debug('best_ws %s' % best_ws)
+            random.shuffle(roots)
+            for root in roots:
+                pools = best_pe.root_pools[root]
+                pgs = len(best_pe.target_by_root[root])
+                min_pgs = pgs * min_pg_per_osd
+                if best_pe.total_by_root[root] < min_pgs:
+                    self.log.info('Skipping root %s (pools %s), total pgs %d '
+                                  '< minimum %d (%d per osd)',
+                                  root, pools, pgs, min_pgs, min_pg_per_osd)
+                    continue
+                self.log.info('Balancing root %s (pools %s) by %s' %
+                              (root, pools, key))
+                target = best_pe.target_by_root[root]
+                actual = best_pe.actual_by_root[root][key]
+                queue = sorted(actual.keys(),
+                               key=lambda osd: -abs(target[osd] - actual[osd]))
+                for osd in queue:
+                    if orig_osd_weight[osd] == 0:
+                        self.log.debug('skipping out osd.%d', osd)
+                    else:
+                        deviation = target[osd] - actual[osd]
+                        if deviation == 0:
+                            break
+                        self.log.debug('osd.%d deviation %f', osd, deviation)
+                        weight = best_ws[osd]
+                        ow = orig_osd_weight[osd]
+                        if actual[osd] > 0:
+                            calc_weight = target[osd] / actual[osd] * weight * ow
+                        else:
+                            # not enough to go on here... keep orig weight
+                            calc_weight = weight / orig_osd_weight[osd]
+                        new_weight = weight * (1.0 - step) + calc_weight * step
+                        self.log.debug('Reweight osd.%d %f -> %f', osd, weight,
+                                       new_weight)
+                        next_ws[osd] = new_weight
+                        if ow < 1.0:
+                            new_ow = min(1.0, max(step + (1.0 - step) * ow,
+                                                  ow + .005))
+                            self.log.debug('Reweight osd.%d reweight %f -> %f',
+                                           osd, ow, new_ow)
+                            next_ow[osd] = new_ow
+
+                # normalize weights under this root
+                root_weight = crush.get_item_weight(pe.root_ids[root])
+                root_sum = sum(b for a,b in next_ws.iteritems()
+                               if a in target.keys())
+                if root_sum > 0 and root_weight > 0:
+                    factor = root_sum / root_weight
+                    self.log.debug('normalizing root %s %d, weight %f, '
+                                   'ws sum %f, factor %f',
+                                   root, pe.root_ids[root], root_weight,
+                                   root_sum, factor)
+                    for osd in actual.keys():
+                        next_ws[osd] = next_ws[osd] / factor
+
+            # recalc
+            plan.compat_ws = copy.deepcopy(next_ws)
+            next_ms = plan.final_state()
+            next_pe = self.calc_eval(next_ms)
+            next_misplaced = next_ms.calc_misplaced_from(ms)
+            self.log.debug('Step result score %f -> %f, misplacing %f',
+                           best_pe.score, next_pe.score, next_misplaced)
+
+            if next_misplaced > max_misplaced:
+                if best_pe.score < pe.score:
+                    self.log.debug('Step misplaced %f > max %f, stopping',
+                                   next_misplaced, max_misplaced)
+                    break
+                step /= 2.0
+                next_ws = copy.deepcopy(best_ws)
+                next_ow = copy.deepcopy(best_ow)
+                self.log.debug('Step misplaced %f > max %f, reducing step to %f',
+                               next_misplaced, max_misplaced, step)
+            else:
+                if next_pe.score > best_pe.score * 1.0001:
+                    if bad_steps < 5 and random.randint(0, 100) < 70:
+                        self.log.debug('Score got worse, taking another step')
+                    else:
+                        step /= 2.0
+                        next_ws = copy.deepcopy(best_ws)
+                        next_ow = copy.deepcopy(best_ow)
+                        self.log.debug('Score got worse, trying smaller step %f',
+                                       step)
+                else:
+                    bad_steps = 0
+                    best_pe = next_pe
+                    best_ws = next_ws
+                    best_ow = next_ow
+                    if best_pe.score == 0:
+                        break
+            left -= 1
+
+        # allow a small regression if we are phasing out osd weights
+        fudge = 0
+        if next_ow != orig_osd_weight:
+            fudge = .001
+
+        if best_pe.score < pe.score + fudge:
+            self.log.info('Success, score %f -> %f', pe.score, best_pe.score)
+            plan.compat_ws = best_ws
+            for osd, w in best_ow.iteritems():
+                if w != orig_osd_weight[osd]:
+                    self.log.debug('osd.%d reweight %f', osd, w)
+                    plan.osd_weights[osd] = w
+            return True
+        else:
+            self.log.info('Failed to find further optimization, score %f',
+                          pe.score)
+            return False
+
+    def get_compat_weight_set_weights(self):
+        # enable compat weight-set
+        self.log.debug('ceph osd crush weight-set create-compat')
+        result = CommandResult('')
+        self.send_command(result, 'mon', '', json.dumps({
+            'prefix': 'osd crush weight-set create-compat',
+            'format': 'json',
+        }), '')
+        r, outb, outs = result.wait()
+        if r != 0:
+            self.log.error('Error creating compat weight-set')
+            return
+
+        result = CommandResult('')
+        self.send_command(result, 'mon', '', json.dumps({
+            'prefix': 'osd crush dump',
+            'format': 'json',
+        }), '')
+        r, outb, outs = result.wait()
+        if r != 0:
+            self.log.error('Error dumping crush map')
+            return
+        try:
+            crushmap = json.loads(outb)
+        except:
+            raise RuntimeError('unable to parse crush map')
+
+        raw = crushmap.get('choose_args',{}).get('-1', [])
+        weight_set = {}
+        for b in raw:
+            bucket = None
+            for t in crushmap['buckets']:
+                if t['id'] == b['bucket_id']:
+                    bucket = t
+                    break
+            if not bucket:
+                raise RuntimeError('could not find bucket %s' % b['bucket_id'])
+            self.log.debug('bucket items %s' % bucket['items'])
+            self.log.debug('weight set %s' % b['weight_set'][0])
+            if len(bucket['items']) != len(b['weight_set'][0]):
+                raise RuntimeError('weight-set size does not match bucket items')
+            for pos in range(len(bucket['items'])):
+                weight_set[bucket['items'][pos]['id']] = b['weight_set'][0][pos]
+
+        self.log.debug('weight_set weights %s' % weight_set)
+        return weight_set
+
+    def do_crush(self):
+        self.log.info('do_crush (not yet implemented)')
+
+    def do_osd_weight(self):
+        self.log.info('do_osd_weight (not yet implemented)')
+
+    def execute(self, plan):
+        self.log.info('Executing plan %s' % plan.name)
+
+        commands = []
+
+        # compat weight-set
+        if len(plan.compat_ws) and \
+           '-1' not in plan.initial.crush_dump.get('choose_args', {}):
+            self.log.debug('ceph osd crush weight-set create-compat')
+            result = CommandResult('')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd crush weight-set create-compat',
+                'format': 'json',
+            }), '')
+            r, outb, outs = result.wait()
+            if r != 0:
+                self.log.error('Error creating compat weight-set')
+                return
+
+        for osd, weight in plan.compat_ws.iteritems():
+            self.log.info('ceph osd crush weight-set reweight-compat osd.%d %f',
+                          osd, weight)
+            result = CommandResult('foo')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd crush weight-set reweight-compat',
+                'format': 'json',
+                'item': 'osd.%d' % osd,
+                'weight': [weight],
+            }), 'foo')
+            commands.append(result)
+
+        # new_weight
+        reweightn = {}
+        for osd, weight in plan.osd_weights.iteritems():
+            reweightn[str(osd)] = str(int(weight * float(0x10000)))
+        if len(reweightn):
+            self.log.info('ceph osd reweightn %s', reweightn)
+            result = CommandResult('foo')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd reweightn',
+                'format': 'json',
+                'weights': json.dumps(reweightn),
+            }), 'foo')
+            commands.append(result)
+
+        # upmap
+        incdump = plan.inc.dump()
+        for pgid in incdump.get('old_pg_upmap_items', []):
+            self.log.info('ceph osd rm-pg-upmap-items %s', pgid)
+            result = CommandResult('foo')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd rm-pg-upmap-items',
+                'format': 'json',
+                'pgid': pgid,
+            }), 'foo')
+            commands.append(result)
+
+        for item in incdump.get('new_pg_upmap_items', []):
+            self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'],
+                          item['mappings'])
+            osdlist = []
+            for m in item['mappings']:
+                osdlist += [m['from'], m['to']]
+            result = CommandResult('foo')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd pg-upmap-items',
+                'format': 'json',
+                'pgid': item['pgid'],
+                'id': osdlist,
+            }), 'foo')
+            commands.append(result)
+
+        # wait for commands
+        self.log.debug('commands %s' % commands)
+        for result in commands:
+            r, outb, outs = result.wait()
+            if r != 0:
+                self.log.error('Error on command')
+                return
+        self.log.debug('done')
diff --git a/ceph/src/pybind/mgr/dashboard/base.html b/ceph/src/pybind/mgr/dashboard/base.html
index 146bd92d0..03deb8e20 100644
--- a/ceph/src/pybind/mgr/dashboard/base.html
+++ b/ceph/src/pybind/mgr/dashboard/base.html
@@ -9,27 +9,27 @@
     <meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no"
           name="viewport">
     <link rel="stylesheet"
-          href="/static/AdminLTE-2.3.7/bootstrap/css/bootstrap.min.css">
+          href="{{ url_prefix }}/static/AdminLTE-2.3.7/bootstrap/css/bootstrap.min.css">
     <link rel="stylesheet"
           href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.5.0/css/font-awesome.min.css">
     <link rel="stylesheet"
           href="https://cdnjs.cloudflare.com/ajax/libs/ionicons/2.0.1/css/ionicons.min.css">
     <link rel="stylesheet"
-          href="/static/AdminLTE-2.3.7/dist/css/AdminLTE.min.css">
+          href="{{ url_prefix }}/static/AdminLTE-2.3.7/dist/css/AdminLTE.min.css">
     <link rel="stylesheet"
-          href="/static/AdminLTE-2.3.7/dist/css/skins/skin-blue.min.css">
+          href="{{ url_prefix }}/static/AdminLTE-2.3.7/dist/css/skins/skin-blue.min.css">
     <link rel="stylesheet"
-          href="/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.css">
+          href="{{ url_prefix }}/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.css">
 
-    <script src="/static/AdminLTE-2.3.7/plugins/jQuery/jquery-2.2.3.min.js"></script>
-    <script src="/static/AdminLTE-2.3.7/plugins/sparkline/jquery.sparkline.min.js"></script>
+    <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/plugins/jQuery/jquery-2.2.3.min.js"></script>
+    <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/plugins/sparkline/jquery.sparkline.min.js"></script>
 
-    <script src="/static/rivets.bundled.min.js"></script>
-    <script src="/static/underscore-min.js"></script>
+    <script src="{{ url_prefix }}/static/rivets.bundled.min.js"></script>
+    <script src="{{ url_prefix }}/static/underscore-min.js"></script>
 
-    <script src="/static/AdminLTE-2.3.7/bootstrap/js/bootstrap.min.js"></script>
-    <script src="/static/AdminLTE-2.3.7/dist/js/app.min.js"></script>
-    <script src="/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.min.js"></script>
+    <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/bootstrap/js/bootstrap.min.js"></script>
+    <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/dist/js/app.min.js"></script>
+    <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.min.js"></script>
 
     <script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.17.1/moment.min.js"></script>
     <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.4.0/Chart.min.js"></script>
@@ -42,7 +42,7 @@
             var refresh_interval = 5000;
 
             var refresh = function() {
-                $.get("/toplevel_data", function(data) {
+                $.get("{{ url_prefix }}/toplevel_data", function(data) {
                     _.extend(toplevel_data, data);
                     setTimeout(refresh, refresh_interval);
                 });
@@ -94,6 +94,12 @@
                 var width=4;
                 var unit = 0;
 
+                if (n == null) {
+                    // People shouldn't really be passing null, but let's
+                    // do something sensible instead of barfing.
+                    return "-";
+                }
+
                 while (Math.floor(n / (divisor**unit)).toString().length > width - 1) {
                     unit = unit + 1;
                 }
@@ -154,8 +160,8 @@
         });
       </script>
 
-      <link rel="shortcut icon" href="http://ceph.com/wp-content/themes/ceph/favicon.ico">
-      <link rel="shortcut icon" href="/static/favicon.ico">
+      <link rel="shortcut icon" href="https://ceph.com/wp-content/themes/cephTheme/Resources/Favicons/favicon-96x96.png">
+      <link rel="shortcut icon" href="{{ url_prefix }}/static/favicon.ico">
 
     <style>
         div.box {
@@ -297,13 +303,13 @@
     <!-- Main Header -->
     <header class="main-header">
         <!-- Logo -->
-        <a href="/" class="logo">
+        <a href="{{ url_prefix }}/" class="logo">
       <span class="logo-lg">
-        <img src="/static/Ceph_Logo_Standard_RGB_White_120411_fa.png"
+        <img src="{{ url_prefix }}/static/Ceph_Logo_Standard_RGB_White_120411_fa.png"
              width="123px" height="34px"/>
           </span>
       <span class="logo-mini">
-        <img src="/static/logo-mini.png"
+        <img src="{{ url_prefix }}/static/logo-mini.png"
              width="34px" height="34px"/>
           </span>
         </a>
@@ -341,7 +347,7 @@
       <ul class="sidebar-menu">
         <!-- Optionally, you can add icons to the links -->
         <li class="{%if path_info=='/' or path_info.startswith('/health')%}active{%endif%}">
-            <a href="/health">
+            <a href="{{ url_prefix }}/health">
             <i class="fa fa-heartbeat" rv-style="health_status | health_color"></i>
             <span>Cluster health</span></a>
         </li>
@@ -353,10 +359,10 @@
           </a>
           <ul class="treeview-menu menu-open">
             <li>
-                <a href="/servers">Servers</a>
+                <a href="{{ url_prefix }}/servers">Servers</a>
             </li>
             <li>
-                <a href="/osd">OSDs</a>
+                <a href="{{ url_prefix }}/osd">OSDs</a>
             </li>
           </ul>
         </li>
@@ -369,7 +375,7 @@
           </a>
           <ul class="treeview-menu menu-open">
             <li>
-              <a href="/rbd_mirroring">
+              <a href="{{ url_prefix }}/rbd_mirroring">
                 <i class="fa fa-exchange"></i> Mirroring
                 <span class="pull-right-container">
                     <small rv-hide="rbd_mirroring.warnings | hide_count_box" class="label pull-right bg-yellow">{rbd_mirroring.warnings}</small>
@@ -378,7 +384,7 @@
               </a>
             </li>
             <li>
-              <a href="/rbd_iscsi">
+              <a href="{{ url_prefix }}/rbd_iscsi">
                 <i class="fa fa-upload"></i> iSCSI
                 <span class="pull-right-container" />
               </a>
@@ -435,83 +441,6 @@
     <strong>Copyright &copy; 2016 by Ceph Contributors.</strong> Free software (LGPL 2.1)
   </footer>
 
-  <!-- Control Sidebar -->
-  <aside class="control-sidebar control-sidebar-dark">
-    <!-- Create the tabs -->
-    <ul class="nav nav-tabs nav-justified control-sidebar-tabs">
-      <li class="active"><a href="#control-sidebar-home-tab" data-toggle="tab"><i class="fa fa-home"></i></a></li>
-      <li><a href="#control-sidebar-settings-tab" data-toggle="tab"><i class="fa fa-gears"></i></a></li>
-    </ul>
-    <!-- Tab panes -->
-    <div class="tab-content">
-      <!-- Home tab content -->
-      <div class="tab-pane active" id="control-sidebar-home-tab">
-        <h3 class="control-sidebar-heading">Recent Activity</h3>
-        <ul class="control-sidebar-menu">
-          <li>
-            <a href="javascript::;">
-              <i class="menu-icon fa fa-birthday-cake bg-red"></i>
-
-              <div class="menu-info">
-                <h4 class="control-sidebar-subheading">Langdon's Birthday</h4>
-
-                <p>Will be 23 on April 24th</p>
-              </div>
-            </a>
-          </li>
-        </ul>
-        <!-- /.control-sidebar-menu -->
-
-        <h3 class="control-sidebar-heading">Tasks Progress</h3>
-        <ul class="control-sidebar-menu">
-          <li>
-            <a href="javascript::;">
-              <h4 class="control-sidebar-subheading">
-                Custom Template Design
-                <span class="pull-right-container">
-                  <span class="label label-danger pull-right">70%</span>
-                </span>
-              </h4>
-
-              <div class="progress progress-xxs">
-                <div class="progress-bar progress-bar-danger" style="width: 70%"></div>
-              </div>
-            </a>
-          </li>
-        </ul>
-        <!-- /.control-sidebar-menu -->
-
-      </div>
-      <!-- /.tab-pane -->
-      <!-- Stats tab content -->
-      <div class="tab-pane" id="control-sidebar-stats-tab">Stats Tab Content</div>
-      <!-- /.tab-pane -->
-      <!-- Settings tab content -->
-      <div class="tab-pane" id="control-sidebar-settings-tab">
-        <form method="post">
-          <h3 class="control-sidebar-heading">General Settings</h3>
-
-          <div class="form-group">
-            <label class="control-sidebar-subheading">
-              Report panel usage
-              <input type="checkbox" class="pull-right" checked>
-            </label>
-
-            <p>
-              Some information about this general settings option
-            </p>
-          </div>
-          <!-- /.form-group -->
-        </form>
-      </div>
-      <!-- /.tab-pane -->
-    </div>
-  </aside>
-  <!-- /.control-sidebar -->
-  <!-- Add the sidebar's background. This div must be placed
-       immediately after the control sidebar -->
-  <div class="control-sidebar-bg"></div>
-
 </div>
 
 <!--
diff --git a/ceph/src/pybind/mgr/dashboard/clients.html b/ceph/src/pybind/mgr/dashboard/clients.html
index bd5a55721..ab99bf764 100644
--- a/ceph/src/pybind/mgr/dashboard/clients.html
+++ b/ceph/src/pybind/mgr/dashboard/clients.html
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/clients_data/" + content_data.fscid  + "/", function(data) {
+                $.get("{{ url_prefix }}/clients_data/" + content_data.fscid  + "/", function(data) {
                     content_data.clients = data;
                     setTimeout(refresh, 5000);
                 });
diff --git a/ceph/src/pybind/mgr/dashboard/filesystem.html b/ceph/src/pybind/mgr/dashboard/filesystem.html
index c58f1e18f..60a97a007 100644
--- a/ceph/src/pybind/mgr/dashboard/filesystem.html
+++ b/ceph/src/pybind/mgr/dashboard/filesystem.html
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/filesystem_data/" + content_data.fs_status.filesystem.id  + "/", function(data) {
+                $.get("{{ url_prefix }}/filesystem_data/" + content_data.fs_status.filesystem.id  + "/", function(data) {
                     _.extend(content_data.fs_status, data);
                     setTimeout(refresh, 5000);
                 });
@@ -71,7 +71,7 @@
             var rhs_transform = delta_timeseries;
 
             var draw_chart = function() {
-                $.get("/mds_counters/" + content_data.fs_status.filesystem.id  + "/", function(data) {
+                $.get("{{ url_prefix }}/mds_counters/" + content_data.fs_status.filesystem.id  + "/", function(data) {
                     var top_chart = true;
 
                     // Cull any chart elements that correspond to MDSs no
@@ -270,7 +270,7 @@
         <div class="box-body">
             <table>
                 <thead>
-                <tr>
+                <tr rv-show="standbys | length">
                     <th>Daemon</th>
                 </tr>
                 </thead>
@@ -278,6 +278,9 @@
                 <tr rv-each-standby="standbys">
                     <td>{standby.name}</td>
                 </tr>
+                <tr class="ceph-none-found" rv-hide="standbys | length">
+                    <td>None found</td>
+                </tr>
                 </tbody>
             </table>
         </div>
diff --git a/ceph/src/pybind/mgr/dashboard/health.html b/ceph/src/pybind/mgr/dashboard/health.html
index 48cea82f1..e076d4bb0 100644
--- a/ceph/src/pybind/mgr/dashboard/health.html
+++ b/ceph/src/pybind/mgr/dashboard/health.html
@@ -81,26 +81,15 @@
             };
 
             rivets.formatters.pg_status_style = function(pg_status) {
-                var unhealthy = false;
-                var scrubbing = false;
                 $.each(pg_status, function(state, count) {
-                    if (state == "active+clean") {
-
-                    } else if (state == "active+clean+scrubbing"
-                    || state == "active+clean+scrubbing+deep") {
-                        scrubbing = true;
+                    if (state == "active+clean"
+                     || state == "active+clean+scrubbing"
+                     || state == "active+clean+scrubbing+deep") {
+                        return "color: #00bb00";
                     } else {
-                        unhealthy = true;
+                        return "color: #FFC200";
                     }
                 });
-
-                if (unhealthy) {
-                    return "color: #FFC200";
-                } else if (scrubbing) {
-                    return "color: #0000bb";
-                } else {
-                    return "color: #00bb00";
-                }
             };
 
             rivets.formatters.pg_status = function(pg_status) {
@@ -210,7 +199,7 @@
             rivets.bind($("#content"), content_data);
 
             var refresh = function() {
-                $.get("/health_data", function(data) {
+                $.get("{{ url_prefix }}/health_data", function(data) {
                     _.extend(content_data, data);
                     draw_usage_charts();
                     setTimeout(refresh, 5000);
diff --git a/ceph/src/pybind/mgr/dashboard/module.py b/ceph/src/pybind/mgr/dashboard/module.py
index 10b5c37ed..074103a5a 100644
--- a/ceph/src/pybind/mgr/dashboard/module.py
+++ b/ceph/src/pybind/mgr/dashboard/module.py
@@ -22,11 +22,12 @@ import json
 import sys
 import time
 import threading
+import socket
 
 import cherrypy
 import jinja2
 
-from mgr_module import MgrModule, CommandResult
+from mgr_module import MgrModule, MgrStandbyModule, CommandResult
 
 from types import OsdMap, NotFound, Config, FsMap, MonMap, \
     PgSummary, Health, MonStatus
@@ -45,7 +46,7 @@ log = logging.getLogger("dashboard")
 LOG_BUFFER_SIZE = 30
 
 # cherrypy likes to sys.exit on error.  don't let it take us down too!
-def os_exit_noop():
+def os_exit_noop(*args, **kwargs):
     pass
 
 os._exit = os_exit_noop
@@ -61,6 +62,54 @@ def recurse_refs(root, path):
 
     log.info("%s %d (%s)" % (path, sys.getrefcount(root), root.__class__))
 
+def get_prefixed_url(url):
+    return global_instance().url_prefix + url
+
+
+
+class StandbyModule(MgrStandbyModule):
+    def serve(self):
+        server_addr = self.get_localized_config('server_addr', '::')
+        server_port = self.get_localized_config('server_port', '7000')
+        if server_addr is None:
+            raise RuntimeError('no server_addr configured; try "ceph config-key set mgr/dashboard/server_addr <ip>"')
+        log.info("server_addr: %s server_port: %s" % (server_addr, server_port))
+        cherrypy.config.update({
+            'server.socket_host': server_addr,
+            'server.socket_port': int(server_port),
+            'engine.autoreload.on': False
+        })
+
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        jinja_loader = jinja2.FileSystemLoader(current_dir)
+        env = jinja2.Environment(loader=jinja_loader)
+
+        module = self
+
+        class Root(object):
+            @cherrypy.expose
+            def index(self):
+                active_uri = module.get_active_uri()
+                if active_uri:
+                    log.info("Redirecting to active '{0}'".format(active_uri))
+                    raise cherrypy.HTTPRedirect(active_uri)
+                else:
+                    template = env.get_template("standby.html")
+                    return template.render(delay=5)
+
+        cherrypy.tree.mount(Root(), "/", {})
+        log.info("Starting engine...")
+        cherrypy.engine.start()
+        log.info("Waiting for engine...")
+        cherrypy.engine.wait(state=cherrypy.engine.states.STOPPED)
+        log.info("Engine done.")
+
+    def shutdown(self):
+        log.info("Stopping server...")
+        cherrypy.engine.wait(state=cherrypy.engine.states.STARTED)
+        cherrypy.engine.stop()
+        log.info("Stopped server")
+
 
 class Module(MgrModule):
     def __init__(self, *args, **kwargs):
@@ -98,6 +147,9 @@ class Module(MgrModule):
         self.pool_stats = defaultdict(lambda: defaultdict(
             lambda: collections.deque(maxlen=10)))
 
+        # A prefix for all URLs to use the dashboard with a reverse http proxy
+        self.url_prefix = ''
+
     @property
     def rados(self):
         """
@@ -107,8 +159,7 @@ class Module(MgrModule):
         if self._rados:
             return self._rados
 
-        from mgr_module import ceph_state
-        ctx_capsule = ceph_state.get_context()
+        ctx_capsule = self.get_context()
         self._rados = rados.Rados(context=ctx_capsule)
         self._rados.connect()
 
@@ -374,7 +425,7 @@ class Module(MgrModule):
                 "id": fs_id,
                 "name": mdsmap['fs_name'],
                 "client_count": client_count,
-                "clients_url": "/clients/{0}/".format(fs_id),
+                "clients_url": get_prefixed_url("/clients/{0}/".format(fs_id)),
                 "ranks": rank_table,
                 "pools": pools_table
             },
@@ -440,7 +491,7 @@ class Module(MgrModule):
                 rbd_pools = sorted([
                     {
                         "name": name,
-                        "url": "/rbd_pool/{0}/".format(name)
+                        "url": get_prefixed_url("/rbd_pool/{0}/".format(name))
                     }
                     for name in data
                 ], key=lambda k: k['name'])
@@ -455,7 +506,7 @@ class Module(MgrModule):
                     {
                         "id": f['id'],
                         "name": f['mdsmap']['fs_name'],
-                        "url": "/filesystem/{0}/".format(f['id'])
+                        "url": get_prefixed_url("/filesystem/{0}/".format(f['id']))
                     }
                     for f in fsmap.data['filesystems']
                 ]
@@ -479,6 +530,7 @@ class Module(MgrModule):
                 }
 
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(toplevel_data, indent=2),
@@ -542,11 +594,12 @@ class Module(MgrModule):
                     "clients": clients,
                     "fs_name": fs_name,
                     "fscid": fscid,
-                    "fs_url": "/filesystem/" + fscid_str + "/"
+                    "fs_url": get_prefixed_url("/filesystem/" + fscid_str + "/")
                 }
 
                 template = env.get_template("clients.html")
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(self._toplevel_data(), indent=2),
@@ -591,6 +644,7 @@ class Module(MgrModule):
                 }
 
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(toplevel_data, indent=2),
@@ -617,6 +671,7 @@ class Module(MgrModule):
                 content_data = self._rbd_mirroring()
 
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(toplevel_data, indent=2),
@@ -643,6 +698,7 @@ class Module(MgrModule):
                 content_data = self._rbd_iscsi()
 
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(toplevel_data, indent=2),
@@ -658,6 +714,7 @@ class Module(MgrModule):
             def health(self):
                 template = env.get_template("health.html")
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(self._toplevel_data(), indent=2),
@@ -668,6 +725,7 @@ class Module(MgrModule):
             def servers(self):
                 template = env.get_template("servers.html")
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info=cherrypy.request.path_info,
                     toplevel_data=json.dumps(self._toplevel_data(), indent=2),
@@ -821,6 +879,17 @@ class Module(MgrModule):
                         ret[k1][k2] = sorted_dict
                 return ret
 
+        url_prefix = self.get_config('url_prefix')
+        if url_prefix == None:
+            url_prefix = ''
+        else:
+            if len(url_prefix) != 0:
+                if url_prefix[0] != '/':
+                    url_prefix = '/'+url_prefix
+                if url_prefix[-1] == '/':
+                    url_prefix = url_prefix[:-1]
+        self.url_prefix = url_prefix
+
         server_addr = self.get_localized_config('server_addr', '::')
         server_port = self.get_localized_config('server_port', '7000')
         if server_addr is None:
@@ -832,6 +901,16 @@ class Module(MgrModule):
             'engine.autoreload.on': False
         })
 
+        osdmap = self.get_osdmap()
+        log.info("latest osdmap is %d" % osdmap.get_epoch())
+
+        # Publish the URI that others may use to access the service we're
+        # about to start serving
+        self.set_uri("http://{0}:{1}/".format(
+            socket.getfqdn() if server_addr == "::" else server_addr,
+            server_port
+        ))
+
         static_dir = os.path.join(current_dir, 'static')
         conf = {
             "/static": {
@@ -882,6 +961,7 @@ class Module(MgrModule):
                 toplevel_data = self._toplevel_data()
 
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info='/osd' + cherrypy.request.path_info,
                     toplevel_data=json.dumps(toplevel_data, indent=2),
@@ -923,7 +1003,7 @@ class Module(MgrModule):
                 result['up'] = osd_info['up']
                 result['in'] = osd_info['in']
 
-                result['url'] = "/osd/perf/{0}".format(osd_id)
+                result['url'] = get_prefixed_url("/osd/perf/{0}".format(osd_id))
 
                 return result
 
@@ -936,7 +1016,6 @@ class Module(MgrModule):
                 for server in servers:
                     hostname = server['hostname']
                     services = server['services']
-                    first = True
                     for s in services:
                         if s["type"] == "osd":
                             osd_id = int(s["id"])
@@ -948,18 +1027,18 @@ class Module(MgrModule):
                             summary = self._osd_summary(osd_id,
                                                         osd_map.osds_by_id[osd_id])
 
-                            if first:
-                                # A little helper for rendering
-                                summary['first'] = True
-                                first = False
                             result[hostname].append(summary)
 
+                    result[hostname].sort(key=lambda a: a['id'])
+                    if len(result[hostname]):
+                        result[hostname][0]['first'] = True
+
                 global_instance().log.warn("result.size {0} servers.size {1}".format(
                     len(result), len(servers)
                 ))
 
                 # Return list form for convenience of rendering
-                return result.items()
+                return sorted(result.items(), key=lambda a: a[0])
 
             @cherrypy.expose
             def index(self):
@@ -976,16 +1055,18 @@ class Module(MgrModule):
                 }
 
                 return template.render(
+                    url_prefix = global_instance().url_prefix,
                     ceph_version=global_instance().version,
                     path_info='/osd' + cherrypy.request.path_info,
                     toplevel_data=json.dumps(toplevel_data, indent=2),
                     content_data=json.dumps(content_data, indent=2)
                 )
 
-        cherrypy.tree.mount(Root(), "/", conf)
-        cherrypy.tree.mount(OSDEndpoint(), "/osd", conf)
+        cherrypy.tree.mount(Root(), get_prefixed_url("/"), conf)
+        cherrypy.tree.mount(OSDEndpoint(), get_prefixed_url("/osd"), conf)
 
-        log.info("Starting engine...")
+        log.info("Starting engine on {0}:{1}...".format(
+            server_addr, server_port))
         cherrypy.engine.start()
         log.info("Waiting for engine...")
         cherrypy.engine.block()
diff --git a/ceph/src/pybind/mgr/dashboard/osd_perf.html b/ceph/src/pybind/mgr/dashboard/osd_perf.html
index 7ab958eff..b13ad17e1 100644
--- a/ceph/src/pybind/mgr/dashboard/osd_perf.html
+++ b/ceph/src/pybind/mgr/dashboard/osd_perf.html
@@ -96,7 +96,7 @@
             post_load();
 
             var refresh = function() {
-                $.get("/osd/perf_data/" + content_data.osd.osd  + "/", function(data) {
+                $.get("{{ url_prefix }}/osd/perf_data/" + content_data.osd.osd  + "/", function(data) {
                     _.extend(content_data.osd_histogram, data.osd_histogram);
                     _.extend(content_data.osd, data.osd);
                     _.extend(content_data.osd_metadata, data.osd_metadata);
diff --git a/ceph/src/pybind/mgr/dashboard/osds.html b/ceph/src/pybind/mgr/dashboard/osds.html
index ddcb8577d..b55b3df59 100644
--- a/ceph/src/pybind/mgr/dashboard/osds.html
+++ b/ceph/src/pybind/mgr/dashboard/osds.html
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/osd/list_data/", function(data) {
+                $.get("{{ url_prefix }}/osd/list_data/", function(data) {
                     content_data.osds_by_server = data;
                     $('.inlinesparkline').sparkline();
                     setTimeout(refresh, 5000);
diff --git a/ceph/src/pybind/mgr/dashboard/rbd_iscsi.html b/ceph/src/pybind/mgr/dashboard/rbd_iscsi.html
index 105f5dec8..b8e47fdb2 100644
--- a/ceph/src/pybind/mgr/dashboard/rbd_iscsi.html
+++ b/ceph/src/pybind/mgr/dashboard/rbd_iscsi.html
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/rbd_iscsi_data", function(data) {
+                $.get("{{ url_prefix }}/rbd_iscsi_data", function(data) {
                     _.extend(content_data, data);
                     setTimeout(refresh, 30000);
                 });
diff --git a/ceph/src/pybind/mgr/dashboard/rbd_ls.py b/ceph/src/pybind/mgr/dashboard/rbd_ls.py
index 6588766a7..87315a91b 100644
--- a/ceph/src/pybind/mgr/dashboard/rbd_ls.py
+++ b/ceph/src/pybind/mgr/dashboard/rbd_ls.py
@@ -6,8 +6,8 @@ from remote_view_cache import RemoteViewCache
 
 class RbdPoolLs(RemoteViewCache):
     def _get(self):
-        from mgr_module import ceph_state
-        ctx_capsule = ceph_state.get_context()
+        ctx_capsule = self._module.get_context()
+
 
         osd_map = self._module.get_sync_object(OsdMap).data
         osd_pools = [pool['pool_name'] for pool in osd_map['pools']]
diff --git a/ceph/src/pybind/mgr/dashboard/rbd_mirroring.html b/ceph/src/pybind/mgr/dashboard/rbd_mirroring.html
index b83aadd08..272068547 100644
--- a/ceph/src/pybind/mgr/dashboard/rbd_mirroring.html
+++ b/ceph/src/pybind/mgr/dashboard/rbd_mirroring.html
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/rbd_mirroring_data", function(data) {
+                $.get("{{ url_prefix }}/rbd_mirroring_data", function(data) {
                     _.extend(content_data, data);
                     setTimeout(refresh, 30000);
                 });
diff --git a/ceph/src/pybind/mgr/dashboard/rbd_pool.html b/ceph/src/pybind/mgr/dashboard/rbd_pool.html
index 973bc3717..0d0e54fdc 100644
--- a/ceph/src/pybind/mgr/dashboard/rbd_pool.html
+++ b/ceph/src/pybind/mgr/dashboard/rbd_pool.html
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/rbd_pool_data/" + content_data.pool_name  + "/", function(data) {
+                $.get("{{ url_prefix }}/rbd_pool_data/" + content_data.pool_name  + "/", function(data) {
                     content_data.images = data;
                     setTimeout(refresh, 10000);
                 });
diff --git a/ceph/src/pybind/mgr/dashboard/servers.html b/ceph/src/pybind/mgr/dashboard/servers.html
index e6c8b3cf6..421d3389c 100644
--- a/ceph/src/pybind/mgr/dashboard/servers.html
+++ b/ceph/src/pybind/mgr/dashboard/servers.html
@@ -8,7 +8,7 @@
             var content_data = {{ content_data }};
 
             var refresh = function() {
-                $.get("/servers_data", function(data) {
+                $.get("{{ url_prefix }}/servers_data", function(data) {
                     _.extend(content_data, data);
                     setTimeout(refresh, 5000);
                 });
diff --git a/ceph/src/pybind/mgr/dashboard/standby.html b/ceph/src/pybind/mgr/dashboard/standby.html
new file mode 100644
index 000000000..ec706a17d
--- /dev/null
+++ b/ceph/src/pybind/mgr/dashboard/standby.html
@@ -0,0 +1,15 @@
+
+<html>
+    <!-- Note: this is only displayed when the standby
+         does not know an active URI to redirect to, otherwise
+         a simple redirect is returned instead -->
+    <head>
+        <title>Ceph</title>
+        <meta http-equiv="refresh" content="{{delay}}">
+    </head>
+    <body>
+        No active ceph-mgr instance is currently running
+        the dashboard.  A failover may be in progress.
+        Retrying in {{delay}} seconds...
+    </body>
+</html>
diff --git a/ceph/src/pybind/mgr/influx/__init__.py b/ceph/src/pybind/mgr/influx/__init__.py
new file mode 100644
index 000000000..0440e0705
--- /dev/null
+++ b/ceph/src/pybind/mgr/influx/__init__.py
@@ -0,0 +1 @@
+from module import *  # NOQA
diff --git a/ceph/src/pybind/mgr/influx/module.py b/ceph/src/pybind/mgr/influx/module.py
new file mode 100644
index 000000000..adeb45270
--- /dev/null
+++ b/ceph/src/pybind/mgr/influx/module.py
@@ -0,0 +1,162 @@
+
+from datetime import datetime
+from threading import Event
+import json
+import errno
+
+from mgr_module import MgrModule
+
+try:
+    from influxdb import InfluxDBClient
+    from influxdb.exceptions import InfluxDBClientError
+except ImportError:
+    InfluxDBClient = None
+
+class Module(MgrModule):
+    COMMANDS = [
+        {
+            "cmd": "influx self-test",
+            "desc": "debug the module",
+            "perm": "rw"  
+        },
+    ]
+
+
+    def __init__(self, *args, **kwargs):
+        super(Module, self).__init__(*args, **kwargs)
+        self.event = Event()
+        self.run = True 
+
+
+    def get_latest(self, daemon_type, daemon_name, stat):
+        data = self.get_counter(daemon_type, daemon_name, stat)[stat]
+        if data:
+            return data[-1][1]
+        else:
+            return 0
+
+
+    def get_df_stats(self):
+        df = self.get("df")
+        data = []
+
+        df_types = [
+            'bytes_used',
+            'dirty',
+            'rd_bytes',
+            'raw_bytes_used',
+            'wr_bytes',
+            'objects',
+            'max_avail'
+        ]
+
+        for df_type in df_types:
+            for pool in df['pools']:
+                point = {
+                    "measurement": "ceph_pool_stats",
+                    "tags": {
+                        "pool_name" : pool['name'],
+                        "pool_id" : pool['id'],
+                        "type_instance" : df_type,
+                        "mgr_id" : self.get_mgr_id(),
+                    },
+                        "time" : datetime.utcnow().isoformat() + 'Z',
+                        "fields": {
+                            "value" : pool['stats'][df_type],
+                        }
+                }
+                data.append(point)
+        return data
+
+    def get_daemon_stats(self):
+        data = []
+
+        for daemon, counters in self.get_all_perf_counters().iteritems():
+            svc_type, svc_id = daemon.split(".")
+            metadata = self.get_metadata(svc_type, svc_id)
+
+            for path, counter_info in counters.items():
+                if counter_info['type'] & self.PERFCOUNTER_HISTOGRAM:
+                    continue
+
+                value = counter_info['value']
+
+                data.append({
+                    "measurement": "ceph_daemon_stats",
+                    "tags": {
+                        "ceph_daemon": daemon,
+                        "type_instance": path,
+                        "host": metadata['hostname']
+                    },
+                    "time": datetime.utcnow().isoformat() + 'Z',
+                    "fields": {
+                        "value": value
+                    }
+                })
+
+        return data
+
+    def send_to_influx(self):
+        host = self.get_config("hostname")
+        if not host:
+            self.log.error("No InfluxDB server configured, please set"
+                           "`hostname` configuration key.")
+            return
+
+        port = int(self.get_config("port", default="8086"))
+        database = self.get_config("database", default="ceph")
+
+        # If influx server has authentication turned off then
+        # missing username/password is valid.
+        username = self.get_config("username", default="")
+        password = self.get_config("password", default="")
+
+        client = InfluxDBClient(host, port, username, password, database)
+
+        # using influx client get_list_database requires admin privs, instead we'll catch the not found exception and inform the user if db can't be created
+        try:
+            client.write_points(self.get_df_stats(), 'ms')
+            client.write_points(self.get_daemon_stats(), 'ms')
+        except InfluxDBClientError as e:
+            if e.code == 404:
+                self.log.info("Database '{0}' not found, trying to create (requires admin privs).  You can also create manually and grant write privs to user '{1}'".format(database,username))
+                client.create_database(database)
+            else:
+                raise
+
+    def shutdown(self):
+        self.log.info('Stopping influx module')
+        self.run = False
+        self.event.set()
+
+    def handle_command(self, cmd):
+        if cmd['prefix'] == 'influx self-test':
+            daemon_stats = self.get_daemon_stats()
+            assert len(daemon_stats)
+            df_stats = self.get_df_stats()
+            result = {
+                'daemon_stats': daemon_stats,
+                'df_stats': df_stats
+            }
+            return 0, json.dumps(result, indent=2), 'Self-test OK'
+        else:
+            return (-errno.EINVAL, '',
+                    "Command not found '{0}'".format(cmd['prefix']))
+
+    def serve(self):
+        if InfluxDBClient is None:
+            self.log.error("Cannot transmit statistics: influxdb python "
+                           "module not found.  Did you install it?")
+            return
+
+        self.log.info('Starting influx module')
+        self.run = True
+        while self.run:
+            self.send_to_influx()
+            self.log.debug("Running interval loop")
+            interval = self.get_config("interval")
+            if interval is None:
+                interval = 5
+            self.log.debug("sleeping for %d seconds",interval)
+            self.event.wait(interval)
+            
diff --git a/ceph/src/pybind/mgr/localpool/__init__.py b/ceph/src/pybind/mgr/localpool/__init__.py
new file mode 100644
index 000000000..79f5b86fd
--- /dev/null
+++ b/ceph/src/pybind/mgr/localpool/__init__.py
@@ -0,0 +1,2 @@
+
+from module import *  # NOQA
diff --git a/ceph/src/pybind/mgr/localpool/module.py b/ceph/src/pybind/mgr/localpool/module.py
new file mode 100644
index 000000000..0abdbfbc7
--- /dev/null
+++ b/ceph/src/pybind/mgr/localpool/module.py
@@ -0,0 +1,92 @@
+from mgr_module import MgrModule, CommandResult
+import json
+import threading
+
+class Module(MgrModule):
+    def __init__(self, *args, **kwargs):
+        super(Module, self).__init__(*args, **kwargs)
+        self.serve_event = threading.Event()
+
+    def notify(self, notify_type, notify_id):
+        if notify_type == 'osd_map':
+            self.handle_osd_map()
+
+    def handle_osd_map(self):
+        """
+        Check pools on each OSDMap change
+        """
+        subtree_type = self.get_config('subtree') or 'rack'
+        failure_domain = self.get_config('failure_domain') or 'host'
+        pg_num = self.get_config('pg_num') or '128'
+        num_rep = self.get_config('num_rep') or '3'
+        min_size = self.get_config('min_size')
+        prefix = self.get_config('prefix') or 'by-' + subtree_type + '-'
+
+        osdmap = self.get("osd_map")
+        lpools = []
+        for pool in osdmap['pools']:
+            if pool['pool_name'].find(prefix) == 0:
+                lpools.append(pool['pool_name'])
+
+        self.log.debug('localized pools = %s', lpools)
+        subtrees = []
+        tree = self.get('osd_map_tree')
+        for node in tree['nodes']:
+            if node['type'] == subtree_type:
+                subtrees.append(node['name'])
+                pool_name = prefix + node['name']
+                if pool_name not in lpools:
+                    self.log.info('Creating localized pool %s', pool_name)
+                    #
+                    result = CommandResult("")
+                    self.send_command(result, "mon", "", json.dumps({
+                        "prefix": "osd crush rule create-replicated",
+                        "format": "json",
+                        "name": pool_name,
+                        "root": node['name'],
+                        "type": failure_domain,
+                    }), "")
+                    r, outb, outs = result.wait()
+
+                    result = CommandResult("")
+                    self.send_command(result, "mon", "", json.dumps({
+                        "prefix": "osd pool create",
+                        "format": "json",
+                        "pool": pool_name,
+                        'rule': pool_name,
+                        'erasure_code_profile': pool_name,
+                        "pool_type": 'replicated',
+                        'pg_num': str(pg_num),
+                    }), "")
+                    r, outb, outs = result.wait()
+
+                    result = CommandResult("")
+                    self.send_command(result, "mon", "", json.dumps({
+                        "prefix": "osd pool set",
+                        "format": "json",
+                        "pool": pool_name,
+                        'var': 'size',
+                        "val": str(num_rep),
+                    }), "")
+                    r, outb, outs = result.wait()
+
+                    if min_size:
+                        result = CommandResult("")
+                        self.send_command(result, "mon", "", json.dumps({
+                            "prefix": "osd pool set",
+                            "format": "json",
+                            "pool": pool_name,
+                            'var': 'min_size',
+                            "val": str(min_size),
+                        }), "")
+                        r, outb, outs = result.wait()
+
+        # TODO remove pools for hosts that don't exist?
+
+    def serve(self):
+        self.handle_osd_map()
+        self.serve_event.wait()
+        self.serve_event.clear()
+
+    def shutdown(self):
+        self.serve_event.set()
diff --git a/ceph/src/pybind/mgr/mgr_module.py b/ceph/src/pybind/mgr/mgr_module.py
index 2463bafe7..1abbcc5cd 100644
--- a/ceph/src/pybind/mgr/mgr_module.py
+++ b/ceph/src/pybind/mgr/mgr_module.py
@@ -1,10 +1,54 @@
 
-import ceph_state  #noqa
+import ceph_module  # noqa
+#import ceph_osdmap  #noqa
+#import ceph_osdmap_incremental  #noqa
+#import ceph_crushmap  #noqa
+
 import json
 import logging
 import threading
+from collections import defaultdict
+
+
+class CPlusPlusHandler(logging.Handler):
+    def __init__(self, module_inst):
+        super(CPlusPlusHandler, self).__init__()
+        self._module = module_inst
+
+    def emit(self, record):
+        if record.levelno <= logging.DEBUG:
+            ceph_level = 20
+        elif record.levelno <= logging.INFO:
+            ceph_level = 4
+        elif record.levelno <= logging.WARNING:
+            ceph_level = 1
+        else:
+            ceph_level = 0
+
+        self._module._ceph_log(ceph_level, self.format(record))
+
+
+def configure_logger(module_inst, name):
+    logger = logging.getLogger(name)
+
+
+    # Don't filter any logs at the python level, leave it to C++
+    logger.setLevel(logging.DEBUG)
+
+    # FIXME: we should learn the log level from C++ land, and then
+    # avoid calling the C++ level log when we know a message is of
+    # an insufficient level to be ultimately output
+    logger.addHandler(CPlusPlusHandler(module_inst))
+
+    return logger
 
 
+def unconfigure_logger(module_inst, name):
+    logger = logging.getLogger(name)
+    rm_handlers = [h for h in logger.handlers if isinstance(h, CPlusPlusHandler)]
+    for h in rm_handlers:
+        logger.removeHandler(h)
+
 class CommandResult(object):
     """
     Use with MgrModule.send_command
@@ -30,36 +74,168 @@ class CommandResult(object):
         return self.r, self.outb, self.outs
 
 
-class MgrModule(object):
+class OSDMap(ceph_module.BasePyOSDMap):
+    def get_epoch(self):
+        return self._get_epoch()
+
+    def get_crush_version(self):
+        return self._get_crush_version()
+
+    def dump(self):
+        return self._dump()
+
+    def new_incremental(self):
+        return self._new_incremental()
+
+    def apply_incremental(self, inc):
+        return self._apply_incremental(inc)
+
+    def get_crush(self):
+        return self._get_crush()
+
+    def get_pools_by_take(self, take):
+        return self._get_pools_by_take(take).get('pools', [])
+
+    def calc_pg_upmaps(self, inc,
+                       max_deviation=.01, max_iterations=10, pools=[]):
+        return self._calc_pg_upmaps(
+            inc,
+            max_deviation, max_iterations, pools)
+
+    def map_pool_pgs_up(self, poolid):
+        return self._map_pool_pgs_up(poolid)
+
+class OSDMapIncremental(ceph_module.BasePyOSDMapIncremental):
+    def get_epoch(self):
+        return self._get_epoch()
+
+    def dump(self):
+        return self._dump()
+
+    def set_osd_reweights(self, weightmap):
+        """
+        weightmap is a dict, int to float.  e.g. { 0: .9, 1: 1.0, 3: .997 }
+        """
+        return self._set_osd_reweights(weightmap)
+
+    def set_crush_compat_weight_set_weights(self, weightmap):
+        """
+        weightmap is a dict, int to float.  devices only.  e.g.,
+        { 0: 3.4, 1: 3.3, 2: 3.334 }
+        """
+        return self._set_crush_compat_weight_set_weights(weightmap)
+
+class CRUSHMap(ceph_module.BasePyCRUSH):
+    def dump(self):
+        return self._dump()
+
+    def get_item_weight(self, item):
+        return self._get_item_weight(item)
+
+    def get_item_name(self, item):
+        return self._get_item_name(item)
+
+    def find_takes(self):
+        return self._find_takes().get('takes', [])
+
+    def get_take_weight_osd_map(self, root):
+        uglymap = self._get_take_weight_osd_map(root)
+        return { int(k): v for k, v in uglymap.get('weights', {}).iteritems() }
+
+class MgrStandbyModule(ceph_module.BaseMgrStandbyModule):
+    """
+    Standby modules only implement a serve and shutdown method, they
+    are not permitted to implement commands and they do not receive
+    any notifications.
+
+    They only have access to the mgrmap (for acecssing service URI info
+    from their active peer), and to configuration settings (read only).
+    """
+
+    def __init__(self, module_name, capsule):
+        super(MgrStandbyModule, self).__init__(capsule)
+        self.module_name = module_name
+        self._logger = configure_logger(self, module_name)
+
+    def __del__(self):
+        unconfigure_logger(self, self.module_name)
+
+    @property
+    def log(self):
+        return self._logger
+
+    def serve(self):
+        """
+        The serve method is mandatory for standby modules.
+        :return:
+        """
+        raise NotImplementedError()
+
+    def get_mgr_id(self):
+        return self._ceph_get_mgr_id()
+
+    def get_config(self, key):
+        return self._ceph_get_config(key)
+
+    def get_active_uri(self):
+        return self._ceph_get_active_uri()
+
+    def get_localized_config(self, key, default=None):
+        r = self.get_config(self.get_mgr_id() + '/' + key)
+        if r is None:
+            r = self.get_config(key)
+
+        if r is None:
+            r = default
+        return r
+
+class MgrModule(ceph_module.BaseMgrModule):
     COMMANDS = []
 
-    def __init__(self, handle):
-        self._handle = handle
-        self._logger = logging.getLogger(handle)
+    # Priority definitions for perf counters
+    PRIO_CRITICAL = 10
+    PRIO_INTERESTING = 8
+    PRIO_USEFUL = 5
+    PRIO_UNINTERESTING = 2
+    PRIO_DEBUGONLY = 0
+
+    # counter value types
+    PERFCOUNTER_TIME = 1
+    PERFCOUNTER_U64 = 2
+
+    # counter types
+    PERFCOUNTER_LONGRUNAVG = 4
+    PERFCOUNTER_COUNTER = 8
+    PERFCOUNTER_HISTOGRAM = 0x10
+    PERFCOUNTER_TYPE_MASK = ~2
 
-        # Don't filter any logs at the python level, leave it to C++
-        self._logger.setLevel(logging.DEBUG)
+    def __init__(self, module_name, py_modules_ptr, this_ptr):
+        self.module_name = module_name
 
-        # FIXME: we should learn the log level from C++ land, and then
-        # avoid calling ceph_state.log when we know a message is of
-        # an insufficient level to be ultimately output
+        # If we're taking over from a standby module, let's make sure
+        # its logger was unconfigured before we hook ours up
+        unconfigure_logger(self, self.module_name)
+        self._logger = configure_logger(self, module_name)
 
-        class CPlusPlusHandler(logging.Handler):
-            def emit(self, record):
-                if record.levelno <= logging.DEBUG:
-                    ceph_level = 20
-                elif record.levelno <= logging.INFO:
-                    ceph_level = 4
-                elif record.levelno <= logging.WARNING:
-                    ceph_level = 1
-                else:
-                    ceph_level = 0
+        super(MgrModule, self).__init__(py_modules_ptr, this_ptr)
 
-                ceph_state.log(handle, ceph_level, self.format(record))
+        self._version = self._ceph_get_version()
 
-        self._logger.addHandler(CPlusPlusHandler())
+        self._perf_schema_cache = None
 
-        self._version = ceph_state.get_version()
+    def __del__(self):
+        unconfigure_logger(self, self.module_name)
+
+    def update_perf_schema(self, daemon_type, daemon_name):
+        """
+        For plugins that use get_all_perf_counters, call this when
+        receiving a notification of type 'perf_schema_update', to
+        prompt MgrModule to update its cache of counter schemas.
+
+        :param daemon_type:
+        :param daemon_name:
+        :return:
+        """
 
     @property
     def log(self):
@@ -69,6 +245,12 @@ class MgrModule(object):
     def version(self):
         return self._version
 
+    def get_context(self):
+        """
+        :return: a Python capsule containing a C++ CephContext pointer
+        """
+        return self._ceph_get_context()
+
     def notify(self, notify_type, notify_id):
         """
         Called by the ceph-mgr service to notify the Python plugin
@@ -100,7 +282,7 @@ class MgrModule(object):
         """
         Called by the plugin to load some cluster state from ceph-mgr
         """
-        return ceph_state.get(self._handle, data_name)
+        return self._ceph_get(data_name)
 
     def get_server(self, hostname):
         """
@@ -109,7 +291,7 @@ class MgrModule(object):
 
         :param hostname: a hostame
         """
-        return ceph_state.get_server(self._handle, hostname)
+        return self._ceph_get_server(hostname)
 
     def get_perf_schema(self, svc_type, svc_name):
         """
@@ -121,7 +303,7 @@ class MgrModule(object):
         :param svc_name:
         :return: list of dicts describing the counters requested
         """
-        return ceph_state.get_perf_schema(self._handle, svc_type, svc_name)
+        return self._ceph_get_perf_schema(svc_type, svc_name)
 
     def get_counter(self, svc_type, svc_name, path):
         """
@@ -133,14 +315,14 @@ class MgrModule(object):
         :param path:
         :return: A list of two-element lists containing time and value
         """
-        return ceph_state.get_counter(self._handle, svc_type, svc_name, path)
+        return self._ceph_get_counter(svc_type, svc_name, path)
 
     def list_servers(self):
         """
         Like ``get_server``, but instead of returning information
         about just one node, return all the nodes in an array.
         """
-        return ceph_state.get_server(self._handle, None)
+        return self._ceph_get_server(None)
 
     def get_metadata(self, svc_type, svc_id):
         """
@@ -150,7 +332,7 @@ class MgrModule(object):
         :param svc_id: string
         :return: dict
         """
-        return ceph_state.get_metadata(self._handle, svc_type, svc_id)
+        return self._ceph_get_metadata(svc_type, svc_id)
 
     def get_daemon_status(self, svc_type, svc_id):
         """
@@ -160,14 +342,14 @@ class MgrModule(object):
         :param svc_id: string
         :return: dict
         """
-        return ceph_state.get_daemon_status(self._handle, svc_type, svc_id)
+        return self._ceph_get_daemon_status(svc_type, svc_id)
 
     def send_command(self, *args, **kwargs):
         """
         Called by the plugin to send a command to the mon
         cluster.
         """
-        ceph_state.send_command(self._handle, *args, **kwargs)
+        self._ceph_send_command(*args, **kwargs)
 
     def set_health_checks(self, checks):
         """
@@ -191,7 +373,7 @@ class MgrModule(object):
 
         :param list: dict of health check dicts
         """
-        ceph_state.set_health_checks(self._handle, checks)
+        self._ceph_set_health_checks(checks)
 
     def handle_command(self, cmd):
         """
@@ -217,16 +399,20 @@ class MgrModule(object):
 
         :return: str
         """
-        return ceph_state.get_mgr_id()
+        return self._ceph_get_mgr_id()
 
-    def get_config(self, key):
+    def get_config(self, key, default=None):
         """
         Retrieve the value of a persistent configuration setting
 
         :param key: str
         :return: str
         """
-        return ceph_state.get_config(self._handle, key)
+        r = self._ceph_get_config(key)
+        if r is None:
+            return default
+        else:
+            return r
 
     def get_config_prefix(self, key_prefix):
         """
@@ -235,7 +421,7 @@ class MgrModule(object):
         :param key_prefix: str
         :return: str
         """
-        return ceph_state.get_config_prefix(self._handle, key_prefix)
+        return self._ceph_get_config_prefix(key_prefix)
 
     def get_localized_config(self, key, default=None):
         """
@@ -259,7 +445,7 @@ class MgrModule(object):
         :param key: str
         :param val: str
         """
-        ceph_state.set_config(self._handle, key, val)
+        self._ceph_set_config(key, val)
 
     def set_localized_config(self, key, val):
         """
@@ -268,7 +454,7 @@ class MgrModule(object):
         :param default: str
         :return: str
         """
-        return self.set_config(self.get_mgr_id() + '/' + key, val)
+        return self._ceph_set_config(self.get_mgr_id() + '/' + key, val)
 
     def set_config_json(self, key, val):
         """
@@ -277,7 +463,7 @@ class MgrModule(object):
         :param key: str
         :param val: json-serializable object
         """
-        self.set_config(key, json.dumps(val))
+        self._ceph_set_config(key, json.dumps(val))
 
     def get_config_json(self, key):
         """
@@ -299,3 +485,77 @@ class MgrModule(object):
         :return: bool
         """
         pass
+
+    def get_osdmap(self):
+        """
+        Get a handle to an OSDMap.  If epoch==0, get a handle for the latest
+        OSDMap.
+        :return: OSDMap
+        """
+        return self._ceph_get_osdmap()
+
+    def get_all_perf_counters(self, prio_limit=PRIO_USEFUL):
+        """
+        Return the perf counters currently known to this ceph-mgr
+        instance, filtered by priority equal to or greater than `prio_limit`.
+
+        The result us a map of string to dict, associating services
+        (like "osd.123") with their counters.  The counter
+        dict for each service maps counter paths to a counter
+        info structure, which is the information from
+        the schema, plus an additional "value" member with the latest
+        value.
+        """
+
+        result = defaultdict(dict)
+
+        # TODO: improve C++->Python interface to return just
+        # the latest if that's all we want.
+        def get_latest(daemon_type, daemon_name, counter):
+            data = self.get_counter(daemon_type, daemon_name, counter)[counter]
+            if data:
+                return data[-1][1]
+            else:
+                return 0
+
+        for server in self.list_servers():
+            for service in server['services']:
+                if service['type'] not in ("mds", "osd", "mon"):
+                    continue
+
+                schema = self.get_perf_schema(service['type'], service['id'])
+                if not schema:
+                    self.log.warn("No perf counter schema for {0}.{1}".format(
+                        service['type'], service['id']
+                    ))
+                    continue
+
+                # Value is returned in a potentially-multi-service format,
+                # get just the service we're asking about
+                svc_full_name = "{0}.{1}".format(service['type'], service['id'])
+                schema = schema[svc_full_name]
+
+                # Populate latest values
+                for counter_path, counter_schema in schema.items():
+                    # self.log.debug("{0}: {1}".format(
+                    #     counter_path, json.dumps(counter_schema)
+                    # ))
+                    if counter_schema['priority'] < prio_limit:
+                        continue
+
+                    counter_info = counter_schema
+                    counter_info['value'] = get_latest(service['type'], service['id'], counter_path)
+                    result[svc_full_name][counter_path] = counter_info
+
+        self.log.debug("returning {0} counter".format(len(result)))
+
+        return result
+
+    def set_uri(self, uri):
+        """
+        If the module exposes a service, then call this to publish the
+        address once it is available.
+
+        :return: a string
+        """
+        return self._ceph_set_uri(uri)
diff --git a/ceph/src/pybind/mgr/prometheus/module.py b/ceph/src/pybind/mgr/prometheus/module.py
index f98f1f7e6..842517f20 100644
--- a/ceph/src/pybind/mgr/prometheus/module.py
+++ b/ceph/src/pybind/mgr/prometheus/module.py
@@ -1,7 +1,8 @@
 import cherrypy
+import json
+import errno
 import math
 import os
-import time
 from collections import OrderedDict
 from mgr_module import MgrModule
 
@@ -14,7 +15,7 @@ DEFAULT_PORT = 9283
 
 
 # cherrypy likes to sys.exit on error.  don't let it take us down too!
-def os_exit_noop():
+def os_exit_noop(*args, **kwargs):
     pass
 
 
@@ -32,31 +33,32 @@ def global_instance():
     return _global_instance['plugin']
 
 
-# counter value types
-PERFCOUNTER_TIME = 1
-PERFCOUNTER_U64 = 2
+def health_status_to_number(status):
 
-# counter types
-PERFCOUNTER_LONGRUNAVG = 4
-PERFCOUNTER_COUNTER = 8
-PERFCOUNTER_HISTOGRAM = 0x10
-PERFCOUNTER_TYPE_MASK = ~2
+    if status == 'HEALTH_OK':
+        return 0
+    elif status == 'HEALTH_WARN':
+        return 1
+    elif status == 'HEALTH_ERR':
+        return 2
 
+PG_STATES = ['creating', 'active', 'clean', 'down', 'scrubbing', 'degraded',
+        'inconsistent', 'peering', 'repair', 'recovering', 'forced-recovery',
+        'backfill', 'forced-backfill', 'wait-backfill', 'backfill-toofull',
+        'incomplete', 'stale', 'remapped', 'undersized', 'peered']
 
-def stattype_to_str(stattype):
+DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_objects']
 
-    typeonly = stattype & PERFCOUNTER_TYPE_MASK
-    if typeonly == 0:
-        return 'gauge'
-    if typeonly == PERFCOUNTER_LONGRUNAVG:
-        # this lie matches the DaemonState decoding: only val, no counts
-        return 'counter'
-    if typeonly == PERFCOUNTER_COUNTER:
-        return 'counter'
-    if typeonly == PERFCOUNTER_HISTOGRAM:
-        return 'histogram'
+DF_POOL = ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty',
+           'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
 
-    return ''
+OSD_METADATA = ('cluster_addr', 'device_class', 'id', 'public_addr')
+
+OSD_STATUS = ['weight', 'up', 'in']
+
+POOL_METADATA = ('pool_id', 'name')
+
+DISK_OCCUPATION = ('instance', 'device', 'ceph_daemon')
 
 
 class Metric(object):
@@ -76,7 +78,16 @@ class Metric(object):
 
         def promethize(path):
             ''' replace illegal metric name characters '''
-            return path.replace('.', '_').replace('-', '_')
+            result = path.replace('.', '_').replace('+', '_plus').replace('::', '_')
+
+            # Hyphens usually turn into underscores, unless they are
+            # trailing
+            if result.endswith("-"):
+                result = result[0:-1] + "_minus"
+            else:
+                result = result.replace("-", "_")
+
+            return "ceph_{0}".format(result)
 
         def floatstr(value):
             ''' represent as Go-compatible float '''
@@ -116,98 +127,233 @@ class Metric(object):
 
 
 class Module(MgrModule):
+    COMMANDS = [
+        {
+            "cmd": "prometheus self-test",
+            "desc": "Run a self test on the prometheus module",
+            "perm": "rw"
+        },
+    ]
 
     def __init__(self, *args, **kwargs):
         super(Module, self).__init__(*args, **kwargs)
         self.notified = False
         self.serving = False
-        self.metrics = dict()
+        self.metrics = self._setup_static_metrics()
         self.schema = OrderedDict()
         _global_instance['plugin'] = self
 
-    def _get_ordered_schema(self, **kwargs):
-
-        '''
-        fetch an ordered-by-key performance counter schema
-        ['perf_schema'][daemontype.id][countername] with keys
-        'nick' (if present)
-        'description'
-        'type' (counter type....counter/gauge/avg/histogram/etc.)
-        '''
-
-        daemon_type = kwargs.get('daemon_type', '')
-        daemon_id = kwargs.get('daemon_id', '')
-
-        schema = self.get_perf_schema(daemon_type, daemon_id)
-        if not schema:
-            self.log.warning('_get_ordered_schema: no data')
-            return
-
-        new_schema = dict()
-        for k1 in schema.keys():    # 'perf_schema', but assume only one
-            for k2 in sorted(schema[k1].keys()):
-                sorted_dict = OrderedDict(
-                    sorted(schema[k1][k2].items(), key=lambda i: i[0])
-                )
-                new_schema[k2] = sorted_dict
-        for k in sorted(new_schema.keys()):
-            self.log.debug("updating schema for %s" % k)
-            self.schema[k] = new_schema[k]
-
-    def shutdown(self):
-        self.serving = False
-        pass
-
-    # XXX duplicated from dashboard; factor out?
-    def get_latest(self, daemon_type, daemon_name, stat):
-        data = self.get_counter(daemon_type, daemon_name, stat)[stat]
-        if data:
-            return data[-1][1]
-        else:
-            return 0
-
-    def get_stat(self, daemon, path):
+    def _stattype_to_str(self, stattype):
+
+        typeonly = stattype & self.PERFCOUNTER_TYPE_MASK
+        if typeonly == 0:
+            return 'gauge'
+        if typeonly == self.PERFCOUNTER_LONGRUNAVG:
+            # this lie matches the DaemonState decoding: only val, no counts
+            return 'counter'
+        if typeonly == self.PERFCOUNTER_COUNTER:
+            return 'counter'
+        if typeonly == self.PERFCOUNTER_HISTOGRAM:
+            return 'histogram'
+
+        return ''
+
+    def _setup_static_metrics(self):
+        metrics = {}
+        metrics['health_status'] = Metric(
+            'untyped',
+            'health_status',
+            'Cluster health status'
+        )
+        metrics['mon_quorum_count'] = Metric(
+            'gauge',
+            'mon_quorum_count',
+            'Monitors in quorum'
+        )
+        metrics['osd_metadata'] = Metric(
+            'untyped',
+            'osd_metadata',
+            'OSD Metadata',
+            OSD_METADATA
+        )
 
-        perfcounter = self.schema[daemon][path]
-        stattype = stattype_to_str(perfcounter['type'])
-        # XXX simplify first effort: no histograms
-        # averages are already collapsed to one value for us
-        if not stattype or stattype == 'histogram':
-            self.log.debug('ignoring %s, type %s' % (path, stattype))
-            return
+        # The reason for having this separate to OSD_METADATA is
+        # so that we can stably use the same tag names that
+        # the Prometheus node_exporter does
+        metrics['disk_occupation'] = Metric(
+            'undef',
+            'disk_occupation',
+            'Associate Ceph daemon with disk used',
+            DISK_OCCUPATION
+        )
 
-        if path not in self.metrics:
-            self.metrics[path] = Metric(
-                stattype,
+        metrics['pool_metadata'] = Metric(
+            'untyped',
+            'pool_metadata',
+            'POOL Metadata',
+            POOL_METADATA
+        )
+        for state in OSD_STATUS:
+            path = 'osd_{}'.format(state)
+            self.log.debug("init: creating {}".format(path))
+            metrics[path] = Metric(
+                'untyped',
                 path,
-                perfcounter['description'],
-                ('daemon',),
+                'OSD status {}'.format(state),
+                ('ceph_daemon',)
             )
+        for state in PG_STATES:
+            path = 'pg_{}'.format(state)
+            self.log.debug("init: creating {}".format(path))
+            metrics[path] = Metric(
+                'gauge',
+                path,
+                'PG {}'.format(state),
+            )
+        for state in DF_CLUSTER:
+            path = 'cluster_{}'.format(state)
+            self.log.debug("init: creating {}".format(path))
+            metrics[path] = Metric(
+                'gauge',
+                path,
+                'DF {}'.format(state),
+            )
+        for state in DF_POOL:
+            path = 'pool_{}'.format(state)
+            self.log.debug("init: creating {}".format(path))
+            metrics[path] = Metric(
+                'gauge',
+                path,
+                'DF pool {}'.format(state),
+                ('pool_id',)
+            )
+
+        return metrics
 
-        daemon_type, daemon_id = daemon.split('.')
+    def shutdown(self):
+        self.serving = False
+        pass
 
-        self.metrics[path].set(
-            self.get_latest(daemon_type, daemon_id, path),
-            (daemon,)
+    def get_health(self):
+        health = json.loads(self.get('health')['json'])
+        self.metrics['health_status'].set(
+            health_status_to_number(health['status'])
         )
 
+    def get_df(self):
+        # maybe get the to-be-exported metrics from a config?
+        df = self.get('df')
+        for stat in DF_CLUSTER:
+            path = 'cluster_{}'.format(stat)
+            self.metrics[path].set(df['stats'][stat])
+
+        for pool in df['pools']:
+            for stat in DF_POOL:
+                path = 'pool_{}'.format(stat)
+                self.metrics[path].set(pool['stats'][stat], (pool['id'],))
+
+    def get_quorum_status(self):
+        mon_status = json.loads(self.get('mon_status')['json'])
+        self.metrics['mon_quorum_count'].set(len(mon_status['quorum']))
+
+    def get_pg_status(self):
+        # TODO add per pool status?
+        pg_s = self.get('pg_summary')['all']
+        reported_pg_s = [(s,v) for key, v in pg_s.items() for s in
+                         key.split('+')]
+        for state, value in reported_pg_s:
+            path = 'pg_{}'.format(state)
+            self.metrics[path].set(value)
+        reported_states = [s[0] for s in reported_pg_s]
+        for state in PG_STATES:
+            path = 'pg_{}'.format(state)
+            if state not in reported_states:
+                self.metrics[path].set(0)
+
+    def get_metadata_and_osd_status(self):
+        osd_map = self.get('osd_map')
+        osd_devices = self.get('osd_map_crush')['devices']
+        for osd in osd_map['osds']:
+            id_ = osd['osd']
+            p_addr = osd['public_addr'].split(':')[0]
+            c_addr = osd['cluster_addr'].split(':')[0]
+            dev_class = next((osd for osd in osd_devices if osd['id'] == id_))
+            self.metrics['osd_metadata'].set(0, (
+                c_addr,
+                dev_class['class'],
+                id_,
+                p_addr
+            ))
+            for state in OSD_STATUS:
+                status = osd[state]
+                self.metrics['osd_{}'.format(state)].set(
+                    status,
+                    ('osd.{}'.format(id_),))
+
+            osd_metadata = self.get_metadata("osd", str(id_))
+            dev_keys = ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
+            osd_dev_node = None
+            for dev_key in dev_keys:
+                val = osd_metadata.get(dev_key, None)
+                if val and val != "unknown":
+                    osd_dev_node = val
+                    break
+            osd_hostname = osd_metadata.get('hostname', None)
+            if osd_dev_node and osd_hostname:
+                self.log.debug("Got dev for osd {0}: {1}/{2}".format(
+                    id_, osd_hostname, osd_dev_node))
+                self.metrics['disk_occupation'].set(0, (
+                    osd_hostname,
+                    osd_dev_node,
+                    "osd.{0}".format(id_)
+                ))
+            else:
+                self.log.info("Missing dev node metadata for osd {0}, skipping "
+                               "occupation record for this osd".format(id_))
+
+        for pool in osd_map['pools']:
+            id_ = pool['pool']
+            name = pool['pool_name']
+            self.metrics['pool_metadata'].set(0, (id_, name))
+
     def collect(self):
-        for daemon in self.schema.keys():
-            for path in self.schema[daemon].keys():
-                self.get_stat(daemon, path)
+        self.get_health()
+        self.get_df()
+        self.get_quorum_status()
+        self.get_metadata_and_osd_status()
+        self.get_pg_status()
+
+        for daemon, counters in self.get_all_perf_counters().iteritems():
+            for path, counter_info in counters.items():
+                stattype = self._stattype_to_str(counter_info['type'])
+                # XXX simplify first effort: no histograms
+                # averages are already collapsed to one value for us
+                if not stattype or stattype == 'histogram':
+                    self.log.debug('ignoring %s, type %s' % (path, stattype))
+                    continue
+
+                if path not in self.metrics:
+                    self.metrics[path] = Metric(
+                        stattype,
+                        path,
+                        counter_info['description'],
+                        ("ceph_daemon",),
+                    )
+
+                self.metrics[path].set(
+                    counter_info['value'],
+                    (daemon,)
+                )
+
         return self.metrics
 
-    def notify(self, ntype, nid):
-        ''' Just try to sync and not run until we're notified once '''
-        if not self.notified:
-            self.serving = True
-            self.notified = True
-        if ntype == 'perf_schema_update':
-            daemon_type, daemon_id = nid.split('.')
-            self._get_ordered_schema(
-                daemon_type=daemon_type,
-                daemon_id=daemon_id
-            )
+    def handle_command(self, cmd):
+        if cmd['prefix'] == 'prometheus self-test':
+            self.collect()
+            return 0, '', 'Self-test OK'
+        else:
+            return (-errno.EINVAL, '',
+                    "Command not found '{0}'".format(cmd['prefix']))
 
     def serve(self):
 
@@ -226,6 +372,17 @@ class Module(MgrModule):
 
             @cherrypy.expose
             def index(self):
+                return '''<!DOCTYPE html>
+<html>
+	<head><title>Ceph Exporter</title></head>
+	<body>
+		<h1>Ceph Exporter</h1>
+		<p><a href='/metrics'>Metrics</a></p>
+	</body>
+</html>'''
+
+            @cherrypy.expose
+            def metrics(self):
                 metrics = global_instance().collect()
                 cherrypy.response.headers['Content-Type'] = 'text/plain'
                 if metrics:
@@ -237,13 +394,10 @@ class Module(MgrModule):
             "server_addr: %s server_port: %s" %
             (server_addr, server_port)
         )
-        # wait for first notification (of any kind) to start up
-        while not self.serving:
-            time.sleep(1)
 
         cherrypy.config.update({
             'server.socket_host': server_addr,
-            'server.socket_port': server_port,
+            'server.socket_port': int(server_port),
             'engine.autoreload.on': False
         })
         cherrypy.tree.mount(Root(), "/")
diff --git a/ceph/src/pybind/mgr/restful/module.py b/ceph/src/pybind/mgr/restful/module.py
index 63fcae72a..6ce610b88 100644
--- a/ceph/src/pybind/mgr/restful/module.py
+++ b/ceph/src/pybind/mgr/restful/module.py
@@ -10,6 +10,7 @@ import inspect
 import tempfile
 import threading
 import traceback
+import socket
 
 import common
 
@@ -26,6 +27,9 @@ from mgr_module import MgrModule, CommandResult
 instance = None
 
 
+class CannotServe(Exception):
+    pass
+
 
 class CommandsRequest(object):
     """
@@ -247,6 +251,8 @@ class Module(MgrModule):
             try:
                 self._serve()
                 self.server.socket.close()
+            except CannotServe as cs:
+                self.log.warn("server not running: {0}".format(cs.message))
             except:
                 self.log.error(str(traceback.format_exc()))
 
@@ -272,7 +278,8 @@ class Module(MgrModule):
 
         server_addr = self.get_localized_config('server_addr', '::')
         if server_addr is None:
-            raise RuntimeError('no server_addr configured; try "ceph config-key set mgr/restful/server_addr <ip>"')
+            raise CannotServe('no server_addr configured; try "ceph config-key set mgr/restful/server_addr <ip>"')
+
         server_port = int(self.get_localized_config('server_port', '8003'))
         self.log.info('server_addr: %s server_port: %d',
                       server_addr, server_port)
@@ -296,11 +303,18 @@ class Module(MgrModule):
             pkey_fname = self.get_localized_config('key_file')
 
         if not cert_fname or not pkey_fname:
-            raise RuntimeError('no certificate configured')
+            raise CannotServe('no certificate configured')
         if not os.path.isfile(cert_fname):
-            raise RuntimeError('certificate %s does not exist' % cert_fname)
+            raise CannotServe('certificate %s does not exist' % cert_fname)
         if not os.path.isfile(pkey_fname):
-            raise RuntimeError('private key %s does not exist' % pkey_fname)
+            raise CannotServe('private key %s does not exist' % pkey_fname)
+
+        # Publish the URI that others may use to access the service we're
+        # about to start serving
+        self.set_uri("https://{0}:{1}/".format(
+            socket.gethostname() if server_addr == "::" else server_addr,
+            server_port
+        ))
 
         # Create the HTTPS werkzeug server serving pecan app
         self.server = make_server(
diff --git a/ceph/src/pybind/mgr/selftest/__init__.py b/ceph/src/pybind/mgr/selftest/__init__.py
new file mode 100644
index 000000000..622a611b6
--- /dev/null
+++ b/ceph/src/pybind/mgr/selftest/__init__.py
@@ -0,0 +1,3 @@
+
+from module import *
+
diff --git a/ceph/src/pybind/mgr/selftest/module.py b/ceph/src/pybind/mgr/selftest/module.py
new file mode 100644
index 000000000..e289aee94
--- /dev/null
+++ b/ceph/src/pybind/mgr/selftest/module.py
@@ -0,0 +1,217 @@
+
+from mgr_module import MgrModule, CommandResult
+import threading
+import random
+import json
+import errno
+
+
+class Module(MgrModule):
+    """
+    This module is for testing the ceph-mgr python interface from within
+    a running ceph-mgr daemon.
+
+    It implements a sychronous self-test command for calling the functions
+    in the MgrModule interface one by one, and a background "workload"
+    command for causing the module to perform some thrashing-type
+    activities in its serve() thread.
+    """
+
+    WORKLOAD_COMMAND_SPAM = "command_spam"
+    SHUTDOWN = "shutdown"
+
+    WORKLOADS = (WORKLOAD_COMMAND_SPAM, )
+
+    COMMANDS = [
+            {
+                "cmd": "mgr self-test run",
+                "desc": "Run mgr python interface tests",
+                "perm": "r"
+            },
+            {
+                "cmd": "mgr self-test background start name=workload,type=CephString",
+                "desc": "Activate a background workload (one of {0})".format(
+                    ", ".join(WORKLOADS)),
+                "perm": "r"
+            },
+            {
+                "cmd": "mgr self-test background stop",
+                "desc": "Stop background workload if any is running",
+                "perm": "r"
+            },
+            ]
+
+
+
+    def __init__(self, *args, **kwargs):
+        super(Module, self).__init__(*args, **kwargs)
+        self._event = threading.Event()
+        self._workload = None
+
+    def handle_command(self, command):
+        if command['prefix'] == 'mgr self-test run':
+            self._self_test()
+            return 0, '', 'Self-test succeeded'
+
+        elif command['prefix'] == 'mgr self-test background start':
+            if command['workload'] not in self.WORKLOADS:
+                return (-errno.EINVAL, '',
+                        "Workload not found '{0}'".format(command['workload']))
+            self._workload = command['workload']
+            self._event.set()
+            return 0, '', 'Running `{0}` in background'.format(self._workload)
+
+        elif command['prefix'] == 'mgr self-test background stop':
+            if self._workload:
+                was_running = self._workload
+                self._workload = None
+                self._event.set()
+                return 0, '', 'Stopping background workload `{0}`'.format(
+                        was_running)
+            else:
+                return 0, '', 'No background workload was running'
+
+        else:
+            return (-errno.EINVAL, '',
+                    "Command not found '{0}'".format(command['prefix']))
+
+    def _self_test(self):
+        self.log.info("Running self-test procedure...")
+
+        self._self_test_osdmap()
+        self._self_test_getters()
+        self._self_test_config()
+        self._self_test_misc()
+        self._self_test_perf_counters()
+
+    def _self_test_getters(self):
+        self.version
+        self.get_context()
+        self.get_mgr_id()
+
+        # In this function, we will assume that the system is in a steady
+        # state, i.e. if a server/service appears in one call, it will
+        # not have gone by the time we call another function referring to it
+
+        objects = [
+                "fs_map",
+                "osdmap_crush_map_text",
+                "osd_map",
+                "config",
+                "mon_map",
+                "service_map",
+                "osd_metadata",
+                "pg_summary",
+                "pg_status",
+                "pg_dump",
+                "df",
+                "osd_stats",
+                "health",
+                "mon_status",
+                "mgr_map"
+                ]
+        for obj in objects:
+            self.get(obj)
+
+        servers = self.list_servers()
+        for server in servers:
+            self.get_server(server['hostname'])
+
+        osdmap = self.get('osd_map')
+        for o in osdmap['osds']:
+            osd_id = o['osd']
+            self.get_metadata("osd", str(osd_id))
+
+        self.get_daemon_status("osd", "0")
+        #send_command
+
+    def _self_test_config(self):
+        # This is not a strong test (can't tell if values really
+        # persisted), it's just for the python interface bit.
+
+        self.set_config("testkey", "testvalue")
+        assert self.get_config("testkey") == "testvalue"
+
+        self.set_localized_config("testkey", "testvalue")
+        assert self.get_localized_config("testkey") == "testvalue"
+
+        self.set_config_json("testjsonkey", {"testblob": 2})
+        assert self.get_config_json("testjsonkey") == {"testblob": 2}
+
+        assert sorted(self.get_config_prefix("test").keys()) == sorted(
+                ["testkey", "testjsonkey"])
+
+    def _self_test_perf_counters(self):
+        self.get_perf_schema("osd", "0")
+        self.get_counter("osd", "0", "osd.op")
+        #get_counter
+        #get_all_perf_coutners
+
+    def _self_test_misc(self):
+        self.set_uri("http://this.is.a.test.com")
+        self.set_health_checks({})
+
+    def _self_test_osdmap(self):
+        osdmap = self.get_osdmap()
+        osdmap.get_epoch()
+        osdmap.get_crush_version()
+        osdmap.dump()
+
+        inc = osdmap.new_incremental()
+        osdmap.apply_incremental(inc)
+        inc.get_epoch()
+        inc.dump()
+
+        crush = osdmap.get_crush()
+        crush.dump()
+        crush.get_item_name(-1)
+        crush.get_item_weight(-1)
+        crush.find_takes()
+        crush.get_take_weight_osd_map(-1)
+
+        #osdmap.get_pools_by_take()
+        #osdmap.calc_pg_upmaps()
+        #osdmap.map_pools_pgs_up()
+
+        #inc.set_osd_reweights
+        #inc.set_crush_compat_weight_set_weights
+
+        self.log.info("Finished self-test procedure.")
+
+    def shutdown(self):
+        self._workload = self.SHUTDOWN
+        self._event.set()
+
+    def _command_spam(self):
+        self.log.info("Starting command_spam workload...")
+        while not self._event.is_set():
+            osdmap = self.get_osdmap()
+            dump = osdmap.dump()
+            count = len(dump['osds'])
+            i = int(random.random() * count)
+            w = random.random()
+
+            result = CommandResult('')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd reweight',
+                'id': i,
+                'weight': w
+                }), '')
+
+            crush = osdmap.get_crush().dump()
+            r, outb, outs = result.wait()
+
+        self._event.clear()
+        self.log.info("Ended command_spam workload...")
+
+    def serve(self):
+        while True:
+            if self._workload == self.WORKLOAD_COMMAND_SPAM:
+                self._command_spam()
+            elif self._workload == self.SHUTDOWN:
+                self.log.info("Shutting down...")
+                break
+            else:
+                self.log.info("Waiting for workload request...")
+                self._event.wait()
+                self._event.clear()
diff --git a/ceph/src/pybind/mgr/status/module.py b/ceph/src/pybind/mgr/status/module.py
index 35b9dbb66..606fb93e6 100644
--- a/ceph/src/pybind/mgr/status/module.py
+++ b/ceph/src/pybind/mgr/status/module.py
@@ -58,18 +58,22 @@ class Module(MgrModule):
         """
         return self.BOLD_SEQ + msg + self.RESET_SEQ
 
-    def format_dimless(self, n, width, colored=True):
+    def format_units(self, n, width, colored, decimal):
         """
         Format a number without units, so as to fit into `width` characters, substituting
         an appropriate unit suffix.
+        
+        Use decimal for dimensionless things, use base 2 (decimal=False) for byte sizes/rates.
         """
+        
+        factor = 1000 if decimal else 1024
         units = [' ', 'k', 'M', 'G', 'T', 'P']
         unit = 0
-        while len("%s" % (int(n) // (1000**unit))) > width - 1:
+        while len("%s" % (int(n) // (factor**unit))) > width - 1:
             unit += 1
 
         if unit > 0:
-            truncated_float = ("%f" % (n / (1000.0 ** unit)))[0:width - 1]
+            truncated_float = ("%f" % (n / (float(factor) ** unit)))[0:width - 1]
             if truncated_float[-1] == '.':
                 truncated_float = " " + truncated_float[0:-1]
         else:
@@ -86,6 +90,12 @@ class Module(MgrModule):
         else:
             return formatted
 
+    def format_dimless(self, n, width, colored=True):
+        return self.format_units(n, width, colored, decimal=True)
+    
+    def format_bytes(self, n, width, colored=True):
+        return self.format_units(n, width, colored, decimal=False)
+        
     def get_latest(self, daemon_type, daemon_name, stat):
         data = self.get_counter(daemon_type, daemon_name, stat)[stat]
         #self.log.error("get_latest {0} data={1}".format(stat, data))
@@ -209,8 +219,8 @@ class Module(MgrModule):
                 stats = pool_stats[pool_id]
                 pools_table.add_row([
                     pools[pool_id]['pool_name'], pool_type,
-                    self.format_dimless(stats['bytes_used'], 5),
-                    self.format_dimless(stats['max_avail'], 5)
+                    self.format_bytes(stats['bytes_used'], 5),
+                    self.format_bytes(stats['max_avail'], 5)
                 ])
 
             output += "{0} - {1} clients\n".format(
@@ -273,13 +283,13 @@ class Module(MgrModule):
             stats = osd_stats[osd_id]
 
             osd_table.add_row([osd_id, metadata['hostname'],
-                               self.format_dimless(stats['kb_used'] * 1024, 5),
-                               self.format_dimless(stats['kb_avail'] * 1024, 5),
+                               self.format_bytes(stats['kb_used'] * 1024, 5),
+                               self.format_bytes(stats['kb_avail'] * 1024, 5),
                                self.format_dimless(self.get_rate("osd", osd_id.__str__(), "osd.op_w") +
                                self.get_rate("osd", osd_id.__str__(), "osd.op_rw"), 5),
-                               self.format_dimless(self.get_rate("osd", osd_id.__str__(), "osd.op_in_bytes"), 5),
+                               self.format_bytes(self.get_rate("osd", osd_id.__str__(), "osd.op_in_bytes"), 5),
                                self.format_dimless(self.get_rate("osd", osd_id.__str__(), "osd.op_r"), 5),
-                               self.format_dimless(self.get_rate("osd", osd_id.__str__(), "osd.op_out_bytes"), 5),
+                               self.format_bytes(self.get_rate("osd", osd_id.__str__(), "osd.op_out_bytes"), 5),
                                ])
 
         return 0, "", osd_table.get_string()
diff --git a/ceph/src/pybind/mgr/zabbix/module.py b/ceph/src/pybind/mgr/zabbix/module.py
index eb2eba11a..c2125a942 100644
--- a/ceph/src/pybind/mgr/zabbix/module.py
+++ b/ceph/src/pybind/mgr/zabbix/module.py
@@ -12,7 +12,10 @@ from mgr_module import MgrModule
 
 
 def avg(data):
-    return sum(data) / float(len(data))
+    if len(data):
+        return sum(data) / float(len(data))
+    else:
+        return 0
 
 
 class ZabbixSender(object):
@@ -257,11 +260,12 @@ class Module(MgrModule):
         while self.run:
             self.log.debug('Waking up for new iteration')
 
-            # Sometimes fetching data fails, should be fixed by PR #16020
             try:
                 self.send()
             except Exception as exc:
-                self.log.error(exc)
+                # Shouldn't happen, but let's log it and retry next interval,
+                # rather than dying completely.
+                self.log.exception("Unexpected error during send():")
 
             interval = self.config['interval']
             self.log.debug('Sleeping for %d seconds', interval)
diff --git a/ceph/src/rbdmap b/ceph/src/rbdmap
index 25de46483..3b840aef9 100755
--- a/ceph/src/rbdmap
+++ b/ceph/src/rbdmap
@@ -58,19 +58,19 @@ do_map() {
 
 unmount_unmap() {
 	local rbd_dev=$1
-	local mnt=$(findmnt --mtab --source ${rbd_dev} --noheadings \
+	local mnts=$(findmnt --mtab --source ${rbd_dev} --noheadings \
 							| awk '{print $1'})
 
 	logger -p "daemon.debug" -t rbdmap "Unmapping '${rbd_dev}'"
-	if [ -n "${mnt}" ]; then
+	for mnt in ${mnts}; do
 	    logger -p "daemon.debug" -t rbdmap "Unmounting '${mnt}'"
 	    umount "${mnt}" >>/dev/null 2>&1
-	fi
-	if mountpoint -q "${mnt}"; then
-	    ## Un-mounting failed.
-	    logger -p "daemon.warning" -t rbdmap "Failed to unmount '${mnt}'"
-	    return 1
-	fi
+	    if mountpoint -q "${mnt}"; then
+		  ## Un-mounting failed.
+		  logger -p "daemon.warning" -t rbdmap "Failed to unmount '${mnt}'"
+		  return 1
+	    fi
+	done
 	## Un-mapping.
 	rbd unmap $rbd_dev >>/dev/null 2>&1
 	if [ $? -ne 0 ]; then
diff --git a/ceph/src/rgw/rgw_admin.cc b/ceph/src/rgw/rgw_admin.cc
index 9c0826822..bd7701607 100644
--- a/ceph/src/rgw/rgw_admin.cc
+++ b/ceph/src/rgw/rgw_admin.cc
@@ -9,6 +9,7 @@
 #include <boost/optional.hpp>
 
 #include "auth/Crypto.h"
+#include "compressor/Compressor.h"
 
 #include "common/armor.h"
 #include "common/ceph_json.h"
@@ -4250,6 +4251,13 @@ int main(int argc, const char **argv)
           cerr << "ERROR: --placement-id not specified" << std::endl;
           return EINVAL;
         }
+        // validate compression type
+        if (compression_type && *compression_type != "random"
+            && !Compressor::get_comp_alg_type(*compression_type)) {
+          std::cerr << "Unrecognized compression type" << std::endl;
+          return EINVAL;
+        }
+
 	RGWZoneParams zone(zone_id, zone_name);
 	int ret = zone.init(g_ceph_context, store);
         if (ret < 0) {
diff --git a/ceph/src/rgw/rgw_auth_s3.h b/ceph/src/rgw/rgw_auth_s3.h
index 936986425..1c851a933 100644
--- a/ceph/src/rgw/rgw_auth_s3.h
+++ b/ceph/src/rgw/rgw_auth_s3.h
@@ -36,7 +36,7 @@ class ExternalAuthStrategy : public rgw::auth::Strategy,
   using keystone_cache_t = rgw::keystone::TokenCache;
   using EC2Engine = rgw::auth::keystone::EC2Engine;
 
-  EC2Engine keystone_engine;
+  boost::optional <EC2Engine> keystone_engine;
   LDAPEngine ldap_engine;
 
   aplptr_t create_apl_remote(CephContext* const cct,
@@ -56,16 +56,18 @@ public:
                        RGWRados* const store,
                        AWSEngine::VersionAbstractor* const ver_abstractor)
     : store(store),
-      keystone_engine(cct, ver_abstractor,
-                      static_cast<rgw::auth::RemoteApplier::Factory*>(this),
-                      keystone_config_t::get_instance(),
-                      keystone_cache_t::get_instance<keystone_config_t>()),
       ldap_engine(cct, store, *ver_abstractor,
                   static_cast<rgw::auth::RemoteApplier::Factory*>(this)) {
 
     if (cct->_conf->rgw_s3_auth_use_keystone &&
         ! cct->_conf->rgw_keystone_url.empty()) {
-      add_engine(Control::SUFFICIENT, keystone_engine);
+
+      keystone_engine.emplace(cct, ver_abstractor,
+                              static_cast<rgw::auth::RemoteApplier::Factory*>(this),
+                              keystone_config_t::get_instance(),
+                              keystone_cache_t::get_instance<keystone_config_t>());
+      add_engine(Control::SUFFICIENT, *keystone_engine);
+
     }
 
     if (cct->_conf->rgw_s3_auth_use_ldap &&
diff --git a/ceph/src/rgw/rgw_basic_types.h b/ceph/src/rgw/rgw_basic_types.h
index 31e9d3a32..b4707b1a7 100644
--- a/ceph/src/rgw/rgw_basic_types.h
+++ b/ceph/src/rgw/rgw_basic_types.h
@@ -160,12 +160,10 @@ public:
   }
 
   const std::string& get_tenant() const {
-    ceph_assert(t != Wildcard);
     return u.tenant;
   }
 
   const std::string& get_id() const {
-    ceph_assert(t != Wildcard && t != Tenant);
     return u.id;
   }
 
diff --git a/ceph/src/rgw/rgw_bucket.cc b/ceph/src/rgw/rgw_bucket.cc
index 0da6dd587..2a38733aa 100644
--- a/ceph/src/rgw/rgw_bucket.cc
+++ b/ceph/src/rgw/rgw_bucket.cc
@@ -106,10 +106,9 @@ int rgw_read_user_buckets(RGWRados * store,
 {
   int ret;
   buckets.clear();
-  string buckets_obj_id;
+  std::string buckets_obj_id;
   rgw_get_buckets_obj(user_id, buckets_obj_id);
   rgw_raw_obj obj(store->get_zone_params().user_uid_pool, buckets_obj_id);
-  list<cls_user_bucket_entry> entries;
 
   bool truncated = false;
   string m = marker;
@@ -121,15 +120,18 @@ int rgw_read_user_buckets(RGWRados * store,
   }
 
   do {
+    std::list<cls_user_bucket_entry> entries;
     ret = store->cls_user_list_buckets(obj, m, end_marker, max - total, entries, &m, &truncated);
-    if (ret == -ENOENT)
+    if (ret == -ENOENT) {
       ret = 0;
+    }
 
-    if (ret < 0)
+    if (ret < 0) {
       return ret;
+    }
 
-    for (const auto& entry : entries) {
-      buckets.add(RGWBucketEnt(user_id, entry));
+    for (auto& entry : entries) {
+      buckets.add(RGWBucketEnt(user_id, std::move(entry)));
       total++;
     }
 
@@ -178,7 +180,11 @@ int rgw_bucket_sync_user_stats(RGWRados *store, const string& tenant_name, const
   return 0;
 }
 
-int rgw_link_bucket(RGWRados *store, const rgw_user& user_id, rgw_bucket& bucket, real_time creation_time, bool update_entrypoint)
+int rgw_link_bucket(RGWRados* const store,
+                    const rgw_user& user_id,
+                    rgw_bucket& bucket,
+                    ceph::real_time creation_time,
+                    bool update_entrypoint)
 {
   int ret;
   string& tenant_name = bucket.tenant;
@@ -476,7 +482,8 @@ void check_bad_user_bucket_mapping(RGWRados *store, const rgw_user& user_id,
         cout << "bucket info mismatch: expected " << actual_bucket << " got " << bucket << std::endl;
         if (fix) {
           cout << "fixing" << std::endl;
-          r = rgw_link_bucket(store, user_id, actual_bucket, bucket_info.creation_time);
+          r = rgw_link_bucket(store, user_id, actual_bucket,
+                              bucket_info.creation_time);
           if (r < 0) {
             cerr << "failed to fix bucket: " << cpp_strerror(-r) << std::endl;
           }
@@ -893,7 +900,8 @@ int RGWBucket::link(RGWBucketAdminOpState& op_state, std::string *err_msg)
       return r;
     }
 
-    r = rgw_link_bucket(store, user_info.user_id, bucket_info.bucket, real_time());
+    r = rgw_link_bucket(store, user_info.user_id, bucket_info.bucket,
+                        ceph::real_time());
     if (r < 0) {
       return r;
     }
@@ -2121,7 +2129,8 @@ public:
     if (be.linked) {
       ret = rgw_link_bucket(store, be.owner, be.bucket, be.creation_time, false);
     } else {
-      ret = rgw_unlink_bucket(store, be.owner, be.bucket.tenant, be.bucket.name, false);
+      ret = rgw_unlink_bucket(store, be.owner, be.bucket.tenant,
+                              be.bucket.name, false);
     }
 
     return ret;
diff --git a/ceph/src/rgw/rgw_bucket.h b/ceph/src/rgw/rgw_bucket.h
index 6cc20eb8f..62b292325 100644
--- a/ceph/src/rgw/rgw_bucket.h
+++ b/ceph/src/rgw/rgw_bucket.h
@@ -104,10 +104,14 @@ public:
  */
 class RGWUserBuckets
 {
-  map<string, RGWBucketEnt> buckets;
+  std::map<std::string, RGWBucketEnt> buckets;
 
 public:
-  RGWUserBuckets() {}
+  RGWUserBuckets() = default;
+  RGWUserBuckets(RGWUserBuckets&&) = default;
+
+  RGWUserBuckets& operator=(const RGWUserBuckets&) = default;
+
   void encode(bufferlist& bl) const {
     ::encode(buckets, bl);
   }
@@ -172,7 +176,11 @@ extern int rgw_read_user_buckets(RGWRados *store,
 				 bool* is_truncated,
                                  uint64_t default_amount = 1000);
 
-extern int rgw_link_bucket(RGWRados *store, const rgw_user& user_id, rgw_bucket& bucket, real_time creation_time, bool update_entrypoint = true);
+extern int rgw_link_bucket(RGWRados* store,
+                           const rgw_user& user_id,
+                           rgw_bucket& bucket,
+                           ceph::real_time creation_time,
+                           bool update_entrypoint = true);
 extern int rgw_unlink_bucket(RGWRados *store, const rgw_user& user_id,
                              const string& tenant_name, const string& bucket_name, bool update_entrypoint = true);
 
diff --git a/ceph/src/rgw/rgw_common.cc b/ceph/src/rgw/rgw_common.cc
index 02b807efc..d10577ef4 100644
--- a/ceph/src/rgw/rgw_common.cc
+++ b/ceph/src/rgw/rgw_common.cc
@@ -76,6 +76,7 @@ rgw_http_errors rgw_http_s3_errors({
     { ERR_AMZ_CONTENT_SHA256_MISMATCH, {400, "XAmzContentSHA256Mismatch" }},
     { ERR_INVALID_TAG, {400, "InvalidTag"}},
     { ERR_MALFORMED_ACL_ERROR, {400, "MalformedACLError" }},
+    { ERR_INVALID_ENCRYPTION_ALGORITHM, {400, "InvalidEncryptionAlgorithmError" }},
     { ERR_LENGTH_REQUIRED, {411, "MissingContentLength" }},
     { EACCES, {403, "AccessDenied" }},
     { EPERM, {403, "AccessDenied" }},
@@ -118,11 +119,14 @@ rgw_http_errors rgw_http_s3_errors({
 rgw_http_errors rgw_http_swift_errors({
     { EACCES, {403, "AccessDenied" }},
     { EPERM, {401, "AccessDenied" }},
+    { ENAMETOOLONG, {400, "Metadata name too long" }},
     { ERR_USER_SUSPENDED, {401, "UserSuspended" }},
     { ERR_INVALID_UTF8, {412, "Invalid UTF8" }},
     { ERR_BAD_URL, {412, "Bad URL" }},
     { ERR_NOT_SLO_MANIFEST, {400, "Not an SLO manifest" }},
     { ERR_QUOTA_EXCEEDED, {413, "QuotaExceeded" }},
+    { ENOTEMPTY, {409, "There was a conflict when trying "
+                       "to complete your request." }},
     /* FIXME(rzarzynski): we need to find a way to apply Swift's error handling
      * procedures also for ERR_ZERO_IN_URL. This make a problem as the validation
      * is performed very early, even before setting the req_state::proto_flags. */
@@ -341,7 +345,17 @@ void set_req_state_err(struct req_state* s, int err_no, const string& err_msg)
 {
   if (s) {
     set_req_state_err(s, err_no);
-    s->err.message = err_msg;
+    if (s->prot_flags & RGW_REST_SWIFT && !err_msg.empty()) {
+      /* TODO(rzarzynski): there never ever should be a check like this one.
+       * It's here only for the sake of the patch's backportability. Further
+       * commits will move the logic to a per-RGWHandler replacement of
+       * the end_header() function. Alternativaly, we might consider making
+       * that just for the dump(). Please take a look on @cbodley's comments
+       * in PR #10690 (https://github.com/ceph/ceph/pull/10690). */
+      s->err.err_code = err_msg;
+    } else {
+      s->err.message = err_msg;
+    }
   }
 }
 
diff --git a/ceph/src/rgw/rgw_common.h b/ceph/src/rgw/rgw_common.h
index 8d299cf39..6146d1cb3 100644
--- a/ceph/src/rgw/rgw_common.h
+++ b/ceph/src/rgw/rgw_common.h
@@ -216,6 +216,7 @@ using ceph::crypto::MD5;
 #define ERR_ZERO_IN_URL          2211
 #define ERR_MALFORMED_ACL_ERROR  2212
 #define ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION 2213
+#define ERR_INVALID_ENCRYPTION_ALGORITHM                 2214
 
 #define ERR_BUSY_RESHARDING      2300
 
@@ -437,6 +438,7 @@ enum RGWOpType {
   RGW_OP_STAT_ACCOUNT,
   RGW_OP_LIST_BUCKET,
   RGW_OP_GET_BUCKET_LOGGING,
+  RGW_OP_GET_BUCKET_LOCATION,
   RGW_OP_GET_BUCKET_VERSIONING,
   RGW_OP_SET_BUCKET_VERSIONING,
   RGW_OP_GET_BUCKET_WEBSITE,
@@ -774,11 +776,12 @@ struct RGWUserInfo
 WRITE_CLASS_ENCODER(RGWUserInfo)
 
 struct rgw_pool {
-  string name;
-  string ns;
+  std::string name;
+  std::string ns;
 
-  rgw_pool() {}
+  rgw_pool() = default;
   rgw_pool(const rgw_pool& _p) : name(_p.name), ns(_p.ns) {}
+  rgw_pool(rgw_pool&&) = default;
   rgw_pool(const string& _s) {
     from_str(_s);
   }
@@ -834,6 +837,8 @@ struct rgw_pool {
     DECODE_FINISH(bl);
   }
 
+  rgw_pool& operator=(const rgw_pool&) = default;
+
   bool operator==(const rgw_pool& p) const {
     return (compare(p) == 0);
   }
@@ -855,10 +860,20 @@ struct rgw_data_placement_target {
   rgw_pool data_extra_pool;
   rgw_pool index_pool;
 
-  rgw_data_placement_target() {}
+  rgw_data_placement_target() = default;
+  rgw_data_placement_target(const rgw_data_placement_target&) = default;
+  rgw_data_placement_target(rgw_data_placement_target&&) = default;
+
+  rgw_data_placement_target(const rgw_pool& data_pool,
+                            const rgw_pool& data_extra_pool,
+                            const rgw_pool& index_pool)
+    : data_pool(data_pool),
+      data_extra_pool(data_extra_pool),
+      index_pool(index_pool) {
+  }
 
-  rgw_data_placement_target(const rgw_pool& _data_pool, const rgw_pool& _data_extra_pool, const rgw_pool& _index_pool) 
-         : data_pool(_data_pool), data_extra_pool(_data_extra_pool), index_pool(_index_pool) {}
+  rgw_data_placement_target&
+  operator=(const rgw_data_placement_target&) = default;
 
   const rgw_pool& get_data_extra_pool() const {
     if (data_extra_pool.empty()) {
@@ -984,6 +999,8 @@ struct rgw_bucket {
     explicit_placement(b.explicit_placement.data_pool,
                        b.explicit_placement.data_extra_pool,
                        b.explicit_placement.index_pool) {}
+  rgw_bucket(const rgw_bucket&) = default;
+  rgw_bucket(rgw_bucket&&) = default;
 
   void convert(cls_user_bucket *b) const {
     b->name = name;
@@ -1069,6 +1086,8 @@ struct rgw_bucket {
   void decode_json(JSONObj *obj);
   static void generate_test_instances(list<rgw_bucket*>& o);
 
+  rgw_bucket& operator=(const rgw_bucket&) = default;
+
   bool operator<(const rgw_bucket& b) const {
     return name.compare(b.name) < 0;
   }
@@ -1860,19 +1879,31 @@ struct RGWBucketEnt {
   rgw_bucket bucket;
   size_t size;
   size_t size_rounded;
-  real_time creation_time;
+  ceph::real_time creation_time;
   uint64_t count;
 
-  RGWBucketEnt() : size(0), size_rounded(0), count(0) {}
+  /* The placement_rule is necessary to calculate per-storage-policy statics
+   * of the Swift API. Although the info available in RGWBucketInfo, we need
+   * to duplicate it here to not affect the performance of buckets listing. */
+  std::string placement_rule;
 
-  explicit RGWBucketEnt(const rgw_user& u, const cls_user_bucket_entry& e)
-    : bucket(u, e.bucket),
+  RGWBucketEnt()
+    : size(0),
+      size_rounded(0),
+      count(0) {
+  }
+  RGWBucketEnt(const RGWBucketEnt&) = default;
+  RGWBucketEnt(RGWBucketEnt&&) = default;
+  explicit RGWBucketEnt(const rgw_user& u, cls_user_bucket_entry&& e)
+    : bucket(u, std::move(e.bucket)),
       size(e.size),
       size_rounded(e.size_rounded),
       creation_time(e.creation_time),
       count(e.count) {
   }
 
+  RGWBucketEnt& operator=(const RGWBucketEnt&) = default;
+
   void convert(cls_user_bucket_entry *b) const {
     bucket.convert(&b->bucket);
     b->size = size;
@@ -1882,7 +1913,7 @@ struct RGWBucketEnt {
   }
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(6, 5, bl);
+    ENCODE_START(7, 5, bl);
     uint64_t s = size;
     __u32 mt = ceph::real_clock::to_time_t(creation_time);
     string empty_str;  // originally had the bucket name here, but we encode bucket later
@@ -1894,6 +1925,7 @@ struct RGWBucketEnt {
     s = size_rounded;
     ::encode(s, bl);
     ::encode(creation_time, bl);
+    ::encode(placement_rule, bl);
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
@@ -1917,6 +1949,8 @@ struct RGWBucketEnt {
     size_rounded = s;
     if (struct_v >= 6)
       ::decode(creation_time, bl);
+    if (struct_v >= 7)
+      ::decode(placement_rule, bl);
     DECODE_FINISH(bl);
   }
   void dump(Formatter *f) const;
diff --git a/ceph/src/rgw/rgw_crypt.cc b/ceph/src/rgw/rgw_crypt.cc
index 4d5b00ff6..81a84ad69 100644
--- a/ceph/src/rgw/rgw_crypt.cc
+++ b/ceph/src/rgw/rgw_crypt.cc
@@ -905,7 +905,16 @@ static int get_actual_key_from_kms(CephContext *cct,
 
   map<string, string>::iterator it = str_map.find(std::string(key_id));
   if (it != str_map.end() ) {
-    std::string master_key = from_base64((*it).second);
+    std::string master_key;
+    try {
+      master_key = from_base64((*it).second);
+    } catch (...) {
+      ldout(cct, 5) << "ERROR: get_actual_key_from_kms invalid encryption key id "
+                    << "which contains character that is not base64 encoded."
+                    << dendl;
+      return -EINVAL;
+    }
+
     if (master_key.length() == AES_256_KEYSIZE) {
       uint8_t _actual_key[AES_256_KEYSIZE];
       if (AES_256_ECB_encrypt(cct,
@@ -1025,26 +1034,57 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
         ldout(s->cct, 5) << "ERROR: Invalid value for header "
                          << "x-amz-server-side-encryption-customer-algorithm"
                          << dendl;
-        return -ERR_INVALID_REQUEST;
+        s->err.message = "The requested encryption algorithm is not valid, must be AES256.";
+        return -ERR_INVALID_ENCRYPTION_ALGORITHM;
       }
       if (s->cct->_conf->rgw_crypt_require_ssl &&
           !s->info.env->exists("SERVER_PORT_SECURE")) {
         ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
         return -ERR_INVALID_REQUEST;
       }
-      std::string key_bin = from_base64(
+
+      std::string key_bin;
+      try {
+        key_bin = from_base64(
           get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY) );
+      } catch (...) {
+        ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_encrypt invalid encryption "
+                         << "key which contains character that is not base64 encoded."
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key.";
+        return -EINVAL;
+      }
+
       if (key_bin.size() != AES_256_CBC::AES_256_KEYSIZE) {
         ldout(s->cct, 5) << "ERROR: invalid encryption key size" << dendl;
-        return -ERR_INVALID_REQUEST;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key.";
+        return -EINVAL;
       }
+
       boost::string_view keymd5 =
           get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5);
-      std::string keymd5_bin = from_base64(keymd5);
+
+      std::string keymd5_bin;
+      try {
+        keymd5_bin = from_base64(keymd5);
+      } catch (...) {
+        ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_encrypt invalid encryption key "
+                         << "md5 which contains character that is not base64 encoded."
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key md5.";
+        return -EINVAL;
+      }
+
       if (keymd5_bin.size() != CEPH_CRYPTO_MD5_DIGESTSIZE) {
         ldout(s->cct, 5) << "ERROR: Invalid key md5 size" << dendl;
-        return -ERR_INVALID_DIGEST;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key md5.";
+        return -EINVAL;
       }
+
       MD5 key_hash;
       byte key_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE];
       key_hash.Update(reinterpret_cast<const byte*>(key_bin.c_str()), key_bin.size());
@@ -1052,7 +1092,8 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
 
       if (memcmp(key_hash_res, keymd5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) {
         ldout(s->cct, 5) << "ERROR: Invalid key md5 hash" << dendl;
-        return -ERR_INVALID_DIGEST;
+        s->err.message = "The calculated MD5 hash of the key did not match the hash that was provided.";
+        return -EINVAL;
       }
 
       set_attr(attrs, RGW_ATTR_CRYPT_MODE, "SSE-C-AES256");
@@ -1067,7 +1108,30 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
       crypt_http_responses["x-amz-server-side-encryption-customer-algorithm"] = "AES256";
       crypt_http_responses["x-amz-server-side-encryption-customer-key-MD5"] = keymd5.to_string();
       return 0;
+    } else {
+      boost::string_view customer_key =
+          get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY);
+      if (!customer_key.empty()) {
+        ldout(s->cct, 5) << "ERROR: SSE-C encryption request is missing the header "
+                         << "x-amz-server-side-encryption-customer-algorithm"
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide a valid encryption algorithm.";
+        return -EINVAL;
+      }
+
+      boost::string_view customer_key_md5 =
+          get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5);
+      if (!customer_key_md5.empty()) {
+        ldout(s->cct, 5) << "ERROR: SSE-C encryption request is missing the header "
+                         << "x-amz-server-side-encryption-customer-algorithm"
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide a valid encryption algorithm.";
+        return -EINVAL;
+      }
     }
+
     /* AMAZON server side encryption with KMS (key management service) */
     boost::string_view req_sse =
         get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION);
@@ -1075,7 +1139,9 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
       if (req_sse != "aws:kms") {
         ldout(s->cct, 5) << "ERROR: Invalid value for header x-amz-server-side-encryption"
                          << dendl;
-        return -ERR_INVALID_REQUEST;
+        s->err.message = "Server Side Encryption with KMS managed key requires "
+                         "HTTP header x-amz-server-side-encryption : aws:kms";
+        return -EINVAL;
       }
       if (s->cct->_conf->rgw_crypt_require_ssl &&
           !s->info.env->exists("SERVER_PORT_SECURE")) {
@@ -1085,17 +1151,24 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
       boost::string_view key_id =
           get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID);
       if (key_id.empty()) {
+        ldout(s->cct, 5) << "ERROR: not provide a valid key id" << dendl;
+        s->err.message = "Server Side Encryption with KMS managed key requires "
+                         "HTTP header x-amz-server-side-encryption-aws-kms-key-id";
         return -ERR_INVALID_ACCESS_KEY;
       }
       /* try to retrieve actual key */
       std::string key_selector = create_random_key_selector(s->cct);
       std::string actual_key;
       res = get_actual_key_from_kms(s->cct, key_id, key_selector, actual_key);
-      if (res != 0)
+      if (res != 0) {
+        ldout(s->cct, 5) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl;
+        s->err.message = "Failed to retrieve the actual key, kms-keyid: " + key_id.to_string();
         return res;
+      }
       if (actual_key.size() != AES_256_KEYSIZE) {
         ldout(s->cct, 5) << "ERROR: key obtained from key_id:" <<
             key_id << " is not 256 bit size" << dendl;
+        s->err.message = "KMS provided an invalid key for the given kms-keyid.";
         return -ERR_INVALID_ACCESS_KEY;
       }
       set_attr(attrs, RGW_ATTR_CRYPT_MODE, "SSE-KMS");
@@ -1108,13 +1181,37 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
         *block_crypt = std::move(aes);
       }
       actual_key.replace(0, actual_key.length(), actual_key.length(), '\000');
+
+      crypt_http_responses["x-amz-server-side-encryption"] = "aws:kms";
+      crypt_http_responses["x-amz-server-side-encryption-aws-kms-key-id"] = key_id.to_string();
       return 0;
+    } else {
+      boost::string_view key_id =
+          get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID);
+      if (!key_id.empty()) {
+        ldout(s->cct, 5) << "ERROR: SSE-KMS encryption request is missing the header "
+                         << "x-amz-server-side-encryption"
+                         << dendl;
+        s->err.message = "Server Side Encryption with KMS managed key requires "
+                         "HTTP header x-amz-server-side-encryption : aws:kms";
+        return -EINVAL;
+      }
     }
 
     /* no other encryption mode, check if default encryption is selected */
     if (s->cct->_conf->rgw_crypt_default_encryption_key != "") {
-      std::string master_encryption_key =
-          from_base64(s->cct->_conf->rgw_crypt_default_encryption_key);
+      std::string master_encryption_key;
+      try {
+        master_encryption_key = from_base64(s->cct->_conf->rgw_crypt_default_encryption_key);
+      } catch (...) {
+        ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_encrypt invalid default encryption key "
+                         << "which contains character that is not base64 encoded."
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key.";
+        return -EINVAL;
+      }
+
       if (master_encryption_key.size() != 256 / 8) {
         ldout(s->cct, 0) << "ERROR: failed to decode 'rgw crypt default encryption key' to 256 bit string" << dendl;
         /* not an error to return; missing encryption does not inhibit processing */
@@ -1170,26 +1267,58 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
     const char *req_cust_alg =
         s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM", NULL);
 
-    if ((nullptr == req_cust_alg) || (strcmp(req_cust_alg, "AES256") != 0)) {
-      ldout(s->cct, 5) << "ERROR: Invalid value for header "
+    if (nullptr == req_cust_alg)  {
+      ldout(s->cct, 5) << "ERROR: Request for SSE-C encrypted object missing "
                        << "x-amz-server-side-encryption-customer-algorithm"
                        << dendl;
-      return -ERR_INVALID_REQUEST;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide a valid encryption algorithm.";
+      return -EINVAL;
+    } else if (strcmp(req_cust_alg, "AES256") != 0) {
+      ldout(s->cct, 5) << "ERROR: The requested encryption algorithm is not valid, must be AES256." << dendl;
+      s->err.message = "The requested encryption algorithm is not valid, must be AES256.";
+      return -ERR_INVALID_ENCRYPTION_ALGORITHM;
+    }
+
+    std::string key_bin;
+    try {
+      key_bin = from_base64(s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY", ""));
+    } catch (...) {
+      ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_decrypt invalid encryption key "
+                       << "which contains character that is not base64 encoded."
+                       << dendl;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide an appropriate secret key.";
+      return -EINVAL;
     }
 
-    std::string key_bin =
-        from_base64(s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY", ""));
     if (key_bin.size() != AES_256_CBC::AES_256_KEYSIZE) {
       ldout(s->cct, 5) << "ERROR: Invalid encryption key size" << dendl;
-      return -ERR_INVALID_REQUEST;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide an appropriate secret key.";
+      return -EINVAL;
     }
 
     std::string keymd5 =
         s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5", "");
-    std::string keymd5_bin = from_base64(keymd5);
+    std::string keymd5_bin;
+    try {
+      keymd5_bin = from_base64(keymd5);
+    } catch (...) {
+      ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_decrypt invalid encryption key md5 "
+                       << "which contains character that is not base64 encoded."
+                       << dendl;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide an appropriate secret key md5.";
+      return -EINVAL;
+    }
+
+
     if (keymd5_bin.size() != CEPH_CRYPTO_MD5_DIGESTSIZE) {
       ldout(s->cct, 5) << "ERROR: Invalid key md5 size " << dendl;
-      return -ERR_INVALID_DIGEST;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide an appropriate secret key md5.";
+      return -EINVAL;
     }
 
     MD5 key_hash;
@@ -1199,7 +1328,8 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
 
     if ((memcmp(key_hash_res, keymd5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) ||
         (get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYMD5) != keymd5_bin)) {
-      return -ERR_INVALID_DIGEST;
+      s->err.message = "The calculated MD5 hash of the key did not match the hash that was provided.";
+      return -EINVAL;
     }
     auto aes = std::unique_ptr<AES_256_CBC>(new AES_256_CBC(s->cct));
     aes->set_key(reinterpret_cast<const uint8_t*>(key_bin.c_str()), AES_256_CBC::AES_256_KEYSIZE);
@@ -1222,12 +1352,14 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
     std::string actual_key;
     res = get_actual_key_from_kms(s->cct, key_id, key_selector, actual_key);
     if (res != 0) {
-      ldout(s->cct, 10) << "No encryption key for key-id=" << key_id << dendl;
+      ldout(s->cct, 10) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl;
+      s->err.message = "Failed to retrieve the actual key, kms-keyid: " + key_id;
       return res;
     }
     if (actual_key.size() != AES_256_KEYSIZE) {
       ldout(s->cct, 0) << "ERROR: key obtained from key_id:" <<
           key_id << " is not 256 bit size" << dendl;
+      s->err.message = "KMS provided an invalid key for the given kms-keyid.";
       return -ERR_INVALID_ACCESS_KEY;
     }
 
@@ -1242,8 +1374,17 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
   }
 
   if (stored_mode == "RGW-AUTO") {
-    std::string master_encryption_key =
-        from_base64(std::string(s->cct->_conf->rgw_crypt_default_encryption_key));
+    std::string master_encryption_key;
+    try {
+      master_encryption_key = from_base64(std::string(s->cct->_conf->rgw_crypt_default_encryption_key));
+    } catch (...) {
+      ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_decrypt invalid default encryption key "
+                       << "which contains character that is not base64 encoded."
+                       << dendl;
+      s->err.message = "The default encryption key is not valid base64.";
+      return -EINVAL;
+    }
+
     if (master_encryption_key.size() != 256 / 8) {
       ldout(s->cct, 0) << "ERROR: failed to decode 'rgw crypt default encryption key' to 256 bit string" << dendl;
       return -EIO;
diff --git a/ceph/src/rgw/rgw_data_sync.cc b/ceph/src/rgw/rgw_data_sync.cc
index 8fe6497f2..daaffb7cd 100644
--- a/ceph/src/rgw/rgw_data_sync.cc
+++ b/ceph/src/rgw/rgw_data_sync.cc
@@ -735,19 +735,20 @@ public:
 
   int operate() override {
     reenter(this) {
-      entries_index = new RGWShardedOmapCRManager(sync_env->async_rados, store, this, num_shards,
-						  store->get_zone_params().log_pool,
-                                                  oid_prefix);
       yield {
         string entrypoint = string("/admin/metadata/bucket.instance");
         /* FIXME: need a better scaling solution here, requires streaming output */
         call(new RGWReadRESTResourceCR<list<string> >(store->ctx(), sync_env->conn, sync_env->http_manager,
                                                       entrypoint, NULL, &result));
       }
-      if (get_ret_status() < 0) {
+      if (retcode < 0) {
         ldout(sync_env->cct, 0) << "ERROR: failed to fetch metadata for section bucket.index" << dendl;
-        return set_state(RGWCoroutine_Error);
+        return set_cr_error(retcode);
       }
+      entries_index = new RGWShardedOmapCRManager(sync_env->async_rados, store, this, num_shards,
+						  store->get_zone_params().log_pool,
+                                                  oid_prefix);
+      yield; // yield so OmapAppendCRs can start
       for (iter = result.begin(); iter != result.end(); ++iter) {
         ldout(sync_env->cct, 20) << "list metadata: section=bucket.index key=" << *iter << dendl;
 
@@ -1595,8 +1596,9 @@ class RGWDataSyncControlCR : public RGWBackoffControlCR
   RGWDataSyncEnv *sync_env;
   uint32_t num_shards;
 
+  static constexpr bool exit_on_error = false; // retry on all errors
 public:
-  RGWDataSyncControlCR(RGWDataSyncEnv *_sync_env, uint32_t _num_shards) : RGWBackoffControlCR(_sync_env->cct, true),
+  RGWDataSyncControlCR(RGWDataSyncEnv *_sync_env, uint32_t _num_shards) : RGWBackoffControlCR(_sync_env->cct, exit_on_error),
                                                       sync_env(_sync_env), num_shards(_num_shards) {
   }
 
diff --git a/ceph/src/rgw/rgw_file.cc b/ceph/src/rgw/rgw_file.cc
index 83df3ddbf..84f8e961e 100644
--- a/ceph/src/rgw/rgw_file.cc
+++ b/ceph/src/rgw/rgw_file.cc
@@ -92,10 +92,10 @@ namespace rgw {
 	auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1);
 	auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1);
 	if (ux_key && ux_attrs) {
-	  bool old_key = rgw_fh->decode_attrs(ux_key, ux_attrs);
-	  if (old_key) {
-	    update_fhk(rgw_fh);
-	  }
+	  DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs);
+	  if (get<0>(dar) || get<1>(dar)) {
+	    update_fh(rgw_fh);
+          }
 	}
 	if (! (flags & RGWFileHandle::FLAG_LOCKED)) {
 	  rgw_fh->mtx.unlock();
@@ -147,10 +147,10 @@ namespace rgw {
 	    auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1);
 	    auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1);
 	    if (ux_key && ux_attrs) {
-	      bool old_key = rgw_fh->decode_attrs(ux_key, ux_attrs);
-	      if (old_key) {
-		update_fhk(rgw_fh);
-	      }
+              DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs);
+              if (get<0>(dar) || get<1>(dar)) {
+                update_fh(rgw_fh);
+              }
 	    }
 	  }
 	  goto done;
@@ -181,10 +181,10 @@ namespace rgw {
 	    auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1);
 	    auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1);
 	    if (ux_key && ux_attrs) {
-	      bool old_key = rgw_fh->decode_attrs(ux_key, ux_attrs);
-	      if (old_key) {
-		update_fhk(rgw_fh);
-	      }
+              DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs);
+              if (get<0>(dar) || get<1>(dar)) {
+                update_fh(rgw_fh);
+              }
 	    }
 	  }
 	  goto done;
@@ -744,7 +744,7 @@ namespace rgw {
   } /* RGWLibFS::setattr */
 
   /* called under rgw_fh->mtx held */
-  void RGWLibFS::update_fhk(RGWFileHandle *rgw_fh)
+  void RGWLibFS::update_fh(RGWFileHandle *rgw_fh)
   {
     int rc, rc2;
     string obj_name{rgw_fh->relative_object_name()};
@@ -757,15 +757,15 @@ namespace rgw {
 
     lsubdout(get_context(), rgw, 17)
       << __func__
-      << " update old versioned fhk : " << obj_name
+      << " update old versioned fh : " << obj_name
       << dendl;
 
     RGWSetAttrsRequest req(cct, get_user(), rgw_fh->bucket_name(), obj_name);
 
     rgw_fh->encode_attrs(ux_key, ux_attrs);
 
-    /* update ux_key only */
     req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key));
+    req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs));
 
     rc = rgwlib.get_fe()->execute_req(&req);
     rc2 = req.get_ret();
@@ -773,10 +773,10 @@ namespace rgw {
     if ((rc != 0) || (rc2 != 0)) {
       lsubdout(get_context(), rgw, 17)
 	<< __func__
-	<< " update fhk failed : " << obj_name
+	<< " update fh failed : " << obj_name
 	<< dendl;
     }
-  } /* RGWLibFS::update_fhk */
+  } /* RGWLibFS::update_fh */
 
   void RGWLibFS::close()
   {
@@ -786,7 +786,7 @@ namespace rgw {
     {
       RGWLibFS* fs;
     public:
-      ObjUnref(RGWLibFS* fs) : fs(fs) {}
+      ObjUnref(RGWLibFS* _fs) : fs(_fs) {}
       void operator()(RGWFileHandle* fh) const {
 	lsubdout(fs->get_context(), rgw, 5)
 	  << __func__
@@ -956,7 +956,7 @@ namespace rgw {
       fs->fh_cache.remove(fh.fh_hk.object, this, FHCache::FLAG_LOCK);
     }
     /* cond-unref parent */
-    if (parent && (! parent->is_root())) {
+    if (parent && (! parent->is_mount())) {
       /* safe because if parent->unref causes its deletion,
        * there are a) by refcnt, no other objects/paths pointing
        * to it and b) by the semantics of valid iteration of
@@ -976,23 +976,26 @@ namespace rgw {
     rgw::encode(*this, ux_attrs1);
   } /* RGWFileHandle::encode_attrs */
 
-  bool RGWFileHandle::decode_attrs(const ceph::buffer::list* ux_key1,
-				   const ceph::buffer::list* ux_attrs1)
+  DecodeAttrsResult RGWFileHandle::decode_attrs(const ceph::buffer::list* ux_key1,
+                                                const ceph::buffer::list* ux_attrs1)
   {
-    bool old_key = false;
+    DecodeAttrsResult dar { false, false };
     fh_key fhk;
     auto bl_iter_key1  = const_cast<buffer::list*>(ux_key1)->begin();
     rgw::decode(fhk, bl_iter_key1);
     if (fhk.version >= 2) {
       assert(this->fh.fh_hk == fhk.fh_hk);
     } else {
-      old_key = true;
+      get<0>(dar) = true;
     }
 
     auto bl_iter_unix1 = const_cast<buffer::list*>(ux_attrs1)->begin();
     rgw::decode(*this, bl_iter_unix1);
+    if (this->state.version < 2) {
+      get<1>(dar) = true;
+    }
 
-    return old_key;
+    return dar;
   } /* RGWFileHandle::decode_attrs */
 
   bool RGWFileHandle::reclaim() {
@@ -1020,26 +1023,29 @@ namespace rgw {
     return false;
   }
 
-  int RGWFileHandle::readdir(rgw_readdir_cb rcb, void *cb_arg, uint64_t *offset,
+  std::ostream& operator<<(std::ostream &os,
+			   RGWFileHandle::readdir_offset const &offset)
+  {
+    using boost::get;
+    if (unlikely(!! get<uint64_t*>(&offset))) {
+      uint64_t* ioff = get<uint64_t*>(offset);
+      os << *ioff;
+    }
+    else
+      os << get<const char*>(offset);
+    return os;
+  }
+
+  int RGWFileHandle::readdir(rgw_readdir_cb rcb, void *cb_arg,
+			     readdir_offset offset,
 			     bool *eof, uint32_t flags)
   {
     using event = RGWLibFS::event;
+    using boost::get;
     int rc = 0;
     struct timespec now;
     CephContext* cct = fs->get_context();
 
-    if ((*offset == 0) &&
-	(flags & RGW_READDIR_FLAG_DOTDOT)) {
-      /* send '.' and '..' with their NFS-defined offsets */
-      rcb(".", cb_arg, 1, RGW_LOOKUP_FLAG_DIR);
-      rcb("..", cb_arg, 2, RGW_LOOKUP_FLAG_DIR);
-    }
-
-    lsubdout(fs->get_context(), rgw, 15)
-      << __func__
-      << " offset=" << *offset
-      << dendl;
-
     directory* d = get<directory>(&variant_type);
     if (d) {
       (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */
@@ -1047,6 +1053,13 @@ namespace rgw {
       d->last_readdir = now;
     }
 
+    bool initial_off;
+    if (likely(!! get<const char*>(&offset))) {
+      initial_off = ! get<const char*>(offset);
+    } else {
+      initial_off = (*get<uint64_t*>(offset) == 0);
+    }
+
     if (is_root()) {
       RGWListBucketsRequest req(cct, fs->get_user(), this, rcb, cb_arg,
 				offset);
@@ -1055,7 +1068,7 @@ namespace rgw {
 	(void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */
 	lock_guard guard(mtx);
 	state.atime = now;
-	if (*offset == 0)
+	if (initial_off)
 	  set_nlink(2);
 	inc_nlink(req.d_count);
 	*eof = req.eof();
@@ -1070,7 +1083,7 @@ namespace rgw {
 	(void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */
 	lock_guard guard(mtx);
 	state.atime = now;
-	if (*offset == 0)
+	if (initial_off)
 	  set_nlink(2);
 	inc_nlink(req.d_count);
 	*eof = req.eof();
@@ -1155,6 +1168,15 @@ namespace rgw {
       }
     }
 
+    int overlap = 0;
+    if ((static_cast<off_t>(off) < f->write_req->real_ofs) &&
+        ((f->write_req->real_ofs - off) <= len)) {
+      overlap = f->write_req->real_ofs - off;
+      off = f->write_req->real_ofs;
+      buffer = static_cast<char*>(buffer) + overlap;
+      len -= overlap;
+    }
+
     buffer::list bl;
     /* XXXX */
 #if 0
@@ -1191,7 +1213,7 @@ namespace rgw {
       rc = -EIO;
     }
 
-    *bytes_written = (rc == 0) ? len : 0;
+    *bytes_written = (rc == 0) ? (len + overlap) : 0;
     return rc;
   } /* RGWFileHandle::write */
 
@@ -1394,7 +1416,7 @@ namespace rgw {
     struct timespec omtime = rgw_fh->get_mtime();
     real_time appx_t = real_clock::now();
 
-    s->obj_size = ofs; // XXX check ofs
+    s->obj_size = bytes_written;
     perfcounter->inc(l_rgw_put_b, s->obj_size);
 
     op_ret = get_store()->check_quota(s->bucket_owner.get_id(), s->bucket,
@@ -1403,7 +1425,8 @@ namespace rgw {
       goto done;
     }
 
-    op_ret = get_store()->check_bucket_shards(s->bucket_info, s->bucket, bucket_quota);
+    op_ret = get_store()->check_bucket_shards(s->bucket_info, s->bucket,
+					      bucket_quota);
     if (op_ret < 0) {
       goto done;
     }
@@ -1449,7 +1472,10 @@ namespace rgw {
       attrbl.append(val.c_str(), val.size() + 1);
     }
 
-    rgw_get_request_metadata(s->cct, s->info, attrs);
+    op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+    if (op_ret < 0) {
+      goto done;
+    }
     encode_delete_at_attr(delete_at, attrs);
 
     /* Add a custom metadata to expose the information whether an object
@@ -1504,7 +1530,38 @@ void rgwfile_version(int *major, int *minor, int *extra)
 
   /* stash access data for "mount" */
   RGWLibFS* new_fs = new RGWLibFS(static_cast<CephContext*>(rgw), uid, acc_key,
-				  sec_key);
+				  sec_key, "/");
+  assert(new_fs);
+
+  rc = new_fs->authorize(rgwlib.get_store());
+  if (rc != 0) {
+    delete new_fs;
+    return -EINVAL;
+  }
+
+  /* register fs for shared gc */
+  rgwlib.get_fe()->get_process()->register_fs(new_fs);
+
+  struct rgw_fs *fs = new_fs->get_fs();
+  fs->rgw = rgw;
+
+  /* XXX we no longer assume "/" is unique, but we aren't tracking the
+   * roots atm */
+
+  *rgw_fs = fs;
+
+  return 0;
+}
+
+int rgw_mount2(librgw_t rgw, const char *uid, const char *acc_key,
+               const char *sec_key, const char *root, struct rgw_fs **rgw_fs,
+               uint32_t flags)
+{
+  int rc = 0;
+
+  /* stash access data for "mount" */
+  RGWLibFS* new_fs = new RGWLibFS(static_cast<CephContext*>(rgw), uid, acc_key,
+				  sec_key, root);
   assert(new_fs);
 
   rc = new_fs->authorize(rgwlib.get_store());
@@ -1566,8 +1623,8 @@ int rgw_statfs(struct rgw_fs *rgw_fs,
   vfs_st->f_bavail = UINT64_MAX;
   vfs_st->f_files = 1024; /* object count, do we have an est? */
   vfs_st->f_ffree = UINT64_MAX;
-  vfs_st->f_fsid[0] = fs->get_inst();
-  vfs_st->f_fsid[1] = fs->get_inst();
+  vfs_st->f_fsid[0] = fs->get_fsid();
+  vfs_st->f_fsid[1] = fs->get_fsid();
   vfs_st->f_flag = 0;
   vfs_st->f_namemax = 4096;
   return 0;
@@ -1841,9 +1898,50 @@ int rgw_readdir(struct rgw_fs *rgw_fs,
     /* bad parent */
     return -EINVAL;
   }
+
+  lsubdout(parent->get_fs()->get_context(), rgw, 15)
+    << __func__
+    << " offset=" << *offset
+    << dendl;
+
+  if ((*offset == 0) &&
+      (flags & RGW_READDIR_FLAG_DOTDOT)) {
+    /* send '.' and '..' with their NFS-defined offsets */
+    rcb(".", cb_arg, 1, RGW_LOOKUP_FLAG_DIR);
+    rcb("..", cb_arg, 2, RGW_LOOKUP_FLAG_DIR);
+  }
+
   int rc = parent->readdir(rcb, cb_arg, offset, eof, flags);
   return rc;
-}
+} /* rgw_readdir */
+
+/* enumeration continuing from name */
+int rgw_readdir2(struct rgw_fs *rgw_fs,
+		 struct rgw_file_handle *parent_fh, const char *name,
+		 rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+		 uint32_t flags)
+{
+  RGWFileHandle* parent = get_rgwfh(parent_fh);
+  if (! parent) {
+    /* bad parent */
+    return -EINVAL;
+  }
+
+  lsubdout(parent->get_fs()->get_context(), rgw, 15)
+    << __func__
+    << " offset=" << name
+    << dendl;
+
+  if ((! name) &&
+      (flags & RGW_READDIR_FLAG_DOTDOT)) {
+    /* send '.' and '..' with their NFS-defined offsets */
+    rcb(".", cb_arg, 1, RGW_LOOKUP_FLAG_DIR);
+    rcb("..", cb_arg, 2, RGW_LOOKUP_FLAG_DIR);
+  }
+
+  int rc = parent->readdir(rcb, cb_arg, name, eof, flags);
+  return rc;
+} /* rgw_readdir2 */
 
 /* project offset of dirent name */
 int rgw_dirent_offset(struct rgw_fs *rgw_fs,
@@ -1891,8 +1989,14 @@ int rgw_write(struct rgw_fs *rgw_fs,
   if (! rgw_fh->is_file())
     return -EISDIR;
 
-  if (! rgw_fh->is_open())
-    return -EPERM;
+  if (! rgw_fh->is_open()) {
+    if (flags & RGW_OPEN_FLAG_V3) {
+      rc = rgw_fh->open(flags);
+      if (!! rc)
+	return rc;
+    } else
+      return -EPERM;
+  }
 
   rc = rgw_fh->write(offset, length, bytes_written, buffer);
 
diff --git a/ceph/src/rgw/rgw_file.h b/ceph/src/rgw/rgw_file.h
index 34f3f0bce..2f8b8bad1 100644
--- a/ceph/src/rgw/rgw_file.h
+++ b/ceph/src/rgw/rgw_file.h
@@ -173,6 +173,8 @@ namespace rgw {
   using boost::variant;
   using boost::container::flat_map;
 
+  typedef std::tuple<bool, bool> DecodeAttrsResult;
+
   class RGWFileHandle : public cohort::lru::Object
   {
     struct rgw_file_handle fh;
@@ -204,8 +206,9 @@ namespace rgw {
       struct timespec ctime;
       struct timespec mtime;
       struct timespec atime;
+      uint32_t version;
       State() : dev(0), size(0), nlink(1), owner_uid(0), owner_gid(0),
-		ctime{0,0}, mtime{0,0}, atime{0,0} {}
+		ctime{0,0}, mtime{0,0}, atime{0,0}, version(0) {}
     } state;
 
     struct file {
@@ -250,6 +253,7 @@ namespace rgw {
     static constexpr uint32_t FLAG_LOCKED = 0x0200;
     static constexpr uint32_t FLAG_STATELESS_OPEN = 0x0400;
     static constexpr uint32_t FLAG_EXACT_MATCH = 0x0800;
+    static constexpr uint32_t FLAG_MOUNT = 0x1000;
 
 #define CREATE_FLAGS(x) \
     ((x) & ~(RGWFileHandle::FLAG_CREATE|RGWFileHandle::FLAG_LOCK))
@@ -257,33 +261,47 @@ namespace rgw {
     friend class RGWLibFS;
 
   private:
-    RGWFileHandle(RGWLibFS* _fs, uint32_t fs_inst)
+    RGWFileHandle(RGWLibFS* _fs)
       : fs(_fs), bucket(nullptr), parent(nullptr), variant_type{directory()},
-	depth(0), flags(FLAG_ROOT)
+	depth(0), flags(FLAG_NONE)
       {
 	/* root */
 	fh.fh_type = RGW_FS_TYPE_DIRECTORY;
 	variant_type = directory();
 	/* stat */
-	state.dev = fs_inst;
 	state.unix_mode = RGW_RWXMODE|S_IFDIR;
 	/* pointer to self */
 	fh.fh_private = this;
       }
 
-    void init_rootfs(std::string& fsid, const std::string& object_name) {
+    uint64_t init_fsid(std::string& uid) {
+      return XXH64(uid.c_str(), uid.length(), fh_key::seed);
+    }
+
+    void init_rootfs(std::string& fsid, const std::string& object_name,
+                     bool is_bucket) {
       /* fh_key */
       fh.fh_hk.bucket = XXH64(fsid.c_str(), fsid.length(), fh_key::seed);
       fh.fh_hk.object = XXH64(object_name.c_str(), object_name.length(),
 			      fh_key::seed);
       fhk = fh.fh_hk;
       name = object_name;
+
+      state.dev = init_fsid(fsid);
+
+      if (is_bucket) {
+        flags |= RGWFileHandle::FLAG_BUCKET | RGWFileHandle::FLAG_MOUNT;
+        bucket = this;
+        depth = 1;
+      } else {
+        flags |= RGWFileHandle::FLAG_ROOT | RGWFileHandle::FLAG_MOUNT;
+      }
     }
 
   public:
-    RGWFileHandle(RGWLibFS* fs, uint32_t fs_inst, RGWFileHandle* _parent,
+    RGWFileHandle(RGWLibFS* _fs, RGWFileHandle* _parent,
 		  const fh_key& _fhk, std::string& _name, uint32_t _flags)
-      : fs(fs), bucket(nullptr), parent(_parent), name(std::move(_name)),
+      : fs(_fs), bucket(nullptr), parent(_parent), name(std::move(_name)),
 	fhk(_fhk), flags(_flags) {
 
       if (parent->is_root()) {
@@ -307,8 +325,8 @@ namespace rgw {
       /* save constant fhk */
       fh.fh_hk = fhk.fh_hk; /* XXX redundant in fh_hk */
 
-      /* stat */
-      state.dev = fs_inst;
+      /* inherits parent's fsid */
+      state.dev = parent->state.dev;
 
       switch (fh.fh_type) {
       case RGW_FS_TYPE_DIRECTORY:
@@ -515,6 +533,7 @@ namespace rgw {
 
     bool is_open() const { return flags & FLAG_OPEN; }
     bool is_root() const { return flags & FLAG_ROOT; }
+    bool is_mount() const { return flags & FLAG_MOUNT; }
     bool is_bucket() const { return flags & FLAG_BUCKET; }
     bool is_object() const { return !is_bucket(); }
     bool is_file() const { return (fh.fh_type == RGW_FS_TYPE_FILE); }
@@ -536,8 +555,11 @@ namespace rgw {
       return -EPERM;
     }
 
-    int readdir(rgw_readdir_cb rcb, void *cb_arg, uint64_t *offset, bool *eof,
-		uint32_t flags);
+    typedef boost::variant<uint64_t*, const char*> readdir_offset;
+
+    int readdir(rgw_readdir_cb rcb, void *cb_arg, readdir_offset offset,
+		bool *eof, uint32_t flags);
+
     int write(uint64_t off, size_t len, size_t *nbytes, void *buffer);
 
     int commit(uint64_t offset, uint64_t length, uint32_t flags) {
@@ -594,7 +616,7 @@ namespace rgw {
     }
 
     void encode(buffer::list& bl) const {
-      ENCODE_START(1, 1, bl);
+      ENCODE_START(2, 1, bl);
       ::encode(uint32_t(fh.fh_type), bl);
       ::encode(state.dev, bl);
       ::encode(state.size, bl);
@@ -605,11 +627,12 @@ namespace rgw {
       for (const auto& t : { state.ctime, state.mtime, state.atime }) {
 	::encode(real_clock::from_timespec(t), bl);
       }
+      ::encode((uint32_t)2, bl);
       ENCODE_FINISH(bl);
     }
 
     void decode(bufferlist::iterator& bl) {
-      DECODE_START(1, bl);
+      DECODE_START(2, bl);
       uint32_t fh_type;
       ::decode(fh_type, bl);
       assert(fh.fh_type == fh_type);
@@ -624,14 +647,17 @@ namespace rgw {
 	::decode(enc_time, bl);
 	*t = real_clock::to_timespec(enc_time);
       }
+      if (struct_v >= 2) {
+        ::decode(state.version, bl);
+      }
       DECODE_FINISH(bl);
     }
 
     void encode_attrs(ceph::buffer::list& ux_key1,
 		      ceph::buffer::list& ux_attrs1);
 
-    bool decode_attrs(const ceph::buffer::list* ux_key1,
-		      const ceph::buffer::list* ux_attrs1);
+    DecodeAttrsResult decode_attrs(const ceph::buffer::list* ux_key1,
+                                   const ceph::buffer::list* ux_attrs1);
 
     void invalidate();
 
@@ -694,7 +720,6 @@ namespace rgw {
     {
     public:
       RGWLibFS* fs;
-      uint32_t fs_inst;
       RGWFileHandle* parent;
       const fh_key& fhk;
       std::string& name;
@@ -702,20 +727,20 @@ namespace rgw {
 
       Factory() = delete;
 
-      Factory(RGWLibFS* fs, uint32_t fs_inst, RGWFileHandle* parent,
-	      const fh_key& fhk, std::string& name, uint32_t flags)
-	: fs(fs), fs_inst(fs_inst), parent(parent), fhk(fhk), name(name),
-	  flags(flags) {}
+      Factory(RGWLibFS* _fs, RGWFileHandle* _parent,
+	      const fh_key& _fhk, std::string& _name, uint32_t _flags)
+	: fs(_fs), parent(_parent), fhk(_fhk), name(_name),
+	  flags(_flags) {}
 
       void recycle (cohort::lru::Object* o) override {
 	/* re-use an existing object */
 	o->~Object(); // call lru::Object virtual dtor
 	// placement new!
-	new (o) RGWFileHandle(fs, fs_inst, parent, fhk, name, flags);
+	new (o) RGWFileHandle(fs, parent, fhk, name, flags);
       }
 
       cohort::lru::Object* alloc() override {
-	return new RGWFileHandle(fs, fs_inst, parent, fhk, name, flags);
+	return new RGWFileHandle(fs, parent, fhk, name, flags);
       }
     }; /* Factory */
 
@@ -768,7 +793,6 @@ namespace rgw {
     static std::atomic<uint32_t> fs_inst_counter;
 
     static uint32_t write_completion_interval_s;
-    std::string fsid;
 
     using lock_guard = std::lock_guard<std::mutex>;
     using unique_lock = std::unique_lock<std::mutex>;
@@ -837,8 +861,8 @@ namespace rgw {
     };
 
     RGWLibFS(CephContext* _cct, const char *_uid, const char *_user_id,
-	    const char* _key)
-      : cct(_cct), root_fh(this, new_inst()), invalidate_cb(nullptr),
+	    const char* _key, const char *root)
+      : cct(_cct), root_fh(this), invalidate_cb(nullptr),
 	invalidate_arg(nullptr), shutdown(false), refcnt(1),
 	fh_cache(cct->_conf->rgw_nfs_fhcache_partitions,
 		 cct->_conf->rgw_nfs_fhcache_size),
@@ -846,17 +870,19 @@ namespace rgw {
 	       cct->_conf->rgw_nfs_lru_lane_hiwat),
 	uid(_uid), key(_user_id, _key) {
 
-      /* no bucket may be named rgw_fs_inst-(.*) */
-      fsid = RGWFileHandle::root_name + "rgw_fs_inst-" +
-	std::to_string(get_inst());
-
-      root_fh.init_rootfs(fsid /* bucket */, RGWFileHandle::root_name);
+      if (!root || !strcmp(root, "/")) {
+        root_fh.init_rootfs(uid, RGWFileHandle::root_name, false);
+      } else {
+        root_fh.init_rootfs(uid, root, true);
+      }
 
       /* pointer to self */
       fs.fs_private = this;
 
       /* expose public root fh */
       fs.root_fh = root_fh.get_fh();
+
+      new_inst();
     }
 
     friend void intrusive_ptr_add_ref(const RGWLibFS* fs) {
@@ -1031,7 +1057,7 @@ namespace rgw {
 	    fh->mtx.unlock(); /* ! LOCKED */
       } else {
 	/* make or re-use handle */
-	RGWFileHandle::Factory prototype(this, get_inst(), parent, fhk,
+	RGWFileHandle::Factory prototype(this, parent, fhk,
 					 obj_name, CREATE_FLAGS(flags));
 	fh = static_cast<RGWFileHandle*>(
 	  fh_lru.insert(&prototype,
@@ -1045,7 +1071,7 @@ namespace rgw {
 	  fh_cache.insert_latched(fh, lat, RGWFileHandle::FHCache::FLAG_UNLOCK);
 	  get<1>(fhr) |= RGWFileHandle::FLAG_CREATE;
 	  /* ref parent (non-initial ref cannot fail on valid object) */
-	  if (! parent->is_root()) {
+	  if (! parent->is_mount()) {
 	    (void) fh_lru.ref(parent, cohort::lru::FLAG_NONE);
 	  }
 	  goto out; /* !LATCHED */
@@ -1066,13 +1092,13 @@ namespace rgw {
     } /*  lookup_fh(RGWFileHandle*, const char *, const uint32_t) */
 
     inline void unref(RGWFileHandle* fh) {
-      if (likely(! fh->is_root())) {
+      if (likely(! fh->is_mount())) {
 	(void) fh_lru.unref(fh, cohort::lru::FLAG_NONE);
       }
     }
 
     inline RGWFileHandle* ref(RGWFileHandle* fh) {
-      if (likely(! fh->is_root())) {
+      if (likely(! fh->is_mount())) {
 	fh_lru.ref(fh, cohort::lru::FLAG_NONE);
       }
       return fh;
@@ -1083,8 +1109,7 @@ namespace rgw {
     int setattr(RGWFileHandle* rgw_fh, struct stat* st, uint32_t mask,
 		uint32_t flags);
 
-    void update_fhk(RGWFileHandle *rgw_fh);
-
+    void update_fh(RGWFileHandle *rgw_fh);
 
     LookupFHResult stat_bucket(RGWFileHandle* parent, const char *path,
 			       RGWLibFS::BucketStats& bs,
@@ -1166,7 +1191,7 @@ namespace rgw {
 
     struct rgw_fs* get_fs() { return &fs; }
 
-    uint32_t get_inst() { return root_fh.state.dev; }
+    uint64_t get_fsid() { return root_fh.state.dev; }
 
     RGWUserInfo* get_user() { return &user; }
 
@@ -1193,20 +1218,32 @@ class RGWListBucketsRequest : public RGWLibRequest,
 {
 public:
   RGWFileHandle* rgw_fh;
-  uint64_t* offset;
+  RGWFileHandle::readdir_offset offset;
   void* cb_arg;
   rgw_readdir_cb rcb;
+  uint64_t* ioff;
   size_t ix;
   uint32_t d_count;
 
   RGWListBucketsRequest(CephContext* _cct, RGWUserInfo *_user,
 			RGWFileHandle* _rgw_fh, rgw_readdir_cb _rcb,
-			void* _cb_arg, uint64_t* _offset)
+			void* _cb_arg, RGWFileHandle::readdir_offset& _offset)
     : RGWLibRequest(_cct, _user), rgw_fh(_rgw_fh), offset(_offset),
-      cb_arg(_cb_arg), rcb(_rcb), ix(0), d_count(0) {
-    const auto& mk = rgw_fh->find_marker(*offset);
-    if (mk) {
-      marker = mk->name;
+      cb_arg(_cb_arg), rcb(_rcb), ioff(nullptr), ix(0), d_count(0) {
+
+    using boost::get;
+
+    if (unlikely(!! get<uint64_t*>(&offset))) {
+      ioff = get<uint64_t*>(offset);
+      const auto& mk = rgw_fh->find_marker(*ioff);
+      if (mk) {
+	marker = mk->name;
+      }
+    } else {
+      const char* mk = get<const char*>(offset);
+      if (mk) {
+	marker = mk;
+      }
     }
     op = this;
   }
@@ -1277,7 +1314,9 @@ public:
   int operator()(const boost::string_ref& name,
 		 const boost::string_ref& marker) {
     uint64_t off = XXH64(name.data(), name.length(), fh_key::seed);
-    *offset = off;
+    if (!! ioff) {
+      *ioff = off;
+    }
     /* update traversal cache */
     rgw_fh->add_marker(off, rgw_obj_key{marker.data(), ""},
 		       RGW_FS_TYPE_DIRECTORY);
@@ -1286,7 +1325,7 @@ public:
   }
 
   bool eof() {
-    lsubdout(cct, rgw, 15) << "READDIR offset: " << *offset
+    lsubdout(cct, rgw, 15) << "READDIR offset: " << offset
 			   << " is_truncated: " << is_truncated
 			   << dendl;
     return !is_truncated;
@@ -1303,21 +1342,37 @@ class RGWReaddirRequest : public RGWLibRequest,
 {
 public:
   RGWFileHandle* rgw_fh;
-  uint64_t* offset;
+  RGWFileHandle::readdir_offset offset;
   void* cb_arg;
   rgw_readdir_cb rcb;
+  uint64_t* ioff;
   size_t ix;
   uint32_t d_count;
 
   RGWReaddirRequest(CephContext* _cct, RGWUserInfo *_user,
 		    RGWFileHandle* _rgw_fh, rgw_readdir_cb _rcb,
-		    void* _cb_arg, uint64_t* _offset)
+		    void* _cb_arg, RGWFileHandle::readdir_offset& _offset)
     : RGWLibRequest(_cct, _user), rgw_fh(_rgw_fh), offset(_offset),
-      cb_arg(_cb_arg), rcb(_rcb), ix(0), d_count(0) {
-    const auto& mk = rgw_fh->find_marker(*offset);
-    if (mk) {
-      marker = *mk;
+      cb_arg(_cb_arg), rcb(_rcb), ioff(nullptr), ix(0), d_count(0) {
+
+    using boost::get;
+
+    if (unlikely(!! get<uint64_t*>(&offset))) {
+      ioff = get<uint64_t*>(offset);
+      const auto& mk = rgw_fh->find_marker(*ioff);
+      if (mk) {
+	marker = *mk;
+      }
+    } else {
+      const char* mk = get<const char*>(offset);
+      if (mk) {
+	std::string tmark{rgw_fh->relative_object_name()};
+	tmark += "/";
+	tmark += mk;	
+	marker = rgw_obj_key{std::move(tmark), "", ""};
+      }
     }
+
     default_max = 1000; // XXX was being omitted
     op = this;
   }
@@ -1366,7 +1421,9 @@ public:
 
     /* hash offset of name in parent (short name) for NFS readdir cookie */
     uint64_t off = XXH64(name.data(), name.length(), fh_key::seed);
-    *offset = off;
+    if (unlikely(!! ioff)) {
+      *ioff = off;
+    }
     /* update traversal cache */
     rgw_fh->add_marker(off, marker, type);
     ++d_count;
@@ -1456,7 +1513,7 @@ public:
   }
 
   bool eof() {
-    lsubdout(cct, rgw, 15) << "READDIR offset: " << *offset
+    lsubdout(cct, rgw, 15) << "READDIR offset: " << offset
 			   << " next marker: " << next_marker
 			   << " is_truncated: " << is_truncated
 			   << dendl;
diff --git a/ceph/src/rgw/rgw_iam_policy.cc b/ceph/src/rgw/rgw_iam_policy.cc
index 28b97d04d..cff4fa493 100644
--- a/ceph/src/rgw/rgw_iam_policy.cc
+++ b/ceph/src/rgw/rgw_iam_policy.cc
@@ -221,7 +221,9 @@ optional<ARN> ARN::parse(const string& s, bool wildcards) {
   if ((s == "*") && wildcards) {
     return ARN(Partition::wildcard, Service::wildcard, "*", "*", "*");
   } else if (regex_match(s, match, wildcards ? rx_wild : rx_no_wild)) {
-    ceph_assert(match.size() == 6);
+    if (match.size() != 6) {
+      return boost::none;
+    }
 
     ARN a;
     {
@@ -771,7 +773,9 @@ static optional<Principal> parse_principal(CephContext* cct, TokenID t,
 			  ECMAScript | optimize);
     smatch match;
     if (regex_match(a->resource, match, rx)) {
-      ceph_assert(match.size() == 3);
+      if (match.size() != 3) {
+	return boost::none;
+      }
 
       if (match[1] == "user") {
 	return Principal::user(std::move(a->account),
@@ -839,7 +843,9 @@ bool ParseState::do_string(CephContext* cct, const char* s, size_t l) {
     // Principals
 
   } else if (w->kind == TokenKind::princ_type) {
-    ceph_assert(pp->s.size() > 1);
+    if (pp->s.size() <= 1) {
+      return false;
+    }
     auto& pri = pp->s[pp->s.size() - 2].w->id == TokenID::Principal ?
       t->princ : t->noprinc;
 
diff --git a/ceph/src/rgw/rgw_iam_policy.h b/ceph/src/rgw/rgw_iam_policy.h
index 5236e6b02..59117456e 100644
--- a/ceph/src/rgw/rgw_iam_policy.h
+++ b/ceph/src/rgw/rgw_iam_policy.h
@@ -29,8 +29,6 @@
 #include "rgw_iam_policy_keywords.h"
 #include "rgw_string.h"
 
-#include "include/assert.h" // razzin' frazzin' ...grrr.
-
 class RGWRados;
 namespace rgw {
 namespace auth {
@@ -254,7 +252,6 @@ string to_string(const MaskedIP& m);
 inline bool operator ==(const MaskedIP& l, const MaskedIP& r) {
   auto shift = std::max((l.v6 ? 128 : 32) - l.prefix,
 			(r.v6 ? 128 : 32) - r.prefix);
-  ceph_assert(shift > 0);
   return (l.addr >> shift) == (r.addr >> shift);
 }
 
diff --git a/ceph/src/rgw/rgw_json_enc.cc b/ceph/src/rgw/rgw_json_enc.cc
index 4792e62c3..ad7b941bf 100644
--- a/ceph/src/rgw/rgw_json_enc.cc
+++ b/ceph/src/rgw/rgw_json_enc.cc
@@ -798,6 +798,7 @@ void RGWBucketEnt::dump(Formatter *f) const
   utime_t ut(creation_time);
   encode_json("mtime", ut, f); /* mtime / creation time discrepency needed for backward compatibility */
   encode_json("count", count, f);
+  encode_json("placement_rule", placement_rule, f);
 }
 
 void RGWUploadPartInfo::dump(Formatter *f) const
diff --git a/ceph/src/rgw/rgw_keystone.h b/ceph/src/rgw/rgw_keystone.h
index 3add9ae8a..1cdbae519 100644
--- a/ceph/src/rgw/rgw_keystone.h
+++ b/ceph/src/rgw/rgw_keystone.h
@@ -277,8 +277,11 @@ class TokenCache {
   ~TokenCache() {
     down_flag = true;
 
-    revocator.stop();
-    revocator.join();
+    // Only stop and join if revocator thread is started.
+    if (revocator.is_started()) {
+      revocator.stop();
+      revocator.join();
+    }
   }
 
 public:
diff --git a/ceph/src/rgw/rgw_lc.cc b/ceph/src/rgw/rgw_lc.cc
index cad0304e3..cdcfaffff 100644
--- a/ceph/src/rgw/rgw_lc.cc
+++ b/ceph/src/rgw/rgw_lc.cc
@@ -202,8 +202,10 @@ bool RGWLC::if_already_run_today(time_t& start_date)
   localtime_r(&start_date, &bdt);
 
   if (cct->_conf->rgw_lc_debug_interval > 0) {
-	  /* We're debugging, so say we can run */
-	  return false;
+    if (now - start_date < cct->_conf->rgw_lc_debug_interval)
+      return true;
+    else
+      return false;
   }
 
   bdt.tm_hour = 0;
@@ -674,13 +676,11 @@ int RGWLC::process(int index, int max_lock_secs)
     l.unlock(&store->lc_pool_ctx, obj_names[index]);
     ret = bucket_lc_process(entry.first);
     bucket_lc_post(index, max_lock_secs, entry, ret);
-    return 0;
+  }while(1);
+
 exit:
     l.unlock(&store->lc_pool_ctx, obj_names[index]);
     return 0;
-
-  }while(1);
-
 }
 
 void RGWLC::start_processor()
diff --git a/ceph/src/rgw/rgw_op.cc b/ceph/src/rgw/rgw_op.cc
index ac6f7b700..11fdbd4eb 100644
--- a/ceph/src/rgw/rgw_op.cc
+++ b/ceph/src/rgw/rgw_op.cc
@@ -495,6 +495,8 @@ int rgw_build_bucket_policies(RGWRados* store, struct req_state* s)
        */
       if (store->get_zonegroup().is_master_zonegroup() && s->system_request) {
         /*If this is the master, don't redirect*/
+      } else if (s->op_type == RGW_OP_GET_BUCKET_LOCATION ) {
+        /* If op is get bucket location, don't redirect */
       } else if (!s->local_source ||
           (s->op != OP_PUT && s->op != OP_COPY) ||
           s->object.empty()) {
@@ -1638,8 +1640,13 @@ void RGWGetObj::execute()
   /* start gettorrent */
   if (torrent.get_flag())
   {
+    attr_iter = attrs.find(RGW_ATTR_CRYPT_MODE);
+    if (attr_iter != attrs.end() && attr_iter->second.to_str() == "SSE-C-AES256") {
+      op_ret = -ERR_INVALID_REQUEST;
+      goto done_err;
+    }
     torrent.init(s, store);
-    torrent.get_torrent_file(op_ret, read_op, total_len, bl, obj);
+    op_ret = torrent.get_torrent_file(read_op, total_len, bl, obj);
     if (op_ret < 0)
     {
       ldout(s->cct, 0) << "ERROR: failed to get_torrent_file ret= " << op_ret
@@ -1807,7 +1814,7 @@ void RGWListBuckets::execute()
   bool started = false;
   uint64_t total_count = 0;
 
-  uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk;
+  const uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk;
 
   op_ret = get_params();
   if (op_ret < 0) {
@@ -1842,15 +1849,32 @@ void RGWListBuckets::execute()
 			<< s->user->user_id << dendl;
       break;
     }
-    map<string, RGWBucketEnt>& m = buckets.get_buckets();
-    map<string, RGWBucketEnt>::iterator iter;
-    for (iter = m.begin(); iter != m.end(); ++iter) {
-      RGWBucketEnt& bucket = iter->second;
-      buckets_size += bucket.size;
-      buckets_size_rounded += bucket.size_rounded;
-      buckets_objcount += bucket.count;
+
+    /* We need to have stats for all our policies - even if a given policy
+     * isn't actually used in a given account. In such situation its usage
+     * stats would be simply full of zeros. */
+    for (const auto& policy : store->get_zonegroup().placement_targets) {
+      policies_stats.emplace(policy.second.name,
+                             decltype(policies_stats)::mapped_type());
+    }
+
+    std::map<std::string, RGWBucketEnt>& m = buckets.get_buckets();
+    for (const auto& kv : m) {
+      const auto& bucket = kv.second;
+
+      global_stats.bytes_used += bucket.size;
+      global_stats.bytes_used_rounded += bucket.size_rounded;
+      global_stats.objects_count += bucket.count;
+
+      /* operator[] still can create a new entry for storage policy seen
+       * for first time. */
+      auto& policy_stats = policies_stats[bucket.placement_rule];
+      policy_stats.bytes_used += bucket.size;
+      policy_stats.bytes_used_rounded += bucket.size_rounded;
+      policy_stats.buckets_count++;
+      policy_stats.objects_count += bucket.count;
     }
-    buckets_count += m.size();
+    global_stats.buckets_count += m.size();
     total_count += m.size();
 
     done = (m.size() < read_count || (limit >= 0 && total_count >= (uint64_t)limit));
@@ -1861,10 +1885,10 @@ void RGWListBuckets::execute()
     }
 
     if (!m.empty()) {
-      send_response_data(buckets);
-
       map<string, RGWBucketEnt>::reverse_iterator riter = m.rbegin();
       marker = riter->first;
+
+      handle_listing_chunk(std::move(buckets));
     }
   } while (is_truncated && !done);
 
@@ -1968,17 +1992,31 @@ void RGWStatAccount::execute()
 			<< s->user->user_id << dendl;
       break;
     } else {
-      map<string, RGWBucketEnt>& m = buckets.get_buckets();
-      map<string, RGWBucketEnt>::iterator iter;
-      for (iter = m.begin(); iter != m.end(); ++iter) {
-        RGWBucketEnt& bucket = iter->second;
-        buckets_size += bucket.size;
-        buckets_size_rounded += bucket.size_rounded;
-        buckets_objcount += bucket.count;
-
-        marker = iter->first;
+      /* We need to have stats for all our policies - even if a given policy
+       * isn't actually used in a given account. In such situation its usage
+       * stats would be simply full of zeros. */
+      for (const auto& policy : store->get_zonegroup().placement_targets) {
+        policies_stats.emplace(policy.second.name,
+                               decltype(policies_stats)::mapped_type());
+      }
+
+      std::map<std::string, RGWBucketEnt>& m = buckets.get_buckets();
+      for (const auto& kv : m) {
+        const auto& bucket = kv.second;
+
+        global_stats.bytes_used += bucket.size;
+        global_stats.bytes_used_rounded += bucket.size_rounded;
+        global_stats.objects_count += bucket.count;
+
+        /* operator[] still can create a new entry for storage policy seen
+         * for first time. */
+        auto& policy_stats = policies_stats[bucket.placement_rule];
+        policy_stats.bytes_used += bucket.size;
+        policy_stats.bytes_used_rounded += bucket.size_rounded;
+        policy_stats.buckets_count++;
+        policy_stats.objects_count += bucket.count;
       }
-      buckets_count += m.size();
+      global_stats.buckets_count += m.size();
 
     }
   } while (is_truncated);
@@ -1986,11 +2024,16 @@ void RGWStatAccount::execute()
 
 int RGWGetBucketVersioning::verify_permission()
 {
-  if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+			    rgw::IAM::s3GetBucketVersioning,
+			    ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
   }
-
-  return 0;
+  return -EACCES;
 }
 
 void RGWGetBucketVersioning::pre_exec()
@@ -2006,11 +2049,16 @@ void RGWGetBucketVersioning::execute()
 
 int RGWSetBucketVersioning::verify_permission()
 {
-  if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+			    rgw::IAM::s3PutBucketVersioning,
+			    ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
   }
-
-  return 0;
+  return -EACCES;
 }
 
 void RGWSetBucketVersioning::pre_exec()
@@ -2050,10 +2098,17 @@ void RGWSetBucketVersioning::execute()
 
 int RGWGetBucketWebsite::verify_permission()
 {
-  if (s->user->user_id.compare(s->bucket_owner.get_id()) != 0)
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+                           rgw::IAM::s3GetBucketWebsite,
+                           ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
+  }
 
-  return 0;
+  return -EACCES;
 }
 
 void RGWGetBucketWebsite::pre_exec()
@@ -2070,10 +2125,17 @@ void RGWGetBucketWebsite::execute()
 
 int RGWSetBucketWebsite::verify_permission()
 {
-  if (s->user->user_id.compare(s->bucket_owner.get_id()) != 0)
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+                           rgw::IAM::s3PutBucketWebsite,
+                           ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
+  }
 
-  return 0;
+  return -EACCES;
 }
 
 void RGWSetBucketWebsite::pre_exec()
@@ -2258,11 +2320,16 @@ int RGWGetBucketLogging::verify_permission()
 
 int RGWGetBucketLocation::verify_permission()
 {
-  if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+			    rgw::IAM::s3GetBucketLocation,
+			    ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
   }
-
-  return 0;
+  return -EACCES;
 }
 
 int RGWCreateBucket::verify_permission()
@@ -2639,7 +2706,10 @@ void RGWCreateBucket::execute()
   if (need_metadata_upload()) {
     /* It's supposed that following functions WILL NOT change any special
      * attributes (like RGW_ATTR_ACL) if they are already present in attrs. */
-    rgw_get_request_metadata(s->cct, s->info, attrs, false);
+    op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false);
+    if (op_ret < 0) {
+      return;
+    }
     prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs);
     populate_with_generic_attrs(s, attrs);
 
@@ -2732,7 +2802,10 @@ void RGWCreateBucket::execute()
 
       attrs.clear();
 
-      rgw_get_request_metadata(s->cct, s->info, attrs, false);
+      op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false);
+      if (op_ret < 0) {
+        return;
+      }
       prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs);
       populate_with_generic_attrs(s, attrs);
       op_ret = filter_out_quota_info(attrs, rmattr_names, s->bucket_info.quota);
@@ -3531,7 +3604,10 @@ void RGWPutObj::execute()
   emplace_attr(RGW_ATTR_ETAG, std::move(bl));
 
   populate_with_generic_attrs(s, attrs);
-  rgw_get_request_metadata(s->cct, s->info, attrs);
+  op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+  if (op_ret < 0) {
+    goto done;
+  }
   encode_delete_at_attr(delete_at, attrs);
   encode_obj_tags_attr(obj_tags.get(), attrs);
 
@@ -3845,7 +3921,10 @@ int RGWPutMetadataAccount::init_processing()
     attrs.emplace(RGW_ATTR_ACL, std::move(acl_bl));
   }
 
-  rgw_get_request_metadata(s->cct, s->info, attrs, false);
+  op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false);
+  if (op_ret < 0) {
+    return op_ret;
+  }
   prepare_add_del_attrs(orig_attrs, rmattr_names, attrs);
   populate_with_generic_attrs(s, attrs);
 
@@ -3937,7 +4016,10 @@ void RGWPutMetadataBucket::execute()
     return;
   }
 
-  rgw_get_request_metadata(s->cct, s->info, attrs, false);
+  op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false);
+  if (op_ret < 0) {
+    return;
+  }
 
   if (!placement_rule.empty() &&
       placement_rule != s->bucket_info.placement_rule) {
@@ -4024,7 +4106,11 @@ void RGWPutMetadataObject::execute()
     return;
   }
 
-  rgw_get_request_metadata(s->cct, s->info, attrs);
+  op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+  if (op_ret < 0) {
+    return;
+  }
+
   /* check if obj exists, read orig attrs */
   op_ret = get_obj_attrs(store, s, obj, orig_attrs);
   if (op_ret < 0) {
@@ -4398,7 +4484,10 @@ int RGWCopyObj::init_common()
   dest_policy.encode(aclbl);
   emplace_attr(RGW_ATTR_ACL, std::move(aclbl));
 
-  rgw_get_request_metadata(s->cct, s->info, attrs);
+  op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+  if (op_ret < 0) {
+    return op_ret;
+  }
   populate_with_generic_attrs(s, attrs);
 
   return 0;
@@ -4868,11 +4957,16 @@ void RGWDeleteLC::execute()
 
 int RGWGetCORS::verify_permission()
 {
-  if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+			    rgw::IAM::s3PutBucketCORS,
+			    ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
   }
-
-  return 0;
+  return -EACCES;
 }
 
 void RGWGetCORS::execute()
@@ -4890,11 +4984,16 @@ void RGWGetCORS::execute()
 
 int RGWPutCORS::verify_permission()
 {
-  if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+			    rgw::IAM::s3PutBucketCORS,
+			    ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
   }
-
-  return 0;
+  return -EACCES;
 }
 
 void RGWPutCORS::execute()
@@ -5017,6 +5116,12 @@ void RGWOptionsCORS::execute()
 
 int RGWGetRequestPayment::verify_permission()
 {
+  if (s->iam_policy &&
+      s->iam_policy->eval(s->env, *s->auth.identity,
+			  rgw::IAM::s3GetBucketRequestPayment,
+			  ARN(s->bucket)) != Effect::Allow) {
+      return -EACCES;
+  }
   return 0;
 }
 
@@ -5032,11 +5137,16 @@ void RGWGetRequestPayment::execute()
 
 int RGWSetRequestPayment::verify_permission()
 {
-  if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
-    return -EACCES;
+  if (s->iam_policy) {
+    if (s->iam_policy->eval(s->env, *s->auth.identity,
+			    rgw::IAM::s3PutBucketRequestPayment,
+			    ARN(s->bucket)) == Effect::Allow) {
+      return 0;
+    }
+  } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return 0;
   }
-
-  return 0;
+  return -EACCES;
 }
 
 void RGWSetRequestPayment::pre_exec()
@@ -5108,7 +5218,10 @@ void RGWInitMultipart::execute()
   if (op_ret != 0)
     return;
 
-  rgw_get_request_metadata(s->cct, s->info, attrs);
+  op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+  if (op_ret < 0) {
+    return;
+  }
 
   do {
     char buf[33];
@@ -5287,22 +5400,16 @@ void RGWCompleteMultipart::execute()
     from deleting the parts*/
   rgw_pool meta_pool;
   rgw_raw_obj raw_obj;
-  librados::ObjectWriteOperation op;
-  librados::IoCtx ioctx;
-  rados::cls::lock::Lock l("RGWCompleteMultipart");
-  int max_lock_secs_mp = s->cct->_conf->get_val<int64_t>("rgw_mp_lock_max_time");
+  int max_lock_secs_mp =
+    s->cct->_conf->get_val<int64_t>("rgw_mp_lock_max_time");
+  utime_t dur(max_lock_secs_mp, 0);
 
-  op.assert_exists();
   store->obj_to_raw((s->bucket_info).placement_rule, meta_obj, &raw_obj);
-  store->get_obj_data_pool((s->bucket_info).placement_rule,meta_obj,&meta_pool);
-  store->open_pool_ctx(meta_pool, ioctx);
-
-  const string raw_meta_oid = raw_obj.oid;
-  utime_t time(max_lock_secs_mp, 0);
-  l.set_duration(time);
-  l.lock_exclusive(&op);
-  op_ret = ioctx.operate(raw_meta_oid, &op);
+  store->get_obj_data_pool((s->bucket_info).placement_rule,
+			   meta_obj,&meta_pool);
+  store->open_pool_ctx(meta_pool, serializer.ioctx);
 
+  op_ret = serializer.try_lock(raw_obj.oid, dur);
   if (op_ret < 0) {
     dout(0) << "RGWCompleteMultipart::execute() failed to acquire lock " << dendl;
     op_ret = -ERR_INTERNAL_ERROR;
@@ -5452,6 +5559,7 @@ void RGWCompleteMultipart::execute()
   obj_op.meta.owner = s->owner.get_id();
   obj_op.meta.flags = PUT_OBJ_CREATE;
   obj_op.meta.modify_tail = true;
+  obj_op.meta.completeMultipart = true;
   op_ret = obj_op.write_meta(ofs, accounted_size, attrs);
   if (op_ret < 0)
     return;
@@ -5459,13 +5567,41 @@ void RGWCompleteMultipart::execute()
   // remove the upload obj
   int r = store->delete_obj(*static_cast<RGWObjectCtx *>(s->obj_ctx),
 			    s->bucket_info, meta_obj, 0);
-  if (r < 0) {
-    ldout(store->ctx(), 0) << "WARNING: failed to remove object " << meta_obj << dendl;
-    r = l.unlock(&ioctx, raw_meta_oid);
+  if (r >= 0)  {
+    /* serializer's exclusive lock is released */
+    serializer.clear_locked();
+  } else {
+      ldout(store->ctx(), 0) << "WARNING: failed to remove object "
+			     << meta_obj << dendl;
+  }
+}
+
+int RGWCompleteMultipart::MPSerializer::try_lock(
+  const std::string& _oid,
+  utime_t dur)
+{
+  oid = _oid;
+  op.assert_exists();
+  lock.set_duration(dur);
+  lock.lock_exclusive(&op);
+  int ret = ioctx.operate(oid, &op);
+  if (! ret) {
+    locked = true;
+  }
+  return ret;
+}
+
+void RGWCompleteMultipart::complete()
+{
+  /* release exclusive lock iff not already */
+  if (unlikely(serializer.locked)) {
+    int r = serializer.unlock();
     if (r < 0) {
-      ldout(store->ctx(), 0) << "WARNING: failed to unlock " << raw_meta_oid << dendl;
+      ldout(store->ctx(), 0) << "WARNING: failed to unlock "
+			     << serializer.oid << dendl;
     }
   }
+  send_response();
 }
 
 int RGWAbortMultipart::verify_permission()
diff --git a/ceph/src/rgw/rgw_op.h b/ceph/src/rgw/rgw_op.h
index d9ce2b400..68b83a45f 100644
--- a/ceph/src/rgw/rgw_op.h
+++ b/ceph/src/rgw/rgw_op.h
@@ -42,6 +42,8 @@
 #include "rgw_lc.h"
 #include "rgw_torrent.h"
 #include "rgw_tag.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rgw/cls_rgw_client.h"
 
 #include "include/assert.h"
 
@@ -582,40 +584,51 @@ public:
 }; /* RGWBulkUploadOp::AlignedStreamGetter */
 
 
+struct RGWUsageStats {
+  uint64_t bytes_used = 0;
+  uint64_t bytes_used_rounded = 0;
+  uint64_t buckets_count = 0;
+  uint64_t objects_count = 0;
+};
+
 #define RGW_LIST_BUCKETS_LIMIT_MAX 10000
 
 class RGWListBuckets : public RGWOp {
 protected:
   bool sent_data;
-  string marker;
-  string end_marker;
+  std::string marker;
+  std::string end_marker;
   int64_t limit;
   uint64_t limit_max;
-  uint32_t buckets_count;
-  uint64_t buckets_objcount;
-  uint64_t buckets_size;
-  uint64_t buckets_size_rounded;
-  map<string, bufferlist> attrs;
+  std::map<std::string, ceph::bufferlist> attrs;
   bool is_truncated;
 
+  RGWUsageStats global_stats;
+  std::map<std::string, RGWUsageStats> policies_stats;
+
   virtual uint64_t get_default_max() const {
     return 1000;
   }
 
 public:
-  RGWListBuckets() : sent_data(false) {
-    limit = limit_max = RGW_LIST_BUCKETS_LIMIT_MAX;
-    buckets_count = 0;
-    buckets_objcount = 0;
-    buckets_size = 0;
-    buckets_size_rounded = 0;
-    is_truncated = false;
+  RGWListBuckets()
+    : sent_data(false),
+      limit(RGW_LIST_BUCKETS_LIMIT_MAX),
+      limit_max(RGW_LIST_BUCKETS_LIMIT_MAX),
+      is_truncated(false) {
   }
 
   int verify_permission() override;
   void execute() override;
 
   virtual int get_params() = 0;
+  virtual void handle_listing_chunk(RGWUserBuckets&& buckets) {
+    /* The default implementation, used by e.g. S3, just generates a new
+     * part of listing and sends it client immediately. Swift can behave
+     * differently: when the reverse option is requested, all incoming
+     * instances of RGWUserBuckets are buffered and finally reversed. */
+    return send_response_data(buckets);
+  }
   virtual void send_response_begin(bool has_buckets) = 0;
   virtual void send_response_data(RGWUserBuckets& buckets) = 0;
   virtual void send_response_end() = 0;
@@ -659,24 +672,17 @@ public:
 
 class RGWStatAccount : public RGWOp {
 protected:
-  uint32_t buckets_count;
-  uint64_t buckets_objcount;
-  uint64_t buckets_size;
-  uint64_t buckets_size_rounded;
+  RGWUsageStats global_stats;
+  std::map<std::string, RGWUsageStats> policies_stats;
 
 public:
-  RGWStatAccount() {
-    buckets_count = 0;
-    buckets_objcount = 0;
-    buckets_size = 0;
-    buckets_size_rounded = 0;
-  }
+  RGWStatAccount() = default;
 
   int verify_permission() override;
   void execute() override;
 
   void send_response() override = 0;
-  const string name() override { return "stat_account"; }
+  const std::string name() override { return "stat_account"; }
   RGWOpType get_type() override { return RGW_OP_STAT_ACCOUNT; }
   uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
 };
@@ -739,6 +745,7 @@ public:
 
   void send_response() override = 0;
   const string name() override { return "get_bucket_location"; }
+  RGWOpType get_type() override { return RGW_OP_GET_BUCKET_LOCATION; }
   uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
 };
 
@@ -1590,6 +1597,27 @@ protected:
   char *data;
   int len;
 
+  struct MPSerializer {
+    librados::IoCtx ioctx;
+    rados::cls::lock::Lock lock;
+    librados::ObjectWriteOperation op;
+    std::string oid;
+    bool locked;
+
+    MPSerializer() : lock("RGWCompleteMultipart"), locked(false)
+      {}
+
+    int try_lock(const std::string& oid, utime_t dur);
+
+    int unlock() {
+      return lock.unlock(&ioctx, oid);
+    }
+
+    void clear_locked() {
+      locked = false;
+    }
+  } serializer;
+
 public:
   RGWCompleteMultipart() {
     data = NULL;
@@ -1602,6 +1630,7 @@ public:
   int verify_permission() override;
   void pre_exec() override;
   void execute() override;
+  void complete() override;
 
   virtual int get_params() = 0;
   void send_response() override = 0;
@@ -1885,38 +1914,73 @@ static inline void format_xattr(std::string &xattr)
  * map(<attr_name, attr_contents>, where attr_name is RGW_ATTR_PREFIX.HTTP_NAME)
  * s: The request state
  * attrs: will be filled up with attrs mapped as <attr_name, attr_contents>
+ * On success returns 0.
+ * On failure returns a negative error code.
  *
  */
-static inline void rgw_get_request_metadata(CephContext *cct,
-					    struct req_info& info,
-					    map<string, bufferlist>& attrs,
-					    const bool allow_empty_attrs = true)
+static inline int rgw_get_request_metadata(CephContext* const cct,
+                                           struct req_info& info,
+                                           std::map<std::string, ceph::bufferlist>& attrs,
+                                           const bool allow_empty_attrs = true)
 {
   static const std::set<std::string> blacklisted_headers = {
       "x-amz-server-side-encryption-customer-algorithm",
       "x-amz-server-side-encryption-customer-key",
       "x-amz-server-side-encryption-customer-key-md5"
   };
-  map<string, string>::iterator iter;
-  for (iter = info.x_meta_map.begin(); iter != info.x_meta_map.end(); ++iter) {
-    const string &name(iter->first);
-    string &xattr(iter->second);
+
+  size_t valid_meta_count = 0;
+  for (auto& kv : info.x_meta_map) {
+    const std::string& name = kv.first;
+    std::string& xattr = kv.second;
+
     if (blacklisted_headers.count(name) == 1) {
       lsubdout(cct, rgw, 10) << "skipping x>> " << name << dendl;
       continue;
-    }
-    if (allow_empty_attrs || !xattr.empty()) {
+    } else if (allow_empty_attrs || !xattr.empty()) {
       lsubdout(cct, rgw, 10) << "x>> " << name << ":" << xattr << dendl;
       format_xattr(xattr);
-      string attr_name(RGW_ATTR_PREFIX);
+
+      std::string attr_name(RGW_ATTR_PREFIX);
       attr_name.append(name);
-      map<string, bufferlist>::value_type v(attr_name, bufferlist());
-      std::pair < map<string, bufferlist>::iterator, bool >
-	rval(attrs.insert(v));
-      bufferlist& bl(rval.first->second);
+
+      /* Check roughly whether we aren't going behind the limit on attribute
+       * name. Passing here doesn't guarantee that an OSD will accept that
+       * as ObjectStore::get_max_attr_name_length() can set the limit even
+       * lower than the "osd_max_attr_name_len" configurable.  */
+      const size_t max_attr_name_len = \
+        cct->_conf->get_val<size_t>("rgw_max_attr_name_len");
+      if (max_attr_name_len && attr_name.length() > max_attr_name_len) {
+        return -ENAMETOOLONG;
+      }
+
+      /* Similar remarks apply to the check for value size. We're veryfing
+       * it early at the RGW's side as it's being claimed in /info. */
+      const size_t max_attr_size = \
+        cct->_conf->get_val<size_t>("rgw_max_attr_size");
+      if (max_attr_size && xattr.length() > max_attr_size) {
+        return -EFBIG;
+      }
+
+      /* Swift allows administrators to limit the number of metadats items
+       * send _in a single request_. */
+      const auto rgw_max_attrs_num_in_req = \
+        cct->_conf->get_val<size_t>("rgw_max_attrs_num_in_req");
+      if (rgw_max_attrs_num_in_req &&
+          ++valid_meta_count > rgw_max_attrs_num_in_req) {
+        return -E2BIG;
+      }
+
+      auto rval = attrs.emplace(std::move(attr_name), ceph::bufferlist());
+      /* At the moment the value of the freshly created attribute key-value
+       * pair is an empty bufferlist. */
+
+      ceph::bufferlist& bl = rval.first->second;
       bl.append(xattr.c_str(), xattr.size() + 1);
     }
   }
+
+  return 0;
 } /* rgw_get_request_metadata */
 
 static inline void encode_delete_at_attr(boost::optional<ceph::real_time> delete_at,
diff --git a/ceph/src/rgw/rgw_quota.cc b/ceph/src/rgw/rgw_quota.cc
index d94dc0ca4..ce3d1265f 100644
--- a/ceph/src/rgw/rgw_quota.cc
+++ b/ceph/src/rgw/rgw_quota.cc
@@ -110,10 +110,9 @@ bool RGWQuotaCache<T>::can_use_cached_stats(RGWQuotaInfo& quota, RGWStorageStats
       quota.max_size_soft_threshold = quota.max_size * store->ctx()->_conf->rgw_bucket_quota_soft_threshold;
     }
 
-    const auto cached_stats_num_kb_rounded = rgw_rounded_kb(cached_stats.size_rounded);
-    if (cached_stats_num_kb_rounded >= (uint64_t)quota.max_size_soft_threshold) {
+    if (cached_stats.size_rounded  >= (uint64_t)quota.max_size_soft_threshold) {
       ldout(store->ctx(), 20) << "quota: can't use cached stats, exceeded soft threshold (size): "
-        << cached_stats_num_kb_rounded << " >= " << quota.max_size_soft_threshold << dendl;
+        << cached_stats.size_rounded << " >= " << quota.max_size_soft_threshold << dendl;
       return false;
     }
   }
diff --git a/ceph/src/rgw/rgw_rados.cc b/ceph/src/rgw/rgw_rados.cc
index 9df547a19..a44fc01a8 100644
--- a/ceph/src/rgw/rgw_rados.cc
+++ b/ceph/src/rgw/rgw_rados.cc
@@ -5557,17 +5557,13 @@ int RGWRados::Bucket::List::list_objects(int64_t max,
   result->clear();
 
   rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
-
-  rgw_obj_key end_marker_obj;
-  rgw_obj_index_key cur_end_marker;
-  if (!params.ns.empty()) {
-    end_marker_obj = rgw_obj_key(params.end_marker.name, params.end_marker.instance, params.ns);
-    end_marker_obj.ns = params.ns;
-    end_marker_obj.get_index_key(&cur_end_marker);
-  }
   rgw_obj_index_key cur_marker;
   marker_obj.get_index_key(&cur_marker);
 
+  rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
+                             params.ns);
+  rgw_obj_index_key cur_end_marker;
+  end_marker_obj.get_index_key(&cur_end_marker);
   const bool cur_end_marker_valid = !params.end_marker.empty();
 
   rgw_obj_key prefix_obj(params.prefix);
@@ -7013,8 +7009,14 @@ int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_si
   meta.canceled = false;
 
   /* update quota cache */
-  store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
-                                     accounted_size, orig_size);
+  if (meta.completeMultipart){
+  	store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+                                     0, orig_size);
+  }
+  else {
+    store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+                                     accounted_size, orig_size);  
+  }
   return 0;
 
 done_cancel:
@@ -8142,9 +8144,11 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
     if (tail_placement.bucket.name.empty()) {
       manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
     }
+    string ref_tag;
     for (; miter != astate->manifest.obj_end(); ++miter) {
       ObjectWriteOperation op;
-      cls_refcount_get(op, tag, true);
+      ref_tag = tag + '\0';
+      cls_refcount_get(op, ref_tag, true);
       const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
       ref.ioctx.locator_set_key(loc.loc);
 
@@ -8898,7 +8902,6 @@ int RGWRados::Object::Delete::delete_obj()
   index_op.set_zones_trace(params.zones_trace);
   index_op.set_bilog_flags(params.bilog_flags);
 
-
   r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
   if (r < 0)
     return r;
@@ -9728,10 +9731,13 @@ int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& ob
       return r;
 
     bl.append(tag.c_str(), tag.size() + 1);
-
     op.setxattr(RGW_ATTR_ID_TAG,  bl);
   }
 
+
+  real_time mtime = real_clock::now();
+  struct timespec mtime_ts = real_clock::to_timespec(mtime);
+  op.mtime2(&mtime_ts);
   r = ref.ioctx.operate(ref.oid, &op);
   if (state) {
     if (r >= 0) {
@@ -9742,7 +9748,6 @@ int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& ob
       string content_type(content_type_bl.c_str(), content_type_bl.length());
       uint64_t epoch = ref.ioctx.get_last_version();
       int64_t poolid = ref.ioctx.get_id();
-      real_time mtime = real_clock::now();
       r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
                             mtime, etag, content_type, &acl_bl,
                             RGW_OBJ_CATEGORY_MAIN, NULL);
@@ -12185,6 +12190,10 @@ int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
         ent.size_rounded += stats.total_size_rounded;
       }
     }
+
+    // fill in placement_rule from the bucket instance for use in swift's
+    // per-storage policy statistics
+    ent.placement_rule = std::move(bucket_info.placement_rule);
   }
 
   return m.size();
@@ -12821,8 +12830,11 @@ int RGWRados::cls_bucket_list(RGWBucketInfo& bucket_info, int shard_id, rgw_obj_
     const string& name = vcurrents[pos]->first;
     struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
 
-    bool force_check = force_check_filter && force_check_filter(dirent.key.name);
-    if ((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty() || force_check) {
+    bool force_check = force_check_filter &&
+        force_check_filter(dirent.key.name);
+    if ((!dirent.exists && !dirent.is_delete_marker()) ||
+        !dirent.pending_map.empty() ||
+        force_check) {
       /* there are uncommitted ops. We need to check the current state,
        * and if the tags are old we need to do cleanup as well. */
       librados::IoCtx sub_ctx;
diff --git a/ceph/src/rgw/rgw_rados.h b/ceph/src/rgw/rgw_rados.h
index 6984192f0..da916a599 100644
--- a/ceph/src/rgw/rgw_rados.h
+++ b/ceph/src/rgw/rgw_rados.h
@@ -1860,7 +1860,7 @@ public:
   int get_zonegroup(RGWZoneGroup& zonegroup,
 		    const string& zonegroup_id);
 
-  bool is_single_zonegroup()
+  bool is_single_zonegroup() const
   {
       return (period_map.zonegroups.size() == 1);
   }
@@ -2809,11 +2809,12 @@ public:
         const string *user_data;
         rgw_zone_set *zones_trace;
         bool modify_tail;
+        bool completeMultipart;
 
         MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
                  remove_objs(NULL), category(RGW_OBJ_CATEGORY_MAIN), flags(0),
                  if_match(NULL), if_nomatch(NULL), olh_epoch(0), canceled(false), user_data(nullptr), zones_trace(nullptr),
-                 modify_tail(false) {}
+                 modify_tail(false),  completeMultipart(false) {}
       } meta;
 
       explicit Write(RGWRados::Object *_target) : target(_target) {}
@@ -3593,6 +3594,11 @@ public:
       (get_zonegroup().zones.size() > 1 || current_period.is_multi_zonegroups_with_zones());
   }
 
+  bool can_reshard() const {
+    return current_period.get_id().empty() ||
+      (zonegroup.zones.size() == 1 && current_period.is_single_zonegroup());
+  }
+
   librados::Rados* get_rados_handle();
 
   int delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles);
diff --git a/ceph/src/rgw/rgw_reshard.cc b/ceph/src/rgw/rgw_reshard.cc
index 0389ca33d..a3a712c76 100644
--- a/ceph/src/rgw/rgw_reshard.cc
+++ b/ceph/src/rgw/rgw_reshard.cc
@@ -595,6 +595,11 @@ void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& buc
 
 int RGWReshard::add(cls_rgw_reshard_entry& entry)
 {
+  if (!store->can_reshard()) {
+    ldout(store->ctx(), 20) << __func__ << " Resharding is disabled"  << dendl;
+    return 0;
+  }
+
   string logshard_oid;
 
   get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
@@ -856,6 +861,10 @@ void  RGWReshard::get_logshard_oid(int shard_num, string *logshard)
 
 int RGWReshard::process_all_logshards()
 {
+  if (!store->can_reshard()) {
+    ldout(store->ctx(), 20) << __func__ << " Resharding is disabled"  << dendl;
+    return 0;
+  }
   int ret = 0;
 
   for (int i = 0; i < num_logshards; i++) {
@@ -899,14 +908,11 @@ void *RGWReshard::ReshardWorker::entry() {
   utime_t last_run;
   do {
     utime_t start = ceph_clock_now();
-    ldout(cct, 2) << "object expiration: start" << dendl;
     if (reshard->process_all_logshards()) {
       /* All shards have been processed properly. Next time we can start
        * from this moment. */
       last_run = start;
     }
-    ldout(cct, 2) << "object expiration: stop" << dendl;
-
 
     if (reshard->going_down())
       break;
diff --git a/ceph/src/rgw/rgw_rest.h b/ceph/src/rgw/rgw_rest.h
index f780ab4ab..515057dfc 100644
--- a/ceph/src/rgw/rgw_rest.h
+++ b/ceph/src/rgw/rgw_rest.h
@@ -662,6 +662,7 @@ extern void dump_header(struct req_state* s,
 extern void dump_header(struct req_state* s,
                         const boost::string_ref& name,
                         const utime_t& val);
+
 template <class... Args>
 static inline void dump_header_prefixed(struct req_state* s,
                                         const boost::string_ref& name_prefix,
@@ -677,6 +678,24 @@ static inline void dump_header_prefixed(struct req_state* s,
   return dump_header(s, std::move(full_name), std::forward<Args>(args)...);
 }
 
+template <class... Args>
+static inline void dump_header_infixed(struct req_state* s,
+                                       const boost::string_ref& prefix,
+                                       const boost::string_ref& infix,
+                                       const boost::string_ref& sufix,
+                                       Args&&... args) {
+  char full_name_buf[prefix.size() + infix.size() + sufix.size() + 1];
+  const auto len = snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s%.*s",
+                            static_cast<int>(prefix.length()),
+                            prefix.data(),
+                            static_cast<int>(infix.length()),
+                            infix.data(),
+                            static_cast<int>(sufix.length()),
+                            sufix.data());
+  boost::string_ref full_name(full_name_buf, len);
+  return dump_header(s, std::move(full_name), std::forward<Args>(args)...);
+}
+
 template <class... Args>
 static inline void dump_header_quoted(struct req_state* s,
                                       const boost::string_ref& name,
diff --git a/ceph/src/rgw/rgw_rest_swift.cc b/ceph/src/rgw/rgw_rest_swift.cc
index 96f7cb7e5..ba1822f38 100644
--- a/ceph/src/rgw/rgw_rest_swift.cc
+++ b/ceph/src/rgw/rgw_rest_swift.cc
@@ -2,6 +2,7 @@
 // vim: ts=8 sw=2 smarttab
 
 #include <boost/algorithm/string/predicate.hpp>
+#include <boost/format.hpp>
 #include <boost/optional.hpp>
 #include <boost/utility/in_place_factory.hpp>
 
@@ -38,10 +39,15 @@ int RGWListBuckets_ObjStore_SWIFT::get_params()
   prefix = s->info.args.get("prefix");
   marker = s->info.args.get("marker");
   end_marker = s->info.args.get("end_marker");
+  wants_reversed = s->info.args.exists("reverse");
 
-  string limit_str = s->info.args.get("limit");
+  if (wants_reversed) {
+    std::swap(marker, end_marker);
+  }
+
+  std::string limit_str = s->info.args.get("limit");
   if (!limit_str.empty()) {
-    string err;
+    std::string err;
     long l = strict_strtol(limit_str.c_str(), 10, &err);
     if (!err.empty()) {
       return -EINVAL;
@@ -73,10 +79,8 @@ int RGWListBuckets_ObjStore_SWIFT::get_params()
 }
 
 static void dump_account_metadata(struct req_state * const s,
-                                  const uint32_t buckets_count,
-                                  const uint64_t buckets_object_count,
-                                  const uint64_t buckets_size,
-                                  const uint64_t buckets_size_rounded,
+                                  const RGWUsageStats& global_stats,
+                                  const std::map<std::string, RGWUsageStats> policies_stats,
                                   /* const */map<string, bufferlist>& attrs,
                                   const RGWQuotaInfo& quota,
                                   const RGWAccessControlPolicy_SWIFTAcct &policy)
@@ -84,10 +88,24 @@ static void dump_account_metadata(struct req_state * const s,
   /* Adding X-Timestamp to keep align with Swift API */
   dump_header(s, "X-Timestamp", ceph_clock_now());
 
-  dump_header(s, "X-Account-Container-Count", buckets_count);
-  dump_header(s, "X-Account-Object-Count", buckets_object_count);
-  dump_header(s, "X-Account-Bytes-Used", buckets_size);
-  dump_header(s, "X-Account-Bytes-Used-Actual", buckets_size_rounded);
+  dump_header(s, "X-Account-Container-Count", global_stats.buckets_count);
+  dump_header(s, "X-Account-Object-Count", global_stats.objects_count);
+  dump_header(s, "X-Account-Bytes-Used", global_stats.bytes_used);
+  dump_header(s, "X-Account-Bytes-Used-Actual", global_stats.bytes_used_rounded);
+
+  for (const auto& kv : policies_stats) {
+    const auto& policy_name = camelcase_dash_http_attr(kv.first);
+    const auto& policy_stats = kv.second;
+
+    dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+                        "-Container-Count", policy_stats.buckets_count);
+    dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+                        "-Object-Count", policy_stats.objects_count);
+    dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+                        "-Bytes-Used", policy_stats.bytes_used);
+    dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+                        "-Bytes-Used-Actual", policy_stats.bytes_used_rounded);
+  }
 
   /* Dump TempURL-related stuff */
   if (s->perm_mask == RGW_PERM_FULL_CONTROL) {
@@ -150,10 +168,8 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets)
   if (! s->cct->_conf->rgw_swift_enforce_content_length) {
     /* Adding account stats in the header to keep align with Swift API */
     dump_account_metadata(s,
-            buckets_count,
-            buckets_objcount,
-            buckets_size,
-            buckets_size_rounded,
+            global_stats,
+            policies_stats,
             attrs,
             user_quota,
             static_cast<RGWAccessControlPolicy_SWIFTAcct&>(*s->user_acl));
@@ -170,6 +186,17 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets)
   }
 }
 
+void RGWListBuckets_ObjStore_SWIFT::handle_listing_chunk(RGWUserBuckets&& buckets)
+{
+  if (wants_reversed) {
+    /* Just store in the reversal buffer. Its content will be handled later,
+     * in send_response_end(). */
+    reverse_buffer.emplace(std::begin(reverse_buffer), std::move(buckets));
+  } else {
+    return send_response_data(buckets);
+  }
+}
+
 void RGWListBuckets_ObjStore_SWIFT::send_response_data(RGWUserBuckets& buckets)
 {
   if (! sent_data) {
@@ -184,23 +211,61 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_data(RGWUserBuckets& buckets)
   for (auto iter = m.lower_bound(prefix);
        iter != m.end() && boost::algorithm::starts_with(iter->first, prefix);
        ++iter) {
-    const RGWBucketEnt& obj = iter->second;
+    dump_bucket_entry(iter->second);
+  }
+}
 
-    s->formatter->open_object_section("container");
-    s->formatter->dump_string("name", obj.bucket.name);
-    if (need_stats) {
-      s->formatter->dump_int("count", obj.count);
-      s->formatter->dump_int("bytes", obj.size);
-    }
-    s->formatter->close_section();
-    if (! s->cct->_conf->rgw_swift_enforce_content_length) {
-      rgw_flush_formatter(s, s->formatter);
-    }
+void RGWListBuckets_ObjStore_SWIFT::dump_bucket_entry(const RGWBucketEnt& obj)
+{
+  s->formatter->open_object_section("container");
+  s->formatter->dump_string("name", obj.bucket.name);
+
+  if (need_stats) {
+    s->formatter->dump_int("count", obj.count);
+    s->formatter->dump_int("bytes", obj.size);
+  }
+
+  s->formatter->close_section();
+
+  if (! s->cct->_conf->rgw_swift_enforce_content_length) {
+    rgw_flush_formatter(s, s->formatter);
+  }
+}
+
+void RGWListBuckets_ObjStore_SWIFT::send_response_data_reversed(RGWUserBuckets& buckets)
+{
+  if (! sent_data) {
+    return;
+  }
+
+  /* Take care of the prefix parameter of Swift API. There is no business
+   * in applying the filter earlier as we really need to go through all
+   * entries regardless of it (the headers like X-Account-Container-Count
+   * aren't affected by specifying prefix). */
+  std::map<std::string, RGWBucketEnt>& m = buckets.get_buckets();
+
+  auto iter = m.rbegin();
+  for (/* initialized above */;
+       iter != m.rend() && !boost::algorithm::starts_with(iter->first, prefix);
+       ++iter) {
+    /* NOP */;
+  }
+
+  for (/* iter carried */;
+       iter != m.rend() && boost::algorithm::starts_with(iter->first, prefix);
+       ++iter) {
+    dump_bucket_entry(iter->second);
   }
 }
 
 void RGWListBuckets_ObjStore_SWIFT::send_response_end()
 {
+  if (wants_reversed) {
+    for (auto& buckets : reverse_buffer) {
+      send_response_data_reversed(buckets);
+    }
+  }
+
   if (sent_data) {
     s->formatter->close_section();
   }
@@ -208,15 +273,13 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_end()
   if (s->cct->_conf->rgw_swift_enforce_content_length) {
     /* Adding account stats in the header to keep align with Swift API */
     dump_account_metadata(s,
-            buckets_count,
-            buckets_objcount,
-            buckets_size,
-            buckets_size_rounded,
+            global_stats,
+            policies_stats,
             attrs,
             user_quota,
             static_cast<RGWAccessControlPolicy_SWIFTAcct&>(*s->user_acl));
     dump_errno(s);
-    end_header(s, NULL, NULL, s->formatter->get_len(), true);
+    end_header(s, nullptr, nullptr, s->formatter->get_len(), true);
   }
 
   if (sent_data || s->cct->_conf->rgw_swift_enforce_content_length) {
@@ -470,10 +533,8 @@ void RGWStatAccount_ObjStore_SWIFT::send_response()
   if (op_ret >= 0) {
     op_ret = STATUS_NO_CONTENT;
     dump_account_metadata(s,
-            buckets_count,
-            buckets_objcount,
-            buckets_size,
-            buckets_size_rounded,
+            global_stats,
+            policies_stats,
             attrs,
             user_quota,
             static_cast<RGWAccessControlPolicy_SWIFTAcct&>(*s->user_acl));
@@ -646,13 +707,41 @@ int RGWCreateBucket_ObjStore_SWIFT::get_params()
   return get_swift_versioning_settings(s, swift_ver_location);
 }
 
+static inline int handle_metadata_errors(req_state* const s, const int op_ret)
+{
+  if (op_ret == -EFBIG) {
+    /* Handle the custom error message of exceeding maximum custom attribute
+     * (stored as xattr) size. */
+    const auto error_message = boost::str(
+      boost::format("Metadata value longer than %lld")
+        % s->cct->_conf->get_val<size_t>("rgw_max_attr_size"));
+    set_req_state_err(s, EINVAL, error_message);
+    return -EINVAL;
+  } else if (op_ret == -E2BIG) {
+    const auto error_message = boost::str(
+      boost::format("Too many metadata items; max %lld")
+        % s->cct->_conf->get_val<size_t>("rgw_max_attrs_num_in_req"));
+    set_req_state_err(s, EINVAL, error_message);
+    return -EINVAL;
+  }
+
+  return op_ret;
+}
+
 void RGWCreateBucket_ObjStore_SWIFT::send_response()
 {
-  if (! op_ret)
-    op_ret = STATUS_CREATED;
-  else if (op_ret == -ERR_BUCKET_EXISTS)
-    op_ret = STATUS_ACCEPTED;
-  set_req_state_err(s, op_ret);
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret != op_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret) {
+      op_ret = STATUS_CREATED;
+    } else if (op_ret == -ERR_BUCKET_EXISTS) {
+      op_ret = STATUS_ACCEPTED;
+    }
+    set_req_state_err(s, op_ret);
+  }
+
   dump_errno(s);
   /* Propose ending HTTP header with 0 Content-Length header. */
   end_header(s, NULL, NULL, 0);
@@ -819,8 +908,14 @@ int RGWPutObj_ObjStore_SWIFT::get_params()
 
 void RGWPutObj_ObjStore_SWIFT::send_response()
 {
-  if (! op_ret) {
-    op_ret = STATUS_CREATED;
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret) {
+      op_ret = STATUS_CREATED;
+    }
+    set_req_state_err(s, op_ret);
   }
 
   if (! lo_etag.empty()) {
@@ -892,10 +987,16 @@ int RGWPutMetadataAccount_ObjStore_SWIFT::get_params()
 
 void RGWPutMetadataAccount_ObjStore_SWIFT::send_response()
 {
-  if (! op_ret) {
-    op_ret = STATUS_NO_CONTENT;
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret != op_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret) {
+      op_ret = STATUS_NO_CONTENT;
+    }
+    set_req_state_err(s, op_ret);
   }
-  set_req_state_err(s, op_ret);
+
   dump_errno(s);
   end_header(s, this);
   rgw_flush_formatter_and_reset(s, s->formatter);
@@ -922,10 +1023,16 @@ int RGWPutMetadataBucket_ObjStore_SWIFT::get_params()
 
 void RGWPutMetadataBucket_ObjStore_SWIFT::send_response()
 {
-  if (!op_ret && (op_ret != -EINVAL)) {
-    op_ret = STATUS_NO_CONTENT;
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret != op_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret && (op_ret != -EINVAL)) {
+      op_ret = STATUS_NO_CONTENT;
+    }
+    set_req_state_err(s, op_ret);
   }
-  set_req_state_err(s, op_ret);
+
   dump_errno(s);
   end_header(s, this);
   rgw_flush_formatter_and_reset(s, s->formatter);
@@ -952,13 +1059,20 @@ int RGWPutMetadataObject_ObjStore_SWIFT::get_params()
 
 void RGWPutMetadataObject_ObjStore_SWIFT::send_response()
 {
-  if (! op_ret) {
-    op_ret = STATUS_ACCEPTED;
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret != op_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret) {
+      op_ret = STATUS_ACCEPTED;
+    }
+    set_req_state_err(s, op_ret);
   }
-  set_req_state_err(s, op_ret);
+
   if (!s->is_err()) {
     dump_content_length(s, 0);
   }
+
   dump_errno(s);
   end_header(s, this);
   rgw_flush_formatter_and_reset(s, s->formatter);
@@ -1660,7 +1774,25 @@ void RGWInfo_ObjStore_SWIFT::list_swift_data(Formatter& formatter,
 
   string ceph_version(CEPH_GIT_NICE_VER);
   formatter.dump_string("version", ceph_version);
-  formatter.dump_int("max_meta_name_length", 81);
+
+  const size_t max_attr_name_len = \
+    g_conf->get_val<size_t>("rgw_max_attr_name_len");
+  if (max_attr_name_len) {
+    const size_t meta_name_limit = \
+      max_attr_name_len - strlen(RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX);
+    formatter.dump_int("max_meta_name_length", meta_name_limit);
+  }
+
+  const size_t meta_value_limit = g_conf->get_val<size_t>("rgw_max_attr_size");
+  if (meta_value_limit) {
+    formatter.dump_int("max_meta_value_length", meta_value_limit);
+  }
+
+  const size_t meta_num_limit = \
+    g_conf->get_val<size_t>("rgw_max_attrs_num_in_req");
+  if (meta_num_limit) {
+    formatter.dump_int("max_meta_count", meta_num_limit);
+  }
 
   formatter.open_array_section("policies");
   RGWZoneGroup& zonegroup = store.get_zonegroup();
@@ -2591,11 +2723,22 @@ int RGWHandler_REST_SWIFT::postauth_init()
 
 int RGWHandler_REST_SWIFT::validate_bucket_name(const string& bucket)
 {
-  int ret = RGWHandler_REST::validate_bucket_name(bucket);
-  if (ret < 0)
-    return ret;
+  const size_t len = bucket.size();
 
-  int len = bucket.size();
+  if (len > MAX_BUCKET_NAME_LEN) {
+    /* Bucket Name too long. Generate custom error message and bind it
+     * to an R-value reference. */
+    const auto msg = boost::str(
+      boost::format("Container name length of %lld longer than %lld")
+        % len % int(MAX_BUCKET_NAME_LEN));
+    set_req_state_err(s, ERR_INVALID_BUCKET_NAME, msg);
+    return -ERR_INVALID_BUCKET_NAME;
+  }
+
+  const auto ret = RGWHandler_REST::validate_bucket_name(bucket);
+  if (ret < 0) {
+    return ret;
+  }
 
   if (len == 0)
     return 0;
@@ -2608,7 +2751,7 @@ int RGWHandler_REST_SWIFT::validate_bucket_name(const string& bucket)
 
   const char *s = bucket.c_str();
 
-  for (int i = 0; i < len; ++i, ++s) {
+  for (size_t i = 0; i < len; ++i, ++s) {
     if (*(unsigned char *)s == 0xff)
       return -ERR_INVALID_BUCKET_NAME;
   }
diff --git a/ceph/src/rgw/rgw_rest_swift.h b/ceph/src/rgw/rgw_rest_swift.h
index 296b83acd..a0fd8cfd0 100644
--- a/ceph/src/rgw/rgw_rest_swift.h
+++ b/ceph/src/rgw/rgw_rest_swift.h
@@ -37,18 +37,27 @@ public:
 
 class RGWListBuckets_ObjStore_SWIFT : public RGWListBuckets_ObjStore {
   bool need_stats;
+  bool wants_reversed;
   std::string prefix;
+  std::vector<RGWUserBuckets> reverse_buffer;
 
   uint64_t get_default_max() const override {
     return 0;
   }
+
 public:
-  RGWListBuckets_ObjStore_SWIFT() : need_stats(true) {}
+  RGWListBuckets_ObjStore_SWIFT()
+    : need_stats(true),
+      wants_reversed(false) {
+  }
   ~RGWListBuckets_ObjStore_SWIFT() override {}
 
   int get_params() override;
+  void handle_listing_chunk(RGWUserBuckets&& buckets) override;
   void send_response_begin(bool has_buckets) override;
   void send_response_data(RGWUserBuckets& buckets) override;
+  void send_response_data_reversed(RGWUserBuckets& buckets);
+  void dump_bucket_entry(const RGWBucketEnt& obj);
   void send_response_end() override;
 
   bool should_get_stats() override { return need_stats; }
@@ -381,7 +390,7 @@ public:
   }
   ~RGWHandler_REST_SWIFT() override = default;
 
-  static int validate_bucket_name(const string& bucket);
+  int validate_bucket_name(const string& bucket);
 
   int init(RGWRados *store, struct req_state *s, rgw::io::BasicClient *cio) override;
   int authorize() override;
diff --git a/ceph/src/rgw/rgw_rest_user.cc b/ceph/src/rgw/rgw_rest_user.cc
index 8539c3ed7..c8d8dd742 100644
--- a/ceph/src/rgw/rgw_rest_user.cc
+++ b/ceph/src/rgw/rgw_rest_user.cc
@@ -32,6 +32,7 @@ void RGWOp_User_Info::execute()
 
   std::string uid_str;
   bool fetch_stats;
+  bool sync_stats;
 
   RESTArgs::get_string(s, "uid", uid_str, &uid_str);
 
@@ -47,8 +48,11 @@ void RGWOp_User_Info::execute()
 
   RESTArgs::get_bool(s, "stats", false, &fetch_stats);
 
+  RESTArgs::get_bool(s, "sync", false, &sync_stats);
+
   op_state.set_user_id(uid);
   op_state.set_fetch_stats(fetch_stats);
+  op_state.set_sync_stats(sync_stats);
 
   http_ret = RGWUserAdminOp_User::info(store, op_state, flusher);
 }
diff --git a/ceph/src/rgw/rgw_swift_auth.cc b/ceph/src/rgw/rgw_swift_auth.cc
index 525671c34..15dfe032b 100644
--- a/ceph/src/rgw/rgw_swift_auth.cc
+++ b/ceph/src/rgw/rgw_swift_auth.cc
@@ -203,13 +203,17 @@ class TempURLEngine::PrefixableSignatureHelper
   const boost::optional<const std::string&> prefix;
 
 public:
-  PrefixableSignatureHelper(const std::string& decoded_uri,
+  PrefixableSignatureHelper(const std::string& _decoded_uri,
 	                    const std::string& object_name,
                             const boost::optional<const std::string&> prefix)
-    : decoded_uri(decoded_uri),
+    : decoded_uri(_decoded_uri),
       object_name(object_name),
       prefix(prefix) {
-    /* Transform: v1/acct/cont/obj - > v1/acct/cont/ */
+    /* Transform: v1/acct/cont/obj - > v1/acct/cont/
+     *
+     * NOTE(rzarzynski): we really want to substr() on boost::string_view,
+     * not std::string. Otherwise we would end with no_obj_uri referencing
+     * a temporary. */
     no_obj_uri = \
       decoded_uri.substr(0, decoded_uri.length() - object_name.length());
   }
diff --git a/ceph/src/rgw/rgw_swift_auth.h b/ceph/src/rgw/rgw_swift_auth.h
index afab8e069..cc508202d 100644
--- a/ceph/src/rgw/rgw_swift_auth.h
+++ b/ceph/src/rgw/rgw_swift_auth.h
@@ -168,7 +168,7 @@ class DefaultStrategy : public rgw::auth::Strategy,
   /* The engines. */
   const rgw::auth::swift::TempURLEngine tempurl_engine;
   const rgw::auth::swift::SignedTokenEngine signed_engine;
-  const rgw::auth::keystone::TokenEngine keystone_engine;
+  boost::optional <const rgw::auth::keystone::TokenEngine> keystone_engine;
   const rgw::auth::swift::ExternalTokenEngine external_engine;
   const rgw::auth::swift::SwiftAnonymousEngine anon_engine;
 
@@ -229,11 +229,6 @@ public:
                     store,
                     static_cast<rgw::auth::TokenExtractor*>(this),
                     static_cast<rgw::auth::LocalApplier::Factory*>(this)),
-      keystone_engine(cct,
-                      static_cast<rgw::auth::TokenExtractor*>(this),
-                      static_cast<rgw::auth::RemoteApplier::Factory*>(this),
-                      keystone_config_t::get_instance(),
-                      keystone_cache_t::get_instance<keystone_config_t>()),
       external_engine(cct,
                       store,
                       static_cast<rgw::auth::TokenExtractor*>(this),
@@ -251,7 +246,13 @@ public:
     /* The auth strategy is responsible for deciding whether a parcular
      * engine is disabled or not. */
     if (! cct->_conf->rgw_keystone_url.empty()) {
-      add_engine(Control::SUFFICIENT, keystone_engine);
+      keystone_engine.emplace(cct,
+                              static_cast<rgw::auth::TokenExtractor*>(this),
+                              static_cast<rgw::auth::RemoteApplier::Factory*>(this),
+                              keystone_config_t::get_instance(),
+                              keystone_cache_t::get_instance<keystone_config_t>());
+
+      add_engine(Control::SUFFICIENT, *keystone_engine);
     }
     if (! cct->_conf->rgw_swift_auth_url.empty()) {
       add_engine(Control::SUFFICIENT, external_engine);
diff --git a/ceph/src/rgw/rgw_torrent.cc b/ceph/src/rgw/rgw_torrent.cc
index c1f8aaac0..ad85148b3 100644
--- a/ceph/src/rgw/rgw_torrent.cc
+++ b/ceph/src/rgw/rgw_torrent.cc
@@ -37,8 +37,10 @@ void seed::init(struct req_state *p_req, RGWRados *p_store)
   store = p_store;
 }
 
-void seed::get_torrent_file(int &op_ret, RGWRados::Object::Read &read_op, uint64_t &total_len, 
-  bufferlist &bl_data, rgw_obj &obj)
+int seed::get_torrent_file(RGWRados::Object::Read &read_op,
+                           uint64_t &total_len,
+                           ceph::bufferlist &bl_data,
+                           rgw_obj &obj)
 {
   /* add other field if config is set */
   dencode.bencode_dict(bl);
@@ -63,11 +65,12 @@ void seed::get_torrent_file(int &op_ret, RGWRados::Object::Read &read_op, uint64
   ldout(s->cct, 0) << "NOTICE: head obj oid= " << oid << dendl;
 
   obj_key.insert(RGW_OBJ_TORRENT);
-  op_ret = read_op.state.io_ctx.omap_get_vals_by_keys(oid, obj_key, &m);
+  const int op_ret = read_op.state.io_ctx.omap_get_vals_by_keys(oid, obj_key, &m);
   if (op_ret < 0)
   {
-    ldout(s->cct, 0) << "ERROR: failed to omap_get_vals_by_keys op_ret = " << op_ret << dendl;
-    return;
+    ldout(s->cct, 0) << "ERROR: failed to omap_get_vals_by_keys op_ret = "
+                     << op_ret << dendl;
+    return op_ret;
   }
 
   map<string, bufferlist>::iterator iter;
@@ -81,7 +84,7 @@ void seed::get_torrent_file(int &op_ret, RGWRados::Object::Read &read_op, uint64
 
   bl_data = bl;
   total_len = bl.length();
-  return;
+  return 0;
 }
 
 bool seed::get_flag()
diff --git a/ceph/src/rgw/rgw_torrent.h b/ceph/src/rgw/rgw_torrent.h
index b33aac09b..fd6a0d6ed 100644
--- a/ceph/src/rgw/rgw_torrent.h
+++ b/ceph/src/rgw/rgw_torrent.h
@@ -115,8 +115,10 @@ public:
 
   int get_params();
   void init(struct req_state *p_req, RGWRados *p_store);
-  void get_torrent_file(int &op_ret, RGWRados::Object::Read &read_op, 
-    uint64_t &total_len, bufferlist &bl_data, rgw_obj &obj);
+  int get_torrent_file(RGWRados::Object::Read &read_op,
+                       uint64_t &total_len,
+                       ceph::bufferlist &bl_data,
+                       rgw_obj &obj);
   
   off_t get_data_len();
   bool get_flag();
diff --git a/ceph/src/rgw/rgw_user.cc b/ceph/src/rgw/rgw_user.cc
index ebe795e7d..7fe884218 100644
--- a/ceph/src/rgw/rgw_user.cc
+++ b/ceph/src/rgw/rgw_user.cc
@@ -1760,7 +1760,7 @@ int RGWUser::init(RGWUserAdminOpState& op_state)
 {
   bool found = false;
   std::string swift_user;
-  rgw_user& uid = op_state.get_user_id();
+  user_id = op_state.get_user_id();
   std::string user_email = op_state.get_user_email();
   std::string access_key = op_state.get_access_key();
   std::string subuser = op_state.get_subuser();
@@ -1775,16 +1775,16 @@ int RGWUser::init(RGWUserAdminOpState& op_state)
 
   clear_populated();
 
-  if (uid.empty() && !subuser.empty()) {
+  if (user_id.empty() && !subuser.empty()) {
     size_t pos = subuser.find(':');
     if (pos != string::npos) {
-      uid = subuser.substr(0, pos);
-      op_state.set_user_id(uid);
+      user_id = subuser.substr(0, pos);
+      op_state.set_user_id(user_id);
     }
   }
 
-  if (!uid.empty() && (uid.compare(RGW_USER_ANON_ID) != 0)) {
-    found = (rgw_get_user_info_by_uid(store, uid, user_info, &op_state.objv) >= 0);
+  if (!user_id.empty() && (user_id.compare(RGW_USER_ANON_ID) != 0)) {
+    found = (rgw_get_user_info_by_uid(store, user_id, user_info, &op_state.objv) >= 0);
     op_state.found_by_uid = found;
   }
   if (!user_email.empty() && !found) {
@@ -1809,7 +1809,9 @@ int RGWUser::init(RGWUserAdminOpState& op_state)
     set_populated();
   }
 
-  user_id = user_info.user_id;
+  if (user_id.empty()) {
+    user_id = user_info.user_id;
+  }
   op_state.set_initialized();
 
   // this may have been called by a helper object
@@ -2363,6 +2365,13 @@ int RGWUserAdminOp_User::info(RGWRados *store, RGWUserAdminOpState& op_state,
   if (ret < 0)
     return ret;
 
+  if (op_state.sync_stats) {
+    ret = rgw_user_sync_all_stats(store, info.user_id);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
   RGWStorageStats stats;
   RGWStorageStats *arg_stats = NULL;
   if (op_state.fetch_stats) {
diff --git a/ceph/src/rgw/rgw_user.h b/ceph/src/rgw/rgw_user.h
index 5e6754b03..516617a29 100644
--- a/ceph/src/rgw/rgw_user.h
+++ b/ceph/src/rgw/rgw_user.h
@@ -163,6 +163,7 @@ struct RGWUserAdminOpState {
   __u8 system;
   __u8 exclusive;
   __u8 fetch_stats;
+  __u8 sync_stats;
   std::string caps;
   RGWObjVersionTracker objv;
   uint32_t op_mask;
@@ -334,6 +335,10 @@ struct RGWUserAdminOpState {
     fetch_stats = is_fetch_stats;
   }
 
+  void set_sync_stats(__u8 is_sync_stats) {
+    sync_stats = is_sync_stats;
+  }
+
   void set_user_info(RGWUserInfo& user_info) {
     user_id = user_info.user_id;
     info = user_info;
diff --git a/ceph/src/test/CMakeLists.txt b/ceph/src/test/CMakeLists.txt
index 50b357691..f3999e3b0 100644
--- a/ceph/src/test/CMakeLists.txt
+++ b/ceph/src/test/CMakeLists.txt
@@ -14,6 +14,7 @@ add_subdirectory(cls_log)
 add_subdirectory(cls_numops)
 add_subdirectory(cls_sdk)
 if(WITH_RBD)
+  add_subdirectory(cls_journal)
   add_subdirectory(cls_rbd)
 endif(WITH_RBD)
 add_subdirectory(cls_refcount)
@@ -347,6 +348,20 @@ target_link_libraries(ceph_test_librgw_file_aw
   ${EXTRALIBS}
   )
 
+# ceph_test_librgw_file_marker (READDIR with string and uint64 offsets)
+add_executable(ceph_test_librgw_file_marker
+  librgw_file_marker.cc
+  )
+set_target_properties(ceph_test_librgw_file_marker PROPERTIES COMPILE_FLAGS
+  ${UNITTEST_CXX_FLAGS})
+target_link_libraries(ceph_test_librgw_file_marker
+  rgw
+  librados
+  ceph-common
+  ${UNITTEST_LIBS}
+  ${EXTRALIBS}
+  )
+
 # ceph_test_rgw_token
 add_executable(ceph_test_rgw_token
   test_rgw_token.cc
diff --git a/ceph/src/test/cli/crushtool/build.t b/ceph/src/test/cli/crushtool/build.t
index eadd96942..4061d8fb4 100644
--- a/ceph/src/test/cli/crushtool/build.t
+++ b/ceph/src/test/cli/crushtool/build.t
@@ -14,7 +14,7 @@
 # display a warning if there is more than one root
 #
   $ crushtool --outfn "$map" --build --num_osds 5 node straw 2 rack straw 1 
-  .* The crush rulesets will use the root rack0 (re)
+  The crush rulesets will use the root rack0 (re)
   and ignore the others.
   There are 3 roots, they can be
   grouped into a single root by appending something like:
diff --git a/ceph/src/test/cli/osdmaptool/help.t b/ceph/src/test/cli/osdmaptool/help.t
index e3e6cd091..8fd313a76 100644
--- a/ceph/src/test/cli/osdmaptool/help.t
+++ b/ceph/src/test/cli/osdmaptool/help.t
@@ -8,6 +8,7 @@
      --test-map-pgs-dump-all [--pool <poolid>] map all pgs to osds
      --health                dump health checks
      --mark-up-in            mark osds up and in (but do not persist)
+     --mark-out <osdid>      mark an osd as out (but do not persist)
      --with-default-pool     include default pool when creating map
      --clear-temp            clear pg_temp and primary_temp
      --test-random           do random placements
diff --git a/ceph/src/test/cli/osdmaptool/missing-argument.t b/ceph/src/test/cli/osdmaptool/missing-argument.t
index 97ed692cb..db1745bd8 100644
--- a/ceph/src/test/cli/osdmaptool/missing-argument.t
+++ b/ceph/src/test/cli/osdmaptool/missing-argument.t
@@ -8,6 +8,7 @@
      --test-map-pgs-dump-all [--pool <poolid>] map all pgs to osds
      --health                dump health checks
      --mark-up-in            mark osds up and in (but do not persist)
+     --mark-out <osdid>      mark an osd as out (but do not persist)
      --with-default-pool     include default pool when creating map
      --clear-temp            clear pg_temp and primary_temp
      --test-random           do random placements
diff --git a/ceph/src/test/cli/osdmaptool/upmap-out.t b/ceph/src/test/cli/osdmaptool/upmap-out.t
new file mode 100644
index 000000000..bc0a28a07
--- /dev/null
+++ b/ceph/src/test/cli/osdmaptool/upmap-out.t
@@ -0,0 +1,23 @@
+  $ osdmaptool --create-from-conf om -c $TESTDIR/ceph.conf.withracks --with-default-pool
+  osdmaptool: osdmap file 'om'
+  osdmaptool: writing epoch 1 to om
+  $ osdmaptool om --mark-up-in --mark-out 147 --upmap-max 11 --upmap c
+  osdmaptool: osdmap file 'om'
+  marking all OSDs up and in
+  marking OSD@147 as out
+  writing upmap command output to: c
+  checking for upmap cleanups
+  upmap, max-count 11, max deviation 0.01
+  $ cat c
+  ceph osd pg-upmap-items 1.7 142 145
+  ceph osd pg-upmap-items 1.8 219 223 99 103
+  ceph osd pg-upmap-items 1.17 171 173 201 202
+  ceph osd pg-upmap-items 1.1a 201 202 115 114
+  ceph osd pg-upmap-items 1.1c 171 173 201 202 127 130
+  ceph osd pg-upmap-items 1.20 88 87 201 202
+  ceph osd pg-upmap-items 1.21 207 206 142 145
+  ceph osd pg-upmap-items 1.51 201 202 65 64 186 189
+  ceph osd pg-upmap-items 1.62 219 223
+  ceph osd pg-upmap-items 1.6f 219 223 108 111
+  ceph osd pg-upmap-items 1.82 219 223 157 158 6 3
+  $ rm -f om c
diff --git a/ceph/src/test/cls_journal/CMakeLists.txt b/ceph/src/test/cls_journal/CMakeLists.txt
new file mode 100644
index 000000000..6e99cdc5a
--- /dev/null
+++ b/ceph/src/test/cls_journal/CMakeLists.txt
@@ -0,0 +1,18 @@
+# cls_test_cls_journal
+add_executable(ceph_test_cls_journal
+  test_cls_journal.cc
+  $<TARGET_OBJECTS:common_texttable_obj>)
+set_target_properties(ceph_test_cls_journal PROPERTIES COMPILE_FLAGS
+  ${UNITTEST_CXX_FLAGS})
+target_link_libraries(ceph_test_cls_journal
+  cls_journal_client
+  librados
+  global
+  ${UNITTEST_LIBS}
+  ${CMAKE_DL_LIBS}
+  ${CRYPTO_LIBS}
+  ${EXTRALIBS}
+  radostest)
+install(TARGETS
+  ceph_test_cls_journal
+  DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/ceph/src/test/cls_journal/test_cls_journal.cc b/ceph/src/test/cls_journal/test_cls_journal.cc
index 2e1123607..7d948c976 100644
--- a/ceph/src/test/cls_journal/test_cls_journal.cc
+++ b/ceph/src/test/cls_journal/test_cls_journal.cc
@@ -322,15 +322,21 @@ TEST_F(TestClsJournal, ClientUnregisterPruneTags) {
                                   bufferlist()));
   ASSERT_EQ(0, client::tag_create(ioctx, oid, 1, Tag::TAG_CLASS_NEW,
                                   bufferlist()));
-  ASSERT_EQ(0, client::tag_create(ioctx, oid, 2, 1, bufferlist()));
+
+  for (uint32_t i = 2; i <= 96; ++i) {
+    ASSERT_EQ(0, client::tag_create(ioctx, oid, i, 1, bufferlist()));
+  }
 
   librados::ObjectWriteOperation op1;
-  client::client_commit(&op1, "id1", {{{1, 2, 120}}});
+  client::client_commit(&op1, "id1", {{{1, 32, 120}}});
   ASSERT_EQ(0, ioctx.operate(oid, &op1));
 
   ASSERT_EQ(0, client::client_unregister(ioctx, oid, "id2"));
 
-  std::set<Tag> expected_tags = {{0, 0, {}}, {2, 1, {}}};
+  std::set<Tag> expected_tags = {{0, 0, {}}};
+  for (uint32_t i = 32; i <= 96; ++i) {
+    expected_tags.insert({i, 1, {}});
+  }
   std::set<Tag> tags;
   ASSERT_EQ(0, client::tag_list(ioctx, oid, "id1",
                                 boost::optional<uint64_t>(), &tags));
@@ -547,6 +553,14 @@ TEST_F(TestClsJournal, TagList) {
   ASSERT_EQ(0, client::tag_list(ioctx, oid, "id1", boost::optional<uint64_t>(0),
                                 &tags));
   ASSERT_EQ(expected_filtered_tags, tags);
+
+  librados::ObjectWriteOperation op1;
+  client::client_commit(&op1, "id1", {{{96, 0, 120}}});
+  ASSERT_EQ(0, ioctx.operate(oid, &op1));
+
+  ASSERT_EQ(0, client::tag_list(ioctx, oid, "id1", boost::optional<uint64_t>(),
+                                &tags));
+  ASSERT_EQ(expected_all_tags, tags);
 }
 
 TEST_F(TestClsJournal, GuardAppend) {
diff --git a/ceph/src/test/common/test_bit_vector.cc b/ceph/src/test/common/test_bit_vector.cc
index f5b0b26dc..a486bf46d 100644
--- a/ceph/src/test/common/test_bit_vector.cc
+++ b/ceph/src/test/common/test_bit_vector.cc
@@ -248,3 +248,31 @@ TYPED_TEST(BitVectorTest, data_crc) {
   ASSERT_THROW(bit_vector2.decode_data(data_it, byte_offset),
 	       buffer::malformed_input);
 }
+
+TYPED_TEST(BitVectorTest, iterator) {
+  typename TestFixture::bit_vector_t bit_vector;
+
+  uint64_t radix = 1 << bit_vector.BIT_COUNT;
+  uint64_t size = 25 * (1ULL << 20);
+  uint64_t offset = 0;
+
+  // create fragmented in-memory bufferlist layout
+  uint64_t resize = 0;
+  while (resize < size) {
+    resize += 4096;
+    if (resize > size) {
+      resize = size;
+    }
+    bit_vector.resize(resize);
+  }
+
+  for (auto it = bit_vector.begin(); it != bit_vector.end(); ++it, ++offset) {
+    *it = offset % radix;
+  }
+
+  offset = 123;
+  auto end_it = bit_vector.begin() + (size - 1024);
+  for (auto it = bit_vector.begin() + offset; it != end_it; ++it, ++offset) {
+    ASSERT_EQ(offset % radix, *it);
+  }
+}
diff --git a/ceph/src/test/daemon_config.cc b/ceph/src/test/daemon_config.cc
index 5cfd08551..fc3a5c7a9 100644
--- a/ceph/src/test/daemon_config.cc
+++ b/ceph/src/test/daemon_config.cc
@@ -32,80 +32,80 @@ using std::string;
 TEST(DaemonConfig, SimpleSet) {
   int ret;
   ret = g_ceph_context->_conf->set_val("log_graylog_port", "21");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   char buf[128];
   memset(buf, 0, sizeof(buf));
   char *tmp = buf;
   ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("21"), string(buf));
 }
 
 TEST(DaemonConfig, Substitution) {
   int ret;
   ret = g_ceph_context->_conf->set_val("internal_safe_to_start_threads", "false");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("host", "foo");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("public_network", "bar$host.baz", false);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   char buf[128];
   memset(buf, 0, sizeof(buf));
   char *tmp = buf;
   ret = g_ceph_context->_conf->get_val("public_network", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("barfoo.baz"), string(buf));
 }
 
 TEST(DaemonConfig, SubstitutionTrailing) {
   int ret;
   ret = g_ceph_context->_conf->set_val("internal_safe_to_start_threads", "false");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("host", "foo");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("public_network", "bar$host", false);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   char buf[128];
   memset(buf, 0, sizeof(buf));
   char *tmp = buf;
   ret = g_ceph_context->_conf->get_val("public_network", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("barfoo"), string(buf));
 }
 
 TEST(DaemonConfig, SubstitutionBraces) {
   int ret;
   ret = g_ceph_context->_conf->set_val("internal_safe_to_start_threads", "false");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("host", "foo");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("public_network", "bar${host}baz", false);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   char buf[128];
   memset(buf, 0, sizeof(buf));
   char *tmp = buf;
   ret = g_ceph_context->_conf->get_val("public_network", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("barfoobaz"), string(buf));
 }
 TEST(DaemonConfig, SubstitutionBracesTrailing) {
   int ret;
   ret = g_ceph_context->_conf->set_val("internal_safe_to_start_threads", "false");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("host", "foo");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("public_network", "bar${host}", false);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   char buf[128];
   memset(buf, 0, sizeof(buf));
   char *tmp = buf;
   ret = g_ceph_context->_conf->get_val("public_network", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("barfoo"), string(buf));
 }
 
@@ -113,15 +113,15 @@ TEST(DaemonConfig, SubstitutionBracesTrailing) {
 TEST(DaemonConfig, SubstitutionMultiple) {
   int ret;
   ret = g_ceph_context->_conf->set_val("mon_host", "localhost", false);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->set_val("keyring", "$mon_host/$cluster.keyring,$mon_host/$cluster.mon.keyring", false);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   char buf[512];
   memset(buf, 0, sizeof(buf));
   char *tmp = buf;
   ret = g_ceph_context->_conf->get_val("keyring", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("localhost/ceph.keyring,localhost/ceph.mon.keyring"), tmp);
   ASSERT_TRUE(strchr(buf, '$') == NULL);
 }
@@ -143,12 +143,12 @@ TEST(DaemonConfig, ArgV) {
   char *tmp = buf;
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("keyfile", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("/tmp/my-keyfile"), string(buf));
 
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("22"), string(buf));
 
   ASSERT_EQ(0, g_ceph_context->_conf->set_val("internal_safe_to_start_threads",
@@ -159,25 +159,25 @@ TEST(DaemonConfig, InjectArgs) {
   int ret;
   std::string injection("--log-graylog-port 56 --leveldb-max-open-files 42");
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   char buf[128];
   char *tmp = buf;
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("leveldb_max_open_files", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("42"), string(buf));
 
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("56"), string(buf));
 
   injection = "--log-graylog-port 57";
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("57"), string(buf));
 }
 
@@ -191,30 +191,35 @@ TEST(DaemonConfig, InjectArgsReject) {
   // We should complain about the garbage in the input
   std::string injection("--random-garbage-in-injectargs 26 --log-graylog-port 28");
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, -EINVAL); 
+  ASSERT_EQ(-EINVAL, ret);
 
   // But, debug should still be set...
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("28"), string(buf));
 
   // What's the current value of osd_data?
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("osd_data", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   // Injectargs shouldn't let us change this, since it is a string-valued
   // variable and there isn't an observer for it.
   std::string injection2("--osd_data /tmp/some-other-directory --log-graylog-port 4");
   ret = g_ceph_context->_conf->injectargs(injection2, &cout);
-  ASSERT_EQ(ret, -ENOSYS); 
+  ASSERT_EQ(-ENOSYS, ret);
 
   // It should be unchanged.
   memset(buf2, 0, sizeof(buf2));
   ret = g_ceph_context->_conf->get_val("osd_data", &tmp2, sizeof(buf2));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string(buf), string(buf2));
+
+  // We should complain about the missing arguments.
+  std::string injection3("--log-graylog-port 28 --debug_ms");
+  ret = g_ceph_context->_conf->injectargs(injection3, &cout);
+  ASSERT_EQ(-EINVAL, ret);
 }
 
 TEST(DaemonConfig, InjectArgsBooleans) {
@@ -225,51 +230,51 @@ TEST(DaemonConfig, InjectArgsBooleans) {
   // Change log_to_syslog
   std::string injection("--log_to_syslog --log-graylog-port 28");
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   // log_to_syslog should be set...
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_to_syslog", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("true"), string(buf));
 
   // Turn off log_to_syslog
   injection = "--log_to_syslog=false --log-graylog-port 28";
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   // log_to_syslog should be cleared...
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_to_syslog", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("false"), string(buf));
 
   // Turn on log_to_syslog
   injection = "--log-graylog-port=1 --log_to_syslog=true --leveldb-max-open-files 40";
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   // log_to_syslog should be set...
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_to_syslog", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("true"), string(buf));
 
   // parse error
   injection = "--log-graylog-port 1 --log_to_syslog=falsey --leveldb-max-open-files 42";
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, -EINVAL);
+  ASSERT_EQ(-EINVAL, ret);
 
   // log_to_syslog should still be set...
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_to_syslog", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("true"), string(buf));
 
   // debug-ms should still become 42...
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("leveldb_max_open_files", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("42"), string(buf));
 }
 
@@ -285,25 +290,25 @@ TEST(DaemonConfig, InjectArgsLogfile) {
   injection += tmpfile;
   // We're allowed to change log_file because there is an observer.
   ret = g_ceph_context->_conf->injectargs(injection, &cout);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   // It should have taken effect.
   char buf[128];
   char *tmp = buf;
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("log_file", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string(buf), string(tmpfile));
 
   // The logfile should exist.
-  ASSERT_EQ(access(tmpfile, R_OK), 0);
+  ASSERT_EQ(0, access(tmpfile, R_OK));
 
   // Let's turn off the logfile.
   ret = g_ceph_context->_conf->set_val("log_file", "");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   g_ceph_context->_conf->apply_changes(NULL);
   ret = g_ceph_context->_conf->get_val("log_file", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string(""), string(buf));
 
   // Clean up the garbage
@@ -315,7 +320,7 @@ TEST(DaemonConfig, ThreadSafety1) {
   // Verify that we can't change this, since internal_safe_to_start_threads has
   // been set.
   ret = g_ceph_context->_conf->set_val("osd_data", "");
-  ASSERT_EQ(ret, -ENOSYS);
+  ASSERT_EQ(-ENOSYS, ret);
 
   ASSERT_EQ(0, g_ceph_context->_conf->set_val("internal_safe_to_start_threads",
 				       "false"));
@@ -324,24 +329,24 @@ TEST(DaemonConfig, ThreadSafety1) {
   // OSD threads running, we know changing osd_data won't actually blow up the
   // world.
   ret = g_ceph_context->_conf->set_val("osd_data", "/tmp/crazydata");
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 
   char buf[128];
   char *tmp = buf;
   memset(buf, 0, sizeof(buf));
   ret = g_ceph_context->_conf->get_val("osd_data", &tmp, sizeof(buf));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ASSERT_EQ(string("/tmp/crazydata"), string(buf));
 
   ASSERT_EQ(0, g_ceph_context->_conf->set_val("internal_safe_to_start_threads",
 				       "false"));
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
 }
 
 TEST(DaemonConfig, InvalidIntegers) {
   {
     int ret = g_ceph_context->_conf->set_val("log_graylog_port", "rhubarb");
-    ASSERT_EQ(ret, -EINVAL);
+    ASSERT_EQ(-EINVAL, ret);
   }
 
   {
@@ -349,7 +354,7 @@ TEST(DaemonConfig, InvalidIntegers) {
     string str = boost::lexical_cast<string>(max);
     str = str + "999"; // some extra digits to take us out of bounds
     int ret = g_ceph_context->_conf->set_val("log_graylog_port", str);
-    ASSERT_EQ(ret, -EINVAL);
+    ASSERT_EQ(-EINVAL, ret);
   }
 }
 
@@ -358,17 +363,17 @@ TEST(DaemonConfig, InvalidFloats) {
     double bad_value = 2 * (double)std::numeric_limits<float>::max();
     string str = boost::lexical_cast<string>(-bad_value);
     int ret = g_ceph_context->_conf->set_val("log_stop_at_utilization", str);
-    ASSERT_EQ(ret, -EINVAL);
+    ASSERT_EQ(-EINVAL, ret);
   }
   {
     double bad_value = 2 * (double)std::numeric_limits<float>::max();
     string str = boost::lexical_cast<string>(bad_value);
     int ret = g_ceph_context->_conf->set_val("log_stop_at_utilization", str);
-    ASSERT_EQ(ret, -EINVAL);
+    ASSERT_EQ(-EINVAL, ret);
   }
   {
     int ret = g_ceph_context->_conf->set_val("log_stop_at_utilization", "not a float");
-    ASSERT_EQ(ret, -EINVAL);
+    ASSERT_EQ(-EINVAL, ret);
   }
 }
 
diff --git a/ceph/src/test/librados/misc.cc b/ceph/src/test/librados/misc.cc
index ea4d3e713..f7ce348b0 100644
--- a/ceph/src/test/librados/misc.cc
+++ b/ceph/src/test/librados/misc.cc
@@ -28,6 +28,7 @@ using std::string;
 
 typedef RadosTest LibRadosMisc;
 typedef RadosTestPP LibRadosMiscPP;
+typedef RadosTestECPP LibRadosMiscECPP;
 
 TEST(LibRadosMiscVersion, Version) {
   int major, minor, extra;
@@ -1356,3 +1357,24 @@ TEST_F(LibRadosMiscPP, Applications) {
   ASSERT_EQ(0, ioctx.application_metadata_list("app1", &meta));
   ASSERT_EQ(expected_meta, meta);
 }
+
+TEST_F(LibRadosMiscECPP, CompareExtentRange) {
+  bufferlist bl1;
+  bl1.append("ceph");
+  ObjectWriteOperation write;
+  write.write(0, bl1);
+  ASSERT_EQ(0, ioctx.operate("foo", &write));
+
+  bufferlist bl2;
+  bl2.append("ph");
+  bl2.append(std::string(2, '\0'));
+  ObjectReadOperation read1;
+  read1.cmpext(2, bl2, nullptr);
+  ASSERT_EQ(0, ioctx.operate("foo", &read1, nullptr));
+
+  bufferlist bl3;
+  bl3.append(std::string(4, '\0'));
+  ObjectReadOperation read2;
+  read2.cmpext(2097152, bl3, nullptr);
+  ASSERT_EQ(0, ioctx.operate("foo", &read2, nullptr));
+}
diff --git a/ceph/src/test/librbd/CMakeLists.txt b/ceph/src/test/librbd/CMakeLists.txt
index 2833d22e6..7c095f482 100644
--- a/ceph/src/test/librbd/CMakeLists.txt
+++ b/ceph/src/test/librbd/CMakeLists.txt
@@ -64,6 +64,7 @@ set(unittest_librbd_srcs
   operation/test_mock_SnapshotRemoveRequest.cc
   operation/test_mock_SnapshotRollbackRequest.cc
   operation/test_mock_SnapshotUnprotectRequest.cc
+  operation/test_mock_TrimRequest.cc
   watcher/test_mock_RewatchRequest.cc
   )
 add_executable(unittest_librbd
diff --git a/ceph/src/test/librbd/mock/MockImageCtx.h b/ceph/src/test/librbd/mock/MockImageCtx.h
index be439b985..92330a1dd 100644
--- a/ceph/src/test/librbd/mock/MockImageCtx.h
+++ b/ceph/src/test/librbd/mock/MockImageCtx.h
@@ -156,6 +156,8 @@ struct MockImageCtx {
 					     cls::rbd::SnapshotNamespace *out_snap_namespace));
   MOCK_CONST_METHOD2(get_parent_spec, int(librados::snap_t in_snap_id,
                                           ParentSpec *pspec));
+  MOCK_CONST_METHOD2(get_parent_overlap, int(librados::snap_t in_snap_id,
+                                             uint64_t *overlap));
 
   MOCK_CONST_METHOD2(is_snap_protected, int(librados::snap_t in_snap_id,
                                             bool *is_protected));
@@ -204,6 +206,9 @@ struct MockImageCtx {
   MOCK_METHOD8(write_to_cache, void(object_t, const bufferlist&, size_t,
                                     uint64_t, Context *, int, uint64_t, ZTracer::Trace *));
 
+  MOCK_CONST_METHOD0(get_stripe_count, uint64_t());
+  MOCK_CONST_METHOD0(get_stripe_period, uint64_t());
+
   ImageCtx *image_ctx;
   CephContext *cct;
   PerfCounters *perfcounter;
diff --git a/ceph/src/test/librbd/mock/MockObjectMap.h b/ceph/src/test/librbd/mock/MockObjectMap.h
index 9ace5e374..26e979eed 100644
--- a/ceph/src/test/librbd/mock/MockObjectMap.h
+++ b/ceph/src/test/librbd/mock/MockObjectMap.h
@@ -19,7 +19,7 @@ struct MockObjectMap {
   MOCK_METHOD3(aio_resize, void(uint64_t new_size, uint8_t default_object_state,
                                 Context *on_finish));
 
-  template <typename T, void(T::*MF)(int)>
+  template <typename T, void(T::*MF)(int) = &T::complete>
   bool aio_update(uint64_t snap_id, uint64_t start_object_no, uint8_t new_state,
                   const boost::optional<uint8_t> &current_state,
                   const ZTracer::Trace &parent_trace, T *callback_object) {
@@ -28,23 +28,31 @@ struct MockObjectMap {
                              callback_object);
   }
 
-  template <typename T, void(T::*MF)(int)>
+  template <typename T, void(T::*MF)(int) = &T::complete>
   bool aio_update(uint64_t snap_id, uint64_t start_object_no,
                   uint64_t end_object_no, uint8_t new_state,
                   const boost::optional<uint8_t> &current_state,
                   const ZTracer::Trace &parent_trace, T *callback_object) {
-    return aio_update(snap_id, start_object_no, end_object_no, new_state,
-                      current_state, parent_trace,
-                      util::create_context_callback<T, MF>(callback_object));
+    auto ctx = util::create_context_callback<T, MF>(callback_object);
+    bool updated = aio_update(snap_id, start_object_no, end_object_no,
+                              new_state, current_state, parent_trace, ctx);
+    if (!updated) {
+      delete ctx;
+    }
+    return updated;
   }
   MOCK_METHOD7(aio_update, bool(uint64_t snap_id, uint64_t start_object_no,
                                 uint64_t end_object_no, uint8_t new_state,
                                 const boost::optional<uint8_t> &current_state,
                                 const ZTracer::Trace &parent_trace,
                                 Context *on_finish));
+
   MOCK_METHOD2(snapshot_add, void(uint64_t snap_id, Context *on_finish));
   MOCK_METHOD2(snapshot_remove, void(uint64_t snap_id, Context *on_finish));
   MOCK_METHOD2(rollback, void(uint64_t snap_id, Context *on_finish));
+
+  MOCK_CONST_METHOD1(object_may_exist, bool(uint64_t));
+
 };
 
 } // namespace librbd
diff --git a/ceph/src/test/librbd/object_map/test_mock_UpdateRequest.cc b/ceph/src/test/librbd/object_map/test_mock_UpdateRequest.cc
index 7f47be281..453e49cd6 100644
--- a/ceph/src/test/librbd/object_map/test_mock_UpdateRequest.cc
+++ b/ceph/src/test/librbd/object_map/test_mock_UpdateRequest.cc
@@ -18,12 +18,22 @@ namespace object_map {
 
 using ::testing::_;
 using ::testing::DoDefault;
+using ::testing::InSequence;
 using ::testing::Return;
 using ::testing::StrEq;
 
 class TestMockObjectMapUpdateRequest : public TestMockFixture {
 public:
-  void expect_update(librbd::ImageCtx *ictx, uint64_t snap_id, int r) {
+  void expect_update(librbd::ImageCtx *ictx, uint64_t snap_id,
+                     uint64_t start_object_no, uint64_t end_object_no,
+                     uint8_t new_state,
+                     const boost::optional<uint8_t>& current_state, int r) {
+    bufferlist bl;
+    ::encode(start_object_no, bl);
+    ::encode(end_object_no, bl);
+    ::encode(new_state, bl);
+    ::encode(current_state, bl);
+
     std::string oid(ObjectMap<>::object_map_name(ictx->id, snap_id));
     if (snap_id == CEPH_NOSNAP) {
       EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
@@ -33,11 +43,13 @@ public:
 
     if (r < 0) {
       EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
-                  exec(oid, _, StrEq("rbd"), StrEq("object_map_update"), _, _, _))
+                  exec(oid, _, StrEq("rbd"), StrEq("object_map_update"),
+                       ContentsEqual(bl), _, _))
                     .WillOnce(Return(r));
     } else {
       EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
-                  exec(oid, _, StrEq("rbd"), StrEq("object_map_update"), _, _, _))
+                  exec(oid, _, StrEq("rbd"), StrEq("object_map_update"),
+                       ContentsEqual(bl), _, _))
                     .WillOnce(DoDefault());
     }
   }
@@ -92,7 +104,7 @@ TEST_F(TestMockObjectMapUpdateRequest, UpdateHeadOnDisk) {
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
 
-  expect_update(ictx, CEPH_NOSNAP, 0);
+  expect_update(ictx, CEPH_NOSNAP, 0, 1, OBJECT_NONEXISTENT, OBJECT_EXISTS, 0);
 
   ceph::BitVector<2> object_map;
   object_map.resize(1);
@@ -122,7 +134,7 @@ TEST_F(TestMockObjectMapUpdateRequest, UpdateSnapOnDisk) {
 				"snap1"));
 
   uint64_t snap_id = ictx->snap_id;
-  expect_update(ictx, snap_id, 0);
+  expect_update(ictx, snap_id, 0, 1, OBJECT_NONEXISTENT, OBJECT_EXISTS, 0);
 
   ceph::BitVector<2> object_map;
   object_map.resize(1);
@@ -148,7 +160,8 @@ TEST_F(TestMockObjectMapUpdateRequest, UpdateOnDiskError) {
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
 
-  expect_update(ictx, CEPH_NOSNAP, -EINVAL);
+  expect_update(ictx, CEPH_NOSNAP, 0, 1, OBJECT_NONEXISTENT, OBJECT_EXISTS,
+                -EINVAL);
   expect_invalidate(ictx);
 
   ceph::BitVector<2> object_map;
@@ -178,7 +191,8 @@ TEST_F(TestMockObjectMapUpdateRequest, RebuildSnapOnDisk) {
   ASSERT_EQ(CEPH_NOSNAP, ictx->snap_id);
 
   uint64_t snap_id = ictx->snap_info.rbegin()->first;
-  expect_update(ictx, snap_id, 0);
+  expect_update(ictx, snap_id, 0, 1, OBJECT_EXISTS_CLEAN,
+                boost::optional<uint8_t>(), 0);
   expect_unlock_exclusive_lock(*ictx);
 
   ceph::BitVector<2> object_map;
@@ -199,5 +213,40 @@ TEST_F(TestMockObjectMapUpdateRequest, RebuildSnapOnDisk) {
   ASSERT_NE(OBJECT_EXISTS_CLEAN, object_map[0]);
 }
 
+TEST_F(TestMockObjectMapUpdateRequest, BatchUpdate) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  librbd::NoOpProgressContext no_progress;
+  ASSERT_EQ(0, ictx->operations->resize(712312 * ictx->get_object_size(), false,
+                                        no_progress));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  InSequence seq;
+  expect_update(ictx, CEPH_NOSNAP, 0, 262144, OBJECT_NONEXISTENT, OBJECT_EXISTS,
+                0);
+  expect_update(ictx, CEPH_NOSNAP, 262144, 524288, OBJECT_NONEXISTENT,
+                OBJECT_EXISTS, 0);
+  expect_update(ictx, CEPH_NOSNAP, 524288, 712312, OBJECT_NONEXISTENT,
+                OBJECT_EXISTS, 0);
+  expect_unlock_exclusive_lock(*ictx);
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(712312);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *req = new UpdateRequest<>(
+    *ictx, &object_map, CEPH_NOSNAP, 0, object_map.size(), OBJECT_NONEXISTENT,
+    OBJECT_EXISTS, {}, &cond_ctx);
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    RWLock::WLocker object_map_locker(ictx->object_map_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
 } // namespace object_map
 } // namespace librbd
diff --git a/ceph/src/test/librbd/operation/test_mock_SnapshotCreateRequest.cc b/ceph/src/test/librbd/operation/test_mock_SnapshotCreateRequest.cc
index 845c36fde..7124df5b3 100644
--- a/ceph/src/test/librbd/operation/test_mock_SnapshotCreateRequest.cc
+++ b/ceph/src/test/librbd/operation/test_mock_SnapshotCreateRequest.cc
@@ -43,7 +43,7 @@ public:
   }
 
   void expect_allocate_snap_id(MockImageCtx &mock_image_ctx, int r) {
-    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx),
                                selfmanaged_snap_create(_));
     if (r < 0 && r != -ESTALE) {
       expect.WillOnce(Return(r));
@@ -53,7 +53,7 @@ public:
   }
 
   void expect_release_snap_id(MockImageCtx &mock_image_ctx, int r) {
-    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx),
                                selfmanaged_snap_remove(_));
     if (r < 0) {
       expect.WillOnce(Return(r));
diff --git a/ceph/src/test/librbd/operation/test_mock_SnapshotRemoveRequest.cc b/ceph/src/test/librbd/operation/test_mock_SnapshotRemoveRequest.cc
index 3f5fe1868..d16e0a4d2 100644
--- a/ceph/src/test/librbd/operation/test_mock_SnapshotRemoveRequest.cc
+++ b/ceph/src/test/librbd/operation/test_mock_SnapshotRemoveRequest.cc
@@ -110,7 +110,7 @@ public:
   }
 
   void expect_release_snap_id(MockImageCtx &mock_image_ctx) {
-    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx),
                                 selfmanaged_snap_remove(_))
                                   .WillOnce(DoDefault());
   }
diff --git a/ceph/src/test/librbd/operation/test_mock_TrimRequest.cc b/ceph/src/test/librbd/operation/test_mock_TrimRequest.cc
new file mode 100644
index 000000000..7a8cb43e8
--- /dev/null
+++ b/ceph/src/test/librbd/operation/test_mock_TrimRequest.cc
@@ -0,0 +1,496 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "common/bit_vector.hpp"
+#include "librbd/AsyncRequest.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ObjectRequest.h"
+#include "librbd/operation/TrimRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace librbd {
+namespace {
+
+struct MockTestImageCtx : public MockImageCtx {
+  MockTestImageCtx(ImageCtx &image_ctx) : MockImageCtx(image_ctx) {
+  }
+};
+
+} // anonymous namespace
+
+template<>
+struct AsyncRequest<librbd::MockTestImageCtx> {
+  librbd::MockTestImageCtx& m_image_ctx;
+  Context *on_finish;
+
+  AsyncRequest(librbd::MockTestImageCtx& image_ctx, Context* on_finish)
+    : m_image_ctx(image_ctx), on_finish(on_finish) {
+  }
+  virtual ~AsyncRequest() {
+  }
+
+  Context* create_callback_context() {
+    return util::create_context_callback(this);
+  }
+
+  Context* create_async_callback_context() {
+    return util::create_context_callback<AsyncRequest,
+                                         &AsyncRequest::async_complete>(this);
+  }
+
+  void complete(int r) {
+    if (should_complete(r)) {
+      async_complete(r);
+    }
+  }
+
+  void async_complete(int r) {
+    on_finish->complete(r);
+  }
+
+  bool is_canceled() const {
+    return false;
+  }
+
+  virtual void send() = 0;
+  virtual bool should_complete(int r) = 0;
+};
+
+namespace io {
+
+template <>
+struct ObjectRequest<librbd::MockTestImageCtx> : public ObjectRequestHandle {
+  static ObjectRequest* s_instance;
+  Context *on_finish = nullptr;
+
+  static ObjectRequest* create_truncate(librbd::MockTestImageCtx *ictx,
+                                        const std::string &oid,
+                                        uint64_t object_no,
+                                        uint64_t object_off,
+                                        const ::SnapContext &snapc,
+                                        const ZTracer::Trace &parent_trace,
+                                        Context *completion) {
+    assert(s_instance != nullptr);
+    s_instance->on_finish = completion;
+    s_instance->construct_truncate();
+    return s_instance;
+  }
+
+  static ObjectRequest* create_trim(librbd::MockTestImageCtx *ictx,
+                                    const std::string &oid,
+                                    uint64_t object_no,
+                                    const ::SnapContext &snapc,
+                                    bool post_object_map_update,
+                                    Context *completion) {
+    assert(s_instance != nullptr);
+    s_instance->on_finish = completion;
+    s_instance->construct_trim();
+    return s_instance;
+  }
+
+  ObjectRequest() {
+    s_instance = this;
+  }
+
+  MOCK_METHOD0(construct_truncate, void());
+  MOCK_METHOD0(construct_trim, void());
+  MOCK_METHOD0(send, void());
+  MOCK_METHOD1(complete, void(int));
+};
+
+ObjectRequest<librbd::MockTestImageCtx>* ObjectRequest<librbd::MockTestImageCtx>::s_instance = nullptr;
+
+} // namespace io
+} // namespace librbd
+
+// template definitions
+#include "librbd/AsyncObjectThrottle.cc"
+#include "librbd/operation/TrimRequest.cc"
+
+namespace librbd {
+namespace operation {
+
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::InSequence;
+using ::testing::Invoke;
+using ::testing::Return;
+using ::testing::StrEq;
+using ::testing::WithArg;
+
+class TestMockOperationTrimRequest : public TestMockFixture {
+public:
+  typedef TrimRequest<MockTestImageCtx> MockTrimRequest;
+  typedef librbd::io::ObjectRequest<MockTestImageCtx> MockObjectRequest;
+
+  int create_snapshot(const char *snap_name) {
+    librbd::ImageCtx *ictx;
+    int r = open_image(m_image_name, &ictx);
+    if (r < 0) {
+      return r;
+    }
+
+    r = snap_create(*ictx, snap_name);
+    if (r < 0) {
+      return r;
+    }
+
+    r = snap_protect(*ictx, snap_name);
+    if (r < 0) {
+      return r;
+    }
+    close_image(ictx);
+    return 0;
+  }
+
+  void expect_is_lock_owner(MockTestImageCtx &mock_image_ctx) {
+    if (mock_image_ctx.exclusive_lock != nullptr) {
+      EXPECT_CALL(*mock_image_ctx.exclusive_lock, is_lock_owner())
+                    .WillRepeatedly(Return(true));
+    }
+  }
+
+  void expect_object_map_update(MockTestImageCtx &mock_image_ctx,
+                                uint64_t start_object, uint64_t end_object,
+                                uint8_t state, uint8_t current_state,
+                                bool updated, int ret_val) {
+    if (mock_image_ctx.object_map != nullptr) {
+      EXPECT_CALL(*mock_image_ctx.object_map,
+                  aio_update(CEPH_NOSNAP, start_object, end_object, state,
+                             boost::optional<uint8_t>(current_state), _, _))
+        .WillOnce(WithArg<6>(Invoke([&mock_image_ctx, updated, ret_val](Context *ctx) {
+                               if (updated) {
+                                 mock_image_ctx.op_work_queue->queue(ctx, ret_val);
+                               }
+                               return updated;
+                             })));
+    }
+  }
+
+  void expect_get_parent_overlap(MockTestImageCtx &mock_image_ctx,
+                                 uint64_t overlap) {
+    EXPECT_CALL(mock_image_ctx, get_parent_overlap(CEPH_NOSNAP, _))
+      .WillOnce(WithArg<1>(Invoke([overlap](uint64_t *o) {
+                             *o = overlap;
+                             return 0;
+                           })));
+  }
+
+  void expect_object_may_exist(MockTestImageCtx &mock_image_ctx,
+                               uint64_t object_no, bool exists) {
+    if (mock_image_ctx.object_map != nullptr) {
+      EXPECT_CALL(*mock_image_ctx.object_map, object_may_exist(object_no))
+        .WillOnce(Return(exists));
+    }
+  }
+
+  void expect_get_object_name(MockTestImageCtx &mock_image_ctx,
+                              uint64_t object_no, const std::string& oid) {
+    EXPECT_CALL(mock_image_ctx, get_object_name(object_no))
+      .WillOnce(Return(oid));
+  }
+
+  void expect_aio_remove(MockTestImageCtx &mock_image_ctx,
+                         const std::string& oid, int ret_val) {
+    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx), remove(oid, _))
+      .WillOnce(Return(ret_val));
+  }
+
+  void expect_object_trim(MockImageCtx &mock_image_ctx,
+                          MockObjectRequest &mock_object_request, int ret_val) {
+    EXPECT_CALL(mock_object_request, construct_trim());
+    EXPECT_CALL(mock_object_request, send())
+      .WillOnce(Invoke([&mock_image_ctx, &mock_object_request, ret_val]() {
+                         mock_image_ctx.op_work_queue->queue(mock_object_request.on_finish, ret_val);
+                       }));
+  }
+
+  void expect_object_truncate(MockImageCtx &mock_image_ctx,
+                              MockObjectRequest &mock_object_request,
+                              int ret_val) {
+    EXPECT_CALL(mock_object_request, construct_truncate());
+    EXPECT_CALL(mock_object_request, send())
+      .WillOnce(Invoke([&mock_image_ctx, &mock_object_request, ret_val]() {
+                         mock_image_ctx.op_work_queue->queue(mock_object_request.on_finish, ret_val);
+                       }));
+  }
+};
+
+TEST_F(TestMockOperationTrimRequest, SuccessRemove) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+  expect_op_work_queue(mock_image_ctx);
+  expect_is_lock_owner(mock_image_ctx);
+
+  InSequence seq;
+  EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+  EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+  // pre
+  expect_object_map_update(mock_image_ctx, 0, 1, OBJECT_PENDING, OBJECT_EXISTS,
+                           true, 0);
+
+  // copy-up
+  expect_get_parent_overlap(mock_image_ctx, 0);
+
+  // remove
+  expect_object_may_exist(mock_image_ctx, 0, true);
+  expect_get_object_name(mock_image_ctx, 0, "object0");
+  expect_aio_remove(mock_image_ctx, "object0", 0);
+
+   // post
+  expect_object_map_update(mock_image_ctx, 0, 1, OBJECT_NONEXISTENT,
+                           OBJECT_PENDING, true, 0);
+
+  C_SaferCond cond_ctx;
+  librbd::NoOpProgressContext progress_ctx;
+  MockTrimRequest *req = new MockTrimRequest(
+    mock_image_ctx, &cond_ctx, m_image_size, 0, progress_ctx);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, SuccessCopyUp) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING)
+  ASSERT_EQ(0, create_snapshot("snap1"));
+
+  int order = 22;
+  uint64_t features;
+  ASSERT_TRUE(::get_features(&features));
+  std::string clone_name = get_temp_image_name();
+  ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
+                             clone_name.c_str(), features, &order, 0, 0));
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(clone_name, &ictx));
+  ASSERT_EQ(0, snap_create(*ictx, "snap"));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+  expect_op_work_queue(mock_image_ctx);
+  expect_is_lock_owner(mock_image_ctx);
+
+  InSequence seq;
+  EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+  EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+  // pre
+  expect_object_map_update(mock_image_ctx, 0, 2, OBJECT_PENDING, OBJECT_EXISTS,
+                           true, 0);
+
+  // copy-up
+  expect_get_parent_overlap(mock_image_ctx, ictx->get_object_size());
+  expect_get_object_name(mock_image_ctx, 0, "object0");
+
+  MockObjectRequest mock_object_request;
+  expect_object_trim(mock_image_ctx, mock_object_request, 0);
+
+  // remove
+  expect_object_may_exist(mock_image_ctx, 1, true);
+  expect_get_object_name(mock_image_ctx, 1, "object1");
+  expect_aio_remove(mock_image_ctx, "object1", 0);
+
+   // post
+  expect_object_map_update(mock_image_ctx, 0, 2, OBJECT_NONEXISTENT,
+                           OBJECT_PENDING, true, 0);
+
+  C_SaferCond cond_ctx;
+  librbd::NoOpProgressContext progress_ctx;
+  MockTrimRequest *req = new MockTrimRequest(
+    mock_image_ctx, &cond_ctx, 2 * ictx->get_object_size(), 0, progress_ctx);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, SuccessBoundary) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+  expect_op_work_queue(mock_image_ctx);
+  expect_is_lock_owner(mock_image_ctx);
+
+  InSequence seq;
+  EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+  EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+  // boundary
+  MockObjectRequest mock_object_request;
+  expect_object_truncate(mock_image_ctx, mock_object_request, 0);
+
+  C_SaferCond cond_ctx;
+  librbd::NoOpProgressContext progress_ctx;
+  MockTrimRequest *req = new MockTrimRequest(
+    mock_image_ctx, &cond_ctx, ictx->get_object_size(), 1, progress_ctx);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, SuccessNoOp) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+}
+
+TEST_F(TestMockOperationTrimRequest, RemoveError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+  expect_op_work_queue(mock_image_ctx);
+  expect_is_lock_owner(mock_image_ctx);
+
+  InSequence seq;
+  EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+  EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+  // pre
+  expect_object_map_update(mock_image_ctx, 0, 1, OBJECT_PENDING, OBJECT_EXISTS,
+                           false, 0);
+
+  // copy-up
+  expect_get_parent_overlap(mock_image_ctx, 0);
+
+  // remove
+  expect_object_may_exist(mock_image_ctx, 0, true);
+  expect_get_object_name(mock_image_ctx, 0, "object0");
+  expect_aio_remove(mock_image_ctx, "object0", -EPERM);
+
+  C_SaferCond cond_ctx;
+  librbd::NoOpProgressContext progress_ctx;
+  MockTrimRequest *req = new MockTrimRequest(
+    mock_image_ctx, &cond_ctx, m_image_size, 0, progress_ctx);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EPERM, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, CopyUpError) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING)
+  ASSERT_EQ(0, create_snapshot("snap1"));
+
+  int order = 22;
+  uint64_t features;
+  ASSERT_TRUE(::get_features(&features));
+  std::string clone_name = get_temp_image_name();
+  ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
+                             clone_name.c_str(), features, &order, 0, 0));
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(clone_name, &ictx));
+  ASSERT_EQ(0, snap_create(*ictx, "snap"));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+  expect_op_work_queue(mock_image_ctx);
+  expect_is_lock_owner(mock_image_ctx);
+
+  InSequence seq;
+  EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+  EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+  // pre
+  expect_object_map_update(mock_image_ctx, 0, 2, OBJECT_PENDING, OBJECT_EXISTS,
+                           false, 0);
+
+  // copy-up
+  expect_get_parent_overlap(mock_image_ctx, ictx->get_object_size());
+  expect_get_object_name(mock_image_ctx, 0, "object0");
+
+  MockObjectRequest mock_object_request;
+  expect_object_trim(mock_image_ctx, mock_object_request, -EINVAL);
+
+  C_SaferCond cond_ctx;
+  librbd::NoOpProgressContext progress_ctx;
+  MockTrimRequest *req = new MockTrimRequest(
+    mock_image_ctx, &cond_ctx, 2 * ictx->get_object_size(), 0, progress_ctx);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, BoundaryError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock mock_exclusive_lock;
+  MockJournal mock_journal;
+  MockObjectMap mock_object_map;
+  initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+                      mock_object_map);
+  expect_op_work_queue(mock_image_ctx);
+  expect_is_lock_owner(mock_image_ctx);
+
+  InSequence seq;
+  EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+  EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+  // boundary
+  MockObjectRequest mock_object_request;
+  expect_object_truncate(mock_image_ctx, mock_object_request, -EINVAL);
+
+  C_SaferCond cond_ctx;
+  librbd::NoOpProgressContext progress_ctx;
+  MockTrimRequest *req = new MockTrimRequest(
+    mock_image_ctx, &cond_ctx, ictx->get_object_size(), 1, progress_ctx);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+} // namespace operation
+} // namespace librbd
diff --git a/ceph/src/test/librgw_file.cc b/ceph/src/test/librgw_file.cc
index 0fc6bbe75..c8dea47ba 100644
--- a/ceph/src/test/librgw_file.cc
+++ b/ceph/src/test/librgw_file.cc
@@ -53,8 +53,8 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
-		      &fs, RGW_MOUNT_FLAG_NONE);
+  int ret = rgw_mount2(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
+                       "/", &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
 }
diff --git a/ceph/src/test/librgw_file_aw.cc b/ceph/src/test/librgw_file_aw.cc
index 9a29fd235..337cd25a0 100644
--- a/ceph/src/test/librgw_file_aw.cc
+++ b/ceph/src/test/librgw_file_aw.cc
@@ -176,8 +176,8 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw, userid.c_str(), access_key.c_str(),
-		      secret_key.c_str(), &fs, RGW_MOUNT_FLAG_NONE);
+  int ret = rgw_mount2(rgw, userid.c_str(), access_key.c_str(),
+                       secret_key.c_str(), "/", &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
 }
diff --git a/ceph/src/test/librgw_file_cd.cc b/ceph/src/test/librgw_file_cd.cc
index 0dd2a4d99..9b6af4b6a 100644
--- a/ceph/src/test/librgw_file_cd.cc
+++ b/ceph/src/test/librgw_file_cd.cc
@@ -57,8 +57,8 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw, userid.c_str(), access_key.c_str(),
-		      secret_key.c_str(), &fs, RGW_MOUNT_FLAG_NONE);
+  int ret = rgw_mount2(rgw, userid.c_str(), access_key.c_str(),
+                       secret_key.c_str(), "/", &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
 }
diff --git a/ceph/src/test/librgw_file_gp.cc b/ceph/src/test/librgw_file_gp.cc
index d2b00fc83..2b591e468 100644
--- a/ceph/src/test/librgw_file_gp.cc
+++ b/ceph/src/test/librgw_file_gp.cc
@@ -179,8 +179,8 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
-		      &fs, RGW_MOUNT_FLAG_NONE);
+  int ret = rgw_mount2(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
+                       "/", &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
 }
diff --git a/ceph/src/test/librgw_file_marker.cc b/ceph/src/test/librgw_file_marker.cc
new file mode 100644
index 000000000..74199dfa5
--- /dev/null
+++ b/ceph/src/test/librgw_file_marker.cc
@@ -0,0 +1,488 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <stdint.h>
+#include <tuple>
+#include <iostream>
+#include <fstream>
+#include <stack>
+
+#include "include/rados/librgw.h"
+#include "include/rados/rgw_file.h"
+#include "rgw/rgw_file.h"
+#include "rgw/rgw_lib_frontend.h" // direct requests
+
+#include "gtest/gtest.h"
+#include "common/backport14.h"
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "global/global_init.h"
+#include "include/assert.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace {
+
+  using namespace rgw;
+  using std::get;
+  using std::string;
+
+  librgw_t rgw_h = nullptr;
+  string userid("testuser");
+  string access_key("");
+  string secret_key("");
+  struct rgw_fs *fs = nullptr;
+  CephContext* cct = nullptr;
+
+  uint32_t owner_uid = 867;
+  uint32_t owner_gid = 5309;
+
+  uint32_t create_mask = RGW_SETATTR_UID | RGW_SETATTR_GID | RGW_SETATTR_MODE;
+
+  string bucket_name("nfsroot");
+
+  class obj_rec
+  {
+  public:
+    string name;
+    struct rgw_file_handle* fh;
+    struct rgw_file_handle* parent_fh;
+    RGWFileHandle* rgw_fh; // alias into fh
+
+    struct state {
+      bool readdir;
+      state() : readdir(false) {}
+    } state;
+
+    obj_rec(string _name, struct rgw_file_handle* _fh,
+	    struct rgw_file_handle* _parent_fh, RGWFileHandle* _rgw_fh)
+      : name(std::move(_name)), fh(_fh), parent_fh(_parent_fh),
+	rgw_fh(_rgw_fh) {}
+
+    void clear() {
+      fh = nullptr;
+      rgw_fh = nullptr;
+    }
+
+    void sync() {
+      if (fh)
+	rgw_fh = get_rgwfh(fh);
+    }
+
+    friend ostream& operator<<(ostream& os, const obj_rec& rec);
+  };
+
+  ostream& operator<<(ostream& os, const obj_rec& rec)
+  {
+    RGWFileHandle* rgw_fh = rec.rgw_fh;
+    if (rgw_fh) {
+      const char* type = rgw_fh->is_dir() ? "DIR " : "FILE ";
+      os << rec.rgw_fh->full_object_name()
+	 << " (" << rec.rgw_fh->object_name() << "): "
+	 << type;
+    }
+    return os;
+  }
+  
+  std::stack<obj_rec> obj_stack;
+  std::deque<obj_rec> cleanup_queue;
+
+  typedef std::vector<obj_rec> obj_vec;
+  typedef std::tuple<obj_rec, obj_vec> dirs1_rec;
+  typedef std::vector<dirs1_rec> dirs1_vec;
+
+  dirs1_vec dirs_vec;
+
+  struct obj_rec_st
+  {
+    const obj_rec& obj;
+    const struct stat& st;
+
+    obj_rec_st(const obj_rec& _obj, const struct stat& _st)
+      : obj(_obj), st(_st) {}
+  };
+
+  ostream& operator<<(ostream& os, const obj_rec_st& rec)
+  {
+    RGWFileHandle* rgw_fh = rec.obj.rgw_fh;
+    if (rgw_fh) {
+      const char* type = rgw_fh->is_dir() ? "DIR " : "FILE ";
+      os << rgw_fh->full_object_name()
+	 << " (" << rgw_fh->object_name() << "): "
+	 << type;
+      const struct stat& st = rec.st;
+      switch(uint8_t(rgw_fh->is_dir())) {
+      case 1:
+	os << " mode: " << st.st_mode;
+	os << " nlinks: " << st.st_nlink;
+	break;
+      case 0:
+      default:
+	os << " mode: " << st.st_mode;
+	os << " size: " << st.st_size;
+	// xxx
+	break;
+      }
+    }
+    return os;
+  }
+
+  bool do_marker1 = false;
+  bool do_marker2 = true;
+  bool do_create = false;
+  bool do_delete = false;
+  bool verbose = false;
+
+  string marker_dir("nfs_marker");
+  struct rgw_file_handle *bucket_fh = nullptr;
+  struct rgw_file_handle *marker_fh;
+  static constexpr int marker_nobjs = 2*1024;
+  std::deque<obj_rec> marker_objs;
+
+  using dirent_t = std::tuple<std::string, uint64_t>;
+  struct dirent_vec
+  {
+    std::vector<dirent_t> obj_names;
+    uint32_t count;
+    dirent_vec() : count(0) {}
+  };
+
+  struct {
+    int argc;
+    char **argv;
+  } saved_args;
+}
+
+TEST(LibRGW, TVAR) {
+  typedef boost::variant<uint64_t*, const char*> readdir_offset;
+
+  uint64_t i1{64001};
+  std::string s1{"blunderbuss"};
+
+  readdir_offset v1{&i1};
+  readdir_offset v2{s1.c_str()};
+  readdir_offset v3{static_cast<const char*>(nullptr)};
+
+  uint64_t* pi1 = get<uint64_t*>(v1);
+  ASSERT_NE(pi1, nullptr);
+  std::cout << "read i1: " << *pi1 << std::endl;
+
+  const char* ps1 = get<const char*>(v2);
+  ASSERT_NE(ps1, nullptr);
+  std::cout << "read s1: " << ps1 << std::endl;
+
+  const char* ps3 = get<const char*>(v3);
+  ASSERT_EQ(ps3, nullptr);
+  std::cout << "read s3: " << ps3 << std::endl;
+}
+
+TEST(LibRGW, INIT) {
+  int ret = librgw_create(&rgw_h, saved_args.argc, saved_args.argv);
+  ASSERT_EQ(ret, 0);
+  ASSERT_NE(rgw_h, nullptr);
+}
+
+TEST(LibRGW, MOUNT) {
+  int ret = rgw_mount2(rgw_h, userid.c_str(), access_key.c_str(),
+                       secret_key.c_str(), "/", &fs, RGW_MOUNT_FLAG_NONE);
+  ASSERT_EQ(ret, 0);
+  ASSERT_NE(fs, nullptr);
+
+  cct = static_cast<RGWLibFS*>(fs->fs_private)->get_context();
+}
+
+TEST(LibRGW, MARKER1_SETUP_BUCKET) {
+  /* "large" directory enumeration test.  this one deals only with
+   * file objects */
+  struct stat st;
+  int ret;
+
+  st.st_uid = owner_uid;
+  st.st_gid = owner_gid;
+  st.st_mode = 755;
+
+  (void) rgw_lookup(fs, fs->root_fh, bucket_name.c_str(), &bucket_fh,
+		    RGW_LOOKUP_FLAG_NONE);
+  if (! bucket_fh) {
+    if (do_create) {
+      struct stat st;
+
+      st.st_uid = owner_uid;
+      st.st_gid = owner_gid;
+      st.st_mode = 755;
+
+      ret = rgw_mkdir(fs, fs->root_fh, bucket_name.c_str(), &st, create_mask,
+		      &bucket_fh, RGW_MKDIR_FLAG_NONE);
+      ASSERT_EQ(ret, 0);
+    }
+  }
+
+  ASSERT_NE(bucket_fh, nullptr);
+
+  (void) rgw_lookup(fs, bucket_fh, marker_dir.c_str(), &marker_fh,
+		    RGW_LOOKUP_FLAG_NONE);
+  if (! marker_fh) {
+    if (do_create) {
+      ret = rgw_mkdir(fs, bucket_fh, marker_dir.c_str(), &st, create_mask,
+		      &marker_fh, RGW_MKDIR_FLAG_NONE);
+      ASSERT_EQ(ret, 0);
+    }
+  }
+
+  ASSERT_NE(marker_fh, nullptr);
+}
+
+TEST(LibRGW, MARKER1_SETUP_OBJECTS)
+{
+  /* "large" directory enumeration test.  this one deals only with
+   * file objects */
+  if (do_create) {
+    int ret;
+
+    for (int ix = 0; ix < marker_nobjs; ++ix) {
+      std::string object_name("f_");
+      object_name += to_string(ix);
+      obj_rec obj{object_name, nullptr, marker_fh, nullptr};
+      // lookup object--all operations are by handle
+      ret = rgw_lookup(fs, marker_fh, obj.name.c_str(), &obj.fh,
+		       RGW_LOOKUP_FLAG_CREATE);
+      ASSERT_EQ(ret, 0);
+      obj.rgw_fh = get_rgwfh(obj.fh);
+      // open object--open transaction
+      ret = rgw_open(fs, obj.fh, 0 /* posix flags */, RGW_OPEN_FLAG_NONE);
+      ASSERT_EQ(ret, 0);
+      ASSERT_TRUE(obj.rgw_fh->is_open());
+      // unstable write data
+      size_t nbytes;
+      string data("data for ");
+      data += object_name;
+      int ret = rgw_write(fs, obj.fh, 0, data.length(), &nbytes,
+			  (void*) data.c_str(), RGW_WRITE_FLAG_NONE);
+      ASSERT_EQ(ret, 0);
+      ASSERT_EQ(nbytes, data.length());
+      // commit transaction (write on close)
+      ret = rgw_close(fs, obj.fh, 0 /* flags */);
+      ASSERT_EQ(ret, 0);
+      // save for cleanup
+      marker_objs.push_back(obj);
+    }
+  }
+}
+
+extern "C" {
+  static bool r2_cb(const char* name, void *arg, uint64_t offset,
+		    uint32_t flags) {
+    dirent_vec& dvec =
+      *(static_cast<dirent_vec*>(arg));
+    lsubdout(cct, rgw, 10) << __func__
+			   << " bucket=" << bucket_name
+			   << " dir=" << marker_dir
+			   << " iv count=" << dvec.count
+			   << " called back name=" << name
+			   << " flags=" << flags
+			   << dendl;
+
+  std::cout << __func__
+			   << " bucket=" << bucket_name
+			   << " dir=" << marker_dir
+			   << " iv count=" << dvec.count
+			   << " called back name=" << name
+			   << " flags=" << flags
+			   << std::endl;
+
+    string name_str{name};
+    if (! ((name_str == ".") ||
+	   (name_str == ".."))) {
+      dvec.obj_names.push_back(dirent_t{std::move(name_str), offset});
+    }
+    return true; /* XXX */
+  }
+}
+
+TEST(LibRGW, MARKER1_READDIR)
+{
+  if (do_marker1) {
+    using std::get;
+
+    dirent_vec dvec;
+    uint64_t offset = 0;
+    bool eof = false;
+
+    /* because RGWReaddirRequest::default_max is 1000 (XXX make
+     * configurable?) and marker_nobjs is 5*1024, the number
+     * of required rgw_readdir operations N should be
+     * marker_nobjs/1000 < N < marker_nobjs/1000+1, i.e., 6 when
+     * marker_nobjs==5*1024 */
+    uint32_t max_iterations = marker_nobjs/1000+1;
+
+    do {
+      ASSERT_TRUE(dvec.count <= max_iterations);
+      int ret = rgw_readdir(fs, marker_fh, &offset, r2_cb, &dvec, &eof,
+			    RGW_READDIR_FLAG_DOTDOT);
+      ASSERT_EQ(ret, 0);
+      ASSERT_EQ(offset, get<1>(dvec.obj_names.back())); // cookie check
+      ++dvec.count;
+    } while(!eof);
+    std::cout << "Read " << dvec.obj_names.size() << " objects in "
+	      << marker_dir.c_str() << std::endl;
+  }
+}
+
+TEST(LibRGW, MARKER2_READDIR)
+{
+  if (do_marker2) {
+    using std::get;
+
+    dirent_vec dvec;
+    std::string marker{""};
+    bool eof = false;
+
+    /* because RGWReaddirRequest::default_max is 1000 (XXX make
+     * configurable?) and marker_nobjs is 5*1024, the number
+     * of required rgw_readdir operations N should be
+     * marker_nobjs/1000 < N < marker_nobjs/1000+1, i.e., 6 when
+     * marker_nobjs==5*1024 */
+    uint32_t max_iterations = marker_nobjs/1000+1;
+
+    do {
+      ASSERT_TRUE(dvec.count <= max_iterations);
+      int ret = rgw_readdir2(fs, marker_fh,
+			     (marker.length() > 0) ? marker.c_str() : nullptr,
+			     r2_cb, &dvec, &eof,
+			     RGW_READDIR_FLAG_DOTDOT);
+      ASSERT_EQ(ret, 0);
+      marker = get<0>(dvec.obj_names.back());
+      ++dvec.count;
+    } while((!eof) && dvec.count < 4);
+    std::cout << "Read " << dvec.obj_names.size() << " objects in "
+	      << marker_dir.c_str() << std::endl;
+  }
+}
+
+TEST(LibRGW, MARKER1_OBJ_CLEANUP)
+{
+  int rc;
+  for (auto& obj : marker_objs) {
+    if (obj.fh) {
+      if (do_delete) {
+	if (verbose) {
+	  std::cout << "unlinking: " << bucket_name << ":" << obj.name
+		    << std::endl;
+	}
+	rc = rgw_unlink(fs, marker_fh, obj.name.c_str(), RGW_UNLINK_FLAG_NONE);
+      }
+      rc = rgw_fh_rele(fs, obj.fh, 0 /* flags */);
+      ASSERT_EQ(rc, 0);
+    }
+  }
+  marker_objs.clear();
+}
+
+TEST(LibRGW, CLEANUP) {
+  int rc;
+
+  if (do_marker1) {
+    cleanup_queue.push_back(
+      obj_rec{bucket_name, bucket_fh, fs->root_fh, get_rgwfh(fs->root_fh)});
+  }
+
+  for (auto& elt : cleanup_queue) {
+    if (elt.fh) {
+      rc = rgw_fh_rele(fs, elt.fh, 0 /* flags */);
+      ASSERT_EQ(rc, 0);
+    }
+  }
+  cleanup_queue.clear();
+}
+
+TEST(LibRGW, UMOUNT) {
+  if (! fs)
+    return;
+
+  int ret = rgw_umount(fs, RGW_UMOUNT_FLAG_NONE);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST(LibRGW, SHUTDOWN) {
+  librgw_shutdown(rgw_h);
+}
+
+int main(int argc, char *argv[])
+{
+  char *v{nullptr};
+  string val;
+  vector<const char*> args;
+
+  argv_to_vec(argc, const_cast<const char**>(argv), args);
+  env_to_vec(args);
+
+  v = getenv("AWS_ACCESS_KEY_ID");
+  if (v) {
+    access_key = v;
+  }
+
+  v = getenv("AWS_SECRET_ACCESS_KEY");
+  if (v) {
+    secret_key = v;
+  }
+
+  for (auto arg_iter = args.begin(); arg_iter != args.end();) {
+    if (ceph_argparse_witharg(args, arg_iter, &val, "--access",
+			      (char*) nullptr)) {
+      access_key = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--secret",
+				     (char*) nullptr)) {
+      secret_key = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--userid",
+				     (char*) nullptr)) {
+      userid = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--bn",
+				     (char*) nullptr)) {
+      bucket_name = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--uid",
+				     (char*) nullptr)) {
+      owner_uid = std::stoi(val);
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--gid",
+				     (char*) nullptr)) {
+      owner_gid = std::stoi(val);
+    } else if (ceph_argparse_flag(args, arg_iter, "--marker1",
+					    (char*) nullptr)) {
+      do_marker1 = true;
+    } else if (ceph_argparse_flag(args, arg_iter, "--create",
+					    (char*) nullptr)) {
+      do_create = true;
+    } else if (ceph_argparse_flag(args, arg_iter, "--delete",
+					    (char*) nullptr)) {
+      do_delete = true;
+    } else if (ceph_argparse_flag(args, arg_iter, "--verbose",
+					    (char*) nullptr)) {
+      verbose = true;
+    } else {
+      ++arg_iter;
+    }
+  }
+
+  /* dont accidentally run as anonymous */
+  if ((access_key == "") ||
+      (secret_key == "")) {
+    std::cout << argv[0] << " no AWS credentials, exiting" << std::endl;
+    return EPERM;
+  }
+
+  saved_args.argc = argc;
+  saved_args.argv = argv;
+
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/ceph/src/test/librgw_file_nfsns.cc b/ceph/src/test/librgw_file_nfsns.cc
index 347aee4c6..92a0a6815 100644
--- a/ceph/src/test/librgw_file_nfsns.cc
+++ b/ceph/src/test/librgw_file_nfsns.cc
@@ -190,8 +190,8 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw_h, userid.c_str(), access_key.c_str(),
-		      secret_key.c_str(), &fs, RGW_MOUNT_FLAG_NONE);
+  int ret = rgw_mount2(rgw_h, userid.c_str(), access_key.c_str(),
+                       secret_key.c_str(), "/", &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
 
diff --git a/ceph/src/test/mon/PGMap.cc b/ceph/src/test/mon/PGMap.cc
index d7e8a18fe..37648b9e5 100644
--- a/ceph/src/test/mon/PGMap.cc
+++ b/ceph/src/test/mon/PGMap.cc
@@ -175,7 +175,7 @@ TEST(pgmap, dump_object_stat_sum_0)
   float copies_rate =
     (static_cast<float>(sum.num_object_copies - sum.num_objects_degraded) /
      sum.num_object_copies);
-  float used_bytes = sum.num_bytes * copies_rate;
+  float used_bytes = sum.num_bytes * copies_rate * pool.get_size();
   float used_percent = used_bytes / (used_bytes + avail) * 100;
   unsigned col = 0;
   ASSERT_EQ(stringify(si_t(sum.num_bytes)), tbl.get(0, col++));
diff --git a/ceph/src/test/perf_counters.cc b/ceph/src/test/perf_counters.cc
index cadecb42d..182af5429 100644
--- a/ceph/src/test/perf_counters.cc
+++ b/ceph/src/test/perf_counters.cc
@@ -182,7 +182,7 @@ TEST(PerfCounters, MultiplePerfCounters) {
   ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":13,\"element2\":0.000000000,"
 	    "\"element3\":{\"avgcount\":0,\"sum\":0.000000000,\"avgtime\":0.000000000}}}"), msg);
   ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf schema\", \"format\": \"json\" }", &msg));
-  ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":{\"type\":2,\"metric_type\":\"gauge\",\"value_type\":\"integer\",\"description\":\"\",\"nick\":\"\"},\"element2\":{\"type\":1,\"metric_type\":\"gauge\",\"value_type\":\"real\",\"description\":\"\",\"nick\":\"\"},\"element3\":{\"type\":5,\"metric_type\":\"gauge\",\"value_type\":\"real-integer-pair\",\"description\":\"\",\"nick\":\"\"}}}"), msg);
+  ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":{\"type\":2,\"metric_type\":\"gauge\",\"value_type\":\"integer\",\"description\":\"\",\"nick\":\"\",\"priority\":0},\"element2\":{\"type\":1,\"metric_type\":\"gauge\",\"value_type\":\"real\",\"description\":\"\",\"nick\":\"\",\"priority\":0},\"element3\":{\"type\":5,\"metric_type\":\"gauge\",\"value_type\":\"real-integer-pair\",\"description\":\"\",\"nick\":\"\",\"priority\":0}}}"), msg);
   coll->clear();
   ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf dump\", \"format\": \"json\" }", &msg));
   ASSERT_EQ("{}", msg);
diff --git a/ceph/src/test/perf_local.cc b/ceph/src/test/perf_local.cc
index 98cccd87b..c3b9f7ccc 100644
--- a/ceph/src/test/perf_local.cc
+++ b/ceph/src/test/perf_local.cc
@@ -785,8 +785,9 @@ double perf_timer()
   uint64_t start = Cycles::rdtsc();
   Mutex::Locker l(lock);
   for (int i = 0; i < count; i++) {
-    timer.add_event_after(12345, c[i]);
-    timer.cancel_event(c[i]);
+    if (timer.add_event_after(12345, c[i])) {
+      timer.cancel_event(c[i]);
+    }
   }
   uint64_t stop = Cycles::rdtsc();
   delete[] c;
diff --git a/ceph/src/test/rbd_mirror/image_replayer/test_mock_BootstrapRequest.cc b/ceph/src/test/rbd_mirror/image_replayer/test_mock_BootstrapRequest.cc
index c372a4b9f..4dff2f0a2 100644
--- a/ceph/src/test/rbd_mirror/image_replayer/test_mock_BootstrapRequest.cc
+++ b/ceph/src/test/rbd_mirror/image_replayer/test_mock_BootstrapRequest.cc
@@ -929,6 +929,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, PrimaryRemote) {
 
   // register missing client in remote journal
   librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta;
+  mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
   client_data.client_meta = mirror_peer_client_meta;
   expect_journaler_register_client(mock_journaler, client_data, 0);
 
@@ -1019,6 +1020,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, PrimaryRemoteLocalDeleted) {
   // re-register the client
   expect_journaler_unregister_client(mock_journaler, 0);
   mirror_peer_client_meta = {};
+  mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
   client_data.client_meta = mirror_peer_client_meta;
   expect_journaler_register_client(mock_journaler, client_data, 0);
 
diff --git a/ceph/src/test/rbd_mirror/mock/MockSafeTimer.h b/ceph/src/test/rbd_mirror/mock/MockSafeTimer.h
index 3de5fbcdb..32d58471d 100644
--- a/ceph/src/test/rbd_mirror/mock/MockSafeTimer.h
+++ b/ceph/src/test/rbd_mirror/mock/MockSafeTimer.h
@@ -9,7 +9,7 @@
 struct Context;
 
 struct MockSafeTimer {
-  MOCK_METHOD2(add_event_after, void(double, Context*));
+  MOCK_METHOD2(add_event_after, Context*(double, Context*));
   MOCK_METHOD1(cancel_event, bool(Context *));
 };
 
diff --git a/ceph/src/test/rbd_mirror/test_mock_ImageReplayer.cc b/ceph/src/test/rbd_mirror/test_mock_ImageReplayer.cc
index 7a0bb6706..9e2006c9d 100644
--- a/ceph/src/test/rbd_mirror/test_mock_ImageReplayer.cc
+++ b/ceph/src/test/rbd_mirror/test_mock_ImageReplayer.cc
@@ -105,6 +105,7 @@ using ::testing::InSequence;
 using ::testing::Invoke;
 using ::testing::MatcherCast;
 using ::testing::Return;
+using ::testing::ReturnArg;
 using ::testing::SetArgPointee;
 using ::testing::WithArg;
 
@@ -356,9 +357,10 @@ public:
   void expect_add_event_after_repeatedly(MockThreads &mock_threads) {
     EXPECT_CALL(*mock_threads.timer, add_event_after(_, _))
       .WillRepeatedly(
-        Invoke([this](double seconds, Context *ctx) {
-          m_threads->timer->add_event_after(seconds, ctx);
-        }));
+        DoAll(Invoke([this](double seconds, Context *ctx) {
+		       m_threads->timer->add_event_after(seconds, ctx);
+		     }),
+	  ReturnArg<1>()));
     EXPECT_CALL(*mock_threads.timer, cancel_event(_))
       .WillRepeatedly(
         Invoke([this](Context *ctx) {
diff --git a/ceph/src/test/rbd_mirror/test_mock_InstanceReplayer.cc b/ceph/src/test/rbd_mirror/test_mock_InstanceReplayer.cc
index 1903c55f2..02bc0886d 100644
--- a/ceph/src/test/rbd_mirror/test_mock_InstanceReplayer.cc
+++ b/ceph/src/test/rbd_mirror/test_mock_InstanceReplayer.cc
@@ -121,9 +121,11 @@ namespace rbd {
 namespace mirror {
 
 using ::testing::_;
+using ::testing::DoAll;
 using ::testing::InSequence;
 using ::testing::Invoke;
 using ::testing::Return;
+using ::testing::ReturnArg;
 using ::testing::ReturnRef;
 using ::testing::WithArg;
 
@@ -146,8 +148,8 @@ public:
   void expect_add_event_after(MockThreads &mock_threads,
                               Context** timer_ctx = nullptr) {
     EXPECT_CALL(*mock_threads.timer, add_event_after(_, _))
-      .WillOnce(WithArg<1>(
-        Invoke([this, &mock_threads, timer_ctx](Context *ctx) {
+      .WillOnce(DoAll(
+        WithArg<1>(Invoke([this, &mock_threads, timer_ctx](Context *ctx) {
           assert(mock_threads.timer_lock.is_locked());
           if (timer_ctx != nullptr) {
             *timer_ctx = ctx;
@@ -159,7 +161,8 @@ public:
                 ctx->complete(0);
               }), 0);
           }
-        })));
+        })),
+        ReturnArg<1>()));
   }
 
   void expect_cancel_event(MockThreads &mock_threads, bool canceled) {
diff --git a/ceph/src/test/rbd_mirror/test_mock_PoolWatcher.cc b/ceph/src/test/rbd_mirror/test_mock_PoolWatcher.cc
index 1b7877434..4c7463d66 100644
--- a/ceph/src/test/rbd_mirror/test_mock_PoolWatcher.cc
+++ b/ceph/src/test/rbd_mirror/test_mock_PoolWatcher.cc
@@ -145,6 +145,7 @@ using ::testing::DoAll;
 using ::testing::InSequence;
 using ::testing::Invoke;
 using ::testing::Return;
+using ::testing::ReturnArg;
 using ::testing::StrEq;
 using ::testing::WithArg;
 using ::testing::WithoutArgs;
@@ -238,13 +239,15 @@ public:
 
   void expect_timer_add_event(MockThreads &mock_threads) {
     EXPECT_CALL(*mock_threads.timer, add_event_after(_, _))
-      .WillOnce(WithArg<1>(Invoke([this](Context *ctx) {
-          auto wrapped_ctx = new FunctionContext([this, ctx](int r) {
-              Mutex::Locker timer_locker(m_threads->timer_lock);
-              ctx->complete(r);
-            });
-          m_threads->work_queue->queue(wrapped_ctx, 0);
-        })));
+      .WillOnce(DoAll(WithArg<1>(Invoke([this](Context *ctx) {
+                        auto wrapped_ctx =
+			  new FunctionContext([this, ctx](int r) {
+			      Mutex::Locker timer_locker(m_threads->timer_lock);
+			      ctx->complete(r);
+			    });
+			m_threads->work_queue->queue(wrapped_ctx, 0);
+                      })),
+                      ReturnArg<1>()));
   }
 
   int when_shut_down(MockPoolWatcher &mock_pool_watcher) {
diff --git a/ceph/src/test/rgw/rgw_multi/tests.py b/ceph/src/test/rgw/rgw_multi/tests.py
index 0109a0213..61e531af9 100644
--- a/ceph/src/test/rgw/rgw_multi/tests.py
+++ b/ceph/src/test/rgw/rgw_multi/tests.py
@@ -10,6 +10,7 @@ try:
 except ImportError:
     from itertools import zip_longest
 from itertools import combinations
+from cStringIO import StringIO
 
 import boto
 import boto.s3.connection
@@ -931,6 +932,27 @@ def test_bucket_sync_disable_enable():
     for bucket_name in buckets:
         zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
 
+def test_multipart_object_sync():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+    buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+    _, bucket = zone_bucket[0]
+
+    # initiate a multipart upload
+    upload = bucket.initiate_multipart_upload('MULTIPART')
+    mp = boto.s3.multipart.MultiPartUpload(bucket)
+    mp.key_name = upload.key_name
+    mp.id = upload.id
+    part_size = 5 * 1024 * 1024 # 5M min part size
+    mp.upload_part_from_file(StringIO('a' * part_size), 1)
+    mp.upload_part_from_file(StringIO('b' * part_size), 2)
+    mp.upload_part_from_file(StringIO('c' * part_size), 3)
+    mp.upload_part_from_file(StringIO('d' * part_size), 4)
+    mp.complete_upload()
+
+    zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
 def test_encrypted_object_sync():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
diff --git a/ceph/src/test/test_ipaddr.cc b/ceph/src/test/test_ipaddr.cc
index 6f5a42f8f..8cd0281af 100644
--- a/ceph/src/test/test_ipaddr.cc
+++ b/ceph/src/test/test_ipaddr.cc
@@ -1,4 +1,6 @@
 #include "include/ipaddr.h"
+#include "common/pick_address.h"
+#include "global/global_context.h"
 #include "gtest/gtest.h"
 
 #if defined(__FreeBSD__)
@@ -537,3 +539,52 @@ TEST(CommonIPAddr, ParseNetwork_IPv6_9000)
   ipv6(&want, "2001:1234:5678:90ab::dead:beef");
   ASSERT_EQ(0, memcmp(want.sin6_addr.s6_addr, network.sin6_addr.s6_addr, sizeof(network.sin6_addr.s6_addr)));
 }
+
+TEST(pick_address, find_ip_in_subnet_list)
+{
+  struct ifaddrs one, two;
+  struct sockaddr_in a_one;
+  struct sockaddr_in a_two;
+  const struct sockaddr *result;
+
+  one.ifa_next = &two;
+  one.ifa_addr = (struct sockaddr*)&a_one;
+  one.ifa_name = eth0;
+
+  two.ifa_next = NULL;
+  two.ifa_addr = (struct sockaddr*)&a_two;
+  two.ifa_name = eth1;
+
+  ipv4(&a_one, "10.1.1.2");
+  ipv4(&a_two, "10.2.1.123");
+
+  // match by network
+  result = find_ip_in_subnet_list(
+    g_ceph_context,
+    &one,
+    "10.1.0.0/16",
+    "eth0");
+  ASSERT_EQ((struct sockaddr*)&a_one, result);
+
+  result = find_ip_in_subnet_list(
+    g_ceph_context,
+    &one,
+    "10.2.0.0/16",
+    "eth1");
+  ASSERT_EQ((struct sockaddr*)&a_two, result);
+
+  // match by eth name
+  result = find_ip_in_subnet_list(
+    g_ceph_context,
+    &one,
+    "10.0.0.0/8",
+    "eth0");
+  ASSERT_EQ((struct sockaddr*)&a_one, result);
+
+  result = find_ip_in_subnet_list(
+    g_ceph_context,
+    &one,
+    "10.0.0.0/8",
+    "eth1");
+  ASSERT_EQ((struct sockaddr*)&a_two, result);
+}
diff --git a/ceph/src/tools/CMakeLists.txt b/ceph/src/tools/CMakeLists.txt
index ed19c63bc..750208589 100644
--- a/ceph/src/tools/CMakeLists.txt
+++ b/ceph/src/tools/CMakeLists.txt
@@ -26,7 +26,9 @@ add_executable(ceph-osdomap-tool ceph_osdomap_tool.cc)
 target_link_libraries(ceph-osdomap-tool os global Boost::program_options)
 install(TARGETS ceph-osdomap-tool DESTINATION bin)
 
-add_executable(ceph-monstore-tool ceph_monstore_tool.cc)
+add_executable(ceph-monstore-tool
+  ceph_monstore_tool.cc
+  ../mgr/mgr_commands.cc)
 target_link_libraries(ceph-monstore-tool os global Boost::program_options)
 install(TARGETS ceph-monstore-tool DESTINATION bin)
 install(PROGRAMS
diff --git a/ceph/src/tools/ceph_kvstore_tool.cc b/ceph/src/tools/ceph_kvstore_tool.cc
index 6d7ef7313..e9f31091c 100644
--- a/ceph/src/tools/ceph_kvstore_tool.cc
+++ b/ceph/src/tools/ceph_kvstore_tool.cc
@@ -36,7 +36,13 @@ using namespace std;
 
 class StoreTool
 {
-  boost::scoped_ptr<KeyValueDB> db;
+  boost::scoped_ptr<BlueStore> bluestore;
+
+  // TODO: make KeyValueDB enable_shared_from_this
+  // bluestore will hold *db* also, use unique_ptr/shared_ptr will
+  // double free. 
+  KeyValueDB* db;
+
   string store_path;
 
   public:
@@ -46,7 +52,7 @@ class StoreTool
 #ifdef HAVE_LIBAIO
       // note: we'll leak this!  the only user is ceph-kvstore-tool and
       // we don't care.
-      BlueStore *bluestore = new BlueStore(g_ceph_context, path);
+      bluestore.reset(new BlueStore(g_ceph_context, path));
       int r = bluestore->start_kv_only(&db_ptr);
       if (r < 0) {
 	exit(1);
@@ -64,7 +70,18 @@ class StoreTool
 	exit(1);
       }
     }
-    db.reset(db_ptr);
+    db = db_ptr;
+  }
+
+  ~StoreTool() {
+    if (bluestore) {
+      bluestore->umount();   
+    }
+    else {
+      if (db) {
+        delete db;
+      }
+    }
   }
 
   uint32_t traverse(const string &prefix,
diff --git a/ceph/src/tools/ceph_monstore_tool.cc b/ceph/src/tools/ceph_monstore_tool.cc
index 8c941443d..bf607ffa9 100644
--- a/ceph/src/tools/ceph_monstore_tool.cc
+++ b/ceph/src/tools/ceph_monstore_tool.cc
@@ -24,6 +24,7 @@
 #include "auth/cephx/CephxKeyServer.h"
 #include "global/global_init.h"
 #include "include/stringify.h"
+#include "mgr/mgr_commands.h"
 #include "mon/AuthMonitor.h"
 #include "mon/MonitorDBStore.h"
 #include "mon/Paxos.h"
@@ -588,6 +589,36 @@ static int update_monitor(MonitorDBStore& st)
   return 0;
 }
 
+static int update_mgrmap(MonitorDBStore& st)
+{
+  auto t = make_shared<MonitorDBStore::Transaction>();
+
+  {
+    MgrMap map;
+    // mgr expects epoch > 1
+    map.epoch++;
+    auto initial_modules =
+      get_str_vec(g_ceph_context->_conf->get_val<string>("mgr_initial_modules"));
+    copy(begin(initial_modules),
+	 end(initial_modules),
+	 inserter(map.modules, end(map.modules)));
+    bufferlist bl;
+    map.encode(bl, CEPH_FEATURES_ALL);
+    t->put("mgr", map.epoch, bl);
+    t->put("mgr", "last_committed", map.epoch);
+  }
+  {
+    auto mgr_command_descs = mgr_commands;
+    for (auto& c : mgr_command_descs) {
+      c.set_flag(MonCommand::FLAG_MGR);
+    }
+    bufferlist bl;
+    ::encode(mgr_command_descs, bl);
+    t->put("mgr_command_desc", "", bl);
+  }
+  return st.apply_transaction(t);
+}
+
 static int update_paxos(MonitorDBStore& st)
 {
   // build a pending paxos proposal from all non-permanent k/v pairs. once the
@@ -598,6 +629,7 @@ static int update_paxos(MonitorDBStore& st)
   {
     MonitorDBStore::Transaction t;
     vector<string> prefixes = {"auth", "osdmap",
+			       "mgr", "mgr_command_desc",
 			       "pgmap", "pgmap_pg", "pgmap_meta"};
     for (const auto& prefix : prefixes) {
       for (auto i = st.get_iterator(prefix); i->valid(); i->next()) {
@@ -706,6 +738,9 @@ int rebuild_monstore(const char* progname,
   if ((r = update_monitor(st))) {
     return r;
   }
+  if ((r = update_mgrmap(st))) {
+    return r;
+  }
   return 0;
 }
 
diff --git a/ceph/src/tools/ceph_objectstore_tool.cc b/ceph/src/tools/ceph_objectstore_tool.cc
index 397303dea..de679a956 100644
--- a/ceph/src/tools/ceph_objectstore_tool.cc
+++ b/ceph/src/tools/ceph_objectstore_tool.cc
@@ -73,7 +73,7 @@ CompatSet get_test_compat_set() {
 const ssize_t max_read = 1024 * 1024;
 const int fd_none = INT_MIN;
 bool outistty;
-bool dry_run = false;
+bool dry_run;
 
 struct action_on_object_t {
   virtual ~action_on_object_t() {}
@@ -294,7 +294,7 @@ ghobject_t log_oid;
 ghobject_t biginfo_oid;
 
 int file_fd = fd_none;
-bool debug = false;
+bool debug;
 super_header sh;
 uint64_t testalign;
 
@@ -2534,16 +2534,16 @@ int main(int argc, char **argv)
     ("journal-path", po::value<string>(&jpath),
      "path to journal, use if tool can't find it")
     ("pgid", po::value<string>(&pgidstr),
-     "PG id, mandatory for info, log, remove, export, rm-past-intervals, mark-complete, and mandatory for apply-layout-settings if --pool is not specified")
+     "PG id, mandatory for info, log, remove, export, export-remove, rm-past-intervals, mark-complete, and mandatory for apply-layout-settings if --pool is not specified")
     ("pool", po::value<string>(&pool),
      "Pool name, mandatory for apply-layout-settings if --pgid is not specified")
     ("op", po::value<string>(&op),
-     "Arg is one of [info, log, remove, mkfs, fsck, fuse, dup, export, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
+     "Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
      "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, apply-layout-settings, update-mon-db]")
     ("epoch", po::value<unsigned>(&epoch),
      "epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified")
     ("file", po::value<string>(&file),
-     "path of file to export, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap")
+     "path of file to export, export-remove, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap")
     ("mon-store-path", po::value<string>(&mon_store_path),
      "path of monstore to update-mon-db")
     ("fsid", po::value<string>(&fsid),
@@ -2597,23 +2597,15 @@ int main(int argc, char **argv)
     return 1;
   }
 
-  if (!vm.count("debug")) {
-    debug = false;
-  } else {
-    debug = true;
-  }
+  debug = (vm.count("debug") > 0);
 
-  if (!vm.count("force")) {
-    force = false;
-  } else {
-    force = true;
-  }
+  force = (vm.count("force") > 0);
 
   if (vm.count("namespace"))
     nspace = argnspace;
 
-  if (vm.count("dry-run"))
-    dry_run = true;
+  dry_run = (vm.count("dry-run") > 0);
+
   osflagbits_t flags = 0;
   if (dry_run || vm.count("skip-journal-replay"))
     flags |= SKIP_JOURNAL_REPLAY;
@@ -2621,6 +2613,7 @@ int main(int argc, char **argv)
     flags |= SKIP_MOUNT_OMAP;
   if (op == "update-mon-db")
     flags |= SKIP_JOURNAL_REPLAY;
+
   head = (vm.count("head") > 0);
 
   vector<const char *> ceph_options;
@@ -2690,7 +2683,7 @@ int main(int argc, char **argv)
   outistty = isatty(STDOUT_FILENO);
 
   file_fd = fd_none;
-  if ((op == "export" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) {
+  if ((op == "export" || op == "export-remove" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) {
     if (!vm.count("file") || file == "-") {
       if (outistty) {
         cerr << "stdout is a tty and no --file filename specified" << std::endl;
@@ -2715,7 +2708,7 @@ int main(int argc, char **argv)
   ObjectStoreTool tool = ObjectStoreTool(file_fd, dry_run);
 
   if (vm.count("file") && file_fd == fd_none && !dry_run) {
-    cerr << "--file option only applies to import, export, "
+    cerr << "--file option only applies to import, export, export-remove, "
 	 << "get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap" << std::endl;
     return 1;
   }
@@ -2777,6 +2770,20 @@ int main(int argc, char **argv)
     return 1;
   }
 
+  //Verify that the journal-path really exists
+  if (type == "filestore") {
+    if (::stat(jpath.c_str(), &st) == -1) {
+      string err = string("journal-path: ") + jpath;
+      perror(err.c_str());
+      return 1;
+    }
+    if (S_ISDIR(st.st_mode)) {
+      cerr << "journal-path: " << jpath << ": "
+	   << cpp_strerror(EISDIR) << std::endl;
+      return 1;
+    }
+  }
+
   ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags);
   if (fs == NULL) {
     cerr << "Unable to create store of type " << type << std::endl;
@@ -2796,6 +2803,19 @@ int main(int argc, char **argv)
     cout << "fsck found no errors" << std::endl;
     return 0;
   }
+  if (op == "repair" || op == "repair-deep") {
+    int r = fs->repair(op == "repair-deep");
+    if (r < 0) {
+      cerr << "repair failed: " << cpp_strerror(r) << std::endl;
+      return 1;
+    }
+    if (r > 0) {
+      cerr << "repair found " << r << " errors" << std::endl;
+      return 1;
+    }
+    cout << "repair found no errors" << std::endl;
+    return 0;
+  }
   if (op == "mkfs") {
     if (fsid.length()) {
       uuid_d f;
@@ -2808,7 +2828,7 @@ int main(int argc, char **argv)
     }
     int r = fs->mkfs();
     if (r < 0) {
-      cerr << "fsck failed: " << cpp_strerror(r) << std::endl;
+      cerr << "mkfs failed: " << cpp_strerror(r) << std::endl;
       return 1;
     }
     return 0;
@@ -3008,7 +3028,7 @@ int main(int argc, char **argv)
   // The ops which require --pgid option are checked here and
   // mentioned in the usage for --pgid.
   if ((op == "info" || op == "log" || op == "remove" || op == "export"
-      || op == "rm-past-intervals" || op == "mark-complete") &&
+      || op == "export-remove" || op == "rm-past-intervals" || op == "mark-complete") &&
       pgidstr.length() == 0) {
     cerr << "Must provide pgid" << std::endl;
     usage(desc);
@@ -3114,6 +3134,11 @@ int main(int argc, char **argv)
   biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
 
   if (op == "remove") {
+    if (!force && !dry_run) {
+      cerr << "Please use export-remove or you must use --force option" << std::endl;
+      ret = -EINVAL;
+      goto out;
+    }
     ret = initiate_new_remove_pg(fs, pgid, *osr);
     if (ret < 0) {
       cerr << "PG '" << pgid << "' not found" << std::endl;
@@ -3206,8 +3231,8 @@ int main(int argc, char **argv)
 
   // If not an object command nor any of the ops handled below, then output this usage
   // before complaining about a bad pgid
-  if (!vm.count("objcmd") && op != "export" && op != "info" && op != "log" && op != "rm-past-intervals" && op != "mark-complete") {
-    cerr << "Must provide --op (info, log, remove, mkfs, fsck, export, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
+  if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "rm-past-intervals" && op != "mark-complete") {
+    cerr << "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
       "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)"
 	 << std::endl;
     usage(desc);
@@ -3481,10 +3506,17 @@ int main(int argc, char **argv)
     if (debug)
       cerr << "struct_v " << (int)struct_ver << std::endl;
 
-    if (op == "export") {
+    if (op == "export" || op == "export-remove") {
       ret = tool.do_export(fs, coll, pgid, info, map_epoch, struct_ver, superblock, past_intervals);
-      if (ret == 0)
+      if (ret == 0) {
         cerr << "Export successful" << std::endl;
+        if (op == "export-remove") {
+          ret = initiate_new_remove_pg(fs, pgid, *osr);
+          // Export succeeded, so pgid is there
+          assert(ret == 0);
+          cerr << "Remove successful" << std::endl;
+        }
+      }
     } else if (op == "info") {
       formatter->open_object_section("info");
       info.dump(formatter);
diff --git a/ceph/src/tools/ceph_osdomap_tool.cc b/ceph/src/tools/ceph_osdomap_tool.cc
index 21cf60c5a..52fa7ae71 100644
--- a/ceph/src/tools/ceph_osdomap_tool.cc
+++ b/ceph/src/tools/ceph_osdomap_tool.cc
@@ -27,7 +27,7 @@ using namespace std;
 
 int main(int argc, char **argv) {
   po::options_description desc("Allowed options");
-  string store_path, cmd, out_path, oid;
+  string store_path, cmd, oid, backend;
   bool debug = false;
   desc.add_options()
     ("help", "produce help message")
@@ -38,6 +38,8 @@ int main(int argc, char **argv) {
     ("oid", po::value<string>(&oid), "Restrict to this object id when dumping objects")
     ("command", po::value<string>(&cmd),
      "command arg is one of [dump-raw-keys, dump-raw-key-vals, dump-objects, dump-objects-with-keys, check, dump-headers, repair], mandatory")
+    ("backend", po::value<string>(&backend),
+     "DB backend (default rocksdb)")
     ;
   po::positional_options_description p;
   p.add("command", 1);
@@ -96,7 +98,15 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  KeyValueDB* store(KeyValueDB::create(g_ceph_context, "leveldb", store_path));
+  if (vm.count("backend") == 0) {
+    backend = "rocksdb";
+  }
+
+  KeyValueDB* store(KeyValueDB::create(g_ceph_context, backend, store_path));
+  if (store == NULL) {
+    std::cerr << "Invalid backend '" << backend << "' specified" << std::endl;
+    return 1;
+  }
   /*if (vm.count("paranoid")) {
     std::cerr << "Enabling paranoid checks" << std::endl;
     store->options.paranoid_checks = true;
@@ -113,6 +123,11 @@ int main(int argc, char **argv) {
   // the DBObjectMap which we might want to examine for diagnostic
   // reasons.  Instead use --command repair.
 
+  omap.get_state();
+  std::cout << "Version: " << (int)omap.state.v << std::endl;
+  std::cout << "Seq: " << omap.state.seq << std::endl;
+  std::cout << "legacy: " << (omap.state.legacy ? "true" : "false") << std::endl;
+
   if (cmd == "dump-raw-keys") {
     KeyValueDB::WholeSpaceIterator i = store->get_iterator();
     for (i->seek_to_first(); i->valid(); i->next()) {
@@ -164,7 +179,7 @@ int main(int argc, char **argv) {
   } else if (cmd == "check" || cmd == "repair") {
     ostringstream ss;
     bool repair = (cmd == "repair");
-    r = omap.check(ss, repair);
+    r = omap.check(ss, repair, true);
     if (r) {
       std::cerr << ss.str() << std::endl;
       if (r > 0) {
@@ -184,6 +199,10 @@ int main(int argc, char **argv) {
     for (auto i : headers)
       std::cout << i << std::endl;
     return 0;
+  } else if (cmd == "resetv2") {
+    omap.state.v = 2;
+    omap.state.legacy = false;
+    omap.set_state();
   } else {
     std::cerr << "Did not recognize command " << cmd << std::endl;
     return 1;
diff --git a/ceph/src/tools/crushtool.cc b/ceph/src/tools/crushtool.cc
index 2a1bc83b8..7ecae9529 100644
--- a/ceph/src/tools/crushtool.cc
+++ b/ceph/src/tools/crushtool.cc
@@ -829,14 +829,15 @@ int main(int argc, const char **argv)
 
     {
       set<int> roots;
-      crush.find_roots(roots);
-      if (roots.size() > 1)
-	dout(1)	<< "The crush rulesets will use the root " << root << "\n"
-		<< "and ignore the others.\n"
-		<< "There are " << roots.size() << " roots, they can be\n"
-		<< "grouped into a single root by appending something like:\n"
-		<< "  root straw 0\n"
-		<< dendl;
+      crush.find_roots(&roots);
+      if (roots.size() > 1) {
+	cerr << "The crush rulesets will use the root " << root << "\n"
+	     << "and ignore the others.\n"
+	     << "There are " << roots.size() << " roots, they can be\n"
+	     << "grouped into a single root by appending something like:\n"
+	     << "  root straw 0\n"
+	     << std::endl;
+      }
     }
     
     if (OSDMap::build_simple_crush_rules(g_ceph_context, crush, root, &cerr))
diff --git a/ceph/src/tools/monmaptool.cc b/ceph/src/tools/monmaptool.cc
index 1fac4c10e..0972faf18 100644
--- a/ceph/src/tools/monmaptool.cc
+++ b/ceph/src/tools/monmaptool.cc
@@ -310,7 +310,7 @@ int main(int argc, const char **argv)
     monmap.created = ceph_clock_now();
     monmap.last_changed = monmap.created;
     srand(getpid() + time(0));
-    if (g_conf->fsid.is_zero()) {
+    if (g_conf->get_val<uuid_d>("fsid").is_zero()) {
       monmap.generate_fsid();
       cout << me << ": generated fsid " << monmap.fsid << std::endl;
     }
@@ -338,8 +338,8 @@ int main(int argc, const char **argv)
     modified = true;
   }
 
-  if (!g_conf->fsid.is_zero()) {
-    monmap.fsid = g_conf->fsid;
+  if (!g_conf->get_val<uuid_d>("fsid").is_zero()) {
+    monmap.fsid = g_conf->get_val<uuid_d>("fsid");
     cout << me << ": set fsid to " << monmap.fsid << std::endl;
     modified = true;
   }
diff --git a/ceph/src/tools/osdmaptool.cc b/ceph/src/tools/osdmaptool.cc
index e5b75917e..7178bd5ec 100644
--- a/ceph/src/tools/osdmaptool.cc
+++ b/ceph/src/tools/osdmaptool.cc
@@ -35,6 +35,7 @@ void usage()
   cout << "   --test-map-pgs-dump-all [--pool <poolid>] map all pgs to osds" << std::endl;
   cout << "   --health                dump health checks" << std::endl;
   cout << "   --mark-up-in            mark osds up and in (but do not persist)" << std::endl;
+  cout << "   --mark-out <osdid>      mark an osd as out (but do not persist)" << std::endl;
   cout << "   --with-default-pool     include default pool when creating map" << std::endl;
   cout << "   --clear-temp            clear pg_temp and primary_temp" << std::endl;
   cout << "   --test-random           do random placements" << std::endl;
@@ -116,6 +117,7 @@ int main(int argc, const char **argv)
   int range_last = -1;
   int pool = -1;
   bool mark_up_in = false;
+  int marked_out = -1;
   bool clear_temp = false;
   bool test_map_pgs = false;
   bool test_map_pgs_dump = false;
@@ -175,6 +177,8 @@ int main(int argc, const char **argv)
       create_from_conf = true;
     } else if (ceph_argparse_flag(args, i, "--mark-up-in", (char*)NULL)) {
       mark_up_in = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--mark-out", (char*)NULL)) {
+      marked_out = std::stoi(val);
     } else if (ceph_argparse_flag(args, i, "--clear-temp", (char*)NULL)) {
       clear_temp = true;
     } else if (ceph_argparse_flag(args, i, "--test-map-pgs", (char*)NULL)) {
@@ -317,6 +321,15 @@ int main(int argc, const char **argv)
       osdmap.crush->adjust_item_weightf(g_ceph_context, i, 1.0);
     }
   }
+
+  if (marked_out >=0 && marked_out < osdmap.get_max_osd()) {
+    cout << "marking OSD@" << marked_out << " as out" << std::endl;
+    int id = marked_out;
+    osdmap.set_state(id, osdmap.get_state(id) | CEPH_OSD_UP);
+    osdmap.set_weight(id, CEPH_OSD_OUT);
+    osdmap.crush->adjust_item_weightf(g_ceph_context, id, 1.0);
+  }
+
   if (clear_temp) {
     cout << "clearing pg/primary temp" << std::endl;
     osdmap.clear_temp();
diff --git a/ceph/src/tools/rbd/action/MirrorImage.cc b/ceph/src/tools/rbd/action/MirrorImage.cc
index 70cd91f39..e0151c978 100644
--- a/ceph/src/tools/rbd/action/MirrorImage.cc
+++ b/ceph/src/tools/rbd/action/MirrorImage.cc
@@ -31,6 +31,25 @@ namespace mirror_image {
 namespace at = argument_types;
 namespace po = boost::program_options;
 
+namespace {
+
+int validate_mirroring_enabled(librbd::Image& image) {
+  librbd::mirror_image_info_t mirror_image;
+  int r = image.mirror_image_get_info(&mirror_image, sizeof(mirror_image));
+  if (r < 0) {
+    std::cerr << "rbd: failed to retrieve mirror mode: "
+              << cpp_strerror(r) << std::endl;
+    return r;
+  }
+
+  if (mirror_image.state != RBD_MIRROR_IMAGE_ENABLED) {
+    std::cerr << "rbd: mirroring not enabled on the image" << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+} // anonymous namespace
 
 void get_arguments(po::options_description *positional,
                            po::options_description *options) {
@@ -115,6 +134,11 @@ int execute_promote(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(image);
+  if (r < 0) {
+    return r;
+  }
+
   r = image.mirror_image_promote(force);
   if (r < 0) {
     std::cerr << "rbd: error promoting image to primary" << std::endl;
@@ -146,6 +170,11 @@ int execute_demote(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(image);
+  if (r < 0) {
+    return r;
+  }
+
   r = image.mirror_image_demote();
   if (r < 0) {
     std::cerr << "rbd: error demoting image to non-primary" << std::endl;
@@ -177,6 +206,11 @@ int execute_resync(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(image);
+  if (r < 0) {
+    return r;
+  }
+
   r = image.mirror_image_resync();
   if (r < 0) {
     std::cerr << "rbd: error flagging image resync" << std::endl;
@@ -220,6 +254,11 @@ int execute_status(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(image);
+  if (r < 0) {
+    return r;
+  }
+
   librbd::mirror_image_status_t status;
   r = image.mirror_image_get_status(&status, sizeof(status));
   if (r < 0) {
diff --git a/ceph/src/tools/rbd/action/MirrorPool.cc b/ceph/src/tools/rbd/action/MirrorPool.cc
index 4314b1ed6..ba179d054 100644
--- a/ceph/src/tools/rbd/action/MirrorPool.cc
+++ b/ceph/src/tools/rbd/action/MirrorPool.cc
@@ -36,6 +36,23 @@ namespace po = boost::program_options;
 
 namespace {
 
+int validate_mirroring_enabled(librados::IoCtx& io_ctx) {
+  librbd::RBD rbd;
+  rbd_mirror_mode_t mirror_mode;
+  int r = rbd.mirror_mode_get(io_ctx, &mirror_mode);
+  if (r < 0) {
+    std::cerr << "rbd: failed to retrieve mirror mode: "
+              << cpp_strerror(r) << std::endl;
+    return r;
+  }
+
+  if (mirror_mode == RBD_MIRROR_MODE_DISABLED) {
+    std::cerr << "rbd: mirroring not enabled on the pool" << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
 int validate_uuid(const std::string &uuid) {
   boost::regex pattern("^[A-F0-9]{8}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{12}$",
                        boost::regex::icase);
@@ -328,7 +345,7 @@ public:
 
 protected:
   bool skip_action(const librbd::mirror_image_info_t &info) const override {
-    return info.primary;
+    return (info.state != RBD_MIRROR_IMAGE_ENABLED || info.primary);
   }
 
   void execute_action(librbd::Image &image,
@@ -340,6 +357,7 @@ protected:
     if (r >= 0) {
       (*m_counter)++;
     }
+    ImageRequestBase::handle_execute_action(r);
   }
 
   std::string get_action_type() const override {
@@ -360,7 +378,7 @@ public:
 
 protected:
   bool skip_action(const librbd::mirror_image_info_t &info) const override {
-    return !info.primary;
+    return (info.state != RBD_MIRROR_IMAGE_ENABLED || !info.primary);
   }
 
   void execute_action(librbd::Image &image,
@@ -403,6 +421,10 @@ protected:
   }
 
   void finalize_action() override {
+    if (m_mirror_image_status.info.global_id.empty()) {
+      return;
+    }
+
     std::string state = utils::mirror_image_status_state(m_mirror_image_status);
     std::string last_update = (
       m_mirror_image_status.last_update == 0 ?
@@ -529,25 +551,15 @@ int execute_peer_add(const po::variables_map &vm) {
   if (r < 0) {
     return r;
   }
-  
-  librbd::RBD rbd;
-  rbd_mirror_mode_t mirror_mode;
-  r = rbd.mirror_mode_get(io_ctx, &mirror_mode);
+
+  r = validate_mirroring_enabled(io_ctx);
   if (r < 0) {
-    std::cerr << "rbd: failed to retrieve mirror mode: " 
-              << cpp_strerror(r) << std::endl;
     return r;
   }
-  
-  if (mirror_mode == RBD_MIRROR_MODE_DISABLED) {
-    std::cerr << "rbd: failed to add mirror peer: "
-              << "mirroring must be enabled on the pool " 
-              << pool_name << std::endl;
-    return -EINVAL;
-  }
 
   // TODO: temporary restriction to prevent adding multiple peers
   // until rbd-mirror daemon can properly handle the scenario
+  librbd::RBD rbd;
   std::vector<librbd::mirror_peer_t> mirror_peers;
   r = rbd.mirror_peer_list(io_ctx, &mirror_peers);
   if (r < 0) {
@@ -593,6 +605,11 @@ int execute_peer_remove(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
   librbd::RBD rbd;
   r = rbd.mirror_peer_remove(io_ctx, uuid);
   if (r < 0) {
@@ -639,6 +656,11 @@ int execute_peer_set(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
   librbd::RBD rbd;
   if (key == "client") {
     r = rbd.mirror_peer_set_client(io_ctx, uuid.c_str(), value.c_str());
@@ -839,6 +861,11 @@ int execute_status(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
   librbd::RBD rbd;
 
   std::map<librbd::mirror_image_status_state_t, int> states;
@@ -932,6 +959,11 @@ int execute_promote(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
   std::atomic<unsigned> counter = { 0 };
   ImageRequestGenerator<PromoteImageRequest> generator(io_ctx, &counter,
                                                        vm["force"].as<bool>());
@@ -957,6 +989,11 @@ int execute_demote(const po::variables_map &vm) {
     return r;
   }
 
+  r = validate_mirroring_enabled(io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
   std::atomic<unsigned> counter { 0 };
   ImageRequestGenerator<DemoteImageRequest> generator(io_ctx, &counter);
   r = generator.execute();
diff --git a/ceph/src/tools/rbd_mirror/ImageReplayer.cc b/ceph/src/tools/rbd_mirror/ImageReplayer.cc
index 2bb31b4b1..bf77e9db7 100644
--- a/ceph/src/tools/rbd_mirror/ImageReplayer.cc
+++ b/ceph/src/tools/rbd_mirror/ImageReplayer.cc
@@ -74,76 +74,83 @@ struct ReplayHandler : public ::journal::ReplayHandler {
   }
 };
 
+template <typename I>
 class ImageReplayerAdminSocketCommand {
 public:
+  ImageReplayerAdminSocketCommand(const std::string &desc,
+                                  ImageReplayer<I> *replayer)
+    : desc(desc), replayer(replayer) {
+  }
   virtual ~ImageReplayerAdminSocketCommand() {}
   virtual bool call(Formatter *f, stringstream *ss) = 0;
+
+  std::string desc;
+  ImageReplayer<I> *replayer;
+  bool registered = false;
 };
 
 template <typename I>
-class StatusCommand : public ImageReplayerAdminSocketCommand {
+class StatusCommand : public ImageReplayerAdminSocketCommand<I> {
 public:
-  explicit StatusCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+  explicit StatusCommand(const std::string &desc, ImageReplayer<I> *replayer)
+    : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+  }
 
   bool call(Formatter *f, stringstream *ss) override {
-    replayer->print_status(f, ss);
+    this->replayer->print_status(f, ss);
     return true;
   }
-
-private:
-  ImageReplayer<I> *replayer;
 };
 
 template <typename I>
-class StartCommand : public ImageReplayerAdminSocketCommand {
+class StartCommand : public ImageReplayerAdminSocketCommand<I> {
 public:
-  explicit StartCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+  explicit StartCommand(const std::string &desc, ImageReplayer<I> *replayer)
+    : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+  }
 
   bool call(Formatter *f, stringstream *ss) override {
-    replayer->start(nullptr, true);
+    this->replayer->start(nullptr, true);
     return true;
   }
-
-private:
-  ImageReplayer<I> *replayer;
 };
 
 template <typename I>
-class StopCommand : public ImageReplayerAdminSocketCommand {
+class StopCommand : public ImageReplayerAdminSocketCommand<I> {
 public:
-  explicit StopCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+  explicit StopCommand(const std::string &desc, ImageReplayer<I> *replayer)
+    : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+  }
 
   bool call(Formatter *f, stringstream *ss) override {
-    replayer->stop(nullptr, true);
+    this->replayer->stop(nullptr, true);
     return true;
   }
-
-private:
-  ImageReplayer<I> *replayer;
 };
 
 template <typename I>
-class RestartCommand : public ImageReplayerAdminSocketCommand {
+class RestartCommand : public ImageReplayerAdminSocketCommand<I> {
 public:
-  explicit RestartCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+  explicit RestartCommand(const std::string &desc, ImageReplayer<I> *replayer)
+    : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+  }
 
   bool call(Formatter *f, stringstream *ss) override {
-    replayer->restart();
+    this->replayer->restart();
     return true;
   }
-
-private:
-  ImageReplayer<I> *replayer;
 };
 
 template <typename I>
-class FlushCommand : public ImageReplayerAdminSocketCommand {
+class FlushCommand : public ImageReplayerAdminSocketCommand<I> {
 public:
-  explicit FlushCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+  explicit FlushCommand(const std::string &desc, ImageReplayer<I> *replayer)
+    : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+  }
 
   bool call(Formatter *f, stringstream *ss) override {
     C_SaferCond cond;
-    replayer->flush(&cond);
+    this->replayer->flush(&cond);
     int r = cond.wait();
     if (r < 0) {
       *ss << "flush: " << cpp_strerror(r);
@@ -151,9 +158,6 @@ public:
     }
     return true;
   }
-
-private:
-  ImageReplayer<I> *replayer;
 };
 
 template <typename I>
@@ -161,72 +165,44 @@ class ImageReplayerAdminSocketHook : public AdminSocketHook {
 public:
   ImageReplayerAdminSocketHook(CephContext *cct, const std::string &name,
 			       ImageReplayer<I> *replayer)
-    : admin_socket(cct->get_admin_socket()), name(name), replayer(replayer),
-      lock("ImageReplayerAdminSocketHook::lock " +
-             replayer->get_global_image_id()) {
+    : admin_socket(cct->get_admin_socket()),
+      commands{{"rbd mirror flush " + name,
+                new FlushCommand<I>("flush rbd mirror " + name, replayer)},
+               {"rbd mirror restart " + name,
+                new RestartCommand<I>("restart rbd mirror " + name, replayer)},
+               {"rbd mirror start " + name,
+                new StartCommand<I>("start rbd mirror " + name, replayer)},
+               {"rbd mirror status " + name,
+                new StatusCommand<I>("get status for rbd mirror " + name, replayer)},
+               {"rbd mirror stop " + name,
+                new StopCommand<I>("stop rbd mirror " + name, replayer)}} {
   }
 
   int register_commands() {
-    std::string command;
-    int r;
-
-    command = "rbd mirror status " + name;
-    r = admin_socket->register_command(command, command, this,
-				       "get status for rbd mirror " + name);
-    if (r < 0) {
-      return r;
-    }
-    commands[command] = new StatusCommand<I>(replayer);
-
-    command = "rbd mirror start " + name;
-    r = admin_socket->register_command(command, command, this,
-				       "start rbd mirror " + name);
-    if (r < 0) {
-      return r;
-    }
-    commands[command] = new StartCommand<I>(replayer);
-
-    command = "rbd mirror stop " + name;
-    r = admin_socket->register_command(command, command, this,
-				       "stop rbd mirror " + name);
-    if (r < 0) {
-      return r;
-    }
-    commands[command] = new StopCommand<I>(replayer);
-
-    command = "rbd mirror restart " + name;
-    r = admin_socket->register_command(command, command, this,
-				       "restart rbd mirror " + name);
-    if (r < 0) {
-      return r;
-    }
-    commands[command] = new RestartCommand<I>(replayer);
-
-    command = "rbd mirror flush " + name;
-    r = admin_socket->register_command(command, command, this,
-				       "flush rbd mirror " + name);
-    if (r < 0) {
-      return r;
+    for (auto &it : commands) {
+      int r = admin_socket->register_command(it.first, it.first, this,
+                                             it.second->desc);
+      if (r < 0) {
+        return r;
+      }
+      it.second->registered = true;
     }
-    commands[command] = new FlushCommand<I>(replayer);
-
     return 0;
   }
 
   ~ImageReplayerAdminSocketHook() override {
-    Mutex::Locker locker(lock);
-    for (Commands::const_iterator i = commands.begin(); i != commands.end();
-	 ++i) {
-      (void)admin_socket->unregister_command(i->first);
-      delete i->second;
+    for (auto &it : commands) {
+      if (it.second->registered) {
+        admin_socket->unregister_command(it.first);
+      }
+      delete it.second;
     }
     commands.clear();
   }
 
   bool call(std::string command, cmdmap_t& cmdmap, std::string format,
 	    bufferlist& out) override {
-    Mutex::Locker locker(lock);
-    Commands::const_iterator i = commands.find(command);
+    auto i = commands.find(command);
     assert(i != commands.end());
     Formatter *f = Formatter::create(format);
     stringstream ss;
@@ -237,12 +213,9 @@ public:
   }
 
 private:
-  typedef std::map<std::string, ImageReplayerAdminSocketCommand*> Commands;
+  typedef std::map<std::string, ImageReplayerAdminSocketCommand<I> *> Commands;
 
   AdminSocket *admin_socket;
-  std::string name;
-  ImageReplayer<I> *replayer;
-  Mutex lock;
   Commands commands;
 };
 
@@ -606,20 +579,7 @@ void ImageReplayer<I>::handle_bootstrap(int r) {
     return;
   }
 
-  {
-    Mutex::Locker locker(m_lock);
-    std::string name = m_local_ioctx.get_pool_name() + "/" +
-                       m_local_image_ctx->name;
-    if (m_name != name) {
-      m_name = name;
-      if (m_asok_hook) {
-	// Re-register asok commands using the new name.
-	delete m_asok_hook;
-	m_asok_hook = nullptr;
-      }
-    }
-    register_admin_socket_hook();
-  }
+  on_name_changed();
 
   update_mirror_image_status(false, boost::none);
   init_remote_journaler();
@@ -785,7 +745,6 @@ void ImageReplayer<I>::stop(Context *on_finish, bool manual, int r,
   image_replayer::BootstrapRequest<I> *bootstrap_request = nullptr;
   bool shut_down_replay = false;
   bool running = true;
-  bool canceled_task = false;
   {
     Mutex::Locker locker(m_lock);
 
@@ -808,14 +767,6 @@ void ImageReplayer<I>::stop(Context *on_finish, bool manual, int r,
         std::swap(m_on_stop_finish, on_finish);
         m_stop_requested = true;
         m_manual_stop = manual;
-
-	Mutex::Locker timer_locker(m_threads->timer_lock);
-        if (m_delayed_preprocess_task != nullptr) {
-          canceled_task = m_threads->timer->cancel_event(
-            m_delayed_preprocess_task);
-          assert(canceled_task);
-          m_delayed_preprocess_task = nullptr;
-        }
       }
     }
   }
@@ -826,11 +777,6 @@ void ImageReplayer<I>::stop(Context *on_finish, bool manual, int r,
     bootstrap_request->put();
   }
 
-  if (canceled_task) {
-    m_event_replay_tracker.finish_op();
-    on_replay_interrupted();
-  }
-
   if (!running) {
     dout(20) << "not running" << dendl;
     if (on_finish) {
@@ -1275,6 +1221,8 @@ void ImageReplayer<I>::handle_process_entry_ready(int r) {
   dout(20) << dendl;
   assert(r == 0);
 
+  on_name_changed();
+
   // attempt to process the next event
   handle_replay_ready();
 }
@@ -1554,6 +1502,22 @@ void ImageReplayer<I>::reschedule_update_status_task(int new_interval) {
 template <typename I>
 void ImageReplayer<I>::shut_down(int r) {
   dout(20) << "r=" << r << dendl;
+
+  bool canceled_delayed_preprocess_task = false;
+  {
+    Mutex::Locker timer_locker(m_threads->timer_lock);
+    if (m_delayed_preprocess_task != nullptr) {
+      canceled_delayed_preprocess_task = m_threads->timer->cancel_event(
+        m_delayed_preprocess_task);
+      assert(canceled_delayed_preprocess_task);
+      m_delayed_preprocess_task = nullptr;
+    }
+  }
+  if (canceled_delayed_preprocess_task) {
+    // wake up sleeping replay
+    m_event_replay_tracker.finish_op();
+  }
+
   {
     Mutex::Locker locker(m_lock);
     assert(m_state == STATE_STOPPING);
@@ -1665,6 +1629,7 @@ template <typename I>
 void ImageReplayer<I>::handle_shut_down(int r) {
   reschedule_update_status_task(-1);
 
+  bool unregister_asok_hook = false;
   {
     Mutex::Locker locker(m_lock);
 
@@ -1696,17 +1661,21 @@ void ImageReplayer<I>::handle_shut_down(int r) {
       m_local_image_id = "";
       m_resync_requested = false;
       if (m_delete_requested) {
-        unregister_admin_socket_hook();
+        unregister_asok_hook = true;
         m_delete_requested = false;
       }
     } else if (m_last_r == -ENOENT &&
                m_local_image_id.empty() && m_remote_image.image_id.empty()) {
       dout(0) << "mirror image no longer exists" << dendl;
-      unregister_admin_socket_hook();
+      unregister_asok_hook = true;
       m_finished = true;
     }
   }
 
+  if (unregister_asok_hook) {
+    unregister_admin_socket_hook();
+  }
+
   dout(20) << "stop complete" << dendl;
   m_local_ioctx.close();
 
@@ -1789,30 +1758,51 @@ void ImageReplayer<I>::resync_image(Context *on_finish) {
 
 template <typename I>
 void ImageReplayer<I>::register_admin_socket_hook() {
-  if (m_asok_hook != nullptr) {
-    return;
-  }
+  ImageReplayerAdminSocketHook<I> *asok_hook;
+  {
+    Mutex::Locker locker(m_lock);
+    if (m_asok_hook != nullptr) {
+      return;
+    }
 
-  dout(20) << "registered asok hook: " << m_name << dendl;
-  auto asok_hook = new ImageReplayerAdminSocketHook<I>(g_ceph_context, m_name,
-                                                       this);
-  int r = asok_hook->register_commands();
-  if (r < 0) {
+    dout(20) << "registered asok hook: " << m_name << dendl;
+    asok_hook = new ImageReplayerAdminSocketHook<I>(g_ceph_context, m_name,
+                                                    this);
+    int r = asok_hook->register_commands();
+    if (r == 0) {
+      m_asok_hook = asok_hook;
+      return;
+    }
     derr << "error registering admin socket commands" << dendl;
-    delete asok_hook;
-    asok_hook = nullptr;
-    return;
   }
-
-  m_asok_hook = asok_hook;
+  delete asok_hook;
 }
 
 template <typename I>
 void ImageReplayer<I>::unregister_admin_socket_hook() {
   dout(20) << dendl;
 
-  delete m_asok_hook;
-  m_asok_hook = nullptr;
+  AdminSocketHook *asok_hook = nullptr;
+  {
+    Mutex::Locker locker(m_lock);
+    std::swap(asok_hook, m_asok_hook);
+  }
+  delete asok_hook;
+}
+
+template <typename I>
+void ImageReplayer<I>::on_name_changed() {
+  {
+    Mutex::Locker locker(m_lock);
+    std::string name = m_local_ioctx.get_pool_name() + "/" +
+      m_local_image_ctx->name;
+    if (m_name == name) {
+      return;
+    }
+    m_name = name;
+  }
+  unregister_admin_socket_hook();
+  register_admin_socket_hook();
 }
 
 template <typename I>
diff --git a/ceph/src/tools/rbd_mirror/ImageReplayer.h b/ceph/src/tools/rbd_mirror/ImageReplayer.h
index 3f2ab2fca..a66b02a24 100644
--- a/ceph/src/tools/rbd_mirror/ImageReplayer.h
+++ b/ceph/src/tools/rbd_mirror/ImageReplayer.h
@@ -425,6 +425,8 @@ private:
 
   void register_admin_socket_hook();
   void unregister_admin_socket_hook();
+
+  void on_name_changed();
 };
 
 } // namespace mirror
diff --git a/ceph/src/tools/rbd_mirror/PoolReplayer.cc b/ceph/src/tools/rbd_mirror/PoolReplayer.cc
index 8d03e878f..817d3434c 100644
--- a/ceph/src/tools/rbd_mirror/PoolReplayer.cc
+++ b/ceph/src/tools/rbd_mirror/PoolReplayer.cc
@@ -47,6 +47,9 @@ const std::string SERVICE_DAEMON_LEADER_KEY("leader");
 const std::string SERVICE_DAEMON_LOCAL_COUNT_KEY("image_local_count");
 const std::string SERVICE_DAEMON_REMOTE_COUNT_KEY("image_remote_count");
 
+const std::vector<std::string> UNIQUE_PEER_CONFIG_KEYS {
+  {"monmap", "mon_host", "mon_dns_srv_name", "key", "keyfile", "keyring"}};
+
 class PoolReplayerAdminSocketCommand {
 public:
   PoolReplayerAdminSocketCommand(PoolReplayer *pool_replayer)
@@ -260,7 +263,7 @@ void PoolReplayer::init()
   dout(20) << "replaying for " << m_peer << dendl;
   int r = init_rados(g_ceph_context->_conf->cluster,
                      g_ceph_context->_conf->name.to_str(),
-                     "local cluster", &m_local_rados);
+                     "local cluster", &m_local_rados, false);
   if (r < 0) {
     m_callout_id = m_service_daemon->add_or_update_callout(
       m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
@@ -270,7 +273,7 @@ void PoolReplayer::init()
 
   r = init_rados(m_peer.cluster_name, m_peer.client_name,
                  std::string("remote peer ") + stringify(m_peer),
-                 &m_remote_rados);
+                 &m_remote_rados, true);
   if (r < 0) {
     m_callout_id = m_service_daemon->add_or_update_callout(
       m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
@@ -377,7 +380,8 @@ void PoolReplayer::shut_down() {
 int PoolReplayer::init_rados(const std::string &cluster_name,
 			     const std::string &client_name,
 			     const std::string &description,
-			     RadosRef *rados_ref) {
+			     RadosRef *rados_ref,
+                             bool strip_cluster_overrides) {
   rados_ref->reset(new librados::Rados());
 
   // NOTE: manually bootstrap a CephContext here instead of via
@@ -402,6 +406,18 @@ int PoolReplayer::init_rados(const std::string &cluster_name,
     cct->put();
     return r;
   }
+
+  // preserve cluster-specific config settings before applying environment/cli
+  // overrides
+  std::map<std::string, std::string> config_values;
+  if (strip_cluster_overrides) {
+    // remote peer connections shouldn't apply cluster-specific
+    // configuration settings
+    for (auto& key : UNIQUE_PEER_CONFIG_KEYS) {
+      config_values[key] = cct->_conf->get_val<std::string>(key);
+    }
+  }
+
   cct->_conf->parse_env();
 
   // librados::Rados::conf_parse_env
@@ -427,6 +443,20 @@ int PoolReplayer::init_rados(const std::string &cluster_name,
     }
   }
 
+  if (strip_cluster_overrides) {
+    // remote peer connections shouldn't apply cluster-specific
+    // configuration settings
+    for (auto& pair : config_values) {
+      auto value = cct->_conf->get_val<std::string>(pair.first);
+      if (pair.second != value) {
+        dout(0) << "reverting global config option override: "
+                << pair.first << ": " << value << " -> " << pair.second
+                << dendl;
+        cct->_conf->set_val_or_die(pair.first, pair.second);
+      }
+    }
+  }
+
   if (!g_ceph_context->_conf->admin_socket.empty()) {
     cct->_conf->set_val_or_die("admin_socket",
                                "$run_dir/$name.$pid.$cluster.$cctid.asok");
diff --git a/ceph/src/tools/rbd_mirror/PoolReplayer.h b/ceph/src/tools/rbd_mirror/PoolReplayer.h
index ca693ef74..49aaee3c4 100644
--- a/ceph/src/tools/rbd_mirror/PoolReplayer.h
+++ b/ceph/src/tools/rbd_mirror/PoolReplayer.h
@@ -89,7 +89,8 @@ private:
 
   int init_rados(const std::string &cluster_name,
                  const std::string &client_name,
-                 const std::string &description, RadosRef *rados_ref);
+                 const std::string &description, RadosRef *rados_ref,
+                 bool strip_cluster_overrides);
 
   void handle_post_acquire_leader(Context *on_finish);
   void handle_pre_release_leader(Context *on_finish);
diff --git a/ceph/src/tools/rbd_mirror/PoolWatcher.cc b/ceph/src/tools/rbd_mirror/PoolWatcher.cc
index 18c6df384..8d60aa4f4 100644
--- a/ceph/src/tools/rbd_mirror/PoolWatcher.cc
+++ b/ceph/src/tools/rbd_mirror/PoolWatcher.cc
@@ -362,10 +362,11 @@ void PoolWatcher<I>::schedule_refresh_images(double interval) {
   }
 
   m_image_ids_invalid = true;
-  m_timer_ctx = new FunctionContext([this](int r) {
-      process_refresh_images();
-    });
-  m_threads->timer->add_event_after(interval, m_timer_ctx);
+  m_timer_ctx = m_threads->timer->add_event_after(
+    interval,
+    new FunctionContext([this](int r) {
+	process_refresh_images();
+      }));
 }
 
 template <typename I>
diff --git a/ceph/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc b/ceph/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
index 1b2359a7d..1c521b274 100644
--- a/ceph/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
+++ b/ceph/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
@@ -213,9 +213,11 @@ void BootstrapRequest<I>::register_client() {
 
   update_progress("REGISTER_CLIENT");
 
-  // record an place-holder record
-  librbd::journal::ClientData client_data{
-    librbd::journal::MirrorPeerClientMeta{m_local_image_id}};
+  librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
+    m_local_image_id};
+  mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
+  librbd::journal::ClientData client_data{mirror_peer_client_meta};
   bufferlist client_data_bl;
   ::encode(client_data, client_data_bl);
 
@@ -239,6 +241,8 @@ void BootstrapRequest<I>::handle_register_client(int r) {
 
   m_client = {};
   *m_client_meta = librbd::journal::MirrorPeerClientMeta(m_local_image_id);
+  m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
   is_primary();
 }
 
@@ -498,10 +502,6 @@ void BootstrapRequest<I>::handle_create_local_image(int r) {
 
 template <typename I>
 void BootstrapRequest<I>::get_remote_tags() {
-  dout(20) << dendl;
-
-  update_progress("GET_REMOTE_TAGS");
-
   if (m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_SYNCING) {
     // optimization -- no need to compare remote tags if we just created
     // the image locally or sync was interrupted
@@ -510,6 +510,7 @@ void BootstrapRequest<I>::get_remote_tags() {
   }
 
   dout(20) << dendl;
+  update_progress("GET_REMOTE_TAGS");
 
   Context *ctx = create_context_callback<
     BootstrapRequest<I>, &BootstrapRequest<I>::handle_get_remote_tags>(this);
diff --git a/ceph/src/tools/rbd_mirror/image_sync/ImageCopyRequest.cc b/ceph/src/tools/rbd_mirror/image_sync/ImageCopyRequest.cc
index 6278d0101..6768caa00 100644
--- a/ceph/src/tools/rbd_mirror/image_sync/ImageCopyRequest.cc
+++ b/ceph/src/tools/rbd_mirror/image_sync/ImageCopyRequest.cc
@@ -161,8 +161,9 @@ void ImageCopyRequest<I>::send_object_copies() {
   {
     Mutex::Locker timer_locker(*m_timer_lock);
     if (m_update_sync_ctx) {
-      m_timer->add_event_after(m_update_sync_point_interval,
-                               m_update_sync_ctx);
+      m_update_sync_ctx = m_timer->add_event_after(
+        m_update_sync_point_interval,
+	m_update_sync_ctx);
     }
   }
 
diff --git a/ceph/src/vstart.sh b/ceph/src/vstart.sh
index 82ec76bf9..63f2202cb 100755
--- a/ceph/src/vstart.sh
+++ b/ceph/src/vstart.sh
@@ -507,8 +507,10 @@ $DAEMONOPTS
         osd copyfrom max chunk = 524288
         bluestore fsck on mount = true
         bluestore block create = true
+	bluestore block db path = $CEPH_DEV_DIR/osd\$id/block.db.file
         bluestore block db size = 67108864
         bluestore block db create = true
+	bluestore block wal path = $CEPH_DEV_DIR/osd\$id/block.wal.file
         bluestore block wal size = 1048576000
         bluestore block wal create = true
 $COSDDEBUG
@@ -516,7 +518,7 @@ $COSDMEMSTORE
 $COSDSHORT
 $extra_conf
 [mon]
-        mgr initial modules = restful status dashboard
+        mgr initial modules = restful status dashboard balancer
         mon pg warn min per osd = 3
         mon osd allow primary affinity = true
         mon reweight min pgs per osd = 4
@@ -627,9 +629,14 @@ EOF
             echo "add osd$osd $uuid"
             ceph_adm osd create $uuid
             ceph_adm osd crush add osd.$osd 1.0 host=$HOSTNAME root=default
-            $SUDO $CEPH_BIN/ceph-osd -i $osd $ARGS --mkfs --mkkey --osd-uuid $uuid
+	    OSD_SECRET=$($CEPH_BIN/ceph-authtool --gen-print-key)
+            $SUDO $CEPH_BIN/ceph-osd -i $osd $ARGS --mkfs --key $OSD_SECRET --osd-uuid $uuid
 
             local key_fn=$CEPH_DEV_DIR/osd$osd/keyring
+	    cat > $key_fn<<EOF
+[osd.$osd]
+	key = $OSD_SECRET
+EOF
             echo adding osd$osd key to auth repository
             ceph_adm -i "$key_fn" auth add osd.$osd osd "allow *" mon "allow profile osd" mgr "allow profile osd"
         fi
@@ -776,6 +783,7 @@ else
         debug rocksdb = 10
         debug bdev = 20
         debug rgw = 20
+	debug reserver = 10
         debug objclass = 20'
     CMDSDEBUG='
         debug ms = 1
diff --git a/ceph/systemd/ceph-rbd-mirror@.service b/ceph/systemd/ceph-rbd-mirror@.service
index d38aec524..17fd7381f 100644
--- a/ceph/systemd/ceph-rbd-mirror@.service
+++ b/ceph/systemd/ceph-rbd-mirror@.service
@@ -2,6 +2,7 @@
 Description=Ceph rbd mirror daemon
 After=network-online.target local-fs.target
 Wants=network-online.target local-fs.target
+PartOf=ceph-rbd-mirror.target
 
 [Service]
 LimitNOFILE=1048576
-- 
2.39.2