From f64942e41c1f59e95cdc1205bbe5d32ed6dfd429 Mon Sep 17 00:00:00 2001
From: Alwin Antreich <a.antreich@proxmox.com>
Date: Wed, 6 Feb 2019 09:29:01 +0100
Subject: [PATCH] update source to 12.2.11

Signed-off-by: Alwin Antreich <a.antreich@proxmox.com>
---
 Makefile                                      |   4 +-
 ceph/CMakeLists.txt                           |  19 +-
 ceph/PendingReleaseNotes                      |  33 +
 ceph/alpine/APKBUILD                          |   6 +-
 ceph/ceph.spec                                |   7 +-
 ceph/ceph.spec.in                             |   1 +
 ceph/debian/changelog                         |   6 +
 ceph/debian/control                           |   2 +
 .../upgrade/luminous-p2p/% => doc/README.md}  |   0
 ceph/doc/_ext/edit_on_github.py               |  43 +
 ceph/doc/_static/js/ceph.js                   |  41 +
 ceph/doc/_templates/page.html                 |  21 +
 ceph/doc/ceph-volume/lvm/zap.rst              |  49 +-
 ceph/doc/cephfs/dirfrags.rst                  |   7 +-
 ceph/doc/cephfs/eviction.rst                  |   6 +-
 ceph/doc/cephfs/fuse.rst                      |   4 +-
 ceph/doc/cephfs/health-messages.rst           |   2 +-
 ceph/doc/cephfs/mds-config-ref.rst            |  34 -
 ceph/doc/conf.py                              |  14 +
 ceph/doc/man/8/ceph-volume.rst                |  11 +
 ceph/doc/man/8/crushtool.rst                  |   8 +
 ceph/doc/mgr/balancer.rst                     |   4 +
 .../configuration/bluestore-config-ref.rst    | 160 +++-
 ceph/doc/rados/operations/add-or-rm-mons.rst  |   5 +
 ceph/doc/rados/operations/crush-map-edits.rst | 293 ++++---
 ceph/doc/rados/operations/crush-map.rst       |   5 +
 ceph/doc/rados/operations/user-management.rst |   6 -
 .../troubleshooting/troubleshooting-mon.rst   |   5 +-
 ceph/doc/radosgw/adminops.rst                 |   5 +-
 ceph/doc/radosgw/config-ref.rst               |  11 +
 ceph/doc/radosgw/encryption.rst               |   5 +
 ceph/doc/radosgw/frontends.rst                |  29 +-
 ceph/doc/start/hardware-recommendations.rst   |  28 +-
 ceph/doc/start/quick-ceph-deploy.rst          |   2 +-
 ceph/examples/librados/Makefile               |  10 +-
 ceph/examples/librados/hello_world.readme     |   2 +-
 ceph/install-deps.sh                          |   1 +
 .../cephfs/clusters/1-mds-1-client-coloc.yaml |  12 +
 ceph/qa/cephfs/clusters/1-mds-1-client.yaml   |   7 +-
 .../cephfs/clusters/1-mds-2-client-coloc.yaml |  12 +
 ceph/qa/cephfs/clusters/1-mds-2-client.yaml   |   7 +-
 ceph/qa/cephfs/clusters/1-mds-3-client.yaml   |  15 +
 .../cephfs/clusters/1-mds-4-client-coloc.yaml |  12 +
 ceph/qa/cephfs/clusters/1-mds-4-client.yaml   |   7 +-
 ceph/qa/cephfs/clusters/3-mds.yaml            |   7 +-
 ceph/qa/cephfs/clusters/9-mds.yaml            |   7 +-
 ceph/qa/cephfs/clusters/fixed-2-ucephfs.yaml  |   4 +-
 ceph/qa/run-standalone.sh                     |   3 +-
 ceph/qa/standalone/ceph-helpers.sh            |  14 +-
 ceph/qa/standalone/scrub/osd-scrub-repair.sh  |  61 ++
 .../smoke/basic/2-ceph/ceph_ansible.yaml      |   2 +-
 .../tasks/cfuse_workunit_suites_pjd.yaml      |   1 +
 .../clusters/1-mds-4-client-coloc.yaml        |   1 +
 .../clusters/4-remote-clients.yaml            |  10 -
 .../fs/basic_functional/tasks/damage.yaml     |   2 +
 .../tasks/cfuse_workunit_suites_pjd.yaml      |   1 +
 .../clusters/small-cluster.yaml               |   2 +
 .../multiclient/clusters/1-mds-2-client.yaml  |   1 +
 .../multiclient/clusters/1-mds-3-client.yaml  |   1 +
 .../multiclient/clusters/three_clients.yaml   |  15 -
 .../fs/multiclient/clusters/two_clients.yaml  |  14 -
 .../fs/multifs/clusters/2-remote-clients.yaml |  10 -
 .../tasks/cfuse_workunit_suites_pjd.yaml      |   1 +
 .../thrash/clusters/1-mds-1-client-coloc.yaml |   1 +
 .../thrash/clusters/mds-1active-1standby.yaml |  10 -
 .../thrash/msgr-failures/osd-mds-delay.yaml   |   2 +-
 .../tasks/cfuse_workunit_suites_pjd.yaml      |   1 +
 .../tasks/kclient_workunit_suites_pjd.yaml    |   1 +
 .../suites/kcephfs/recovery/tasks/damage.yaml |   2 +
 .../tasks/cfuse_workunit_suites_pjd.yaml      |   1 +
 .../singleton/all/mon-config-key-caps.yaml    |  17 +
 .../1.1-pg-log-overrides/normal_pg_log.yaml   |   1 +
 .../1.1-pg-log-overrides/short_pg_log.yaml    |   6 +
 .../1.1-pg-log-overrides/normal_pg_log.yaml   |   1 +
 .../1.1-pg-log-overrides/short_pg_log.yaml    |   6 +
 .../1.1-pg-log-overrides/normal_pg_log.yaml   |   1 +
 .../1.1-pg-log-overrides/short_pg_log.yaml    |   6 +
 .../2-partial-upgrade/firsthalf.yaml          |   5 +
 .../stress-split/5-finish-upgrade.yaml        |  10 +
 .../luminous-p2p/luminous-p2p-parallel/%      |   0
 .../luminous-p2p/luminous-p2p-parallel/.qa    |   1 +
 .../point-to-point-upgrade.yaml               |  33 +
 .../luminous-p2p-parallel/supported           |   1 +
 .../luminous-p2p/luminous-p2p-stress-split/%  |   0
 .../luminous-p2p-stress-split/0-cluster/+     |   0
 .../0-cluster}/.qa                            |   0
 .../0-cluster/openstack.yaml                  |   6 +
 .../0-cluster/start.yaml                      |  20 +
 .../1-ceph-install/luminous.yaml              |  19 +
 .../1.1-pg-log-overrides/normal_pg_log.yaml   |   1 +
 .../1.1-pg-log-overrides/short_pg_log.yaml    |   6 +
 .../2-partial-upgrade/.qa                     |   1 +
 .../2-partial-upgrade/firsthalf.yaml          |  17 +
 .../luminous-p2p-stress-split/3-thrash/.qa    |   1 +
 .../3-thrash/default.yaml                     |  25 +
 .../luminous-p2p-stress-split/4-workload/+    |   0
 .../luminous-p2p-stress-split/4-workload/.qa  |   1 +
 .../4-workload/radosbench.yaml                |  40 +
 .../4-workload/rbd-cls.yaml                   |  10 +
 .../4-workload/rbd-import-export.yaml         |  12 +
 .../4-workload/rbd_api.yaml                   |  10 +
 .../4-workload/readwrite.yaml                 |  16 +
 .../4-workload/snaps-few-objects.yaml         |  18 +
 .../5-finish-upgrade.yaml                     |  14 +
 .../7-final-workload/+                        |   0
 .../7-final-workload/.qa                      |   1 +
 .../7-final-workload/rbd-python.yaml          |   9 +
 .../7-final-workload/rgw-swift.yaml           |  11 +
 .../7-final-workload/snaps-many-objects.yaml  |  16 +
 .../luminous-p2p-stress-split/supported       |   1 +
 .../thrashosds-health.yaml                    |   1 +
 ceph/qa/suites/upgrade/luminous-p2p/supported |   1 -
 ceph/qa/tasks/cephfs/filesystem.py            |  52 +-
 ceph/qa/tasks/cephfs/fuse_mount.py            |  39 +-
 ceph/qa/tasks/cephfs/kernel_mount.py          |  14 +-
 ceph/qa/tasks/cephfs/test_client_limits.py    |   6 +-
 ceph/qa/tasks/cephfs/test_client_recovery.py  |  28 +-
 ceph/qa/tasks/cephfs/test_damage.py           |  71 +-
 ceph/qa/tasks/cephfs/test_data_scan.py        |   4 +-
 ceph/qa/tasks/cephfs/test_flush.py            |   4 +-
 ceph/qa/tasks/cephfs/test_forward_scrub.py    |   4 +-
 ceph/qa/tasks/cephfs/test_fragment.py         |   1 -
 .../qa/tasks/cephfs/test_journal_migration.py |   5 +-
 ceph/qa/tasks/cephfs/test_journal_repair.py   |  10 +-
 ceph/qa/tasks/cephfs/test_misc.py             |  92 +-
 ceph/qa/tasks/cephfs/test_recovery_pool.py    |  29 +-
 ceph/qa/tasks/qemu.py                         |   4 +-
 ceph/qa/tasks/thrashosds-health.yaml          |   2 +-
 ceph/qa/tasks/workunit.py                     |   2 +-
 .../ceph-tests/ceph-admin-commands.sh         |   7 +-
 ceph/qa/workunits/mon/test_config_key_caps.sh | 201 +++++
 .../qa/workunits/rados/test_librados_build.sh |   4 +-
 ceph/qa/workunits/rbd/run_devstack_tempest.sh |   4 +-
 .../suites/cephfs_journal_tool_smoke.sh       |   2 +-
 ceph/run-make-check.sh                        |  90 +-
 ceph/src/.git_version                         |   4 +-
 ceph/src/auth/AuthSessionHandler.cc           |   4 +
 ceph/src/ceph-create-keys                     |  12 +-
 ceph/src/ceph-volume/ceph_volume/api/lvm.py   |   3 +
 .../ceph_volume/devices/lvm/activate.py       |   8 +-
 .../ceph_volume/devices/lvm/batch.py          |  11 +-
 .../devices/lvm/strategies/bluestore.py       |  59 +-
 .../devices/lvm/strategies/filestore.py       |  64 +-
 .../devices/lvm/strategies/strategies.py      |  50 ++
 .../ceph_volume/devices/lvm/zap.py            | 288 +++++--
 .../ceph-volume/ceph_volume/inventory/main.py |   6 +-
 .../ceph-volume/ceph_volume/tests/conftest.py |   4 +-
 .../tests/devices/lvm/test_batch.py           |   7 +
 .../ceph_volume/tests/devices/lvm/test_zap.py | 153 ++++
 .../ceph_volume/tests/devices/test_zap.py     |   4 +-
 .../bluestore/mixed-type-dmcrypt/test_zap.yml |   1 +
 .../centos7/bluestore/mixed-type/test_zap.yml |   1 +
 .../single-type-dmcrypt/test_zap.yml          |   1 +
 .../bluestore/single-type/test_zap.yml        |   1 +
 .../filestore/mixed-type-dmcrypt/test_zap.yml |   1 +
 .../centos7/filestore/mixed-type/test_zap.yml |   1 +
 .../single-type-dmcrypt/test_zap.yml          |   1 +
 .../filestore/single-type/test_zap.yml        |   1 +
 .../functional/batch/playbooks/test_zap.yml   |  31 +
 .../tests/functional/batch/tox.ini            |   3 +
 .../single-type-dmcrypt/test_zap.yml          |   1 +
 .../xenial/bluestore/single-type/test_zap.yml |   1 +
 .../single-type-dmcrypt/test_zap.yml          |   1 +
 .../xenial/filestore/single-type/test_zap.yml |   1 +
 .../lvm/centos7/bluestore/dmcrypt/test.yml    |  11 +
 .../lvm/centos7/filestore/dmcrypt/test.yml    |  31 +
 .../lvm/playbooks/test_bluestore.yml          |  42 +
 .../lvm/playbooks/test_filestore.yml          |  49 ++
 .../lvm/xenial/bluestore/dmcrypt/test.yml     |  11 +
 .../lvm/xenial/filestore/dmcrypt/test.yml     |  31 +
 .../tests/functional/playbooks/deploy.yml     |   8 +-
 .../ceph_volume/tests/util/test_device.py     | 161 +++-
 .../ceph_volume/tests/util/test_disk.py       |  50 ++
 .../ceph_volume/tests/util/test_encryption.py |  18 +
 .../ceph_volume/tests/util/test_util.py       |  36 +-
 .../ceph-volume/ceph_volume/util/__init__.py  |  23 +-
 .../ceph_volume/util/arg_validators.py        |  11 +-
 .../ceph-volume/ceph_volume/util/device.py    |  87 +-
 ceph/src/ceph-volume/ceph_volume/util/disk.py |  67 +-
 .../ceph_volume/util/encryption.py            |   4 +-
 ceph/src/client/Client.cc                     |  32 +-
 ceph/src/cls/lock/cls_lock.cc                 |  71 +-
 ceph/src/cls/lock/cls_lock_client.cc          |  21 +-
 ceph/src/cls/lock/cls_lock_client.h           |  43 +-
 ceph/src/cls/lock/cls_lock_ops.cc             |   2 +-
 ceph/src/cls/lock/cls_lock_ops.h              |   3 +
 ceph/src/cls/lock/cls_lock_types.h            |  29 +-
 ceph/src/cls/rgw/cls_rgw.cc                   |   8 +-
 ceph/src/cls/rgw/cls_rgw_client.cc            |  23 +-
 ceph/src/cls/rgw/cls_rgw_client.h             |  36 +-
 ceph/src/cls/rgw/cls_rgw_types.h              |  21 +
 ceph/src/common/Cond.h                        | 100 +--
 ceph/src/common/CondVar.h                     | 109 +++
 ceph/src/common/TrackedOp.cc                  |   4 +-
 ceph/src/common/WeightedPriorityQueue.h       |  21 +-
 ceph/src/common/buffer.cc                     |  26 +
 ceph/src/common/ceph_context.cc               |  16 +-
 ceph/src/common/cmdparse.h                    |  53 +-
 ceph/src/common/config.cc                     | 140 +--
 ceph/src/common/config.h                      |  59 +-
 ceph/src/common/hobject.h                     |  31 +-
 ceph/src/common/legacy_config_opts.h          |  16 +-
 ceph/src/common/options.cc                    |  82 +-
 ceph/src/crush/CrushCompiler.cc               |  10 +-
 ceph/src/crush/CrushTester.cc                 |  77 ++
 ceph/src/crush/CrushTester.h                  |   2 +
 ceph/src/crush/CrushWrapper.cc                | 424 ++++++++-
 ceph/src/crush/CrushWrapper.h                 |  14 +
 ceph/src/include/buffer.h                     |   1 +
 ceph/src/include/ceph_features.h              |   2 +
 ceph/src/include/ceph_fs.h                    |   4 +-
 ceph/src/include/cephfs/libcephfs.h           |   2 +
 ceph/src/include/config-h.in.cmake            |   3 +
 ceph/src/include/rados.h                      |   4 +-
 ceph/src/include/rados/librados.hpp           |  17 +-
 ceph/src/librados/librados.cc                 |   6 +-
 ceph/src/librbd/librbd.cc                     |   1 +
 ceph/src/librbd/operation/ResizeRequest.cc    |   1 +
 ceph/src/mds/CInode.cc                        |  13 +-
 ceph/src/mds/CInode.h                         |   1 +
 ceph/src/mds/FSMap.cc                         |   8 -
 ceph/src/mds/Locker.cc                        |  16 +-
 ceph/src/mds/MDBalancer.cc                    |   2 +-
 ceph/src/mds/MDCache.cc                       | 201 +++--
 ceph/src/mds/MDCache.h                        |  11 +-
 ceph/src/mds/MDLog.cc                         |   8 +-
 ceph/src/mds/MDSDaemon.cc                     |  18 +-
 ceph/src/mds/MDSMap.h                         |  16 +
 ceph/src/mds/MDSRank.cc                       | 645 ++++++++++----
 ceph/src/mds/MDSRank.h                        |  10 +-
 ceph/src/mds/PurgeQueue.cc                    |  83 +-
 ceph/src/mds/PurgeQueue.h                     |   5 +-
 ceph/src/mds/Server.cc                        | 218 +++--
 ceph/src/mds/Server.h                         |   9 +-
 ceph/src/mds/SessionMap.h                     |   1 +
 ceph/src/mds/StrayManager.cc                  |  18 +-
 ceph/src/mgr/DaemonServer.cc                  |   7 +-
 ceph/src/mgr/DaemonState.cc                   |  13 +-
 ceph/src/mon/AuthMonitor.cc                   |  34 +-
 ceph/src/mon/ConfigKeyService.cc              |   6 +-
 ceph/src/mon/FSCommands.cc                    |  72 +-
 ceph/src/mon/LogMonitor.cc                    |  32 +-
 ceph/src/mon/MDSMonitor.cc                    |  22 +-
 ceph/src/mon/MgrMonitor.cc                    |  43 +-
 ceph/src/mon/MonCap.cc                        |   8 +-
 ceph/src/mon/MonCommands.h                    |   6 +-
 ceph/src/mon/Monitor.cc                       |  20 +-
 ceph/src/mon/MonmapMonitor.cc                 |  39 +-
 ceph/src/mon/OSDMonitor.cc                    | 407 +++++----
 ceph/src/os/bluestore/BlueFS.cc               |  24 +
 ceph/src/os/bluestore/BlueFS.h                |   7 +
 ceph/src/os/bluestore/BlueStore.cc            |  91 +-
 ceph/src/os/bluestore/BlueStore.h             |  10 +-
 ceph/src/os/bluestore/bluestore_tool.cc       |  81 +-
 ceph/src/os/filestore/LFNIndex.h              |   2 +-
 ceph/src/osd/OSD.cc                           |  66 +-
 ceph/src/osd/OSD.h                            |   3 +-
 ceph/src/osd/OSDMap.cc                        |  48 +-
 ceph/src/osd/OSDMap.h                         |   7 +-
 ceph/src/osd/PG.cc                            |  52 +-
 ceph/src/osd/PG.h                             |   7 +-
 ceph/src/osd/PGLog.cc                         |  41 +-
 ceph/src/osd/PGLog.h                          |   3 +-
 ceph/src/osd/PrimaryLogPG.cc                  |  95 ++-
 ceph/src/osd/PrimaryLogPG.h                   |   1 +
 ceph/src/osd/osd_types.h                      |  11 +-
 ceph/src/osdc/Journaler.cc                    |  10 +-
 ceph/src/osdc/ObjectCacher.cc                 |  29 +-
 ceph/src/osdc/ObjectCacher.h                  |   1 +
 ceph/src/osdc/Objecter.cc                     |  11 +-
 ceph/src/pybind/ceph_volume_client.py         |  26 +-
 ceph/src/pybind/mgr/balancer/module.py        |  24 +-
 ceph/src/pybind/mgr/influx/module.py          |   2 +
 ceph/src/pybind/mgr/prometheus/module.py      |  29 +-
 ceph/src/pybind/mgr/restful/common.py         |   2 +-
 ceph/src/pybind/mgr/restful/module.py         |   6 +-
 ceph/src/pybind/mgr/status/module.py          |   4 +-
 ceph/src/pybind/rbd/rbd.pyx                   |   1 +
 ceph/src/rgw/CMakeLists.txt                   |  22 +-
 ceph/src/rgw/librgw.cc                        |   5 +
 ceph/src/rgw/rgw_admin.cc                     |  65 +-
 ceph/src/rgw/rgw_asio_client.cc               |  58 +-
 ceph/src/rgw/rgw_asio_client.h                |  22 +-
 ceph/src/rgw/rgw_asio_frontend.cc             | 386 ++++++---
 ceph/src/rgw/rgw_auth.cc                      |   5 +
 ceph/src/rgw/rgw_auth_s3.cc                   |   3 +-
 ceph/src/rgw/rgw_bucket.cc                    | 219 ++++-
 ceph/src/rgw/rgw_bucket.h                     |  11 +-
 ceph/src/rgw/rgw_common.cc                    |  32 +-
 ceph/src/rgw/rgw_common.h                     |   4 +
 ceph/src/rgw/rgw_cr_rados.cc                  |   2 +-
 ceph/src/rgw/rgw_crypt.cc                     |   8 +-
 ceph/src/rgw/rgw_data_sync.cc                 |   4 +
 ceph/src/rgw/rgw_file.h                       |   7 +
 ceph/src/rgw/rgw_iam_policy.cc                |   2 +-
 ceph/src/rgw/rgw_metadata.cc                  |   4 +-
 ceph/src/rgw/rgw_metadata.h                   |   4 +-
 ceph/src/rgw/rgw_op.cc                        |  27 +-
 ceph/src/rgw/rgw_op.h                         |  30 +
 ceph/src/rgw/rgw_quota.cc                     |  23 +
 ceph/src/rgw/rgw_quota.h                      |   4 +
 ceph/src/rgw/rgw_rados.cc                     | 285 +++++--
 ceph/src/rgw/rgw_rados.h                      |  24 +-
 ceph/src/rgw/rgw_reshard.cc                   | 513 +++++++----
 ceph/src/rgw/rgw_reshard.h                    |  96 ++-
 ceph/src/rgw/rgw_rest.cc                      |  15 +-
 ceph/src/rgw/rgw_rest_s3.cc                   |   5 +-
 ceph/src/rgw/rgw_rest_swift.cc                |   2 +
 ceph/src/rgw/rgw_rest_user.cc                 |  42 +-
 ceph/src/rgw/rgw_sync_log_trim.cc             |  73 +-
 ceph/src/rgw/rgw_sync_module_es.cc            |  58 +-
 ceph/src/rgw/rgw_user.cc                      |  19 +-
 ceph/src/test/cli/crushtool/crush-classes/a   | Bin 0 -> 2358 bytes
 ceph/src/test/cli/crushtool/crush-classes/b   | Bin 0 -> 20656 bytes
 .../test/cli/crushtool/crush-classes/beesly   | Bin 0 -> 64806 bytes
 ceph/src/test/cli/crushtool/crush-classes/c   | Bin 0 -> 8801 bytes
 ceph/src/test/cli/crushtool/crush-classes/d   | Bin 0 -> 3657 bytes
 ceph/src/test/cli/crushtool/crush-classes/e   | Bin 0 -> 7094 bytes
 ceph/src/test/cli/crushtool/crush-classes/f   | Bin 0 -> 61002 bytes
 .../src/test/cli/crushtool/crush-classes/flax | Bin 0 -> 8184 bytes
 ceph/src/test/cli/crushtool/crush-classes/g   | Bin 0 -> 43071 bytes
 .../src/test/cli/crushtool/crush-classes/gabe | Bin 0 -> 61114 bytes
 .../test/cli/crushtool/crush-classes/gabe2    | Bin 0 -> 61002 bytes
 ceph/src/test/cli/crushtool/help.t            |   9 +
 ceph/src/test/cli/crushtool/reclassify.t      | 588 +++++++++++++
 ceph/src/test/cli/radosgw-admin/help.t        |   2 +
 ceph/src/test/cls_lock/test_cls_lock.cc       | 178 +++-
 ceph/src/test/cls_rgw/test_cls_rgw.cc         |  12 +-
 ceph/src/test/compressor/CMakeLists.txt       |   2 +-
 ceph/src/test/encoding/readable.sh            |   6 +
 ceph/src/test/librados/aio.cc                 | 805 +++++++-----------
 ceph/src/test/librados/lock.cc                |  16 +-
 .../librados_test_stub/LibradosTestStub.cc    |   6 +
 ceph/src/test/objectstore/store_test.cc       |  67 ++
 ceph/src/test/osd/TestOSDMap.cc               | 137 +++
 ceph/src/test/rgw/rgw_multi/multisite.py      |   5 +-
 ceph/src/test/rgw/rgw_multi/tests.py          |  33 +-
 ceph/src/test/rgw/test_rgw_iam_policy.cc      |   4 +-
 ceph/src/tools/cephfs/JournalTool.cc          |  57 +-
 ceph/src/tools/cephfs/JournalTool.h           |  11 +
 ceph/src/tools/cephfs/RoleSelector.cc         |   5 +-
 ceph/src/tools/cephfs/RoleSelector.h          |   3 +-
 ceph/src/tools/crushtool.cc                   | 106 ++-
 ceph/src/tools/rados/rados.cc                 |   8 +-
 ceph/src/tools/rbd_mirror/ImageReplayer.cc    |   4 +-
 345 files changed, 9014 insertions(+), 2848 deletions(-)
 rename ceph/{qa/suites/upgrade/luminous-p2p/% => doc/README.md} (100%)
 create mode 100644 ceph/doc/_ext/edit_on_github.py
 create mode 100644 ceph/doc/_static/js/ceph.js
 create mode 100644 ceph/doc/_templates/page.html
 create mode 100644 ceph/qa/cephfs/clusters/1-mds-1-client-coloc.yaml
 create mode 100644 ceph/qa/cephfs/clusters/1-mds-2-client-coloc.yaml
 create mode 100644 ceph/qa/cephfs/clusters/1-mds-3-client.yaml
 create mode 100644 ceph/qa/cephfs/clusters/1-mds-4-client-coloc.yaml
 create mode 120000 ceph/qa/suites/fs/basic_functional/clusters/1-mds-4-client-coloc.yaml
 delete mode 100644 ceph/qa/suites/fs/basic_functional/clusters/4-remote-clients.yaml
 create mode 120000 ceph/qa/suites/fs/multiclient/clusters/1-mds-2-client.yaml
 create mode 120000 ceph/qa/suites/fs/multiclient/clusters/1-mds-3-client.yaml
 delete mode 100644 ceph/qa/suites/fs/multiclient/clusters/three_clients.yaml
 delete mode 100644 ceph/qa/suites/fs/multiclient/clusters/two_clients.yaml
 delete mode 100644 ceph/qa/suites/fs/multifs/clusters/2-remote-clients.yaml
 create mode 120000 ceph/qa/suites/fs/thrash/clusters/1-mds-1-client-coloc.yaml
 delete mode 100644 ceph/qa/suites/fs/thrash/clusters/mds-1active-1standby.yaml
 create mode 100644 ceph/qa/suites/rados/singleton/all/mon-config-key-caps.yaml
 create mode 100644 ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/normal_pg_log.yaml
 create mode 100644 ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/short_pg_log.yaml
 create mode 100644 ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/normal_pg_log.yaml
 create mode 100644 ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/short_pg_log.yaml
 create mode 100644 ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/normal_pg_log.yaml
 create mode 100644 ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/short_pg_log.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/%
 create mode 120000 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/.qa
 rename ceph/qa/suites/upgrade/luminous-p2p/{ => luminous-p2p-parallel}/point-to-point-upgrade.yaml (85%)
 create mode 120000 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/supported
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/%
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/+
 rename ceph/qa/suites/upgrade/luminous-p2p/{ => luminous-p2p-stress-split/0-cluster}/.qa (100%)
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/openstack.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/start.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1-ceph-install/luminous.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/normal_pg_log.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/short_pg_log.yaml
 create mode 120000 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/.qa
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/firsthalf.yaml
 create mode 120000 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/.qa
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/default.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/+
 create mode 120000 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/.qa
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/radosbench.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-cls.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-import-export.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd_api.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/readwrite.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/snaps-few-objects.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/5-finish-upgrade.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/+
 create mode 120000 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/.qa
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rbd-python.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rgw-swift.yaml
 create mode 100644 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/snaps-many-objects.yaml
 create mode 120000 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/supported
 create mode 120000 ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/thrashosds-health.yaml
 delete mode 120000 ceph/qa/suites/upgrade/luminous-p2p/supported
 create mode 100755 ceph/qa/workunits/mon/test_config_key_caps.sh
 create mode 100644 ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/strategies.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type-dmcrypt/test_zap.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type/test_zap.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type-dmcrypt/test_zap.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type/test_zap.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type-dmcrypt/test_zap.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type/test_zap.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type-dmcrypt/test_zap.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type/test_zap.yml
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/playbooks/test_zap.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type-dmcrypt/test_zap.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type/test_zap.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type-dmcrypt/test_zap.yml
 create mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type/test_zap.yml
 create mode 100644 ceph/src/common/CondVar.h
 create mode 100644 ceph/src/test/cli/crushtool/crush-classes/a
 create mode 100644 ceph/src/test/cli/crushtool/crush-classes/b
 create mode 100644 ceph/src/test/cli/crushtool/crush-classes/beesly
 create mode 100644 ceph/src/test/cli/crushtool/crush-classes/c
 create mode 100644 ceph/src/test/cli/crushtool/crush-classes/d
 create mode 100644 ceph/src/test/cli/crushtool/crush-classes/e
 create mode 100644 ceph/src/test/cli/crushtool/crush-classes/f
 create mode 100644 ceph/src/test/cli/crushtool/crush-classes/flax
 create mode 100644 ceph/src/test/cli/crushtool/crush-classes/g
 create mode 100644 ceph/src/test/cli/crushtool/crush-classes/gabe
 create mode 100644 ceph/src/test/cli/crushtool/crush-classes/gabe2
 mode change 100755 => 100644 ceph/src/test/cli/crushtool/help.t
 create mode 100644 ceph/src/test/cli/crushtool/reclassify.t

diff --git a/Makefile b/Makefile
index 8bb42e96f..505198c63 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
-RELEASE=5.2
+RELEASE=5.3
 
 PACKAGE=ceph
-VER=12.2.10
+VER=12.2.11
 DEBREL=pve1
 
 SRCDIR=ceph
diff --git a/ceph/CMakeLists.txt b/ceph/CMakeLists.txt
index 35c193936..5403de8f4 100644
--- a/ceph/CMakeLists.txt
+++ b/ceph/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8.11)
 
 project(ceph)
-set(VERSION 12.2.10)
+set(VERSION 12.2.11)
 
 if(POLICY CMP0046)
   # Tweak policies (this one disables "missing" dependency warning)
@@ -367,6 +367,8 @@ endif()
 option(WITH_RADOSGW "Rados Gateway is enabled" ON)
 option(WITH_RADOSGW_FCGI_FRONTEND "Rados Gateway's FCGI frontend is enabled" OFF)
 option(WITH_RADOSGW_BEAST_FRONTEND "Rados Gateway's Beast frontend is enabled" ON)
+option(WITH_RADOSGW_BEAST_OPENSSL "Rados Gateway's Beast frontend uses OpenSSL" ON)
+
 if(WITH_RADOSGW)
   find_package(EXPAT REQUIRED)
   if(WITH_RADOSGW_FCGI_FRONTEND)
@@ -376,14 +378,7 @@ if(WITH_RADOSGW)
     message(WARNING "disabling WITH_RADOSGW_BEAST_FRONTEND, which depends on WITH_BOOST_CONTEXT")
     set(WITH_RADOSGW_BEAST_FRONTEND OFF)
   endif()
-endif(WITH_RADOSGW)
-
 
-if (WITH_RADOSGW)
-  if (NOT DEFINED OPENSSL_FOUND)
-    message(STATUS "Looking for openssl anyways, because radosgw selected")
-    find_package(OpenSSL)
-  endif()
 # https://curl.haxx.se/docs/install.html mentions the
 # configure flags for various ssl backends
   execute_process(
@@ -396,7 +391,13 @@ if (WITH_RADOSGW)
   if (CURL_CONFIG_ERRORS)
     message(WARNING "unable to run curl-config; rgw cannot make ssl requests to external systems reliably")
   endif()
-  find_package(OpenSSL)
+
+  if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
+    find_package(OpenSSL REQUIRED)
+  else()
+    find_package(OpenSSL)
+  endif()
+
   if (OPENSSL_FOUND)
     if (NOT NO_CURL_SSL_LINK)
       message(STATUS "libcurl is linked with openssl: explicitly setting locks")
diff --git a/ceph/PendingReleaseNotes b/ceph/PendingReleaseNotes
index 00ee957e0..b75c79fb1 100644
--- a/ceph/PendingReleaseNotes
+++ b/ceph/PendingReleaseNotes
@@ -1,3 +1,13 @@
+>= 12.2.11
+----------
+* `cephfs-journal-tool` makes rank argument (--rank) mandatory. Rank is
+  of format `filesystem:rank`, where `filesystem` is the cephfs filesystem
+  and `rank` is the MDS rank on which the operation is to be executed. To
+  operate on all ranks, use `all` or `*` as the rank specifier. Note that,
+  operations that dump journal information to file will now dump to per-rank
+  suffixed dump files. Importing journal information from dump files is
+  disallowed if operation is targetted for all ranks.
+
 >= 12.1.2
 ---------
 * When running 'df' on a CephFS filesystem comprising exactly one data pool,
@@ -122,3 +132,26 @@
   a clean upgrade path is added to the pg log hard limit patches.
 
   See also: http://tracker.ceph.com/issues/36686
+
+12.2.11
+-------
+
+* The default memory utilization for the mons has been increased
+  somewhat.  Rocksdb now uses 512 MB of RAM by default, which should
+  be sufficient for small to medium-sized clusters; large clusters
+  should tune this up.  Also, the ``mon_osd_cache_size`` has been
+  increase from 10 OSDMaps to 500, which will translate to an
+  additional 500 MB to 1 GB of RAM for large clusters, and much less
+  for small clusters.
+
+* New CephFS file system attributes session_timeout and session_autoclose
+  are configurable via `ceph fs set`. The MDS config options
+  mds_session_timeout, mds_session_autoclose, and mds_max_file_size are now
+  obsolete.
+
+* This release fixes the pg log hard limit bug(https://tracker.ceph.com/issues/23979).
+  A flag called pglog_hardlimit has been introduced. It is off by default.
+  This flag enables the feature that limits the length of the pg log. Users should run
+  'ceph osd set pglog_hardlimit' after completely upgrading to 12.2.11. Once all the OSDs
+  have this flag set, the length of the pg log will be capped by a hard limit. We do not
+  recommend unsetting this flag beyond this point.
diff --git a/ceph/alpine/APKBUILD b/ceph/alpine/APKBUILD
index 26f824c7a..220346e45 100644
--- a/ceph/alpine/APKBUILD
+++ b/ceph/alpine/APKBUILD
@@ -1,7 +1,7 @@
 # Contributor: John Coyle <dx9err@gmail.com>
 # Maintainer: John Coyle <dx9err@gmail.com>
 pkgname=ceph
-pkgver=12.2.10
+pkgver=12.2.11
 pkgrel=0
 pkgdesc="Ceph is a distributed object store and file system"
 pkgusers="ceph"
@@ -63,7 +63,7 @@ makedepends="
 	xmlstarlet
 	yasm
 "
-source="ceph-12.2.10.tar.bz2"
+source="ceph-12.2.11.tar.bz2"
 subpackages="
 	$pkgname-base
 	$pkgname-common
@@ -116,7 +116,7 @@ _sysconfdir=/etc
 _udevrulesdir=/etc/udev/rules.d
 _python_sitelib=/usr/lib/python2.7/site-packages
 
-builddir=$srcdir/ceph-12.2.10
+builddir=$srcdir/ceph-12.2.11
 
 build() {
 	export CEPH_BUILD_VIRTUALENV=$builddir
diff --git a/ceph/ceph.spec b/ceph/ceph.spec
index 94d44b690..d10206738 100644
--- a/ceph/ceph.spec
+++ b/ceph/ceph.spec
@@ -61,7 +61,7 @@
 # main package definition
 #################################################################################
 Name:		ceph
-Version:	12.2.10
+Version:	12.2.11
 Release:	0%{?dist}
 %if 0%{?fedora} || 0%{?rhel}
 Epoch:		2
@@ -77,7 +77,7 @@ License:	LGPL-2.1 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and
 Group:		System/Filesystems
 %endif
 URL:		http://ceph.com/
-Source0:	http://ceph.com/download/ceph-12.2.10.tar.bz2
+Source0:	http://ceph.com/download/ceph-12.2.11.tar.bz2
 %if 0%{?suse_version}
 %if 0%{?is_opensuse}
 ExclusiveArch:  x86_64 aarch64 ppc64 ppc64le
@@ -788,7 +788,7 @@ python-rbd, python-rgw or python-cephfs instead.
 # common
 #################################################################################
 %prep
-%autosetup -p1 -n ceph-12.2.10
+%autosetup -p1 -n ceph-12.2.11
 
 %build
 %if 0%{with cephfs_java}
@@ -806,6 +806,7 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 export CPPFLAGS="$java_inc"
 export CFLAGS="$RPM_OPT_FLAGS"
 export CXXFLAGS="$RPM_OPT_FLAGS"
+export LDFLAGS="$RPM_LD_FLAGS"
 
 env | sort
 
diff --git a/ceph/ceph.spec.in b/ceph/ceph.spec.in
index d708aea33..fa34ade2d 100644
--- a/ceph/ceph.spec.in
+++ b/ceph/ceph.spec.in
@@ -806,6 +806,7 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 export CPPFLAGS="$java_inc"
 export CFLAGS="$RPM_OPT_FLAGS"
 export CXXFLAGS="$RPM_OPT_FLAGS"
+export LDFLAGS="$RPM_LD_FLAGS"
 
 env | sort
 
diff --git a/ceph/debian/changelog b/ceph/debian/changelog
index 2b61ec882..eaed6bfbb 100644
--- a/ceph/debian/changelog
+++ b/ceph/debian/changelog
@@ -1,3 +1,9 @@
+ceph (12.2.11-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com>  Wed, 30 Jan 2019 15:51:24 +0000
+
 ceph (12.2.10-1) stable; urgency=medium
 
   * New upstream release
diff --git a/ceph/debian/control b/ceph/debian/control
index 6d01e3115..65c29ed0d 100644
--- a/ceph/debian/control
+++ b/ceph/debian/control
@@ -402,11 +402,13 @@ Replaces: ceph (<< 10),
           ceph-test (<< 9.0.3-1646),
           librbd1 (<< 0.92-1238),
           python-ceph (<< 0.92-1223),
+	  radosgw (<< 12.0.3)
 Breaks: ceph (<< 10),
         ceph-fs-common (<< 11.0),
         ceph-test (<< 9.0.3-1646),
         librbd1 (<< 0.92-1238),
         python-ceph (<< 0.92-1223),
+	radosgw (<< 12.0.3)
 Suggests: ceph-base (= ${binary:Version}),
           ceph-mds (= ${binary:Version}),
 Description: common utilities to mount and interact with a ceph storage cluster
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/% b/ceph/doc/README.md
similarity index 100%
rename from ceph/qa/suites/upgrade/luminous-p2p/%
rename to ceph/doc/README.md
diff --git a/ceph/doc/_ext/edit_on_github.py b/ceph/doc/_ext/edit_on_github.py
new file mode 100644
index 000000000..290f4b424
--- /dev/null
+++ b/ceph/doc/_ext/edit_on_github.py
@@ -0,0 +1,43 @@
+"""
+Adapted from https://gist.github.com/mgedmin/6052926
+
+Sphinx extension to add ReadTheDocs-style "Edit on GitHub" links to the
+sidebar.
+
+Loosely based on https://github.com/astropy/astropy/pull/347
+"""
+
+import os
+import warnings
+
+
+__licence__ = 'BSD (3 clause)'
+
+
+def get_github_url(app, view, path):
+    return 'https://github.com/{project}/{view}/{branch}/doc/{path}'.format(
+        project=app.config.edit_on_github_project,
+        view=view,
+        branch=app.config.edit_on_github_branch,
+        path=path)
+
+
+def html_page_context(app, pagename, templatename, context, doctree):
+    if templatename != 'page.html':
+        return
+
+    if not app.config.edit_on_github_project:
+        warnings.warn("edit_on_github_project not specified")
+        return
+
+    path = os.path.relpath(doctree.get('source'), app.builder.srcdir)
+    show_url = get_github_url(app, 'blob', path)
+    edit_url = get_github_url(app, 'edit', path)
+
+    context['show_on_github_url'] = show_url
+    context['edit_on_github_url'] = edit_url
+
+def setup(app):
+    app.add_config_value('edit_on_github_project', '', True)
+    app.add_config_value('edit_on_github_branch', 'master', True)
+    app.connect('html-page-context', html_page_context)
diff --git a/ceph/doc/_static/js/ceph.js b/ceph/doc/_static/js/ceph.js
new file mode 100644
index 000000000..61f95fb6a
--- /dev/null
+++ b/ceph/doc/_static/js/ceph.js
@@ -0,0 +1,41 @@
+$(function() {
+  var releases_url = "http://docs.ceph.com/docs/master/releases.json";
+
+  function show_edit(branch, data) {
+    if (branch) {
+      if (branch === "master") {
+        $("#dev-warning").show();
+        return true;
+      }
+      if (data && data.releases && branch in data.releases) {
+        var eol = ("actual_eol" in data.releases[branch]);
+        if (eol) {
+          $("#eol-warning").show();
+        }
+        return !eol;
+      }
+    }
+    $("#dev-warning").show();
+    return false;
+  }
+
+  function get_branch() {
+    var url = window.location.href;
+    var res = url.match(/docs.ceph.com\/docs\/([a-z]+)\/?/i)
+    if (res) {
+      return res[1]
+    }
+    return null;
+  }
+
+  $.getJSON(releases_url, function(data) {
+    var branch = get_branch();
+    if (show_edit(branch, data)) {
+      // patch the edit-on-github URL for correct branch
+      var url = $("#edit-on-github").attr("href");
+      url = url.replace("master", branch);
+      $("#edit-on-github").attr("href", url);
+      $("#docubetter").show();
+    }
+  });
+});
diff --git a/ceph/doc/_templates/page.html b/ceph/doc/_templates/page.html
new file mode 100644
index 000000000..914a752fa
--- /dev/null
+++ b/ceph/doc/_templates/page.html
@@ -0,0 +1,21 @@
+{% extends "!page.html" %}
+{% block body %}
+
+<div id="dev-warning" class="admonition note" style="display:none;">
+  <p class="first admonition-title">Notice</p>
+  <p class="last">This document is for a development version of Ceph.</p>
+</div>
+
+<div id="eol-warning" class="admonition warning" style="display:none;">
+  <p class="first admonition-title">Warning</p>
+  <p class="last">This document is for an unsupported version of Ceph.</p>
+</div>
+
+{%- if edit_on_github_url %}
+  <div id="docubetter" align="right" style="display:none; padding: 15px; font-weight: bold;">
+    <a id="edit-on-github" href="{{ edit_on_github_url }}" rel="nofollow">{{ _('Edit on GitHub')}}</a> | <a href="https://github.com/ceph/ceph/projects/4">Report a Documentation Bug</a>
+  </div>
+{%- endif %}
+
+  {{ super() }}
+{% endblock %}
diff --git a/ceph/doc/ceph-volume/lvm/zap.rst b/ceph/doc/ceph-volume/lvm/zap.rst
index 2236ad4ef..367d74693 100644
--- a/ceph/doc/ceph-volume/lvm/zap.rst
+++ b/ceph/doc/ceph-volume/lvm/zap.rst
@@ -15,18 +15,51 @@ on the given lv or partition will be removed and all data will be purged.
 
 Zapping a logical volume::
 
-      ceph-volume lvm zap {vg name/lv name}
+    ceph-volume lvm zap {vg name/lv name}
 
 Zapping a partition::
 
-      ceph-volume lvm zap /dev/sdc1
+    ceph-volume lvm zap /dev/sdc1
 
-If you are zapping a raw device or partition and would like any vgs or lvs created
-from that device removed use the ``--destroy`` flag. A common use case is to simply
-deploy OSDs using a whole raw device. If you do so and then wish to reuse that device for
-another OSD you must use the ``--destroy`` flag when zapping so that the vgs and lvs that
-ceph-volume created on the raw device will be removed.
+Removing Devices
+----------------
+When zapping, and looking for full removal of the device (lv, vg, or partition)
+use the ``--destroy`` flag. A common use case is to simply deploy OSDs using
+a whole raw device. If you do so and then wish to reuse that device for another
+OSD you must use the ``--destroy`` flag when zapping so that the vgs and lvs
+that ceph-volume created on the raw device will be removed.
+
+.. note:: Multiple devices can be accepted at once, to zap them all
 
 Zapping a raw device and destroying any vgs or lvs present::
 
-      ceph-volume lvm zap /dev/sdc --destroy
+    ceph-volume lvm zap /dev/sdc --destroy
+
+
+This action can be performed on partitions, and logical volumes as well::
+
+    ceph-volume lvm zap /dev/sdc1 --destroy
+    ceph-volume lvm zap osd-vg/data-lv --destroy
+
+
+Finally, multiple devices can be detected if filtering by OSD ID and/or OSD
+FSID. Either identifier can be used or both can be used at the same time. This
+is useful in situations where multiple devices associated with a specific ID
+need to be purged. When using the FSID, the filtering is stricter, and might
+not match other (possibly invalid) devices associated to an ID.
+
+By ID only::
+
+    ceph-volume lvm zap --destroy --osd-id 1
+
+By FSID::
+
+    ceph-volume lvm zap --destroy --osd-fsid 2E8FBE58-0328-4E3B-BFB7-3CACE4E9A6CE
+
+By both::
+
+    ceph-volume lvm zap --destroy --osd-fsid 2E8FBE58-0328-4E3B-BFB7-3CACE4E9A6CE --osd-id 1
+
+
+.. warning:: If the systemd unit associated with the OSD ID to be zapped is
+             detected as running, the tool will refuse to zap until the daemon is stopped.
diff --git a/ceph/doc/cephfs/dirfrags.rst b/ceph/doc/cephfs/dirfrags.rst
index 717553fea..24b05edfc 100644
--- a/ceph/doc/cephfs/dirfrags.rst
+++ b/ceph/doc/cephfs/dirfrags.rst
@@ -25,10 +25,9 @@ fragments may be *merged* to reduce the number of fragments in the directory.
 Splitting and merging
 =====================
 
-An MDS will only consider doing splits and merges if the ``mds_bal_frag``
-setting is true in the MDS's configuration file, and the allow_dirfrags
-setting is true in the filesystem map (set on the mons).  These settings
-are both true by default since the *Luminous* (12.2.x) release of Ceph.
+An MDS will only consider doing splits if the allow_dirfrags setting is true in
+the file system map (set on the mons).  This setting is true by default since
+the *Luminous* release (12.2.X).
 
 When an MDS identifies a directory fragment to be split, it does not
 do the split immediately.  Because splitting interrupts metadata IO,
diff --git a/ceph/doc/cephfs/eviction.rst b/ceph/doc/cephfs/eviction.rst
index 8f0f20b84..e803da179 100644
--- a/ceph/doc/cephfs/eviction.rst
+++ b/ceph/doc/cephfs/eviction.rst
@@ -23,9 +23,9 @@ Automatic client eviction
 
 There are three situations in which a client may be evicted automatically:
 
-On an active MDS daemon, if a client has not communicated with the MDS for
-over ``mds_session_autoclose`` seconds (300 seconds by default), then it
-will be evicted automatically.
+On an active MDS daemon, if a client has not communicated with the MDS for over
+``session_autoclose`` (a file system variable) seconds (300 seconds by
+default), then it will be evicted automatically.
 
 On an active MDS daemon, if a client has not responded to cap revoke messages
 for over ``mds_cap_revoke_eviction_timeout`` (configuration option) seconds.
diff --git a/ceph/doc/cephfs/fuse.rst b/ceph/doc/cephfs/fuse.rst
index 02a4d485c..251253703 100644
--- a/ceph/doc/cephfs/fuse.rst
+++ b/ceph/doc/cephfs/fuse.rst
@@ -26,7 +26,7 @@ For additional details on ``cephx`` configuration, see
 To mount the Ceph file system as a FUSE, you may use the ``ceph-fuse`` command.
 For example::
 
-	sudo mkdir /home/usernname/cephfs
+	sudo mkdir /home/username/cephfs
 	sudo ceph-fuse -m 192.168.0.1:6789 /home/username/cephfs
 
 If you have more than one filesystem, specify which one to mount using
@@ -48,5 +48,5 @@ A persistent mount point can be setup via::
 	sudo systemctl enable ceph-fuse@/mnt.service
 
 .. _ceph-fuse: ../../man/8/ceph-fuse/
-.. _fstab: ./fstab
+.. _fstab: ../fstab/#fuse
 .. _CEPHX Config Reference: ../../rados/configuration/auth-config-ref
diff --git a/ceph/doc/cephfs/health-messages.rst b/ceph/doc/cephfs/health-messages.rst
index 7b82c2f87..3a6217c7b 100644
--- a/ceph/doc/cephfs/health-messages.rst
+++ b/ceph/doc/cephfs/health-messages.rst
@@ -67,7 +67,7 @@ are like locks.  Sometimes, for example when another client needs access,
 the MDS will request clients release their capabilities.  If the client
 is unresponsive or buggy, it might fail to do so promptly or fail to do
 so at all.  This message appears if a client has taken longer than
-``mds_session_timeout`` (default 60s) to comply.
+``session_timeout`` (default 60s) to comply.
 
 Message: "Client *name* failing to respond to cache pressure"
 Code: MDS_HEALTH_CLIENT_RECALL, MDS_HEALTH_CLIENT_RECALL_MANY
diff --git a/ceph/doc/cephfs/mds-config-ref.rst b/ceph/doc/cephfs/mds-config-ref.rst
index 2fd47ae33..70a97c90f 100644
--- a/ceph/doc/cephfs/mds-config-ref.rst
+++ b/ceph/doc/cephfs/mds-config-ref.rst
@@ -10,15 +10,6 @@
 :Type: Boolean
 :Default: ``true`` 
 
-
-``mds max file size``
-
-:Description: The maximum allowed file size to set when creating a 
-              new file system.
-
-:Type:  64-bit Integer Unsigned
-:Default:  ``1ULL << 40``
-
 ``mds cache memory limit``
 
 :Description: The memory limit the MDS should enforce for its cache.
@@ -102,24 +93,6 @@
 :Default: ``24.0*60.0``
 
 
-``mds session timeout``
-
-:Description: The interval (in seconds) of client inactivity before Ceph 
-              times out capabilities and leases.
-              
-:Type:  Float
-:Default: ``60``
-
-
-``mds session autoclose``
-
-:Description: The interval (in seconds) before Ceph closes 
-              a laggy client's session.
-              
-:Type:  Float
-:Default: ``300``
-
-
 ``mds reconnect timeout``
 
 :Description: The interval (in seconds) to wait for clients to reconnect 
@@ -249,13 +222,6 @@
 :Default: ``0``
 
 
-``mds bal frag``
-
-:Description: Determines whether the MDS will fragment directories.
-:Type:  Boolean
-:Default:  ``false``
-
-
 ``mds bal split size``
 
 :Description: The maximum directory size before the MDS will split a directory 
diff --git a/ceph/doc/conf.py b/ceph/doc/conf.py
index ce1e5af97..6bd56ba40 100644
--- a/ceph/doc/conf.py
+++ b/ceph/doc/conf.py
@@ -33,16 +33,20 @@ html_logo = 'logo.png'
 html_favicon = 'favicon.ico'
 html_use_smartypants = True
 html_show_sphinx = False
+html_static_path = ["_static"]
 html_sidebars = {
     '**': ['smarttoc.html', 'searchbox.html'],
     }
 
+sys.path.insert(0, os.path.abspath('_ext'))
+
 extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.graphviz',
     'sphinx.ext.todo',
     'sphinxcontrib.ditaa',
     'breathe',
+    'edit_on_github',
     ]
 ditaa = 'ditaa'
 todo_include_todos = True
@@ -66,3 +70,13 @@ breathe_domain_by_extension = {'py': 'py', 'c': 'c', 'h': 'c', 'cc': 'cxx', 'hpp
 pybind = os.path.join(top_level, 'src/pybind')
 if pybind not in sys.path:
     sys.path.insert(0, pybind)
+
+# the docs are rendered with github links pointing to master. the javascript
+# snippet in _static/ceph.js rewrites the edit links when a page is loaded, to
+# point to the correct branch.
+edit_on_github_project = 'ceph/ceph'
+edit_on_github_branch = 'master'
+
+# handles edit-on-github and old version warning display
+def setup(app):
+    app.add_javascript('js/ceph.js')
diff --git a/ceph/doc/man/8/ceph-volume.rst b/ceph/doc/man/8/ceph-volume.rst
index af5775997..9ad5a5237 100644
--- a/ceph/doc/man/8/ceph-volume.rst
+++ b/ceph/doc/man/8/ceph-volume.rst
@@ -226,6 +226,17 @@ Usage, for logical partitions::
 
       ceph-volume lvm zap /dev/sdc1
 
+For full removal of the device use the ``--destroy`` flag (allowed for all
+device types)::
+
+      ceph-volume lvm zap --destroy /dev/sdc1
+
+Multiple devices can be removed by specifying the OSD ID and/or the OSD FSID::
+
+      ceph-volume lvm zap --destroy --osd-id 1
+      ceph-volume lvm zap --destroy --osd-id 1 --osd-fsid C9605912-8395-4D76-AFC0-7DFDAC315D59
+
+
 Positional arguments:
 
 * <DEVICE>  Either in the form of ``vg/lv`` for logical volumes,
diff --git a/ceph/doc/man/8/crushtool.rst b/ceph/doc/man/8/crushtool.rst
index 897f62ec4..c5fae504a 100644
--- a/ceph/doc/man/8/crushtool.rst
+++ b/ceph/doc/man/8/crushtool.rst
@@ -258,6 +258,14 @@ creating a new Ceph cluster. They can be further edited with::
        # recompile
        crushtool -c map.txt -o crushmap
 
+Reclassify
+==========
+
+The *reclassify* function allows users to transition from older maps that
+maintain parallel hierarchies for OSDs of different types to a modern CRUSH
+map that makes use of the *device class* feature.  For more information,
+see http://docs.ceph.com/docs/master/rados/operations/crush-map-edits/#migrating-from-a-legacy-ssd-rule-to-device-classes.
+
 Example output from --test
 ==========================
 
diff --git a/ceph/doc/mgr/balancer.rst b/ceph/doc/mgr/balancer.rst
index 191c45593..f3cb86f7d 100644
--- a/ceph/doc/mgr/balancer.rst
+++ b/ceph/doc/mgr/balancer.rst
@@ -129,6 +129,10 @@ The name is provided by the user and can be any useful identifying string.  The
 
   ceph balancer show <plan-name>
 
+All plans can be shown with::
+
+  ceph balancer ls
+
 Old plans can be discarded with::
 
   ceph balancer rm <plan-name>
diff --git a/ceph/doc/rados/configuration/bluestore-config-ref.rst b/ceph/doc/rados/configuration/bluestore-config-ref.rst
index 542ba151a..d7e70ee92 100644
--- a/ceph/doc/rados/configuration/bluestore-config-ref.rst
+++ b/ceph/doc/rados/configuration/bluestore-config-ref.rst
@@ -51,8 +51,164 @@ To specify a WAL device and/or DB device, ::
 
   ceph-disk prepare --bluestore <device> --block.wal <wal-device> --block-db <db-device>
 
-Cache size
-==========
+Provisioning strategies
+-----------------------
+Although there are multiple ways to deploy a Bluestore OSD (unlike Filestore
+which had 1) here are two common use cases that should help clarify the
+initial deployment strategy:
+
+.. _bluestore-single-type-device-config:
+
+**block (data) only**
+^^^^^^^^^^^^^^^^^^^^^
+If all the devices are the same type, for example all are spinning drives, and
+there are no fast devices to combine these, it makes sense to just deploy with
+block only and not try to separate ``block.db`` or ``block.wal``. The
+:ref:`ceph-volume-lvm` call for a single ``/dev/sda`` device would look like::
+
+    ceph-volume lvm create --bluestore --data /dev/sda
+
+If logical volumes have already been created for each device (1 LV using 100%
+of the device), then the :ref:`ceph-volume-lvm` call for an lv named
+``ceph-vg/block-lv`` would look like::
+
+    ceph-volume lvm create --bluestore --data ceph-vg/block-lv
+
+.. _bluestore-mixed-device-config:
+
+**block and block.db**
+^^^^^^^^^^^^^^^^^^^^^^
+If there is a mix of fast and slow devices (spinning and solid state),
+it is recommended to place ``block.db`` on the faster device while ``block``
+(data) lives on the slower (spinning drive). Sizing for ``block.db`` should be
+as large as possible to avoid performance penalties otherwise. The
+``ceph-volume`` tool is currently not able to create these automatically, so
+the volume groups and logical volumes need to be created manually.
+
+For the below example, lets assume 4 spinning drives (sda, sdb, sdc, and sdd)
+and 1 solid state drive (sdx). First create the volume groups::
+
+    $ vgcreate ceph-block-0 /dev/sda
+    $ vgcreate ceph-block-1 /dev/sdb
+    $ vgcreate ceph-block-2 /dev/sdc
+    $ vgcreate ceph-block-3 /dev/sdd
+
+Now create the logical volumes for ``block``::
+
+    $ lvcreate -l 100%FREE -n block-0 ceph-block-0
+    $ lvcreate -l 100%FREE -n block-1 ceph-block-1
+    $ lvcreate -l 100%FREE -n block-2 ceph-block-2
+    $ lvcreate -l 100%FREE -n block-3 ceph-block-3
+
+We are creating 4 OSDs for the four slow spinning devices, so assuming a 200GB
+SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB::
+
+    $ vgcreate ceph-db-0 /dev/sdx
+    $ lvcreate -L 50GB -n db-0 ceph-db-0
+    $ lvcreate -L 50GB -n db-1 ceph-db-0
+    $ lvcreate -L 50GB -n db-2 ceph-db-0
+    $ lvcreate -L 50GB -n db-3 ceph-db-0
+
+Finally, create the 4 OSDs with ``ceph-volume``::
+
+    $ ceph-volume lvm create --bluestore --data ceph-block-0/block-0 --block.db ceph-db-0/db-0
+    $ ceph-volume lvm create --bluestore --data ceph-block-1/block-1 --block.db ceph-db-0/db-1
+    $ ceph-volume lvm create --bluestore --data ceph-block-2/block-2 --block.db ceph-db-0/db-2
+    $ ceph-volume lvm create --bluestore --data ceph-block-3/block-3 --block.db ceph-db-0/db-3
+
+These operations should end up creating 4 OSDs, with ``block`` on the slower
+spinning drives and a 50GB logical volume for each coming from the solid state
+drive.
+
+Sizing
+======
+When using a :ref:`mixed spinning and solid drive setup
+<bluestore-mixed-device-config>` it is important to make a large-enough
+``block.db`` logical volume for Bluestore. Generally, ``block.db`` should have
+*as large as possible* logical volumes.
+
+It is recommended that the ``block.db`` size isn't smaller than 4% of
+``block``. For example, if the ``block`` size is 1TB, then ``block.db``
+shouldn't be less than 40GB.
+
+If *not* using a mix of fast and slow devices, it isn't required to create
+separate logical volumes for ``block.db`` (or ``block.wal``). Bluestore will
+automatically manage these within the space of ``block``.
+
+
+Automatic Cache Sizing
+======================
+
+Bluestore can be configured to automatically resize it's caches when tc_malloc
+is configured as the memory allocator and the ``bluestore_cache_autotune``
+setting is enabled.  This option is currently enabled by default.  Bluestore
+will attempt to keep OSD heap memory usage under a designated target size via
+the ``osd_memory_target`` configuration option.  This is a best effort
+algorithm and caches will not shrink smaller than the amount specified by
+``osd_memory_cache_min``.  Cache ratios will be chosen based on a hierarchy
+of priorities.  If priority information is not availabe, the
+``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio`` options are
+used as fallbacks.
+
+``bluestore_cache_autotune``
+
+:Description: Automatically tune the ratios assigned to different bluestore caches while respecting minimum values.
+:Type: Boolean
+:Requered: Yes
+:Default: ``True``
+
+``osd_memory_target``
+
+:Description: When tcmalloc is available and cache autotuning is enabled, try to keep this many bytes mapped in memory. Note: This may not exactly match the RSS memory usage of the process.  While the total amount of heap memory mapped by the process should generally stay close to this target, there is no guarantee that the kernel will actually reclaim  memory that has been unmapped.  During initial developement, it was found that some kernels result in the OSD's RSS Memory exceeding the mapped memory by up to 20%.  It is hypothesised however, that the kernel generally may be more aggressive about reclaiming unmapped memory when there is a high amount of memory pressure.  Your mileage may vary.
+:Type: Unsigned Integer
+:Requered: Yes
+:Default: ``4294967296``
+
+``bluestore_cache_autotune_chunk_size``
+
+:Description: The chunk size in bytes to allocate to caches when cache autotune is enabled.  When the autotuner assigns memory to different caches, it will allocate memory in chunks.  This is done to avoid evictions when there are minor fluctuations in the heap size or autotuned cache ratios.
+:Type: Unsigned Integer
+:Requered: No
+:Default: ``33554432``
+
+``bluestore_cache_autotune_interval``
+
+:Description: The number of seconds to wait between rebalances when cache autotune is enabled.  This setting changes how quickly the ratios of the difference caches are recomputed.  Note:  Setting the interval too small can result in high CPU usage and lower performance.
+:Type: Float
+:Requered: No
+:Default: ``5``
+
+``osd_memory_base``
+
+:Description: When tcmalloc and cache autotuning is enabled, estimate the minimum amount of memory in bytes the OSD will need.  This is used to help the autotuner estimate the expected aggregate memory consumption of the caches.
+:Type: Unsigned Interger
+:Required: No
+:Default: ``805306368``
+
+``osd_memory_expected_fragmentation``
+
+:Description: When tcmalloc and cache autotuning is enabled, estimate the percent of memory fragmentation.  This is used to help the autotuner estimate the expected aggregate memory consumption of the caches.
+:Type: Float
+:Required: No
+:Default: ``0.15``
+
+``osd_memory_cache_min``
+
+:Description: When tcmalloc and cache autotuning is enabled, set the minimum amount of memory used for caches. Note: Setting this value too low can result in significant cache thrashing.
+:Type: Unsigned Integer
+:Required: No
+:Default: ``134217728``
+
+``osd_memory_cache_resize_interval``
+
+:Description: When tcmalloc and cache autotuning is enabled, wait this many seconds between resizing caches.  This setting changes the total amount of memory available for bluestore to use for caching.  Note: Setting the interval too small can result in memory allocator thrashing and lower performance.
+:Type: Float
+:Required: No
+:Default: ``1``
+
+
+Manual Cache Sizing
+===================
 
 The amount of memory consumed by each OSD for BlueStore's cache is
 determined by the ``bluestore_cache_size`` configuration option.  If
diff --git a/ceph/doc/rados/operations/add-or-rm-mons.rst b/ceph/doc/rados/operations/add-or-rm-mons.rst
index 0cdc4313c..20cba1bca 100644
--- a/ceph/doc/rados/operations/add-or-rm-mons.rst
+++ b/ceph/doc/rados/operations/add-or-rm-mons.rst
@@ -1,3 +1,5 @@
+.. _adding-and-removing-monitors:
+
 ==========================
  Adding/Removing Monitors
 ==========================
@@ -6,6 +8,8 @@ When you have a cluster up and running, you may add or remove monitors
 from the cluster at runtime. To bootstrap a monitor, see `Manual Deployment`_
 or `Monitor Bootstrap`_.
 
+.. _adding-monitors:
+
 Adding Monitors
 ===============
 
@@ -121,6 +125,7 @@ on ``mon.a``).
 
 	ceph-mon -i {mon-id} --public-addr {ip:port}
 
+.. _removing-monitors:
 
 Removing Monitors
 =================
diff --git a/ceph/doc/rados/operations/crush-map-edits.rst b/ceph/doc/rados/operations/crush-map-edits.rst
index 36a902083..64d37c714 100644
--- a/ceph/doc/rados/operations/crush-map-edits.rst
+++ b/ceph/doc/rados/operations/crush-map-edits.rst
@@ -475,144 +475,161 @@ A rule takes the following form::
 .. important:: A given CRUSH rule may be assigned to multiple pools, but it
    is not possible for a single pool to have multiple CRUSH rules.
 
-
-Placing Different Pools on Different OSDS:
-==========================================
-
-Suppose you want to have most pools default to OSDs backed by large hard drives,
-but have some pools mapped to OSDs backed by fast solid-state drives (SSDs).
-It's possible to have multiple independent CRUSH hierarchies within the same
-CRUSH map. Define two hierarchies with two different root nodes--one for hard
-disks (e.g., "root platter") and one for SSDs (e.g., "root ssd") as shown
-below::
-
-  device 0 osd.0
-  device 1 osd.1
-  device 2 osd.2
-  device 3 osd.3
-  device 4 osd.4
-  device 5 osd.5
-  device 6 osd.6
-  device 7 osd.7
-
-	host ceph-osd-ssd-server-1 {
-		id -1
-		alg straw
-		hash 0
-		item osd.0 weight 1.00
-		item osd.1 weight 1.00
-	}
-
-	host ceph-osd-ssd-server-2 {
-		id -2
-		alg straw
-		hash 0
-		item osd.2 weight 1.00
-		item osd.3 weight 1.00
-	}
-
-	host ceph-osd-platter-server-1 {
-		id -3
-		alg straw
-		hash 0
-		item osd.4 weight 1.00
-		item osd.5 weight 1.00
-	}
-
-	host ceph-osd-platter-server-2 {
-		id -4
-		alg straw
-		hash 0
-		item osd.6 weight 1.00
-		item osd.7 weight 1.00
-	}
-
-	root platter {
-		id -5
-		alg straw
-		hash 0
-		item ceph-osd-platter-server-1 weight 2.00
-		item ceph-osd-platter-server-2 weight 2.00
-	}
-
-	root ssd {
-		id -6
-		alg straw
-		hash 0
-		item ceph-osd-ssd-server-1 weight 2.00
-		item ceph-osd-ssd-server-2 weight 2.00
-	}
-
-	rule data {
-		ruleset 0
-		type replicated
-		min_size 2
-		max_size 2
-		step take platter
-		step chooseleaf firstn 0 type host
-		step emit
-	}
-
-	rule metadata {
-		ruleset 1
-		type replicated
-		min_size 0
-		max_size 10
-		step take platter
-		step chooseleaf firstn 0 type host
-		step emit
-	}
-
-	rule rbd {
-		ruleset 2
-		type replicated
-		min_size 0
-		max_size 10
-		step take platter
-		step chooseleaf firstn 0 type host
-		step emit
-	}
-
-	rule platter {
-		ruleset 3
-		type replicated
-		min_size 0
-		max_size 10
-		step take platter
-		step chooseleaf firstn 0 type host
-		step emit
-	}
-
-	rule ssd {
-		ruleset 4
-		type replicated
-		min_size 0
-		max_size 4
-		step take ssd
-		step chooseleaf firstn 0 type host
-		step emit
-	}
-
-	rule ssd-primary {
-		ruleset 5
-		type replicated
-		min_size 5
-		max_size 10
-		step take ssd
-		step chooseleaf firstn 1 type host
-		step emit
-		step take platter
-		step chooseleaf firstn -1 type host
-		step emit
-	}
-
-You can then set a pool to use the SSD rule by::
-
-  ceph osd pool set <poolname> crush_ruleset 4
-
-Similarly, using the ``ssd-primary`` rule will cause each placement group in the
-pool to be placed with an SSD as the primary and platters as the replicas.
-
+.. _crush-reclassify:
+
+Migrating from a legacy SSD rule to device classes
+--------------------------------------------------
+
+It used to be necessary to manually edit your CRUSH map and maintain a
+parallel hierarchy for each specialized device type (e.g., SSD) in order to
+write rules that apply to those devices.  Since the Luminous release,
+the *device class* feature has enabled this transparently.
+
+However, migrating from an existing, manually customized per-device map to
+the new device class rules in the trivial way will cause all data in the
+system to be reshuffled.
+
+The ``crushtool`` has a few commands that can transform a legacy rule
+and hierarchy so that you can start using the new class-based rules.
+There are three types of transformations possible:
+
+#. ``--reclassify-root <root-name> <device-class>``
+
+   This will take everything in the hierarchy beneath root-name and
+   adjust any rules that reference that root via a ``take
+   <root-name>`` to instead ``take <root-name> class <device-class>``.
+   It renumbers the buckets in such a way that the old IDs are instead
+   used for the specified class's "shadow tree" so that no data
+   movement takes place.
+
+   For example, imagine you have an existing rule like::
+
+     rule replicated_ruleset {
+        id 0
+        type replicated
+        min_size 1
+        max_size 10
+        step take default
+        step chooseleaf firstn 0 type rack
+        step emit
+     }
+
+   If you reclassify the root `default` as class `hdd`, the rule will
+   become::
+
+     rule replicated_ruleset {
+        id 0
+        type replicated
+        min_size 1
+        max_size 10
+        step take default class hdd
+        step chooseleaf firstn 0 type rack
+        step emit
+     }
+
+#. ``--set-subtree-class <bucket-name> <device-class>``
+
+   This will mark every device in the subtree rooted at *bucket-name*
+   with the specified device class.
+
+   This is normally used in conjunction with the ``--reclassify-root``
+   option to ensure that all devices in that root are labeled with the
+   correct class.  In some situations, however, some of those devices
+   (correctly) have a different class and we do not want to relabel
+   them.  In such cases, one can exclude the ``--set-subtree-class``
+   option.  This means that the remapping process will not be perfect,
+   since the previous rule distributed across devices of multiple
+   classes but the adjusted rules will only map to devices of the
+   specified *device-class*, but that often is an accepted level of
+   data movement when the nubmer of outlier devices is small.
+
+#. ``--reclassify-bucket <match-pattern> <device-class> <default-parent>``
+
+   This will allow you to merge a parallel type-specific hiearchy with the normal hierarchy.  For example, many users have maps like::
+
+     host node1 {
+        id -2           # do not change unnecessarily
+        # weight 109.152
+        alg straw
+        hash 0  # rjenkins1
+        item osd.0 weight 9.096
+        item osd.1 weight 9.096
+        item osd.2 weight 9.096
+        item osd.3 weight 9.096
+        item osd.4 weight 9.096
+        item osd.5 weight 9.096
+        ...
+     }
+
+     host node1-ssd {
+        id -10          # do not change unnecessarily
+        # weight 2.000
+        alg straw
+        hash 0  # rjenkins1
+        item osd.80 weight 2.000
+	...
+     }
+
+     root default {
+        id -1           # do not change unnecessarily
+        alg straw
+        hash 0  # rjenkins1
+        item node1 weight 110.967
+        ...
+     }
+
+     root ssd {
+        id -18          # do not change unnecessarily
+        # weight 16.000
+        alg straw
+        hash 0  # rjenkins1
+        item node1-ssd weight 2.000
+	...
+     }
+
+   This function will reclassify each bucket that matches a
+   pattern.  The pattern can look like ``%suffix`` or ``prefix%``.
+   For example, in the above example, we would use the pattern
+   ``%-ssd``.  For each matched bucket, the remaining portion of the
+   name (that matches the ``%`` wildcard) specifies the *base bucket*.
+   All devices in the matched bucket are labeled with the specified
+   device class and then moved to the base bucket.  If the base bucket
+   does not exist (e.g., ``node12-ssd`` exists but ``node12`` does
+   not), then it is created and linked underneath the specified
+   *default parent* bucket.  In each case, we are careful to preserve
+   the old bucket IDs for the new shadow buckets to prevent data
+   movement.  Any rules with ``take`` steps referencing the old
+   buckets are adjusted.
+
+#. ``--reclassify-bucket <bucket-name> <device-class> <base-bucket>``
+
+   The same command can also be used without a wildcard to map a
+   single bucket.  For example, in the previous example, we want the
+   ``ssd`` bucket to be mapped to the ``default`` bucket.
+
+The final command to convert the map comprised of the above fragments would be something like::
+
+  $ ceph osd getcrushmap -o original
+  $ crushtool -i original --reclassify \
+      --set-subtree-class default hdd \
+      --reclassify-root default hdd \
+      --reclassify-bucket %-ssd ssd default \
+      --reclassify-bucket ssd ssd default \
+      -o adjusted
+
+In order to ensure that the conversion is correct, there is a ``--compare`` command that will test a large sample of inputs to the CRUSH map and ensure that the same result comes back out.  These inputs are controlled by the same options that apply to the ``--test`` command.  For the above example,::
+
+  $ crushtool -i original --compare adjusted
+  rule 0 had 0/10240 mismatched mappings (0)
+  rule 1 had 0/10240 mismatched mappings (0)
+  maps appear equivalent
+
+If there were difference, you'd see what ratio of inputs are remapped
+in the parentheses.
+
+If you are satisfied with the adjusted map, you can apply it to the cluster with something like::
+
+  ceph osd setcrushmap -i adjusted
 
 Tuning CRUSH, the hard way
 --------------------------
diff --git a/ceph/doc/rados/operations/crush-map.rst b/ceph/doc/rados/operations/crush-map.rst
index 05fa4ff69..e9d667344 100644
--- a/ceph/doc/rados/operations/crush-map.rst
+++ b/ceph/doc/rados/operations/crush-map.rst
@@ -243,6 +243,11 @@ with::
 
   ceph osd crush tree --show-shadow
 
+For older clusters created before Luminous that relied on manually
+crafted CRUSH maps to maintain per-device-type hierarchies, there is a
+*reclassify* tool available to help transition to device classes
+without triggering data movement (see :ref:`crush-reclassify`).
+
 
 Weights sets
 ------------
diff --git a/ceph/doc/rados/operations/user-management.rst b/ceph/doc/rados/operations/user-management.rst
index 8a35a501a..8c0874107 100644
--- a/ceph/doc/rados/operations/user-management.rst
+++ b/ceph/doc/rados/operations/user-management.rst
@@ -387,12 +387,6 @@ For example::
 	ceph auth caps client.paul mon 'allow rw' osd 'allow rwx pool=liverpool'
 	ceph auth caps client.brian-manager mon 'allow *' osd 'allow *'
 
-To remove a capability, you may reset the capability. If you want the user
-to have no access to a particular daemon that was previously set, specify 
-an empty string. For example:: 
-
-	ceph auth caps client.ringo mon ' ' osd ' '
-
 See `Authorization (Capabilities)`_ for additional details on capabilities.
 
 
diff --git a/ceph/doc/rados/troubleshooting/troubleshooting-mon.rst b/ceph/doc/rados/troubleshooting/troubleshooting-mon.rst
index 89fb94c32..642b2e07b 100644
--- a/ceph/doc/rados/troubleshooting/troubleshooting-mon.rst
+++ b/ceph/doc/rados/troubleshooting/troubleshooting-mon.rst
@@ -402,8 +402,8 @@ or::
 Recovery using healthy monitor(s)
 ---------------------------------
 
-If there is any survivers, we can always `replace`_ the corrupted one with a
-new one. And after booting up, the new joiner will sync up with a healthy
+If there are any survivors, we can always :ref:`replace <adding-and-removing-monitors>` the corrupted one with a
+new one. After booting up, the new joiner will sync up with a healthy
 peer, and once it is fully sync'ed, it will be able to serve the clients.
 
 Recovery using OSDs
@@ -563,5 +563,4 @@ Finally, you should reach out to us on the mailing lists, on IRC or file
 a new issue on the `tracker`_.
 
 .. _cluster map: ../../architecture#cluster-map
-.. _replace: ../operation/add-or-rm-mons
 .. _tracker: http://tracker.ceph.com/projects/ceph/issues/new
diff --git a/ceph/doc/radosgw/adminops.rst b/ceph/doc/radosgw/adminops.rst
index 5da13a8b9..16efd5f84 100644
--- a/ceph/doc/radosgw/adminops.rst
+++ b/ceph/doc/radosgw/adminops.rst
@@ -1858,8 +1858,9 @@ Valid parameters for quotas include:
   the maximum number of objects. A negative value disables this setting.
   
 - **Maximum Size:** The ``max-size`` option allows you to specify a quota
-  for the maximum number of bytes. A negative value disables this setting.
-  
+  for the maximum number of bytes. The ``max-size-kb`` option allows you
+  to specify it in KiB. A negative value disables this setting.
+
 - **Quota Type:** The ``quota-type`` option sets the scope for the quota.
   The options are ``bucket`` and ``user``.
 
diff --git a/ceph/doc/radosgw/config-ref.rst b/ceph/doc/radosgw/config-ref.rst
index 45054a9ec..d86baf126 100644
--- a/ceph/doc/radosgw/config-ref.rst
+++ b/ceph/doc/radosgw/config-ref.rst
@@ -576,6 +576,17 @@ Swift Settings
 :Default: ``false``
 
 
+``rgw trust forwarded https``
+
+:Description: When a proxy in front of radosgw is used for ssl termination, radosgw
+              does not know whether incoming http connections are secure. Enable
+              this option to trust the ``Forwarded`` and ``X-Forwarded-Proto`` headers
+              sent by the proxy when determining whether the connection is secure.
+              This is required for some features, such as server side encryption.
+:Type: Boolean
+:Default: ``false``
+
+
 
 Logging Settings
 ================
diff --git a/ceph/doc/radosgw/encryption.rst b/ceph/doc/radosgw/encryption.rst
index a7bb7e2e9..ea89e502a 100644
--- a/ceph/doc/radosgw/encryption.rst
+++ b/ceph/doc/radosgw/encryption.rst
@@ -9,6 +9,11 @@ with 3 options for the management of encryption keys. Server-side encryption
 means that the data is sent over HTTP in its unencrypted form, and the Ceph
 Object Gateway stores that data in the Ceph Storage Cluster in encrypted form.
 
+.. note:: Requests for server-side encryption must be sent over a secure HTTPS
+          connection to avoid sending secrets in plaintext. If a proxy is used
+          for SSL termination, ``rgw trust forwarded https`` must be enabled
+          before forwarded requests will be trusted as secure.
+
 Customer-Provided Keys
 ======================
 
diff --git a/ceph/doc/radosgw/frontends.rst b/ceph/doc/radosgw/frontends.rst
index ff6323ee4..7c0b2cced 100644
--- a/ceph/doc/radosgw/frontends.rst
+++ b/ceph/doc/radosgw/frontends.rst
@@ -18,7 +18,7 @@ and the Boost.Asio library for asynchronous network i/o.
 Options
 -------
 
-``port``
+``port`` and ``ssl_port``
 
 :Description: Sets the listening port number. Can be specified multiple
               times as in ``port=80 port=8000``.
@@ -27,18 +27,37 @@ Options
 :Default: ``80``
 
 
-``endpoint``
+``endpoint`` and ``ssl_endpoint``
 
 :Description: Sets the listening address in the form ``address[:port]``,
               where the address is an IPv4 address string in dotted decimal
-              form, or an IPv6 address in hexadecimal notation. The
-              optional port defaults to 80. Can be specified multiple times
-              as in ``endpoint=::1 endpoint=192.168.0.100:8000``.
+              form, or an IPv6 address in hexadecimal notation surrounded
+              by square brackets. The optional port defaults to 80 for
+              ``endpoint`` and 443 for ``ssl_endpoint``. Can be specified
+              multiple times as in ``endpoint=[::1] endpoint=192.168.0.100:8000``.
 
 :Type: Integer
 :Default: None
 
 
+``ssl_certificate``
+
+:Description: Path to the SSL certificate file used for SSL-enabled endpoints.
+
+:Type: String
+:Default: None
+
+
+``ssl_private_key``
+
+:Description: Optional path to the private key file used for SSL-enabled
+              endpoints. If one is not given, the ``ssl_certificate`` file
+              is used as the private key.
+
+:Type: String
+:Default: None
+
+
 Civetweb
 ========
 
diff --git a/ceph/doc/start/hardware-recommendations.rst b/ceph/doc/start/hardware-recommendations.rst
index eac5dc8c9..2ad982e39 100644
--- a/ceph/doc/start/hardware-recommendations.rst
+++ b/ceph/doc/start/hardware-recommendations.rst
@@ -39,11 +39,29 @@ separate hosts.
 RAM
 ===
 
-Metadata servers and monitors must be capable of serving their data quickly, so
-they should have plenty of RAM (e.g., 1GB of RAM per daemon instance). OSDs do
-not require as much RAM for regular operations (e.g., 500MB of RAM per daemon
-instance); however, during recovery they need significantly more RAM (e.g., ~1GB
-per 1TB of storage per daemon). Generally, more RAM is better.
+Generally, more RAM is better.
+
+Monitors and managers (ceph-mon and ceph-mgr)
+---------------------------------------------
+
+Monitor and manager daemon memory usage generally scales with the size of the
+cluster.  For small clusters, 1-2 GB is generally sufficient.  For
+large clusters, you should provide more (5-10 GB).  You may also want
+to consider tuning settings like ``mon_osd_cache_size`` or
+``rocksdb_cache_size``.
+
+Metadata servers (ceph-mds)
+---------------------------
+
+The metadata daemon memory utilization depends on how much memory its cache is
+configured to consume.  We recommend 1 GB as a minimum for most systems.  See
+``mds_cache_memory``.
+
+OSDs (ceph-osd)
+---------------
+
+By default, OSDs that use the BlueStore backend require 3-5 GB of RAM.  You can
+adjust the amount of memory the OSD consumes with the ``osd_memory_target`` configuration option when BlueStore is in use.  When using the legacy FileStore backend, the operating system page cache is used for caching data, so no tuning is normally needed, and the OSD memory consumption is generally related to the number of PGs per daemon in the system.
 
 
 Data Storage
diff --git a/ceph/doc/start/quick-ceph-deploy.rst b/ceph/doc/start/quick-ceph-deploy.rst
index 50b7f307f..dcb01e7a0 100644
--- a/ceph/doc/start/quick-ceph-deploy.rst
+++ b/ceph/doc/start/quick-ceph-deploy.rst
@@ -124,7 +124,7 @@ configuration details, perform the following steps using ``ceph-deploy``.
      ceph-deploy mgr create node1  *Required only for luminous+ builds, i.e >= 12.x builds*
 
 #. Add three OSDs. For the purposes of these instructions, we assume you have an
-   unused disk in each node called ``/dev/vdb``.  *Be sure that the device is not currently in use and does not contain any important data.*
+   unused disk in each node called ``/dev/vdb``.  *Be sure that the device is not currently in use and does not contain any important data.* ::
 
      ceph-deploy osd create {ceph-node}:{device}
 
diff --git a/ceph/examples/librados/Makefile b/ceph/examples/librados/Makefile
index 2b6109c4c..e51c045a6 100644
--- a/ceph/examples/librados/Makefile
+++ b/ceph/examples/librados/Makefile
@@ -3,13 +3,13 @@ CXX?=g++
 CXX_FLAGS?=-std=c++11 -Wall -Wextra -Werror -g
 CXX_LIBS?=-lrados -lradosstriper
 CXX_INC?=$(LOCAL_LIBRADOS_INC)
-CXX_CC=$(CXX) $(CXX_FLAGS) $(CXX_INC) $(LOCAL_LIBRADOS) $(CXX_LIBS)
+CXX_CC=$(CXX) $(CXX_FLAGS) $(CXX_INC) $(LOCAL_LIBRADOS)
 
 CC?=gcc
 CC_FLAGS=-Wall -Wextra -Werror -g
 CC_INC=$(LOCAL_LIBRADOS_INC)
 CC_LIBS?=-lrados
-CC_CC=$(CC) $(CC_FLAGS) $(CC_INC) $(LOCAL_LIBRADOS) $(CC_LIBS)
+CC_CC=$(CC) $(CC_FLAGS) $(CC_INC) $(LOCAL_LIBRADOS)
 
 # Relative path to the Ceph source:
 CEPH_SRC_HOME?=../../src
@@ -26,13 +26,13 @@ all-system: LOCAL_LIBRADOS_INC=
 all-system: all
 
 hello_world_cpp: hello_world.cc
-	$(CXX_CC) -o hello_world_cpp hello_world.cc
+	$(CXX_CC) -o hello_world_cpp hello_world.cc $(CXX_LIBS)
 
 hello_radosstriper_cpp: hello_radosstriper.cc
-	$(CXX_CC) -o hello_radosstriper_cpp hello_radosstriper.cc
+	$(CXX_CC) -o hello_radosstriper_cpp hello_radosstriper.cc $(CXX_LIBS)
 
 hello_world_c: hello_world_c.c
-	$(CC_CC) -o hello_world_c hello_world_c.c
+	$(CC_CC) -o hello_world_c hello_world_c.c $(CC_LIBS)
 
 clean:
 	rm -f hello_world_cpp hello_radosstriper_cpp hello_world_c
diff --git a/ceph/examples/librados/hello_world.readme b/ceph/examples/librados/hello_world.readme
index d438f932e..afa1cb32e 100644
--- a/ceph/examples/librados/hello_world.readme
+++ b/ceph/examples/librados/hello_world.readme
@@ -6,7 +6,7 @@ build tree (ie. using relative paths). If you would like to build the examples a
 your system librados and headers, use "make all-system".
 
 And executed using
-./librados_hello_world -c ../../src/ceph.conf
+./hello_world_cpp -c ../../src/ceph.conf
 (or whatever path to a ceph.conf is appropriate to you, or
 by explicitly specifying monitors, user id, and keys).
 
diff --git a/ceph/install-deps.sh b/ceph/install-deps.sh
index 9ead1056d..e73e05f6b 100755
--- a/ceph/install-deps.sh
+++ b/ceph/install-deps.sh
@@ -90,6 +90,7 @@ if [ x`uname`x = xFreeBSDx ]; then
         net/socat \
         textproc/expat2 \
         textproc/gsed \
+        lang/gawk \
         textproc/libxml2 \
         textproc/xmlstarlet \
         textproc/jq \
diff --git a/ceph/qa/cephfs/clusters/1-mds-1-client-coloc.yaml b/ceph/qa/cephfs/clusters/1-mds-1-client-coloc.yaml
new file mode 100644
index 000000000..abcfffec6
--- /dev/null
+++ b/ceph/qa/cephfs/clusters/1-mds-1-client-coloc.yaml
@@ -0,0 +1,12 @@
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7]
+openstack:
+- volumes: # attached to each instance
+    count: 4
+    size: 20 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/1-mds-1-client.yaml b/ceph/qa/cephfs/clusters/1-mds-1-client.yaml
index e64b0b88d..966f0dcc8 100644
--- a/ceph/qa/cephfs/clusters/1-mds-1-client.yaml
+++ b/ceph/qa/cephfs/clusters/1-mds-1-client.yaml
@@ -5,4 +5,9 @@ roles:
 openstack:
 - volumes: # attached to each instance
     count: 4
-    size: 10 # GB
+    size: 20 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/1-mds-2-client-coloc.yaml b/ceph/qa/cephfs/clusters/1-mds-2-client-coloc.yaml
new file mode 100644
index 000000000..9f0f0dc39
--- /dev/null
+++ b/ceph/qa/cephfs/clusters/1-mds-2-client-coloc.yaml
@@ -0,0 +1,12 @@
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7, client.1]
+openstack:
+- volumes: # attached to each instance
+    count: 4
+    size: 20 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/1-mds-2-client.yaml b/ceph/qa/cephfs/clusters/1-mds-2-client.yaml
index 006e15a7b..656178c0f 100644
--- a/ceph/qa/cephfs/clusters/1-mds-2-client.yaml
+++ b/ceph/qa/cephfs/clusters/1-mds-2-client.yaml
@@ -6,4 +6,9 @@ roles:
 openstack:
 - volumes: # attached to each instance
     count: 4
-    size: 10 # GB
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/1-mds-3-client.yaml b/ceph/qa/cephfs/clusters/1-mds-3-client.yaml
new file mode 100644
index 000000000..02e6d6dc6
--- /dev/null
+++ b/ceph/qa/cephfs/clusters/1-mds-3-client.yaml
@@ -0,0 +1,15 @@
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7]
+- [client.0]
+- [client.1]
+- [client.2]
+openstack:
+- volumes: # attached to each instance
+    count: 4
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/1-mds-4-client-coloc.yaml b/ceph/qa/cephfs/clusters/1-mds-4-client-coloc.yaml
new file mode 100644
index 000000000..6ff916c4e
--- /dev/null
+++ b/ceph/qa/cephfs/clusters/1-mds-4-client-coloc.yaml
@@ -0,0 +1,12 @@
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0, client.1]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7, client.2, client.3]
+openstack:
+- volumes: # attached to each instance
+    count: 4
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/1-mds-4-client.yaml b/ceph/qa/cephfs/clusters/1-mds-4-client.yaml
index a6be36dea..f17c83b82 100644
--- a/ceph/qa/cephfs/clusters/1-mds-4-client.yaml
+++ b/ceph/qa/cephfs/clusters/1-mds-4-client.yaml
@@ -8,4 +8,9 @@ roles:
 openstack:
 - volumes: # attached to each instance
     count: 4
-    size: 10 # GB
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/3-mds.yaml b/ceph/qa/cephfs/clusters/3-mds.yaml
index c0d463a90..f9fc10808 100644
--- a/ceph/qa/cephfs/clusters/3-mds.yaml
+++ b/ceph/qa/cephfs/clusters/3-mds.yaml
@@ -5,4 +5,9 @@ roles:
 openstack:
 - volumes: # attached to each instance
     count: 4
-    size: 10 # GB
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/9-mds.yaml b/ceph/qa/cephfs/clusters/9-mds.yaml
index 0bf240272..414fb2ba4 100644
--- a/ceph/qa/cephfs/clusters/9-mds.yaml
+++ b/ceph/qa/cephfs/clusters/9-mds.yaml
@@ -5,4 +5,9 @@ roles:
 openstack:
 - volumes: # attached to each instance
     count: 4
-    size: 10 # GB
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/fixed-2-ucephfs.yaml b/ceph/qa/cephfs/clusters/fixed-2-ucephfs.yaml
index 94948f4c3..129aac6ce 100644
--- a/ceph/qa/cephfs/clusters/fixed-2-ucephfs.yaml
+++ b/ceph/qa/cephfs/clusters/fixed-2-ucephfs.yaml
@@ -4,7 +4,9 @@ roles:
 openstack:
 - volumes: # attached to each instance
     count: 4
-    size: 10 # GB
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
 log-rotate:
   ceph-mds: 10G
   ceph-osd: 10G
diff --git a/ceph/qa/run-standalone.sh b/ceph/qa/run-standalone.sh
index 9321cba65..2c7ceaa34 100755
--- a/ceph/qa/run-standalone.sh
+++ b/ceph/qa/run-standalone.sh
@@ -6,7 +6,8 @@ if [ ! -e Makefile -o ! -d bin ]; then
     exit 1
 fi
 
-if [ ! -d /tmp/ceph-disk-virtualenv -o ! -d /tmp/ceph-detect-init-virtualenv ]; then
+TEMP_DIR=${TMPDIR:-/tmp}
+if [ ! -d $TEMP_DIR/ceph-disk-virtualenv -o ! -d $TEMP_DIR/ceph-detect-init-virtualenv ]; then
     echo '/tmp/*-virtualenv directories not built. Please run "make check" first.'
     exit 1
 fi
diff --git a/ceph/qa/standalone/ceph-helpers.sh b/ceph/qa/standalone/ceph-helpers.sh
index f12f0698a..3883a6f58 100755
--- a/ceph/qa/standalone/ceph-helpers.sh
+++ b/ceph/qa/standalone/ceph-helpers.sh
@@ -19,7 +19,9 @@
 #
 TIMEOUT=300
 PG_NUM=4
-: ${CEPH_BUILD_VIRTUALENV:=/tmp}
+TMPDIR=${TMPDIR:-/tmp}
+CEPH_BUILD_VIRTUALENV=${TMPDIR}
+TESTDIR=${TESTDIR:-${TMPDIR}}
 
 if type xmlstarlet > /dev/null 2>&1; then
     XMLSTARLET=xmlstarlet
@@ -32,10 +34,12 @@ fi
 
 if [ `uname` = FreeBSD ]; then
     SED=gsed
+    AWK=gawk
     DIFFCOLOPTS=""
     KERNCORE="kern.corefile"
 else
     SED=sed
+    AWK=awk
     termwidth=$(stty -a | head -1 | sed -e 's/.*columns \([0-9]*\).*/\1/')
     if [ -n "$termwidth" -a "$termwidth" != "0" ]; then
         termwidth="-W ${termwidth}"
@@ -202,8 +206,8 @@ function teardown() {
 
 function __teardown_btrfs() {
     local btrfs_base_dir=$1
-    local btrfs_root=$(df -P . | tail -1 | awk '{print $NF}')
-    local btrfs_dirs=$(cd $btrfs_base_dir; sudo btrfs subvolume list . -t | awk '/^[0-9]/ {print $4}' | grep "$btrfs_base_dir/$btrfs_dir")
+    local btrfs_root=$(df -P . | tail -1 | $AWK '{print $NF}')
+    local btrfs_dirs=$(cd $btrfs_base_dir; sudo btrfs subvolume list -t . | $AWK '/^[0-9]/ {print $4}' | grep "$btrfs_base_dir/$btrfs_dir")
     for subvolume in $btrfs_dirs; do
        sudo btrfs subvolume delete $btrfs_root/$subvolume
     done
@@ -1350,7 +1354,7 @@ function test_is_clean() {
 
 #######################################################################
 
-calc() { awk "BEGIN{print $*}"; }
+calc() { $AWK "BEGIN{print $*}"; }
 
 ##
 # Return a list of numbers that are increasingly larger and whose
@@ -1757,7 +1761,7 @@ function run_in_background() {
     local pid_variable=$1
     shift
     # Execute the command and prepend the output with its pid
-    # We enforce to return the exit status of the command and not the awk one.
+    # We enforce to return the exit status of the command and not the sed one.
     ("$@" |& sed 's/^/'$$': /'; return "${PIPESTATUS[0]}") >&2 &
     eval "$pid_variable+=\" $!\""
 }
diff --git a/ceph/qa/standalone/scrub/osd-scrub-repair.sh b/ceph/qa/standalone/scrub/osd-scrub-repair.sh
index a266aed90..b6d541bb3 100755
--- a/ceph/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/ceph/qa/standalone/scrub/osd-scrub-repair.sh
@@ -5565,6 +5565,67 @@ EOF
     teardown $dir || return 1
 }
 
+function TEST_request_scrub_priority() {
+    local dir=$1
+    local poolname=psr_pool
+    local objname=POBJ
+    local OBJECTS=64
+    local PGS=8
+
+    setup $dir || return 1
+    run_mon $dir a --osd_pool_default_size=1 || return 1
+    run_mgr $dir x || return 1
+    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+    ceph_osd_args+="--osd_scrub_backoff_ratio=0"
+    run_osd $dir 0 $ceph_osd_args || return 1
+
+    create_pool $poolname $PGS $PGS || return 1
+    wait_for_clean || return 1
+
+    local osd=0
+    add_something $dir $poolname $objname noscrub || return 1
+    local primary=$(get_primary $poolname $objname)
+    local pg=$(get_pg $poolname $objname)
+    poolid=$(ceph osd dump | grep "^pool.*[']${poolname}[']" | awk '{ print $2 }')
+
+    local otherpgs
+    for i in $(seq 0 $(expr $PGS - 1))
+    do
+        opg="${poolid}.${i}"
+        if [ "$opg" = "$pg" ]; then
+          continue
+        fi
+        otherpgs="${otherpgs}${opg} "
+        local other_last_scrub=$(get_last_scrub_stamp $pg)
+        # Fake a schedule scrub
+        CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) \
+             trigger_scrub $opg || return 1
+    done
+
+    sleep 15
+    flush_pg_stats
+
+    # Request a regular scrub and it will be done
+    local last_scrub=$(get_last_scrub_stamp $pg)
+    ceph pg scrub $pg
+
+    ceph osd unset noscrub || return 1
+    ceph osd unset nodeep-scrub || return 1
+
+    wait_for_scrub $pg "$last_scrub"
+
+    for opg in $otherpgs $pg
+    do
+        wait_for_scrub $opg "$other_last_scrub"
+    done
+
+    # Verify that the requested scrub ran first
+    grep "log_channel.*scrub ok" $dir/osd.${primary}.log | head -1 | sed 's/.*[[]DBG[]]//' | grep -q $pg || return 1
+
+    return 0
+}
+
+
 main osd-scrub-repair "$@"
 
 # Local Variables:
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml
index 5ca4bd609..750fa6dd7 100644
--- a/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml
+++ b/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml
@@ -3,8 +3,8 @@ meta:
 
 overrides:
    ceph_ansible:
+     branch: stable-3.2
      vars:
-        branch: stable-3.2
         ceph_conf_overrides:
           global:
             osd default pool size: 2
diff --git a/ceph/qa/suites/fs/32bits/tasks/cfuse_workunit_suites_pjd.yaml b/ceph/qa/suites/fs/32bits/tasks/cfuse_workunit_suites_pjd.yaml
index a1e2ada19..37e315f7e 100644
--- a/ceph/qa/suites/fs/32bits/tasks/cfuse_workunit_suites_pjd.yaml
+++ b/ceph/qa/suites/fs/32bits/tasks/cfuse_workunit_suites_pjd.yaml
@@ -6,6 +6,7 @@ overrides:
         fuse default permissions: false
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/pjd.sh
diff --git a/ceph/qa/suites/fs/basic_functional/clusters/1-mds-4-client-coloc.yaml b/ceph/qa/suites/fs/basic_functional/clusters/1-mds-4-client-coloc.yaml
new file mode 120000
index 000000000..e5444ae22
--- /dev/null
+++ b/ceph/qa/suites/fs/basic_functional/clusters/1-mds-4-client-coloc.yaml
@@ -0,0 +1 @@
+.qa/cephfs/clusters/1-mds-4-client-coloc.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/basic_functional/clusters/4-remote-clients.yaml b/ceph/qa/suites/fs/basic_functional/clusters/4-remote-clients.yaml
deleted file mode 100644
index 1c540a4ef..000000000
--- a/ceph/qa/suites/fs/basic_functional/clusters/4-remote-clients.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-roles:
-- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, mds.a, mds.b, client.1, client.2, client.3]
-- [client.0, osd.4, osd.5, osd.6, osd.7]
-openstack:
-- volumes: # attached to each instance
-    count: 2
-    size: 10 # GB
-log-rotate:
-  ceph-mds: 10G
-  ceph-osd: 10G
diff --git a/ceph/qa/suites/fs/basic_functional/tasks/damage.yaml b/ceph/qa/suites/fs/basic_functional/tasks/damage.yaml
index 3f4aac9e5..9ae738f01 100644
--- a/ceph/qa/suites/fs/basic_functional/tasks/damage.yaml
+++ b/ceph/qa/suites/fs/basic_functional/tasks/damage.yaml
@@ -17,6 +17,8 @@ overrides:
       - Corrupt dentry
       - Scrub error on inode
       - Metadata damage detected
+      - MDS_READ_ONLY
+      - force file system read-only
 
 tasks:
   - cephfs_test_runner:
diff --git a/ceph/qa/suites/fs/basic_workload/tasks/cfuse_workunit_suites_pjd.yaml b/ceph/qa/suites/fs/basic_workload/tasks/cfuse_workunit_suites_pjd.yaml
index a1e2ada19..37e315f7e 100644
--- a/ceph/qa/suites/fs/basic_workload/tasks/cfuse_workunit_suites_pjd.yaml
+++ b/ceph/qa/suites/fs/basic_workload/tasks/cfuse_workunit_suites_pjd.yaml
@@ -6,6 +6,7 @@ overrides:
         fuse default permissions: false
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/pjd.sh
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/clusters/small-cluster.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/clusters/small-cluster.yaml
index 12047bd7a..5cd97a3ae 100644
--- a/ceph/qa/suites/fs/bugs/client_trim_caps/clusters/small-cluster.yaml
+++ b/ceph/qa/suites/fs/bugs/client_trim_caps/clusters/small-cluster.yaml
@@ -4,6 +4,8 @@ openstack:
 - volumes: # attached to each instance
     count: 2
     size: 10 # GB
+- machine:
+    disk: 100 # GB
 log-rotate:
   ceph-mds: 10G
   ceph-osd: 10G
diff --git a/ceph/qa/suites/fs/multiclient/clusters/1-mds-2-client.yaml b/ceph/qa/suites/fs/multiclient/clusters/1-mds-2-client.yaml
new file mode 120000
index 000000000..9f4f161a3
--- /dev/null
+++ b/ceph/qa/suites/fs/multiclient/clusters/1-mds-2-client.yaml
@@ -0,0 +1 @@
+.qa/cephfs/clusters/1-mds-2-client.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/multiclient/clusters/1-mds-3-client.yaml b/ceph/qa/suites/fs/multiclient/clusters/1-mds-3-client.yaml
new file mode 120000
index 000000000..6b25e07c4
--- /dev/null
+++ b/ceph/qa/suites/fs/multiclient/clusters/1-mds-3-client.yaml
@@ -0,0 +1 @@
+.qa/cephfs/clusters/1-mds-3-client.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/multiclient/clusters/three_clients.yaml b/ceph/qa/suites/fs/multiclient/clusters/three_clients.yaml
deleted file mode 100644
index a533af5c6..000000000
--- a/ceph/qa/suites/fs/multiclient/clusters/three_clients.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-roles:
-- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
-- [client.2]
-- [client.1]
-- [client.0]
-
-openstack:
-- volumes: # attached to each instance
-    count: 1
-    size: 10 # GB
-
-log-rotate:
-  ceph-mds: 10G
-  ceph-osd: 10G
-
diff --git a/ceph/qa/suites/fs/multiclient/clusters/two_clients.yaml b/ceph/qa/suites/fs/multiclient/clusters/two_clients.yaml
deleted file mode 100644
index 00f3815cb..000000000
--- a/ceph/qa/suites/fs/multiclient/clusters/two_clients.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-roles:
-- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
-- [client.1]
-- [client.0]
-
-openstack:
-- volumes: # attached to each instance
-    count: 3
-    size: 10 # GB
-
-log-rotate:
-  ceph-mds: 10G
-  ceph-osd: 10G
-
diff --git a/ceph/qa/suites/fs/multifs/clusters/2-remote-clients.yaml b/ceph/qa/suites/fs/multifs/clusters/2-remote-clients.yaml
deleted file mode 100644
index 2ae772c3f..000000000
--- a/ceph/qa/suites/fs/multifs/clusters/2-remote-clients.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-roles:
-- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, mon.b, mds.a, mds.b, client.1]
-- [mds.c, mds.d, mon.c, client.0, osd.4, osd.5, osd.6, osd.7]
-openstack:
-- volumes: # attached to each instance
-    count: 2
-    size: 10 # GB
-log-rotate:
-  ceph-mds: 10G
-  ceph-osd: 10G
diff --git a/ceph/qa/suites/fs/permission/tasks/cfuse_workunit_suites_pjd.yaml b/ceph/qa/suites/fs/permission/tasks/cfuse_workunit_suites_pjd.yaml
index 2dd8ac779..09be26675 100644
--- a/ceph/qa/suites/fs/permission/tasks/cfuse_workunit_suites_pjd.yaml
+++ b/ceph/qa/suites/fs/permission/tasks/cfuse_workunit_suites_pjd.yaml
@@ -7,6 +7,7 @@ overrides:
         client acl type: posix_acl
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/pjd.sh
diff --git a/ceph/qa/suites/fs/thrash/clusters/1-mds-1-client-coloc.yaml b/ceph/qa/suites/fs/thrash/clusters/1-mds-1-client-coloc.yaml
new file mode 120000
index 000000000..d15ecfda0
--- /dev/null
+++ b/ceph/qa/suites/fs/thrash/clusters/1-mds-1-client-coloc.yaml
@@ -0,0 +1 @@
+.qa/cephfs/clusters/1-mds-1-client-coloc.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/thrash/clusters/mds-1active-1standby.yaml b/ceph/qa/suites/fs/thrash/clusters/mds-1active-1standby.yaml
deleted file mode 100644
index d02524866..000000000
--- a/ceph/qa/suites/fs/thrash/clusters/mds-1active-1standby.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2, mds.b-s-a]
-- [mon.b, mgr.x, mds.a, osd.3, osd.4, osd.5, client.0]
-openstack:
-- volumes: # attached to each instance
-    count: 3
-    size: 10 # GB
-log-rotate:
-  ceph-mds: 10G
-  ceph-osd: 10G
diff --git a/ceph/qa/suites/fs/thrash/msgr-failures/osd-mds-delay.yaml b/ceph/qa/suites/fs/thrash/msgr-failures/osd-mds-delay.yaml
index adcebc0ba..4dc0086e6 100644
--- a/ceph/qa/suites/fs/thrash/msgr-failures/osd-mds-delay.yaml
+++ b/ceph/qa/suites/fs/thrash/msgr-failures/osd-mds-delay.yaml
@@ -3,6 +3,6 @@ overrides:
     conf:
       global:
         ms inject socket failures: 2500
-        mds inject delay type: osd mds
+        ms inject delay type: osd mds
         ms inject delay probability: .005
         ms inject delay max: 1
diff --git a/ceph/qa/suites/fs/thrash/tasks/cfuse_workunit_suites_pjd.yaml b/ceph/qa/suites/fs/thrash/tasks/cfuse_workunit_suites_pjd.yaml
index a1e2ada19..37e315f7e 100644
--- a/ceph/qa/suites/fs/thrash/tasks/cfuse_workunit_suites_pjd.yaml
+++ b/ceph/qa/suites/fs/thrash/tasks/cfuse_workunit_suites_pjd.yaml
@@ -6,6 +6,7 @@ overrides:
         fuse default permissions: false
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/pjd.sh
diff --git a/ceph/qa/suites/kcephfs/cephfs/tasks/kclient_workunit_suites_pjd.yaml b/ceph/qa/suites/kcephfs/cephfs/tasks/kclient_workunit_suites_pjd.yaml
index 09abaeb6e..1f24a5506 100644
--- a/ceph/qa/suites/kcephfs/cephfs/tasks/kclient_workunit_suites_pjd.yaml
+++ b/ceph/qa/suites/kcephfs/cephfs/tasks/kclient_workunit_suites_pjd.yaml
@@ -1,6 +1,7 @@
 tasks:
 - kclient:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/pjd.sh
diff --git a/ceph/qa/suites/kcephfs/recovery/tasks/damage.yaml b/ceph/qa/suites/kcephfs/recovery/tasks/damage.yaml
index 3f4aac9e5..9ae738f01 100644
--- a/ceph/qa/suites/kcephfs/recovery/tasks/damage.yaml
+++ b/ceph/qa/suites/kcephfs/recovery/tasks/damage.yaml
@@ -17,6 +17,8 @@ overrides:
       - Corrupt dentry
       - Scrub error on inode
       - Metadata damage detected
+      - MDS_READ_ONLY
+      - force file system read-only
 
 tasks:
   - cephfs_test_runner:
diff --git a/ceph/qa/suites/multimds/basic/tasks/cfuse_workunit_suites_pjd.yaml b/ceph/qa/suites/multimds/basic/tasks/cfuse_workunit_suites_pjd.yaml
index a1e2ada19..37e315f7e 100644
--- a/ceph/qa/suites/multimds/basic/tasks/cfuse_workunit_suites_pjd.yaml
+++ b/ceph/qa/suites/multimds/basic/tasks/cfuse_workunit_suites_pjd.yaml
@@ -6,6 +6,7 @@ overrides:
         fuse default permissions: false
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/pjd.sh
diff --git a/ceph/qa/suites/rados/singleton/all/mon-config-key-caps.yaml b/ceph/qa/suites/rados/singleton/all/mon-config-key-caps.yaml
new file mode 100644
index 000000000..0b0b95c52
--- /dev/null
+++ b/ceph/qa/suites/rados/singleton/all/mon-config-key-caps.yaml
@@ -0,0 +1,17 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - client.0
+tasks:
+- install:
+- ceph:
+    log-whitelist:
+    - overall HEALTH_
+    - \(AUTH_BAD_CAPS\)
+- workunit:
+    clients:
+      all:
+        - mon/test_config_key_caps.sh
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/normal_pg_log.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/normal_pg_log.yaml
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/normal_pg_log.yaml
@@ -0,0 +1 @@
+
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/short_pg_log.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/short_pg_log.yaml
new file mode 100644
index 000000000..20cc101de
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/short_pg_log.yaml
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/normal_pg_log.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/normal_pg_log.yaml
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/normal_pg_log.yaml
@@ -0,0 +1 @@
+
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/short_pg_log.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/short_pg_log.yaml
new file mode 100644
index 000000000..20cc101de
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/short_pg_log.yaml
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/normal_pg_log.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/normal_pg_log.yaml
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/normal_pg_log.yaml
@@ -0,0 +1 @@
+
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/short_pg_log.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/short_pg_log.yaml
new file mode 100644
index 000000000..20cc101de
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/short_pg_log.yaml
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split/2-partial-upgrade/firsthalf.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split/2-partial-upgrade/firsthalf.yaml
index 442dcf105..a73b87beb 100644
--- a/ceph/qa/suites/upgrade/jewel-x/stress-split/2-partial-upgrade/firsthalf.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split/2-partial-upgrade/firsthalf.yaml
@@ -10,3 +10,8 @@ tasks:
 - ceph.restart:
     daemons: [mon.a,mon.b,mon.c,osd.0, osd.1, osd.2]
 - print: "**** done ceph.restart 1st half"
+- exec:
+    osd.0:
+      - ceph osd set pglog_hardlimit && exit 1 || true
+      - ceph osd dump --format=json-pretty | grep "flags"
+- print: "**** try to set pglog_hardlimit, should not succeed"
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split/5-finish-upgrade.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split/5-finish-upgrade.yaml
index 1d528cd5d..faea6fdbf 100644
--- a/ceph/qa/suites/upgrade/jewel-x/stress-split/5-finish-upgrade.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split/5-finish-upgrade.yaml
@@ -6,4 +6,14 @@ tasks:
     daemons: [osd.3, osd.4, osd.5]
     wait-for-healthy: false
     wait-for-osds-up: true
+- exec:
+     osd.0:
+      - ceph osd require-osd-release luminous
+- print: "**** done `ceph osd require-osd-release luminous`"
+- exec:
+    osd.0:
+      - ceph osd dump --format=json-pretty | grep "flags"
+      - ceph osd set pglog_hardlimit
+      - ceph osd dump --format=json-pretty | grep "flags"
+- print: "**** try to set pglog_hardlimit again, should succeed"
 
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/% b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/%
new file mode 100644
index 000000000..e69de29bb
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/.qa b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/.qa
new file mode 120000
index 000000000..a23f7e045
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/.qa
@@ -0,0 +1 @@
+../../.qa
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/point-to-point-upgrade.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/point-to-point-upgrade.yaml
similarity index 85%
rename from ceph/qa/suites/upgrade/luminous-p2p/point-to-point-upgrade.yaml
rename to ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/point-to-point-upgrade.yaml
index 9deeb4c49..c0b6ebbd5 100644
--- a/ceph/qa/suites/upgrade/luminous-p2p/point-to-point-upgrade.yaml
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/point-to-point-upgrade.yaml
@@ -11,6 +11,11 @@ meta:
    run workload and upgrade-sequence in parallel
    install ceph/luminous v12.2.8 point version
    run workload and upgrade-sequence in parallel
+   install ceph/luminous v12.2.9 point version
+   run workload and upgrade-sequence in parallel
+   install ceph/luminous v12.2.10 point version
+   run workload and upgrade-sequence in parallel
+
    install ceph/luminous latest version
    run workload and upgrade-sequence in parallel
 overrides:
@@ -119,6 +124,34 @@ tasks:
    - upgrade-sequence_luminous
 - print: "**** done parallel luminous v12.2.8"
 
+
+####  upgrade to v12.2.9
+- install.upgrade:
+    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+    mon.a:
+      tag: v12.2.9
+    mon.b:
+      tag: v12.2.9
+    # Note that client.a IS NOT upgraded at this point
+- parallel:
+   - workload_luminous
+   - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.9"
+
+####  upgrade to v12.2.10
+- install.upgrade:
+    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+    mon.a:
+      tag: v12.2.10
+    mon.b:
+      tag: v12.2.10
+    # Note that client.a IS NOT upgraded at this point
+- parallel:
+   - workload_luminous
+   - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.10"
+
+
 ####  upgrade to latest luminous
 - install.upgrade:
     #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/supported b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/supported
new file mode 120000
index 000000000..79010c36a
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/supported
@@ -0,0 +1 @@
+../../../../distros/supported
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/% b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/%
new file mode 100644
index 000000000..e69de29bb
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/+ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/+
new file mode 100644
index 000000000..e69de29bb
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/.qa b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/.qa
similarity index 100%
rename from ceph/qa/suites/upgrade/luminous-p2p/.qa
rename to ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/.qa
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/openstack.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/openstack.yaml
new file mode 100644
index 000000000..a0d5c2019
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/openstack.yaml
@@ -0,0 +1,6 @@
+openstack:
+  - machine:
+      disk: 100 # GB
+  - volumes: # attached to each instance
+      count: 3
+      size: 30 # GB
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/start.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/start.yaml
new file mode 100644
index 000000000..4f40219b5
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/start.yaml
@@ -0,0 +1,20 @@
+meta:
+- desc: |
+   Run ceph on two nodes,
+   with a separate client-only node.
+   Use xfs beneath the osds.
+overrides:
+  ceph:
+    fs: xfs
+roles:
+- - mon.a
+  - mon.b
+  - mon.c
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+- - osd.3
+  - osd.4
+  - osd.5
+- - client.0
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1-ceph-install/luminous.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1-ceph-install/luminous.yaml
new file mode 100644
index 000000000..b66e0ca99
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1-ceph-install/luminous.yaml
@@ -0,0 +1,19 @@
+meta:
+- desc: install ceph/luminous latest
+tasks:
+- install:
+    tag: v12.2.10
+    exclude_packages: ['librados3']
+    extra_packages: ['librados2']
+- print: "**** done install luminous v12.2.10"
+- ceph:
+- exec:
+    osd.0:
+      - ceph osd require-osd-release luminous
+      - ceph osd set-require-min-compat-client luminous
+- print: "**** done ceph"
+overrides:
+  ceph:
+    conf:
+      mon:
+        mon warn on osd down out interval zero: false
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/normal_pg_log.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/normal_pg_log.yaml
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/normal_pg_log.yaml
@@ -0,0 +1 @@
+
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/short_pg_log.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/short_pg_log.yaml
new file mode 100644
index 000000000..20cc101de
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/short_pg_log.yaml
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/.qa b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/.qa
new file mode 120000
index 000000000..a602a0353
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/.qa
@@ -0,0 +1 @@
+../.qa/
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/firsthalf.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/firsthalf.yaml
new file mode 100644
index 000000000..a73b87beb
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/firsthalf.yaml
@@ -0,0 +1,17 @@
+meta:
+- desc: |
+   install upgrade ceph/-x on one node only
+   1st half
+   restart : osd.0,1,2
+tasks:
+- install.upgrade:
+    osd.0:
+- print: "**** done install.upgrade osd.0"
+- ceph.restart:
+    daemons: [mon.a,mon.b,mon.c,osd.0, osd.1, osd.2]
+- print: "**** done ceph.restart 1st half"
+- exec:
+    osd.0:
+      - ceph osd set pglog_hardlimit && exit 1 || true
+      - ceph osd dump --format=json-pretty | grep "flags"
+- print: "**** try to set pglog_hardlimit, should not succeed"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/.qa b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/.qa
new file mode 120000
index 000000000..a602a0353
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/.qa
@@ -0,0 +1 @@
+../.qa/
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/default.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/default.yaml
new file mode 100644
index 000000000..b3fddefc7
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/default.yaml
@@ -0,0 +1,25 @@
+meta:
+- desc: |
+   randomly kill and revive osd
+   small chance to increase the number of pgs
+overrides:
+  ceph:
+    log-whitelist:
+    - but it is still running
+    - wrongly marked me down
+    - objects unfound and apparently lost
+    - log bound mismatch
+tasks:
+- parallel:
+  - stress-tasks
+stress-tasks:
+- thrashosds:
+    timeout: 1200
+    chance_pgnum_grow: 1
+    chance_pgpnum_fix: 1
+    chance_thrash_cluster_full: 0
+    chance_thrash_pg_upmap: 0
+    chance_thrash_pg_upmap_items: 0
+    disable_objectstore_tool_tests: true
+    chance_force_recovery: 0
+- print: "**** done thrashosds 3-thrash"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/+ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/+
new file mode 100644
index 000000000..e69de29bb
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/.qa b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/.qa
new file mode 120000
index 000000000..a602a0353
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/.qa
@@ -0,0 +1 @@
+../.qa/
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/radosbench.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/radosbench.yaml
new file mode 100644
index 000000000..626ae8ea6
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/radosbench.yaml
@@ -0,0 +1,40 @@
+meta:
+- desc: |
+   run randomized correctness test for rados operations
+   generate write load with rados bench
+stress-tasks:
+- full_sequential:
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+- print: "**** done radosbench 7-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-cls.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-cls.yaml
new file mode 100644
index 000000000..f8cc4d8ac
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-cls.yaml
@@ -0,0 +1,10 @@
+meta:
+- desc: |
+   run basic cls tests for rbd
+stress-tasks:
+- workunit:
+    branch: luminous
+    clients:
+      client.0:
+        - cls/test_cls_rbd.sh
+- print: "**** done cls/test_cls_rbd.sh 5-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-import-export.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-import-export.yaml
new file mode 100644
index 000000000..30a677af6
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-import-export.yaml
@@ -0,0 +1,12 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd
+stress-tasks:
+- workunit:
+    branch: luminous
+    clients:
+      client.0:
+        - rbd/import_export.sh
+    env:
+      RBD_CREATE_ARGS: --new-format
+- print: "**** done rbd/import_export.sh 5-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd_api.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd_api.yaml
new file mode 100644
index 000000000..9079aa33b
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd_api.yaml
@@ -0,0 +1,10 @@
+meta:
+- desc: |
+   librbd C and C++ api tests
+stress-tasks:
+- workunit:
+     branch: luminous
+     clients:
+        client.0:
+           - rbd/test_librbd.sh
+- print: "**** done rbd/test_librbd.sh 7-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/readwrite.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/readwrite.yaml
new file mode 100644
index 000000000..41e34d6d7
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/readwrite.yaml
@@ -0,0 +1,16 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool,
+   using only reads, writes, and deletes
+stress-tasks:
+- full_sequential:
+  - rados:
+      clients: [client.0]
+      ops: 4000
+      objects: 500
+      write_append_excl: false
+      op_weights:
+        read: 45
+        write: 45
+        delete: 10
+- print: "**** done rados/readwrite 5-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/snaps-few-objects.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/snaps-few-objects.yaml
new file mode 100644
index 000000000..f56d0de0f
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/snaps-few-objects.yaml
@@ -0,0 +1,18 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool with snapshot operations
+stress-tasks:
+- full_sequential:
+  - rados:
+      clients: [client.0]
+      ops: 4000
+      objects: 50
+      write_append_excl: false
+      op_weights:
+        read: 100
+        write: 100
+        delete: 50
+        snap_create: 50
+        snap_remove: 50
+        rollback: 50
+- print: "**** done rados/snaps-few-objects 5-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/5-finish-upgrade.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/5-finish-upgrade.yaml
new file mode 100644
index 000000000..9d5a96c6c
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/5-finish-upgrade.yaml
@@ -0,0 +1,14 @@
+tasks:
+- install.upgrade:
+    osd.3:
+    client.0:
+- ceph.restart:
+    daemons: [osd.3, osd.4, osd.5]
+    wait-for-healthy: false
+    wait-for-osds-up: true
+- exec:
+    osd.0:
+      - ceph osd set pglog_hardlimit
+      - ceph osd dump --format=json-pretty | grep "flags"
+- print: "**** try to set pglog_hardlimit again, should succeed"
+
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/+ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/+
new file mode 100644
index 000000000..e69de29bb
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/.qa b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/.qa
new file mode 120000
index 000000000..a602a0353
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/.qa
@@ -0,0 +1 @@
+../.qa/
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rbd-python.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rbd-python.yaml
new file mode 100644
index 000000000..56ba21d7a
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rbd-python.yaml
@@ -0,0 +1,9 @@
+meta:
+- desc: |
+   librbd python api tests
+tasks:
+- workunit:
+    clients:
+      client.0:
+        - rbd/test_librbd_python.sh
+- print: "**** done rbd/test_librbd_python.sh 9-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rgw-swift.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rgw-swift.yaml
new file mode 100644
index 000000000..76e5d6fc2
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rgw-swift.yaml
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   swift api tests for rgw
+tasks:
+- rgw:
+    client.0:
+- print: "**** done rgw 9-workload"
+- swift:
+    client.0:
+      rgw_server: client.0
+- print: "**** done swift 9-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/snaps-many-objects.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/snaps-many-objects.yaml
new file mode 100644
index 000000000..805bf97c3
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/snaps-many-objects.yaml
@@ -0,0 +1,16 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool with snapshot operations
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 500
+    write_append_excl: false
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 50
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/supported b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/supported
new file mode 120000
index 000000000..79010c36a
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/supported
@@ -0,0 +1 @@
+../../../../distros/supported
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/thrashosds-health.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/thrashosds-health.yaml
new file mode 120000
index 000000000..e0426dbe4
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/supported b/ceph/qa/suites/upgrade/luminous-p2p/supported
deleted file mode 120000
index dd0d7f1d5..000000000
--- a/ceph/qa/suites/upgrade/luminous-p2p/supported
+++ /dev/null
@@ -1 +0,0 @@
-../../../distros/supported/
\ No newline at end of file
diff --git a/ceph/qa/tasks/cephfs/filesystem.py b/ceph/qa/tasks/cephfs/filesystem.py
index 7f9253aab..e03001225 100644
--- a/ceph/qa/tasks/cephfs/filesystem.py
+++ b/ceph/qa/tasks/cephfs/filesystem.py
@@ -171,8 +171,10 @@ class CephCluster(object):
         del self._ctx.ceph['ceph'].conf[subsys][key]
         write_conf(self._ctx)
 
-    def json_asok(self, command, service_type, service_id):
-        proc = self.mon_manager.admin_socket(service_type, service_id, command)
+    def json_asok(self, command, service_type, service_id, timeout=None):
+        if timeout is None:
+            timeout = 15*60
+        proc = self.mon_manager.admin_socket(service_type, service_id, command, timeout=timeout)
         response_data = proc.stdout.getvalue()
         log.info("_json_asok output: {0}".format(response_data))
         if response_data.strip():
@@ -444,10 +446,10 @@ class Filesystem(MDSCluster):
         self.mon_manager.raw_cluster_cmd("fs", "set", self.name, var, *a)
 
     def set_max_mds(self, max_mds):
-        self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "max_mds", "%d" % max_mds)
+        self.set_var("max_mds", "%d" % max_mds)
 
     def set_allow_dirfrags(self, yes):
-        self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "allow_dirfrags", str(yes).lower(), '--yes-i-really-mean-it')
+        self.set_var("allow_dirfrags", str(yes).lower(), '--yes-i-really-mean-it')
 
     def get_pgs_per_fs_pool(self):
         """
@@ -559,8 +561,10 @@ class Filesystem(MDSCluster):
     def _df(self):
         return json.loads(self.mon_manager.raw_cluster_cmd("df", "--format=json-pretty"))
 
-    def get_mds_map(self):
-        return self.status().get_fsmap(self.id)['mdsmap']
+    def get_mds_map(self, status=None):
+        if status is None:
+            status = self.status()
+        return status.get_fsmap(self.id)['mdsmap']
 
     def get_var(self, var):
         return self.status().get_fsmap(self.id)['mdsmap'][var]
@@ -855,15 +859,15 @@ class Filesystem(MDSCluster):
 
         return version
 
-    def mds_asok(self, command, mds_id=None):
+    def mds_asok(self, command, mds_id=None, timeout=None):
         if mds_id is None:
             mds_id = self.get_lone_mds_id()
 
-        return self.json_asok(command, 'mds', mds_id)
+        return self.json_asok(command, 'mds', mds_id, timeout=timeout)
 
-    def rank_asok(self, command, rank=0):
-        info = self.get_rank(rank=rank)
-        return self.json_asok(command, 'mds', info['name'])
+    def rank_asok(self, command, rank=0, status=None, timeout=None):
+        info = self.get_rank(rank=rank, status=status)
+        return self.json_asok(command, 'mds', info['name'], timeout=timeout)
 
     def read_cache(self, path, depth=None):
         cmd = ["dump", "tree", path]
@@ -893,9 +897,17 @@ class Filesystem(MDSCluster):
         while True:
             status = self.status()
             if rank is not None:
-                mds_info = status.get_rank(self.id, rank)
-                current_state = mds_info['state'] if mds_info else None
-                log.info("Looked up MDS state for mds.{0}: {1}".format(rank, current_state))
+                try:
+                    mds_info = status.get_rank(self.id, rank)
+                    current_state = mds_info['state'] if mds_info else None
+                    log.info("Looked up MDS state for mds.{0}: {1}".format(rank, current_state))
+                except:
+                    mdsmap = self.get_mds_map(status=status)
+                    if rank in mdsmap['failed']:
+                        log.info("Waiting for rank {0} to come back.".format(rank))
+                        current_state = None
+                    else:
+                        raise
             elif mds_id is not None:
                 # mds_info is None if no daemon with this ID exists in the map
                 mds_info = status.get_mds(mds_id)
@@ -1166,6 +1178,9 @@ class Filesystem(MDSCluster):
         """
         return ""
 
+    def _make_rank(self, rank):
+        return "{}:{}".format(self.name, rank)
+
     def _run_tool(self, tool, args, rank=None, quiet=False):
         # Tests frequently have [client] configuration that jacks up
         # the objecter log level (unlikely to be interesting here)
@@ -1176,7 +1191,7 @@ class Filesystem(MDSCluster):
             base_args = [os.path.join(self._prefix, tool), '--debug-mds=4', '--debug-objecter=1']
 
         if rank is not None:
-            base_args.extend(["--rank", "%d" % rank])
+            base_args.extend(["--rank", "%s" % str(rank)])
 
         t1 = datetime.datetime.now()
         r = self.tool_remote.run(
@@ -1198,11 +1213,12 @@ class Filesystem(MDSCluster):
         mds_id = self.mds_ids[0]
         return self.mds_daemons[mds_id].remote
 
-    def journal_tool(self, args, rank=None, quiet=False):
+    def journal_tool(self, args, rank, quiet=False):
         """
-        Invoke cephfs-journal-tool with the passed arguments, and return its stdout
+        Invoke cephfs-journal-tool with the passed arguments for a rank, and return its stdout
         """
-        return self._run_tool("cephfs-journal-tool", args, rank, quiet)
+        fs_rank = self._make_rank(rank)
+        return self._run_tool("cephfs-journal-tool", args, fs_rank, quiet)
 
     def table_tool(self, args, quiet=False):
         """
diff --git a/ceph/qa/tasks/cephfs/fuse_mount.py b/ceph/qa/tasks/cephfs/fuse_mount.py
index b121680b0..33bcf8c60 100644
--- a/ceph/qa/tasks/cephfs/fuse_mount.py
+++ b/ceph/qa/tasks/cephfs/fuse_mount.py
@@ -50,6 +50,7 @@ class FuseMount(CephFSMount):
                 '--',
                 self.mountpoint,
             ],
+            timeout=(15*60)
         )
 
         run_cmd = [
@@ -88,12 +89,14 @@ class FuseMount(CephFSMount):
         def list_connections():
             self.client_remote.run(
                 args=["sudo", "mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"],
-                check_status=False
+                check_status=False,
+                timeout=(15*60)
             )
             p = self.client_remote.run(
                 args=["ls", "/sys/fs/fuse/connections"],
                 stdout=StringIO(),
-                check_status=False
+                check_status=False,
+                timeout=(15*60)
             )
             if p.exitstatus != 0:
                 return []
@@ -163,7 +166,8 @@ class FuseMount(CephFSMount):
             ],
             stdout=StringIO(),
             stderr=StringIO(),
-            wait=False
+            wait=False,
+            timeout=(15*60)
         )
         try:
             proc.wait()
@@ -202,11 +206,18 @@ class FuseMount(CephFSMount):
 
         # Now that we're mounted, set permissions so that the rest of the test will have
         # unrestricted access to the filesystem mount.
-        self.client_remote.run(
-            args=['sudo', 'chmod', '1777', self.mountpoint])
+        try:
+            stderr = StringIO()
+            self.client_remote.run(args=['sudo', 'chmod', '1777', self.mountpoint], timeout=(15*60), stderr=stderr)
+        except run.CommandFailedError:
+            stderr = stderr.getvalue()
+            if "Read-only file system".lower() in stderr.lower():
+                pass
+            else:
+                raise
 
     def _mountpoint_exists(self):
-        return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False).exitstatus == 0
+        return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False, timeout=(15*60)).exitstatus == 0
 
     def umount(self):
         try:
@@ -218,6 +229,7 @@ class FuseMount(CephFSMount):
                     '-u',
                     self.mountpoint,
                 ],
+                timeout=(30*60),
             )
         except run.CommandFailedError:
             log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name))
@@ -229,7 +241,7 @@ class FuseMount(CephFSMount):
                 run.Raw(';'),
                 'ps',
                 'auxf',
-            ])
+            ], timeout=(60*15))
 
             # abort the fuse mount, killing all hung processes
             if self._fuse_conn:
@@ -252,7 +264,8 @@ class FuseMount(CephFSMount):
                         '-f',
                         self.mountpoint,
                     ],
-                    stderr=stderr
+                    stderr=stderr,
+                    timeout=(60*15)
                 )
             except CommandFailedError:
                 if self.is_mounted():
@@ -307,7 +320,8 @@ class FuseMount(CephFSMount):
                     '--',
                     self.mountpoint,
                 ],
-                stderr=stderr
+                stderr=stderr,
+                timeout=(60*5)
             )
         except CommandFailedError:
             if "No such file or directory" in stderr.getvalue():
@@ -354,6 +368,7 @@ class FuseMount(CephFSMount):
                 '-rf',
                 self.mountpoint,
             ],
+            timeout=(60*5)
         )
 
     def _asok_path(self):
@@ -392,15 +407,15 @@ print find_socket("{client_name}")
 
         # Find the admin socket
         p = self.client_remote.run(args=[
-            'python', '-c', pyscript
-        ], stdout=StringIO())
+            'sudo', 'python2', '-c', pyscript
+        ], stdout=StringIO(), timeout=(15*60))
         asok_path = p.stdout.getvalue().strip()
         log.info("Found client admin socket at {0}".format(asok_path))
 
         # Query client ID from admin socket
         p = self.client_remote.run(
             args=['sudo', self._prefix + 'ceph', '--admin-daemon', asok_path] + args,
-            stdout=StringIO())
+            stdout=StringIO(), timeout=(15*60))
         return json.loads(p.stdout.getvalue())
 
     def get_global_id(self):
diff --git a/ceph/qa/tasks/cephfs/kernel_mount.py b/ceph/qa/tasks/cephfs/kernel_mount.py
index 80271a6eb..4fdbd1b0c 100644
--- a/ceph/qa/tasks/cephfs/kernel_mount.py
+++ b/ceph/qa/tasks/cephfs/kernel_mount.py
@@ -43,6 +43,7 @@ class KernelMount(CephFSMount):
                 run.Raw('>'),
                 filename,
             ],
+            timeout=(5*60),
         )
 
     def mount(self, mount_path=None, mount_fs_name=None):
@@ -60,6 +61,7 @@ class KernelMount(CephFSMount):
                 '--',
                 self.mountpoint,
             ],
+            timeout=(5*60),
         )
 
         if mount_path is None:
@@ -84,10 +86,11 @@ class KernelMount(CephFSMount):
                 '-o',
                 opts
             ],
+            timeout=(30*60),
         )
 
         self.client_remote.run(
-            args=['sudo', 'chmod', '1777', self.mountpoint])
+            args=['sudo', 'chmod', '1777', self.mountpoint], timeout=(5*60))
 
         self.mounted = True
 
@@ -99,7 +102,7 @@ class KernelMount(CephFSMount):
             cmd.append('-f')
 
         try:
-            self.client_remote.run(args=cmd, timeout=(5*60))
+            self.client_remote.run(args=cmd, timeout=(15*60))
         except Exception as e:
             self.client_remote.run(args=[
                 'sudo',
@@ -107,7 +110,7 @@ class KernelMount(CephFSMount):
                 'lsof',
                 run.Raw(';'),
                 'ps', 'auxf',
-            ])
+            ], timeout=(15*60))
             raise e
 
         rproc = self.client_remote.run(
@@ -194,6 +197,7 @@ class KernelMount(CephFSMount):
                 '--',
                 self.mountpoint,
             ],
+            timeout=(5*60),
         )
 
     def _find_debug_dir(self):
@@ -219,7 +223,7 @@ class KernelMount(CephFSMount):
 
         p = self.client_remote.run(args=[
             'sudo', 'python', '-c', pyscript
-        ], stdout=StringIO())
+        ], stdout=StringIO(), timeout=(5*60))
         client_id_to_dir = json.loads(p.stdout.getvalue())
 
         try:
@@ -241,7 +245,7 @@ class KernelMount(CephFSMount):
 
         p = self.client_remote.run(args=[
             'sudo', 'python', '-c', pyscript
-        ], stdout=StringIO())
+        ], stdout=StringIO(), timeout=(5*60))
         return p.stdout.getvalue()
 
     def get_global_id(self):
diff --git a/ceph/qa/tasks/cephfs/test_client_limits.py b/ceph/qa/tasks/cephfs/test_client_limits.py
index b06d5123d..1f1d54670 100644
--- a/ceph/qa/tasks/cephfs/test_client_limits.py
+++ b/ceph/qa/tasks/cephfs/test_client_limits.py
@@ -134,10 +134,10 @@ class TestClientLimits(CephFSTestCase):
         # Client B tries to stat the file that client A created
         rproc = self.mount_b.write_background("file1")
 
-        # After mds_session_timeout, we should see a health warning (extra lag from
+        # After session_timeout, we should see a health warning (extra lag from
         # MDS beacon period)
-        mds_session_timeout = float(self.fs.get_config("mds_session_timeout"))
-        self.wait_for_health("MDS_CLIENT_LATE_RELEASE", mds_session_timeout + 10)
+        session_timeout = self.fs.get_var("session_timeout")
+        self.wait_for_health("MDS_CLIENT_LATE_RELEASE", session_timeout + 10)
 
         # Client B should still be stuck
         self.assertFalse(rproc.finished)
diff --git a/ceph/qa/tasks/cephfs/test_client_recovery.py b/ceph/qa/tasks/cephfs/test_client_recovery.py
index 829ca3d5c..2b91cbfe6 100644
--- a/ceph/qa/tasks/cephfs/test_client_recovery.py
+++ b/ceph/qa/tasks/cephfs/test_client_recovery.py
@@ -30,10 +30,9 @@ class TestClientNetworkRecovery(CephFSTestCase):
     REQUIRE_ONE_CLIENT_REMOTE = True
     CLIENTS_REQUIRED = 2
 
-    LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
+    LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"]
 
     # Environment references
-    mds_session_timeout = None
     mds_reconnect_timeout = None
     ms_max_backoff = None
 
@@ -45,6 +44,8 @@ class TestClientNetworkRecovery(CephFSTestCase):
         I/O after failure.
         """
 
+        session_timeout = self.fs.get_var("session_timeout")
+
         # We only need one client
         self.mount_b.umount_wait()
 
@@ -67,7 +68,7 @@ class TestClientNetworkRecovery(CephFSTestCase):
         # ...then it should block
         self.assertFalse(write_blocked.finished)
         self.assert_session_state(client_id, "open")
-        time.sleep(self.mds_session_timeout * 1.5)  # Long enough for MDS to consider session stale
+        time.sleep(session_timeout * 1.5)  # Long enough for MDS to consider session stale
         self.assertFalse(write_blocked.finished)
         self.assert_session_state(client_id, "stale")
 
@@ -87,10 +88,9 @@ class TestClientRecovery(CephFSTestCase):
     REQUIRE_KCLIENT_REMOTE = True
     CLIENTS_REQUIRED = 2
 
-    LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
+    LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"]
 
     # Environment references
-    mds_session_timeout = None
     mds_reconnect_timeout = None
     ms_max_backoff = None
 
@@ -214,6 +214,8 @@ class TestClientRecovery(CephFSTestCase):
         self.mount_a.create_destroy()
 
     def test_stale_caps(self):
+        session_timeout = self.fs.get_var("session_timeout")
+
         # Capability release from stale session
         # =====================================
         cap_holder = self.mount_a.open_background()
@@ -226,7 +228,7 @@ class TestClientRecovery(CephFSTestCase):
         self.mount_a.kill()
 
         try:
-            # Now, after mds_session_timeout seconds, the waiter should
+            # Now, after session_timeout seconds, the waiter should
             # complete their operation when the MDS marks the holder's
             # session stale.
             cap_waiter = self.mount_b.write_background()
@@ -239,9 +241,9 @@ class TestClientRecovery(CephFSTestCase):
 
             cap_waited = b - a
             log.info("cap_waiter waited {0}s".format(cap_waited))
-            self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0,
+            self.assertTrue(session_timeout / 2.0 <= cap_waited <= session_timeout * 2.0,
                             "Capability handover took {0}, expected approx {1}".format(
-                                cap_waited, self.mds_session_timeout
+                                cap_waited, session_timeout
                             ))
 
             cap_holder.stdin.close()
@@ -261,6 +263,8 @@ class TestClientRecovery(CephFSTestCase):
         # Eviction while holding a capability
         # ===================================
 
+        session_timeout = self.fs.get_var("session_timeout")
+
         # Take out a write capability on a file on client A,
         # and then immediately kill it.
         cap_holder = self.mount_a.open_background()
@@ -290,9 +294,9 @@ class TestClientRecovery(CephFSTestCase):
             log.info("cap_waiter waited {0}s".format(cap_waited))
             # This is the check that it happened 'now' rather than waiting
             # for the session timeout
-            self.assertLess(cap_waited, self.mds_session_timeout / 2.0,
+            self.assertLess(cap_waited, session_timeout / 2.0,
                             "Capability handover took {0}, expected less than {1}".format(
-                                cap_waited, self.mds_session_timeout / 2.0
+                                cap_waited, session_timeout / 2.0
                             ))
 
             cap_holder.stdin.close()
@@ -479,6 +483,8 @@ class TestClientRecovery(CephFSTestCase):
         if not isinstance(self.mount_a, FuseMount):
             raise SkipTest("Require FUSE client to handle signal STOP/CONT")
 
+        session_timeout = self.fs.get_var("session_timeout")
+
         self.mount_a.run_shell(["mkdir", "testdir"])
         self.mount_a.run_shell(["touch", "testdir/file1"])
         # populate readdir cache
@@ -497,7 +503,7 @@ class TestClientRecovery(CephFSTestCase):
         self.mount_b.client_remote.run(args=["sudo", "kill", "-STOP", mount_b_pid])
 
         self.assert_session_state(mount_b_gid, "open")
-        time.sleep(self.mds_session_timeout * 1.5)  # Long enough for MDS to consider session stale
+        time.sleep(session_timeout * 1.5)  # Long enough for MDS to consider session stale
         self.assert_session_state(mount_b_gid, "stale")
 
         self.mount_a.run_shell(["touch", "testdir/file2"])
diff --git a/ceph/qa/tasks/cephfs/test_damage.py b/ceph/qa/tasks/cephfs/test_damage.py
index 380b49c4b..01e9d5803 100644
--- a/ceph/qa/tasks/cephfs/test_damage.py
+++ b/ceph/qa/tasks/cephfs/test_damage.py
@@ -12,6 +12,7 @@ DAMAGED_ON_START = "damaged_on_start"
 DAMAGED_ON_LS = "damaged_on_ls"
 CRASHED = "server crashed"
 NO_DAMAGE = "no damage"
+READONLY = "readonly"
 FAILED_CLIENT = "client failed"
 FAILED_SERVER = "server failed"
 
@@ -134,8 +135,8 @@ class TestDamage(CephFSTestCase):
         mutations = []
 
         # Removals
-        for obj_id in objects:
-            if obj_id in [
+        for o in objects:
+            if o in [
                 # JournalPointers are auto-replaced if missing (same path as upgrade)
                 "400.00000000",
                 # Missing dirfrags for non-system dirs result in empty directory
@@ -148,29 +149,37 @@ class TestDamage(CephFSTestCase):
                 expectation = DAMAGED_ON_START
 
             log.info("Expectation on rm '{0}' will be '{1}'".format(
-                obj_id, expectation
+                o, expectation
             ))
 
             mutations.append(MetadataMutation(
-                obj_id,
-                "Delete {0}".format(obj_id),
-                lambda o=obj_id: self.fs.rados(["rm", o]),
+                o,
+                "Delete {0}".format(o),
+                lambda o=o: self.fs.rados(["rm", o]),
                 expectation
             ))
 
         # Blatant corruptions
-        mutations.extend([
-            MetadataMutation(
-                o,
-                "Corrupt {0}".format(o),
-                lambda o=o: self.fs.rados(["put", o, "-"], stdin_data=junk),
-                DAMAGED_ON_START
-            ) for o in data_objects
-        ])
-
-        # Truncations
         for obj_id in data_objects:
             if obj_id == "500.00000000":
+                # purge queue corruption results in read-only FS
+                mutations.append(MetadataMutation(
+                    obj_id,
+                    "Corrupt {0}".format(obj_id),
+                    lambda o=obj_id: self.fs.rados(["put", o, "-"], stdin_data=junk),
+                    READONLY
+                ))
+            else:
+                mutations.append(MetadataMutation(
+                    obj_id,
+                    "Corrupt {0}".format(obj_id),
+                    lambda o=obj_id: self.fs.rados(["put", o, "-"], stdin_data=junk),
+                    DAMAGED_ON_START
+                ))
+
+        # Truncations
+        for o in data_objects:
+            if o == "500.00000000":
                 # The PurgeQueue is allowed to be empty: Journaler interprets
                 # an empty header object as an empty journal.
                 expectation = NO_DAMAGE
@@ -182,7 +191,7 @@ class TestDamage(CephFSTestCase):
                     o,
                     "Truncate {0}".format(o),
                     lambda o=o: self.fs.rados(["truncate", o, "0"]),
-                    DAMAGED_ON_START
+                    expectation
             ))
 
         # OMAP value corruptions
@@ -204,22 +213,22 @@ class TestDamage(CephFSTestCase):
             )
 
         # OMAP header corruptions
-        for obj_id in omap_header_objs:
-            if re.match("60.\.00000000", obj_id) \
-                    or obj_id in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
+        for o in omap_header_objs:
+            if re.match("60.\.00000000", o) \
+                    or o in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
                 expectation = DAMAGED_ON_START
             else:
                 expectation = NO_DAMAGE
 
             log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
-                obj_id, expectation
+                o, expectation
             ))
 
             mutations.append(
                 MetadataMutation(
-                    obj_id,
-                    "Corrupt omap header on {0}".format(obj_id),
-                    lambda o=obj_id: self.fs.rados(["setomapheader", o, junk]),
+                    o,
+                    "Corrupt omap header on {0}".format(o),
+                    lambda o=o: self.fs.rados(["setomapheader", o, junk]),
                     expectation
                 )
             )
@@ -314,7 +323,17 @@ class TestDamage(CephFSTestCase):
                     else:
                         log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
                         results[mutation] = FAILED_SERVER
-
+            elif mutation.expectation == READONLY:
+                proc = self.mount_a.run_shell(["mkdir", "foo"], wait=False)
+                try:
+                    proc.wait()
+                except CommandFailedError:
+                    stderr = proc.stderr.getvalue()
+                    log.info(stderr)
+                    if "Read-only file system".lower() in stderr.lower():
+                        pass
+                    else:
+                        raise
             else:
                 try:
                     wait([proc], 20)
@@ -480,7 +499,7 @@ class TestDamage(CephFSTestCase):
 
         # Drop everything from the MDS cache
         self.mds_cluster.mds_stop()
-        self.fs.journal_tool(['journal', 'reset'])
+        self.fs.journal_tool(['journal', 'reset'], 0)
         self.mds_cluster.mds_fail_restart()
         self.fs.wait_for_daemons()
 
diff --git a/ceph/qa/tasks/cephfs/test_data_scan.py b/ceph/qa/tasks/cephfs/test_data_scan.py
index a2d315768..1e7745541 100644
--- a/ceph/qa/tasks/cephfs/test_data_scan.py
+++ b/ceph/qa/tasks/cephfs/test_data_scan.py
@@ -362,9 +362,9 @@ class TestDataScan(CephFSTestCase):
         if False:
             with self.assertRaises(CommandFailedError):
                 # Normal reset should fail when no objects are present, we'll use --force instead
-                self.fs.journal_tool(["journal", "reset"])
+                self.fs.journal_tool(["journal", "reset"], 0)
 
-        self.fs.journal_tool(["journal", "reset", "--force"])
+        self.fs.journal_tool(["journal", "reset", "--force"], 0)
         self.fs.data_scan(["init"])
         self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
         self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
diff --git a/ceph/qa/tasks/cephfs/test_flush.py b/ceph/qa/tasks/cephfs/test_flush.py
index 1f84e4200..ee0b1c92b 100644
--- a/ceph/qa/tasks/cephfs/test_flush.py
+++ b/ceph/qa/tasks/cephfs/test_flush.py
@@ -44,7 +44,7 @@ class TestFlush(CephFSTestCase):
 
         # ...and the journal is truncated to just a single subtreemap from the
         # newly created segment
-        summary_output = self.fs.journal_tool(["event", "get", "summary"])
+        summary_output = self.fs.journal_tool(["event", "get", "summary"], 0)
         try:
             self.assertEqual(summary_output,
                              dedent(
@@ -72,7 +72,7 @@ class TestFlush(CephFSTestCase):
                              ).strip())
             flush_data = self.fs.mds_asok(["flush", "journal"])
             self.assertEqual(flush_data['return_code'], 0)
-            self.assertEqual(self.fs.journal_tool(["event", "get", "summary"]),
+            self.assertEqual(self.fs.journal_tool(["event", "get", "summary"], 0),
                              dedent(
                                  """
                                  Events by type:
diff --git a/ceph/qa/tasks/cephfs/test_forward_scrub.py b/ceph/qa/tasks/cephfs/test_forward_scrub.py
index 1f80366af..e165780f3 100644
--- a/ceph/qa/tasks/cephfs/test_forward_scrub.py
+++ b/ceph/qa/tasks/cephfs/test_forward_scrub.py
@@ -242,10 +242,10 @@ class TestForwardScrub(CephFSTestCase):
         # is all that will be in the InoTable in memory)
 
         self.fs.journal_tool(["event", "splice",
-            "--inode={0}".format(inos["./file2_sixmegs"]), "summary"])
+                              "--inode={0}".format(inos["./file2_sixmegs"]), "summary"], 0)
 
         self.fs.journal_tool(["event", "splice",
-            "--inode={0}".format(inos["./file3_sixmegs"]), "summary"])
+                              "--inode={0}".format(inos["./file3_sixmegs"]), "summary"], 0)
 
         # Revert to old inotable.
         for key, value in inotable_copy.iteritems():
diff --git a/ceph/qa/tasks/cephfs/test_fragment.py b/ceph/qa/tasks/cephfs/test_fragment.py
index a62ef7432..54a49cea2 100644
--- a/ceph/qa/tasks/cephfs/test_fragment.py
+++ b/ceph/qa/tasks/cephfs/test_fragment.py
@@ -33,7 +33,6 @@ class TestFragmentation(CephFSTestCase):
         Apply kwargs as MDS configuration settings, enable dirfrags
         and restart the MDSs.
         """
-        kwargs['mds_bal_frag'] = "true"
 
         for k, v in kwargs.items():
             self.ceph_cluster.set_ceph_conf("mds", k, v.__str__())
diff --git a/ceph/qa/tasks/cephfs/test_journal_migration.py b/ceph/qa/tasks/cephfs/test_journal_migration.py
index 64fe93980..5f956be93 100644
--- a/ceph/qa/tasks/cephfs/test_journal_migration.py
+++ b/ceph/qa/tasks/cephfs/test_journal_migration.py
@@ -82,13 +82,14 @@ class TestJournalMigration(CephFSTestCase):
             ))
 
         # Verify that cephfs-journal-tool can now read the rewritten journal
-        inspect_out = self.fs.journal_tool(["journal", "inspect"])
+        inspect_out = self.fs.journal_tool(["journal", "inspect"], 0)
         if not inspect_out.endswith(": OK"):
             raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
                 inspect_out
             ))
 
-        self.fs.journal_tool(["event", "get", "json", "--path", "/tmp/journal.json"])
+        self.fs.journal_tool(["event", "get", "json",
+                              "--path", "/tmp/journal.json"], 0)
         p = self.fs.tool_remote.run(
             args=[
                 "python",
diff --git a/ceph/qa/tasks/cephfs/test_journal_repair.py b/ceph/qa/tasks/cephfs/test_journal_repair.py
index 62cbbb068..9832f91a1 100644
--- a/ceph/qa/tasks/cephfs/test_journal_repair.py
+++ b/ceph/qa/tasks/cephfs/test_journal_repair.py
@@ -77,7 +77,7 @@ class TestJournalRepair(CephFSTestCase):
         self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
 
         # Execute the dentry recovery, this should populate the backing store
-        self.fs.journal_tool(['event', 'recover_dentries', 'list'])
+        self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
 
         # Dentries in ROOT_INO are present
         self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
@@ -87,7 +87,7 @@ class TestJournalRepair(CephFSTestCase):
 
         # Now check the MDS can read what we wrote: truncate the journal
         # and start the mds.
-        self.fs.journal_tool(['journal', 'reset'])
+        self.fs.journal_tool(['journal', 'reset'], 0)
         self.fs.mds_fail_restart()
         self.fs.wait_for_daemons()
 
@@ -265,10 +265,10 @@ class TestJournalRepair(CephFSTestCase):
         self.fs.mds_stop(active_mds_names[0])
         self.fs.mds_fail(active_mds_names[0])
         # Invoke recover_dentries quietly, because otherwise log spews millions of lines
-        self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
-        self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
+        self.fs.journal_tool(["event", "recover_dentries", "summary"], 0, quiet=True)
+        self.fs.journal_tool(["event", "recover_dentries", "summary"], 1, quiet=True)
         self.fs.table_tool(["0", "reset", "session"])
-        self.fs.journal_tool(["journal", "reset"], rank=0)
+        self.fs.journal_tool(["journal", "reset"], 0)
         self.fs.erase_mds_objects(1)
         self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
                 '--yes-i-really-mean-it')
diff --git a/ceph/qa/tasks/cephfs/test_misc.py b/ceph/qa/tasks/cephfs/test_misc.py
index 4158538fd..c27278008 100644
--- a/ceph/qa/tasks/cephfs/test_misc.py
+++ b/ceph/qa/tasks/cephfs/test_misc.py
@@ -7,15 +7,13 @@ import errno
 import time
 import json
 import logging
+import time
 
 log = logging.getLogger(__name__)
 
 class TestMisc(CephFSTestCase):
     CLIENTS_REQUIRED = 2
 
-    LOAD_SETTINGS = ["mds_session_autoclose"]
-    mds_session_autoclose = None
-
     def test_getattr_caps(self):
         """
         Check if MDS recognizes the 'mask' parameter of open request.
@@ -43,6 +41,16 @@ class TestMisc(CephFSTestCase):
 
         self.mount_a.kill_background(p)
 
+    def test_root_rctime(self):
+        """
+        Check that the root inode has a non-default rctime on startup.
+        """
+
+        t = time.time()
+        rctime = self.mount_a.getfattr(".", "ceph.dir.rctime")
+        log.info("rctime = {}".format(rctime))
+        self.assertGreaterEqual(rctime, t-10)
+
     def test_fs_new(self):
         data_pool_name = self.fs.get_data_pool_name()
 
@@ -106,6 +114,8 @@ class TestMisc(CephFSTestCase):
         only session
         """
 
+        session_autoclose = self.fs.get_var("session_autoclose")
+
         self.mount_b.umount_wait()
         ls_data = self.fs.mds_asok(['session', 'ls'])
         self.assert_session_count(1, ls_data)
@@ -113,7 +123,7 @@ class TestMisc(CephFSTestCase):
         self.mount_a.kill()
         self.mount_a.kill_cleanup()
 
-        time.sleep(self.mds_session_autoclose * 1.5)
+        time.sleep(session_autoclose * 1.5)
         ls_data = self.fs.mds_asok(['session', 'ls'])
         self.assert_session_count(1, ls_data)
 
@@ -128,7 +138,7 @@ class TestMisc(CephFSTestCase):
         self.mount_a.kill()
         self.mount_a.kill_cleanup()
 
-        time.sleep(self.mds_session_autoclose * 1.5)
+        time.sleep(session_autoclose * 1.5)
         ls_data = self.fs.mds_asok(['session', 'ls'])
         self.assert_session_count(1, ls_data)
 
@@ -202,3 +212,75 @@ class TestMisc(CephFSTestCase):
 
         ratio = raw_avail / fs_avail
         assert 0.9 < ratio < 1.1
+
+    def _run_drop_cache_cmd(self, timeout, use_tell):
+        drop_res = None
+        if use_tell:
+            mds_id = self.fs.get_lone_mds_id()
+            drop_res = json.loads(
+                self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id),
+                                                    "cache", "drop", str(timeout)))
+        else:
+            drop_res = self.fs.mds_asok(["cache", "drop", str(timeout)])
+        return drop_res
+
+    def _drop_cache_command(self, timeout, use_tell=True):
+        self.mount_b.umount_wait()
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        self.assert_session_count(1, ls_data)
+
+        # create some files
+        self.mount_a.create_n_files("dc-dir/dc-file", 1000)
+        # drop cache
+        drop_res = self._run_drop_cache_cmd(timeout, use_tell)
+
+        self.assertTrue(drop_res['client_recall']['return_code'] == 0)
+        self.assertTrue(drop_res['flush_journal']['return_code'] == 0)
+
+    def _drop_cache_command_timeout(self, timeout, use_tell=True):
+        self.mount_b.umount_wait()
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        self.assert_session_count(1, ls_data)
+
+        # create some files
+        self.mount_a.create_n_files("dc-dir/dc-file-t", 1000)
+
+        # simulate client death and try drop cache
+        self.mount_a.kill()
+        drop_res = self._run_drop_cache_cmd(timeout, use_tell)
+
+        self.assertTrue(drop_res['client_recall']['return_code'] == -errno.ETIMEDOUT)
+        self.assertTrue(drop_res['flush_journal']['return_code'] == 0)
+
+        self.mount_a.kill_cleanup()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+    def test_drop_cache_command_asok(self):
+        """
+        Basic test for checking drop cache command using admin socket.
+        Note that the cache size post trimming is not checked here.
+        """
+        self._drop_cache_command(10, use_tell=False)
+
+    def test_drop_cache_command_tell(self):
+        """
+        Basic test for checking drop cache command using tell interface.
+        Note that the cache size post trimming is not checked here.
+        """
+        self._drop_cache_command(10)
+
+    def test_drop_cache_command_timeout_asok(self):
+        """
+        Check drop cache command with non-responding client using admin
+        socket. Note that the cache size post trimming is not checked here.
+        """
+        self._drop_cache_command_timeout(5, use_tell=False)
+
+    def test_drop_cache_command_timeout_tell(self):
+        """
+        Check drop cache command with non-responding client using tell
+        interface. Note that the cache size post trimming is not checked
+        here.
+        """
+        self._drop_cache_command_timeout(5)
diff --git a/ceph/qa/tasks/cephfs/test_recovery_pool.py b/ceph/qa/tasks/cephfs/test_recovery_pool.py
index 097342a9d..97049b9c0 100644
--- a/ceph/qa/tasks/cephfs/test_recovery_pool.py
+++ b/ceph/qa/tasks/cephfs/test_recovery_pool.py
@@ -141,10 +141,6 @@ class TestRecoveryPool(CephFSTestCase):
         self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
                 '--yes-i-really-mean-it')
 
-        def get_state(mds_id):
-            info = self.mds_cluster.get_mds_info(mds_id)
-            return info['state'] if info is not None else None
-
         self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
         self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
         self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
@@ -153,7 +149,7 @@ class TestRecoveryPool(CephFSTestCase):
         if False:
             with self.assertRaises(CommandFailedError):
                 # Normal reset should fail when no objects are present, we'll use --force instead
-                self.fs.journal_tool(["journal", "reset"])
+                self.fs.journal_tool(["journal", "reset"], 0)
 
         self.fs.mds_stop()
         self.fs.data_scan(['scan_extents', '--alternate-pool',
@@ -163,22 +159,18 @@ class TestRecoveryPool(CephFSTestCase):
                            recovery_pool, '--filesystem', self.fs.name,
                            '--force-corrupt', '--force-init',
                            self.fs.get_data_pool_name()])
-        self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
-                              'recover_dentries', 'list',
-                              '--alternate-pool', recovery_pool])
+        self.fs.journal_tool(['event', 'recover_dentries', 'list',
+                              '--alternate-pool', recovery_pool], 0)
 
         self.fs.data_scan(['init', '--force-init', '--filesystem',
                            self.fs.name])
         self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
                            '--force-corrupt', '--force-init',
                            self.fs.get_data_pool_name()])
-        self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
-                              'recover_dentries', 'list'])
+        self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
 
-        self.fs.journal_tool(['--rank=' + recovery_fs + ":0", 'journal',
-                              'reset', '--force'])
-        self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'journal',
-                              'reset', '--force'])
+        self.recovery_fs.journal_tool(['journal', 'reset', '--force'], 0)
+        self.fs.journal_tool(['journal', 'reset', '--force'], 0)
         self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
                                             recovery_fs + ":0")
 
@@ -190,12 +182,11 @@ class TestRecoveryPool(CephFSTestCase):
         self.recovery_fs.mds_restart()
         self.fs.wait_for_daemons()
         self.recovery_fs.wait_for_daemons()
-        for mds_id in self.recovery_fs.mds_ids:
-            self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + mds_id,
+        status = self.recovery_fs.status()
+        for rank in self.recovery_fs.get_ranks(status=status):
+            self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + rank['name'],
                                                 'injectargs', '--debug-mds=20')
-            self.fs.mon_manager.raw_cluster_cmd('daemon', "mds." + mds_id,
-                                                'scrub_path', '/',
-                                                'recursive', 'repair')
+            self.fs.rank_asok(['scrub_path', '/', 'recursive', 'repair'], rank=rank['rank'], status=status)
         log.info(str(self.mds_cluster.status()))
 
         # Mount a client
diff --git a/ceph/qa/tasks/qemu.py b/ceph/qa/tasks/qemu.py
index f597c08d6..b2bca00d8 100644
--- a/ceph/qa/tasks/qemu.py
+++ b/ceph/qa/tasks/qemu.py
@@ -115,7 +115,7 @@ def generate_iso(ctx, config):
 
         (remote,) = ctx.cluster.only(client).remotes.keys()
 
-        clone_dir = '{tdir}/clone.{role}'.format(tdir=testdir, role=client)
+        clone_dir = '{tdir}/qemu_clone.{role}'.format(tdir=testdir, role=client)
         remote.run(args=refspec.clone(git_url, clone_dir))
 
         src_dir = os.path.dirname(__file__)
@@ -212,7 +212,7 @@ def generate_iso(ctx, config):
                     os.path.join(testdir, 'qemu', 'userdata.' + client),
                     os.path.join(testdir, 'qemu', 'metadata.' + client),
                     '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client),
-                    '{tdir}/clone.{client}'.format(tdir=testdir, client=client),
+                    '{tdir}/qemu_clone.{client}'.format(tdir=testdir, client=client),
                     ],
                 )
 
diff --git a/ceph/qa/tasks/thrashosds-health.yaml b/ceph/qa/tasks/thrashosds-health.yaml
index 111e2d8c4..9a4d35abf 100644
--- a/ceph/qa/tasks/thrashosds-health.yaml
+++ b/ceph/qa/tasks/thrashosds-health.yaml
@@ -12,4 +12,4 @@ overrides:
       - \(REQUEST_SLOW\)
       - \(TOO_FEW_PGS\)
       - \(MON_DOWN\)
-      - slow requests
+      - slow request
diff --git a/ceph/qa/tasks/workunit.py b/ceph/qa/tasks/workunit.py
index 0a46ade76..17cfaf7f1 100644
--- a/ceph/qa/tasks/workunit.py
+++ b/ceph/qa/tasks/workunit.py
@@ -410,7 +410,7 @@ def _run_tests(ctx, refspec, role, tests, env, basedir,
                 )
                 if cleanup:
                     args=['sudo', 'rm', '-rf', '--', scratch_tmp]
-                    remote.run(logger=log.getChild(role), args=args, timeout=(15*60))
+                    remote.run(logger=log.getChild(role), args=args, timeout=(60*60))
     finally:
         log.info('Stopping %s on %s...', tests, role)
         args=['sudo', 'rm', '-rf', '--', workunits_file, clonedir]
diff --git a/ceph/qa/workunits/ceph-tests/ceph-admin-commands.sh b/ceph/qa/workunits/ceph-tests/ceph-admin-commands.sh
index 30e74cce5..4a9f0a66f 100755
--- a/ceph/qa/workunits/ceph-tests/ceph-admin-commands.sh
+++ b/ceph/qa/workunits/ceph-tests/ceph-admin-commands.sh
@@ -1,12 +1,9 @@
-#!/bin/sh -e
+#!/bin/sh -ex
 
-#check ceph health
 ceph -s
-#list pools
 rados lspools
-#lisr rbd images
 rbd ls
-#check that the monitors work
+# check that the monitors work
 ceph osd set nodown
 ceph osd unset nodown
 
diff --git a/ceph/qa/workunits/mon/test_config_key_caps.sh b/ceph/qa/workunits/mon/test_config_key_caps.sh
new file mode 100755
index 000000000..77b4b53b7
--- /dev/null
+++ b/ceph/qa/workunits/mon/test_config_key_caps.sh
@@ -0,0 +1,201 @@
+#!/usr/bin/env bash
+
+set -x
+set -e
+
+tmp=$(mktemp -d -p /tmp test_mon_config_key_caps.XXXXX)
+entities=()
+
+function cleanup()
+{
+	set +e
+	set +x
+	if [[ -e $tmp/keyring ]] && [[ -e $tmp/keyring.orig ]]; then
+		grep '\[.*\..*\]' $tmp/keyring.orig > $tmp/entities.orig
+		for e in $(grep '\[.*\..*\]' $tmp/keyring | \
+			diff $tmp/entities.orig - | \
+			sed -n 's/^.*\[\(.*\..*\)\]/\1/p');
+		do
+			ceph auth rm $e 2>&1 >& /dev/null
+		done
+	fi
+	#rm -fr $tmp
+}
+
+trap cleanup 0 # cleanup on exit
+
+function expect_false()
+{
+	set -x
+	if "$@"; then return 1; else return 0; fi
+}
+
+# for cleanup purposes
+ceph auth export -o $tmp/keyring.orig
+
+k=$tmp/keyring
+
+# setup a few keys
+ceph config-key ls
+ceph config-key set daemon-private/osd.123/test-foo
+ceph config-key set mgr/test-foo
+ceph config-key set device/test-foo
+ceph config-key set test/foo
+
+allow_aa=client.allow_aa
+allow_bb=client.allow_bb
+allow_cc=client.allow_cc
+
+mgr_a=mgr.a
+mgr_b=mgr.b
+osd_a=osd.100
+osd_b=osd.200
+
+prefix_aa=client.prefix_aa
+prefix_bb=client.prefix_bb
+prefix_cc=client.prefix_cc
+match_aa=client.match_aa
+match_bb=client.match_bb
+
+fail_aa=client.fail_aa
+fail_bb=client.fail_bb
+fail_cc=client.fail_cc
+fail_dd=client.fail_dd
+fail_ee=client.fail_ee
+fail_ff=client.fail_ff
+fail_gg=client.fail_gg
+fail_writes=client.fail_writes
+
+ceph auth get-or-create $allow_aa mon 'allow *'
+ceph auth get-or-create $allow_bb mon 'allow service config-key rwx'
+ceph auth get-or-create $allow_cc mon 'allow command "config-key get"'
+
+ceph auth get-or-create $mgr_a mon 'allow profile mgr'
+ceph auth get-or-create $mgr_b mon 'allow profile mgr'
+ceph auth get-or-create $osd_a mon 'allow profile osd'
+ceph auth get-or-create $osd_b mon 'allow profile osd'
+
+ceph auth get-or-create $prefix_aa mon \
+	"allow command \"config-key get\" with key prefix client/$prefix_aa"
+
+cap="allow command \"config-key set\" with key prefix client/"
+cap="$cap,allow command \"config-key get\" with key prefix client/$prefix_bb"
+ceph auth get-or-create $prefix_bb mon "$cap"
+
+cap="allow command \"config-key get\" with key prefix client/"
+cap="$cap, allow command \"config-key set\" with key prefix client/"
+cap="$cap, allow command \"config-key ls\""
+ceph auth get-or-create $prefix_cc mon "$cap"
+
+cap="allow command \"config-key get\" with key=client/$match_aa/foo"
+ceph auth get-or-create $match_aa mon "$cap"
+cap="allow command \"config-key get\" with key=client/$match_bb/foo"
+cap="$cap,allow command \"config-key set\" with key=client/$match_bb/foo"
+ceph auth get-or-create $match_bb mon "$cap"
+
+ceph auth get-or-create $fail_aa mon 'allow rx'
+ceph auth get-or-create $fail_bb mon 'allow r,allow w'
+ceph auth get-or-create $fail_cc mon 'allow rw'
+ceph auth get-or-create $fail_dd mon 'allow rwx'
+ceph auth get-or-create $fail_ee mon 'allow profile bootstrap-rgw'
+ceph auth get-or-create $fail_ff mon 'allow profile bootstrap-rbd'
+# write commands will require rw; wx is not enough
+ceph auth get-or-create $fail_gg mon 'allow service config-key wx'
+# read commands will only require 'r'; 'rx' should be enough.
+ceph auth get-or-create $fail_writes mon 'allow service config-key rx'
+
+# grab keyring
+ceph auth export -o $k
+
+# keys will all the caps can do whatever
+for c in $allow_aa $allow_bb $allow_cc $mgr_a $mgr_b; do
+	ceph -k $k --name $c config-key get daemon-private/osd.123/test-foo
+	ceph -k $k --name $c config-key get mgr/test-foo
+	ceph -k $k --name $c config-key get device/test-foo
+	ceph -k $k --name $c config-key get test/foo
+done
+
+for c in $osd_a $osd_b; do
+	ceph -k $k --name $c config-key put daemon-private/$c/test-foo
+	ceph -k $k --name $c config-key get daemon-private/$c/test-foo
+	expect_false ceph -k $k --name $c config-key ls
+	expect_false ceph -k $k --name $c config-key get mgr/test-foo
+	expect_false ceph -k $k --name $c config-key get device/test-foo
+	expect_false ceph -k $k --name $c config-key get test/foo
+done
+
+expect_false ceph -k $k --name $osd_a get daemon-private/$osd_b/test-foo
+expect_false ceph -k $k --name $osd_b get daemon-private/$osd_a/test-foo
+
+expect_false ceph -k $k --name $prefix_aa \
+	config-key ls
+expect_false ceph -k $k --name $prefix_aa \
+	config-key get daemon-private/osd.123/test-foo
+expect_false ceph -k $k --name $prefix_aa \
+	config-key set test/bar
+expect_false ceph -k $k --name $prefix_aa \
+	config-key set client/$prefix_aa/foo
+
+# write something so we can read, use a custom entity
+ceph -k $k --name $allow_bb config-key set client/$prefix_aa/foo
+ceph -k $k --name $prefix_aa config-key get client/$prefix_aa/foo
+# check one writes to the other's prefix, the other is able to read
+ceph -k $k --name $prefix_bb config-key set client/$prefix_aa/bar
+ceph -k $k --name $prefix_aa config-key get client/$prefix_aa/bar
+
+ceph -k $k --name $prefix_bb config-key set client/$prefix_bb/foo
+ceph -k $k --name $prefix_bb config-key get client/$prefix_bb/foo
+
+expect_false ceph -k $k --name $prefix_bb config-key get client/$prefix_aa/bar
+expect_false ceph -k $k --name $prefix_bb config-key ls
+expect_false ceph -k $k --name $prefix_bb \
+	config-key get daemon-private/osd.123/test-foo
+expect_false ceph -k $k --name $prefix_bb config-key get mgr/test-foo
+expect_false ceph -k $k --name $prefix_bb config-key get device/test-foo
+expect_false ceph -k $k --name $prefix_bb config-key get test/bar
+expect_false ceph -k $k --name $prefix_bb config-key set test/bar
+
+ceph -k $k --name $prefix_cc config-key set client/$match_aa/foo
+ceph -k $k --name $prefix_cc config-key set client/$match_bb/foo
+ceph -k $k --name $prefix_cc config-key get client/$match_aa/foo
+ceph -k $k --name $prefix_cc config-key get client/$match_bb/foo
+expect_false ceph -k $k --name $prefix_cc config-key set other/prefix
+expect_false ceph -k $k --name $prefix_cc config-key get mgr/test-foo
+ceph -k $k --name $prefix_cc config-key ls >& /dev/null
+
+ceph -k $k --name $match_aa config-key get client/$match_aa/foo
+expect_false ceph -k $k --name $match_aa config-key get client/$match_bb/foo
+expect_false ceph -k $k --name $match_aa config-key set client/$match_aa/foo
+ceph -k $k --name $match_bb config-key get client/$match_bb/foo
+ceph -k $k --name $match_bb config-key set client/$match_bb/foo
+expect_false ceph -k $k --name $match_bb config-key get client/$match_aa/foo
+expect_false ceph -k $k --name $match_bb config-key set client/$match_aa/foo
+
+keys=(daemon-private/osd.123/test-foo
+	  mgr/test-foo
+	  device/test-foo
+	  test/foo
+	  client/$prefix_aa/foo
+	  client/$prefix_bb/foo
+	  client/$match_aa/foo
+	  client/$match_bb/foo
+)
+# expect these all to fail accessing config-key
+for c in $fail_aa $fail_bb $fail_cc \
+		 $fail_dd $fail_ee $fail_ff \
+		 $fail_gg; do
+	for m in get set; do
+		for key in ${keys[*]} client/$prefix_aa/foo client/$prefix_bb/foo; do
+			expect_false ceph -k $k --name $c config-key $m $key
+		done
+	done
+done
+
+# fail writes but succeed on reads
+expect_false ceph -k $k --name $fail_writes config-key set client/$match_aa/foo
+expect_false ceph -k $k --name $fail_writes config-key set test/foo
+ceph -k $k --name $fail_writes config-key ls
+ceph -k $k --name $fail_writes config-key get client/$match_aa/foo 
+ceph -k $k --name $fail_writes config-key get daemon-private/osd.123/test-foo
+
+echo "OK"
diff --git a/ceph/qa/workunits/rados/test_librados_build.sh b/ceph/qa/workunits/rados/test_librados_build.sh
index 43ded25b2..3aaaec7eb 100755
--- a/ceph/qa/workunits/rados/test_librados_build.sh
+++ b/ceph/qa/workunits/rados/test_librados_build.sh
@@ -20,8 +20,8 @@ hello_world_cpp
 "
 BINARIES="${BINARIES_TO_RUN}hello_radosstriper_cpp
 "
-DL_PREFIX="http://git.ceph.com/?p=ceph.git;a=blob_plain;f=examples/librados/"
-#DL_PREFIX="https://raw.githubusercontent.com/ceph/ceph/master/examples/librados/"
+DL_PREFIX="http://git.ceph.com/?p=ceph.git;a=blob_plain;hb=luminous;f=examples/librados/"
+#DL_PREFIX="https://raw.githubusercontent.com/ceph/ceph/luminous/examples/librados/"
 DESTDIR=$(pwd)
 
 function cleanup () {
diff --git a/ceph/qa/workunits/rbd/run_devstack_tempest.sh b/ceph/qa/workunits/rbd/run_devstack_tempest.sh
index 7ee21f09f..65a45d8b7 100755
--- a/ceph/qa/workunits/rbd/run_devstack_tempest.sh
+++ b/ceph/qa/workunits/rbd/run_devstack_tempest.sh
@@ -1,7 +1,7 @@
 #!/bin/bash -ex
 
-STACK_BRANCH=stable/pike
-TEMPEST_BRANCH=17.2.0
+STACK_BRANCH=stable/rocky
+TEMPEST_BRANCH=19.0.0
 
 STACK_USER=${STACK_USER:-stack}
 STACK_GROUP=${STACK_GROUP:-stack}
diff --git a/ceph/qa/workunits/suites/cephfs_journal_tool_smoke.sh b/ceph/qa/workunits/suites/cephfs_journal_tool_smoke.sh
index 60e914965..7e4ad3bd3 100755
--- a/ceph/qa/workunits/suites/cephfs_journal_tool_smoke.sh
+++ b/ceph/qa/workunits/suites/cephfs_journal_tool_smoke.sh
@@ -3,7 +3,7 @@
 set -e
 set -x
 
-export BIN="${BIN:-cephfs-journal-tool}"
+export BIN="${BIN:-cephfs-journal-tool --rank=cephfs:0}"
 export JOURNAL_FILE=/tmp/journal.bin
 export JSON_OUTPUT=/tmp/json.tmp
 export BINARY_OUTPUT=/tmp/binary.tmp
diff --git a/ceph/run-make-check.sh b/ceph/run-make-check.sh
index 078345422..2244e5ea5 100755
--- a/ceph/run-make-check.sh
+++ b/ceph/run-make-check.sh
@@ -13,8 +13,31 @@
 #
 
 #
-# Return MAX(1, (number of processors / 2)) by default or NPROC
+# To just look at what this script will do, run it like this:
 #
+# $ DRY_RUN=echo ./run-make-check.sh
+#
+
+set -e
+
+trap clean_up_after_myself EXIT
+
+ORIGINAL_CCACHE_CONF="$HOME/.ccache/ccache.conf"
+SAVED_CCACHE_CONF="$HOME/.run-make-check-saved-ccache-conf"
+
+function save_ccache_conf() {
+    test -f $ORIGINAL_CCACHE_CONF && cp $ORIGINAL_CCACHE_CONF $SAVED_CCACHE_CONF || true
+}
+
+function restore_ccache_conf() {
+    test -f $SAVED_CCACHE_CONF && mv $SAVED_CCACHE_CONF $ORIGINAL_CCACHE_CONF || true
+}
+
+function clean_up_after_myself() {
+    rm -fr ${CEPH_BUILD_VIRTUALENV:-/tmp}/*virtualenv*
+    restore_ccache_conf
+}
+
 function get_processors() {
     if test -n "$NPROC" ; then
         echo $NPROC
@@ -54,26 +77,72 @@ function run() {
         exit 1
     fi
     if [ -n "$install_cmd" ]; then
-        $DRY_RUN sudo $install_cmd ccache jq $which_pkg
+        $DRY_RUN sudo $install_cmd ccache $which_pkg
     else
         echo "WARNING: Don't know how to install packages" >&2
         echo "This probably means distribution $ID is not supported by run-make-check.sh" >&2
     fi
 
+    if ! type ccache > /dev/null 2>&1 ; then
+        echo "ERROR: ccache could not be installed"
+        exit 1
+    fi
+
     if test -f ./install-deps.sh ; then
 	$DRY_RUN ./install-deps.sh || return 1
+        trap clean_up_after_myself EXIT
     fi
 
     # Init defaults after deps are installed. get_processors() depends on coreutils nproc.
     DEFAULT_MAKEOPTS=${DEFAULT_MAKEOPTS:--j$(get_processors)}
     BUILD_MAKEOPTS=${BUILD_MAKEOPTS:-$DEFAULT_MAKEOPTS}
+    test "$BUILD_MAKEOPTS" && echo "make will run with option(s) $BUILD_MAKEOPTS"
     CHECK_MAKEOPTS=${CHECK_MAKEOPTS:-$DEFAULT_MAKEOPTS}
 
-    $DRY_RUN ./do_cmake.sh $@ || return 1
+    if type python2 > /dev/null 2>&1 ; then
+        # gtest-parallel requires Python 2
+        CMAKE_PYTHON_OPTS="-DWITH_GTEST_PARALLEL=ON"
+    else
+        CMAKE_PYTHON_OPTS="-DWITH_PYTHON2=OFF -DWITH_PYTHON3=ON -DMGR_PYTHON_VERSION=3 -DWITH_GTEST_PARALLEL=OFF"
+    fi
+
+    CMAKE_BUILD_OPTS=""
+
+    cat <<EOM
+Note that the binaries produced by this script do not contain correct time
+and git version information, which may make them unsuitable for debugging
+and production use.
+EOM
+    save_ccache_conf
+    # remove the entropy generated by the date/time embedded in the build
+    CMAKE_BUILD_OPTS="$CMAKE_BUILD_OPTS -DENABLE_GIT_VERSION=OFF"
+    $DRY_RUN export SOURCE_DATE_EPOCH="946684800"
+    $DRY_RUN ccache -o sloppiness=time_macros
+    $DRY_RUN ccache -o run_second_cpp=true
+    if [ -n "$JENKINS_HOME" ]; then
+        # Build host has plenty of space available, let's use it to keep
+        # various versions of the built objects. This could increase the cache hit
+        # if the same or similar PRs are running several times
+        $DRY_RUN ccache -o max_size=100G
+    else
+        echo "Current ccache max_size setting:"
+        ccache -p | grep max_size
+    fi
+    $DRY_RUN ccache -z # Reset the ccache statistics
+
+    $DRY_RUN ./do_cmake.sh $CMAKE_BUILD_OPTS $CMAKE_PYTHON_OPTS $@ || return 1
     $DRY_RUN cd build
     $DRY_RUN make $BUILD_MAKEOPTS tests || return 1
-    # prevent OSD EMFILE death on tests
-    $DRY_RUN sudo ulimit -n 32768
+
+    $DRY_RUN ccache -s # print the ccache statistics to evaluate the efficiency
+
+    # to prevent OSD EMFILE death on tests, make sure ulimit >= 1024
+    $DRY_RUN ulimit -n $(ulimit -Hn)
+    if [ $(ulimit -n) -lt 1024 ];then
+        echo "***ulimit -n too small, better bigger than 1024 for test***"
+        return 1
+    fi
+
     if ! $DRY_RUN ctest $CHECK_MAKEOPTS --output-on-failure; then
         rm -f ${TMPDIR:-/tmp}/ceph-asok.*
         return 1
@@ -86,21 +155,14 @@ function main() {
         echo "with the ability to run commands as root via sudo."
     fi
     echo -n "Checking hostname sanity... "
-    if hostname --fqdn >/dev/null 2>&1 ; then
+    if $DRY_RUN hostname --fqdn >/dev/null 2>&1 ; then
         echo "OK"
     else
         echo "NOT OK"
         echo "Please fix 'hostname --fqdn', otherwise 'make check' will fail"
         return 1
     fi
-    if run "$@" ; then
-        rm -fr ${CEPH_BUILD_VIRTUALENV:-/tmp}/*virtualenv*
-        echo "cmake check: successful run on $(git rev-parse HEAD)"
-        return 0
-    else
-        rm -fr ${CEPH_BUILD_VIRTUALENV:-/tmp}/*virtualenv*
-        return 1
-    fi
+    run "$@" && echo "make check: successful run on $(git rev-parse HEAD)"
 }
 
 main "$@"
diff --git a/ceph/src/.git_version b/ceph/src/.git_version
index fc407817f..268e02b63 100644
--- a/ceph/src/.git_version
+++ b/ceph/src/.git_version
@@ -1,2 +1,2 @@
-177915764b752804194937482a39e95e0ca3de94
-v12.2.10
+26dc3775efc7bb286a1d6d66faee0ba30ea23eee
+v12.2.11
diff --git a/ceph/src/auth/AuthSessionHandler.cc b/ceph/src/auth/AuthSessionHandler.cc
index ab46b60c5..286e383f6 100644
--- a/ceph/src/auth/AuthSessionHandler.cc
+++ b/ceph/src/auth/AuthSessionHandler.cc
@@ -30,6 +30,10 @@ AuthSessionHandler *get_auth_session_handler(CephContext *cct, int protocol, Cry
  
   switch (protocol) {
   case CEPH_AUTH_CEPHX:
+    // if there is no session key, there is no session handler.
+    if (key.get_type() == CEPH_CRYPTO_NONE) {
+      return nullptr;
+    }
     return new CephxSessionHandler(cct, key, features);
   case CEPH_AUTH_NONE:
     return new AuthNoneSessionHandler(cct, key);
diff --git a/ceph/src/ceph-create-keys b/ceph/src/ceph-create-keys
index c14c02f28..41d76e157 100755
--- a/ceph/src/ceph-create-keys
+++ b/ceph/src/ceph-create-keys
@@ -91,12 +91,12 @@ def get_key(cluster, mon_id, wait_count=600):
     pathdir = os.path.dirname(path)
     if not os.path.exists(pathdir):
         os.makedirs(pathdir)
-        os.chmod(pathdir, 0770)
+        os.chmod(pathdir, 0o770)
         os.chown(pathdir, get_ceph_uid(), get_ceph_gid())
     while wait_count > 0:
         try:
-            with file(tmp, 'w') as f:
-                os.fchmod(f.fileno(), 0600)
+            with open(tmp, 'w') as f:
+                os.fchmod(f.fileno(), 0o600)
                 os.fchown(f.fileno(), get_ceph_uid(), get_ceph_gid())
                 LOG.info('Talking to monitor...')
 
@@ -201,13 +201,13 @@ def bootstrap_key(cluster, type_, wait_count=600):
     pathdir = os.path.dirname(path)
     if not os.path.exists(pathdir):
         os.makedirs(pathdir)
-        os.chmod(pathdir, 0770)
+        os.chmod(pathdir, 0o770)
         os.chown(pathdir, get_ceph_uid(), get_ceph_gid())
 
     while wait_count > 0:
         try:
-            with file(tmp, 'w') as f:
-                os.fchmod(f.fileno(), 0600)
+            with open(tmp, 'w') as f:
+                os.fchmod(f.fileno(), 0o600)
                 os.fchown(f.fileno(), get_ceph_uid(), get_ceph_gid())
                 LOG.info('Talking to monitor...')
                 returncode = subprocess.call(
diff --git a/ceph/src/ceph-volume/ceph_volume/api/lvm.py b/ceph/src/ceph-volume/ceph_volume/api/lvm.py
index aed4a8f64..bcb54d65b 100644
--- a/ceph/src/ceph-volume/ceph_volume/api/lvm.py
+++ b/ceph/src/ceph-volume/ceph_volume/api/lvm.py
@@ -466,6 +466,9 @@ def remove_vg(vg_name):
     """
     Removes a volume group.
     """
+    if not vg_name:
+        logger.warning('Skipping removal of invalid VG name: "%s"', vg_name)
+        return
     fail_msg = "Unable to remove vg %s" % vg_name
     process.run(
         [
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py
index 852c314c2..1ad15bc80 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py
@@ -63,6 +63,9 @@ def activate_filestore(lvs, no_systemd=False):
     if not system.device_is_mounted(source, destination=destination):
         prepare_utils.mount_osd(source, osd_id, is_vdo=is_vdo)
 
+    # ensure that the OSD destination is always chowned properly
+    system.chown(destination)
+
     # always re-do the symlink regardless if it exists, so that the journal
     # device path that may have changed can be mapped correctly every time
     destination = '/var/lib/ceph/osd/%s-%s/journal' % (conf.cluster, osd_id)
@@ -151,7 +154,10 @@ def activate_bluestore(lvs, no_systemd=False):
     db_device_path = get_osd_device_path(osd_lv, lvs, 'db', dmcrypt_secret=dmcrypt_secret)
     wal_device_path = get_osd_device_path(osd_lv, lvs, 'wal', dmcrypt_secret=dmcrypt_secret)
 
-    # Once symlinks are removed, the osd dir can be 'primed again.
+    # Once symlinks are removed, the osd dir can be 'primed again. chown first,
+    # regardless of what currently exists so that ``prime-osd-dir`` can succeed
+    # even if permissions are somehow messed up
+    system.chown(osd_path)
     prime_command = [
         'ceph-bluestore-tool', '--cluster=%s' % conf.cluster,
         'prime-osd-dir', '--dev', osd_lv_path,
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py
index cce58b166..76a52f37d 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py
@@ -139,14 +139,11 @@ class Batch(object):
         self.argv = argv
 
     def get_devices(self):
-        all_devices = disk.get_devices()
         # remove devices with partitions
-        # XXX Should be optional when getting device info
-        for device, detail in all_devices.items():
-            if detail.get('partitions') != {}:
-                del all_devices[device]
-        devices = sorted(all_devices.items(), key=lambda x: (x[0], x[1]['size']))
-        return device_formatter(devices)
+        devices = [(device, details) for device, details in
+                       disk.get_devices().items() if details.get('partitions') == {}]
+        size_sort = lambda x: (x[0], x[1]['size'])
+        return device_formatter(sorted(devices, key=size_sort))
 
     def print_help(self):
         return self._help.format(
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/bluestore.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/bluestore.py
index 92dc3a2e9..ee269a394 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/bluestore.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/bluestore.py
@@ -1,47 +1,28 @@
 from __future__ import print_function
-import json
 from ceph_volume.util import disk, prepare
 from ceph_volume.api import lvm
 from . import validators
+from .strategies import Strategy
+from .strategies import MixedStrategy
 from ceph_volume.devices.lvm.create import Create
 from ceph_volume.devices.lvm.prepare import Prepare
 from ceph_volume.util import templates
 from ceph_volume.exceptions import SizeAllocationError
 
 
-class SingleType(object):
+class SingleType(Strategy):
     """
     Support for all SSDs, or all HDDS
     """
 
     def __init__(self, devices, args):
-        self.args = args
-        self.osds_per_device = args.osds_per_device
-        self.devices = devices
-        # TODO: add --fast-devices and --slow-devices so these can be customized
-        self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
-        self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
-        self.computed = {'osds': [], 'vgs': [], 'filtered_devices': args.filtered_devices}
-        if self.devices:
-            self.validate()
-            self.compute()
-        else:
-            self.computed["changed"] = False
+        super(SingleType, self).__init__(devices, args)
+        self.validate_compute()
 
     @staticmethod
     def type():
         return "bluestore.SingleType"
 
-    @property
-    def total_osds(self):
-        if self.hdds:
-            return len(self.hdds) * self.osds_per_device
-        else:
-            return len(self.ssds) * self.osds_per_device
-
-    def report_json(self):
-        print(json.dumps(self.computed, indent=4, sort_keys=True))
-
     def report_pretty(self):
         string = ""
         if self.args.filtered_devices:
@@ -141,32 +122,19 @@ class SingleType(object):
                     Create(command).main()
 
 
-class MixedType(object):
+class MixedType(MixedStrategy):
 
     def __init__(self, devices, args):
-        self.args = args
-        self.devices = devices
-        self.osds_per_device = args.osds_per_device
-        # TODO: add --fast-devices and --slow-devices so these can be customized
-        self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
-        self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
-        self.computed = {'osds': [], 'filtered_devices': args.filtered_devices}
+        super(MixedType, self).__init__(devices, args)
         self.block_db_size = self.get_block_size()
         self.system_vgs = lvm.VolumeGroups()
         self.dbs_needed = len(self.hdds) * self.osds_per_device
-        if self.devices:
-            self.validate()
-            self.compute()
-        else:
-            self.computed["changed"] = False
+        self.validate_compute()
 
     @staticmethod
     def type():
         return "bluestore.MixedType"
 
-    def report_json(self):
-        print(json.dumps(self.computed, indent=4, sort_keys=True))
-
     def get_block_size(self):
         if self.args.block_db_size:
             return disk.Size(b=self.args.block_db_size)
@@ -319,17 +287,6 @@ class MixedType(object):
             else:
                 Create(command).main()
 
-    def get_common_vg(self):
-        # find all the vgs associated with the current device
-        for ssd in self.ssds:
-            for pv in ssd.pvs_api:
-                vg = self.system_vgs.get(vg_name=pv.vg_name)
-                if not vg:
-                    continue
-                # this should give us just one VG, it would've been caught by
-                # the validator otherwise
-                return vg
-
     def validate(self):
         """
         HDDs represent data devices, and solid state devices are for block.db,
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/filestore.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/filestore.py
index b94cc6ea3..c01e83721 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/filestore.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/filestore.py
@@ -1,8 +1,9 @@
 from __future__ import print_function
-import json
 from ceph_volume.util import disk, prepare
 from ceph_volume.api import lvm
 from . import validators
+from .strategies import Strategy
+from .strategies import MixedStrategy
 from ceph_volume.devices.lvm.create import Create
 from ceph_volume.devices.lvm.prepare import Prepare
 from ceph_volume.util import templates
@@ -20,40 +21,21 @@ def get_journal_size(args):
         return prepare.get_journal_size(lv_format=False)
 
 
-class SingleType(object):
+class SingleType(Strategy):
     """
     Support for all SSDs, or all HDDs, data and journal LVs will be colocated
     in the same device
     """
 
     def __init__(self, devices, args):
-        self.args = args
-        self.osds_per_device = args.osds_per_device
-        self.devices = devices
-        self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
-        self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
-        self.computed = {'osds': [], 'vgs': [], 'filtered_devices': args.filtered_devices}
+        super(SingleType, self).__init__(devices, args)
         self.journal_size = get_journal_size(args)
-        if self.devices:
-            self.validate()
-            self.compute()
-        else:
-            self.computed["changed"] = False
+        self.validate_compute()
 
     @staticmethod
     def type():
         return "filestore.SingleType"
 
-    @property
-    def total_osds(self):
-        if self.hdds:
-            return len(self.hdds) * self.osds_per_device
-        else:
-            return len(self.ssds) * self.osds_per_device
-
-    def report_json(self):
-        print(json.dumps(self.computed, indent=4, sort_keys=True))
-
     def report_pretty(self):
         string = ""
         if self.args.filtered_devices:
@@ -176,7 +158,7 @@ class SingleType(object):
                 Create(command).main()
 
 
-class MixedType(object):
+class MixedType(MixedStrategy):
     """
     Supports HDDs with SSDs, journals will be placed on SSDs, while HDDs will
     be used fully for data.
@@ -186,36 +168,17 @@ class MixedType(object):
     """
 
     def __init__(self, devices, args):
-        self.args = args
-        self.osds_per_device = args.osds_per_device
-        self.devices = devices
-        self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
-        self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
-        self.computed = {'osds': [], 'vg': None, 'filtered_devices': args.filtered_devices}
+        super(MixedType, self).__init__(devices, args)
         self.blank_ssds = []
         self.journals_needed = len(self.hdds) * self.osds_per_device
         self.journal_size = get_journal_size(args)
         self.system_vgs = lvm.VolumeGroups()
-        if self.devices:
-            self.validate()
-            self.compute()
-        else:
-            self.computed["changed"] = False
+        self.validate_compute()
 
     @staticmethod
     def type():
         return "filestore.MixedType"
 
-    def report_json(self):
-        print(json.dumps(self.computed, indent=4, sort_keys=True))
-
-    @property
-    def total_osds(self):
-        if self.hdds:
-            return len(self.hdds) * self.osds_per_device
-        else:
-            return len(self.ssds) * self.osds_per_device
-
     def report_pretty(self):
         string = ""
         if self.args.filtered_devices:
@@ -252,17 +215,6 @@ class MixedType(object):
 
         print(string)
 
-    def get_common_vg(self):
-        # find all the vgs associated with the current device
-        for ssd in self.ssds:
-            for pv in ssd.pvs_api:
-                vg = self.system_vgs.get(vg_name=pv.vg_name)
-                if not vg:
-                    continue
-                # this should give us just one VG, it would've been caught by
-                # the validator otherwise
-                return vg
-
     def validate(self):
         """
         Ensure that the minimum requirements for this type of scenario is
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/strategies.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/strategies.py
new file mode 100644
index 000000000..d4ec5a730
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/strategies.py
@@ -0,0 +1,50 @@
+import json
+
+class Strategy(object):
+
+    def __init__(self, devices, args):
+        self.args = args
+        self.osds_per_device = args.osds_per_device
+        self.devices = devices
+        self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
+        self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
+        self.computed = {'osds': [], 'vgs': [], 'filtered_devices': args.filtered_devices}
+
+    def validate_compute(self):
+        if self.devices:
+            self.validate()
+            self.compute()
+        else:
+            self.computed["changed"] = False
+
+    def report_json(self):
+        print(json.dumps(self.computed, indent=4, sort_keys=True))
+
+    @property
+    def total_osds(self):
+        if self.hdds:
+            return len(self.hdds) * self.osds_per_device
+        else:
+            return len(self.ssds) * self.osds_per_device
+
+    # protect against base class instantiation and incomplete implementations.
+    # We could also use the abc module and implement this as an
+    # AbstractBaseClass
+    def compute(self):
+        raise NotImplementedError('compute() must be implemented in a child class')
+
+    def execute(self):
+        raise NotImplementedError('execute() must be implemented in a child class')
+
+class MixedStrategy(Strategy):
+
+    def get_common_vg(self):
+        # find all the vgs associated with the current device
+        for ssd in self.ssds:
+            for pv in ssd.pvs_api:
+                vg = self.system_vgs.get(vg_name=pv.vg_name)
+                if not vg:
+                    continue
+                # this should give us just one VG, it would've been caught by
+                # the validator otherwise
+                return vg
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py
index 8e0e3a3c5..328a03615 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py
@@ -1,11 +1,14 @@
 import argparse
+import os
 import logging
 
 from textwrap import dedent
 
 from ceph_volume import decorators, terminal, process
 from ceph_volume.api import lvm as api
-from ceph_volume.util import system, encryption, disk
+from ceph_volume.util import system, encryption, disk, arg_validators
+from ceph_volume.util.device import Device
+from ceph_volume.systemd import systemctl
 
 logger = logging.getLogger(__name__)
 mlogger = terminal.MultiLogger(__name__)
@@ -39,6 +42,79 @@ def zap_data(path):
     ])
 
 
+def find_associated_devices(osd_id=None, osd_fsid=None):
+    """
+    From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the
+    system that match those tag values, further detect if any partitions are
+    part of the OSD, and then return the set of LVs and partitions (if any).
+    """
+    lv_tags = {}
+    if osd_id:
+        lv_tags['ceph.osd_id'] = osd_id
+    if osd_fsid:
+        lv_tags['ceph.osd_fsid'] = osd_fsid
+    lvs = api.Volumes()
+    lvs.filter(lv_tags=lv_tags)
+    if not lvs:
+        raise RuntimeError('Unable to find any LV for zapping OSD: %s' % osd_id or osd_fsid)
+
+    devices_to_zap = ensure_associated_lvs(lvs)
+
+    return [Device(path) for path in set(devices_to_zap) if path]
+
+
+def ensure_associated_lvs(lvs):
+    """
+    Go through each LV and ensure if backing devices (journal, wal, block)
+    are LVs or partitions, so that they can be accurately reported.
+    """
+    # look for many LVs for each backing type, because it is possible to
+    # receive a filtering for osd.1, and have multiple failed deployments
+    # leaving many journals with osd.1 - usually, only a single LV will be
+    # returned
+    journal_lvs = lvs._filter(lv_tags={'ceph.type': 'journal'})
+    db_lvs = lvs._filter(lv_tags={'ceph.type': 'db'})
+    wal_lvs = lvs._filter(lv_tags={'ceph.type': 'wal'})
+    backing_devices = [
+        (journal_lvs, 'journal'),
+        (db_lvs, 'block'),
+        (wal_lvs, 'wal')
+    ]
+
+    verified_devices = []
+
+    for lv in lvs:
+        # go through each lv and append it, otherwise query `blkid` to find
+        # a physical device. Do this for each type (journal,db,wal) regardless
+        # if they have been processed in the previous LV, so that bad devices
+        # with the same ID can be caught
+        for ceph_lvs, _type in backing_devices:
+            if ceph_lvs:
+                verified_devices.extend([l.lv_path for l in ceph_lvs])
+                continue
+
+            # must be a disk partition, by querying blkid by the uuid we are
+            # ensuring that the device path is always correct
+            try:
+                device_uuid = lv.tags['ceph.%s_uuid' % _type]
+            except KeyError:
+                # Bluestore will not have ceph.journal_uuid, and Filestore
+                # will not not have ceph.db_uuid
+                continue
+
+            osd_device = disk.get_device_from_partuuid(device_uuid)
+            if not osd_device:
+                # if the osd_device is not found by the partuuid, then it is
+                # not possible to ensure this device exists anymore, so skip it
+                continue
+            verified_devices.append(osd_device)
+
+        verified_devices.append(lv.lv_path)
+
+    # reduce the list from all the duplicates that were added
+    return list(set(verified_devices))
+
+
 class Zap(object):
 
     help = 'Removes all data and filesystems from a logical volume or partition.'
@@ -59,70 +135,128 @@ class Zap(object):
         if dmcrypt and dmcrypt_uuid:
             self.dmcrypt_close(dmcrypt_uuid)
 
+    def zap_lv(self, device):
+        """
+        Device examples: vg-name/lv-name, /dev/vg-name/lv-name
+        Requirements: Must be a logical volume (LV)
+        """
+        lv = api.get_lv(lv_name=device.lv_name, vg_name=device.vg_name)
+        self.unmount_lv(lv)
+
+        wipefs(device.abspath)
+        zap_data(device.abspath)
+
+        if self.args.destroy:
+            lvs = api.Volumes()
+            lvs.filter(vg_name=device.vg_name)
+            if len(lvs) <= 1:
+                mlogger.info('Only 1 LV left in VG, will proceed to destroy volume group %s', device.vg_name)
+                api.remove_vg(device.vg_name)
+            else:
+                mlogger.info('More than 1 LV left in VG, will proceed to destroy LV only')
+                mlogger.info('Removing LV because --destroy was given: %s', device.abspath)
+                api.remove_lv(device.abspath)
+        elif lv:
+            # just remove all lvm metadata, leaving the LV around
+            lv.clear_tags()
+
+    def zap_partition(self, device):
+        """
+        Device example: /dev/sda1
+        Requirements: Must be a partition
+        """
+        if device.is_encrypted:
+            # find the holder
+            holders = [
+                '/dev/%s' % holder for holder in device.sys_api.get('holders', [])
+            ]
+            for mapper_uuid in os.listdir('/dev/mapper'):
+                mapper_path = os.path.join('/dev/mapper', mapper_uuid)
+                if os.path.realpath(mapper_path) in holders:
+                    self.dmcrypt_close(mapper_uuid)
+
+        if system.device_is_mounted(device.abspath):
+            mlogger.info("Unmounting %s", device.abspath)
+            system.unmount(device.abspath)
+
+        wipefs(device.abspath)
+        zap_data(device.abspath)
+
+        if self.args.destroy:
+            mlogger.info("Destroying partition since --destroy was used: %s" % device.abspath)
+            disk.remove_partition(device)
+
+    def zap_lvm_member(self, device):
+        """
+        An LVM member may have more than one LV and or VG, for example if it is
+        a raw device with multiple partitions each belonging to a different LV
+
+        Device example: /dev/sda
+        Requirements: An LV or VG present in the device, making it an LVM member
+        """
+        for lv in device.lvs:
+            self.zap_lv(Device(lv.lv_path))
+
+
+    def zap_raw_device(self, device):
+        """
+        Any whole (raw) device passed in as input will be processed here,
+        checking for LVM membership and partitions (if any).
+
+        Device example: /dev/sda
+        Requirements: None
+        """
+        if not self.args.destroy:
+            # the use of dd on a raw device causes the partition table to be
+            # destroyed
+            mlogger.warning(
+                '--destroy was not specified, but zapping a whole device will remove the partition table'
+            )
+
+        # look for partitions and zap those
+        for part_name in device.sys_api.get('partitions', {}).keys():
+            self.zap_partition(Device('/dev/%s' % part_name))
+
+        wipefs(device.abspath)
+        zap_data(device.abspath)
+
     @decorators.needs_root
-    def zap(self, args):
-        for device in args.devices:
-            if disk.is_mapper_device(device):
+    def zap(self, devices=None):
+        devices = devices or self.args.devices
+
+        for device in devices:
+            mlogger.info("Zapping: %s", device.abspath)
+            if device.is_mapper:
                 terminal.error("Refusing to zap the mapper device: {}".format(device))
                 raise SystemExit(1)
-            lv = api.get_lv_from_argument(device)
-            if lv:
-                # we are zapping a logical volume
-                path = lv.lv_path
-                self.unmount_lv(lv)
-            else:
-                # we are zapping a partition
-                #TODO: ensure device is a partition
-                path = device
-                # check to if it is encrypted to close
-                partuuid = disk.get_partuuid(device)
-                if encryption.status("/dev/mapper/{}".format(partuuid)):
-                    dmcrypt_uuid = partuuid
-                    self.dmcrypt_close(dmcrypt_uuid)
-
-            mlogger.info("Zapping: %s", path)
-
-            # check if there was a pv created with the
-            # name of device
-            pvs = api.PVolumes()
-            pvs.filter(pv_name=device)
-            vgs = set([pv.vg_name for pv in pvs])
-            for pv in pvs:
-                vg_name = pv.vg_name
-                lv = None
-                if pv.lv_uuid:
-                    lv = api.get_lv(vg_name=vg_name, lv_uuid=pv.lv_uuid)
-
-                if lv:
-                    self.unmount_lv(lv)
-
-            if args.destroy:
-                for vg_name in vgs:
-                    mlogger.info("Destroying volume group %s because --destroy was given", vg_name)
-                    api.remove_vg(vg_name)
-                if not lv:
-                    mlogger.info("Destroying physical volume %s because --destroy was given", device)
-                    api.remove_pv(device)
-
-            wipefs(path)
-            zap_data(path)
-
-            if lv and not pvs:
-                if args.destroy:
-                    lvs = api.Volumes()
-                    lvs.filter(vg_name=lv.vg_name)
-                    if len(lvs) <= 1:
-                        mlogger.info('Only 1 LV left in VG, will proceed to destroy volume group %s', lv.vg_name)
-                        api.remove_vg(lv.vg_name)
-                    else:
-                        mlogger.info('More than 1 LV left in VG, will proceed to destroy LV only')
-                        mlogger.info('Removing LV because --destroy was given: %s', lv)
-                        api.remove_lv(lv)
-                else:
-                    # just remove all lvm metadata, leaving the LV around
-                    lv.clear_tags()
-
-        terminal.success("Zapping successful for: %s" % ", ".join(args.devices))
+            if device.is_lvm_member:
+                self.zap_lvm_member(device)
+            if device.is_lv:
+                self.zap_lv(device)
+            if device.is_partition:
+                self.zap_partition(device)
+            if device.is_device:
+                self.zap_raw_device(device)
+
+        if self.args.devices:
+            terminal.success(
+                "Zapping successful for: %s" % ", ".join([str(d) for d in self.args.devices])
+            )
+        else:
+            terminal.success(
+                "Zapping successful for OSD: %s" % self.args.osd_id or self.args.osd_fsid
+            )
+
+    @decorators.needs_root
+    def zap_osd(self):
+        if self.args.osd_id:
+            osd_is_running = systemctl.osd_is_active(self.args.osd_id)
+            if osd_is_running:
+                mlogger.error("OSD ID %s is running, stop it with:" % self.args.osd_id)
+                mlogger.error("systemctl stop ceph-osd@%s" % self.args.osd_id)
+                raise SystemExit("Unable to zap devices associated with OSD ID: %s" % self.args.osd_id)
+        devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid)
+        self.zap(devices)
 
     def dmcrypt_close(self, dmcrypt_uuid):
         dmcrypt_path = "/dev/mapper/{}".format(dmcrypt_uuid)
@@ -155,6 +289,14 @@ class Zap(object):
 
               ceph-volume lvm zap /dev/sda /dev/sdb /db/sdc
 
+          Zapping devices associated with an OSD ID:
+
+              ceph-volume lvm zap --osd-id 1
+
+            Optionally include the OSD FSID
+
+              ceph-volume lvm zap --osd-id 1 --osd-fsid 55BD4219-16A7-4037-BC20-0F158EFCC83D
+
         If the --destroy flag is given and you are zapping a raw device or partition
         then all vgs and lvs that exist on that raw device or partition will be destroyed.
 
@@ -179,17 +321,35 @@ class Zap(object):
             'devices',
             metavar='DEVICES',
             nargs='*',
+            type=arg_validators.ValidDevice(gpt_ok=True),
             default=[],
             help='Path to one or many lv (as vg/lv), partition (as /dev/sda1) or device (as /dev/sda)'
         )
+
         parser.add_argument(
             '--destroy',
             action='store_true',
             default=False,
             help='Destroy all volume groups and logical volumes if you are zapping a raw device or partition',
         )
+
+        parser.add_argument(
+            '--osd-id',
+            help='Specify an OSD ID to detect associated devices for zapping',
+        )
+
+        parser.add_argument(
+            '--osd-fsid',
+            help='Specify an OSD FSID to detect associated devices for zapping',
+        )
+
         if len(self.argv) == 0:
             print(sub_command_help)
             return
-        args = parser.parse_args(self.argv)
-        self.zap(args)
+
+        self.args = parser.parse_args(self.argv)
+
+        if self.args.osd_id or self.args.osd_fsid:
+            self.zap_osd()
+        else:
+            self.zap()
diff --git a/ceph/src/ceph-volume/ceph_volume/inventory/main.py b/ceph/src/ceph-volume/ceph_volume/inventory/main.py
index f4c732cab..1d821b602 100644
--- a/ceph/src/ceph-volume/ceph_volume/inventory/main.py
+++ b/ceph/src/ceph-volume/ceph_volume/inventory/main.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import argparse
-import pprint
+import json
 
 from ceph_volume.util.device import Devices, Device
 
@@ -39,8 +39,8 @@ class Inventory(object):
 
     def format_report(self, inventory):
         if self.args.format == 'json':
-            print(inventory.json_report())
+            print(json.dumps(inventory.json_report()))
         elif self.args.format == 'json-pretty':
-            pprint.pprint(inventory.json_report())
+            print(json.dumps(inventory.json_report(), indent=4, sort_keys=True))
         else:
             print(inventory.pretty_report())
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/conftest.py b/ceph/src/ceph-volume/ceph_volume/tests/conftest.py
index cf7dd5d8f..8ec99bb84 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/conftest.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/conftest.py
@@ -192,10 +192,11 @@ def tmpfile(tmpdir):
 
 @pytest.fixture
 def device_info(monkeypatch):
-    def apply(devices=None, lsblk=None, lv=None, blkid=None):
+    def apply(devices=None, lsblk=None, lv=None, blkid=None, udevadm=None):
         devices = devices if devices else {}
         lsblk = lsblk if lsblk else {}
         blkid = blkid if blkid else {}
+        udevadm = udevadm if udevadm else {}
         lv = Factory(**lv) if lv else None
         monkeypatch.setattr("ceph_volume.sys_info.devices", {})
         monkeypatch.setattr("ceph_volume.util.device.disk.get_devices", lambda: devices)
@@ -206,4 +207,5 @@ def device_info(monkeypatch):
         monkeypatch.setattr("ceph_volume.util.device.lvm.get_lv", lambda vg_name, lv_uuid: lv)
         monkeypatch.setattr("ceph_volume.util.device.disk.lsblk", lambda path: lsblk)
         monkeypatch.setattr("ceph_volume.util.device.disk.blkid", lambda path: blkid)
+        monkeypatch.setattr("ceph_volume.util.disk.udevadm_property", lambda *a, **kw: udevadm)
     return apply
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py
index d1f9046a0..50ef61b83 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py
@@ -1,6 +1,13 @@
 from ceph_volume.devices.lvm import batch
 
 
+class TestBatchSmoke(object):
+
+    def test_batch_instance(self, is_root):
+        b = batch.Batch([])
+        b.main()
+
+
 class TestFilterDevices(object):
 
     def test_filter_used_device(self, factory):
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
new file mode 100644
index 000000000..55daa4f87
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
@@ -0,0 +1,153 @@
+import pytest
+from ceph_volume.api import lvm as api
+from ceph_volume.devices.lvm import zap
+
+
+class TestFindAssociatedDevices(object):
+
+    def test_no_lvs_found_that_match_id(self, volumes, monkeypatch, device_info):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        tags = 'ceph.osd_id=9,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', vg_name='vg', lv_tags=tags)
+        volumes.append(osd)
+        with pytest.raises(RuntimeError):
+            zap.find_associated_devices(osd_id=10)
+
+    def test_no_lvs_found_that_match_fsid(self, volumes, monkeypatch, device_info):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        tags = 'ceph.osd_id=9,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', vg_name='vg', lv_tags=tags)
+        volumes.append(osd)
+        with pytest.raises(RuntimeError):
+            zap.find_associated_devices(osd_fsid='aaaa-lkjh')
+
+    def test_no_lvs_found_that_match_id_fsid(self, volumes, monkeypatch, device_info):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        tags = 'ceph.osd_id=9,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', vg_name='vg', lv_tags=tags)
+        volumes.append(osd)
+        with pytest.raises(RuntimeError):
+            zap.find_associated_devices(osd_id='9', osd_fsid='aaaa-lkjh')
+
+    def test_no_ceph_lvs_found(self, volumes, monkeypatch):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags='')
+        volumes.append(osd)
+        with pytest.raises(RuntimeError):
+            zap.find_associated_devices(osd_id=100)
+
+    def test_lv_is_matched_id(self, volumes, monkeypatch):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.find_associated_devices(osd_id='0')
+        assert result[0].abspath == '/dev/VolGroup/lv'
+
+    def test_lv_is_matched_fsid(self, volumes, monkeypatch):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.find_associated_devices(osd_fsid='asdf-lkjh')
+        assert result[0].abspath == '/dev/VolGroup/lv'
+
+    def test_lv_is_matched_id_fsid(self, volumes, monkeypatch):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.find_associated_devices(osd_id='0', osd_fsid='asdf-lkjh')
+        assert result[0].abspath == '/dev/VolGroup/lv'
+
+
+class TestEnsureAssociatedLVs(object):
+
+    def test_nothing_is_found(self, volumes):
+        result = zap.ensure_associated_lvs(volumes)
+        assert result == []
+
+    def test_data_is_found(self, volumes):
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/data', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert result == ['/dev/VolGroup/data']
+
+    def test_block_is_found(self, volumes):
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=block'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/block', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert result == ['/dev/VolGroup/block']
+
+    def test_block_and_partition_are_found(self, volumes, monkeypatch):
+        monkeypatch.setattr(zap.disk, 'get_device_from_partuuid', lambda x: '/dev/sdb1')
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=block'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/block', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert '/dev/sdb1' in result
+        assert '/dev/VolGroup/block' in result
+
+    def test_journal_is_found(self, volumes):
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=journal'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert result == ['/dev/VolGroup/lv']
+
+    def test_multiple_journals_are_found(self, volumes):
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=journal'
+        for i in range(3):
+            osd = api.Volume(
+                lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
+            volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert '/dev/VolGroup/lv0' in result
+        assert '/dev/VolGroup/lv1' in result
+        assert '/dev/VolGroup/lv2' in result
+
+    def test_multiple_dbs_are_found(self, volumes):
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=db'
+        for i in range(3):
+            osd = api.Volume(
+                lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
+            volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert '/dev/VolGroup/lv0' in result
+        assert '/dev/VolGroup/lv1' in result
+        assert '/dev/VolGroup/lv2' in result
+
+    def test_multiple_wals_are_found(self, volumes):
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.wal_uuid=x,ceph.type=wal'
+        for i in range(3):
+            osd = api.Volume(
+                lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
+            volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert '/dev/VolGroup/lv0' in result
+        assert '/dev/VolGroup/lv1' in result
+        assert '/dev/VolGroup/lv2' in result
+
+    def test_multiple_backing_devs_are_found(self, volumes):
+        for _type in ['journal', 'db', 'wal']:
+            tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.wal_uuid=x,ceph.type=%s' % _type
+            osd = api.Volume(
+                lv_name='volume%s' % _type, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % _type, lv_tags=tags)
+            volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert '/dev/VolGroup/lvjournal' in result
+        assert '/dev/VolGroup/lvwal' in result
+        assert '/dev/VolGroup/lvdb' in result
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py
index 493c74c50..6333e3a4e 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py
@@ -19,7 +19,9 @@ class TestZap(object):
         '/dev/mapper/foo',
         '/dev/dm-0',
     ])
-    def test_can_not_zap_mapper_device(self, capsys, is_root, device_name):
+    def test_can_not_zap_mapper_device(self, monkeypatch, device_info, capsys, is_root, device_name):
+        monkeypatch.setattr('os.path.exists', lambda x: True)
+        device_info()
         with pytest.raises(SystemExit):
             lvm.zap.Zap(argv=[device_name]).main()
         stdout, stderr = capsys.readouterr()
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type-dmcrypt/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type-dmcrypt/test_zap.yml
new file mode 120000
index 000000000..cb969fa1d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type-dmcrypt/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type/test_zap.yml
new file mode 120000
index 000000000..cb969fa1d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type-dmcrypt/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type-dmcrypt/test_zap.yml
new file mode 120000
index 000000000..cb969fa1d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type-dmcrypt/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type/test_zap.yml
new file mode 120000
index 000000000..cb969fa1d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type-dmcrypt/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type-dmcrypt/test_zap.yml
new file mode 120000
index 000000000..cb969fa1d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type-dmcrypt/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type/test_zap.yml
new file mode 120000
index 000000000..cb969fa1d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type-dmcrypt/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type-dmcrypt/test_zap.yml
new file mode 120000
index 000000000..cb969fa1d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type-dmcrypt/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type/test_zap.yml
new file mode 120000
index 000000000..cb969fa1d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/playbooks/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/playbooks/test_zap.yml
new file mode 100644
index 000000000..850ecc94e
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/playbooks/test_zap.yml
@@ -0,0 +1,31 @@
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    - name: stop ceph-osd daemons
+      service:
+        name: "ceph-osd@{{ item }}"
+        state: stopped
+      with_items: "{{ osd_ids }}"
+
+
+- hosts: mons
+  become: yes
+  tasks:
+
+    - name: purge osds
+      command: "ceph --cluster {{ cluster }} osd purge osd.{{ item }} --yes-i-really-mean-it"
+      with_items: "{{ osd_ids }}"
+
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    - name: zap devices used for OSDs
+      command: "ceph-volume --cluster {{ cluster }} lvm zap --osd-id {{ item }} --destroy"
+      with_items: "{{ osd_ids }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini
index c2725a09f..4c3af6811 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini
@@ -63,4 +63,7 @@ commands=
   # retest to ensure cluster came back up correctly
   testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
 
+  # test zap OSDs by ID
+  ansible-playbook -vv -i {changedir}/hosts {changedir}/test_zap.yml
+
   vagrant destroy {env:VAGRANT_DESTROY_FLAGS:"--force"}
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type-dmcrypt/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type-dmcrypt/test_zap.yml
new file mode 120000
index 000000000..cb969fa1d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type-dmcrypt/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type/test_zap.yml
new file mode 120000
index 000000000..cb969fa1d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type-dmcrypt/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type-dmcrypt/test_zap.yml
new file mode 120000
index 000000000..cb969fa1d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type-dmcrypt/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type/test_zap.yml
new file mode 120000
index 000000000..cb969fa1d
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml
index bebe6dc36..8caa1ce38 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml
@@ -32,6 +32,17 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    # partitions have been completely removed, so re-create them again
+    - name: re-create partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
     - name: redeploy osd.2 using /dev/sdd1
       command: "ceph-volume --cluster {{ cluster }} lvm create --bluestore --data /dev/sdd1 --osd-id 2"
       environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml
index c48e4bece..17b74d524 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml
@@ -40,6 +40,27 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    # partitions have been completely removed, so re-create them again
+    - name: re-create partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
+    - name: re-create partition /dev/sdd lvm journals
+      parted:
+        device: /dev/sdd
+        number: 2
+        part_start: 50%
+        part_end: 100%
+        unit: '%'
+        state: present
+        label: gpt
+
     - name: redeploy osd.2 using /dev/sdd1
       command: "ceph-volume --cluster {{ cluster }} lvm create --filestore --data /dev/sdd1 --journal /dev/sdd2 --osd-id 2"
       environment:
@@ -56,6 +77,16 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: re-create partition /dev/sdc1
+      parted:
+        device: /dev/sdc
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        state: present
+        label: gpt
+
     - name: prepare osd.0 again using test_group/data-lv1
       command: "ceph-volume --cluster {{ cluster }} lvm prepare --filestore --data test_group/data-lv1 --journal /dev/sdc1 --osd-id 0"
       environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml
index e4e804a70..353df127c 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml
@@ -35,6 +35,17 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    # partitions have been completely removed, so re-create them again
+    - name: re-create partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
     - name: redeploy osd.2 using /dev/sdd1
       command: "ceph-volume --cluster {{ cluster }} lvm create --bluestore --data /dev/sdd1 --osd-id 2"
       environment:
@@ -51,6 +62,37 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: find all OSD directories
+      find:
+        paths: /var/lib/ceph/osd
+        recurse: no
+        file_type: directory
+      register: osd_directories
+
+    - name: find all OSD symlinks
+      find:
+        paths: /var/lib/ceph/osd
+        recurse: yes
+        depth: 2
+        file_type: link
+      register: osd_symlinks
+
+    # set the OSD dir and the block/block.db links to root:root permissions, to
+    # ensure that the OSD will be able to activate regardless
+    - file:
+        path: "{{ item.path }}"
+        owner: root
+        group: root
+      with_items:
+        - "{{ osd_directories.files }}"
+
+    - file:
+        path: "{{ item.path }}"
+        owner: root
+        group: root
+      with_items:
+        - "{{ osd_symlinks.files }}"
+
     - name: activate all to start the previously prepared osd.0
       command: "ceph-volume lvm activate --all"
       environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml
index 4aa3cf19d..e896c41b0 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml
@@ -41,6 +41,27 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    # partitions have been completely removed, so re-create them again
+    - name: re-create partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
+    - name: re-create partition /dev/sdd lvm journals
+      parted:
+        device: /dev/sdd
+        number: 2
+        part_start: 50%
+        part_end: 100%
+        unit: '%'
+        state: present
+        label: gpt
+
     - name: redeploy osd.2 using /dev/sdd1
       command: "ceph-volume --cluster {{ cluster }} lvm create --filestore --data /dev/sdd1 --journal /dev/sdd2 --osd-id 2"
       environment:
@@ -65,6 +86,34 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: find all OSD paths
+      find:
+        paths: /var/lib/ceph/osd
+        recurse: no
+        file_type: directory
+      register: osd_paths
+
+    # set all OSD paths to root:rootto ensure that the OSD will be able to
+    # activate regardless
+    - name: mangle permissions to root
+      file:
+        path: "{{ item.path }}"
+        owner: root
+        group: root
+        recurse: yes
+      with_items:
+        - "{{ osd_paths.files }}"
+
+    - name: stop ceph-osd@2 daemon
+      service:
+        name: ceph-osd@2
+        state: stopped
+
+    - name: stop ceph-osd@1 daemon
+      service:
+        name: ceph-osd@1
+        state: stopped
+
     - name: activate all to start the previously prepared osd.0
       command: "ceph-volume lvm activate --filestore --all"
       environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml
index 19209b1d2..3e032e202 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml
@@ -33,6 +33,17 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    # partitions have been completely removed, so re-create them again
+    - name: re-create partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
     - name: redeploy osd.2 using /dev/sdd1
       command: "ceph-volume --cluster {{ cluster }} lvm create --bluestore --data /dev/sdd1 --osd-id 2"
       environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml
index c48e4bece..17b74d524 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml
@@ -40,6 +40,27 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    # partitions have been completely removed, so re-create them again
+    - name: re-create partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
+    - name: re-create partition /dev/sdd lvm journals
+      parted:
+        device: /dev/sdd
+        number: 2
+        part_start: 50%
+        part_end: 100%
+        unit: '%'
+        state: present
+        label: gpt
+
     - name: redeploy osd.2 using /dev/sdd1
       command: "ceph-volume --cluster {{ cluster }} lvm create --filestore --data /dev/sdd1 --journal /dev/sdd2 --osd-id 2"
       environment:
@@ -56,6 +77,16 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: re-create partition /dev/sdc1
+      parted:
+        device: /dev/sdc
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        state: present
+        label: gpt
+
     - name: prepare osd.0 again using test_group/data-lv1
       command: "ceph-volume --cluster {{ cluster }} lvm prepare --filestore --data test_group/data-lv1 --journal /dev/sdc1 --osd-id 0"
       environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/playbooks/deploy.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/playbooks/deploy.yml
index 3564cf3cd..f46fcb1d4 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/playbooks/deploy.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/playbooks/deploy.yml
@@ -75,10 +75,10 @@
   become: True
   any_errors_fatal: true
   roles:
-    - role: ceph-defaults
-      tags: ['ceph_update_config']
-    - role: ceph-handler
-    - role: ceph-common
+    - ceph-defaults
+    - ceph-facts
+    - ceph-handler
+    - ceph-common
   tasks:
     - name: rsync ceph-volume to test nodes on centos
       synchronize:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py
index 99e1d494c..8be5f8e4b 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py
@@ -18,12 +18,37 @@ class TestDevice(object):
         disk = device.Device("vg/lv")
         assert disk.is_lv
 
-    def test_is_device(self, device_info):
+    def test_vgs_is_empty(self, device_info, pvolumes, monkeypatch):
+        BarPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+        pvolumes.append(BarPVolume)
+        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+        lsblk = {"TYPE": "disk"}
+        device_info(lsblk=lsblk)
+        disk = device.Device("/dev/nvme0n1")
+        assert disk.vgs == []
+
+    def test_vgs_is_not_empty(self, device_info, pvolumes, monkeypatch):
+        BarPVolume = api.PVolume(vg_name='foo', lv_uuid='111', pv_name='/dev/nvme0n1', pv_uuid="0000", pv_tags={})
+        pvolumes.append(BarPVolume)
+        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+        lsblk = {"TYPE": "disk"}
+        device_info(lsblk=lsblk)
+        disk = device.Device("/dev/nvme0n1")
+        assert len(disk.vgs) == 1
+
+    def test_device_is_device(self, device_info, pvolumes):
         data = {"/dev/sda": {"foo": "bar"}}
         lsblk = {"TYPE": "device"}
         device_info(devices=data, lsblk=lsblk)
         disk = device.Device("/dev/sda")
-        assert disk.is_device
+        assert disk.is_device is True
+
+    def test_disk_is_device(self, device_info, pvolumes):
+        data = {"/dev/sda": {"foo": "bar"}}
+        lsblk = {"TYPE": "disk"}
+        device_info(devices=data, lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.is_device is True
 
     def test_is_partition(self, device_info, pvolumes):
         data = {"/dev/sda": {"foo": "bar"}}
@@ -51,6 +76,11 @@ class TestDevice(object):
         disk = device.Device("/dev/mapper/foo")
         assert disk.is_mapper
 
+    def test_dm_is_mapper_device(self, device_info):
+        device_info()
+        disk = device.Device("/dev/dm-4")
+        assert disk.is_mapper
+
     def test_is_not_mapper_device(self, device_info):
         device_info()
         disk = device.Device("/dev/sda")
@@ -62,6 +92,14 @@ class TestDevice(object):
         disk = device.Device("/dev/sda")
         assert disk.is_ceph_disk_member
 
+    def test_is_ceph_disk_member_not_available(self, device_info):
+        lsblk = {"PARTLABEL": "ceph data"}
+        device_info(lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.is_ceph_disk_member
+        assert not disk.available
+        assert "Used by ceph-disk" in disk.rejected_reasons
+
     def test_is_not_ceph_disk_member_lsblk(self, device_info):
         lsblk = {"PARTLABEL": "gluster partition"}
         device_info(lsblk=lsblk)
@@ -117,6 +155,125 @@ class TestDevice(object):
         disk = device.Device("/dev/sda")
         assert not disk.used_by_ceph
 
+    def test_get_device_id(self, device_info):
+        udev = {k:k for k in ['ID_VENDOR', 'ID_MODEL', 'ID_SCSI_SERIAL']}
+        device_info(udevadm=udev)
+        disk = device.Device("/dev/sda")
+        assert disk._get_device_id() == 'ID_VENDOR_ID_MODEL_ID_SCSI_SERIAL'
+
+
+
+class TestDeviceEncryption(object):
+
+    def test_partition_is_not_encrypted_lsblk(self, device_info, pvolumes):
+        lsblk = {'TYPE': 'part', 'FSTYPE': 'xfs'}
+        device_info(lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.is_encrypted is False
+
+    def test_partition_is_encrypted_lsblk(self, device_info, pvolumes):
+        lsblk = {'TYPE': 'part', 'FSTYPE': 'crypto_LUKS'}
+        device_info(lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.is_encrypted is True
+
+    def test_partition_is_not_encrypted_blkid(self, device_info, pvolumes):
+        lsblk = {'TYPE': 'part'}
+        blkid = {'TYPE': 'ceph data'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        assert disk.is_encrypted is False
+
+    def test_partition_is_encrypted_blkid(self, device_info, pvolumes):
+        lsblk = {'TYPE': 'part'}
+        blkid = {'TYPE': 'crypto_LUKS'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        assert disk.is_encrypted is True
+
+    def test_mapper_is_encrypted_luks1(self, device_info, pvolumes, monkeypatch):
+        status = {'type': 'LUKS1'}
+        monkeypatch.setattr(device, 'encryption_status', lambda x: status)
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/mapper/uuid")
+        assert disk.is_encrypted is True
+
+    def test_mapper_is_encrypted_luks2(self, device_info, pvolumes, monkeypatch):
+        status = {'type': 'LUKS2'}
+        monkeypatch.setattr(device, 'encryption_status', lambda x: status)
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/mapper/uuid")
+        assert disk.is_encrypted is True
+
+    def test_mapper_is_encrypted_plain(self, device_info, pvolumes, monkeypatch):
+        status = {'type': 'PLAIN'}
+        monkeypatch.setattr(device, 'encryption_status', lambda x: status)
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/mapper/uuid")
+        assert disk.is_encrypted is True
+
+    def test_mapper_is_not_encrypted_plain(self, device_info, pvolumes, monkeypatch):
+        monkeypatch.setattr(device, 'encryption_status', lambda x: {})
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/mapper/uuid")
+        assert disk.is_encrypted is False
+
+    def test_lv_is_encrypted_blkid(self, device_info, pvolumes):
+        lsblk = {'TYPE': 'lvm'}
+        blkid = {'TYPE': 'crypto_LUKS'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        disk.lv_api = {}
+        assert disk.is_encrypted is True
+
+    def test_lv_is_not_encrypted_blkid(self, factory, device_info, pvolumes):
+        lsblk = {'TYPE': 'lvm'}
+        blkid = {'TYPE': 'xfs'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        disk.lv_api = factory(encrypted=None)
+        assert disk.is_encrypted is False
+
+    def test_lv_is_encrypted_lsblk(self, device_info, pvolumes):
+        lsblk = {'FSTYPE': 'crypto_LUKS', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        disk.lv_api = {}
+        assert disk.is_encrypted is True
+
+    def test_lv_is_not_encrypted_lsblk(self, factory, device_info, pvolumes):
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        disk.lv_api = factory(encrypted=None)
+        assert disk.is_encrypted is False
+
+    def test_lv_is_encrypted_lvm_api(self, factory, device_info, pvolumes):
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        disk.lv_api = factory(encrypted=True)
+        assert disk.is_encrypted is True
+
+    def test_lv_is_not_encrypted_lvm_api(self, factory, device_info, pvolumes):
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        disk.lv_api = factory(encrypted=False)
+        assert disk.is_encrypted is False
+
 
 class TestDeviceOrdering(object):
 
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py
index 5d1bd82b6..e40c982d1 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py
@@ -45,6 +45,34 @@ class TestBlkid(object):
         assert result['UUID'] == '62416664-cbaf-40bd-9689-10bd337379c3'
         assert result['TYPE'] == 'xfs'
 
+class TestUdevadmProperty(object):
+
+    def test_good_output(self, stub_call):
+        output = """ID_MODEL=SK_hynix_SC311_SATA_512GB
+ID_PART_TABLE_TYPE=gpt
+ID_SERIAL_SHORT=MS83N71801150416A""".split()
+        stub_call((output, [], 0))
+        result = disk.udevadm_property('dev/sda')
+        assert result['ID_MODEL'] == 'SK_hynix_SC311_SATA_512GB'
+        assert result['ID_PART_TABLE_TYPE'] == 'gpt'
+        assert result['ID_SERIAL_SHORT'] == 'MS83N71801150416A'
+
+    def test_property_filter(self, stub_call):
+        output = """ID_MODEL=SK_hynix_SC311_SATA_512GB
+ID_PART_TABLE_TYPE=gpt
+ID_SERIAL_SHORT=MS83N71801150416A""".split()
+        stub_call((output, [], 0))
+        result = disk.udevadm_property('dev/sda', ['ID_MODEL',
+                                                   'ID_SERIAL_SHORT'])
+        assert result['ID_MODEL'] == 'SK_hynix_SC311_SATA_512GB'
+        assert 'ID_PART_TABLE_TYPE' not in result
+
+    def test_fail_on_broken_output(self, stub_call):
+        output = ["ID_MODEL:SK_hynix_SC311_SATA_512GB"]
+        stub_call((output, [], 0))
+        with pytest.raises(ValueError):
+            disk.udevadm_property('dev/sda')
+
 
 class TestDeviceFamily(object):
 
@@ -239,6 +267,28 @@ class TestGetDevices(object):
         assert len(result) == 1
         assert result == [ceph_data_path]
 
+    def test_sda1_partition(self, tmpfile, tmpdir):
+        block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
+        block_sda_path = os.path.join(block_path, 'sda')
+        block_sda1_path = os.path.join(block_sda_path, 'sda1')
+        block_sda1_holders = os.path.join(block_sda1_path, 'holders')
+        dev_sda_path = os.path.join(dev_path, 'sda')
+        dev_sda1_path = os.path.join(dev_path, 'sda1')
+        os.makedirs(block_sda_path)
+        os.makedirs(block_sda1_path)
+        os.makedirs(dev_sda1_path)
+        os.makedirs(block_sda1_holders)
+        os.makedirs(dev_sda_path)
+        tmpfile('size', '1024', directory=block_sda_path)
+        tmpfile('partition', '1', directory=block_sda1_path)
+        result = disk.get_devices(
+            _sys_block_path=block_path,
+            _dev_path=dev_path,
+            _mapper_path=mapper_path)
+        assert dev_sda_path in list(result.keys())
+        assert '/dev/sda1' in list(result.keys())
+        assert result['/dev/sda1']['holders'] == []
+
     def test_sda_size(self, tmpfile, tmpdir):
         block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
         block_sda_path = os.path.join(block_path, 'sda')
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_encryption.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_encryption.py
index 8cca42689..e1420b440 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_encryption.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_encryption.py
@@ -33,3 +33,21 @@ class TestDmcryptClose(object):
         file_name = '/path/does/not/exist'
         encryption.dmcrypt_close(file_name)
         assert fake_run.calls == []
+
+
+class TestDmcryptKey(object):
+
+    def test_dmcrypt_with_default_size(self, conf_ceph_stub):
+        conf_ceph_stub('[global]\nfsid=asdf-lkjh')
+        result = encryption.create_dmcrypt_key()
+        assert len(result) == 172
+
+    def test_dmcrypt_with_custom_size(self, conf_ceph_stub):
+        conf_ceph_stub('''
+        [global]
+        fsid=asdf
+        [osd]
+        osd_dmcrypt_size=8
+        ''')
+        result = encryption.create_dmcrypt_key()
+        assert len(result) == 172
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_util.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_util.py
index 82f2ef27f..1a094d33f 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_util.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_util.py
@@ -15,12 +15,27 @@ class TestAsBytes(object):
 
 class TestStrToInt(object):
 
-    def test_passing_a_float_str(self):
-        result = util.str_to_int("1.99")
+    def test_passing_a_float_str_comma(self):
+        result = util.str_to_int("1,99")
         assert result == 1
 
-    def test_passing_a_float_does_not_round(self):
-        result = util.str_to_int("1.99", round_down=False)
+    def test_passing_a_float_does_not_round_comma(self):
+        result = util.str_to_int("1,99", round_down=False)
+        assert result == 2
+
+    @pytest.mark.parametrize("value", ['2', 2])
+    def test_passing_an_int(self, value):
+        result = util.str_to_int(value)
+        assert result == 2
+
+    @pytest.mark.parametrize("value", ['1.99', 1.99])
+    def test_passing_a_float(self, value):
+        result = util.str_to_int(value)
+        assert result == 1
+
+    @pytest.mark.parametrize("value", ['1.99', 1.99])
+    def test_passing_a_float_does_not_round(self, value):
+        result = util.str_to_int(value, round_down=False)
         assert result == 2
 
     def test_text_is_not_an_integer_like(self):
@@ -28,6 +43,11 @@ class TestStrToInt(object):
             util.str_to_int("1.4GB")
         assert str(error.value) == "Unable to convert to integer: '1.4GB'"
 
+    def test_input_is_not_string(self):
+        with pytest.raises(RuntimeError) as error:
+            util.str_to_int(None)
+        assert str(error.value) == "Unable to convert to integer: 'None'"
+
 
 def true_responses(upper_casing=False):
     if upper_casing:
@@ -75,22 +95,22 @@ class TestPromptBool(object):
     def test_trueish(self, response):
         fake_input = lambda x: response
         qx = 'what the what?'
-        assert util.prompt_bool(qx, _raw_input=fake_input) is True
+        assert util.prompt_bool(qx, input_=fake_input) is True
 
     @pytest.mark.parametrize('response', false_responses())
     def test_falseish(self, response):
         fake_input = lambda x: response
         qx = 'what the what?'
-        assert util.prompt_bool(qx, _raw_input=fake_input) is False
+        assert util.prompt_bool(qx, input_=fake_input) is False
 
     def test_try_again_true(self):
         responses = ['g', 'h', 'y']
         fake_input = lambda x: responses.pop(0)
         qx = 'what the what?'
-        assert util.prompt_bool(qx, _raw_input=fake_input) is True
+        assert util.prompt_bool(qx, input_=fake_input) is True
 
     def test_try_again_false(self):
         responses = ['g', 'h', 'n']
         fake_input = lambda x: responses.pop(0)
         qx = 'what the what?'
-        assert util.prompt_bool(qx, _raw_input=fake_input) is False
+        assert util.prompt_bool(qx, input_=fake_input) is False
diff --git a/ceph/src/ceph-volume/ceph_volume/util/__init__.py b/ceph/src/ceph-volume/ceph_volume/util/__init__.py
index cdcf3a5b0..43c9c9d68 100644
--- a/ceph/src/ceph-volume/ceph_volume/util/__init__.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/__init__.py
@@ -2,6 +2,10 @@ import logging
 from math import floor
 from ceph_volume import terminal
 
+try:
+    input = raw_input  # pylint: disable=redefined-builtin
+except NameError:
+    pass
 
 logger = logging.getLogger(__name__)
 
@@ -31,10 +35,21 @@ def str_to_int(string, round_down=True):
     """
     Parses a string number into an integer, optionally converting to a float
     and rounding down.
+
+    Some LVM values may come with a comma instead of a dot to define decimals.
+    This function normalizes a comma into a dot
     """
     error_msg = "Unable to convert to integer: '%s'" % str(string)
     try:
-        integer = float(string)
+        integer = float(string.replace(',', '.'))
+    except AttributeError:
+        # this might be a integer already, so try to use it, otherwise raise
+        # the original exception
+        if isinstance(string, (int, float)):
+            integer = string
+        else:
+            logger.exception(error_msg)
+            raise RuntimeError(error_msg)
     except (TypeError, ValueError):
         logger.exception(error_msg)
         raise RuntimeError(error_msg)
@@ -68,12 +83,12 @@ def str_to_bool(val):
         raise ValueError("Invalid input value: %s" % val)
 
 
-def prompt_bool(question, _raw_input=None):
+def prompt_bool(question, input_=None):
     """
     Interface to prompt a boolean (or boolean-like) response from a user.
     Usually a confirmation.
     """
-    input_prompt = _raw_input or raw_input
+    input_prompt = input_ or input
     prompt_format = '--> {question} '.format(question=question)
     response = input_prompt(prompt_format)
     try:
@@ -82,4 +97,4 @@ def prompt_bool(question, _raw_input=None):
         terminal.error('Valid true responses are: y, yes, <Enter>')
         terminal.error('Valid false responses are: n, no')
         terminal.error('That response was invalid, please try again')
-        return prompt_bool(question, _raw_input=input_prompt)
+        return prompt_bool(question, input_=input_prompt)
diff --git a/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py b/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py
index 534c9aa64..a04c19924 100644
--- a/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py
@@ -8,15 +8,22 @@ from ceph_volume.util.device import Device
 
 class ValidDevice(object):
 
-    def __init__(self, as_string=False):
+    def __init__(self, as_string=False, gpt_ok=False):
         self.as_string = as_string
+        self.gpt_ok = gpt_ok
 
     def __call__(self, string):
         device = Device(string)
         error = None
         if not device.exists:
             error = "Unable to proceed with non-existing device: %s" % string
-        elif device.has_gpt_headers:
+        # FIXME this is not a nice API, this validator was meant to catch any
+        # non-existing devices upfront, not check for gpt headers. Now this
+        # needs to optionally skip checking gpt headers which is beyond
+        # verifying if the device exists. The better solution would be to
+        # configure this with a list of checks that can be excluded/included on
+        # __init__
+        elif device.has_gpt_headers and not self.gpt_ok:
             error = "GPT headers found, they must be removed on: %s" % string
 
         if error:
diff --git a/ceph/src/ceph-volume/ceph_volume/util/device.py b/ceph/src/ceph-volume/ceph_volume/util/device.py
index 181044886..06f90cd37 100644
--- a/ceph/src/ceph-volume/ceph_volume/util/device.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/device.py
@@ -10,6 +10,16 @@ report_template = """
 {dev:<25} {size:<12} {rot!s:<7} {available!s:<9} {model}"""
 
 
+def encryption_status(abspath):
+    """
+    Helper function to run ``encryption.status()``. It is done here to avoid
+    a circular import issue (encryption module imports from this module) and to
+    ease testing by allowing monkeypatching of this function.
+    """
+    from ceph_volume.util import encryption
+    return encryption.status(abspath)
+
+
 class Devices(object):
     """
     A container for Device instances with reporting
@@ -79,6 +89,7 @@ class Device(object):
         self._is_lvm_member = None
         self._parse()
         self.available, self.rejected_reasons = self._check_reject_reasons()
+        self.device_id = self._get_device_id()
 
     def __lt__(self, other):
         '''
@@ -172,6 +183,32 @@ class Device(object):
         output['lvs'] = [lv.report() for lv in self.lvs]
         return output
 
+    def _get_device_id(self):
+        """
+        Please keep this implementation in sync with get_device_id() in
+        src/common/blkdev.cc
+        """
+        props = ['ID_VENDOR','ID_MODEL','ID_SERIAL_SHORT', 'ID_SERIAL',
+                 'ID_SCSI_SERIAL']
+        p = disk.udevadm_property(self.abspath, props)
+        if 'ID_VENDOR' in p and 'ID_MODEL' in p and 'ID_SCSI_SERIAL' in p:
+            dev_id = '_'.join([p['ID_VENDOR'], p['ID_MODEL'],
+                              p['ID_SCSI_SERIAL']])
+        elif 'ID_MODEL' in p and 'ID_SERIAL_SHORT' in p:
+            dev_id = '_'.join([p['ID_MODEL'], p['ID_SERIAL_SHORT']])
+        elif 'ID_SERIAL' in p:
+            dev_id = p['ID_SERIAL']
+            if dev_id.startswith('MTFD'):
+                # Micron NVMes hide the vendor
+                dev_id = 'Micron_' + dev_id
+        else:
+            # the else branch should fallback to using sysfs and ioctl to
+            # retrieve device_id on FreeBSD. Still figuring out if/how the
+            # python ioctl implementation does that on FreeBSD
+            dev_id = ''
+        dev_id.replace(' ', '_')
+        return dev_id
+
     def _set_lvm_membership(self):
         if self._is_lvm_member is None:
             # this is contentious, if a PV is recognized by LVM but has no
@@ -185,6 +222,7 @@ class Device(object):
                 pvs.filter(pv_name=path)
                 has_vgs = [pv.vg_name for pv in pvs if pv.vg_name]
                 if has_vgs:
+                    self.vgs = list(set(has_vgs))
                     # a pv can only be in one vg, so this should be safe
                     self.vg_name = has_vgs[0]
                     self._is_lvm_member = True
@@ -194,6 +232,8 @@ class Device(object):
                             lv = lvm.get_lv(vg_name=pv.vg_name, lv_uuid=pv.lv_uuid)
                             if lv:
                                 self.lvs.append(lv)
+                else:
+                    self.vgs = []
         return self._is_lvm_member
 
     def _get_pv_paths(self):
@@ -239,11 +279,18 @@ class Device(object):
 
     @property
     def is_ceph_disk_member(self):
-        return self.ceph_disk.is_member
+        is_member = self.ceph_disk.is_member
+        if self.sys_api.get("partitions"):
+            for part in self.sys_api.get("partitions").keys():
+                part = Device("/dev/%s" % part)
+                if part.is_ceph_disk_member:
+                    is_member = True
+                    break
+        return is_member
 
     @property
     def is_mapper(self):
-        return self.path.startswith('/dev/mapper')
+        return self.path.startswith(('/dev/mapper', '/dev/dm-'))
 
     @property
     def is_lv(self):
@@ -258,9 +305,40 @@ class Device(object):
     @property
     def is_device(self):
         if self.disk_api:
-            return self.disk_api['TYPE'] == 'device'
+            is_device = self.disk_api['TYPE'] == 'device'
+            is_disk = self.disk_api['TYPE'] == 'disk'
+            if is_device or is_disk:
+                return True
         return False
 
+    @property
+    def is_encrypted(self):
+        """
+        Only correct for LVs, device mappers, and partitions. Will report a ``None``
+        for raw devices.
+        """
+        crypt_reports = [self.blkid_api.get('TYPE', ''), self.disk_api.get('FSTYPE', '')]
+        if self.is_lv:
+            # if disk APIs are reporting this is encrypted use that:
+            if 'crypto_LUKS' in crypt_reports:
+                return True
+            # if ceph-volume created this, then a tag would let us know
+            elif self.lv_api.encrypted:
+                return True
+            return False
+        elif self.is_partition:
+            return 'crypto_LUKS' in crypt_reports
+        elif self.is_mapper:
+            active_mapper = encryption_status(self.abspath)
+            if active_mapper:
+                # normalize a bit to ensure same values regardless of source
+                encryption_type = active_mapper['type'].lower().strip('12')  # turn LUKS1 or LUKS2 into luks
+                return True if encryption_type in ['plain', 'luks'] else False
+            else:
+                return False
+        else:
+            return None
+
     @property
     def used_by_ceph(self):
         # only filter out data devices as journals could potentially be reused
@@ -282,6 +360,9 @@ class Device(object):
         ]
         rejected = [reason for (k, v, reason) in reasons if
                     self.sys_api.get(k, '') == v]
+        if self.is_ceph_disk_member:
+            rejected.append("Used by ceph-disk")
+
         return len(rejected) == 0, rejected
 
 
diff --git a/ceph/src/ceph-volume/ceph_volume/util/disk.py b/ceph/src/ceph-volume/ceph_volume/util/disk.py
index ccc2ff7a1..c85d3be9a 100644
--- a/ceph/src/ceph-volume/ceph_volume/util/disk.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/disk.py
@@ -127,6 +127,23 @@ def get_device_from_partuuid(partuuid):
     return ' '.join(out).strip()
 
 
+def remove_partition(device):
+    """
+    Removes a partition using parted
+
+    :param device: A ``Device()`` object
+    """
+    parent_device = '/dev/%s' % device.disk_api['PKNAME']
+    udev_info = udevadm_property(device.abspath)
+    partition_number = udev_info.get('ID_PART_ENTRY_NUMBER')
+    if not partition_number:
+        raise RuntimeError('Unable to detect the partition number for device: %s' % device.abspath)
+
+    process.run(
+        ['parted', parent_device, '--script', '--', 'rm', partition_number]
+    )
+
+
 def _stat_is_device(stat_obj):
     """
     Helper function that will interpret ``os.stat`` output directly, so that other
@@ -170,6 +187,47 @@ def device_family(device):
     return devices
 
 
+def udevadm_property(device, properties=[]):
+    """
+    Query udevadm for information about device properties.
+    Optionally pass a list of properties to return. A requested property might
+    not be returned if not present.
+
+    Expected output format::
+        # udevadm info --query=property --name=/dev/sda                                  :(
+        DEVNAME=/dev/sda
+        DEVTYPE=disk
+        ID_ATA=1
+        ID_BUS=ata
+        ID_MODEL=SK_hynix_SC311_SATA_512GB
+        ID_PART_TABLE_TYPE=gpt
+        ID_PART_TABLE_UUID=c8f91d57-b26c-4de1-8884-0c9541da288c
+        ID_PATH=pci-0000:00:17.0-ata-3
+        ID_PATH_TAG=pci-0000_00_17_0-ata-3
+        ID_REVISION=70000P10
+        ID_SERIAL=SK_hynix_SC311_SATA_512GB_MS83N71801150416A
+        TAGS=:systemd:
+        USEC_INITIALIZED=16117769
+        ...
+    """
+    out = _udevadm_info(device)
+    ret = {}
+    for line in out:
+        p, v = line.split('=', 1)
+        if not properties or p in properties:
+            ret[p] = v
+    return ret
+
+
+def _udevadm_info(device):
+    """
+    Call udevadm and return the output
+    """
+    cmd = ['udevadm', 'info', '--query=property', device]
+    out, _err, _rc = process.call(cmd)
+    return out
+
+
 def lsblk(device, columns=None, abspath=False):
     """
     Create a dictionary of identifying values for a device using ``lsblk``.
@@ -631,7 +689,7 @@ def get_partitions_facts(sys_block_path):
         folder_path = os.path.join(sys_block_path, folder)
         if os.path.exists(os.path.join(folder_path, 'partition')):
             contents = get_file_contents(os.path.join(folder_path, 'partition'))
-            if '1' in contents:
+            if contents:
                 part = {}
                 partname = folder
                 part_sys_block_path = os.path.join(sys_block_path, partname)
@@ -645,6 +703,9 @@ def get_partitions_facts(sys_block_path):
                     part['sectorsize'] = get_file_contents(
                         part_sys_block_path + "/queue/hw_sector_size", 512)
                 part['size'] = human_readable_size(float(part['sectors']) * 512)
+                part['holders'] = []
+                for holder in os.listdir(part_sys_block_path + '/holders'):
+                    part['holders'].append(holder)
 
                 partition_metadata[partname] = part
     return partition_metadata
@@ -754,5 +815,9 @@ def get_devices(_sys_block_path='/sys/block', _dev_path='/dev', _mapper_path='/d
         metadata['path'] = diskname
         metadata['locked'] = is_locked_raw_device(metadata['path'])
 
+        for part_name, part_metadata in metadata['partitions'].items():
+            part_abspath = '/dev/%s' % part_name
+            device_facts[part_abspath] = part_metadata
+
         device_facts[diskname] = metadata
     return device_facts
diff --git a/ceph/src/ceph-volume/ceph_volume/util/encryption.py b/ceph/src/ceph-volume/ceph_volume/util/encryption.py
index f6e3fdd7e..e2b3ca164 100644
--- a/ceph/src/ceph-volume/ceph_volume/util/encryption.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/encryption.py
@@ -23,7 +23,7 @@ def create_dmcrypt_key():
     )
     # The size of the key is defined in bits, so we must transform that
     # value to bytes (dividing by 8) because we read in bytes, not bits
-    random_string = os.urandom(dmcrypt_key_size / 8)
+    random_string = os.urandom(int(dmcrypt_key_size / 8))
     key = base64.b64encode(random_string).decode('utf-8')
     return key
 
@@ -60,6 +60,7 @@ def plain_open(key, device, mapping):
         'cryptsetup',
         '--key-file',
         '-',
+        '--allow-discards',  # allow discards (aka TRIM) requests for device
         'open',
         device,
         mapping,
@@ -84,6 +85,7 @@ def luks_open(key, device, mapping):
         'cryptsetup',
         '--key-file',
         '-',
+        '--allow-discards',  # allow discards (aka TRIM) requests for device
         'luksOpen',
         device,
         mapping,
diff --git a/ceph/src/client/Client.cc b/ceph/src/client/Client.cc
index fc99ad53b..9f78b24a5 100644
--- a/ceph/src/client/Client.cc
+++ b/ceph/src/client/Client.cc
@@ -456,6 +456,7 @@ void Client::dump_status(Formatter *f)
     f->dump_int("mds_epoch", mdsmap->get_epoch());
     f->dump_int("osd_epoch", osd_epoch);
     f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
+    f->dump_bool("blacklisted", blacklisted);
   }
 }
 
@@ -2475,6 +2476,12 @@ void Client::handle_osd_map(MOSDMap *m)
         return o.is_blacklisted(myaddr);});
   }
 
+  // Always subscribe to next osdmap for blacklisted client
+  // until this client is not blacklisted.
+  if (blacklisted) {
+    objecter->maybe_request_map();
+  }
+
   if (objecter->osdmap_full_flag()) {
     _handle_full_flag(-1);
   } else {
@@ -2611,13 +2618,14 @@ void Client::handle_fs_map_user(MFSMapUser *m)
 
 void Client::handle_mds_map(MMDSMap* m)
 {
+  mds_gid_t old_inc, new_inc;
   if (m->get_epoch() <= mdsmap->get_epoch()) {
     ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
                   << " is identical to or older than our "
                   << mdsmap->get_epoch() << dendl;
     m->put();
     return;
-  }  
+  }
 
   ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
 
@@ -2664,6 +2672,13 @@ void Client::handle_mds_map(MMDSMap* m)
     if (!mdsmap->is_up(mds)) {
       session->con->mark_down();
     } else if (mdsmap->get_inst(mds) != session->inst) {
+      old_inc = oldmap->get_incarnation(mds);
+      new_inc = mdsmap->get_incarnation(mds);
+      if (old_inc != new_inc) {
+        ldout(cct, 1) << "mds incarnation changed from "
+		      << old_inc << " to " << new_inc << dendl;
+        oldstate = MDSMap::STATE_NULL;
+      }
       session->con->mark_down();
       session->inst = mdsmap->get_inst(mds);
       // When new MDS starts to take over, notify kernel to trim unused entries
@@ -2674,6 +2689,11 @@ void Client::handle_mds_map(MMDSMap* m)
       continue;  // no change
     
     session->mds_state = newstate;
+    if (old_inc != new_inc && newstate > MDSMap::STATE_RECONNECT) {
+      // missed reconnect close the session so that it can be reopened
+      _closed_mds_session(session);
+      continue;
+    }
     if (newstate == MDSMap::STATE_RECONNECT) {
       session->con = messenger->get_connection(session->inst);
       send_reconnect(session);
@@ -4862,7 +4882,6 @@ void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
 	  tcap->cap_id = m->peer.cap_id;
 	  tcap->seq = m->peer.seq - 1;
 	  tcap->issue_seq = tcap->seq;
-	  tcap->mseq = m->peer.mseq;
 	  tcap->issued |= cap->issued;
 	  tcap->implemented |= cap->issued;
 	  if (cap == in->auth_cap)
@@ -9196,6 +9215,8 @@ int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, in
 int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
                   const struct iovec *iov, int iovcnt)
 {
+  uint64_t fpos = 0;
+
   if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
     return -EFBIG;
 
@@ -9235,7 +9256,7 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
       }
     }
     offset = f->pos;
-    f->pos = offset+size;
+    fpos = offset+size;
     unlock_fh_pos(f);
   }
 
@@ -9385,6 +9406,11 @@ success:
   lat -= start;
   logger->tinc(l_c_wrlat, lat);
 
+  if (fpos) {
+    lock_fh_pos(f);
+    f->pos = fpos;
+    unlock_fh_pos(f);
+  }
   totalwritten = size;
   r = (int)totalwritten;
 
diff --git a/ceph/src/cls/lock/cls_lock.cc b/ceph/src/cls/lock/cls_lock.cc
index 6e2ae4bbd..1dab0dd72 100644
--- a/ceph/src/cls/lock/cls_lock.cc
+++ b/ceph/src/cls/lock/cls_lock.cc
@@ -34,7 +34,18 @@ CLS_NAME(lock)
 
 #define LOCK_PREFIX    "lock."
 
-static int read_lock(cls_method_context_t hctx, const string& name, lock_info_t *lock)
+static int clean_lock(cls_method_context_t hctx)
+{
+  int r = cls_cxx_remove(hctx);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+static int read_lock(cls_method_context_t hctx,
+		     const string& name,
+		     lock_info_t *lock)
 {
   bufferlist bl;
   string key = LOCK_PREFIX;
@@ -67,16 +78,20 @@ static int read_lock(cls_method_context_t hctx, const string& name, lock_info_t
   map<locker_id_t, locker_info_t>::iterator iter = lock->lockers.begin();
 
   while (iter != lock->lockers.end()) {
-    map<locker_id_t, locker_info_t>::iterator next = iter;
-    ++next;
-
     struct locker_info_t& info = iter->second;
     if (!info.expiration.is_zero() && info.expiration < now) {
       CLS_LOG(20, "expiring locker");
-      lock->lockers.erase(iter);
+      iter = lock->lockers.erase(iter);
+    } else {
+      ++iter;
     }
+  }
 
-    iter = next;
+  if (lock->lockers.empty() && cls_lock_is_ephemeral(lock->lock_type)) {
+    r = clean_lock(hctx);
+    if (r < 0) {
+      CLS_ERR("error, on read, cleaning lock object %s", cpp_strerror(r).c_str());
+    }
   }
 
   return 0;
@@ -121,24 +136,35 @@ static int lock_obj(cls_method_context_t hctx,
                     const string& cookie,
                     const string& tag)
 {
-  bool exclusive = lock_type == LOCK_EXCLUSIVE;
+  bool exclusive = cls_lock_is_exclusive(lock_type);
   lock_info_t linfo;
-  bool fail_if_exists = (flags & LOCK_FLAG_RENEW) == 0;
+  bool fail_if_exists = (flags & LOCK_FLAG_MAY_RENEW) == 0;
+  bool fail_if_does_not_exist = flags & LOCK_FLAG_MUST_RENEW;
 
-  CLS_LOG(20, "requested lock_type=%s fail_if_exists=%d", cls_lock_type_str(lock_type), fail_if_exists);
-  if (lock_type != LOCK_EXCLUSIVE &&
-      lock_type != LOCK_SHARED)
+  CLS_LOG(20,
+	  "requested lock_type=%s fail_if_exists=%d fail_if_does_not_exist=%d",
+	  cls_lock_type_str(lock_type), fail_if_exists, fail_if_does_not_exist);
+  if (!cls_lock_is_valid(lock_type)) {
     return -EINVAL;
+  }
 
   if (name.empty())
     return -EINVAL;
 
+  if (!fail_if_exists && fail_if_does_not_exist) {
+    // at most one of LOCK_FLAG_MAY_RENEW and LOCK_FLAG_MUST_RENEW may
+    // be set since they have different implications if the lock does
+    // not already exist
+    return -EINVAL;
+  }
+
   // see if there's already a locker
   int r = read_lock(hctx, name, &linfo);
   if (r < 0 && r != -ENOENT) {
     CLS_ERR("Could not read lock info: %s", cpp_strerror(r).c_str());
     return r;
   }
+
   map<locker_id_t, locker_info_t>& lockers = linfo.lockers;
   map<locker_id_t, locker_info_t>::iterator iter;
 
@@ -160,11 +186,13 @@ static int lock_obj(cls_method_context_t hctx,
   CLS_LOG(20, "existing_lock_type=%s", cls_lock_type_str(existing_lock_type));
   iter = lockers.find(id);
   if (iter != lockers.end()) {
-    if (fail_if_exists) {
+    if (fail_if_exists && !fail_if_does_not_exist) {
       return -EEXIST;
     } else {
       lockers.erase(iter); // remove old entry
     }
+  } else if (fail_if_does_not_exist) {
+    return -ENOENT;
   }
 
   if (!lockers.empty()) {
@@ -235,9 +263,9 @@ static int lock_op(cls_method_context_t hctx,
  *  entity or cookie is wrong), or -errno on other error.
  */
 static int remove_lock(cls_method_context_t hctx,
-                const string& name,
-                entity_name_t& locker,
-                const string& cookie)
+		       const string& name,
+		       entity_name_t& locker,
+		       const string& cookie)
 {
   // get current lockers
   lock_info_t linfo;
@@ -257,7 +285,12 @@ static int remove_lock(cls_method_context_t hctx,
   }
   lockers.erase(iter);
 
-  r = write_lock(hctx, name, linfo);
+  if (cls_lock_is_ephemeral(linfo.lock_type)) {
+    ceph_assert(lockers.empty());
+    r = clean_lock(hctx);
+  } else {
+    r = write_lock(hctx, name, linfo);
+  }
 
   return r;
 }
@@ -301,7 +334,7 @@ static int unlock_op(cls_method_context_t hctx,
  * is wrong), or -errno on other (unexpected) error.
  */
 static int break_lock(cls_method_context_t hctx,
-               bufferlist *in, bufferlist *out)
+		      bufferlist *in, bufferlist *out)
 {
   CLS_LOG(20, "break_lock");
   cls_lock_break_op op;
@@ -421,7 +454,7 @@ int assert_locked(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
     return -EINVAL;
   }
 
-  if (op.type != LOCK_EXCLUSIVE && op.type != LOCK_SHARED) {
+  if (!cls_lock_is_valid(op.type)) {
     return -EINVAL;
   }
 
@@ -493,7 +526,7 @@ int set_cookie(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
     return -EINVAL;
   }
 
-  if (op.type != LOCK_EXCLUSIVE && op.type != LOCK_SHARED) {
+  if (!cls_lock_is_valid(op.type)) {
     return -EINVAL;
   }
 
diff --git a/ceph/src/cls/lock/cls_lock_client.cc b/ceph/src/cls/lock/cls_lock_client.cc
index 3a3cef367..776fe4889 100644
--- a/ceph/src/cls/lock/cls_lock_client.cc
+++ b/ceph/src/cls/lock/cls_lock_client.cc
@@ -208,14 +208,19 @@ namespace rados {
         rados_op->exec("lock", "set_cookie", in);
       }
 
+      void Lock::assert_locked_shared(ObjectOperation *op)
+      {
+        assert_locked(op, name, LOCK_SHARED, cookie, tag);
+      }
+
       void Lock::assert_locked_exclusive(ObjectOperation *op)
       {
         assert_locked(op, name, LOCK_EXCLUSIVE, cookie, tag);
       }
 
-      void Lock::assert_locked_shared(ObjectOperation *op)
+      void Lock::assert_locked_exclusive_ephemeral(ObjectOperation *op)
       {
-        assert_locked(op, name, LOCK_SHARED, cookie, tag);
+        assert_locked(op, name, LOCK_EXCLUSIVE_EPHEMERAL, cookie, tag);
       }
 
       void Lock::lock_shared(ObjectWriteOperation *op)
@@ -242,6 +247,18 @@ namespace rados {
                     cookie, tag, description, duration, flags);
       }
 
+      void Lock::lock_exclusive_ephemeral(ObjectWriteOperation *op)
+      {
+        lock(op, name, LOCK_EXCLUSIVE_EPHEMERAL,
+             cookie, tag, description, duration, flags);
+      }
+
+      int Lock::lock_exclusive_ephemeral(IoCtx *ioctx, const string& oid)
+      {
+        return lock(ioctx, oid, name, LOCK_EXCLUSIVE_EPHEMERAL,
+                    cookie, tag, description, duration, flags);
+      }
+
       void Lock::unlock(ObjectWriteOperation *op)
       {
 	rados::cls::lock::unlock(op, name, cookie);
diff --git a/ceph/src/cls/lock/cls_lock_client.h b/ceph/src/cls/lock/cls_lock_client.h
index 7aa06238f..0066dc3c0 100644
--- a/ceph/src/cls/lock/cls_lock_client.h
+++ b/ceph/src/cls/lock/cls_lock_client.h
@@ -4,6 +4,8 @@
 #ifndef CEPH_CLS_LOCK_CLIENT_H
 #define CEPH_CLS_LOCK_CLIENT_H
 
+#include <chrono>
+
 #include "cls/lock/cls_lock_types.h"
 
 namespace librados {
@@ -87,26 +89,53 @@ namespace rados {
 	void set_tag(const std::string& t) { tag = t; }
 	void set_description(const std::string& desc) { description = desc; }
 	void set_duration(const utime_t& e) { duration = e; }
-	void set_renew(bool renew) {
+	void set_duration(const ceph::timespan& d) {
+	  duration = utime_t(ceph::real_clock::time_point::min() + d);
+	}
+
+	void set_may_renew(bool renew) {
 	  if (renew) {
-	    flags |= LOCK_FLAG_RENEW;
+	    flags |= LOCK_FLAG_MAY_RENEW;
+	    flags &= ~LOCK_FLAG_MUST_RENEW; // if may then not must
 	  } else {
-	    flags &= ~LOCK_FLAG_RENEW;
+	    flags &= ~LOCK_FLAG_MAY_RENEW;
+	  }
+	}
+
+	void set_must_renew(bool renew) {
+	  if (renew) {
+	    flags |= LOCK_FLAG_MUST_RENEW;
+	    flags &= ~LOCK_FLAG_MAY_RENEW; // if must then not may
+	  } else {
+	    flags &= ~LOCK_FLAG_MUST_RENEW;
 	  }
 	}
 
-        void assert_locked_exclusive(librados::ObjectOperation *rados_op);
         void assert_locked_shared(librados::ObjectOperation *rados_op);
+        void assert_locked_exclusive(librados::ObjectOperation *rados_op);
+        void assert_locked_exclusive_ephemeral(librados::ObjectOperation *rados_op);
 
 	/* ObjectWriteOperation */
-	void lock_exclusive(librados::ObjectWriteOperation *ioctx);
 	void lock_shared(librados::ObjectWriteOperation *ioctx);
+	void lock_exclusive(librados::ObjectWriteOperation *ioctx);
+
+	// Be careful when using an exclusive ephemeral lock; it is
+	// intended strictly for cases when a lock object exists
+	// solely for a lock in a given process and the object is no
+	// longer needed when the lock is unlocked or expired, as the
+	// cls back-end will make an effort to delete it.
+	void lock_exclusive_ephemeral(librados::ObjectWriteOperation *ioctx);
 	void unlock(librados::ObjectWriteOperation *ioctx);
-	void break_lock(librados::ObjectWriteOperation *ioctx, const entity_name_t& locker);
+	void break_lock(librados::ObjectWriteOperation *ioctx,
+			const entity_name_t& locker);
 
 	/* IoCtx */
-	int lock_exclusive(librados::IoCtx *ioctx, const std::string& oid);
 	int lock_shared(librados::IoCtx *ioctx, const std::string& oid);
+	int lock_exclusive(librados::IoCtx *ioctx, const std::string& oid);
+
+	// NB: see above comment on exclusive ephemeral locks
+	int lock_exclusive_ephemeral(librados::IoCtx *ioctx,
+				     const std::string& oid);
 	int unlock(librados::IoCtx *ioctx, const std::string& oid);
 	int break_lock(librados::IoCtx *ioctx, const std::string& oid,
 		       const entity_name_t& locker);
diff --git a/ceph/src/cls/lock/cls_lock_ops.cc b/ceph/src/cls/lock/cls_lock_ops.cc
index 10d005900..96a2b1ae5 100644
--- a/ceph/src/cls/lock/cls_lock_ops.cc
+++ b/ceph/src/cls/lock/cls_lock_ops.cc
@@ -45,7 +45,7 @@ void cls_lock_lock_op::generate_test_instances(list<cls_lock_lock_op*>& o)
   i->tag = "tag";
   i->description = "description";
   i->duration = utime_t(5, 0);
-  i->flags = LOCK_FLAG_RENEW;
+  i->flags = LOCK_FLAG_MAY_RENEW;
   o.push_back(i);
   o.push_back(new cls_lock_lock_op);
 }
diff --git a/ceph/src/cls/lock/cls_lock_ops.h b/ceph/src/cls/lock/cls_lock_ops.h
index dbdddfe21..b9388e788 100644
--- a/ceph/src/cls/lock/cls_lock_ops.h
+++ b/ceph/src/cls/lock/cls_lock_ops.h
@@ -1,3 +1,6 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
 #ifndef CEPH_CLS_LOCK_OPS_H
 #define CEPH_CLS_LOCK_OPS_H
 
diff --git a/ceph/src/cls/lock/cls_lock_types.h b/ceph/src/cls/lock/cls_lock_types.h
index 36d39c890..5f44126b4 100644
--- a/ceph/src/cls/lock/cls_lock_types.h
+++ b/ceph/src/cls/lock/cls_lock_types.h
@@ -1,3 +1,6 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
 #ifndef CEPH_CLS_LOCK_TYPES_H
 #define CEPH_CLS_LOCK_TYPES_H
 
@@ -7,12 +10,14 @@
 #include "msg/msg_types.h"
 
 /* lock flags */
-#define LOCK_FLAG_RENEW 0x1        /* idempotent lock acquire */
+#define LOCK_FLAG_MAY_RENEW 0x1    /* idempotent lock acquire */
+#define LOCK_FLAG_MUST_RENEW 0x2   /* lock must already be acquired */
 
 enum ClsLockType {
-  LOCK_NONE      = 0,
-  LOCK_EXCLUSIVE = 1,
-  LOCK_SHARED    = 2,
+  LOCK_NONE                = 0,
+  LOCK_EXCLUSIVE           = 1,
+  LOCK_SHARED              = 2,
+  LOCK_EXCLUSIVE_EPHEMERAL = 3, /* lock object is removed @ unlock */
 };
 
 static inline const char *cls_lock_type_str(ClsLockType type)
@@ -24,11 +29,27 @@ static inline const char *cls_lock_type_str(ClsLockType type)
 	return "exclusive";
       case LOCK_SHARED:
 	return "shared";
+      case LOCK_EXCLUSIVE_EPHEMERAL:
+	return "exclusive-ephemeral";
       default:
 	return "<unknown>";
     }
 }
 
+inline bool cls_lock_is_exclusive(ClsLockType type) {
+  return LOCK_EXCLUSIVE == type || LOCK_EXCLUSIVE_EPHEMERAL == type;
+}
+
+inline bool cls_lock_is_ephemeral(ClsLockType type) {
+  return LOCK_EXCLUSIVE_EPHEMERAL == type;
+}
+
+inline bool cls_lock_is_valid(ClsLockType type) {
+  return LOCK_SHARED == type ||
+    LOCK_EXCLUSIVE == type ||
+    LOCK_EXCLUSIVE_EPHEMERAL == type;
+}
+
 namespace rados {
   namespace cls {
     namespace lock {
diff --git a/ceph/src/cls/rgw/cls_rgw.cc b/ceph/src/cls/rgw/cls_rgw.cc
index 13b3e92dc..fba47d460 100644
--- a/ceph/src/cls/rgw/cls_rgw.cc
+++ b/ceph/src/cls/rgw/cls_rgw.cc
@@ -1437,13 +1437,15 @@ static int rgw_bucket_link_olh(cls_method_context_t hctx, bufferlist *in, buffer
       if (ret < 0) {
         return ret;
       }
+    }
+
+    removing = existed && op.delete_marker;
+    if (!removing) {
       ret = other_obj.unlink();
       if (ret < 0) {
         return ret;
       }
     }
-
-    removing = existed && op.delete_marker;
   } else {
     removing = (existed && !obj.is_delete_marker() && op.delete_marker);
   }
@@ -3758,7 +3760,7 @@ static int rgw_set_bucket_resharding(cls_method_context_t hctx, bufferlist *in,
 
 static int rgw_clear_bucket_resharding(cls_method_context_t hctx, bufferlist *in,  bufferlist *out)
 {
-  cls_rgw_set_bucket_resharding_op op;
+  cls_rgw_clear_bucket_resharding_op op;
 
   bufferlist::iterator in_iter = in->begin();
   try {
diff --git a/ceph/src/cls/rgw/cls_rgw_client.cc b/ceph/src/cls/rgw/cls_rgw_client.cc
index 3c4ed919a..93ef2b522 100644
--- a/ceph/src/cls/rgw/cls_rgw_client.cc
+++ b/ceph/src/cls/rgw/cls_rgw_client.cc
@@ -92,14 +92,16 @@ bool BucketIndexAioManager::wait_for_completions(int valid_ret_code,
   return true;
 }
 
-void cls_rgw_bucket_init(ObjectWriteOperation& o)
+// note: currently only called by tesing code
+void cls_rgw_bucket_init_index(ObjectWriteOperation& o)
 {
   bufferlist in;
   o.exec(RGW_CLASS, RGW_BUCKET_INIT_INDEX, in);
 }
 
 static bool issue_bucket_index_init_op(librados::IoCtx& io_ctx,
-    const string& oid, BucketIndexAioManager *manager) {
+				       const string& oid,
+				       BucketIndexAioManager *manager) {
   bufferlist in;
   librados::ObjectWriteOperation op;
   op.create(true);
@@ -107,6 +109,15 @@ static bool issue_bucket_index_init_op(librados::IoCtx& io_ctx,
   return manager->aio_operate(io_ctx, oid, &op);
 }
 
+static bool issue_bucket_index_clean_op(librados::IoCtx& io_ctx,
+					const string& oid,
+					BucketIndexAioManager *manager) {
+  bufferlist in;
+  librados::ObjectWriteOperation op;
+  op.remove();
+  return manager->aio_operate(io_ctx, oid, &op);
+}
+
 static bool issue_bucket_set_tag_timeout_op(librados::IoCtx& io_ctx,
     const string& oid, uint64_t timeout, BucketIndexAioManager *manager) {
   bufferlist in;
@@ -126,11 +137,16 @@ int CLSRGWIssueBucketIndexInit::issue_op(int shard_id, const string& oid)
 void CLSRGWIssueBucketIndexInit::cleanup()
 {
   // Do best effort removal
-  for (map<int, string>::iterator citer = objs_container.begin(); citer != iter; ++citer) {
+  for (auto citer = objs_container.begin(); citer != iter; ++citer) {
     io_ctx.remove(citer->second);
   }
 }
 
+int CLSRGWIssueBucketIndexClean::issue_op(int shard_id, const string& oid)
+{
+  return issue_bucket_index_clean_op(io_ctx, oid, &manager);
+}
+
 int CLSRGWIssueSetTagTimeout::issue_op(int shard_id, const string& oid)
 {
   return issue_bucket_set_tag_timeout_op(io_ctx, oid, tag_timeout, &manager);
@@ -956,4 +972,3 @@ int CLSRGWIssueSetBucketResharding::issue_op(int shard_id, const string& oid)
 {
   return issue_set_bucket_resharding(io_ctx, oid, entry, &manager);
 }
-
diff --git a/ceph/src/cls/rgw/cls_rgw_client.h b/ceph/src/cls/rgw/cls_rgw_client.h
index c4ab0f648..97a950cf0 100644
--- a/ceph/src/cls/rgw/cls_rgw_client.h
+++ b/ceph/src/cls/rgw/cls_rgw_client.h
@@ -1,3 +1,6 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
 #ifndef CEPH_CLS_RGW_CLIENT_H
 #define CEPH_CLS_RGW_CLIENT_H
 
@@ -230,7 +233,7 @@ public:
 };
 
 /* bucket index */
-void cls_rgw_bucket_init(librados::ObjectWriteOperation& o);
+void cls_rgw_bucket_init_index(librados::ObjectWriteOperation& o);
 
 class CLSRGWConcurrentIO {
 protected:
@@ -252,9 +255,15 @@ protected:
   virtual void reset_container(map<int, string>& objs) {}
 
 public:
-  CLSRGWConcurrentIO(librados::IoCtx& ioc, map<int, string>& _objs_container,
-                     uint32_t _max_aio) : io_ctx(ioc), objs_container(_objs_container), max_aio(_max_aio) {}
-  virtual ~CLSRGWConcurrentIO() {}
+
+  CLSRGWConcurrentIO(librados::IoCtx& ioc,
+		     map<int, string>& _objs_container,
+		     uint32_t _max_aio) :
+  io_ctx(ioc), objs_container(_objs_container), max_aio(_max_aio)
+  {}
+
+  virtual ~CLSRGWConcurrentIO()
+  {}
 
   int operator()() {
     int ret = 0;
@@ -305,6 +314,23 @@ public:
     CLSRGWConcurrentIO(ioc, _bucket_objs, _max_aio) {}
 };
 
+
+class CLSRGWIssueBucketIndexClean : public CLSRGWConcurrentIO {
+protected:
+  int issue_op(int shard_id, const string& oid) override;
+  int valid_ret_code() override {
+    return -ENOENT;
+  }
+
+public:
+  CLSRGWIssueBucketIndexClean(librados::IoCtx& ioc,
+			      map<int, string>& _bucket_objs,
+			      uint32_t _max_aio) :
+  CLSRGWConcurrentIO(ioc, _bucket_objs, _max_aio)
+  {}
+};
+
+
 class CLSRGWIssueSetTagTimeout : public CLSRGWConcurrentIO {
   uint64_t tag_timeout;
 protected:
@@ -536,7 +562,7 @@ int cls_rgw_reshard_get(librados::IoCtx& io_ctx, const string& oid, cls_rgw_resh
 int cls_rgw_reshard_get_head(librados::IoCtx& io_ctx, const string& oid, cls_rgw_reshard_entry& entry);
 void cls_rgw_reshard_remove(librados::ObjectWriteOperation& op, const cls_rgw_reshard_entry& entry);
 
-/* resharding attribute  */
+/* resharding attribute on bucket index shard headers */
 int cls_rgw_set_bucket_resharding(librados::IoCtx& io_ctx, const string& oid,
                                   const cls_rgw_bucket_instance_entry& entry);
 int cls_rgw_clear_bucket_resharding(librados::IoCtx& io_ctx, const string& oid);
diff --git a/ceph/src/cls/rgw/cls_rgw_types.h b/ceph/src/cls/rgw/cls_rgw_types.h
index 51107c325..baa61c9fb 100644
--- a/ceph/src/cls/rgw/cls_rgw_types.h
+++ b/ceph/src/cls/rgw/cls_rgw_types.h
@@ -1,3 +1,6 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
 #ifndef CEPH_CLS_RGW_TYPES_H
 #define CEPH_CLS_RGW_TYPES_H
 
@@ -609,6 +612,24 @@ enum cls_rgw_reshard_status {
   CLS_RGW_RESHARD_DONE        = 2,
 };
 
+static inline std::string to_string(const enum cls_rgw_reshard_status status)
+{
+  switch (status) {
+  case CLS_RGW_RESHARD_NONE:
+    return "CLS_RGW_RESHARD_NONE";
+    break;
+  case CLS_RGW_RESHARD_IN_PROGRESS:
+    return "CLS_RGW_RESHARD_IN_PROGRESS";
+    break;
+  case CLS_RGW_RESHARD_DONE:
+    return "CLS_RGW_RESHARD_DONE";
+    break;
+  default:
+    break;
+  };
+  return "Unknown reshard status";
+}
+
 struct cls_rgw_bucket_instance_entry {
   cls_rgw_reshard_status reshard_status{CLS_RGW_RESHARD_NONE};
   string new_bucket_instance_id;
diff --git a/ceph/src/common/Cond.h b/ceph/src/common/Cond.h
index aa53b60f2..1777827e3 100644
--- a/ceph/src/common/Cond.h
+++ b/ceph/src/common/Cond.h
@@ -17,105 +17,7 @@
 #define CEPH_COND_H
 
 #include "include/Context.h"
-
-class Cond {
-  // my bits
-  pthread_cond_t _c;
-
-  Mutex *waiter_mutex;
-
-  // don't allow copying.
-  void operator=(Cond &C);
-  Cond(const Cond &C);
-
- public:
-  Cond() : waiter_mutex(NULL) {
-    int r = pthread_cond_init(&_c,NULL);
-    assert(r == 0);
-  }
-  virtual ~Cond() { 
-    pthread_cond_destroy(&_c); 
-  }
-
-  int Wait(Mutex &mutex)  { 
-    // make sure this cond is used with one mutex only
-    assert(waiter_mutex == NULL || waiter_mutex == &mutex);
-    waiter_mutex = &mutex;
-
-    assert(mutex.is_locked());
-
-    mutex._pre_unlock();
-    int r = pthread_cond_wait(&_c, &mutex._m);
-    mutex._post_lock();
-    return r;
-  }
-
-  int WaitUntil(Mutex &mutex, utime_t when) {
-    // make sure this cond is used with one mutex only
-    assert(waiter_mutex == NULL || waiter_mutex == &mutex);
-    waiter_mutex = &mutex;
-
-    assert(mutex.is_locked());
-
-    struct timespec ts;
-    when.to_timespec(&ts);
-
-    mutex._pre_unlock();
-    int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
-    mutex._post_lock();
-
-    return r;
-  }
-
-  int WaitInterval(Mutex &mutex, utime_t interval) {
-    utime_t when = ceph_clock_now();
-    when += interval;
-    return WaitUntil(mutex, when);
-  }
-
-  template<typename Duration>
-  int WaitInterval(Mutex &mutex, Duration interval) {
-    ceph::real_time when(ceph::real_clock::now());
-    when += interval;
-
-    struct timespec ts = ceph::real_clock::to_timespec(when);
-
-    mutex._pre_unlock();
-    int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
-    mutex._post_lock();
-
-    return r;
-  }
-
-  int SloppySignal() { 
-    int r = pthread_cond_broadcast(&_c);
-    return r;
-  }
-  int Signal() { 
-    // make sure signaler is holding the waiter's lock.
-    assert(waiter_mutex == NULL ||
-	   waiter_mutex->is_locked());
-
-    int r = pthread_cond_broadcast(&_c);
-    return r;
-  }
-  int SignalOne() { 
-    // make sure signaler is holding the waiter's lock.
-    assert(waiter_mutex == NULL ||
-	   waiter_mutex->is_locked());
-
-    int r = pthread_cond_signal(&_c);
-    return r;
-  }
-  int SignalAll() { 
-    // make sure signaler is holding the waiter's lock.
-    assert(waiter_mutex == NULL ||
-	   waiter_mutex->is_locked());
-
-    int r = pthread_cond_broadcast(&_c);
-    return r;
-  }
-};
+#include "CondVar.h"
 
 /**
  * context to signal a cond
diff --git a/ceph/src/common/CondVar.h b/ceph/src/common/CondVar.h
new file mode 100644
index 000000000..c193b9988
--- /dev/null
+++ b/ceph/src/common/CondVar.h
@@ -0,0 +1,109 @@
+#ifndef CEPH_COND_VAR_H
+#define CEPH_COND_VAR_H
+
+#include "include/utime.h"
+
+#include "Clock.h"
+#include "Mutex.h"
+#include "pthread.h"
+
+class Cond {
+  // my bits
+  pthread_cond_t _c;
+
+  Mutex *waiter_mutex;
+
+  // don't allow copying.
+  void operator=(Cond &C);
+  Cond(const Cond &C);
+
+ public:
+  Cond() : waiter_mutex(NULL) {
+    int r = pthread_cond_init(&_c,NULL);
+    assert(r == 0);
+  }
+  virtual ~Cond() { 
+    pthread_cond_destroy(&_c); 
+  }
+
+  int Wait(Mutex &mutex)  { 
+    // make sure this cond is used with one mutex only
+    assert(waiter_mutex == NULL || waiter_mutex == &mutex);
+    waiter_mutex = &mutex;
+
+    assert(mutex.is_locked());
+
+    mutex._pre_unlock();
+    int r = pthread_cond_wait(&_c, &mutex._m);
+    mutex._post_lock();
+    return r;
+  }
+
+  int WaitUntil(Mutex &mutex, utime_t when) {
+    // make sure this cond is used with one mutex only
+    assert(waiter_mutex == NULL || waiter_mutex == &mutex);
+    waiter_mutex = &mutex;
+
+    assert(mutex.is_locked());
+
+    struct timespec ts;
+    when.to_timespec(&ts);
+
+    mutex._pre_unlock();
+    int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
+    mutex._post_lock();
+
+    return r;
+  }
+
+  int WaitInterval(Mutex &mutex, utime_t interval) {
+    utime_t when = ceph_clock_now();
+    when += interval;
+    return WaitUntil(mutex, when);
+  }
+
+  template<typename Duration>
+  int WaitInterval(Mutex &mutex, Duration interval) {
+    ceph::real_time when(ceph::real_clock::now());
+    when += interval;
+
+    struct timespec ts = ceph::real_clock::to_timespec(when);
+
+    mutex._pre_unlock();
+    int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
+    mutex._post_lock();
+
+    return r;
+  }
+
+  int SloppySignal() { 
+    int r = pthread_cond_broadcast(&_c);
+    return r;
+  }
+  int Signal() { 
+    // make sure signaler is holding the waiter's lock.
+    assert(waiter_mutex == NULL ||
+	   waiter_mutex->is_locked());
+
+    int r = pthread_cond_broadcast(&_c);
+    return r;
+  }
+  int SignalOne() { 
+    // make sure signaler is holding the waiter's lock.
+    assert(waiter_mutex == NULL ||
+	   waiter_mutex->is_locked());
+
+    int r = pthread_cond_signal(&_c);
+    return r;
+  }
+  int SignalAll() { 
+    // make sure signaler is holding the waiter's lock.
+    assert(waiter_mutex == NULL ||
+	   waiter_mutex->is_locked());
+
+    int r = pthread_cond_broadcast(&_c);
+    return r;
+  }
+};
+
+#endif // CEPH_COND_VAR_H
diff --git a/ceph/src/common/TrackedOp.cc b/ceph/src/common/TrackedOp.cc
index 4ed2fa48b..788b29744 100644
--- a/ceph/src/common/TrackedOp.cc
+++ b/ceph/src/common/TrackedOp.cc
@@ -342,8 +342,10 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector, int *sl
     while (i != sdata->ops_in_flight_sharded.end() &&
 	   i->get_initiated() < too_old) {
 
-      if (!i->warn_interval_multiplier)
+      if (!i->warn_interval_multiplier) {
+	++i;
 	continue;
+      }
 
       (*slow)++;
 
diff --git a/ceph/src/common/WeightedPriorityQueue.h b/ceph/src/common/WeightedPriorityQueue.h
index 64ac120bf..fa463b4c4 100644
--- a/ceph/src/common/WeightedPriorityQueue.h
+++ b/ceph/src/common/WeightedPriorityQueue.h
@@ -67,8 +67,11 @@ class WeightedPriorityQueue :  public OpQueue <T, K>
         K key;		// klass
         ListPairs lp;
         Klass(K& k) :
-          key(k)
-          {}
+          key(k) {
+        }
+        ~Klass() {
+          lp.clear_and_dispose(DelItem<ListPair>());
+        }
       friend bool operator< (const Klass &a, const Klass &b)
         { return a.key < b.key; }
       friend bool operator> (const Klass &a, const Klass &b)
@@ -129,8 +132,11 @@ class WeightedPriorityQueue :  public OpQueue <T, K>
 	Kit next;
 	SubQueue(unsigned& p) :
 	  key(p),
-	  next(klasses.begin())
-	  {}
+	  next(klasses.begin()) {
+	}
+	~SubQueue() {
+	  klasses.clear_and_dispose(DelItem<Klass>());
+	}
       friend bool operator< (const SubQueue &a, const SubQueue &b)
         { return a.key < b.key; }
       friend bool operator> (const SubQueue &a, const SubQueue &b)
@@ -195,8 +201,11 @@ class WeightedPriorityQueue :  public OpQueue <T, K>
 	Queue() :
 	  total_prio(0),
 	  max_cost(0),
-	  size(0)
-	  {}
+	  size(0) {
+	}
+	~Queue() {
+	  queues.clear_and_dispose(DelItem<SubQueue>());
+	}
 	bool empty() const {
 	  return !size;
 	}
diff --git a/ceph/src/common/buffer.cc b/ceph/src/common/buffer.cc
index 09dcc67b2..cf63639a0 100644
--- a/ceph/src/common/buffer.cc
+++ b/ceph/src/common/buffer.cc
@@ -1723,6 +1723,32 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
     }
   }
 
+  uint64_t buffer::list::get_wasted_space() const
+  {
+    if (_buffers.size() == 1)
+      return _buffers.back().wasted();
+
+    std::vector<const raw*> raw_vec;
+    raw_vec.reserve(_buffers.size());
+    for (const auto& p : _buffers)
+      raw_vec.push_back(p.get_raw());
+    std::sort(raw_vec.begin(), raw_vec.end());
+
+    uint64_t total = 0;
+    const raw *last = nullptr;
+    for (const auto r : raw_vec) {
+      if (r == last)
+	continue;
+      last = r;
+      total += r->len;
+    }
+    // If multiple buffers are sharing the same raw buffer and they overlap
+    // with each other, the wasted space will be underestimated.
+    if (total <= length())
+      return 0;
+    return total - length();
+  }
+
   void buffer::list::rebuild()
   {
     if (_len == 0) {
diff --git a/ceph/src/common/ceph_context.cc b/ceph/src/common/ceph_context.cc
index 0afdc3ac6..87194f7dd 100644
--- a/ceph/src/common/ceph_context.cc
+++ b/ceph/src/common/ceph_context.cc
@@ -34,7 +34,8 @@ namespace {
 
 class LockdepObs : public md_config_obs_t {
 public:
-  explicit LockdepObs(CephContext *cct) : m_cct(cct), m_registered(false) {
+  explicit LockdepObs(CephContext *cct)
+    : m_cct(cct), m_registered(false), lock("lock_dep_obs", false, true) {
   }
   ~LockdepObs() override {
     if (m_registered) {
@@ -49,6 +50,7 @@ public:
 
   void handle_conf_change(const md_config_t *conf,
                           const std::set <std::string> &changed) override {
+    Mutex::Locker locker(lock);
     if (conf->lockdep && !m_registered) {
       lockdep_register_ceph_context(m_cct);
       m_registered = true;
@@ -60,14 +62,17 @@ public:
 private:
   CephContext *m_cct;
   bool m_registered;
+  Mutex lock;
 };
 
 class MempoolObs : public md_config_obs_t,
 		  public AdminSocketHook {
   CephContext *cct;
+  Mutex lock;
 
 public:
-  explicit MempoolObs(CephContext *cct) : cct(cct) {
+  explicit MempoolObs(CephContext *cct)
+    : cct(cct), lock("mem_pool_obs", false, true) {
     cct->_conf->add_observer(this);
     int r = cct->get_admin_socket()->register_command(
       "dump_mempools",
@@ -92,6 +97,7 @@ public:
 
   void handle_conf_change(const md_config_t *conf,
                           const std::set <std::string> &changed) override {
+    Mutex::Locker locker(lock);
     if (changed.count("mempool_debug")) {
       mempool::set_debug_mode(cct->_conf->mempool_debug);
     }
@@ -184,9 +190,12 @@ private:
  */
 class LogObs : public md_config_obs_t {
   ceph::logging::Log *log;
+  Mutex lock;
 
 public:
-  explicit LogObs(ceph::logging::Log *l) : log(l) {}
+  explicit LogObs(ceph::logging::Log *l)
+    : log(l), lock("log_obs", false, true) {
+  }
 
   const char** get_tracked_conf_keys() const override {
     static const char *KEYS[] = {
@@ -211,6 +220,7 @@ public:
 
   void handle_conf_change(const md_config_t *conf,
                           const std::set <std::string> &changed) override {
+    Mutex::Locker locker(lock);
     // stderr
     if (changed.count("log_to_stderr") || changed.count("err_to_stderr")) {
       int l = conf->log_to_stderr ? 99 : (conf->err_to_stderr ? -1 : -2);
diff --git a/ceph/src/common/cmdparse.h b/ceph/src/common/cmdparse.h
index 41495f555..38d6f98aa 100644
--- a/ceph/src/common/cmdparse.h
+++ b/ceph/src/common/cmdparse.h
@@ -46,31 +46,74 @@ void handle_bad_get(CephContext *cct, const std::string& k, const char *name);
 
 std::string cmd_vartype_stringify(const cmd_vartype& v);
 
+struct bad_cmd_get : public std::exception {
+  std::string desc;
+  bad_cmd_get(const std::string& f, const cmdmap_t& cmdmap) {
+    desc = "bad or missing field '" + f + "'";
+  }
+  const char *what() const throw() override {
+    return desc.c_str();
+  }
+};
+
 template <typename T>
-bool
-cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k, T& val)
+bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k,
+		T& val)
 {
   if (cmdmap.count(k)) {
     try {
       val = boost::get<T>(cmdmap.find(k)->second);
       return true;
-    } catch (boost::bad_get) {
+    } catch (boost::bad_get&) {
       handle_bad_get(cct, k, typeid(T).name());
     }
   }
   return false;
 }
 
+template <typename T>
+bool cmd_getval_throws(CephContext *cct, const cmdmap_t& cmdmap,
+		       const std::string& k, T& val)
+{
+  if (cmdmap.count(k)) {
+    try {
+      val = boost::get<T>(cmdmap.find(k)->second);
+      return true;
+    } catch (boost::bad_get&) {
+      throw bad_cmd_get(k, cmdmap);
+    }
+  }
+  return false;
+}
+
 // with default
 
 template <typename T>
-void
-cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k, T& val, const T& defval)
+void cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k,
+		T& val, const T& defval)
 {
   if (!cmd_getval(cct, cmdmap, k, val))
     val = defval;
 }
 
+template <typename T>
+bool cmd_getval_throws(
+  CephContext *cct, const cmdmap_t& cmdmap, const std::string& k,
+  T& val, const T& defval)
+{
+  if (cmdmap.count(k)) {
+    try {
+      val = boost::get<T>(cmdmap.find(k)->second);
+      return true;
+    } catch (boost::bad_get&) {
+      throw bad_cmd_get(k, cmdmap);
+    }
+  } else {
+    val = defval;
+    return true;
+  }
+}
+
 template <typename T>
 void
 cmd_putval(CephContext *cct, cmdmap_t& cmdmap, const std::string& k, const T& val)
diff --git a/ceph/src/common/config.cc b/ceph/src/common/config.cc
index b3a98a595..ef348f95d 100644
--- a/ceph/src/common/config.cc
+++ b/ceph/src/common/config.cc
@@ -20,6 +20,7 @@
 #include "osd/osd_types.h"
 #include "common/errno.h"
 #include "common/hostname.h"
+#include "common/backport14.h"
 
 #include <boost/type_traits.hpp>
 
@@ -197,11 +198,16 @@ void md_config_t::add_observer(md_config_obs_t* observer_)
     obs_map_t::value_type val(*k, observer_);
     observers.insert(val);
   }
+  obs_call_gate.emplace(observer_, ceph::make_unique<CallGate>());
 }
 
 void md_config_t::remove_observer(md_config_obs_t* observer_)
 {
   Mutex::Locker l(lock);
+
+  call_gate_close(observer_);
+  obs_call_gate.erase(observer_);
+
   bool found_obs = false;
   for (obs_map_t::iterator o = observers.begin(); o != observers.end(); ) {
     if (o->second == observer_) {
@@ -665,12 +671,21 @@ int md_config_t::parse_injectargs(std::vector<const char*>& args,
 
 void md_config_t::apply_changes(std::ostream *oss)
 {
-  Mutex::Locker l(lock);
-  /*
-   * apply changes until the cluster name is assigned
-   */
-  if (cluster.size())
-    _apply_changes(oss);
+  rev_obs_map_t rev_obs;
+  {
+    Mutex::Locker l(lock);
+    /*
+     * apply changes until the cluster name is assigned
+     */
+    if (cluster.size()) {
+      for_each_change(
+        oss, [this, &rev_obs](md_config_obs_t *obs, const std::string &key) {
+          map_observer_changes(obs, key, &rev_obs);
+        });
+    }
+  }
+
+  call_observers(rev_obs);
 }
 
 bool md_config_t::_internal_field(const string& s)
@@ -680,12 +695,8 @@ bool md_config_t::_internal_field(const string& s)
   return false;
 }
 
-void md_config_t::_apply_changes(std::ostream *oss)
+void md_config_t::for_each_change(std::ostream *oss, config_gather_cb callback)
 {
-  /* Maps observers to the configuration options that they care about which
-   * have changed. */
-  typedef std::map < md_config_obs_t*, std::set <std::string> > rev_obs_map_t;
-
   expand_all_meta();
 
   // expand_all_meta could have modified anything.  Copy it all out again.
@@ -697,9 +708,6 @@ void md_config_t::_apply_changes(std::ostream *oss)
     update_legacy_val(option, ptr);
   }
 
-  // create the reverse observer mapping, mapping observers to the set of
-  // changed keys that they'll get.
-  rev_obs_map_t robs;
   std::set <std::string> empty_set;
   char buf[128];
   char *bufptr = (char*)buf;
@@ -717,71 +725,68 @@ void md_config_t::_apply_changes(std::ostream *oss)
       }
     }
     for (obs_map_t::iterator r = range.first; r != range.second; ++r) {
-      rev_obs_map_t::value_type robs_val(r->second, empty_set);
-      pair < rev_obs_map_t::iterator, bool > robs_ret(robs.insert(robs_val));
-      std::set <std::string> &keys(robs_ret.first->second);
-      keys.insert(key);
+      callback(r->second, key);
     }
   }
 
   changed.clear();
-
-  // Make any pending observer callbacks
-  for (rev_obs_map_t::const_iterator r = robs.begin(); r != robs.end(); ++r) {
-    md_config_obs_t *obs = r->first;
-    obs->handle_conf_change(this, r->second);
-  }
-
 }
 
 void md_config_t::call_all_observers()
 {
-  std::map<md_config_obs_t*,std::set<std::string> > obs;
+  rev_obs_map_t rev_obs;
   {
     Mutex::Locker l(lock);
 
     expand_all_meta();
 
     for (auto r = observers.begin(); r != observers.end(); ++r) {
-      obs[r->second].insert(r->first);
+      map_observer_changes(r->second, r->first, &rev_obs);
     }
   }
-  for (auto p = obs.begin();
-       p != obs.end();
-       ++p) {
-    p->first->handle_conf_change(this, p->second);
-  }
+
+  call_observers(rev_obs);
 }
 
 int md_config_t::injectargs(const std::string& s, std::ostream *oss)
 {
   int ret;
-  Mutex::Locker l(lock);
-  char b[s.length()+1];
-  strcpy(b, s.c_str());
-  std::vector<const char*> nargs;
-  char *p = b;
-  while (*p) {
-    nargs.push_back(p);
-    while (*p && *p != ' ') p++;
-    if (!*p)
-      break;
-    *p++ = 0;
-    while (*p && *p == ' ') p++;
-  }
-  ret = parse_injectargs(nargs, oss);
-  if (!nargs.empty()) {
-    *oss << " failed to parse arguments: ";
-    std::string prefix;
-    for (std::vector<const char*>::const_iterator i = nargs.begin();
-	 i != nargs.end(); ++i) {
-      *oss << prefix << *i;
-      prefix = ",";
+  rev_obs_map_t rev_obs;
+  {
+    Mutex::Locker l(lock);
+
+    char b[s.length()+1];
+    strcpy(b, s.c_str());
+    std::vector<const char*> nargs;
+    char *p = b;
+    while (*p) {
+      nargs.push_back(p);
+      while (*p && *p != ' ') p++;
+      if (!*p)
+        break;
+      *p++ = 0;
+      while (*p && *p == ' ') p++;
+    }
+    ret = parse_injectargs(nargs, oss);
+    if (!nargs.empty()) {
+      *oss << " failed to parse arguments: ";
+      std::string prefix;
+      for (std::vector<const char*>::const_iterator i = nargs.begin();
+           i != nargs.end(); ++i) {
+        *oss << prefix << *i;
+        prefix = ",";
+      }
+      *oss << "\n";
+      ret = -EINVAL;
     }
-    *oss << "\n";
-    ret = -EINVAL;
+
+    for_each_change(
+      oss, [this, &rev_obs](md_config_obs_t *obs, const std::string &key) {
+        map_observer_changes(obs, key, &rev_obs);
+      });
   }
-  _apply_changes(oss);
+
+  call_observers(rev_obs);
   return ret;
 }
 
@@ -1389,3 +1394,26 @@ void md_config_t::complain_about_parse_errors(CephContext *cct)
   ::complain_about_parse_errors(cct, &parse_errors);
 }
 
+void md_config_t::call_observers(rev_obs_map_t &rev_obs) {
+  for (auto p : rev_obs) {
+    p.first->handle_conf_change(this, p.second);
+    // this can be done outside the lock as call_gate_enter()
+    // and remove_observer() are serialized via lock
+    call_gate_leave(p.first);
+  }
+}
+
+void md_config_t::map_observer_changes(md_config_obs_t *obs, const std::string &key,
+                                       rev_obs_map_t *rev_obs) {
+  ceph_assert(lock.is_locked());
+
+  auto p = rev_obs->emplace(obs, std::set<std::string>{});
+
+  p.first->second.emplace(key);
+  if (p.second) {
+    // this needs to be done under lock as once this lock is
+    // dropped (before calling observers) a remove_observer()
+    // can sneak in and cause havoc.
+    call_gate_enter(p.first->first);
+  }
+}
diff --git a/ceph/src/common/config.h b/ceph/src/common/config.h
index 612f083d8..1145e12e3 100644
--- a/ceph/src/common/config.h
+++ b/ceph/src/common/config.h
@@ -19,6 +19,7 @@
 #include "common/entity_name.h"
 #include "common/code_environment.h"
 #include "common/Mutex.h"
+#include "common/CondVar.h"
 #include "log/SubsystemMap.h"
 #include "common/config_obs.h"
 #include "common/options.h"
@@ -65,6 +66,62 @@ extern const char *CEPH_CONF_FILE_DEFAULT;
  * while another thread is reading them, either.
  */
 struct md_config_t {
+private:
+  class CallGate {
+  private:
+    uint32_t call_count = 0;
+    Mutex lock;
+    Cond cond;
+  public:
+    CallGate()
+      : lock("call::gate::lock", false, true) {
+    }
+
+    void enter() {
+      Mutex::Locker locker(lock);
+      ++call_count;
+    }
+    void leave() {
+      Mutex::Locker locker(lock);
+      ceph_assert(call_count > 0);
+      if (--call_count == 0) {
+        cond.Signal();
+      }
+    }
+    void close() {
+      Mutex::Locker locker(lock);
+      while (call_count != 0) {
+        cond.Wait(lock);
+      }
+    }
+  };
+
+  void call_gate_enter(md_config_obs_t *obs) {
+    auto p = obs_call_gate.find(obs);
+    ceph_assert(p != obs_call_gate.end());
+    p->second->enter();
+  }
+  void call_gate_leave(md_config_obs_t *obs) {
+    auto p = obs_call_gate.find(obs);
+    ceph_assert(p != obs_call_gate.end());
+    p->second->leave();
+  }
+  void call_gate_close(md_config_obs_t *obs) {
+    auto p = obs_call_gate.find(obs);
+    ceph_assert(p != obs_call_gate.end());
+    p->second->close();
+  }
+
+  typedef std::unique_ptr<CallGate> CallGateRef;
+  std::map<md_config_obs_t*, CallGateRef> obs_call_gate;
+
+  typedef std::map<md_config_obs_t*, std::set<std::string>> rev_obs_map_t;
+  typedef std::function<void(md_config_obs_t*, const std::string&)> config_gather_cb;
+
+  void call_observers(rev_obs_map_t &rev_obs);
+  void map_observer_changes(md_config_obs_t *obs, const std::string &key,
+                            rev_obs_map_t *rev_obs);
+
 public:
   typedef boost::variant<int64_t md_config_t::*,
                          uint64_t md_config_t::*,
@@ -135,7 +192,7 @@ public:
 
   // Expand all metavariables. Make any pending observer callbacks.
   void apply_changes(std::ostream *oss);
-  void _apply_changes(std::ostream *oss);
+  void for_each_change(std::ostream *oss, config_gather_cb callback);
   bool _internal_field(const string& k);
   void call_all_observers();
 
diff --git a/ceph/src/common/hobject.h b/ceph/src/common/hobject.h
index 24eeb9754..8b68f85b4 100644
--- a/ceph/src/common/hobject.h
+++ b/ceph/src/common/hobject.h
@@ -35,6 +35,21 @@ namespace ceph {
 #endif
 
 struct hobject_t {
+public:
+  static const int64_t POOL_META = -1;
+  static const int64_t POOL_TEMP_START = -2; // and then negative
+
+  static bool is_temp_pool(int64_t pool) {
+    return pool <= POOL_TEMP_START;
+  }
+  static int64_t get_temp_pool(int64_t pool) {
+    return POOL_TEMP_START - pool;
+  }
+  static bool is_meta_pool(int64_t pool) {
+    return pool == POOL_META;
+  }
+
+public:
   object_t oid;
   snapid_t snap;
 private:
@@ -42,9 +57,6 @@ private:
   bool max;
   uint32_t nibblewise_key_cache;
   uint32_t hash_reverse_bits;
-  static const int64_t POOL_META = -1;
-  static const int64_t POOL_TEMP_START = -2; // and then negative
-  friend class spg_t;  // for POOL_TEMP_START
 public:
   int64_t pool;
   string nspace;
@@ -84,10 +96,16 @@ public:
   }
 
   bool is_temp() const {
-    return pool <= POOL_TEMP_START && pool != INT64_MIN;
+    return is_temp_pool(pool) && pool != INT64_MIN;
   }
   bool is_meta() const {
-    return pool == POOL_META;
+    return is_meta_pool(pool);
+  }
+  int64_t get_logical_pool() const {
+    if (is_temp_pool(pool))
+      return get_temp_pool(pool);  // it's reversible
+    else
+      return pool;
   }
 
   hobject_t() : snap(0), hash(0), max(false), pool(INT64_MIN) {
@@ -258,7 +276,8 @@ public:
   hobject_t make_temp_hobject(const string& name) const {
     return hobject_t(object_t(name), "", CEPH_NOSNAP,
 		     hash,
-		     hobject_t::POOL_TEMP_START - pool, "");
+		     get_temp_pool(pool),
+		     "");
   }
 
   void swap(hobject_t &o) {
diff --git a/ceph/src/common/legacy_config_opts.h b/ceph/src/common/legacy_config_opts.h
index a51870ef6..828697758 100644
--- a/ceph/src/common/legacy_config_opts.h
+++ b/ceph/src/common/legacy_config_opts.h
@@ -429,7 +429,6 @@ OPTION(journaler_write_head_interval, OPT_INT)
 OPTION(journaler_prefetch_periods, OPT_INT)   // * journal object size
 OPTION(journaler_prezero_periods, OPT_INT)     // * journal object size
 OPTION(mds_data, OPT_STR)
-OPTION(mds_max_file_size, OPT_U64) // Used when creating new CephFS. Change with 'ceph mds set max_file_size <size>' afterwards
 // max xattr kv pairs size for each dir/file
 OPTION(mds_max_xattr_pairs_size, OPT_U32)
 OPTION(mds_max_file_recover, OPT_U32)
@@ -440,17 +439,15 @@ OPTION(mds_beacon_interval, OPT_FLOAT)
 OPTION(mds_beacon_grace, OPT_FLOAT)
 OPTION(mds_enforce_unique_name, OPT_BOOL)
 
-OPTION(mds_session_timeout, OPT_FLOAT)    // cap bits and leases time out if client unresponsive or not returning its caps
 OPTION(mds_session_blacklist_on_timeout, OPT_BOOL)    // whether to blacklist clients whose sessions are dropped due to timeout
 OPTION(mds_session_blacklist_on_evict, OPT_BOOL)  // whether to blacklist clients whose sessions are dropped via admin commands
 
 OPTION(mds_sessionmap_keys_per_op, OPT_U32)    // how many sessions should I try to load/store in a single OMAP operation?
 OPTION(mds_recall_state_timeout, OPT_FLOAT)    // detect clients which aren't trimming caps
 OPTION(mds_freeze_tree_timeout, OPT_FLOAT)    // detecting freeze tree deadlock
-OPTION(mds_session_autoclose, OPT_FLOAT) // autoclose idle session
 OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
 OPTION(mds_reconnect_timeout, OPT_FLOAT)  // seconds to wait for clients during mds restart
-	      //  make it (mds_session_timeout - mds_beacon_grace)
+	      //  make it (mdsmap.session_timeout - mds_beacon_grace)
 OPTION(mds_tick_interval, OPT_FLOAT)
 OPTION(mds_dirstat_min_interval, OPT_FLOAT)    // try to avoid propagating more often than this
 OPTION(mds_scatter_nudge_interval, OPT_FLOAT)  // how quickly dirstat changes propagate up the hierarchy
@@ -467,7 +464,6 @@ OPTION(mds_bal_export_pin, OPT_BOOL)  // allow clients to pin directory trees to
 OPTION(mds_bal_sample_interval, OPT_DOUBLE)  // every 3 seconds
 OPTION(mds_bal_replicate_threshold, OPT_FLOAT)
 OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT)
-OPTION(mds_bal_frag, OPT_BOOL)
 OPTION(mds_bal_split_size, OPT_INT)
 OPTION(mds_bal_split_rd, OPT_FLOAT)
 OPTION(mds_bal_split_wr, OPT_FLOAT)
@@ -665,7 +661,8 @@ OPTION(osd_peering_wq_threads, OPT_INT)
 OPTION(osd_peering_wq_batch_size, OPT_U64)
 OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64)
 OPTION(osd_op_pq_min_cost, OPT_U64)
-OPTION(osd_disk_threads, OPT_INT)
+OPTION(osd_remove_threads, OPT_INT)
+OPTION(osd_recovery_threads, OPT_INT)
 OPTION(osd_disk_thread_ioprio_class, OPT_STR) // rt realtime be best effort idle
 OPTION(osd_disk_thread_ioprio_priority, OPT_INT) // 0-7
 OPTION(osd_recover_clone_overlap, OPT_BOOL)   // preserve clone_overlap during recovery/migration
@@ -847,6 +844,7 @@ OPTION(osd_op_history_duration, OPT_U32) // Oldest completed op to track
 OPTION(osd_op_history_slow_op_size, OPT_U32)           // Max number of slow ops to track
 OPTION(osd_op_history_slow_op_threshold, OPT_DOUBLE) // track the op if over this threshold
 OPTION(osd_target_transaction_size, OPT_INT)     // to adjust various transactions that batch smaller items
+OPTION(osd_delete_sleep, OPT_FLOAT)         // seconds to sleep between removal transactions
 OPTION(osd_failsafe_full_ratio, OPT_FLOAT) // what % full makes an OSD "full" (failsafe)
 OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL) // immediately mark OSDs as down once they refuse to accept connections
 
@@ -1003,6 +1001,9 @@ OPTION(bluestore_bluefs_max_ratio, OPT_FLOAT)  // max fs free / total free
 OPTION(bluestore_bluefs_gift_ratio, OPT_FLOAT) // how much to add at a time
 OPTION(bluestore_bluefs_reclaim_ratio, OPT_FLOAT) // how much to reclaim at a time
 OPTION(bluestore_bluefs_balance_interval, OPT_FLOAT) // how often (sec) to balance free space between bluefs and bluestore
+// how often (sec) to dump allocation failure happened during bluefs rebalance
+OPTION(bluestore_bluefs_balance_failure_dump_interval, OPT_FLOAT)
+
 // If you want to use spdk driver, you need to specify NVMe serial number here
 // with "spdk:" prefix.
 // Users can use 'lspci -vvv -d 8086:0953 | grep "Device Serial Number"' to
@@ -1031,6 +1032,7 @@ OPTION(bluestore_block_preallocate_file, OPT_BOOL) //whether preallocate space i
 OPTION(bluestore_csum_type, OPT_STR) // none|xxhash32|xxhash64|crc32c|crc32c_16|crc32c_8
 OPTION(bluestore_csum_min_block, OPT_U32)
 OPTION(bluestore_csum_max_block, OPT_U32)
+OPTION(bluestore_retry_disk_reads, OPT_U64)
 OPTION(bluestore_min_alloc_size, OPT_U32)
 OPTION(bluestore_min_alloc_size_hdd, OPT_U32)
 OPTION(bluestore_min_alloc_size_ssd, OPT_U32)
@@ -1124,6 +1126,7 @@ OPTION(bluestore_debug_omit_kv_commit, OPT_BOOL)
 OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL)
 OPTION(bluestore_shard_finishers, OPT_BOOL)
 OPTION(bluestore_debug_random_read_err, OPT_DOUBLE)
+OPTION(bluestore_debug_inject_csum_err_probability, OPT_FLOAT)
 
 OPTION(kstore_max_ops, OPT_U64)
 OPTION(kstore_max_bytes, OPT_U64)
@@ -1540,6 +1543,7 @@ OPTION(rgw_shard_warning_threshold, OPT_DOUBLE) // pct of safe max
 
 OPTION(rgw_swift_versioning_enabled, OPT_BOOL) // whether swift object versioning feature is enabled
 
+OPTION(rgw_trust_forwarded_https, OPT_BOOL) // trust Forwarded and X-Forwarded-Proto headers for ssl termination
 OPTION(rgw_crypt_require_ssl, OPT_BOOL) // requests including encryption key headers must be sent over ssl
 OPTION(rgw_crypt_default_encryption_key, OPT_STR) // base64 encoded key for encryption of rgw objects
 OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR) // extra keys that may be used for aws:kms
diff --git a/ceph/src/common/options.cc b/ceph/src/common/options.cc
index ff3bb1a1b..231a7651b 100644
--- a/ceph/src/common/options.cc
+++ b/ceph/src/common/options.cc
@@ -898,7 +898,7 @@ std::vector<Option> get_global_options() {
     .set_description(""),
 
     Option("mon_osd_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
-    .set_default(10)
+    .set_default(500)
     .set_description(""),
 
     Option("mon_cpu_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
@@ -1950,7 +1950,11 @@ std::vector<Option> get_global_options() {
     .set_default(65536)
     .set_description(""),
 
-    Option("osd_disk_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("osd_remove_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description(""),
+
+    Option("osd_recovery_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(1)
     .set_description(""),
 
@@ -2842,6 +2846,10 @@ std::vector<Option> get_global_options() {
     .set_default(30)
     .set_description(""),
 
+    Option("osd_delete_sleep", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Time in seconds to sleep before next removal transaction"),
+
     Option("osd_failsafe_full_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(.97)
     .set_description(""),
@@ -2956,7 +2964,7 @@ std::vector<Option> get_global_options() {
     .set_description(""),
 
     Option("rocksdb_cache_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
-    .set_default(128_M)
+    .set_default(512_M)
     .set_description(""),
 
     Option("rocksdb_cache_row_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
@@ -3164,6 +3172,7 @@ std::vector<Option> get_global_options() {
 
     Option("osd_memory_expected_fragmentation", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(0.15)
+    .set_min_max(0.0, 1.0)
     .add_see_also("bluestore_cache_autotune")
     .set_description("When tcmalloc and cache autotuning is enabled, estimate the percent of memory fragmentation."),
 
@@ -3335,6 +3344,11 @@ std::vector<Option> get_global_options() {
     .set_default(1)
     .set_description("How frequently (in seconds) to balance free space between BlueFS and BlueStore"),
 
+    Option("bluestore_bluefs_balance_failure_dump_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("How frequently (in seconds) to dump information on "
+      "allocation failure occurred during BlueFS space rebalance"),
+
     Option("bluestore_spdk_mem", Option::TYPE_UINT, Option::LEVEL_DEV)
     .set_default(512)
     .set_description(""),
@@ -3422,6 +3436,12 @@ std::vector<Option> get_global_options() {
     .set_description("Maximum block size to checksum")
     .add_see_also("bluestore_csum_min_block"),
 
+    Option("bluestore_retry_disk_reads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_min_max(0, 255)
+    .set_description("Number of read retries on checksum validation error")
+    .set_long_description("Retries to read data from the disk this many times when checksum validation fails to handle spurious read errors gracefully."),
+
     Option("bluestore_min_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(0)
     .add_tag("mkfs")
@@ -3791,6 +3811,10 @@ std::vector<Option> get_global_options() {
     .set_default(0)
     .set_description(""),
 
+    Option("bluestore_debug_inject_csum_err_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0.0)
+    .set_description("inject crc verification errors into bluestore device reads"),
+
     // -----------------------------------------
     // kstore
 
@@ -5572,6 +5596,17 @@ std::vector<Option> get_rgw_options() {
     .set_default(120)
     .set_description(""),
 
+    Option("rgw_trust_forwarded_https", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Trust Forwarded and X-Forwarded-Proto headers")
+    .set_long_description(
+        "When a proxy in front of radosgw is used for ssl termination, radosgw "
+        "does not know whether incoming http connections are secure. Enable "
+        "this option to trust the Forwarded and X-Forwarded-Proto headers sent "
+        "by the proxy when determining whether the connection is secure. This "
+        "is required for some features, such as server side encryption.")
+    .add_see_also("rgw_crypt_require_ssl"),
+
     Option("rgw_crypt_require_ssl", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
     .set_description("Requests including encryption key headers must be sent over ssl"),
@@ -5670,6 +5705,17 @@ std::vector<Option> get_rgw_options() {
 			  "of RGW instances under heavy use. If you would like "
 			  "to turn off cache expiry, set this value to zero."),
 
+    Option("rgw_max_listing_results", Option::TYPE_UINT,
+	   Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_min_max(1, 100000)
+    .add_service("rgw")
+    .set_description("Upper bound on results in listing operations, ListBucket max-keys")
+    .set_long_description("This caps the maximum permitted value for listing-like operations in RGW S3. "
+			  "Affects ListBucket(max-keys), "
+			  "ListBucketVersions(max-keys), "
+			  "ListBucketMultiPartUploads(max-uploads), "
+			  "ListMultipartUploadParts(max-parts)"),
   });
 }
 
@@ -6026,10 +6072,6 @@ std::vector<Option> get_mds_options() {
     .set_default("/var/lib/ceph/mds/$cluster-$id")
     .set_description(""),
 
-    Option("mds_max_file_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
-    .set_default(1ULL << 40)
-    .set_description(""),
-
     Option("mds_max_xattr_pairs_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(64 << 10)
     .set_description(""),
@@ -6080,14 +6122,14 @@ std::vector<Option> get_mds_options() {
     .set_default(15)
     .set_description(""),
 
+    Option("mds_heartbeat_grace", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(15)
+    .set_description("tolerance in seconds for MDS internal heartbeat"),
+
     Option("mds_enforce_unique_name", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
     .set_description(""),
 
-    Option("mds_session_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(60)
-    .set_description(""),
-
     Option("mds_session_blacklist_on_timeout", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
     .set_description(""),
@@ -6108,10 +6150,6 @@ std::vector<Option> get_mds_options() {
     .set_default(30)
     .set_description(""),
 
-    Option("mds_session_autoclose", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(300)
-    .set_description(""),
-
     Option("mds_health_summarize_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(10)
     .set_description(""),
@@ -6184,10 +6222,6 @@ std::vector<Option> get_mds_options() {
     .set_default(0)
     .set_description(""),
 
-    Option("mds_bal_frag", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
-    .set_default(true)
-    .set_description(""),
-
     Option("mds_bal_split_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(10000)
     .set_description(""),
@@ -6509,6 +6543,16 @@ std::vector<Option> get_mds_options() {
     Option("mds_cap_revoke_eviction_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
      .set_default(0)
      .set_description("number of seconds after which clients which have not responded to cap revoke messages by the MDS are evicted."),
+
+    Option("mds_dump_cache_threshold_formatter", Option::TYPE_UINT, Option::LEVEL_DEV)
+     .set_default(1_G)
+     .set_description("threshold for cache usage to disallow \"dump cache\" operation to formatter")
+     .set_long_description("Disallow MDS from dumping caches to formatter via \"dump cache\" command if cache usage exceeds this threshold."),
+
+    Option("mds_dump_cache_threshold_file", Option::TYPE_UINT, Option::LEVEL_DEV)
+     .set_default(0)
+     .set_description("threshold for cache usage to disallow \"dump cache\" operation to file")
+     .set_long_description("Disallow MDS from dumping caches to file via \"dump cache\" command if cache usage exceeds this threshold."),
   });
 }
 
diff --git a/ceph/src/crush/CrushCompiler.cc b/ceph/src/crush/CrushCompiler.cc
index 5d8415122..ae9152de6 100644
--- a/ceph/src/crush/CrushCompiler.cc
+++ b/ceph/src/crush/CrushCompiler.cc
@@ -1048,9 +1048,13 @@ void CrushCompiler::find_used_bucket_ids(iter_t const& i)
 {
   for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
     if ((int)p->value.id().to_long() == crush_grammar::_bucket) {
-      iter_t firstline = p->children.begin() + 3;
-      string tag = string_node(firstline->children[0]);
-      if (tag == "id") {
+      for (iter_t firstline = p->children.begin() + 3;
+	   firstline != p->children.end();
+	   ++firstline) {
+	string tag = string_node(firstline->children[0]);
+	if (tag != "id") {
+	  break;
+	}
 	int id = int_node(firstline->children[1]);
 	//err << "saw bucket id " << id << std::endl;
 	id_item[id] = string();
diff --git a/ceph/src/crush/CrushTester.cc b/ceph/src/crush/CrushTester.cc
index b6b4b2f20..86f91ef3d 100644
--- a/ceph/src/crush/CrushTester.cc
+++ b/ceph/src/crush/CrushTester.cc
@@ -723,3 +723,80 @@ int CrushTester::test()
 
   return 0;
 }
+
+int CrushTester::compare(CrushWrapper& crush2)
+{
+  if (min_rule < 0 || max_rule < 0) {
+    min_rule = 0;
+    max_rule = crush.get_max_rules() - 1;
+  }
+  if (min_x < 0 || max_x < 0) {
+    min_x = 0;
+    max_x = 1023;
+  }
+
+  // initial osd weights
+  vector<__u32> weight;
+
+  /*
+   * note device weight is set by crushtool
+   * (likely due to a given a command line option)
+   */
+  for (int o = 0; o < crush.get_max_devices(); o++) {
+    if (device_weight.count(o)) {
+      weight.push_back(device_weight[o]);
+    } else if (crush.check_item_present(o)) {
+      weight.push_back(0x10000);
+    } else {
+      weight.push_back(0);
+    }
+  }
+
+  // make adjustments
+  adjust_weights(weight);
+
+  map<int,int> bad_by_rule;
+
+  int ret = 0;
+  for (int r = min_rule; r < crush.get_max_rules() && r <= max_rule; r++) {
+    if (!crush.rule_exists(r)) {
+      if (output_statistics)
+        err << "rule " << r << " dne" << std::endl;
+      continue;
+    }
+    if (ruleset >= 0 &&
+	crush.get_rule_mask_ruleset(r) != ruleset) {
+      continue;
+    }
+    int minr = min_rep, maxr = max_rep;
+    if (min_rep < 0 || max_rep < 0) {
+      minr = crush.get_rule_mask_min_size(r);
+      maxr = crush.get_rule_mask_max_size(r);
+    }
+    int bad = 0;
+    for (int nr = minr; nr <= maxr; nr++) {
+      for (int x = min_x; x <= max_x; ++x) {
+	vector<int> out;
+	crush.do_rule(r, x, out, nr, weight, 0);
+	vector<int> out2;
+	crush2.do_rule(r, x, out2, nr, weight, 0);
+	if (out != out2) {
+	  ++bad;
+	}
+      }
+    }
+    if (bad) {
+      ret = -1;
+    }
+    int max = (maxr - minr + 1) * (max_x - min_x + 1);
+    double ratio = (double)bad / (double)max;
+    cout << "rule " << r << " had " << bad << "/" << max
+	 << " mismatched mappings (" << ratio << ")" << std::endl;
+  }
+  if (ret) {
+    cerr << "warning: maps are NOT equivalent" << std::endl;
+  } else {
+    cout << "maps appear equivalent" << std::endl;
+  }
+  return ret;
+}
diff --git a/ceph/src/crush/CrushTester.h b/ceph/src/crush/CrushTester.h
index e58c24875..c4257b63d 100644
--- a/ceph/src/crush/CrushTester.h
+++ b/ceph/src/crush/CrushTester.h
@@ -359,6 +359,8 @@ public:
   void check_overlapped_rules() const;
   int test();
   int test_with_fork(int timeout);
+
+  int compare(CrushWrapper& other);
 };
 
 #endif
diff --git a/ceph/src/crush/CrushWrapper.cc b/ceph/src/crush/CrushWrapper.cc
index ed55b0562..ca8d9059f 100644
--- a/ceph/src/crush/CrushWrapper.cc
+++ b/ceph/src/crush/CrushWrapper.cc
@@ -1607,18 +1607,432 @@ int32_t CrushWrapper::_alloc_class_id() const {
   assert(0 == "no available class id");
 }
 
-void CrushWrapper::reweight(CephContext *cct)
+int CrushWrapper::set_subtree_class(
+  const string& subtree,
+  const string& new_class)
 {
+  if (!name_exists(subtree)) {
+    return -ENOENT;
+  }
+
+  int new_class_id = -1;
+  if (class_exists(new_class)) {
+    new_class_id = get_class_id(new_class);
+  } else {
+    for (new_class_id = 1; class_name.count(new_class_id); ++new_class_id) ;
+    class_name[new_class_id] = new_class;
+    class_rname[new_class] = new_class_id;
+  }
+
+  int id = get_item_id(subtree);
+  list<int> q = { id };
+  while (!q.empty()) {
+    int id = q.front();
+    q.pop_front();
+    crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b)) {
+      return PTR_ERR(b);
+    }
+    for (unsigned i = 0; i < b->size; ++i) {
+      int item = b->items[i];
+      if (item >= 0) {
+	class_map[item] = new_class_id;
+      } else {
+	q.push_back(item);
+      }
+    }
+  }
+  return 0;
+}
+
+int CrushWrapper::reclassify(
+  CephContext *cct,
+  ostream& out,
+  const map<string,string>& classify_root,
+  const map<string,pair<string,string>>& classify_bucket
+  )
+{
+  map<int,string> reclassified_bucket; // orig_id -> class
+
+  // classify_root
+  for (auto& i : classify_root) {
+    string root = i.first;
+    if (!name_exists(root)) {
+      out << "root " << root << " does not exist" << std::endl;
+      return -EINVAL;
+    }
+    int root_id = get_item_id(root);
+    string new_class = i.second;
+    int new_class_id = -1;
+    out << "classify_root " << root << " (" << root_id
+	<< ") as " << new_class << std::endl;
+    if (class_exists(new_class)) {
+      new_class_id = get_class_id(new_class);
+      out << "  new class " << new_class << " exists as " << new_class_id
+	  << std::endl;
+    } else {
+      for (new_class_id = 1; class_name.count(new_class_id); ++new_class_id) ;
+      class_name[new_class_id] = new_class;
+      class_rname[new_class] = new_class_id;
+      out << "  created new class " << new_class << " as " << new_class_id
+	  << std::endl;
+    }
+
+    // validate rules
+    for (unsigned j = 0; j < crush->max_rules; j++) {
+      if (crush->rules[j]) {
+	auto rule = crush->rules[j];
+	for (unsigned k = 0; k < rule->len; ++k) {
+	  if (rule->steps[k].op == CRUSH_RULE_TAKE) {
+	    int step_item = get_rule_arg1(j, k);
+	    int original_item;
+	    int c;
+	    int res = split_id_class(step_item, &original_item, &c);
+	    if (res < 0)
+	      return res;
+	    if (c >= 0) {
+	      if (original_item == root_id) {
+		out << "  rule " << j << " includes take on root "
+		    << root << " class " << c << std::endl;
+		return -EINVAL;
+	      }
+	    }
+	  }
+	}
+      }
+    }
+
+    // rebuild new buckets for root
+    //cout << "before class_bucket: " << class_bucket << std::endl;
+    map<int,int> renumber;
+    list<int> q;
+    q.push_back(root_id);
+    while (!q.empty()) {
+      int id = q.front();
+      q.pop_front();
+      crush_bucket *bucket = get_bucket(id);
+      if (IS_ERR(bucket)) {
+	out << "cannot find bucket " << id
+	    << ": " << cpp_strerror(PTR_ERR(bucket)) << std::endl;
+	return PTR_ERR(bucket);
+      }
+
+      // move bucket
+      int new_id = get_new_bucket_id();
+      out << "  renumbering bucket " << id << " -> " << new_id << std::endl;
+      renumber[id] = new_id;
+      crush->buckets[-1-new_id] = bucket;
+      bucket->id = new_id;
+      crush->buckets[-1-id] = crush_make_bucket(crush,
+						bucket->alg,
+						bucket->hash,
+						bucket->type,
+						0, NULL, NULL);
+      crush->buckets[-1-id]->id = id;
+      for (auto& i : choose_args) {
+	i.second.args[-1-new_id] = i.second.args[-1-id];
+	memset(&i.second.args[-1-id], 0, sizeof(i.second.args[0]));
+      }
+      class_bucket.erase(id);
+      class_bucket[new_id][new_class_id] = id;
+      name_map[new_id] = string(get_item_name(id));
+      name_map[id] = string(get_item_name(id)) + "~" + new_class;
+
+      for (unsigned j = 0; j < bucket->size; ++j) {
+	if (bucket->items[j] < 0) {
+	  q.push_front(bucket->items[j]);
+	} else {
+	  // we don't reclassify the device here; if the users wants that,
+	  // they can pass --set-subtree-class separately.
+	}
+      }
+    }
+    //cout << "mid class_bucket: " << class_bucket << std::endl;
+
+    for (int i = 0; i < crush->max_buckets; ++i) {
+      crush_bucket *b = crush->buckets[i];
+      if (!b) {
+	continue;
+      }
+      for (unsigned j = 0; j < b->size; ++j) {
+	if (renumber.count(b->items[j])) {
+	  b->items[j] = renumber[b->items[j]];
+	}
+      }
+    }
+
+    int r = rebuild_roots_with_classes();
+    if (r < 0) {
+      out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
+	  << std::endl;
+      return r;
+    }
+    //cout << "final class_bucket: " << class_bucket << std::endl;
+  }
+
+  // classify_bucket
+  map<int,int> send_to;  // source bucket -> dest bucket
+  map<int,map<int,int>> new_class_bucket;
+  map<int,string> new_bucket_names;
+  map<int,map<string,string>> new_buckets;
+  map<string,int> new_bucket_by_name;
+  for (auto& i : classify_bucket) {
+    const string& match = i.first;  // prefix% or %suffix
+    const string& new_class = i.second.first;
+    const string& default_parent = i.second.second;
+    if (!name_exists(default_parent)) {
+      out << "default parent " << default_parent << " does not exist"
+	  << std::endl;
+      return -EINVAL;
+    }
+    int default_parent_id = get_item_id(default_parent);
+    crush_bucket *default_parent_bucket = get_bucket(default_parent_id);
+    assert(default_parent_bucket);
+    string default_parent_type_name = get_type_name(default_parent_bucket->type);
+
+    out << "classify_bucket " << match << " as " << new_class
+	<< " default bucket " << default_parent
+	<< " (" << default_parent_type_name << ")" << std::endl;
+
+    int new_class_id = -1;
+    if (class_exists(new_class)) {
+      new_class_id = get_class_id(new_class);
+      out << "  new class " << new_class << " exists as " << new_class_id
+	  << std::endl;
+    } else {
+      for (new_class_id = 1; class_name.count(new_class_id); ++new_class_id) {
+      }
+      class_name[new_class_id] = new_class;
+      class_rname[new_class] = new_class_id;
+      out << "  created new class " << new_class << " as " << new_class_id
+	  << std::endl;
+    }
+
+    for (int j = 0; j < crush->max_buckets; ++j) {
+      crush_bucket *b = crush->buckets[j];
+      if (!b || is_shadow_item(b->id)) {
+	continue;
+      }
+      string name = get_item_name(b->id);
+      if (name.length() < match.length()) {
+	continue;
+      }
+      string basename;
+      if (match[0] == '%') {
+	if (match.substr(1) != name.substr(name.size() - match.size() + 1)) {
+	  continue;
+	}
+	basename = name.substr(0, name.size() - match.size() + 1);
+      } else if (match[match.size() - 1] == '%') {
+	if (match.substr(0, match.size() - 1) !=
+	    name.substr(0, match.size() - 1)) {
+	  continue;
+	}
+	basename = name.substr(match.size() - 1);
+      } else if (match == name) {
+	basename = default_parent;
+      } else {
+	continue;
+      }
+      cout << "match " << match << " to " << name << " basename " << basename
+	   << std::endl;
+      // look up or create basename bucket
+      int base_id;
+      if (name_exists(basename)) {
+	base_id = get_item_id(basename);
+	cout << "  have base " << base_id << std::endl;
+      } else if (new_bucket_by_name.count(basename)) {
+	base_id = new_bucket_by_name[basename];
+	cout << "  already creating base " << base_id << std::endl;
+      } else {
+	base_id = get_new_bucket_id();
+	crush->buckets[-1-base_id] = crush_make_bucket(crush,
+						       b->alg,
+						       b->hash,
+						       b->type,
+						       0, NULL, NULL);
+	crush->buckets[-1-base_id]->id = base_id;
+	name_map[base_id] = basename;
+	new_bucket_by_name[basename] = base_id;
+	cout << "  created base " << base_id << std::endl;
+
+	new_buckets[base_id][default_parent_type_name] = default_parent;
+      }
+      send_to[b->id] = base_id;
+      new_class_bucket[base_id][new_class_id] = b->id;
+      new_bucket_names[b->id] = basename + "~" + get_class_name(new_class_id);
+
+      // make sure devices are classified
+      for (unsigned i = 0; i < b->size; ++i) {
+	int item = b->items[i];
+	if (item >= 0) {
+	  class_map[item] = new_class_id;
+	}
+      }
+    }
+  }
+
+  // no name_exists() works below,
+  have_rmaps = false;
+
+  // copy items around
+  //cout << "send_to " << send_to << std::endl;
   set<int> roots;
   find_roots(&roots);
-  for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
-    if (*p >= 0)
+  for (auto& i : send_to) {
+    crush_bucket *from = get_bucket(i.first);
+    crush_bucket *to = get_bucket(i.second);
+    cout << "moving items from " << from->id << " (" << get_item_name(from->id)
+	 << ") to " << to->id << " (" << get_item_name(to->id) << ")"
+	 << std::endl;
+    for (unsigned j = 0; j < from->size; ++j) {
+      int item = from->items[j];
+      int r;
+      map<string,string> to_loc;
+      to_loc[get_type_name(to->type)] = get_item_name(to->id);
+      if (item >= 0) {
+	if (subtree_contains(to->id, item)) {
+	  continue;
+	}
+	map<string,string> from_loc;
+	from_loc[get_type_name(from->type)] = get_item_name(from->id);
+	auto w = get_item_weightf_in_loc(item, from_loc);
+	r = insert_item(cct, item,
+			w,
+			get_item_name(item),
+			to_loc);
+      } else {
+	if (!send_to.count(item)) {
+	  lderr(cct) << "item " << item << " in bucket " << from->id
+	       << " is not also a reclassified bucket" << dendl;
+	  return -EINVAL;
+	}
+	int newitem = send_to[item];
+	if (subtree_contains(to->id, newitem)) {
+	  continue;
+	}
+	r = link_bucket(cct, newitem, to_loc);
+      }
+      if (r != 0) {
+	cout << __func__ << " err from insert_item: " << cpp_strerror(r)
+	     << std::endl;
+	return r;
+      }
+    }
+  }
+
+  // make sure new buckets have parents
+  for (auto& i : new_buckets) {
+    int parent;
+    if (get_immediate_parent_id(i.first, &parent) < 0) {
+      cout << "new bucket " << i.first << " missing parent, adding at "
+	   << i.second << std::endl;
+      int r = link_bucket(cct, i.first, i.second);
+      if (r != 0) {
+	cout << __func__ << " err from insert_item: " << cpp_strerror(r)
+	     << std::endl;
+	return r;
+      }
+    }
+  }
+
+  // set class mappings
+  //cout << "pre class_bucket: " << class_bucket << std::endl;
+  for (auto& i : new_class_bucket) {
+    for (auto& j : i.second) {
+      class_bucket[i.first][j.first] = j.second;
+    }
+
+  }
+  //cout << "post class_bucket: " << class_bucket << std::endl;
+  for (auto& i : new_bucket_names) {
+    name_map[i.first] = i.second;
+  }
+
+  int r = rebuild_roots_with_classes();
+  if (r < 0) {
+    out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
+	<< std::endl;
+    return r;
+  }
+  //cout << "final class_bucket: " << class_bucket << std::endl;
+
+  return 0;
+}
+
+int CrushWrapper::get_new_bucket_id()
+{
+  int id = -1;
+  while (crush->buckets[-1-id] &&
+	 -1-id < crush->max_buckets) {
+    id--;
+  }
+  if (-1-id == crush->max_buckets) {
+    ++crush->max_buckets;
+    crush->buckets = (struct crush_bucket**)realloc(
+      crush->buckets,
+      sizeof(crush->buckets[0]) * crush->max_buckets);
+    for (auto& i : choose_args) {
+      assert(i.second.size == crush->max_buckets - 1);
+      ++i.second.size;
+      i.second.args = (struct crush_choose_arg*)realloc(
+	i.second.args,
+	sizeof(i.second.args[0]) * i.second.size);
+    }
+  }
+  return id;
+}
+
+void CrushWrapper::reweight(CephContext *cct)
+{
+  set<int> roots;
+  find_nonshadow_roots(&roots);
+  for (auto id : roots) {
+    if (id >= 0)
       continue;
-    crush_bucket *b = get_bucket(*p);
-    ldout(cct, 5) << "reweight bucket " << *p << dendl;
+    crush_bucket *b = get_bucket(id);
+    ldout(cct, 5) << "reweight root bucket " << id << dendl;
     int r = crush_reweight_bucket(crush, b);
     assert(r == 0);
+
+    for (auto& i : choose_args) {
+      //cout << "carg " << i.first << std::endl;
+      vector<uint32_t> w;  // discard top-level weights
+      reweight_bucket(b, i.second, &w);
+    }
+  }
+  int r = rebuild_roots_with_classes();
+  ceph_assert(r == 0);
+}
+
+void CrushWrapper::reweight_bucket(
+  crush_bucket *b,
+  crush_choose_arg_map& arg_map,
+  vector<uint32_t> *weightv)
+{
+  int idx = -1 - b->id;
+  unsigned npos = arg_map.args[idx].weight_set_positions;
+  //cout << __func__ << " " << b->id << " npos " << npos << std::endl;
+  weightv->resize(npos);
+  for (unsigned i = 0; i < b->size; ++i) {
+    int item = b->items[i];
+    if (item >= 0) {
+      for (unsigned pos = 0; pos < npos; ++pos) {
+	(*weightv)[pos] += arg_map.args[idx].weight_set->weights[i];
+      }
+    } else {
+      vector<uint32_t> subw(npos);
+      crush_bucket *sub = get_bucket(item);
+      assert(sub);
+      reweight_bucket(sub, arg_map, &subw);
+      for (unsigned pos = 0; pos < npos; ++pos) {
+	(*weightv)[pos] += subw[pos];
+	// strash the real bucket weight as the weights for this reference
+	arg_map.args[idx].weight_set->weights[i] = subw[pos];
+      }
+    }
   }
+  //cout << __func__ << " finish " << b->id << " " << *weightv << std::endl;
 }
 
 int CrushWrapper::add_simple_rule_at(
diff --git a/ceph/src/crush/CrushWrapper.h b/ceph/src/crush/CrushWrapper.h
index b18e4f8e6..8063e8ab8 100644
--- a/ceph/src/crush/CrushWrapper.h
+++ b/ceph/src/crush/CrushWrapper.h
@@ -948,6 +948,9 @@ public:
     return adjust_item_weight_in_loc(cct, id, (int)(weight * (float)0x10000), loc);
   }
   void reweight(CephContext *cct);
+  void reweight_bucket(crush_bucket *b,
+		       crush_choose_arg_map& arg_map,
+		       vector<uint32_t> *weightv);
 
   int adjust_subtree_weight(CephContext *cct, int id, int weight);
   int adjust_subtree_weightf(CephContext *cct, int id, float weight) {
@@ -1186,6 +1189,8 @@ private:
    **/
   int detach_bucket(CephContext *cct, int item);
 
+  int get_new_bucket_id();
+
 public:
   int get_max_buckets() const {
     if (!crush) return -EINVAL;
@@ -1286,6 +1291,15 @@ public:
   /* remove unused roots generated for class devices */
   int trim_roots_with_class();
 
+  int reclassify(
+    CephContext *cct,
+    ostream& out,
+    const map<string,string>& classify_root,
+    const map<string,pair<string,string>>& classify_bucket
+    );
+
+  int set_subtree_class(const string& name, const string& class_name);
+
   void start_choose_profile() {
     free(crush->choose_tries);
     /*
diff --git a/ceph/src/include/buffer.h b/ceph/src/include/buffer.h
index 596f15b99..4892be612 100644
--- a/ceph/src/include/buffer.h
+++ b/ceph/src/include/buffer.h
@@ -692,6 +692,7 @@ namespace buffer CEPH_BUFFER_API {
       return *this;
     }
 
+    uint64_t get_wasted_space() const;
     unsigned get_num_buffers() const { return _buffers.size(); }
     const ptr& front() const { return _buffers.front(); }
     const ptr& back() const { return _buffers.back(); }
diff --git a/ceph/src/include/ceph_features.h b/ceph/src/include/ceph_features.h
index bea6f49fb..ac51eb246 100755
--- a/ceph/src/include/ceph_features.h
+++ b/ceph/src/include/ceph_features.h
@@ -93,6 +93,7 @@ DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS)
 
 DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES)
 DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(19, 2, OSD_PGLOG_HARDLIMIT)
 
 DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
 
@@ -231,6 +232,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_RADOS_BACKOFF |		\
 	 CEPH_FEATURE_OSD_RECOVERY_DELETES | \
 	 CEPH_FEATURE_CEPHX_V2 | \
+	 CEPH_FEATURE_OSD_PGLOG_HARDLIMIT | \
 	 0ULL)
 
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
diff --git a/ceph/src/include/ceph_fs.h b/ceph/src/include/ceph_fs.h
index b40683db6..f57f780a6 100644
--- a/ceph/src/include/ceph_fs.h
+++ b/ceph/src/include/ceph_fs.h
@@ -379,10 +379,10 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_SETATTR_ATIME	(1 << 4)
 #define CEPH_SETATTR_SIZE	(1 << 5)
 #define CEPH_SETATTR_CTIME	(1 << 6)
-#define CEPH_SETATTR_BTIME	(1 << 9)
-#endif
 #define CEPH_SETATTR_MTIME_NOW	(1 << 7)
 #define CEPH_SETATTR_ATIME_NOW	(1 << 8)
+#define CEPH_SETATTR_BTIME	(1 << 9)
+#endif
 #define CEPH_SETATTR_KILL_SGUID	(1 << 10)
 
 /*
diff --git a/ceph/src/include/cephfs/libcephfs.h b/ceph/src/include/cephfs/libcephfs.h
index b6c0f706a..cb90130fb 100644
--- a/ceph/src/include/cephfs/libcephfs.h
+++ b/ceph/src/include/cephfs/libcephfs.h
@@ -116,6 +116,8 @@ struct CephContext;
 # define CEPH_SETATTR_ATIME	16
 # define CEPH_SETATTR_SIZE	32
 # define CEPH_SETATTR_CTIME	64
+# define CEPH_SETATTR_MTIME_NOW	128
+# define CEPH_SETATTR_ATIME_NOW	256
 # define CEPH_SETATTR_BTIME	512
 #endif
 
diff --git a/ceph/src/include/config-h.in.cmake b/ceph/src/include/config-h.in.cmake
index 19f1d0a5f..5627d6392 100644
--- a/ceph/src/include/config-h.in.cmake
+++ b/ceph/src/include/config-h.in.cmake
@@ -327,4 +327,7 @@
 /* Defined if boost::context is available */
 #cmakedefine HAVE_BOOST_CONTEXT
 
+/* Defined if OpenSSL is available for the rgw beast frontend */
+#cmakedefine WITH_RADOSGW_BEAST_OPENSSL
+
 #endif /* CONFIG_H */
diff --git a/ceph/src/include/rados.h b/ceph/src/include/rados.h
index e91236a01..f8fcc6b41 100644
--- a/ceph/src/include/rados.h
+++ b/ceph/src/include/rados.h
@@ -157,6 +157,7 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
 #define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
 #define CEPH_OSDMAP_PURGED_SNAPDIRS  (1<<20) /* osds have converted snapsets */
+#define CEPH_OSDMAP_PGLOG_HARDLIMIT  (1<<22) /* put a hard limit on pg log length */
 
 /* these are hidden in 'ceph status' view */
 #define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL|	\
@@ -164,7 +165,8 @@ extern const char *ceph_osd_state_name(int s);
 				      CEPH_OSDMAP_REQUIRE_LUMINOUS |	\
 				      CEPH_OSDMAP_RECOVERY_DELETES |	\
 				      CEPH_OSDMAP_SORTBITWISE |		\
-				      CEPH_OSDMAP_PURGED_SNAPDIRS)
+				      CEPH_OSDMAP_PURGED_SNAPDIRS |     \
+                                      CEPH_OSDMAP_PGLOG_HARDLIMIT)
 #define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL |	\
 					  CEPH_OSDMAP_REQUIRE_KRAKEN |	\
 					  CEPH_OSDMAP_REQUIRE_LUMINOUS)
diff --git a/ceph/src/include/rados/librados.hpp b/ceph/src/include/rados/librados.hpp
index f4cd777df..ad8276d2d 100644
--- a/ceph/src/include/rados/librados.hpp
+++ b/ceph/src/include/rados/librados.hpp
@@ -110,18 +110,19 @@ namespace librados
     bool operator!=(const NObjectIterator& rhs) const;
     const ListObject& operator*() const;
     const ListObject* operator->() const;
-    NObjectIterator &operator++(); // Preincrement
-    NObjectIterator operator++(int); // Postincrement
+    NObjectIterator &operator++(); //< Preincrement; errors are thrown as exceptions
+    NObjectIterator operator++(int); //< Postincrement; errors are thrown as exceptions
     friend class IoCtx;
     friend class NObjectIteratorImpl;
 
     /// get current hash position of the iterator, rounded to the current pg
     uint32_t get_pg_hash_position() const;
 
-    /// move the iterator to a given hash position.  this may (will!) be rounded to the nearest pg.
+    /// move the iterator to a given hash position. this may (will!) be rounded
+    /// to the nearest pg. errors are thrown as exceptions
     uint32_t seek(uint32_t pos);
 
-    /// move the iterator to a given cursor position
+    /// move the iterator to a given cursor position. errors are thrown as exceptions
     uint32_t seek(const ObjectCursor& cursor);
 
     /// get current cursor position
@@ -890,14 +891,16 @@ namespace librados
 		     std::list<librados::locker_t> *lockers);
 
 
-    /// Start enumerating objects for a pool
+    /// Start enumerating objects for a pool. Errors are thrown as exceptions.
     NObjectIterator nobjects_begin();
     NObjectIterator nobjects_begin(const bufferlist &filter);
-    /// Start enumerating objects for a pool starting from a hash position
+    /// Start enumerating objects for a pool starting from a hash position.
+    /// Errors are thrown as exceptions.
     NObjectIterator nobjects_begin(uint32_t start_hash_position);
     NObjectIterator nobjects_begin(uint32_t start_hash_position,
                                    const bufferlist &filter);
-    /// Start enumerating objects for a pool starting from cursor
+    /// Start enumerating objects for a pool starting from cursor. Errors are
+    /// thrown as exceptions.
     NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor);
     NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor,
                                    const bufferlist &filter);
diff --git a/ceph/src/librados/librados.cc b/ceph/src/librados/librados.cc
index b96c726ff..79d55c796 100644
--- a/ceph/src/librados/librados.cc
+++ b/ceph/src/librados/librados.cc
@@ -56,7 +56,6 @@ using std::map;
 using std::set;
 using std::vector;
 using std::list;
-using std::runtime_error;
 
 #define dout_subsys ceph_subsys_rados
 #undef dout_prefix
@@ -809,9 +808,8 @@ void librados::NObjectIteratorImpl::get_next()
     return;
   }
   else if (ret) {
-    ostringstream oss;
-    oss << "rados returned " << cpp_strerror(ret);
-    throw std::runtime_error(oss.str());
+    throw std::system_error(-ret, std::system_category(),
+                            "rados_nobjects_list_next");
   }
 
   if (cur_obj.impl == NULL)
diff --git a/ceph/src/librbd/librbd.cc b/ceph/src/librbd/librbd.cc
index db07e5e22..1d4664eb2 100644
--- a/ceph/src/librbd/librbd.cc
+++ b/ceph/src/librbd/librbd.cc
@@ -3058,6 +3058,7 @@ extern "C" int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps,
     tracepoint(librbd, snap_list_exit, -EINVAL, 0);
     return -EINVAL;
   }
+  memset(snaps, 0, sizeof(*snaps) * *max_snaps);
 
   int r = librbd::snap_list(ictx, cpp_snaps);
   if (r == -ENOENT) {
diff --git a/ceph/src/librbd/operation/ResizeRequest.cc b/ceph/src/librbd/operation/ResizeRequest.cc
index 07e5158c9..1e7e69afe 100644
--- a/ceph/src/librbd/operation/ResizeRequest.cc
+++ b/ceph/src/librbd/operation/ResizeRequest.cc
@@ -118,6 +118,7 @@ Context *ResizeRequest<I>::send_append_op_event() {
 
   if (m_new_size < m_original_size && !m_allow_shrink) {
     ldout(cct, 1) << " shrinking the image is not permitted" << dendl;
+    image_ctx.io_work_queue->unblock_writes();
     this->async_complete(-EINVAL);
     return nullptr;
   }
diff --git a/ceph/src/mds/CInode.cc b/ceph/src/mds/CInode.cc
index 8ba75de12..e8c1bc8bc 100644
--- a/ceph/src/mds/CInode.cc
+++ b/ceph/src/mds/CInode.cc
@@ -420,7 +420,10 @@ void CInode::pop_and_dirty_projected_inode(LogSegment *ls)
   int64_t old_pool = inode.layout.pool_id;
 
   mark_dirty(front.inode.version, ls);
+  bool new_export_pin = inode.export_pin != front.inode.export_pin;
   inode = front.inode;
+  if (new_export_pin)
+    maybe_export_pin(true);
 
   if (inode.is_backtrace_updated())
     mark_dirty_parent(ls, old_pool != inode.layout.pool_id);
@@ -4514,7 +4517,6 @@ void CInode::set_export_pin(mds_rank_t rank)
   assert(is_dir());
   assert(is_projected());
   get_projected_inode()->export_pin = rank;
-  maybe_export_pin(true);
 }
 
 mds_rank_t CInode::get_export_pin(bool inherit) const
@@ -4528,15 +4530,14 @@ mds_rank_t CInode::get_export_pin(bool inherit) const
   while (true) {
     if (in->is_system())
       break;
-    const CDentry *pdn = in->get_projected_parent_dn();
+    const CDentry *pdn = in->get_parent_dn();
     if (!pdn)
       break;
-    const mempool_inode *pi = in->get_projected_inode();
     // ignore export pin for unlinked directory
-    if (pi->nlink == 0)
+    if (in->get_inode().nlink == 0)
       break;
-    if (pi->export_pin >= 0)
-      return pi->export_pin;
+    if (in->get_inode().export_pin >= 0)
+      return in->get_inode().export_pin;
 
     if (!inherit)
       break;
diff --git a/ceph/src/mds/CInode.h b/ceph/src/mds/CInode.h
index 6e2b73b07..c7628fee6 100644
--- a/ceph/src/mds/CInode.h
+++ b/ceph/src/mds/CInode.h
@@ -736,6 +736,7 @@ public:
   int d_type() const { return IFTODT(inode.mode); }
 
   mempool_inode& get_inode() { return inode; }
+  const mempool_inode& get_inode() const { return inode; }
   CDentry* get_parent_dn() { return parent; }
   const CDentry* get_parent_dn() const { return parent; }
   const CDentry* get_projected_parent_dn() const { return !projected_parent.empty() ? projected_parent.back() : parent; }
diff --git a/ceph/src/mds/FSMap.cc b/ceph/src/mds/FSMap.cc
index a4f33db8d..cfa135225 100644
--- a/ceph/src/mds/FSMap.cc
+++ b/ceph/src/mds/FSMap.cc
@@ -232,16 +232,12 @@ void FSMap::create_filesystem(boost::string_view name,
   auto fs = std::make_shared<Filesystem>();
   fs->mds_map.epoch = epoch;
   fs->mds_map.fs_name = std::string(name);
-  fs->mds_map.max_mds = 1;
   fs->mds_map.data_pools.push_back(data_pool);
   fs->mds_map.metadata_pool = metadata_pool;
   fs->mds_map.cas_pool = -1;
-  fs->mds_map.max_file_size = g_conf->mds_max_file_size;
   fs->mds_map.compat = compat;
   fs->mds_map.created = ceph_clock_now();
   fs->mds_map.modified = ceph_clock_now();
-  fs->mds_map.session_timeout = g_conf->mds_session_timeout;
-  fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
   fs->mds_map.enabled = true;
   if (features & CEPH_FEATURE_SERVER_JEWEL) {
     fs->fscid = next_filesystem_id++;
@@ -276,17 +272,13 @@ void FSMap::reset_filesystem(fs_cluster_id_t fscid)
   // Carry forward what makes sense
   new_fs->fscid = fs->fscid;
   new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
-  new_fs->mds_map.max_mds = 1;
   new_fs->mds_map.data_pools = fs->mds_map.data_pools;
   new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
   new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
   new_fs->mds_map.fs_name = fs->mds_map.fs_name;
-  new_fs->mds_map.max_file_size = g_conf->mds_max_file_size;
   new_fs->mds_map.compat = compat;
   new_fs->mds_map.created = ceph_clock_now();
   new_fs->mds_map.modified = ceph_clock_now();
-  new_fs->mds_map.session_timeout = g_conf->mds_session_timeout;
-  new_fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
   new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
   new_fs->mds_map.enabled = true;
 
diff --git a/ceph/src/mds/Locker.cc b/ceph/src/mds/Locker.cc
index 058fcae76..42b47087f 100644
--- a/ceph/src/mds/Locker.cc
+++ b/ceph/src/mds/Locker.cc
@@ -2224,7 +2224,7 @@ void Locker::request_inode_file_caps(CInode *in)
 {
   assert(!in->is_auth());
 
-  int wanted = in->get_caps_wanted() & ~CEPH_CAP_PIN;
+  int wanted = in->get_caps_wanted() & in->get_caps_allowed_ever() & ~CEPH_CAP_PIN;
   if (wanted != in->replica_caps_wanted) {
     // wait for single auth
     if (in->is_ambiguous_auth()) {
@@ -2873,9 +2873,9 @@ void Locker::handle_client_caps(MClientCaps *m)
 
     // filter wanted based on what we could ever give out (given auth/replica status)
     bool need_flush = m->flags & CLIENT_CAPS_SYNC;
-    int new_wanted = m->get_wanted() & head_in->get_caps_allowed_ever();
+    int new_wanted = m->get_wanted();
     if (new_wanted != cap->wanted()) {
-      if (!need_flush && (new_wanted & ~cap->pending())) {
+      if (!need_flush && in->is_auth() && (new_wanted & ~cap->pending())) {
 	// exapnding caps.  make sure we aren't waiting for a log flush
 	need_flush = _need_flush_mdlog(head_in, new_wanted & ~cap->pending());
       }
@@ -3534,7 +3534,7 @@ void Locker::remove_client_cap(CInode *in, client_t client)
 
 /**
  * Return true if any currently revoking caps exceed the
- * mds_session_timeout threshold.
+ * session_timeout threshold.
  */
 bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking,
                                     double timeout) const
@@ -3587,8 +3587,8 @@ void Locker::caps_tick()
 
     utime_t age = now - cap->get_last_revoke_stamp();
     dout(20) << __func__ << " age = " << age << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
-    if (age <= g_conf->mds_session_timeout) {
-      dout(20) << __func__ << " age below timeout " << g_conf->mds_session_timeout << dendl;
+    if (age <= mds->mdsmap->get_session_timeout()) {
+      dout(20) << __func__ << " age below timeout " << mds->mdsmap->get_session_timeout() << dendl;
       break;
     } else {
       ++i;
@@ -3599,7 +3599,7 @@ void Locker::caps_tick()
       }
     }
     // exponential backoff of warning intervals
-    if (age > g_conf->mds_session_timeout * (1 << cap->get_num_revoke_warnings())) {
+    if (age > mds->mdsmap->get_session_timeout() * (1 << cap->get_num_revoke_warnings())) {
       cap->inc_num_revoke_warnings();
       stringstream ss;
       ss << "client." << cap->get_client() << " isn't responding to mclientcaps(revoke), ino "
@@ -4871,7 +4871,7 @@ void Locker::file_eval(ScatterLock *lock, bool *need_issue)
   else if (lock->get_state() != LOCK_SYNC &&
 	   !lock->is_wrlocked() &&   // drain wrlocks first!
 	   !lock->is_waiter_for(SimpleLock::WAIT_WR) &&
-	   !(wanted & (CEPH_CAP_GWR|CEPH_CAP_GBUFFER)) &&
+	   !(wanted & CEPH_CAP_GWR) &&
 	   !((lock->get_state() == LOCK_MIX) &&
 	     in->is_dir() && in->has_subtree_or_exporting_dirfrag())  // if we are a delegation point, stay where we are
 	   //((wanted & CEPH_CAP_RD) || 
diff --git a/ceph/src/mds/MDBalancer.cc b/ceph/src/mds/MDBalancer.cc
index c3d56bff9..39abb57ba 100644
--- a/ceph/src/mds/MDBalancer.cc
+++ b/ceph/src/mds/MDBalancer.cc
@@ -1147,7 +1147,7 @@ void MDBalancer::hit_inode(const utime_t& now, CInode *in, int type, int who)
 void MDBalancer::maybe_fragment(CDir *dir, bool hot)
 {
   // split/merge
-  if (g_conf->mds_bal_frag && bal_fragment_interval > 0 &&
+  if (bal_fragment_interval > 0 &&
       dir->is_auth() &&
       !dir->inode->is_base() &&  // not root/base (for now at least)
       !dir->inode->is_stray()) { // not straydir
diff --git a/ceph/src/mds/MDCache.cc b/ceph/src/mds/MDCache.cc
index ddb7ea924..d7b40a312 100644
--- a/ceph/src/mds/MDCache.cc
+++ b/ceph/src/mds/MDCache.cc
@@ -399,7 +399,8 @@ void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
   memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
   if (in->inode.is_dir()) {
     in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
-    ++in->inode.rstat.rsubdirs;
+    in->inode.rstat.rsubdirs = 1; /* itself */
+    in->inode.rstat.rctime = in->inode.ctime;
   } else {
     in->inode.layout = default_file_layout;
     ++in->inode.rstat.rfiles;
@@ -446,13 +447,12 @@ void MDCache::create_empty_hierarchy(MDSGather *gather)
   adjust_subtree_auth(rootdir, mds->get_nodeid());   
   rootdir->dir_rep = CDir::REP_ALL;   //NONE;
 
-  rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat;
-  rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
-
-  root->inode.dirstat = rootdir->fnode.fragstat;
-  root->inode.rstat = rootdir->fnode.rstat;
-  ++root->inode.rstat.rsubdirs;
-  root->inode.accounted_rstat = root->inode.rstat;
+  assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
+  assert(rootdir->fnode.fragstat == root->inode.dirstat);
+  assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
+  /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
+   * assume version 0 is stale/invalid.
+   */
 
   rootdir->mark_complete();
   rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
@@ -3068,7 +3068,7 @@ void MDCache::handle_mds_failure(mds_rank_t who)
 
   // MDCache::shutdown_export_strays() always exports strays to mds.0
   if (who == mds_rank_t(0))
-    shutdown_exported_strays.clear();
+    shutdown_exporting_strays.clear();
 
   show_subtrees();  
 }
@@ -7498,7 +7498,7 @@ void MDCache::check_memory_usage()
 
   if (cache_toofull()) {
     last_recall_state = clock::now();
-    mds->server->recall_client_state();
+    mds->server->recall_client_state(-1.0, false, nullptr);
   }
 
   // If the cache size had exceeded its limit, but we're back in bounds
@@ -7642,7 +7642,16 @@ bool MDCache::shutdown_pass()
   // Fully trim the log so that all objects in cache are clean and may be
   // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
   // trim the log such that the cache eventually becomes clean.
-  mds->mdlog->trim(0);
+  if (mds->mdlog->get_num_segments() > 0) {
+    auto ls = mds->mdlog->get_current_segment();
+    if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
+      // Current segment contains events other than subtreemap or
+      // there are dirty dirfrags (see CDir::log_mark_dirty())
+      mds->mdlog->start_new_segment();
+      mds->mdlog->flush();
+    }
+  }
+  mds->mdlog->trim_all();
   if (mds->mdlog->get_num_segments() > 1) {
     dout(7) << "still >1 segments, waiting for log to trim" << dendl;
     return false;
@@ -7675,6 +7684,12 @@ bool MDCache::shutdown_pass()
   assert(!migrator->is_exporting());
   assert(!migrator->is_importing());
 
+  // replicas may dirty scatter locks
+  if (myin && myin->is_replicated()) {
+    dout(7) << "still have replicated objects" << dendl;
+    return false;
+  }
+
   if ((myin && myin->is_auth_pinned()) ||
       (mydir && mydir->is_auth_pinned())) {
     dout(7) << "still have auth pinned objects" << dendl;
@@ -7685,9 +7700,11 @@ bool MDCache::shutdown_pass()
   if (!mds->mdlog->is_capped()) {
     dout(7) << "capping the log" << dendl;
     mds->mdlog->cap();
-    mds->mdlog->trim();
   }
   
+  if (!mds->mdlog->empty())
+    mds->mdlog->trim(0);
+
   if (!mds->mdlog->empty()) {
     dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() 
 	    << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
@@ -7744,60 +7761,119 @@ bool MDCache::shutdown_pass()
 
 bool MDCache::shutdown_export_strays()
 {
+  static const unsigned MAX_EXPORTING = 100;
+
   if (mds->get_nodeid() == 0)
     return true;
-  
-  dout(10) << "shutdown_export_strays" << dendl;
+
+  if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
+    return false;
+
+  dout(10) << "shutdown_export_strays " << shutdown_export_next.first
+	   << " '" << shutdown_export_next.second << "'" << dendl;
 
   bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
+  bool all_exported = false;
 
-  bool done = true;
+again:
+  auto next = shutdown_export_next;
 
-  list<CDir*> dfs;
   for (int i = 0; i < NUM_STRAY; ++i) {
-    if (!strays[i] ||
-	!strays[i]->state_test(CInode::STATE_STRAYPINNED))
+    CInode *strayi = strays[i];
+    if (!strayi ||
+	!strayi->state_test(CInode::STATE_STRAYPINNED))
+      continue;
+    if (strayi->ino() < next.first.ino)
       continue;
-    strays[i]->get_dirfrags(dfs);
-  }
 
-  for (std::list<CDir*>::iterator dfs_i = dfs.begin();
-       dfs_i != dfs.end(); ++dfs_i)
-  {
-    CDir *dir = *dfs_i;
+    deque<CDir*> dfls;
+    strayi->get_dirfrags(dfls);
 
-    if (!dir->is_complete()) {
-      dir->fetch(0);
-      done = false;
-      if (!mds0_active)
-	break;
-    }
-    
-    for (auto &p : dir->items) {
-      CDentry *dn = p.second;
-      CDentry::linkage_t *dnl = dn->get_projected_linkage();
-      if (dnl->is_null())
+    while (!dfls.empty()) {
+      CDir *dir = dfls.front();
+      dfls.pop_front();
+
+      if (dir->dirfrag() < next.first)
 	continue;
-      done = false;
-      if (!mds0_active)
-	break;
-      
-      if (dn->state_test(CDentry::STATE_PURGING)) {
-        // Don't try to migrate anything that is actually
-        // being purged right now
-        continue;
+      if (next.first < dir->dirfrag()) {
+	next.first = dir->dirfrag();
+	next.second.clear();
+      }
+
+      if (!dir->is_complete()) {
+	MDSInternalContextBase *fin = nullptr;
+	if (shutdown_exporting_strays.empty()) {
+	  fin = new MDSInternalContextWrapper(mds,
+		  new FunctionContext([this](int r) {
+		    shutdown_export_strays();
+		  })
+		);
+	}
+	dir->fetch(fin);
+	goto done;
       }
 
-      if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) {
-	shutdown_exported_strays.insert(dnl->get_inode()->ino());
-	stray_manager.migrate_stray(dn, mds_rank_t(0));  // send to root!
+      CDir::dentry_key_map::iterator it;
+      if (next.second.empty()) {
+	it = dir->begin();
       } else {
-	dout(10) << "already exporting " << *dn << dendl;
+	auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
+	it = dir->lower_bound(dentry_key_t(0, next.second, hash));
+      }
+
+      for (; it != dir->end(); ++it) {
+	CDentry *dn = it->second;
+	CDentry::linkage_t *dnl = dn->get_projected_linkage();
+	if (dnl->is_null())
+	  continue;
+
+	if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
+	  next.second = string(it->first.name);
+	  goto done;
+	}
+
+	auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
+	if (!ret.second) {
+	  dout(10) << "already exporting/purging " << *dn << dendl;
+	  continue;
+	}
+
+	// Don't try to migrate anything that is actually
+	// being purged right now
+	if (!dn->state_test(CDentry::STATE_PURGING))
+	  stray_manager.migrate_stray(dn, mds_rank_t(0));  // send to root!
+
+	if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
+	  ++it;
+	  if (it != dir->end()) {
+	    next.second = string(it->first.name);
+	  } else {
+	    if (dfls.empty())
+	      next.first.ino.val++;
+	    else
+	      next.first = dfls.front()->dirfrag();
+	    next.second.clear();
+	  }
+	  goto done;
+	}
       }
     }
   }
 
-  return done;
+  if (shutdown_exporting_strays.empty()) {
+    dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
+    if (first_df < shutdown_export_next.first ||
+	!shutdown_export_next.second.empty()) {
+      shutdown_export_next.first = first_df;
+      shutdown_export_next.second.clear();
+      goto again;
+    }
+    all_exported = true;
+  }
+
+done:
+  shutdown_export_next = next;
+  return all_exported;
 }
 
 // ========= messaging ==============
@@ -11940,7 +12016,7 @@ void MDCache::show_cache()
     show_func(p.second);
 }
 
-int MDCache::cache_status(Formatter *f)
+void MDCache::cache_status(Formatter *f)
 {
   f->open_object_section("cache");
 
@@ -11949,7 +12025,6 @@ int MDCache::cache_status(Formatter *f)
   f->close_section();
 
   f->close_section();
-  return 0;
 }
 
 int MDCache::dump_cache(boost::string_view file_name)
@@ -11975,6 +12050,32 @@ int MDCache::dump_cache(boost::string_view fn, Formatter *f,
 			 boost::string_view dump_root, int depth)
 {
   int r = 0;
+
+  // dumping large caches may cause mds to hang or worse get killed.
+  // so, disallow the dump if the cache size exceeds the configured
+  // threshold, which is 1G for formatter and unlimited for file (note
+  // that this can be jacked up by the admin... and is nothing but foot
+  // shooting, but the option itself is for devs and hence dangerous to
+  // tune). TODO: remove this when fixed.
+  uint64_t threshold = f ?
+    g_conf->get_val<uint64_t>("mds_dump_cache_threshold_formatter") :
+    g_conf->get_val<uint64_t>("mds_dump_cache_threshold_file");
+
+  if (threshold && cache_size() > threshold) {
+    if (f) {
+      std::stringstream ss;
+      ss << "cache usage exceeds dump threshold";
+      f->open_object_section("result");
+      f->dump_string("error", ss.str());
+      f->close_section();
+    } else {
+      derr << "cache usage exceeds dump threshold" << dendl;
+      r = -EINVAL;
+    }
+    return r;
+  }
+
+  r = 0;
   int fd = -1;
 
   if (f) {
diff --git a/ceph/src/mds/MDCache.h b/ceph/src/mds/MDCache.h
index c39a9a1a6..49d8fc738 100644
--- a/ceph/src/mds/MDCache.h
+++ b/ceph/src/mds/MDCache.h
@@ -759,13 +759,18 @@ public:
 
   // shutdown
 private:
-  set<inodeno_t> shutdown_exported_strays;
+  set<inodeno_t> shutdown_exporting_strays;
+  pair<dirfrag_t, string> shutdown_export_next;
 public:
   void shutdown_start();
   void shutdown_check();
   bool shutdown_pass();
-  bool shutdown_export_strays();
   bool shutdown();                    // clear cache (ie at shutodwn)
+  bool shutdown_export_strays();
+  void shutdown_export_stray_finish(inodeno_t ino) {
+    if (shutdown_exporting_strays.erase(ino))
+      shutdown_export_strays();
+  }
 
   bool did_shutdown_log_cap;
 
@@ -1176,7 +1181,7 @@ public:
   int dump_cache(Formatter *f);
   int dump_cache(boost::string_view dump_root, int depth, Formatter *f);
 
-  int cache_status(Formatter *f);
+  void cache_status(Formatter *f);
 
   void dump_resolve_status(Formatter *f) const;
   void dump_rejoin_status(Formatter *f) const;
diff --git a/ceph/src/mds/MDLog.cc b/ceph/src/mds/MDLog.cc
index 49297f86c..1083ab356 100644
--- a/ceph/src/mds/MDLog.cc
+++ b/ceph/src/mds/MDLog.cc
@@ -64,7 +64,8 @@ void MDLog::create_logger()
   plb.add_u64(l_mdl_evexd, "evexd", "Current expired events");
   plb.add_u64(l_mdl_segexg, "segexg", "Expiring segments");
   plb.add_u64(l_mdl_segexd, "segexd", "Current expired segments");
-  plb.add_u64_counter(l_mdl_replayed, "replayed", "Events replayed");
+  plb.add_u64_counter(l_mdl_replayed, "replayed", "Events replayed",
+		      "repl", PerfCountersBuilder::PRIO_INTERESTING);
   plb.add_time_avg(l_mdl_jlat, "jlat", "Journaler flush latency");
   plb.add_u64_counter(l_mdl_evex, "evex", "Total expired events");
   plb.add_u64_counter(l_mdl_evtrm, "evtrm", "Trimmed events");
@@ -699,7 +700,8 @@ int MDLog::trim_all()
 
   map<uint64_t,LogSegment*>::iterator p = segments.begin();
   while (p != segments.end() &&
-	 p->first < last_seq && p->second->end <= safe_pos) {
+	 p->first < last_seq &&
+	 p->second->end < safe_pos) { // next segment should have been started
     LogSegment *ls = p->second;
     ++p;
 
@@ -862,6 +864,8 @@ void MDLog::replay(MDSInternalContextBase *c)
   if (journaler->get_read_pos() == journaler->get_write_pos()) {
     dout(10) << "replay - journal empty, done." << dendl;
     mds->mdcache->trim();
+    if (mds->is_standby_replay())
+      mds->update_mlogger();
     if (c) {
       c->complete(0);
     }
diff --git a/ceph/src/mds/MDSDaemon.cc b/ceph/src/mds/MDSDaemon.cc
index 332ef1106..2f55ef825 100644
--- a/ceph/src/mds/MDSDaemon.cc
+++ b/ceph/src/mds/MDSDaemon.cc
@@ -251,6 +251,11 @@ void MDSDaemon::set_up_admin_socket()
                                      asok_hook,
                                      "show cache status");
   assert(r == 0);
+  r = admin_socket->register_command("cache drop",
+                                     "cache drop name=timeout,type=CephInt,range=0,req=false",
+                                     asok_hook,
+                                     "drop cache");
+  assert(r == 0);
   r = admin_socket->register_command("dump tree",
 				     "dump tree "
 				     "name=root,type=CephString,req=true "
@@ -367,6 +372,8 @@ const char** MDSDaemon::get_tracked_conf_keys() const
     "mds_cache_reservation",
     "mds_health_cache_threshold",
     "mds_cache_mid",
+    "mds_dump_cache_threshold_formatter",
+    "mds_dump_cache_threshold_file",
     // MDBalancer
     "mds_bal_fragment_interval",
     // PurgeQueue
@@ -648,7 +655,6 @@ void MDSDaemon::handle_command(MCommand *m)
   m->put();
 }
 
-
 struct MDSCommand {
   string cmdstring;
   string helpstring;
@@ -702,6 +708,9 @@ COMMAND("heap " \
 	"name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
 	"show heap usage info (available only if compiled with tcmalloc)", \
 	"mds", "*", "cli,rest")
+COMMAND("cache drop name=timeout,type=CephInt,range=0,req=false", "trim cache and optionally "
+	"request client to release all caps and flush the journal", "mds",
+	"r", "cli,rest")
 };
 
 
@@ -856,7 +865,7 @@ int MDSDaemon::_handle_command(
     }
     else {
       bool handled = mds_rank->handle_command(cmdmap, m, &r, &ds, &ss,
-					      need_reply);
+					      run_later, need_reply);
       if (!handled) {
         // MDSDaemon doesn't know this command
         ss << "unrecognized command! " << prefix;
@@ -1056,8 +1065,11 @@ void MDSDaemon::suicide()
 
   //because add_observer is called after set_up_admin_socket
   //so we can use asok_hook to avoid assert in the remove_observer
-  if (asok_hook != NULL)
+  if (asok_hook != NULL) {
+    mds_lock.Unlock();
     g_conf->remove_observer(this);
+    mds_lock.Lock();
+  }
 
   clean_up_admin_socket();
 
diff --git a/ceph/src/mds/MDSMap.h b/ceph/src/mds/MDSMap.h
index 86a538ce4..3b4ef75f6 100644
--- a/ceph/src/mds/MDSMap.h
+++ b/ceph/src/mds/MDSMap.h
@@ -232,10 +232,16 @@ public:
   utime_t get_session_timeout() const {
     return utime_t(session_timeout,0);
   }
+  void set_session_timeout(uint32_t t) {
+    session_timeout = t;
+  }
 
   utime_t get_session_autoclose() const {
     return utime_t(session_autoclose, 0);
   }
+  void set_session_autoclose(uint32_t t) {
+    session_autoclose = t;
+  }
 
   uint64_t get_max_filesize() const { return max_file_size; }
   void set_max_filesize(uint64_t m) { max_file_size = m; }
@@ -635,6 +641,16 @@ public:
     }
   }
 
+  /**
+   * Get MDS rank incarnation if the rank is up, else -1
+   */
+  mds_gid_t get_incarnation(mds_rank_t m) const {
+    std::map<mds_rank_t, mds_gid_t>::const_iterator u = up.find(m);
+    if (u == up.end())
+      return MDS_GID_NONE;
+    return (mds_gid_t)get_inc_gid(u->second);
+  }
+
   int get_inc_gid(mds_gid_t gid) const {
     auto mds_info_entry = mds_info.find(gid);
     if (mds_info_entry != mds_info.end())
diff --git a/ceph/src/mds/MDSRank.cc b/ceph/src/mds/MDSRank.cc
index 1e8b024b8..b196b5044 100644
--- a/ceph/src/mds/MDSRank.cc
+++ b/ceph/src/mds/MDSRank.cc
@@ -44,6 +44,391 @@
 #undef dout_prefix
 #define dout_prefix *_dout << "mds." << whoami << '.' << incarnation << ' '
 
+class C_Flush_Journal : public MDSInternalContext {
+public:
+  C_Flush_Journal(MDCache *mdcache, MDLog *mdlog, MDSRank *mds,
+                  std::ostream *ss, Context *on_finish)
+    : MDSInternalContext(mds),
+      mdcache(mdcache), mdlog(mdlog), ss(ss), on_finish(on_finish),
+      whoami(mds->whoami), incarnation(mds->incarnation) {
+  }
+
+  void send() {
+    assert(mds->mds_lock.is_locked());
+
+    dout(20) << __func__ << dendl;
+
+    if (mdcache->is_readonly()) {
+      dout(5) << __func__ << ": read-only FS" << dendl;
+      complete(-EROFS);
+      return;
+    }
+
+    if (!mds->is_active()) {
+      dout(5) << __func__ << ": MDS not active, no-op" << dendl;
+      complete(0);
+      return;
+    }
+
+    flush_mdlog();
+  }
+
+private:
+
+  void flush_mdlog() {
+    dout(20) << __func__ << dendl;
+
+    // I need to seal off the current segment, and then mark all
+    // previous segments for expiry
+    mdlog->start_new_segment();
+
+    Context *ctx = new FunctionContext([this](int r) {
+        handle_flush_mdlog(r);
+      });
+
+    // Flush initially so that all the segments older than our new one
+    // will be elegible for expiry
+    mdlog->flush();
+    mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
+  }
+
+  void handle_flush_mdlog(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    if (r != 0) {
+      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
+      complete(r);
+      return;
+    }
+
+    clear_mdlog();
+  }
+
+  void clear_mdlog() {
+    dout(20) << __func__ << dendl;
+
+    Context *ctx = new FunctionContext([this](int r) {
+        handle_clear_mdlog(r);
+      });
+
+    // Because we may not be the last wait_for_safe context on MDLog,
+    // and subsequent contexts might wake up in the middle of our
+    // later trim_all and interfere with expiry (by e.g. marking
+    // dirs/dentries dirty on previous log segments), we run a second
+    // wait_for_safe here. See #10368
+    mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
+  }
+
+  void handle_clear_mdlog(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    if (r != 0) {
+      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
+      complete(r);
+      return;
+    }
+
+    trim_mdlog();
+  }
+
+  void trim_mdlog() {
+    // Put all the old log segments into expiring or expired state
+    dout(5) << __func__ << ": beginning segment expiry" << dendl;
+
+    int ret = mdlog->trim_all();
+    if (ret != 0) {
+      *ss << "Error " << ret << " (" << cpp_strerror(ret) << ") while trimming log";
+      complete(ret);
+      return;
+    }
+
+    expire_segments();
+  }
+
+  void expire_segments() {
+    dout(20) << __func__ << dendl;
+
+    // Attach contexts to wait for all expiring segments to expire
+    MDSGatherBuilder *expiry_gather = new MDSGatherBuilder(g_ceph_context);
+
+    const auto &expiring_segments = mdlog->get_expiring_segments();
+    for (auto p : expiring_segments) {
+      p->wait_for_expiry(expiry_gather->new_sub());
+    }
+    dout(5) << __func__ << ": waiting for " << expiry_gather->num_subs_created()
+            << " segments to expire" << dendl;
+
+    if (!expiry_gather->has_subs()) {
+      trim_segments();
+      delete expiry_gather;
+      return;
+    }
+
+    Context *ctx = new FunctionContext([this](int r) {
+        handle_expire_segments(r);
+      });
+    expiry_gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
+    expiry_gather->activate();
+  }
+
+  void handle_expire_segments(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    ceph_assert(r == 0); // MDLog is not allowed to raise errors via
+                         // wait_for_expiry
+    trim_segments();
+  }
+
+  void trim_segments() {
+    dout(20) << __func__ << dendl;
+
+    Context *ctx = new C_OnFinisher(new FunctionContext([this](int _) {
+          Mutex::Locker locker(mds->mds_lock);
+          trim_expired_segments();
+        }), mds->finisher);
+    ctx->complete(0);
+  }
+
+  void trim_expired_segments() {
+    dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now "
+            << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
+            << mdlog->get_journaler()->get_trimmed_pos() << dendl;
+
+    // Now everyone I'm interested in is expired
+    mdlog->trim_expired_segments();
+
+    dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now "
+            << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
+            << mdlog->get_journaler()->get_trimmed_pos() << dendl;
+
+    write_journal_head();
+  }
+
+  void write_journal_head() {
+    dout(20) << __func__ << dendl;
+
+    Context *ctx = new FunctionContext([this](int r) {
+        Mutex::Locker locker(mds->mds_lock);
+        handle_write_head(r);
+      });
+    // Flush the journal header so that readers will start from after
+    // the flushed region
+    mdlog->get_journaler()->write_head(ctx);
+  }
+
+  void handle_write_head(int r) {
+    if (r != 0) {
+      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
+    } else {
+      dout(5) << __func__ << ": write_head complete, all done!" << dendl;
+    }
+
+    complete(r);
+  }
+
+  void finish(int r) override {
+    dout(20) << __func__ << ": r=" << r << dendl;
+    on_finish->complete(r);
+  }
+
+  MDCache *mdcache;
+  MDLog *mdlog;
+  std::ostream *ss;
+  Context *on_finish;
+
+  // so as to use dout
+  mds_rank_t whoami;
+  int incarnation;
+};
+
+class C_Drop_Cache : public MDSInternalContext {
+public:
+  C_Drop_Cache(Server *server, MDCache *mdcache, MDLog *mdlog,
+               MDSRank *mds, uint64_t recall_timeout,
+               Formatter *f, Context *on_finish)
+    : MDSInternalContext(mds),
+      server(server), mdcache(mdcache), mdlog(mdlog),
+      recall_timeout(recall_timeout), f(f), on_finish(on_finish),
+      whoami(mds->whoami), incarnation(mds->incarnation) {
+  }
+
+  void send() {
+    // not really a hard requirement here, but lets ensure this in
+    // case we change the logic here.
+    assert(mds->mds_lock.is_locked());
+
+    dout(20) << __func__ << dendl;
+    recall_client_state();
+  }
+
+private:
+  // context which completes itself (with -ETIMEDOUT) after a specified
+  // timeout or when explicitly completed, whichever comes first. Note
+  // that the context does not detroy itself after completion -- it
+  // needs to be explicitly freed.
+  class C_ContextTimeout : public MDSInternalContext {
+  public:
+    C_ContextTimeout(MDSRank *mds, uint64_t timeout, Context *on_finish)
+      : MDSInternalContext(mds),
+        timeout(timeout),
+        lock("mds::context::timeout", false, true),
+        on_finish(on_finish) {
+    }
+    ~C_ContextTimeout() {
+      ceph_assert(timer_task == nullptr);
+    }
+
+    void start_timer() {
+      if (!timeout) {
+        return;
+      }
+
+      timer_task = new FunctionContext([this](int _) {
+          timer_task = nullptr;
+          complete(-ETIMEDOUT);
+        });
+      mds->timer.add_event_after(timeout, timer_task);
+    }
+
+    void finish(int r) override {
+      Context *ctx = nullptr;
+      {
+        Mutex::Locker locker(lock);
+        std::swap(on_finish, ctx);
+      }
+      if (ctx != nullptr) {
+        ctx->complete(r);
+      }
+    }
+    void complete(int r) override {
+      if (timer_task != nullptr) {
+        mds->timer.cancel_event(timer_task);
+      }
+
+      finish(r);
+    }
+
+    uint64_t timeout;
+    Mutex lock;
+    Context *on_finish = nullptr;
+    Context *timer_task = nullptr;
+  };
+
+  void recall_client_state() {
+    dout(20) << __func__ << dendl;
+
+    f->open_object_section("result");
+
+    MDSGatherBuilder *gather = new MDSGatherBuilder(g_ceph_context);
+    server->recall_client_state(1.0, true, gather);
+    if (!gather->has_subs()) {
+      handle_recall_client_state(0);
+      delete gather;
+      return;
+    }
+
+    C_ContextTimeout *ctx = new C_ContextTimeout(
+      mds, recall_timeout, new FunctionContext([this](int r) {
+          handle_recall_client_state(r);
+        }));
+
+    ctx->start_timer();
+    gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
+    gather->activate();
+  }
+
+  void handle_recall_client_state(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    // client recall section
+    f->open_object_section("client_recall");
+    f->dump_int("return_code", r);
+    f->dump_string("message", cpp_strerror(r));
+    f->close_section();
+
+    // we can still continue after recall timeout
+    trim_cache();
+  }
+
+  void trim_cache() {
+    dout(20) << __func__ << dendl;
+
+    if (!mdcache->trim(UINT64_MAX)) {
+      cmd_err(f, "failed to trim cache");
+      complete(-EINVAL);
+      return;
+    }
+
+    flush_journal();
+  }
+
+  void flush_journal() {
+    dout(20) << __func__ << dendl;
+
+    Context *ctx = new FunctionContext([this](int r) {
+        handle_flush_journal(r);
+      });
+
+    C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, mds, &ss, ctx);
+    flush_journal->send();
+  }
+
+  void handle_flush_journal(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    if (r != 0) {
+      cmd_err(f, ss.str());
+      complete(r);
+      return;
+    }
+
+    // journal flush section
+    f->open_object_section("flush_journal");
+    f->dump_int("return_code", r);
+    f->dump_string("message", ss.str());
+    f->close_section();
+
+    cache_status();
+  }
+
+  void cache_status() {
+    dout(20) << __func__ << dendl;
+
+    // cache status section
+    mdcache->cache_status(f);
+    f->close_section();
+
+    complete(0);
+  }
+
+  void finish(int r) override {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    on_finish->complete(r);
+  }
+
+  Server *server;
+  MDCache *mdcache;
+  MDLog *mdlog;
+  uint64_t recall_timeout;
+  Formatter *f;
+  Context *on_finish;
+
+  int retval = 0;
+  std::stringstream ss;
+
+  // so as to use dout
+  mds_rank_t whoami;
+  int incarnation;
+
+  void cmd_err(Formatter *f, boost::string_view err) {
+    f->reset();
+    f->open_object_section("result");
+    f->dump_string("error", err);
+    f->close_section();
+  }
+};
+
 MDSRank::MDSRank(
     mds_rank_t whoami_,
     Mutex &mds_lock_,
@@ -77,9 +462,10 @@ MDSRank::MDSRank(
           // Purge Queue operates inside mds_lock when we're calling into
           // it, and outside when in background, so must handle both cases.
           if (mds_lock.is_locked_by_me()) {
-            damaged();
+            handle_write_error(r);
           } else {
-            damaged_unlocked();
+            Mutex::Locker l(mds_lock);
+            handle_write_error(r);
           }
         }
       )
@@ -525,9 +911,14 @@ void MDSRank::ProgressThread::shutdown()
 
 bool MDSRankDispatcher::ms_dispatch(Message *m)
 {
-  bool ret;
+  if (m->get_source().is_client()) {
+    Session *session = static_cast<Session*>(m->get_connection()->get_priv());
+    if (session)
+      session->last_seen = Session::clock::now();
+  }
+
   inc_dispatch_depth();
-  ret = _dispatch(m, true);
+  bool ret = _dispatch(m, true);
   dec_dispatch_depth();
   return ret;
 }
@@ -819,7 +1210,8 @@ void MDSRank::heartbeat_reset()
   // NB not enabling suicide grace, because the mon takes care of killing us
   // (by blacklisting us) when we fail to send beacons, and it's simpler to
   // only have one way of dying.
-  g_ceph_context->get_heartbeat_map()->reset_timeout(hb, g_conf->mds_beacon_grace, 0);
+  auto grace = g_conf->get_val<double>("mds_heartbeat_grace");
+  g_ceph_context->get_heartbeat_map()->reset_timeout(hb, grace, 0);
 }
 
 bool MDSRank::is_stale_message(Message *m) const
@@ -872,6 +1264,7 @@ Session *MDSRank::get_session(Message *m)
         imported_session->auth_caps = session->auth_caps;
         assert(session->get_nref() == 1);
         imported_session->connection->set_priv(imported_session->get());
+        imported_session->last_seen = session->last_seen;
         session = imported_session;
       }
     }
@@ -1062,6 +1455,8 @@ void MDSRank::boot_start(BootStep step, int r)
         << cpp_strerror(r);
       damaged();
       assert(r == 0);  // Unreachable, damaged() calls respawn()
+    } else if (r == -EROFS) {
+      dout(0) << "boot error forcing transition to read-only; MDS will try to continue" << dendl;
     } else {
       // Completely unexpected error, give up and die
       dout(0) << "boot_start encountered an error, failing" << dendl;
@@ -1271,7 +1666,7 @@ void MDSRank::standby_replay_restart()
 {
   if (standby_replaying) {
     /* Go around for another pass of replaying in standby */
-    dout(4) << "standby_replay_restart (as standby)" << dendl;
+    dout(5) << "Restarting replay as standby-replay" << dendl;
     mdlog->get_journaler()->reread_head_and_probe(
       new C_MDS_StandbyReplayRestartFinish(
         this,
@@ -1300,7 +1695,11 @@ void MDSRank::standby_replay_restart()
 
 void MDSRank::replay_done()
 {
-  dout(1) << "replay_done" << (standby_replaying ? " (as standby)" : "") << dendl;
+  if (!standby_replaying) {
+    dout(1) << "Finished replaying journal" << dendl;
+  } else {
+    dout(5) << "Finished replaying journal as standby-replay" << dendl;
+  }
 
   if (is_standby_replay()) {
     // The replay was done in standby state, and we are still in that state
@@ -1322,7 +1721,7 @@ void MDSRank::replay_done()
 
     // Reformat and come back here
     if (mdlog->get_journaler()->get_stream_format() < g_conf->mds_journal_format) {
-        dout(4) << "reformatting journal on standbyreplay->replay transition" << dendl;
+        dout(4) << "reformatting journal on standby-replay->replay transition" << dendl;
         mdlog->reopen(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
         return;
     }
@@ -2012,9 +2411,18 @@ bool MDSRankDispatcher::handle_asok_command(
     }
   } else if (command == "cache status") {
     Mutex::Locker l(mds_lock);
-    int r = mdcache->cache_status(f);
+    mdcache->cache_status(f);
+  } else if (command == "cache drop") {
+    int64_t timeout;
+    if (!cmd_getval(g_ceph_context, cmdmap, "timeout", timeout)) {
+      timeout = 0;
+    }
+
+    C_SaferCond cond;
+    command_cache_drop((uint64_t)timeout, f, &cond);
+    int r = cond.wait();
     if (r != 0) {
-      ss << "Failed to get cache status: " << cpp_strerror(r);
+      f->flush(ss);
     }
   } else if (command == "dump tree") {
     string root;
@@ -2053,19 +2461,26 @@ bool MDSRankDispatcher::handle_asok_command(
   return true;
 }
 
-class C_MDS_Send_Command_Reply : public MDSInternalContext
-{
+class C_MDS_Send_Command_Reply : public MDSInternalContext {
 protected:
   MCommand *m;
 public:
   C_MDS_Send_Command_Reply(MDSRank *_mds, MCommand *_m) :
     MDSInternalContext(_mds), m(_m) { m->get(); }
-  void send (int r, boost::string_view out_str) {
+
+  void send(int r, boost::string_view ss) {
+    std::stringstream ds;
+    send(r, ss, ds);
+  }
+
+  void send(int r, boost::string_view ss, std::stringstream &ds) {
     bufferlist bl;
-    MDSDaemon::send_command_reply(m, mds, r, bl, out_str);
+    bl.append(ds);
+    MDSDaemon::send_command_reply(m, mds, r, bl, ss);
     m->put();
   }
-  void finish (int r) override {
+
+  void finish(int r) override {
     send(r, "");
   }
 };
@@ -2206,142 +2621,27 @@ void MDSRank::command_flush_path(Formatter *f, boost::string_view path)
   f->close_section(); // results
 }
 
-/**
- * Wrapper around _command_flush_journal that
- * handles serialization of result
- */
-void MDSRank::command_flush_journal(Formatter *f)
-{
-  assert(f != NULL);
+// synchronous wrapper around "journal flush" asynchronous context
+// execution.
+void MDSRank::command_flush_journal(Formatter *f) {
+  ceph_assert(f != NULL);
 
+  C_SaferCond cond;
   std::stringstream ss;
-  const int r = _command_flush_journal(&ss);
-  f->open_object_section("result");
-  f->dump_string("message", ss.str());
-  f->dump_int("return_code", r);
-  f->close_section();
-}
-
-/**
- * Implementation of "flush journal" asok command.
- *
- * @param ss
- * Optionally populate with a human readable string describing the
- * reason for any unexpected return status.
- */
-int MDSRank::_command_flush_journal(std::stringstream *ss)
-{
-  assert(ss != NULL);
 
-  Mutex::Locker l(mds_lock);
-
-  if (mdcache->is_readonly()) {
-    dout(5) << __func__ << ": read-only FS" << dendl;
-    return -EROFS;
-  }
-
-  if (!is_active()) {
-    dout(5) << __func__ << ": MDS not active, no-op" << dendl;
-    return 0;
-  }
-
-  // I need to seal off the current segment, and then mark all previous segments
-  // for expiry
-  mdlog->start_new_segment();
-  int r = 0;
-
-  // Flush initially so that all the segments older than our new one
-  // will be elegible for expiry
-  {
-    C_SaferCond mdlog_flushed;
-    mdlog->flush();
-    mdlog->wait_for_safe(new MDSInternalContextWrapper(this, &mdlog_flushed));
-    mds_lock.Unlock();
-    r = mdlog_flushed.wait();
-    mds_lock.Lock();
-    if (r != 0) {
-      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
-      return r;
-    }
-  }
-
-  // Because we may not be the last wait_for_safe context on MDLog, and
-  // subsequent contexts might wake up in the middle of our later trim_all
-  // and interfere with expiry (by e.g. marking dirs/dentries dirty
-  // on previous log segments), we run a second wait_for_safe here.
-  // See #10368
   {
-    C_SaferCond mdlog_cleared;
-    mdlog->wait_for_safe(new MDSInternalContextWrapper(this, &mdlog_cleared));
-    mds_lock.Unlock();
-    r = mdlog_cleared.wait();
-    mds_lock.Lock();
-    if (r != 0) {
-      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
-      return r;
-    }
+    Mutex::Locker locker(mds_lock);
+    C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, this, &ss, &cond);
+    flush_journal->send();
   }
+  int r = cond.wait();
 
-  // Put all the old log segments into expiring or expired state
-  dout(5) << __func__ << ": beginning segment expiry" << dendl;
-  r = mdlog->trim_all();
-  if (r != 0) {
-    *ss << "Error " << r << " (" << cpp_strerror(r) << ") while trimming log";
-    return r;
-  }
-
-  // Attach contexts to wait for all expiring segments to expire
-  MDSGatherBuilder expiry_gather(g_ceph_context);
-
-  const std::set<LogSegment*> &expiring_segments = mdlog->get_expiring_segments();
-  for (std::set<LogSegment*>::const_iterator i = expiring_segments.begin();
-       i != expiring_segments.end(); ++i) {
-    (*i)->wait_for_expiry(expiry_gather.new_sub());
-  }
-  dout(5) << __func__ << ": waiting for " << expiry_gather.num_subs_created()
-          << " segments to expire" << dendl;
-
-  if (expiry_gather.has_subs()) {
-    C_SaferCond cond;
-    expiry_gather.set_finisher(new MDSInternalContextWrapper(this, &cond));
-    expiry_gather.activate();
-
-    // Drop mds_lock to allow progress until expiry is complete
-    mds_lock.Unlock();
-    int r = cond.wait();
-    mds_lock.Lock();
-
-    assert(r == 0);  // MDLog is not allowed to raise errors via wait_for_expiry
-  }
-
-  dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now " << std::hex <<
-    mdlog->get_journaler()->get_expire_pos() << "/" <<
-    mdlog->get_journaler()->get_trimmed_pos() << dendl;
-
-  // Now everyone I'm interested in is expired
-  mdlog->trim_expired_segments();
-
-  dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now " << std::hex <<
-    mdlog->get_journaler()->get_expire_pos() << "/" <<
-    mdlog->get_journaler()->get_trimmed_pos() << dendl;
-
-  // Flush the journal header so that readers will start from after the flushed region
-  C_SaferCond wrote_head;
-  mdlog->get_journaler()->write_head(&wrote_head);
-  mds_lock.Unlock();  // Drop lock to allow messenger dispatch progress
-  r = wrote_head.wait();
-  mds_lock.Lock();
-  if (r != 0) {
-      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
-      return r;
-  }
-
-  dout(5) << __func__ << ": write_head complete, all done!" << dendl;
-
-  return 0;
+  f->open_object_section("result");
+  f->dump_string("message", ss.str());
+  f->dump_int("return_code", r);
+  f->close_section();
 }
 
-
 void MDSRank::command_get_subtrees(Formatter *f)
 {
   assert(f != NULL);
@@ -2895,6 +3195,7 @@ bool MDSRankDispatcher::handle_command(
   int *r,
   std::stringstream *ds,
   std::stringstream *ss,
+  Context **run_later,
   bool *need_reply)
 {
   assert(r != nullptr);
@@ -2916,10 +3217,9 @@ bool MDSRankDispatcher::handle_command(
       return true;
     }
 
-    Formatter *f = new JSONFormatter(true);
-    dump_sessions(filter, f);
-    f->flush(*ds);
-    delete f;
+    JSONFormatter f(true);
+    dump_sessions(filter, &f);
+    f.flush(*ds);
     return true;
   } else if (prefix == "session evict" || prefix == "client evict") {
     std::vector<std::string> filter_args;
@@ -2936,10 +3236,9 @@ bool MDSRankDispatcher::handle_command(
     *need_reply = false;
     return true;
   } else if (prefix == "damage ls") {
-    Formatter *f = new JSONFormatter(true);
-    damage_table.dump(f);
-    f->flush(*ds);
-    delete f;
+    JSONFormatter f(true);
+    damage_table.dump(&f);
+    f.flush(*ds);
     return true;
   } else if (prefix == "damage rm") {
     damage_entry_id_t id = 0;
@@ -2950,12 +3249,56 @@ bool MDSRankDispatcher::handle_command(
     }
 
     damage_table.erase(id);
+    return true;
+  } else if (prefix == "cache drop") {
+    int64_t timeout;
+    if (!cmd_getval(g_ceph_context, cmdmap, "timeout", timeout)) {
+      timeout = 0;
+    }
+
+    JSONFormatter *f = new JSONFormatter(true);
+    C_MDS_Send_Command_Reply *reply = new C_MDS_Send_Command_Reply(this, m);
+    Context *on_finish = new FunctionContext([this, f, reply](int r) {
+        cache_drop_send_reply(f, reply, r);
+        delete f;
+        delete reply;
+      });
+
+    *need_reply = false;
+    *run_later = new C_OnFinisher(
+      new FunctionContext([this, timeout, f, on_finish](int _) {
+          command_cache_drop((uint64_t)timeout, f, on_finish);
+        }), finisher);
+
     return true;
   } else {
     return false;
   }
 }
 
+void MDSRank::cache_drop_send_reply(Formatter *f, C_MDS_Send_Command_Reply *reply, int r) {
+  dout(20) << __func__ << ": r=" << r << dendl;
+
+  std::stringstream ds;
+  std::stringstream ss;
+  if (r != 0) {
+    f->flush(ss);
+  } else {
+    f->flush(ds);
+  }
+
+  reply->send(r, ss.str(), ds);
+}
+
+void MDSRank::command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish) {
+  dout(20) << __func__ << dendl;
+
+  Mutex::Locker locker(mds_lock);
+  C_Drop_Cache *request = new C_Drop_Cache(server, mdcache, mdlog, this,
+                                           timeout, f, on_finish);
+  request->send();
+}
+
 epoch_t MDSRank::get_osd_epoch() const
 {
   return objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));  
diff --git a/ceph/src/mds/MDSRank.h b/ceph/src/mds/MDSRank.h
index faac81dba..46f72a4b9 100644
--- a/ceph/src/mds/MDSRank.h
+++ b/ceph/src/mds/MDSRank.h
@@ -114,6 +114,7 @@ class MonClient;
 class Finisher;
 class MMDSMap;
 class ScrubStack;
+class C_MDS_Send_Command_Reply;
 
 /**
  * The public part of this class's interface is what's exposed to all
@@ -129,6 +130,10 @@ class MDSRank {
     int incarnation;
 
   public:
+
+    friend class C_Flush_Journal;
+    friend class C_Drop_Cache;
+
     mds_rank_t get_nodeid() const { return whoami; }
     int64_t get_metadata_pool();
 
@@ -465,11 +470,13 @@ class MDSRank {
         std::ostream &ss,
         Formatter *f);
     int _command_export_dir(boost::string_view path, mds_rank_t dest);
-    int _command_flush_journal(std::stringstream *ss);
     CDir *_command_dirfrag_get(
         const cmdmap_t &cmdmap,
         std::ostream &ss);
 
+    void cache_drop_send_reply(Formatter *f, C_MDS_Send_Command_Reply *reply, int r);
+    void command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish);
+
   protected:
     Messenger    *messenger;
     MonClient    *monc;
@@ -584,6 +591,7 @@ public:
     int *r,
     std::stringstream *ds,
     std::stringstream *ss,
+    Context **run_later,
     bool *need_reply);
 
   void dump_sessions(const SessionFilter &filter, Formatter *f) const;
diff --git a/ceph/src/mds/PurgeQueue.cc b/ceph/src/mds/PurgeQueue.cc
index fa5d43f62..329c60e74 100644
--- a/ceph/src/mds/PurgeQueue.cc
+++ b/ceph/src/mds/PurgeQueue.cc
@@ -19,6 +19,7 @@
 
 #include "PurgeQueue.h"
 
+#include <string.h>
 
 #define dout_context cct
 #define dout_subsys ceph_subsys_mds
@@ -93,6 +94,7 @@ PurgeQueue::~PurgeQueue()
   if (logger) {
     g_ceph_context->get_perfcounters_collection()->remove(logger.get());
   }
+  delete on_error;
 }
 
 void PurgeQueue::create_logger()
@@ -123,6 +125,12 @@ void PurgeQueue::init()
 void PurgeQueue::activate()
 {
   Mutex::Locker l(lock);
+
+  if (readonly) {
+    dout(10) << "skipping activate: PurgeQueue is readonly" << dendl;
+    return;
+  }
+
   if (journaler.get_read_pos() == journaler.get_write_pos())
     return;
 
@@ -177,7 +185,7 @@ void PurgeQueue::open(Context *completion)
       finish_contexts(g_ceph_context, waiting_for_recovery);
     } else {
       derr << "Error " << r << " loading Journaler" << dendl;
-      on_error->complete(r);
+      _go_readonly(r);
     }
   }));
 }
@@ -185,10 +193,14 @@ void PurgeQueue::open(Context *completion)
 void PurgeQueue::wait_for_recovery(Context* c)
 {
   Mutex::Locker l(lock);
-  if (recovered)
+  if (recovered) {
     c->complete(0);
-  else
+  } else if (readonly) {
+    dout(10) << "cannot wait for recovery: PurgeQueue is readonly" << dendl;
+    c->complete(-EROFS);
+  } else {
     waiting_for_recovery.push_back(c);
+  }
 }
 
 void PurgeQueue::_recover()
@@ -210,7 +222,7 @@ void PurgeQueue::_recover()
     if (journaler.get_error()) {
       int r = journaler.get_error();
       derr << "Error " << r << " recovering write_pos" << dendl;
-      on_error->complete(r);
+      _go_readonly(r);
       return;
     }
 
@@ -244,8 +256,12 @@ void PurgeQueue::create(Context *fin)
   journaler.create(&layout, JOURNAL_FORMAT_RESILIENT);
   journaler.write_head(new FunctionContext([this](int r) {
     Mutex::Locker l(lock);
-    recovered = true;
-    finish_contexts(g_ceph_context, waiting_for_recovery);
+    if (r) {
+      _go_readonly(r);
+    } else {
+      recovered = true;
+      finish_contexts(g_ceph_context, waiting_for_recovery);
+    }
   }));
 }
 
@@ -257,6 +273,12 @@ void PurgeQueue::push(const PurgeItem &pi, Context *completion)
   dout(4) << "pushing inode 0x" << std::hex << pi.ino << std::dec << dendl;
   Mutex::Locker l(lock);
 
+  if (readonly) {
+    dout(10) << "cannot push inode: PurgeQueue is readonly" << dendl;
+    completion->complete(-EROFS);
+    return;
+  }
+
   // Callers should have waited for open() before using us
   assert(!journaler.is_readonly());
 
@@ -271,7 +293,7 @@ void PurgeQueue::push(const PurgeItem &pi, Context *completion)
   if (!could_consume) {
     // Usually, it is not necessary to explicitly flush here, because the reader
     // will get flushes generated inside Journaler::is_readable.  However,
-    // if we remain in a can_consume()==false state for a long period then
+    // if we remain in a _can_consume()==false state for a long period then
     // we should flush in order to allow MDCache to drop its strays rather
     // than having them wait for purgequeue to progress.
     if (!delayed_flush) {
@@ -317,8 +339,13 @@ uint32_t PurgeQueue::_calculate_ops(const PurgeItem &item) const
   return ops_required;
 }
 
-bool PurgeQueue::can_consume()
+bool PurgeQueue::_can_consume()
 {
+  if (readonly) {
+    dout(10) << "can't consume: PurgeQueue is readonly" << dendl;
+    return false;
+  }
+
   dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, "
            << in_flight.size() << "/" << g_conf->mds_max_purge_files
            << " files" << dendl;
@@ -346,12 +373,23 @@ bool PurgeQueue::can_consume()
   }
 }
 
+void PurgeQueue::_go_readonly(int r)
+{
+  if (readonly) return;
+  dout(1) << "going readonly because internal IO failed: " << strerror(-r) << dendl;
+  readonly = true;
+  on_error->complete(r);
+  on_error = nullptr;
+  journaler.set_readonly();
+  finish_contexts(g_ceph_context, waiting_for_recovery, r);
+}
+
 bool PurgeQueue::_consume()
 {
   assert(lock.is_locked_by_me());
 
   bool could_consume = false;
-  while(can_consume()) {
+  while(_can_consume()) {
 
     if (delayed_flush) {
       // We are now going to read from the journal, so any proactive
@@ -363,7 +401,7 @@ bool PurgeQueue::_consume()
 
     if (int r = journaler.get_error()) {
       derr << "Error " << r << " recovering write_pos" << dendl;
-      on_error->complete(r);
+      _go_readonly(r);
       return could_consume;
     }
 
@@ -377,7 +415,7 @@ bool PurgeQueue::_consume()
           if (r == 0) {
             _consume();
           } else if (r != -EAGAIN) {
-            on_error->complete(r);
+            _go_readonly(r);
           }
         }));
       }
@@ -399,7 +437,7 @@ bool PurgeQueue::_consume()
     } catch (const buffer::error &err) {
       derr << "Decode error at read_pos=0x" << std::hex
            << journaler.get_read_pos() << dendl;
-      on_error->complete(0);
+      _go_readonly(EIO);
     }
     dout(20) << " executing item (0x" << std::hex << item.ino
              << std::dec << ")" << dendl;
@@ -419,7 +457,8 @@ void PurgeQueue::_execute_item(
 
   in_flight[expire_to] = item;
   logger->set(l_pq_executing, in_flight.size());
-  ops_in_flight += _calculate_ops(item);
+  auto ops = _calculate_ops(item);
+  ops_in_flight += ops;
   logger->set(l_pq_executing_ops, ops_in_flight);
 
   SnapContext nullsnapc;
@@ -486,6 +525,8 @@ void PurgeQueue::_execute_item(
   } else {
     derr << "Invalid item (action=" << item.action << ") in purge queue, "
             "dropping it" << dendl;
+    ops_in_flight -= ops;
+    logger->set(l_pq_executing_ops, ops_in_flight);
     in_flight.erase(expire_to);
     logger->set(l_pq_executing, in_flight.size());
     return;
@@ -505,9 +546,7 @@ void PurgeQueue::_execute_item(
     // expire_pos doesn't fall too far behind our progress when consuming
     // a very long queue.
     if (in_flight.empty() || journaler.write_head_needed()) {
-      journaler.write_head(new FunctionContext([this](int r){
-            journaler.trim();
-            }));
+      journaler.write_head(nullptr);
     }
   }), &finisher));
 
@@ -551,6 +590,11 @@ void PurgeQueue::update_op_limit(const MDSMap &mds_map)
 {
   Mutex::Locker l(lock);
 
+  if (readonly) {
+    dout(10) << "skipping; PurgeQueue is readonly" << dendl;
+    return;
+  }
+
   uint64_t pg_count = 0;
   objecter->with_osdmap([&](const OSDMap& o) {
     // Number of PGs across all data pools
@@ -607,6 +651,13 @@ bool PurgeQueue::drain(
     size_t *in_flight_count
     )
 {
+  Mutex::Locker l(lock);
+
+  if (readonly) {
+    dout(10) << "skipping drain; PurgeQueue is readonly" << dendl;
+    return true;
+  }
+
   assert(progress != nullptr);
   assert(progress_total != nullptr);
   assert(in_flight_count != nullptr);
diff --git a/ceph/src/mds/PurgeQueue.h b/ceph/src/mds/PurgeQueue.h
index 6a13a57ee..eac9a1a46 100644
--- a/ceph/src/mds/PurgeQueue.h
+++ b/ceph/src/mds/PurgeQueue.h
@@ -77,6 +77,7 @@ protected:
   CephContext *cct;
   const mds_rank_t rank;
   Mutex lock;
+  bool readonly = false;
 
   int64_t metadata_pool;
 
@@ -103,7 +104,7 @@ protected:
 
   uint32_t _calculate_ops(const PurgeItem &item) const;
 
-  bool can_consume();
+  bool _can_consume();
 
   // How many bytes were remaining when drain() was first called,
   // used for indicating progress.
@@ -133,6 +134,8 @@ protected:
   bool recovered;
   std::list<Context*> waiting_for_recovery;
 
+  void _go_readonly(int r);
+
 public:
   void init();
   void activate();
diff --git a/ceph/src/mds/Server.cc b/ceph/src/mds/Server.cc
index 04a9e3e38..e3fe19aa3 100644
--- a/ceph/src/mds/Server.cc
+++ b/ceph/src/mds/Server.cc
@@ -482,17 +482,24 @@ void Server::handle_client_session(MClientSession *m)
   m->put();
 }
 
+
+void Server::flush_session(Session *session, MDSGatherBuilder *gather) {
+  if (!session->is_open() ||
+      !session->connection.get() ||
+      !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
+    return;
+  }
+
+  version_t seq = session->wait_for_flush(gather->new_sub());
+  mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
+}
+
 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
 {
   for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
     Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
     assert(session);
-    if (!session->is_open() ||
-	!session->connection.get() ||
-	!session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER))
-      continue;
-    version_t seq = session->wait_for_flush(gather.new_sub());
-    mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
+    flush_session(session, &gather);
   }
 }
 
@@ -724,21 +731,40 @@ void Server::find_idle_sessions()
   //  (caps go stale, lease die)
   double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
   double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
-  while (1) {
-    Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
-    if (!session) break;
-    auto last_cap_renew_span = std::chrono::duration<double>(now-session->last_cap_renew).count();
-    if (last_cap_renew_span < cutoff) {
-      dout(20) << "laggiest active session is " << session->info.inst << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
-      break;
+
+  const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
+  if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
+    std::vector<Session*> new_stale;
+
+    for (auto session : *(sessions_p1->second)) {
+      auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+      if (last_cap_renew_span < cutoff) {
+	dout(20) << "laggiest active session is " << session->info.inst
+		 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+	break;
+      }
+
+      if (session->last_seen > session->last_cap_renew) {
+	last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
+	if (last_cap_renew_span < cutoff) {
+	  dout(20) << "laggiest active session is " << session->info.inst
+		   << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+	  continue;
+	}
+      }
+
+      dout(10) << "new stale session " << session->info.inst
+	       << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
+      new_stale.push_back(session);
     }
 
-    dout(10) << "new stale session " << session->info.inst << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
-    mds->sessionmap.set_state(session, Session::STATE_STALE);
-    mds->locker->revoke_stale_caps(session);
-    mds->locker->remove_stale_leases(session);
-    mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
-    finish_flush_session(session, session->get_push_seq());
+    for (auto session : new_stale) {
+      mds->sessionmap.set_state(session, Session::STATE_STALE);
+      mds->locker->revoke_stale_caps(session);
+      mds->locker->remove_stale_leases(session);
+      mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
+      finish_flush_session(session, session->get_push_seq());
+    }
   }
 
   // autoclose
@@ -758,11 +784,11 @@ void Server::find_idle_sessions()
 
   // Collect a list of sessions exceeding the autoclose threshold
   std::vector<Session *> to_evict;
-  const auto sessions_p = mds->sessionmap.by_state.find(Session::STATE_STALE);
-  if (sessions_p == mds->sessionmap.by_state.end() || sessions_p->second->empty()) {
+  const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
+  if (sessions_p2 == mds->sessionmap.by_state.end() || sessions_p2->second->empty()) {
     return;
   }
-  const auto &stale_sessions = sessions_p->second;
+  const auto &stale_sessions = sessions_p2->second;
   assert(stale_sessions != nullptr);
 
   for (const auto &session: *stale_sessions) {
@@ -934,7 +960,7 @@ void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
 
   // clients will get the mdsmap and discover we're reconnecting via the monitor.
   
-  reconnect_start = ceph_clock_now();
+  reconnect_start = clock::now();
   dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
   mds->sessionmap.dump();
 }
@@ -953,8 +979,7 @@ void Server::handle_client_reconnect(MClientReconnect *m)
     return;
   }
 
-  utime_t delay = ceph_clock_now();
-  delay -= reconnect_start;
+  auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
   dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
 
   bool deny = false;
@@ -1049,6 +1074,8 @@ void Server::handle_client_reconnect(MClientReconnect *m)
   }
   mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
 
+  reconnect_last_seen = clock::now();
+
   // remove from gather set
   client_reconnect_gather.erase(from);
   if (client_reconnect_gather.empty())
@@ -1070,52 +1097,70 @@ void Server::reconnect_gather_finish()
 void Server::reconnect_tick()
 {
   if (reconnect_evicting) {
-    dout(4) << "reconnect_tick: waiting for evictions" << dendl;
+    dout(7) << "reconnect_tick: waiting for evictions" << dendl;
     return;
   }
 
-  utime_t reconnect_end = reconnect_start;
-  reconnect_end += g_conf->mds_reconnect_timeout;
-  if (ceph_clock_now() >= reconnect_end &&
-      !client_reconnect_gather.empty()) {
-    dout(10) << "reconnect timed out" << dendl;
+  if (client_reconnect_gather.empty())
+    return;
 
-    // If we're doing blacklist evictions, use this to wait for them before
-    // proceeding to reconnect_gather_finish
-    MDSGatherBuilder gather(g_ceph_context);
+  auto now = clock::now();
+  auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
+  if (elapse1 < g_conf->mds_reconnect_timeout)
+    return;
 
-    for (set<client_t>::iterator p = client_reconnect_gather.begin();
-	 p != client_reconnect_gather.end();
-	 ++p) {
-      Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
-      assert(session);
-      dout(1) << "reconnect gave up on " << session->info.inst << dendl;
-
-      mds->clog->warn() << "evicting unresponsive client " << *session
-                        << ", after waiting " << g_conf->mds_reconnect_timeout
-                        << " seconds during MDS startup";
-
-      if (g_conf->mds_session_blacklist_on_timeout) {
-        std::stringstream ss;
-        mds->evict_client(session->info.inst.name.num(), false, true, ss,
-                          gather.new_sub());
-      } else {
-        kill_session(session, NULL);
-      }
+  vector<Session*> remaining_sessions;
+  remaining_sessions.reserve(client_reconnect_gather.size());
+  for (auto c : client_reconnect_gather) {
+    Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
+    ceph_assert(session);
+    remaining_sessions.push_back(session);
+    // client re-sends cap flush messages before the reconnect message
+    if (session->last_seen > reconnect_last_seen)
+      reconnect_last_seen = session->last_seen;
+  }
 
-      failed_reconnects++;
-    }
-    client_reconnect_gather.clear();
+  auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
+  if (elapse2 < g_conf->mds_reconnect_timeout / 2) {
+    dout(7) << "reconnect_tick: last seen " << elapse2
+            << " seconds ago, extending reconnect interval" << dendl;
+    return;
+  }
+
+  dout(7) << "reconnect timed out, " << remaining_sessions.size()
+	  << " clients have not reconnected in time" << dendl;
+
+  // If we're doing blacklist evictions, use this to wait for them before
+  // proceeding to reconnect_gather_finish
+  MDSGatherBuilder gather(g_ceph_context);
+
+  for (auto session : remaining_sessions) {
+    dout(1) << "reconnect gives up on " << session->info.inst << dendl;
 
-    if (gather.has_subs()) {
-      dout(1) << "reconnect will complete once clients are evicted" << dendl;
-      gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
-            [this](int r){reconnect_gather_finish();})));
-      gather.activate();
-      reconnect_evicting = true;
+    mds->clog->warn() << "evicting unresponsive client " << *session
+		      << ", after waiting " << elapse1
+		      << " seconds during MDS startup";
+
+    if (g_conf->mds_session_blacklist_on_timeout) {
+      std::stringstream ss;
+      mds->evict_client(session->get_client().v, false, true, ss,
+			gather.new_sub());
     } else {
-      reconnect_gather_finish();
+      kill_session(session, NULL);
     }
+
+    failed_reconnects++;
+  }
+  client_reconnect_gather.clear();
+
+  if (gather.has_subs()) {
+    dout(1) << "reconnect will complete once clients are evicted" << dendl;
+    gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
+	    [this](int r){reconnect_gather_finish();})));
+    gather.activate();
+    reconnect_evicting = true;
+  } else {
+    reconnect_gather_finish();
   }
 }
 
@@ -1147,8 +1192,12 @@ void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
  * to trim some caps, and consequently unpin some inodes in the MDCache so
  * that it can trim too.
  */
-void Server::recall_client_state(void)
-{
+void Server::recall_client_state(double ratio, bool flush_client_session,
+                                 MDSGatherBuilder *gather) {
+  if (flush_client_session) {
+    assert(gather != nullptr);
+  }
+
   /* try to recall at least 80% of all caps */
   uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
   uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
@@ -1162,16 +1211,18 @@ void Server::recall_client_state(void)
   /* ratio: determine the amount of caps to recall from each client. Use
    * percentage full over the cache reservation. Cap the ratio at 80% of client
    * caps. */
-  double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
+  if (ratio < 0.0)
+    ratio = 1.0 - fmin(0.80, mdcache->cache_toofull_ratio());
 
-  dout(10) << "recall_client_state " << ratio
-	   << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
-	   << dendl;
+  dout(10) << __func__ << ": ratio=" << ratio << ", caps per client "
+           << min_caps_per_client << "-" << max_caps_per_client << dendl;
 
   set<Session*> sessions;
   mds->sessionmap.get_client_session_set(sessions);
+
   for (auto &session : sessions) {
     if (!session->is_open() ||
+        !session->connection.get() ||
 	!session->info.inst.name.is_client())
       continue;
 
@@ -1185,6 +1236,9 @@ void Server::recall_client_state(void)
       MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
       m->head.max_caps = newlim;
       mds->send_message_client(m, session);
+      if (flush_client_session) {
+        flush_session(session, gather);
+      }
       session->notify_recall_sent(newlim);
     }
   }
@@ -6158,8 +6212,6 @@ void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
   if (in->is_dir()) {
     assert(straydn);
     mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
-
-    in->maybe_export_pin(true);
   }
 
   journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
@@ -7968,16 +8020,17 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
 {
   dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
 
-  CDentry::linkage_t *destdnl = destdn->get_linkage();
+  CInode *in = destdn->get_linkage()->get_inode();
+
+  inodeno_t migrated_stray;
+  if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
+    migrated_stray = in->ino();
 
   list<MDSInternalContextBase*> finished;
   if (r == 0) {
     // unfreeze+singleauth inode
     //  hmm, do i really need to delay this?
     if (mdr->more()->is_inode_exporter) {
-
-      CInode *in = destdnl->get_inode();
-
       // drop our pins
       // we exported, clear out any xlocks that we moved to another MDS
       set<SimpleLock*>::iterator i = mdr->xlocks.begin();
@@ -7994,14 +8047,14 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
       bufferlist::iterator bp = mdr->more()->inode_import.begin();
       ::decode(peer_imported, bp);
 
-      dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
-      mdcache->migrator->finish_export_inode(destdnl->get_inode(), ceph_clock_now(),
-					     mdr->slave_to_mds, peer_imported, finished);
+      dout(10) << " finishing inode export on " << *in << dendl;
+      mdcache->migrator->finish_export_inode(in, ceph_clock_now(), mdr->slave_to_mds,
+					     peer_imported, finished);
       mds->queue_waiters(finished);   // this includes SINGLEAUTH waiters.
 
       // unfreeze
-      assert(destdnl->get_inode()->is_frozen_inode());
-      destdnl->get_inode()->unfreeze_inode(finished);
+      assert(in->is_frozen_inode());
+      in->unfreeze_inode(finished);
     }
 
     // singleauth
@@ -8037,8 +8090,8 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
     // witness list from the master, and they failed before we tried prep again.
     if (mdr->more()->rollback_bl.length()) {
       if (mdr->more()->is_inode_exporter) {
-	dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
-	destdnl->get_inode()->abort_export();
+	dout(10) << " reversing inode export of " << *in << dendl;
+	in->abort_export();
       }
       if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
 	mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
@@ -8061,6 +8114,9 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
       mdcache->request_finish(mdr);
     }
   }
+
+  if (migrated_stray && mds->is_stopping())
+    mdcache->shutdown_export_stray_finish(migrated_stray);
 }
 
 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
diff --git a/ceph/src/mds/Server.h b/ceph/src/mds/Server.h
index db6c77139..4fff7ae6f 100644
--- a/ceph/src/mds/Server.h
+++ b/ceph/src/mds/Server.h
@@ -89,6 +89,9 @@ private:
   int failed_reconnects;
   bool reconnect_evicting;  // true if I am waiting for evictions to complete
                             // before proceeding to reconnect_gather_finish
+  time reconnect_start = time::min();
+  time reconnect_last_seen = time::min();
+  set<client_t> client_reconnect_gather;  // clients i need a reconnect msg from.
 
   double cap_revoke_eviction_timeout = 0;
 
@@ -114,8 +117,6 @@ public:
   void handle_osd_map();
 
   // -- sessions and recovery --
-  utime_t  reconnect_start;
-  set<client_t> client_reconnect_gather;  // clients i need a reconnect msg from.
   bool waiting_for_reconnect(client_t c) const;
   void dump_reconnect_status(Formatter *f) const;
 
@@ -140,7 +141,8 @@ public:
   void reconnect_tick();
   void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
 
-  void recall_client_state(void);
+  void recall_client_state(double ratio, bool flush_client_session,
+                           MDSGatherBuilder *gather);
   void force_clients_readonly();
 
   // -- requests --
@@ -320,6 +322,7 @@ public:
 
 private:
   void reply_client_request(MDRequestRef& mdr, MClientReply *reply);
+  void flush_session(Session *session, MDSGatherBuilder *gather);
 };
 
 #endif
diff --git a/ceph/src/mds/SessionMap.h b/ceph/src/mds/SessionMap.h
index 217b4ba15..eee382f4a 100644
--- a/ceph/src/mds/SessionMap.h
+++ b/ceph/src/mds/SessionMap.h
@@ -246,6 +246,7 @@ public:
   xlist<Capability*> caps;     // inodes with caps; front=most recently used
   xlist<ClientLease*> leases;  // metadata leases to clients
   time last_cap_renew = time::min();
+  time last_seen = time::min();
 
 public:
   version_t inc_push_seq() { return ++cap_push_seq; }
diff --git a/ceph/src/mds/StrayManager.cc b/ceph/src/mds/StrayManager.cc
index b94dd57cb..f331722ba 100644
--- a/ceph/src/mds/StrayManager.cc
+++ b/ceph/src/mds/StrayManager.cc
@@ -229,8 +229,6 @@ void StrayManager::_purge_stray_purged(
 
     mds->mdlog->submit_entry(le, new C_PurgeStrayLogged(this, dn, pdv,
           mds->mdlog->get_current_segment()));
-
-    logger->set(l_mdc_num_strays, num_strays);
   }
 }
 
@@ -263,9 +261,13 @@ void StrayManager::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *l
   }
 
   // drop inode
+  inodeno_t ino = in->ino();
   if (in->is_dirty())
     in->mark_clean();
-  in->mdcache->remove_inode(in);
+  mds->mdcache->remove_inode(in);
+
+  if (mds->is_stopping())
+    mds->mdcache->shutdown_export_stray_finish(ino);
 }
 
 void StrayManager::enqueue(CDentry *dn, bool trunc)
@@ -465,7 +467,7 @@ bool StrayManager::_eval_stray(CDentry *dn, bool delay)
 	return false;  // not until some snaps are deleted.
       }
 
-      in->mdcache->clear_dirty_bits_for_stray(in);
+      mds->mdcache->clear_dirty_bits_for_stray(in);
 
       if (!in->remote_parents.empty()) {
 	// unlink any stale remote snap dentry.
@@ -748,11 +750,15 @@ void StrayManager::_truncate_stray_logged(CDentry *dn, LogSegment *ls)
 
   dout(10) << __func__ << ": " << *dn << " " << *in << dendl;
 
+  in->pop_and_dirty_projected_inode(ls);
+
+  in->state_clear(CInode::STATE_PURGING);
   dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED);
   dn->put(CDentry::PIN_PURGING);
 
-  in->pop_and_dirty_projected_inode(ls);
-
   eval_stray(dn);
+
+  if (!dn->state_test(CDentry::STATE_PURGING) &&  mds->is_stopping())
+    mds->mdcache->shutdown_export_stray_finish(in->ino());
 }
 
diff --git a/ceph/src/mgr/DaemonServer.cc b/ceph/src/mgr/DaemonServer.cc
index 2f81e4054..87c399b80 100644
--- a/ceph/src/mgr/DaemonServer.cc
+++ b/ceph/src/mgr/DaemonServer.cc
@@ -773,11 +773,7 @@ bool DaemonServer::handle_command(MCommand *m)
       f.reset(Formatter::create("json-pretty"));
     // only include state from services that are in the persisted service map
     f->open_object_section("service_status");
-    ServiceMap s;
-    cluster_state.with_servicemap([&](const ServiceMap& service_map) {
-	s = service_map;
-      });
-    for (auto& p : s.services) {
+    for (auto& p : pending_service_map.services) {
       f->open_object_section(p.first.c_str());
       for (auto& q : p.second.daemons) {
 	f->open_object_section(q.first.c_str());
@@ -853,6 +849,7 @@ bool DaemonServer::handle_command(MCommand *m)
       ss << "pg " << pgid << " primary osd." << acting_primary
 	 << " is not currently connected";
       cmdctx->reply(-EAGAIN, ss);
+      return true;
     }
     vector<pg_t> pgs = { pgid };
     for (auto& con : p->second) {
diff --git a/ceph/src/mgr/DaemonState.cc b/ceph/src/mgr/DaemonState.cc
index dc6726739..9ba7cf926 100644
--- a/ceph/src/mgr/DaemonState.cc
+++ b/ceph/src/mgr/DaemonState.cc
@@ -133,8 +133,6 @@ void DaemonPerfCounters::update(MMgrReport *report)
   for (const auto &t : report->declare_types) {
     types.insert(std::make_pair(t.path, t));
     session->declared_types.insert(t.path);
-    instances.insert(std::pair<std::string, PerfCounterInstance>(
-                     t.path, PerfCounterInstance(t.type)));
   }
   // Remove any old types
   for (const auto &t : report->undeclare_types) {
@@ -148,6 +146,13 @@ void DaemonPerfCounters::update(MMgrReport *report)
   DECODE_START(1, p);
   for (const auto &t_path : session->declared_types) {
     const auto &t = types.at(t_path);
+    auto instances_it = instances.find(t_path);
+    // Always check the instance exists, as we don't prevent yet
+    // multiple sessions from daemons with the same name, and one
+    // session clearing stats created by another on open.
+    if (instances_it == instances.end()) {
+      instances_it = instances.insert({t_path, t.type}).first;
+    }
     uint64_t val = 0;
     uint64_t avgcount = 0;
     uint64_t avgcount2 = 0;
@@ -156,9 +161,9 @@ void DaemonPerfCounters::update(MMgrReport *report)
     if (t.type & PERFCOUNTER_LONGRUNAVG) {
       ::decode(avgcount, p);
       ::decode(avgcount2, p);
-      instances.at(t_path).push_avg(now, val, avgcount);
+      instances_it->second.push_avg(now, val, avgcount);
     } else {
-      instances.at(t_path).push(now, val);
+      instances_it->second.push(now, val);
     }
   }
   DECODE_FINISH(p);
diff --git a/ceph/src/mon/AuthMonitor.cc b/ceph/src/mon/AuthMonitor.cc
index 965958b12..338d55b08 100644
--- a/ceph/src/mon/AuthMonitor.cc
+++ b/ceph/src/mon/AuthMonitor.cc
@@ -286,7 +286,14 @@ bool AuthMonitor::preprocess_query(MonOpRequestRef op)
   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
   switch (m->get_type()) {
   case MSG_MON_COMMAND:
-    return preprocess_command(op);
+    try {
+      return preprocess_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
 
   case CEPH_MSG_AUTH:
     return prep_auth(op, false);
@@ -306,7 +313,14 @@ bool AuthMonitor::prepare_update(MonOpRequestRef op)
   dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
   switch (m->get_type()) {
   case MSG_MON_COMMAND:
-    return prepare_command(op);
+    try {
+      return prepare_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
   case MSG_MON_GLOBAL_ID:
     return prepare_global_id(op);
   case CEPH_MSG_AUTH:
@@ -556,7 +570,7 @@ bool AuthMonitor::preprocess_command(MonOpRequestRef op)
   }
 
   string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
   if (prefix == "auth add" ||
       prefix == "auth del" ||
       prefix == "auth rm" ||
@@ -576,7 +590,7 @@ bool AuthMonitor::preprocess_command(MonOpRequestRef op)
 
   // entity might not be supplied, but if it is, it should be valid
   string entity_name;
-  cmd_getval(g_ceph_context, cmdmap, "entity", entity_name);
+  cmd_getval_throws(g_ceph_context, cmdmap, "entity", entity_name);
   EntityName entity;
   if (!entity_name.empty() && !entity.from_str(entity_name)) {
     ss << "invalid entity_auth " << entity_name;
@@ -585,7 +599,7 @@ bool AuthMonitor::preprocess_command(MonOpRequestRef op)
   }
 
   string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
   boost::scoped_ptr<Formatter> f(Formatter::create(format));
 
   if (prefix == "auth export") {
@@ -1047,10 +1061,10 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op)
   string entity_name;
   EntityName entity;
 
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
 
   string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
   boost::scoped_ptr<Formatter> f(Formatter::create(format));
 
   MonSession *session = m->get_session();
@@ -1059,14 +1073,14 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op)
     return true;
   }
 
-  cmd_getval(g_ceph_context, cmdmap, "caps", caps_vec);
+  cmd_getval_throws(g_ceph_context, cmdmap, "caps", caps_vec);
   if ((caps_vec.size() % 2) != 0) {
     ss << "bad capabilities request; odd number of arguments";
     err = -EINVAL;
     goto done;
   }
 
-  cmd_getval(g_ceph_context, cmdmap, "entity", entity_name);
+  cmd_getval_throws(g_ceph_context, cmdmap, "entity", entity_name);
   if (!entity_name.empty() && !entity.from_str(entity_name)) {
     ss << "bad entity name";
     err = -EINVAL;
@@ -1298,7 +1312,7 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op)
     return true;
   } else if (prefix == "fs authorize") {
     string filesystem;
-    cmd_getval(g_ceph_context, cmdmap, "filesystem", filesystem);
+    cmd_getval_throws(g_ceph_context, cmdmap, "filesystem", filesystem);
     string mds_cap_string, osd_cap_string;
     string osd_cap_wanted = "r";
 
diff --git a/ceph/src/mon/ConfigKeyService.cc b/ceph/src/mon/ConfigKeyService.cc
index e191f8367..6d23ff6ba 100644
--- a/ceph/src/mon/ConfigKeyService.cc
+++ b/ceph/src/mon/ConfigKeyService.cc
@@ -189,9 +189,9 @@ bool ConfigKeyService::service_dispatch(MonOpRequestRef op)
     return false;
   }
 
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
   string key;
-  cmd_getval(g_ceph_context, cmdmap, "key", key);
+  cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
 
   if (prefix == "config-key get") {
     ret = store_get(key, rdata);
@@ -212,7 +212,7 @@ bool ConfigKeyService::service_dispatch(MonOpRequestRef op)
 
     bufferlist data;
     string val;
-    if (cmd_getval(g_ceph_context, cmdmap, "val", val)) {
+    if (cmd_getval_throws(g_ceph_context, cmdmap, "val", val)) {
       // they specified a value in the command instead of a file
       data.append(val);
     } else if (cmd->get_data_len() > 0) {
diff --git a/ceph/src/mon/FSCommands.cc b/ceph/src/mon/FSCommands.cc
index 3ce6f5565..cb3e54711 100644
--- a/ceph/src/mon/FSCommands.cc
+++ b/ceph/src/mon/FSCommands.cc
@@ -44,13 +44,13 @@ class FlagSetHandler : public FileSystemCommandHandler
       std::stringstream &ss) override
   {
     string flag_name;
-    cmd_getval(g_ceph_context, cmdmap, "flag_name", flag_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "flag_name", flag_name);
 
     string flag_val;
-    cmd_getval(g_ceph_context, cmdmap, "val", flag_val);
+    cmd_getval_throws(g_ceph_context, cmdmap, "val", flag_val);
 
     string confirm;
-    cmd_getval(g_ceph_context, cmdmap, "confirm", confirm);
+    cmd_getval_throws(g_ceph_context, cmdmap, "confirm", confirm);
 
     if (flag_name == "enable_multiple") {
       bool flag_bool = false;
@@ -99,7 +99,7 @@ class FsNewHandler : public FileSystemCommandHandler
     assert(m_paxos->is_plugged());
 
     string metadata_name;
-    cmd_getval(g_ceph_context, cmdmap, "metadata", metadata_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "metadata", metadata_name);
     int64_t metadata = mon->osdmon()->osdmap.lookup_pg_pool_name(metadata_name);
     if (metadata < 0) {
       ss << "pool '" << metadata_name << "' does not exist";
@@ -107,7 +107,7 @@ class FsNewHandler : public FileSystemCommandHandler
     }
 
     string force_str;
-    cmd_getval(g_ceph_context,cmdmap, "force", force_str);
+    cmd_getval_throws(g_ceph_context,cmdmap, "force", force_str);
     bool force = (force_str == "--force");
     const pool_stat_t *stat = mon->pgservice->get_pool_stat(metadata);
     if (stat) {
@@ -120,7 +120,7 @@ class FsNewHandler : public FileSystemCommandHandler
     }
 
     string data_name;
-    cmd_getval(g_ceph_context, cmdmap, "data", data_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "data", data_name);
     int64_t data = mon->osdmon()->osdmap.lookup_pg_pool_name(data_name);
     if (data < 0) {
       ss << "pool '" << data_name << "' does not exist";
@@ -132,7 +132,7 @@ class FsNewHandler : public FileSystemCommandHandler
     }
 
     string fs_name;
-    cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name);
     if (fs_name.empty()) {
         // Ensure fs name is not empty so that we can implement
         // commmands that refer to FS by name in future.
@@ -166,7 +166,7 @@ class FsNewHandler : public FileSystemCommandHandler
       string sure;
       if ((std::find(data_pools.begin(), data_pools.end(), data) != data_pools.end()
 	   || fs->mds_map.get_metadata_pool() == metadata)
-	  && ((!cmd_getval(g_ceph_context, cmdmap, "sure", sure)
+	  && ((!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure)
 	       || sure != "--allow-dangerous-metadata-overlay"))) {
 	ss << "Filesystem '" << fs_name
 	   << "' is already using one of the specified RADOS pools. This should ONLY be done in emergencies and after careful reading of the documentation. Pass --allow-dangerous-metadata-overlay to permit this.";
@@ -230,7 +230,7 @@ public:
       std::stringstream &ss) override
   {
     std::string fs_name;
-    if (!cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name) || fs_name.empty()) {
       ss << "Missing filesystem name";
       return -EINVAL;
     }
@@ -242,14 +242,14 @@ public:
     }
 
     string var;
-    if (!cmd_getval(g_ceph_context, cmdmap, "var", var) || var.empty()) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "var", var) || var.empty()) {
       ss << "Invalid variable";
       return -EINVAL;
     }
     string val;
     string interr;
     int64_t n = 0;
-    if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "val", val)) {
       return -EINVAL;
     }
     // we got a string.  see if it contains an int.
@@ -290,7 +290,7 @@ public:
 
       if (enable_inline) {
 	string confirm;
-	if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
+	if (!cmd_getval_throws(g_ceph_context, cmdmap, "confirm", confirm) ||
 	    confirm != "--yes-i-really-mean-it") {
 	  ss << EXPERIMENTAL_WARNING;
 	  return -EPERM;
@@ -455,6 +455,36 @@ public:
       {
         fs->mds_map.set_standby_count_wanted(n);
       });
+    } else if (var == "session_timeout") {
+      if (interr.length()) {
+       ss << var << " requires an integer value";
+       return -EINVAL;
+      }
+      if (n < 30) {
+       ss << var << " must be at least 30s";
+       return -ERANGE;
+      }
+      fsmap.modify_filesystem(
+          fs->fscid,
+          [n](std::shared_ptr<Filesystem> fs)
+      {
+        fs->mds_map.set_session_timeout((uint32_t)n);
+      });
+    } else if (var == "session_autoclose") {
+      if (interr.length()) {
+       ss << var << " requires an integer value";
+       return -EINVAL;
+      }
+      if (n < 30) {
+       ss << var << " must be at least 30s";
+       return -ERANGE;
+      }
+      fsmap.modify_filesystem(
+          fs->fscid,
+          [n](std::shared_ptr<Filesystem> fs)
+      {
+        fs->mds_map.set_session_autoclose((uint32_t)n);
+      });
     } else {
       ss << "unknown variable " << var;
       return -EINVAL;
@@ -485,10 +515,10 @@ class AddDataPoolHandler : public FileSystemCommandHandler
     assert(m_paxos->is_plugged());
 
     string poolname;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
 
     std::string fs_name;
-    if (!cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name)
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name)
         || fs_name.empty()) {
       ss << "Missing filesystem name";
       return -EINVAL;
@@ -564,7 +594,7 @@ class SetDefaultHandler : public FileSystemCommandHandler
       std::stringstream &ss) override
   {
     std::string fs_name;
-    cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name);
     auto fs = fsmap.get_filesystem(fs_name);
     if (fs == nullptr) {
         ss << "filesystem '" << fs_name << "' does not exist";
@@ -594,7 +624,7 @@ class RemoveFilesystemHandler : public FileSystemCommandHandler
     // (redundant while there is only one FS, but command
     //  syntax should apply to multi-FS future)
     string fs_name;
-    cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name);
     auto fs = fsmap.get_filesystem(fs_name);
     if (fs == nullptr) {
         // Consider absence success to make deletes idempotent
@@ -610,7 +640,7 @@ class RemoveFilesystemHandler : public FileSystemCommandHandler
 
     // Check for confirmation flag
     string sure;
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
     if (sure != "--yes-i-really-mean-it") {
       ss << "this is a DESTRUCTIVE operation and will make data in your filesystem permanently" \
             " inaccessible.  Add --yes-i-really-mean-it if you are sure you wish to continue.";
@@ -655,7 +685,7 @@ class ResetFilesystemHandler : public FileSystemCommandHandler
       std::stringstream &ss) override
   {
     string fs_name;
-    cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name);
     auto fs = fsmap.get_filesystem(fs_name);
     if (fs == nullptr) {
         ss << "filesystem '" << fs_name << "' does not exist";
@@ -672,7 +702,7 @@ class ResetFilesystemHandler : public FileSystemCommandHandler
 
     // Check for confirmation flag
     string sure;
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
     if (sure != "--yes-i-really-mean-it") {
       ss << "this is a potentially destructive operation, only for use by experts in disaster recovery.  "
         "Add --yes-i-really-mean-it if you are sure you wish to continue.";
@@ -700,10 +730,10 @@ class RemoveDataPoolHandler : public FileSystemCommandHandler
       std::stringstream &ss) override
   {
     string poolname;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
 
     std::string fs_name;
-    if (!cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name)
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name)
         || fs_name.empty()) {
       ss << "Missing filesystem name";
       return -EINVAL;
diff --git a/ceph/src/mon/LogMonitor.cc b/ceph/src/mon/LogMonitor.cc
index 4987e67da..fc983bcee 100644
--- a/ceph/src/mon/LogMonitor.cc
+++ b/ceph/src/mon/LogMonitor.cc
@@ -256,7 +256,14 @@ bool LogMonitor::preprocess_query(MonOpRequestRef op)
   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
   switch (m->get_type()) {
   case MSG_MON_COMMAND:
-    return preprocess_command(op);
+    try {
+      return preprocess_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
 
   case MSG_LOG:
     return preprocess_log(op);
@@ -274,7 +281,14 @@ bool LogMonitor::prepare_update(MonOpRequestRef op)
   dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
   switch (m->get_type()) {
   case MSG_MON_COMMAND:
-    return prepare_command(op);
+    try {
+      return prepare_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
   case MSG_LOG:
     return prepare_log(op);
   default:
@@ -395,22 +409,22 @@ bool LogMonitor::preprocess_command(MonOpRequestRef op)
   }
 
   string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
 
   string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
   boost::scoped_ptr<Formatter> f(Formatter::create(format));
 
   if (prefix == "log last") {
     int64_t num = 20;
-    cmd_getval(g_ceph_context, cmdmap, "num", num);
+    cmd_getval_throws(g_ceph_context, cmdmap, "num", num);
     if (f) {
       f->open_array_section("tail");
     }
 
     std::string level_str;
     clog_type level;
-    if (cmd_getval(g_ceph_context, cmdmap, "level", level_str)) {
+    if (cmd_getval_throws(g_ceph_context, cmdmap, "level", level_str)) {
       level = LogEntry::str_to_level(level_str);
       if (level == CLOG_UNKNOWN) {
         ss << "Invalid severity '" << level_str << "'";
@@ -422,7 +436,7 @@ bool LogMonitor::preprocess_command(MonOpRequestRef op)
     }
 
     std::string channel;
-    if (!cmd_getval(g_ceph_context, cmdmap, "channel", channel)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "channel", channel)) {
       channel = CLOG_CHANNEL_DEFAULT;
     }
 
@@ -488,7 +502,7 @@ bool LogMonitor::prepare_command(MonOpRequestRef op)
   }
 
   string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
 
   MonSession *session = m->get_session();
   if (!session) {
@@ -498,7 +512,7 @@ bool LogMonitor::prepare_command(MonOpRequestRef op)
 
   if (prefix == "log") {
     vector<string> logtext;
-    cmd_getval(g_ceph_context, cmdmap, "logtext", logtext);
+    cmd_getval_throws(g_ceph_context, cmdmap, "logtext", logtext);
     LogEntry le;
     le.who = m->get_orig_source_inst();
     le.name = session->entity_name;
diff --git a/ceph/src/mon/MDSMonitor.cc b/ceph/src/mon/MDSMonitor.cc
index 1ad416170..7be5f14f7 100644
--- a/ceph/src/mon/MDSMonitor.cc
+++ b/ceph/src/mon/MDSMonitor.cc
@@ -309,7 +309,14 @@ bool MDSMonitor::preprocess_query(MonOpRequestRef op)
     return preprocess_beacon(op);
     
   case MSG_MON_COMMAND:
-    return preprocess_command(op);
+    try {
+      return preprocess_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
 
   case MSG_MDS_OFFLOAD_TARGETS:
     return preprocess_offload_targets(op);
@@ -519,7 +526,14 @@ bool MDSMonitor::prepare_update(MonOpRequestRef op)
     return prepare_beacon(op);
 
   case MSG_MON_COMMAND:
-    return prepare_command(op);
+    try {
+      return prepare_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
 
   case MSG_MDS_OFFLOAD_TARGETS:
     return prepare_offload_targets(op);
@@ -686,7 +700,9 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
         });
     }
 
-    if (info.state == MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPED ) {
+    if (info.state == MDSMap::STATE_STOPPING &&
+        state != MDSMap::STATE_STOPPING &&
+        state != MDSMap::STATE_STOPPED) {
       // we can't transition to any other states from STOPPING
       dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
 	       << dendl;
diff --git a/ceph/src/mon/MgrMonitor.cc b/ceph/src/mon/MgrMonitor.cc
index fe5bf828a..17a59add3 100644
--- a/ceph/src/mon/MgrMonitor.cc
+++ b/ceph/src/mon/MgrMonitor.cc
@@ -231,7 +231,15 @@ bool MgrMonitor::preprocess_query(MonOpRequestRef op)
     case MSG_MGR_BEACON:
       return preprocess_beacon(op);
     case MSG_MON_COMMAND:
-      return preprocess_command(op);
+      try {
+	return preprocess_command(op);
+      }
+      catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+
     default:
       mon->no_reply(op);
       derr << "Unhandled message type " << m->get_type() << dendl;
@@ -247,7 +255,14 @@ bool MgrMonitor::prepare_update(MonOpRequestRef op)
       return prepare_beacon(op);
 
     case MSG_MON_COMMAND:
-      return prepare_command(op);
+      try {
+	return prepare_command(op);
+      }
+      catch (const bad_cmd_get& e) {
+	bufferlist bl;
+	mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+	return true;
+      }
 
     default:
       mon->no_reply(op);
@@ -723,16 +738,16 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
   }
 
   string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("json-pretty"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("json-pretty"));
   boost::scoped_ptr<Formatter> f(Formatter::create(format));
 
   string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
   int r = 0;
 
   if (prefix == "mgr dump") {
     int64_t epoch = 0;
-    cmd_getval(g_ceph_context, cmdmap, "epoch", epoch, (int64_t)map.get_epoch());
+    cmd_getval_throws(g_ceph_context, cmdmap, "epoch", epoch, (int64_t)map.get_epoch());
     if (epoch == (int64_t)map.get_epoch()) {
       f->dump_object("mgrmap", map);
     } else {
@@ -776,14 +791,14 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
     f->flush(rdata);
   } else if (prefix == "mgr metadata") {
     string name;
-    cmd_getval(g_ceph_context, cmdmap, "id", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "id", name);
     if (name.size() > 0 && !map.have_name(name)) {
       ss << "mgr." << name << " does not exist";
       r = -ENOENT;
       goto reply;
     }
     string format;
-    cmd_getval(g_ceph_context, cmdmap, "format", format);
+    cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
     if (name.size()) {
       f->open_object_section("mgr_metadata");
@@ -822,7 +837,7 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
     if (!f)
       f.reset(Formatter::create("json-pretty"));
     string field;
-    cmd_getval(g_ceph_context, cmdmap, "property", field);
+    cmd_getval_throws(g_ceph_context, cmdmap, "property", field);
     count_metadata(field, f.get());
     f->flush(rdata);
     r = 0;
@@ -858,17 +873,17 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
   }
 
   string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
   boost::scoped_ptr<Formatter> f(Formatter::create(format));
 
   string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
 
   int r = 0;
 
   if (prefix == "mgr fail") {
     string who;
-    cmd_getval(g_ceph_context, cmdmap, "who", who);
+    cmd_getval_throws(g_ceph_context, cmdmap, "who", who);
 
     std::string err;
     uint64_t gid = strict_strtol(who.c_str(), 10, &err);
@@ -910,13 +925,13 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
     }
   } else if (prefix == "mgr module enable") {
     string module;
-    cmd_getval(g_ceph_context, cmdmap, "module", module);
+    cmd_getval_throws(g_ceph_context, cmdmap, "module", module);
     if (module.empty()) {
       r = -EINVAL;
       goto out;
     }
     string force;
-    cmd_getval(g_ceph_context, cmdmap, "force", force);
+    cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
     if (!pending_map.all_support_module(module) &&
 	force != "--force") {
       ss << "all mgr daemons do not support module '" << module << "', pass "
@@ -927,7 +942,7 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
     pending_map.modules.insert(module);
   } else if (prefix == "mgr module disable") {
     string module;
-    cmd_getval(g_ceph_context, cmdmap, "module", module);
+    cmd_getval_throws(g_ceph_context, cmdmap, "module", module);
     if (module.empty()) {
       r = -EINVAL;
       goto out;
diff --git a/ceph/src/mon/MonCap.cc b/ceph/src/mon/MonCap.cc
index c93644f68..95cb8bf71 100644
--- a/ceph/src/mon/MonCap.cc
+++ b/ceph/src/mon/MonCap.cc
@@ -353,6 +353,12 @@ mon_rwxa_t MonCapGrant::get_allowed(CephContext *cct,
     }
     return MON_CAP_ALL;
   }
+  // we don't allow config-key service to be accessed with blanket caps other
+  // than '*' (i.e., 'any'), and that should have been checked by the caller
+  // via 'is_allow_all()'.
+  if (s == "config-key") {
+    return 0;
+  }
   return allow;
 }
 
@@ -487,7 +493,7 @@ struct MonCapParser : qi::grammar<Iterator, MonCap()>
     quoted_string %=
       lexeme['"' >> +(char_ - '"') >> '"'] | 
       lexeme['\'' >> +(char_ - '\'') >> '\''];
-    unquoted_word %= +char_("a-zA-Z0-9_.-");
+    unquoted_word %= +char_("a-zA-Z0-9_./-");
     str %= quoted_string | unquoted_word;
 
     spaces = +(lit(' ') | lit('\n') | lit('\t'));
diff --git a/ceph/src/mon/MonCommands.h b/ceph/src/mon/MonCommands.h
index f5bb5c5bb..6890af25a 100644
--- a/ceph/src/mon/MonCommands.h
+++ b/ceph/src/mon/MonCommands.h
@@ -404,10 +404,10 @@ COMMAND("fs set " \
 	"name=fs_name,type=CephString " \
 	"name=var,type=CephChoices,strings=max_mds|max_file_size"
         "|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags|balancer" \
-        "|standby_count_wanted " \
+        "|standby_count_wanted|session_timeout|session_autoclose " \
 	"name=val,type=CephString "					\
 	"name=confirm,type=CephString,req=false",			\
-	"set mds parameter <var> to <val>", "mds", "rw", "cli,rest")
+	"set fs parameter <var> to <val>", "mds", "rw", "cli,rest")
 COMMAND("fs flag set name=flag_name,type=CephChoices,strings=enable_multiple "
         "name=val,type=CephString " \
 	"name=confirm,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
@@ -742,7 +742,7 @@ COMMAND("osd erasure-code-profile ls", \
 	"list all erasure code profiles", \
 	"osd", "r", "cli,rest")
 COMMAND("osd set " \
-	"name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds " \
+	"name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds||pglog_hardlimit " \
 	"name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"set <key>", "osd", "rw", "cli,rest")
 COMMAND("osd unset " \
diff --git a/ceph/src/mon/Monitor.cc b/ceph/src/mon/Monitor.cc
index 9960dea46..76c3c7708 100644
--- a/ceph/src/mon/Monitor.cc
+++ b/ceph/src/mon/Monitor.cc
@@ -889,7 +889,9 @@ void Monitor::shutdown()
 
   state = STATE_SHUTDOWN;
 
+  lock.Unlock();
   g_conf->remove_observer(this);
+  lock.Lock();
 
   if (admin_hook) {
     AdminSocket* admin_socket = cct->get_admin_socket();
@@ -929,6 +931,16 @@ void Monitor::shutdown()
 
   remove_all_sessions();
 
+  log_client.shutdown();
+
+  // unlock before msgr shutdown...
+  lock.Unlock();
+
+  // shutdown messenger before removing logger from perfcounter collection, 
+  // otherwise _ms_dispatch() will try to update deleted logger
+  messenger->shutdown();
+  mgr_messenger->shutdown();
+
   if (logger) {
     cct->get_perfcounters_collection()->remove(logger);
     delete logger;
@@ -940,14 +952,6 @@ void Monitor::shutdown()
     delete cluster_logger;
     cluster_logger = NULL;
   }
-
-  log_client.shutdown();
-
-  // unlock before msgr shutdown...
-  lock.Unlock();
-
-  messenger->shutdown();  // last thing!  ceph_mon.cc will delete mon.
-  mgr_messenger->shutdown();
 }
 
 void Monitor::wait_for_paxos_write()
diff --git a/ceph/src/mon/MonmapMonitor.cc b/ceph/src/mon/MonmapMonitor.cc
index 8abcf8131..78f4a8200 100644
--- a/ceph/src/mon/MonmapMonitor.cc
+++ b/ceph/src/mon/MonmapMonitor.cc
@@ -201,7 +201,14 @@ bool MonmapMonitor::preprocess_query(MonOpRequestRef op)
   switch (m->get_type()) {
     // READs
   case MSG_MON_COMMAND:
-    return preprocess_command(op);
+    try {
+      return preprocess_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
   case MSG_MON_JOIN:
     return preprocess_join(op);
   default:
@@ -238,7 +245,7 @@ bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
   }
 
   string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
 
   MonSession *session = m->get_session();
   if (!session) {
@@ -247,7 +254,7 @@ bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
   }
 
   string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
   boost::scoped_ptr<Formatter> f(Formatter::create(format));
 
   if (prefix == "mon stat") {
@@ -264,7 +271,7 @@ bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
 
     epoch_t epoch;
     int64_t epochnum;
-    cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)0);
+    cmd_getval_throws(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)0);
     epoch = epochnum;
 
     MonMap *p = mon->monmap;
@@ -315,7 +322,7 @@ bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
    
     bool list_with_value = false;
     string with_value;
-    if (cmd_getval(g_ceph_context, cmdmap, "with_value", with_value) &&
+    if (cmd_getval_throws(g_ceph_context, cmdmap, "with_value", with_value) &&
         with_value == "--with-value") {
       list_with_value = true;
     }
@@ -403,7 +410,14 @@ bool MonmapMonitor::prepare_update(MonOpRequestRef op)
   
   switch (m->get_type()) {
   case MSG_MON_COMMAND:
-    return prepare_command(op);
+    try {
+      return prepare_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
   case MSG_MON_JOIN:
     return prepare_join(op);
   default:
@@ -428,7 +442,7 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
   }
 
   string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
 
   MonSession *session = m->get_session();
   if (!session) {
@@ -489,9 +503,9 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
   bool propose = false;
   if (prefix == "mon add") {
     string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
     string addrstr;
-    cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "addr", addrstr);
     entity_addr_t addr;
     bufferlist rdata;
 
@@ -559,7 +573,7 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
   } else if (prefix == "mon remove" ||
              prefix == "mon rm") {
     string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
     if (!monmap.contains(name)) {
       err = 0;
       ss << "mon." << name << " does not exist or has already been removed";
@@ -625,7 +639,7 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
      * 'mon flag set/unset'.
      */
     string feature_name;
-    if (!cmd_getval(g_ceph_context, cmdmap, "feature_name", feature_name)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "feature_name", feature_name)) {
       ss << "missing required feature name";
       err = -EINVAL;
       goto reply;
@@ -640,7 +654,7 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
     }
 
     string sure;
-    if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure) ||
         sure != "--yes-i-really-mean-it") {
       ss << "please specify '--yes-i-really-mean-it' if you "
          << "really, **really** want to set feature '"
@@ -674,7 +688,6 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
             << "persistent = " << pending_map.persistent_features
             // output optional nevertheless, for auditing purposes.
             << ", optional = " << pending_map.optional_features << dendl;
-    
   } else {
     ss << "unknown command " << prefix;
     err = -EINVAL;
diff --git a/ceph/src/mon/OSDMonitor.cc b/ceph/src/mon/OSDMonitor.cc
index 8376a4066..9ce27c136 100644
--- a/ceph/src/mon/OSDMonitor.cc
+++ b/ceph/src/mon/OSDMonitor.cc
@@ -1006,7 +1006,8 @@ void OSDMonitor::prime_pg_temp(
   int next_up_primary, next_acting_primary;
   next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
 			    &next_acting, &next_acting_primary);
-  if (acting == next_acting && next_up != next_acting)
+  if (acting == next_acting &&
+      !(up != acting && next_up == next_acting))
     return;  // no change since last epoch
 
   if (acting.empty())
@@ -1723,7 +1724,14 @@ bool OSDMonitor::preprocess_query(MonOpRequestRef op)
   switch (m->get_type()) {
     // READs
   case MSG_MON_COMMAND:
-    return preprocess_command(op);
+    try {
+      return preprocess_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
   case CEPH_MSG_MON_GET_OSDMAP:
     return preprocess_get_osdmap(op);
 
@@ -1783,7 +1791,14 @@ bool OSDMonitor::prepare_update(MonOpRequestRef op)
     return prepare_beacon(op);
 
   case MSG_MON_COMMAND:
-    return prepare_command(op);
+    try {
+      return prepare_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
 
   case CEPH_MSG_POOLOP:
     return prepare_pool_op(op);
@@ -2502,6 +2517,17 @@ bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
     }
   }
 
+  // The release check here is required because for OSD_PGLOG_HARDLIMIT,
+  // we are reusing a jewel feature bit that was retired in luminous.
+  if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+      osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
+      !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
+    mon->clog->info() << "disallowing boot of OSD "
+		      << m->get_orig_source_inst()
+		      << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
+    goto ignore;
+  }
+
   // already booted?
   if (osdmap.is_up(from) &&
       osdmap.get_inst(from) == m->get_orig_source_inst() &&
@@ -3064,6 +3090,7 @@ bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
 
   // check privilege, ignore if failed
   MonSession *session = m->get_session();
+  mon->no_reply(op);
   if (!session)
     goto ignore;
   if (!session->caps.is_capable(
@@ -4260,10 +4287,10 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
   }
 
   string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
 
   string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
   boost::scoped_ptr<Formatter> f(Formatter::create(format));
 
   if (prefix == "osd stat") {
@@ -4288,7 +4315,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 
     epoch_t epoch = 0;
     int64_t epochnum;
-    cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
+    cmd_getval_throws(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
     epoch = epochnum;
     
     bufferlist osdmap_bl;
@@ -4352,7 +4379,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
       rdata.append(ds);
     } else if (prefix == "osd tree") {
       vector<string> states;
-      cmd_getval(g_ceph_context, cmdmap, "states", states);
+      cmd_getval_throws(g_ceph_context, cmdmap, "states", states);
       unsigned filter = 0;
       for (auto& s : states) {
 	if (s == "up") {
@@ -4404,7 +4431,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
       ss << p->get_crush_version();
     } else if (prefix == "osd ls-tree") {
       string bucket_name;
-      cmd_getval(g_ceph_context, cmdmap, "name", bucket_name);
+      cmd_getval_throws(g_ceph_context, cmdmap, "name", bucket_name);
       set<int> osds;
       r = p->get_osds_by_bucket_name(bucket_name, &osds);
       if (r == -ENOENT) {
@@ -4466,7 +4493,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     goto reply;
   } else if (prefix  == "osd find") {
     int64_t osd;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", osd)) {
       ss << "unable to parse osd id value '"
          << cmd_vartype_stringify(cmdmap["id"]) << "'";
       r = -EINVAL;
@@ -4478,11 +4505,12 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
       goto reply;
     }
     string format;
-    cmd_getval(g_ceph_context, cmdmap, "format", format);
+    cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
     f->open_object_section("osd_location");
     f->dump_int("osd", osd);
     f->dump_stream("ip") << osdmap.get_addr(osd);
+    f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
     f->open_object_section("crush_location");
     map<string,string> loc = osdmap.crush->get_full_location(osd);
     for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
@@ -4493,7 +4521,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
   } else if (prefix == "osd metadata") {
     int64_t osd = -1;
     if (cmd_vartype_stringify(cmdmap["id"]).size() &&
-        !cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
+        !cmd_getval_throws(g_ceph_context, cmdmap, "id", osd)) {
       ss << "unable to parse osd id value '"
          << cmd_vartype_stringify(cmdmap["id"]) << "'";
       r = -EINVAL;
@@ -4505,7 +4533,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
       goto reply;
     }
     string format;
-    cmd_getval(g_ceph_context, cmdmap, "format", format);
+    cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
     if (osd >= 0) {
       f->open_object_section("osd_metadata");
@@ -4546,15 +4574,15 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     if (!f)
       f.reset(Formatter::create("json-pretty"));
     string field;
-    cmd_getval(g_ceph_context, cmdmap, "property", field);
+    cmd_getval_throws(g_ceph_context, cmdmap, "property", field);
     count_metadata(field, f.get());
     f->flush(rdata);
     r = 0;
   } else if (prefix == "osd map") {
     string poolstr, objstr, namespacestr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
-    cmd_getval(g_ceph_context, cmdmap, "object", objstr);
-    cmd_getval(g_ceph_context, cmdmap, "nspace", namespacestr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "object", objstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "nspace", namespacestr);
 
     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
     if (pool < 0) {
@@ -4608,7 +4636,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
   } else if (prefix == "pg map") {
     pg_t pgid;
     string pgidstr;
-    cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr);
     if (!pgid.parse(pgidstr.c_str())) {
       ss << "invalid pgid '" << pgidstr << "'";
       r = -EINVAL;
@@ -4685,7 +4713,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     }
   } else if (prefix == "osd lspools") {
     int64_t auid;
-    cmd_getval(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
+    cmd_getval_throws(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
     if (f)
       f->open_array_section("pools");
     for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
@@ -4736,7 +4764,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 
   } else if (prefix == "osd pool ls") {
     string detail;
-    cmd_getval(g_ceph_context, cmdmap, "detail", detail);
+    cmd_getval_throws(g_ceph_context, cmdmap, "detail", detail);
     if (!f && detail == "detail") {
       ostringstream ss;
       osdmap.print_pools(ss);
@@ -4768,7 +4796,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 
   } else if (prefix == "osd crush get-tunable") {
     string tunable;
-    cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
+    cmd_getval_throws(g_ceph_context, cmdmap, "tunable", tunable);
     ostringstream rss;
     if (f)
       f->open_object_section("tunable");
@@ -4791,7 +4819,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 
   } else if (prefix == "osd pool get") {
     string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
     if (pool < 0) {
       ss << "unrecognized pool '" << poolstr << "'";
@@ -4801,7 +4829,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 
     const pg_pool_t *p = osdmap.get_pg_pool(pool);
     string var;
-    cmd_getval(g_ceph_context, cmdmap, "var", var);
+    cmd_getval_throws(g_ceph_context, cmdmap, "var", var);
 
     typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
     const choices_map_t ALL_CHOICES = {
@@ -5241,7 +5269,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 					   osdmap, f.get(), &ss, &rdata);
   } else if (prefix == "osd pool get-quota") {
     string pool_name;
-    cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", pool_name);
 
     int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
     if (poolid < 0) {
@@ -5292,7 +5320,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     }
   } else if (prefix == "osd crush rule ls-by-class") {
     string class_name;
-    cmd_getval(g_ceph_context, cmdmap, "class", class_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "class", class_name);
     if (class_name.empty()) {
       ss << "no class specified";
       r = -EINVAL;
@@ -5320,9 +5348,9 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     }
   } else if (prefix == "osd crush rule dump") {
     string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
     string format;
-    cmd_getval(g_ceph_context, cmdmap, "format", format);
+    cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
     if (name == "") {
       f->open_array_section("rules");
@@ -5343,7 +5371,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     rdata.append(rs.str());
   } else if (prefix == "osd crush dump") {
     string format;
-    cmd_getval(g_ceph_context, cmdmap, "format", format);
+    cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
     f->open_object_section("crush_map");
     osdmap.crush->dump(f.get());
@@ -5354,7 +5382,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     rdata.append(rs.str());
   } else if (prefix == "osd crush show-tunables") {
     string format;
-    cmd_getval(g_ceph_context, cmdmap, "format", format);
+    cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
     f->open_object_section("crush_map_tunables");
     osdmap.crush->dump_tunables(f.get());
@@ -5365,7 +5393,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     rdata.append(rs.str());
   } else if (prefix == "osd crush tree") {
     string shadow;
-    cmd_getval(g_ceph_context, cmdmap, "shadow", shadow);
+    cmd_getval_throws(g_ceph_context, cmdmap, "shadow", shadow);
     bool show_shadow = shadow == "--show-shadow";
     boost::scoped_ptr<Formatter> f(Formatter::create(format));
     if (f) {
@@ -5386,7 +5414,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     }
   } else if (prefix == "osd crush ls") {
     string name;
-    if (!cmd_getval(g_ceph_context, cmdmap, "node", name)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "node", name)) {
       ss << "no node specified";
       r = -EINVAL;
       goto reply;
@@ -5430,7 +5458,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     f->flush(rdata);
   } else if (prefix == "osd crush class ls-osd") {
     string name;
-    cmd_getval(g_ceph_context, cmdmap, "class", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "class", name);
     set<int> osds;
     osdmap.crush->get_devices_by_class(name, &osds);
     if (f) {
@@ -5499,7 +5527,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     f->flush(rdata);
   } else if (prefix == "osd erasure-code-profile get") {
     string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
     if (!osdmap.has_erasure_code_profile(name)) {
       ss << "unknown erasure code profile '" << name << "'";
       r = -ENOENT;
@@ -5527,11 +5555,11 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
                                                      "json-pretty"));
     string pool_name;
-    cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", pool_name);
     string app;
-    cmd_getval(g_ceph_context, cmdmap, "app", app);
+    cmd_getval_throws(g_ceph_context, cmdmap, "app", app);
     string key;
-    cmd_getval(g_ceph_context, cmdmap, "key", key);
+    cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
 
     if (pool_name.empty()) {
       // all
@@ -6413,14 +6441,14 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
                                          stringstream& ss)
 {
   string poolstr;
-  cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+  cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
   int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
   if (pool < 0) {
     ss << "unrecognized pool '" << poolstr << "'";
     return -ENOENT;
   }
   string var;
-  cmd_getval(g_ceph_context, cmdmap, "var", var);
+  cmd_getval_throws(g_ceph_context, cmdmap, "var", var);
 
   pg_pool_t p = *osdmap.get_pg_pool(pool);
   if (pending_inc.new_pools.count(pool))
@@ -6436,17 +6464,12 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
   int64_t n = 0;
   double f = 0;
   int64_t uf = 0;  // micro-f
-  if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
-    // wasn't a string; maybe an older mon forwarded json with an int?
-    if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
-      return -EINVAL;  // no value!
-  } else {
-    // we got a string.  see if it contains an int.
-    n = strict_strtoll(val.c_str(), 10, &interr);
-    // or a float
-    f = strict_strtod(val.c_str(), &floaterr);
-    uf = llrintl(f * (double)1000000.0);
-  }
+  cmd_getval(g_ceph_context, cmdmap, "val", val);
+
+  // parse string as both int and float; different fields use different types.
+  n = strict_strtoll(val.c_str(), 10, &interr);
+  f = strict_strtod(val.c_str(), &floaterr);
+  uf = llrintl(f * (double)1000000.0);
 
   if (!p.is_tier() &&
       (var == "hit_set_type" || var == "hit_set_period" ||
@@ -6555,7 +6578,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
       return r;
     }
     string force;
-    cmd_getval(g_ceph_context,cmdmap, "force", force);
+    cmd_getval_throws(g_ceph_context,cmdmap, "force", force);
     if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
 	force != "--yes-i-really-mean-it") {
       ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
@@ -6622,7 +6645,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
   } else if (var == "hashpspool") {
     uint64_t flag = pg_pool_t::get_flag_by_name(var);
     string force;
-    cmd_getval(g_ceph_context, cmdmap, "force", force);
+    cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
     if (force != "--yes-i-really-mean-it") {
       ss << "are you SURE?  this will remap all placement groups in this pool,"
 	    " this triggers large data movement,"
@@ -6918,7 +6941,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
                                                  stringstream& ss)
 {
   string pool_name;
-  cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+  cmd_getval_throws(g_ceph_context, cmdmap, "pool", pool_name);
   int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
   if (pool < 0) {
     ss << "unrecognized pool '" << pool_name << "'";
@@ -6931,7 +6954,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
   }
 
   string app;
-  cmd_getval(g_ceph_context, cmdmap, "app", app);
+  cmd_getval_throws(g_ceph_context, cmdmap, "app", app);
   bool app_exists = (p.application_metadata.count(app) > 0);
 
   if (boost::algorithm::ends_with(prefix, "enable")) {
@@ -6946,7 +6969,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
     }
 
     string force;
-    cmd_getval(g_ceph_context, cmdmap, "force", force);
+    cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
 
     if (!app_exists && !p.application_metadata.empty() &&
         force != "--yes-i-really-mean-it") {
@@ -6974,7 +6997,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
 
   } else if (boost::algorithm::ends_with(prefix, "disable")) {
     string force;
-    cmd_getval(g_ceph_context, cmdmap, "force", force);
+    cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
 
     if (force != "--yes-i-really-mean-it") {
       ss << "Are you SURE? Disabling an application within a pool might result "
@@ -7005,7 +7028,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
     }
 
     string key;
-    cmd_getval(g_ceph_context, cmdmap, "key", key);
+    cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
 
     if (key.empty()) {
       ss << "key must be provided";
@@ -7027,7 +7050,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
     }
 
     string value;
-    cmd_getval(g_ceph_context, cmdmap, "value", value);
+    cmd_getval_throws(g_ceph_context, cmdmap, "value", value);
     if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
       ss << "value '" << value << "' too long; max length "
          << MAX_POOL_APPLICATION_LENGTH;
@@ -7045,7 +7068,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
     }
 
     string key;
-    cmd_getval(g_ceph_context, cmdmap, "key", key);
+    cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
     auto it = p.application_metadata[app].find(key);
     if (it == p.application_metadata[app].end()) {
       ss << "application '" << app << "' on pool '" << pool_name
@@ -7360,7 +7383,7 @@ int OSDMonitor::prepare_command_osd_new(
    * If `id` is specified, and the osd has been previously marked
    * as destroyed, then the `id` will be reused.
    */
-  if (!cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
+  if (!cmd_getval_throws(g_ceph_context, cmdmap, "uuid", uuidstr)) {
     ss << "requires the OSD's UUID to be specified.";
     return -EINVAL;
   } else if (!uuid.parse(uuidstr.c_str())) {
@@ -7368,7 +7391,7 @@ int OSDMonitor::prepare_command_osd_new(
     return -EINVAL;
   }
 
-  if (cmd_getval(g_ceph_context, cmdmap, "id", id) &&
+  if (cmd_getval_throws(g_ceph_context, cmdmap, "id", id) &&
       (id < 0)) {
     ss << "invalid OSD id; must be greater or equal than zero.";
     return -EINVAL;
@@ -7614,7 +7637,7 @@ static int parse_reweights(CephContext *cct,
 			   map<int32_t, uint32_t>* weights)
 {
   string weights_str;
-  if (!cmd_getval(g_ceph_context, cmdmap, "weights", weights_str)) {
+  if (!cmd_getval_throws(g_ceph_context, cmdmap, "weights", weights_str)) {
     return -EINVAL;
   }
   std::replace(begin(weights_str), end(weights_str), '\'', '"');
@@ -7797,11 +7820,11 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   int err = 0;
 
   string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
   boost::scoped_ptr<Formatter> f(Formatter::create(format));
 
   string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
 
   int64_t osdid;
   string name;
@@ -7809,7 +7832,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   if (prefix != "osd pg-temp" &&
       prefix != "osd pg-upmap" &&
       prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
-    osdid_present = cmd_getval(g_ceph_context, cmdmap, "id", osdid);
+    osdid_present = cmd_getval_throws(g_ceph_context, cmdmap, "id", osdid);
   }
   if (osdid_present) {
     ostringstream oss;
@@ -7868,7 +7891,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
   
     int64_t prior_version = 0;
-    if (cmd_getval(g_ceph_context, cmdmap, "prior_version", prior_version)) {
+    if (cmd_getval_throws(g_ceph_context, cmdmap, "prior_version", prior_version)) {
       if (prior_version == osdmap.get_crush_version() - 1) {
 	// see if we are a resend of the last update.  this is imperfect
 	// (multiple racing updaters may not both get reliable success)
@@ -7963,14 +7986,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
 
     string device_class;
-    if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "class", device_class)) {
       err = -EINVAL; // no value!
       goto reply;
     }
 
     bool stop = false;
     vector<string> idvec;
-    cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
     CrushWrapper newcrush;
     _get_pending_crush(newcrush);
     set<int> updated;
@@ -8046,7 +8069,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  } else if (prefix == "osd crush rm-device-class") {
     bool stop = false;
     vector<string> idvec;
-    cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
     CrushWrapper newcrush;
     _get_pending_crush(newcrush);
     set<int> updated;
@@ -8106,11 +8129,11 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
   } else if (prefix == "osd crush class rename") {
     string srcname, dstname;
-    if (!cmd_getval(g_ceph_context, cmdmap, "srcname", srcname)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "srcname", srcname)) {
       err = -EINVAL;
       goto reply;
     }
-    if (!cmd_getval(g_ceph_context, cmdmap, "dstname", dstname)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "dstname", dstname)) {
       err = -EINVAL;
       goto reply;
     }
@@ -8139,8 +8162,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   } else if (prefix == "osd crush add-bucket") {
     // os crush add-bucket <name> <type>
     string name, typestr;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
-    cmd_getval(g_ceph_context, cmdmap, "type", typestr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "type", typestr);
 
     if (!_have_pending_crush() &&
 	_get_stable_crush().name_exists(name)) {
@@ -8187,8 +8210,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     goto update;
   } else if (prefix == "osd crush rename-bucket") {
     string srcname, dstname;
-    cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
-    cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "srcname", srcname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "dstname", dstname);
 
     err = crush_rename_bucket(srcname, dstname, &ss);
     if (err == -EALREADY) // equivalent to success for idempotency
@@ -8220,14 +8243,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 	goto reply;
       }
       string poolname, mode;
-      cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+      cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
       if (pool < 0) {
 	ss << "pool '" << poolname << "' not found";
 	err = -ENOENT;
 	goto reply;
       }
-      cmd_getval(g_ceph_context, cmdmap, "mode", mode);
+      cmd_getval_throws(g_ceph_context, cmdmap, "mode", mode);
       if (mode != "flat" && mode != "positional") {
 	ss << "unrecognized weight-set mode '" << mode << "'";
 	err = -EINVAL;
@@ -8250,7 +8273,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     int64_t pool;
     if (prefix == "osd crush weight-set rm") {
       string poolname;
-      cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+      cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
       if (pool < 0) {
 	ss << "pool '" << poolname << "' not found";
@@ -8269,9 +8292,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 	     prefix == "osd crush weight-set reweight-compat") {
     string poolname, item;
     vector<double> weight;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
-    cmd_getval(g_ceph_context, cmdmap, "item", item);
-    cmd_getval(g_ceph_context, cmdmap, "weight", weight);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "item", item);
+    cmd_getval_throws(g_ceph_context, cmdmap, "weight", weight);
     CrushWrapper newcrush;
     _get_pending_crush(newcrush);
     int64_t pool;
@@ -8333,7 +8356,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
 
     double weight;
-    if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", weight)) {
       ss << "unable to parse weight value '"
          << cmd_vartype_stringify(cmdmap["weight"]) << "'";
       err = -EINVAL;
@@ -8342,7 +8365,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
     string args;
     vector<string> argvec;
-    cmd_getval(g_ceph_context, cmdmap, "args", argvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
     map<string,string> loc;
     CrushWrapper::parse_loc_map(argvec, &loc);
 
@@ -8401,7 +8424,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       }
 
       double weight;
-      if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
+      if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", weight)) {
         ss << "unable to parse weight value '"
            << cmd_vartype_stringify(cmdmap["weight"]) << "'";
         err = -EINVAL;
@@ -8410,7 +8433,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
       string args;
       vector<string> argvec;
-      cmd_getval(g_ceph_context, cmdmap, "args", argvec);
+      cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
       map<string,string> loc;
       CrushWrapper::parse_loc_map(argvec, &loc);
 
@@ -8444,8 +8467,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
       string args;
       vector<string> argvec;
-      cmd_getval(g_ceph_context, cmdmap, "name", name);
-      cmd_getval(g_ceph_context, cmdmap, "args", argvec);
+      cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
+      cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
       map<string,string> loc;
       CrushWrapper::parse_loc_map(argvec, &loc);
 
@@ -8482,9 +8505,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     } while (false);
   } else if (prefix == "osd crush swap-bucket") {
     string source, dest, force;
-    cmd_getval(g_ceph_context, cmdmap, "source", source);
-    cmd_getval(g_ceph_context, cmdmap, "dest", dest);
-    cmd_getval(g_ceph_context, cmdmap, "force", force);
+    cmd_getval_throws(g_ceph_context, cmdmap, "source", source);
+    cmd_getval_throws(g_ceph_context, cmdmap, "dest", dest);
+    cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
     CrushWrapper newcrush;
     _get_pending_crush(newcrush);
     if (!newcrush.name_exists(source)) {
@@ -8530,9 +8553,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   } else if (prefix == "osd crush link") {
     // osd crush link <name> <loc1> [<loc2> ...]
     string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
     vector<string> argvec;
-    cmd_getval(g_ceph_context, cmdmap, "args", argvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
     map<string,string> loc;
     CrushWrapper::parse_loc_map(argvec, &loc);
 
@@ -8593,7 +8616,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       _get_pending_crush(newcrush);
 
       string name;
-      cmd_getval(g_ceph_context, cmdmap, "name", name);
+      cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
 
       if (!osdmap.crush->name_exists(name)) {
 	err = 0;
@@ -8613,7 +8636,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
       bool unlink_only = prefix == "osd crush unlink";
       string ancestor_str;
-      if (cmd_getval(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
+      if (cmd_getval_throws(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
 	if (!newcrush.name_exists(ancestor_str)) {
 	  err = -ENOENT;
 	  ss << "ancestor item '" << ancestor_str
@@ -8660,7 +8683,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     _get_pending_crush(newcrush);
 
     string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
     if (!newcrush.name_exists(name)) {
       err = -ENOENT;
       ss << "device '" << name << "' does not appear in the crush map";
@@ -8674,7 +8697,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     double w;
-    if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
       ss << "unable to parse weight value '"
 	 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
       err = -EINVAL;
@@ -8698,7 +8721,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     _get_pending_crush(newcrush);
 
     string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
     if (!newcrush.name_exists(name)) {
       err = -ENOENT;
       ss << "device '" << name << "' does not appear in the crush map";
@@ -8712,7 +8735,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     double w;
-    if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
       ss << "unable to parse weight value '"
 	 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
       err = -EINVAL;
@@ -8736,7 +8759,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
     err = 0;
     string profile;
-    cmd_getval(g_ceph_context, cmdmap, "profile", profile);
+    cmd_getval_throws(g_ceph_context, cmdmap, "profile", profile);
     if (profile == "legacy" || profile == "argonaut") {
       newcrush.set_tunables_legacy();
     } else if (profile == "bobtail") {
@@ -8775,10 +8798,10 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
     err = 0;
     string tunable;
-    cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
+    cmd_getval_throws(g_ceph_context, cmdmap, "tunable", tunable);
 
     int64_t value = -1;
-    if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "value", value)) {
       err = -EINVAL;
       ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
       goto reply;
@@ -8812,10 +8835,10 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
   } else if (prefix == "osd crush rule create-simple") {
     string name, root, type, mode;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
-    cmd_getval(g_ceph_context, cmdmap, "root", root);
-    cmd_getval(g_ceph_context, cmdmap, "type", type);
-    cmd_getval(g_ceph_context, cmdmap, "mode", mode);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "root", root);
+    cmd_getval_throws(g_ceph_context, cmdmap, "type", type);
+    cmd_getval_throws(g_ceph_context, cmdmap, "mode", mode);
     if (mode == "")
       mode = "firstn";
 
@@ -8853,10 +8876,10 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
   } else if (prefix == "osd crush rule create-replicated") {
     string name, root, type, device_class;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
-    cmd_getval(g_ceph_context, cmdmap, "root", root);
-    cmd_getval(g_ceph_context, cmdmap, "type", type);
-    cmd_getval(g_ceph_context, cmdmap, "class", device_class);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "root", root);
+    cmd_getval_throws(g_ceph_context, cmdmap, "type", type);
+    cmd_getval_throws(g_ceph_context, cmdmap, "class", device_class);
 
     if (!device_class.empty()) {
       if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
@@ -8902,7 +8925,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
   } else if (prefix == "osd erasure-code-profile rm") {
     string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
 
     if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
       goto wait;
@@ -8933,9 +8956,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
   } else if (prefix == "osd erasure-code-profile set") {
     string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
     vector<string> profile;
-    cmd_getval(g_ceph_context, cmdmap, "profile", profile);
+    cmd_getval_throws(g_ceph_context, cmdmap, "profile", profile);
     bool force;
     if (profile.size() > 0 && profile.back() == "--force") {
       profile.pop_back();
@@ -9015,9 +9038,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     if (err)
       goto reply;
     string name, poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
     string profile;
-    cmd_getval(g_ceph_context, cmdmap, "profile", profile);
+    cmd_getval_throws(g_ceph_context, cmdmap, "profile", profile);
     if (profile == "")
       profile = "default";
     if (profile == "default") {
@@ -9071,7 +9094,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
   } else if (prefix == "osd crush rule rm") {
     string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
 
     if (!osdmap.crush->rule_exists(name)) {
       ss << "rule " << name << " does not exist";
@@ -9115,8 +9138,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   } else if (prefix == "osd crush rule rename") {
     string srcname;
     string dstname;
-    cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
-    cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "srcname", srcname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "dstname", dstname);
     if (srcname.empty() || dstname.empty()) {
       ss << "must specify both source rule name and destination rule name";
       err = -EINVAL;
@@ -9153,7 +9176,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
   } else if (prefix == "osd setmaxosd") {
     int64_t newmax;
-    if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "newmax", newmax)) {
       ss << "unable to parse 'newmax' value '"
          << cmd_vartype_stringify(cmdmap["newmax"]) << "'";
       err = -EINVAL;
@@ -9201,7 +9224,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     double n;
-    if (!cmd_getval(g_ceph_context, cmdmap, "ratio", n)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "ratio", n)) {
       ss << "unable to parse 'ratio' value '"
          << cmd_vartype_stringify(cmdmap["ratio"]) << "'";
       err = -EINVAL;
@@ -9226,7 +9249,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     string v;
-    cmd_getval(g_ceph_context, cmdmap, "version", v);
+    cmd_getval_throws(g_ceph_context, cmdmap, "version", v);
     int vno = ceph_release_from_name(v.c_str());
     if (vno <= 0) {
       ss << "version " << v << " is not recognized";
@@ -9247,7 +9270,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     string sure;
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
     if (sure != "--yes-i-really-mean-it") {
       FeatureMap m;
       mon->get_combined_feature_map(&m);
@@ -9299,9 +9322,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
   } else if (prefix == "osd set") {
     string sure;
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
     string key;
-    cmd_getval(g_ceph_context, cmdmap, "key", key);
+    cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
     if (key == "full")
       return prepare_set_flag(op, CEPH_OSDMAP_FULL);
     else if (key == "pause")
@@ -9356,6 +9379,24 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 	err = -EPERM;
 	goto reply;
       }
+    } else if (key == "pglog_hardlimit") {
+      if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        err = -EPERM;
+        goto reply;
+      }
+      // The release check here is required because for OSD_PGLOG_HARDLIMIT,
+      // we are reusing a jewel feature bit that was retired in luminous.
+      if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+         (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
+          || sure == "--yes-i-really-mean-it")) {
+	return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
+      } else {
+	ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
+	err = -EPERM;
+	goto reply;
+      }
     } else if (key == "require_jewel_osds") {
       if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
         ss << "Not advisable to continue since no OSDs are up. Pass "
@@ -9410,7 +9451,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
   } else if (prefix == "osd unset") {
     string key;
-    cmd_getval(g_ceph_context, cmdmap, "key", key);
+    cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
     if (key == "full")
       return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
     else if (key == "pause")
@@ -9442,9 +9483,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
   } else if (prefix == "osd require-osd-release") {
     string release;
-    cmd_getval(g_ceph_context, cmdmap, "release", release);
+    cmd_getval_throws(g_ceph_context, cmdmap, "release", release);
     string sure;
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
     if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
       ss << "the sortbitwise flag must be set first";
       err = -EPERM;
@@ -9503,7 +9544,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     bool verbose = true;
 
     vector<string> idvec;
-    cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
       set<int> osds;
 
@@ -9637,7 +9678,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     bool stop = false;
 
     vector<string> idvec;
-    cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
 
       set<int> osds;
@@ -9771,7 +9812,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     bool stop = false;
 
     vector<string> idvec;
-    cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
 
     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
 
@@ -9915,7 +9956,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
   } else if (prefix == "osd pg-temp") {
     string pgidstr;
-    if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr)) {
       ss << "unable to parse 'pgid' value '"
          << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
       err = -EINVAL;
@@ -9940,7 +9981,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
     vector<int64_t> id_vec;
     vector<int32_t> new_pg_temp;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id_vec)) {
       ss << "unable to parse 'id' value(s) '"
          << cmd_vartype_stringify(cmdmap["id"]) << "'";
       err = -EINVAL;
@@ -9977,7 +10018,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     goto update;
   } else if (prefix == "osd primary-temp") {
     string pgidstr;
-    if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr)) {
       ss << "unable to parse 'pgid' value '"
          << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
       err = -EINVAL;
@@ -9996,7 +10037,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
 
     int64_t osd;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", osd)) {
       ss << "unable to parse 'id' value '"
          << cmd_vartype_stringify(cmdmap["id"]) << "'";
       err = -EINVAL;
@@ -10049,7 +10090,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     if (err < 0)
       goto reply;
     string pgidstr;
-    if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr)) {
       ss << "unable to parse 'pgid' value '"
          << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
       err = -EINVAL;
@@ -10124,7 +10165,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     case OP_PG_UPMAP:
       {
         vector<int64_t> id_vec;
-        if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
+        if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id_vec)) {
           ss << "unable to parse 'id' value(s) '"
              << cmd_vartype_stringify(cmdmap["id"]) << "'";
           err = -EINVAL;
@@ -10184,7 +10225,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     case OP_PG_UPMAP_ITEMS:
       {
         vector<int64_t> id_vec;
-        if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
+        if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id_vec)) {
           ss << "unable to parse 'id' value(s) '"
              << cmd_vartype_stringify(cmdmap["id"]) << "'";
           err = -EINVAL;
@@ -10266,14 +10307,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     goto update;
   } else if (prefix == "osd primary-affinity") {
     int64_t id;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
       ss << "invalid osd id value '"
          << cmd_vartype_stringify(cmdmap["id"]) << "'";
       err = -EINVAL;
       goto reply;
     }
     double w;
-    if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
       ss << "unable to parse 'weight' value '"
            << cmd_vartype_stringify(cmdmap["weight"]) << "'";
       err = -EINVAL;
@@ -10316,14 +10357,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
   } else if (prefix == "osd reweight") {
     int64_t id;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
       ss << "unable to parse osd id value '"
          << cmd_vartype_stringify(cmdmap["id"]) << "'";
       err = -EINVAL;
       goto reply;
     }
     double w;
-    if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
       ss << "unable to parse weight value '"
          << cmd_vartype_stringify(cmdmap["weight"]) << "'";
       err = -EINVAL;
@@ -10362,14 +10403,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     return true;
   } else if (prefix == "osd lost") {
     int64_t id;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
       ss << "unable to parse osd id value '"
          << cmd_vartype_stringify(cmdmap["id"]) << "'";
       err = -EINVAL;
       goto reply;
     }
     string sure;
-    if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
       ss << "are you SURE?  this might mean real, permanent data loss.  pass "
 	    "--yes-i-really-mean-it if you really do.";
       err = -EPERM;
@@ -10414,7 +10455,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
 
     int64_t id;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
       ss << "unable to parse osd id value '"
          << cmd_vartype_stringify(cmdmap["id"]) << "";
       err = -EINVAL;
@@ -10427,7 +10468,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
 
     string sure;
-    if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure) ||
         sure != "--yes-i-really-mean-it") {
       ss << "Are you SURE? This will mean real, permanent data loss, as well "
          << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
@@ -10533,7 +10574,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
     // optional id provided?
     int64_t id = -1, cmd_id = -1;
-    if (cmd_getval(g_ceph_context, cmdmap, "id", cmd_id)) {
+    if (cmd_getval_throws(g_ceph_context, cmdmap, "id", cmd_id)) {
       if (cmd_id < 0) {
 	ss << "invalid osd id value '" << cmd_id << "'";
 	err = -EINVAL;
@@ -10544,7 +10585,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
     uuid_d uuid;
     string uuidstr;
-    if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
+    if (cmd_getval_throws(g_ceph_context, cmdmap, "uuid", uuidstr)) {
       if (!uuid.parse(uuidstr.c_str())) {
         ss << "invalid uuid value '" << uuidstr << "'";
         err = -EINVAL;
@@ -10611,7 +10652,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     return true;
   } else if (prefix == "osd blacklist") {
     string addrstr;
-    cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "addr", addrstr);
     entity_addr_t addr;
     if (!addr.parse(addrstr.c_str(), 0)) {
       ss << "unable to parse address " << addrstr;
@@ -10620,12 +10661,12 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
     else {
       string blacklistop;
-      cmd_getval(g_ceph_context, cmdmap, "blacklistop", blacklistop);
+      cmd_getval_throws(g_ceph_context, cmdmap, "blacklistop", blacklistop);
       if (blacklistop == "add") {
 	utime_t expires = ceph_clock_now();
 	double d;
 	// default one hour
-	cmd_getval(g_ceph_context, cmdmap, "expire", d,
+	cmd_getval_throws(g_ceph_context, cmdmap, "expire", d,
           g_conf->mon_osd_blacklist_default_expire);
 	expires += d;
 
@@ -10665,7 +10706,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
   } else if (prefix == "osd pool mksnap") {
     string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
     if (pool < 0) {
       ss << "unrecognized pool '" << poolstr << "'";
@@ -10673,7 +10714,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     string snapname;
-    cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "snap", snapname);
     const pg_pool_t *p = osdmap.get_pg_pool(pool);
     if (p->is_unmanaged_snaps_mode()) {
       ss << "pool " << poolstr << " is in unmanaged snaps mode";
@@ -10708,7 +10749,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     return true;
   } else if (prefix == "osd pool rmsnap") {
     string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
     if (pool < 0) {
       ss << "unrecognized pool '" << poolstr << "'";
@@ -10716,7 +10757,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     string snapname;
-    cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "snap", snapname);
     const pg_pool_t *p = osdmap.get_pg_pool(pool);
     if (p->is_unmanaged_snaps_mode()) {
       ss << "pool " << poolstr << " is in unmanaged snaps mode";
@@ -10749,16 +10790,16 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   } else if (prefix == "osd pool create") {
     int64_t  pg_num;
     int64_t pgp_num;
-    cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
-    cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
+    cmd_getval_throws(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
 
     string pool_type_str;
-    cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool_type", pool_type_str);
     if (pool_type_str.empty())
       pool_type_str = g_conf->osd_pool_default_type;
 
     string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
     if (pool_id >= 0) {
       const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
@@ -10793,9 +10834,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     bool implicit_rule_creation = false;
     int64_t expected_num_objects = 0;
     string rule_name;
-    cmd_getval(g_ceph_context, cmdmap, "rule", rule_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "rule", rule_name);
     string erasure_code_profile;
-    cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
+    cmd_getval_throws(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
 
     if (pool_type == pg_pool_t::TYPE_ERASURE) {
       if (erasure_code_profile == "")
@@ -10829,7 +10870,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 	  rule_name = poolstr;
 	}
       }
-      cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
+      cmd_getval_throws(g_ceph_context, cmdmap, "expected_num_objects",
                  expected_num_objects, int64_t(0));
     } else {
       //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
@@ -10846,7 +10887,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
         }
         rule_name = erasure_code_profile;
       } else { // cmd is well-formed
-        cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
+        cmd_getval_throws(g_ceph_context, cmdmap, "expected_num_objects",
                    expected_num_objects, int64_t(0));
       }
     }
@@ -10888,7 +10929,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
 
     int64_t fast_read_param;
-    cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
+    cmd_getval_throws(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
     FastReadType fast_read = FAST_READ_DEFAULT;
     if (fast_read_param == 0)
       fast_read = FAST_READ_OFF;
@@ -10929,9 +10970,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
              prefix == "osd pool rm") {
     // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
     string poolstr, poolstr2, sure;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
-    cmd_getval(g_ceph_context, cmdmap, "pool2", poolstr2);
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool2", poolstr2);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
     if (pool < 0) {
       ss << "pool '" << poolstr << "' does not exist";
@@ -10958,8 +10999,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     goto update;
   } else if (prefix == "osd pool rename") {
     string srcpoolstr, destpoolstr;
-    cmd_getval(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
-    cmd_getval(g_ceph_context, cmdmap, "destpool", destpoolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "destpool", destpoolstr);
     int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
     int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
 
@@ -11016,7 +11057,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     if (err)
       goto reply;
     string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
     if (pool_id < 0) {
       ss << "unrecognized pool '" << poolstr << "'";
@@ -11024,7 +11065,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     string tierpoolstr;
-    cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
     if (tierpool_id < 0) {
       ss << "unrecognized pool '" << tierpoolstr << "'";
@@ -11042,7 +11083,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
     // make sure new tier is empty
     string force_nonempty;
-    cmd_getval(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
+    cmd_getval_throws(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
     const pool_stat_t *pstats = mon->pgservice->get_pool_stat(tierpool_id);
     if (pstats && pstats->stats.sum.num_objects != 0 &&
 	force_nonempty != "--force-nonempty") {
@@ -11080,7 +11121,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   } else if (prefix == "osd tier remove" ||
              prefix == "osd tier rm") {
     string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
     if (pool_id < 0) {
       ss << "unrecognized pool '" << poolstr << "'";
@@ -11088,7 +11129,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     string tierpoolstr;
-    cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
     if (tierpool_id < 0) {
       ss << "unrecognized pool '" << tierpoolstr << "'";
@@ -11144,7 +11185,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     if (err)
       goto reply;
     string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
     if (pool_id < 0) {
       ss << "unrecognized pool '" << poolstr << "'";
@@ -11152,7 +11193,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     string overlaypoolstr;
-    cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
     int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
     if (overlaypool_id < 0) {
       ss << "unrecognized pool '" << overlaypoolstr << "'";
@@ -11197,7 +11238,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   } else if (prefix == "osd tier remove-overlay" ||
              prefix == "osd tier rm-overlay") {
     string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
     if (pool_id < 0) {
       ss << "unrecognized pool '" << poolstr << "'";
@@ -11242,7 +11283,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     if (err)
       goto reply;
     string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
     if (pool_id < 0) {
       ss << "unrecognized pool '" << poolstr << "'";
@@ -11257,7 +11298,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     string modestr;
-    cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "mode", modestr);
     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
     if (mode < 0) {
       ss << "'" << modestr << "' is not a valid cache mode";
@@ -11266,7 +11307,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
 
     string sure;
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
     if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
 	 mode != pg_pool_t::CACHEMODE_NONE &&
 	 mode != pg_pool_t::CACHEMODE_PROXY &&
@@ -11392,7 +11433,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     if (err)
       goto reply;
     string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
     if (pool_id < 0) {
       ss << "unrecognized pool '" << poolstr << "'";
@@ -11400,7 +11441,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     string tierpoolstr;
-    cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
     if (tierpool_id < 0) {
       ss << "unrecognized pool '" << tierpoolstr << "'";
@@ -11417,7 +11458,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
 
     int64_t size = 0;
-    if (!cmd_getval(g_ceph_context, cmdmap, "size", size)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "size", size)) {
       ss << "unable to parse 'size' value '"
          << cmd_vartype_stringify(cmdmap["size"]) << "'";
       err = -EINVAL;
@@ -11482,7 +11523,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     return true;
   } else if (prefix == "osd pool set-quota") {
     string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
     if (pool_id < 0) {
       ss << "unrecognized pool '" << poolstr << "'";
@@ -11491,7 +11532,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     }
 
     string field;
-    cmd_getval(g_ceph_context, cmdmap, "field", field);
+    cmd_getval_throws(g_ceph_context, cmdmap, "field", field);
     if (field != "max_objects" && field != "max_bytes") {
       ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
       err = -EINVAL;
@@ -11500,7 +11541,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 
     // val could contain unit designations, so we treat as a string
     string val;
-    cmd_getval(g_ceph_context, cmdmap, "val", val);
+    cmd_getval_throws(g_ceph_context, cmdmap, "val", val);
     string tss;
     int64_t value;
     if (field == "max_objects") {
@@ -11615,7 +11656,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   } else if (prefix == "osd force-create-pg") {
     pg_t pgid;
     string pgidstr;
-    cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr);
     if (!pgid.parse(pgidstr.c_str())) {
       ss << "invalid pgid '" << pgidstr << "'";
       err = -EINVAL;
diff --git a/ceph/src/os/bluestore/BlueFS.cc b/ceph/src/os/bluestore/BlueFS.cc
index cf9e1d603..cf3063aa8 100644
--- a/ceph/src/os/bluestore/BlueFS.cc
+++ b/ceph/src/os/bluestore/BlueFS.cc
@@ -299,6 +299,30 @@ int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
   return 0;
 }
 
+// returns true if specified device is attached
+bool BlueFS::is_device(unsigned id)
+{
+  return !(id >= MAX_BDEV || bdev[id] == nullptr);
+}
+
+// returns true if specified device is under full bluefs control
+// and hence can be expanded
+bool BlueFS::is_device_expandable(unsigned id)
+{
+  if (id >= MAX_BDEV || bdev[id] == nullptr) {
+    return false;
+  }
+  switch(id) {
+  case BDEV_WAL:
+    return true;
+
+  case BDEV_DB:
+    // true if DB volume is non-shared
+    return bdev[BDEV_SLOW] != nullptr;
+  }
+  return false;
+}
+
 int BlueFS::mkfs(uuid_d osd_uuid)
 {
   std::unique_lock<std::mutex> l(lock);
diff --git a/ceph/src/os/bluestore/BlueFS.h b/ceph/src/os/bluestore/BlueFS.h
index ade3c0498..033a4930c 100644
--- a/ceph/src/os/bluestore/BlueFS.h
+++ b/ceph/src/os/bluestore/BlueFS.h
@@ -347,6 +347,13 @@ public:
   /// get current extents that we own for given block device
   int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
 
+  // returns true if specified device is attached
+  bool is_device(unsigned id);
+  
+  // returns true if specified device is under full bluefs control
+  // and hence can be expanded
+  bool is_device_expandable(unsigned id);
+
   int open_for_write(
     const string& dir,
     const string& file,
diff --git a/ceph/src/os/bluestore/BlueStore.cc b/ceph/src/os/bluestore/BlueStore.cc
index b326d39bc..6d601aaac 100644
--- a/ceph/src/os/bluestore/BlueStore.cc
+++ b/ceph/src/os/bluestore/BlueStore.cc
@@ -1370,10 +1370,8 @@ void BlueStore::BufferSpace::read(
   cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
 }
 
-void BlueStore::BufferSpace::finish_write(Cache* cache, uint64_t seq)
+void BlueStore::BufferSpace::_finish_write(Cache* cache, uint64_t seq)
 {
-  std::lock_guard<std::recursive_mutex> l(cache->lock);
-
   auto i = writing.begin();
   while (i != writing.end()) {
     if (i->seq > seq) {
@@ -1648,6 +1646,23 @@ void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
   }
 }
 
+void BlueStore::SharedBlob::finish_write(uint64_t seq)
+{
+  while (true) {
+    Cache *cache = coll->cache;
+    std::lock_guard<std::recursive_mutex> l(cache->lock);
+    if (coll->cache != cache) {
+      ldout(coll->store->cct, 20) << __func__
+				  << " raced with sb cache update, was " << cache
+				  << ", now " << coll->cache << ", retrying"
+				  << dendl;
+      continue;
+    }
+    bc._finish_write(cache, seq);
+    break;
+  }
+}
+
 // SharedBlobSet
 
 #undef dout_prefix
@@ -3418,8 +3433,12 @@ void BlueStore::MempoolThread::_tune_cache_size(bool interval_stats)
   uint64_t target = store->osd_memory_target;
   uint64_t base = store->osd_memory_base;
   double fragmentation = store->osd_memory_expected_fragmentation;
-  uint64_t cache_max = ((1.0 - fragmentation) * target) - base;
   uint64_t cache_min = store->osd_memory_cache_min;
+  uint64_t cache_max = cache_min;
+  uint64_t limited_target = (1.0 - fragmentation) * target;
+  if (limited_target > base + cache_min) {
+    cache_max = limited_target - base;
+  }
 
   size_t heap_size = 0;
   size_t unmapped = 0;
@@ -4171,6 +4190,8 @@ void BlueStore::_init_logger()
 		    "collection");
   b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
                     "Read EIO errors propagated to high level callers");
+  b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
+                    "Read operations that required at least one retry due to failed checksum validation");
   logger = b.create_perf_counters();
   cct->get_perfcounters_collection()->add(logger);
 }
@@ -5059,6 +5080,18 @@ int BlueStore::_reconcile_bluefs_freespace()
   return 0;
 }
 
+void BlueStore::_dump_alloc_on_rebalance_failure()
+{
+  auto dump_interval =
+    cct->_conf->bluestore_bluefs_balance_failure_dump_interval;
+  if (dump_interval > 0 &&
+    next_dump_on_bluefs_balance_failure <= ceph_clock_now()) {
+    alloc->dump();
+    next_dump_on_bluefs_balance_failure = ceph_clock_now();
+    next_dump_on_bluefs_balance_failure += dump_interval;
+  }
+}
+
 int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
 {
   int ret = 0;
@@ -5147,18 +5180,18 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
 					0, 0, &exts);
 
     if (alloc_len <= 0) {
-      dout(1) << __func__ << " no allocate on 0x" << std::hex << gift
+      dout(0) << __func__ << " no allocate on 0x" << std::hex << gift
               << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
       alloc->unreserve(gift);
-      alloc->dump();
+      _dump_alloc_on_rebalance_failure();
       return 0;
     } else if (alloc_len < (int64_t)gift) {
-      dout(1) << __func__ << " insufficient allocate on 0x" << std::hex << gift
+      dout(0) << __func__ << " insufficient allocate on 0x" << std::hex << gift
               << " min_alloc_size 0x" << min_alloc_size 
 	      << " allocated 0x" << alloc_len
 	      << std::dec << dendl;
       alloc->unreserve(gift - alloc_len);
-      alloc->dump();
+      _dump_alloc_on_rebalance_failure();
     }
     for (auto& p : exts) {
       bluestore_pextent_t e = bluestore_pextent_t(p);
@@ -6698,7 +6731,8 @@ int BlueStore::_do_read(
   uint64_t offset,
   size_t length,
   bufferlist& bl,
-  uint32_t op_flags)
+  uint32_t op_flags,
+  uint64_t retry_count)
 {
   FUNCTRACE();
   int r = 0;
@@ -6919,7 +6953,14 @@ int BlueStore::_do_read(
       bufferlist& compressed_bl = *p++;
       if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
 		       b2r_it->second.front().logical_offset) < 0) {
-	return -EIO;
+        // Handles spurious read errors caused by a kernel bug.
+        // We sometimes get all-zero pages as a result of the read under
+        // high memory pressure. Retrying the failing read succeeds in most cases.
+        // See also: http://tracker.ceph.com/issues/22464
+        if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+          return -EIO;
+        }
+        return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
       }
       bufferlist raw_bl;
       r = _decompress(compressed_bl, &raw_bl);
@@ -6937,7 +6978,14 @@ int BlueStore::_do_read(
       for (auto& reg : b2r_it->second) {
 	if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
 			 reg.logical_offset) < 0) {
-	  return -EIO;
+          // Handles spurious read errors caused by a kernel bug.
+          // We sometimes get all-zero pages as a result of the read under
+          // high memory pressure. Retrying the failing read succeeds in most cases.
+          // See also: http://tracker.ceph.com/issues/22464
+          if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+            return -EIO;
+          }
+          return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
 	}
 	if (buffered) {
 	  bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
@@ -6981,6 +7029,11 @@ int BlueStore::_do_read(
   assert(pos == length);
   assert(pr == pr_end);
   r = bl.length();
+  if (retry_count) {
+    logger->inc(l_bluestore_reads_with_retries);
+    dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
+            << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
+  }
   return r;
 }
 
@@ -6993,6 +7046,13 @@ int BlueStore::_verify_csum(OnodeRef& o,
   uint64_t bad_csum;
   utime_t start = ceph_clock_now();
   int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
+  if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
+      (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
+    derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
+    bad = blob_xoffset;
+    r = -1;
+    bad_csum = 0xDEADBEEF;
+  }
   if (r < 0) {
     if (r == -1) {
       PExtentVector pex;
@@ -8398,7 +8458,7 @@ void BlueStore::_txc_finish(TransContext *txc)
   assert(txc->state == TransContext::STATE_FINISHING);
 
   for (auto& sb : txc->shared_blobs_written) {
-    sb->bc.finish_write(sb->get_cache(), txc->seq);
+    sb->finish_write(txc->seq);
   }
   txc->shared_blobs_written.clear();
 
@@ -10608,6 +10668,8 @@ void BlueStore::_choose_write_options(
 
   dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
            << " target_blob_size 0x" << std::hex << wctx->target_blob_size
+	   << " compress=" << (int)wctx->compress
+	   << " buffered=" << (int)wctx->buffered
            << std::dec << dendl;
 }
 
@@ -11551,6 +11613,11 @@ int BlueStore::_rename(TransContext *txc,
   c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
   r = 0;
 
+  // hold a ref to new Onode in old name position, to ensure we don't drop
+  // it from the cache before this txc commits (or else someone may come along
+  // and read newo's metadata via the old name).
+  txc->note_modified_object(oldo);
+
  out:
   dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
 	   << new_oid << " = " << r << dendl;
diff --git a/ceph/src/os/bluestore/BlueStore.h b/ceph/src/os/bluestore/BlueStore.h
index 6a14042df..8f78594d9 100644
--- a/ceph/src/os/bluestore/BlueStore.h
+++ b/ceph/src/os/bluestore/BlueStore.h
@@ -118,6 +118,7 @@ enum {
   l_bluestore_extent_compress,
   l_bluestore_gc_merged,
   l_bluestore_read_eio,
+  l_bluestore_reads_with_retries,
   l_bluestore_last
 };
 
@@ -333,7 +334,7 @@ public:
       b->cache_private = _discard(cache, offset, bl.length());
       _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
     }
-    void finish_write(Cache* cache, uint64_t seq);
+    void _finish_write(Cache* cache, uint64_t seq);
     void did_read(Cache* cache, uint32_t offset, bufferlist& bl) {
       std::lock_guard<std::recursive_mutex> l(cache->lock);
       Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
@@ -410,6 +411,8 @@ public:
     void put_ref(uint64_t offset, uint32_t length,
 		 PExtentVector *r, set<SharedBlob*> *maybe_unshared_blobs);
 
+    void finish_write(uint64_t seq);
+
     friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
       return l.get_sbid() == r.get_sbid();
     }
@@ -1825,6 +1828,7 @@ private:
   unsigned bluefs_shared_bdev = 0;  ///< which bluefs bdev we are sharing
   bool bluefs_single_shared_device = true;
   utime_t bluefs_last_balance;
+  utime_t next_dump_on_bluefs_balance_failure;
 
   KeyValueDB *db = nullptr;
   BlockDevice *bdev = nullptr;
@@ -2124,6 +2128,7 @@ private:
 
   void _open_statfs();
 
+  void _dump_alloc_on_rebalance_failure();
   int _reconcile_bluefs_freespace();
   int _balance_bluefs_freespace(PExtentVector *extents);
   void _commit_bluefs_freespace(const PExtentVector& extents);
@@ -2366,7 +2371,8 @@ public:
     uint64_t offset,
     size_t len,
     bufferlist& bl,
-    uint32_t op_flags = 0);
+    uint32_t op_flags = 0,
+    uint64_t retry_count = 0);
 
 private:
   int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
diff --git a/ceph/src/os/bluestore/bluestore_tool.cc b/ceph/src/os/bluestore/bluestore_tool.cc
index 1320df831..746ba8222 100644
--- a/ceph/src/os/bluestore/bluestore_tool.cc
+++ b/ceph/src/os/bluestore/bluestore_tool.cc
@@ -66,6 +66,28 @@ void validate_path(CephContext *cct, const string& path, bool bluefs)
   }
 }
 
+const char* find_device_path(
+  int id,
+  CephContext *cct,
+  const vector<string>& devs)
+{
+  for (auto& i : devs) {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct, i, &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << i << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if ((id == BlueFS::BDEV_SLOW && label.description == "main") ||
+        (id == BlueFS::BDEV_DB && label.description == "bluefs db") ||
+        (id == BlueFS::BDEV_WAL && label.description == "bluefs wal")) {
+      return i.c_str();
+    }
+  }
+  return nullptr;
+}
+
 BlueFS *open_bluefs(
   CephContext *cct,
   const string& path,
@@ -277,7 +299,9 @@ int main(int argc, char **argv)
   env_to_vec(args);
 
   auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
-			 CODE_ENVIRONMENT_UTILITY, 0);
+			 CODE_ENVIRONMENT_UTILITY,
+			 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+
   common_init_finish(cct.get());
 
   if (action == "fsck" ||
@@ -403,7 +427,22 @@ int main(int argc, char **argv)
 	   << cpp_strerror(r) << std::endl;
       exit(EXIT_FAILURE);
     }
-    label.meta[key] = value;
+    if (key == "size") {
+      label.size = strtoull(value.c_str(), nullptr, 10);
+    } else if (key =="osd_uuid") {
+      label.osd_uuid.parse(value.c_str());
+    } else if (key =="btime") {
+      uint64_t epoch;
+      uint64_t nsec;
+      int r = utime_t::parse_date(value.c_str(), &epoch, &nsec);
+      if (r == 0) {
+	label.btime = utime_t(epoch, nsec);
+      }
+    } else if (key =="description") {
+      label.description = value;
+    } else {
+      label.meta[key] = value;
+    }
     r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
     if (r < 0) {
       cerr << "unable to write label for " << devs.front() << ": "
@@ -438,17 +477,51 @@ int main(int argc, char **argv)
   }
   else if (action == "bluefs-bdev-expand") {
     BlueFS *fs = open_bluefs(cct.get(), path, devs);
-    cout << "start:" << std::endl;
     fs->dump_block_extents(cout);
+    cout << "Expanding..." << std::endl;
     for (int devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB }) {
+      if (!fs->is_device(devid)) {
+        continue;
+      }
+      if (!fs->is_device_expandable(devid)) {
+	cout << devid
+	     << " : can't be expanded. Bypassing..."
+	     << std::endl;
+	continue;
+      }
       interval_set<uint64_t> before;
       fs->get_block_extents(devid, &before);
+      assert(!before.empty());
       uint64_t end = before.range_end();
       uint64_t size = fs->get_block_device_size(devid);
       if (end < size) {
-	cout << "expanding dev " << devid << " from 0x" << std::hex
+	cout << devid
+	     <<" : expanding " << " from 0x" << std::hex
 	     << end << " to 0x" << size << std::dec << std::endl;
 	fs->add_block_extent(devid, end, size-end);
+	const char* path = find_device_path(devid, cct.get(), devs);
+	if (path == nullptr) {
+	  cerr << devid
+	       <<": can't find device path " << std::endl;
+	  continue;
+	}
+	bluestore_bdev_label_t label;
+	int r = BlueStore::_read_bdev_label(cct.get(), path, &label);
+	if (r < 0) {
+	  cerr << "unable to read label for " << path << ": "
+		<< cpp_strerror(r) << std::endl;
+	  continue;
+	}
+        label.size = size;
+	r = BlueStore::_write_bdev_label(cct.get(), path, label);
+	if (r < 0) {
+	  cerr << "unable to write label for " << path << ": "
+		<< cpp_strerror(r) << std::endl;
+	  continue;
+	}
+	cout << devid
+	     <<" : size label updated to " << size
+	     << std::endl;
       }
     }
     delete fs;
diff --git a/ceph/src/os/filestore/LFNIndex.h b/ceph/src/os/filestore/LFNIndex.h
index bbab0d097..2f2d2e0f8 100644
--- a/ceph/src/os/filestore/LFNIndex.h
+++ b/ceph/src/os/filestore/LFNIndex.h
@@ -63,7 +63,7 @@
       out:					\
       complete_inject_failure();		\
       return r;					\
-    } catch (RetryException) {			\
+    } catch (RetryException&) {			\
       failed = true;				\
     } catch (...) {				\
       ceph_abort();				\
diff --git a/ceph/src/osd/OSD.cc b/ceph/src/osd/OSD.cc
index 7d7374337..0f968d333 100644
--- a/ceph/src/osd/OSD.cc
+++ b/ceph/src/osd/OSD.cc
@@ -215,7 +215,7 @@ OSDService::OSDService(OSD *osd) :
   monc(osd->monc),
   peering_wq(osd->peering_wq),
   recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
-		  &osd->disk_tp),
+		  &osd->recovery_tp),
   class_handler(osd->class_handler),
   pg_epoch_lock("OSDService::pg_epoch_lock"),
   publish_lock("OSDService::publish_lock"),
@@ -1971,7 +1971,8 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
 	     "osd_peering_tp_threads"),
   osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
 	    get_num_op_threads()),
-  disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
+  remove_tp(cct, "OSD::remove_tp", "tp_osd_remove", cct->_conf->osd_remove_threads, "osd_remove_threads"),
+  recovery_tp(cct, "OSD::recovery_tp", "tp_osd_recovery", cct->_conf->osd_recovery_threads, "osd_recovery_threads"),
   command_tp(cct, "OSD::command_tp", "tp_osd_cmd",  1),
   session_waiting_lock("OSD::session_waiting_lock"),
   osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
@@ -2022,7 +2023,7 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
     store,
     cct->_conf->osd_remove_thread_timeout,
     cct->_conf->osd_remove_thread_suicide_timeout,
-    &disk_tp),
+    &remove_tp),
   service(this)
 {
   monc->set_messenger(client_messenger);
@@ -2679,7 +2680,8 @@ int OSD::init()
   service.max_oldest_map = superblock.oldest_map;
 
   osd_op_tp.start();
-  disk_tp.start();
+  remove_tp.start();
+  recovery_tp.start();
   command_tp.start();
 
   set_disk_tp_priority();
@@ -3434,9 +3436,13 @@ int OSD::shutdown()
   command_tp.stop();
   dout(10) << "command tp stopped" << dendl;
 
-  disk_tp.drain();
-  disk_tp.stop();
-  dout(10) << "disk tp paused (new)" << dendl;
+  remove_tp.drain();
+  remove_tp.stop();
+  dout(10) << "remove tp paused (new)" << dendl;
+
+  recovery_tp.drain();
+  recovery_tp.stop();
+  dout(10) << "recovery tp paused (new)" << dendl;
 
   dout(10) << "stopping agent" << dendl;
   service.agent_stop();
@@ -3501,7 +3507,10 @@ int OSD::shutdown()
 #ifdef PG_DEBUG_REFS
   service.dump_live_pgids();
 #endif
+
+  osd_lock.Unlock();
   cct->_conf->remove_observer(this);
+  osd_lock.Lock();
 
   dout(10) << "syncing store" << dendl;
   enable_disable_fuse(true);
@@ -5255,11 +5264,10 @@ void OSD::heartbeat()
 
 bool OSD::heartbeat_reset(Connection *con)
 {
+  Mutex::Locker l(heartbeat_lock);
   HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
   if (s) {
-    heartbeat_lock.Lock();
     if (is_stopping()) {
-      heartbeat_lock.Unlock();
       s->put();
       return true;
     }
@@ -5293,7 +5301,6 @@ bool OSD::heartbeat_reset(Connection *con)
     } else {
       dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
     }
-    heartbeat_lock.Unlock();
     s->put();
   }
   return true;
@@ -5431,6 +5438,7 @@ void OSD::tick_without_osd_lock()
     }
   }
 
+  check_ops_in_flight();
   mgrc.update_osd_health(get_health_metrics());
   service.kick_recovery_queue();
   tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
@@ -5707,6 +5715,12 @@ bool remove_dir(
       cont = dstate->pause_clearing();
       handle.suspend_tp_timeout();
       waiter.wait();
+      if (cct->_conf->osd_delete_sleep) {
+        utime_t t;
+        t.set_from_double(cct->_conf->osd_delete_sleep);
+        lgeneric_subdout(cct, osd, 10) << __func__ << " inject delay of " << t << dendl;
+        t.sleep();
+      }
       handle.reset_tp_timeout();
       if (cont)
         cont = dstate->resume_clearing();
@@ -6585,6 +6599,22 @@ COMMAND("compact",
         "osd", "rw", "cli,rest")
 };
 
+namespace {
+  class unlock_guard {
+    Mutex& m;
+  public:
+    explicit unlock_guard(Mutex& mutex)
+      : m(mutex)
+    {
+      m.Unlock();
+    }
+    unlock_guard(unlock_guard&) = delete;
+    ~unlock_guard() {
+      m.Lock();
+    }
+  };
+}
+
 void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
 {
   int r = 0;
@@ -6658,21 +6688,19 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
     string args = argsvec.front();
     for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
       args += " " + *a;
-    osd_lock.Unlock();
+    unlock_guard unlock{osd_lock};
     r = cct->_conf->injectargs(args, &ss);
-    osd_lock.Lock();
   }
   else if (prefix == "config set") {
     std::string key;
     std::string val;
     cmd_getval(cct, cmdmap, "key", key);
     cmd_getval(cct, cmdmap, "value", val);
-    osd_lock.Unlock();
+    unlock_guard unlock{osd_lock};
     r = cct->_conf->set_val(key, val, true, &ss);
     if (r == 0) {
       cct->_conf->apply_changes(nullptr);
     }
-    osd_lock.Lock();
   }
   else if (prefix == "cluster_log") {
     vector<string> msg;
@@ -6966,6 +6994,7 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
     cmd_getval(cct, cmdmap, "delay", delay);
     ostringstream oss;
     oss << delay;
+    unlock_guard unlock{osd_lock};
     r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
     if (r != 0) {
       ss << "kick_recovery_wq: error setting "
@@ -9969,6 +9998,7 @@ const char** OSD::get_tracked_conf_keys() const
 void OSD::handle_conf_change(const struct md_config_t *conf,
 			     const std::set <std::string> &changed)
 {
+  Mutex::Locker l(osd_lock);
   if (changed.count("osd_max_backfills")) {
     service.local_reserver.set_max(cct->_conf->osd_max_backfills);
     service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
@@ -10099,12 +10129,14 @@ void OSD::set_disk_tp_priority()
     return;
   int cls =
     ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
-  if (cls < 0)
+  if (cls < 0) {
     derr << __func__ << cpp_strerror(cls) << ": "
 	 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
 	 << " but only the following values are allowed: idle, be or rt" << dendl;
-  else
-    disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
+  } else {
+    remove_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
+    recovery_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
+  }
 }
 
 // --------------------------------
diff --git a/ceph/src/osd/OSD.h b/ceph/src/osd/OSD.h
index b6d2482bc..f445a2b98 100644
--- a/ceph/src/osd/OSD.h
+++ b/ceph/src/osd/OSD.h
@@ -1393,7 +1393,8 @@ private:
 
   ThreadPool peering_tp;
   ShardedThreadPool osd_op_tp;
-  ThreadPool disk_tp;
+  ThreadPool remove_tp;
+  ThreadPool recovery_tp;
   ThreadPool command_tp;
 
   void set_disk_tp_priority();
diff --git a/ceph/src/osd/OSDMap.cc b/ceph/src/osd/OSDMap.cc
index f6cf25fd9..3b7ddfa4a 100644
--- a/ceph/src/osd/OSDMap.cc
+++ b/ceph/src/osd/OSDMap.cc
@@ -1637,12 +1637,12 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
     to_check.insert(p.first);
   }
   for (auto& pg : to_check) {
-    auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
-    if (crush_rule < 0) {
-      lderr(cct) << __func__ << " unable to load crush-rule of pg "
-                 << pg << dendl;
+    if (!tmpmap.pg_exists(pg)) {
+      ldout(cct, 0) << __func__ << " pg " << pg << " is gone" << dendl;
+      to_cancel.insert(pg);
       continue;
     }
+    auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
     map<int, float> weight_map;
     auto it = rule_weight_map.find(crush_rule);
     if (it == rule_weight_map.end()) {
@@ -1672,19 +1672,23 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
     tmpmap.pg_to_raw_up(pg, &raw, &primary);
     set<int> parents;
     for (auto osd : raw) {
+      // skip non-existent/down osd for erasure-coded PGs
+      if (osd == CRUSH_ITEM_NONE)
+        continue;
       if (type > 0) {
         auto parent = tmpmap.crush->get_parent_of_type(osd, type, crush_rule);
-        if (parent >= 0) {
+        if (parent < 0) {
+          auto r = parents.insert(parent);
+          if (!r.second) {
+            // two up-set osds come from same parent
+            to_cancel.insert(pg);
+            break;
+          }
+        } else {
           lderr(cct) << __func__ << " unable to get parent of raw osd."
                      << osd << " of pg " << pg
                      << dendl;
-          break;
-        }
-        auto r = parents.insert(parent);
-        if (!r.second) {
-          // two up-set osds come from same parent
-          to_cancel.insert(pg);
-          break;
+          // continue to do checks below
         }
       }
       // the above check validates collision only
@@ -1740,6 +1744,7 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
       }
     }
   }
+  tmpmap.clean_pg_upmaps(cct, pending_inc);
 }
 
 int OSDMap::apply_incremental(const Incremental &inc)
@@ -3148,6 +3153,8 @@ string OSDMap::get_flag_string(unsigned f)
     s += ",recovery_deletes";
   if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
     s += ",purged_snapdirs";
+  if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
+    s += ",pglog_hardlimit";
   if (s.length())
     s.erase(0, 1);
   return s;
@@ -3905,7 +3912,7 @@ int OSDMap::summarize_mapping_stats(
 
 int OSDMap::clean_pg_upmaps(
   CephContext *cct,
-  Incremental *pending_inc)
+  Incremental *pending_inc) const
 {
   ldout(cct, 10) << __func__ << dendl;
   int changed = 0;
@@ -3926,9 +3933,16 @@ int OSDMap::clean_pg_upmaps(
     pg_to_raw_osds(p.first, &raw, &primary);
     mempool::osdmap::vector<pair<int,int>> newmap;
     for (auto& q : p.second) {
-      if (std::find(raw.begin(), raw.end(), q.first) != raw.end()) {
-	newmap.push_back(q);
+      if (std::find(raw.begin(), raw.end(), q.first) == raw.end()) {
+        // cancel mapping if source osd does not exist anymore
+        continue;
+      }
+      if (q.second != CRUSH_ITEM_NONE && q.second < max_osd &&
+          q.second >= 0 && osd_weight[q.second] == 0) {
+        // cancel mapping if target osd is out
+        continue;
       }
+      newmap.push_back(q);
     }
     if (newmap.empty()) {
       ldout(cct, 10) << " removing no-op pg_upmap_items " << p.first << " "
@@ -4069,6 +4083,8 @@ int OSDMap::calc_pg_upmaps(
     multimap<float,int> deviation_osd;  // deviation(pgs), osd
     set<int> overfull;
     for (auto& i : pgs_by_osd) {
+      // make sure osd is still there (belongs to this crush-tree)
+      assert(osd_weight.count(i.first));
       float target = osd_weight[i.first] * pgs_per_weight;
       float deviation = (float)i.second.size() - target;
       ldout(cct, 20) << " osd." << i.first
@@ -4107,8 +4123,6 @@ int OSDMap::calc_pg_upmaps(
     for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
       int osd = p->second;
       float deviation = p->first;
-      // make sure osd is still there (belongs to this crush-tree)
-      assert(osd_weight.count(osd));
       float target = osd_weight[osd] * pgs_per_weight;
       assert(target > 0);
       if (deviation/target < max_deviation_ratio) {
diff --git a/ceph/src/osd/OSDMap.h b/ceph/src/osd/OSDMap.h
index 2a5f985c5..5e3fe3d28 100644
--- a/ceph/src/osd/OSDMap.h
+++ b/ceph/src/osd/OSDMap.h
@@ -1313,7 +1313,7 @@ public:
 
   int clean_pg_upmaps(
     CephContext *cct,
-    Incremental *pending_inc);
+    Incremental *pending_inc) const;
 
   bool try_pg_upmap(
     CephContext *cct,
@@ -1333,6 +1333,11 @@ public:
 
   int get_osds_by_bucket_name(const string &name, set<int> *osds) const;
 
+  bool have_pg_upmaps(pg_t pg) const {
+    return pg_upmap.count(pg) ||
+      pg_upmap_items.count(pg);
+  }
+
   /*
    * handy helpers to build simple maps...
    */
diff --git a/ceph/src/osd/PG.cc b/ceph/src/osd/PG.cc
index d2760c1ae..7965757d7 100644
--- a/ceph/src/osd/PG.cc
+++ b/ceph/src/osd/PG.cc
@@ -3348,33 +3348,6 @@ void PG::write_if_dirty(ObjectStore::Transaction& t)
     t.omap_setkeys(coll, pgmeta_oid, km);
 }
 
-void PG::trim_log()
-{
-  assert(is_primary());
-  calc_trim_to();
-  dout(10) << __func__ << " to " << pg_trim_to << dendl;
-  if (pg_trim_to != eversion_t()) {
-    // inform peers to trim log
-    assert(!actingbackfill.empty());
-    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
-	 i != actingbackfill.end();
-	 ++i) {
-      if (*i == pg_whoami) continue;
-      osd->send_message_osd_cluster(
-	i->osd,
-	new MOSDPGTrim(
-	  get_osdmap()->get_epoch(),
-	  spg_t(info.pgid.pgid, i->shard),
-	  pg_trim_to),
-	get_osdmap()->get_epoch());
-    }
-
-    // trim primary as well
-    pg_log.trim(pg_trim_to, info);
-    dirty_info = true;
-  }
-}
-
 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
 {
   // raise last_complete only if we were previously up to date
@@ -3458,7 +3431,14 @@ void PG::append_log(
 	roll_forward_to));
   }
 
-  pg_log.trim(trim_to, info);
+  dout(10) << __func__ << " approx pg log length =  "
+           << pg_log.get_log().approx_size() << dendl;
+  dout(10) << __func__ << " transaction_applied = "
+           << transaction_applied << dendl;
+  if (!transaction_applied)
+    dout(10) << __func__ << " " << pg_whoami
+             << " is backfill target" << dendl;
+  pg_log.trim(trim_to, info, transaction_applied);
 
   // update the local pg, pg log
   dirty_info = true;
@@ -3920,9 +3900,14 @@ void PG::reg_next_scrub()
     return;
 
   utime_t reg_stamp;
-  if (scrubber.must_scrub ||
-      (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
+  bool must = false;
+  if (scrubber.must_scrub) {
+    // Set the smallest time that isn't utime_t()
+    reg_stamp = utime_t(0,1);
+    must = true;
+  } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) {
     reg_stamp = ceph_clock_now();
+    must = true;
   } else {
     reg_stamp = info.history.last_scrub_stamp;
   }
@@ -3936,7 +3921,7 @@ void PG::reg_next_scrub()
 					       reg_stamp,
 					       scrub_min_interval,
 					       scrub_max_interval,
-					       scrubber.must_scrub);
+					       must);
 }
 
 void PG::unreg_next_scrub()
@@ -4367,7 +4352,7 @@ void PG::_scan_snaps(ScrubMap &smap)
 	  bool done;
 	  t.register_on_applied_sync(
 	    new C_SafeCond(&my_lock, &my_cond, &done, &r));
-	  r = osd->store->apply_transaction(osr.get(), std::move(t));
+	  r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
 	  if (r != 0) {
 	    derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
 		 << dendl;
@@ -7553,9 +7538,6 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx)
     pg->publish_stats_to_osd();
   }
 
-  // trim pglog on recovered
-  pg->trim_log();
-
   // adjust acting set?  (e.g. because backfill completed...)
   bool history_les_bound = false;
   if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
diff --git a/ceph/src/osd/PG.h b/ceph/src/osd/PG.h
index 11bf5fc8d..d782d897f 100644
--- a/ceph/src/osd/PG.h
+++ b/ceph/src/osd/PG.h
@@ -1139,6 +1139,8 @@ public:
 
   virtual void calc_trim_to() = 0;
 
+  virtual void calc_trim_to_aggressive() = 0;
+
   void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
 			pg_missing_t& omissing, pg_shard_t from);
   void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
@@ -2506,6 +2508,10 @@ protected:
     return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
   }
 
+  bool hard_limit_pglog() const {
+    return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
+  }
+
   void init_primary_up_acting(
     const vector<int> &newup,
     const vector<int> &newacting,
@@ -2653,7 +2659,6 @@ public:
     ObjectStore::Transaction &t,
     bool transaction_applied = true);
   bool check_log_for_corruption(ObjectStore *store);
-  void trim_log();
 
   std::string get_corrupt_pg_log_name() const;
   static int read_info(
diff --git a/ceph/src/osd/PGLog.cc b/ceph/src/osd/PGLog.cc
index 96f49fd9d..8a6648c33 100644
--- a/ceph/src/osd/PGLog.cc
+++ b/ceph/src/osd/PGLog.cc
@@ -50,14 +50,9 @@ void PGLog::IndexedLog::trim(
   set<string>* trimmed_dups,
   eversion_t *write_from_dups)
 {
-  if (complete_to != log.end() &&
-      complete_to->version <= s) {
-    generic_dout(0) << " bad trim to " << s << " when complete_to is "
-		    << complete_to->version
-		    << " on " << *this << dendl;
-  }
-
   assert(s <= can_rollback_to);
+  if (complete_to != log.end())
+    lgeneric_subdout(cct, osd, 20) << " complete_to " << complete_to->version << dendl;
 
   auto earliest_dup_version =
     log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
@@ -68,17 +63,17 @@ void PGLog::IndexedLog::trim(
     const pg_log_entry_t &e = *log.begin();
     if (e.version > s)
       break;
-    generic_dout(20) << "trim " << e << dendl;
+    lgeneric_subdout(cct, osd, 20) << "trim " << e << dendl;
     if (trimmed)
       trimmed->insert(e.version);
 
     unindex(e);         // remove from index,
 
     // add to dup list
-    generic_dout(20) << "earliest_dup_version = " << earliest_dup_version << dendl;
+    lgeneric_subdout(cct, osd, 20) << "earliest_dup_version = " << earliest_dup_version << dendl;
     if (e.version.version >= earliest_dup_version) {
       if (write_from_dups != nullptr && *write_from_dups > e.version) {
-	generic_dout(20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
+	lgeneric_subdout(cct, osd, 20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
 	*write_from_dups = e.version;
       }
       dups.push_back(pg_log_dup_t(e));
@@ -91,6 +86,10 @@ void PGLog::IndexedLog::trim(
       }
     }
 
+    bool reset_complete_to = false;
+    // we are trimming past complete_to, so reset complete_to
+    if (complete_to != log.end() && e.version >= complete_to->version)
+      reset_complete_to = true;
     if (rollback_info_trimmed_to_riter == log.rend() ||
 	e.version == rollback_info_trimmed_to_riter->version) {
       log.pop_front();
@@ -98,13 +97,20 @@ void PGLog::IndexedLog::trim(
     } else {
       log.pop_front();
     }
+
+    // reset complete_to to the beginning of the log
+    if (reset_complete_to) {
+      lgeneric_subdout(cct, osd, 20) << " moving complete_to " << " to "
+                      << log.begin()->version << dendl;
+      complete_to = log.begin();
+    }
   }
 
   while (!dups.empty()) {
     const auto& e = *dups.begin();
     if (e.version.version >= earliest_dup_version)
       break;
-    generic_dout(20) << "trim dup " << e << dendl;
+    lgeneric_subdout(cct, osd, 20) << "trim dup " << e << dendl;
     if (trimmed_dups)
       trimmed_dups->insert(e.get_key_name());
     if (indexed_data & PGLOG_INDEXED_DUPS) {
@@ -162,16 +168,23 @@ void PGLog::clear_info_log(
 
 void PGLog::trim(
   eversion_t trim_to,
-  pg_info_t &info)
+  pg_info_t &info,
+  bool transaction_applied)
 {
+  dout(10) << __func__ << " proposed trim_to = " << trim_to << dendl;
   // trim?
   if (trim_to > log.tail) {
-    // We shouldn't be trimming the log past last_complete
-    assert(trim_to <= info.last_complete);
+    dout(10) << __func__ << " missing = " << missing.num_missing() << dendl;
+    // Don't assert for backfill_targets
+    // or whenever there are missing items
+    if (transaction_applied && (missing.num_missing() == 0))
+      assert(trim_to <= info.last_complete);
 
     dout(10) << "trim " << log << " to " << trim_to << dendl;
     log.trim(cct, trim_to, &trimmed, &trimmed_dups, &write_from_dups);
     info.log_tail = log.tail;
+    if (log.complete_to != log.log.end())
+      dout(10) << " after trim complete_to " << log.complete_to->version << dendl;
   }
 }
 
diff --git a/ceph/src/osd/PGLog.h b/ceph/src/osd/PGLog.h
index 7253936dd..6f85ee1f1 100644
--- a/ceph/src/osd/PGLog.h
+++ b/ceph/src/osd/PGLog.h
@@ -705,7 +705,8 @@ public:
 
   void trim(
     eversion_t trim_to,
-    pg_info_t &info);
+    pg_info_t &info,
+    bool transaction_applied = true);
 
   void roll_forward_to(
     eversion_t roll_forward_to,
diff --git a/ceph/src/osd/PrimaryLogPG.cc b/ceph/src/osd/PrimaryLogPG.cc
index a4a4f8ffb..0074c7964 100644
--- a/ceph/src/osd/PrimaryLogPG.cc
+++ b/ceph/src/osd/PrimaryLogPG.cc
@@ -1569,23 +1569,23 @@ void PrimaryLogPG::calc_trim_to()
   size_t target = cct->_conf->osd_min_pg_log_entries;
   if (is_degraded() ||
       state_test(PG_STATE_RECOVERING |
-		 PG_STATE_RECOVERY_WAIT |
-		 PG_STATE_BACKFILLING |
-		 PG_STATE_BACKFILL_WAIT |
-		 PG_STATE_BACKFILL_TOOFULL)) {
+                 PG_STATE_RECOVERY_WAIT |
+                 PG_STATE_BACKFILLING |
+                 PG_STATE_BACKFILL_WAIT |
+                 PG_STATE_BACKFILL_TOOFULL)) {
     target = cct->_conf->osd_max_pg_log_entries;
   }
 
-  eversion_t limit = MIN(
+  eversion_t limit = std::min(
     min_last_complete_ondisk,
     pg_log.get_can_rollback_to());
   if (limit != eversion_t() &&
       limit != pg_trim_to &&
       pg_log.get_log().approx_size() > target) {
-    size_t num_to_trim = MIN(pg_log.get_log().approx_size() - target,
-			     cct->_conf->osd_pg_log_trim_max);
+    size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
+                             cct->_conf->osd_pg_log_trim_max);
     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
-	cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
+        cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
       return;
     }
     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
@@ -1594,9 +1594,9 @@ void PrimaryLogPG::calc_trim_to()
       new_trim_to = it->version;
       ++it;
       if (new_trim_to > limit) {
-	new_trim_to = limit;
-	dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
-	break;
+        new_trim_to = limit;
+        dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
+        break;
       }
     }
     dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
@@ -1606,6 +1606,63 @@ void PrimaryLogPG::calc_trim_to()
   }
 }
 
+void PrimaryLogPG::calc_trim_to_aggressive()
+{
+  size_t target = cct->_conf->osd_min_pg_log_entries;
+  if (is_degraded() ||
+      state_test(PG_STATE_RECOVERING |
+		 PG_STATE_RECOVERY_WAIT |
+		 PG_STATE_BACKFILLING |
+		 PG_STATE_BACKFILL_WAIT |
+		 PG_STATE_BACKFILL_TOOFULL)) {
+    target = cct->_conf->osd_max_pg_log_entries;
+  }
+  // limit pg log trimming up to the can_rollback_to value
+  eversion_t limit = std::min(
+    pg_log.get_head(),
+    pg_log.get_can_rollback_to());
+  dout(10) << __func__ << " limit = " << limit << dendl;
+
+  if (limit != eversion_t() &&
+      limit != pg_trim_to &&
+      pg_log.get_log().approx_size() > target) {
+    dout(10) << __func__ << " approx pg log length =  "
+             << pg_log.get_log().approx_size() << dendl;
+    size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
+				  cct->_conf->osd_pg_log_trim_max);
+    dout(10) << __func__ << " num_to_trim =  " << num_to_trim << dendl;
+    if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
+	cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
+      return;
+    }
+    auto it = pg_log.get_log().log.begin(); // oldest log entry
+    auto rit = pg_log.get_log().log.rbegin();
+    eversion_t by_n_to_keep; // start from tail
+    eversion_t by_n_to_trim = eversion_t::max(); // start from head
+    for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
+      i++;
+      if (i > target && by_n_to_keep == eversion_t()) {
+        by_n_to_keep = rit->version;
+      }
+      if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
+        by_n_to_trim = it->version;
+      }
+      if (by_n_to_keep != eversion_t() &&
+          by_n_to_trim != eversion_t::max()) {
+        break;
+      }
+    }
+
+    if (by_n_to_keep == eversion_t()) {
+      return;
+    }
+
+    pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
+    dout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
+    assert(pg_trim_to <= pg_log.get_head());
+  }
+}
+
 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
 			   const PGPool &_pool, spg_t p) :
   PG(o, curmap, _pool, p),
@@ -3318,7 +3375,10 @@ void PrimaryLogPG::execute_ctx(OpContext *ctx)
   assert(op->may_write() || op->may_cache());
 
   // trim log?
-  calc_trim_to();
+  if (hard_limit_pglog())
+    calc_trim_to_aggressive();
+  else
+    calc_trim_to();
 
   // verify that we are doing this in order?
   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
@@ -5804,7 +5864,6 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
                           op.alloc_hint.expected_write_size,
 			  op.alloc_hint.flags);
-        ctx->delta_stats.num_wr++;
         result = 0;
       }
       break;
@@ -9579,7 +9638,10 @@ void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
   dout(20) << __func__ << " " << repop << dendl;
   issue_repop(repop, ctx.get());
   eval_repop(repop);
-  calc_trim_to();
+  if (hard_limit_pglog())
+    calc_trim_to_aggressive();
+  else
+    calc_trim_to();
   repop->put();
 }
 
@@ -9721,7 +9783,10 @@ void PrimaryLogPG::submit_log_entries(
       assert(r == 0);
     });
 
-  calc_trim_to();
+  if (hard_limit_pglog())
+    calc_trim_to_aggressive();
+  else
+    calc_trim_to();
 }
 
 void PrimaryLogPG::cancel_log_updates()
diff --git a/ceph/src/osd/PrimaryLogPG.h b/ceph/src/osd/PrimaryLogPG.h
index 3f10cef18..ca1a0b1d6 100644
--- a/ceph/src/osd/PrimaryLogPG.h
+++ b/ceph/src/osd/PrimaryLogPG.h
@@ -1333,6 +1333,7 @@ protected:
   void apply_and_flush_repops(bool requeue);
 
   void calc_trim_to() override;
+  void calc_trim_to_aggressive() override;
   int do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr);
   int do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr);
 
diff --git a/ceph/src/osd/osd_types.h b/ceph/src/osd/osd_types.h
index 58cdfdefb..5b63b9ed9 100644
--- a/ceph/src/osd/osd_types.h
+++ b/ceph/src/osd/osd_types.h
@@ -417,10 +417,14 @@ struct pg_t {
   unsigned get_split_bits(unsigned pg_num) const;
 
   bool contains(int bits, const ghobject_t& oid) {
-    return oid.match(bits, ps());
+    return
+      (int64_t)m_pool == oid.hobj.get_logical_pool() &&
+      oid.match(bits, ps());
   }
   bool contains(int bits, const hobject_t& oid) {
-    return oid.match(bits, ps());
+    return
+      (int64_t)m_pool == oid.get_logical_pool() &&
+      oid.match(bits, ps());
   }
 
   hobject_t get_hobj_start() const;
@@ -562,7 +566,8 @@ struct spg_t {
     return ghobject_t(
       hobject_t(object_t(name), "", CEPH_NOSNAP,
 		pgid.ps(),
-		hobject_t::POOL_TEMP_START - pgid.pool(), ""),
+		hobject_t::get_temp_pool(pgid.pool()),
+		""),
       ghobject_t::NO_GEN,
       shard);
   }
diff --git a/ceph/src/osdc/Journaler.cc b/ceph/src/osdc/Journaler.cc
index 8ca0c4b27..0b5663b81 100644
--- a/ceph/src/osdc/Journaler.cc
+++ b/ceph/src/osdc/Journaler.cc
@@ -532,11 +532,12 @@ void Journaler::_finish_flush(int r, uint64_t start, ceph::real_time stamp)
   // adjust safe_pos
   auto it = pending_safe.find(start);
   assert(it != pending_safe.end());
+  uint64_t min_next_safe_pos = pending_safe.begin()->second;
   pending_safe.erase(it);
   if (pending_safe.empty())
     safe_pos = next_safe_pos;
   else
-    safe_pos = pending_safe.begin()->second;
+    safe_pos = min_next_safe_pos;
 
   ldout(cct, 10) << "_finish_flush safe from " << start
 		 << ", pending_safe " << pending_safe
@@ -1267,6 +1268,13 @@ bool Journaler::try_read_entry(bufferlist& bl)
 
   // prefetch?
   _prefetch();
+
+  // If bufferlist consists of discontiguous memory, decoding types whose
+  // denc_traits needs contiguous memory is inefficient. The bufferlist may
+  // get copied to temporary memory multiple times (copy_shallow() in
+  // src/include/denc.h actually does deep copy)
+  if (bl.get_num_buffers() > 1)
+    bl.rebuild();
   return true;
 }
 
diff --git a/ceph/src/osdc/ObjectCacher.cc b/ceph/src/osdc/ObjectCacher.cc
index 26705ce1a..0f0272d1e 100644
--- a/ceph/src/osdc/ObjectCacher.cc
+++ b/ceph/src/osdc/ObjectCacher.cc
@@ -12,7 +12,7 @@
 #include "include/assert.h"
 
 #define MAX_FLUSH_UNDER_LOCK 20  ///< max bh's we start writeback on
-#define BUFFER_MEMORY_WEIGHT 12   // memory usage of BufferHead, count in (1<<n)
+#define BUFFER_MEMORY_WEIGHT CEPH_PAGE_SHIFT  // memory usage of BufferHead, count in (1<<n)
 
 using std::chrono::seconds;
 				 /// while holding the lock
@@ -230,6 +230,20 @@ void ObjectCacher::Object::try_merge_bh(BufferHead *bh)
   ++p;
   if (p != data.end() && can_merge_bh(bh, p->second))
     merge_left(bh, p->second);
+
+  maybe_rebuild_buffer(bh);
+}
+
+void ObjectCacher::Object::maybe_rebuild_buffer(BufferHead *bh)
+{
+  auto& bl = bh->bl;
+  if (bl.get_num_buffers() <= 1)
+    return;
+
+  auto wasted = bl.get_wasted_space();
+  if (wasted * 2 > bl.length() &&
+      wasted > (1U << BUFFER_MEMORY_WEIGHT))
+    bl.rebuild();
 }
 
 /*
@@ -454,15 +468,18 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex,
         if (cur + max >= bh->end()) {
           // we want right bit (one splice)
           final = split(bh, cur);   // just split it, take right half.
+          maybe_rebuild_buffer(bh);
           replace_journal_tid(final, tid);
           ++p;
           assert(p->second == final);
         } else {
           // we want middle bit (two splices)
           final = split(bh, cur);
+          maybe_rebuild_buffer(bh);
           ++p;
           assert(p->second == final);
-          split(final, cur+max);
+          auto right = split(final, cur+max);
+          maybe_rebuild_buffer(right);
           replace_journal_tid(final, tid);
         }
       } else {
@@ -471,7 +488,8 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex,
           // whole bufferhead, piece of cake.
         } else {
           // we want left bit (one splice)
-          split(bh, cur + max);        // just split
+          auto right = split(bh, cur + max);        // just split
+          maybe_rebuild_buffer(right);
         }
         if (final) {
           oc->mark_dirty(bh);
@@ -549,6 +567,7 @@ void ObjectCacher::Object::truncate(loff_t s)
     // split bh at truncation point?
     if (bh->start() < s) {
       split(bh, s);
+      maybe_rebuild_buffer(bh);
       continue;
     }
 
@@ -586,13 +605,15 @@ void ObjectCacher::Object::discard(loff_t off, loff_t len,
     // split bh at truncation point?
     if (bh->start() < off) {
       split(bh, off);
+      maybe_rebuild_buffer(bh);
       ++p;
       continue;
     }
 
     assert(bh->start() >= off);
     if (bh->end() > off + len) {
-      split(bh, off + len);
+      auto right = split(bh, off + len);
+      maybe_rebuild_buffer(right);
     }
 
     ++p;
diff --git a/ceph/src/osdc/ObjectCacher.h b/ceph/src/osdc/ObjectCacher.h
index 60f049ef5..81abb214c 100644
--- a/ceph/src/osdc/ObjectCacher.h
+++ b/ceph/src/osdc/ObjectCacher.h
@@ -343,6 +343,7 @@ class ObjectCacher {
     void merge_left(BufferHead *left, BufferHead *right);
     bool can_merge_bh(BufferHead *left, BufferHead *right);
     void try_merge_bh(BufferHead *bh);
+    void maybe_rebuild_buffer(BufferHead *bh);
 
     bool is_cached(loff_t off, loff_t len) const;
     bool include_all_cached_data(loff_t off, loff_t len);
diff --git a/ceph/src/osdc/Objecter.cc b/ceph/src/osdc/Objecter.cc
index c93370ac2..0989a4c14 100644
--- a/ceph/src/osdc/Objecter.cc
+++ b/ceph/src/osdc/Objecter.cc
@@ -413,7 +413,9 @@ void Objecter::shutdown()
 
   initialized = false;
 
+  wl.unlock();
   cct->_conf->remove_observer(this);
+  wl.lock();
 
   map<int,OSDSession*>::iterator p;
   while (!osd_sessions.empty()) {
@@ -2873,10 +2875,11 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
   }
 
   bool unpaused = false;
-  if (t->paused && !target_should_be_paused(t)) {
-    t->paused = false;
+  bool should_be_paused = target_should_be_paused(t);
+  if (t->paused && !should_be_paused) {
     unpaused = true;
   }
+  t->paused = should_be_paused;
 
   bool legacy_change =
     t->pgid != pgid ||
@@ -3439,7 +3442,9 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
     op->tid = 0;
     m->get_redirect().combine_with_locator(op->target.target_oloc,
 					   op->target.target_oid.name);
-    op->target.flags |= (CEPH_OSD_FLAG_REDIRECTED | CEPH_OSD_FLAG_IGNORE_OVERLAY);
+    op->target.flags |= (CEPH_OSD_FLAG_REDIRECTED |
+			 CEPH_OSD_FLAG_IGNORE_CACHE |
+			 CEPH_OSD_FLAG_IGNORE_OVERLAY);
     _op_submit(op, sul, NULL);
     m->put();
     return;
diff --git a/ceph/src/pybind/ceph_volume_client.py b/ceph/src/pybind/ceph_volume_client.py
index 87cba46d1..73406d837 100644
--- a/ceph/src/pybind/ceph_volume_client.py
+++ b/ceph/src/pybind/ceph_volume_client.py
@@ -571,14 +571,14 @@ class CephFSVolumeClient(object):
         else:
             return pool_id
 
-    def create_group(self, group_id):
+    def create_group(self, group_id, mode=0o755):
         # Prevent craftily-named volume groups from colliding with the meta
         # files.
         if group_id.endswith(META_FILE_EXT):
             raise ValueError("group ID cannot end with '{0}'.".format(
                 META_FILE_EXT))
         path = self._get_group_path(group_id)
-        self._mkdir_p(path)
+        self._mkdir_p(path, mode)
 
     def destroy_group(self, group_id):
         path = self._get_group_path(group_id)
@@ -589,7 +589,7 @@ class CephFSVolumeClient(object):
         else:
             self.fs.rmdir(path)
 
-    def _mkdir_p(self, path):
+    def _mkdir_p(self, path, mode=0o755):
         try:
             self.fs.stat(path)
         except cephfs.ObjectNotFound:
@@ -604,9 +604,10 @@ class CephFSVolumeClient(object):
             try:
                 self.fs.stat(subpath)
             except cephfs.ObjectNotFound:
-                self.fs.mkdir(subpath, 0o755)
+                self.fs.mkdir(subpath, mode)
 
-    def create_volume(self, volume_path, size=None, data_isolated=False, namespace_isolated=True):
+    def create_volume(self, volume_path, size=None, data_isolated=False, namespace_isolated=True,
+                      mode=0o755):
         """
         Set up metadata, pools and auth for a volume.
 
@@ -622,7 +623,7 @@ class CephFSVolumeClient(object):
         path = self._get_path(volume_path)
         log.info("create_volume: {0}".format(path))
 
-        self._mkdir_p(path)
+        self._mkdir_p(path, mode)
 
         if size is not None:
             self.fs.setxattr(path, 'ceph.quota.max_bytes', to_bytes(size), 0)
@@ -1362,9 +1363,9 @@ class CephFSVolumeClient(object):
             dir_path, self.rados.conf_get('client_snapdir'), snapshot_name
         )
 
-    def _snapshot_create(self, dir_path, snapshot_name):
+    def _snapshot_create(self, dir_path, snapshot_name, mode=0o755):
         # TODO: raise intelligible exception for clusters where snaps are disabled
-        self.fs.mkdir(self._snapshot_path(dir_path, snapshot_name), 0o755)
+        self.fs.mkdir(self._snapshot_path(dir_path, snapshot_name), mode)
 
     def _snapshot_destroy(self, dir_path, snapshot_name):
         """
@@ -1375,17 +1376,18 @@ class CephFSVolumeClient(object):
         except cephfs.ObjectNotFound:
             log.warn("Snapshot was already gone: {0}".format(snapshot_name))
 
-    def create_snapshot_volume(self, volume_path, snapshot_name):
-        self._snapshot_create(self._get_path(volume_path), snapshot_name)
+    def create_snapshot_volume(self, volume_path, snapshot_name, mode=0o755):
+        self._snapshot_create(self._get_path(volume_path), snapshot_name, mode)
 
     def destroy_snapshot_volume(self, volume_path, snapshot_name):
         self._snapshot_destroy(self._get_path(volume_path), snapshot_name)
 
-    def create_snapshot_group(self, group_id, snapshot_name):
+    def create_snapshot_group(self, group_id, snapshot_name, mode=0o755):
         if group_id is None:
             raise RuntimeError("Group ID may not be None")
 
-        return self._snapshot_create(self._get_group_path(group_id), snapshot_name)
+        return self._snapshot_create(self._get_group_path(group_id), snapshot_name,
+                                     mode)
 
     def destroy_snapshot_group(self, group_id, snapshot_name):
         if group_id is None:
diff --git a/ceph/src/pybind/mgr/balancer/module.py b/ceph/src/pybind/mgr/balancer/module.py
index 529ee012d..0f966aa2d 100644
--- a/ceph/src/pybind/mgr/balancer/module.py
+++ b/ceph/src/pybind/mgr/balancer/module.py
@@ -258,6 +258,11 @@ class Module(MgrModule):
             "desc": "Show an optimization plan",
             "perm": "r",
         },
+        {
+            "cmd": "balancer ls",
+            "desc": "List all plans",
+            "perm": "r",
+        },
         {
             "cmd": "balancer execute name=plan,type=CephString",
             "desc": "Execute an optimization plan",
@@ -277,7 +282,7 @@ class Module(MgrModule):
         self.log.warn("Handling command: '%s'" % str(command))
         if command['prefix'] == 'balancer status':
             s = {
-                'plans': self.plans.keys(),
+                'plans': list(self.plans.keys()),
                 'active': self.active,
                 'mode': self.get_config('mode', default_mode),
             }
@@ -344,6 +349,8 @@ class Module(MgrModule):
         elif command['prefix'] == 'balancer reset':
             self.plans = {}
             return (0, '', '')
+        elif command['prefix'] == 'balancer ls':
+            return (0, json.dumps([p for p in self.plans], indent=4), '')
         elif command['prefix'] == 'balancer dump':
             plan = self.plans.get(command['plan'])
             if not plan:
@@ -608,12 +615,16 @@ class Module(MgrModule):
         }
         self.log.debug('score_by_root %s' % pe.score_by_root)
 
+        # get the list of score metrics, comma separated
+        metrics = self.get_config('crush_compat_metrics', 'pgs,objects,bytes').split(',')
+
         # total score is just average of normalized stddevs
         pe.score = 0.0
         for r, vs in six.iteritems(pe.score_by_root):
             for k, v in six.iteritems(vs):
-                pe.score += v
-        pe.score /= 3 * len(roots)
+                if k in metrics:
+                    pe.score += v
+        pe.score /= len(metrics) * len(roots)
         return pe
 
     def evaluate(self, ms, pools, verbose=False):
@@ -756,7 +767,12 @@ class Module(MgrModule):
             self.log.error(detail)
             return -errno.EOPNOTSUPP, detail
 
-        key = 'pgs'  # pgs objects or bytes
+        # rebalance by pgs, objects, or bytes
+        metrics = self.get_config('crush_compat_metrics', 'pgs,objects,bytes').split(',')
+        key = metrics[0] # balancing using the first score metric
+        if key not in ['pgs', 'bytes', 'objects']:
+            self.log.warn("Invalid crush_compat balancing key %s. Using 'pgs'." % key)
+            key = 'pgs'
 
         # go
         best_ws = copy.deepcopy(orig_ws)
diff --git a/ceph/src/pybind/mgr/influx/module.py b/ceph/src/pybind/mgr/influx/module.py
index 96eb33ce5..b1f369d22 100644
--- a/ceph/src/pybind/mgr/influx/module.py
+++ b/ceph/src/pybind/mgr/influx/module.py
@@ -114,6 +114,8 @@ class Module(MgrModule):
         for daemon, counters in six.iteritems(self.get_all_perf_counters()):
             svc_type, svc_id = daemon.split(".", 1)
             metadata = self.get_metadata(svc_type, svc_id)
+            if not metadata:
+                continue
 
             for path, counter_info in counters.items():
                 if counter_info['type'] & self.PERFCOUNTER_HISTOGRAM:
diff --git a/ceph/src/pybind/mgr/prometheus/module.py b/ceph/src/pybind/mgr/prometheus/module.py
index 03dd9962a..fc013fb85 100644
--- a/ceph/src/pybind/mgr/prometheus/module.py
+++ b/ceph/src/pybind/mgr/prometheus/module.py
@@ -102,7 +102,7 @@ POOL_METADATA = ('pool_id', 'name')
 
 RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
 
-DISK_OCCUPATION = ( 'ceph_daemon', 'device','instance')
+DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device', 'wal_device', 'instance')
 
 NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
 
@@ -483,13 +483,24 @@ class Module(MgrModule):
             osd_metadata = self.get_metadata("osd", str(id_))
             if osd_metadata is None:
                 continue
-            dev_keys = ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
-            osd_dev_node = None
-            for dev_key in dev_keys:
-                val = osd_metadata.get(dev_key, None)
-                if val and val != "unknown":
-                    osd_dev_node = val
-                    break
+
+            osd_objectstore = osd_metadata.get('osd_objectstore', None)
+            if osd_objectstore == "filestore":
+            # collect filestore backend device
+                osd_dev_node = osd_metadata.get('backend_filestore_dev_node', None)
+            # collect filestore journal device
+                osd_wal_dev_node = osd_metadata.get('osd_journal', '')
+                osd_db_dev_node = ''
+            elif osd_objectstore == "bluestore":
+            # collect bluestore backend device
+                osd_dev_node = osd_metadata.get('bluestore_bdev_dev_node', None)
+            # collect bluestore wal backend
+                osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '')
+            # collect bluestore db backend
+                osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '')
+            if osd_dev_node and osd_dev_node == "unknown":
+                osd_dev_node = None
+
             osd_hostname = osd_metadata.get('hostname', None)
             if osd_dev_node and osd_hostname:
                 self.log.debug("Got dev for osd {0}: {1}/{2}".format(
@@ -497,6 +508,8 @@ class Module(MgrModule):
                 self.metrics['disk_occupation'].set(1, (
                     "osd.{0}".format(id_),
                     osd_dev_node,
+                    osd_db_dev_node,
+                    osd_wal_dev_node,
                     osd_hostname
                 ))
             else:
diff --git a/ceph/src/pybind/mgr/restful/common.py b/ceph/src/pybind/mgr/restful/common.py
index 83e32fb60..847156780 100644
--- a/ceph/src/pybind/mgr/restful/common.py
+++ b/ceph/src/pybind/mgr/restful/common.py
@@ -6,7 +6,7 @@ OSD_FLAGS = [
 
 # Implemented osd commands
 OSD_IMPLEMENTED_COMMANDS = [
-    'scrub', 'deep_scrub', 'repair'
+    'scrub', 'deep-scrub', 'repair'
 ]
 
 # Valid values for the 'var' argument to 'ceph osd pool set'
diff --git a/ceph/src/pybind/mgr/restful/module.py b/ceph/src/pybind/mgr/restful/module.py
index 5832081d1..197df6ec5 100644
--- a/ceph/src/pybind/mgr/restful/module.py
+++ b/ceph/src/pybind/mgr/restful/module.py
@@ -362,7 +362,9 @@ class Module(MgrModule):
             if tag == 'seq':
                 return
 
-            request = [x for x in self.requests if x.is_running(tag)]
+            with self.requests_lock:
+                request = [x for x in self.requests if x.is_running(tag)]
+
             if len(request) != 1:
                 self.log.warn("Unknown request '%s'" % str(tag))
                 return
@@ -581,8 +583,8 @@ class Module(MgrModule):
 
 
     def submit_request(self, _request, **kwargs):
-        request = CommandsRequest(_request)
         with self.requests_lock:
+            request = CommandsRequest(_request)
             self.requests.append(request)
         if kwargs.get('wait', 0):
             while not request.is_finished():
diff --git a/ceph/src/pybind/mgr/status/module.py b/ceph/src/pybind/mgr/status/module.py
index 12eddf516..acedbff90 100644
--- a/ceph/src/pybind/mgr/status/module.py
+++ b/ceph/src/pybind/mgr/status/module.py
@@ -196,7 +196,7 @@ class Module(MgrModule):
                 dns = self.get_latest("mds", daemon_info['name'], "mds_mem.dn")
 
                 activity = "Evts: " + self.format_dimless(
-                    self.get_rate("mds", daemon_info['name'], "mds_log.replay"),
+                    self.get_rate("mds", daemon_info['name'], "mds_log.replayed"),
                     5
                 ) + "/s"
 
@@ -240,7 +240,7 @@ class Module(MgrModule):
         output += "\n" + standby_table.get_string() + "\n"
 
         if len(mds_versions) == 1:
-            output += "MDS version: {0}".format(mds_versions.keys()[0])
+            output += "MDS version: {0}".format(list(mds_versions)[0])
         else:
             version_table = PrettyTable(["version", "daemons"])
             for version, daemons in six.iteritems(mds_versions):
diff --git a/ceph/src/pybind/rbd/rbd.pyx b/ceph/src/pybind/rbd/rbd.pyx
index 4738b3c19..0013ae27b 100644
--- a/ceph/src/pybind/rbd/rbd.pyx
+++ b/ceph/src/pybind/rbd/rbd.pyx
@@ -356,6 +356,7 @@ RBD_FEATURES_SINGLE_CLIENT = _RBD_FEATURES_SINGLE_CLIENT
 RBD_FEATURES_ALL = _RBD_FEATURES_ALL
 
 RBD_FLAG_OBJECT_MAP_INVALID = _RBD_FLAG_OBJECT_MAP_INVALID
+RBD_FLAG_FAST_DIFF_INVALID = _RBD_FLAG_FAST_DIFF_INVALID
 
 RBD_MIRROR_MODE_DISABLED = _RBD_MIRROR_MODE_DISABLED
 RBD_MIRROR_MODE_IMAGE = _RBD_MIRROR_MODE_IMAGE
diff --git a/ceph/src/rgw/CMakeLists.txt b/ceph/src/rgw/CMakeLists.txt
index ff2d2d8a1..57cb2a5b9 100644
--- a/ceph/src/rgw/CMakeLists.txt
+++ b/ceph/src/rgw/CMakeLists.txt
@@ -142,11 +142,7 @@ add_library(rgw_a STATIC ${rgw_a_srcs})
 
 add_dependencies(rgw_a civetweb_h)
 
-target_include_directories(rgw_a PUBLIC
-  ${FCGI_INCLUDE_DIR}
-  "../rapidjson/include"
-  )
-target_compile_definitions(rgw_a PUBLIC BOOST_COROUTINES_NO_DEPRECATION_WARNING)
+target_include_directories(rgw_a SYSTEM PUBLIC "../rapidjson/include")
 
 target_link_libraries(rgw_a librados cls_lock_client cls_rgw_client cls_refcount_client
   cls_log_client cls_statelog_client cls_timeindex_client cls_version_client
@@ -155,9 +151,13 @@ target_link_libraries(rgw_a librados cls_lock_client cls_rgw_client cls_refcount
   ${EXPAT_LIBRARIES}
   ${OPENLDAP_LIBRARIES} ${CRYPTO_LIBS})
 
-if (WITH_CURL_OPENSSL)
+if (WITH_CURL_OPENSSL OR (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL))
   target_link_libraries(rgw_a ${OPENSSL_LIBRARIES})
-endif (WITH_CURL_OPENSSL)
+endif()
+if (WITH_RADOSGW_BEAST_FRONTEND)
+  target_compile_definitions(rgw_a PUBLIC BOOST_COROUTINES_NO_DEPRECATION_WARNING)
+  target_link_libraries(rgw_a Boost::coroutine Boost::context)
+endif()
 
 set(radosgw_srcs
   rgw_loadgen_process.cc
@@ -177,7 +177,9 @@ endif (WITH_RADOSGW_BEAST_FRONTEND)
 
 add_library(radosgw_a STATIC ${radosgw_srcs}
   $<TARGET_OBJECTS:civetweb_common_objs>)
-target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
+if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
+  target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
+endif()
 
 add_executable(radosgw rgw_main.cc)
 target_link_libraries(radosgw radosgw_a librados
@@ -193,6 +195,10 @@ add_dependencies(radosgw cls_rgw cls_lock cls_refcount
   cls_version cls_replica_log cls_user)
 install(TARGETS radosgw DESTINATION bin)
 
+if (WITH_RADOSGW_BEAST_FRONTEND)
+  target_link_libraries(radosgw_a ${OPENSSL_LIBRARIES})
+endif()
+
 set(radosgw_admin_srcs
   rgw_admin.cc
   rgw_orphan.cc)
diff --git a/ceph/src/rgw/librgw.cc b/ceph/src/rgw/librgw.cc
index 4d7e32aaf..f6b78c42d 100644
--- a/ceph/src/rgw/librgw.cc
+++ b/ceph/src/rgw/librgw.cc
@@ -49,6 +49,8 @@
 #include "rgw_auth_s3.h"
 #include "rgw_lib.h"
 #include "rgw_lib_frontend.h"
+#include "rgw_http_client.h"
+#include "rgw_http_client_curl.h"
 
 #include <errno.h>
 #include <chrono>
@@ -119,6 +121,7 @@ namespace rgw {
 	RGWLibFS* fs = iter->first->ref();
 	uniq.unlock();
 	fs->gc();
+	fs->update_user();
 	fs->rele();
 	uniq.lock();
 	if (cur_gen != gen)
@@ -489,6 +492,7 @@ namespace rgw {
     rgw_tools_init(g_ceph_context);
 
     rgw_init_resolver();
+    rgw::curl::setup_curl(boost::none);
 
     store = RGWStoreManager::get_storage(g_ceph_context,
 					 g_conf->rgw_enable_gc_threads,
@@ -598,6 +602,7 @@ namespace rgw {
 
     rgw_tools_cleanup();
     rgw_shutdown_resolver();
+    rgw::curl::cleanup_curl();
 
     rgw_perf_stop(g_ceph_context);
 
diff --git a/ceph/src/rgw/rgw_admin.cc b/ceph/src/rgw/rgw_admin.cc
index 311596b88..13ce136cc 100644
--- a/ceph/src/rgw/rgw_admin.cc
+++ b/ceph/src/rgw/rgw_admin.cc
@@ -201,6 +201,8 @@ void usage()
   cout << "  reshard list               list all bucket resharding or scheduled to be reshared\n";
   cout << "  reshard process            process of scheduled reshard jobs\n";
   cout << "  reshard cancel             cancel resharding a bucket\n";
+  cout << "  reshard stale-instances list list stale-instances from bucket resharding\n";
+  cout << "  reshard stale-instances rm   cleanup stale-instances from bucket resharding\n";
   cout << "  sync error list            list sync error\n";
   cout << "  sync error trim            trim sync error\n";
   cout << "options:\n";
@@ -497,6 +499,8 @@ enum {
   OPT_RESHARD_STATUS,
   OPT_RESHARD_PROCESS,
   OPT_RESHARD_CANCEL,
+  OPT_RESHARD_STALE_INSTANCES_LIST,
+  OPT_RESHARD_STALE_INSTANCES_DELETE
 };
 
 static int get_cmd(const char *cmd, const char *prev_cmd, const char *prev_prev_cmd, bool *need_more)
@@ -531,6 +535,7 @@ static int get_cmd(const char *cmd, const char *prev_cmd, const char *prev_prev_
       strcmp(cmd, "replicalog") == 0 ||
       strcmp(cmd, "role") == 0 ||
       strcmp(cmd, "role-policy") == 0 ||
+      strcmp(cmd, "stale-instances") == 0 ||
       strcmp(cmd, "subuser") == 0 ||
       strcmp(cmd, "sync") == 0 ||
       strcmp(cmd, "usage") == 0 ||
@@ -951,6 +956,13 @@ static int get_cmd(const char *cmd, const char *prev_cmd, const char *prev_prev_
       return OPT_RESHARD_PROCESS;
     if (strcmp(cmd, "cancel") == 0)
       return OPT_RESHARD_CANCEL;
+  } else if ((prev_prev_cmd && strcmp(prev_prev_cmd, "reshard") == 0) &&
+	     (strcmp(prev_cmd, "stale-instances") == 0)) {
+    if (strcmp(cmd, "list") == 0)
+      return OPT_RESHARD_STALE_INSTANCES_LIST;
+    if (strcmp(cmd, "rm") == 0 ||
+        strcmp(cmd, "delete") == 0)
+      return OPT_RESHARD_STALE_INSTANCES_DELETE;
   }
 
   return -EINVAL;
@@ -1055,6 +1067,22 @@ static void show_roles_info(vector<RGWRole>& roles, Formatter* formatter)
   formatter->flush(cout);
 }
 
+static void show_reshard_status(
+  const list<cls_rgw_bucket_instance_entry>& status, Formatter *formatter)
+{
+  formatter->open_array_section("status");
+  for (const auto& entry : status) {
+    formatter->open_object_section("entry");
+    formatter->dump_string("reshard_status", to_string(entry.reshard_status));
+    formatter->dump_string("new_bucket_instance_id",
+			   entry.new_bucket_instance_id);
+    formatter->dump_unsigned("num_shards", entry.num_shards);
+    formatter->close_section();
+  }
+  formatter->close_section();
+  formatter->flush(cout);
+}
+
 class StoreDestructor {
   RGWRados *store;
 public:
@@ -2458,7 +2486,7 @@ int check_reshard_bucket_params(RGWRados *store,
   int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket, &attrs);
   if (ret < 0) {
     cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
-    return -ret;
+    return ret;
   }
 
   int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
@@ -2780,7 +2808,7 @@ int main(int argc, const char **argv)
         return EINVAL;
       }
     } else if (ceph_argparse_witharg(args, i, &val, "--max-size", (char*)NULL)) {
-      max_size = strict_iecstrtoll(val.c_str(), &err);
+      max_size = strict_iec_cast<long long>(val.c_str(), &err);
       if (!err.empty()) {
         cerr << "ERROR: failed to parse max size: " << err << std::endl;
         return EINVAL;
@@ -5652,7 +5680,7 @@ next:
     for (int i = 0; i < max_shards; i++) {
       RGWRados::BucketShard bs(store);
       int shard_id = (bucket_info.num_shards > 0  ? i : -1);
-      int ret = bs.init(bucket, shard_id);
+      int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
       marker.clear();
 
       if (ret < 0) {
@@ -5713,7 +5741,7 @@ next:
     for (int i = 0; i < max_shards; i++) {
       RGWRados::BucketShard bs(store);
       int shard_id = (bucket_info.num_shards > 0  ? i : -1);
-      int ret = bs.init(bucket, shard_id);
+      int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
       if (ret < 0) {
         cerr << "ERROR: bs.init(bucket=" << bucket << ", shard=" << shard_id << "): " << cpp_strerror(-ret) << std::endl;
         return -ret;
@@ -5909,7 +5937,7 @@ next:
       return ret;
     }
 
-    RGWBucketReshard br(store, bucket_info, attrs);
+    RGWBucketReshard br(store, bucket_info, attrs, nullptr /* no callback */);
 
 #define DEFAULT_RESHARD_MAX_ENTRIES 1000
     if (max_entries < 1) {
@@ -5995,7 +6023,6 @@ next:
     return 0;
   }
 
-
   if (opt_cmd == OPT_RESHARD_STATUS) {
     if (bucket_name.empty()) {
       cerr << "ERROR: bucket not specified" << std::endl;
@@ -6011,16 +6038,16 @@ next:
       return -ret;
     }
 
-    RGWBucketReshard br(store, bucket_info, attrs);
+    RGWBucketReshard br(store, bucket_info, attrs, nullptr /* no callback */);
     list<cls_rgw_bucket_instance_entry> status;
     int r = br.get_status(&status);
     if (r < 0) {
-      cerr << "ERROR: could not get resharding status for bucket " << bucket_name << std::endl;
+      cerr << "ERROR: could not get resharding status for bucket " <<
+	bucket_name << std::endl;
       return -r;
     }
 
-    encode_json("status", status, formatter);
-    formatter->flush(cout);
+    show_reshard_status(status, formatter);
   }
 
   if (opt_cmd == OPT_RESHARD_PROCESS) {
@@ -6048,7 +6075,7 @@ next:
       return -ret;
     }
 
-    RGWBucketReshard br(store, bucket_info, attrs);
+    RGWBucketReshard br(store, bucket_info, attrs, nullptr /* no callback */);
     int ret = br.cancel();
     if (ret < 0) {
       if (ret == -EBUSY) {
@@ -7496,5 +7523,19 @@ next:
     }
   }
 
-  return 0;
+ if (opt_cmd == OPT_RESHARD_STALE_INSTANCES_LIST) {
+   ret = RGWBucketAdminOp::list_stale_instances(store, bucket_op,f);
+   if (ret < 0) {
+     cerr << "ERROR: listing stale instances" << cpp_strerror(-ret) << std::endl;
+   }
+ }
+
+ if (opt_cmd == OPT_RESHARD_STALE_INSTANCES_DELETE) {
+   ret = RGWBucketAdminOp::clear_stale_instances(store, bucket_op,f);
+   if (ret < 0) {
+     cerr << "ERROR: deleting stale instances" << cpp_strerror(-ret) << std::endl;
+   }
+ }
+
+ return 0;
 }
diff --git a/ceph/src/rgw/rgw_asio_client.cc b/ceph/src/rgw/rgw_asio_client.cc
index 57dbd6b18..4637eed20 100644
--- a/ceph/src/rgw/rgw_asio_client.cc
+++ b/ceph/src/rgw/rgw_asio_client.cc
@@ -11,10 +11,13 @@
 
 using namespace rgw::asio;
 
-ClientIO::ClientIO(tcp::socket& socket,
-                   parser_type& parser,
-                   beast::flat_buffer& buffer)
-  : socket(socket), parser(parser), buffer(buffer), txbuf(*this)
+ClientIO::ClientIO(parser_type& parser, bool is_ssl,
+                   const endpoint_type& local_endpoint,
+                   const endpoint_type& remote_endpoint)
+  : parser(parser), is_ssl(is_ssl),
+    local_endpoint(local_endpoint),
+    remote_endpoint(remote_endpoint),
+    txbuf(*this)
 {
 }
 
@@ -74,53 +77,16 @@ int ClientIO::init_env(CephContext *cct)
   env.set("SCRIPT_URI", url.to_string()); /* FIXME */
 
   char port_buf[16];
-  snprintf(port_buf, sizeof(port_buf), "%d", socket.local_endpoint().port());
+  snprintf(port_buf, sizeof(port_buf), "%d", local_endpoint.port());
   env.set("SERVER_PORT", port_buf);
-  env.set("REMOTE_ADDR", socket.remote_endpoint().address().to_string());
-  // TODO: set SERVER_PORT_SECURE if using ssl
+  if (is_ssl) {
+    env.set("SERVER_PORT_SECURE", port_buf);
+  }
+  env.set("REMOTE_ADDR", remote_endpoint.address().to_string());
   // TODO: set REMOTE_USER if authenticated
   return 0;
 }
 
-size_t ClientIO::write_data(const char* buf, size_t len)
-{
-  boost::system::error_code ec;
-  auto bytes = boost::asio::write(socket, boost::asio::buffer(buf, len), ec);
-  if (ec) {
-    derr << "write_data failed: " << ec.message() << dendl;
-    throw rgw::io::Exception(ec.value(), std::system_category());
-  }
-  /* According to the documentation of boost::asio::write if there is
-   * no error (signalised by ec), then bytes == len. We don't need to
-   * take care of partial writes in such situation. */
-  return bytes;
-}
-
-size_t ClientIO::read_data(char* buf, size_t max)
-{
-  auto& message = parser.get();
-  auto& body_remaining = message.body();
-  body_remaining.data = buf;
-  body_remaining.size = max;
-
-  dout(30) << this << " read_data for " << max << " with "
-      << buffer.size() << " bytes buffered" << dendl;
-
-  while (body_remaining.size && !parser.is_done()) {
-    boost::system::error_code ec;
-    beast::http::read_some(socket, buffer, parser, ec);
-    if (ec == beast::http::error::partial_message ||
-        ec == beast::http::error::need_buffer) {
-      break;
-    }
-    if (ec) {
-      derr << "failed to read body: " << ec.message() << dendl;
-      throw rgw::io::Exception(ec.value(), std::system_category());
-    }
-  }
-  return max - body_remaining.size;
-}
-
 size_t ClientIO::complete_request()
 {
   return 0;
diff --git a/ceph/src/rgw/rgw_asio_client.h b/ceph/src/rgw/rgw_asio_client.h
index eeed4ee80..5a9957b46 100644
--- a/ceph/src/rgw/rgw_asio_client.h
+++ b/ceph/src/rgw/rgw_asio_client.h
@@ -18,22 +18,22 @@ using parser_type = beast::http::request_parser<beast::http::buffer_body>;
 
 class ClientIO : public io::RestfulClient,
                  public io::BuffererSink {
- private:
-  using tcp = boost::asio::ip::tcp;
-  tcp::socket& socket;
+ protected:
   parser_type& parser;
-  beast::flat_buffer& buffer; //< parse buffer
+ private:
+  const bool is_ssl;
+  using endpoint_type = boost::asio::ip::tcp::endpoint;
+  endpoint_type local_endpoint;
+  endpoint_type remote_endpoint;
 
   RGWEnv env;
 
   rgw::io::StaticOutputBufferer<> txbuf;
 
-  size_t write_data(const char *buf, size_t len) override;
-  size_t read_data(char *buf, size_t max);
-
  public:
-  ClientIO(tcp::socket& socket, parser_type& parser,
-           beast::flat_buffer& buffer);
+  ClientIO(parser_type& parser, bool is_ssl,
+           const endpoint_type& local_endpoint,
+           const endpoint_type& remote_endpoint);
   ~ClientIO() override;
 
   int init_env(CephContext *cct) override;
@@ -46,10 +46,6 @@ class ClientIO : public io::RestfulClient,
   size_t send_content_length(uint64_t len) override;
   size_t complete_header() override;
 
-  size_t recv_body(char* buf, size_t max) override {
-    return read_data(buf, max);
-  }
-
   size_t send_body(const char* buf, size_t len) override {
     return write_data(buf, len);
   }
diff --git a/ceph/src/rgw/rgw_asio_frontend.cc b/ceph/src/rgw/rgw_asio_frontend.cc
index db0ef66dd..e974ae7bf 100644
--- a/ceph/src/rgw/rgw_asio_frontend.cc
+++ b/ceph/src/rgw/rgw_asio_frontend.cc
@@ -8,12 +8,17 @@
 #include <vector>
 
 #include <boost/asio.hpp>
+#include <boost/asio/spawn.hpp>
 
 #include "common/errno.h"
 
 #include "rgw_asio_client.h"
 #include "rgw_asio_frontend.h"
 
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+#include <boost/asio/ssl.hpp>
+#endif
+
 #define dout_subsys ceph_subsys_rgw
 
 namespace {
@@ -63,145 +68,156 @@ void Pauser::wait()
 
 using tcp = boost::asio::ip::tcp;
 namespace beast = boost::beast;
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+namespace ssl = boost::asio::ssl;
+#endif
+
+template <typename Stream>
+class StreamIO : public rgw::asio::ClientIO {
+  Stream& stream;
+  beast::flat_buffer& buffer;
+ public:
+  StreamIO(Stream& stream, rgw::asio::parser_type& parser,
+           beast::flat_buffer& buffer, bool is_ssl,
+           const tcp::endpoint& local_endpoint,
+           const tcp::endpoint& remote_endpoint)
+      : ClientIO(parser, is_ssl, local_endpoint, remote_endpoint),
+        stream(stream), buffer(buffer)
+  {}
+
+  size_t write_data(const char* buf, size_t len) override {
+    boost::system::error_code ec;
+    auto bytes = boost::asio::write(stream, boost::asio::buffer(buf, len), ec);
+    if (ec) {
+      derr << "write_data failed: " << ec.message() << dendl;
+      throw rgw::io::Exception(ec.value(), std::system_category());
+    }
+    return bytes;
+  }
 
-class Connection {
-  RGWProcessEnv& env;
-  boost::asio::io_service::strand strand;
-  tcp::socket socket;
-
-  // references are bound to callbacks for async operations. if a callback
-  // function returns without issuing another operation, the reference is
-  // dropped and the Connection is deleted/closed
-  std::atomic<int> nref{0};
-  using Ref = boost::intrusive_ptr<Connection>;
+  size_t recv_body(char* buf, size_t max) override {
+    auto& message = parser.get();
+    auto& body_remaining = message.body();
+    body_remaining.data = buf;
+    body_remaining.size = max;
+
+    while (body_remaining.size && !parser.is_done()) {
+      boost::system::error_code ec;
+      beast::http::read_some(stream, buffer, parser, ec);
+      if (ec == beast::http::error::partial_message ||
+          ec == beast::http::error::need_buffer) {
+        break;
+      }
+      if (ec) {
+        derr << "failed to read body: " << ec.message() << dendl;
+        throw rgw::io::Exception(ec.value(), std::system_category());
+      }
+    }
+    return max - body_remaining.size;
+  }
+};
 
+template <typename Stream>
+void handle_connection(RGWProcessEnv& env, Stream& stream,
+                       beast::flat_buffer& buffer, bool is_ssl,
+                       boost::system::error_code& ec,
+                       boost::asio::yield_context yield)
+{
   // limit header to 4k, since we read it all into a single flat_buffer
   static constexpr size_t header_limit = 4096;
   // don't impose a limit on the body, since we read it in pieces
   static constexpr size_t body_limit = std::numeric_limits<size_t>::max();
 
-  beast::flat_buffer buffer;
-  boost::optional<rgw::asio::parser_type> parser;
-
-  using bad_response_type = beast::http::response<beast::http::empty_body>;
-  boost::optional<bad_response_type> response;
+  auto cct = env.store->ctx();
 
-  CephContext* ctx() const { return env.store->ctx(); }
-
-  void read_header() {
+  // read messages from the stream until eof
+  for (;;) {
     // configure the parser
-    parser.emplace();
-    parser->header_limit(header_limit);
-    parser->body_limit(body_limit);
+    rgw::asio::parser_type parser;
+    parser.header_limit(header_limit);
+    parser.body_limit(body_limit);
 
     // parse the header
-    beast::http::async_read_header(socket, buffer, *parser, strand.wrap(
-            std::bind(&Connection::on_header, Ref{this},
-                      std::placeholders::_1)));
-  }
-
-  void discard_unread_message() {
-    if (parser->is_done()) {
-      // nothing left to discard, start reading the next message
-      read_header();
-      return;
-    }
-
-    // read the rest of the request into a static buffer. multiple clients could
-    // write at the same time, but this is okay because we never read it back
-    static std::array<char, 1024> discard_buffer;
-
-    auto& body = parser->get().body();
-    body.size = discard_buffer.size();
-    body.data = discard_buffer.data();
-
-    beast::http::async_read_some(socket, buffer, *parser, strand.wrap(
-            std::bind(&Connection::on_discard_unread, Ref{this},
-                      std::placeholders::_1)));
-  }
-
-  void on_discard_unread(boost::system::error_code ec) {
-    if (ec == boost::asio::error::connection_reset) {
-      return;
-    }
-    if (ec) {
-      ldout(ctx(), 5) << "discard_unread_message failed: "
-          << ec.message() << dendl;
-      return;
-    }
-    discard_unread_message();
-  }
-
-  void on_write_error(boost::system::error_code ec) {
-    if (ec) {
-      ldout(ctx(), 5) << "failed to write response: " << ec.message() << dendl;
-    }
-  }
-
-  void on_header(boost::system::error_code ec) {
+    beast::http::async_read_header(stream, buffer, parser, yield[ec]);
     if (ec == boost::asio::error::connection_reset ||
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+        ec == ssl::error::stream_truncated ||
+#endif
         ec == beast::http::error::end_of_stream) {
       return;
     }
     if (ec) {
-      auto& message = parser->get();
-      ldout(ctx(), 1) << "failed to read header: " << ec.message() << dendl;
-      ldout(ctx(), 1) << "====== req done http_status=400 ======" << dendl;
-      response.emplace();
-      response->result(beast::http::status::bad_request);
-      response->version(message.version() == 10 ? 10 : 11);
-      response->prepare_payload();
-      beast::http::async_write(socket, *response, strand.wrap(
-            std::bind(&Connection::on_write_error, Ref{this},
-                      std::placeholders::_1)));
+      ldout(cct, 1) << "failed to read header: " << ec.message() << dendl;
+      auto& message = parser.get();
+      beast::http::response<beast::http::empty_body> response;
+      response.result(beast::http::status::bad_request);
+      response.version(message.version() == 10 ? 10 : 11);
+      response.prepare_payload();
+      beast::http::async_write(stream, response, yield[ec]);
+      if (ec) {
+        ldout(cct, 5) << "failed to write response: " << ec.message() << dendl;
+      }
+      ldout(cct, 1) << "====== req done http_status=400 ======" << dendl;
       return;
     }
 
     // process the request
     RGWRequest req{env.store->get_new_req_id()};
 
-    rgw::asio::ClientIO real_client{socket, *parser, buffer};
+    auto& socket = stream.lowest_layer();
+    StreamIO<Stream> real_client{stream, parser, buffer, is_ssl,
+                                 socket.local_endpoint(),
+                                 socket.remote_endpoint()};
 
     auto real_client_io = rgw::io::add_reordering(
-                            rgw::io::add_buffering(ctx(),
+                            rgw::io::add_buffering(cct,
                               rgw::io::add_chunking(
                                 rgw::io::add_conlen_controlling(
                                   &real_client))));
-    RGWRestfulIO client(ctx(), &real_client_io);
+    RGWRestfulIO client(cct, &real_client_io);
     process_request(env.store, env.rest, &req, env.uri_prefix,
                     *env.auth_registry, &client, env.olog);
 
-    if (parser->keep_alive()) {
-      // parse any unread bytes from the previous message (in case we replied
-      // before reading the entire body) before reading the next
-      discard_unread_message();
+    if (!parser.keep_alive()) {
+      return;
     }
-  }
 
- public:
-  Connection(RGWProcessEnv& env, tcp::socket&& socket)
-    : env(env), strand(socket.get_io_service()), socket(std::move(socket)) {}
-
-  void on_connect() {
-    read_header();
-  }
+    // if we failed before reading the entire message, discard any remaining
+    // bytes before reading the next
+    while (!parser.is_done()) {
+      static std::array<char, 1024> discard_buffer;
 
-  void get() { ++nref; }
-  void put() { if (nref.fetch_sub(1) == 1) { delete this; } }
+      auto& body = parser.get().body();
+      body.size = discard_buffer.size();
+      body.data = discard_buffer.data();
 
-  friend void intrusive_ptr_add_ref(Connection *c) { c->get(); }
-  friend void intrusive_ptr_release(Connection *c) { c->put(); }
-};
+      beast::http::async_read_some(stream, buffer, parser, yield[ec]);
+      if (ec == boost::asio::error::connection_reset) {
+        return;
+      }
+      if (ec) {
+        ldout(cct, 5) << "failed to discard unread message: "
+            << ec.message() << dendl;
+        return;
+      }
+    }
+  }
+}
 
 class AsioFrontend {
   RGWProcessEnv env;
   RGWFrontendConfig* conf;
   boost::asio::io_service service;
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+  boost::optional<ssl::context> ssl_context;
+  int init_ssl();
+#endif
 
   struct Listener {
     tcp::endpoint endpoint;
     tcp::acceptor acceptor;
     tcp::socket socket;
+    bool use_ssl = false;
 
     Listener(boost::asio::io_service& service)
       : acceptor(service), socket(service) {}
@@ -241,20 +257,46 @@ unsigned short parse_port(const char *input, boost::system::error_code& ec)
 }
 
 tcp::endpoint parse_endpoint(BOOST_ASIO_STRING_VIEW_PARAM input,
+                             unsigned short default_port,
                              boost::system::error_code& ec)
 {
   tcp::endpoint endpoint;
 
-  auto colon = input.find(':');
-  if (colon != input.npos) {
-    auto port_str = input.substr(colon + 1);
-    endpoint.port(parse_port(port_str.data(), ec));
-  } else {
-    endpoint.port(80);
+  if (input.empty()) {
+    ec = boost::asio::error::invalid_argument;
+    return endpoint;
   }
-  if (!ec) {
+
+  if (input[0] == '[') { // ipv6
+    const size_t addr_begin = 1;
+    const size_t addr_end = input.find(']');
+    if (addr_end == input.npos) { // no matching ]
+      ec = boost::asio::error::invalid_argument;
+      return endpoint;
+    }
+    if (addr_end + 1 < input.size()) {
+      // :port must must follow [ipv6]
+      if (input[addr_end + 1] != ':') {
+        ec = boost::asio::error::invalid_argument;
+        return endpoint;
+      } else {
+        auto port_str = input.substr(addr_end + 2);
+        endpoint.port(parse_port(port_str.data(), ec));
+      }
+    }
+    auto addr = input.substr(addr_begin, addr_end - addr_begin);
+    endpoint.address(boost::asio::ip::make_address_v6(addr, ec));
+  } else { // ipv4
+    auto colon = input.find(':');
+    if (colon != input.npos) {
+      auto port_str = input.substr(colon + 1);
+      endpoint.port(parse_port(port_str.data(), ec));
+      if (ec) {
+        return endpoint;
+      }
+    }
     auto addr = input.substr(0, colon);
-    endpoint.address(boost::asio::ip::make_address(addr, ec));
+    endpoint.address(boost::asio::ip::make_address_v4(addr, ec));
   }
   return endpoint;
 }
@@ -287,9 +329,16 @@ int AsioFrontend::init()
   boost::system::error_code ec;
   auto& config = conf->get_config_map();
 
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+  int r = init_ssl();
+  if (r < 0) {
+    return r;
+  }
+#endif
+
   // parse endpoints
-  auto range = config.equal_range("port");
-  for (auto i = range.first; i != range.second; ++i) {
+  auto ports = config.equal_range("port");
+  for (auto i = ports.first; i != ports.second; ++i) {
     auto port = parse_port(i->second.c_str(), ec);
     if (ec) {
       lderr(ctx()) << "failed to parse port=" << i->second << dendl;
@@ -299,9 +348,9 @@ int AsioFrontend::init()
     listeners.back().endpoint.port(port);
   }
 
-  range = config.equal_range("endpoint");
-  for (auto i = range.first; i != range.second; ++i) {
-    auto endpoint = parse_endpoint(i->second, ec);
+  auto endpoints = config.equal_range("endpoint");
+  for (auto i = endpoints.first; i != endpoints.second; ++i) {
+    auto endpoint = parse_endpoint(i->second, 80, ec);
     if (ec) {
       lderr(ctx()) << "failed to parse endpoint=" << i->second << dendl;
       return -ec.value();
@@ -335,6 +384,88 @@ int AsioFrontend::init()
   return drop_privileges(ctx());
 }
 
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+int AsioFrontend::init_ssl()
+{
+  boost::system::error_code ec;
+  auto& config = conf->get_config_map();
+
+  // ssl configuration
+  auto cert = config.find("ssl_certificate");
+  const bool have_cert = cert != config.end();
+  if (have_cert) {
+    // only initialize the ssl context if it's going to be used
+    ssl_context = boost::in_place(ssl::context::tls);
+  }
+
+  auto key = config.find("ssl_private_key");
+  const bool have_private_key = key != config.end();
+  if (have_private_key) {
+    if (!have_cert) {
+      lderr(ctx()) << "no ssl_certificate configured for ssl_private_key" << dendl;
+      return -EINVAL;
+    }
+    ssl_context->use_private_key_file(key->second, ssl::context::pem, ec);
+    if (ec) {
+      lderr(ctx()) << "failed to add ssl_private_key=" << key->second
+          << ": " << ec.message() << dendl;
+      return -ec.value();
+    }
+  }
+  if (have_cert) {
+    ssl_context->use_certificate_chain_file(cert->second, ec);
+    if (ec) {
+      lderr(ctx()) << "failed to use ssl_certificate=" << cert->second
+          << ": " << ec.message() << dendl;
+      return -ec.value();
+    }
+    if (!have_private_key) {
+      // attempt to use it as a private key if a separate one wasn't provided
+      ssl_context->use_private_key_file(cert->second, ssl::context::pem, ec);
+      if (ec) {
+        lderr(ctx()) << "failed to use ssl_certificate=" << cert->second
+            << " as a private key: " << ec.message() << dendl;
+        return -ec.value();
+      }
+    }
+  }
+
+  // parse ssl endpoints
+  auto ports = config.equal_range("ssl_port");
+  for (auto i = ports.first; i != ports.second; ++i) {
+    if (!have_cert) {
+      lderr(ctx()) << "no ssl_certificate configured for ssl_port" << dendl;
+      return -EINVAL;
+    }
+    auto port = parse_port(i->second.c_str(), ec);
+    if (ec) {
+      lderr(ctx()) << "failed to parse ssl_port=" << i->second << dendl;
+      return -ec.value();
+    }
+    listeners.emplace_back(service);
+    listeners.back().endpoint.port(port);
+    listeners.back().use_ssl = true;
+  }
+
+  auto endpoints = config.equal_range("ssl_endpoint");
+  for (auto i = endpoints.first; i != endpoints.second; ++i) {
+    if (!have_cert) {
+      lderr(ctx()) << "no ssl_certificate configured for ssl_endpoint" << dendl;
+      return -EINVAL;
+    }
+    auto endpoint = parse_endpoint(i->second, 443, ec);
+    if (ec) {
+      lderr(ctx()) << "failed to parse ssl_endpoint=" << i->second << dendl;
+      return -ec.value();
+    }
+    listeners.emplace_back(service);
+    listeners.back().endpoint = endpoint;
+    listeners.back().use_ssl = true;
+  }
+  return 0;
+}
+#endif // WITH_RADOSGW_BEAST_OPENSSL
+
 void AsioFrontend::accept(Listener& l, boost::system::error_code ec)
 {
   if (!l.acceptor.is_open()) {
@@ -350,9 +481,42 @@ void AsioFrontend::accept(Listener& l, boost::system::error_code ec)
                             accept(l, ec);
                           });
 
-  boost::intrusive_ptr<Connection> conn{new Connection(env, std::move(socket))};
-  conn->on_connect();
-  // reference drops here, but on_connect() takes another
+  // spawn a coroutine to handle the connection
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+  if (l.use_ssl) {
+    boost::asio::spawn(service, std::bind(
+      [this] (boost::asio::yield_context yield, tcp::socket& s) mutable {
+        // wrap the socket in an ssl stream
+        ssl::stream<tcp::socket&> stream{s, *ssl_context};
+        beast::flat_buffer buffer;
+        // do ssl handshake
+        boost::system::error_code ec;
+        auto bytes = stream.async_handshake(ssl::stream_base::server,
+                                            buffer.data(), yield[ec]);
+        if (ec) {
+          ldout(ctx(), 1) << "ssl handshake failed: " << ec.message() << dendl;
+          return;
+        }
+        buffer.consume(bytes);
+        handle_connection(env, stream, buffer, true, ec, yield);
+        if (!ec) {
+          // ssl shutdown (ignoring errors)
+          stream.async_shutdown(yield[ec]);
+        }
+        s.shutdown(tcp::socket::shutdown_both, ec);
+      }, std::placeholders::_1, std::move(socket)));
+  } else {
+#else
+  {
+#endif // WITH_RADOSGW_BEAST_OPENSSL
+    boost::asio::spawn(service, std::bind(
+      [this] (boost::asio::yield_context yield, tcp::socket& s) mutable {
+        beast::flat_buffer buffer;
+        boost::system::error_code ec;
+        handle_connection(env, s, buffer, false, ec, yield);
+        s.shutdown(tcp::socket::shutdown_both, ec);
+      }, std::placeholders::_1, std::move(socket)));
+  }
 }
 
 int AsioFrontend::run()
diff --git a/ceph/src/rgw/rgw_auth.cc b/ceph/src/rgw/rgw_auth.cc
index e18749060..1073ba1ed 100644
--- a/ceph/src/rgw/rgw_auth.cc
+++ b/ceph/src/rgw/rgw_auth.cc
@@ -5,6 +5,7 @@
 
 #include "rgw_common.h"
 #include "rgw_auth.h"
+#include "rgw_quota.h"
 #include "rgw_user.h"
 #include "rgw_http_client.h"
 #include "rgw_keystone.h"
@@ -444,6 +445,10 @@ void rgw::auth::RemoteApplier::create_account(const rgw_user& acct_user,
   user_info.user_id = new_acct_user;
   user_info.display_name = info.acct_name;
 
+  user_info.max_buckets = cct->_conf->rgw_user_max_buckets;
+  rgw_apply_default_bucket_quota(user_info.bucket_quota, cct);
+  rgw_apply_default_user_quota(user_info.user_quota, cct);
+
   int ret = rgw_store_user_info(store, user_info, nullptr, nullptr,
                                 real_time(), true);
   if (ret < 0) {
diff --git a/ceph/src/rgw/rgw_auth_s3.cc b/ceph/src/rgw/rgw_auth_s3.cc
index 0904e8255..e685705ee 100644
--- a/ceph/src/rgw/rgw_auth_s3.cc
+++ b/ceph/src/rgw/rgw_auth_s3.cc
@@ -659,7 +659,8 @@ get_v4_canon_req_hash(CephContext* cct,
 
   const auto canonical_req_hash = calc_hash_sha256(canonical_req);
 
-  ldout(cct, 10) << "canonical request = " << canonical_req << dendl;
+  using sanitize = rgw::crypt_sanitize::log_content;
+  ldout(cct, 10) << "canonical request = " << sanitize{canonical_req} << dendl;
   ldout(cct, 10) << "canonical request hash = "
                  << buf_to_hex(canonical_req_hash).data() << dendl;
 
diff --git a/ceph/src/rgw/rgw_bucket.cc b/ceph/src/rgw/rgw_bucket.cc
index f8e8c2366..e4a56d849 100644
--- a/ceph/src/rgw/rgw_bucket.cc
+++ b/ceph/src/rgw/rgw_bucket.cc
@@ -13,6 +13,7 @@
 #include "common/errno.h"
 #include "common/ceph_json.h"
 #include "common/backport14.h"
+#include "include/scope_guard.h"
 #include "rgw_rados.h"
 #include "rgw_acl.h"
 #include "rgw_acl_s3.h"
@@ -26,7 +27,7 @@
 #include "include/rados/librados.hpp"
 // until everything is moved from rgw_common
 #include "rgw_common.h"
-
+#include "rgw_reshard.h"
 #include "cls/user/cls_user_types.h"
 
 #define dout_context g_ceph_context
@@ -297,7 +298,8 @@ int rgw_bucket_instance_store_info(RGWRados *store, string& entry, bufferlist& b
   return store->meta_mgr->put_entry(bucket_instance_meta_handler, entry, bl, exclusive, objv_tracker, mtime, pattrs);
 }
 
-int rgw_bucket_instance_remove_entry(RGWRados *store, string& entry, RGWObjVersionTracker *objv_tracker) {
+int rgw_bucket_instance_remove_entry(RGWRados *store, const string& entry,
+                                     RGWObjVersionTracker *objv_tracker) {
   return store->meta_mgr->remove_entry(bucket_instance_meta_handler, entry, objv_tracker);
 }
 
@@ -1654,6 +1656,213 @@ int RGWBucketAdminOp::set_quota(RGWRados *store, RGWBucketAdminOpState& op_state
   return bucket.set_quota(op_state);
 }
 
+static int purge_bucket_instance(RGWRados *store, const RGWBucketInfo& bucket_info)
+{
+  int max_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
+  for (int i = 0; i < max_shards; i++) {
+    RGWRados::BucketShard bs(store);
+    int shard_id = (bucket_info.num_shards > 0  ? i : -1);
+    int ret = bs.init(bucket_info.bucket, shard_id, nullptr);
+    if (ret < 0) {
+      cerr << "ERROR: bs.init(bucket=" << bucket_info.bucket << ", shard=" << shard_id
+           << "): " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+    ret = store->bi_remove(bs);
+    if (ret < 0) {
+      cerr << "ERROR: failed to remove bucket index object: "
+           << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+  }
+  return 0;
+}
+
+inline auto split_tenant(const std::string& bucket_name) ->
+  std::pair<std::string,std::string>
+{
+  auto p = bucket_name.find('/');
+  if(p != std::string::npos) {
+    return std::make_pair(bucket_name.substr(0,p), bucket_name.substr(p+1));
+  }
+  return std::make_pair(std::string(), bucket_name);
+}
+
+using bucket_instance_ls = std::vector<RGWBucketInfo>;
+void get_stale_instances(RGWRados *store, const std::string& bucket_name,
+                         const vector<std::string>& lst,
+                         bucket_instance_ls& stale_instances)
+{
+
+  RGWObjectCtx obj_ctx(store);
+
+  bucket_instance_ls other_instances;
+// first iterate over the entries, and pick up the done buckets; these
+// are guaranteed to be stale
+  for (const auto& bucket_instance : lst){
+    RGWBucketInfo binfo;
+    int r = store->get_bucket_instance_info(obj_ctx, bucket_instance,
+                                            binfo, nullptr,nullptr);
+    if (r < 0){
+      // this can only happen if someone deletes us right when we're processing
+      lderr(store->ctx()) << "Bucket instance is invalid: " << bucket_instance
+                          << cpp_strerror(-r) << dendl;
+      continue;
+    }
+    if (binfo.reshard_status == CLS_RGW_RESHARD_DONE)
+      stale_instances.emplace_back(std::move(binfo));
+    else {
+      other_instances.emplace_back(std::move(binfo));
+    }
+  }
+
+  // Read the cur bucket info, if the bucket doesn't exist we can simply return
+  // all the instances
+  std::string tenant,bucket;
+  std::tie(tenant, bucket) = split_tenant(bucket_name);
+  RGWBucketInfo cur_bucket_info;
+  int r = store->get_bucket_info(obj_ctx, tenant, bucket, cur_bucket_info, nullptr);
+  if (r < 0) {
+    if (r == -ENOENT) {
+      // bucket doesn't exist, everything is stale then
+      stale_instances.insert(std::end(stale_instances),
+                             std::make_move_iterator(other_instances.begin()),
+                             std::make_move_iterator(other_instances.end()));
+    } else {
+      // all bets are off if we can't read the bucket, just return the sureshot stale instances
+      lderr(store->ctx()) << "error: reading bucket info for bucket: "
+                          << bucket << cpp_strerror(-r) << dendl;
+    }
+    return;
+  }
+
+  // Don't process further in this round if bucket is resharding
+  if (cur_bucket_info.reshard_status == CLS_RGW_RESHARD_IN_PROGRESS)
+    return;
+
+  other_instances.erase(std::remove_if(other_instances.begin(), other_instances.end(),
+                                       [&cur_bucket_info](const RGWBucketInfo& b){
+                                         return (b.bucket.bucket_id == cur_bucket_info.bucket.bucket_id ||
+                                                 b.bucket.bucket_id == cur_bucket_info.new_bucket_instance_id);
+                                       }),
+                        other_instances.end());
+
+  // check if there are still instances left
+  if (other_instances.empty()) {
+    return;
+  }
+
+  // Now we have a bucket with instances where the reshard status is none, this
+  // usually happens when the reshard process couldn't complete, lockdown the
+  // bucket and walk through these instances to make sure no one else interferes
+  // with these
+  {
+    RGWBucketReshardLock reshard_lock(store, cur_bucket_info, true);
+    r = reshard_lock.lock();
+    if (r < 0) {
+      // most likely bucket is under reshard, return the sureshot stale instances
+      ldout(store->ctx(), 5) << __func__
+                             << "failed to take reshard lock; reshard underway likey" << dendl;
+      return;
+    }
+    auto sg = make_scope_guard([&reshard_lock](){ reshard_lock.unlock();} );
+    // this should be fast enough that we may not need to renew locks and check
+    // exit status?, should we read the values of the instances again?
+    stale_instances.insert(std::end(stale_instances),
+                           std::make_move_iterator(other_instances.begin()),
+                           std::make_move_iterator(other_instances.end()));
+  }
+
+  return;
+}
+
+static int process_stale_instances(RGWRados *store, RGWBucketAdminOpState& op_state,
+                                   RGWFormatterFlusher& flusher,
+                                   std::function<void(const bucket_instance_ls&,
+                                                      Formatter *,
+                                                      RGWRados*)> process_f)
+{
+  std::string marker;
+  void *handle;
+  Formatter *formatter = flusher.get_formatter();
+  static constexpr auto default_max_keys = 1000;
+
+  int ret = store->meta_mgr->list_keys_init("bucket.instance", marker, &handle);
+  if (ret < 0) {
+    cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+
+  bool truncated;
+
+  formatter->open_array_section("keys");
+
+  do {
+    list<std::string> keys;
+
+    ret = store->meta_mgr->list_keys_next(handle, default_max_keys, keys, &truncated);
+    if (ret < 0 && ret != -ENOENT) {
+      cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    } if (ret != -ENOENT) {
+      // partition the list of buckets by buckets as the listing is un sorted,
+      // since it would minimize the reads to bucket_info
+      std::unordered_map<std::string, std::vector<std::string>> bucket_instance_map;
+      for (auto &key: keys) {
+        auto pos = key.find(':');
+        if(pos != std::string::npos)
+          bucket_instance_map[key.substr(0,pos)].emplace_back(std::move(key));
+      }
+      for (const auto& kv: bucket_instance_map) {
+        bucket_instance_ls stale_lst;
+        get_stale_instances(store, kv.first, kv.second, stale_lst);
+        process_f(stale_lst, formatter, store);
+      }
+    }
+  } while (truncated);
+
+  formatter->close_section(); // keys
+  formatter->flush(cout);
+  return 0;
+}
+
+int RGWBucketAdminOp::list_stale_instances(RGWRados *store,
+                                           RGWBucketAdminOpState& op_state,
+                                           RGWFormatterFlusher& flusher)
+{
+  auto process_f = [](const bucket_instance_ls& lst,
+                      Formatter *formatter,
+                      RGWRados*){
+                     for (const auto& binfo: lst)
+                       formatter->dump_string("key", binfo.bucket.get_key());
+                   };
+  return process_stale_instances(store, op_state, flusher, process_f);
+}
+
+
+int RGWBucketAdminOp::clear_stale_instances(RGWRados *store,
+                                            RGWBucketAdminOpState& op_state,
+                                            RGWFormatterFlusher& flusher)
+{
+  auto process_f = [](const bucket_instance_ls& lst,
+                      Formatter *formatter,
+                      RGWRados *store){
+                     for (const auto &binfo: lst) {
+                       int ret = purge_bucket_instance(store, binfo);
+                       if (ret == 0){
+                         auto md_key = "bucket.instance:" + binfo.bucket.get_key();
+                         ret = store->meta_mgr->remove(md_key);
+                       }
+                       formatter->open_object_section("delete_status");
+                       formatter->dump_string("bucket_instance", binfo.bucket.get_key());
+                       formatter->dump_int("status", -ret);
+                       formatter->close_section();
+                     }
+                   };
+
+  return process_stale_instances(store, op_state, flusher, process_f);
+}
+
 void rgw_data_change::dump(Formatter *f) const
 {
   string type;
@@ -2379,7 +2588,8 @@ public:
     RGWListRawObjsCtx ctx;
   };
 
-  int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) override {
+  int remove(RGWRados *store, string& entry,
+	     RGWObjVersionTracker& objv_tracker) override {
     RGWBucketInfo info;
     RGWObjectCtx obj_ctx(store);
 
@@ -2387,7 +2597,8 @@ public:
     if (ret < 0 && ret != -ENOENT)
       return ret;
 
-    return rgw_bucket_instance_remove_entry(store, entry, &info.objv_tracker);
+    return rgw_bucket_instance_remove_entry(store, entry,
+					    &info.objv_tracker);
   }
 
   void get_pool_and_oid(RGWRados *store, const string& key, rgw_pool& pool, string& oid) override {
diff --git a/ceph/src/rgw/rgw_bucket.h b/ceph/src/rgw/rgw_bucket.h
index f43365515..a574be618 100644
--- a/ceph/src/rgw/rgw_bucket.h
+++ b/ceph/src/rgw/rgw_bucket.h
@@ -34,7 +34,8 @@ extern int rgw_bucket_parse_bucket_instance(const string& bucket_instance, strin
 extern int rgw_bucket_parse_bucket_key(CephContext *cct, const string& key,
                                        rgw_bucket* bucket, int *shard_id);
 
-extern int rgw_bucket_instance_remove_entry(RGWRados *store, string& entry, RGWObjVersionTracker *objv_tracker);
+extern int rgw_bucket_instance_remove_entry(RGWRados *store, const string& entry,
+					    RGWObjVersionTracker *objv_tracker);
 extern void rgw_bucket_instance_key_to_oid(string& key);
 extern void rgw_bucket_instance_oid_to_key(string& oid);
 
@@ -137,7 +138,7 @@ public:
   /**
    * Remove a bucket from the user's list by name.
    */
-  void remove(string& name) {
+  void remove(const string& name) {
     map<string, RGWBucketEnt>::iterator iter;
     iter = buckets.find(name);
     if (iter != buckets.end()) {
@@ -335,6 +336,12 @@ public:
 			 RGWFormatterFlusher& flusher,
 			 bool warnings_only = false);
   static int set_quota(RGWRados *store, RGWBucketAdminOpState& op_state);
+
+  static int list_stale_instances(RGWRados *store, RGWBucketAdminOpState& op_state,
+				  RGWFormatterFlusher& flusher);
+
+  static int clear_stale_instances(RGWRados *store, RGWBucketAdminOpState& op_state,
+				   RGWFormatterFlusher& flusher);
 };
 
 
diff --git a/ceph/src/rgw/rgw_common.cc b/ceph/src/rgw/rgw_common.cc
index 96007f398..4980d233b 100644
--- a/ceph/src/rgw/rgw_common.cc
+++ b/ceph/src/rgw/rgw_common.cc
@@ -1080,6 +1080,31 @@ string RGWHTTPArgs::sys_get(const string& name, bool * const exists) const
   return e ? iter->second : string();
 }
 
+bool rgw_transport_is_secure(CephContext *cct, const RGWEnv& env)
+{
+  const auto& m = env.get_map();
+  // frontend connected with ssl
+  if (m.count("SERVER_PORT_SECURE")) {
+    return true;
+  }
+  // ignore proxy headers unless explicitly enabled
+  if (!cct->_conf->rgw_trust_forwarded_https) {
+    return false;
+  }
+  // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Forwarded
+  // Forwarded: by=<identifier>; for=<identifier>; host=<host>; proto=<http|https>
+  auto i = m.find("HTTP_FORWARDED");
+  if (i != m.end() && i->second.find("proto=https") != std::string::npos) {
+    return true;
+  }
+  // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-Proto
+  i = m.find("HTTP_X_FORWARDED_PROTO");
+  if (i != m.end() && i->second == "https") {
+    return true;
+  }
+  return false;
+}
+
 bool verify_user_permission(struct req_state * const s,
                             RGWAccessControlPolicy * const user_acl,
                             const int perm)
@@ -1870,12 +1895,15 @@ bool match_policy(boost::string_view pattern, boost::string_view input,
 {
   const uint32_t flag2 = flag & (MATCH_POLICY_ACTION|MATCH_POLICY_ARN) ?
       MATCH_CASE_INSENSITIVE : 0;
+  const bool colonblocks = !(flag & (MATCH_POLICY_RESOURCE |
+				     MATCH_POLICY_STRING));
 
   const auto npos = boost::string_view::npos;
   boost::string_view::size_type last_pos_input = 0, last_pos_pattern = 0;
   while (true) {
-    auto cur_pos_input = input.find(":", last_pos_input);
-    auto cur_pos_pattern = pattern.find(":", last_pos_pattern);
+    auto cur_pos_input = colonblocks ? input.find(":", last_pos_input) : npos;
+    auto cur_pos_pattern =
+      colonblocks ? pattern.find(":", last_pos_pattern) : npos;
 
     auto substr_input = input.substr(last_pos_input, cur_pos_input);
     auto substr_pattern = pattern.substr(last_pos_pattern, cur_pos_pattern);
diff --git a/ceph/src/rgw/rgw_common.h b/ceph/src/rgw/rgw_common.h
index 04f6d5fc9..8e927ec93 100644
--- a/ceph/src/rgw/rgw_common.h
+++ b/ceph/src/rgw/rgw_common.h
@@ -420,6 +420,10 @@ public:
   const std::map<string, string, ltstr_nocase>& get_map() const { return env_map; }
 };
 
+// return true if the connection is secure. this either means that the
+// connection arrived via ssl, or was forwarded as https by a trusted proxy
+bool rgw_transport_is_secure(CephContext *cct, const RGWEnv& env);
+
 enum http_op {
   OP_GET,
   OP_PUT,
diff --git a/ceph/src/rgw/rgw_cr_rados.cc b/ceph/src/rgw/rgw_cr_rados.cc
index d79188a8d..47babe661 100644
--- a/ceph/src/rgw/rgw_cr_rados.cc
+++ b/ceph/src/rgw/rgw_cr_rados.cc
@@ -170,7 +170,7 @@ int RGWAsyncLockSystemObj::_send_request()
   utime_t duration(duration_secs, 0);
   l.set_duration(duration);
   l.set_cookie(cookie);
-  l.set_renew(true);
+  l.set_may_renew(true);
 
   return l.lock_exclusive(&ref.ioctx, ref.oid);
 }
diff --git a/ceph/src/rgw/rgw_crypt.cc b/ceph/src/rgw/rgw_crypt.cc
index 81a84ad69..d5893734f 100644
--- a/ceph/src/rgw/rgw_crypt.cc
+++ b/ceph/src/rgw/rgw_crypt.cc
@@ -1038,7 +1038,7 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
         return -ERR_INVALID_ENCRYPTION_ALGORITHM;
       }
       if (s->cct->_conf->rgw_crypt_require_ssl &&
-          !s->info.env->exists("SERVER_PORT_SECURE")) {
+          !rgw_transport_is_secure(s->cct, *s->info.env)) {
         ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
         return -ERR_INVALID_REQUEST;
       }
@@ -1144,7 +1144,7 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
         return -EINVAL;
       }
       if (s->cct->_conf->rgw_crypt_require_ssl &&
-          !s->info.env->exists("SERVER_PORT_SECURE")) {
+          !rgw_transport_is_secure(s->cct, *s->info.env)) {
         ldout(s->cct, 5) << "ERROR: insecure request, rgw_crypt_require_ssl is set" << dendl;
         return -ERR_INVALID_REQUEST;
       }
@@ -1260,7 +1260,7 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
 
   if (stored_mode == "SSE-C-AES256") {
     if (s->cct->_conf->rgw_crypt_require_ssl &&
-        !s->info.env->exists("SERVER_PORT_SECURE")) {
+        !rgw_transport_is_secure(s->cct, *s->info.env)) {
       ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
       return -ERR_INVALID_REQUEST;
     }
@@ -1342,7 +1342,7 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
 
   if (stored_mode == "SSE-KMS") {
     if (s->cct->_conf->rgw_crypt_require_ssl &&
-        !s->info.env->exists("SERVER_PORT_SECURE")) {
+        !rgw_transport_is_secure(s->cct, *s->info.env)) {
       ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
       return -ERR_INVALID_REQUEST;
     }
diff --git a/ceph/src/rgw/rgw_data_sync.cc b/ceph/src/rgw/rgw_data_sync.cc
index dca9ea919..ad4315782 100644
--- a/ceph/src/rgw/rgw_data_sync.cc
+++ b/ceph/src/rgw/rgw_data_sync.cc
@@ -2546,6 +2546,10 @@ public:
             }
             logger.log("remove");
             call(data_sync_module->remove_object(sync_env, *bucket_info, key, timestamp, versioned, versioned_epoch.value_or(0), &zones_trace));
+            // our copy of the object is more recent, continue as if it succeeded
+            if (retcode == -ERR_PRECONDITION_FAILED) {
+              retcode = 0;
+            }
           } else if (op == CLS_RGW_OP_LINK_OLH_DM) {
             logger.log("creating delete marker");
             set_status("creating delete marker");
diff --git a/ceph/src/rgw/rgw_file.h b/ceph/src/rgw/rgw_file.h
index 796dcbf9d..86d647f95 100644
--- a/ceph/src/rgw/rgw_file.h
+++ b/ceph/src/rgw/rgw_file.h
@@ -1204,6 +1204,13 @@ namespace rgw {
 
     RGWUserInfo* get_user() { return &user; }
 
+    void update_user() {
+      RGWUserInfo _user = user;
+      int ret = rgw_get_user_info_by_access_key(rgwlib.get_store(), key.id, user);
+      if (ret != 0)
+        user = _user;
+    }
+
     void close();
     void gc();
   }; /* RGWLibFS */
diff --git a/ceph/src/rgw/rgw_iam_policy.cc b/ceph/src/rgw/rgw_iam_policy.cc
index 9f31db2ea..331802144 100644
--- a/ceph/src/rgw/rgw_iam_policy.cc
+++ b/ceph/src/rgw/rgw_iam_policy.cc
@@ -381,7 +381,7 @@ bool ARN::match(const ARN& candidate) const {
     return false;
   }
 
-  if (!match_policy(resource, candidate.resource, MATCH_POLICY_ARN)) {
+  if (!match_policy(resource, candidate.resource, MATCH_POLICY_RESOURCE)) {
     return false;
   }
 
diff --git a/ceph/src/rgw/rgw_metadata.cc b/ceph/src/rgw/rgw_metadata.cc
index 6a275239c..30337dcd5 100644
--- a/ceph/src/rgw/rgw_metadata.cc
+++ b/ceph/src/rgw/rgw_metadata.cc
@@ -1079,7 +1079,9 @@ done:
   return 0;
 }
 
-int RGWMetadataManager::remove_entry(RGWMetadataHandler *handler, string& key, RGWObjVersionTracker *objv_tracker)
+int RGWMetadataManager::remove_entry(RGWMetadataHandler *handler,
+				     const string& key,
+				     RGWObjVersionTracker *objv_tracker)
 {
   string section;
   RGWMetadataLogData log_data;
diff --git a/ceph/src/rgw/rgw_metadata.h b/ceph/src/rgw/rgw_metadata.h
index 0dfc73f11..5c3205bbc 100644
--- a/ceph/src/rgw/rgw_metadata.h
+++ b/ceph/src/rgw/rgw_metadata.h
@@ -346,7 +346,9 @@ public:
 
   int put_entry(RGWMetadataHandler *handler, const string& key, bufferlist& bl, bool exclusive,
                 RGWObjVersionTracker *objv_tracker, real_time mtime, map<string, bufferlist> *pattrs = NULL);
-  int remove_entry(RGWMetadataHandler *handler, string& key, RGWObjVersionTracker *objv_tracker);
+  int remove_entry(RGWMetadataHandler *handler,
+		   const string& key,
+		   RGWObjVersionTracker *objv_tracker);
   int get(string& metadata_key, Formatter *f);
   int put(string& metadata_key, bufferlist& bl,
           RGWMetadataHandler::sync_type_t sync_mode,
diff --git a/ceph/src/rgw/rgw_op.cc b/ceph/src/rgw/rgw_op.cc
index dce4549f6..730de87fa 100644
--- a/ceph/src/rgw/rgw_op.cc
+++ b/ceph/src/rgw/rgw_op.cc
@@ -608,9 +608,7 @@ rgw::IAM::Environment rgw_build_iam_environment(RGWRados* store,
     e.emplace("aws:Referer", i->second);
   }
 
-  // These seem to be the semantics, judging from rest_rgw_s3.cc
-  i = m.find("SERVER_PORT_SECURE");
-  if (i != m.end()) {
+  if (rgw_transport_is_secure(s->cct, *s->info.env)) {
     e.emplace("aws:SecureTransport", "true");
   }
 
@@ -2281,22 +2279,13 @@ int RGWListBucket::verify_permission()
 
 int RGWListBucket::parse_max_keys()
 {
-  if (!max_keys.empty()) {
-    char *endptr;
-    max = strtol(max_keys.c_str(), &endptr, 10);
-    if (endptr) {
-      if (endptr == max_keys.c_str()) return -EINVAL;
-      while (*endptr && isspace(*endptr)) // ignore white space
-        endptr++;
-      if (*endptr) {
-        return -EINVAL;
-      }
-    }
-  } else {
-    max = default_max;
-  }
-
-  return 0;
+  // Bound max value of max-keys to configured value for security
+  // Bound min value of max-keys to '0'
+  // Some S3 clients explicitly send max-keys=0 to detect if the bucket is
+  // empty without listing any items.
+  return parse_value_and_bound(max_keys, max, 0,
+			s->cct->_conf->get_val<uint64_t>("rgw_max_listing_results"),
+			default_max);
 }
 
 void RGWListBucket::pre_exec()
diff --git a/ceph/src/rgw/rgw_op.h b/ceph/src/rgw/rgw_op.h
index e4d8cd4a9..23d979947 100644
--- a/ceph/src/rgw/rgw_op.h
+++ b/ceph/src/rgw/rgw_op.h
@@ -2214,6 +2214,36 @@ public:
   virtual const string name() { return "get_cluster_stat"; }
 };
 
+static inline int parse_value_and_bound(
+    const string &input,
+    int &output,
+    const long lower_bound,
+    const long upper_bound,
+    const long default_val)
+{
+  if (!input.empty()) {
+    char *endptr;
+    output = strtol(input.c_str(), &endptr, 10);
+    if (endptr) {
+      if (endptr == input.c_str()) return -EINVAL;
+      while (*endptr && isspace(*endptr)) // ignore white space
+        endptr++;
+      if (*endptr) {
+        return -EINVAL;
+      }
+    }
+    if(output > upper_bound) {
+      output = upper_bound;
+    }
+    if(output < lower_bound) {
+      output = lower_bound;
+    }
+  } else {
+    output = default_val;
+  }
+
+  return 0;
+}
 
 
 #endif /* CEPH_RGW_OP_H */
diff --git a/ceph/src/rgw/rgw_quota.cc b/ceph/src/rgw/rgw_quota.cc
index ce3d1265f..4212e9c8b 100644
--- a/ceph/src/rgw/rgw_quota.cc
+++ b/ceph/src/rgw/rgw_quota.cc
@@ -1006,3 +1006,26 @@ void RGWQuotaHandler::free_handler(RGWQuotaHandler *handler)
 }
 
 
+void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const CephContext* cct)
+{
+  if (cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
+    quota.max_objects = cct->_conf->rgw_bucket_default_quota_max_objects;
+    quota.enabled = true;
+  }
+  if (cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
+    quota.max_size = cct->_conf->rgw_bucket_default_quota_max_size;
+    quota.enabled = true;
+  }
+}
+
+void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const CephContext* cct)
+{
+  if (cct->_conf->rgw_user_default_quota_max_objects >= 0) {
+    quota.max_objects = cct->_conf->rgw_user_default_quota_max_objects;
+    quota.enabled = true;
+  }
+  if (cct->_conf->rgw_user_default_quota_max_size >= 0) {
+    quota.max_size = cct->_conf->rgw_user_default_quota_max_size;
+    quota.enabled = true;
+  }
+}
diff --git a/ceph/src/rgw/rgw_quota.h b/ceph/src/rgw/rgw_quota.h
index 49ec14871..2f08d4a5d 100644
--- a/ceph/src/rgw/rgw_quota.h
+++ b/ceph/src/rgw/rgw_quota.h
@@ -114,4 +114,8 @@ public:
   static void free_handler(RGWQuotaHandler *handler);
 };
 
+// apply default quotas from configuration
+void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const CephContext* cct);
+void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const CephContext* cct);
+
 #endif
diff --git a/ceph/src/rgw/rgw_rados.cc b/ceph/src/rgw/rgw_rados.cc
index f123514e4..ff4ac6959 100644
--- a/ceph/src/rgw/rgw_rados.cc
+++ b/ceph/src/rgw/rgw_rados.cc
@@ -1388,6 +1388,12 @@ int RGWPeriod::update()
       return -EINVAL;
     }  
     
+    if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
+      ldout(cct,0) << "ERROR: zonegroup " << zg.get_name()
+                   << " has a non existent master zone "<< dendl;
+      return -EINVAL;
+    }
+
     if (zg.is_master_zonegroup()) {
       master_zonegroup = zg.get_id();
       master_zone = zg.master_zone;
@@ -3504,21 +3510,22 @@ int RGWIndexCompletionThread::process()
     ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
 
     RGWRados::BucketShard bs(store);
+    RGWBucketInfo bucket_info;
 
-    int r = bs.init(c->obj.bucket, c->obj);
+    int r = bs.init(c->obj.bucket, c->obj, &bucket_info);
     if (r < 0) {
       ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
       /* not much to do */
       continue;
     }
 
-    r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int { 
-                             librados::ObjectWriteOperation o;
-                             cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
-                             cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
-                                                        c->log_op, c->bilog_op, &c->zones_trace);
-
-                             return bs->index_ctx.operate(bs->bucket_obj, &o);
+    r = store->guard_reshard(&bs, c->obj, bucket_info,
+			     [&](RGWRados::BucketShard *bs) -> int {
+			       librados::ObjectWriteOperation o;
+			       cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+			       cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
+							  c->log_op, c->bilog_op, &c->zones_trace);
+			       return bs->index_ctx.operate(bs->bucket_obj, &o);
                              });
     if (r < 0) {
       ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
@@ -4258,6 +4265,9 @@ int RGWRados::init_zg_from_period(bool *initialized)
     // use endpoints from the zonegroup's master zone
     auto master = zg.zones.find(zg.master_zone);
     if (master == zg.zones.end()) {
+      // Check for empty zonegroup which can happen if zone was deleted before removal
+      if (zg.zones.size() == 0)
+        continue;
       // fix missing master zone for a single zone zonegroup
       if (zg.master_zone.empty() && zg.zones.size() == 1) {
 	master = zg.zones.begin();
@@ -4920,9 +4930,21 @@ int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::
  */
 int RGWRados::list_buckets_init(RGWAccessHandle *handle)
 {
-  librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
-  *handle = (RGWAccessHandle)state;
-  return 0;
+  try {
+    auto iter = root_pool_ctx.nobjects_begin();
+    librados::NObjectIterator *state = new librados::NObjectIterator(iter);
+    *handle = (RGWAccessHandle)state;
+    return 0;
+  } catch (const std::system_error& e) {
+    int r = -e.code().value();
+    ldout(cct, 10) << "nobjects_begin threw " << e.what()
+       << ", returning " << r << dendl;
+    return r;
+  } catch (const std::exception& e) {
+    ldout(cct, 10) << "nobjects_begin threw " << e.what()
+       << ", returning -5" << dendl;
+    return -EIO;
+  }
 }
 
 /** 
@@ -4945,8 +4967,18 @@ int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *hand
     if (obj.key.name[0] == '_') {
       obj.key.name = obj.key.name.substr(1);
     }
-
-    (*state)++;
+    try {
+      (*state)++;
+    } catch (const std::system_error& e) {
+      int r = -e.code().value();
+      ldout(cct, 10) << "nobjects_begin threw " << e.what()
+         << ", returning " << r << dendl;
+      return r;
+    } catch (const std::exception& e) {
+      ldout(cct, 10) << "nobjects_begin threw " << e.what()
+         << ", returning -5" << dendl;
+      return -EIO;
+    }
   } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
 
   return 0;
@@ -5501,7 +5533,7 @@ int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& durati
   l.set_duration(ut);
   l.set_cookie(owner_id);
   l.set_tag(zone_id);
-  l.set_renew(true);
+  l.set_may_renew(true);
   
   return l.lock_exclusive(&io_ctx, oid);
 }
@@ -5904,9 +5936,9 @@ int RGWRados::create_pool(const rgw_pool& pool)
 
 int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
 {
-  librados::IoCtx index_ctx; // context for new bucket
+  librados::IoCtx index_ctx;
 
-  string dir_oid =  dir_oid_prefix;
+  string dir_oid = dir_oid_prefix;
   int r = open_bucket_index_ctx(bucket_info, index_ctx);
   if (r < 0) {
     return r;
@@ -5917,7 +5949,29 @@ int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
   map<int, string> bucket_objs;
   get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
 
-  return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
+  return CLSRGWIssueBucketIndexInit(index_ctx,
+				    bucket_objs,
+				    cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+int RGWRados::clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
+{
+  librados::IoCtx index_ctx;
+
+  std::string dir_oid = dir_oid_prefix;
+  int r = open_bucket_index_ctx(bucket_info, index_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  dir_oid.append(bucket_info.bucket.bucket_id);
+
+  std::map<int, std::string> bucket_objs;
+  get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
+
+  return CLSRGWIssueBucketIndexClean(index_ctx,
+				     bucket_objs,
+				     cct->_conf->rgw_bucket_index_max_aio)();
 }
 
 void RGWRados::create_bucket_id(string *bucket_id)
@@ -6022,16 +6076,16 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
       /* only remove it if it's a different bucket instance */
       if (info.bucket.bucket_id != bucket.bucket_id) {
         /* remove bucket meta instance */
-        string entry = bucket.get_key();
-        r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
+        r = rgw_bucket_instance_remove_entry(this,
+					     bucket.get_key(),
+					     &instance_ver);
         if (r < 0)
           return r;
 
-        map<int, string>::const_iterator biter;
-        for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
-          // Do best effort removal
-          index_ctx.remove(biter->second);
-        }
+	/* remove bucket index objects asynchronously by best effort */
+	(void) CLSRGWIssueBucketIndexClean(index_ctx,
+					   bucket_objs,
+					   cct->_conf->rgw_bucket_index_max_aio)();
       }
       /* ret == -ENOENT here */
     }
@@ -6688,19 +6742,24 @@ int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key
   return 0;
 }
 
-int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
+int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
+				const rgw_obj& obj,
+				RGWBucketInfo* bucket_info_out)
 {
   bucket = _bucket;
 
   RGWObjectCtx obj_ctx(store);
 
   RGWBucketInfo bucket_info;
-  int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
+  RGWBucketInfo* bucket_info_p =
+    bucket_info_out ? bucket_info_out : &bucket_info;
+  
+  int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
   if (ret < 0) {
     return ret;
   }
 
-  ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
+  ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
   if (ret < 0) {
     ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
     return ret;
@@ -6710,7 +6769,9 @@ int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
   return 0;
 }
 
-int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
+int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
+				int sid,
+				RGWBucketInfo* bucket_info_out)
 {
   bucket = _bucket;
   shard_id = sid;
@@ -6718,12 +6779,14 @@ int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
   RGWObjectCtx obj_ctx(store);
 
   RGWBucketInfo bucket_info;
-  int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
+  RGWBucketInfo* bucket_info_p =
+    bucket_info_out ? bucket_info_out : &bucket_info;
+  int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
   if (ret < 0) {
     return ret;
   }
 
-  ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
+  ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, shard_id, &bucket_obj);
   if (ret < 0) {
     ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
     return ret;
@@ -8656,17 +8719,17 @@ int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& ob
   /* if the bucket is not synced we can remove the meta file */
   if (!is_syncing_bucket_meta(bucket)) {
     RGWObjVersionTracker objv_tracker;
-    string entry = bucket.get_key();
-    r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
+    r = rgw_bucket_instance_remove_entry(this, bucket.get_key(), &objv_tracker);
     if (r < 0) {
       return r;
     }
-    /* remove bucket index objects*/
-    map<int, string>::const_iterator biter;
-    for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
-      index_ctx.remove(biter->second);
-    }
+
+   /* remove bucket index objects asynchronously by best effort */
+    (void) CLSRGWIssueBucketIndexClean(index_ctx,
+				       bucket_objs,
+				       cct->_conf->rgw_bucket_index_max_aio)();
   }
+
   return 0;
 }
 
@@ -8952,7 +9015,7 @@ int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
   return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
 }
 
-int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
+int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
 {
   librados::IoCtx index_ctx;
   map<int, string> bucket_objs;
@@ -10216,7 +10279,7 @@ int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::functio
     }
     ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
     string new_bucket_id;
-    r = store->block_while_resharding(bs, &new_bucket_id);
+    r = store->block_while_resharding(bs, &new_bucket_id, target->bucket_info);
     if (r == -ERR_BUSY_RESHARDING) {
       continue;
     }
@@ -10268,9 +10331,9 @@ int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_t
     }
   }
 
-  int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int { 
-    return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
-  });
+  int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
+				   return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
+				 });
 
   if (r < 0) {
     return r;
@@ -10369,9 +10432,9 @@ int RGWRados::Bucket::UpdateIndex::cancel()
   RGWRados *store = target->get_store();
   BucketShard *bs;
 
-  int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int { 
-    return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
-  });
+  int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
+				 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
+			       });
 
   /*
    * need to update data log anyhow, so that whoever follows needs to update its internal markers
@@ -11176,14 +11239,17 @@ int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjStat
   return ret;
 }
 
-int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call)
+int RGWRados::guard_reshard(BucketShard *bs,
+			    const rgw_obj& obj_instance,
+			    const RGWBucketInfo& bucket_info,
+			    std::function<int(BucketShard *)> call)
 {
   rgw_obj obj;
   const rgw_obj *pobj = &obj_instance;
   int r;
 
   for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
-    r = bs->init(pobj->bucket, *pobj);
+    r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */);
     if (r < 0) {
       ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
       return r;
@@ -11194,7 +11260,7 @@ int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::f
     }
     ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
     string new_bucket_id;
-    r = block_while_resharding(bs, &new_bucket_id);
+    r = block_while_resharding(bs, &new_bucket_id, bucket_info);
     if (r == -ERR_BUSY_RESHARDING) {
       continue;
     }
@@ -11216,11 +11282,13 @@ int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::f
   return 0;
 }
 
-int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
+int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
+				     string *new_bucket_id,
+				     const RGWBucketInfo& bucket_info)
 {
   std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
 
-  return waiter->block_while_resharding(bs, new_bucket_id);
+  return waiter->block_while_resharding(bs, new_bucket_id, bucket_info);
 }
 
 int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
@@ -11246,13 +11314,14 @@ int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjStat
   BucketShard bs(this);
 
   cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
-  r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int { 
-                    librados::ObjectWriteOperation op;
-                    cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                    return cls_rgw_bucket_link_olh(bs->index_ctx, op,
-                                                   bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
-                                                   unmod_since, high_precision_time,
-                                                   get_zone().log_data, zones_trace);
+  r = guard_reshard(&bs, obj_instance, bucket_info,
+		    [&](BucketShard *bs) -> int {
+		      librados::ObjectWriteOperation op;
+		      cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+		      return cls_rgw_bucket_link_olh(bs->index_ctx, op,
+						     bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
+						     unmod_since, high_precision_time,
+						     get_zone().log_data, zones_trace);
                     });
   if (r < 0) {
     ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
@@ -11290,11 +11359,12 @@ int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, con
   BucketShard bs(this);
 
   cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
-  r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int { 
-                    librados::ObjectWriteOperation op;
-                    cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                    return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
-                                                          olh_tag, olh_epoch, get_zone().log_data, zones_trace);
+  r = guard_reshard(&bs, obj_instance, bucket_info,
+		    [&](BucketShard *bs) -> int {
+		      librados::ObjectWriteOperation op;
+		      cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+		      return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
+							    olh_tag, olh_epoch, get_zone().log_data, zones_trace);
                     });
   if (r < 0) {
     ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
@@ -11316,7 +11386,8 @@ int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObj
   }
 
   BucketShard bs(this);
-  int ret = bs.init(obj_instance.bucket, obj_instance);
+  int ret =
+    bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
   if (ret < 0) {
     ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
     return ret;
@@ -11326,12 +11397,13 @@ int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObj
 
   cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
 
-  ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int { 
-                      ObjectReadOperation op;
-                      cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                      return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
-                                                 key, ver_marker, olh_tag, log, is_truncated);
-                    });
+  ret = guard_reshard(&bs, obj_instance, bucket_info,
+		      [&](BucketShard *bs) -> int {
+			ObjectReadOperation op;
+			cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+			return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
+						   key, ver_marker, olh_tag, log, is_truncated);
+		      });
   if (ret < 0) {
     ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
     return ret;
@@ -11349,7 +11421,8 @@ int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObj
   }
 
   BucketShard bs(this);
-  int ret = bs.init(obj_instance.bucket, obj_instance);
+  int ret =
+    bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
   if (ret < 0) {
     ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
     return ret;
@@ -11359,11 +11432,12 @@ int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObj
 
   cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
 
-  ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int { 
-                      ObjectWriteOperation op;
-                      cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                      cls_rgw_trim_olh_log(op, key, ver, olh_tag);
-                      return pbs->index_ctx.operate(pbs->bucket_obj, &op);
+  ret = guard_reshard(&bs, obj_instance, bucket_info,
+		      [&](BucketShard *pbs) -> int {
+			ObjectWriteOperation op;
+			cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+			cls_rgw_trim_olh_log(op, key, ver, olh_tag);
+			return pbs->index_ctx.operate(pbs->bucket_obj, &op);
                       });
   if (ret < 0) {
     ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
@@ -11387,10 +11461,11 @@ int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjSta
 
   cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
 
-  int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int { 
-                          ObjectWriteOperation op;
-                          cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                          return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
+  int ret = guard_reshard(&bs, obj_instance, bucket_info,
+			  [&](BucketShard *pbs) -> int {
+			    ObjectWriteOperation op;
+			    cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+			    return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
                           });
   if (ret < 0) {
     ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
@@ -12591,9 +12666,19 @@ int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGW
     return -EINVAL;
   }
 
-  iter = io_ctx.nobjects_begin(oc);
-
-  return 0;
+  try {
+    iter = io_ctx.nobjects_begin(oc);
+    return 0;
+  } catch (const std::system_error& e) {
+    r = -e.code().value();
+    ldout(cct, 10) << "nobjects_begin threw " << e.what()
+       << ", returning " << r << dendl;
+    return r;
+  } catch (const std::exception& e) {
+    ldout(cct, 10) << "nobjects_begin threw " << e.what()
+       << ", returning -5" << dendl;
+    return -EIO;
+  }
 }
 
 string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
@@ -12601,7 +12686,8 @@ string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
   return ctx.iter.get_cursor().to_str();
 }
 
-int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
+static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
+                           vector<rgw_bucket_dir_entry>& objs,
                            bool *is_truncated, RGWAccessListFilter *filter)
 {
   librados::IoCtx& io_ctx = ctx.io_ctx;
@@ -12640,6 +12726,24 @@ struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
   }
 };
 
+int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
+                           bool *is_truncated, RGWAccessListFilter *filter)
+{
+  // catch exceptions from NObjectIterator::operator++()
+  try {
+    return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
+  } catch (const std::system_error& e) {
+    int r = -e.code().value();
+    ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
+       << ", returning " << r << dendl;
+    return r;
+  } catch (const std::exception& e) {
+    ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
+       << ", returning -5" << dendl;
+    return -EIO;
+  }
+}
+
 int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
 {
   if (!ctx->initialized) {
@@ -12876,7 +12980,7 @@ int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rg
 int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
 {
   BucketShard bs(this);
-  int ret = bs.init(bucket, obj);
+  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
   if (ret < 0) {
     ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
     return ret;
@@ -12908,7 +13012,7 @@ int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
 int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
 {
   BucketShard bs(this);
-  int ret = bs.init(bucket, obj);
+  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
   if (ret < 0) {
     ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
     return ret;
@@ -12921,7 +13025,7 @@ int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string&
 {
   rgw_obj obj(bucket, obj_name);
   BucketShard bs(this);
-  int ret = bs.init(bucket, obj);
+  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
   if (ret < 0) {
     ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
     return ret;
@@ -12963,7 +13067,7 @@ int RGWRados::bi_remove(BucketShard& bs)
 int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
 {
   BucketShard bs(this);
-  int ret = bs.init(bucket, shard_id);
+  int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
   if (ret < 0) {
     ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
     return ret;
@@ -13015,13 +13119,6 @@ bool RGWRados::process_expire_objects()
   return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
 }
 
-int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
-{
-  bufferlist in;
-  cls_rgw_bucket_init(op);
-  return index_ctx.operate(oid, &op);
-}
-
 int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
                                  rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
 {
diff --git a/ceph/src/rgw/rgw_rados.h b/ceph/src/rgw/rgw_rados.h
index 9b4d052c3..cd462fe20 100644
--- a/ceph/src/rgw/rgw_rados.h
+++ b/ceph/src/rgw/rgw_rados.h
@@ -2229,6 +2229,7 @@ class RGWRados : public AdminSocketHook
   friend class RGWReplicaLogger;
   friend class RGWReshard;
   friend class RGWBucketReshard;
+  friend class RGWBucketReshardLock;
   friend class BucketIndexLockGuard;
   friend class RGWCompleteMultipart;
 
@@ -2611,6 +2612,7 @@ public:
   int create_pool(const rgw_pool& pool);
 
   int init_bucket_index(RGWBucketInfo& bucket_info, int num_shards);
+  int clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards);
   int select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& rule,
                               string *pselected_rule_name, RGWZonePlacementInfo *rule_info);
   int select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info);
@@ -2706,8 +2708,8 @@ public:
     string bucket_obj;
 
     explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {}
-    int init(const rgw_bucket& _bucket, const rgw_obj& obj);
-    int init(const rgw_bucket& _bucket, int sid);
+    int init(const rgw_bucket& _bucket, const rgw_obj& obj, RGWBucketInfo* out);
+    int init(const rgw_bucket& _bucket, int sid, RGWBucketInfo* out);
     int init(const RGWBucketInfo& bucket_info, int sid);
   };
 
@@ -2747,7 +2749,8 @@ public:
 
     int get_bucket_shard(BucketShard **pbs) {
       if (!bs_initialized) {
-        int r = bs.init(bucket_info.bucket, obj);
+        int r =
+	  bs.init(bucket_info.bucket, obj, nullptr /* no RGWBucketInfo */);
         if (r < 0) {
           return r;
         }
@@ -2943,7 +2946,8 @@ public:
       rgw_zone_set *zones_trace{nullptr};
 
       int init_bs() {
-        int r = bs.init(target->get_bucket(), obj);
+        int r =
+	  bs.init(target->get_bucket(), obj, nullptr /* no RGWBucketInfo */);
         if (r < 0) {
           return r;
         }
@@ -3350,8 +3354,13 @@ public:
   int obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op);
   int obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op);
 
-  int guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call);
-  int block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id);
+  int guard_reshard(BucketShard *bs,
+		    const rgw_obj& obj_instance,
+		    const RGWBucketInfo& bucket_info,
+		    std::function<int(BucketShard *)> call);
+  int block_while_resharding(RGWRados::BucketShard *bs,
+			     string *new_bucket_id,
+			     const RGWBucketInfo& bucket_info);
 
   void bucket_index_guard_olh_op(RGWObjState& olh_state, librados::ObjectOperation& op);
   int olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag);
@@ -3511,7 +3520,6 @@ public:
   int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv,
 			     map<string, bufferlist> *pattrs, bool create_entry_point);
 
-  int cls_rgw_init_index(librados::IoCtx& io_ctx, librados::ObjectWriteOperation& op, string& oid);
   int cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
   int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag, int64_t pool, uint64_t epoch,
                           rgw_bucket_dir_entry& ent, RGWObjCategory category, list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
@@ -3623,7 +3631,7 @@ public:
                          map<RGWObjCategory, RGWStorageStats> *existing_stats,
                          map<RGWObjCategory, RGWStorageStats> *calculated_stats);
   int bucket_rebuild_index(RGWBucketInfo& bucket_info);
-  int bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry);
+  int bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry);
   int remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list);
   int move_rados_obj(librados::IoCtx& src_ioctx,
 		     const string& src_oid, const string& src_locator,
diff --git a/ceph/src/rgw/rgw_reshard.cc b/ceph/src/rgw/rgw_reshard.cc
index 7b3111163..a7c041f68 100644
--- a/ceph/src/rgw/rgw_reshard.cc
+++ b/ceph/src/rgw/rgw_reshard.cc
@@ -1,6 +1,8 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
+#include <limits>
+
 #include "rgw_rados.h"
 #include "rgw_bucket.h"
 #include "rgw_reshard.h"
@@ -23,6 +25,7 @@ using namespace std;
 #define RESHARD_SHARD_WINDOW 64
 #define RESHARD_MAX_AIO 128
 
+
 class BucketReshardShard {
   RGWRados *store;
   const RGWBucketInfo& bucket_info;
@@ -66,10 +69,12 @@ class BucketReshardShard {
 public:
   BucketReshardShard(RGWRados *_store, const RGWBucketInfo& _bucket_info,
                      int _num_shard,
-                     deque<librados::AioCompletion *>& _completions) : store(_store), bucket_info(_bucket_info), bs(store),
-                                                                       aio_completions(_completions) {
+                     deque<librados::AioCompletion *>& _completions) :
+    store(_store), bucket_info(_bucket_info), bs(store),
+    aio_completions(_completions)
+  {
     num_shard = (bucket_info.num_shards > 0 ? _num_shard : -1);
-    bs.init(bucket_info.bucket, num_shard);
+    bs.init(bucket_info.bucket, num_shard, nullptr /* no RGWBucketInfo */);
   }
 
   int get_num_shard() {
@@ -92,8 +97,10 @@ public:
         return ret;
       }
     }
+
     return 0;
   }
+
   int flush() {
     if (entries.size() == 0) {
       return 0;
@@ -130,7 +137,8 @@ public:
     }
     return ret;
   }
-};
+}; // class BucketReshardShard
+
 
 class BucketReshardManager {
   RGWRados *store;
@@ -140,8 +148,12 @@ class BucketReshardManager {
   vector<BucketReshardShard *> target_shards;
 
 public:
-  BucketReshardManager(RGWRados *_store, const RGWBucketInfo& _target_bucket_info, int _num_target_shards) : store(_store), target_bucket_info(_target_bucket_info),
-                                                                                                       num_target_shards(_num_target_shards) {
+  BucketReshardManager(RGWRados *_store,
+		       const RGWBucketInfo& _target_bucket_info,
+		       int _num_target_shards) :
+    store(_store), target_bucket_info(_target_bucket_info),
+    num_target_shards(_num_target_shards)
+  {
     target_shards.resize(num_target_shards);
     for (int i = 0; i < num_target_shards; ++i) {
       target_shards[i] = new BucketReshardShard(store, target_bucket_info, i, completions);
@@ -152,7 +164,8 @@ public:
     for (auto& shard : target_shards) {
       int ret = shard->wait_all_aio();
       if (ret < 0) {
-        ldout(store->ctx(), 20) << __func__ << ": shard->wait_all_aio() returned ret=" << ret << dendl;
+        ldout(store->ctx(), 20) << __func__ <<
+	  ": shard->wait_all_aio() returned ret=" << ret << dendl;
       }
     }
   }
@@ -160,11 +173,14 @@ public:
   int add_entry(int shard_index,
                 rgw_cls_bi_entry& entry, bool account, uint8_t category,
                 const rgw_bucket_category_stats& entry_stats) {
-    int ret = target_shards[shard_index]->add_entry(entry, account, category, entry_stats);
+    int ret = target_shards[shard_index]->add_entry(entry, account, category,
+						    entry_stats);
     if (ret < 0) {
-      derr << "ERROR: target_shards.add_entry(" << entry.idx << ") returned error: " << cpp_strerror(-ret) << dendl;
+      derr << "ERROR: target_shards.add_entry(" << entry.idx <<
+	") returned error: " << cpp_strerror(-ret) << dendl;
       return ret;
     }
+
     return 0;
   }
 
@@ -188,43 +204,22 @@ public:
     target_shards.clear();
     return ret;
   }
-};
-
-RGWBucketReshard::RGWBucketReshard(RGWRados *_store, const RGWBucketInfo& _bucket_info, const map<string, bufferlist>& _bucket_attrs) :
-                                                     store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
-                                                     reshard_lock(reshard_lock_name) {
-  const rgw_bucket& b = bucket_info.bucket;
-  reshard_oid = b.tenant + (b.tenant.empty() ? "" : ":") + b.name + ":" + b.bucket_id;
-
-  utime_t lock_duration(store->ctx()->_conf->rgw_reshard_bucket_lock_duration, 0);
-#define COOKIE_LEN 16
-  char cookie_buf[COOKIE_LEN + 1];
-  gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
-  cookie_buf[COOKIE_LEN] = '\0';
-
-  reshard_lock.set_cookie(cookie_buf);
-  reshard_lock.set_duration(lock_duration);
-}
-
-int RGWBucketReshard::lock_bucket()
-{
-  int ret = reshard_lock.lock_exclusive(&store->reshard_pool_ctx, reshard_oid);
-  if (ret < 0) {
-    ldout(store->ctx(), 0) << "RGWReshard::add failed to acquire lock on " << reshard_oid << " ret=" << ret << dendl;
-    return ret;
-  }
-  return 0;
-}
-
-void RGWBucketReshard::unlock_bucket()
-{
-  int ret = reshard_lock.unlock(&store->reshard_pool_ctx, reshard_oid);
-  if (ret < 0) {
-    ldout(store->ctx(), 0) << "WARNING: RGWReshard::add failed to drop lock on " << reshard_oid << " ret=" << ret << dendl;
-  }
-}
-
-int RGWBucketReshard::set_resharding_status(const string& new_instance_id, int32_t num_shards, cls_rgw_reshard_status status)
+}; // class BucketReshardManager
+
+RGWBucketReshard::RGWBucketReshard(RGWRados *_store,
+				   const RGWBucketInfo& _bucket_info,
+				   const map<string, bufferlist>& _bucket_attrs,
+				   RGWBucketReshardLock* _outer_reshard_lock) :
+  store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
+  reshard_lock(store, bucket_info, true),
+  outer_reshard_lock(_outer_reshard_lock)
+{ }
+
+int RGWBucketReshard::set_resharding_status(RGWRados* store,
+					    const RGWBucketInfo& bucket_info,
+					    const string& new_instance_id,
+					    int32_t num_shards,
+					    cls_rgw_reshard_status status)
 {
   if (new_instance_id.empty()) {
     ldout(store->ctx(), 0) << __func__ << " missing new bucket instance id" << dendl;
@@ -243,16 +238,48 @@ int RGWBucketReshard::set_resharding_status(const string& new_instance_id, int32
   return 0;
 }
 
-int RGWBucketReshard::clear_resharding()
+// reshard lock assumes lock is held
+int RGWBucketReshard::clear_resharding(RGWRados* store,
+				       const RGWBucketInfo& bucket_info)
 {
-  cls_rgw_bucket_instance_entry instance_entry;
+  int ret = clear_index_shard_reshard_status(store, bucket_info);
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "RGWBucketReshard::" << __func__ <<
+      " ERROR: error clearing reshard status from index shard " <<
+      cpp_strerror(-ret) << dendl;
+    return ret;
+  }
 
-  int ret = store->bucket_set_reshard(bucket_info, instance_entry);
+  cls_rgw_bucket_instance_entry instance_entry;
+  ret = store->bucket_set_reshard(bucket_info, instance_entry);
   if (ret < 0) {
-    ldout(store->ctx(), 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: "
-		  << cpp_strerror(-ret) << dendl;
+    ldout(store->ctx(), 0) << "RGWReshard::" << __func__ <<
+      " ERROR: error setting bucket resharding flag on bucket index: " <<
+      cpp_strerror(-ret) << dendl;
     return ret;
   }
+
+  return 0;
+}
+
+int RGWBucketReshard::clear_index_shard_reshard_status(RGWRados* store,
+						       const RGWBucketInfo& bucket_info)
+{
+  uint32_t num_shards = bucket_info.num_shards;
+
+  if (num_shards < std::numeric_limits<uint32_t>::max()) {
+    int ret = set_resharding_status(store, bucket_info,
+				    bucket_info.bucket.bucket_id,
+				    (num_shards < 1 ? 1 : num_shards),
+				    CLS_RGW_RESHARD_NONE);
+    if (ret < 0) {
+      ldout(store->ctx(), 0) << "RGWBucketReshard::" << __func__ <<
+	" ERROR: error clearing reshard status from index shard " <<
+	cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+  }
+
   return 0;
 }
 
@@ -276,13 +303,13 @@ static int create_new_bucket_instance(RGWRados *store,
   int ret = store->init_bucket_index(new_bucket_info, new_bucket_info.num_shards);
   if (ret < 0) {
     cerr << "ERROR: failed to init new bucket indexes: " << cpp_strerror(-ret) << std::endl;
-    return -ret;
+    return ret;
   }
 
   ret = store->put_bucket_instance_info(new_bucket_info, true, real_time(), &attrs);
   if (ret < 0) {
     cerr << "ERROR: failed to store new bucket instance info: " << cpp_strerror(-ret) << std::endl;
-    return -ret;
+    return ret;
   }
 
   return 0;
@@ -291,20 +318,21 @@ static int create_new_bucket_instance(RGWRados *store,
 int RGWBucketReshard::create_new_bucket_instance(int new_num_shards,
                                                  RGWBucketInfo& new_bucket_info)
 {
-  return ::create_new_bucket_instance(store, new_num_shards, bucket_info, bucket_attrs, new_bucket_info);
+  return ::create_new_bucket_instance(store, new_num_shards,
+				      bucket_info, bucket_attrs, new_bucket_info);
 }
 
 int RGWBucketReshard::cancel()
 {
-  int ret = lock_bucket();
+  int ret = reshard_lock.lock();
   if (ret < 0) {
     return ret;
   }
 
   ret = clear_resharding();
 
-  unlock_bucket();
-  return 0;
+  reshard_lock.unlock();
+  return ret;
 }
 
 class BucketInfoReshardUpdate
@@ -326,16 +354,28 @@ class BucketInfoReshardUpdate
   }
 
 public:
-  BucketInfoReshardUpdate(RGWRados *_store, RGWBucketInfo& _bucket_info,
-                          map<string, bufferlist>& _bucket_attrs, const string& new_bucket_id) : store(_store), 
-                                                                                                 bucket_info(_bucket_info),
-                                                                                                 bucket_attrs(_bucket_attrs) {
+  BucketInfoReshardUpdate(RGWRados *_store,
+			  RGWBucketInfo& _bucket_info,
+                          map<string, bufferlist>& _bucket_attrs,
+			  const string& new_bucket_id) :
+    store(_store),
+    bucket_info(_bucket_info),
+    bucket_attrs(_bucket_attrs)
+  {
     bucket_info.new_bucket_instance_id = new_bucket_id;
   }
+
   ~BucketInfoReshardUpdate() {
     if (in_progress) {
+      // resharding must not have ended correctly, clean up
+      int ret =
+	RGWBucketReshard::clear_index_shard_reshard_status(store, bucket_info);
+      if (ret < 0) {
+	lderr(store->ctx()) << "Error: " << __func__ <<
+	  " clear_index_shard_status returned " << ret << dendl;
+      }
       bucket_info.new_bucket_instance_id.clear();
-      set_status(CLS_RGW_RESHARD_NONE);
+      set_status(CLS_RGW_RESHARD_NONE); // clears new_bucket_instance as well
     }
   }
 
@@ -358,35 +398,109 @@ public:
   }
 };
 
-int RGWBucketReshard::do_reshard(
-		   int num_shards,
-		   RGWBucketInfo& new_bucket_info,
-		   int max_entries,
-                   bool verbose,
-                   ostream *out,
-		   Formatter *formatter)
+
+RGWBucketReshardLock::RGWBucketReshardLock(RGWRados* _store,
+					   const std::string& reshard_lock_oid,
+					   bool _ephemeral) :
+  store(_store),
+  lock_oid(reshard_lock_oid),
+  ephemeral(_ephemeral),
+  internal_lock(reshard_lock_name)
+{
+  const int lock_dur_secs = store->ctx()->_conf->rgw_reshard_bucket_lock_duration;
+  duration = std::chrono::seconds(lock_dur_secs);
+
+#define COOKIE_LEN 16
+  char cookie_buf[COOKIE_LEN + 1];
+  gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
+  cookie_buf[COOKIE_LEN] = '\0';
+
+  internal_lock.set_cookie(cookie_buf);
+  internal_lock.set_duration(duration);
+}
+
+int RGWBucketReshardLock::lock() {
+  internal_lock.set_must_renew(false);
+  int ret;
+  if (ephemeral) {
+    ret = internal_lock.lock_exclusive_ephemeral(&store->reshard_pool_ctx,
+						 lock_oid);
+  } else {
+    ret = internal_lock.lock_exclusive(&store->reshard_pool_ctx, lock_oid);
+  }
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "RGWReshardLock::" << __func__ <<
+      " failed to acquire lock on " << lock_oid << " ret=" << ret << dendl;
+    return ret;
+  }
+  reset_time(Clock::now());
+
+  return 0;
+}
+
+void RGWBucketReshardLock::unlock() {
+  int ret = internal_lock.unlock(&store->reshard_pool_ctx, lock_oid);
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ <<
+      " failed to drop lock on " << lock_oid << " ret=" << ret << dendl;
+  }
+}
+
+int RGWBucketReshardLock::renew(const Clock::time_point& now) {
+  internal_lock.set_must_renew(true);
+  int ret;
+  if (ephemeral) {
+    ret = internal_lock.lock_exclusive_ephemeral(&store->reshard_pool_ctx,
+						 lock_oid);
+  } else {
+    ret = internal_lock.lock_exclusive(&store->reshard_pool_ctx, lock_oid);
+  }
+  if (ret < 0) { /* expired or already locked by another processor */
+    ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " <<
+      lock_oid << " with " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  internal_lock.set_must_renew(false);
+
+  reset_time(now);
+  ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " <<
+    lock_oid << dendl;
+
+  return 0;
+}
+
+
+int RGWBucketReshard::do_reshard(int num_shards,
+				 RGWBucketInfo& new_bucket_info,
+				 int max_entries,
+				 bool verbose,
+				 ostream *out,
+				 Formatter *formatter)
 {
   rgw_bucket& bucket = bucket_info.bucket;
 
   int ret = 0;
 
   if (out) {
-    (*out) << "*** NOTICE: operation will not remove old bucket index objects ***" << std::endl;
-    (*out) << "***         these will need to be removed manually             ***" << std::endl;
     (*out) << "tenant: " << bucket_info.bucket.tenant << std::endl;
     (*out) << "bucket name: " << bucket_info.bucket.name << std::endl;
-    (*out) << "old bucket instance id: " << bucket_info.bucket.bucket_id << std::endl;
-    (*out) << "new bucket instance id: " << new_bucket_info.bucket.bucket_id << std::endl;
+    (*out) << "old bucket instance id: " << bucket_info.bucket.bucket_id <<
+      std::endl;
+    (*out) << "new bucket instance id: " << new_bucket_info.bucket.bucket_id <<
+      std::endl;
   }
 
-  /* update bucket info  -- in progress*/
+  /* update bucket info -- in progress*/
   list<rgw_cls_bi_entry> entries;
 
   if (max_entries < 0) {
-    ldout(store->ctx(), 0) << __func__ << ": can't reshard, negative max_entries" << dendl;
+    ldout(store->ctx(), 0) << __func__ <<
+      ": can't reshard, negative max_entries" << dendl;
     return -EINVAL;
   }
 
+  // NB: destructor cleans up sharding state if reshard does not
+  // complete successfully
   BucketInfoReshardUpdate bucket_info_updater(store, bucket_info, bucket_attrs, new_bucket_info.bucket.bucket_id);
 
   ret = bucket_info_updater.start();
@@ -411,7 +525,8 @@ int RGWBucketReshard::do_reshard(
     cout << "total entries:";
   }
 
-  int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
+  const int num_source_shards =
+    (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
   string marker;
   for (int i = 0; i < num_source_shards; ++i) {
     bool is_truncated = true;
@@ -421,11 +536,10 @@ int RGWBucketReshard::do_reshard(
       ret = store->bi_list(bucket, i, string(), marker, max_entries, &entries, &is_truncated);
       if (ret < 0 && ret != -ENOENT) {
 	derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl;
-	return -ret;
+	return ret;
       }
 
-      list<rgw_cls_bi_entry>::iterator iter;
-      for (iter = entries.begin(); iter != entries.end(); ++iter) {
+      for (auto iter = entries.begin(); iter != entries.end(); ++iter) {
 	rgw_cls_bi_entry& entry = *iter;
 	if (verbose) {
 	  formatter->open_object_section("entry");
@@ -453,22 +567,41 @@ int RGWBucketReshard::do_reshard(
 
 	int shard_index = (target_shard_id > 0 ? target_shard_id : 0);
 
-	ret = target_shards_mgr.add_entry(shard_index, entry, account, category, stats);
+	ret = target_shards_mgr.add_entry(shard_index, entry, account,
+					  category, stats);
 	if (ret < 0) {
 	  return ret;
 	}
+
+	Clock::time_point now = Clock::now();
+	if (reshard_lock.should_renew(now)) {
+	  // assume outer locks have timespans at least the size of ours, so
+	  // can call inside conditional
+	  if (outer_reshard_lock) {
+	    ret = outer_reshard_lock->renew(now);
+	    if (ret < 0) {
+	      return ret;
+	    }
+	  }
+	  ret = reshard_lock.renew(now);
+	  if (ret < 0) {
+	    lderr(store->ctx()) << "Error renewing bucket lock: " << ret << dendl;
+	    return ret;
+	  }
+	}
+
 	if (verbose) {
 	  formatter->close_section();
 	  if (out) {
 	    formatter->flush(*out);
-	    formatter->flush(*out);
 	  }
 	} else if (out && !(total_entries % 1000)) {
 	  (*out) << " " << total_entries;
 	}
-      }
+      } // entries loop
     }
   }
+
   if (verbose) {
     formatter->close_section();
     if (out) {
@@ -481,13 +614,13 @@ int RGWBucketReshard::do_reshard(
   ret = target_shards_mgr.finish();
   if (ret < 0) {
     lderr(store->ctx()) << "ERROR: failed to reshard" << dendl;
-    return EIO;
+    return -EIO;
   }
 
   ret = rgw_link_bucket(store, new_bucket_info.owner, new_bucket_info.bucket, bucket_info.creation_time);
   if (ret < 0) {
     lderr(store->ctx()) << "failed to link new bucket instance (bucket_id=" << new_bucket_info.bucket.bucket_id << ": " << cpp_strerror(-ret) << ")" << dendl;
-    return -ret;
+    return ret;
   }
 
   ret = bucket_info_updater.complete();
@@ -495,8 +628,10 @@ int RGWBucketReshard::do_reshard(
     ldout(store->ctx(), 0) << __func__ << ": failed to update bucket info ret=" << ret << dendl;
     /* don't error out, reshard process succeeded */
   }
+
   return 0;
-}
+  // NB: some error clean-up is done by ~BucketInfoReshardUpdate
+} // RGWBucketReshard::do_reshard
 
 int RGWBucketReshard::get_status(list<cls_rgw_bucket_instance_entry> *status)
 {
@@ -523,11 +658,14 @@ int RGWBucketReshard::get_status(list<cls_rgw_bucket_instance_entry> *status)
   return 0;
 }
 
-int RGWBucketReshard::execute(int num_shards, int max_op_entries,
-                              bool verbose, ostream *out, Formatter *formatter, RGWReshard* reshard_log)
 
+int RGWBucketReshard::execute(int num_shards, int max_op_entries,
+                              bool verbose, ostream *out, Formatter *formatter,
+			      RGWReshard* reshard_log)
 {
-  int ret = lock_bucket();
+  Clock::time_point now;
+
+  int ret = reshard_lock.lock();
   if (ret < 0) {
     return ret;
   }
@@ -535,21 +673,23 @@ int RGWBucketReshard::execute(int num_shards, int max_op_entries,
   RGWBucketInfo new_bucket_info;
   ret = create_new_bucket_instance(num_shards, new_bucket_info);
   if (ret < 0) {
-    unlock_bucket();
-    return ret;
+    // shard state is uncertain, but this will attempt to remove them anyway
+    goto error_out;
   }
 
   if (reshard_log) {
     ret = reshard_log->update(bucket_info, new_bucket_info);
     if (ret < 0) {
-      unlock_bucket();
-      return ret;
+      goto error_out;
     }
   }
 
-  ret = set_resharding_status(new_bucket_info.bucket.bucket_id, num_shards, CLS_RGW_RESHARD_IN_PROGRESS);
+  // set resharding status of current bucket_info & shards with
+  // information about planned resharding
+  ret = set_resharding_status(new_bucket_info.bucket.bucket_id,
+			      num_shards, CLS_RGW_RESHARD_IN_PROGRESS);
   if (ret < 0) {
-    unlock_bucket();
+    reshard_lock.unlock();
     return ret;
   }
 
@@ -557,32 +697,77 @@ int RGWBucketReshard::execute(int num_shards, int max_op_entries,
 		   new_bucket_info,
 		   max_op_entries,
                    verbose, out, formatter);
-
   if (ret < 0) {
-    unlock_bucket();
-    return ret;
+    goto error_out;
   }
 
-  ret = set_resharding_status(new_bucket_info.bucket.bucket_id, num_shards, CLS_RGW_RESHARD_DONE);
+  // at this point we've done the main work; we'll make a best-effort
+  // to clean-up but will not indicate any errors encountered
+
+  reshard_lock.unlock();
+
+  // resharding successful, so remove old bucket index shards; use
+  // best effort and don't report out an error; the lock isn't needed
+  // at this point since all we're using a best effor to to remove old
+  // shard objects
+  ret = store->clean_bucket_index(bucket_info, bucket_info.num_shards);
   if (ret < 0) {
-    unlock_bucket();
-    return ret;
+    lderr(store->ctx()) << "Error: " << __func__ <<
+      " failed to clean up old shards; " <<
+      "RGWRados::clean_bucket_index returned " << ret << dendl;
   }
 
-  unlock_bucket();
+  ret = rgw_bucket_instance_remove_entry(store,
+					 bucket_info.bucket.get_key(),
+					 nullptr);
+  if (ret < 0) {
+    lderr(store->ctx()) << "Error: " << __func__ <<
+      " failed to clean old bucket info object \"" <<
+      bucket_info.bucket.get_key() <<
+      "\"created after successful resharding with error " << ret << dendl;
+  }
 
   return 0;
-}
+
+error_out:
+
+  reshard_lock.unlock();
+
+  // since the real problem is the issue that led to this error code
+  // path, we won't touch ret and instead use another variable to
+  // temporarily error codes
+  int ret2 = store->clean_bucket_index(new_bucket_info,
+				       new_bucket_info.num_shards);
+  if (ret2 < 0) {
+    lderr(store->ctx()) << "Error: " << __func__ <<
+      " failed to clean up shards from failed incomplete resharding; " <<
+      "RGWRados::clean_bucket_index returned " << ret2 << dendl;
+  }
+
+  ret2 = rgw_bucket_instance_remove_entry(store,
+					  new_bucket_info.bucket.get_key(),
+					  nullptr);
+  if (ret2 < 0) {
+    lderr(store->ctx()) << "Error: " << __func__ <<
+      " failed to clean bucket info object \"" <<
+      new_bucket_info.bucket.get_key() <<
+      "\"created during incomplete resharding with error " << ret2 << dendl;
+  }
+
+  return ret;
+} // execute
 
 
 RGWReshard::RGWReshard(RGWRados* _store, bool _verbose, ostream *_out,
-                       Formatter *_formatter) : store(_store), instance_lock(bucket_instance_lock_name),
-                                                verbose(_verbose), out(_out), formatter(_formatter)
+                       Formatter *_formatter) :
+  store(_store), instance_lock(bucket_instance_lock_name),
+  verbose(_verbose), out(_out), formatter(_formatter)
 {
   num_logshards = store->ctx()->_conf->rgw_reshard_num_logs;
 }
 
-string RGWReshard::get_logshard_key(const string& tenant, const string& bucket_name)
+string RGWReshard::get_logshard_key(const string& tenant,
+				    const string& bucket_name)
 {
   return tenant + ":" + bucket_name;
 }
@@ -732,12 +917,14 @@ int RGWReshardWait::do_wait()
   return 0;
 }
 
-int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
+int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs,
+					   string *new_bucket_id,
+					   const RGWBucketInfo& bucket_info)
 {
   int ret = 0;
   cls_rgw_bucket_instance_entry entry;
 
-  for (int i=0; i < num_retries;i++) {
+  for (int i=0; i < num_retries; i++) {
     ret = cls_rgw_get_bucket_resharding(bs->index_ctx, bs->bucket_obj, &entry);
     if (ret < 0) {
       ldout(store->ctx(), 0) << __func__ << " ERROR: failed to get bucket resharding :"  <<
@@ -749,12 +936,48 @@ int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs, string *ne
       return 0;
     }
     ldout(store->ctx(), 20) << "NOTICE: reshard still in progress; " << (i < num_retries - 1 ? "retrying" : "too many retries") << dendl;
-    /* needed to unlock as clear resharding uses the same lock */
 
     if (i == num_retries - 1) {
       break;
     }
 
+    // If bucket is erroneously marked as resharding (e.g., crash or
+    // other error) then fix it. If we can take the bucket reshard
+    // lock then it means no other resharding should be taking place,
+    // and we're free to clear the flags.
+    {
+      // since we expect to do this rarely, we'll do our work in a
+      // block and erase our work after each try
+
+      RGWObjectCtx obj_ctx(bs->store);
+      const rgw_bucket& b = bs->bucket;
+      std::string bucket_id = b.get_key();
+      RGWBucketReshardLock reshard_lock(bs->store, bucket_info, true);
+      ret = reshard_lock.lock();
+      if (ret < 0) {
+	ldout(store->ctx(), 20) << __func__ <<
+	  " INFO: failed to take reshard lock for bucket " <<
+	  bucket_id << "; expected if resharding underway" << dendl;
+      } else {
+	ldout(store->ctx(), 10) << __func__ <<
+	  " INFO: was able to take reshard lock for bucket " <<
+	  bucket_id << dendl;
+	ret = RGWBucketReshard::clear_resharding(bs->store, bucket_info);
+	if (ret < 0) {
+	  reshard_lock.unlock();
+	  ldout(store->ctx(), 0) << __func__ <<
+	    " ERROR: failed to clear resharding flags for bucket " <<
+	    bucket_id << dendl;
+	} else {
+	  reshard_lock.unlock();
+	  ldout(store->ctx(), 5) << __func__ <<
+	    " INFO: apparently successfully cleared resharding flags for "
+	    "bucket " << bucket_id << dendl;
+	  continue; // if we apparently succeed immediately test again
+	} // if clear resharding succeeded
+      } // if taking of lock succeeded
+    } // block to encapsulate recovery from incomplete reshard
+
     ret = do_wait();
     if (ret < 0) {
       ldout(store->ctx(), 0) << __func__ << " ERROR: bucket is still resharding, please retry" << dendl;
@@ -771,92 +994,86 @@ int RGWReshard::process_single_logshard(int logshard_num)
   bool truncated = true;
 
   CephContext *cct = store->ctx();
-  int max_entries = 1000;
-  int max_secs = 60;
-
-  rados::cls::lock::Lock l(reshard_lock_name);
-
-  utime_t time(max_secs, 0);
-  l.set_duration(time);
-
-  char cookie_buf[COOKIE_LEN + 1];
-  gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
-  cookie_buf[COOKIE_LEN] = '\0';
-
-  l.set_cookie(cookie_buf);
+  constexpr uint32_t max_entries = 1000;
 
   string logshard_oid;
   get_logshard_oid(logshard_num, &logshard_oid);
 
-  int ret = l.lock_exclusive(&store->reshard_pool_ctx, logshard_oid);
+  RGWBucketReshardLock logshard_lock(store, logshard_oid, false);
+
+  int ret = logshard_lock.lock();
   if (ret == -EBUSY) { /* already locked by another processor */
-    ldout(store->ctx(), 5) << __func__ << "(): failed to acquire lock on " << logshard_oid << dendl;
+    ldout(store->ctx(), 5) << __func__ << "(): failed to acquire lock on " <<
+      logshard_oid << dendl;
     return ret;
   }
 
-  utime_t lock_start_time = ceph_clock_now();
-
   do {
     std::list<cls_rgw_reshard_entry> entries;
     ret = list(logshard_num, marker, max_entries, entries, &truncated);
     if (ret < 0) {
-      ldout(cct, 10) << "cannot list all reshards in logshard oid=" << logshard_oid << dendl;
+      ldout(cct, 10) << "cannot list all reshards in logshard oid=" <<
+	logshard_oid << dendl;
       continue;
     }
 
-    for(auto& entry: entries) {
+    for(auto& entry: entries) { // logshard entries
       if(entry.new_instance_id.empty()) {
 
-	ldout(store->ctx(), 20) << __func__ << " resharding " << entry.bucket_name  << dendl;
+	ldout(store->ctx(), 20) << __func__ << " resharding " <<
+	  entry.bucket_name  << dendl;
 
 	RGWObjectCtx obj_ctx(store);
 	rgw_bucket bucket;
 	RGWBucketInfo bucket_info;
 	map<string, bufferlist> attrs;
 
-	ret = store->get_bucket_info(obj_ctx, entry.tenant, entry.bucket_name, bucket_info, nullptr,
-                                   &attrs);
+	ret = store->get_bucket_info(obj_ctx, entry.tenant, entry.bucket_name,
+				     bucket_info, nullptr, &attrs);
 	if (ret < 0) {
-	  ldout(cct, 0) <<  __func__ << ": Error in get_bucket_info: " << cpp_strerror(-ret) << dendl;
+	  ldout(cct, 0) <<  __func__ << ": Error in get_bucket_info: " <<
+	    cpp_strerror(-ret) << dendl;
 	  return -ret;
 	}
 
-	RGWBucketReshard br(store, bucket_info, attrs);
+	RGWBucketReshard br(store, bucket_info, attrs, nullptr);
 
 	Formatter* formatter = new JSONFormatter(false);
 	auto formatter_ptr = std::unique_ptr<Formatter>(formatter);
-	ret = br.execute(entry.new_num_shards, max_entries, true,nullptr, formatter, this);
+	ret = br.execute(entry.new_num_shards, max_entries, true, nullptr,
+			 formatter, this);
 	if (ret < 0) {
-	  ldout (store->ctx(), 0) <<  __func__ << "ERROR in reshard_bucket " << entry.bucket_name << ":" <<
+	  ldout (store->ctx(), 0) <<  __func__ <<
+	    "ERROR in reshard_bucket " << entry.bucket_name << ":" <<
 	    cpp_strerror(-ret)<< dendl;
 	  return ret;
 	}
 
-	ldout (store->ctx(), 20) <<  " removing entry" << entry.bucket_name<< dendl;
+	ldout (store->ctx(), 20) <<  " removing entry" << entry.bucket_name <<
+	  dendl;
 
       	ret = remove(entry);
 	if (ret < 0) {
-	  ldout(cct, 0)<< __func__ << ":Error removing bucket " << entry.bucket_name << " for resharding queue: "
-		       << cpp_strerror(-ret) << dendl;
+	  ldout(cct, 0)<< __func__ << ":Error removing bucket " <<
+	    entry.bucket_name << " for resharding queue: " <<
+	    cpp_strerror(-ret) << dendl;
 	  return ret;
 	}
       }
-      utime_t now = ceph_clock_now();
-
-      if (now > lock_start_time + max_secs / 2) { /* do you need to renew lock? */
-        l.set_renew(true);
-        ret = l.lock_exclusive(&store->reshard_pool_ctx, logshard_oid);
-        if (ret == -EBUSY) { /* already locked by another processor */
-          ldout(store->ctx(), 5) << __func__ << "(): failed to acquire lock on " << logshard_oid << dendl;
-          return ret;
-        }
-        lock_start_time = now;
+
+      Clock::time_point now = Clock::now();
+      if (logshard_lock.should_renew(now)) {
+	ret = logshard_lock.renew(now);
+	if (ret < 0) {
+	  return ret;
+	}
       }
+
       entry.get_key(&marker);
     }
   } while (truncated);
 
-  l.unlock(&store->reshard_pool_ctx, logshard_oid);
+  logshard_lock.unlock();
   return 0;
 }
 
diff --git a/ceph/src/rgw/rgw_reshard.h b/ceph/src/rgw/rgw_reshard.h
index 6fe431299..be47c86b2 100644
--- a/ceph/src/rgw/rgw_reshard.h
+++ b/ceph/src/rgw/rgw_reshard.h
@@ -5,31 +5,72 @@
 #define RGW_RESHARD_H
 
 #include <vector>
+#include <functional>
+
 #include "include/rados/librados.hpp"
+#include "common/ceph_time.h"
 #include "cls/rgw/cls_rgw_types.h"
 #include "cls/lock/cls_lock_client.h"
 #include "rgw_bucket.h"
 
+
 class CephContext;
 class RGWRados;
 
+class RGWBucketReshardLock {
+  using Clock = ceph::coarse_mono_clock;
+
+  RGWRados* store;
+  const std::string lock_oid;
+  const bool ephemeral;
+  rados::cls::lock::Lock internal_lock;
+  std::chrono::seconds duration;
+
+  Clock::time_point start_time;
+  Clock::time_point renew_thresh;
+
+  void reset_time(const Clock::time_point& now) {
+    start_time = now;
+    renew_thresh = start_time + duration / 2;
+  }
+
+public:
+  RGWBucketReshardLock(RGWRados* _store,
+		       const std::string& reshard_lock_oid,
+		       bool _ephemeral);
+  RGWBucketReshardLock(RGWRados* _store,
+		       const RGWBucketInfo& bucket_info,
+		       bool _ephemeral) :
+    RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral)
+  {}
+
+  int lock();
+  void unlock();
+  int renew(const Clock::time_point&);
+
+  bool should_renew(const Clock::time_point& now) const {
+    return now >= renew_thresh;
+  }
+}; // class RGWBucketReshardLock
 
 class RGWBucketReshard {
+public:
+
   friend class RGWReshard;
 
+  using Clock = ceph::coarse_mono_clock;
+
+private:
+
   RGWRados *store;
   RGWBucketInfo bucket_info;
   std::map<string, bufferlist> bucket_attrs;
 
-  string reshard_oid;
-  rados::cls::lock::Lock reshard_lock;
-
-  int lock_bucket();
-  void unlock_bucket();
-  int set_resharding_status(const string& new_instance_id, int32_t num_shards, cls_rgw_reshard_status status);
-  int clear_resharding();
+  RGWBucketReshardLock reshard_lock;
+  RGWBucketReshardLock* outer_reshard_lock;
 
-  int create_new_bucket_instance(int new_num_shards, RGWBucketInfo& new_bucket_info);
+  int create_new_bucket_instance(int new_num_shards,
+				 RGWBucketInfo& new_bucket_info);
   int do_reshard(int num_shards,
 		 RGWBucketInfo& new_bucket_info,
 		 int max_entries,
@@ -37,19 +78,46 @@ class RGWBucketReshard {
                  ostream *os,
 		 Formatter *formatter);
 public:
-  RGWBucketReshard(RGWRados *_store, const RGWBucketInfo& _bucket_info,
-                   const std::map<string, bufferlist>& _bucket_attrs);
 
+  // pass nullptr for the final parameter if no outer reshard lock to
+  // manage
+  RGWBucketReshard(RGWRados *_store, const RGWBucketInfo& _bucket_info,
+                   const std::map<string, bufferlist>& _bucket_attrs,
+		   RGWBucketReshardLock* _outer_reshard_lock);
   int execute(int num_shards, int max_op_entries,
               bool verbose = false, ostream *out = nullptr,
               Formatter *formatter = nullptr,
 	      RGWReshard *reshard_log = nullptr);
-  int abort();
   int get_status(std::list<cls_rgw_bucket_instance_entry> *status);
   int cancel();
-};
+  static int clear_resharding(RGWRados* store,
+			      const RGWBucketInfo& bucket_info);
+  int clear_resharding() {
+    return clear_resharding(store, bucket_info);
+  }
+  static int clear_index_shard_reshard_status(RGWRados* store,
+					      const RGWBucketInfo& bucket_info);
+  int clear_index_shard_reshard_status() {
+    return clear_index_shard_reshard_status(store, bucket_info);
+  }
+  static int set_resharding_status(RGWRados* store,
+				   const RGWBucketInfo& bucket_info,
+				   const string& new_instance_id,
+				   int32_t num_shards,
+				   cls_rgw_reshard_status status);
+  int set_resharding_status(const string& new_instance_id,
+			    int32_t num_shards,
+			    cls_rgw_reshard_status status) {
+    return set_resharding_status(store, bucket_info,
+				 new_instance_id, num_shards, status);
+  }
+}; // RGWBucketReshard
 
 class RGWReshard {
+public:
+    using Clock = ceph::coarse_mono_clock;
+
+private:
     RGWRados *store;
     string lock_name;
     rados::cls::lock::Lock instance_lock;
@@ -116,7 +184,9 @@ public:
   ~RGWReshardWait() {
     assert(going_down);
   }
-  int block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id);
+  int block_while_resharding(RGWRados::BucketShard *bs,
+			     string *new_bucket_id,
+			     const RGWBucketInfo& bucket_info);
 
   void stop() {
     Mutex::Locker l(lock);
diff --git a/ceph/src/rgw/rgw_rest.cc b/ceph/src/rgw/rgw_rest.cc
index 80a886ec5..67de9bb9b 100644
--- a/ceph/src/rgw/rgw_rest.cc
+++ b/ceph/src/rgw/rgw_rest.cc
@@ -1659,8 +1659,9 @@ int RGWListMultipart_ObjStore::get_params()
   }
   
   string str = s->info.args.get("max-parts");
-  if (!str.empty())
-    max_parts = atoi(str.c_str());
+  op_ret = parse_value_and_bound(str, max_parts, 0,
+                                 g_conf->get_val<uint64_t>("rgw_max_listing_results"),
+                                 max_parts);
 
   return op_ret;
 }
@@ -1670,10 +1671,12 @@ int RGWListBucketMultiparts_ObjStore::get_params()
   delimiter = s->info.args.get("delimiter");
   prefix = s->info.args.get("prefix");
   string str = s->info.args.get("max-uploads");
-  if (!str.empty())
-    max_uploads = atoi(str.c_str());
-  else
-    max_uploads = default_max;
+  op_ret = parse_value_and_bound(str, max_uploads, 0,
+			g_conf->get_val<uint64_t>("rgw_max_listing_results"),
+			default_max);
+  if (op_ret < 0) {
+    return op_ret;
+  }
 
   string key_marker = s->info.args.get("key-marker");
   string upload_id_marker = s->info.args.get("upload-id-marker");
diff --git a/ceph/src/rgw/rgw_rest_s3.cc b/ceph/src/rgw/rgw_rest_s3.cc
index 3b07327f3..6534e254e 100644
--- a/ceph/src/rgw/rgw_rest_s3.cc
+++ b/ceph/src/rgw/rgw_rest_s3.cc
@@ -3751,8 +3751,9 @@ AWSGeneralAbstractor::get_auth_data_v4(const req_state* const s,
   boost::optional<std::string> canonical_headers = \
     get_v4_canonical_headers(s->info, signed_hdrs, using_qs);
   if (canonical_headers) {
-    ldout(s->cct, 10) << "canonical headers format = " << *canonical_headers
-                      << dendl;
+    using sanitize = rgw::crypt_sanitize::log_content;
+    ldout(s->cct, 10) << "canonical headers format = "
+                      << sanitize{*canonical_headers} << dendl;
   } else {
     throw -EPERM;
   }
diff --git a/ceph/src/rgw/rgw_rest_swift.cc b/ceph/src/rgw/rgw_rest_swift.cc
index c9d96d963..35e192c15 100644
--- a/ceph/src/rgw/rgw_rest_swift.cc
+++ b/ceph/src/rgw/rgw_rest_swift.cc
@@ -303,6 +303,8 @@ int RGWListBucket_ObjStore_SWIFT::get_params()
   if (op_ret < 0) {
     return op_ret;
   }
+  // S3 behavior is to silently cap the max-keys.
+  // Swift behavior is to abort.
   if (max > default_max)
     return -ERR_PRECONDITION_FAILED;
 
diff --git a/ceph/src/rgw/rgw_rest_user.cc b/ceph/src/rgw/rgw_rest_user.cc
index ee526e3c0..bd561d0c4 100644
--- a/ceph/src/rgw/rgw_rest_user.cc
+++ b/ceph/src/rgw/rgw_rest_user.cc
@@ -151,37 +151,6 @@ void RGWOp_User_Create::execute()
   if (gen_key)
     op_state.set_generate_key();
 
-  RGWQuotaInfo bucket_quota;
-  RGWQuotaInfo user_quota;
-
-  if (s->cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
-    bucket_quota.max_objects = s->cct->_conf->rgw_bucket_default_quota_max_objects;
-    bucket_quota.enabled = true;
-  }
-
-  if (s->cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
-    bucket_quota.max_size = s->cct->_conf->rgw_bucket_default_quota_max_size;
-    bucket_quota.enabled = true;
-  }
-
-  if (s->cct->_conf->rgw_user_default_quota_max_objects >= 0) {
-    user_quota.max_objects = s->cct->_conf->rgw_user_default_quota_max_objects;
-    user_quota.enabled = true;
-  }
-
-  if (s->cct->_conf->rgw_user_default_quota_max_size >= 0) {
-    user_quota.max_size = s->cct->_conf->rgw_user_default_quota_max_size;
-    user_quota.enabled = true;
-  }
-
-  if (bucket_quota.enabled) {
-    op_state.set_bucket_quota(bucket_quota);
-  }
-
-  if (user_quota.enabled) {
-    op_state.set_user_quota(user_quota);
-  }
-
   http_ret = RGWUserAdminOp_User::create(store, op_state, flusher);
 }
 
@@ -895,11 +864,14 @@ void RGWOp_Quota_Set::execute()
         old_quota = &info.bucket_quota;
       }
 
-      int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size);
-      int64_t max_size_kb;
       RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, &quota.max_objects);
-      RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb);
-      quota.max_size = max_size_kb * 1024;
+      RESTArgs::get_int64(s, "max-size", old_quota->max_size, &quota.max_size);
+      int64_t max_size_kb;
+      bool has_max_size_kb = false;
+      RESTArgs::get_int64(s, "max-size-kb", 0, &max_size_kb, &has_max_size_kb);
+      if (has_max_size_kb) {
+        quota.max_size = max_size_kb * 1024;
+      }
       RESTArgs::get_bool(s, "enabled", old_quota->enabled, &quota.enabled);
     }
 
diff --git a/ceph/src/rgw/rgw_sync_log_trim.cc b/ceph/src/rgw/rgw_sync_log_trim.cc
index 418441996..ffec32052 100644
--- a/ceph/src/rgw/rgw_sync_log_trim.cc
+++ b/ceph/src/rgw/rgw_sync_log_trim.cc
@@ -17,6 +17,7 @@
 #include <boost/circular_buffer.hpp>
 #include <boost/container/flat_map.hpp>
 
+#include "include/scope_guard.h"
 #include "common/bounded_key_counter.h"
 #include "common/errno.h"
 #include "rgw_sync_log_trim.h"
@@ -573,7 +574,6 @@ class AsyncMetadataList : public RGWAsyncRadosRequest {
   const std::string section;
   const std::string start_marker;
   MetadataListCallback callback;
-  void *handle{nullptr};
 
   int _send_request() override;
  public:
@@ -584,54 +584,55 @@ class AsyncMetadataList : public RGWAsyncRadosRequest {
     : RGWAsyncRadosRequest(caller, cn), cct(cct), mgr(mgr),
       section(section), start_marker(start_marker), callback(callback)
   {}
-  ~AsyncMetadataList() override {
-    if (handle) {
-      mgr->list_keys_complete(handle);
-    }
-  }
 };
 
 int AsyncMetadataList::_send_request()
 {
+  void* handle = nullptr;
+  std::list<std::string> keys;
+  bool truncated{false};
+  std::string marker;
+
   // start a listing at the given marker
   int r = mgr->list_keys_init(section, start_marker, &handle);
-  if (r < 0) {
+  if (r == -EINVAL) {
+    // restart with empty marker below
+  } else if (r < 0) {
     ldout(cct, 10) << "failed to init metadata listing: "
         << cpp_strerror(r) << dendl;
     return r;
-  }
-  ldout(cct, 20) << "starting metadata listing at " << start_marker << dendl;
-
-  std::list<std::string> keys;
-  bool truncated{false};
-  std::string marker;
-
-  do {
-    // get the next key and marker
-    r = mgr->list_keys_next(handle, 1, keys, &truncated);
-    if (r < 0) {
-      ldout(cct, 10) << "failed to list metadata: "
-          << cpp_strerror(r) << dendl;
-      return r;
-    }
-    marker = mgr->get_marker(handle);
-
-    if (!keys.empty()) {
-      assert(keys.size() == 1);
-      auto& key = keys.front();
-      if (!callback(std::move(key), std::move(marker))) {
-        return 0;
+  } else {
+    ldout(cct, 20) << "starting metadata listing at " << start_marker << dendl;
+
+    // release the handle when scope exits
+    auto g = make_scope_guard([=] { mgr->list_keys_complete(handle); });
+
+    do {
+      // get the next key and marker
+      r = mgr->list_keys_next(handle, 1, keys, &truncated);
+      if (r < 0) {
+        ldout(cct, 10) << "failed to list metadata: "
+            << cpp_strerror(r) << dendl;
+        return r;
       }
-    }
-  } while (truncated);
+      marker = mgr->get_marker(handle);
+
+      if (!keys.empty()) {
+        ceph_assert(keys.size() == 1);
+        auto& key = keys.front();
+        if (!callback(std::move(key), std::move(marker))) {
+          return 0;
+        }
+      }
+    } while (truncated);
 
-  if (start_marker.empty()) {
-    // already listed all keys
-    return 0;
+    if (start_marker.empty()) {
+      // already listed all keys
+      return 0;
+    }
   }
 
   // restart the listing from the beginning (empty marker)
-  mgr->list_keys_complete(handle);
   handle = nullptr;
 
   r = mgr->list_keys_init(section, "", &handle);
@@ -642,6 +643,8 @@ int AsyncMetadataList::_send_request()
   }
   ldout(cct, 20) << "restarting metadata listing" << dendl;
 
+  // release the handle when scope exits
+  auto g = make_scope_guard([=] { mgr->list_keys_complete(handle); });
   do {
     // get the next key and marker
     r = mgr->list_keys_next(handle, 1, keys, &truncated);
diff --git a/ceph/src/rgw/rgw_sync_module_es.cc b/ceph/src/rgw/rgw_sync_module_es.cc
index 775fcd24e..aa58219b5 100644
--- a/ceph/src/rgw/rgw_sync_module_es.cc
+++ b/ceph/src/rgw/rgw_sync_module_es.cc
@@ -243,6 +243,19 @@ struct es_index_config {
   }
 };
 
+static bool is_sys_attr(const std::string& attr_name){
+  static constexpr std::initializer_list<const char*> rgw_sys_attrs = {RGW_ATTR_PG_VER,
+                                                                       RGW_ATTR_SOURCE_ZONE,
+                                                                       RGW_ATTR_ID_TAG,
+                                                                       RGW_ATTR_TEMPURL_KEY1,
+                                                                       RGW_ATTR_TEMPURL_KEY2,
+                                                                       RGW_ATTR_UNIX1,
+                                                                       RGW_ATTR_UNIX_KEY1
+  };
+
+  return std::find(rgw_sys_attrs.begin(), rgw_sys_attrs.end(), attr_name) != rgw_sys_attrs.end();
+}
+
 struct es_obj_metadata {
   CephContext *cct;
   ElasticConfigRef es_conf;
@@ -267,7 +280,6 @@ struct es_obj_metadata {
 
     for (auto i : attrs) {
       const string& attr_name = i.first;
-      string name;
       bufferlist& val = i.second;
 
       if (attr_name.compare(0, sizeof(RGW_ATTR_PREFIX) - 1, RGW_ATTR_PREFIX) != 0) {
@@ -275,19 +287,22 @@ struct es_obj_metadata {
       }
 
       if (attr_name.compare(0, sizeof(RGW_ATTR_META_PREFIX) - 1, RGW_ATTR_META_PREFIX) == 0) {
-        name = attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1);
-        custom_meta[name] = string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0));
+        custom_meta.emplace(attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1),
+                            string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0)));
         continue;
       }
 
-      name = attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1);
+      if (attr_name.compare(0, sizeof(RGW_ATTR_CRYPT_PREFIX) -1, RGW_ATTR_CRYPT_PREFIX) == 0) {
+        continue;
+      }
 
-      if (name == "acl") {
+      if (attr_name == RGW_ATTR_ACL) {
         try {
           auto i = val.begin();
           ::decode(policy, i);
         } catch (buffer::error& err) {
           ldout(cct, 0) << "ERROR: failed to decode acl for " << bucket_info.bucket << "/" << key << dendl;
+          continue;
         }
 
         const RGWAccessControlList& acl = policy.get_acl();
@@ -303,19 +318,30 @@ struct es_obj_metadata {
             }
           }
         }
-      } else if (name == "x-amz-tagging") {
-        auto tags_bl = val.begin();
-        ::decode(obj_tags, tags_bl);
-      } else if (name == "compression") {
+      } else if (attr_name == RGW_ATTR_TAGS) {
+        try {
+          auto tags_bl = val.begin();
+          ::decode(obj_tags, tags_bl);
+        } catch (buffer::error& err) {
+          ldout(cct,0) << "ERROR: failed to decode obj tags for "
+                       << bucket_info.bucket << "/" << key << dendl;
+          continue;
+        }
+      } else if (attr_name == RGW_ATTR_COMPRESSION) {
         RGWCompressionInfo cs_info;
-        auto vals_bl = val.begin();
-        decode(cs_info, vals_bl);
-        out_attrs[name] = cs_info.compression_type;
+        try {
+          auto vals_bl = val.begin();
+          ::decode(cs_info, vals_bl);
+        } catch (buffer::error& err) {
+          ldout(cct,0) << "ERROR: failed to decode compression attr for "
+                       << bucket_info.bucket << "/" << key << dendl;
+          continue;
+        }
+        out_attrs.emplace("compression",std::move(cs_info.compression_type));
       } else {
-        if (name != "pg_ver" &&
-            name != "source_zone" &&
-            name != "idtag") {
-          out_attrs[name] = string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0));
+        if (!is_sys_attr(attr_name)) {
+          out_attrs.emplace(attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1),
+                            std::string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0)));
         }
       }
     }
diff --git a/ceph/src/rgw/rgw_user.cc b/ceph/src/rgw/rgw_user.cc
index d0e706028..6ba69335a 100644
--- a/ceph/src/rgw/rgw_user.cc
+++ b/ceph/src/rgw/rgw_user.cc
@@ -23,6 +23,7 @@
 #include "rgw_common.h"
 
 #include "rgw_bucket.h"
+#include "rgw_quota.h"
 
 #define dout_subsys ceph_subsys_rgw
 
@@ -1992,14 +1993,7 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg)
   if (op_state.has_bucket_quota()) {
     user_info.bucket_quota = op_state.get_bucket_quota();
   } else {
-    if (cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
-      user_info.bucket_quota.max_objects = cct->_conf->rgw_bucket_default_quota_max_objects;
-      user_info.bucket_quota.enabled = true;
-    }
-    if (cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
-      user_info.bucket_quota.max_size = cct->_conf->rgw_bucket_default_quota_max_size;
-      user_info.bucket_quota.enabled = true;
-    }
+    rgw_apply_default_bucket_quota(user_info.bucket_quota, cct);
   }
 
   if (op_state.temp_url_key_specified) {
@@ -2013,14 +2007,7 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg)
   if (op_state.has_user_quota()) {
     user_info.user_quota = op_state.get_user_quota();
   } else {
-    if (cct->_conf->rgw_user_default_quota_max_objects >= 0) {
-      user_info.user_quota.max_objects = cct->_conf->rgw_user_default_quota_max_objects;
-      user_info.user_quota.enabled = true;
-    }
-    if (cct->_conf->rgw_user_default_quota_max_size >= 0) {
-      user_info.user_quota.max_size = cct->_conf->rgw_user_default_quota_max_size;
-      user_info.user_quota.enabled = true;
-    }
+    rgw_apply_default_user_quota(user_info.user_quota, cct);
   }
 
   // update the request
diff --git a/ceph/src/test/cli/crushtool/crush-classes/a b/ceph/src/test/cli/crushtool/crush-classes/a
new file mode 100644
index 0000000000000000000000000000000000000000..3e7d168e15cfcc1631c916d83e9a0b314fd9dc82
GIT binary patch
literal 2358
zcmchZS#H!o5Qc4b!VY1FkdO@mlX#tlH!i^wA{vbi84=>q*bx$k;0|ySo_XRDNGw+X
z!v9UuB`X<_A{1Ks$Nf!LyQ{0atL=HdH|=@eA*mxBk+xJORD1UMk#|`9OZ-#%A$@#z
zN&ooX?kD}Q^VRQzY)*@(q=(Y7v?2vkD8*7DrBYLBNp0!Vdtbl)fj#k&dDMNpp#PZB
zHR+}_EzL-C(yX*7-Itc6hV(ytB+hjo&uQF?(iQ23bW6G|-I4A}^U{KJPkJC7w2vC%
zp9<^nU2|lQO}?EP?Xhp-uhJLk>B@vGd~f%Ye%KM4C$c%d;{|)|jC5JLDqWW%=}oo0
zpZ5D?9DJ#d!<+d3m~=wI!}HRlwBPqTd)~)<>*H_^IdexP=66y$C7qRAzx_Vd^{<7w
zF{bDH6V&C>7RX#FZtmN(&Di@p_N*msf3nAa3Vn()bFW(~%8KbS&E8tKEc+!EL}l3R
zz2pfGi`N8$>LA->j8V17=r=MhidSw)vToJw<r`H#ByboH^B4VMgXP+>tsbjR*x4#o
z)gKgD5QL3hK9ECMBW{#s#_Er64ryi+j&F`=CLZ{xDo2_0GG-hy(fFp2L^{4HQp*+-
z&+_Np&GkyP(N>Y?%K?EUR6s~%9YlEJASMnD5-JW-;^m;pC=Oc0*Fl@mI_MBt`i%w+
zSZbq?gJ~3TM2%uLv{6EeGfFvFMom(*QHw-q)Fw(s9S)<TC?Fjeg(Od-h}AQSN&ZF&
z>BA@`;Tko$B8*xjVxu-G*{H+i=O_+H)kfiTouZjK#UymIOlIqp&ef?oU#HeWo!X0a
z>Tof*cP0VXnNhe@r-<vr+F~wmqlD|vDCJT&YI40BwYcbw+FbcY9TKynG>GdI@=;)I
z5uXP}F&_#>NvlpN9}m{nWam@ZDZ_lQ-tTp*JX;-ZuIFVwx)Rt6ds{fV^rPS<$I17x
H-BW%8TnmH5

literal 0
HcmV?d00001

diff --git a/ceph/src/test/cli/crushtool/crush-classes/b b/ceph/src/test/cli/crushtool/crush-classes/b
new file mode 100644
index 0000000000000000000000000000000000000000..68c900f65528b822f67c4aaec4f1ebcbba3914d0
GIT binary patch
literal 20656
zcmeHP2b2_5+U;@F#l<xXX2q<Cb`Iv8z?^qowV7#{kzso1nE?g~29S&(IR_I)a?UyD
zoWZ<?HRrJZz13fJf3GNKm4hsL{yyHhb??im>eVZb9#4*k{~h4*c-{mK1pb3xRqM**
z`73?st-wxT8?Xb|1#AH}06zlT0Yu%aKi1`E<eP)K^yyp8Q$=z|b*=j5f)4`d9|ACb
zU<_N5vj?2(*<6(+c|1R5$~@M}n|)9fI28CBa2Rkna0GB9u;*o_>&i0Ohw0KG@T2!b
zrXK0G9}BM<P#ri9H~}~jI2kww*z<a1+x~r~%yiq&L%!z&=K?i>ivTa+11<#iyv%Id
zzjKs%De}DxxE#0wxDvPuxEi<)*c)Yjn<+Ehw|?Xs0>VH9hyoV_mjE$f&&$mA?Khb+
zt6THx2IPAqa1(GdP#d@fxDBWS{Q5F;J-sk@v+>iKbLeZ7Sp~DHcD-6TTq|5ptOw_K
z2cRR+7i;U@19Cil`&Rc1K9J0rmgAYZw9NDI3t47<Um-s8s~Q-Rb1*dLD94ZUrajOW
zXa_LQFLJVZ={%=4P3NhZd4Gv?>AKdm>dG<N4154|0y+cC_W`S}={o9s>kLolOJ;sw
zAii@=eGZTJw)c1bkjokPHuO8d(Eog!d;AfjJU8sh&dbl>vp!9a$?@=B^D*>Cz=y!O
z55hnnIho~t>PYu5==Xv5{!6+~9O>SL-UPh$U()>>;m&rluJl+xwuoc(7vLB!TGZQS
z)|YM4aj%DWD`3a5nQ_^69rq}BNB=@xjvM3l!F=L9`c&XFpayU{a0YM|a5iubke$Ej
zG5D>T^<!V_`W+ANB*1Qi&5X-2(Q&VVckM64<=E-C7a(4(Ux>?b({Tfc7yN~|oO_3$
zkFLYK;~e80;Jo7;x?>^?j8!wtzkcUrnRVy9WjfAVrsKNgTIU+#+Kkl5vDSPyq_17C
zCLWork;lWir0aGHyfXp2Uu<SvKKIk9An>Ed=f2b9zQF12aPhcDaK__)!s&6Z;q<uI
zaC*Eq(6i^)?Wo@!z@5Ndz&*gdz<t02z;B&T&iHIU*CWfc*VWzd?g#9Cvzc)>y40Eb
zC1*R<!{dF1Kdw{Wx45R6zA4ZgXa-mje#0N;xDM%@<*jw`*0^}9;o0@#vx9jtE#o!@
z%79;69oA(P;ydfI(#2ch;w^`#>%zR4mT^-+3GiF$vJCOLe)!}1;TW*JEXOik{xJ4Z
zgzNFIY;7-b#I0<uEOx}LY+WuwT>V^Pomdw0a6XT0&AKl{xUPF;&x8ezxRpH<<~!n6
z_Dq<ExVrAF6U$;Ac3qv!y3a+pu6t$o^*N5XmEG58JK|P$U!R4zy6&tK%VHkRI@_9c
zpNVkJwSvoXt+~eWV?QuI?k#w4b~4k=Ksfg*Ovfi8=PdKWrS+q4-`A(Zcdo5zF5XlZ
zZ;FdI*~OdW;!Sk%Cb)RxUA%EF-dK40-IMKNJ6Lblll5XfST4(9zRX9uWUmqXe+=?v
zpW5%9+;RTi?_Z-cWv2H``{G?_l%vdj@y<6AWpXW5k@pDpHQUa*Q~)i2#{l*{-y`^p
z=Ulg$?<V^F!PXqN5lE-U4Hi=VzGnt+WXjxk?{=?e%G`J76tAI7j@ym$KFPjj+gX<;
zKsmrM<NIVEd7so{=6s)g74h|0?Th#DS2AU$`)yzB8((&mc`D}PX+RC&_j(r^jxy6@
zm42UQU$gD3ODWI@;8^i}+8!gDxt^Tw(=Q=DpJRRGor7zb^T8@d-Z?lg_|9>odgpi%
zah%Vl7hJsOUA*Vuu?_t3X~(>nmT@i9<qu;IL%6fdp)TGK7jG~;T_*ElTE=xL&aznM
zAcQ;19O&W=aPj)X(`7O*re)m9%It@5y^m#o4F;HBW$F4NoX=T~CHEE_TjoWbzFVeK
zvt51QJKNRU#p~tb^>p!ixOm-Nyk}j!ZZ2L|7w;JtuM0fAw_v;24%VCXWW87qmdkRO
zFZ21Oe%w=^ecBoMvQN3rc#b?BXbP+WIsv_bSAYq?Q@~PSC@>P(49o*cfkGe-EUm-S
zot*y}vrU~GZF&t}S6~#d0$2|`3Ty-F0{wvJfyaP_z)YYmFd0Y!`$L;LI@+`t^=bj^
z1iAxrfN{Xnz(c?$;6-3HP#+iolmH`uslfiwrVfrawMM<R0F8h?0FRx<0M7w=zye?v
zP!DJZbOF`^%Yj#c{h>|m9c`M6daVN502_e<U<vR#@Gy`9UILy4ngi2-fxviR2e3c1
zshy)u>rk&^;ANmQFdC=;CIOECTY(wCGGIQC0Qv&MfG2_dp-pWaZR&=4Edo{o8-O=}
zvA_$!KY>9&1E3W!8z=*|15<$Z!2Zytr=?9l-hHrj&Rv9hH36Og1_O@+&j1~OzXRLe
z_ITb6<os%$h=;>P7vy-V#gG{H_yh5`jibzt@E!zu0S$qcKo4LzunX85Wwv&d*%SHp
z2g-qe07HO$pfS)6*c)X&C1oNz>&Lb~`9G+~6E5|5+)<BxF$S$%>e13sk9{%!_<pYU
zFMo!0+}x#ac?O}|`DeHXH*;wx&s=mn=VG1x(Vw49UE0Yr3f<1uXy+e&@6*_&ojixp
z?ff(BTlu|-ZL)v7;yH$H=bz!7BjwUgp4I4fVoUIce_t$dw9_8f|K~qu59FZQxvY9l
zPA;$C!T3@h%s=>50kW$%eadtYyqw&F4><(PYyKE7-?~c2d0X=Ni}G|nhl<a#lco7A
zn!yFh(y{}<OqN#=FD)%BWef(T;&~0o)Tv|>7R<|%{BqXSx=iLXofSEmY{)A8708d5
z#q$!yWr-9kY*n<dv_V-#NrItFoJ!O$Ocqzgm8308p+cPMN5acPHIhx@sft<!`S~qN
zOY^y|cMm`yZvuZqihAXF1*LXu-t9~iF?m}&3^C)y@uI@G9hDCNhl{v;R2+`D^~;O%
z%ZoCR`BZe;h|LEmi;L$K6gIRo=1##h5tCaA4nskC{iGd{TPMRqJZ`b*<;7D)c06t+
z4G;0SrDcUm3gU(N$>Lh*DfU(?7aOs;5x1&eRM8-jircZdXLs6&&DRI^Ox<L0vTo9j
z%oh!Zi@1E{;mBs=^2LjzTR&SC-~VW-%JMarv%_vDU*)Oo{^4;2XHceRc_w3Jn;jS)
zJ+bWUxblFFsWNkdr-&R2V_Kz;IWbog_2T74Wi8PRo?>!3mlu>%@?i4_$hyf?gG6zI
zc%oh+MW2VBEU}<4Rhm~ohex3-v2G$&7N^1UQFc>7VIq|*u4q_ZN|z_69KxbRvbZ3g
z%BRN@RmMz|mlULur4_{qx;$&OJBFI)uWWZRQB+?N@>rIc)lZg{6%{7xG8&Jotp+r(
za^XoeC8q^KdDwjvWI-a8N}vMerPin-gvaM>shyYY^Jv{lk!gk<!h?6Fu$yB0gC#}1
zRJ`0QgP#RB&7=u-0)8=I5!v?JA^e`eQL{SLHhFN*j%b!lPf1js-#Ct8JT**qbbbR#
zj{}FwcKLOLnP8l4iwFDcD&)tm;<p;R7_s;{Xt!b{jGv2CG2-$=l3k2g{0_v{s_u~x
zei_u=BQ`%8I=V+f_(?I_Jz|cO=xT^1WE4V-Ar^nMogzEL5;8_oWQJJ$vGCa}d*NVq
zje}(`9LVvKQjZAlOB3L$g#%|Iob1AZF$soa;lP~?*H}34rbx`}#9<Lr;bs;N^fYKz
z)?PTUrb}k_WM{G&@a%;HIa4Gq89EEnS_bCALC9=`7`-Yf=GZB+y(%GdB}JxJ#h)iB
z)Xb2O`3TW7L+k~TAlrUxE+cdyLb5YMqA!AJ&uFn0!`f{|OV|>GsTnQqQpv<#IIx$&
zvKJ2Ia*4t9$@v0n1+3kQk+78rQ^kn8N-|+FVy%YdvT%_78ieTXkpyccf$AO!S%;8p
z_lUV(qB|B2^51|E?pV@y`;Ab>fJ^#K@Uv5!BeB_Z5N`_{?xCz3+Exhmn@r4Y5<6Sn
z?QpooLMQCNB`38#XLdq!+r-D_E?lPzwx2X_N^E<6zXi=I>($5#a=*v|P%~~pc?tJj
zT*^{rr1v28Q-<FA@XR$z_g@mzt|Yw=;O$m4%KH$$T9u6Ykz|_f0J<N;&D<Yo{2PX~
z6tfFUM&J{Ku;TJ$_*C*!1B&FIiEFO1&*55K-5~J=E?MJDD<t<X;oG^x`bzRqxx@Wh
zT$MZAZ^TtCgZr&yD0BQfJ42&g-@`R}9|?aDPuK8A@$?Y?1W)%dlKu=U+nX?;tyamc
z9PPH=F7S46p_lImoUIf-z987b&#tmXfb-B6L0;G*#6@F^F#FCH5zb{>L|GwQ#5j|g
z&MMT)>SvUX9g$IfuHcLca5`pGkhg=33UN3yD$KWtjEZn7WmNQNtzx`q*<pU~v0C}6
zY2~l3Rp2<Sg2!tWIzg-OiCRTY(kgngR<To*3V2V|%6FPp{u){ZPS+}UhE}07wF;l5
zRpe}~qUUH8J6EZo_dKn9=WFG!sa4<tt%9|*3SFpG_#&+$Uag`&tzv$qLf(K@zMxkA
zkXC`PR>6o?p{Q12eoV4wK`3&uR?$neie0Ky*n62)zRR`pU!hgtO09xdX%)I!tMD~i
zMXuE<dYx9W>y?UlZ_vtjqgMW#v<lp;Rj{^Jp*mWHZ_z4pt5(t5w2IxXRMdNiR=zv6
z^53OZ;BKvg_h=QmSF7-ST1D>HD*Av{F@9IHM>FQ-*SL)G@w;F~`T12fqXG|U6?|B$
z&?8!f<61@PY8B1XDwYq$+hY1&=S>*WSI?0C`i2Y?7&2IB$k3yP3^y=jq{xubhK7t4
zYw7bQ4e2W}q`#3N11UoWOAQ$+Gi12jkdelQjPk>9rgMC;rds;F6^8URGo=49Lk5}~
zGT6e9p_YaWw=!hpaYII*Fl6jWEd$=C4C!laNPinc2A(!#u&p6O?F<=iZ^%doLq<Cq
zGS*4UptrLjeO(Ocf5wo3u7(VDGi2ylLx#H>GSb74(Vm8k_0lrr?QKY3A4B^48Zyw&
zkiq_j3=J@3c%UI8gA5rRY{=LUEyLcShV%_Hr2jcX2A(%$@C8GLUNmI*B|}Dr8#4N`
zA!Dy-8S%bqNZ)IQ^uKP%z#E1Pjxc0sq#?tj3>g`1$mkeD#>Q$H^^P;7Z@eM>6AT%c
zXvp9sLxv_BGCakQk*S7^PBUa|x|T8T3`6>68qz<@kb&8T49+oRXs#i{^9&i8Z^-BZ
zL&g>=>GygU8Pd1dkp3lx3@kNdaG4=P%MBS`VaUizLq=B_GPYVvpLdNReQOQrUuVd`
zdP4>`7&5fckl{^+jBGY!bc-QlTeb9iw;9s6-H`qrh79a9WN?=uLkB?ecNmaLloS=_
z#mf@;e2vV3Qx(85cxibm!BYk(>*vQ`2Iu5dqrMu~C3sDu)d);4CAw5}ndoxSjYT&R
z-Bfgi=w_lH6Wv^N3(+k_w-U{RdzR0WdX~?ld1@ZUQ}ZaB%=2q<SK)5L&kA=J?jhV$
zxR-Em;XcBBh5HHj7akxyP<W8=U|}BhvK~C!CG#Yg%)?tUPi)CNoh9=;mdry~GEZO0
zJaQ%Tc$LfpRWeUe$viV9^N^IxBT@25;Zee)g~tew1+)Lg36B?^AUsiclJI2VDZ*2Q
zrwLCNo*~TRL*~zOLo!bZ$vhP#^9+#8Ge0s<_{cocBl7@{%riSO59-J~lq2&Pj?A++
zG7sIzJZU4Z6ka8~T6m4{TH$rV>xDN8^N@x4d?AcW>qn12dtK$GrS%Dqmj{VH7|gib
zz>%u}{2h`e>k}a_xdQn+G)>n3HH4R3ulyaJChK2*^YTd1M}e8|(Za`wf2^>zaB*E-
z^l`$+gPHCG(I*O@B>u^wPZ52p=+lI22%j!|hVYqS=6jatvxUzQK3DiW;q!%S3SR(b
z`dY#lihq%?SJ)@aTNHl*fa!w5Auxa8G+F<L4qiq@$Am8yzC`#^;md?C7rsLHO5v-7
zuNJ-r%<``lzE1q>Mc*L$M&X-;Zx*gCTu1m8;akDX_cqbD3*RApr|@0EcY_)K9?|y-
z-zR)OnBfnI{=4XZ2tO$NPYHiW_+jBkgyX_>h4X~@>|p*0(e;Gu3l|6%3O_2`K)49Z
z^bLiJ#ZL;C2saW=36~0&373PJPh;UG!cD>aRiw%K*Xg``OmuVM7Q!vT%%_#;$3;IO
z{G|9#3AYw*BmA^*Tj6$K=GR_y2hkmcJBi;}xQp;J65dsGH{oZ6y9@UKGyk5#y@Y#%
z`RkJ=>tA{EvY+VwV5S=&dZ6$i;lW^r4-p;;=5JV<tbh5>%jZSEApD~6OTxp!%;#m%
zuL!>?{F?CV!fyzV5FRN!N_e#J7~!$P<AlcxPY|9cJV|)6@D$;x!qbGO3(pXqDLhMf
zw(uO`xx({==L;_oUMRdsc(L#j;ibaMgqI7i5MC*~N_e&K8sW9V>x9<}ZxG%nyh(Vo
z@D|~%!rO$m3-1u#DZC5JaXdhDE*LvUTD-)*(hF*hr=}f)J2VD!$JR#q+{w}B&hBaD
zw-TQ_z_!Zg&XDoBQ*5Vv?jY%NN7-Ka+-cJ1&a;E^xg({|9coAAbLUE*JK0XkFBP9V
z-p<PBPMGnzGv1(l?l|dlC$&lW-09Kh4ra6RxwE6s9m^KwbH_@bJF%_G?=3!eUfYz<
zoh;*Xhqhh$+!52~PH%_uxdW!po!w65b0<unJG@=W?=C)fgl{UpkNDgHzNLKbaG4Kx
zZf`4}J6!tQxxJ(Oe&TcI$S-jA{Nc`+@wwxBPx%AH=gyL!<LvkY#ph0wALML*koeqr
zexQ8ru$e!1jvp$2i1^%@ex!Wv=oz0o)sK}wOnmNKKT-a3;&Uhasq*`a&mHn-${#E~
zcgCM9zpwZ+z+WnVxcJ-wf2I79;&W&FwenvRpF84jl+QN^wu?LCZ<YV5_}oc<r~J|4
zbEo{h@<)lk5&VPl$B54z^^eN$D*kxzPs)E?{59a8l|MrKbzuJN@$G_Z)_*J*<oQ2G
C!>aNC

literal 0
HcmV?d00001

diff --git a/ceph/src/test/cli/crushtool/crush-classes/beesly b/ceph/src/test/cli/crushtool/crush-classes/beesly
new file mode 100644
index 0000000000000000000000000000000000000000..0048b148db1105f2f523b99224b3dd600633dfad
GIT binary patch
literal 64806
zcmeI52fP$Tyrr+8Vphyq%$Pd|Ge$%VfC)t<3zD-D5HMiEgqXpEC<+2*Q4zCZ4wylV
zm@t6|sC&-bGk5Ag>U(c@x$_?G*3Zu~Uw7Z`s+sPdnV#yRs%p6^{<lI^Rn^n@AM;O?
z74hgA@9p*K!2f_h0mjs8T7O}~;mdOwPFo(!{D5inhu}-VBJeBt35<JvMBT0jkF49c
z(a5^qUTfEF-*Zr1F8e#?Er-ALRvB8gA)IApP#?SnJ^|kX)@vPbH0TLNfCs<~FdwXg
z$~9QNs;W0udIlH@E&^A8>%cAGE-<~{3bhJ0QN^~`L7T>-z4hR1<B#A6Fb~WHUx6>d
zUTF6}!F4UhGVb%!0MoVsJAwr451N1$pcNPlMuD+lB6tM61YQUI@H|}vZUxVP*<cA+
z1JB)NU}}Y)AMO`?e$L0fz~_f;{0)2uz6IZa&%tNlpPrkImLG~WR8{fs+~;RAOxp_V
z079@II20TUP6k83P2est8B7H)f_dN!JQr7h3E*|`C8&evWo@t}c)UW-&%&}cjYi$-
zpf0O{wZP6G0vXr~8~_dh+o6s-f?YubGO!Og05k!IgQLOm;3Uu%bOe<+CU_p;Yu+8l
z{)%W1+qeLH0%n8x;1}>uuQy+7F7Lk9i!p6^w0l*sCfFGGAO(AXM&KZD7}y@|+zEIf
z0lS0!Kx5DpGzZ6mR-iR#4=VAzu-$H(P}M3vKWl(>z~*3Q5P=x%3HAZ|f&;-}pgCv>
z+JLs89q0f$f>S^z&>5TxD$ypsj&7T}V7pyGH_#pQ06jr3&>QpteL+9a9}EDegMnZu
z7zWM-=YjLVaBv~G7+eA>(Walv+H@(lI}+Rjo&|Hjrg-tT0y}`c!I9uZ&=vH={&X(5
z4BQCr0uO*E!3;1H%m&|qMW7OG;`=g>$y@PW%=0kY_&xX<d;#WwS>R*9<MPa_&Mr`V
zTudIDe1CIabFObyus(PXd<K35b@5!R4?2Pi!Cl}*Fc++ac5egvVugdidEhc|4HyH)
zfqTINpc2my->=;^t%Ulo2G#@bf=|KsU^&!%U2r@Y07ikwz&l_e*a&Uvg*u-O&H=;0
z<=|RyGq@8>0+nbJKYzGw!e*=11n+>^;A`+JsE>Ld1-gLqz@1<kco!@NTcAyS@W*F@
zA>cxACAc1p2a`c-tfvxf;y%kZb;tg>0k(NLxS87qGr<zD1?s#F*aaL4T7cHzRJH?L
z46Xn-gYn=I@HBW8ya&Dp3qWnQ2TxfQKLfe14?h!c4|WAP*c%)IjsmBFbHI(@PH;bX
z9LxYegLBZnkzg`-1AGJOq7CbUJ;5|=ur}Mnb+b+UOy{;~V@%r)>;`rR2ZLtd1aKC(
z8r%vdfQP^|@CsM}hM)~Mf=9vI;0LfG+Oi>d7#n-CLT%z_RJToApl(|NAM61dgCoK5
z;7o84xCPt|9tKZ>*FZ1Se>k`aOa<?OAHgbU&j#QTwx>dE;(6aajz*w<*MLdjIq((O
z1a;dQ><IP&&A>_EL2w@GdnFhHCV+>*Q{WZwE|>$p2fu^b-0$#oRq;EP`})+ywABI6
z4<CXrz%O7$)Vn!o1qOjJ;0f>ns6sur1_Q9df#7^F0$dG7gFC=IU<#<s_7smJerI&s
zgsN4o3*H8^z&GGGur}&`Ea(F+2QP!4!TR`vo~Y|-U@*7<Tnt8n8^P^h0(cM{fb~?O
zP5ch*wrP3RAG8PWgD=5+urlhu0XP{912=;gz-M43w5tK=k9wa4hJj1LC~y-P3nqg5
zK_%M6&rWWeu0Z{701txa!CbI8>b)J<6&wtX0c}BNw4*n;5L^vz1(U#|;92lG_z-*p
zegXIZSCKaH`?%YtjnIZ|fd^912pj>9180B>z!)$dJOG{muY!f(T(t8V@HChOeg&(d
z9h-nh@ftl<q1TDm7ucrNa4lg&)T<%b1!SNpI0l>ut_OF3d%;uS4bUHT9t)lYbHJLY
z`{v*=)O{7qUkhvswgx+a04%M2Zz?_@{~g$L)u(6^uh$g|AMtZM+gbd4bQz{y2d01*
zz;|FXv|(GY6W9+N2ik!yXwv|20k{T?1rx#J;Cb*C_y~Ln7J{YK#=lz++rsMw?tb_?
zrqx3|R|gw`-9P|xuqW6bGzLe2hN$1pAOtDc1MCkD0f&JWpd~mFv;iH!-_>JjPGg&R
z{ljh3ADFfR>bnNm5O^R4yMukffuIT47WLZ!>;__xgGS(Ba42X7jsYitlR-Oh3Rs%;
z|GRauO|#0{bS0+U2A%;QgFit7v|)P?f~MdEa4P7EI-UtG1=oT*z+~_Qcm})yJ^*vU
zkKhmRclB7B)7U0{4|b2Et1xXGcp7{FRzcmj1Pws~4hJnkd(aJaJO_*bH-J0AeP9}R
z0lW=9248@AU}@I9d|hnIM`f>12h{Nt&<S(~HvkVGnob2%!CH7@ZVBE1Z-TeL+u$8A
z6TA!F1Mh<mz=z-?@G+PL%ImPSoo)KCtWC4A-7mqn;0Le}ECD<{)&r}7HNpB|Q?Mo2
z4(tqkkbpfvBXAIC0*(MJ!0}*dx1oG}*hJL_W&7M5)c*_c4fqNC3VsK5u<(jtb+8s#
z7i<JJ2it%hz^)(!yMw*K0pL(@G&lj2Z+~ezR$ujg+3WKO>OB{H2Yv<%z+$i*7G43Y
z3hIOPz{X$;ur=5bcpwIQf_=e(;1F;)I101`OSArew=Qg=>b<fyeTMpf4!#ESz%Sr8
z@CR5Pf3PxG1FQo!1e<}aKtr$#2tW$<0{el3K~vBS90UHY9!ql?Hc|C%S(`pZ-M<3g
zgCD_sun6Eszp9n+2djX!!3JOx&;V=;b^^PB2;^WNus>)F4g<}>abRiI|L@j?O;pV+
zYtxCCcr0iIGH^0D3A6@vK^w3=Xbakb_Mijk2u=Z=Kxc3&=mNTeZlF8p0sgKYOLH1F
zQS}bm#Ot@0;<wd$aGuk74&*t5#~SxdzLsn|e_tj13|M?zOs?x~EaQF#!K7vWci6Ye
z+QjE*Rj@vI4}1oG1k2Jj={L*Tv=ZvStY4FUqpVHbAJ+u$fZ1SKye9p6S)1??YMH<O
z^jcY)@G*6nzy9=US)1^dw#;9DdIfFb`@)gxf5LM+&xJg9@L1!%$=8x?FTMx-UGg}5
z8Oylu3k@)B8?Ylt!2X~KXaQP*!C({^3nqd`z)RqD@IR&h$&9i#{Tup>yreeWz4>W{
z_qXlweE*;RW;7Y!{oVkT{Jn_#+KXkc%f^_ttp85*LRp*s4ZjmTU)H97!?ob&)TV#a
zwcuybCcY1hJNm-H_XD2GdG6!6g2x*7O}>_Fd-43w^GES9lg|f_&1bNT`#vxl)9T<H
zu`KsVd%CPmi_xa#(WX_wvf7{Qsj@cx8?KK}SDT8zU;N)(AOG*XHqPVd$+EU!SF2hP
ztO3>mn}eM}1Y)o!*az$j4g`mR=Ab2L1KNUipd&a1bON2hsbFbdZzx|E+cK?eU+aQt
zT|qa{9rOS_K`+o7^Z|WAKhPfx0H=e2U?>;{&IRXz^TBX%A-EV^0?O;Kw4H5w0&QYn
zcJ^)O_nU{|_kqi%?^mZ&$Aju{naANguintCUZF25&y75n@Yv(N%Ga3Bje8#DzFWNZ
z!gV}``R@7jQcN2O?g7t&Ibc)l99w}Mz_QSvaB5kb{tdq=JgPR$n9;cKUH5#v7Tv)G
z;407qTnDZNy+Jo{H8>3n1($-s;4E-DxES;Wmw<s_=B;&?D%`goL7lmOo;YHQ<@kM;
zZFmse1MUJZqb+1X!vhP+zu|sf7k|V3oclf3aW80vbwuITyd+<^uY>EItLu$?y>A7#
zfSbV>FdEzhiv76RXSDdZm|Pd@!~MVbygrWS;t_B!7!7U*_kbzjVK5#%1Qy|MZ$nN5
z<G}skK5z%P1w08Rf!cUpA1JHya6D0`gL+^TxE-8`3q3uM?U6@-`gnX0xE}=IMG#{8
z`^YxP!N||Sd{7&8z8`hwYsS}*`vTi=Hy97@0L6Wwn0#$`zT>{Y^Bt;H#dS;uTt{(V
zm;!fSzoux*rdar8a2&V_w8gYfki)>f;B4?1I0@5#K=wn9M&1J!gJ$r4$Zp8mcrMrm
z_qkvjCV~mzPEdRfipl3<5~jQD^zbCU0Oq2O<B%(ZwZQ`T>d28`2_APuz6{O;kAhdh
z<KQL07h*%u1k^@5dEC0~WE;kTvEX)4Y+o_iPCgHAJ6A&8nu0oD3fi*<@=oO5$cvGS
zz&x-K{AJ`i$T7%Spc@`vh<ppY2QCM-(N4Z@Zk<=bcBg?k*xs7R8DI!_1^k3*AA|Mq
zcq{NCSPZ`p*$LDIEx-gY3N!+>QD>fmS?BpU_ww^z7t~=zZ0lU)8{i}GIXDwM3El-~
zgTCMrZ~+(sCW1e}jbI@d1eRufxliys;Xc1ODO4SRx?YBS9ee;D0B?i(nEorWA96Hu
zH{>VC4UlcY4PZ5JJElzrE8y|ctS{HaI`h2d)_G-|Px^yj@SgA{SOgA)j|A`IaVO+=
z$U(@5z)Rp&`1#0LU>9&3_!QHAM%G51`JU|7`FU(>I#>bQdk)+L=D{~c?hTFtFTn>Q
z`+_g<_%-krxE%f)@?-E6xEEd<b>{ift@C4eO@9Y>;B}n@)@Q=Uf}QYqRd62IAM^*W
zVEX&uSUg@E`3{&3egMm3T5Z&s=Rmj4^{|~D*xr@k7BC#l1ebz0!H3{Ha27ZRoB@V`
zFTr@w0Xzq)u%6$LwNYojN4s_Y7O&^0pgXqLAB+IUgY&^)@GIsGMJ_?U2HN5AEy%-=
zQ;@BZ1HnaLI=nXO%=a0$&PQN7tAUF^ckl`J%LU*wJf4Xhf_xL41|rn!I;0OC1m9ym
zR#x>B9*+ZG<8f`&neU5korhvuM}l79Zm>GI9(<1Ne}&u&{DjBff)=Rra`1P+p?Lfe
zrk#byQ^9ZG5>Oj;z72IQ@9W9W<7@}Nd$LdKZ0wuMdLJl$zhIk+*MH`qP0MN@D1J|J
zU!PCVmSw#U6u<YmZTdI#ff`-brhh{pD1I+@U#EXVAE+DErsBEba-8#S2GhYzumo&@
z^YJ!d7jP(O0a}AoabE5NE(TYCo56VS-*xZb4Qflnuv_6;(+Jf48ZZew2fhOT(fwCr
zQ1=PoVek}q7t8^*b<Y()Z(m>b{9l1-H-HDh^WeX7ugm|0`&`(T>&n`)EZ-+_t=dxT
z4{#;gcN=&H{73!oyJCM~-~0dWe)oJmt|@CnZS=RV&A#?*!zi?Y*J2i>EepTF^146I
zSv=Nx>~NpsYsI$mGaJuiJpP%4uOH#MuEsL_ZDP}fYqDE(q5miQ`tWyN{wBd~u`ek5
z{qQ$m{yxF&a@moX$8&%2dh4=q9s8=XIxh>?h_6(gmxb#)SCl=!%ffYs%TZ?@Pc8M^
z2KQ_3Cw$)79`4_S&p(s<{$-f%?)%HaHQW(pbzT;(Ra{zD=Vjr0|0QL0UKYOhUyM5Q
z81VF)4)<^FFMR&lp5if3OdbOlVLGpn4%hE{JdU^?9!Fdcm$?w`9w*Df?_0ym>bxvm
zqrRZ5&db6z>hsI${4G8s{!hOSc^>NQ9w*$txxeuFXM2jrM=^N}oQvu1F|aKBt~so%
z&db7YcSFnSye#~7H>9l2%ffFS=b+B+F~I$s`wO3cwx_u77n8@pU`*$GMQi>3&*O;e
z;rm4Ky<!lY-$mxCpA_F;_;-AN;oosR1L3?ry~38)u7}e_;rq)VOyl<#{v9U$|MLIf
zGk{BPRrr~N1E=wp`NH_IRQ7{p;lAST9L43A``=u?&VO-vZj;;K_LtY+Jc0d`(>aax
zVSUT%^oO$yAD7p=9{z@ff7_!EM*%3TqEnAP{MEMbxOba&>;X`?)v|NXJ{@}Z@7%5f
zw?lgO=)qe`3*(8jZ_~F;yAIv^cIaKGW%b;?ytAE08SiRr-=SlheqH+x?9{&fAWY&7
zbYAR-yENa$^AeAfd3<tm3r=pd%OOW5At&?D=52)gHSz;GPYdjt(9?_4Jj?v7ZRF*#
z&sjWc*jZ^71vzK&q+(}<X&9uO#dD6G<p)t%SRqd>+IGSDwrm#d6M23Xb1i(=E}NAH
zUc^~^Q@68{FpUb^=6k-Km1m(}Xg5z;=IIDCu8g0GRi(nS#!u8{4gF(3jtgz(2W~TK
zfEW5n!nN@8xS7>IiNdf@TYg-(v!c}Z3ftnR^6G7Sg$?i<ty#3K7llFLRpJ-kvRR&=
z7B;}|!{xKQ>ZgNWm(8p;S&+qrXNp&s%uW**s>F**tWxtveqk|QsH$E=62xI1<{k4q
z4+<|HuVa~61Ck^S3Qq?wbD3G4yetn2wdGYXGpm2#1zzDL=S4Czt5fPFabXs(p_y3&
zvLyEk+qwy}*pQx{YS=T1k{}#iKDl?AWKlMzd~$bG+K>4=omok5KhM*EKSkQfd6pz0
ze|5E!!#E9N{y@tExP9+7oxAfF=IUj1SXJ*wW6LL3??e3E-z;5yAg~LAnOwbFu{Va9
zTz$y0V~L$y*nQYX#!T)L@knBKA3M2l6tTyWncUXnjyB;hlMAN;_9L@P7fu8BVlq!@
zILCPGj%F6=U)cZIJ5P%Q?3rg4X<ImBuwx=mb<PQleG~OpKKnMBzv@#s*4fq6OhyB6
zI6S1kY~IN8+56S3ILz1qwtB^0&JMI@4b?}@)SAiK(Yi-bocO%NYj({}Q62_`gXwY1
z(!o?X1w8@hcAXRTwDP}h9|oaUI5Rz2KC5jOCDmu5>E*Nf`8XO1fA!Q~W_fW^eQZBn
zKC5r+r`7Kj&y>&V?_*x!fPEITsvA*#l6tOu(N1~fM}<Y7FQ3)d_v5f|9(kdB)~Q*R
zg@t4L#lOsoB0n!Q=B4skeY_kk<r#NI`K(jDFt0wfy<9%4PwoZPD|`jB)R@AyUoBs>
zR~-4(r?=P2XLSjpAS|4xUoW54EA!&4aGrkSFSF7N&kt+-X8EjcapqOOeY{mZt4oZt
zY~l3wcKNJcagtVV>mAHeV+tFXS-xoXnULFlw|rLh*^sl|E1y+;M&zvb%V$-e6*=pJ
zzsxF}89D32zsxF}9Xacx@>$h~5NCaiS=9%fd83(CKDqi$W_J1H>I2}Dzf3Od{Bz1D
zSHB&7ipeV%&)2NOXMb6)uxo!_KDm0g{-S(x^*h;@<&&#-sjtc>SMN@9%O_XwN?(^x
zu6`^0rhIbsQS|L!CKryN@5(1vA3fifPp*D1omW1&S7Fcnp?q@nUi)MD<mx^4CrsvP
zlph~{E?=bj#Q013<mxlt{PM}wXS)UEldI2Jzhbhs6D}-Yr25?TTlwVb6U3sLzx49O
z<%?9GD3+8KiGMF&r26pt1Cz~}JoYf<Pksf^2Xj2acj3Y}fn7PGTaB=*Ta5(#Hc(tF
zWQ&TzQFLk~=C^?2oP=8`O3D*JQ8Mn$Mafrz@bB<lWLMu~ZEMWu7$P+mtnOOKd(&#>
zMLZ^JEao_eHI~$OEnVBSjBklGOXll37I^Eq=5u_O;*vqIfomcAwAIXuc;|7A#k^m<
z#*$54%Xp7}&Agm<*%dAHcu#hX`3+nPwsbAr%C%@~*WztlOSW|_-Ojaad)M-Yjz!)M
zuK7E<7VP9&$h!oKwT_})T#I*gE!oYr)N?KKUCRT<VlQ;f=N*Z~?ZpA_O{}qyBSF+y
zl)4skG|ifMN$y&@yKC7VuI0S*vG{k1x3_Bn?})6K7w+p?)X24XKi87|T}uyeEj!S)
z{9wmYzp-n<A+Ci@T#F8MEpF;sa+quB5w7J&I+l6OT=Sc|7PN3JJj%7`7}t_xT}zL1
zE#v*8#b+?fk9REhPH@d{<yvr}YvIYR#jRaS+PIdsbuDY>THYSUQ&{zR$?IU$?`SkQ
z#c0^cXw=zge5%o;i_x^J(X5-%oOiI+)XMjI81;J^4SE?3dmD}V7>)ZHO?W$OX+vp$
zquBtX`Dv~O9&ewm`P;xh!)S1((eNyz5pTOK{cSSXXm*a#e2A-|H`J&<%xG|~(eON@
z(fLN>3ydbijiwhG%`P&UU+ik+U1HSdJ+(CrjDitH!^@0Dmm7_*Fq&LxG`-4b#v6A_
z8_KVCHTFgs^{+7+TxT@8-e`P-(d0&>=}ktn(MIzzt|s2iM*Uli2DcgwZ!;Q=HJaRE
zG#zI&yVGcXm#e8a-l%`K(cm7V;RK`6M5FP&Mw3ZK)5%7&DMs`AUCsOlj0O)H4IeTZ
zJ!~|7#Aq_rX!f|Px%Y%of11(YNu%L(qtR1F<EM=#&lpXgHJUwVG=JXF!1G=(>c40-
zc*$ru!)Wx1(d1R5>1#%_*Nx_Hxaxav84cbx8opyRnrSqC*J$#d(e!<z*#}1R4_yts
zkBtVijE1v~MxPjs=NL^sHJW~AH2cEU(EHM;|CP~TuF>#oqtQ1;<8O^7-x*E6H=4~e
zn*Zo(<o|3m{KaTA-)OwRX!5JkbfMAgH>3F?S7UFnQGbcipfIipCh;=^u0#J}=0<-S
zjrr?N@%$Xb$#O>1I!3d)Msp4bQ}efpw}MfBMWaDIqapv%g3{l{{IRQ~35S^}Y08mw
zN}8>1G+)Ej)LYZ2zn0OUzR_@PqtQA><8_TD>l@AZZ!Xl-D$O|@d5vb?Mn?UOjRx$i
zURpTha26$vIO2Rs<IRmGTNq6n7|l4!cIj{Ptz6B$t&RHZ`d_nf9&Bqg+|FpUz0tU#
z(S##$lvb7QXf)f&Xuh+fq37}LgW}(Yp1-TnU^k;7FA$U#j(nqWU^EGhrjgMsHkv1{
z`d(_(&x{7S(QtR8(H=(QJ&h)N8BO;#n(bpW-`CZ^Yh={l&uFl}(eMDH(Sb(egN!Bz
z8%-M<%?>e|H*q!ec<)HfixT=xjRuDq4G%XO9bq&+(rD7mXxiLp*1~9hl&g_<v{C;U
zqrtI8!{dxbEse&<8%;PKe@%O%w3X59M5FmhuEyTUM*Y@CgSJMac1C0VQeLy&IO$+C
z?PxSR#c1Bi)x_&;)IZf|(8Xxj)o9esXx!as(!*%l(`eSqXx`h^)azr^?`t&ZXEf|@
zG#X$uKFw%yy3zCuquE)mX5K)f!62jIV58AFM&ltylc7e_VMeoajppaMntSIP^)E0Q
z3^y8HXf(RWXne8J<PxLlrAD(6M)S)ZMW=%+j0RU3jYb-cuQr;DGMZjvG`rSlex0kn
zcZ1R3Mx)_PMx)V2<1t2)n~kQo7|m`qn&0MX;N5Q2A8RzY!)Q3pXmqF1_%5T#c%$h(
zM)L`-hTcS@{=G(nNk+rTMx!Z4<NJ&z_Zv+gFq%DRG=Ip|$a~nR|A^7xQKR8hqtRnV
z<HwCAPZ&+78O@$FnooB%_MS58KW#L4#%TDg(dc=j$qPo)7ma2w8O>+7ns_f8^<Om_
zzGgIf-Dv!V(d12|>03s#w~glSxSD!1jr#8z4c;>vzHc=8z-auD(ez`Z*({^^Y*#by
z6Qlkdqrs;}!_SOHpBs(8Fq(X6G@WZS|Jv2u`^KpMt<m5+qv7{Pqj^T-AB-kH8clyP
zn*D4v|HaYR^X41%7Z?qGH5x878vSN8USu>`Y&2bBH2c$4-)9HE;^&Rn4>*)VNy9ou
zqq;_8j?q$DI9<_b#*x8FbMuv44LtVyubCV8{3jnv8mwwGT+L|2@hD0Q$7>i()-;-~
zWi+dAG+*1*&|Al-zpl|>eWTF^M&k{QCL0+|Ietw|ox^Msqd7+ptC<^ln;G>tHyUhV
zG;Clr+R|vewb67NquI7b^X*)Xz3q+q4UGmn7!7wc8tr5>-q~ovk%em78>hP(&2}@I
zd#)xPe=@H5+r$ry2BFa~G8)B3<HTr^8cj2!S#C7n-PP3F!>GTf(SReS)YLf*Ign9F
zqkW9V`x;Fe8BO;yn(c2iKfu+@<7iJc8_IkR##hqdV54DUqtPKo<0eLvLye|Qjb?`#
z%@21q_jqqyO*L|#L(P;lXl68QZZv9PG(O5`a<tL(7^B&-M)TtwO+2rqQU7?O!3jpg
zRz{-}jm9S#O-?qNwl<ozF`Bn^)%V&N_1hZ_Iv5Q*8jVgd8h0|9bT*oDK#1b2koZ{_
zqj^_X1FxG=zq`?(htaU7(WsZvxVO=SV^Y*qBS`xi&H5S5`@0%?1C08o84XT18uCW}
znhk~VSw@qyjiwxJth8`8$Y?&;)yO-?s6WJLFw|%`%xHA3(fB;0$@xap3yfyNjpi4+
z8haNR^)EIWTw*l5)MzxqXndK`<Z`3w6-KiwjpkRmns_6P`d1qbMi~vSF&bTKG``Mg
za=p>?2BX=HM)R9oO})`Z{V_&^n~jFI7>#Z<8sBC#x!q_w)@XKz(R`e%nRlmA|1P7!
zc%$LnMx%R-#uJPt6OE?#8qFpd%_qB>dsB@1_ZbcDHyS=*G<wiz{E*S)VWa6IMzcqa
z=2IO_J?}B2{^Lf2Cya*Ej7Coyji(z;o-&#~Z8UqvX#T9LzW1C_|9PXq3r52ijYcmS
zjb|85UN)M(Vl;czX#Se3f%m#m{|%$Tn?}R8j7D!8jo&ev%ru(5YczY$X#T#dq4$AN
z|3jm}M@GYsjYhMK#<PtkpBPQ&7|lL4nt$eM<b7_`|H5eSrP1&!qtRTW@z+L^Z;Ynj
z8qK~lnt$(V?9DUk|6nxu(P;RS(dcKR@h?V``9{+PMzddy<_ld-yx)xai;M<~jfP8%
zM!y@4|1g^TX*A^+n#FhLG)wqTw-z<?lhA0+K^AM~1|G*OtkEdsEzc#5IC5l3<0Fiw
z&5Y(83bAJ4*yqTUCCyHBHSswPbj{o(Xl*p&=-8#XF$a?_X~J<WOPX@H?UH63nX{xh
z{}J07O(WiMSJJqf(Uc==mgZ(0Ahe`;Ustn;Ls-|$&0>zoS<-|9hSq5AaZKNm`W#lY
zq`}!nLk`keni~x=8V@#_aCFkr!YKy`E@{T`Kuel)2-BkGo_DTMp97wj<^~*Bw4@=2
zA1-Oc5mQSVbI|9KCKnk^Ic#)kZpP7!OPXKms_$`J)0(-y&mo6P8eDEPywYfVqpLwY
z+Gsk)XvTqGYt|fkw;K&Pd~IoN#1XJdnvOS`Pjod3Is9$S!cjcgXfnlU%5juy7LL8C
zMibs?UYeUe>1yJ04BDExN&2#@Y50oK=yjv%8%DD?jplE;nt2@Pxn@I|&oO&T8gS^`
zl7<|)x}*^Y0xxO&zR`q3NtfoP9C^2-*+)k6k6q2ZSw{WYMuSg`hI5QYpBqiTG@8$a
zuB1*TxI@#QXFcU$#m@ZfC*!`iz<R_1vP;YRF$eB;JmDb1j;D*PXN#@pON<BJ@7Dc4
ztOvNO-&ubU@-E}j_JW85(>os5v7T^{bZ36bftnr9IH<SdIR`c`dFXKvBgcIXRN;8Q
z!So#uIVgnV5eJcWJm!Gnjwc*|-|>`#);peYP<Y4lwTwp|2Qe?rk9-d7?|8sL?i~*~
zn7rdL2fcPY;o#?vryTU(@r;A#JDxW%9w!{AzBE71IFNS9lazz3mmD_{TMsx`e`$V}
za8P^4a}LH`^4#a(?v4jdtY=MK#{t0s^J|tzUlI=HU2@-VX+7jX@TK`d(9wFrf%u*I
z8UG=Hl7}G&moItbb0BfY1O8(Rj;BM6#}Nl0FU^m$k;c<-l=YZ{)|ZygygQ8N{y6Ia
z|M7s*@_ELA&}%&KeEwqtjz|25AxiH1{0AZ&kEU5qUN9cS{6`E*%LmCz*3%i*Gya1V
z&hk0`VT6*0-mBLAH>^i*T94ndp1f^6eaCt>(|Z1{@yL76y8piQ-~;R7ht{Kytj8Z)
zPi9$9XIsxW7<1|Qi1IncV~>LXm*&Sl2Re2<_}qHPL8_hk5eMRSJpRgh!hxWj`RUiz
zvu~{D9R8uSeBym)-T&TtFwc7UgZ1b~>+w(4lb@}pzgW-aThA95PrYBQ`wOiHzgZ9Y
zkIa|amqv@N$4jgyzgtiLu%7V_L1%qAZ;dN?=5Z7>$9;}s<9NW4G#w9@w;ruvJzmjz
zQqOw2lJ#t5>-j3ib8l7a{%Y2P)vbqXSdZ4U9<OCR;i#ge_T}l?*0Xi2=j*zTE{^M2
z_t&=`Y+yay(0a6y^>}0J$tKp*O|55}S<g2&?t5EU_ZwIbwzM8@Wj)&3dc2MGWLxX$
zcGk1)t>+Dm2i^|W{T;0bJ6R8RwjS+bJ>J!NvYYkPv!40Z^T2rMh1PxEl3KIBgkca{
z4-@NAYCX=ZC%N@>ck9_6*7H4$N8Vo6{k^RR`&bY6wH`IH9`9#8+24A4fc5M^>-j;(
zWA9+=eq-ywA=bkt)}uqM$4#v#hgnY#x1JqgJ#S__@ta!@T38Q{vK}36JwC>Ia;)|A
zIO|zU>-q7<Q||=pek<z%2e2>ge`$D<_2^{lack>I8|!IX>sdSNd3)oT*TK5q(Ry%-
z^{|umsI&F>RO?9>>uFc(8ApdIwLi<d8_&HS*8QH=gI?Cd-qxc&*5kg`lm6DT0oL==
zT#r2ObnE^Z)`K&xhi6%j23k)BSx*OB&(5))4>9h0L#_M6tOw^=kIuIqUtm2MZauxw
zdUlcZ{9@yQcd7MYg!S+;>(S-b<14HuS6WX;TF<XG9(tp!``1_xuC*RsXFa;!dVHhx
z^d{@sXzTeH<B@l>b^jLY!R^-LvDTA2tf%9wXLnl9?=l{H<E{JmSPv&yk0x4=@3o#x
zvYt-1o=vfy-)}teAFv)gXgz$`di;p><WcMCRO{Je*7L`Wr`|N{!IRd*>DHsCtjGBH
zT_{WrXoqfih4WXp4t?9u@smasE>TtWZrh%V@YL3>cjvyH+qLP6hlLX0|NOHRmRp}b
z2Fne^I2(LCNXcRD=p5IM$?@!%9K4P>Ovz#C=p2KN$+72{9B7WoQRSE%LXOGt;+PyD
zj>&=Hm>dO;$symEmnk`z8=Zr=F*#xzlf$zyITjm}1FtbT(i)Q^tT8#98k1wEF*#5g
zlcS<BIRqM$<4rL+xD=BkNHIBF6q92^F*y(vlcPQ{Im8o_<2f-ojuVrEHZeIu6O+R+
zF*)`UlLIX=IjRzqLntvhbP|)}Br!QC5|bk!F*)23lVck(Igk;PqZTnaS`m{&6EQgs
z5tD-sF*(8zlfwxyId%|}qxLY1g}Vm}_a+wZQ7py14?AU{tf=ILW)|*lEGt(>Vc11J
zTvfiBa&_ez$~Bd1DeEiOR<5I5SGk_Dumj`q2J(%R8!I<ea_AD4&8wqo3uOc4mddS^
zTPwFwZmZl*xxKQXatGy(%AJ%uD|b=us@zTKDSagmO_s1a3jc&TAI9=TnJP17uH0R@
zhjLHl-pYNH`zjkL_fzh#JV1G%@?d3S<sr%@%0rb+m4_*hP#&plrfjZkp*%`?jPh9J
zamtp;<CP~UTPZoD2FuCSQPo=6M%h-`PT5}BQF)58ld`k&RAm=s@f_Ms-d)*4*;Cm|
z*<0C1*;m<5*<U$8d7AQc<r&H|m1ijjDhDgiQ4UcKRSr|0t2|G6zVZU)aOH){i<B2D
zFHv5q9HG2SdAafm<(0~-lp~c_D@Q4>QC_FKUU`F(gEg?+R2@~Lm1C4QD{oQWs=Q4(
zR(XeVobpcPUCQywyOsASCnzT>?^RAxPF7A)-miQ>`JnP4<-^KHlv9<DE1ytKQ$DGj
zu6#=QwDK9{v&!d`&nsV0zNmajIYar1@>S()%GZ@|DBn`Pt$as0Q~9p)J>~n#50oD&
zKUU6C&Q^Y+oTL0y`I+(y<(JB@lyj9|E5A{GtNc#+y>g!NN9E7TUzGEe3zWYq7b<^K
zE>bR5E>ZH9dX_(`qv}s(;qw+AFSlIvggVN)%H@?SC|6Y0Q?9IBMY*bSHRbBcHI!>A
z*HYG3uB}{0xvp}3<p#<Pl^ZEHR&Ju)RJoaQbLAGw2Ffj!TPe3zZll~*xt(%*Wkcl-
z${m$EDR)-xqTE%vo6=MI%0L+^BW0{il&LaP=E~ibdnor*?xoyYxsP&RWh3Q&%KeoG
zC=XN~q&!&JSb2!DiSkfoQ{`dG!<9!Uk5o2OHdnS#9;G~5d5rQ{<#EcE%Hx$MC|fB{
zRGy?fS=n0IR@qM3UfDs}QF)58ld`k2i?XY-o3gvIhq9-#m$J9AkFu|_pR&Jlfbulu
z>B=*dXDJ6N2Pp?D&ruFh4pk0Qo~t}hdA{-j<#6SN%8Qg2D=$%AsvM!bTzQ4^D&^J6
zQOawS*D9}5-k`iud6ROqa*Xn3<t@rvmA5HxSB_QQp&X~YQ+bziyz(C91m#5Ky~;_-
z$;v6p`;_-9A5cE1d`S7Q@)6~u%Bjl7l#eT)P)<`mshqBSO8K<%8RfIe=anxgUsS%N
zoS}SK`Kt0Y<?G5fly55EQogNxM>$jZuJS$Q`^pcLA1Oap&Qi`+exjVC{8agw@^j@E
z$}g33m0v5rQGToZPWioZp7ICfkIJ8vKP!Jx&Q~r_{;FK4{7tz?xmdYG`KPk@1$MbQ
zdf!#nRj#01QCUy9l5%C`D#}%rt0`AkuAy90xt6lNa&6^0%5|0ND>qPXsN6`ov2qjT
zrpnEfn=7|aHc)P<+*-Mfa$Dtg%I%d6l{+YRRPLnQS-FdHSLJR>Pw6WIWvGmlu`*Gn
z%1oIncUSJA+*7%ia&P56%6*mlDfd?%pgd4{kn&(<W91>rCdxyVO_hf!4_6+cJW|<A
z*<9H|d6e>K<uS@*mB%StDvwv5plqc)QF)T`WMyk*;j$lg@wW1I%J#|*%8trYl%15F
zm8UAZD7z}VDZ49sD0?b<DSIpXDElh=Df=r2C{I(Kt~^tDmhx=nK;<CiVC6Z=A<Ci3
zVajur=PNHz4p&~JyjXdOa)k0S<>ksNlvgQ7Dz8?KQeLCHR(YNBdgTqu8<jUHM=QrD
zZ&u!-yj6Ld@^<A|<sHg#$~%>JDaR}CR^FqWpq!|@S2;;JSvf^{pYnd?1Ih=L4=Ep3
zKB9b7IaT?X@^R%8%4y0cmD80^DW6t8qkLBRobq|)3(6OjFDYjzUsk@Vd`<bf@(tyi
z%D0qnE8kJhRKBZxPx-#`1LcRxkCY!PXDMeZKT*z6eyaRT`ML5-<yXqN%CD8*D8E&H
zr~F<yPx*uLN99k-pOwET=PMT|e^oA2{-#`{T&!H8{9XBn@=qmiy<p)z7c8NYcUiFT
z_6io>SHZ#?Dp-zC@)in~7E0bU!E&OKw@I+HR`SLO7TyuT!doF&cn<^%Z+>9mT@Nh0
z-GPPoIk4~s2bLa6-qOIrdl^`G69WtHUSQ#E3oN`}f#qx^?@(ajtqClHmAn~&g?Aya
z@b&{1-gm&l8xB}_rvVFZF<{}n1uVR&fQ5Gxu<$kl7T!O=!W#!zc*g+C2qo_kVByUH
zELSRTRPsImmN829{AXd8e-`%jXJIFQ7WVFEnW$uceinA*XPKg8*L@cD*=Ko7$zJ*_
z?4HlUe)%lykk7)N_$=&#&+?{{o$guK+n$Bp>{-~qo`oIjS=ghVg<a`c*oU5lo#$CT
zQnI@|3;W5l%vQ2zJPW(Tv#>8b3p>HHu=hL5JSBU(v#^&t3wyV-uva?^d$Y5!7ds1k
zud}e%ItzQNv;3}P?{pUSN-vb<tAoAJS?Va+>zswX%~{yXoQ1v1S=g(bg}upH*o&Nn
zy~kPDYn+9>#aY-(oQ1u^S=cL_g}uR9*bAJ6y}w!5>zidgC3|_ZY@lSXZWi|DW@(_@
zO37Z^EbOh#!d}`e?48ZRUfC?{jm^Sd*evXQ&B9*SEbMK~a;%cQt6ADA*_)b$y{K8(
zdzz)6lD(x_*h`vah?2dcSw<>HDcSp(g}t6x*xQ+fy_{LtyP1W(npxPJnT5TWS*9u3
zYng?;m08$JnT5TRS=cL?g}sqk*bAA3y^mSg>zIYTjalASvUf4dOeK30v#=L23wsZ<
zu-7mPdkeF$moN)^2eYtOFw1NudjYe|QL@)B3w!&ru$M0jd-t-iS1${D^RloPFAIC`
zvar`K3w!Ibu$L|id*`ySS1t>C<Fc?9E(?3#var`J3wztL%vZ8^Ez7S;_NHZFFIpD%
zo@HULSr+z|WnnK_7WR&1Dg3(`e6RxlhwHq`I?B4r<&`TaS5($huB2R9xr%aC<!Z{+
zm1`*1RIa70uUuQXj*`7iS=LjouiQYnp>iYT#>!2Un<_U`Zm!%y*+99aax3N5%59X}
zDz{T^uWYE?LAj%HC*{t{U6i{jcT;*wUl}MvC3{=4#L7gODl=uS++Deca!=)6%Dt8Q
zDEC!1QtqeRUwMG?K;=QogO!byhbWsU4^=i*9;Q58d4#f=vbnN_lD!UDj#eI{JXU#}
zvZeBP<q67G$`h3*DNk0mR<=>LRkl;MS9VZ#RGy;jr0lFbRoO+^RoPA1UD-p~Q`t+|
zTiHj+UUw}0l>?NgDNk3Pp*&N0mU5tSkaDo{9OV$@P~|Y?dCK#Z7bu4-FH~NnyjXdu
za)k0S<>ksNlvgT8Dz8?KQeLCHR(YNBdgYDEo0OxKW0W^5Z&BW^9IL!TIZk<}@-F3g
z<vq#?%8AN*m6Mc{l~a`WD<4oksC-!Yi1Jb8ROMsJ$CcBRPb#M?pHdbt7xH5i{>`dJ
zh?~$QrZ}11YDPQG?lof^XE&Rh9cOo&TO4P%n_C@c_nX@sXZM@i9cQ<wv5vEQ)E$nq
zo7FhS*=_1h$Jt%$F2~t@YP{p@o^`k5?51^(<LqWN!SSJZJkfD>|GL+4cITSpIJ<96
zcAVYFrZ~=SWA{1E?q>Hp&hBInIL_{44?527Uk^FX?oSUp&TeLpIL>Zlk2=n7V^bYx
zH>f8ZAB4vrJI?N8vm9r)ve}Nad)OzAKZVD09B22l&m3nrvCkc6cd;)VXE(Jk9cQ<&
zuN-Ihu(^)2o7mTm-+;&8IL>Zp-#X6jXx}-`ZfV~;ej^^wbDZ7JesG-K(0+8B-O+w>
zoZaw#cKk6s{>5>2ubb~UyV)&poZahwb)4Pu7CO#ucfUE#?sSVBXLr2Cj<cKH635xy
z?sv!8jqVS}uf}7R;_*LFj|&s=zae}tLE(H}!5CAoY5j%z*SI;2@XRvq^$~Tu9z3#c
z<3=OvdV8&1w|&p*FZ%o&F0&z+-fx9k1)HeiX^q>f2liTJ=s$wlRs2L-$-41*`p@dN
z(eguCwZgx)RLQ#W`KsjhR?_xXME=w3gjt2|m6fd5Dymnd_SaP_yuH;byuH=6z2fnB
z=2d4`5x(YbVzH_<DqOcUE4;n6D!je=+TI4p+C2AS^;K(YTa`L))~WFJ)~)dN*3<SX
z@!G9l;kL0&l{}|&pRDBeHmdOUHm>mYHmQZ}O~rMKzXO}9+Em-)V;0`xz}K@9$HwN`
zR;A8sTU2;^4Jy37Ew#N$ymnhvxNTckczfGa@VVT!!fo5G!rR-v!rN<D;qC39?eTNC
zhtJ-mJ3faO9~Tp|t9I0~l{)9`RN?LIT;c8QQsM3GTH)>OR^jb=72ck&?YYNJ@py3)
zi&X^`u3K2)?M2!izcbX<cS@|fa2sePKeH!lSEbH*X@$3!)x!4v?(;gQvE<sGd(7Ou
z`Dy0;W;I>w++E97>bc*e!rR-k!rR+R+pEN9?7b_vt$ix|JnpOQ@i&airtep$Q^)E~
z^^G*0*RI|3KR-_uKT}~=)qYy8Qv3D(72e(f+8(cs&AhelQU$B5I#BE9?=&yZXk1`H
z!vhP+`3Gq}U!PVZwpcC-x8_?$)xnz1b5HTNh2pj5qBB_<1O8EMwc(IjUVjs<-+hjX
z&lP4>9jf_!j`;i(pQEOlUR&)xOzZ!rdL3TN+dtwz+5VBWy!~diy#D4|Kd*x?&0`O%
zt7@Tj*VevqR4uRn=vrR?F}1w@V{3W+$JO%sTh{XWkFVwRpP=<43Z)hPU&-$xCu)0@
z`22QKh1+&=g}2wb!rN<8;qA56_A0Tjw9~fuzOb~v`(u?=?X~_&w53A@x7AVGs_kng
zr&MS=owS|ecYA*4X&81ZeD~pFe$OW?owY2l3S5dlVn?En*tnxFtb0SVdUY42E$ecb
zQ?*QOectF&%j@r|_1DH{v2L|o_wHIhKc~;YcPug=-?5l1JwR=KKk8Y_>+e;|>+fC5
z>+hrWyRT32b;7KwzM9YPcP;T9*wc65ewtoe$5j7XUjKkvUO&%HwYguPuJ!-Z>vl#h
zZ~x3%UjJFOy#BLmdHn;m{(pLX25CNzD;`tD<7%)oeK@WOw$?SlbDZg$E?l$zT=kLu
EUt722DgXcg

literal 0
HcmV?d00001

diff --git a/ceph/src/test/cli/crushtool/crush-classes/c b/ceph/src/test/cli/crushtool/crush-classes/c
new file mode 100644
index 0000000000000000000000000000000000000000..e709cb9a3f8a4dbfe224d4e7bba1a6a61f4a3b21
GIT binary patch
literal 8801
zcmd6sXOmn<6^46u4F&-=(U@dKOHNwP+%R{BB*~KH1QvL~*p_W+v@;52Nvql!;cxl_
z6kp&oe~4(n<gnP7985OR@IKwQTisnFmr#LfPt}=o-*ay0zTrLR^cd4IdyO%R(i75a
zr2m#|($=mQM~kl!ep&cm(tk?-A^p4bZ_>X?Uy^?MrL*SnV#hqc^Ne|B;nMcGEPm*^
zvCiJH*W?;(y`X(wFE7%OE=U<0<9e0!)zWLFjO(@1N2HHRKYJsn^MA8kpL@U<nI~i1
zBz>Kf?KeoTlU^@<qx4PEH%s3leN1{-`oA9IXWBV?%NTh^@fhDK-`k`&NZ&5KQTh()
zJEiZEzFYbp>EqHTq*r5%yhHIAZ_;%)OSee3O1DY3OLs_bmhP19l0GRtBE1@8<Q<R4
zc#E!ktMoSM?b17>yQNFgJEdJ|PkK~3l3tB5@(nUZzAN6rLRv~IX)Wzb8|gqgl)hK`
zKIt*(iZmaie~)ti<zj!lx4e&VM0_9LFW(2GAC&Hq?v?J7-X+~HJs`bXI+jkPmwWBY
z-OrEBTgC{7F-G`^@A;tg9_hW(`=s|vACP`X`eEsV(nHeY(#yT>rS|jV@NthB*I(Lk
z!Gay(k4POOGFoH)oW(zy#s8GWAIjo?%;L*g{0~|DV_E$7S^VQ!{C8RW;Vk}Q7XL&R
z|E=QWAs7|D=YB9}=Ec|<A7kJ;oXg*vE45tdH#(2IyLK@4>c5`j`_&xZFX#AvF~|4w
zIliCG@%?m;?<exPd*X5+<7Iq|hwF1a&d>Qc-p%65f4L_w=set$jhvf*+>UW0=jP|L
zF>b`%{9HE1jhLH%l#OvC=H~Twj2k&O|1i6c8!<QkpdI5z%gx`{xZvRnHyyP-5*!H!
z@s0Bh@s6%bxyQ@_?tG=*$oEdK)FXXQ=W)4dem(HrES}c`-^t>6J@D-;p4S84%HnxF
z@Xaiq*8|_k;(0yrY!=V!fv+nb*8|)S=FGepJL6*vT!(XAX&vB?Blmqx=gI4Vug>v3
zGspLpIleE?@qKBI?~8MMUzp?jynJyzz<3!S<Kg;TkMnarj=%Cs;J-VPdUcUQI-N~y
z>(chx^zR5h?ynax{e5R~@me~0wh`*SvE6dV`17y^fXq7&*Ij6@OAX0>m%1+XKEg4p
z)3HC@$<^s>K`t({a&k1Co|>`;BVHS=JcR|Xtv<~_**rHs&o%7k>X_r~&a10um;y@$
z<I!xiGC4b&tT91$UYndawR)CVyYK9C+UAkc&W$I>N9Rw^E}Z@3nF)`MrG@>Ib0>E%
z&sNt4*3DB*+>Lefz!P`hx_K88cWvFgCW*VUZr-58U0OFUSK=<LoA)kp_pF;wmibSo
zV~hDLSrWN8kKN152W*J1HVHK$#7CWk`XR(OpM>fVf)bKY6+$pZ5-LLoLP<hJ2*Ec=
zs24({170Qn%ubHSus$n77pHnbUh+ZWZSbz=-acM(NV0E@m+X>wE4<{L#9QJeBPHGf
zFS#o5_VChK@Y?dI1Zf^vz`8hmJ6T0&Es}_PP>RraBoX(V6rn9iBJMFMLer8&+*4A7
zmL`d~holG%PZDv@ND<nhB;p>CA`k%|r=~}*n9P5`)aW7;ubmeCW#Y9{qVr6=b~^N=
ziPuhrZZ+}RY0$?eUONRk+{9~#&-w%;aq&^Gs*-pb)`AjGpB1dcQ?u@tcq&#46Hmz+
zXW}Vd&OJRj?3>cQ>>8P+YtiRY*Yh}_2ZO?ofP$OvK?&b^P_f}b4KI4o=Y8~`fk!+T
z_#DBhhYXnG>_xkLhXM6q(trw<X#*;`)d5w|S3ot3xB>N9&JC#H4GU-hB?L62sq?7l
zvh*5Ik44yk3YKF7Dp{BfsA7pWpqj<nfch-k2Gp>C8_<BI+<=BOZyuH1%`xh2iBYjN
zM&-5`Roi1!?}$<V<`^|QV>H+mqv0(Ps=BwvsCQe8irZsU-VvjUC09SAs$Pmw|IQdS
zEXamE2Q1A7G%O-icgq;{su&e4qx$3Ok_FX(Dwb9Qs##<WsLyh1Kn)A80S#D!4QNQy
z<x#)O(r7?E7D)pt_Qa^%8>4DpjA|BB!*TunF=`IPXmEFoh6f|mbnl5#@7@>{_r<8Z
zKStF9F{)WK^<!!JET0C{JQ$<Fp%@KW?DTsMy0kI@^&W~*u^gk4rX=iH(Sih2({KdT
zr@aWMc_c=IM`JX6EJDNX;TZKEk5TbNjLIitR2_*?eKbb>QH+{nF&eDIXgC&v79H1h
zCkgBwPoU*odF($?Ym;-QPpyn*lkt(Y^QR}%$qcTB9+%cmJiT=6{K`|4*%Zo#sg)-6
zYlW|cT{|7#|IVlMLy3yZbtJ~PZp9ea$rwYWN!)cn#<=dt7>Y;yuB$SJ@(}DgKVw{n
zXpHM5jd2~PF|IQ;#&xj9xK7s?*AW{-`K8ksYA!5Q7IwY0R8nASCoI(xmih=wMTDgW
z!upj{N*yqT4VE$nOVNU*M8Q&!U@1SaUYHxgT}8OF$TiN`bjBvHA;N=2NOcHV4k5uI
zq&9@7hLFw>5*b1YLr7i-X$v7?A*3pVB!!TkkQ<z_{}8fCGD1j02nh%w^&li2gmi<D
zXpqg$m@Tc{+S+Zc-QL<AttEXRJDsur^07&>Ku8k^2?8NCAS4BZbbycu5K;gF`6JLi
z!YhnG^#~-7K<@}IHUgz1kU0X4BM>+Obt8~A0$n2zHL}MUv$wVTT6<S(_qX;yYoTA{
zpfmPgCpIBl1e!%4SOjWCAXNl9MIcfH3Pm7K-aTj&3t?iRN-QLag&wgFw-!nSLxu=6
zh(Le{)Q3QN2y}-)bjT5B%+c14T6?UuE3Nh10(&9Q7P8=s{Z3S~GZa1*I$&O5YIwg3
zpDJ-Ego57t-Jd3oQYh)Y-~GYjD2Afm``w>0j&dmLz2E(D<0y#2-uwCVxh|AMY480|
z0e&co;@<nA2K-PK<-PYq75Jer3ViQ}!tg__l=<EdrQwHaDfYd8z4b%Al>FWg<*`3h
xOyTeSP#}J&neyNJ-JcVVs%Zke-~F-TsGC;6``w>Hj>_c@eyEu9lOklye*mz9-|GMX

literal 0
HcmV?d00001

diff --git a/ceph/src/test/cli/crushtool/crush-classes/d b/ceph/src/test/cli/crushtool/crush-classes/d
new file mode 100644
index 0000000000000000000000000000000000000000..e43163673e6299b3844f46d4266be6c641f5de86
GIT binary patch
literal 3657
zcmd6p*LK`g6oxf{1TY;!hx9@c9MlUfz0)Cda3(Wy0pkfXV^|Arc?7O`3Lb>V;4&BV
zgz|rKv}YS`VqaNneItEGM_Xr(q_cMrbb|YWAlM{66}O6iv`y*gr*F3go8^DY{}R{5
zHSwoVbF=;MeMkFF@ZHtTLGeW=`0?v4!NqSTd*X3@r-{6B4Q<4ujqA5G5rSrayN>S?
z4~fUbwp031{F~hqn{g7GF+D8jzwtkA6ZeYy#iQa$@s!vmc8mSukeC*sNJJ){6EBLR
zg5jJMFN>QU(?;pX-0=LE8=ez$aErJ@+$ruB4~j>`4zWw@5l@SQ;#m=iToj@bFNh=J
zm^dNMhzsHs@tW8u|9!pVV9t2o{haZ=JRo?t9v9oiPO(?)69>dI;;@KCDoXLZcu5==
zC&g)TPMjC7ir2*(V!dPCDAzG(e`wBFL-@M*_I{WDCVmyah@ZvE7ti|PSAhRJQMjIG
zKwo;ZVN>EBu~j@FSS!{7F&LMR{jS>H7axcZ#Yf^}v6i^>B{u)p$DUE!CD9d^MNjm_
zdSWw=KK3VyQj1wJC+5Y1SWj%8kB|MPqP!*E78k`k;ytm}*v%#Gho{9F6m&XM9P74v
zFIL_N_OvmN?bi11-007H#<}*L_-}vKH8$=s81>uJ^MlcNlbT2gz4`8Fv@oIvcN%tk
zpRvb>gRgjbIlR(e<{r)ApwD$p=fU7}QWmdazdP>s>ZNf#WI?6#u%25OEOG8e-|?3^
zW@;JM(x6{Y$Hnw$)aOY~ni86LaVAYMO}wX*ridm!i%C;R6Engnb;!cSNSixFIcxt!
z3)197OEU6AE0VTp9LgH)*R$Q_#j%1mpS7m+aLA-GO-RJ7wh}XrRubl*m6U^4GUlq4
zoSU{%Ft@FgOlvC@v&wbNO+)6uETVugi%Dx{35$hUN;zPbv9O!vlnG`93%*%Nnl!6e
zd|Hb`N*A+;Bw-elTFnv?uvtpEW0sLv%yO1Cvx36KtfYi7t0-n#OTz8mq8;Aio!*jN
z-qPLPvOV7Nz21s_-pc*nDhh{oENOVaTXfJ{e8^iu^0ezx$}zKyq-vH^l9&~tw~|uD
z&Q&Dh*0PY&&MZp3#gu$@F3G*6ls$GXBdMF^ltX3(MWb0s>Nl$>np(?4N=vhd0>Lak
z;w?GqEj{KfJMJw%;jK97tvuzeI_<0o&v=W@dW+9_OU`>sFL=u+j@$oG<dlMD1%-}T
zNr`AyQ2@18hLl2P5edjFCKZ__Bqg(y^kkNisLXOwyIDa3Gb>5?W);c5wJM|xFpDS%
z%wkFdvxFpUmd<#~E_utl-ipiK%AU8XFJoC;S?j`jl}59xG@e_f$^0r!7glM;YR$&#
nHmt8KF7&$Nx<4~qUaUv;n1!A7(w$LOQA4xwMbRVZu*UrhoEr96

literal 0
HcmV?d00001

diff --git a/ceph/src/test/cli/crushtool/crush-classes/e b/ceph/src/test/cli/crushtool/crush-classes/e
new file mode 100644
index 0000000000000000000000000000000000000000..a1720c77305ca6ac57618f86f7987a0f5ff47e4b
GIT binary patch
literal 7094
zcmcJT$&(aC6vlHF5P@M)5LA?XXRy0^7Z>(jQAG4+OiZ^zh%_-Xgoa^E7#=+8furGn
z;L(FuuP&&#AZ{b>h~tV2vibcoJ9E;u20Mb7jQINX_bRu_%KGxXGN#R}HO90H4+`Ui
zODYrf@4}^CGeP`A@ngap!q37l!mq*)!e_$p`7X0*?{tgqq|<aC8)uG`r<j-KbQ!ME
z=S9_{&l4A4ZKux<#NQTP6TT9@6uuU|5k3*lY3wH(FlM}Y##GH&@d1Hr^m#$`+U&n}
zJO2*(It8P90@OL*LY;F>byy49#{K-=LY=csb?9L{Khb>tYN5`bO?9}NtD5@xqlG%Z
zhdQ>gy{9uNzTZ-OJYQCjHJ_;FR|}Jcslv6wEa3)Wt}stnDC`xUIMq2S)(p)w9;`3>
zh%tD!tkWc6x-e6?QMg~2BOLFTY<3MVwU+-L^uc~~PkOxVgSk!-t`TMnHw$|N{|uTf
zef$*quodk({U{&%^=aeYfse)C7v2(h&O9Ub4SVpU(0;nh9G%r+@yg&dGlHpxYm76j
zdh|)*9XKKWuJES7v*OwCESS%bU@Knchn{!`xqsdZ?wx+SglWPI;X2`Z;U;0eut0b~
z7~DQC6$@UNZeDzTmc^oHJM3&vH^xviTNwBI6yK>7-*+j#Z{>6AFvHIGBk}iyqXO>-
zdz<%-clUF_7~X6T-p&|qj_Af%uLJtT7-RWPpoLeupV3nKxRUqk3-uB2)#Kvt2(Jt5
z2cAFAo9FwfU^m8Pe%Ld-C+^-l?9*H%-(q2jZRa$b_mjFj_g3o;g}SyQ+n#fY_M8=H
z*HU4buv}OntQ1xW-NI_&7U9V3NoIJ?<-y#zUpF@q6?~joV>KSy6R$-^<7I_Ji1mzl
z@GiAl7yW6z56=4dK5kW?IUz3;grZOq%0fk0BmCc*v98RGJ=8oNallxOhhDq!+<v=F
zbGTi&Lug(%^rzLjXm;~`5Fg<qW?{eGDXbON3G0Oo!bV|}uvypwPA;@~c|xZd4XhjQ
zUGsRvPGdD5XPg_)t=m@3VVkgBXkIroyVbg!bItcbOc~$D4vnx=xJ$TO*d^Q}+$-!B
z?h{&FH?*pGJmSr<7*Eq`si1P4WTvfcBL6iVtDP=RbgS!PSSmSA{kBGZ;u8F$)hC9c
z(5BvBWeB?4SMML-&?JoP-PhCKzrUXvT<YuTeH07tt3S?$<L{yBVaBk3>s8v>%JuqV
zgcyVx)t-T#-rB)|S|1yp%6+v*_SX+`tx<R2IsIqJsd2Dgt(6D&RjWLg5l4T2l}*67
zwg$U|sHL}dXkWK(p<5NYLtFOSC|6aFolE5R-?qFje#{b+;H4UIs4Z`>pG1jQJarh`
z7O@!Rsyae;QYTSD?EIKTCV?W3IMf#T^^+)~&#A-MwkZ85SJe?`M(QMroK$|y0+Zl4
z8*!*DXQ!V;VMru(7~7U(dX%f`hyy=$5(UBwwraKZaL?fb1B0qe)Z(X+FfT$O@5tgH
z{_>6-4uU@K$m1ZgBui0u-F@}C?Ux8;q>`=>ws=R5k%?uzBf&v{;~iNXMA?2t@`UmK
z8bky0TJm0~t{s>VK6PE}G)Y=~cUiI^PZH8DPjch}sh!xHKQr1@yLjXiscr2_5@Dxa
zwYN`hlls^aqVv>tc9qDBQrl&T=2P3*^(T)?ZI>ehNNtzrsgQf6w#)OiA>LkkLh@Wf
z#Tq+K|7=VmfmxkFC0TVy7Mb)ULH<0+A!?rF`Qu3e@$;m}D4vwq6rPmXQ=U|iN!r;#
zGt4%yEZZxvgnTZr9NBGPdGhhV3S@nO70IOoE3s06mC4})tB@&rmSo7k1Ix1hfhFYq
zf#t}s1Iv?V23FwfKww4k`M^qK^?{YiO#`ctv3r)w%#B%gUd)pDG0QE8S$<*63X5V^
zTpY8~l9-j3#;md|V)@MSm}OVQELj<|+^U%6$uayL<qKpRffdO+0xJ>O2UaG=53G_z
ztdJol_ty$pqVvEK;`G3B#hB%Z-NUs4QG8%U;`zWzME-%5iSq-i5J`Jh%n(xtmL<9l
zEFlgKEVnjh`E@ZX5G9A2#SJklZH!rYQ_L!xBUZ|6iCK1Q%#v*}%WaQYen-p-J7ZQP
z7WFeLm553MD-)jvRv}XLtehcc4J^AmW(jd@xR%=!vpnCu!nMNQm=*bU6RwreDhevl
zS37iIe{at~t@=>k;R7`^Eu2W=RHLwsxYpLj#%Oy=KQ1OVhPX`BxH#PyVlBAKHC!G7
zclnPogh6nZ!@2wp?(#~Pi^5%Q?DA&#klHZ?_ie`*f&u*QyM;0C8;CLPdy6sd+m11u
z?$qN%$8wNkIlr+S+gJ{0`sWbFa*kp-F|nMDSWZD1!>NYll)`fAU^zvo=ZpXESDnLw
z!nr^}+9^CA3I_p&$3R=;AB*y1(R?gwk45LPC_I*@ibd72=s6Z8$D-j_)EkR#V^M4@
zT8%}evFI}vWyYe(SkxGc4r5VZ?A(T(*Rb;&7NzAn8jD3;Dd;L?i8J<_3)fLmEc%H>
zIk9La7PZ8plUNiIi#B3WMGAUIVGC2xK(3>HSac6-H?}NV2S??w=o=Pg!=h<e)Qp0T
zQQY?dm&0>?t$<zEu<IKZjlz#QVbLY*=7vR!;HVG<{h^>d6f}qHs0|jK!J;r&v;~W*
zV9^sSN`gg0u&4(X-N2$4ShNBg?^lIQxY)W1ir7gcU%mzkz-1JHT*~`V0{kcgd6M_L
zFB487NPD~=WuQJvK}zHO?hBAp3^Ek&cVDfXa*%y^zx&eW6vTdISDzI3b<imZvAM7B
zzBW6>ASCyGl!N)83<Tfak7D3QafrjcA4S2BA`y~%zx%@Nl!++W`%x0=qcB9l-jA~2
gM{$U9y&t8)k1`R(dOym8ALXKWKMIK-MUpZ90J&VHCIA2c

literal 0
HcmV?d00001

diff --git a/ceph/src/test/cli/crushtool/crush-classes/f b/ceph/src/test/cli/crushtool/crush-classes/f
new file mode 100644
index 0000000000000000000000000000000000000000..c91e42666ffcccd388111a6f970913c2dc864e30
GIT binary patch
literal 61002
zcmeI52bdH^+pe2k17b$Rgot8RbUOD4W)a0elALqSQKE>5h=AmrvqVu4vmz=2Dqt27
zGbT)!<9Xhhr+4d}U4Fg~cRByLt}~Z6&)ikr)m2?RJ*juJtRj~Dmt|Sj_H4^K6!{Ns
z#c;1evANl(vi^k2eX08!_luxz%ILr<0hfAN7pq!_09ocRa5yLmjsV5Lk>DtBG&lwv
z3yuTFgW})>5Q0XJWfd5+(3bS)FZ4MZ_fA+bFV#;O!yn)WFazUg08Rz6tc>+=+>6J)
z8y=5c){(K-;n-#TCxVjTBych~1)K^_1Es*}pforGlmTagvp`vJHaG{I3(f-tId*AF
z#`8P-9M9MJD7ye$2rdE_gL2>!a4ARt8#urP9`Hc`B9H`^f%2d-D9Ap?^Yt70Ec4a2
z!n>(sO2+Uj_y%kPGH1eTBhzyy<CJ>yxU7J(%fS`kN^ljp8e9vm0oQ?w;CfIA+yHI_
zH-VeME#OvA1#|<2G0tDu=lFQ4itSVb)j<t#8>k6tf!d%Bs0-?W`k(=52pWOLpb2OS
znt|q^Ap0C2Pd}s2GB;UE(~l(?!(Q+$kohp@<R_GwkC&F{Q&-R*3<Kl9EU*-;0Z)M!
z!B+4N_y~LjeguDk!;0X%0Zs;Ig2EWzkLa_E@7m4prH(xr!`I+T@CAs+DCdhIWs<4K
zn2g^XXDjrzCl~}qfhk}fSOFdb&x4o28{l2=DfkZj4ze(=qd`ef3X}zfF-|%6nSE}H
zZTA3ofcwA%Fc&-o9tBT>jbJDE415EA0rCWKBsd<N0_1iUvM~DmeMX<#U|Y9?A>dvx
z5iA6&z&fxVYy#WC2jFAyHTV_C6W39oI5-`g0}7+h-=WWPJl(z|{dkcv>;Yea_*jxW
z6kU?XkU1wEP}T<w1!KSrumn5|o&+y|x50<tORyLG2@b_rjsYiuGT;L6D#{CEd~!}U
z``ixO=?w;h;b0P&16G5zU;}swyanC^Ux4qyZ{QI0^KftiI1QW&3Zu`m|II$P$F{qJ
zfnWrf3>JXp;1Tc)*aEhJ9pH2DE%+Jy4T_<kCxX)8{6gxpoP*3hx5jpRfjhxSFbymM
z4}!<RbKn*5I@kqvgCD>j;2&@V`dI>;3eGE}KFcv~_PG<bI~q&{^TA5+1b7x~2Csql
z!6#r3_z4t2--?3cz!~6da3Qz|6vnY3=XtZwi_wR>!2{ra&<}J5_kgjW3+M=1fZ1RG
z7!Rg{zF;O;2ChSY-$WKhpXGT;`rH-IN2g#L6|t@BK_zelxDiwaH-VeME#OvA1ylvq
zKy^?9+y-ibTA((l1L}f$pgw2-8iGleXqId%V#({bf<(@VpJQxt{(Szc1*zBWqu?Dt
zZ7}K9(29{|<+b`DSoU(hSCQBK@_I#{)8#d#oQq4~J%GG+xem|o?eO|i&dajyXQ-3+
zJf&Wpb?D*nPvNhF*T8o0D%c97KeI2}X-zEqzGY_7uNPQ?v?c3GpBrGgL*O64cY=4p
zTOjUpoU+a*D3^6sRZG9--35OSyaV0@9mWTiiLCQ6%B9brd~#o^&(g+AU^93byaHYT
z&jM+8;bl8g?Z<iDlh_YH+WH8~$o9K@k(KK6`|us$ZSV$Y@EGRaK-T#X<+A-cdy1#t
z?@1dQ!6vW;ya?8VXMnVuNr_9dEJ50meI&>5M$GBS@VmgpIL^v}OTeX|5oipWfTo}s
zXbxI{mY@}A4cdUVpdDxrI)IL#6X*=O069)GkKuwoc6V`1GH)d~rh*<rLHAkaUFJdN
zT8`b!Imp~+GIw%pn{(F{`?nkD4tju|pcm*3`heR(U(gTq2Lr%BFbLcM27^1n5O5b5
z3hoBOz&#+d4+Xgoq%9efJoaQv@!S>k7z(4$@_04pE^{6-_nVAU&PnFn4afey7mNV+
zfstSo7!B?RW58H24vYsAz(g<!Oa@cHR4@%p2Q$D-Fbm8EnSCgX{UGN|bM6Xy4298W
zInSDNmpKoa`%UIfo=eQRn}hv37t90m!2+-lECP$c67T?63YLN8U<FtSR)Gh>YVZ(v
z7_0$n!6RTD$m~O5><4+SGv}_L$50r3mgh%v?lR{gbHB;l$!iOFzO9be5ptfL4Icwq
zf#yJ78;lGi_4-C$i^w|h>x$>$Gr)Mz8N{zA;*{;kYZr4nQ)M~O4YUR3He@|nM?Pbd
z@x-4?B{26kaDWRu;DZ2!AOcBn87L1bfXl%Z;7V{6xEfpot_6kh8dzSN$vETJiBH3y
z01ZLBKjQJpYdLd2Hi6dzw}EQFTs~e$UQ5Y3@$0k3@VcM|s0z&2W^!Nlqr3)_b>#Jy
z%<&ZXM9>wq0nG-4d0x}W>n2$zejPR$J^^$At%3O(Ozz8e<n^A}w+-;=U>xWK^6Q(t
zMl-kb9DEuW3p#@Qwj-}C&GF2F-w#@XW<a)Mj!F6^uLsTT%!H2yEkIL{-*)76qq$Ba
zcpXq3Q~~*|Bd>F19eF*QjqN@Ke@vDI=J=&g^0+b2oBgoD9bgE!3)~Iv0VBXDFb0eV
zlfYCk1Iz?-z<jU>JOEaJ)nFZX96SY{1#-^GJO>oyc}T`2b8enDWlRM<hJx<19CPM*
zGjk3y_nC}S-cQIp%lifl(Rva-2h<1VaUsjf`yTU{7zu9=YJtpSLf)g8?cN7(2Wo=M
zcArL>dHk%$Iqf;H0Xz?0055`#;3cpLYzAAv%itC8D%c9Pf$iWm@H%({yb0a{Z-aM0
z=J6AILCuHkD|x(_$B&Fj#!--CD5(BQJJM&Fd-M1)=Ro#<e#ehICzx~hE|%K?-UB<q
z`(PLN0DK5O0w04<z^C9dup4|1z5ri>ufW$}5BLUr3%gLtmYz7+I+csyf2RK#|#
z2bI7Lz}&XnmwhMaMsvPpKNR%13woT7vd`##>L%k@mrvOv`IN0inLNJbCg%-lOV*WT
zQ-%CTmRN%_**Eb`?me7O*+co1twx!-9cfF}m1X1hB&E#<QI@%#Rr!>y%%^Mx%49oo
zZj#p)(w3|%%gXT}+lX(njBIZ?$}_jOET6Ka`IJ47PuUWb#bcE|Nc+;BtS`&Q+mBQF
zuo&f;eOQ!F*}{Cv7N9KNp3I@NCF{zv@iru7JM&Q{p9RKam-&~uk^Lt9mbT(`WSRIT
z>&`=+TmvBGcvz&`irblsI{C~iu9IcsI(Z(Fv5Cw6$ULv)_c)Z-{jyH{xz`Wa;9l?}
z_zC<BegVIN-@xzS5AY}W3;YexK<XllB3U@L!C{~nI0_sCjswL(32-pZN3%F4d5)FO
zy<|)UJ%)llx5(>KbM7+dAiuej_g?1QWnupw0uBX-gQDOFa3nYy91D&ICx8<{NpKQ4
z8Jq%61*d^h;B-(LoB_&!Gr_@_J9*zAV~Xca##GQ_DCoJ9_buk!WzIo<b0_bEWu2$+
z9$P+dJ`3Y43(f}TfOEn5;6iW-uz?SfpaQrGTn}yr)j%!K5HttvKo`&x^aDe{e=>Iy
z=4GdrJCOY&=Ph}EDPxlNVDh+=F%|R}GW+&#`z-Ho&AH2*gZ$=BKEIH4;&bMC80Q7x
zB5*M%2k`d5azFsegDb$*pdz>lQ~@<Xeb5ZF1|2|G&<hLzcYzV$KbgDC{qo<q%$<BL
zAY+OjcQU4e9>agrr~NG}eU{H9%(=^)gZ$=BKC3e4E`f2nzyl$Oz-8caa3#1F+z4(3
zH9#HE7_<cKL1)kd^aXc-d%$Qg9!v%@j?8}SZ*2cfz3d11oW-0w8B;-z;lJtA{+5+K
z%jZ1i+-1%|=01~k#&DhZoOun#c^#+(ZUB|REubp64b%foKr7G|bOPPM?O+hN8{7-V
zfC*p<m=0!vxsv~4?*5bWl8i|{zmokBpEG4l1wDrS%|~4SpY~ZkKQrepa}M&GyHQ*x
zK4(_PIBSEtpaEzEnt~Rf4d@8Ef!<&sxDyNqqrg}&2}}bs!5lCjECh?e1K{7xUA#a3
zr<A#q&vVSjos6lV$M8S(;lHhuKFjAy=G<k@L4I>Lg6qWB@bcw;>V2%d&yaQx19II$
zQE&t(295+rfuq4O;8<`RI35%SCx8$X<md6SPozKhvd{5%2=aR5L{Ji(1mrcBT&r>_
zI1Q8nr-Rbq3{VD~3C;p#!P(#(a4t9x6l9;JE$Q>{j6SzTSr2drxDWhqye9G<^jY2$
z-^gp*<yet@FJqD8MfSfS;~R!DIric?sfw~{pgO1lZUZ$zEl?ZO0d+w=P#-h^4M8K&
z7&HM*K{L=C6vmv~ozdr3*iKI{2#f+#z&x-5;BEK+ife0!X7srO`qc*v1!KSrumn5|
zo&+y|x50<t|JHS!ccIU692MnpB*%*Edl^f748_OM5R{q6QA_l@E9eh~fpK6KSPIsF
zr@)I~D|iQd1ik`4g1^9FMGF7A?mIL3+#1{c-+uk%;EX>1zi=($9qjXot)kTV{zlAg
zWpER?8QcPH1yw**Pz_WEHNb74Ca48!gF2uts0ZqU2B0A*tZNMiq3`k-IK4#r+J8CD
z<+zb~ka3xh0~w#3QwO449t-hv^Z6*d09*(z0vCgF03W;mulf6$0U3R+fWBQ0t^ikp
ztH9OZT5t`x4pao!gG%5Aa3i=0+zf63w}L958z{)<f%x@AfA%?k?rMka^ag{$a4-qX
z0jt4UumQXT-U9D|FTnTUH*g4!p~Jxm;52Y9D9Ap?uP6Ft^!fjV>nQr7&+-`f=<@V+
zsB(PE@gn<Q#v{jh=Ie^vQEomTw81{U9Si~Yf{9=uSOwOB^<Wd&4n6=MgRjA_Kt6Ck
z3KR#YgL6P(%uSz+KDWoFyMuvX1egpKfaTy3@C?`jwt*esbMP(r8T<{3p`Ry$(%}3;
z>T~amK6k=)M}w(gK3EB!0MCNW;5G0*_yp_$KY=3XTTyTvI0KvwE(8~W!r0Hf&}Z{?
zg&Zrg?`16J`BcuOJy9m#+0<z;C-t|c^4eBjmka`OTuNK=I#*t!$T?b$QMoVc_CTF{
zXCv!kD5<vMcDkc3b)7QnWZAf`8}7@o8^15^lZDqYU@#a8hJpLQNN_(G2PT54U<Q~C
z=79xZ5m*A2f@NR@co?h$&wv*}{GKs>pC@kKkLwRKWnakkMKY%Nv&48zGM0iI!-4iO
zb3N&E7xp>+?kwK_@xG7uQJk_)=Zv{qj`6JotH6WcA+QEu7h6w(=fEcLDtH6D3qAs0
zg73huK;Bdz2~GrOfeV2PE(g-@c>8f?j^*FDj7hH7GUralRM2DiH+{@pSNhyBW9~BN
zAl_GT$~qk~=595{xfVPE9tBT;C&9De1+W=x18;&I;A8L=_yPP5vN1nLgOcD(Z~?GE
z5?l+U-|_b2%pA+VaT!zljJcCB74#VXO&>GYl|Hx2n7hn5i1$^TvQFEKxqBSrd>X6=
z8^A{J5_lQB2HpnmgU`S>;Aij`I27}895@x64K4;gxB^rJH-j2L`W<gS&djm=8<#P)
z$(TDCQ$dg6-}EtaUFmb{jJeC4gLq%XDeJV#n7ijO&Mn{-uoY|vuY-5MPVg!C8tess
zf-KBOF>nGn9h?U)1rfLkR06kx+dzFF{f@UEXXaS`jmwx?X3U+8si4R3Z~B<IuJpM@
z#@uDjLA<Zxly#bC%-vfU=X+om_z-*oc7r|ONAL$If;lJ(jt8fKvfv`%fXl!&;6_jd
z)C6@wBhU;;zvJ!4nK_n!<1(gZ8FMFND(Er%n?7c)D}8R7F?X4B5bvuvWt}D&bN2zp
z`8oIkd<(t@KY?Gs-{3Ip|6{?)pbR)4BtQtR1lNO`Ks8VsGz3jSOV9>Lzh(Pz#(l{g
z&w;v(sd2{K$(Uq31v!QT?O%TDNuL{K%$>}Ex&Pz+6sN4yFk|k1!#e+fL$IF@2S<Qo
zKnYM9oC7Wa0jL131C>ElPz%%pjX`tJ3bX~CKzGmwNZ;e_$C=-F4zx_h)F5N-WK0D;
zh6C+ie(On}>u1be<{ZTPDo$CaUdG%Vh5cI`oCHn*rN9~BTu=^ppgg!5+yHI?)j=K5
z05kzDKx@zrbOhZ%Z!i!@-{bAancsL0v`ofSH)HN(Oa(oL1MOda>q(#MWXxUW9K`!7
zPFbgR#@w~XI6HvOpbO{<dVpS_Ke!VN1NVaaz-TZQOaPO?G%yRy1q;ApuoSEUj{xa=
zy!|-y8_$83$(U+o%$<y>pvQ2a{mXAX>2uADxyzh`cwfaS>)e(xcRexA+d*G201N_y
z!B8+9j0E?CabO~t0;Yr6U>;ZqmVo8pA@C@80z3nr1Jd_+`*G$so&znDG1bVJI~h|!
zkKsW3m*0BQ=js`AmpKRVzKT=Usg^N!%P`KB;6d;(SOeCA$H0@|X|Nt_0MCOLz>8ob
zcnNF*o52?FGI#~N3bul6K>8kUKhFHdbD(81rm7ipCu1t;F&t?B@>@^(TqR@fGUp)P
zS8-yM|9?BcJs_VM<B*UK$mOGa3%mXw(Zl2)+@$W1)V=gSvW(Of$<8i%1XAWpZbd<h
z)IHQ8<96cpB@YL9xs|Fn+dL9wa+75`cW)u9OLd#h-FwL5vRd;tO}lq*D=!pNRo$94
zZ!e38ckA3+Iw5zvwCE{)OWp0<Le`Uox^?c{QMy+Qv}oF+Y4etydbI2&otK5XwQSwC
zb0?|evOW6ZjvRs~psY*6w|(2T-7s>Tz!})Rdkc9)<yY0FMGHBB9GiD7ITz_Q!q9PD
zX-rP7`BkAYc_z_~rPh)sCA~(Fu$_c7CeN|?RiQC?Nue7{ttGEc^cs=vc&TB^i>>^s
z(3rfo)QzRql9#D!jlfR2jwd~nSKawlp)q-rp&CoCCGS_%8etguZc-X+ypJk0Chs;?
zW9hZzjjL)b@Dh%j+Gp}UJijV5CLcSf#?ouaCm0!PIFS=b&*TG{{HoBHeEyWtn5-oq
zWvMm7L|_N0Lqk59%dZNJ$p?|DvGiK<nWkRD@x3V3n0!Q=UlkgYPo#BYskP+8ZM8<=
zdv+p}VajLq`BkAY`D#KnmR?J~<xp$j_z#k)p2?Rs`BkAY`Ho68mR?J~){;|3i<YgL
z_UzDOAeNMGwq!{>THSKh%Ow(iLW<>!HF=CzuPi2Cqse)odS%ZQm#^Z~mi^H6qfn~k
zn>t+;L{1`*D)}N$SA}x)OO<?A#|_xx^1ZQM6vuWlwE_8BSyy=;&N8x=eCw>M(5WER
zpnNHvQDxg+s#}9lmEJZUBxvQ1Jd3(f=%*Hy%Q|!`Zs7T;5y({^8C9O+`qH3W6p~Tp
zJ7KC@@?9mjof@26JEa#*L|!7*ExEKRqbhX6)VSr!D_!N-o|kG+z7XXGQhkw&{M4de
zlyt&Wm0Sa;tHPx3J2C>fL@=Yuj>6P|B3GxXst~9BWa<PV7u)7ng$Cu?Vtst3)*79;
zM&Jfcm^!K6zmF<3HYT&N^jc%{tPy&N^b^szeN>^b@p&3ctu-NYjnGbdiAWyP6ZcVt
z#wKMpmR@Ue<{Cj52AE|uHf0}GXl!a`W9hY~<yix#2S3%=^nFyJu^D+9ORY6C&l;}l
z+Nmd^S^KC$W3%%#mRf60<{Ex7>4vHE;9OLhCnAh|9vd-FL@1t*VxEX#3t&7?q)veg
z;p!0Mi3p1>%2Sm-5us{vo~rbT2vtjRH(<+rJdkJ6^oa;nOY>BvPeiC%mZvIxB0|;j
zyj7_a5vo?8D(6H5Uzumo^w|bgtMXQ*&NireFmF}rY=f%Rxf@8$!b5o$O`mPB=)-xd
zQfC`ft;tiBKHH#bZSDqq*?W&<F3PhFs@CPHN}p{|^=RIz)Y%4Ak7ZWzkitRuc%G{C
zX&Y-jfhu*jNw4)}o;8w@ixVE!cxoS2*v`{=8<Vx3L2-KW{M68&H65$1&vBfXv4!X0
z>c|a3Je8zQC>!=sg;t(Nl^RKUtryJ3vDk|_?x!}kF~_~siQy&FvHYeS_ftpxX46r>
zCD&6AyO&Kz{VTbide(UrF7FAdSN6lLIrqJA8}6&_`>99M_MAn0_qE*n_Uk$Kz2qCX
zul5(7WaJeAcIBJ<s6xlyLY3ZNYOS~Ptbu2EC$($d*+&%`dpA#GskL_GStE?1^vk{X
z_ECk#cIIg;wbuK2)`)z*cHFg(Dm3;%p2kvZeTd?tFb!=xHSgT9ZKu|<vpzx@&!DN}
z^yAEB(A+2HGVo8~>FM<R&sZ0?xf?ETHLF+tJZCB2{{r{*eos9Zzsy@Kub;lkQ=ED^
z`E{ORS59(!@)V~|?BC=mPMwaw%~PB@$9;!l4$1vK&myU(gC8=B(<iUJnZ<rW&Wu0i
zH0UOO%CiVg5TX5Z?jqqYc@{~XM1RdwoO<H;EoV*7{ypy^@&xinX0ew#pZ<wrd7?F*
zRsYIq%=Q1yyOx}4QHo3Ba^5H4Uh2~de8O;vT#=e1w#>YIgdjDJe0UIxD?2w9PwvFx
z%d>ebfovs~P>#!3qWB{cX*ww%GRS%&c0vY}W40XOIp!Q~ntP0Co*a=mOZxJ6O*s}E
zZ(3N~v`Ef7IZGx>80I8SG|et)nsbtA?#ZTkr<mr;-!#Qra02=3rW^}PnHHUHTC%iZ
zZsH8n>@ub~XPV}oWtvykH2-YVf^$p@&owPN&$Q(EhIxq#OtUXE&AG@l_hQq$a;EwC
zIyz@hc)_Kng$dIl+q9%(n4fS>vpv%s-!wNc%?nNQBh!MUY2jt2MdeLP%C++GYz7Ir
zYCgy8D@=3bO8T5S_bSu8t4;H-F)g^(wD3C9qKc*^uQx1AR5H!J!8GSa)7;9Yc{iEn
z-)vfNi)rDlrbSguOI9^3N>nq=u5OxB!!-9c)4ZCd`L#?7YMU0;F)gZVTC$#D$%Opn
zT09R)yMbv=L(|+wrg@D`^P89!G&L=3W?IzTv}6mIoEFpPx<pH*b}OY$Yo%@*rCwX5
zemkW>d!=CqrBO$v$xf!)iOx#xE=rxQO5JWsz3xi=9!i6rO2b}CquxrBeN1%{@+Sm2
zyUwxuDs}oPb^9yz1}OChDh&oH4ewAI4OW`G(^NMxM5%q3QfH`A_im-$Fs1%IN`v7_
z!+VuRBa|lZGu2CsRBDe>>Wo(E-mlafqtqX(G#IBe9IrH*pfowrR6j9EsXbY#GexO8
zRjD^ksXtw5FhglLQ)x6yX>zuyL1K<ld#+Mvo>F(dQg4A$f1%P~k<xIn(rAg&<O8OL
ziKR;IWlEjpO5GJoy_HJ+RZ4>gm4>U8Mh_`XK5S}~SfkWltJHZ!sk=_8_o!0;F{Q!d
zO2a3VMo%hDK4oe$@w8I=8Kus%O5OEJz2}tr8<YmmD-B;z8oj7AxzSK3k$6d|y-BIF
zS*g24srRx{{}rXdt4hPIN~3K`liN+T6R#<?Usvk9q11g-srQyr|81qgJ4(ZMl}0<1
zCf_sFN$gZ=zpvEUrPTdEsrR8$|0AWr$4bLblt!N_O@3yoo7k<?{#>c^g;MuRrQTOc
z{jZe<dz6OXD2={Vn*7dGFY&!n`v;}YUZw7jO1+<y`adfTeo-3!sx<mdY4Ued{lp(i
z?LU<|e<^kUR_gtu)R!;94@XhzeWxGDpQz<(n58t5zwXMdO{V^qGhR1H9BS4Ew)|yz
z&ccBsfApEFuKXQbu6jo(_2n{v+}hwsrJ?+dVs348w9@1;riO`QmD=*RcRA~Z&hbiJ
z`OC-LT2KB+HCO!-N(1?m!`#}iq|)dlrOA^`jS{CQwNF**$REb$v>dsmlzQ@~nYp#T
z{MBQw2J(l6xf+&H8l9;$d6uckL|LV_{C#3hQ%Ohu95h#5`ODp0_0Ci3%im$<)&>_S
z4dt&Mb892{1IS!WUTmnFNR(4*U!v5xRH-YUs>DsX3C~vQJ4yprY3M19e5J|2RD6=C
z)Q*%oNu};(O1<(*{R&Eh%aw*#D2=XEn!L(XCvmk>`x>RrwMyOVlzJ7F`qwKBDk%+b
zP#Vck4dm>0Ct2B4Hz7YjkW=g0@>>MC>d4O$<f<#bP>`#h{8&M*`c;(%@)HKRwW0i)
zL9RwMlqTgj4svR}L`|i({PIC=ts_5zkgKlz9zw2q^3w>p>dUVr<Z2*4n2@WX{B}aF
zM)ETXxteTbs-I}A)NZ2GX{yw1rqpY$)R(K6ayH}#EtQ6?lt!(UCfk@AB-$#q+bMP0
zD|I_4^*So`J1Gr1D-F9Sjk+pLb~80hbXRKkQ0nwl>h@CV^;YWlQ5xK?H0-N1>Zdf>
z-_$5EK&d@YsWV8adxuhQuu}g{rNI!T;ay6jp-Pi?o0?1vQ)=I%)ETbSy;rF>LaBeB
z(qN?0aFo($w9@4LhI)y_7^U`DrOr5|?s%o%1f~8&rNJbn;bf)J6s5_jrrL>VO6}=N
zof%5qnM%D`O8wbNgE>mWxk{sXN|W<VbrK7d+6$FBi<G*Hm3m8*`VS}#mMRUGDUFsZ
zO|CH2O{`REuTttfsMKAp)O$#&|FF_vjnZ(f(&!PT$#tfBiS?%XiRYBs8<aZFD|KH`
z>b<Dc->5WrNolxAX|!2sa*L@!;$@}wD@vVLmAYG%dfSxx+m!~dDGgs&8oi-3`KGC1
z;w`22+e)2xl)CRK^>!%r-%}dwR2sgoG}@&!`GKiX;zOnOM@pTKmAaoO^*&YVf2K6p
ztu*{xY4nBC<d>!<6JIH{zgFt(QR;r9)caPc|DDp{d!^wIN~66>lRp~jClWs?wSQLX
z{G!zTRjK!zQvY|Q!5>P)Kb1y*DNX)us-5^psV!Gw$1hHN+bN>d%~I-REA<ai8XT%L
zJWOeHxYA@%Q=P;SO6_7wog<aHM=AA=R_Y(4G&ojic%0Jcc%{kWrn-p}l-ebfIwvZ1
zODgqFQtF?qG&n_Rc&gIqG^NQ>rh19fmD;71I%g<#%P95ERO+9lG$^YyJX>jWj?(10
zruvEVl-lPjbuLiqUZ~W&NU49Z(x9Bu@DiobrAm_tQ*mLtQrl7LxJq45spl*81EoQz
zG>nu+Nu|llObrv|mD&}QI+rVTuTbh;snowpX>hgD@EWDjwMvuMnHnW3Dz&dy>Qqwd
z-k{XGQK?^9X>gO$@MfjaElQKOnwm^hQEFFJ>QqzeR#)oPQ0m{NG^nXGtfe%ntu$H3
z&>)ehtJJQi)Tyu3ZJ^X^sMK$yG-#|eY@#%3sx;ZmR6EgJsog@U(^9G1N~zacsozFv
z&{k>KPHEI$X|jW<PNJhyyOUC<vr@N<Qm?C0znjvayV9_S(x|7>WG_?QL~o^bAEnOi
zO5MIny?#pl{z`)ZO2dIlqd`iOcbMuW1}nAiRO$>->fWW)8>-a5TWK&%X?Ty)Xt>hk
zy{7t!5lZd*lsY4ox}%hOqm}yiD-Fgd4aX{t#wksXH#JC1P-;(9>P%AVPFCtoQR+`s
z8cb6fPFEVuP@0@+YM7X%)Sj)>nWNO5tJIsP)Ss_3SfDgqs5DxnG`ZN+D6vGT{eV(u
zsZw{DQg69Ze}&RurP6Sf(&#~@$<?MN6Avl1A6DwDQR=Q$>OG>=U#B#9RB8B_(&%xe
z$tMgA6Nx94+D|EUo>uBUqtts=slQ%n@SM_cgVN}ErO6jewG%HYwKpntUQ+6AQtEA1
z>TgjRysR{QMQQY^(&ScCoy0b!_I9PtYf9bMm3nU|_1{z)yrndJTWR!;(&W3Qx``c1
z?e~;AJC(ZcEA@6M^*>M=e5f@1NNM!3(&Q(mdWlb!+Mg+Pb}My1SL%JC)c;ax@Ric=
zYo*a1rO9th^%LJJwZBv9e6Q5~L8-S_ssE$W;3uWw&q||TlqP>QHAwuX)c#$m^M_LR
zPo>^pO8vi;2LC7x<u8Qe_rGBf6;YbZGBr$OE45QUe#0+b@dvqws#^CjrQYF6{h~^P
zBb0{4ltxD?O&(=xlsH<cEk9wCGor{jR;hcOQtx=BzWgLf?z+JVO2ZOLqZ5@TOPZQY
z$gj)fG?ldFN1SriIYp^^s!~sWr7L$~zm(EIese9iHk6+m%GF4I1twRMWeklHi8Gbj
zXDM~cDs|6R>Ybz1m!HdtHxwm;^OT0?D~&Etn!M0dJ8_Xx`(mX|Ii>C;O1(>!`U$0h
ztu&NN+H?ABN3PPOXR4F%mD+()CsgW2O1-2~|1zb4{JKz1H=MA7(&%!f$tz5C6IUv=
zuTtt<t<=3nsdueX|2m~XMWx~ON~20jlQ)>^C2mw|S61rWq}07xsdtM~|5l|z6{TTS
zrBOAd$?B&1i5g1n+mt#rmAbW*dbO4M^5aiAQ|1SCm4@|{M)j2@8<-j-8Y;CLDRmkv
zb(<*lnkx01DGiz{4O=LUS}IMpGBr%JR%*9V>a<nrwo~f0SL%0A8gx_|c2XL3R+{W$
zYLw`z)b6I#ksraxnUu)wq15ZC)bFJ<=&dyDqcoD=$jNOv+1J!$qMuT`zfxy_QdfQw
zET^fYH%O^}htgoM((q2D(GaD{y9`Yx5<`{RcPn*<DRu8r>J3-w->Wnjp)|ZtX*5!4
za+IldVzg5Gex=SBrS4dz-Z-WHc%{JvrQt-S(IlnG$)-AqDN60BN}Xv+-RVlb8A|<`
zN`qNS!`VusIZBgrO?4CVl-l!^It!G#3zd3{l=_R621}HN4=9b6Dorjk)k`c_YOhf0
ztW@f*QtCaZ)L*SMct~mZu+nIa(&SoG{lp_m?R83>N0qvdDfJ##>OY}0cv5Njl+x&F
zrO9VZ4HC~Pwbv_ko>S^>Q0hIe)PF%~@S@UiqtfUlrO8dEhKbEe?JY{3mzBD&DD_@d
z>TgvVY*QNIy+!I)9RGG}*`-6<=1qIFY|*$|&kilSx9ov$-ctWFvX;%u$*&fbYj%4%
zC%rs=j23sKzDSWkimp5){_U&u?{-O9b^H;dEK&n0UXxynUWZ<nUZ388-iY3q-jv>q
z-h$qe-kRQq-j3d$-jUvk-i6+k-ksTl*^}9e*_+vic{{T&vmdiRa{zN7a}ZO0-$eGu
zVCJ1n`I!<azl%APc{g(y^B$)Bl!z>UFLMM_ei20W`$)R{sE7Fd^s)4D^a=Ed^vU!o
z^l9|z^qKTo^f~moO!*-R+1`BS0_H;IBIaVI{N#fy{{V9-Q+~uj%9k_c=NZK1XBi|{
zG35sr#8)#PV#-e_NckG(TBiIcf|RdgKFXBeHbAlUI9-08K$d%o<?;&y;?L68)8&^0
zq<jO*<;MV^){89P$do_&m-0>Y&2;(Wep&xzy8JP}EcXV>-=x1ye}}$<{vQ2(`Y!s1
z^pBYGm*&#$Crr67H`%t_;*$YR5vdy<(y7~_%)^-S;z({q(~<gr7s}mY^dp%^F^^^*
z!#tLG9P@aloP^|dLON3aKV-RkBE2N@B<9J?Q<$eRPh*y1p3W@IJcC(=c_#BLW?AOh
z%yXFMGS6e4&%A(nA@d^U#msWdOPH536HJ>KpIcqJ$Ml)06D{tC^oW^cUdAlXtiZgS
zc?I)I=2gtAnb$C{WnRau$h@9eiFpI_MrLK^P0X8_w=i#IR$*3UR%2FY)?nVotjVm!
ztj(;$tjny&tj}z~Y{+cHY|L!JY|3oLY|d=KY{_iJY|U)LY|CuNY|re#?8xlI?9A-K
z?8@xM?9S}L?8)rK?9J@Myq(#X*^k+uIe<BkIf!`&b1?Hx<`Cvx%%RM?nZua(Fo!em
zWsYFp#~jHV#T?DNpE-s(mN||&o;iUzkvWMunK^|yl{t+$ojHR!lR1kyn>mL$mpP9)
zpSgg!khzGtn7M@c0COpG8FM*v1#=~H74t#nYUV@ChnZ`bYnhKQ*D)VuKE`~U`2_Px
z=2Oh4na?nvWv*vF$K1etp7{dvMdn84OUzBo&CD&#mzl3HUuAA(Zewm|zQ%l=`3Cb%
z=3C6SneQ;)W$s|U$K1($pSg?q0rNxVN6e3zpD;gVe#YF*{G9m(^GoJe%&(bynBOqJ
zWq!x}p7{fFFY`y{Pt2d0zc7Dg{>J>B`3Lh)=3mUeng1~3C;lQu(gj(}Y~~@%Lz#y$
z4`&u-9>FZeJd$}7^JwNV%ww6yF^^{!XP&?;!90;!l6eyIWacT%Q<<kROEFJpmS&#8
zEW<pLc^0!Q^K9lh%yXING0$gSz`T%o5%XfC+%3nvgn215A(gn<Oo!<*J*LkLm?1M_
zCYhHp%QGu5FK1rCypnkp^J?Zb%xjs~F)JeF?)A({%o~_DN+oWUnKv<SX5PZQm05*Z
zm068homqo<8?z>}7PB_94zn(^9<x5P0ka{q5wkI~39~7)8M8UF1+yiy6|*(74YMt?
z9kV^N1G6Ku6SFh33$rUy?sj8#XUh9;Deo!&;MR-To7snXJF_pdAG1Gm0COO75c3Y^
zVCJ37A<Vm&Lz#CohcWMA4oAw}dzmAc_c2FGC2pgbqnY<J$1uk-$1%q<Com^6Cov~8
zr!c27r!l89XE0|nXEA3p=OE?oT;@FHeC7hF#BCvS5pywf3G)HwQsy$|a^?!=O6Dr&
zgUr>;hnNpD*D%*IA7QRzu4g`nl)D?4&of_;Lfl@YZ=}D(+{E0>+`@dB`3mz@=2qr5
z=62?5%-5N3FyCap#eAFj4)a~+4y4?DkGYfiK695;;`RaaL*_@!kC~q^KV^Q#+|B%)
z`33V!=2y(GnR}SuFu!Gf$NZl819LA@?*7R9iTN}07pcVUSLScb-<f|f|78Bf{G0g?
zGk&99BrDxs7Bic92vQb3lzAA-52qJp9>MZr%p;jcF^^^*!#tLG9P@Z)apnok63i2s
zC7CBNPiCINJe7GGvlR1mW@+Xb%reX~nP)M}B4ylXGtXh3%RCP$xAU17u>3;$Mf8iA
z<(QYS{!(UwX)_(Bi<IqqOrPZeQ$B5#`iPli`DM)V%nHoQnO88cWM0L*nt2WLTIO{~
z*-l0J^~_2vzkzup%PZ4wqTkHCg?THp3bQJ+8nZgH2J<#%O=c}-ZKU+84!thD9=$%Z
z0m~cG8_^rnn=qR)n=zX+TQFNPTQOTR+c4WQ+cDcSJ1{#kJ25*myD+;lyD_^ndoX)4
zdog=6`!H{3_GR{C_Gb=Y4rC5u-oYG<l>K=peF*a|=1|t(O&`X*hdG>iFLMO*KBTle
zk~xayqnY<J$1uk-$1%q<Com^6Cov~8r!c27r!l89XE0|nXEA3p=P>6o=P~Cq7cdtx
z7cmzzmoOh-E@dubE@!S_u0+ayS;c&ixtjG4(I2L-VXkHQBg}Oyf0X_h{c+|KEPs;z
z6!U48KSO_(xt`_E(Kpbar@z2_k-3reFVQzKH#4`e{$=_r^jGOy>D%br>95gWXTHIF
zlld0&ZRR`7cbPkw?;&MB@1(!a+{N+_=pWKQqJPZ%gyo;oKV$A@e$M(Y=wC9wVt&os
z!~BN%EmHdR9sPUy5A?mvADKU~{%86x%wL(mF@I<N!Tgi?7xQoCKS<fWl`a3^J~NA%
z%{&At%N<HTjCnZAi_(vv7o#7^Jc@ZV^BCr_%;T8HGm9f-dneFKFi&KbWS)eS<xZxb
zLO+#$8nYD3Pp6k=p1~}`Jd=4Avn=y$<~huBnddRjXI{X(ka-dFVrDt!CCp2i38u|-
zm@d;}`pkeCG9zY^c^R`jvjX#S<`v8<nO8BdW?sX*mU$hsBJ+A?CFTvx8<~}vH!*K!
z-om_<S%q1ZS&dnpS%Y~SvnI0^vo^C1vo5n9vp%x{vmvt)voW&?vnjI~vpKT`vn8_?
zvo*5~vn{h7vpur|QjXJ(^iIsqEbqeX%IwDa?#v#{p3Gj%-poGC+mW)JzRZ3s@6Q~-
z@`3b0^gEb?S$-#T2+Qwc4rTe>^kMXSn8TU(GDk4)V~%8wVvc6s&m6-X%N)lX&z!)V
z$ehHS%$&lU%ACfW&YZ!V$(+TU&78xW%bdrY&s@M<$Xvu+%v{2JfVq^pjJce-g1HhY
z^R<fpAbmCcA?Cv@U&CC>@<*8Kn2$0aV?NG&g83x#DdyA6XPD12*E64EZeTvoe1Z8Q
zb0hO5<|gK5<`(A5%vYGNGPg3fG5PdliQ>n~Wf#d^{MLDSL^XWd94}Fo-$YkI-e#1`
z6(X{lTrpDHD6hrxn#g)ad0m#*K{hnX<;oFhr#`ZYQQny4jgZZa@@6b=ifm<+%M~Tk
zP77pPqg<{mk#f1xq=Qi|SC~k-TyfIbD3>cuq+G5%>1LG66(>?ISD=hC%H@g;DVHla
z#u(*t<%X2Yl^o-Za=C&-%H@iVNk+L`$sy%(Wye&bT(0Pla=F4|hEXn8c1XEg=`q_V
zpT%;y;^PUUT&|Rm<>ktcr;T#CQbNk*%8BQVa=9`>%H>LlSB-MH(m~4QN{OvTxm-aZ
z<#L6^Hlw^B%jHUu?MAs=IU&o-l@zZT<#L6El*<(tuN&oZC5DvCl^Jgt<#MHkl*^SD
z?;7QD#f6m16&O2>a=G$C%H>Lo4~+6%ESD=XJ~hha$_rUuuE_YzD3>cUq+G7p*lm>e
zWchsL=SKOnESD=%zA(z~VEN<7FOBlSEMJNI$|&#6@`cE+jq*ELz6!a=DDT7aMaXZA
z^8PGeiu~3nAIfsMBIi4!ycf$CAip=tZ)f>p<PS#qaF#!U+-sEI&2qWY=SQP_5X)B}
ze=^F4u>3*f&qn!OEMJZM#V8-f@-@g`jq(93Uxxh6DDTViCCJ~6@_{U0j{L(YznA6f
zkbfHGJy<>u`Ik|C56jmg|2E1;u>4V^+~V_XciflbYA#di<Q9M9EOkPKV(I75*h`?S
z4BqBnj<?1G!92X>edgXQ>r-U4K3Uedle4Woc$+#5Z|!H{?R&A_S=Ph&SmEI|SypMh
z?X(wXS>NLAR<YVy*0TOshHr7DU$PB(dYCafu$q16TF3C3LMe;4ANS`FmPvn}!rQ+&
zL$j<?@izP?w0|ky9-oD`%va;%kSp;~!>a08)^ClntPSvUyJlHSKo5K*u^VqI*W#`D
zA8oU&CJVBx@=s@3t7oM5(4j{A{Rd@Pw_JmMAD(TkK;J9&%Ch=n*;45Had^wx4ci=v
zmvE=zZSWq9Y1im1YbV}ncf#BM4tQ&Q?t@v@MT@emkFkw-yoa%U8HbElSpQRGEj%qk
zN%}H#8MBOU$0^xXsd{J&+n&|}`>`VSdF3o?d&ey6dk5pj_NHNb7oon?W!U$4TRrBM
zENcSZ_CM7)%W5<=%i7d5%X;OJ-0c@dyK(#B@GPrFbL_VPXb*2$2jlJeMZ>bJjTq-R
zjPK;TvaH9JWLekZW1wC$QTJk&RSsjTgpXN<t;O8qc#va4#&#G#dN_jZ%iK+R#k0ng
z^Q`Zy`>^!YVDYy1;;rne?bx6HV84BfW8yu`6W+$p`4q=Um~FKym2I`b+iBU~Wsb|X
zHWkUXR^es!D|pLX3HPr=Uw2)aZEelUw%)l_wq{vJ0%>3NZMzB;tVXAnw?00!ytQrp
zRVYbcxEgQ2LFe>%?R&DUo3MY9_=x5Y?2pnzv#fXU@z!vRXJf7OxX;Jj-iVKYcCAd$
z`R;z$C-`XK3w(^!7whe5kUn;f#x~;ipBRtF#pEpOg`2XhMOdy9=Ai@j?RLzM%;#^|
zpIxxuAHz1U#<;h1z&?*K-dQ-N@EDoX635Y4%q!-9R^N2{#~AIG!Q*akL-eOjmbC*P
z8%cl9zB|h*SvP%*%e=~CQ;z+gK2MKPjt3cs925RiIF|4@mSg1!j6rV4vVGZ`#mjnD
ztx>M!j&r5TIu68RJ7gmI{$#o>c?^7k?aDdetsz;~5NvPvG#rOVXIr0smt_@4nauI)
zIG*Gf>VxCyLmV$%(D(IgvaAJIR&K`|?aQ%P3v(*RR0(|4^ev7%*@sgw&JDA1JlD=f
z->?r$XQ$>aYcr1F)3dD`Z^p6E3Hx^u9;<I=Syh&2S#7qZ+b?dk?{-Ift<vYEqwrB#
z$thXZx`AnAj_bgGanTO8`xoYMBgWfqdV0<pjK<@jHO7MdSL53B<3{>_g3*4f7qYDI
z3!GQ*_N>{~EUN*I(-NJstr=Cat<noH*IO{IZ!kaDKSQ3uxeCW}>2tEJc{t90!TISc
z^m$;hY-<bV>X*Ig{>%9w-v8S%t`Vr)IUnQ1G1_KKdhRz|g?)v-$v&0s?wyf7Mw;MQ
zT8nn%eD>4E^!aJ~p7gP~VpzKU6OHyuzJvMMiF5q@n8Q^#&m5a=)%zsNYIS|KRrG80
z@4@u`mHtk`IEsIqKBrdt5Zf-FZSBQe$a(WHJWeN+O+PkEvi(z#D=Ojt7JfDUFY1?A
zcnstJ_Kq<W#oVnwG23$Sk-@t6(Z}L=yn_$l&9V;p8OP7BI2YrwKKjmVt71v?E0Jv-
zgE4fdlx=N)Im>EMKigW}Fxx7J$4oaI?{Yf{$UaM*i*60ANkr;SW}TexORSh@omOS0
zHQ<cBIB&db?YwT9EMiGtW&OiJqsw+$6U)AD%{H>=*9)*e7NFL$P6e{8oO|N!#EBy7
zG!WPK8y{FEQeTSovaJWIy^HPaP<iQ-^YHvOPZqUK2XR}CaEX_R)R$(xwABa4p*;Rg
z-nj(lI3Tw(fV3t1L(b#z|AjhEDJv+j39Af{?aA?6Q0>O;ph@dYAnm8-0moN74oSJ4
z1*BfaYu3sAxSfJZY{DuF3Uj<?v;BRI_Z-$Atnr>($m2b4KjS_BAdmNgLLTph`x);=
zW<AbtBf}`?{DUg%;*9$EoFpl?az?$mZd@-9oA~%0f-|W+=g4#Zu2=EA_AQ=w@LYFH
zIlMl@bM>4<vs14ZpZQW=H>c0V%bvipc+RYM2-XGi8sWpK>DQRDeVqPN?dNxnPOxrY
z=TzIMH_xYXUv7?3f9ZfQ&pB6K*~I4x^BgMo<>rC7KEL+mRc&0K-}=&vxL(#b`+%y{
zRr_&$e(OiZ`uWu-Sugv}>>t#+jCBX=ye^;X$yf^Oyk3Fr@9R8wIqPL#nEMK9U18K^
z?x!n_y3F;iVqJdwP@W?S@;L*WvaaU#3Tq!;!}j;J53gl?e&a33vC6)a+jZQY^e6K*
zq10DoePQj7>)F0+OKz2b9K-SL2HY|IM!FnhnYYR)Fqge4!*9;;Tj=?<b!*15RWiJ4
zhF7D@el%|#cBid6!t7g(48JYIYi4*YdS?45PTgwbU$gx>8D2NT>(TQYdwtf)>+<*(
zk0bU5xFg#W;<|?Ulr>_R^gHvl9*V5StUp-CdlPQ2u#V@ZY=2+lZFZ2y+q{s++hRZC
zZTVl0cLKgHJkZzv*p$_Z+uPUt{@eCjv;BR|ZyVMhtmC=ue#U#C`!arhXt$s7wm-<@
z?NG?$?YN)ub}FFp{wK#FHf43@_V#r=?`wYl)AqZt{e2zJU0Hvyj^}O%INp8j2eTYa
zTix0IzQ)_b*nVMr*V5Byzc9W#>1DKE7@vRlX8ZEG;ezQNzBkDIuBeZ(?ZWt8<94I{
z!ual>uhD*Ce7@d~?aRI^!S7|<8uAX-vibvgEhUfh!uVchfYE+od^SGNXumK%yB@^$
zPec~hXVrIb8**;gUA_Xo`zvpqeOP(xtR3<W6~7Du;yEge?<Mav+Aoan9fuh07smHH
zcNy&$#^+Q+*}lxrmJR{FuXU}#_}+Fmzqh^H*nVMrH#f{^zc9YPy2oh0Fuq?J&i3W}
zGXdW%PQrJK-;b3~;P6+1K#pyB&-cc<srWv3ru9<rgEe(Z?sqXGxQr~9IDL*ZW8Ife
z*+`bfuNmYyIDYS!dEb0~K9Kv;hf&<#fBPEgKiQtNKbr0D>ovputdGYVk2TK!WIO-q
zGSZ(h++JagcP!iA*LcUV{$P!Fd?Al_!hXg(@gR?PQX!9b@_xoU<sgrDY9Wtz+J446
z{UDEbMj?-P=6=RIE1&Vo`6oVKN&ctjqx~(1O<A+Ky}~+Q&0+idI$zCY{lPk4%`4>b
z&fm{?7aZjAE-d8nF51s{7a!#DE-B>kKCqwhE<MQOT~^5BUA~|3t~ki!U0KNEUA3R_
F{vXXEh426X

literal 0
HcmV?d00001

diff --git a/ceph/src/test/cli/crushtool/crush-classes/flax b/ceph/src/test/cli/crushtool/crush-classes/flax
new file mode 100644
index 0000000000000000000000000000000000000000..4f579dd7aa55049669138460a94858b413e78cfc
GIT binary patch
literal 8184
zcmds+X^<6H8HT@^+ue5<G6P6LOfc;G+PC|zB4HcWum>U`25OjlnE{5mlbK-=2+Oby
z+c5c&N{Sy*ETdM$SgF!jQT&0yvb2yW22B|WvD9cbs)$Mq67hYzPxqWQ5&w{>lCSEy
z?|FN<r%#{0=bW#vNQn#(!KcV$zy#M3<tQC~rdP~@{}=p+;2O9J-UojJU%T8YC&15u
z{a&kl8A|vaJf{RD9`k8<2J7esDnKRZ0eXSn;0|ynxEp*1+ynZ9L0||N21bH0U_5X^
z8hj3vf|D)gC&rKP-E0%@0o%kjvrREz0Pf>`ii0vx4!VQeKu^#I@c6!902mHNgK;1M
z9FPJzFab;gGr=d-CblPN6Z@2HVjuDvKLu_FcY%IjAQ%jWf)QX87z=EW1R0P8p9K@a
zy<iHM3Z{W7FdfVQlflhsQ*iwsr>Mf~4?d@C7oSVEYZjOd=79UaTrdyJ2MfSLum~&$
z_k$&1DR=;U9xMY7g5}^L@CEQNSOHdokGqzexeZl#!Do%XKemm(HMXr9)PMr01#7@M
zupSh_BcK^<0Gq%U!B+4n*a3Ed-C!@+4-SAs;0QPhZbqNo=sQq_7i=5H6}@e%uq_W(
zgF3Jl)Pn}l2%10(*a$X*Enpki4t9c8um|h|ZQvj{4B9~lxY2dpjD4u$Bf)lFQtiBz
z#>aP6n{K7?@g3EsTV;IYv-h@Y)2%W#vQ2NPHr*;?BHQ%lf3)fUeJo^~{(?5?YfFxU
z95?to<8#S=WV`>{+7e%J!zjVt7>_6^DQ6j9b)A;$GX}RAUdjLHuF5>8*qje`*A<&v
zO0g*pYU-++o7Xh+2%gzgU9*l#-&EYhMzPqK-@rP&VlmJ2yn~Cy^<^lRgM4*Mbxon6
zrO?z3jt4gtYS$DSxKAD1@)XLvMZBzhVRiL}`j+i=`8@9kSILcb+DW-NH@D8t=4_k0
zc*7#QTAXA$=k4PCjO;2nIoHYZEZ*+Ou1%?g<0iR_y%E`!x0AV)cNRM<va2EK*mi<v
zao+Cn(JSOqC9>JSI<C$loBb=}Vl=YZzbdY~Bb)szVu}&j>|YJGUgzysw+|(6L5R*B
zXR~|RVKfQ$Z(tnuTVP!FMqrXG1SZ8^3rw1|1SZ2?4osFE9GD#Ym*;tv+U(iTC72?F
z&Z*FyTdBJwTN)mj>Z!Xllc;cChD-R+W!Yb$%W-KRILBtv6}kk|l+Za$#zN<EnHjny
zlY`Kun6iZ~%_Jvu8K!cf%QC48U9O+zT)V&S5(9MS4Ah-FNO#G>x=RhwU3#eQGQ)J2
z9j?3F2+bw!k-AHa(w#F}ckUS7CCBP6HBNWw@w&^{y2~;N4DL%Z=V&fvyShs-<qD2W
zIZPTu=caX+WMUTXOJ#MJ&gm}0)Ga(R%LFuZIi}ZvOWREFLYKH#cg`f;xs!F5oT9tb
zRNbYg=`K^HyX<t`<z{FuW6#uGVwUcl*}8M*=q`Dm?oxAgm!7A)%zWKt7w9gxP;*&(
zk?s;qA%nY=b?(=l%d|4wmt=w&x)f8*(50DlhAy*AcUdN);l3PG(!k|xCa0lGFij1e
z!-O?-?h4%{SL!ZRt-H)B-DPWZm&?O(AoNE!yAZL7)e-B|Myy*GvB@<Nn_3&O>2(pC
zsgKy~`iRXn=r&;&BR0_(vCbnA>oTPbTa-vPM{J78X6N2CbLvi;VH(<Lv&^<TZH@_Q
zXdRn*c&AM;b?vkcGxSdDGO6vfN#^dIHnlxs(>o$Ivom6|yCOE%s$19I9kGc$5$o)Y
zSa)B<Cih2dsx4yE2O>6eFk-WZA~ttew@Ld*#3tG!);SunZb!r>OJEtC7@G=>^=oRX
zTMGG=O&jV9%?0+S7aVqZWJw7J(vm0e!Ic=qTZmJ+)}nG@Mdd1r$|Vz(>mzCqLB*wN
zLrkfm)KtpfB!eBwDD%a8S;}I$^4*o*rnEw7rP3Zsdn)avlnrO}_QiX_$0Fx{Mjv0i
z7qBdHwr6mjXK;pRaBgRCR%dWNXK*HG^!G&uC>^MDkkY|QhbSGYbePiNN=GQ=Jj@v7
zi}%u$#WBi{RXR@Tc%`<|gi>}b!}Y~`amga*PX;@j!8wz`*^<F|lEE30!MTyaS&_l{
zkinUdG07L1taOUfsY<6Qtx`H&=?tYamCjPid5kf~7w^R-i*uEqr*yv31xgnx<-EjL
z?2GqelSR%#49-7{2Ym5fbh60VhQWD;!5N0ZxrM=5g~9oR!I^}y(iiWAB#Wz*uTh#;
zT2Q)LX|2*arJM*DYkl!vX0lkX{CcGgN{dPxm2yC5H2LDa{A4lst#i<3Z1lx@Das<p
zX9fpm21jHDhhhfDU<L<Y21i}S4qs%a(p^ehmF`x$N9kUr`;_ii+NP8PCF7tk-pg1P
z4=aB}X}i*+N;{O6D2+jpo5dmidsTlsvq}FQvhO*p`R~hp&*9DgU%>Yq=KTLYe9z&|
z|9{8#9QGJNwH!8C_BP_9(m%`_T`iQm&1;j2-fi;i%UfjQf^9PV@2&Dj+rJ4sk8lY3
z!;dW)zsr=4pBvJ5ttr3UY{=b?CEwj|h;iPOi%SftXtSj6HcM6<G3EGA;&S<nA=U?m
zJX|#8AKx*h@*AeKov~#7inz==hX07Z6qhl`-v>X>o8$j?6x3H-X2@UPF=XFqQ!YJW
z$nPf_(y`8zl2;6A-_cDjJ!Q(~y@ssC{$DLbOMZ=O+hR%6^0-{fTT*!~CjDM8W$Bri
ztU4N($D1vgSY^qiT1&oi#jlT_L4Bjn;(Ifu+|$RBkqb<5Up3^V%ch(vGvt(O$$)x8
z#^HP0msoOqLQGzH#*iNjwj_z|FXEWy7Y(UxGo;sZhP?kLOO9-f$;lI@oPXAkp{2aj
zeij<kXJ3tpeaMjUUoVyC?y%&gDpTI;VM^MxWGn9B@pxRaKQv_Y6;s~sYsm>5Gv#}x
zEW-7DyULJbxMz2nrcCUJ$)g8Md1y>rVy6wc049ISZ(o_#zLy_2WaVsA`n_Svo2Lx<
z0QdL#esMYfmMLF79h1@xrsS^}^1G9UyxJO*9Pam%ji&tbNmIUm%93qME$KbdlnMLe
z^7>Iz;ulPbO)%t*7vj=3+VtATDKTi@Zx@$Iw~MB<UohmsGE)XVjkX<&%Qwp{xjfpC
zpPr4$_SKdwoNUQ!6D>Kn7~gTgkYAiN<-%*0TsmjU)jyl^G}_$pdQ9g1#*lX_Ecx)9
zAs3hX^>IoK>g$T<lT&w4UspVzTp|SZb;a{}yH;OUJfB?h1nujJ=aWmTpuVnnKJQfZ
I{jYKM-z5{{Bme*a

literal 0
HcmV?d00001

diff --git a/ceph/src/test/cli/crushtool/crush-classes/g b/ceph/src/test/cli/crushtool/crush-classes/g
new file mode 100644
index 0000000000000000000000000000000000000000..0e27a9d9962e5c799f0638060fc4955a52c40057
GIT binary patch
literal 43071
zcmeI5dAN-A-^XpGge)alk|j#gy)UIuQI@2lq($~6*`-yaQnZK`N)ahaDk*E(w~(C>
zQd!Er?>z6%c+Z*dXRhDxdityDI?r?N>w0;e*L}{+nfcB=Gsk@Az9(C@?Adx{%a$z%
zsF*Wbwp{R8EO`=NA+X50Ggy}$U(LxoJX?{(+HBd*gD(OXgG<2W;7U*qTmvct7ewGD
za0|E{)CG5g`@n;s31|V@f>t$i<KOI=#xgdi)5az{*2%TV2MPiiuae*jP!?Pbt^*DT
zK^0IP)C6@veb5jz29JW4pdHAJV{<BPY_N%CJ0Ii+g}|kt7$^<OgKL2Wd~gG(3T^|n
zKs|6TXapVx%|RQ`9%RO`Ihi&#vMpT*@`3`OC@2BSfC``zut5NB1l2$dP#fF@8h{7D
zBcK^*4IT%XacoYcjZF?*j|)H^P#9bWii1+%Do_zz4?GZqo58K%4sa*92iy-H0!=|H
z@R&FgKHeNp8=DTe9-TpV@C<k!ya4)w{@_h82)qkE0K>pY@EQ0Vd<DJ*6TuH)7MKq*
z<JcTa8=H=}9^JrG;5pC-yaL_;1HoYMA@~@43dVwOz;|E@m=5NEg<vUI1=fMgI5tPq
z#-<al#}nXb&<nf-`hfx99q>LF4n~15z}Mh=Fcr)IbHQS;9IOWG!EYclj?Iy@vFU>A
z@g(R8UIed#*TLIh2>1wm0>*%GU;>y7egw0@0<Z+E1iyey;17@)$L4U_*mTA9=mDMu
zy}`@iHSiXA4-5q(z-aI#7!M|aX<#Op2Nr>4U=8>cYyp{ZY!0Q3%}=-<KZ6xuE!Y4y
zgKc0Z*bk0?lOTI`{4E470tLXOpcp6x%7JUZ^}q$0acmBzjm<_}=ikA0up1l#$H5tp
z6W8$qkPj3Bmw^(XG$;=$0UHFM3a9~UgK8i%j?ICzvH25Y@E6zv4uGTJ6u^IlvYiL=
zfQ!K;;Bs&UxDs3qt^*DT!A;;6a670AGUM3nPaB)97>2*WUT_c`1E)a_T)*5PFDM9#
zfa0JmxC&GRm4OFha5K0S+yUx=%s4jt(#B>7#$gxO2M&W1;4H|6>vti@4+?{#pd=^*
zDu8Q&1$=M=s0wZabwGWP8OLUC+So+69yfyOpeCpV?gICLCZHAQ2zr3$L0>Qc3<kr%
zXz(?d1ZIK-U>V4aW3wl1Z0^MMXb2tz%|Khw89W7ggZ^M37y?FsF<?BH3KoGCU<23=
z_Jb2(7RZcavzud6<<2^Z*R1Zw^=JSZfyUqw&=fQWZ9sd_0dxV~!IR)w&=b4>UIMRx
z*T5U#E$|L_4}1VV0>i;5@G1Bld<nh+--7SKWbgx+4rYUS;3x1iSPs^Jbzmd-9c%-I
zM&chlVPyH=8HZdL?Okd2t9$UP9|A2wJJ1z81Nwk|U=SDz<fa-6CV**R4p<CUflXj5
z*aMD%^KxX)`_<00`_;X;F86~6z{B8C&;qmukAcTQC(sQ%0iFiWfnMN6@G^K6ybj(3
zZ-aNi``|<HG57?02EG8}z&GGKFcC}vKY|%xE?5YbfTdtH_znCCc7plf7w{{{x<6qM
zvi*CHD{{a3J8j!piHoontOuLH7Vrny0saQN!2xg>90#XCcHD2y2N!_?;8IWwlmg|z
zHNXNsh(R@Q8>kKL0u8_e;1SRqv;mKUE}#c^7W4)$gV(@Y;9c+m7!F2(tZfH3#J{}l
z{FS!t{Dq6K3+w}jz)^4#WW()~1LOvIK|xRi6bDy=t3X9?J@CK{penc(+zIXh_k)K(
zQ_vDT20DRm;3?1(^Z~DeH^AFqF!&IR0As*cU;>x~egw0@9UyDlksImXecaiRw(ac2
zMK}nKfK%Ws$cfwL0+3H`BTy8S0Tn<cV1oeM2yO;7KrK)Y+zT3khe0#Y3bX?qL3i*p
zcpkh2`hhpWAn-o;2#f@ufiJ;$FcC}xv%mtd1e63>+YWB1e|g*4p0@3r!9~b}+af<G
z3@!sDKxt4OTnj1#7ewGDa0|E{)CG5g`@n<XQP3K+2c5wa;2F>hya-+auY-Z$Junn}
z3_b;4fUm)KU<#NH=7OKVQm_*I0)7R#K-RYN?{7QX(zcxoaXVZLE&-Q=D?nLrHMkBq
zAOuxFbx;%30rf#c&=@oUEkIk)0dxgVg6F^spfBhT27q_K5HJjU0!D+e;9D>m`~YTx
z`Cu_v4pxJ8U=!E^{sdXu&cDCyY)#vCKF94a4txW?2UEceFb6CIKZ6xu4OkC0g5SU&
zU>n#0{sz0i9<UD_0EfU4a15LPr@$GIEhqkt0=dBX-~x~b<OTUb0Z<5B0*Zj5pcp6t
zN`ca#EXdk+{{3y|&$Ml49&XP?U>R5i)`AUSGx!~B1>3=2U?<oO_JaN3AUF(;g5%&M
zI1SE%?6`e%g7ZLba3Qz|<O3Ijf}k+C6kG-_2gN~2a0R#$lmX?yRiHem0ImktfUIrj
z-`{rrz_ybU?+I+sXhzQS@h~9ogOm3O%6oU@y-o7oFL|#@@r#Ei{ucXhUXoXTU;h99
zJ5DF(J)-fs^OpB@$UL;Xhe77*W!_ch7ysLN{<rmV-dk`w^K#2voXk1N+=|R0$ZO^D
zy54^~kL3FQbh&@S>Eyh}ZQmjfi`nG0WqBP;UeBnH*O%pQ()@I?<uYx?_s!GE|6%3-
zBJww7^5_23={E6n`}q6h?+AH5ko$G=bm^Agw-Mi$_jpdrQ(f+c+2ppAF_*EakK0o&
zX9<kGIN5T#UwOK^6GzEIem1$jn}J*oxz6(QNy}xDmeXy(_vNds*VzBv`)Ou>T$F8R
zecHB@^*VCaHYATL|J$}L+s?YQZ71t><g9JypKd$9VB3-B<Oz5lljk~l4wJ_s*{0-q
zOwK3Wvdyi<_s!?Jtk<8jwz+@0&8<n>cCuc7&f0eV>9(^vZQIFu{W)vf`KQ~?Dr`IE
zbDcbg$zzdhQ|Zrf$(HB3mH58-T$lB_b=EfbPq(=hY1>ZL>(*J@&OhCDmZxnyS+84X
zZ9D&T+gXNf$9%4n=P-FJl5Hya+?H;6u3L)l%WI}3_}ZuZ|4^Pc<^1ITi1Pc>EvNe#
z-<Rje<i7KAOXUXnKtWIhlmu6RvfyfP9dJMhs(|XCCXoMi)&~tiWAG?w3EF|o_}=R!
zX=8({k#$@FY)7))EKVC6OrvG(1I@8nlr}apUKfJApa3WeN`Nw;0;mLR5P%y&HBbZ8
z26ur5-~sRmXa-t?$3bSiPyLiOHuBn)%+KWqg}|kt7$^<OgKL2Wd~gG(3T^|nKs|6T
zXapVx%|RQ`9%RO`S(r999dJE5gYMuN@H}_{^acIFn_v)l7kmJQfsx=d@HzMjd<!Om
zAHXayA7sX{S%9&T=f%NwXC&rx<#|gUpXG5#?r*YP$@t0hm^|10Q_FpGK2B#o2X@4|
zZr~~K9OwgH0dIhTU@-U)d<;GXW5GAzJ1_-I2XnweuoSEU>p*54_jze!(-qgF2Y43r
z1}}rxz+2!wFcgdcqrsP8JeUNgftg?)SOk`VHQ-mU1!TstnVU8?op3##08fKn;3d!x
z3;^$d_rY*53VZ>+2H%6JU<Q~A7K7zrHCPXR1DSDb=A?~H7hI1gK~L}^con=3-UdU!
zN8l4M28;s}z+~_vm<<+yC154^1#AL;fXp~Hv(v`rCtQ!8!3wYzYyg|VHn0=y2S>n3
zkUjhV?fvp+rH#!-T<71xcCZ^90>{A_kP~BZ0mugmfy+P%P#Tm6m4FQbPzBTgwLvwI
z8Mm967#s6BP@cEs@mU^+<o+hxm5kp%eeO&CTrP~|44lq<4*V1A{sMcz0dN$Y0(ffu
zf49&7^t7?riedO0>;(tGF>o5>z}Vylc|k!?1QZ8l!BwCls0=(1gPXyv;0{m^WX5gs
z$F#B8fpORc_JPCT1UL(F;rd+&@`J*lC@2ZafC}JRU;!W80IGu9KpjvYWX7@iA#H3T
zT#p+;bx;%30(XJ?KoigkbOb%X^Pn#n00x6$U^Ms|Oae2(0<a8Z#<7`}Ha2(SdNc$N
zf@YvC=nS3$y+MC45DWn$z!)$dOa+U;3a|le2m8SZFbibHv6-4SHuvCqJOo;TcAzVG
z2J`{_z#uRbj09uB1TYQE0gJ&ZunBAhd%!VpUXIK;Hd8P*=5wGtZ%qgCI3)Ks*{)>#
zlFx1b)bboS8K;xiP#0{imU!Q({GU^vbLDr+|2yT$FWqvwNjx3C{C}}v^GUqt2!D*`
zk+0=_w{peLmw&Kepc1bUCIbJ+&&c<(=ggV+Liju&%LSlC;;VB%Bk%i_l+!24jjv^l
z<vhg6<syC|Ki9Eqi{!W4bnMy<TXy0l?B;EncJ11>Yx2@`Y1;fTIS1)39ec=ykS{y6
z=q{Hn@ny#r$zRu{W5>tk;^hG?ns#g2yk&=OExX8tmmluZvUS^z9prm*hWOcTU2uBY
z4CG?9XxXZ1_x9a-wrSBqwkSE9t~d*6*^=_JuagVp%39f8GuGnDIH9bSEm^G%WUXxD
zYOOD8W$RaKJy|<FeQn~m$vsB>tRp`w+j+)Xxnzm8^4Da>TDfGl{H*+)sn%MuR{jDt
z*V?hHmA_5Z+DO*QU#)6w;u_1}vubVP8q2*fW361W#AT4zdNS7HlG%xIkXM1!+Qc}>
z>qBa7VjSd^Begbhjpa2ZwKnnF<kcp%HgSz*&P6tuu1&kjm6EBJeCINC+seF6awRUN
zl^7J6&@opg#zba%%$12Dktv+y%8Xk@-uIAPDd(9OCV4}IxiT>x@=ghJWnwsFYACrf
zV?5=3gUOY0o{1rnHx!yH6C)(=G&ENx21p(P<SvAvkbAM*Eo7UQcizjL2CL5v$>y{b
zj=V?T{0Up$<S+LuIr+KM$Xox_8b@{rP-|@2r674WiD8!g49peLwzL(Y?7d+AL?F8~
zm@9nQ&%s>b$&M1{3Rm`?FjqLTYevQ;!v%3d*~!BE5nJ})P(KpLejRF!FME5aHJ<DP
zqSm;wXNX$k$UY}(jV*hks5O@Cr(&+LL)l|Rt%+qn7PTfimcAx9p1#IEk-o+`nZCw7
zmA=M1oxa98lfK5rip2el53JZY5>Fu*-pZCd@XJHFge{N!$#7(IOol5PWHLPYA{o9s
zo+l%advh{EdEib)BsXC)Vi_DcUE-Ho@|d4dc3!g_xuax!&y|OejPfov%P(M7P|&Qf
zkXcb-v*JsPa;!_uvWuAITxOPA)GY6Ev;1Oa1;xz@OPCdvG%LQsDAy`wmVKpJPHD5;
zGG=*Y&GO5c6<lRjSl+Ctf?4s^MtRmXX4w_Za;`PYtz?#WomqZmvx4i*3N5oD+pO3z
z%C}szY|kvmH_Hvo@<Oxx$gChXE4;z1=ti^RDn<p?O=j6Qo8?qB%dKXXSKTcC7PEp|
z%?fWbE2?2uT+^t~y4@`M4zrwEX1TS^^6Hr7*EK7+)2y(bS<zi)#r2Jfth>#!?=j1{
z*DSYzSzbf4{QJxb?l&uJWLETmS#e{dV(US(?1#*99yZH;#4N9gS^lGD1x?Kgo0%0g
zH!E&|l841}k9Af{m3AwYPHUBJ8<k#Lm3}*w!DA}J_9~;tRmL66w5^UR?M^D4&MMt5
zD!r~M{cb9Q?kdA4R7O2i#!s5*SWl_6pH}HSqtbm=rT3glzo*LJd6i)=l~He%@e5|U
zRv(r2iz=O$RJt##^!lpwUr`ynsxs`SGU~4~e$7nJdR?XchDzs6mF@tQ-digDfhvPR
zD#N!`M(?PM-!;>>-cxB0R_P2;>AtVh`#_~XRAum?%J3tV(J+<qa5DqzW0m#@mCi_&
z?k6g}Q7ZjURR*7_3`eVs#;A-xH#4-pP-%}<>3pfu9jDU!N~Qm`%HSK7;dqtNw<_Za
zW=7U`D(&x8Iulj8lT><>Rr*s@22)jr(^N)3sEmI!Gq$Fyv}dSvW~y{&sq|*6^yjDy
z=Bf<msf^~Uj29T`Sk^+7_D?FEMJnCJD!nBt{hw6^OI3!;R7T5H#w*OUt(7Y6RVtm;
zD%~|Iy|pU+UsML`REFzSMjKSdznbY-8&%qyR63hgy1%LPwy5-fR~h`FGW=6zv{hxi
z%}m$YuF~G2()mlJ`?pGOr%Hd9%3!z3aF5DpugZ9znVz*@rF}r9b5NywNTqjJrGG?a
za8zY@Ol5RjWqiU+-#V$%KBdw*t<pWC(mSitmsclqW0m}k<oH2$m0=E*QBIYyOpIlm
zHn7f9X`ipsk=e!654$q?ok~yU*HY=rYqO~gE>aoFBxUNmk<6i|GL~t&j0~-dRoVqq
zIx^Rp`eC<_O0Td={}PqKr7A<2GEJRTBs2P{jEkBXS(mG{i>Y*qt8`1K^knWa<D4V^
z3Y9@AmEn~tqtYtlGG@kBS(SD<m5$5<XZ+sSmA5IR(vz9cRQgw|46acbR#X{Xt1_-+
zq-$B%skAGrbgozF%Bw2Li{V<Ht<rZ?2CmA`QyKXx<G@VY3RT*XN=N3%GcJbh%CvAQ
zy&F~fRa6EysSIVRJ$2fss>-;UnT}OmrG1M^=T?>OZ7RJQDt(#e&G@}eaJ$Oz4wX?Y
zm2qt|U8{~tyRJ${=1((zk?Yn|>B%f}>brh@mBHOALz!+&{cv=z%D91<p4CvLEmPSU
zr}dosRl1E-dJm}d8><W+R2e>`GJ05L{D_&p)kLNJs7j})O1GIxuenNJrj|0Uksq{F
z8OoG>>bp^Em2n$01FNk{yPZnsF_mt6mEPkj{SGPvdDlS3#R$SqDx=OS<1S`~R#%mF
zH<eCzmF^QNJ$a`>#_tXNCshVdsSKZ18OaQG>a_8*W=7U?D(#*so#$1$y;OR=Rr)We
z4Em@HUsM^rq%wZl%-HIy(tbsy^Qua>pGvR4O8+&L!Rso+H&jM%s*DF1=~>oWD(!(P
zok1$yw^e%YsPx}e8N8=59IP@LqB4HpOxyZEr9D)o^Px)jBbDATmHu#*!N)4Y5h|mR
zD&tSgbgWS-?N3!YpQ&_5tMtaG^gmY_e4#QNt1|jhWjxMI*ZNAO{k2Nx8<p;OmEN~1
z{Rt|A?^K50tBfY9j3=4tS(8=TQ&c)rRl3tudOxW2e^ePvR~gPw8O>A~&oa}uW~;R4
zsC4G4bmyt`=BxA<s0<dW41ZD?Em9dTHZ!o6sI-4p=`2<0E>r0(SLv@%8LU(pu2LDT
zRvE7`Gql#Kw0}|QtW)W(SLtm~>Hn%S*r+nxq%zv9GXBlX$l9XP{#~W>hf4QPmEKmB
z{x+4tc9r1{mC;`+<G;;}t(_|ET`Ha3D&0LQy}c^^eJX?fD#HURqk}5rLq__Rby%f+
zM5S|7rF%@JcU+}^LS=ALWq3+ubXsM6#!TBftJ2PvBlT47+fH_sZVr`RPL+Nxm4Uo3
zE%{f0Z-?irjB=}tFEG=w<oFI5r*-T+DxHf|x_MQ4`BeJ(RR(gj<J38a1yn`_RmO$P
zbgjZF?MqZTm#TD&sPryV=@(TQT&^-KrZOt7GA?1JXO&cG%ORXIuAt|XQt4i)(krdf
zFQYP$BSELmDlDfmx=LkS-b~-BpwgBDN@tvv?_8tOt*FwwR;6D_Wgy3;PMuX)S!Hy+
z%Gfe9u;hIO8K(_wN2TMcbUl@xuhI`x2BFF@QW?c6;~UHjts7O^a<J`;3mG~$sdR5v
z=~Y$fS5p~OR~g=-GP+e|e4Ck(RYRp+Q>AmeO7{+xUM-b=ZIwYCm0?|#(VZ&edS=Gf
zT`KMRDxJGky7#E`?p5hGP#H8-8Q!Ndx?g47$jHF59#CmFR_Q#b(tSv!_pnO;5tTs`
zmEof*qoyk3W@g$}bCq@rl}<~QZYz~uYn6T*l|fsTVLO%4V=CkJW;)j6D(wy`osKHq
zPAa|5D*Y}hgRUyWZYrbhD&r^2bgdpL?I%?_PpNdDR_Q&X(tlQE@SMu9r^@Jgm2odK
zJ*&4$`vsLwAC>NlD!rFf`Y)>t`l<|HQ5n6eGVW)lZ}nGczoyc8U8VbmO7BgT{s5K1
zTPnkWDx*Ov<G0NWtant}@2Yg(Q|S&?=?ziozppa*KxH^oW%QxS_#-nzYnV!VxJu_^
zmF@_Y-bj`HCn|$cD#K4zMxUvSN1GX0V^rFot8~6l>5f(DeW}tPr!x3TW%#wq=o^*s
zcr#<`Tb1?%mCko6-S1U;6IJ??R0fk(hEr5VQ&q;(j0`R72bK1ZDxK*n-5DyqnJWER
zDudZ7!#OIWxhms%X4=+#mG%Oa&O(*$Pb$4dD*eSOgC#1%pH)UnRmRKAbgbnn?G-AW
zl`7p;D!tVz{WU6swJO73R7UGm#_P>=tqm&eUsXCARl1v0dYe`Hzo`tis0@Es8U3L$
z{?km)+N#ptrqbE2(%qrb`%9(&x5{9r%5ay;Xt&CEkD0!;SEapArL$k9dqAakP^EuJ
zWpG$!ctmA%RAqe3%)mOX(mtWmIjPb;rP4dC(m$gzIIA+0y+@LNdxt?JpC^&ZSdK-X
zk)b7@G?7YMKA0qxPU15tIF%fRu6$%k>bstNIz=jd`7o1I2J*QTsSM>~PEr}kE={S7
z<pWSMGP2~eE>daBN28?Dkx#)$r7Itrl1fiLA0w5%e4I)u1Nr2PREF|FE2)g+Gc{5f
z%SW(eWNgW&ZKTqc4`)fGBcH>ON>@I%C6%6hLPsin`9POc2J+b*sSM>~JW?5zQW;-q
zWMo<LnJ~#8jx1X~`XiN&d|FH@UHMRuRC@CHAgT1_qe4;{$ft&+GL#PxNo6FTC6dY*
z+hAfTgnzrV?9{$(^QPTewrJ9&d;6AMTXw@MG>HQZ@lErUyR`06xmow-kG1U9wX*Ed
zbndk04&)_2n>~9GSyd0eLw016uVwEgvFx=Zmi?5(vS*T5_C*rQ-bdo;wCrIdE&CLS
zWiKMJ>^CHqJ%z-wZ;)8_1`^BuKVsSAM=bmJh-I%HvFyhqmOXdGvagO<_RbN@{y1XU
z5l1Zh-iT#)8?o$RBbJ?N#IhfaSazKe%ic0#*+E7u`^1Q4HyE+(`65m(GHkLWZwL%s
zV#!6HN1sp2Z70hG=ae`mlzf?oeG#3P&PV5`FQyC71!>s;WGQ@3*)E|kr4u(SzP^lI
zl)juUMi-|`&?V_B=u-5RbZNQ_U6w9KUqzRvE6`Wd*U%N|Yw1e#b#!I=dfK9G+M!+A
zqkTG{Lpq{k`Ud(&x(a<0eKTE^u0~g<Z=r9cZ=-9_HR;>wJLp<;ZMqI!m%fv(N8d%)
zr|+ikq3@*|&<*MP==<qL^aFHb`a$|3`eFJJx(WR#-IQ)dH>X?BE$LQtYq|~HmTpHs
zMz^OQr#sLc=}vTKx(nTv?nZZ~pP+ltPts4(Pt(uP&(hD)J?ZD^UUYBz1-cLYBK;Em
zGToPcg?^RpNB5^+qhF`rpx>ki&~MQL=|S|{^gHys^n3JRdI<eK{Q*6c{*eBN9!3wR
zKc+{}Bk51*QS_(uXY^=#4E;I%1wEGjk{(BYMSo3yLyxDwr6<te(cjY(=}GitdI~+2
zo<{#b|42`#XV5e0S@djr4n3EiN6)7h&<p9G=tcBmdI|kAy_8-?FQ-?~E9q7AYI+U5
zmi~oaN3W+h(7)0f=}q)z`ZszD{X6{!{U^PZ-bQbychG;)f73hZUG#2x551S(NAIT(
z&<E*5^kMo4eUv^%AE!^yC+So4Y5EL(md=*_+!xvD9CS`P7kwUmKAoGsfWDB<LtjMa
zrSsAG>5J(CbV0fhU6{UvzLYLPUq%<DFQ<#q#px1sN%{)96n!OKnl3|^rOVM*(dFq1
z^wsn=bVd4Fx)OaIU75a~wrHDnXqWbApAP7dj_8=afxeNhLf=H+Ojo6=(bef&=v(RA
z=o)lQ`gZybx)xoVu0z+Q@1*O|chU9fyXkxAd+7#rL;61Ye!3C;0Nt2=kba1Mn0|zA
zLO)73rJK>s=@xWLx)t4;ZbP@F+tH8F?diwq4s=Jl6Wy8aLU*MT|M$UNv^)C=x(EFv
z{S^H){S5sq{T$tsexB|{_oiQ<`_M1aFVQd4ed$-|SLuFqfBH50b@~nZO?m+R7Cn$2
zM88eHL%&PEM-Qfl(C^b9&_n4D>5u4P^l<uPdIUX^{)8Sye@cHwkEX}apVME^W9cvH
zar9U8*Yr2^c=}s<0{tERJw1`0L{Fxt&{OGY^bhop^mKX#J(HeA&!*?lbLn~Xe0l-B
zkp78YL@%b7&_B~l>1FhCdIi0bUPZ5_*U)R}U+8u8dU^x>E4`84L~o{lqqoq%(|^!^
z(p%|m^mcj&{TKZ=y_4QW@22<Ad+B}je)<4?kUm5orjO7^>0|V9`UHKFK1H9V&(LS-
z<bU|H=Q#IwKn^-5or^w?KA+A_UqD|-=b<m6^V0d~{Pe|i0lFYvh%QWDLSIT3p)aG0
z(wEc4=;Cw<x+HxCU5dVvE=`xA%hKiOtLXA{1^R0G8oDBVEnSJej;>5!Pg}H2JG4uC
zv`+_gNJn(?6|@`JH_}z;o9LVAs&qBFI(-X$D}5VXgRV*6PTxV-qHEK2=(_ZsbUpem
zx;}k3eGh#v-GFXL-$&n1H=-Y)8`BTc577_PkI+r%N9m??GrBq5f^JE-qFd8#=(cn_
z`Z2ma{W#r$?nrl{JJVh0u5>rLJN*RRgMN~Jihi1YhJKcQj_yf6PxqpG(=X6{=ojgi
z=$Glf^egnMbU(U3{Tlr`{RaIeJ%E0T9!L+O-=^Q8-=*K92h&67_vsJlq4bCJNAxgy
zIQ=m_f*whKLXV<9r9Y!b(_`q*=`ZN9^q2HF`YZZt`Wt#Y{VhF#{*L~ho=8ukC(~2t
zsq{4Z2l_{PIz5A)NzbBZ({t##^gMb#y?|aw|3ojM7t>4VpXsIaGI}|^f?i3lqF2*v
z=(Y4O^g4Pyy@CFf-binvH`Bk-Tj<~EKj=T{t@Ji}JH3Pci~gJ5N$;X}(|hQ>^gen&
zeSkhlAEFP_N9d#UG5R=tf<8%~qEFLj=(BX<U~zcrmw7%}5+61V%gmlEGNmVr%;m`<
zlX$Yo?42w!Z6}M&*U2Iib+RN5Ita_uoGdaYCyPwR$s)6GvdHwCEHdvVi%hu5A~S8W
z$P}9_GPfp+OsdHuvuU!(G@2|je<q7eoXH|HX0pgsnJh9#CW}mt$s)62vdDCpEHV!!
zi%fvYA~RpI$ds2XGS?-GOmfL0vs<#rw3aL~pCwE3&nihAN39$r#}iAgmT4V1s93UP
zT1pNqmTZ~s*=)2-@5o`rlHZr<96733vSm6;jwzOGnWmB>iX~g7pX7LA$(Cs(Ih<Is
zWjaZYCYEfOR+58>C0nMI<XB?KmgyxqlvuK5nn{i%mTZ}3k^_k)Tc(@jIAY0`X*W5F
zSh8grN)95HY?+ReLx?3?rsd=SV#${2IXQe-vZv!~Ie1vIWx7rd9hPjFu9E|YC0nNL
z<gj7Mmgzb<XjrmkI!_K6mTZ~UlLLk&Tc-7n8!gj%CybWqypu-DG@l$OEO~mF{*%Lm
UC0nNZ&KNDzfU+dNpPat@A5|jzOaK4?

literal 0
HcmV?d00001

diff --git a/ceph/src/test/cli/crushtool/crush-classes/gabe b/ceph/src/test/cli/crushtool/crush-classes/gabe
new file mode 100644
index 0000000000000000000000000000000000000000..90e8ed7b940dff0dc99627eef1dea139a2fd0edf
GIT binary patch
literal 61114
zcmeI52bdGp-?oQcRK$*miijw7MUzYopn`%ZDph*#y;qSUBBCI@_uiz6fY=pL5l{g;
zii*9XVz2M>OP<+1zuD!Ff86DJ-|PBvdGq8xCv)bUIg`mGnT?iJz>@z8T9&mt)3T02
z{*7B<+$&#rK_;rKKj3m->i)w00;rol*0YMjr9J~-(Nr7qn4`flpb$6~6b8qE<G~5w
zL~s%~8Jq%&fK!1Fnm&?|XUs)g(w{%k=S<u?b@jqjKV=NRgYUs?jH59q4&<>i)?;ul
z9Q&v6aO|>-jJ-a`F5^E96a%M&Gr*akI5-QG0B3`e;2cm2oD0qarNQ~&0&pR?2;}A1
zr7aoHZ|rk8Ul*h75^yQF3|tP%fGfb2AO_+f0c_v^7kD55Nl+G)164s@_BoudU(sio
zuMXw+r1mKp!!O_)unWkX39pP!&z+1@>dpPKJj$*DSA%On1yB)O2d)K`KxJ?}r~+;P
zH-ekM&EOVL4O9m`L4J($XZAVVpK4$|H9;*<8{7)&fV!X_s1F)|hM*B>44QzZpc!Zm
zT7Z_I70Amzhx^k{=(EgC#)|ZFNycyh><2O*=A8V9GV}S;27T%Q27(b_BA5$SfDPaY
z@I2TF-U1(hFToGsPjGYr95=uj;9QU&<NE=9mhoM;{hidlCu8^ud;vZO;TYw3F|1TF
z^&FG&o8xSYzV-n_z!)$cECj2;Bj8!^B6tn#0Uv{J!EYb~<2nHp10_IdkRRieW1rdQ
z4p?_@a0j>#Oa=?UgWzHCB-je}f=|FV;AbE&5XXU2z?nd9=OOc>&)-G*+#c(?4GaS#
z!4$9rtOc9EX0Q$H2JeFp!B^lHATM0UgCgK;Z~@4VK7Wfo%l>rR^7QjX#_%=x5`_Db
z<WcC7JcrCV>5Q`eU^o~LW`pHmJ$M{E2i^qxz!%^E_yZh;v787_2c^Iz;ANEO$N1!!
zZ1%Y$*3%CR1^0rfU_Mv}Hi9kS1@H!V2Ye2`1HXcT=;txuRB#r!5adUnW&4|b?u2#s
z0)xRQFbymQtH49xDX;_V0&j!Qz<%%(_zM(9KTiWC!NvL1XE_F$eQt;K_62u>(O@Q6
z3LXHDf@i=>;8pM*_!N8(eg}VpW6{r|pg6cFpZYBOxY_5fSnpUc11th-z+>QPupPVt
z-UT0lufdO?0Qy!4oD9wZ=YvbZWgtKH4LQ!6eO`t>+zsvr<G}5p8@LBd0Np_s&>GAG
zgTN#(3k(2rz)DaF{e2yoAAOeBCFyeyydIs2byUW>t_M}X4d6yl72E`F2DgA}pgO1l
zYJyszHn<hk0d+w=P#-h^4M8K&7&HM>uh1;tRlt(-x4cA-iJxI?a{Mg#)#B8-`xtm<
zP!FuE=3621SUFcOh{s;V_bPJUFXt=rnl9&*ax5;2_W*M4QVFl`9dUjs$7Nae6V%Cj
zp37gJd(<)TkKwO^SHN!YGS~^EKl94&wI-B)*IH|&^iPnsWLfERW7HRfzYpIF_JB9Q
zRP@_KmiY+fvdr3=>2u!q;O~I9!0Vv%B+oLDWj;i?^!c%m?o0Jq+IRtM2QPw`z;obf
zAni^l`*y1RFe^Qd?Es{$5AYaSfA`NbQhk0G{x*0MyapOSg1I-4W%i+5*8lAnMN;qg
zq>Zg$8`uG!2b;lDK-#q;Jat8eB}iMcjbtC*ifwZS{4Q`g_OmkJ3UDQ83YvlDpap0N
zT7lM}4QLD6f%c#S=m<K2&Y%nE3c7*rK=#w<KAhM4?t2`Q%v({8DX+(n*L{|GmwAx6
zmVGxm2hnXNb0_<@Id?s<eS3mlpf~6P`htF-Ke!DH0JnpIU=SD#hJZW3P;e(02JQmG
z!QEg4xCcc0keAy)+LAHJb5F(;&Rt%QAwT*o&sTHqqVo{lZZb|eCYf`0FShSUFbdoU
zMuRb6EEosIg9%_Fm;@$+DPSs?2Bw1<U?!LaW`j9kE|>?ReaMgPAjeE|?(%vJ`O#-N
z&YE)<ormallev@E5_9h6WBV=u3&A3=7%Txx!7{KM+z(cOm0%TE4c36Q-~q4>JP6i<
z4PYa92y6n;KIF%Ckk>kM?(%vJ`O#;2eKhATIuFt9CUYm}7V>&4=Mi$8od+Ke+JaU<
z&TGc_fjYmDbCJ|J0?#X+h0g|)KsOMc3xz4`k#iSwJu~1_K~K;DnCp<`WEuI4O~w;`
zE)~Pv$3X(vzyU7sfDZzY1Z6=vP##<bt_IhD3ZNpm7F-AN;~ZGd&19V6dE%4s$3PPh
zZjW$$axQ0X$L8>c;8su*n2!&ak#nikGCV(P25$grgBrj*H<SCa9pxM>we4}<C38F-
zJ_Ymu?LoCcevWfGId76><XlOXnFgN>x`TG00$!I*WIb}eXZCFid={7px`N#LCg*78
zdY*yL1QS3PV6H>%%X;M8(j3oR_&Cr8v;?vqb4;>~oDZ7onFAjST7woKxAn+*qq$5|
zczsX{R0p{&Bj>rYjGWJAV!cnm9|4VlIezJrJa1(GmbsO4&D*iS9bg!^3)~Iv0i(be
zFdj?-Q^5={8_WUo!6L8}+z(cRbzl>C6g&Z*26D_9hxbb+d3hX?G0B{p$4wbiUXLNK
z`z-sM^jYRp_Vwr-nA=P4%Q)rzgv_(NZ@>_($KmrqBVg_q@>qG_W9}29;hjKT5Zx!_
zJ&M`xeejN;4v4n<B+AVFXETmz&wwr9S@0Zq9&80KfNfwq*a2PyFM*fAPOuB?2Cslu
z!E4}k@CJAjyal5BPw06yAF{3F`C{%rGA0>EUXCHJ`YY{7pJndN{l}aG+5WlhKk}Mj
z&fOk7?rrc6*bCkT?}7KhKJWqf5PSqa2A_aW!Drxe@CEo1d<DJ+-+=w#TkstS=gRC$
zUT=p-Ii~PETV<^GdQb)20CGG3kYl4c-?AO@dfa(E&WG7&OnvGm`@p7L${xz4Y$M9#
z`7Jj&Zb)0QtUNYV$baM!8&D?OCcMeL^|_Qim`m9@l$q<1wq#j(Y}lTpwD|zaqU%|k
zOWB%S%2uOH))UUTv?a^RV`YCZ*CmgU^{qmAbbTvxDO-_C+5NecEk{{6Rylr4`_i5)
zFON6-AoryY%TONe!_r*JmgG{l7-ix5WDcb*Symn^?T0s6&mxq`XMy3^W&UMuWV=bf
zrLFMs@|f@@%PvHnTmvBGcv_^|3fozLI{C~itdqxvb@DnSV+&u;qQ@2U7!bam$$7sl
z6P}BFj};yOKY$;>PvB?p3-}fM27U*BfIq=s01c!r!YF``*fPP<pfETdoCr<^ML<z-
zB#uXOIVO3HmCwCoOnE(qygs(bd8s*f(K*O%?&Q6fId>V@z6HTi;22N{91D&ECxDZ{
zDd1Fa8Yl)%2WNmYL2+;vC;`p}CBZqM6gU?giMf;a4Kk*1?qp1PJ%+rVJ9*z?&Ruj4
za+^DOA1uqr`(Jqva30248k`R<02hLb!KDDFnpPaRAPLHY3gCKhGpGsbf+nC9=m@%l
zKHzpR4E&S18@ezv^|-^?K62cW_m?szc@HMfI~h}6k0IK(f7@qye{0TNbPjTxJNf)V
zmI;rU7h#;2fXl$;pbWSYB!CCXfvZ79P#N3=s)IV95oih8fzF@@=nDpcyTB;$Pv$PV
zUH%)Fxs%TYWK7}nPR5kiWB70Sbg0KlpXGB2bMB&ZklWnJXI19h#V}4AIKT%1C=0Fv
z*MRH5ji4H+4eEntpbh8*x`Ezc0JsC(1IB_$U>cBdMEh~5vHdsovK{1e7IW@oOnE(q
z|E5oedaU$WKIbv#E;<L%Z6?c%=Q80j^ID9v5~u=h09C;)pa!@VGz85-ThIY?1--y+
zU<kMyj0EGsWH24f0&~Fv$$v3-|KzwNW0KFWWIKe%Oc_&NkKs`B5!V0HKFjB4=G;Z+
zAh)?2!)3x_W-W}f9%uj>gQlPbXbswfE}$pq2L^*X!M$J%m;k1NnP3i>4;FzXU>Ud{
z{F}K8x5s}<nLGJB$9&$&nDTlI|EUlEZJqR4K36j5E;<Lf&D|(26JEp1)BV)@Sb5(i
z?H&!}x`jgESWp-o2aX3PfD^$<;AC(LC<0CeKFG_@<7Jyje@3#;;dcmfK5`l;22KZZ
z&L!8X6bEO465wo55}X4{fpfumpforiTmUWv7lFL&v$Q3BzBkh64k+sl?g00J|Bcr~
z-h)2Nd*T~;ZM*C%vh8IovcJgo7i4@RP$v6cI43nwRuj|$wZW~R4yX(2f%>2UXb2jC
z#-Isk3YvlDpap0NT7mqSle;5*Zj1Hw0YktTFdZxetAV_Ye-^w5UITl;$KYG=8_2*D
z=LAp;lmMkce)M^Gq|cqP?*3pn7!PKH<zPK{96Sf!1pC1MtLr%LLZ4+nD#ZOr_7&Oo
zGL~>33iqR7C^Pq?Ht2T`Fc6FY6Tw`t0&D<Jfak$Z@D}(0d<lL4e}bbi4<~^$z_}nl
z=H$*upW9)*|J$#h92)8K|AlK2?_i%#Ya67F_cvlbtAd-r&EOVL4O9m;Kuu5!)CRYL
zI-oA72kL_cpdn}k8iOVvzpgbLg1*ah;OwI5YyV|Gm;FZOLB?f14`h5tVILZda(OO<
zugw>u>=JM(xC~ql$^d+Q^}putYX(L7TpoS93S14Y0Tn<+a2>c7R05U3^`Hv40o({~
z0yl$OKs8Vu^aOeN{0+}12C~oLYgb3Crym#!?gdl9e6S8|1Y5uh;0^E&_#Au(egy@w
z4;=$e1!sW^L0<M*+LC?a_DG-qFI-140DYF{zz0{QuS1pnTlN>({xTlf&!guRx1rp8
zJ!p@y-v)+(kzfi~0@i{}U^CbTc7yl9hu|yl3y=@oj|WA-+28_@A9K?`(&tWCbuTa&
zi~`fZVz3H41fBvrz%KAM_zdg^KY_nMVf6DfP!e36Pkrte>2p`CcPy9z7J)V3G4M3l
z4qgH8f{(!0;73pZeJcb`2IqkD!KL6bkRRK*FZyhrSIE91+g`?E9#7@@(g$VoolX75
z^HYCoD(ALxUNQv8ekpCqd9IwJ$T3>>QMoV6_C}q2XCupED5<u>c6y;szOxDI<gsC0
zPu!P%H+)~*KLh76U?><4Mu7XkXfO^;1XI8aFdNJR3&CQr6f6fTz)G+htOuLGQ{Z_J
zzGn>I=ZTy5!}`Nb*%oqrk&G$)EHNCDj3qC}aJYSpE+>8N&OV3VorT*!Yun=mG~7;-
zvP`$g+^xd+)_}F(0q`K$0I-RzC%`jc8+aMK2KImtz!%_K@C%SP)yIL;z<J<OV1uiG
z^xN!%+>egs-?)rPuGcc>PR5kiWB50Hj4msE?h={1=p011nJm*eGI#4R&W+$9@Gy7`
zJPw`)&w=e=7kC}K4L$^4g73j^AQSU*0w@N~1($$0NP_Et^gG&*=ve-Z%a}Sv=1#_x
z*JJoMeT*(EeeM{UyXYK5x0x){Au@N5Vw_Kc&0q`I3SIy&f>*$s;9c+u_y+t0{sc#1
zUQPzZ!TI2F;DW0`WpFd74W!@EeniLeZ(PRIJ~DSQro0}*zv*LiS?P1T$lOKeAiB+D
znYNL+dluu|0bT+-!EW#>cnj<WAA_&J0q_UNz<d-2r-HM=Mc_&hfC``rs0MBYjezt!
z+K=d1{*B9++C=6~#+27%_&0rwE-QU*9htl697MO7EYm77cW+>v?|}EfKJXFv6nqVS
z0KbC*n1e##6mS+O4K4!-P!?PZZUogq9nb(Y1ucQ}JKB%vSpJR6m|8~WPR5kiWB50H
zj4msEZV{Qg=p011nJm*hGI#G|oS%Wu!G7=^_!0aJ{sKp1`=12P0HwgiAO?JJ4Y(fM
z1ZskMpb2OJ+JN>z`YrtkGwf4zJcsKtre=}3lQGG7@^TD^+rQkFlRh_%%$>}Ex&6cK
z6s9cGBr<ouVwt}|L2Tz^z_H*&P!yB|7l12(2g-v=pem>V>Vk%#8E6ICf)1c7=mq)%
z>3g{TFmoHv;g-pm8b{_%#+27%INbi_ww(02QDp9-a}aK;FlCvBk-0k_+qVcf9h?bD
zfOEiwpbT(8IZzSY0B!-bKz+~{GzYCgJJ1ny0X;!KFc?VR!}W)m+jtJQOvcn8GIuhj
zydJ~h_Aj^Pq|fyua~GY1a9f2b%hZd^T_=pQGw24ogC3wa=nDpdJHZGr65I#If(c+U
zm<DEoxnKcU43>cvU@dqENZ-Tthnd@W4!2CkR5vnrGN!y9!{PQXx8<bIbs}>Yor7>&
zg(=J28kxI380T$Z02l;@fT3VGxEG8D<G@5P1xyFCz&x-JECI{GD)1n97(51^0?z>H
zd$|5Ea~sd$mdTiEN9In(l-FZ8-2UaZob<U?WbUGK5N@k5Wtp0hxm$^Gt^p5#^<V?o
z2p$2CgD1geumwB|o&(Q=t>6W)4QvNHz>DA|@G{s5b^+;oxc)G68_(gE$(U+H=1#_x
z*JC)`{^hou^tpOu?xJ%LZmTe{$p60`<`$68jIm3|2jucmzJ*Qyx9HLG4{lOdFm*5e
zk32@|3S?#$Iu<GOCAUJLb?P4KkYPLF@{-2@oNlG+%{Eb#DwB3I57DlCrY(>EC+!|5
z55!F#+pSk?>6WaveYak{r5n<nR_$B#>eWF`KvPvcTeRvV4-xO#t)C1|?sjk8C)GE(
z+pV=MClBh`ty>ovc45%EMei1^+H~#Rrl)Kad2r7*?K*VpDs_Bp?*X_YyY{Il>mGCC
zZaf~h{UDL>5`%m7YAw&y+^X8QZY>9<lX5O4$2h%&?<Z_q8k0kIZdGVZUZHejsiow_
zPA}ob;)$3vCa>MORiQCCRnd*5mXfm`y+jaCIH_UENnCDKXiUyobz`Ze<n&i9;l-17
z!jYcI*?ewQXiVO`sK(Mu$@?I+gzpEgos`C!9i$44$-7w9Sb8aWL#!I}oLIt6Z8Ld)
zpIa3glaDi0W9g;jlaa_0i6G%g&*X!i+^W!+e6AH~OqP<5$kY;k%!_-eT|++c%dHBH
z$p@UOvGh{%S*czk;W|O8G5P2>w<<IypLFZSQcKB)=4uJgb>cB!hAH0x<W_~o<SP%=
zSb8b>_Czg#{ohNbdL~~6<yM8p<hw7`Sb8Zrw8$Z&b(^*=`gHC+7>|^1%;b@Hw%TQC
zm5IgNm=wzwcJdsrRaH#Bo|EH1t*VYKE?@DfHM_p;2EJ6uH-Ngz3lcF;s^kkoUFFN(
zFIDm#AXg9<m+zzXL$PlsQ!9|Kt96y*;3y+Y$+y|M3Z3#&4a%3@k*avyNp))os?zJm
zlLW2Yk>jCu;Jc}Z%H<-u729*%)ClBil1P=4uw7|TE;fl&xd}hjE%^?V>rM?$uF295
zjRj6D)h)SXD^lg#ernuuMVGEh#2qKqpnTEF6{Pwi7Z0k3Izckwr>f-ILS5x2T{j^k
zkV_9ERq?=2?I?29tE%#G*iWVo5ONW3ZdGVdu36UmXKJaj(Iq_FOZcgS>bQecp|SDN
z#?nho$gza)#L_QB6Aw~_#wO)xEVb0+=n{TB>BItgPER>V6&jlwZ7jXiwCEC^?|YbK
zG&cPpRcLHRw6XM3Gjl9~!-Jb@Y}P@l(Aex8jir{FlVb_nw&ST6qPYjDLSyrCG?rRw
zesl>pnY8`Xac}{u%mWcdzL1TW2O<<NLNO0Su*EPQCsK#NC2+Ni@j!%!F3nMuJ`ka5
zS&pjofe2O0vsVz8`M5vFL(>N$RISKSl|B%mYGsb9^nnOft8!MQ4n(M0jjF5z5qwRK
zho+A<s9Kw|Ds{9$)dM-JQb!w9t;=3PY8D>M@zC_q1`l1IvnqA8LDhyFRq3M*sy1e?
zz?H4{Q1qcZ+MsGvj;i$022~H|tV$hiQ1wW(in|nc!bfvdr4QR!>M>NQqfL6L$8#)^
z3~U_mu*4Gwsls}m%-NVM^%RQJljo*}{<P^>Y;%_5z>GCK16O;l=i{X$bwJs2kSetD
zEUMH<(n~#OHjalqpXF|9Wm~h{NgWtoFddKImgR10uitJu>UU&&>S_0)>8O7x+f%PP
zFT>?ML9ME;zccH;<L|<K)qOYhY}%dm5Z8Vs`+oe@tou&#HQZO*3okNqMu1KE`a!DD
zu{ThqSD0Gr%^XYM6+V&LG;bZG3XSc_(O7D!w{tAv2SNID@128Gp|QO=8cQwpZjL1a
zm*<Y}9i$44y`Q78)KdFUd_1P1L)TW_x^(E;PBzvDDB}?{wV!?%eGHoW$b1a^V|aQx
z9rqK~g>8Ncm$#a=s(zOBDA)ZQ_w{y9JsH2qSuE#IU*;%Iolbt0qu7>%+}AmZQwR2M
zaulZy$NO^>r;c&oqL@RnzsvEE)XTy5(c<*M>p-;FjmeSmhpYze<c~QXf&+vf|0(++
z{?9ock~)a~lA}2F!tra?l1}`$oDY!~kl&-lPU?922a4r|)_7I@GpjM%{VV5Ea;QZq
zF1gEkpMZPmtFn5vzCx~a&62pxynKX!&mOKw$ma;5*s^g$apX=YuDqIu;>lV<@nydZ
zB?v!yk*1UKA%iR@5|7D%vMerpc$OtjFwH*EG)MNxtVg;ho93NjnqS1UK#n_E%O#5%
zmWZ8ZTD+KPiPKH9&oIq7(=@lZX`cMmRJgu`U&6HDY}1k@4YOnCm=-T(TH;*O?DI@>
zN}J}MZ<=?3Y5s+#1s9o?yx1@&c8O{6OHE5$W}1DuX-*l_Tzt))wIv+yO4Iz9X+hky
zWWq2vW}6mwOiQ??*`8^RZ<-sJ<|R$@%bFIHGc76C;D@v6#pG)IEQ?=lT0*YS&#JR4
znC4V8&Arw%?>f``N~Q&sO-o*Hm>;WRTKopn5;vM=S2fMK$u#$7)4W?u^Q)N_R5vYI
z!>}M$)3kUk(-O5!vu`!csbiX3*EFx5X?}gvf(E7~8yc33$zQ~U^N@@;HZ9S_G`p#3
zPBYWo=B9ZqO!Hfs7PK-g*%~H?#q_Z*)<$W(t<pq0rFMIzP6wrKN2OjTrG96nK^LXT
zuBOIg-IT_=D^2uJYWGy?^it~fR_gUp>i1O|^i!JbZ)zeYf0~fB=@Rh)N)xv$wFfG7
z1}Sw1EA@sb_3uy`3{{%E(^NY)OlkZsrHSE6?YosaBb2)LDE01D>W@?!j8dAs&r~Ni
zT4{WY(!^M$_Bf@^c%|+HrQSrP{v@TrWTnX|rn<4IO5@X%CZ;R3XDD@MDs^Wm^=2#e
z=O_*4DoxHa)r-wn8egC^u~4bKNU5_}sk=m}w^XUWOlh!OY4Uzk{n!en@s&ywtCZTS
zl{#ybx@(ns4=DB5DGeS}np|&c5Zj<MzENr7A*J>vrOv}j-A9yqk1F*aQyM(3H2H+7
z$=H)h<4-9~JgwB;tkii%sk=p~_pDO?Ii<n#N|Rd+O~hg^D2;DZn%J(?-l5caQK|cq
zQtxG@{!XRAE~Uxcrp9BhD2>0WH1V2J`*o$x8%o_bm3nU}_4g<Z-d37?$J9h@uhRIt
zN)zuXwcl6j>{IG~pw#<NssEAE;A5r9PfWFApDK-irZn-nQu_;~&X-EvuatUUEA_uo
z8thk^{MJ+__MOuB_ev87l-fTib$(Rp{-o6VS*ibv(%@I6$=^(MW4|kn|DiPTr&9Yb
zrOw|<UHLNo7!;-6ce<YZ>07q?@@oLu8pvObWot6^H=^OPUhF8d){DzutY<ygOUNIS
zW~(iKUze@Uu}WRJydb;QJ5H%DfBTqS8_3TUWNY$7Q~j9y(R@~|AD6%B%htpxN^SXz
z%IsQ4{`fUp-J(i8`BTR1T3>!KAzOpfl_ury_Ooh(*qKV>#g!)H4{oy`Y|HO3WUC{8
z@|mr!{555^dh!R0+3J^48puxvWY;FoGc_5LAA88EO~&Q#9<wzee`cDkw){nKwmKIn
zb>;6nvunLel=|}5l-ad`{9$FbCNDSCj>XC-jbEWOaivmQK2-^uvSUtMshd#h*-Cv!
zY2YeNdZyx&M5XaSX(FlAF00fjr_?R4)VoTlf3?!!8l}k!rY2$)mBz1Cnz&A>T}i1^
zS*d%yQm=|q{|2Rj{DeW)W=|xmnrg@7hYhl7?YR8LLAEC3=MJ*fmR~-|R!4pWAzR%V
zN<H~$gzQ>hekCDWgW5`y^4kenwN9*#(zyJhLUwIJeq142ZTX#rY<1)(7qZorUth>p
zPkx9YTYdRWhHMSw=NYm!+0;}w)=X)<xza=nrFKiDPAjFZT*Z{NBG+r9)NiXaXs0yU
z-c&EvL210B(nKevc4wtd7o~1jrCv9ses`ro52eYTruwm7O5?qiCi*C~`zm$%DRuiR
z^=?z@4^SH1t~5E&)F3uUX?(EK#1N(S9ZH>{O5Hn^dc&0ZcPR~qD^1>QYBDxLY5X3g
ziF=jWBb7R%l)CpR^+qf8$0!ZPDou_v)QQE$D~(T3nwY56o}|>7tkj*N)SIf*pQbdJ
zt~5Er)Oc*B()cW;iP=i+IZB<mO5J%%z4=Q01xka3N|TFBO~e)}jW1D}SgO=srqo%k
z)V*J+w?e7EQfaVCX>zrxc5IE(_*$ij2b9|DlsXS8b=NEPHYoKsDh(b|n%rcn6WeU6
z8+%4+e2db=vr6sflseBVb+;<@UQp_9QyOenn%rTk7kg1@{3WG{mzCN(l{&kWy1SKn
zuPF6jRT{jeH2J!ze(VjU@i&zw-coAsQR=*{)O|;(w^ynEuF~K=rOEeA4PyJ0#y?P+
z_)w|+ky7VlrS2z6y-$_;pD7JKSDO67)MV^SrSY$nCcai`f1}jduhjiksrQ{y|9hpu
z0j0?w40U6%AC<;`QkwW#sr`#m=U1ifZ%V!2mHK}u4gOS`{L9pM>~E!Uxw<<%IdS8O
z0!r-+rB0?&x1dt*D5d_<N`qsRCJULGh#jjmURY`3IHmUSN}Uswx+g02PEzWhtTZ@9
zX|jl^cI;H8@uEr-rzy3IDRoX)>Ykz0J5#A&TxoEY(qsu!o!Hq*<0X|Q&QWTYQtF(m
z)ICqBS6ZomzS7_VrO69Tbz>JPjbE%Zafwp<Ql-viO5MwqdS#UQS11jxRGN&LiVNG7
z#uG{two=<s>bOc>PpRiC^#i3rQfacRseY`S(s+5LiK~>_S1Wa{QR-Gu>Qz+gU#m2@
zPHD1|sX?r=()jgC6IGPjHz;*(RO(h$>fNN&zgcN;i_&B@Q<JgkO5-(@CTc3RYbkYV
zD|K&G>eW%|*Hs$SQ<|)As27VhP#SNjG|@<@-B_v9M5)_Usn<-Y-&|?XLTR$4sqt7V
zrSaBE6K#~*ZIwFhl)CMedL5Md9hC;1lqNfynuv8#8t<w!(M_q{U8&PUsoPVj*GsA2
zTWQcoX|k`WcC4S$cz>md+mzY^lsdO7bq6Z-1}XIiD-DJyP2ORu6C0{Dey7sJFs1fg
zN}b_K-Mf`~Bb560C=KpanjC4W8ylrGexK6BXr=ZTrOsHT?l`61c%}XXrNKm{$w{Vq
zvB^r~Q<NsADz&F6b*3wIXDIb%D)nb64Q4A%&N0=G%~cwor!+BNsl7m{vrwtKNU672
zslP;NuvBSsnW;f+xzhOkN)s!T+AEbhtCYH{m3nKG`fHU24=7EpGc_4|P-%R<(!>U(
z_C}@7LrUFEO1+1b`j03L9#xur%uqiTdt7P!38jfAmD*1!b)Hu0ZdU3&qtxG`G<a5N
z@;OuEvFDY>w<=A%pw!-`)Y-1o-J#TbQK|ou(%@yK$(^PqV!M>acPmZ2qSSs>sq>mr
z_jRS-8%q5*l?HDqP3|$(j=ilk{*KbbUZwWCN}czVy6-FX_9^v0P#S!wH2IOKPV8f)
z@lTW{K2>Uerquadsr!Xe?@OirS4xAgl_tM2)s5{}8vj;l;yb1G_ez}uO5GondOs@l
ze^MI!tTg$Hsb1_?rSadCCVp3H|Dn|RQ>puxQtxl2zWlLp`2N@Tf&xmD8K(NNOr`PE
zkLvJ?SNuWlQL5HHTB&o4Qn!#&?^vaNVWq)wN|VQ%8pKXe8ke8O$r@3RI7z8}vQp<1
zrLO$cN%pedsY?B#N`upsCX1PxjLEOoWHptH%a224YvN3$c5$VS{Q6h+gWVEJJ^3xS
z>{?%b#wc3@`E{9WO_nk=h{euT8b42IqO?-`e5K9>N?rLGo^VA$%)3abf3ecw5~ayY
zO^wGcQyRZqX`+l$`wFGbl}g>1QZKI5mrL5S`Wz2zrAf!sM9fth_mn1lrFNjyNh)>A
zD)r=7i?X_r@XIR=u2P!3+EhDsjna4prHP74?Q4}f*C}-?DfKEV^{-bNR8g9|!Bi)9
zqtbX)rHPxA+BYk8Zc*x1Q|eV$>eo;j)Kr?RWvUyitu%hC(nKAlc3q`TJ*BSvs8rUJ
zxn2XMenX`}Bc;j4rh2g^O5;tHCYmX=n=5r%D0N#Z^;#+QTPqFPC{4CC)sMAP8gH*O
z(Lt%*QK{2OsoPnp*F~w{RcX*oX|lViL9B<;cu%DX`4Nn)NeS%UN}WDR-M&h_eoFoR
zN(1@ro$QvA158cEZdV!~s5CK1sVzS>meo|!8KTs^L#a1Zseh-^V3^Y6U4|xOvEfSN
zcPmYdP-@?!)VWuwJ5s4PN~wRJ(qOdG<QP-qv9U_y<CG@GE43#mbtWoxCn@zNEA^)+
z4W=qhPBS$To31oILuq2BQhSzCXSPyzj#6)~Qh%P(V7}7i0#ohSLZ$IVN)wBf+Dnu=
zOO?9IlzPjR`u8ghRwzxbG}Vc%QW{^aG_gjhy;iC7fKqpzQtv^f{(7as2Bpc3rn<3*
zl*TtHO+2jBenhGBs8aVarQYL8{U?+LPby75WvUl@T4{W<(!?`L?JY{3XO+6oDfOOL
z>TgvVyr49>%~U_OU1@xW(!`5O?U$4~FDrF-D)n|L^<k-75&YY;P4~_nTD9oirggKP
zeLA=4)uuPTc}x9I&f2spBfnZyrsZvA60*d}z$`0+KT$<d>XyL2vRr`zp7=Ed!{6FU
z|4x{c)xsY)%0p@+#p}@P((BV3&>PVk)0@(p(Ob}4(p%Hp(A&}5(>u~T(Yw&Q(!0}p
z(0eg^Gy5?6GW#+6GjC%KVBXFg$Q;BR%pAg$-#wA-F_d{HQ+~cg%I{(hXWq>m!Mukl
zKQSVYAITiWlwS&w?LL|=KlUL$jy{1tkv^F|g+7fwoj#L3i#~@wmp-4qfGIyPA?sVj
zT+Cd;T*_R=l%Il-$KTIf!IU3&kn&Yb`Pl|>`MCzkwM_XT2Jv;w2buEI3R1p-xsfS9
zmLTPum=81MHxE#3JxZ6KEs)1O!E*WK0r98lo9Xfk15&<)<?<r|Q0sY?Z)M7l0Z92a
z`gXef(Z4MJB3=H-Umo`w%U`FzNq>v}HvJv?yY%<y`{*Aq<uA~s-H(`ZUv9E)x#0td
z)Ga(%;3=KD9mPDFDJM;GE0m7Z|0_}M7N#G^Jf3+1^F-!JOgX&B?UZz+{(p*c_f&dO
z=4s4g%+r}?FwbNbXP(6@!91H;l6elZ6!To>dCbzx^O+YgFJxZCyqI|j^HSzz%*&Z&
zm{%~bWX70rrW{h`W~ak)m@YGQpv8Tk9x#*4vdnVK^31E4S2M3+R$x|SUdz0WS&3Ph
zc|EfV^9JUP%&N?rm^U+TVOC>SXVzfWWY%KVX5PxI!>r4!$E?q6z--8D#B9uL!feWH
z#%#`P!EDKF#ca)N!)(iJ$868+!0gEE#O%!M!tBcI#_Z1Q!R*QG#q7=O!|coK$L!C%
zjX8jMJ98j&5OXkd2=fl+Q0AS?Va&Uj!<lz8M=<YU-pd@x9L2nkIhr|!IhHw&Ii5Lz
zIgvSuIhi?yIh8q$Ih{F!Ig>eyIh#3$IhQ$)IiI<JxsbVtxtO_xxs<t#xtw`Fa|LrH
za}{$na}9GX^8w~M=7Y@j%ni(q%!inpm=7}_VLr-yjQKe83Fec`r<hMOH#47MZec#l
ze2)1%b1U-&<~HVb<__kI%$JxiGj}p~F?TaxVZO?IjrltB4d$E7x0rjFZ!_Ov?q$Bq
ze2@7)b06~q=7-FWm>)AgVSdW|jQKh93+9*1ub5vmzhUlYe#`uh`91Rh^9SaS%%7M)
zGk;<J%KVM_JM$0bpUl6Qe>1}u{sINk1sTjtW<lmr%%hpdFbgq{Wfo>0$2^{S0`o-X
zNz9X(r!b2!Ph}Qmp2jT3Je_$4^Gs%O=2^@V%(Iy#nddM|G0$b5$1Ke}pLqfELgq!x
zi<y@&FJ)fFyc{Wa%P_BCUdfC}C2nzMf@w1yrpxr0J~Ln@nPr*fnB|#QF|TG`!>qup
z$h?+$9kUX%GE(kd&#c0{fqA1;;#QS;6Z2-~EzD}n>dYF<n#@|v+RR&-b(nRT^_caU
z4VVp?jhKy@O_)uY&6v%ZEtoBtt(dKuZJ2GD?U?PE9he=NotT}OU6@^&-I(2(J&<y@
zC$kq*-j_>xANdEjzRZ5i{><B$1DLln2QmjS2Q!B-?_dsP-pL%syo))Uc{g(e^B(5C
zNVz+bIf{87bF@_AHikKtIgUA=Ie|HmIf*%$IfXfuIgL4;IfFTqIg2@)IfpryIgdFX
zDR&nz7cv(y7fU5>OPEWU%b3fV_cK>8S29;IS2Ncz*D@bqu46vPT+iIV+{k>0xrw=%
z`3zF-Zec#ld`=2+d!D|P{sMCwb31bf^F`)M%$J!vnY);~nXfQkWxmFIo%sgyP3Bw7
zJ<PX}a`zqPUgo>Z_oNcH_nG^cA22^;e#HEk`3dt==4Z^$nO`u!WPZi`n)wZLKl5AW
zcg*jZ2at022j-8=pO`;OC2qeke`Ws0{GIs+^H1hq%)go88}$Mi>Gm?1naqMndFWBh
zqgj3oy%6(QmKSCo$2^{S0`o-XNz9X(r!b2!Ph}Qmp2jT3Je_$4^Gs%O=2^@V%(Iy#
znddM|G0$b5$1IJMai7n;fO#SFBBb0dW?sVbOX-)<FK3ovUcve+nK5RZnPA#TS--<{
zS?)1?mIurv%gZv$G0QWrVqVR>hFO7Gk$EliI%XxL^rJHUdS(@t-@v?)<yGl7(Qjtn
z!mP%u&aA<#$*je!&AgRahgp|d4=MesPj5hPNN>b!%<?Amru1g?=FAq%mdsYn*334{
zw#;_S_RJ2<j?7NX&de^%uFP)C?#v#{p3Gj%-poGCzRZ5i{><B$1DLln2QmjS2Q!B-
z?_dr^%J#gIK8$%6b2#horjKCW!@QR{k~xZbA5z*K%^bt>vCMJI@yrR#iOfmN$;>It
zsmy82>C73-nao+t*~~f2xy*UY`OF2(h0I0F#mptlrOai_<;?q;E0`;ptC*{qYml;C
z)-oSpu4Da!^!4-&%#AF6h`EX757Qr^KgxWJ<&V>!U_Qz6r|3^JH?#a1`WE`L^yiq*
zGq<w-1^PDTcIFP&zes<H{xW?heHVQ<{T2GF%-5K&Gv8po$$X2shxs=19i(jMz4UjP
z@3H)S`ab#x^beUIvHWBDC(KWopRxXP`WMVEnO`x#W`4umkCc9WOaG4kJ^cXl2j-8g
z|B3!H^B3l?%-@*5Gyh=z$^47?H&WJbWy(Le&&*(EG7BQ*aYxaQW*)=xLiA(lh3UsJ
zk7u61Jdt@4^JL~J%pyox->LMX%+r{~n5QG<ac9uaq!*{3#Vo<{v*{(7=P*k#&t;y+
zEX_Qhc>(i6=0(hlnU^pxWnRX-oLPo>1@lT~j2UO9zAMGJY`VjAnI1Fz7B-+KnPr*f
znB|#QF|TG`!>qup$h?+$9kUX%GV^+773K}h8<|y^H!*K!-omWLtj?^#tjVm!tj)ZY
zS%+DdS&vzt*?`%Q*@)Sg*@W4Y*^Jqo*@D@U*^1ek*@oGc*^b$s*@4-S*@@X1Df?*`
zdRJyQmUn0NVD@BvFJ^CMA7)=>KW2aCZAe+q0Osv1AIKcU^1<{W^gEbCS$-#T7|ZWs
z4rlq@^bz!XnD;VAGDk7*V~%EyVUA^vV~%G|U`}LCVoqjGVNPXEV@_wzV9sRDV$NpH
zVa{dFW6ozTU@l}XVlHMbVJ>AZV=iaj&s@P=$y~);&0K?&`C3bVfWD6YAagy-H!wG{
z{2}Hh=EKZKn2$0aV?NG&g83x#DdyA6&CF+*TbR!>pJP7H+{%1`xsAD<xr6y4^CjlX
z%$>|#Oujr>qPRjtHj&)LZ=jb$)Wo;V;UjABo9ODuTa9wLLPQpmD@N)W<#kzJ2iedl
zZ@}{U$R<X)Tsb1`G(t8v%A2vgDYBJO-jd}lkZp}}xuQhcX^re)l*^SRQZ84TbT-Q6
z3KJ=pD^9u@<#MHol*^STJ&kg?;zY{j3Y0NMxm>X!<#Gkbc%xjd+>mm)l4FulE?00!
zxm?jP)hL%MIiy^!?3iJc%M~3`E?0QWHp=D74k?!_J?0tZb6GA|d^~29%as!Hc)9Z9
zNuyk@l#p_{a^hK|T&|3ea=B9CWusiKbdYknQevl3E>}=Uxm;nf%P7B{<#MISZlheT
zoRG)Ml@zZS<#L6El*<(tuNviYC5DvCl^Jgs<#MHkl*^SDdyI0q;zG*h3XHu*xm<Z6
z<#Hv)`$qYDESD=XJ~qnb$_shCT#@mKQ7%_zNV#0G@u^YXhvkcqpBd#(vs|u7`P?YK
zgXNDRzc9*&vV0BlOQXCW%a<U(GRp5{`C8=HMtOghFGYT1ln-S23gmvHd^pSHikxqa
z^1duzjQq|hzm4U~kl!2S_p<yU<N>4nZkEfHK0g@cLs-5V`J+)jjO7m?e=^GNV);7c
z&qnzOmTy4*Vw4YJ`AX!kM)?4iFGv1nln-Y4D&+4*`AC*;LjGZt_h$J*<ex_QJuKgd
z{L3gG#qx)datn{Qy>MUls|8G{lUw+Wv(yRY3#VU0LobT5Qh1wx72X;T1`F|)_o<N?
z*2l=2{WGkIXJlGm<8A5)ytSW;x9^4fWmxO+vBLWH8CFTW?TjzWu=eBaR^fUX*2;l+
z4Bz5PzhoWq@-Ta>XSMv+woWXQfThdA^@sf_h;k{D{yc%VfAfcDSjF)+{71BZCEgyN
zhqufX@o~sC_^4rRtqkkerWw{2_=P<(tmU9LK9cwpZ!0(At@-aAGOXr{Gpuq?W?1WH
zr?=2iM*9PYWLURci+&%IX{|=zEBDQ?2I8?L(D#$^mbE9=IU1*M#ql=yYmDi=u^HB0
zyw&cCxBs2-*80K+GOWv%W>_C$9pQM7X8SS@8LzPU$G}>0R)CW9<>%q}-aa$aD$x*a
zVcj!ZV>?#HHm{mt?e3CceV4$vvA&sD-({#TQ5M@CZ>z`Ol3`88+x{n-WmrvTWLVo;
zWLPgfl)e5!SV!2te{Y6WyA`(EAhd_KtV8kk{IU@l)>e#jBF1;dT^ZIR%QLL&@G(%|
zIjDO+!zzQZRl&zBBQ|30u|LQ@A!9q5A3Yq)_GLb&zT{Zr%Q)6|wOm;GYOrwK2k=&Q
z?QU$(zp>r+W1n~j^MtqY^FPKu;%8bNOJrK@@pf9acd3&zt!)J|t+hC<ehF`xtKj}M
z=<9n|W?DNlGOf3&$=WRII3VrIw(VHHyw$XLIqSot%2~TMS3pVng4S^Tjk~4C8^0&R
zx(VASiH~T0$Mz^WJi~emA8*}@@ocS|9{0tV+Z*u_(0gmrbN=b=*e3XB;B$P8Gyuze
z-8jAPoPc$N?LRgN&x>gp)^j&ySWEG^Dwv1P*tWYdKQf=cVtaPSc7FuxtcY>%=!|V1
zV7zm&PvJQ-zYX@I37A*R|J(uT_D?k0FNNpbfhOor`wZ)Cd~78BJ^$_ut5}2dJ}&br
z&rR9)fBY;xM%f=^9I{WiPheld^H}zk$1n!Doy7KKYZfW(SarwPmOasys?>$Q;n)gJ
zLEj%ww<XVk&#_)P2D~vW!y1P5eL55S;R%`6$KPgHMNlSl{3`Y**@yaLf7*xrr91k*
zc|(S^7>||PDIjcL_QkrGQ`x7A;-jYh*zaT;PRBU6%)|a%FB5&kHY}N$n!Akc*oV)~
zv~Ii^`$AW2-=%o2zMf%KUzK6C-<58^h|#{?3+=T{AD52DM`gvPXIPsCr;$0X5C7Rl
zJ6P|Zn8&RcZ^v2bIcq!?&x3Xt3$|bF>(b8~>Hn!l`)!}gu>8+)T*2G3mOC@7#@J7b
zcFnYASI@LcF2-E%z_`A_{9yYGdkV)Y?8_xD$g~z>KmQrWr!Udx!G$xe9hj@152X7q
z$GdR*?8dl8p>FRYj1&84`|;_y-%<hF3VoApD(gKkJH3xI$G)@??a1-$$F1q()9$a+
z`{wEq>Gn@E+AsDN=4UUC@#8RuYjK=8Dbs5BQHIs_`b?|PSLojZ>Fq21or-Z3`7nJ<
zt+Eg6E|+N?z+A|2^JqLzCznn?H;b|TGm)#S;Qvlu5&zfrD=a*R@qd@c7z$zTHlLPh
z+4#s{)4S+n5j@|)zC9UM!Jn}I{DNaKp6g@p%(N;OL%(8~)`=KH=PH@j?iVwx){Qc)
zbxks@GI-AP#QrX~(}8R=>A}iseil&c3?OxKyf3<Xp><aEIo6<a4&b=4$J$$ICQ2<!
z`YMk(1~e|a*P2lJU2Cn81;1Ks334kA<gs$>nTmCppvXE4g!P>#d6tRPmteiDYi-Rv
z)`YTetE}|+LQ9a_*&uAIF)r~kk@}LXm$tfNKeQq|bp@XLKyK#%X-l?;9LJ|(PE4dM
zufQs-Qb5)x`*U8k8@7Wct#g62pPGlrSfu_u*2{R!amxL$oxDm}Uumu{zs7q$+dtTN
zFJS$V8t;YqJl=~AG2V-h@OUrD=kZ>8i1A)#)XOn$j321u+~tw_=y9-&QEx6QkCPjo
z7Fo~NVK|b?YmU6;zxOg;*Y@Le2d{M}mcjWMUaRLHm6<wUeCi81Z%!YJS3ZWv;x)5j
zK`aa89AV##^f{)qkHdee{oIbxG1kexZT36tTimEOkEe29Zt^Z7JQh?N<mWi%%9%}A
zUjfHa6ItFd>T_#f&T9YD^3sd2UY0lefU4A0`(eGgZn-bF0Q{%=B<p3{nf-%WWmy*<
z*T-QZP4em(C+)~hKG&1J=GSqqJlmIj!5oL&m)li9)-T(_+?G)5Y7o{%x6?I7U39q$
ztjld1%D83Sd9@v}Dyt&bmtWiPTDE_%ZFn8)a~p47jy2q8DjEHWo)e<Vs?7TQ+8)=l
zeOZ^>ssPzf!rKkFWBQGB*~g-{swglYdsD=3j`%I~+}f%Zd2IEF*NAvcy3DJ2>-=fj
zY9Y+N)sFbB5w8>Ry7XxKC{Eq#;a{`;`Vnss@rLx=#@>i^a$X+Z!f}M&7<XiSLRi-%
zm$IfTJJ|l-jP*xqe{at9<=6h);t=C)d4$K?Dxb&O`Vix7^Iwj4D84Q{+~YY`Wwqt{
z4mQ94w*7W&|6u!bd)6PR{kg*-#(TKiGJJjLc!=?KI>O`aoX_L!a)|ME&7<-Dll>5@
zvbu472iu<yHoyP0{qAi4VEc0q)*q?;x#wYy_h8$>EJxE;FSdWM@%A>>pC8|~^fB7c
zkMB<U8tv!D=imL<zMMClGt04F-EUh3_+3$dW8L}jy~b@u`}y(R!2qNE{P=wRcD66u
z?s$AJGXmesTwGh;!CKZpAm>u@JkO8sWd<4T=f`K`gN^p{<Fo4_Z2vT5etlMb2iGCT
zhP~y=TRYDzXB9shye;og@yj3}oTL2sUh+<({rvdeahTD5etf@km(hNHd`>l-?aTaZ
z?(A7J&$6w4_}=z?es6oXvHtw{Zf=Cpetvv^b&t`0etf@lFWZ;n&q#c?I2zwAelkHm
zfrA6tx8*(G51VFKk5-#w?eo63)=%GK{mJiQM)5K7xQb`bmu9T{aw!|lvhbWiUW3EW
zi=y|<*XP5zFMSxp_5HW!NdIJg(*9Vsf3S0gajXx=8;&*1f3lu``WWfYc&;zM#yf%S
zA8fo6S%0L)J1L*XJNXdfopOZ7J2jujJM9qToqmMJJ0qXRJM$3ZopprAJ3F7pJLeGN
zotw*e<@ggGuO$EJ@#s*G!>X)#Twi`2ujaG;gB`CHu>MFLuNLO>co!XFyo-<Uc$eh!
uc$XexyvvU8c$eq%c<(>Ncvl?Z@vhA0@vb_=cvm0c@vh0|@vc3@c>f31TX+uu

literal 0
HcmV?d00001

diff --git a/ceph/src/test/cli/crushtool/crush-classes/gabe2 b/ceph/src/test/cli/crushtool/crush-classes/gabe2
new file mode 100644
index 0000000000000000000000000000000000000000..c91e42666ffcccd388111a6f970913c2dc864e30
GIT binary patch
literal 61002
zcmeI52bdH^+pe2k17b$Rgot8RbUOD4W)a0elALqSQKE>5h=AmrvqVu4vmz=2Dqt27
zGbT)!<9Xhhr+4d}U4Fg~cRByLt}~Z6&)ikr)m2?RJ*juJtRj~Dmt|Sj_H4^K6!{Ns
z#c;1evANl(vi^k2eX08!_luxz%ILr<0hfAN7pq!_09ocRa5yLmjsV5Lk>DtBG&lwv
z3yuTFgW})>5Q0XJWfd5+(3bS)FZ4MZ_fA+bFV#;O!yn)WFazUg08Rz6tc>+=+>6J)
z8y=5c){(K-;n-#TCxVjTBych~1)K^_1Es*}pforGlmTagvp`vJHaG{I3(f-tId*AF
z#`8P-9M9MJD7ye$2rdE_gL2>!a4ARt8#urP9`Hc`B9H`^f%2d-D9Ap?^Yt70Ec4a2
z!n>(sO2+Uj_y%kPGH1eTBhzyy<CJ>yxU7J(%fS`kN^ljp8e9vm0oQ?w;CfIA+yHI_
zH-VeME#OvA1#|<2G0tDu=lFQ4itSVb)j<t#8>k6tf!d%Bs0-?W`k(=52pWOLpb2OS
znt|q^Ap0C2Pd}s2GB;UE(~l(?!(Q+$kohp@<R_GwkC&F{Q&-R*3<Kl9EU*-;0Z)M!
z!B+4N_y~LjeguDk!;0X%0Zs;Ig2EWzkLa_E@7m4prH(xr!`I+T@CAs+DCdhIWs<4K
zn2g^XXDjrzCl~}qfhk}fSOFdb&x4o28{l2=DfkZj4ze(=qd`ef3X}zfF-|%6nSE}H
zZTA3ofcwA%Fc&-o9tBT>jbJDE415EA0rCWKBsd<N0_1iUvM~DmeMX<#U|Y9?A>dvx
z5iA6&z&fxVYy#WC2jFAyHTV_C6W39oI5-`g0}7+h-=WWPJl(z|{dkcv>;Yea_*jxW
z6kU?XkU1wEP}T<w1!KSrumn5|o&+y|x50<tORyLG2@b_rjsYiuGT;L6D#{CEd~!}U
z``ixO=?w;h;b0P&16G5zU;}swyanC^Ux4qyZ{QI0^KftiI1QW&3Zu`m|II$P$F{qJ
zfnWrf3>JXp;1Tc)*aEhJ9pH2DE%+Jy4T_<kCxX)8{6gxpoP*3hx5jpRfjhxSFbymM
z4}!<RbKn*5I@kqvgCD>j;2&@V`dI>;3eGE}KFcv~_PG<bI~q&{^TA5+1b7x~2Csql
z!6#r3_z4t2--?3cz!~6da3Qz|6vnY3=XtZwi_wR>!2{ra&<}J5_kgjW3+M=1fZ1RG
z7!Rg{zF;O;2ChSY-$WKhpXGT;`rH-IN2g#L6|t@BK_zelxDiwaH-VeME#OvA1ylvq
zKy^?9+y-ibTA((l1L}f$pgw2-8iGleXqId%V#({bf<(@VpJQxt{(Szc1*zBWqu?Dt
zZ7}K9(29{|<+b`DSoU(hSCQBK@_I#{)8#d#oQq4~J%GG+xem|o?eO|i&dajyXQ-3+
zJf&Wpb?D*nPvNhF*T8o0D%c97KeI2}X-zEqzGY_7uNPQ?v?c3GpBrGgL*O64cY=4p
zTOjUpoU+a*D3^6sRZG9--35OSyaV0@9mWTiiLCQ6%B9brd~#o^&(g+AU^93byaHYT
z&jM+8;bl8g?Z<iDlh_YH+WH8~$o9K@k(KK6`|us$ZSV$Y@EGRaK-T#X<+A-cdy1#t
z?@1dQ!6vW;ya?8VXMnVuNr_9dEJ50meI&>5M$GBS@VmgpIL^v}OTeX|5oipWfTo}s
zXbxI{mY@}A4cdUVpdDxrI)IL#6X*=O069)GkKuwoc6V`1GH)d~rh*<rLHAkaUFJdN
zT8`b!Imp~+GIw%pn{(F{`?nkD4tju|pcm*3`heR(U(gTq2Lr%BFbLcM27^1n5O5b5
z3hoBOz&#+d4+Xgoq%9efJoaQv@!S>k7z(4$@_04pE^{6-_nVAU&PnFn4afey7mNV+
zfstSo7!B?RW58H24vYsAz(g<!Oa@cHR4@%p2Q$D-Fbm8EnSCgX{UGN|bM6Xy4298W
zInSDNmpKoa`%UIfo=eQRn}hv37t90m!2+-lECP$c67T?63YLN8U<FtSR)Gh>YVZ(v
z7_0$n!6RTD$m~O5><4+SGv}_L$50r3mgh%v?lR{gbHB;l$!iOFzO9be5ptfL4Icwq
zf#yJ78;lGi_4-C$i^w|h>x$>$Gr)Mz8N{zA;*{;kYZr4nQ)M~O4YUR3He@|nM?Pbd
z@x-4?B{26kaDWRu;DZ2!AOcBn87L1bfXl%Z;7V{6xEfpot_6kh8dzSN$vETJiBH3y
z01ZLBKjQJpYdLd2Hi6dzw}EQFTs~e$UQ5Y3@$0k3@VcM|s0z&2W^!Nlqr3)_b>#Jy
z%<&ZXM9>wq0nG-4d0x}W>n2$zejPR$J^^$At%3O(Ozz8e<n^A}w+-;=U>xWK^6Q(t
zMl-kb9DEuW3p#@Qwj-}C&GF2F-w#@XW<a)Mj!F6^uLsTT%!H2yEkIL{-*)76qq$Ba
zcpXq3Q~~*|Bd>F19eF*QjqN@Ke@vDI=J=&g^0+b2oBgoD9bgE!3)~Iv0VBXDFb0eV
zlfYCk1Iz?-z<jU>JOEaJ)nFZX96SY{1#-^GJO>oyc}T`2b8enDWlRM<hJx<19CPM*
zGjk3y_nC}S-cQIp%lifl(Rva-2h<1VaUsjf`yTU{7zu9=YJtpSLf)g8?cN7(2Wo=M
zcArL>dHk%$Iqf;H0Xz?0055`#;3cpLYzAAv%itC8D%c9Pf$iWm@H%({yb0a{Z-aM0
z=J6AILCuHkD|x(_$B&Fj#!--CD5(BQJJM&Fd-M1)=Ro#<e#ehICzx~hE|%K?-UB<q
z`(PLN0DK5O0w04<z^C9dup4|1z5ri>ufW$}5BLUr3%gLtmYz7+I+csyf2RK#|#
z2bI7Lz}&XnmwhMaMsvPpKNR%13woT7vd`##>L%k@mrvOv`IN0inLNJbCg%-lOV*WT
zQ-%CTmRN%_**Eb`?me7O*+co1twx!-9cfF}m1X1hB&E#<QI@%#Rr!>y%%^Mx%49oo
zZj#p)(w3|%%gXT}+lX(njBIZ?$}_jOET6Ka`IJ47PuUWb#bcE|Nc+;BtS`&Q+mBQF
zuo&f;eOQ!F*}{Cv7N9KNp3I@NCF{zv@iru7JM&Q{p9RKam-&~uk^Lt9mbT(`WSRIT
z>&`=+TmvBGcvz&`irblsI{C~iu9IcsI(Z(Fv5Cw6$ULv)_c)Z-{jyH{xz`Wa;9l?}
z_zC<BegVIN-@xzS5AY}W3;YexK<XllB3U@L!C{~nI0_sCjswL(32-pZN3%F4d5)FO
zy<|)UJ%)llx5(>KbM7+dAiuej_g?1QWnupw0uBX-gQDOFa3nYy91D&ICx8<{NpKQ4
z8Jq%61*d^h;B-(LoB_&!Gr_@_J9*zAV~Xca##GQ_DCoJ9_buk!WzIo<b0_bEWu2$+
z9$P+dJ`3Y43(f}TfOEn5;6iW-uz?SfpaQrGTn}yr)j%!K5HttvKo`&x^aDe{e=>Iy
z=4GdrJCOY&=Ph}EDPxlNVDh+=F%|R}GW+&#`z-Ho&AH2*gZ$=BKEIH4;&bMC80Q7x
zB5*M%2k`d5azFsegDb$*pdz>lQ~@<Xeb5ZF1|2|G&<hLzcYzV$KbgDC{qo<q%$<BL
zAY+OjcQU4e9>agrr~NG}eU{H9%(=^)gZ$=BKC3e4E`f2nzyl$Oz-8caa3#1F+z4(3
zH9#HE7_<cKL1)kd^aXc-d%$Qg9!v%@j?8}SZ*2cfz3d11oW-0w8B;-z;lJtA{+5+K
z%jZ1i+-1%|=01~k#&DhZoOun#c^#+(ZUB|REubp64b%foKr7G|bOPPM?O+hN8{7-V
zfC*p<m=0!vxsv~4?*5bWl8i|{zmokBpEG4l1wDrS%|~4SpY~ZkKQrepa}M&GyHQ*x
zK4(_PIBSEtpaEzEnt~Rf4d@8Ef!<&sxDyNqqrg}&2}}bs!5lCjECh?e1K{7xUA#a3
zr<A#q&vVSjos6lV$M8S(;lHhuKFjAy=G<k@L4I>Lg6qWB@bcw;>V2%d&yaQx19II$
zQE&t(295+rfuq4O;8<`RI35%SCx8$X<md6SPozKhvd{5%2=aR5L{Ji(1mrcBT&r>_
zI1Q8nr-Rbq3{VD~3C;p#!P(#(a4t9x6l9;JE$Q>{j6SzTSr2drxDWhqye9G<^jY2$
z-^gp*<yet@FJqD8MfSfS;~R!DIric?sfw~{pgO1lZUZ$zEl?ZO0d+w=P#-h^4M8K&
z7&HM*K{L=C6vmv~ozdr3*iKI{2#f+#z&x-5;BEK+ife0!X7srO`qc*v1!KSrumn5|
zo&+y|x50<t|JHS!ccIU692MnpB*%*Edl^f748_OM5R{q6QA_l@E9eh~fpK6KSPIsF
zr@)I~D|iQd1ik`4g1^9FMGF7A?mIL3+#1{c-+uk%;EX>1zi=($9qjXot)kTV{zlAg
zWpER?8QcPH1yw**Pz_WEHNb74Ca48!gF2uts0ZqU2B0A*tZNMiq3`k-IK4#r+J8CD
z<+zb~ka3xh0~w#3QwO449t-hv^Z6*d09*(z0vCgF03W;mulf6$0U3R+fWBQ0t^ikp
ztH9OZT5t`x4pao!gG%5Aa3i=0+zf63w}L958z{)<f%x@AfA%?k?rMka^ag{$a4-qX
z0jt4UumQXT-U9D|FTnTUH*g4!p~Jxm;52Y9D9Ap?uP6Ft^!fjV>nQr7&+-`f=<@V+
zsB(PE@gn<Q#v{jh=Ie^vQEomTw81{U9Si~Yf{9=uSOwOB^<Wd&4n6=MgRjA_Kt6Ck
z3KR#YgL6P(%uSz+KDWoFyMuvX1egpKfaTy3@C?`jwt*esbMP(r8T<{3p`Ry$(%}3;
z>T~amK6k=)M}w(gK3EB!0MCNW;5G0*_yp_$KY=3XTTyTvI0KvwE(8~W!r0Hf&}Z{?
zg&Zrg?`16J`BcuOJy9m#+0<z;C-t|c^4eBjmka`OTuNK=I#*t!$T?b$QMoVc_CTF{
zXCv!kD5<vMcDkc3b)7QnWZAf`8}7@o8^15^lZDqYU@#a8hJpLQNN_(G2PT54U<Q~C
z=79xZ5m*A2f@NR@co?h$&wv*}{GKs>pC@kKkLwRKWnakkMKY%Nv&48zGM0iI!-4iO
zb3N&E7xp>+?kwK_@xG7uQJk_)=Zv{qj`6JotH6WcA+QEu7h6w(=fEcLDtH6D3qAs0
zg73huK;Bdz2~GrOfeV2PE(g-@c>8f?j^*FDj7hH7GUralRM2DiH+{@pSNhyBW9~BN
zAl_GT$~qk~=595{xfVPE9tBT;C&9De1+W=x18;&I;A8L=_yPP5vN1nLgOcD(Z~?GE
z5?l+U-|_b2%pA+VaT!zljJcCB74#VXO&>GYl|Hx2n7hn5i1$^TvQFEKxqBSrd>X6=
z8^A{J5_lQB2HpnmgU`S>;Aij`I27}895@x64K4;gxB^rJH-j2L`W<gS&djm=8<#P)
z$(TDCQ$dg6-}EtaUFmb{jJeC4gLq%XDeJV#n7ijO&Mn{-uoY|vuY-5MPVg!C8tess
zf-KBOF>nGn9h?U)1rfLkR06kx+dzFF{f@UEXXaS`jmwx?X3U+8si4R3Z~B<IuJpM@
z#@uDjLA<Zxly#bC%-vfU=X+om_z-*oc7r|ONAL$If;lJ(jt8fKvfv`%fXl!&;6_jd
z)C6@wBhU;;zvJ!4nK_n!<1(gZ8FMFND(Er%n?7c)D}8R7F?X4B5bvuvWt}D&bN2zp
z`8oIkd<(t@KY?Gs-{3Ip|6{?)pbR)4BtQtR1lNO`Ks8VsGz3jSOV9>Lzh(Pz#(l{g
z&w;v(sd2{K$(Uq31v!QT?O%TDNuL{K%$>}Ex&Pz+6sN4yFk|k1!#e+fL$IF@2S<Qo
zKnYM9oC7Wa0jL131C>ElPz%%pjX`tJ3bX~CKzGmwNZ;e_$C=-F4zx_h)F5N-WK0D;
zh6C+ie(On}>u1be<{ZTPDo$CaUdG%Vh5cI`oCHn*rN9~BTu=^ppgg!5+yHI?)j=K5
z05kzDKx@zrbOhZ%Z!i!@-{bAancsL0v`ofSH)HN(Oa(oL1MOda>q(#MWXxUW9K`!7
zPFbgR#@w~XI6HvOpbO{<dVpS_Ke!VN1NVaaz-TZQOaPO?G%yRy1q;ApuoSEUj{xa=
zy!|-y8_$83$(U+o%$<y>pvQ2a{mXAX>2uADxyzh`cwfaS>)e(xcRexA+d*G201N_y
z!B8+9j0E?CabO~t0;Yr6U>;ZqmVo8pA@C@80z3nr1Jd_+`*G$so&znDG1bVJI~h|!
zkKsW3m*0BQ=js`AmpKRVzKT=Usg^N!%P`KB;6d;(SOeCA$H0@|X|Nt_0MCOLz>8ob
zcnNF*o52?FGI#~N3bul6K>8kUKhFHdbD(81rm7ipCu1t;F&t?B@>@^(TqR@fGUp)P
zS8-yM|9?BcJs_VM<B*UK$mOGa3%mXw(Zl2)+@$W1)V=gSvW(Of$<8i%1XAWpZbd<h
z)IHQ8<96cpB@YL9xs|Fn+dL9wa+75`cW)u9OLd#h-FwL5vRd;tO}lq*D=!pNRo$94
zZ!e38ckA3+Iw5zvwCE{)OWp0<Le`Uox^?c{QMy+Qv}oF+Y4etydbI2&otK5XwQSwC
zb0?|evOW6ZjvRs~psY*6w|(2T-7s>Tz!})Rdkc9)<yY0FMGHBB9GiD7ITz_Q!q9PD
zX-rP7`BkAYc_z_~rPh)sCA~(Fu$_c7CeN|?RiQC?Nue7{ttGEc^cs=vc&TB^i>>^s
z(3rfo)QzRql9#D!jlfR2jwd~nSKawlp)q-rp&CoCCGS_%8etguZc-X+ypJk0Chs;?
zW9hZzjjL)b@Dh%j+Gp}UJijV5CLcSf#?ouaCm0!PIFS=b&*TG{{HoBHeEyWtn5-oq
zWvMm7L|_N0Lqk59%dZNJ$p?|DvGiK<nWkRD@x3V3n0!Q=UlkgYPo#BYskP+8ZM8<=
zdv+p}VajLq`BkAY`D#KnmR?J~<xp$j_z#k)p2?Rs`BkAY`Ho68mR?J~){;|3i<YgL
z_UzDOAeNMGwq!{>THSKh%Ow(iLW<>!HF=CzuPi2Cqse)odS%ZQm#^Z~mi^H6qfn~k
zn>t+;L{1`*D)}N$SA}x)OO<?A#|_xx^1ZQM6vuWlwE_8BSyy=;&N8x=eCw>M(5WER
zpnNHvQDxg+s#}9lmEJZUBxvQ1Jd3(f=%*Hy%Q|!`Zs7T;5y({^8C9O+`qH3W6p~Tp
zJ7KC@@?9mjof@26JEa#*L|!7*ExEKRqbhX6)VSr!D_!N-o|kG+z7XXGQhkw&{M4de
zlyt&Wm0Sa;tHPx3J2C>fL@=Yuj>6P|B3GxXst~9BWa<PV7u)7ng$Cu?Vtst3)*79;
zM&Jfcm^!K6zmF<3HYT&N^jc%{tPy&N^b^szeN>^b@p&3ctu-NYjnGbdiAWyP6ZcVt
z#wKMpmR@Ue<{Cj52AE|uHf0}GXl!a`W9hY~<yix#2S3%=^nFyJu^D+9ORY6C&l;}l
z+Nmd^S^KC$W3%%#mRf60<{Ex7>4vHE;9OLhCnAh|9vd-FL@1t*VxEX#3t&7?q)veg
z;p!0Mi3p1>%2Sm-5us{vo~rbT2vtjRH(<+rJdkJ6^oa;nOY>BvPeiC%mZvIxB0|;j
zyj7_a5vo?8D(6H5Uzumo^w|bgtMXQ*&NireFmF}rY=f%Rxf@8$!b5o$O`mPB=)-xd
zQfC`ft;tiBKHH#bZSDqq*?W&<F3PhFs@CPHN}p{|^=RIz)Y%4Ak7ZWzkitRuc%G{C
zX&Y-jfhu*jNw4)}o;8w@ixVE!cxoS2*v`{=8<Vx3L2-KW{M68&H65$1&vBfXv4!X0
z>c|a3Je8zQC>!=sg;t(Nl^RKUtryJ3vDk|_?x!}kF~_~siQy&FvHYeS_ftpxX46r>
zCD&6AyO&Kz{VTbide(UrF7FAdSN6lLIrqJA8}6&_`>99M_MAn0_qE*n_Uk$Kz2qCX
zul5(7WaJeAcIBJ<s6xlyLY3ZNYOS~Ptbu2EC$($d*+&%`dpA#GskL_GStE?1^vk{X
z_ECk#cIIg;wbuK2)`)z*cHFg(Dm3;%p2kvZeTd?tFb!=xHSgT9ZKu|<vpzx@&!DN}
z^yAEB(A+2HGVo8~>FM<R&sZ0?xf?ETHLF+tJZCB2{{r{*eos9Zzsy@Kub;lkQ=ED^
z`E{ORS59(!@)V~|?BC=mPMwaw%~PB@$9;!l4$1vK&myU(gC8=B(<iUJnZ<rW&Wu0i
zH0UOO%CiVg5TX5Z?jqqYc@{~XM1RdwoO<H;EoV*7{ypy^@&xinX0ew#pZ<wrd7?F*
zRsYIq%=Q1yyOx}4QHo3Ba^5H4Uh2~de8O;vT#=e1w#>YIgdjDJe0UIxD?2w9PwvFx
z%d>ebfovs~P>#!3qWB{cX*ww%GRS%&c0vY}W40XOIp!Q~ntP0Co*a=mOZxJ6O*s}E
zZ(3N~v`Ef7IZGx>80I8SG|et)nsbtA?#ZTkr<mr;-!#Qra02=3rW^}PnHHUHTC%iZ
zZsH8n>@ub~XPV}oWtvykH2-YVf^$p@&owPN&$Q(EhIxq#OtUXE&AG@l_hQq$a;EwC
zIyz@hc)_Kng$dIl+q9%(n4fS>vpv%s-!wNc%?nNQBh!MUY2jt2MdeLP%C++GYz7Ir
zYCgy8D@=3bO8T5S_bSu8t4;H-F)g^(wD3C9qKc*^uQx1AR5H!J!8GSa)7;9Yc{iEn
z-)vfNi)rDlrbSguOI9^3N>nq=u5OxB!!-9c)4ZCd`L#?7YMU0;F)gZVTC$#D$%Opn
zT09R)yMbv=L(|+wrg@D`^P89!G&L=3W?IzTv}6mIoEFpPx<pH*b}OY$Yo%@*rCwX5
zemkW>d!=CqrBO$v$xf!)iOx#xE=rxQO5JWsz3xi=9!i6rO2b}CquxrBeN1%{@+Sm2
zyUwxuDs}oPb^9yz1}OChDh&oH4ewAI4OW`G(^NMxM5%q3QfH`A_im-$Fs1%IN`v7_
z!+VuRBa|lZGu2CsRBDe>>Wo(E-mlafqtqX(G#IBe9IrH*pfowrR6j9EsXbY#GexO8
zRjD^ksXtw5FhglLQ)x6yX>zuyL1K<ld#+Mvo>F(dQg4A$f1%P~k<xIn(rAg&<O8OL
ziKR;IWlEjpO5GJoy_HJ+RZ4>gm4>U8Mh_`XK5S}~SfkWltJHZ!sk=_8_o!0;F{Q!d
zO2a3VMo%hDK4oe$@w8I=8Kus%O5OEJz2}tr8<YmmD-B;z8oj7AxzSK3k$6d|y-BIF
zS*g24srRx{{}rXdt4hPIN~3K`liN+T6R#<?Usvk9q11g-srQyr|81qgJ4(ZMl}0<1
zCf_sFN$gZ=zpvEUrPTdEsrR8$|0AWr$4bLblt!N_O@3yoo7k<?{#>c^g;MuRrQTOc
z{jZe<dz6OXD2={Vn*7dGFY&!n`v;}YUZw7jO1+<y`adfTeo-3!sx<mdY4Ued{lp(i
z?LU<|e<^kUR_gtu)R!;94@XhzeWxGDpQz<(n58t5zwXMdO{V^qGhR1H9BS4Ew)|yz
z&ccBsfApEFuKXQbu6jo(_2n{v+}hwsrJ?+dVs348w9@1;riO`QmD=*RcRA~Z&hbiJ
z`OC-LT2KB+HCO!-N(1?m!`#}iq|)dlrOA^`jS{CQwNF**$REb$v>dsmlzQ@~nYp#T
z{MBQw2J(l6xf+&H8l9;$d6uckL|LV_{C#3hQ%Ohu95h#5`ODp0_0Ci3%im$<)&>_S
z4dt&Mb892{1IS!WUTmnFNR(4*U!v5xRH-YUs>DsX3C~vQJ4yprY3M19e5J|2RD6=C
z)Q*%oNu};(O1<(*{R&Eh%aw*#D2=XEn!L(XCvmk>`x>RrwMyOVlzJ7F`qwKBDk%+b
zP#Vck4dm>0Ct2B4Hz7YjkW=g0@>>MC>d4O$<f<#bP>`#h{8&M*`c;(%@)HKRwW0i)
zL9RwMlqTgj4svR}L`|i({PIC=ts_5zkgKlz9zw2q^3w>p>dUVr<Z2*4n2@WX{B}aF
zM)ETXxteTbs-I}A)NZ2GX{yw1rqpY$)R(K6ayH}#EtQ6?lt!(UCfk@AB-$#q+bMP0
zD|I_4^*So`J1Gr1D-F9Sjk+pLb~80hbXRKkQ0nwl>h@CV^;YWlQ5xK?H0-N1>Zdf>
z-_$5EK&d@YsWV8adxuhQuu}g{rNI!T;ay6jp-Pi?o0?1vQ)=I%)ETbSy;rF>LaBeB
z(qN?0aFo($w9@4LhI)y_7^U`DrOr5|?s%o%1f~8&rNJbn;bf)J6s5_jrrL>VO6}=N
zof%5qnM%D`O8wbNgE>mWxk{sXN|W<VbrK7d+6$FBi<G*Hm3m8*`VS}#mMRUGDUFsZ
zO|CH2O{`REuTttfsMKAp)O$#&|FF_vjnZ(f(&!PT$#tfBiS?%XiRYBs8<aZFD|KH`
z>b<Dc->5WrNolxAX|!2sa*L@!;$@}wD@vVLmAYG%dfSxx+m!~dDGgs&8oi-3`KGC1
z;w`22+e)2xl)CRK^>!%r-%}dwR2sgoG}@&!`GKiX;zOnOM@pTKmAaoO^*&YVf2K6p
ztu*{xY4nBC<d>!<6JIH{zgFt(QR;r9)caPc|DDp{d!^wIN~66>lRp~jClWs?wSQLX
z{G!zTRjK!zQvY|Q!5>P)Kb1y*DNX)us-5^psV!Gw$1hHN+bN>d%~I-REA<ai8XT%L
zJWOeHxYA@%Q=P;SO6_7wog<aHM=AA=R_Y(4G&ojic%0Jcc%{kWrn-p}l-ebfIwvZ1
zODgqFQtF?qG&n_Rc&gIqG^NQ>rh19fmD;71I%g<#%P95ERO+9lG$^YyJX>jWj?(10
zruvEVl-lPjbuLiqUZ~W&NU49Z(x9Bu@DiobrAm_tQ*mLtQrl7LxJq45spl*81EoQz
zG>nu+Nu|llObrv|mD&}QI+rVTuTbh;snowpX>hgD@EWDjwMvuMnHnW3Dz&dy>Qqwd
z-k{XGQK?^9X>gO$@MfjaElQKOnwm^hQEFFJ>QqzeR#)oPQ0m{NG^nXGtfe%ntu$H3
z&>)ehtJJQi)Tyu3ZJ^X^sMK$yG-#|eY@#%3sx;ZmR6EgJsog@U(^9G1N~zacsozFv
z&{k>KPHEI$X|jW<PNJhyyOUC<vr@N<Qm?C0znjvayV9_S(x|7>WG_?QL~o^bAEnOi
zO5MIny?#pl{z`)ZO2dIlqd`iOcbMuW1}nAiRO$>->fWW)8>-a5TWK&%X?Ty)Xt>hk
zy{7t!5lZd*lsY4ox}%hOqm}yiD-Fgd4aX{t#wksXH#JC1P-;(9>P%AVPFCtoQR+`s
z8cb6fPFEVuP@0@+YM7X%)Sj)>nWNO5tJIsP)Ss_3SfDgqs5DxnG`ZN+D6vGT{eV(u
zsZw{DQg69Ze}&RurP6Sf(&#~@$<?MN6Avl1A6DwDQR=Q$>OG>=U#B#9RB8B_(&%xe
z$tMgA6Nx94+D|EUo>uBUqtts=slQ%n@SM_cgVN}ErO6jewG%HYwKpntUQ+6AQtEA1
z>TgjRysR{QMQQY^(&ScCoy0b!_I9PtYf9bMm3nU|_1{z)yrndJTWR!;(&W3Qx``c1
z?e~;AJC(ZcEA@6M^*>M=e5f@1NNM!3(&Q(mdWlb!+Mg+Pb}My1SL%JC)c;ax@Ric=
zYo*a1rO9th^%LJJwZBv9e6Q5~L8-S_ssE$W;3uWw&q||TlqP>QHAwuX)c#$m^M_LR
zPo>^pO8vi;2LC7x<u8Qe_rGBf6;YbZGBr$OE45QUe#0+b@dvqws#^CjrQYF6{h~^P
zBb0{4ltxD?O&(=xlsH<cEk9wCGor{jR;hcOQtx=BzWgLf?z+JVO2ZOLqZ5@TOPZQY
z$gj)fG?ldFN1SriIYp^^s!~sWr7L$~zm(EIese9iHk6+m%GF4I1twRMWeklHi8Gbj
zXDM~cDs|6R>Ybz1m!HdtHxwm;^OT0?D~&Etn!M0dJ8_Xx`(mX|Ii>C;O1(>!`U$0h
ztu&NN+H?ABN3PPOXR4F%mD+()CsgW2O1-2~|1zb4{JKz1H=MA7(&%!f$tz5C6IUv=
zuTtt<t<=3nsdueX|2m~XMWx~ON~20jlQ)>^C2mw|S61rWq}07xsdtM~|5l|z6{TTS
zrBOAd$?B&1i5g1n+mt#rmAbW*dbO4M^5aiAQ|1SCm4@|{M)j2@8<-j-8Y;CLDRmkv
zb(<*lnkx01DGiz{4O=LUS}IMpGBr%JR%*9V>a<nrwo~f0SL%0A8gx_|c2XL3R+{W$
zYLw`z)b6I#ksraxnUu)wq15ZC)bFJ<=&dyDqcoD=$jNOv+1J!$qMuT`zfxy_QdfQw
zET^fYH%O^}htgoM((q2D(GaD{y9`Yx5<`{RcPn*<DRu8r>J3-w->Wnjp)|ZtX*5!4
za+IldVzg5Gex=SBrS4dz-Z-WHc%{JvrQt-S(IlnG$)-AqDN60BN}Xv+-RVlb8A|<`
zN`qNS!`VusIZBgrO?4CVl-l!^It!G#3zd3{l=_R621}HN4=9b6Dorjk)k`c_YOhf0
ztW@f*QtCaZ)L*SMct~mZu+nIa(&SoG{lp_m?R83>N0qvdDfJ##>OY}0cv5Njl+x&F
zrO9VZ4HC~Pwbv_ko>S^>Q0hIe)PF%~@S@UiqtfUlrO8dEhKbEe?JY{3mzBD&DD_@d
z>TgvVY*QNIy+!I)9RGG}*`-6<=1qIFY|*$|&kilSx9ov$-ctWFvX;%u$*&fbYj%4%
zC%rs=j23sKzDSWkimp5){_U&u?{-O9b^H;dEK&n0UXxynUWZ<nUZ388-iY3q-jv>q
z-h$qe-kRQq-j3d$-jUvk-i6+k-ksTl*^}9e*_+vic{{T&vmdiRa{zN7a}ZO0-$eGu
zVCJ1n`I!<azl%APc{g(y^B$)Bl!z>UFLMM_ei20W`$)R{sE7Fd^s)4D^a=Ed^vU!o
z^l9|z^qKTo^f~moO!*-R+1`BS0_H;IBIaVI{N#fy{{V9-Q+~uj%9k_c=NZK1XBi|{
zG35sr#8)#PV#-e_NckG(TBiIcf|RdgKFXBeHbAlUI9-08K$d%o<?;&y;?L68)8&^0
zq<jO*<;MV^){89P$do_&m-0>Y&2;(Wep&xzy8JP}EcXV>-=x1ye}}$<{vQ2(`Y!s1
z^pBYGm*&#$Crr67H`%t_;*$YR5vdy<(y7~_%)^-S;z({q(~<gr7s}mY^dp%^F^^^*
z!#tLG9P@aloP^|dLON3aKV-RkBE2N@B<9J?Q<$eRPh*y1p3W@IJcC(=c_#BLW?AOh
z%yXFMGS6e4&%A(nA@d^U#msWdOPH536HJ>KpIcqJ$Ml)06D{tC^oW^cUdAlXtiZgS
zc?I)I=2gtAnb$C{WnRau$h@9eiFpI_MrLK^P0X8_w=i#IR$*3UR%2FY)?nVotjVm!
ztj(;$tjny&tj}z~Y{+cHY|L!JY|3oLY|d=KY{_iJY|U)LY|CuNY|re#?8xlI?9A-K
z?8@xM?9S}L?8)rK?9J@Myq(#X*^k+uIe<BkIf!`&b1?Hx<`Cvx%%RM?nZua(Fo!em
zWsYFp#~jHV#T?DNpE-s(mN||&o;iUzkvWMunK^|yl{t+$ojHR!lR1kyn>mL$mpP9)
zpSgg!khzGtn7M@c0COpG8FM*v1#=~H74t#nYUV@ChnZ`bYnhKQ*D)VuKE`~U`2_Px
z=2Oh4na?nvWv*vF$K1etp7{dvMdn84OUzBo&CD&#mzl3HUuAA(Zewm|zQ%l=`3Cb%
z=3C6SneQ;)W$s|U$K1($pSg?q0rNxVN6e3zpD;gVe#YF*{G9m(^GoJe%&(bynBOqJ
zWq!x}p7{fFFY`y{Pt2d0zc7Dg{>J>B`3Lh)=3mUeng1~3C;lQu(gj(}Y~~@%Lz#y$
z4`&u-9>FZeJd$}7^JwNV%ww6yF^^{!XP&?;!90;!l6eyIWacT%Q<<kROEFJpmS&#8
zEW<pLc^0!Q^K9lh%yXING0$gSz`T%o5%XfC+%3nvgn215A(gn<Oo!<*J*LkLm?1M_
zCYhHp%QGu5FK1rCypnkp^J?Zb%xjs~F)JeF?)A({%o~_DN+oWUnKv<SX5PZQm05*Z
zm068homqo<8?z>}7PB_94zn(^9<x5P0ka{q5wkI~39~7)8M8UF1+yiy6|*(74YMt?
z9kV^N1G6Ku6SFh33$rUy?sj8#XUh9;Deo!&;MR-To7snXJF_pdAG1Gm0COO75c3Y^
zVCJ37A<Vm&Lz#CohcWMA4oAw}dzmAc_c2FGC2pgbqnY<J$1uk-$1%q<Com^6Cov~8
zr!c27r!l89XE0|nXEA3p=OE?oT;@FHeC7hF#BCvS5pywf3G)HwQsy$|a^?!=O6Dr&
zgUr>;hnNpD*D%*IA7QRzu4g`nl)D?4&of_;Lfl@YZ=}D(+{E0>+`@dB`3mz@=2qr5
z=62?5%-5N3FyCap#eAFj4)a~+4y4?DkGYfiK695;;`RaaL*_@!kC~q^KV^Q#+|B%)
z`33V!=2y(GnR}SuFu!Gf$NZl819LA@?*7R9iTN}07pcVUSLScb-<f|f|78Bf{G0g?
zGk&99BrDxs7Bic92vQb3lzAA-52qJp9>MZr%p;jcF^^^*!#tLG9P@Z)apnok63i2s
zC7CBNPiCINJe7GGvlR1mW@+Xb%reX~nP)M}B4ylXGtXh3%RCP$xAU17u>3;$Mf8iA
z<(QYS{!(UwX)_(Bi<IqqOrPZeQ$B5#`iPli`DM)V%nHoQnO88cWM0L*nt2WLTIO{~
z*-l0J^~_2vzkzup%PZ4wqTkHCg?THp3bQJ+8nZgH2J<#%O=c}-ZKU+84!thD9=$%Z
z0m~cG8_^rnn=qR)n=zX+TQFNPTQOTR+c4WQ+cDcSJ1{#kJ25*myD+;lyD_^ndoX)4
zdog=6`!H{3_GR{C_Gb=Y4rC5u-oYG<l>K=peF*a|=1|t(O&`X*hdG>iFLMO*KBTle
zk~xayqnY<J$1uk-$1%q<Com^6Cov~8r!c27r!l89XE0|nXEA3p=P>6o=P~Cq7cdtx
z7cmzzmoOh-E@dubE@!S_u0+ayS;c&ixtjG4(I2L-VXkHQBg}Oyf0X_h{c+|KEPs;z
z6!U48KSO_(xt`_E(Kpbar@z2_k-3reFVQzKH#4`e{$=_r^jGOy>D%br>95gWXTHIF
zlld0&ZRR`7cbPkw?;&MB@1(!a+{N+_=pWKQqJPZ%gyo;oKV$A@e$M(Y=wC9wVt&os
z!~BN%EmHdR9sPUy5A?mvADKU~{%86x%wL(mF@I<N!Tgi?7xQoCKS<fWl`a3^J~NA%
z%{&At%N<HTjCnZAi_(vv7o#7^Jc@ZV^BCr_%;T8HGm9f-dneFKFi&KbWS)eS<xZxb
zLO+#$8nYD3Pp6k=p1~}`Jd=4Avn=y$<~huBnddRjXI{X(ka-dFVrDt!CCp2i38u|-
zm@d;}`pkeCG9zY^c^R`jvjX#S<`v8<nO8BdW?sX*mU$hsBJ+A?CFTvx8<~}vH!*K!
z-om_<S%q1ZS&dnpS%Y~SvnI0^vo^C1vo5n9vp%x{vmvt)voW&?vnjI~vpKT`vn8_?
zvo*5~vn{h7vpur|QjXJ(^iIsqEbqeX%IwDa?#v#{p3Gj%-poGC+mW)JzRZ3s@6Q~-
z@`3b0^gEb?S$-#T2+Qwc4rTe>^kMXSn8TU(GDk4)V~%8wVvc6s&m6-X%N)lX&z!)V
z$ehHS%$&lU%ACfW&YZ!V$(+TU&78xW%bdrY&s@M<$Xvu+%v{2JfVq^pjJce-g1HhY
z^R<fpAbmCcA?Cv@U&CC>@<*8Kn2$0aV?NG&g83x#DdyA6XPD12*E64EZeTvoe1Z8Q
zb0hO5<|gK5<`(A5%vYGNGPg3fG5PdliQ>n~Wf#d^{MLDSL^XWd94}Fo-$YkI-e#1`
z6(X{lTrpDHD6hrxn#g)ad0m#*K{hnX<;oFhr#`ZYQQny4jgZZa@@6b=ifm<+%M~Tk
zP77pPqg<{mk#f1xq=Qi|SC~k-TyfIbD3>cuq+G5%>1LG66(>?ISD=hC%H@g;DVHla
z#u(*t<%X2Yl^o-Za=C&-%H@iVNk+L`$sy%(Wye&bT(0Pla=F4|hEXn8c1XEg=`q_V
zpT%;y;^PUUT&|Rm<>ktcr;T#CQbNk*%8BQVa=9`>%H>LlSB-MH(m~4QN{OvTxm-aZ
z<#L6^Hlw^B%jHUu?MAs=IU&o-l@zZT<#L6El*<(tuN&oZC5DvCl^Jgt<#MHkl*^SD
z?;7QD#f6m16&O2>a=G$C%H>Lo4~+6%ESD=XJ~hha$_rUuuE_YzD3>cUq+G7p*lm>e
zWchsL=SKOnESD=%zA(z~VEN<7FOBlSEMJNI$|&#6@`cE+jq*ELz6!a=DDT7aMaXZA
z^8PGeiu~3nAIfsMBIi4!ycf$CAip=tZ)f>p<PS#qaF#!U+-sEI&2qWY=SQP_5X)B}
ze=^F4u>3*f&qn!OEMJZM#V8-f@-@g`jq(93Uxxh6DDTViCCJ~6@_{U0j{L(YznA6f
zkbfHGJy<>u`Ik|C56jmg|2E1;u>4V^+~V_XciflbYA#di<Q9M9EOkPKV(I75*h`?S
z4BqBnj<?1G!92X>edgXQ>r-U4K3Uedle4Woc$+#5Z|!H{?R&A_S=Ph&SmEI|SypMh
z?X(wXS>NLAR<YVy*0TOshHr7DU$PB(dYCafu$q16TF3C3LMe;4ANS`FmPvn}!rQ+&
zL$j<?@izP?w0|ky9-oD`%va;%kSp;~!>a08)^ClntPSvUyJlHSKo5K*u^VqI*W#`D
zA8oU&CJVBx@=s@3t7oM5(4j{A{Rd@Pw_JmMAD(TkK;J9&%Ch=n*;45Had^wx4ci=v
zmvE=zZSWq9Y1im1YbV}ncf#BM4tQ&Q?t@v@MT@emkFkw-yoa%U8HbElSpQRGEj%qk
zN%}H#8MBOU$0^xXsd{J&+n&|}`>`VSdF3o?d&ey6dk5pj_NHNb7oon?W!U$4TRrBM
zENcSZ_CM7)%W5<=%i7d5%X;OJ-0c@dyK(#B@GPrFbL_VPXb*2$2jlJeMZ>bJjTq-R
zjPK;TvaH9JWLekZW1wC$QTJk&RSsjTgpXN<t;O8qc#va4#&#G#dN_jZ%iK+R#k0ng
z^Q`Zy`>^!YVDYy1;;rne?bx6HV84BfW8yu`6W+$p`4q=Um~FKym2I`b+iBU~Wsb|X
zHWkUXR^es!D|pLX3HPr=Uw2)aZEelUw%)l_wq{vJ0%>3NZMzB;tVXAnw?00!ytQrp
zRVYbcxEgQ2LFe>%?R&DUo3MY9_=x5Y?2pnzv#fXU@z!vRXJf7OxX;Jj-iVKYcCAd$
z`R;z$C-`XK3w(^!7whe5kUn;f#x~;ipBRtF#pEpOg`2XhMOdy9=Ai@j?RLzM%;#^|
zpIxxuAHz1U#<;h1z&?*K-dQ-N@EDoX635Y4%q!-9R^N2{#~AIG!Q*akL-eOjmbC*P
z8%cl9zB|h*SvP%*%e=~CQ;z+gK2MKPjt3cs925RiIF|4@mSg1!j6rV4vVGZ`#mjnD
ztx>M!j&r5TIu68RJ7gmI{$#o>c?^7k?aDdetsz;~5NvPvG#rOVXIr0smt_@4nauI)
zIG*Gf>VxCyLmV$%(D(IgvaAJIR&K`|?aQ%P3v(*RR0(|4^ev7%*@sgw&JDA1JlD=f
z->?r$XQ$>aYcr1F)3dD`Z^p6E3Hx^u9;<I=Syh&2S#7qZ+b?dk?{-Ift<vYEqwrB#
z$thXZx`AnAj_bgGanTO8`xoYMBgWfqdV0<pjK<@jHO7MdSL53B<3{>_g3*4f7qYDI
z3!GQ*_N>{~EUN*I(-NJstr=Cat<noH*IO{IZ!kaDKSQ3uxeCW}>2tEJc{t90!TISc
z^m$;hY-<bV>X*Ig{>%9w-v8S%t`Vr)IUnQ1G1_KKdhRz|g?)v-$v&0s?wyf7Mw;MQ
zT8nn%eD>4E^!aJ~p7gP~VpzKU6OHyuzJvMMiF5q@n8Q^#&m5a=)%zsNYIS|KRrG80
z@4@u`mHtk`IEsIqKBrdt5Zf-FZSBQe$a(WHJWeN+O+PkEvi(z#D=Ojt7JfDUFY1?A
zcnstJ_Kq<W#oVnwG23$Sk-@t6(Z}L=yn_$l&9V;p8OP7BI2YrwKKjmVt71v?E0Jv-
zgE4fdlx=N)Im>EMKigW}Fxx7J$4oaI?{Yf{$UaM*i*60ANkr;SW}TexORSh@omOS0
zHQ<cBIB&db?YwT9EMiGtW&OiJqsw+$6U)AD%{H>=*9)*e7NFL$P6e{8oO|N!#EBy7
zG!WPK8y{FEQeTSovaJWIy^HPaP<iQ-^YHvOPZqUK2XR}CaEX_R)R$(xwABa4p*;Rg
z-nj(lI3Tw(fV3t1L(b#z|AjhEDJv+j39Af{?aA?6Q0>O;ph@dYAnm8-0moN74oSJ4
z1*BfaYu3sAxSfJZY{DuF3Uj<?v;BRI_Z-$Atnr>($m2b4KjS_BAdmNgLLTph`x);=
zW<AbtBf}`?{DUg%;*9$EoFpl?az?$mZd@-9oA~%0f-|W+=g4#Zu2=EA_AQ=w@LYFH
zIlMl@bM>4<vs14ZpZQW=H>c0V%bvipc+RYM2-XGi8sWpK>DQRDeVqPN?dNxnPOxrY
z=TzIMH_xYXUv7?3f9ZfQ&pB6K*~I4x^BgMo<>rC7KEL+mRc&0K-}=&vxL(#b`+%y{
zRr_&$e(OiZ`uWu-Sugv}>>t#+jCBX=ye^;X$yf^Oyk3Fr@9R8wIqPL#nEMK9U18K^
z?x!n_y3F;iVqJdwP@W?S@;L*WvaaU#3Tq!;!}j;J53gl?e&a33vC6)a+jZQY^e6K*
zq10DoePQj7>)F0+OKz2b9K-SL2HY|IM!FnhnYYR)Fqge4!*9;;Tj=?<b!*15RWiJ4
zhF7D@el%|#cBid6!t7g(48JYIYi4*YdS?45PTgwbU$gx>8D2NT>(TQYdwtf)>+<*(
zk0bU5xFg#W;<|?Ulr>_R^gHvl9*V5StUp-CdlPQ2u#V@ZY=2+lZFZ2y+q{s++hRZC
zZTVl0cLKgHJkZzv*p$_Z+uPUt{@eCjv;BR|ZyVMhtmC=ue#U#C`!arhXt$s7wm-<@
z?NG?$?YN)ub}FFp{wK#FHf43@_V#r=?`wYl)AqZt{e2zJU0Hvyj^}O%INp8j2eTYa
zTix0IzQ)_b*nVMr*V5Byzc9W#>1DKE7@vRlX8ZEG;ezQNzBkDIuBeZ(?ZWt8<94I{
z!ual>uhD*Ce7@d~?aRI^!S7|<8uAX-vibvgEhUfh!uVchfYE+od^SGNXumK%yB@^$
zPec~hXVrIb8**;gUA_Xo`zvpqeOP(xtR3<W6~7Du;yEge?<Mav+Aoan9fuh07smHH
zcNy&$#^+Q+*}lxrmJR{FuXU}#_}+Fmzqh^H*nVMrH#f{^zc9YPy2oh0Fuq?J&i3W}
zGXdW%PQrJK-;b3~;P6+1K#pyB&-cc<srWv3ru9<rgEe(Z?sqXGxQr~9IDL*ZW8Ife
z*+`bfuNmYyIDYS!dEb0~K9Kv;hf&<#fBPEgKiQtNKbr0D>ovputdGYVk2TK!WIO-q
zGSZ(h++JagcP!iA*LcUV{$P!Fd?Al_!hXg(@gR?PQX!9b@_xoU<sgrDY9Wtz+J446
z{UDEbMj?-P=6=RIE1&Vo`6oVKN&ctjqx~(1O<A+Ky}~+Q&0+idI$zCY{lPk4%`4>b
z&fm{?7aZjAE-d8nF51s{7a!#DE-B>kKCqwhE<MQOT~^5BUA~|3t~ki!U0KNEUA3R_
F{vXXEh426X

literal 0
HcmV?d00001

diff --git a/ceph/src/test/cli/crushtool/help.t b/ceph/src/test/cli/crushtool/help.t
old mode 100755
new mode 100644
index 389b06145..162175350
--- a/ceph/src/test/cli/crushtool/help.t
+++ b/ceph/src/test/cli/crushtool/help.t
@@ -63,6 +63,8 @@
      -i mapfn --move       name --loc type name ...
                            move the given item to specified location
      -i mapfn --reweight   recalculate all bucket weights
+     -i mapfn --rebuild-class-roots
+                           rebuild the per-class shadow trees (normally a no-op)
      -i mapfn --create-simple-rule name root type mode
                            create crush rule <name> to start from <root>,
                            replicate across buckets of type <type>, using
@@ -110,6 +112,13 @@
                            export select data generated during testing routine
                            to CSV files for off-line post-processing
                            use --help-output for more information
+     --reclassify          transform legacy CRUSH map buckets and rules
+                           by adding classes
+        --reclassify-bucket <bucket-match> <class> <default-parent>
+        --reclassify-root <bucket-name> <class>
+     --set-subtree-class <bucket-name> <class>
+                           set class for all items beneath bucket-name
+     --compare <otherfile> compare two maps using --test parameters
   
   Options for the output stage
   
diff --git a/ceph/src/test/cli/crushtool/reclassify.t b/ceph/src/test/cli/crushtool/reclassify.t
new file mode 100644
index 000000000..500cd0dc6
--- /dev/null
+++ b/ceph/src/test/cli/crushtool/reclassify.t
@@ -0,0 +1,588 @@
+  $ crushtool -i $TESTDIR/crush-classes/a --set-subtree-class default hdd --reclassify --reclassify-bucket %-ssd ssd default --reclassify-bucket ssd ssd default --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 1
+    renumbering bucket -1 -> -5
+    renumbering bucket -4 -> -6
+    renumbering bucket -3 -> -7
+    renumbering bucket -2 -> -8
+  classify_bucket %-ssd as ssd default bucket default (root)
+    created new class ssd as 2
+  match %-ssd to ttipod001-cephosd-2-ssd basename ttipod001-cephosd-2
+    have base -8
+  match %-ssd to ttipod001-cephosd-1-ssd basename ttipod001-cephosd-1
+    have base -7
+  match %-ssd to ttipod001-cephosd-3-ssd basename ttipod001-cephosd-3
+    have base -6
+  classify_bucket ssd as ssd default bucket default (root)
+    new class ssd exists as 2
+  match ssd to ssd basename default
+    have base -5
+  moving items from -24 (ttipod001-cephosd-3-ssd) to -6 (ttipod001-cephosd-3)
+  moving items from -23 (ttipod001-cephosd-1-ssd) to -7 (ttipod001-cephosd-1)
+  moving items from -22 (ttipod001-cephosd-2-ssd) to -8 (ttipod001-cephosd-2)
+  moving items from -21 (ssd) to -5 (default)
+  $ crushtool -i $TESTDIR/crush-classes/a --compare foo
+  rule 0 had 0/10240 mismatched mappings (0)
+  rule 1 had 0/10240 mismatched mappings (0)
+  maps appear equivalent
+
+  $ crushtool -i $TESTDIR/crush-classes/d --set-subtree-class default hdd --reclassify --reclassify-bucket %-ssd ssd default --reclassify-bucket ssd ssd default --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 1
+    renumbering bucket -1 -> -13
+    renumbering bucket -6 -> -14
+    renumbering bucket -5 -> -15
+    renumbering bucket -4 -> -16
+    renumbering bucket -3 -> -17
+    renumbering bucket -2 -> -18
+  classify_bucket %-ssd as ssd default bucket default (root)
+    created new class ssd as 2
+  match %-ssd to node-20-ssd basename node-20
+    have base -18
+  match %-ssd to node-21-ssd basename node-21
+    created base -25
+  match %-ssd to node-22-ssd basename node-22
+    created base -26
+  match %-ssd to node-23-ssd basename node-23
+    created base -27
+  match %-ssd to node-27-ssd basename node-27
+    created base -28
+  classify_bucket ssd as ssd default bucket default (root)
+    new class ssd exists as 2
+  match ssd to ssd basename default
+    have base -13
+  moving items from -12 (node-27-ssd) to -28 (node-27)
+  moving items from -11 (node-23-ssd) to -27 (node-23)
+  moving items from -10 (node-22-ssd) to -26 (node-22)
+  moving items from -9 (node-21-ssd) to -25 (node-21)
+  moving items from -8 (node-20-ssd) to -18 (node-20)
+  moving items from -7 (ssd) to -13 (default)
+  $ crushtool -i $TESTDIR/crush-classes/d --compare foo
+  rule 0 had 0/10240 mismatched mappings (0)
+  rule 1 had 0/10240 mismatched mappings (0)
+  maps appear equivalent
+
+  $ crushtool -i $TESTDIR/crush-classes/e --reclassify --reclassify-bucket ceph-osd-ssd-% ssd default --reclassify-bucket ssd-root ssd default --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 1
+    renumbering bucket -1 -> -55
+    renumbering bucket -34 -> -56
+    renumbering bucket -20 -> -57
+    renumbering bucket -14 -> -58
+    renumbering bucket -15 -> -59
+    renumbering bucket -16 -> -60
+    renumbering bucket -52 -> -61
+    renumbering bucket -46 -> -62
+    renumbering bucket -40 -> -63
+  classify_bucket ceph-osd-ssd-% as ssd default bucket default (root)
+    new class ssd exists as 0
+  match ceph-osd-ssd-% to ceph-osd-ssd-node4 basename node4
+    have base -57
+  match ceph-osd-ssd-% to ceph-osd-ssd-node3 basename node3
+    have base -58
+  match ceph-osd-ssd-% to ceph-osd-ssd-node1 basename node1
+    have base -60
+  match ceph-osd-ssd-% to ceph-osd-ssd-node2 basename node2
+    have base -59
+  match ceph-osd-ssd-% to ceph-osd-ssd-node5 basename node5
+    have base -56
+  match ceph-osd-ssd-% to ceph-osd-ssd-node6 basename node6
+    have base -63
+  match ceph-osd-ssd-% to ceph-osd-ssd-node7 basename node7
+    have base -62
+  match ceph-osd-ssd-% to ceph-osd-ssd-node8 basename node8
+    have base -61
+  classify_bucket ssd-root as ssd default bucket default (root)
+    new class ssd exists as 0
+  match ssd-root to ssd-root basename default
+    have base -55
+  moving items from -49 (ceph-osd-ssd-node8) to -61 (node8)
+  moving items from -43 (ceph-osd-ssd-node7) to -62 (node7)
+  moving items from -37 (ceph-osd-ssd-node6) to -63 (node6)
+  moving items from -31 (ceph-osd-ssd-node5) to -56 (node5)
+  moving items from -18 (ssd-root) to -55 (default)
+  moving items from -9 (ceph-osd-ssd-node2) to -59 (node2)
+  moving items from -7 (ceph-osd-ssd-node1) to -60 (node1)
+  moving items from -5 (ceph-osd-ssd-node3) to -58 (node3)
+  moving items from -3 (ceph-osd-ssd-node4) to -57 (node4)
+
+this one has weird node weights, so *lots* of mappings change...
+
+  $ crushtool -i $TESTDIR/crush-classes/e --compare foo
+  rule 0 had 6540/10240 mismatched mappings (0.638672)
+  rule 1 had 8417/10240 mismatched mappings (0.821973)
+  warning: maps are NOT equivalent
+  [1]
+
+  $ crushtool -i $TESTDIR/crush-classes/c --reclassify --reclassify-bucket %-SSD ssd default --reclassify-bucket ssd ssd default --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 0
+    renumbering bucket -1 -> -55
+    renumbering bucket -9 -> -56
+    renumbering bucket -8 -> -57
+    renumbering bucket -7 -> -58
+    renumbering bucket -6 -> -59
+    renumbering bucket -5 -> -60
+    renumbering bucket -4 -> -61
+    renumbering bucket -3 -> -62
+    renumbering bucket -2 -> -63
+  classify_bucket %-SSD as ssd default bucket default (root)
+    created new class ssd as 2
+  match %-SSD to Ceph-Stor1-SSD basename Ceph-Stor1
+    have base -63
+  match %-SSD to Ceph-Stor2-SSD basename Ceph-Stor2
+    have base -62
+  match %-SSD to Ceph-Stor3-SSD basename Ceph-Stor3
+    have base -61
+  match %-SSD to Ceph-Stor4-SSD basename Ceph-Stor4
+    have base -60
+  match %-SSD to Ceph-Stor5-SSD basename Ceph-Stor5
+    have base -59
+  match %-SSD to Ceph-Stor6-SSD basename Ceph-Stor6
+    have base -58
+  match %-SSD to Ceph-Stor7-SSD basename Ceph-Stor7
+    have base -57
+  match %-SSD to Ceph-Stor8-SSD basename Ceph-Stor8
+    have base -56
+  classify_bucket ssd as ssd default bucket default (root)
+    new class ssd exists as 2
+  match ssd to ssd basename default
+    have base -55
+  moving items from -18 (ssd) to -55 (default)
+  moving items from -17 (Ceph-Stor8-SSD) to -56 (Ceph-Stor8)
+  moving items from -16 (Ceph-Stor7-SSD) to -57 (Ceph-Stor7)
+  moving items from -15 (Ceph-Stor6-SSD) to -58 (Ceph-Stor6)
+  moving items from -14 (Ceph-Stor5-SSD) to -59 (Ceph-Stor5)
+  moving items from -13 (Ceph-Stor4-SSD) to -60 (Ceph-Stor4)
+  moving items from -12 (Ceph-Stor3-SSD) to -61 (Ceph-Stor3)
+  moving items from -11 (Ceph-Stor2-SSD) to -62 (Ceph-Stor2)
+  moving items from -10 (Ceph-Stor1-SSD) to -63 (Ceph-Stor1)
+
+wonky crush weights on Ceph-Stor1, so a small number of mappings change
+because the new map has a strictly summing hierarchy.
+
+  $ crushtool -i $TESTDIR/crush-classes/c --compare foo
+  rule 0 had 158/10240 mismatched mappings (0.0154297)
+  rule 1 had 62/5120 mismatched mappings (0.0121094)
+  rule 2 had 0/10240 mismatched mappings (0)
+  warning: maps are NOT equivalent
+  [1]
+
+  $ crushtool -i $TESTDIR/crush-classes/beesly --set-subtree-class 0513-R-0060 hdd --set-subtree-class 0513-R-0050 hdd --reclassify --reclassify-root 0513-R-0050 hdd --reclassify-root 0513-R-0060 hdd -o foo
+  classify_root 0513-R-0050 (-2) as hdd
+    new class hdd exists as 0
+    renumbering bucket -2 -> -131
+    renumbering bucket -14 -> -132
+    renumbering bucket -34 -> -133
+    renumbering bucket -33 -> -134
+    renumbering bucket -30 -> -135
+    renumbering bucket -26 -> -136
+    renumbering bucket -22 -> -137
+    renumbering bucket -18 -> -138
+    renumbering bucket -13 -> -139
+    renumbering bucket -9 -> -140
+    renumbering bucket -12 -> -141
+    renumbering bucket -11 -> -142
+    renumbering bucket -32 -> -143
+    renumbering bucket -31 -> -144
+    renumbering bucket -10 -> -145
+    renumbering bucket -8 -> -146
+    renumbering bucket -6 -> -147
+    renumbering bucket -28 -> -148
+    renumbering bucket -27 -> -149
+    renumbering bucket -21 -> -150
+    renumbering bucket -20 -> -151
+    renumbering bucket -19 -> -152
+    renumbering bucket -7 -> -153
+    renumbering bucket -5 -> -154
+    renumbering bucket -4 -> -155
+    renumbering bucket -25 -> -156
+    renumbering bucket -24 -> -157
+    renumbering bucket -23 -> -158
+    renumbering bucket -17 -> -159
+    renumbering bucket -16 -> -160
+    renumbering bucket -15 -> -161
+    renumbering bucket -3 -> -162
+    renumbering bucket -72 -> -163
+    renumbering bucket -98 -> -164
+    renumbering bucket -97 -> -165
+    renumbering bucket -96 -> -166
+    renumbering bucket -95 -> -167
+    renumbering bucket -94 -> -168
+    renumbering bucket -93 -> -169
+    renumbering bucket -68 -> -170
+  classify_root 0513-R-0060 (-65) as hdd
+    new class hdd exists as 0
+    renumbering bucket -65 -> -35
+    renumbering bucket -76 -> -36
+    renumbering bucket -78 -> -37
+    renumbering bucket -87 -> -38
+    renumbering bucket -82 -> -39
+    renumbering bucket -81 -> -40
+    renumbering bucket -77 -> -41
+    renumbering bucket -75 -> -42
+    renumbering bucket -89 -> -43
+    renumbering bucket -85 -> -44
+    renumbering bucket -84 -> -45
+    renumbering bucket -74 -> -46
+    renumbering bucket -71 -> -47
+    renumbering bucket -80 -> -48
+    renumbering bucket -91 -> -49
+    renumbering bucket -90 -> -50
+    renumbering bucket -88 -> -51
+    renumbering bucket -79 -> -52
+    renumbering bucket -70 -> -53
+    renumbering bucket -86 -> -54
+    renumbering bucket -83 -> -55
+    renumbering bucket -73 -> -56
+    renumbering bucket -69 -> -57
+  $ crushtool -i $TESTDIR/crush-classes/beesly --compare foo
+  rule 0 had 0/10240 mismatched mappings (0)
+  rule 1 had 0/10240 mismatched mappings (0)
+  rule 2 had 0/10240 mismatched mappings (0)
+  rule 4 had 0/10240 mismatched mappings (0)
+  maps appear equivalent
+
+  $ crushtool -i $TESTDIR/crush-classes/flax --reclassify --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 0
+    renumbering bucket -1 -> -5
+    renumbering bucket -12 -> -7
+    renumbering bucket -9 -> -8
+    renumbering bucket -6 -> -10
+    renumbering bucket -4 -> -11
+    renumbering bucket -3 -> -13
+    renumbering bucket -2 -> -14
+  $ crushtool -i $TESTDIR/crush-classes/flax --compare foo
+  rule 0 had 0/10240 mismatched mappings (0)
+  maps appear equivalent
+
+  $ crushtool -i $TESTDIR/crush-classes/gabe --reclassify --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 0
+    rule 3 includes take on root default class 0
+  failed to reclassify map
+  [1]
+
+above fails because of ec-rack-by-2-hdd also has take default class hdd.
+
+below is an adjusted version of the same cluster's map
+
+  $ crushtool -i $TESTDIR/crush-classes/gabe2 --reclassify --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 0
+    renumbering bucket -1 -> -178
+    renumbering bucket -4 -> -179
+    renumbering bucket -25 -> -180
+    renumbering bucket -16 -> -181
+    renumbering bucket -21 -> -182
+    renumbering bucket -19 -> -183
+    renumbering bucket -15 -> -184
+    renumbering bucket -7 -> -185
+    renumbering bucket -47 -> -186
+    renumbering bucket -18 -> -187
+    renumbering bucket -8 -> -188
+    renumbering bucket -6 -> -189
+    renumbering bucket -12 -> -190
+    renumbering bucket -23 -> -191
+    renumbering bucket -22 -> -192
+    renumbering bucket -20 -> -193
+    renumbering bucket -11 -> -194
+    renumbering bucket -10 -> -195
+    renumbering bucket -17 -> -196
+    renumbering bucket -13 -> -197
+    renumbering bucket -9 -> -198
+    renumbering bucket -3 -> -199
+    renumbering bucket -14 -> -200
+    renumbering bucket -5 -> -201
+    renumbering bucket -2 -> -202
+  $ crushtool -i $TESTDIR/crush-classes/gabe2 --compare foo
+  rule 0 had 627/10240 mismatched mappings (0.0612305)
+  rule 1 had 422/6144 mismatched mappings (0.0686849)
+  warning: maps are NOT equivalent
+  [1]
+
+
+
+  $ crushtool -i $TESTDIR/crush-classes/b --reclassify --reclassify-bucket %-hdd hdd default --reclassify-bucket %-ssd ssd default --reclassify-bucket ssd ssd default --reclassify-bucket hdd hdd default -o foo
+  classify_bucket %-hdd as hdd default bucket default (root)
+    new class hdd exists as 0
+  match %-hdd to berta-hdd basename berta
+    have base -37
+  match %-hdd to oelgard-hdd basename oelgard
+    have base -36
+  match %-hdd to leonhard-hdd basename leonhard
+    have base -33
+  match %-hdd to gottlieb-hdd basename gottlieb
+    have base -30
+  match %-hdd to hieronymus-hdd basename hieronymus
+    have base -31
+  match %-hdd to uhu-hdd basename uhu
+    have base -34
+  match %-hdd to euphrosyne-hdd basename euphrosyne
+    have base -35
+  match %-hdd to frauenhaus-hdd basename frauenhaus
+    created base -145
+  match %-hdd to herrenhaus-hdd basename herrenhaus
+    created base -146
+  match %-hdd to zoo-hdd basename zoo
+    created base -147
+  match %-hdd to borkenkaefer-hdd basename borkenkaefer
+    have base -4
+  match %-hdd to hirsch-hdd basename hirsch
+    have base -41
+  match %-hdd to cassowary-hdd basename cassowary
+    created base -148
+  match %-hdd to fuchs-hdd basename fuchs
+    created base -149
+  match %-hdd to analia-hdd basename analia
+    created base -150
+  match %-hdd to gundula-hdd basename gundula
+    created base -151
+  match %-hdd to achim-hdd basename achim
+    created base -152
+  match %-hdd to hugo-hdd basename hugo
+    created base -153
+  match %-hdd to carl-hdd basename carl
+    have base -32
+  classify_bucket %-ssd as ssd default bucket default (root)
+    new class ssd exists as 1
+  match %-ssd to frauenhaus-ssd basename frauenhaus
+    already creating base -145
+  match %-ssd to herrenhaus-ssd basename herrenhaus
+    already creating base -146
+  match %-ssd to zoo-ssd basename zoo
+    already creating base -147
+  match %-ssd to berta-ssd basename berta
+    have base -37
+  match %-ssd to euphrosyne-ssd basename euphrosyne
+    have base -35
+  match %-ssd to oelgard-ssd basename oelgard
+    have base -36
+  match %-ssd to leonhard-ssd basename leonhard
+    have base -33
+  match %-ssd to hieronymus-ssd basename hieronymus
+    have base -31
+  match %-ssd to gottlieb-ssd basename gottlieb
+    have base -30
+  match %-ssd to uhu-ssd basename uhu
+    have base -34
+  match %-ssd to borkenkaefer-ssd basename borkenkaefer
+    have base -4
+  match %-ssd to hirsch-ssd basename hirsch
+    have base -41
+  match %-ssd to phaidon-ssd basename phaidon
+    created base -154
+  match %-ssd to glykera-ssd basename glykera
+    created base -155
+  match %-ssd to bonobo-ssd basename bonobo
+    created base -156
+  classify_bucket hdd as hdd default bucket default (root)
+    new class hdd exists as 0
+  match hdd to hdd basename default
+    have base -1
+  classify_bucket ssd as ssd default bucket default (root)
+    new class ssd exists as 1
+  match ssd to ssd basename default
+    have base -1
+  moving items from -124 (bonobo-ssd) to -156 (bonobo)
+  moving items from -123 (glykera-ssd) to -155 (glykera)
+  moving items from -122 (phaidon-ssd) to -154 (phaidon)
+  moving items from -121 (carl-hdd) to -32 (carl)
+  moving items from -120 (hugo-hdd) to -153 (hugo)
+  moving items from -119 (achim-hdd) to -152 (achim)
+  moving items from -118 (gundula-hdd) to -151 (gundula)
+  moving items from -117 (analia-hdd) to -150 (analia)
+  moving items from -116 (fuchs-hdd) to -149 (fuchs)
+  moving items from -115 (cassowary-hdd) to -148 (cassowary)
+  moving items from -39 (hirsch-ssd) to -41 (hirsch)
+  moving items from -38 (hirsch-hdd) to -41 (hirsch)
+  moving items from -29 (borkenkaefer-ssd) to -4 (borkenkaefer)
+  moving items from -28 (hdd) to -1 (default)
+  moving items from -27 (ssd) to -1 (default)
+  moving items from -26 (uhu-ssd) to -34 (uhu)
+  moving items from -25 (gottlieb-ssd) to -30 (gottlieb)
+  moving items from -24 (hieronymus-ssd) to -31 (hieronymus)
+  moving items from -23 (leonhard-ssd) to -33 (leonhard)
+  moving items from -22 (borkenkaefer-hdd) to -4 (borkenkaefer)
+  moving items from -21 (oelgard-ssd) to -36 (oelgard)
+  moving items from -20 (euphrosyne-ssd) to -35 (euphrosyne)
+  moving items from -19 (berta-ssd) to -37 (berta)
+  moving items from -17 (zoo-ssd) to -147 (zoo)
+  moving items from -16 (herrenhaus-ssd) to -146 (herrenhaus)
+  moving items from -15 (frauenhaus-ssd) to -145 (frauenhaus)
+  moving items from -12 (zoo-hdd) to -147 (zoo)
+  moving items from -11 (herrenhaus-hdd) to -146 (herrenhaus)
+  moving items from -10 (frauenhaus-hdd) to -145 (frauenhaus)
+  moving items from -9 (euphrosyne-hdd) to -35 (euphrosyne)
+  moving items from -8 (uhu-hdd) to -34 (uhu)
+  moving items from -7 (hieronymus-hdd) to -31 (hieronymus)
+  moving items from -6 (gottlieb-hdd) to -30 (gottlieb)
+  moving items from -5 (leonhard-hdd) to -33 (leonhard)
+  moving items from -3 (oelgard-hdd) to -36 (oelgard)
+  moving items from -2 (berta-hdd) to -37 (berta)
+  new bucket -156 missing parent, adding at {root=default}
+  new bucket -155 missing parent, adding at {root=default}
+  new bucket -154 missing parent, adding at {root=default}
+
+  $ crushtool -i $TESTDIR/crush-classes/b --compare foo
+  rule 0 had 0/3072 mismatched mappings (0)
+  rule 1 had 0/4096 mismatched mappings (0)
+  maps appear equivalent
+
+  $ crushtool -i $TESTDIR/crush-classes/f --reclassify --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 0
+    renumbering bucket -1 -> -178
+    renumbering bucket -4 -> -179
+    renumbering bucket -25 -> -180
+    renumbering bucket -16 -> -181
+    renumbering bucket -21 -> -182
+    renumbering bucket -19 -> -183
+    renumbering bucket -15 -> -184
+    renumbering bucket -7 -> -185
+    renumbering bucket -47 -> -186
+    renumbering bucket -18 -> -187
+    renumbering bucket -8 -> -188
+    renumbering bucket -6 -> -189
+    renumbering bucket -12 -> -190
+    renumbering bucket -23 -> -191
+    renumbering bucket -22 -> -192
+    renumbering bucket -20 -> -193
+    renumbering bucket -11 -> -194
+    renumbering bucket -10 -> -195
+    renumbering bucket -17 -> -196
+    renumbering bucket -13 -> -197
+    renumbering bucket -9 -> -198
+    renumbering bucket -3 -> -199
+    renumbering bucket -14 -> -200
+    renumbering bucket -5 -> -201
+    renumbering bucket -2 -> -202
+
+We expect some mismatches below because there are some ssd-labeled nodes under
+default that we aren't changing the class on.
+
+  $ crushtool -i $TESTDIR/crush-classes/f --compare foo
+  rule 0 had 627/10240 mismatched mappings (0.0612305)
+  rule 1 had 422/6144 mismatched mappings (0.0686849)
+  warning: maps are NOT equivalent
+  [1]
+
+  $ crushtool -i $TESTDIR/crush-classes/g --reclassify --reclassify-bucket sata-% hdd-sata default --reclassify-bucket sas-% hdd-sas default --reclassify-bucket sas hdd-sas default --reclassify-bucket sata hdd-sata default -o foo
+  classify_bucket sas as hdd-sas default bucket default (root)
+    created new class hdd-sas as 1
+  match sas to sas basename default
+    have base -1
+  classify_bucket sas-% as hdd-sas default bucket default (root)
+    new class hdd-sas exists as 1
+  match sas-% to sas-osd01 basename osd01
+    created base -73
+  match sas-% to sas-osd02 basename osd02
+    created base -74
+  match sas-% to sas-osd03 basename osd03
+    created base -75
+  match sas-% to sas-osd04 basename osd04
+    created base -76
+  match sas-% to sas-osd05 basename osd05
+    created base -77
+  match sas-% to sas-osd06 basename osd06
+    created base -78
+  match sas-% to sas-osd07 basename osd07
+    created base -79
+  match sas-% to sas-osd08 basename osd08
+    created base -80
+  match sas-% to sas-osd09 basename osd09
+    created base -81
+  match sas-% to sas-rack1 basename rack1
+    created base -82
+  match sas-% to sas-rack2 basename rack2
+    created base -83
+  match sas-% to sas-rack3 basename rack3
+    created base -84
+  classify_bucket sata as hdd-sata default bucket default (root)
+    created new class hdd-sata as 2
+  match sata to sata basename default
+    have base -1
+  classify_bucket sata-% as hdd-sata default bucket default (root)
+    new class hdd-sata exists as 2
+  match sata-% to sata-osd11 basename osd11
+    created base -85
+  match sata-% to sata-osd10 basename osd10
+    created base -86
+  match sata-% to sata-osd14 basename osd14
+    created base -87
+  match sata-% to sata-osd13 basename osd13
+    created base -88
+  match sata-% to sata-osd12 basename osd12
+    created base -89
+  match sata-% to sata-osd15 basename osd15
+    created base -90
+  match sata-% to sata-osd16 basename osd16
+    created base -91
+  match sata-% to sata-osd18 basename osd18
+    created base -92
+  match sata-% to sata-osd19 basename osd19
+    created base -93
+  match sata-% to sata-osd17 basename osd17
+    created base -94
+  match sata-% to sata-osd20 basename osd20
+    created base -95
+  match sata-% to sata-osd21 basename osd21
+    created base -96
+  match sata-% to sata-osd22 basename osd22
+    created base -97
+  match sata-% to sata-osd23 basename osd23
+    created base -98
+  match sata-% to sata-osd24 basename osd24
+    created base -99
+  match sata-% to sata-osd25 basename osd25
+    created base -100
+  match sata-% to sata-osd26 basename osd26
+    created base -101
+  match sata-% to sata-osd27 basename osd27
+    created base -102
+  match sata-% to sata-rack1 basename rack1
+    already creating base -82
+  match sata-% to sata-rack2 basename rack2
+    already creating base -83
+  match sata-% to sata-rack3 basename rack3
+    already creating base -84
+  moving items from -36 (sas) to -1 (default)
+  moving items from -35 (sata) to -1 (default)
+  moving items from -34 (sas-rack3) to -84 (rack3)
+  moving items from -33 (sas-rack2) to -83 (rack2)
+  moving items from -32 (sas-rack1) to -82 (rack1)
+  moving items from -31 (sata-rack3) to -84 (rack3)
+  moving items from -30 (sata-rack2) to -83 (rack2)
+  moving items from -29 (sata-rack1) to -82 (rack1)
+  moving items from -28 (sas-osd09) to -81 (osd09)
+  moving items from -27 (sas-osd08) to -80 (osd08)
+  moving items from -26 (sas-osd07) to -79 (osd07)
+  moving items from -25 (sas-osd06) to -78 (osd06)
+  moving items from -24 (sas-osd05) to -77 (osd05)
+  moving items from -23 (sas-osd04) to -76 (osd04)
+  moving items from -22 (sas-osd03) to -75 (osd03)
+  moving items from -21 (sas-osd02) to -74 (osd02)
+  moving items from -20 (sata-osd27) to -102 (osd27)
+  moving items from -19 (sas-osd01) to -73 (osd01)
+  moving items from -18 (sata-osd26) to -101 (osd26)
+  moving items from -17 (sata-osd25) to -100 (osd25)
+  moving items from -16 (sata-osd24) to -99 (osd24)
+  moving items from -15 (sata-osd23) to -98 (osd23)
+  moving items from -14 (sata-osd22) to -97 (osd22)
+  moving items from -13 (sata-osd21) to -96 (osd21)
+  moving items from -12 (sata-osd20) to -95 (osd20)
+  moving items from -11 (sata-osd17) to -94 (osd17)
+  moving items from -10 (sata-osd19) to -93 (osd19)
+  moving items from -9 (sata-osd18) to -92 (osd18)
+  moving items from -8 (sata-osd16) to -91 (osd16)
+  moving items from -7 (sata-osd15) to -90 (osd15)
+  moving items from -6 (sata-osd12) to -89 (osd12)
+  moving items from -5 (sata-osd13) to -88 (osd13)
+  moving items from -4 (sata-osd14) to -87 (osd14)
+  moving items from -3 (sata-osd10) to -86 (osd10)
+  moving items from -2 (sata-osd11) to -85 (osd11)
+  $ crushtool -i $TESTDIR/crush-classes/g --compare foo
+  rule 0 had 0/10240 mismatched mappings (0)
+  rule 1 had 0/10240 mismatched mappings (0)
+  maps appear equivalent
diff --git a/ceph/src/test/cli/radosgw-admin/help.t b/ceph/src/test/cli/radosgw-admin/help.t
index f847c7677..706a56739 100644
--- a/ceph/src/test/cli/radosgw-admin/help.t
+++ b/ceph/src/test/cli/radosgw-admin/help.t
@@ -140,6 +140,8 @@
     reshard list               list all bucket resharding or scheduled to be reshared
     reshard process            process of scheduled reshard jobs
     reshard cancel             cancel resharding a bucket
+    reshard stale-instances list list stale-instances from bucket resharding
+    reshard stale-instances rm   cleanup stale-instances from bucket resharding
     sync error list            list sync error
     sync error trim            trim sync error
   options:
diff --git a/ceph/src/test/cls_lock/test_cls_lock.cc b/ceph/src/test/cls_lock/test_cls_lock.cc
index 0aeed8245..37d10a19c 100644
--- a/ceph/src/test/cls_lock/test_cls_lock.cc
+++ b/ceph/src/test/cls_lock/test_cls_lock.cc
@@ -94,10 +94,10 @@ TEST(ClsLock, TestMultiLocking) {
   ASSERT_EQ(-EEXIST, l.lock_exclusive(&ioctx, oid));
 
   /* test idempotency */
-  l.set_renew(true);
+  l.set_may_renew(true);
   ASSERT_EQ(0, l.lock_exclusive(&ioctx, oid));
 
-  l.set_renew(false);
+  l.set_may_renew(false);
 
   /* test second client */
   Lock l2(lock_name);
@@ -204,7 +204,7 @@ TEST(ClsLock, TestMeta) {
   /* check new tag */
   string new_tag = "new_tag";
   l.set_tag(new_tag);
-  l.set_renew(true);
+  l.set_may_renew(true);
   ASSERT_EQ(0, l.lock_exclusive(&ioctx, oid));
   lock_info(&ioctx, oid, lock_name, lockers, NULL, &new_tag);
   ASSERT_EQ(1, (int)lockers.size());
@@ -391,3 +391,175 @@ TEST(ClsLock, TestSetCookie) {
 
   ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
 }
+
+TEST(ClsLock, TestRenew) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  bufferlist bl;
+
+  string oid1 = "foo1";
+  string lock_name1 = "mylock1";
+
+  ASSERT_EQ(0, ioctx.write(oid1, bl, bl.length(), 0));
+
+  Lock l1(lock_name1);
+  utime_t lock_duration1(5, 0);
+  l1.set_duration(lock_duration1);
+
+  ASSERT_EQ(0, l1.lock_exclusive(&ioctx, oid1));
+  l1.set_may_renew(true);
+  sleep(2);
+  ASSERT_EQ(0, l1.lock_exclusive(&ioctx, oid1));
+  sleep(7);
+  ASSERT_EQ(0, l1.lock_exclusive(&ioctx, oid1)) <<
+    "when a cls_lock is set to may_renew, a relock after expiration "
+    "should still work";
+  ASSERT_EQ(0, l1.unlock(&ioctx, oid1));
+
+  // ***********************************************
+
+  string oid2 = "foo2";
+  string lock_name2 = "mylock2";
+
+  ASSERT_EQ(0, ioctx.write(oid2, bl, bl.length(), 0));
+
+  Lock l2(lock_name2);
+  utime_t lock_duration2(5, 0);
+  l2.set_duration(lock_duration2);
+
+  ASSERT_EQ(0, l2.lock_exclusive(&ioctx, oid2));
+  l2.set_must_renew(true);
+  sleep(2);
+  ASSERT_EQ(0, l2.lock_exclusive(&ioctx, oid2));
+  sleep(7);
+  ASSERT_EQ(-ENOENT, l2.lock_exclusive(&ioctx, oid2)) <<
+    "when a cls_lock is set to must_renew, a relock after expiration "
+    "should fail";
+  ASSERT_EQ(-ENOENT, l2.unlock(&ioctx, oid2));
+
+  // ***********************************************
+
+  string oid3 = "foo3";
+  string lock_name3 = "mylock3";
+
+  ASSERT_EQ(0, ioctx.write(oid3, bl, bl.length(), 0));
+
+  Lock l3(lock_name3);
+  l3.set_duration(utime_t(5, 0));
+  l3.set_must_renew(true);
+
+  ASSERT_EQ(-ENOENT, l3.lock_exclusive(&ioctx, oid3)) <<
+    "unable to create a lock with must_renew";
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+TEST(ClsLock, TestExclusiveEphemeralBasic) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  bufferlist bl;
+
+  string oid1 = "foo1";
+  string oid2 = "foo2";
+  string lock_name1 = "mylock1";
+  string lock_name2 = "mylock2";
+
+  Lock l1(lock_name1);
+  l1.set_duration(utime_t(5, 0));
+
+  uint64_t size;
+  time_t mod_time;
+
+  l1.set_may_renew(true);
+  ASSERT_EQ(0, l1.lock_exclusive_ephemeral(&ioctx, oid1));
+  ASSERT_EQ(0, ioctx.stat(oid1, &size, &mod_time));
+  sleep(2);
+  ASSERT_EQ(0, l1.unlock(&ioctx, oid1));
+  ASSERT_EQ(-ENOENT, ioctx.stat(oid1, &size, &mod_time));
+
+  // ***********************************************
+
+  Lock l2(lock_name2);
+  utime_t lock_duration2(5, 0);
+  l2.set_duration(utime_t(5, 0));
+
+  ASSERT_EQ(0, l2.lock_exclusive(&ioctx, oid2));
+  ASSERT_EQ(0, ioctx.stat(oid2, &size, &mod_time));
+  sleep(2);
+  ASSERT_EQ(0, l2.unlock(&ioctx, oid2));
+  ASSERT_EQ(0, ioctx.stat(oid2, &size, &mod_time));
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+
+TEST(ClsLock, TestExclusiveEphemeralStealEphemeral) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  bufferlist bl;
+
+  string oid1 = "foo1";
+  string lock_name1 = "mylock1";
+
+  Lock l1(lock_name1);
+  l1.set_duration(utime_t(3, 0));
+
+  ASSERT_EQ(0, l1.lock_exclusive_ephemeral(&ioctx, oid1));
+  sleep(4);
+
+  // l1 is expired, l2 can take; l2 is also exclusive_ephemeral
+  Lock l2(lock_name1);
+  l2.set_duration(utime_t(3, 0));
+  ASSERT_EQ(0, l2.lock_exclusive_ephemeral(&ioctx, oid1));
+  sleep(1);
+  ASSERT_EQ(0, l2.unlock(&ioctx, oid1));
+
+  // l2 cannot unlock its expired lock
+  ASSERT_EQ(-ENOENT, l1.unlock(&ioctx, oid1));
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+
+TEST(ClsLock, TestExclusiveEphemeralStealExclusive) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  bufferlist bl;
+
+  string oid1 = "foo1";
+  string lock_name1 = "mylock1";
+
+  Lock l1(lock_name1);
+  l1.set_duration(utime_t(3, 0));
+
+  ASSERT_EQ(0, l1.lock_exclusive_ephemeral(&ioctx, oid1));
+  sleep(4);
+
+  // l1 is expired, l2 can take; l2 is exclusive (but not ephemeral)
+  Lock l2(lock_name1);
+  l2.set_duration(utime_t(3, 0));
+  ASSERT_EQ(0, l2.lock_exclusive(&ioctx, oid1));
+  sleep(1);
+  ASSERT_EQ(0, l2.unlock(&ioctx, oid1));
+
+  // l2 cannot unlock its expired lock
+  ASSERT_EQ(-ENOENT, l1.unlock(&ioctx, oid1));
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
diff --git a/ceph/src/test/cls_rgw/test_cls_rgw.cc b/ceph/src/test/cls_rgw/test_cls_rgw.cc
index 2b51268fd..8fd460853 100644
--- a/ceph/src/test/cls_rgw/test_cls_rgw.cc
+++ b/ceph/src/test/cls_rgw/test_cls_rgw.cc
@@ -116,7 +116,7 @@ TEST(cls_rgw, index_basic)
   OpMgr mgr;
 
   ObjectWriteOperation *op = mgr.write_op();
-  cls_rgw_bucket_init(*op);
+  cls_rgw_bucket_init_index(*op);
   ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
 
   uint64_t epoch = 1;
@@ -150,7 +150,7 @@ TEST(cls_rgw, index_multiple_obj_writers)
   OpMgr mgr;
 
   ObjectWriteOperation *op = mgr.write_op();
-  cls_rgw_bucket_init(*op);
+  cls_rgw_bucket_init_index(*op);
   ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
 
   uint64_t obj_size = 1024;
@@ -187,7 +187,7 @@ TEST(cls_rgw, index_remove_object)
   OpMgr mgr;
 
   ObjectWriteOperation *op = mgr.write_op();
-  cls_rgw_bucket_init(*op);
+  cls_rgw_bucket_init_index(*op);
   ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
 
   uint64_t obj_size = 1024;
@@ -279,7 +279,7 @@ TEST(cls_rgw, index_suggest)
   OpMgr mgr;
 
   ObjectWriteOperation *op = mgr.write_op();
-  cls_rgw_bucket_init(*op);
+  cls_rgw_bucket_init_index(*op);
   ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
 
   uint64_t total_size = 0;
@@ -390,7 +390,7 @@ TEST(cls_rgw, index_list)
   OpMgr mgr;
 
   ObjectWriteOperation *op = mgr.write_op();
-  cls_rgw_bucket_init(*op);
+  cls_rgw_bucket_init_index(*op);
   ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
 
   uint64_t epoch = 1;
@@ -452,7 +452,7 @@ TEST(cls_rgw, bi_list)
   OpMgr mgr;
 
   ObjectWriteOperation *op = mgr.write_op();
-  cls_rgw_bucket_init(*op);
+  cls_rgw_bucket_init_index(*op);
   ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
 
   string name;
diff --git a/ceph/src/test/compressor/CMakeLists.txt b/ceph/src/test/compressor/CMakeLists.txt
index d296cdf0b..46387c935 100644
--- a/ceph/src/test/compressor/CMakeLists.txt
+++ b/ceph/src/test/compressor/CMakeLists.txt
@@ -6,5 +6,5 @@ add_executable(unittest_compression
   $<TARGET_OBJECTS:unit-main>
   )
 add_ceph_unittest(unittest_compression ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_compression)
-target_link_libraries(unittest_compression global)
+target_link_libraries(unittest_compression global gtest)
 add_dependencies(unittest_compression ceph_example)
diff --git a/ceph/src/test/encoding/readable.sh b/ceph/src/test/encoding/readable.sh
index 5f5469187..c0ee9d721 100755
--- a/ceph/src/test/encoding/readable.sh
+++ b/ceph/src/test/encoding/readable.sh
@@ -232,5 +232,11 @@ if [ $failed -gt 0 ]; then
   echo "FAILED $failed / $numtests tests."
   exit 1
 fi
+
+if [ $numtests -eq 0 ]; then
+  echo "FAILED: no tests found to run!"
+  exit 1
+fi
+
 echo "passed $numtests tests."
 
diff --git a/ceph/src/test/librados/aio.cc b/ceph/src/test/librados/aio.cc
index 142622597..6671a42f1 100644
--- a/ceph/src/test/librados/aio.cc
+++ b/ceph/src/test/librados/aio.cc
@@ -8,7 +8,6 @@
 #include "gtest/gtest.h"
 #include <errno.h>
 #include <fcntl.h>
-#include <semaphore.h>
 #include <sstream>
 #include <string>
 #include <boost/scoped_ptr.hpp>
@@ -24,9 +23,7 @@ public:
   AioTestData()
     : m_cluster(NULL),
       m_ioctx(NULL),
-      m_init(false),
-      m_complete(false),
-      m_safe(false)
+      m_init(false)
   {
   }
 
@@ -35,30 +32,21 @@ public:
     if (m_init) {
       rados_ioctx_destroy(m_ioctx);
       destroy_one_pool(m_pool_name, &m_cluster);
-      sem_close(m_sem);
     }
   }
 
   std::string init()
   {
     int ret;
-    if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
-      int err = errno;
-      ostringstream oss;
-      oss << "sem_open failed: " << cpp_strerror(err);
-      return oss.str();
-    }
     m_pool_name = get_temp_pool_name();
     std::string err = create_one_pool(m_pool_name, &m_cluster);
     if (!err.empty()) {
-      sem_close(m_sem);
       ostringstream oss;
       oss << "create_one_pool(" << m_pool_name << ") failed: error " << err;
       return oss.str();
     }
     ret = rados_ioctx_create(m_cluster, m_pool_name.c_str(), &m_ioctx);
     if (ret) {
-      sem_close(m_sem);
       destroy_one_pool(m_pool_name, &m_cluster);
       ostringstream oss;
       oss << "rados_ioctx_create failed: error " << ret;
@@ -68,22 +56,17 @@ public:
     return "";
   }
 
-  sem_t *m_sem = nullptr;
   rados_t m_cluster;
   rados_ioctx_t m_ioctx;
   std::string m_pool_name;
   bool m_init;
-  bool m_complete;
-  bool m_safe;
 };
 
 class AioTestDataPP
 {
 public:
   AioTestDataPP()
-    : m_init(false),
-      m_complete(false),
-      m_safe(false)
+    : m_init(false)
   {
   }
 
@@ -92,7 +75,6 @@ public:
     if (m_init) {
       m_ioctx.close();
       destroy_one_pool_pp(m_pool_name, m_cluster);
-      sem_close(m_sem);
     }
   }
 
@@ -105,23 +87,15 @@ public:
   {
     int ret;
 
-    if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
-      int err = errno;
-      ostringstream oss;
-      oss << "sem_open failed: " << cpp_strerror(err);
-      return oss.str();
-    }
     m_pool_name = get_temp_pool_name();
     std::string err = create_one_pool_pp(m_pool_name, m_cluster, config);
     if (!err.empty()) {
-      sem_close(m_sem);
       ostringstream oss;
       oss << "create_one_pool(" << m_pool_name << ") failed: error " << err;
       return oss.str();
     }
     ret = m_cluster.ioctx_create(m_pool_name.c_str(), m_ioctx);
     if (ret) {
-      sem_close(m_sem);
       destroy_one_pool_pp(m_pool_name, m_cluster);
       ostringstream oss;
       oss << "rados_ioctx_create failed: error " << ret;
@@ -131,49 +105,18 @@ public:
     return "";
   }
 
-  sem_t *m_sem = nullptr;
   Rados m_cluster;
   IoCtx m_ioctx;
   std::string m_pool_name;
   bool m_init;
-  bool m_complete;
-  bool m_safe;
 };
 
-void set_completion_complete(rados_completion_t cb, void *arg)
-{
-  AioTestData *test = static_cast<AioTestData*>(arg);
-  test->m_complete = true;
-  sem_post(test->m_sem);
-}
-
-void set_completion_safe(rados_completion_t cb, void *arg)
-{
-  AioTestData *test = static_cast<AioTestData*>(arg);
-  test->m_safe = true;
-  sem_post(test->m_sem);
-}
-
-void set_completion_completePP(rados_completion_t cb, void *arg)
-{
-  AioTestDataPP *test = static_cast<AioTestDataPP*>(arg);
-  test->m_complete = true;
-  sem_post(test->m_sem);
-}
-
-void set_completion_safePP(rados_completion_t cb, void *arg)
-{
-  AioTestDataPP *test = static_cast<AioTestDataPP*>(arg);
-  test->m_safe = true;
-  sem_post(test->m_sem);
-}
-
 TEST(LibRadosAio, TooBig) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(-E2BIG, rados_aio_write(test_data.m_ioctx, "foo",
@@ -191,7 +134,7 @@ TEST(LibRadosAio, TooBigPP) {
 
   bufferlist bl;
   AioCompletion *aio_completion = test_data.m_cluster.aio_create_completion(
-                                                                            (void*)&test_data, NULL, NULL);
+                                                                            nullptr, NULL, NULL);
   ASSERT_EQ(-E2BIG, test_data.m_ioctx.aio_write("foo", aio_completion, bl, UINT_MAX, 0));
   ASSERT_EQ(-E2BIG, test_data.m_ioctx.aio_append("foo", aio_completion, bl, UINT_MAX));
   // ioctx.aio_write_full no way to overflow bl.length()
@@ -258,29 +201,27 @@ TEST(LibRadosAio, SimpleWrite) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
 
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion2, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   rados_aio_release(my_completion);
@@ -296,15 +237,14 @@ TEST(LibRadosAio, SimpleWritePP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
 			       my_completion, bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   delete my_completion;
@@ -315,13 +255,12 @@ TEST(LibRadosAio, SimpleWritePP) {
   ASSERT_EQ("", test_data.init());
   test_data.m_ioctx.set_namespace("nspace");
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
+	  nullptr, nullptr, nullptr);
   ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
 			       my_completion, bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   delete my_completion;
@@ -332,8 +271,8 @@ TEST(LibRadosAio, WaitForSafe) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -348,7 +287,7 @@ TEST(LibRadosAio, WaitForSafePP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -367,23 +306,22 @@ TEST(LibRadosAio, RoundTrip) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[256];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
@@ -400,23 +338,22 @@ TEST(LibRadosAio, RoundTrip2) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
@@ -433,8 +370,8 @@ TEST(LibRadosAio, RoundTrip3) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
 
@@ -447,8 +384,7 @@ TEST(LibRadosAio, RoundTrip3) {
 
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
 
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
@@ -457,8 +393,8 @@ TEST(LibRadosAio, RoundTrip3) {
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
 
   rados_read_op_t op2 = rados_create_read_op();
   rados_read_op_read(op2, 0, sizeof(buf2), buf2, NULL, NULL);
@@ -493,7 +429,7 @@ TEST(LibRadosAio, RoundTripPP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -504,13 +440,12 @@ TEST(LibRadosAio, RoundTripPP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
 			      my_completion2, &bl2, sizeof(buf), 0));
@@ -529,7 +464,7 @@ TEST(LibRadosAio, RoundTripPP2) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -540,13 +475,12 @@ TEST(LibRadosAio, RoundTripPP2) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
 			      my_completion2, &bl2, sizeof(buf), 0));
@@ -621,7 +555,7 @@ TEST(LibRadosAio, RoundTripSparseReadPP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -632,14 +566,13 @@ TEST(LibRadosAio, RoundTripSparseReadPP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   std::map<uint64_t, uint64_t> extents;
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_sparse_read("foo",
 			      my_completion2, &extents, &bl2, sizeof(buf), 0));
@@ -657,8 +590,8 @@ TEST(LibRadosAio, RoundTripAppend) {
   AioTestData test_data;
   rados_completion_t my_completion, my_completion2, my_completion3;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
@@ -670,8 +603,8 @@ TEST(LibRadosAio, RoundTripAppend) {
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
   memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
 			       my_completion2, buf2, sizeof(buf2)));
   {
@@ -681,8 +614,8 @@ TEST(LibRadosAio, RoundTripAppend) {
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   char buf3[sizeof(buf) + sizeof(buf2)];
   memset(buf3, 0, sizeof(buf3));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion3, buf3, sizeof(buf3), 0));
   {
@@ -701,7 +634,7 @@ TEST(LibRadosAio, RoundTripAppendPP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -720,7 +653,7 @@ TEST(LibRadosAio, RoundTripAppendPP) {
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_append("foo", my_completion2,
 					    bl2, sizeof(buf2)));
@@ -731,7 +664,7 @@ TEST(LibRadosAio, RoundTripAppendPP) {
   ASSERT_EQ(0, my_completion2->get_return_value());
   bufferlist bl3;
   AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion3, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
 			      my_completion3, &bl3, 2 * sizeof(buf), 0));
@@ -754,8 +687,8 @@ TEST(LibRadosAio, RemoveTest) {
   rados_completion_t my_completion;
   AioTestData test_data;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   memset(buf, 0xaa, sizeof(buf));
   ASSERT_EQ(0, rados_append(test_data.m_ioctx, "foo", buf, sizeof(buf)));
   ASSERT_EQ(0, rados_aio_remove(test_data.m_ioctx, "foo", my_completion));
@@ -779,7 +712,7 @@ TEST(LibRadosAioPP, RemoveTestPP) {
   ASSERT_EQ(0, test_data.m_ioctx.append("foo", bl1, sizeof(buf)));
   boost::scoped_ptr<AioCompletion> my_completion
     (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
   ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion.get()));
   {
     TestAlarm alarm;
@@ -801,8 +734,8 @@ TEST(LibRadosAio, XattrsRoundTrip) {
   ASSERT_EQ(0, rados_append(test_data.m_ioctx, "foo", buf, sizeof(buf)));
   // async getxattr
   rados_completion_t my_completion;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   ASSERT_EQ(0, rados_aio_getxattr(test_data.m_ioctx, "foo", my_completion, attr1, buf, sizeof(buf)));
   {
     TestAlarm alarm;
@@ -812,8 +745,8 @@ TEST(LibRadosAio, XattrsRoundTrip) {
   rados_aio_release(my_completion);
   // async setxattr
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_setxattr(test_data.m_ioctx, "foo", my_completion2, attr1, attr1_buf, sizeof(attr1_buf)));
   {
     TestAlarm alarm;
@@ -823,8 +756,8 @@ TEST(LibRadosAio, XattrsRoundTrip) {
   rados_aio_release(my_completion2);
   // async getxattr
   rados_completion_t my_completion3;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_getxattr(test_data.m_ioctx, "foo", my_completion3, attr1, buf, sizeof(buf)));
   {
     TestAlarm alarm;
@@ -850,7 +783,7 @@ TEST(LibRadosAioPP, XattrsRoundTripPP) {
   // async getxattr
   boost::scoped_ptr<AioCompletion> my_completion
     (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
   ASSERT_EQ(0, test_data.m_ioctx.aio_getxattr("foo", my_completion.get(), attr1, bl2));
   {
     TestAlarm alarm;
@@ -865,7 +798,7 @@ TEST(LibRadosAioPP, XattrsRoundTripPP) {
   ASSERT_EQ("", test_data2.init());
   boost::scoped_ptr<AioCompletion> my_completion2
     (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data2, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
   ASSERT_EQ(0, test_data.m_ioctx.aio_setxattr("foo", my_completion2.get(), attr1, bl3));
   {
     TestAlarm alarm;
@@ -878,7 +811,7 @@ TEST(LibRadosAioPP, XattrsRoundTripPP) {
   ASSERT_EQ("", test_data3.init());
   boost::scoped_ptr<AioCompletion> my_completion3
     (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data3, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
   ASSERT_EQ(0, test_data.m_ioctx.aio_getxattr("foo", my_completion3.get(), attr1, bl4));
   {
     TestAlarm alarm;
@@ -900,8 +833,8 @@ TEST(LibRadosAio, RmXattr) {
   ASSERT_EQ(0, rados_append(test_data.m_ioctx, "foo", buf, sizeof(buf)));  
   // async setxattr
   rados_completion_t my_completion;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   ASSERT_EQ(0, rados_aio_setxattr(test_data.m_ioctx, "foo", my_completion, attr1, attr1_buf, sizeof(attr1_buf)));
   {
     TestAlarm alarm;
@@ -911,8 +844,8 @@ TEST(LibRadosAio, RmXattr) {
   rados_aio_release(my_completion);
   // async rmxattr
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-            set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+            nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_rmxattr(test_data.m_ioctx, "foo", my_completion2, attr1));
   {
     TestAlarm alarm;
@@ -922,8 +855,8 @@ TEST(LibRadosAio, RmXattr) {
   rados_aio_release(my_completion2);
   // async getxattr after deletion
   rados_completion_t my_completion3;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-            set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+            nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_getxattr(test_data.m_ioctx, "foo", my_completion3, attr1, buf, sizeof(buf)));
   {
     TestAlarm alarm;
@@ -939,8 +872,8 @@ TEST(LibRadosAio, RmXattr) {
   ASSERT_EQ(0, rados_write(test_data.m_ioctx, "foo_rmxattr", buf2, sizeof(buf2), 0));
   // asynx setxattr
   rados_completion_t my_completion4;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-            set_completion_complete, set_completion_safe, &my_completion4));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+            nullptr, nullptr, &my_completion4));
   ASSERT_EQ(0, rados_aio_setxattr(test_data.m_ioctx, "foo_rmxattr", my_completion4, attr2, attr2_buf, sizeof(attr2_buf)));
   {
     TestAlarm alarm;
@@ -952,8 +885,8 @@ TEST(LibRadosAio, RmXattr) {
   ASSERT_EQ(0, rados_remove(test_data.m_ioctx, "foo_rmxattr"));
   // async rmxattr on non existing object
   rados_completion_t my_completion5;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-            set_completion_complete, set_completion_safe, &my_completion5));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+            nullptr, nullptr, &my_completion5));
   ASSERT_EQ(0, rados_aio_rmxattr(test_data.m_ioctx, "foo_rmxattr", my_completion5, attr2));
   {
     TestAlarm alarm;
@@ -978,7 +911,7 @@ TEST(LibRadosAioPP, RmXattrPP) {
   bl2.append(attr1_buf, sizeof(attr1_buf));
   boost::scoped_ptr<AioCompletion> my_completion
     (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
   ASSERT_EQ(0, test_data.m_ioctx.aio_setxattr("foo", my_completion.get(), attr1, bl2));
   {
     TestAlarm alarm;
@@ -990,7 +923,7 @@ TEST(LibRadosAioPP, RmXattrPP) {
   ASSERT_EQ("", test_data2.init());
   boost::scoped_ptr<AioCompletion> my_completion2
     (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data2, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
   ASSERT_EQ(0, test_data.m_ioctx.aio_rmxattr("foo", my_completion2.get(), attr1));
   {
     TestAlarm alarm;
@@ -1002,7 +935,7 @@ TEST(LibRadosAioPP, RmXattrPP) {
   ASSERT_EQ("", test_data3.init());
   boost::scoped_ptr<AioCompletion> my_completion3
     (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data3, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
   bufferlist bl3;
   ASSERT_EQ(0, test_data.m_ioctx.aio_getxattr("foo", my_completion3.get(), attr1, bl3));
   {
@@ -1025,7 +958,7 @@ TEST(LibRadosAioPP, RmXattrPP) {
   ASSERT_EQ("", test_data4.init());
   boost::scoped_ptr<AioCompletion> my_completion4
     (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data4, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
   ASSERT_EQ(0, test_data.m_ioctx.aio_setxattr("foo_rmxattr", my_completion4.get(), attr2, bl22));
   {
     TestAlarm alarm;
@@ -1039,7 +972,7 @@ TEST(LibRadosAioPP, RmXattrPP) {
   ASSERT_EQ("", test_data5.init());
   boost::scoped_ptr<AioCompletion> my_completion5
     (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data5, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
   ASSERT_EQ(0, test_data.m_ioctx.aio_rmxattr("foo_rmxattr", my_completion5.get(), attr2));
   {
     TestAlarm alarm;
@@ -1066,8 +999,8 @@ TEST(LibRadosAio, XattrIter) {
   ASSERT_EQ(0, rados_setxattr(test_data.m_ioctx, "foo", attr2, attr2_buf, sizeof(attr2_buf)));
   // call async version of getxattrs and wait for completion
   rados_completion_t my_completion;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-            set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+            nullptr, nullptr, &my_completion));
   rados_xattrs_iter_t iter;
   ASSERT_EQ(0, rados_aio_getxattrs(test_data.m_ioctx, "foo", my_completion, &iter));
   {
@@ -1126,7 +1059,7 @@ TEST(LibRadosIoPP, XattrListPP) {
   // call async version of getxattrs
   boost::scoped_ptr<AioCompletion> my_completion
     (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
   std::map<std::string, bufferlist> attrset;
   ASSERT_EQ(0, test_data.m_ioctx.aio_getxattrs("foo", my_completion.get(), attrset));
   {
@@ -1152,23 +1085,22 @@ TEST(LibRadosAio, IsComplete) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
@@ -1192,7 +1124,7 @@ TEST(LibRadosAio, IsCompletePP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -1203,13 +1135,12 @@ TEST(LibRadosAio, IsCompletePP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
 					  &bl2, sizeof(buf), 0));
@@ -1235,8 +1166,8 @@ TEST(LibRadosAio, IsSafe) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -1256,8 +1187,8 @@ TEST(LibRadosAio, IsSafe) {
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
@@ -1274,7 +1205,7 @@ TEST(LibRadosAio, IsSafePP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -1296,7 +1227,7 @@ TEST(LibRadosAio, IsSafePP) {
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   bufferlist bl2;
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
@@ -1316,8 +1247,8 @@ TEST(LibRadosAio, ReturnValue) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0, sizeof(buf));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent",
@@ -1334,7 +1265,7 @@ TEST(LibRadosAio, ReturnValuePP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   bufferlist bl1;
@@ -1352,8 +1283,8 @@ TEST(LibRadosAio, Flush) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xee, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -1363,8 +1294,8 @@ TEST(LibRadosAio, Flush) {
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
@@ -1381,7 +1312,7 @@ TEST(LibRadosAio, FlushPP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -1394,7 +1325,7 @@ TEST(LibRadosAio, FlushPP) {
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
 					  &bl2, sizeof(buf), 0));
@@ -1413,8 +1344,8 @@ TEST(LibRadosAio, FlushAsync) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   rados_completion_t flush_completion;
   ASSERT_EQ(0, rados_aio_create_completion(NULL, NULL, NULL, &flush_completion));
   char buf[128];
@@ -1435,8 +1366,8 @@ TEST(LibRadosAio, FlushAsync) {
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
@@ -1454,7 +1385,7 @@ TEST(LibRadosAio, FlushAsyncPP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *flush_completion =
       test_data.m_cluster.aio_create_completion(NULL, NULL, NULL);
   AioCompletion *my_completion_null = NULL;
@@ -1478,7 +1409,7 @@ TEST(LibRadosAio, FlushAsyncPP) {
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
 					  &bl2, sizeof(buf), 0));
@@ -1498,8 +1429,8 @@ TEST(LibRadosAio, RoundTripWriteFull) {
   AioTestData test_data;
   rados_completion_t my_completion, my_completion2, my_completion3;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -1511,8 +1442,8 @@ TEST(LibRadosAio, RoundTripWriteFull) {
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[64];
   memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_write_full(test_data.m_ioctx, "foo",
 			       my_completion2, buf2, sizeof(buf2)));
   {
@@ -1522,8 +1453,8 @@ TEST(LibRadosAio, RoundTripWriteFull) {
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   char buf3[sizeof(buf) + sizeof(buf2)];
   memset(buf3, 0, sizeof(buf3));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion3, buf3, sizeof(buf3), 0));
   {
@@ -1541,7 +1472,7 @@ TEST(LibRadosAio, RoundTripWriteFullPP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -1560,7 +1491,7 @@ TEST(LibRadosAio, RoundTripWriteFullPP) {
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_write_full("foo", my_completion2, bl2));
   {
@@ -1570,7 +1501,7 @@ TEST(LibRadosAio, RoundTripWriteFullPP) {
   ASSERT_EQ(0, my_completion2->get_return_value());
   bufferlist bl3;
   AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion3, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
 					  &bl3, sizeof(buf), 0));
@@ -1632,8 +1563,8 @@ TEST(LibRadosAio, RoundTripWriteSame) {
   AioTestData test_data;
   rados_completion_t my_completion, my_completion2, my_completion3;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char full[128];
   memset(full, 0xcc, sizeof(full));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -1647,8 +1578,8 @@ TEST(LibRadosAio, RoundTripWriteSame) {
   char buf[32];
   size_t ws_write_len = sizeof(full);
   memset(buf, 0xdd, sizeof(buf));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_writesame(test_data.m_ioctx, "foo",
 				   my_completion2, buf, sizeof(buf),
 				   ws_write_len, 0));
@@ -1657,8 +1588,8 @@ TEST(LibRadosAio, RoundTripWriteSame) {
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion3, full, sizeof(full), 0));
   {
@@ -1678,7 +1609,7 @@ TEST(LibRadosAio, RoundTripWriteSamePP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char full[128];
@@ -1699,7 +1630,7 @@ TEST(LibRadosAio, RoundTripWriteSamePP) {
   bufferlist bl2;
   bl2.append(buf, sizeof(buf));
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_writesame("foo", my_completion2, bl2,
 					       ws_write_len, 0));
@@ -1710,7 +1641,7 @@ TEST(LibRadosAio, RoundTripWriteSamePP) {
   ASSERT_EQ(0, my_completion2->get_return_value());
   bufferlist bl3;
   AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion3, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
 					  &bl3, sizeof(full), 0));
@@ -1782,23 +1713,22 @@ TEST(LibRadosAio, SimpleStat) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
   time_t pmtime;
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
 			      my_completion2, &psize, &pmtime));
   {
@@ -1815,7 +1745,7 @@ TEST(LibRadosAio, SimpleStatPP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -1826,14 +1756,13 @@ TEST(LibRadosAio, SimpleStatPP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
   time_t pmtime;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
 			  		&psize, &pmtime));
@@ -1851,37 +1780,35 @@ TEST(LibRadosAio, SimpleStatNS) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
   char buf2[64];
   memset(buf2, 0xbb, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf2, sizeof(buf2), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
   time_t pmtime;
   rados_completion_t my_completion2;
   rados_ioctx_set_namespace(test_data.m_ioctx, "");
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
 			      my_completion2, &psize, &pmtime));
   {
@@ -1893,8 +1820,8 @@ TEST(LibRadosAio, SimpleStatNS) {
 
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
   rados_completion_t my_completion3;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
 			      my_completion3, &psize, &pmtime));
   {
@@ -1913,7 +1840,7 @@ TEST(LibRadosAio, SimpleStatPPNS) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -1924,14 +1851,13 @@ TEST(LibRadosAio, SimpleStatPPNS) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
   time_t pmtime;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
 			  		&psize, &pmtime));
@@ -1949,23 +1875,22 @@ TEST(LibRadosAio, StatRemove) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
   time_t pmtime;
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
 			      my_completion2, &psize, &pmtime));
   {
@@ -1975,8 +1900,8 @@ TEST(LibRadosAio, StatRemove) {
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   ASSERT_EQ(sizeof(buf), psize);
   rados_completion_t my_completion3;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_remove(test_data.m_ioctx, "foo", my_completion3));
   {
     TestAlarm alarm;
@@ -1986,8 +1911,8 @@ TEST(LibRadosAio, StatRemove) {
   uint64_t psize2;
   time_t pmtime2;
   rados_completion_t my_completion4;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion4));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion4));
   ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
 			      my_completion4, &psize2, &pmtime2));
   {
@@ -2005,7 +1930,7 @@ TEST(LibRadosAio, StatRemovePP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -2016,14 +1941,13 @@ TEST(LibRadosAio, StatRemovePP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
   time_t pmtime;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
 			  		&psize, &pmtime));
@@ -2036,7 +1960,7 @@ TEST(LibRadosAio, StatRemovePP) {
   uint64_t psize2;
   time_t pmtime2;
   AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion3, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion3));
   {
@@ -2046,7 +1970,7 @@ TEST(LibRadosAio, StatRemovePP) {
   ASSERT_EQ(0, my_completion3->get_return_value());
 
   AioCompletion *my_completion4 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion4, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion4,
 			  		&psize2, &pmtime2));
@@ -2065,21 +1989,19 @@ TEST(LibRadosAio, ExecuteClass) {
   AioTestData test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   }
-  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   char out[128];
   ASSERT_EQ(0, rados_aio_exec(test_data.m_ioctx, "foo", my_completion2,
 			      "hello", "say_hello", NULL, 0, out, sizeof(out)));
@@ -2097,7 +2019,7 @@ TEST(LibRadosAio, ExecuteClassPP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -2108,12 +2030,11 @@ TEST(LibRadosAio, ExecuteClassPP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   bufferlist in, out;
   ASSERT_EQ(0, test_data.m_ioctx.aio_exec("foo", my_completion2,
@@ -2337,8 +2258,8 @@ TEST(LibRadosAio, MultiWrite) {
   AioTestData test_data;
   rados_completion_t my_completion, my_completion2, my_completion3;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -2351,8 +2272,8 @@ TEST(LibRadosAio, MultiWrite) {
 
   char buf2[64];
   memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion2, buf2, sizeof(buf2), sizeof(buf)));
   {
@@ -2363,8 +2284,8 @@ TEST(LibRadosAio, MultiWrite) {
 
   char buf3[(sizeof(buf) + sizeof(buf2)) * 3];
   memset(buf3, 0, sizeof(buf3));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion3, buf3, sizeof(buf3), 0));
   {
@@ -2383,7 +2304,7 @@ TEST(LibRadosAio, MultiWritePP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -2403,7 +2324,7 @@ TEST(LibRadosAio, MultiWritePP) {
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion2,
 					   bl2, sizeof(buf2), sizeof(buf)));
@@ -2415,7 +2336,7 @@ TEST(LibRadosAio, MultiWritePP) {
 
   bufferlist bl3;
   AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion3, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
 					  &bl3, (sizeof(buf) + sizeof(buf2) * 3), 0));
@@ -2437,8 +2358,8 @@ TEST(LibRadosAio, AioUnlock) {
   ASSERT_EQ("", test_data.init());
   ASSERT_EQ(0, rados_lock_exclusive(test_data.m_ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
   rados_completion_t my_completion;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-            set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+            nullptr, nullptr, &my_completion));
   ASSERT_EQ(0, rados_aio_unlock(test_data.m_ioctx, "foo", "TestLock", "Cookie", my_completion));
   {
     TestAlarm alarm;
@@ -2454,7 +2375,7 @@ TEST(LibRadosAio, AioUnlockPP) {
   ASSERT_EQ(0, test_data.m_ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
   boost::scoped_ptr<AioCompletion> my_completion
     (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
   ASSERT_EQ(0, test_data.m_ioctx.aio_unlock("foo", "TestLock", "Cookie", my_completion.get()));
   {
     TestAlarm alarm;
@@ -2472,9 +2393,7 @@ public:
   AioTestDataEC()
     : m_cluster(NULL),
       m_ioctx(NULL),
-      m_init(false),
-      m_complete(false),
-      m_safe(false)
+      m_init(false)
   {
   }
 
@@ -2483,30 +2402,21 @@ public:
     if (m_init) {
       rados_ioctx_destroy(m_ioctx);
       destroy_one_ec_pool(m_pool_name, &m_cluster);
-      sem_close(m_sem);
     }
   }
 
   std::string init()
   {
     int ret;
-    if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
-      int err = errno;
-      ostringstream oss;
-      oss << "sem_open failed: " << cpp_strerror(err);
-      return oss.str();
-    }
     m_pool_name = get_temp_pool_name();
     std::string err = create_one_ec_pool(m_pool_name, &m_cluster);
     if (!err.empty()) {
-      sem_close(m_sem);
       ostringstream oss;
       oss << "create_one_ec_pool(" << m_pool_name << ") failed: error " << err;
       return oss.str();
     }
     ret = rados_ioctx_create(m_cluster, m_pool_name.c_str(), &m_ioctx);
     if (ret) {
-      sem_close(m_sem);
       destroy_one_ec_pool(m_pool_name, &m_cluster);
       ostringstream oss;
       oss << "rados_ioctx_create failed: error " << ret;
@@ -2516,22 +2426,17 @@ public:
     return "";
   }
 
-  sem_t *m_sem;
   rados_t m_cluster;
   rados_ioctx_t m_ioctx;
   std::string m_pool_name;
   bool m_init;
-  bool m_complete;
-  bool m_safe;
 };
 
 class AioTestDataECPP
 {
 public:
   AioTestDataECPP()
-    : m_init(false),
-      m_complete(false),
-      m_safe(false)
+    : m_init(false)
   {
   }
 
@@ -2540,30 +2445,21 @@ public:
     if (m_init) {
       m_ioctx.close();
       destroy_one_ec_pool_pp(m_pool_name, m_cluster);
-      sem_close(m_sem);
     }
   }
 
   std::string init()
   {
     int ret;
-    if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
-      int err = errno;
-      ostringstream oss;
-      oss << "sem_open failed: " << cpp_strerror(err);
-      return oss.str();
-    }
     m_pool_name = get_temp_pool_name();
     std::string err = create_one_ec_pool_pp(m_pool_name, m_cluster);
     if (!err.empty()) {
-      sem_close(m_sem);
       ostringstream oss;
       oss << "create_one_ec_pool(" << m_pool_name << ") failed: error " << err;
       return oss.str();
     }
     ret = m_cluster.ioctx_create(m_pool_name.c_str(), m_ioctx);
     if (ret) {
-      sem_close(m_sem);
       destroy_one_ec_pool_pp(m_pool_name, m_cluster);
       ostringstream oss;
       oss << "rados_ioctx_create failed: error " << ret;
@@ -2573,70 +2469,37 @@ public:
     return "";
   }
 
-  sem_t *m_sem = nullptr;
   Rados m_cluster;
   IoCtx m_ioctx;
   std::string m_pool_name;
   bool m_init;
-  bool m_complete;
-  bool m_safe;
 };
 
-void set_completion_completeEC(rados_completion_t cb, void *arg)
-{
-  AioTestDataEC *test = static_cast<AioTestDataEC*>(arg);
-  test->m_complete = true;
-  sem_post(test->m_sem);
-}
-
-void set_completion_safeEC(rados_completion_t cb, void *arg)
-{
-  AioTestDataEC *test = static_cast<AioTestDataEC*>(arg);
-  test->m_safe = true;
-  sem_post(test->m_sem);
-}
-
-void set_completion_completeECPP(rados_completion_t cb, void *arg)
-{
-  AioTestDataECPP *test = static_cast<AioTestDataECPP*>(arg);
-  test->m_complete = true;
-  sem_post(test->m_sem);
-}
-
-void set_completion_safeECPP(rados_completion_t cb, void *arg)
-{
-  AioTestDataECPP *test = static_cast<AioTestDataECPP*>(arg);
-  test->m_safe = true;
-  sem_post(test->m_sem);
-}
-
 TEST(LibRadosAioEC, SimpleWrite) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
 
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion2, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   rados_aio_release(my_completion);
@@ -2652,15 +2515,14 @@ TEST(LibRadosAioEC, SimpleWritePP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
 			       my_completion, bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   delete my_completion;
@@ -2671,13 +2533,12 @@ TEST(LibRadosAioEC, SimpleWritePP) {
   ASSERT_EQ("", test_data.init());
   test_data.m_ioctx.set_namespace("nspace");
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+	  nullptr, nullptr, nullptr);
   ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
 			       my_completion, bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   delete my_completion;
@@ -2688,8 +2549,8 @@ TEST(LibRadosAioEC, WaitForSafe) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -2704,7 +2565,7 @@ TEST(LibRadosAioEC, WaitForSafePP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -2723,23 +2584,22 @@ TEST(LibRadosAioEC, RoundTrip) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[256];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
@@ -2756,23 +2616,22 @@ TEST(LibRadosAioEC, RoundTrip2) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
@@ -2789,7 +2648,7 @@ TEST(LibRadosAioEC, RoundTripPP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -2800,13 +2659,12 @@ TEST(LibRadosAioEC, RoundTripPP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
 			      my_completion2, &bl2, sizeof(buf), 0));
@@ -2825,7 +2683,7 @@ TEST(LibRadosAioEC, RoundTripPP2) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -2836,13 +2694,12 @@ TEST(LibRadosAioEC, RoundTripPP2) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
 			      my_completion2, &bl2, sizeof(buf), 0));
@@ -2904,7 +2761,7 @@ TEST(LibRadosAioEC, RoundTripSparseReadPP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -2915,15 +2772,14 @@ TEST(LibRadosAioEC, RoundTripSparseReadPP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
 
   map<uint64_t, uint64_t> extents;
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_sparse_read("foo",
 			      my_completion2, &extents, &bl2, sizeof(buf), 0));
@@ -2941,8 +2797,8 @@ TEST(LibRadosAioEC, RoundTripAppend) {
   AioTestDataEC test_data;
   rados_completion_t my_completion, my_completion2, my_completion3, my_completion4;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   int requires;
   ASSERT_EQ(0, rados_ioctx_pool_requires_alignment2(test_data.m_ioctx, &requires));
   ASSERT_NE(0, requires);
@@ -2964,8 +2820,8 @@ TEST(LibRadosAioEC, RoundTripAppend) {
   int hbsize = bsize / 2;
   char *buf2 = (char *)new char[hbsize];
   memset(buf2, 0xdd, hbsize);
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
 			       my_completion2, buf2, hbsize));
   {
@@ -2974,8 +2830,8 @@ TEST(LibRadosAioEC, RoundTripAppend) {
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
 
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
 			       my_completion3, buf2, hbsize));
   {
@@ -2987,8 +2843,8 @@ TEST(LibRadosAioEC, RoundTripAppend) {
   int tbsize = bsize + hbsize;
   char *buf3 = (char *)new char[tbsize];
   memset(buf3, 0, tbsize);
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion4));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion4));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion4, buf3, bsize * 3, 0));
   {
@@ -3011,7 +2867,7 @@ TEST(LibRadosAioEC, RoundTripAppendPP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   bool requires;
@@ -3039,7 +2895,7 @@ TEST(LibRadosAioEC, RoundTripAppendPP) {
   bufferlist bl2;
   bl2.append(buf2, hbsize);
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_append("foo", my_completion2,
 					    bl2, hbsize));
@@ -3050,7 +2906,7 @@ TEST(LibRadosAioEC, RoundTripAppendPP) {
   ASSERT_EQ(0, my_completion2->get_return_value());
 
   AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion3, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_append("foo", my_completion3,
 					    bl2, hbsize));
@@ -3062,7 +2918,7 @@ TEST(LibRadosAioEC, RoundTripAppendPP) {
 
   bufferlist bl3;
   AioCompletion *my_completion4 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion4, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
 			      my_completion4, &bl3, bsize * 3, 0));
@@ -3086,23 +2942,22 @@ TEST(LibRadosAioEC, IsComplete) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
@@ -3126,7 +2981,7 @@ TEST(LibRadosAioEC, IsCompletePP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -3137,13 +2992,12 @@ TEST(LibRadosAioEC, IsCompletePP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
 					  &bl2, sizeof(buf), 0));
@@ -3169,8 +3023,8 @@ TEST(LibRadosAioEC, IsSafe) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -3190,8 +3044,8 @@ TEST(LibRadosAioEC, IsSafe) {
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
@@ -3208,7 +3062,7 @@ TEST(LibRadosAioEC, IsSafePP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -3230,7 +3084,7 @@ TEST(LibRadosAioEC, IsSafePP) {
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   bufferlist bl2;
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
@@ -3250,8 +3104,8 @@ TEST(LibRadosAioEC, ReturnValue) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0, sizeof(buf));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent",
@@ -3268,7 +3122,7 @@ TEST(LibRadosAioEC, ReturnValuePP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   bufferlist bl1;
@@ -3286,8 +3140,8 @@ TEST(LibRadosAioEC, Flush) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xee, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -3297,8 +3151,8 @@ TEST(LibRadosAioEC, Flush) {
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
@@ -3315,7 +3169,7 @@ TEST(LibRadosAioEC, FlushPP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -3328,7 +3182,7 @@ TEST(LibRadosAioEC, FlushPP) {
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
 					  &bl2, sizeof(buf), 0));
@@ -3347,8 +3201,8 @@ TEST(LibRadosAioEC, FlushAsync) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   rados_completion_t flush_completion;
   ASSERT_EQ(0, rados_aio_create_completion(NULL, NULL, NULL, &flush_completion));
   char buf[128];
@@ -3369,8 +3223,8 @@ TEST(LibRadosAioEC, FlushAsync) {
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
@@ -3388,7 +3242,7 @@ TEST(LibRadosAioEC, FlushAsyncPP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   AioCompletion *flush_completion =
       test_data.m_cluster.aio_create_completion(NULL, NULL, NULL);
   AioCompletion *my_completion_null = NULL;
@@ -3412,7 +3266,7 @@ TEST(LibRadosAioEC, FlushAsyncPP) {
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
 					  &bl2, sizeof(buf), 0));
@@ -3432,8 +3286,8 @@ TEST(LibRadosAioEC, RoundTripWriteFull) {
   AioTestDataEC test_data;
   rados_completion_t my_completion, my_completion2, my_completion3;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -3445,8 +3299,8 @@ TEST(LibRadosAioEC, RoundTripWriteFull) {
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[64];
   memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_write_full(test_data.m_ioctx, "foo",
 			       my_completion2, buf2, sizeof(buf2)));
   {
@@ -3456,8 +3310,8 @@ TEST(LibRadosAioEC, RoundTripWriteFull) {
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   char buf3[sizeof(buf) + sizeof(buf2)];
   memset(buf3, 0, sizeof(buf3));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion3, buf3, sizeof(buf3), 0));
   {
@@ -3475,7 +3329,7 @@ TEST(LibRadosAioEC, RoundTripWriteFullPP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -3494,7 +3348,7 @@ TEST(LibRadosAioEC, RoundTripWriteFullPP) {
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_write_full("foo", my_completion2, bl2));
   {
@@ -3504,7 +3358,7 @@ TEST(LibRadosAioEC, RoundTripWriteFullPP) {
   ASSERT_EQ(0, my_completion2->get_return_value());
   bufferlist bl3;
   AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion3, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
 					  &bl3, sizeof(buf), 0));
@@ -3566,23 +3420,22 @@ TEST(LibRadosAioEC, SimpleStat) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
   time_t pmtime;
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
 			      my_completion2, &psize, &pmtime));
   {
@@ -3599,7 +3452,7 @@ TEST(LibRadosAioEC, SimpleStatPP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -3610,14 +3463,13 @@ TEST(LibRadosAioEC, SimpleStatPP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
   time_t pmtime;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
 			  		&psize, &pmtime));
@@ -3635,37 +3487,35 @@ TEST(LibRadosAioEC, SimpleStatNS) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
   char buf2[64];
   memset(buf2, 0xbb, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf2, sizeof(buf2), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
   time_t pmtime;
   rados_completion_t my_completion2;
   rados_ioctx_set_namespace(test_data.m_ioctx, "");
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
 			      my_completion2, &psize, &pmtime));
   {
@@ -3677,8 +3527,8 @@ TEST(LibRadosAioEC, SimpleStatNS) {
 
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
   rados_completion_t my_completion3;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
 			      my_completion3, &psize, &pmtime));
   {
@@ -3697,7 +3547,7 @@ TEST(LibRadosAioEC, SimpleStatPPNS) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -3708,14 +3558,13 @@ TEST(LibRadosAioEC, SimpleStatPPNS) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
   time_t pmtime;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
 			  		&psize, &pmtime));
@@ -3733,23 +3582,22 @@ TEST(LibRadosAioEC, StatRemove) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
   time_t pmtime;
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
 			      my_completion2, &psize, &pmtime));
   {
@@ -3759,8 +3607,8 @@ TEST(LibRadosAioEC, StatRemove) {
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   ASSERT_EQ(sizeof(buf), psize);
   rados_completion_t my_completion3;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_remove(test_data.m_ioctx, "foo", my_completion3));
   {
     TestAlarm alarm;
@@ -3770,8 +3618,8 @@ TEST(LibRadosAioEC, StatRemove) {
   uint64_t psize2;
   time_t pmtime2;
   rados_completion_t my_completion4;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion4));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion4));
   ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
 			      my_completion4, &psize2, &pmtime2));
   {
@@ -3789,7 +3637,7 @@ TEST(LibRadosAioEC, StatRemovePP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -3800,14 +3648,13 @@ TEST(LibRadosAioEC, StatRemovePP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
   time_t pmtime;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
 			  		&psize, &pmtime));
@@ -3820,7 +3667,7 @@ TEST(LibRadosAioEC, StatRemovePP) {
   uint64_t psize2;
   time_t pmtime2;
   AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion3, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion3));
   {
@@ -3830,7 +3677,7 @@ TEST(LibRadosAioEC, StatRemovePP) {
   ASSERT_EQ(0, my_completion3->get_return_value());
 
   AioCompletion *my_completion4 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion4, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion4,
 			  		&psize2, &pmtime2));
@@ -3849,21 +3696,20 @@ TEST(LibRadosAioEC, ExecuteClass) {
   AioTestDataEC test_data;
   rados_completion_t my_completion;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   char out[128];
   ASSERT_EQ(0, rados_aio_exec(test_data.m_ioctx, "foo", my_completion2,
 			      "hello", "say_hello", NULL, 0, out, sizeof(out)));
@@ -3881,7 +3727,7 @@ TEST(LibRadosAioEC, ExecuteClassPP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -3892,12 +3738,11 @@ TEST(LibRadosAioEC, ExecuteClassPP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   bufferlist in, out;
   ASSERT_EQ(0, test_data.m_ioctx.aio_exec("foo", my_completion2,
@@ -3949,8 +3794,8 @@ TEST(LibRadosAioEC, MultiWrite) {
   AioTestDataEC test_data;
   rados_completion_t my_completion, my_completion2, my_completion3;
   ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -3963,8 +3808,8 @@ TEST(LibRadosAioEC, MultiWrite) {
 
   char buf2[64];
   memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion2));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion2, buf2, sizeof(buf2), sizeof(buf)));
   {
@@ -3975,8 +3820,8 @@ TEST(LibRadosAioEC, MultiWrite) {
 
   char buf3[(sizeof(buf) + sizeof(buf2)) * 3];
   memset(buf3, 0, sizeof(buf3));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+	      nullptr, nullptr, &my_completion3));
   ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
 			      my_completion3, buf3, sizeof(buf3), 0));
   {
@@ -3994,7 +3839,7 @@ TEST(LibRadosAioEC, MultiWritePP) {
   AioTestDataECPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -4014,7 +3859,7 @@ TEST(LibRadosAioEC, MultiWritePP) {
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion2,
 					   bl2, sizeof(buf2), sizeof(buf)));
@@ -4026,7 +3871,7 @@ TEST(LibRadosAioEC, MultiWritePP) {
 
   bufferlist bl3;
   AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+	  nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion3, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
 					  &bl3, (sizeof(buf) + sizeof(buf2) * 3), 0));
@@ -4046,22 +3891,20 @@ TEST(LibRadosAio, RacingRemovePP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init({{"objecter_retry_writes_after_first_reply", "true"}}));
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-        (void*)&test_data, set_completion_complete, set_completion_safe);
+        nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion, nullptr);
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-        (void*)&test_data, set_completion_complete, set_completion_safe);
+        nullptr, nullptr, nullptr);
   ASSERT_NE(my_completion2, nullptr);
   ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion2));
   ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
                                          bl, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
     my_completion2->wait_for_complete();
     my_completion->wait_for_complete();
   }
@@ -4076,7 +3919,7 @@ TEST(LibRadosAio, RoundTripCmpExtPP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char full[128];
@@ -4095,7 +3938,7 @@ TEST(LibRadosAio, RoundTripCmpExtPP) {
   bufferlist cbl;
   cbl.append(full, sizeof(full));
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_EQ(0, test_data.m_ioctx.aio_cmpext("foo", my_completion2, 0, cbl));
 
   {
@@ -4109,7 +3952,7 @@ TEST(LibRadosAio, RoundTripCmpExtPP) {
   cbl.clear();
   cbl.append(full, sizeof(full));
   AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  nullptr, nullptr, nullptr);
   ASSERT_EQ(0, test_data.m_ioctx.aio_cmpext("foo", my_completion3, 0, cbl));
 
   {
diff --git a/ceph/src/test/librados/lock.cc b/ceph/src/test/librados/lock.cc
index 1a7c58380..3c5b985ac 100644
--- a/ceph/src/test/librados/lock.cc
+++ b/ceph/src/test/librados/lock.cc
@@ -72,16 +72,16 @@ TEST_F(LibRadosLockPP, LockSharedDurPP) {
   ASSERT_EQ(0, ioctx.lock_shared("foo", "TestLock", "Cookie", "Tag", "", NULL, 0));
 }
 
-TEST_F(LibRadosLock, LockRenew) {
+TEST_F(LibRadosLock, LockMayRenew) {
   ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
   ASSERT_EQ(-EEXIST, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
-  ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_RENEW));
+  ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_MAY_RENEW));
 }
 
-TEST_F(LibRadosLockPP, LockRenewPP) {
+TEST_F(LibRadosLockPP, LockMayRenewPP) {
   ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
   ASSERT_EQ(-EEXIST, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
-  ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_RENEW));
+  ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_MAY_RENEW));
 }
 
 TEST_F(LibRadosLock, Unlock) {
@@ -251,16 +251,16 @@ TEST_F(LibRadosLockECPP, LockSharedDurPP) {
   ASSERT_EQ(0, ioctx.lock_shared("foo", "TestLock", "Cookie", "Tag", "", NULL, 0));
 }
 
-TEST_F(LibRadosLockEC, LockRenew) {
+TEST_F(LibRadosLockEC, LockMayRenew) {
   ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
   ASSERT_EQ(-EEXIST, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
-  ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_RENEW));
+  ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_MAY_RENEW));
 }
 
-TEST_F(LibRadosLockECPP, LockRenewPP) {
+TEST_F(LibRadosLockECPP, LockMayRenewPP) {
   ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
   ASSERT_EQ(-EEXIST, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
-  ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_RENEW));
+  ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_MAY_RENEW));
 }
 
 TEST_F(LibRadosLockEC, Unlock) {
diff --git a/ceph/src/test/librados_test_stub/LibradosTestStub.cc b/ceph/src/test/librados_test_stub/LibradosTestStub.cc
index c07d4b61b..e5e529481 100644
--- a/ceph/src/test/librados_test_stub/LibradosTestStub.cc
+++ b/ceph/src/test/librados_test_stub/LibradosTestStub.cc
@@ -1140,6 +1140,12 @@ int cls_cxx_create(cls_method_context_t hctx, bool exclusive) {
   return ctx->io_ctx_impl->create(ctx->oid, exclusive);
 }
 
+int cls_cxx_remove(cls_method_context_t hctx) {
+  librados::TestClassHandler::MethodContext *ctx =
+    reinterpret_cast<librados::TestClassHandler::MethodContext*>(hctx);
+  return ctx->io_ctx_impl->remove(ctx->oid, ctx->io_ctx_impl->get_snap_context());
+}
+
 int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin) {
   librados::TestClassHandler::MethodContext *ctx =
     reinterpret_cast<librados::TestClassHandler::MethodContext*>(hctx);
diff --git a/ceph/src/test/objectstore/store_test.cc b/ceph/src/test/objectstore/store_test.cc
index d2152d456..326c9785d 100644
--- a/ceph/src/test/objectstore/store_test.cc
+++ b/ceph/src/test/objectstore/store_test.cc
@@ -6855,6 +6855,73 @@ TEST_P(StoreTestSpecificAUSize, fsckOnUnalignedDevice2) {
   g_conf->apply_changes(NULL);
 }
 
+TEST_P(StoreTest, SpuriousReadErrorTest) {
+  if (string(GetParam()) != "bluestore")
+    return;
+
+  ObjectStore::Sequencer osr("test");
+  int r;
+  auto logger = store->get_perf_counters();
+  coll_t cid;
+  ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    cerr << "Creating collection " << cid << std::endl;
+    r = store->queue_transaction(&osr, std::move(t), nullptr);
+    ASSERT_EQ(r, 0);
+  }
+  bufferlist test_data;
+  bufferptr ap(0x2000);
+  memset(ap.c_str(), 'a', 0x2000);
+  test_data.append(ap);
+  {
+    ObjectStore::Transaction t;
+    t.write(cid, hoid, 0, 0x2000, test_data);
+    r = store->queue_transaction(&osr, std::move(t), nullptr);
+    ASSERT_EQ(r, 0);
+    // force cache clear
+    EXPECT_EQ(store->umount(), 0);
+    EXPECT_EQ(store->mount(), 0);
+  }
+
+  cerr << "Injecting CRC error with no retry, expecting EIO" << std::endl;
+  g_conf->set_val("bluestore_retry_disk_reads", "0");
+  g_conf->set_val("bluestore_debug_inject_csum_err_probability", "1");
+  g_ceph_context->_conf->apply_changes(nullptr);
+  {
+    bufferlist in;
+    r = store->read(cid, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+    ASSERT_EQ(-EIO, r);
+    ASSERT_EQ(logger->get(l_bluestore_read_eio), 1u);
+    ASSERT_EQ(logger->get(l_bluestore_reads_with_retries), 0u);
+  }
+
+  cerr << "Injecting CRC error with retries, expecting success after several retries" << std::endl;
+  g_conf->set_val("bluestore_retry_disk_reads", "255");
+  g_conf->set_val("bluestore_debug_inject_csum_err_probability", "0.8");
+  /**
+   * Probabilistic test: 25 reads, each has a 80% chance of failing with 255 retries
+   * Probability of at least one retried read: 1 - (0.2 ** 25) = 100% - 3e-18
+   * Probability of a random test failure: 1 - ((1 - (0.8 ** 255)) ** 25) ~= 5e-24
+   */
+  g_ceph_context->_conf->apply_changes(nullptr);
+  {
+    for (int i = 0; i < 25; ++i) {
+      bufferlist in;
+      r = store->read(cid, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+      ASSERT_EQ(0x2000, r);
+      ASSERT_TRUE(bl_eq(test_data, in));
+    }
+    ASSERT_GE(logger->get(l_bluestore_reads_with_retries), 1u);
+  }
+
+  // revert
+  g_conf->set_val("bluestore_retry_disk_reads", "3");
+  g_conf->set_val("bluestore_debug_inject_csum_err_probability", "0");
+  g_ceph_context->_conf->apply_changes(nullptr);
+}
+
 int main(int argc, char **argv) {
   vector<const char*> args;
   argv_to_vec(argc, (const char **)argv, args);
diff --git a/ceph/src/test/osd/TestOSDMap.cc b/ceph/src/test/osd/TestOSDMap.cc
index 6335a0542..251ae4d99 100644
--- a/ceph/src/test/osd/TestOSDMap.cc
+++ b/ceph/src/test/osd/TestOSDMap.cc
@@ -598,6 +598,143 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
     ASSERT_TRUE(parent_0 != parent_1);
   }
 
+ {
+    // cancel stale upmaps
+    osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+    int from = -1;
+    for (int i = 0; i < (int)get_num_osds(); i++) {
+      if (std::find(up.begin(), up.end(), i) == up.end()) {
+        from = i;
+        break;
+      }
+    }
+    ASSERT_TRUE(from >= 0);
+    int to = -1;
+    for (int i = 0; i < (int)get_num_osds(); i++) {
+      if (std::find(up.begin(), up.end(), i) == up.end() && i != from) {
+        to = i;
+        break;
+      }
+    }
+    ASSERT_TRUE(to >= 0);
+    vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+    new_pg_upmap_items.push_back(make_pair(from, to));
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    pending_inc.new_pg_upmap_items[pgid] =
+      mempool::osdmap::vector<pair<int32_t,int32_t>>(
+        new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+    OSDMap nextmap;
+    nextmap.deepish_copy_from(osdmap);
+    nextmap.apply_incremental(pending_inc);
+    ASSERT_TRUE(nextmap.have_pg_upmaps(pgid));
+    OSDMap::Incremental new_pending_inc(nextmap.get_epoch() + 1);
+    nextmap.clean_pg_upmaps(g_ceph_context, &new_pending_inc);
+    nextmap.apply_incremental(new_pending_inc);
+    ASSERT_TRUE(!nextmap.have_pg_upmaps(pgid));
+  }
+
+  {
+    // https://tracker.ceph.com/issues/37493
+    pg_t ec_pg(0, my_ec_pool);
+    pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg);
+    OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map..
+    int from = -1;
+    int to = -1;
+    {
+      // insert a valid pg_upmap_item
+      vector<int> ec_up;
+      int ec_up_primary;
+      osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
+      ASSERT_TRUE(!ec_up.empty());
+      from = *(ec_up.begin());
+      ASSERT_TRUE(from >= 0);
+      for (int i = 0; i < (int)get_num_osds(); i++) {
+        if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
+          to = i;
+          break;
+        }
+      }
+      ASSERT_TRUE(to >= 0);
+      ASSERT_TRUE(from != to);
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(from, to));
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[ec_pgid] =
+      mempool::osdmap::vector<pair<int32_t,int32_t>>(
+        new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      tmpmap.deepish_copy_from(osdmap);
+      tmpmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+    }
+    {
+      // mark one of the target OSDs of the above pg_upmap_item as down
+      OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
+      pending_inc.new_state[to] = CEPH_OSD_UP;
+      tmpmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(!tmpmap.is_up(to));
+      ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+    }
+    {
+      // confirm *maybe_remove_pg_upmaps* won't do anything bad
+      OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
+      tmpmap.maybe_remove_pg_upmaps(g_ceph_context, tmpmap, &pending_inc);
+      tmpmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+    }
+  }
+
+  {
+    // http://tracker.ceph.com/issues/37501
+    pg_t ec_pg(0, my_ec_pool);
+    pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg);
+    OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map..
+    int from = -1;
+    int to = -1;
+    {
+      // insert a valid pg_upmap_item
+      vector<int> ec_up;
+      int ec_up_primary;
+      osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
+      ASSERT_TRUE(!ec_up.empty());
+      from = *(ec_up.begin());
+      ASSERT_TRUE(from >= 0);
+      for (int i = 0; i < (int)get_num_osds(); i++) {
+        if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
+          to = i;
+          break;
+        }
+      }
+      ASSERT_TRUE(to >= 0);
+      ASSERT_TRUE(from != to);
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(from, to));
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[ec_pgid] =
+      mempool::osdmap::vector<pair<int32_t,int32_t>>(
+        new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      tmpmap.deepish_copy_from(osdmap);
+      tmpmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+    }
+    {
+      // mark one of the target OSDs of the above pg_upmap_item as out
+      OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
+      pending_inc.new_weight[to] = CEPH_OSD_OUT;
+      tmpmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmpmap.is_out(to));
+      ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+    }
+    {
+      // *maybe_remove_pg_upmaps* should be able to remove the above *bad* mapping
+      OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
+      OSDMap nextmap;
+      nextmap.deepish_copy_from(tmpmap);
+      nextmap.maybe_remove_pg_upmaps(g_ceph_context, nextmap, &pending_inc);
+      tmpmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(!tmpmap.have_pg_upmaps(ec_pgid));
+    }
+  }
+
   {
     // TEST pg_upmap
     {
diff --git a/ceph/src/test/rgw/rgw_multi/multisite.py b/ceph/src/test/rgw/rgw_multi/multisite.py
index 58bd98224..78e8a6b56 100644
--- a/ceph/src/test/rgw/rgw_multi/multisite.py
+++ b/ceph/src/test/rgw/rgw_multi/multisite.py
@@ -1,8 +1,9 @@
 from abc import ABCMeta, abstractmethod
-from cStringIO import StringIO
+from six import StringIO
+
 import json
 
-from conn import get_gateway_connection
+from .conn import get_gateway_connection
 
 class Cluster:
     """ interface to run commands against a distinct ceph cluster """
diff --git a/ceph/src/test/rgw/rgw_multi/tests.py b/ceph/src/test/rgw/rgw_multi/tests.py
index 0807677d5..5b4f0a0d3 100644
--- a/ceph/src/test/rgw/rgw_multi/tests.py
+++ b/ceph/src/test/rgw/rgw_multi/tests.py
@@ -4,13 +4,14 @@ import string
 import sys
 import time
 import logging
+import errno
 
 try:
     from itertools import izip_longest as zip_longest
 except ImportError:
     from itertools import zip_longest
 from itertools import combinations
-from cStringIO import StringIO
+from six import StringIO
 
 import boto
 import boto.s3.connection
@@ -21,7 +22,7 @@ from nose.tools import eq_ as eq
 from nose.plugins.attrib import attr
 from nose.plugins.skip import SkipTest
 
-from .multisite import Zone
+from .multisite import Zone, ZoneGroup
 
 from .conn import get_gateway_connection
 
@@ -849,6 +850,34 @@ def test_zonegroup_remove():
     # validate the resulting period
     zonegroup.period.update(z1, commit=True)
 
+
+def test_zg_master_zone_delete():
+
+    master_zg = realm.master_zonegroup()
+    master_zone = master_zg.master_zone
+
+    assert(len(master_zg.zones) >= 1)
+    master_cluster = master_zg.zones[0].cluster
+
+    rm_zg = ZoneGroup('remove_zg')
+    rm_zg.create(master_cluster)
+
+    rm_zone = Zone('remove', rm_zg, master_cluster)
+    rm_zone.create(master_cluster)
+    master_zg.period.update(master_zone, commit=True)
+
+
+    rm_zone.delete(master_cluster)
+    # Period update: This should now fail as the zone will be the master zone
+    # in that zg
+    _, retcode = master_zg.period.update(master_zone, check_retcode=False)
+    assert(retcode == errno.EINVAL)
+
+    # Proceed to delete the zonegroup as well, previous period now does not
+    # contain a dangling master_zone, this must succeed
+    rm_zg.delete(master_cluster)
+    master_zg.period.update(master_zone, commit=True)
+
 def test_set_bucket_website():
     buckets, zone_bucket = create_bucket_per_zone_in_realm()
     for _, bucket in zone_bucket:
diff --git a/ceph/src/test/rgw/test_rgw_iam_policy.cc b/ceph/src/test/rgw/test_rgw_iam_policy.cc
index 738ce1b78..3bae06f47 100644
--- a/ceph/src/test/rgw/test_rgw_iam_policy.cc
+++ b/ceph/src/test/rgw/test_rgw_iam_policy.cc
@@ -942,7 +942,7 @@ TEST(MatchPolicy, Resource)
   EXPECT_TRUE(match_policy("a:b:c", "a:b:c", flag));
   EXPECT_FALSE(match_policy("a:b:c", "A:B:C", flag)); // case sensitive
   EXPECT_TRUE(match_policy("a:*:e", "a:bcd:e", flag));
-  EXPECT_FALSE(match_policy("a:*", "a:b:c", flag)); // cannot span segments
+  EXPECT_TRUE(match_policy("a:*", "a:b:c", flag)); // can span segments
 }
 
 TEST(MatchPolicy, ARN)
@@ -960,5 +960,5 @@ TEST(MatchPolicy, String)
   EXPECT_TRUE(match_policy("a:b:c", "a:b:c", flag));
   EXPECT_FALSE(match_policy("a:b:c", "A:B:C", flag)); // case sensitive
   EXPECT_TRUE(match_policy("a:*:e", "a:bcd:e", flag));
-  EXPECT_FALSE(match_policy("a:*", "a:b:c", flag)); // cannot span segments
+  EXPECT_TRUE(match_policy("a:*", "a:b:c", flag)); // can span segments
 }
diff --git a/ceph/src/tools/cephfs/JournalTool.cc b/ceph/src/tools/cephfs/JournalTool.cc
index a66cc2d70..a9756f73e 100644
--- a/ceph/src/tools/cephfs/JournalTool.cc
+++ b/ceph/src/tools/cephfs/JournalTool.cc
@@ -62,9 +62,7 @@ void JournalTool::usage()
     << "    <output>: [summary|list|binary|json] [--path <path>]\n"
     << "\n"
     << "General options:\n"
-    << "  --rank=filesystem:mds-rank  Journal rank (required if multiple\n"
-    << "                              file systems, default is rank 0 on\n"
-    << "                              the only filesystem otherwise.\n"
+    << "  --rank=filesystem:mds-rank|all Journal rank (mandatory)\n"
     << "\n"
     << "Special options\n"
     << "  --alternate-pool <name>     Alternative metadata pool to target\n"
@@ -92,13 +90,12 @@ int JournalTool::main(std::vector<const char*> &argv)
   std::vector<const char*>::iterator arg = argv.begin();
 
   std::string rank_str;
-  if(!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
-    // Default: act on rank 0.  Will give the user an error if they
-    // try invoking this way when they have more than one filesystem.
-    rank_str = "0";
+  if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
+    derr << "missing mandatory \"--rank\" argument" << dendl;
+    return -EINVAL;
   }
 
-  r = role_selector.parse(*fsmap, rank_str);
+  r = role_selector.parse(*fsmap, rank_str, false);
   if (r != 0) {
     derr << "Couldn't determine MDS rank." << dendl;
     return r;
@@ -145,15 +142,27 @@ int JournalTool::main(std::vector<const char*> &argv)
 
   // Execution
   // =========
-  for (auto role : role_selector.get_roles()) {
+  auto roles = role_selector.get_roles();
+  if (roles.size() > 1) {
+    const std::string &command = argv[0];
+    bool allowed = can_execute_for_all_ranks(mode, command);
+    if (!allowed) {
+      derr << "operation not allowed for all ranks" << dendl;
+      return -EINVAL;
+    }
+
+    all_ranks = true;
+  }
+  for (auto role : roles) {
     rank = role.rank;
+    std::vector<const char *> rank_argv(argv);
     dout(4) << "Executing for rank " << rank << dendl;
     if (mode == std::string("journal")) {
-      r = main_journal(argv);
+      r = main_journal(rank_argv);
     } else if (mode == std::string("header")) {
-      r = main_header(argv);
+      r = main_header(rank_argv);
     } else if (mode == std::string("event")) {
-      r = main_event(argv);
+      r = main_event(rank_argv);
     } else {
       derr << "Bad command '" << mode << "'" << dendl;
       usage();
@@ -169,6 +178,23 @@ int JournalTool::main(std::vector<const char*> &argv)
 }
 
 
+std::string JournalTool::gen_dump_file_path(const std::string &prefix) {
+  if (!all_ranks) {
+    return prefix;
+  }
+
+  return prefix + "." + std::to_string(rank);
+}
+
+bool JournalTool::can_execute_for_all_ranks(const std::string &mode,
+                                            const std::string &command) {
+  if (mode == "journal" && command == "import") {
+    return false;
+  }
+
+  return true;
+}
+
 /**
  * Handle arguments for 'journal' mode
  *
@@ -376,6 +402,8 @@ int JournalTool::main_event(std::vector<const char*> &argv)
     }
   }
 
+  const std::string dump_path = gen_dump_file_path(output_path);
+
   // Execute command
   // ===============
   JournalScanner js(input, rank, filter);
@@ -493,7 +521,7 @@ int JournalTool::main_event(std::vector<const char*> &argv)
 
   // Generate output
   // ===============
-  EventOutput output(js, output_path);
+  EventOutput output(js, dump_path);
   int output_result = 0;
   if (output_style == "binary") {
       output_result = output.binary();
@@ -580,7 +608,8 @@ int JournalTool::journal_export(std::string const &path, bool import, bool force
     if (import) {
       r = dumper.undump(path.c_str(), force);
     } else {
-      r = dumper.dump(path.c_str());
+      const std::string ex_path = gen_dump_file_path(path);
+      r = dumper.dump(ex_path.c_str());
     }
   }
 
diff --git a/ceph/src/tools/cephfs/JournalTool.h b/ceph/src/tools/cephfs/JournalTool.h
index 2edeb6a76..4dccd92f1 100644
--- a/ceph/src/tools/cephfs/JournalTool.h
+++ b/ceph/src/tools/cephfs/JournalTool.h
@@ -38,6 +38,9 @@ class JournalTool : public MDSUtility
     // various main_ functions.
     mds_rank_t rank;
 
+    // when set, generate per rank dump file path
+    bool all_ranks = false;
+
     // Entry points
     int main_journal(std::vector<const char*> &argv);
     int main_header(std::vector<const char*> &argv);
@@ -78,6 +81,14 @@ class JournalTool : public MDSUtility
         bufferlist *out_bl);
     int consume_inos(const std::set<inodeno_t> &inos);
 
+    // generate output file path for dump/export
+    std::string gen_dump_file_path(const std::string &prefix);
+
+    // check if an operation (mode, command) is safe to be
+    // executed on all ranks.
+    bool can_execute_for_all_ranks(const std::string &mode,
+                                   const std::string &command);
+
   public:
     void usage();
     JournalTool() :
diff --git a/ceph/src/tools/cephfs/RoleSelector.cc b/ceph/src/tools/cephfs/RoleSelector.cc
index 3c7932a5e..e2d53b86e 100644
--- a/ceph/src/tools/cephfs/RoleSelector.cc
+++ b/ceph/src/tools/cephfs/RoleSelector.cc
@@ -29,13 +29,14 @@ int MDSRoleSelector::parse_rank(
   }
 }
 
-int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str)
+int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str,
+                           bool allow_unqualified_rank)
 {
   auto colon_pos = str.find(":");
   if (colon_pos == std::string::npos) {
     // An unqualified rank.  Only valid if there is only one
     // namespace.
-    if (fsmap.filesystem_count() == 1) {
+    if (fsmap.filesystem_count() == 1 && allow_unqualified_rank) {
       fscid = fsmap.get_filesystem()->fscid;
       return parse_rank(fsmap, str);
     } else {
diff --git a/ceph/src/tools/cephfs/RoleSelector.h b/ceph/src/tools/cephfs/RoleSelector.h
index 933c51d0f..9090b7200 100644
--- a/ceph/src/tools/cephfs/RoleSelector.h
+++ b/ceph/src/tools/cephfs/RoleSelector.h
@@ -15,7 +15,8 @@ class MDSRoleSelector
 {
   public:
     const std::vector<mds_role_t> &get_roles() const {return roles;}
-    int parse(const FSMap &fsmap, std::string const &str);
+    int parse(const FSMap &fsmap, std::string const &str,
+            bool allow_unqualified_rank=true);
     MDSRoleSelector()
       : fscid(FS_CLUSTER_ID_NONE)
     {}
diff --git a/ceph/src/tools/crushtool.cc b/ceph/src/tools/crushtool.cc
index aca8136cd..13bf247bb 100644
--- a/ceph/src/tools/crushtool.cc
+++ b/ceph/src/tools/crushtool.cc
@@ -172,6 +172,8 @@ void usage()
   cout << "   -i mapfn --move       name --loc type name ...\n"
        << "                         move the given item to specified location\n";
   cout << "   -i mapfn --reweight   recalculate all bucket weights\n";
+  cout << "   -i mapfn --rebuild-class-roots\n";
+  cout << "                         rebuild the per-class shadow trees (normally a no-op)\n";
   cout << "   -i mapfn --create-simple-rule name root type mode\n"
        << "                         create crush rule <name> to start from <root>,\n"
        << "                         replicate across buckets of type <type>, using\n"
@@ -219,6 +221,13 @@ void usage()
   cout << "                         export select data generated during testing routine\n";
   cout << "                         to CSV files for off-line post-processing\n";
   cout << "                         use --help-output for more information\n";
+  cout << "   --reclassify          transform legacy CRUSH map buckets and rules\n";
+  cout << "                         by adding classes\n";
+  cout << "      --reclassify-bucket <bucket-match> <class> <default-parent>\n";
+  cout << "      --reclassify-root <bucket-name> <class>\n";
+  cout << "   --set-subtree-class <bucket-name> <class>\n";
+  cout << "                         set class for all items beneath bucket-name\n";
+  cout << "   --compare <otherfile> compare two maps using --test parameters\n";
   cout << "\n";
   cout << "Options for the output stage\n";
   cout << "\n";
@@ -341,6 +350,8 @@ int main(int argc, const char **argv)
   int verbose = 0;
   bool unsafe_tunables = false;
 
+  bool rebuild_class_roots = false;
+
   bool reweight = false;
   int add_item = -1;
   bool add_bucket = false;
@@ -368,6 +379,13 @@ int main(int argc, const char **argv)
   int straw_calc_version = -1;
   int allowed_bucket_algs = -1;
 
+  bool reclassify = false;
+  map<string,pair<string,string>> reclassify_bucket; // %suffix or prefix% -> class, default_root
+  map<string,string> reclassify_root;        // bucket -> class
+  map<string,string> set_subtree_class;     // bucket -> class
+
+  string compare;
+
   CrushWrapper crush;
 
   CrushTester tester(crush, cout);
@@ -406,6 +424,40 @@ int main(int argc, const char **argv)
       outfn = val;
     } else if (ceph_argparse_flag(args, i, "-v", "--verbose", (char*)NULL)) {
       verbose += 1;
+    } else if (ceph_argparse_witharg(args, i, &val, "--compare", (char*)NULL)) {
+      compare = val;
+    } else if (ceph_argparse_flag(args, i, "--reclassify", (char*)NULL)) {
+      reclassify = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--reclassify-bucket",
+				     (char*)NULL)) {
+      if (i == args.end()) {
+	cerr << "expecting additional argument" << std::endl;
+	return EXIT_FAILURE;
+      }
+      string c = *i;
+      i = args.erase(i);
+      if (i == args.end()) {
+	cerr << "expecting additional argument" << std::endl;
+	return EXIT_FAILURE;
+      }
+      reclassify_bucket[val] = make_pair(c, *i);
+      i = args.erase(i);
+    } else if (ceph_argparse_witharg(args, i, &val, "--reclassify-root",
+				     (char*)NULL)) {
+      if (i == args.end()) {
+	cerr << "expecting additional argument" << std::endl;
+	return EXIT_FAILURE;
+      }
+      reclassify_root[val] = *i;
+      i = args.erase(i);
+    } else if (ceph_argparse_witharg(args, i, &val, "--set-subtree-class",
+				     (char*)NULL)) {
+      if (i == args.end()) {
+	cerr << "expecting additional argument" << std::endl;
+	return EXIT_FAILURE;
+      }
+      set_subtree_class[val] = *i;
+      i = args.erase(i);
     } else if (ceph_argparse_flag(args, i, "--tree", (char*)NULL)) {
       tree = true;
     } else if (ceph_argparse_witharg(args, i, &val, "-f", "--format", (char*)NULL)) {
@@ -468,6 +520,8 @@ int main(int argc, const char **argv)
       adjust = true;
     } else if (ceph_argparse_flag(args, i, "--reweight", (char*)NULL)) {
       reweight = true;
+    } else if (ceph_argparse_flag(args, i, "--rebuild-class-roots", (char*)NULL)) {
+      rebuild_class_roots = true;
     } else if (ceph_argparse_witharg(args, i, &add_item, err, "--add_item", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
@@ -733,7 +787,7 @@ int main(int argc, const char **argv)
     }
   }
 
-  if (test && !check && !display && !write_to_file) {
+  if (test && !check && !display && !write_to_file && compare.empty()) {
     cerr << "WARNING: no output selected; use --output-csv or --show-X" << std::endl;
   }
 
@@ -743,6 +797,8 @@ int main(int argc, const char **argv)
   }
   if (!check && !compile && !decompile && !build && !test && !reweight && !adjust && !tree && !dump &&
       add_item < 0 && !add_bucket && !move_item && !add_rule && !del_rule && full_location < 0 &&
+      !reclassify && !rebuild_class_roots &&
+      compare.empty() &&
       remove_name.empty() && reweight_name.empty()) {
     cerr << "no action specified; -h for help" << std::endl;
     return EXIT_FAILURE;
@@ -1086,7 +1142,31 @@ int main(int argc, const char **argv)
     crush.reweight(g_ceph_context);
     modified = true;
   }
+  if (rebuild_class_roots) {
+    int r = crush.rebuild_roots_with_classes();
+    if (r < 0) {
+      cerr << "failed to rebuidl roots with classes" << std::endl;
+      return EXIT_FAILURE;
+    }
+    modified = true;
+  }
 
+  for (auto& i : set_subtree_class) {
+    crush.set_subtree_class(i.first, i.second);
+    modified = true;
+  }
+  if (reclassify) {
+    int r = crush.reclassify(
+      g_ceph_context,
+      cout,
+      reclassify_root,
+      reclassify_bucket);
+    if (r < 0) {
+      cerr << "failed to reclassify map" << std::endl;
+      return EXIT_FAILURE;
+    }
+    modified = true;
+  }
 
   // display ---
   if (full_location >= 0) {
@@ -1099,7 +1179,7 @@ int main(int argc, const char **argv)
   }
 
   if (tree) {
-    crush.dump_tree(&cout, NULL);
+    crush.dump_tree(&cout, NULL, {}, true);
   }
 
   if (dump) {
@@ -1146,6 +1226,28 @@ int main(int argc, const char **argv)
       return EXIT_FAILURE;
   }
 
+  if (compare.size()) {
+    CrushWrapper crush2;
+    bufferlist in;
+    string error;
+    int r = in.read_file(compare.c_str(), &error);
+    if (r < 0) {
+      cerr << me << ": error reading '" << compare << "': "
+	   << error << std::endl;
+      return EXIT_FAILURE;
+    }
+    auto p = in.begin();
+    try {
+      crush2.decode(p);
+    } catch(...) {
+      cerr << me << ": unable to decode " << compare << std::endl;
+      return EXIT_FAILURE;
+    }
+    r = tester.compare(crush2);
+    if (r < 0)
+      return EXIT_FAILURE;
+  }
+
   // output ---
   if (modified) {
     crush.finalize();
diff --git a/ceph/src/tools/rados/rados.cc b/ceph/src/tools/rados/rados.cc
index 04fb74586..b7bdf9f0e 100644
--- a/ceph/src/tools/rados/rados.cc
+++ b/ceph/src/tools/rados/rados.cc
@@ -1283,7 +1283,7 @@ static int do_cache_flush_evict_all(IoCtx& io_ctx, bool blocking)
       }
     }
   }
-  catch (const std::runtime_error& e) {
+  catch (const std::exception& e) {
     cerr << e.what() << std::endl;
     return -1;
   }
@@ -2252,7 +2252,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 	  }
 	}
       }
-      catch (const std::runtime_error& e) {
+      catch (const std::exception& e) {
 	cerr << e.what() << std::endl;
 	ret = -1;
 	goto out;
@@ -2704,13 +2704,13 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       const string & oid = *iter;
       if (use_striper) {
 	if (forcefull) {
-	  ret = striper.remove(oid, CEPH_OSD_FLAG_FULL_FORCE);
+	  ret = striper.remove(oid, (CEPH_OSD_FLAG_FULL_FORCE | CEPH_OSD_FLAG_FULL_TRY));
 	} else {
 	  ret = striper.remove(oid);
 	}
       } else {
 	if (forcefull) {
-	  ret = io_ctx.remove(oid, CEPH_OSD_FLAG_FULL_FORCE);
+	  ret = io_ctx.remove(oid, (CEPH_OSD_FLAG_FULL_FORCE | CEPH_OSD_FLAG_FULL_TRY));
 	} else {
 	  ret = io_ctx.remove(oid);
 	}
diff --git a/ceph/src/tools/rbd_mirror/ImageReplayer.cc b/ceph/src/tools/rbd_mirror/ImageReplayer.cc
index 9860089a3..11bc08bf4 100644
--- a/ceph/src/tools/rbd_mirror/ImageReplayer.cc
+++ b/ceph/src/tools/rbd_mirror/ImageReplayer.cc
@@ -1503,9 +1503,8 @@ void ImageReplayer<I>::reschedule_update_status_task(int new_interval) {
       m_update_status_interval = new_interval;
     }
 
-    bool restarting = (new_interval == 0 || canceled_task);
     if (new_interval >= 0 && is_running_() &&
-        start_mirror_image_status_update(false, restarting)) {
+        start_mirror_image_status_update(true, false)) {
       m_update_status_task = new FunctionContext(
         [this](int r) {
           assert(m_threads->timer_lock.is_locked());
@@ -1520,6 +1519,7 @@ void ImageReplayer<I>::reschedule_update_status_task(int new_interval) {
 
   if (canceled_task) {
     dout(20) << "canceled task" << dendl;
+    // decrement in-flight status update counter for canceled task
     finish_mirror_image_status_update();
   }
 }
-- 
2.39.2