From a8e162989c7c62b2dc22e83129e3cbdaba113a4d Mon Sep 17 00:00:00 2001
From: Thomas Lamprecht <t.lamprecht@proxmox.com>
Date: Fri, 12 Apr 2019 11:40:44 +0200
Subject: [PATCH] import ceph 12.2.12

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
 Makefile                                      |    2 +-
 ceph/CMakeLists.txt                           |    2 +-
 ceph/PendingReleaseNotes                      |   31 +-
 ceph/admin/doc-requirements.txt               |    2 +-
 ceph/alpine/APKBUILD                          |    6 +-
 ceph/ceph.spec                                |   10 +-
 ceph/ceph.spec.in                             |    4 +-
 ceph/changelog.upstream                       |    6 +
 ceph/doc/api/libcephfs-java.rst               |    9 +
 ceph/doc/ceph-volume/simple/scan.rst          |   11 +
 ceph/doc/man/8/ceph-volume.rst                |    6 +-
 ceph/doc/man/8/ceph.rst                       |   10 +
 ceph/doc/man/8/rbdmap.rst                     |    9 +-
 .../rados/configuration/mon-config-ref.rst    |    2 +-
 .../rados/configuration/osd-config-ref.rst    |   18 +-
 ceph/doc/rados/operations/health-checks.rst   |    2 +-
 .../troubleshooting/troubleshooting-mon.rst   |    4 +-
 ceph/doc/radosgw/index.rst                    |    1 +
 ceph/doc/radosgw/placement.rst                |  180 +++
 ceph/doc/radosgw/s3/authentication.rst        |  121 ++
 .../objectstore-ec/bluestore-bitmap.yaml      |    1 +
 .../tasks/cfuse_workunit_suites_fsstress.yaml |    1 +
 ceph/qa/objectstore/bluestore-bitmap.yaml     |    4 +
 .../bluestore-stupid.yaml}                    |    1 +
 ceph/qa/objectstore/bluestore.yaml            |   38 -
 .../objectstore_cephfs/bluestore-bitmap.yaml  |    1 +
 ceph/qa/objectstore_cephfs/bluestore.yaml     |    1 -
 ceph/qa/overrides/short_pg_log.yaml           |    4 +-
 ceph/qa/packages/packages.yaml                |    4 -
 ceph/qa/run-standalone.sh                     |    2 +
 ceph/qa/standalone/ceph-helpers.sh            |   16 +-
 ceph/qa/standalone/osd/osd-backfill-prio.sh   |  504 ++++++
 ceph/qa/standalone/osd/osd-markdown.sh        |    5 +-
 ceph/qa/standalone/osd/osd-recovery-prio.sh   |  500 ++++++
 ceph/qa/standalone/scrub/osd-scrub-repair.sh  |   89 +-
 .../special/ceph_objectstore_tool.py          |   47 +-
 .../basic/objectstore/bluestore-bitmap.yaml   |    1 +
 .../basic/objectstore/bluestore.yaml          |    1 -
 .../objectstore/bluestore-bitmap.yaml         |    1 +
 .../objectstore/bluestore.yaml                |    1 -
 .../tasks/libcephfs_java.yaml                 |   14 -
 .../objectstore/bluestore-bitmap.yaml         |    1 +
 .../objectstore/bluestore.yaml                |    1 -
 .../client_trim_caps/tasks/trim-i22073.yaml   |    1 -
 .../suites/fs/verify/validater/valgrind.yaml  |   11 +-
 .../powercycle/osd/whitelist_health.yaml      |    1 +
 .../monthrash/workloads/rados_api_tests.yaml  |    1 +
 ceph/qa/suites/rados/rest/rest_test.yaml      |    1 +
 .../objectstore/bluestore-bitmap.yaml         |    1 +
 .../objectstore/bluestore.yaml                |    1 -
 .../all/osd-recovery-incomplete.yaml          |    1 +
 .../rados/singleton/all/osd-recovery.yaml     |    2 +-
 .../rados/singleton/all/thrash-eio.yaml       |    1 +
 .../bluestore-bitmap.yaml                     |    1 +
 .../bluestore.yaml                            |    1 -
 .../rados/verify/tasks/rados_api_tests.yaml   |    1 +
 .../rbd_python_api_tests_old_format.yaml      |    1 +
 .../objectstore/bluestore-bitmap.yaml         |    1 +
 .../objectstore/bluestore.yaml                |    1 -
 ceph/qa/suites/rgw/multisite/overrides.yaml   |    2 +
 .../basic/objectstore/bluestore-bitmap.yaml   |    1 +
 .../smoke/basic/objectstore/bluestore.yaml    |    1 -
 .../objectstore/bluestore-bitmap.yaml         |    1 +
 .../stress-split/objectstore/bluestore.yaml   |    1 -
 ceph/qa/tasks/cephfs/test_client_limits.py    |   38 +-
 ceph/qa/tasks/cephfs/test_misc.py             |    3 +
 ceph/qa/tasks/radosbench.py                   |    4 +-
 ceph/qa/valgrind.supp                         |  622 ++++++++
 ceph/qa/workunits/cephtool/test.sh            |    8 +-
 ceph/qa/workunits/libcephfs-java/test.sh      |   39 -
 .../workunits/rados/test_health_warnings.sh   |    1 +
 ceph/src/.git_version                         |    4 +-
 ceph/src/CMakeLists.txt                       |    2 +-
 ceph/src/auth/Crypto.cc                       |    5 +-
 ceph/src/ceph-disk/run-tox.sh                 |    2 +-
 .../ceph_volume/devices/simple/activate.py    |   34 +-
 .../ceph_volume/devices/simple/scan.py        |   54 +-
 .../ceph_volume/systemd/systemctl.py          |   23 +
 .../tests/devices/simple/test_activate.py     |   20 +
 .../tests/devices/simple/test_scan.py         |    8 -
 .../tests/functional/batch/tox.ini            |    8 +-
 .../lvm/centos7/bluestore/dmcrypt/test.yml    |    5 +
 .../lvm/centos7/filestore/dmcrypt/test.yml    |    5 +
 .../lvm/playbooks/test_bluestore.yml          |    5 +
 .../lvm/playbooks/test_filestore.yml          |    5 +
 .../ceph_volume/tests/functional/lvm/tox.ini  |    8 +-
 .../lvm/xenial/bluestore/dmcrypt/test.yml     |    5 +
 .../lvm/xenial/filestore/dmcrypt/test.yml     |    5 +
 .../centos7/bluestore/dmcrypt-luks/test.yml   |   22 +-
 .../centos7/filestore/activate/test.yml       |    4 +-
 .../tests/functional/simple/tox.ini           |    6 +-
 .../simple/xenial/filestore/activate/test.yml |   22 +-
 .../tests/functional/tests/__init__.py        |    0
 .../tests/functional/tests/conftest.py        |  103 ++
 .../tests/functional/tests/osd/__init__.py    |    0
 .../tests/functional/tests/osd/test_osds.py   |   60 +
 .../tests/systemd/test_systemctl.py           |   21 +
 .../ceph_volume/tests/util/test_device.py     |   36 +
 .../ceph_volume/tests/util/test_disk.py       |   22 -
 .../ceph-volume/ceph_volume/util/device.py    |   15 +-
 ceph/src/ceph-volume/ceph_volume/util/disk.py |    4 -
 ceph/src/ceph-volume/tox.ini                  |    2 +-
 ceph/src/ceph.in                              |   21 +-
 ceph/src/client/Client.cc                     |  153 +-
 ceph/src/client/Client.h                      |    7 +-
 ceph/src/common/AsyncReserver.h               |   72 +
 ceph/src/common/ceph_crypto.cc                |  119 ++
 ceph/src/common/ceph_crypto.h                 |   18 +-
 ceph/src/common/ceph_timer.h                  |    2 +
 ceph/src/common/legacy_config_opts.h          |    2 +-
 ceph/src/common/options.cc                    |   73 +-
 ceph/src/common/str_map.cc                    |   20 +-
 ceph/src/crush/CrushWrapper.cc                |  103 +-
 ceph/src/crush/CrushWrapper.h                 |   16 +-
 ceph/src/journal/Journaler.cc                 |    4 +-
 ceph/src/log/test.cc                          |   20 +-
 ceph/src/mds/Beacon.cc                        |   53 +-
 ceph/src/mds/Beacon.h                         |    2 +-
 ceph/src/mds/CInode.cc                        |   10 +-
 ceph/src/mds/Capability.cc                    |   82 +-
 ceph/src/mds/Capability.h                     |  134 +-
 ceph/src/mds/Locker.cc                        |  236 +--
 ceph/src/mds/Locker.h                         |    6 +-
 ceph/src/mds/MDBalancer.cc                    |    2 +-
 ceph/src/mds/MDCache.cc                       |  295 ++--
 ceph/src/mds/MDCache.h                        |   41 +-
 ceph/src/mds/MDSDaemon.cc                     |    7 +-
 ceph/src/mds/MDSDaemon.h                      |    2 +-
 ceph/src/mds/MDSRank.cc                       |  137 +-
 ceph/src/mds/MDSRank.h                        |    6 +-
 ceph/src/mds/Migrator.cc                      |    5 +-
 ceph/src/mds/Server.cc                        |  308 ++--
 ceph/src/mds/Server.h                         |   16 +-
 ceph/src/mds/SessionMap.cc                    |  108 +-
 ceph/src/mds/SessionMap.h                     |  119 +-
 ceph/src/mds/SimpleLock.h                     |    5 +-
 ceph/src/messages/MMDSFragmentNotify.h        |   39 +-
 ceph/src/messages/MMDSFragmentNotifyAck.h     |   57 +
 ceph/src/mgr/ActivePyModules.cc               |   12 +-
 ceph/src/mgr/BaseMgrModule.cc                 |   13 +
 ceph/src/mgr/DaemonServer.cc                  |    3 +
 ceph/src/mon/AuthMonitor.cc                   |   46 +-
 ceph/src/mon/AuthMonitor.h                    |   15 +-
 ceph/src/mon/CMakeLists.txt                   |    1 +
 ceph/src/mon/FSCommands.cc                    |   10 +
 ceph/src/mon/MDSMonitor.cc                    |    7 +-
 ceph/src/mon/MgrStatMonitor.cc                |    4 +-
 ceph/src/mon/PGMap.cc                         |  108 +-
 ceph/src/msg/Message.cc                       |    5 +
 ceph/src/msg/Message.h                        |    1 +
 ceph/src/msg/async/AsyncConnection.cc         |   27 +-
 ceph/src/msg/async/AsyncConnection.h          |    1 -
 ceph/src/msg/async/EventEpoll.cc              |   14 +-
 ceph/src/msg/msg_types.h                      |   32 +-
 ceph/src/msg/simple/Pipe.cc                   |    6 +-
 ceph/src/os/CMakeLists.txt                    |    4 +-
 ceph/src/os/bluestore/Allocator.cc            |   13 +-
 ceph/src/os/bluestore/Allocator.h             |   17 +-
 ceph/src/os/bluestore/BitAllocator.cc         | 1420 -----------------
 ceph/src/os/bluestore/BitAllocator.h          |  569 -------
 ceph/src/os/bluestore/BitMapAllocator.cc      |  220 ---
 ceph/src/os/bluestore/BitMapAllocator.h       |   50 -
 ceph/src/os/bluestore/BitmapAllocator.cc      |  101 ++
 ceph/src/os/bluestore/BitmapAllocator.h       |   50 +
 ceph/src/os/bluestore/BlockDevice.h           |    2 +-
 ceph/src/os/bluestore/BlueFS.cc               |   49 +-
 ceph/src/os/bluestore/BlueFS.h                |    2 +-
 ceph/src/os/bluestore/BlueStore.cc            |  167 +-
 ceph/src/os/bluestore/BlueStore.h             |   11 +-
 ceph/src/os/bluestore/KernelDevice.cc         |    6 +-
 ceph/src/os/bluestore/StupidAllocator.cc      |   84 +-
 ceph/src/os/bluestore/StupidAllocator.h       |    9 +-
 ceph/src/os/bluestore/bluefs_types.h          |    7 +-
 ceph/src/os/bluestore/bluestore_types.cc      |   30 +-
 ceph/src/os/bluestore/bluestore_types.h       |   99 +-
 .../os/bluestore/fastbmap_allocator_impl.cc   |  544 +++++++
 .../os/bluestore/fastbmap_allocator_impl.h    |  774 +++++++++
 ceph/src/os/filestore/FileStore.cc            |   26 +-
 ceph/src/os/filestore/LFNIndex.cc             |   10 +-
 ceph/src/os/filestore/WBThrottle.cc           |    9 +-
 ceph/src/osd/ECBackend.cc                     |    5 +-
 ceph/src/osd/OSD.cc                           |   99 +-
 ceph/src/osd/OSDMap.cc                        |  644 +++++---
 ceph/src/osd/OSDMap.h                         |    1 +
 ceph/src/osd/PG.cc                            |   77 +-
 ceph/src/osd/PG.h                             |   23 +-
 ceph/src/osd/PrimaryLogPG.cc                  |   11 +-
 ceph/src/osd/osd_types.h                      |    7 +-
 ceph/src/osdc/Objecter.cc                     |   13 +-
 ceph/src/pybind/mgr/balancer/module.py        |   45 +-
 ceph/src/pybind/mgr/dashboard/module.py       |   15 +
 ceph/src/pybind/mgr/prometheus/module.py      |   44 +-
 ceph/src/pybind/mgr/restful/api/crush.py      |    6 +-
 ceph/src/pybind/mgr/restful/common.py         |   54 +-
 ceph/src/pybind/mgr/restful/module.py         |    7 +-
 ceph/src/rbdmap                               |   57 +-
 ceph/src/rgw/CMakeLists.txt                   |    8 +-
 ceph/src/rgw/rgw_admin.cc                     |   23 +-
 ceph/src/rgw/rgw_auth_s3.cc                   |   34 +-
 ceph/src/rgw/rgw_auth_s3.h                    |   18 +-
 ceph/src/rgw/rgw_bucket.cc                    |   51 +-
 ceph/src/rgw/rgw_common.cc                    |    3 -
 ceph/src/rgw/rgw_common.h                     |   28 +-
 ceph/src/rgw/rgw_cr_rados.cc                  |    4 +-
 ceph/src/rgw/rgw_cr_rest.h                    |   87 +-
 ceph/src/rgw/rgw_crypt.cc                     |  103 +-
 ceph/src/rgw/rgw_crypt.h                      |    4 +
 ceph/src/rgw/rgw_data_sync.cc                 |   83 +-
 ceph/src/rgw/rgw_data_sync.h                  |    8 +-
 ceph/src/rgw/rgw_es_query.cc                  |   34 +-
 ceph/src/rgw/rgw_file.h                       |   27 +-
 ceph/src/rgw/rgw_gc.cc                        |    1 +
 ceph/src/rgw/rgw_iam_policy.h                 |    2 +
 ceph/src/rgw/rgw_ldap.cc                      |    2 +-
 ceph/src/rgw/rgw_loadgen.cc                   |    1 +
 ceph/src/rgw/rgw_op.cc                        |    7 +-
 ceph/src/rgw/rgw_rados.cc                     |  178 ++-
 ceph/src/rgw/rgw_rados.h                      |   10 +-
 ceph/src/rgw/rgw_reshard.cc                   |    5 +
 ceph/src/rgw/rgw_rest_client.cc               |    2 +-
 ceph/src/rgw/rgw_rest_conn.h                  |   42 +-
 ceph/src/rgw/rgw_rest_s3.cc                   |   56 +-
 ceph/src/rgw/rgw_rest_s3.h                    |    2 +
 ceph/src/rgw/rgw_sync_module.cc               |    4 +-
 ceph/src/rgw/rgw_sync_module_es.cc            |  329 +++-
 ceph/src/rgw/rgw_sync_module_es.h             |   28 +
 ceph/src/rgw/rgw_sync_module_es_rest.cc       |   17 +-
 ceph/src/test/cli/osdmaptool/upmap-out.t      |   13 +-
 ceph/src/test/cli/osdmaptool/upmap.t          |   11 +-
 ceph/src/test/cli/rbd/help.t                  |   10 +-
 ceph/src/test/common/test_str_map.cc          |   12 +
 ceph/src/test/librbd/fsx.cc                   |   11 +-
 ceph/src/test/mds/TestSessionFilter.cc        |   22 +-
 ceph/src/test/objectstore/Allocator_bench.cc  |  340 ++++
 ceph/src/test/objectstore/Allocator_test.cc   |  221 +--
 .../src/test/objectstore/BitAllocator_test.cc |  593 -------
 ceph/src/test/objectstore/CMakeLists.txt      |   25 +-
 .../objectstore/fastbmap_allocator_test.cc    |  933 +++++++++++
 .../test/objectstore/test_bluestore_types.cc  |   18 +-
 ceph/src/test/osd/TestOSDMap.cc               |  372 ++++-
 ceph/src/test/osd/TestPGLog.cc                |    2 +
 ceph/src/test/rgw/rgw_multi/tests.py          |  129 +-
 ceph/src/test/rgw/rgw_multi/zone_rados.py     |   42 +-
 ceph/src/test/rgw/test_rgw_crypto.cc          |  170 +-
 ceph/src/test/smoke.sh                        |   11 +-
 ceph/src/tools/ceph_monstore_tool.cc          |  179 +--
 ceph/src/tools/ceph_objectstore_tool.cc       |   17 +
 ceph/src/tools/rbd/ArgumentTypes.cc           |    4 +-
 ceph/src/tools/rbd_mirror/ImageReplayer.cc    |    4 +-
 ceph/src/tools/rebuild_mondb.cc               |   60 -
 ceph/src/valgrind.supp                        |  263 ---
 251 files changed, 10165 insertions(+), 5421 deletions(-)
 create mode 100644 ceph/doc/radosgw/placement.rst
 create mode 120000 ceph/qa/cephfs/objectstore-ec/bluestore-bitmap.yaml
 rename ceph/qa/{cephfs/objectstore-ec/bluestore.yaml => objectstore/bluestore-stupid.yaml} (96%)
 delete mode 100644 ceph/qa/objectstore/bluestore.yaml
 create mode 120000 ceph/qa/objectstore_cephfs/bluestore-bitmap.yaml
 delete mode 120000 ceph/qa/objectstore_cephfs/bluestore.yaml
 create mode 100755 ceph/qa/standalone/osd/osd-backfill-prio.sh
 create mode 100755 ceph/qa/standalone/osd/osd-recovery-prio.sh
 create mode 120000 ceph/qa/suites/ceph-deploy/basic/objectstore/bluestore-bitmap.yaml
 delete mode 120000 ceph/qa/suites/ceph-deploy/basic/objectstore/bluestore.yaml
 create mode 120000 ceph/qa/suites/fs/basic_functional/objectstore/bluestore-bitmap.yaml
 delete mode 120000 ceph/qa/suites/fs/basic_functional/objectstore/bluestore.yaml
 delete mode 100644 ceph/qa/suites/fs/basic_functional/tasks/libcephfs_java.yaml
 create mode 120000 ceph/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore-bitmap.yaml
 delete mode 120000 ceph/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore.yaml
 create mode 120000 ceph/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml
 delete mode 120000 ceph/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml
 create mode 120000 ceph/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml
 delete mode 120000 ceph/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml
 create mode 120000 ceph/qa/suites/rbd/singleton-bluestore/objectstore/bluestore-bitmap.yaml
 delete mode 120000 ceph/qa/suites/rbd/singleton-bluestore/objectstore/bluestore.yaml
 create mode 120000 ceph/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml
 delete mode 120000 ceph/qa/suites/smoke/basic/objectstore/bluestore.yaml
 create mode 120000 ceph/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore-bitmap.yaml
 delete mode 120000 ceph/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml
 create mode 100644 ceph/qa/valgrind.supp
 delete mode 100755 ceph/qa/workunits/libcephfs-java/test.sh
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/tests/__init__.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/tests/conftest.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/tests/osd/__init__.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/tests/osd/test_osds.py
 create mode 100644 ceph/src/ceph-volume/ceph_volume/tests/systemd/test_systemctl.py
 create mode 100644 ceph/src/messages/MMDSFragmentNotifyAck.h
 delete mode 100644 ceph/src/os/bluestore/BitAllocator.cc
 delete mode 100644 ceph/src/os/bluestore/BitAllocator.h
 delete mode 100644 ceph/src/os/bluestore/BitMapAllocator.cc
 delete mode 100644 ceph/src/os/bluestore/BitMapAllocator.h
 create mode 100755 ceph/src/os/bluestore/BitmapAllocator.cc
 create mode 100755 ceph/src/os/bluestore/BitmapAllocator.h
 create mode 100755 ceph/src/os/bluestore/fastbmap_allocator_impl.cc
 create mode 100755 ceph/src/os/bluestore/fastbmap_allocator_impl.h
 create mode 100755 ceph/src/test/objectstore/Allocator_bench.cc
 delete mode 100644 ceph/src/test/objectstore/BitAllocator_test.cc
 create mode 100755 ceph/src/test/objectstore/fastbmap_allocator_test.cc
 delete mode 100644 ceph/src/valgrind.supp

diff --git a/Makefile b/Makefile
index 582d27904..72c31d182 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 RELEASE=5.3
 
 PACKAGE=ceph
-VER=12.2.11
+VER=12.2.12
 DEBREL=pve1
 
 SRCDIR=ceph
diff --git a/ceph/CMakeLists.txt b/ceph/CMakeLists.txt
index 5403de8f4..bffe5649b 100644
--- a/ceph/CMakeLists.txt
+++ b/ceph/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8.11)
 
 project(ceph)
-set(VERSION 12.2.11)
+set(VERSION 12.2.12)
 
 if(POLICY CMP0046)
   # Tweak policies (this one disables "missing" dependency warning)
diff --git a/ceph/PendingReleaseNotes b/ceph/PendingReleaseNotes
index b75c79fb1..03daee301 100644
--- a/ceph/PendingReleaseNotes
+++ b/ceph/PendingReleaseNotes
@@ -1,5 +1,12 @@
->= 12.2.11
-----------
+12.2.12
+-------
+* In 12.2.9 and earlier releases, keyring caps were not checked for validity,
+  so the caps string could be anything. As of 12.2.10, caps strings are
+  validated and providing a keyring with an invalid caps string to, e.g.,
+  "ceph auth add" will result in an error.
+
+12.2.11
+-------
 * `cephfs-journal-tool` makes rank argument (--rank) mandatory. Rank is
   of format `filesystem:rank`, where `filesystem` is the cephfs filesystem
   and `rank` is the MDS rank on which the operation is to be executed. To
@@ -8,6 +15,26 @@
   suffixed dump files. Importing journal information from dump files is
   disallowed if operation is targetted for all ranks.
 
+* The MDS cache trimming is now throttled. Dropping the MDS cache
+  via the `ceph tell mds.<foo> cache drop` command or large reductions in the
+  cache size will no longer cause service unavailability.
+
+* The CephFS MDS behavior with recalling caps has been significantly improved
+  to not attempt recalling too many caps at once, leading to instability.
+  MDS with a large cache (64GB+) should be more stable.
+
+* MDS now provides a config option "mds_max_caps_per_client" (default: 1M) to
+  limit the number of caps a client session may hold. Long running client
+  sessions with a large number of caps have been a source of instability in the
+  MDS when all of these caps need to be processed during certain session
+  events. It is recommended to not unnecessarily increase this value.
+
+* The MDS config mds_recall_state_timeout has been removed. Late client recall
+  warnings are now generated based on the number of caps the MDS has recalled
+  which have not been released. The new configs mds_recall_warning_threshold
+  (default: 32K) and mds_recall_warning_decay_rate (default: 60s) sets the
+  threshold for this warning.
+
 >= 12.1.2
 ---------
 * When running 'df' on a CephFS filesystem comprising exactly one data pool,
diff --git a/ceph/admin/doc-requirements.txt b/ceph/admin/doc-requirements.txt
index dc1411303..44920d4bc 100644
--- a/ceph/admin/doc-requirements.txt
+++ b/ceph/admin/doc-requirements.txt
@@ -1,3 +1,3 @@
 Sphinx == 1.6.3
 -e git+https://github.com/ceph/sphinx-ditaa.git@py3#egg=sphinx-ditaa
--e git+https://github.com/michaeljones/breathe#egg=breathe
+breathe == 4.11.1
diff --git a/ceph/alpine/APKBUILD b/ceph/alpine/APKBUILD
index 220346e45..f29f0428b 100644
--- a/ceph/alpine/APKBUILD
+++ b/ceph/alpine/APKBUILD
@@ -1,7 +1,7 @@
 # Contributor: John Coyle <dx9err@gmail.com>
 # Maintainer: John Coyle <dx9err@gmail.com>
 pkgname=ceph
-pkgver=12.2.11
+pkgver=12.2.12
 pkgrel=0
 pkgdesc="Ceph is a distributed object store and file system"
 pkgusers="ceph"
@@ -63,7 +63,7 @@ makedepends="
 	xmlstarlet
 	yasm
 "
-source="ceph-12.2.11.tar.bz2"
+source="ceph-12.2.12.tar.bz2"
 subpackages="
 	$pkgname-base
 	$pkgname-common
@@ -116,7 +116,7 @@ _sysconfdir=/etc
 _udevrulesdir=/etc/udev/rules.d
 _python_sitelib=/usr/lib/python2.7/site-packages
 
-builddir=$srcdir/ceph-12.2.11
+builddir=$srcdir/ceph-12.2.12
 
 build() {
 	export CEPH_BUILD_VIRTUALENV=$builddir
diff --git a/ceph/ceph.spec b/ceph/ceph.spec
index d10206738..9c42008a1 100644
--- a/ceph/ceph.spec
+++ b/ceph/ceph.spec
@@ -61,7 +61,7 @@
 # main package definition
 #################################################################################
 Name:		ceph
-Version:	12.2.11
+Version:	12.2.12
 Release:	0%{?dist}
 %if 0%{?fedora} || 0%{?rhel}
 Epoch:		2
@@ -77,7 +77,7 @@ License:	LGPL-2.1 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and
 Group:		System/Filesystems
 %endif
 URL:		http://ceph.com/
-Source0:	http://ceph.com/download/ceph-12.2.11.tar.bz2
+Source0:	http://ceph.com/download/ceph-12.2.12.tar.bz2
 %if 0%{?suse_version}
 %if 0%{?is_opensuse}
 ExclusiveArch:  x86_64 aarch64 ppc64 ppc64le
@@ -382,7 +382,7 @@ Summary:	Ceph daemon for mirroring RBD images
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 Requires:	librados2 = %{_epoch_prefix}%{version}-%{release}
 %description -n rbd-mirror
 Daemon for mirroring RBD images between Ceph clusters, streaming
@@ -403,7 +403,7 @@ Summary:	Rados REST gateway
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 %if 0%{with selinux}
 Requires:	ceph-selinux = %{_epoch_prefix}%{version}-%{release}
 %endif
@@ -788,7 +788,7 @@ python-rbd, python-rgw or python-cephfs instead.
 # common
 #################################################################################
 %prep
-%autosetup -p1 -n ceph-12.2.11
+%autosetup -p1 -n ceph-12.2.12
 
 %build
 %if 0%{with cephfs_java}
diff --git a/ceph/ceph.spec.in b/ceph/ceph.spec.in
index fa34ade2d..aa3c47abe 100644
--- a/ceph/ceph.spec.in
+++ b/ceph/ceph.spec.in
@@ -382,7 +382,7 @@ Summary:	Ceph daemon for mirroring RBD images
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 Requires:	librados2 = %{_epoch_prefix}%{version}-%{release}
 %description -n rbd-mirror
 Daemon for mirroring RBD images between Ceph clusters, streaming
@@ -403,7 +403,7 @@ Summary:	Rados REST gateway
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 %if 0%{with selinux}
 Requires:	ceph-selinux = %{_epoch_prefix}%{version}-%{release}
 %endif
diff --git a/ceph/changelog.upstream b/ceph/changelog.upstream
index eaed6bfbb..3f8f429e8 100644
--- a/ceph/changelog.upstream
+++ b/ceph/changelog.upstream
@@ -1,3 +1,9 @@
+ceph (12.2.12-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com>  Thu, 11 Apr 2019 12:33:49 +0000
+
 ceph (12.2.11-1) stable; urgency=medium
 
   * New upstream release
diff --git a/ceph/doc/api/libcephfs-java.rst b/ceph/doc/api/libcephfs-java.rst
index 85b5c3112..83b5a6638 100644
--- a/ceph/doc/api/libcephfs-java.rst
+++ b/ceph/doc/api/libcephfs-java.rst
@@ -2,8 +2,17 @@
 Libcephfs (JavaDoc)
 ===================
 
+.. warning::
+
+    CephFS Java bindings are no longer tested by CI. They may not work properly
+    or corrupt data.
+
+    Developers interested in reviving these bindings by fixing and writing tests
+    are encouraged to contribute!
+
 ..
     The admin/build-docs script runs Ant to build the JavaDoc files, and
     copies them to api/libcephfs-java/javadoc/.
 
+
 View the auto-generated `JavaDoc pages for the CephFS Java bindings <javadoc/>`_.
diff --git a/ceph/doc/ceph-volume/simple/scan.rst b/ceph/doc/ceph-volume/simple/scan.rst
index 320fee8fb..2749b14b6 100644
--- a/ceph/doc/ceph-volume/simple/scan.rst
+++ b/ceph/doc/ceph-volume/simple/scan.rst
@@ -9,6 +9,7 @@ PLAIN formats is fully supported.
 
 The command has the ability to inspect a running OSD, by inspecting the
 directory where the OSD data is stored, or by consuming the data partition.
+The command can also scan all running OSDs if no path or device is provided.
 
 Once scanned, information will (by default) persist the metadata as JSON in
 a file in ``/etc/ceph/osd``. This ``JSON`` file will use the naming convention
@@ -31,6 +32,16 @@ the contents to ``stdout`` (no file will be written)::
 
 .. _ceph-volume-simple-scan-directory:
 
+Running OSDs scan
+-----------------
+Using this command without providing an OSD directory or device will scan the
+directories of any currently running OSDs. If a running OSD was not created
+by ceph-disk it will be ignored and not scanned.
+
+To scan all running ceph-disk OSDs, the command would look like::
+
+    ceph-volume simple scan
+
 Directory scan
 --------------
 The directory scan will capture OSD file contents from interesting files. There
diff --git a/ceph/doc/man/8/ceph-volume.rst b/ceph/doc/man/8/ceph-volume.rst
index 9ad5a5237..5b1035ef7 100644
--- a/ceph/doc/man/8/ceph-volume.rst
+++ b/ceph/doc/man/8/ceph-volume.rst
@@ -280,6 +280,10 @@ directory as well.
 
 Optionally, the JSON blob can be sent to stdout for further inspection.
 
+Usage on all running OSDs::
+
+    ceph-voume simple scan
+
 Usage on data devices::
 
     ceph-volume simple scan <data device>
@@ -295,7 +299,7 @@ Optional arguments:
 * [--stdout]            Send the JSON blob to stdout
 * [--force]             If the JSON file exists at destination, overwrite it
 
-Required Positional arguments:
+Optional Positional arguments:
 
 * <DATA DEVICE or OSD DIR>  Actual data partition or a path to the running OSD
 
diff --git a/ceph/doc/man/8/ceph.rst b/ceph/doc/man/8/ceph.rst
index 32482a7d2..27efc67d1 100644
--- a/ceph/doc/man/8/ceph.rst
+++ b/ceph/doc/man/8/ceph.rst
@@ -1465,6 +1465,16 @@ Options
    reply to outfile.  Only specific monitor commands (e.g. osd getmap)
    return a payload.
 
+.. option:: --setuser user
+
+   will apply the appropriate user ownership to the file specified by
+   the option '-o'.
+
+.. option:: --setgroup group
+
+   will apply the appropriate group ownership to the file specified by
+   the option '-o'.
+
 .. option:: -c ceph.conf, --conf=ceph.conf
 
    Use ceph.conf configuration file instead of the default
diff --git a/ceph/doc/man/8/rbdmap.rst b/ceph/doc/man/8/rbdmap.rst
index ba8001ff8..e6980ab7e 100644
--- a/ceph/doc/man/8/rbdmap.rst
+++ b/ceph/doc/man/8/rbdmap.rst
@@ -46,6 +46,8 @@ This will cause the script to issue an ``rbd map`` command like the following::
     rbd map POOLNAME/IMAGENAME --PARAM1 VAL1 --PARAM2 VAL2 
 
 (See the ``rbd`` manpage for a full list of possible options.)
+For parameters and values which contain commas or equality signs, a simple
+apostrophe can be used to prevent replacing them.
 
 When run as ``rbdmap map``, the script parses the configuration file, and for
 each RBD image specified attempts to first map the image (using the ``rbd map``
@@ -77,11 +79,12 @@ sequence.)
 Examples
 ========
 
-Example ``/etc/ceph/rbdmap`` for two RBD images called "bar1" and "bar2", both
-in pool "foopool"::
+Example ``/etc/ceph/rbdmap`` for three RBD images called "bar1", "bar2" and "bar3", 
+which are in pool "foopool"::
 
     foopool/bar1    id=admin,keyring=/etc/ceph/ceph.client.admin.keyring
     foopool/bar2    id=admin,keyring=/etc/ceph/ceph.client.admin.keyring
+    foopool/bar3    id=admin,keyring=/etc/ceph/ceph.client.admin.keyring,options='lock_on_read,queue_depth=1024'
 
 Each line in the file contains two strings: the image spec and the options to
 be passed to ``rbd map``. These two lines get transformed into the following
@@ -89,12 +92,14 @@ commands::
 
     rbd map foopool/bar1 --id admin --keyring /etc/ceph/ceph.client.admin.keyring
     rbd map foopool/bar2 --id admin --keyring /etc/ceph/ceph.client.admin.keyring
+    rbd map foopool/bar2 --id admin --keyring /etc/ceph/ceph.client.admin.keyring --options lock_on_read,queue_depth=1024
 
 If the images had XFS filesystems on them, the corresponding ``/etc/fstab``
 entries might look like this::
 
     /dev/rbd/foopool/bar1 /mnt/bar1 xfs noauto 0 0
     /dev/rbd/foopool/bar2 /mnt/bar2 xfs noauto 0 0
+    /dev/rbd/foopool/bar3 /mnt/bar3 xfs noauto 0 0
 
 After creating the images and populating the ``/etc/ceph/rbdmap`` file, making
 the images get automatically mapped and mounted at boot is just a matter of
diff --git a/ceph/doc/rados/configuration/mon-config-ref.rst b/ceph/doc/rados/configuration/mon-config-ref.rst
index 6c8e92b17..640e38203 100644
--- a/ceph/doc/rados/configuration/mon-config-ref.rst
+++ b/ceph/doc/rados/configuration/mon-config-ref.rst
@@ -1193,7 +1193,7 @@ Miscellaneous
               will be splitted on all OSDs serving that pool. We want to avoid
               extreme multipliers on PG splits.
 :Type: Integer
-:Default: 300
+:Default: 32
 
 
 ``mon session timeout``
diff --git a/ceph/doc/rados/configuration/osd-config-ref.rst b/ceph/doc/rados/configuration/osd-config-ref.rst
index f839122cf..15b78d583 100644
--- a/ceph/doc/rados/configuration/osd-config-ref.rst
+++ b/ceph/doc/rados/configuration/osd-config-ref.rst
@@ -850,30 +850,14 @@ Ceph performs well as the OSD map grows larger.
 
 :Description: The number of OSD maps to keep cached.
 :Type: 32-bit Integer
-:Default: ``500``
-
-
-``osd map cache bl size``
-
-:Description: The size of the in-memory OSD map cache in OSD daemons.
-:Type: 32-bit Integer
 :Default: ``50``
 
 
-``osd map cache bl inc size``
-
-:Description: The size of the in-memory OSD map cache incrementals in
-              OSD daemons.
-
-:Type: 32-bit Integer
-:Default: ``100``
-
-
 ``osd map message max``
 
 :Description: The maximum map entries allowed per MOSDMap message.
 :Type: 32-bit Integer
-:Default: ``100``
+:Default: ``40``
 
 
 
diff --git a/ceph/doc/rados/operations/health-checks.rst b/ceph/doc/rados/operations/health-checks.rst
index c1e22004a..e141f6bcd 100644
--- a/ceph/doc/rados/operations/health-checks.rst
+++ b/ceph/doc/rados/operations/health-checks.rst
@@ -514,7 +514,7 @@ PG_NOT_DEEP_SCRUBBED
 ____________________
 
 One or more PGs has not been deep scrubbed recently.  PGs are normally
-scrubbed every ``osd_deep_mon_scrub_interval`` seconds, and this warning
+scrubbed every ``osd_deep_scrub_interval`` seconds, and this warning
 triggers when ``mon_warn_not_deep_scrubbed`` such intervals have elapsed
 without a scrub.
 
diff --git a/ceph/doc/rados/troubleshooting/troubleshooting-mon.rst b/ceph/doc/rados/troubleshooting/troubleshooting-mon.rst
index 642b2e07b..6b3ec15e2 100644
--- a/ceph/doc/rados/troubleshooting/troubleshooting-mon.rst
+++ b/ceph/doc/rados/troubleshooting/troubleshooting-mon.rst
@@ -460,12 +460,12 @@ Following information are not recoverable using the steps above:
   using ``ceph-monstore-tool``. But the MDS keyrings and other keyrings are missing
   in the recovered monitor store. You might need to re-add them manually.
 
-- **pg settings**: the ``full ratio`` and ``nearfull ratio`` settings configured using
-  ``ceph pg set_full_ratio`` and ``ceph pg set_nearfull_ratio`` will be lost.
+- **creating pools**: If any RADOS pools were in the process of being creating, that state is lost.  The recovery tool assumes that all pools have been created.  If there are PGs that are stuck in the 'unknown' after the recovery for a partially created pool, you can force creation of the *empty* PG with the ``ceph osd force-create-pg`` command.  Note that this will create an *empty* PG, so only do this if you know the pool is empty.
 
 - **MDS Maps**: the MDS maps are lost.
 
 
+
 Everything Failed! Now What?
 =============================
 
diff --git a/ceph/doc/radosgw/index.rst b/ceph/doc/radosgw/index.rst
index 2e25fdbf1..eac49177f 100644
--- a/ceph/doc/radosgw/index.rst
+++ b/ceph/doc/radosgw/index.rst
@@ -40,6 +40,7 @@ you may write data with one API and retrieve it with the other.
 
    Manual Install w/Civetweb <../../install/install-ceph-gateway>
    HTTP Frontends <frontends>
+   Pool Placement <placement>
    Multisite Configuration <multisite>
    Configuring Pools <pools>
    Config Reference <config-ref>
diff --git a/ceph/doc/radosgw/placement.rst b/ceph/doc/radosgw/placement.rst
new file mode 100644
index 000000000..ce9ecbc2e
--- /dev/null
+++ b/ceph/doc/radosgw/placement.rst
@@ -0,0 +1,180 @@
+==============
+Pool Placement
+==============
+
+.. contents::
+
+Placement Targets
+=================
+
+.. versionadded:: Jewel
+
+Placement targets control which `Pools`_ are associated with a particular
+bucket. A bucket's placement target is selected on creation, and cannot be
+modified. The ``radosgw-admin bucket stats`` command will display its
+``placement_rule``.
+
+The zonegroup configuration contains a list of placement targets with an
+initial target named ``default-placement``. The zone configuration then maps
+each zonegroup placement target name onto its local storage. This zone
+placement information includes the ``index_pool`` name for the bucket index,
+the ``data_extra_pool`` name for metadata about incomplete multipart uploads,
+and a ``data_pool`` name for object data.
+
+Zonegroup/Zone Configuration
+============================
+
+Placement configuration is performed with ``radosgw-admin`` commands on
+the zonegroups and zones.
+
+The zonegroup placement configuration can be queried with:
+
+::
+
+  $ radosgw-admin zonegroup get
+  {
+      "id": "ab01123f-e0df-4f29-9d71-b44888d67cd5",
+      "name": "default",
+      "api_name": "default",
+      ...
+      "placement_targets": [
+          {
+              "name": "default-placement",
+              "tags": [],
+          }
+      ],
+      "default_placement": "default-placement",
+      ...
+  }
+
+The zone placement configuration can be queried with:
+
+::
+
+  $ radosgw-admin zone get
+  {
+      "id": "557cdcee-3aae-4e9e-85c7-2f86f5eddb1f",
+      "name": "default",
+      "domain_root": "default.rgw.meta:root",
+      ...
+      "placement_pools": [
+          {
+              "key": "default-placement",
+              "val": {
+                  "index_pool": "default.rgw.buckets.index",
+                  "data_pool": "default.rgw.buckets.data",
+                  "data_extra_pool": "default.rgw.buckets.non-ec",
+                  "index_type": 0
+              }
+          }
+      ],
+      ...
+  }
+
+.. note:: If you have not done any previous `Multisite Configuration`_,
+          a ``default`` zone and zonegroup are created for you, and changes
+          to the zone/zonegroup will not take effect until the Ceph Object
+          Gateways are restarted. If you have created a realm for multisite,
+          the zone/zonegroup changes will take effect once the changes are
+          committed with ``radosgw-admin period update --commit``.
+
+Adding a Placement Target
+-------------------------
+
+To create a new placement target named ``temporary``, start by adding it to
+the zonegroup:
+
+::
+
+  $ radosgw-admin zonegroup placement add \
+        --rgw-zonegroup default \
+        --placement-id temporary
+
+Then provide the zone placement info for that target:
+
+::
+
+  $ radosgw-admin zone placement add \
+        --rgw-zone default \
+        --placement-id temporary \
+        --data-pool default.rgw.temporary.data \
+        --index-pool default.rgw.temporary.index \
+        --data-extra-pool default.rgw.temporary.non-ec \
+        --compression lz4
+
+Customizing Placement
+=====================
+
+Default Placement
+-----------------
+
+By default, new buckets will use the zonegroup's ``default_placement`` target.
+This zonegroup setting can be changed with:
+
+::
+
+  $ radosgw-admin zonegroup placement default \
+        --rgw-zonegroup default \
+        --placement-id new-placement
+
+User Placement
+--------------
+
+A Ceph Object Gateway user can override the zonegroup's default placement
+target by setting a non-empty ``default_placement`` field in the user info.
+
+::
+
+  $ radosgw-admin user info --uid testid
+  {
+      ...
+      "default_placement": "",
+      "placement_tags": [],
+      ...
+  }
+
+If a zonegroup's placement target contains any ``tags``, users will be unable
+to create buckets with that placement target unless their user info contains
+at least one matching tag in its ``placement_tags`` field. This can be useful
+to restrict access to certain types of storage.
+
+The ``radosgw-admin`` command cannot modify these fields directly, so the json
+format must be edited manually:
+
+::
+
+  $ radosgw-admin metadata get user:<user-id> > user.json
+  $ vi user.json
+  $ radosgw-admin metadata put user:<user-id> < user.json
+
+S3 Bucket Placement
+-------------------
+
+When creating a bucket with the S3 protocol, a placement target can be
+provided as part of the LocationConstraint to override the default placement
+targets from the user and zonegroup.
+
+Normally, the LocationConstraint must match the zonegroup's ``api_name``:
+
+::
+
+  <LocationConstraint>default</LocationConstraint>
+
+A custom placement target can be added to the ``api_name`` following a colon:
+
+::
+
+  <LocationConstraint>default:new-placement</LocationConstraint>
+
+Swift Bucket Placement
+----------------------
+
+When creating a bucket with the Swift protocol, a placement target can be
+provided in the HTTP header ``X-Storage-Policy``:
+
+::
+
+  X-Storage-Policy: new-placement
+
+.. _`Pools`: ../pools
+.. _`Multisite Configuration`: ../multisite
diff --git a/ceph/doc/radosgw/s3/authentication.rst b/ceph/doc/radosgw/s3/authentication.rst
index b1875385b..3cdacc495 100644
--- a/ceph/doc/radosgw/s3/authentication.rst
+++ b/ceph/doc/radosgw/s3/authentication.rst
@@ -71,5 +71,126 @@ an object:
 | ``FULL_CONTROL`` | Grantee has full permissions for object in the bucket. | Grantee can read or write to the object ACL. |
 +------------------+--------------------------------------------------------+----------------------------------------------+
 
+Internally, S3 operations are mapped to ACL permissions thus:
+
++---------------------------------------+---------------+
+| Operation                             | Permission    |
++=======================================+===============+
+| ``s3:GetObject``                      | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:GetObjectTorrent``               | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:GetObjectVersion``               | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:GetObjectVersionTorrent``        | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:GetObjectTagging``               | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:GetObjectVersionTagging``        | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:ListAllMyBuckets``               | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:ListBucket``                     | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:ListBucketMultipartUploads``     | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:ListBucketVersions``             | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:ListMultipartUploadParts``       | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:AbortMultipartUpload``           | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:CreateBucket``                   | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:DeleteBucket``                   | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:DeleteObject``                   | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:s3DeleteObjectVersion``          | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:PutObject``                      | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:PutObjectTagging``               | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:PutObjectVersionTagging``        | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:DeleteObjectTagging``            | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:DeleteObjectVersionTagging``     | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:RestoreObject``                  | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:GetAccelerateConfiguration``     | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketAcl``                   | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketCORS``                  | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketLocation``              | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketLogging``               | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketNotification``          | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketPolicy``                | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketRequestPayment``        | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketTagging``               | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketVersioning``            | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketWebsite``               | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetLifecycleConfiguration``      | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetObjectAcl``                   | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetObjectVersionAcl``            | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetReplicationConfiguration``    | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:DeleteBucketPolicy``             | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:DeleteBucketWebsite``            | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:DeleteReplicationConfiguration`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutAccelerateConfiguration``     | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketAcl``                   | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketCORS``                  | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketLogging``               | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketNotification``          | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketPolicy``                | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketRequestPayment``        | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketTagging``               | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutPutBucketVersioning``         | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketWebsite``               | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutLifecycleConfiguration``      | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutObjectAcl``                   | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutObjectVersionAcl``            | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutReplicationConfiguration``    | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+
+Some mappings, (e.g. ``s3:CreateBucket`` to ``WRITE``) are not
+applicable to S3 operation, but are required to allow Swift and S3 to
+access the same resources when things like Swift user ACLs are in
+play. This is one of the many reasons that you should use S3 bucket
+policies rather than S3 ACLs when possible.
+
+
 .. _RFC 2104: http://www.ietf.org/rfc/rfc2104.txt
 .. _HMAC: http://en.wikipedia.org/wiki/HMAC
diff --git a/ceph/qa/cephfs/objectstore-ec/bluestore-bitmap.yaml b/ceph/qa/cephfs/objectstore-ec/bluestore-bitmap.yaml
new file mode 120000
index 000000000..9fb86b9fe
--- /dev/null
+++ b/ceph/qa/cephfs/objectstore-ec/bluestore-bitmap.yaml
@@ -0,0 +1 @@
+../../objectstore/bluestore-bitmap.yaml
\ No newline at end of file
diff --git a/ceph/qa/cephfs/tasks/cfuse_workunit_suites_fsstress.yaml b/ceph/qa/cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
index ddb18fb79..bae220292 100644
--- a/ceph/qa/cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
+++ b/ceph/qa/cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
@@ -1,5 +1,6 @@
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/fsstress.sh
diff --git a/ceph/qa/objectstore/bluestore-bitmap.yaml b/ceph/qa/objectstore/bluestore-bitmap.yaml
index 88dca3a21..b18e04bee 100644
--- a/ceph/qa/objectstore/bluestore-bitmap.yaml
+++ b/ceph/qa/objectstore/bluestore-bitmap.yaml
@@ -20,6 +20,8 @@ overrides:
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@ overrides:
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff --git a/ceph/qa/cephfs/objectstore-ec/bluestore.yaml b/ceph/qa/objectstore/bluestore-stupid.yaml
similarity index 96%
rename from ceph/qa/cephfs/objectstore-ec/bluestore.yaml
rename to ceph/qa/objectstore/bluestore-stupid.yaml
index 19dfeb036..1d28ccbce 100644
--- a/ceph/qa/cephfs/objectstore-ec/bluestore.yaml
+++ b/ceph/qa/objectstore/bluestore-stupid.yaml
@@ -12,6 +12,7 @@ overrides:
         debug bluefs: 20
         debug rocksdb: 10
         bluestore fsck on mount: true
+        bluestore allocator: stupid
         # lower the full ratios since we can fill up a 100gb osd so quickly
         mon osd full ratio: .9
         mon osd backfillfull_ratio: .85
diff --git a/ceph/qa/objectstore/bluestore.yaml b/ceph/qa/objectstore/bluestore.yaml
deleted file mode 100644
index 19dfeb036..000000000
--- a/ceph/qa/objectstore/bluestore.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff --git a/ceph/qa/objectstore_cephfs/bluestore-bitmap.yaml b/ceph/qa/objectstore_cephfs/bluestore-bitmap.yaml
new file mode 120000
index 000000000..951e65ac0
--- /dev/null
+++ b/ceph/qa/objectstore_cephfs/bluestore-bitmap.yaml
@@ -0,0 +1 @@
+../objectstore/bluestore-bitmap.yaml
\ No newline at end of file
diff --git a/ceph/qa/objectstore_cephfs/bluestore.yaml b/ceph/qa/objectstore_cephfs/bluestore.yaml
deleted file mode 120000
index ad17c0eb7..000000000
--- a/ceph/qa/objectstore_cephfs/bluestore.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../objectstore/bluestore.yaml
\ No newline at end of file
diff --git a/ceph/qa/overrides/short_pg_log.yaml b/ceph/qa/overrides/short_pg_log.yaml
index 6ac1bca7f..20cc101de 100644
--- a/ceph/qa/overrides/short_pg_log.yaml
+++ b/ceph/qa/overrides/short_pg_log.yaml
@@ -2,5 +2,5 @@ overrides:
   ceph:
     conf:
       global:
-        osd_min_pg_log_entries: 300
-        osd_max_pg_log_entries: 600
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff --git a/ceph/qa/packages/packages.yaml b/ceph/qa/packages/packages.yaml
index 398656450..31fb66aa9 100644
--- a/ceph/qa/packages/packages.yaml
+++ b/ceph/qa/packages/packages.yaml
@@ -11,8 +11,6 @@ ceph:
   - python-ceph
   - libcephfs2
   - libcephfs-dev
-  - libcephfs-java
-  - libcephfs-jni
   - librados2
   - librbd1
   - rbd-fuse
@@ -40,8 +38,6 @@ ceph:
   - ceph
   - ceph-mgr
   - ceph-fuse
-  - cephfs-java
-  - libcephfs_jni1
   - libcephfs2
   - libcephfs-devel
   - librados2
diff --git a/ceph/qa/run-standalone.sh b/ceph/qa/run-standalone.sh
index 2c7ceaa34..acb486e1a 100755
--- a/ceph/qa/run-standalone.sh
+++ b/ceph/qa/run-standalone.sh
@@ -36,6 +36,8 @@ trap finish TERM HUP INT
 
 PATH=$(pwd)/bin:$PATH
 
+export LD_LIBRARY_PATH="$(pwd)/lib"
+
 # TODO: Use getops
 dryrun=false
 if [[ "$1" = "--dry-run" ]]; then
diff --git a/ceph/qa/standalone/ceph-helpers.sh b/ceph/qa/standalone/ceph-helpers.sh
index 3883a6f58..9a4bae2a5 100755
--- a/ceph/qa/standalone/ceph-helpers.sh
+++ b/ceph/qa/standalone/ceph-helpers.sh
@@ -19,7 +19,7 @@
 #
 TIMEOUT=300
 PG_NUM=4
-TMPDIR=${TMPDIR:-/tmp}
+TMPDIR=${TMPDIR:-${CEPH_BUILD_DIR}}
 CEPH_BUILD_VIRTUALENV=${TMPDIR}
 TESTDIR=${TESTDIR:-${TMPDIR}}
 
@@ -389,6 +389,17 @@ function test_kill_daemons() {
     teardown $dir || return 1
 }
 
+#
+# return a random TCP port which is not used yet
+#
+# please note, there could be racing if we use this function for
+# a free port, and then try to bind on this port.
+#
+function get_unused_port() {
+    local ip=127.0.0.1
+    python3 -c "import socket; s=socket.socket(); s.bind(('$ip', 0)); print(s.getsockname()[1]); s.close()"
+}
+
 #######################################################################
 
 ##
@@ -1411,6 +1422,7 @@ function test_get_timeout_delays() {
 # @return 0 if the cluster is clean, 1 otherwise
 #
 function wait_for_clean() {
+    local cmd=$1
     local num_active_clean=-1
     local cur_active_clean
     local -a delays=($(get_timeout_delays $TIMEOUT .1))
@@ -1436,6 +1448,8 @@ function wait_for_clean() {
             ceph report
             return 1
         fi
+	# eval is a no-op if cmd is empty
+        eval $cmd
         sleep ${delays[$loop]}
         loop+=1
     done
diff --git a/ceph/qa/standalone/osd/osd-backfill-prio.sh b/ceph/qa/standalone/osd/osd-backfill-prio.sh
new file mode 100755
index 000000000..248ac6fb0
--- /dev/null
+++ b/ceph/qa/standalone/osd/osd-backfill-prio.sh
@@ -0,0 +1,504 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2019 Red Hat <contact@redhat.com>
+#
+# Author: David Zafman <dzafman@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    # Fix port????
+    export CEPH_MON="127.0.0.1:7114" # git grep '\<7114\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON --osd_max_backfills=1 --debug_reserver=20 "
+    CEPH_ARGS+="--osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10 "
+    export objects=50
+    export poolprefix=test
+    export FORCE_PRIO="254"     # See OSD_BACKFILL_PRIORITY_FORCED
+    export DEGRADED_PRIO="140"  # See OSD_BACKFILL_DEGRADED_PRIORITY_BASE
+    export NORMAL_PRIO="100"    # See OSD_BACKFILL_PRIORITY_BASE
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+
+function TEST_backfill_priority() {
+    local dir=$1
+    local pools=10
+    local OSDS=5
+    # size 2 -> 1 means degraded by 1, so add 1 to base prio
+    local degraded_prio=$(expr $DEGRADED_PRIO + 1)
+
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    export CEPH_ARGS
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+
+    for p in $(seq 1 $pools)
+    do
+      create_pool "${poolprefix}$p" 1 1
+      ceph osd pool set "${poolprefix}$p" size 2
+    done
+    sleep 5
+
+    wait_for_clean || return 1
+
+    ceph pg dump pgs
+
+    # Find 3 pools with a pg with the same primaries but second
+    # replica on another osd.
+    local PG1
+    local POOLNUM1
+    local pool1
+    local chk_osd1_1
+    local chk_osd1_2
+
+    local PG2
+    local POOLNUM2
+    local pool2
+    local chk_osd2
+
+    local PG3
+    local POOLNUM3
+    local pool3
+
+    for p in $(seq 1 $pools)
+    do
+      ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting
+      local test_osd1=$(head -1 $dir/acting)
+      local test_osd2=$(tail -1 $dir/acting)
+      if [ -z "$PG1" ];
+      then
+        PG1="${p}.0"
+        POOLNUM1=$p
+        pool1="${poolprefix}$p"
+        chk_osd1_1=$test_osd1
+        chk_osd1_2=$test_osd2
+      elif [ -z "$PG2" -a $chk_osd1_1 = $test_osd1 -a $chk_osd1_2 != $test_osd2 ];
+      then
+        PG2="${p}.0"
+        POOLNUM2=$p
+        pool2="${poolprefix}$p"
+        chk_osd2=$test_osd2
+      elif [ -n "$PG2" -a $chk_osd1_1 = $test_osd1 -a $chk_osd1_2 != $test_osd2 -a "$chk_osd2" != $test_osd2 ];
+      then
+        PG3="${p}.0"
+        POOLNUM3=$p
+        pool3="${poolprefix}$p"
+        break
+      fi
+    done
+    rm -f $dir/acting
+
+    if [ "$pool2" = "" -o "pool3" = "" ];
+    then
+      echo "Failure to find appropirate PGs"
+      return 1
+    fi
+
+    for p in $(seq 1 $pools)
+    do
+      if [ $p != $POOLNUM1 -a $p != $POOLNUM2 -a $p != $POOLNUM3 ];
+      then
+        delete_pool ${poolprefix}$p
+      fi
+    done
+
+    ceph osd pool set $pool2 size 1
+    ceph osd pool set $pool3 size 1
+    wait_for_clean || return 1
+
+    dd if=/dev/urandom of=$dir/data bs=1M count=10
+    p=1
+    for pname in $pool1 $pool2 $pool3
+    do
+      for i in $(seq 1 $objects)
+      do
+	rados -p ${pname} put obj${i}-p${p} $dir/data
+      done
+      p=$(expr $p + 1)
+    done
+
+    local otherosd=$(get_not_primary $pool1 obj1-p1)
+
+    ceph pg dump pgs
+    ERRORS=0
+
+    ceph osd set nobackfill
+    ceph osd set noout
+
+    # Get a pg to want to backfill and quickly force it
+    # to be preempted.
+    ceph osd pool set $pool3 size 2
+    sleep 2
+
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+
+    # 3. Item is in progress, adjust priority with no higher priority waiting
+    while(ceph pg force-backfill $PG3 2>&1 | grep -q "doesn't require backfilling")
+    do
+      sleep 2
+    done
+    flush_pg_stats || return 1
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+
+    ceph osd out osd.$chk_osd1_2
+    sleep 2
+    flush_pg_stats || return 1
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+    ceph pg dump pgs
+
+    ceph osd pool set $pool2 size 2
+    sleep 2
+    flush_pg_stats || return 1
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    ceph pg dump pgs
+
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG1}\")).prio")
+    if [ "$PRIO" != "$NORMAL_PRIO" ];
+    then
+      echo "The normal PG ${PG1} doesn't have prio $NORMAL_PRIO queued waiting"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG3} ];
+    then
+      echo "The force-backfill PG $PG3 didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $FORCE_PRIO ];
+      then
+        echo "The force-backfill PG ${PG3} doesn't have prio $FORCE_PRIO"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # 1. Item is queued, re-queue with new priority
+    while(ceph pg force-backfill $PG2 2>&1 | grep -q "doesn't require backfilling")
+    do
+      sleep 2
+    done
+    sleep 2
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG2}\")).prio")
+    if [ "$PRIO" != "$FORCE_PRIO" ];
+    then
+      echo "The second force-backfill PG ${PG2} doesn't have prio $FORCE_PRIO"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+    flush_pg_stats || return 1
+
+    # 4. Item is in progress, if higher priority items waiting prempt item
+    ceph pg cancel-force-backfill $PG3 || return 1
+    sleep 2
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG3}\")).prio")
+    if [ "$PRIO" != "$degraded_prio" ];
+    then
+      echo "After cancel-force-backfill PG ${PG3} doesn't have prio $degraded_prio"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG2} ];
+    then
+      echo "The force-recovery PG $PG2 didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $FORCE_PRIO ];
+      then
+        echo "The first force-recovery PG ${PG2} doesn't have prio $FORCE_PRIO"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    ceph pg cancel-force-backfill $PG2 || return 1
+    sleep 5
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+
+    # 2. Item is queued, re-queue and preempt because new priority higher than an in progress item
+    flush_pg_stats || return 1
+    ceph pg force-backfill $PG3 || return 1
+    sleep 2
+
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG2}\")).prio")
+    if [ "$PRIO" != "$degraded_prio" ];
+    then
+      echo "After cancel-force-backfill PG ${PG2} doesn't have prio $degraded_prio"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG3} ];
+    then
+      echo "The force-backfill PG $PG3 didn't get promoted to an in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $FORCE_PRIO ];
+      then
+        echo "The force-backfill PG ${PG2} doesn't have prio $FORCE_PRIO"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    ceph osd unset noout
+    ceph osd unset nobackfill
+
+    wait_for_clean "CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations" || return 1
+
+    ceph pg dump pgs
+
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_pgstate_history
+
+    if [ $ERRORS != "0" ];
+    then
+      echo "$ERRORS error(s) found"
+    else
+      echo TEST PASSED
+    fi
+
+    delete_pool $pool1
+    delete_pool $pool2
+    delete_pool $pool3
+    kill_daemons $dir || return 1
+    return $ERRORS
+}
+
+#
+# Show that pool recovery_priority is added to the backfill priority
+#
+# Create 2 pools with 2 OSDs with different primarys
+# pool 1 with recovery_priority 1
+# pool 2 with recovery_priority 2
+#
+# Start backfill by changing the pool sizes from 1 to 2
+# Use dump_reservations to verify priorities
+function TEST_backfill_pool_priority() {
+    local dir=$1
+    local pools=3 # Don't assume the first 2 pools are exact what we want
+    local OSDS=2
+
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    export CEPH_ARGS
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+
+    for p in $(seq 1 $pools)
+    do
+      create_pool "${poolprefix}$p" 1 1
+      ceph osd pool set "${poolprefix}$p" size 2
+    done
+    sleep 5
+
+    wait_for_clean || return 1
+
+    ceph pg dump pgs
+
+    # Find 2 pools with different primaries which
+    # means the replica must be on another osd.
+    local PG1
+    local POOLNUM1
+    local pool1
+    local chk_osd1_1
+    local chk_osd1_2
+
+    local PG2
+    local POOLNUM2
+    local pool2
+    local chk_osd2_1
+    local chk_osd2_2
+
+    for p in $(seq 1 $pools)
+    do
+      ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting
+      local test_osd1=$(head -1 $dir/acting)
+      local test_osd2=$(tail -1 $dir/acting)
+      if [ -z "$PG1" ];
+      then
+        PG1="${p}.0"
+        POOLNUM1=$p
+        pool1="${poolprefix}$p"
+        chk_osd1_1=$test_osd1
+        chk_osd1_2=$test_osd2
+      elif [ $chk_osd1_1 != $test_osd1 ];
+      then
+        PG2="${p}.0"
+        POOLNUM2=$p
+        pool2="${poolprefix}$p"
+        chk_osd2_1=$test_osd1
+        chk_osd2_2=$test_osd2
+        break
+      fi
+    done
+    rm -f $dir/acting
+
+    if [ "$pool2" = "" ];
+    then
+      echo "Failure to find appropirate PGs"
+      return 1
+    fi
+
+    for p in $(seq 1 $pools)
+    do
+      if [ $p != $POOLNUM1 -a $p != $POOLNUM2 ];
+      then
+        delete_pool ${poolprefix}$p
+      fi
+    done
+
+    pool1_extra_prio=1
+    pool2_extra_prio=2
+    # size 2 -> 1 means degraded by 1, so add 1 to base prio
+    pool1_prio=$(expr $DEGRADED_PRIO + 1 + $pool1_extra_prio)
+    pool2_prio=$(expr $DEGRADED_PRIO + 1 + $pool2_extra_prio)
+
+    ceph osd pool set $pool1 size 1
+    ceph osd pool set $pool1 recovery_priority $pool1_extra_prio
+    ceph osd pool set $pool2 size 1
+    ceph osd pool set $pool2 recovery_priority $pool2_extra_prio
+    wait_for_clean || return 1
+
+    dd if=/dev/urandom of=$dir/data bs=1M count=10
+    p=1
+    for pname in $pool1 $pool2
+    do
+      for i in $(seq 1 $objects)
+      do
+	rados -p ${pname} put obj${i}-p${p} $dir/data
+      done
+      p=$(expr $p + 1)
+    done
+
+    local otherosd=$(get_not_primary $pool1 obj1-p1)
+
+    ceph pg dump pgs
+    ERRORS=0
+
+    ceph osd pool set $pool1 size 2
+    ceph osd pool set $pool2 size 2
+    sleep 5
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/dump.${chk_osd1_1}.out
+    echo osd.${chk_osd1_1}
+    cat $dir/dump.${chk_osd1_1}.out
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_2}) dump_reservations > $dir/dump.${chk_osd1_2}.out
+    echo osd.${chk_osd1_2}
+    cat $dir/dump.${chk_osd1_2}.out
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd1_1}.out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG1} ];
+    then
+      echo "The primary PG ${PG1} didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd1_1}.out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool1_prio ];
+      then
+        echo "The primary PG ${PG1} doesn't have prio $pool1_prio"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd1_2}.out | jq '.remote_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG1} ];
+    then
+      echo "The primary PG ${PG1} didn't become the in progress item on remote"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd1_2}.out | jq '.remote_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool1_prio ];
+      then
+        echo "The primary PG ${PG1} doesn't have prio $pool1_prio on remote"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd2_1}.out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG2} ];
+    then
+      echo "The primary PG ${PG2} didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd2_1}.out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool2_prio ];
+      then
+        echo "The primary PG ${PG2} doesn't have prio $pool2_prio"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd2_2}.out | jq '.remote_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG2} ];
+    then
+      echo "The primary PG $PG2 didn't become the in progress item on remote"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd2_2}.out | jq '.remote_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool2_prio ];
+      then
+        echo "The primary PG ${PG2} doesn't have prio $pool2_prio on remote"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    wait_for_clean || return 1
+
+    if [ $ERRORS != "0" ];
+    then
+      echo "$ERRORS error(s) found"
+    else
+      echo TEST PASSED
+    fi
+
+    delete_pool $pool1
+    delete_pool $pool2
+    kill_daemons $dir || return 1
+    return $ERRORS
+}
+
+main osd-backfill-prio "$@"
+
+# Local Variables:
+# compile-command: "make -j4 && ../qa/run-standalone.sh osd-backfill-prio.sh"
+# End:
diff --git a/ceph/qa/standalone/osd/osd-markdown.sh b/ceph/qa/standalone/osd/osd-markdown.sh
index 6a28a305c..64157537d 100755
--- a/ceph/qa/standalone/osd/osd-markdown.sh
+++ b/ceph/qa/standalone/osd/osd-markdown.sh
@@ -45,7 +45,10 @@ function markdown_N_impl() {
     ceph osd tree
     ceph osd tree | grep osd.0 |grep up || return 1
     # mark the OSD down.
-    ceph osd down 0
+    # override any dup setting in the environment to ensure we do this
+    # exactly once (modulo messenger failures, at least; we can't *actually*
+    # provide exactly-once semantics for mon commands).
+    CEPH_CLI_TEST_DUP_COMMAND=0 ceph osd down 0
     sleep $sleeptime
   done
 }
diff --git a/ceph/qa/standalone/osd/osd-recovery-prio.sh b/ceph/qa/standalone/osd/osd-recovery-prio.sh
new file mode 100755
index 000000000..25ecb2651
--- /dev/null
+++ b/ceph/qa/standalone/osd/osd-recovery-prio.sh
@@ -0,0 +1,500 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2019 Red Hat <contact@redhat.com>
+#
+# Author: David Zafman <dzafman@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    # Fix port????
+    export CEPH_MON="127.0.0.1:7114" # git grep '\<7114\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON --osd_max_backfills=1 --debug_reserver=20"
+    export objects=200
+    export poolprefix=test
+    export FORCE_PRIO="255"    # See OSD_RECOVERY_PRIORITY_FORCED
+    export NORMAL_PRIO="180"   # See OSD_RECOVERY_PRIORITY_BASE
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+
+function TEST_recovery_priority() {
+    local dir=$1
+    local pools=10
+    local OSDS=5
+
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    export CEPH_ARGS
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+
+    for p in $(seq 1 $pools)
+    do
+      create_pool "${poolprefix}$p" 1 1
+      ceph osd pool set "${poolprefix}$p" size 2
+    done
+    sleep 5
+
+    wait_for_clean || return 1
+
+    ceph pg dump pgs
+
+    # Find 3 pools with a pg with the same primaries but second
+    # replica on another osd.
+    local PG1
+    local POOLNUM1
+    local pool1
+    local chk_osd1_1
+    local chk_osd1_2
+
+    local PG2
+    local POOLNUM2
+    local pool2
+    local chk_osd2
+
+    local PG3
+    local POOLNUM3
+    local pool3
+
+    for p in $(seq 1 $pools)
+    do
+      ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting
+      local test_osd1=$(head -1 $dir/acting)
+      local test_osd2=$(tail -1 $dir/acting)
+      if [ -z "$PG1" ];
+      then
+        PG1="${p}.0"
+        POOLNUM1=$p
+        pool1="${poolprefix}$p"
+        chk_osd1_1=$test_osd1
+        chk_osd1_2=$test_osd2
+      elif [ -z "$PG2" -a $chk_osd1_1 = $test_osd1 -a $chk_osd1_2 != $test_osd2 ];
+      then
+        PG2="${p}.0"
+        POOLNUM2=$p
+        pool2="${poolprefix}$p"
+        chk_osd2=$test_osd2
+      elif [ -n "$PG2" -a $chk_osd1_1 = $test_osd1 -a $chk_osd1_2 != $test_osd2 -a "$chk_osd2" != $test_osd2 ];
+      then
+        PG3="${p}.0"
+        POOLNUM3=$p
+        pool3="${poolprefix}$p"
+        break
+      fi
+    done
+    rm -f $dir/acting
+
+    if [ "$pool2" = "" -o "pool3" = "" ];
+    then
+      echo "Failure to find appropirate PGs"
+      return 1
+    fi
+
+    for p in $(seq 1 $pools)
+    do
+      if [ $p != $POOLNUM1 -a $p != $POOLNUM2 -a $p != $POOLNUM3 ];
+      then
+        delete_pool ${poolprefix}$p
+      fi
+    done
+
+    ceph osd pool set $pool2 size 1
+    ceph osd pool set $pool3 size 1
+    wait_for_clean || return 1
+
+    dd if=/dev/urandom of=$dir/data bs=1M count=10
+    p=1
+    for pname in $pool1 $pool2 $pool3
+    do
+      for i in $(seq 1 $objects)
+      do
+	rados -p ${pname} put obj${i}-p${p} $dir/data
+      done
+      p=$(expr $p + 1)
+    done
+
+    local otherosd=$(get_not_primary $pool1 obj1-p1)
+
+    ceph pg dump pgs
+    ERRORS=0
+
+    ceph osd set norecover
+    ceph osd set noout
+
+    # Get a pg to want to recover and quickly force it
+    # to be preempted.
+    ceph osd pool set $pool3 size 2
+    sleep 2
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+
+    # 3. Item is in progress, adjust priority with no higher priority waiting
+    while(ceph pg force-recovery $PG3 2>&1 | grep -q "doesn't require recovery")
+    do
+      sleep 2
+    done
+    flush_pg_stats || return 1
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+
+    ceph osd out osd.$chk_osd1_2
+    sleep 2
+    flush_pg_stats || return 1
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+    ceph pg dump pgs
+
+    ceph osd pool set $pool2 size 2
+    sleep 2
+    flush_pg_stats || return 1
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    ceph pg dump pgs
+
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG1}\")).prio")
+    if [ "$PRIO" != "$NORMAL_PRIO" ];
+    then
+      echo "The normal PG ${PG1} doesn't have prio $NORMAL_PRIO queued waiting"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG3} ];
+    then
+      echo "The first force-recovery PG $PG3 didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $FORCE_PRIO ];
+      then
+        echo "The first force-recovery PG ${PG3} doesn't have prio $FORCE_PRIO"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # 1. Item is queued, re-queue with new priority
+    while(ceph pg force-recovery $PG2 2>&1 | grep -q "doesn't require recovery")
+    do
+      sleep 2
+    done
+    sleep 2
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG2}\")).prio")
+    if [ "$PRIO" != "$FORCE_PRIO" ];
+    then
+      echo "The second force-recovery PG ${PG2} doesn't have prio $FORCE_PRIO"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+    flush_pg_stats || return 1
+
+    # 4. Item is in progress, if higher priority items waiting prempt item
+    #ceph osd unset norecover
+    ceph pg cancel-force-recovery $PG3 || return 1
+    sleep 2
+    #ceph osd set norecover
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG3}\")).prio")
+    if [ "$PRIO" != "$NORMAL_PRIO" ];
+    then
+      echo "After cancel-recovery PG ${PG3} doesn't have prio $NORMAL_PRIO"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG2} ];
+    then
+      echo "The force-recovery PG $PG2 didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $FORCE_PRIO ];
+      then
+        echo "The first force-recovery PG ${PG2} doesn't have prio $FORCE_PRIO"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    ceph pg cancel-force-recovery $PG2 || return 1
+    sleep 5
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+
+    # 2. Item is queued, re-queue and preempt because new priority higher than an in progress item
+    flush_pg_stats || return 1
+    ceph pg force-recovery $PG3 || return 1
+    sleep 2
+
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG2}\")).prio")
+    if [ "$PRIO" != "$NORMAL_PRIO" ];
+    then
+      echo "After cancel-force-recovery PG ${PG3} doesn't have prio $NORMAL_PRIO"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG3} ];
+    then
+      echo "The force-recovery PG $PG3 didn't get promoted to an in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $FORCE_PRIO ];
+      then
+        echo "The force-recovery PG ${PG2} doesn't have prio $FORCE_PRIO"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    ceph osd unset noout
+    ceph osd unset norecover
+
+    wait_for_clean "CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations" || return 1
+
+    ceph pg dump pgs
+
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_pgstate_history
+
+    if [ $ERRORS != "0" ];
+    then
+      echo "$ERRORS error(s) found"
+    else
+      echo TEST PASSED
+    fi
+
+    delete_pool $pool1
+    delete_pool $pool2
+    delete_pool $pool3
+    kill_daemons $dir || return 1
+    return $ERRORS
+}
+
+#
+# Show that pool recovery_priority is added to recovery priority
+#
+# Create 2 pools with 2 OSDs with different primarys
+# pool 1 with recovery_priority 1
+# pool 2 with recovery_priority 2
+#
+# Start recovery by changing the pool sizes from 1 to 2
+# Use dump_reservations to verify priorities
+function TEST_recovery_pool_priority() {
+    local dir=$1
+    local pools=3 # Don't assume the first 2 pools are exact what we want
+    local OSDS=2
+
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    export CEPH_ARGS
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+
+    for p in $(seq 1 $pools)
+    do
+      create_pool "${poolprefix}$p" 1 1
+      ceph osd pool set "${poolprefix}$p" size 2
+    done
+    sleep 5
+
+    wait_for_clean || return 1
+
+    ceph pg dump pgs
+
+    # Find 2 pools with different primaries which
+    # means the replica must be on another osd.
+    local PG1
+    local POOLNUM1
+    local pool1
+    local chk_osd1_1
+    local chk_osd1_2
+
+    local PG2
+    local POOLNUM2
+    local pool2
+    local chk_osd2_1
+    local chk_osd2_2
+
+    for p in $(seq 1 $pools)
+    do
+      ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting
+      local test_osd1=$(head -1 $dir/acting)
+      local test_osd2=$(tail -1 $dir/acting)
+      if [ -z "$PG1" ];
+      then
+        PG1="${p}.0"
+        POOLNUM1=$p
+        pool1="${poolprefix}$p"
+        chk_osd1_1=$test_osd1
+        chk_osd1_2=$test_osd2
+      elif [ $chk_osd1_1 != $test_osd1 ];
+      then
+        PG2="${p}.0"
+        POOLNUM2=$p
+        pool2="${poolprefix}$p"
+        chk_osd2_1=$test_osd1
+        chk_osd2_2=$test_osd2
+        break
+      fi
+    done
+    rm -f $dir/acting
+
+    if [ "$pool2" = "" ];
+    then
+      echo "Failure to find appropirate PGs"
+      return 1
+    fi
+
+    for p in $(seq 1 $pools)
+    do
+      if [ $p != $POOLNUM1 -a $p != $POOLNUM2 ];
+      then
+        delete_pool ${poolprefix}$p
+      fi
+    done
+
+    pool1_extra_prio=1
+    pool2_extra_prio=2
+    pool1_prio=$(expr $NORMAL_PRIO + $pool1_extra_prio)
+    pool2_prio=$(expr $NORMAL_PRIO + $pool2_extra_prio)
+
+    ceph osd pool set $pool1 size 1
+    ceph osd pool set $pool1 recovery_priority $pool1_extra_prio
+    ceph osd pool set $pool2 size 1
+    ceph osd pool set $pool2 recovery_priority $pool2_extra_prio
+    wait_for_clean || return 1
+
+    dd if=/dev/urandom of=$dir/data bs=1M count=10
+    p=1
+    for pname in $pool1 $pool2
+    do
+      for i in $(seq 1 $objects)
+      do
+	rados -p ${pname} put obj${i}-p${p} $dir/data
+      done
+      p=$(expr $p + 1)
+    done
+
+    local otherosd=$(get_not_primary $pool1 obj1-p1)
+
+    ceph pg dump pgs
+    ERRORS=0
+
+    ceph osd pool set $pool1 size 2
+    ceph osd pool set $pool2 size 2
+    sleep 10
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/dump.${chk_osd1_1}.out
+    echo osd.${chk_osd1_1}
+    cat $dir/dump.${chk_osd1_1}.out
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_2}) dump_reservations > $dir/dump.${chk_osd1_2}.out
+    echo osd.${chk_osd1_2}
+    cat $dir/dump.${chk_osd1_2}.out
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd1_1}.out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG1} ];
+    then
+      echo "The primary PG for $pool1 didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd1_1}.out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool1_prio ];
+      then
+        echo "The primary PG ${PG1} doesn't have prio $pool1_prio"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd1_2}.out | jq '.remote_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG1} ];
+    then
+      echo "The primary PG for $pool1 didn't become the in progress item on remote"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd1_2}.out | jq '.remote_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool1_prio ];
+      then
+        echo "The primary PG ${PG1} doesn't have prio $pool1_prio on remote"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd2_1}.out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG2} ];
+    then
+      echo "The primary PG for $pool2 didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd2_1}.out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool2_prio ];
+      then
+        echo "The primary PG ${PG2} doesn't have prio $pool2_prio"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd2_2}.out | jq '.remote_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG2} ];
+    then
+      echo "The primary PG $PG2 didn't become the in progress item on remote"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd2_2}.out | jq '.remote_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool2_prio ];
+      then
+        echo "The primary PG ${PG2} doesn't have prio $pool2_prio on remote"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    wait_for_clean || return 1
+
+    if [ $ERRORS != "0" ];
+    then
+      echo "$ERRORS error(s) found"
+    else
+      echo TEST PASSED
+    fi
+
+    delete_pool $pool1
+    delete_pool $pool2
+    kill_daemons $dir || return 1
+    return $ERRORS
+}
+
+main osd-recovery-prio "$@"
+
+# Local Variables:
+# compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-prio.sh"
+# End:
diff --git a/ceph/qa/standalone/scrub/osd-scrub-repair.sh b/ceph/qa/standalone/scrub/osd-scrub-repair.sh
index b6d541bb3..8b228784e 100755
--- a/ceph/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/ceph/qa/standalone/scrub/osd-scrub-repair.sh
@@ -194,7 +194,7 @@ function create_ec_pool() {
     local pool_name=$1
     local allow_overwrites=$2
 
-    ceph osd erasure-code-profile set myprofile crush-failure-domain=osd $3 $4 $5 $6 $7 || return 1
+    ceph osd erasure-code-profile set myprofile crush-failure-domain=osd "$@" || return 1
 
     create_pool "$poolname" 1 1 erasure myprofile || return 1
 
@@ -5245,7 +5245,7 @@ function TEST_periodic_scrub_replicated() {
     # Can't upgrade with this set
     ceph osd set nodeep-scrub
     # Let map change propagate to OSDs
-    flush pg_stats
+    flush_pg_stats
     sleep 5
 
     # Fake a schedule scrub
@@ -5274,6 +5274,91 @@ function TEST_periodic_scrub_replicated() {
     rados list-inconsistent-obj $pg | jq '.' | grep -qv $objname || return 1
 }
 
+function TEST_scrub_warning() {
+    local dir=$1
+    local poolname=psr_pool
+    local objname=POBJ
+    local scrubs=5
+    local deep_scrubs=5
+    local i1_day=86400
+    local i7_days=$(calc $i1_day \* 7)
+    local i14_days=$(calc $i1_day \* 14)
+    local overdue=$i1_day
+    local conf_overdue_seconds=$(calc $i7_days + $overdue )
+    local pool_overdue_seconds=$(calc $i14_days + $overdue )
+
+    setup $dir || return 1
+    run_mon $dir a --osd_pool_default_size=1 || return 1
+    run_mgr $dir x --mon_warn_not_scrubbed=${overdue} --mon_warn_not_deep_scrubbed=${overdue} || return 1
+    run_osd $dir 0 $ceph_osd_args --osd_scrub_backoff_ratio=0 || return 1
+
+    for i in $(seq 1 $(expr $scrubs + $deep_scrubs))
+    do
+      create_pool $poolname-$i 1 1 || return 1
+      wait_for_clean || return 1
+      if [ $i = "1" ];
+      then
+        ceph osd pool set $poolname-$i scrub_max_interval $i14_days
+      fi
+      if [ $i = $(expr $scrubs + 1) ];
+      then
+        ceph osd pool set $poolname-$i deep_scrub_interval $i14_days
+      fi
+    done
+
+    # Only 1 osd
+    local primary=0
+
+    ceph osd set noscrub || return 1
+    ceph osd set nodeep-scrub || return 1
+    ceph config set global osd_scrub_interval_randomize_ratio 0
+    ceph config set global osd_deep_scrub_randomize_ratio 0
+    ceph config set global osd_scrub_max_interval ${i7_days}
+    ceph config set global osd_deep_scrub_interval ${i7_days}
+
+    # Fake schedule scrubs
+    for i in $(seq 1 $scrubs)
+    do
+      if [ $i = "1" ];
+      then
+        overdue_seconds=$pool_overdue_seconds
+      else
+        overdue_seconds=$conf_overdue_seconds
+      fi
+      CEPH_ARGS='' ceph daemon $(get_asok_path osd.${primary}) \
+             trigger_scrub ${i}.0 $(expr ${overdue_seconds} + ${i}00) || return 1
+    done
+    # Fake schedule deep scrubs
+    for i in $(seq $(expr $scrubs + 1) $(expr $scrubs + $deep_scrubs))
+    do
+      if [ $i = "$(expr $scrubs + 1)" ];
+      then
+        overdue_seconds=$pool_overdue_seconds
+      else
+        overdue_seconds=$conf_overdue_seconds
+      fi
+      CEPH_ARGS='' ceph daemon $(get_asok_path osd.${primary}) \
+             trigger_deep_scrub ${i}.0 $(expr ${overdue_seconds} + ${i}00) || return 1
+    done
+    flush_pg_stats
+
+    ceph health
+    ceph health detail
+    ceph health | grep -q "$deep_scrubs pgs not deep-scrubbed in time" || return 1
+    ceph health | grep -q "$scrubs pgs not scrubbed in time" || return 1
+    COUNT=$(ceph health detail | grep "not scrubbed since" | wc -l)
+    if [ "$COUNT" != $scrubs ]; then
+      ceph health detail | grep "not scrubbed since"
+      return 1
+    fi
+    COUNT=$(ceph health detail | grep "not deep-scrubbed since" | wc -l)
+    if [ "$COUNT" != $deep_scrubs ]; then
+      ceph health detail | grep "not deep-scrubbed since"
+      return 1
+    fi
+    return 0
+}
+
 #
 # Corrupt snapset in replicated pool
 #
diff --git a/ceph/qa/standalone/special/ceph_objectstore_tool.py b/ceph/qa/standalone/special/ceph_objectstore_tool.py
index 0c6097c1f..1bde02b76 100755
--- a/ceph/qa/standalone/special/ceph_objectstore_tool.py
+++ b/ceph/qa/standalone/special/ceph_objectstore_tool.py
@@ -686,8 +686,8 @@ def main(argv):
     EC_NAME = "ECobject"
     if len(argv) > 0 and argv[0] == 'large':
         PG_COUNT = 12
-        NUM_REP_OBJECTS = 800
-        NUM_CLONED_REP_OBJECTS = 100
+        NUM_REP_OBJECTS = 200
+        NUM_CLONED_REP_OBJECTS = 50
         NUM_EC_OBJECTS = 12
         NUM_NSPACES = 4
         # Larger data sets for first object per namespace
@@ -1470,7 +1470,7 @@ def main(argv):
         for basename in db[nspace].keys():
             file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
             JSON = db[nspace][basename]['json']
-            GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
+            jsondict = json.loads(JSON)
             for pg in OBJREPPGS:
                 OSDS = get_osds(pg, OSDDIR)
                 for osd in OSDS:
@@ -1481,12 +1481,33 @@ def main(argv):
                         continue
                     if int(basename.split(REP_NAME)[1]) > int(NUM_CLONED_REP_OBJECTS):
                         continue
+                    logging.debug("REPobject " + JSON)
                     cmd = (CFSD_PREFIX + " '{json}' dump | grep '\"snap\": 1,' > /dev/null").format(osd=osd, json=JSON)
                     logging.debug(cmd)
                     ret = call(cmd, shell=True)
                     if ret != 0:
                         logging.error("Invalid dump for {json}".format(json=JSON))
                         ERRORS += 1
+            if 'shard_id' in jsondict[1]:
+                logging.debug("ECobject " + JSON)
+                for pg in OBJECPGS:
+                    OSDS = get_osds(pg, OSDDIR)
+                    jsondict = json.loads(JSON)
+                    for osd in OSDS:
+                        DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+                        fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+                                  and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+                        if not fnames:
+                            continue
+                        if int(basename.split(EC_NAME)[1]) > int(NUM_EC_OBJECTS):
+                            continue
+                        # Fix shard_id since we only have one json instance for each object
+                        jsondict[1]['shard_id'] = int(pg.split('s')[1])
+                        cmd = (CFSD_PREFIX + " '{json}' dump | grep '\"hinfo\": [{{]' > /dev/null").format(osd=osd, json=json.dumps((pg, jsondict[1])))
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Invalid dump for {json}".format(json=JSON))
 
     print("Test list-attrs get-attr")
     ATTRFILE = r"/tmp/attrs.{pid}".format(pid=pid)
@@ -1497,16 +1518,16 @@ def main(argv):
             JSON = db[nspace][basename]['json']
             jsondict = json.loads(JSON)
 
-            if 'shard_id' in jsondict:
+            if 'shard_id' in jsondict[1]:
                 logging.debug("ECobject " + JSON)
                 found = 0
                 for pg in OBJECPGS:
                     OSDS = get_osds(pg, OSDDIR)
                     # Fix shard_id since we only have one json instance for each object
-                    jsondict['shard_id'] = int(pg.split('s')[1])
-                    JSON = json.dumps(jsondict)
+                    jsondict[1]['shard_id'] = int(pg.split('s')[1])
+                    JSON = json.dumps((pg, jsondict[1]))
                     for osd in OSDS:
-                        cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-attr hinfo_key").format(osd=osd, pg=pg, json=JSON)
+                        cmd = (CFSD_PREFIX + " '{json}' get-attr hinfo_key").format(osd=osd, json=JSON)
                         logging.debug("TRY: " + cmd)
                         try:
                             out = check_output(cmd, shell=True, stderr=subprocess.STDOUT)
@@ -1522,12 +1543,12 @@ def main(argv):
 
             for pg in ALLPGS:
                 # Make sure rep obj with rep pg or ec obj with ec pg
-                if ('shard_id' in jsondict) != (pg.find('s') > 0):
+                if ('shard_id' in jsondict[1]) != (pg.find('s') > 0):
                     continue
-                if 'shard_id' in jsondict:
+                if 'shard_id' in jsondict[1]:
                     # Fix shard_id since we only have one json instance for each object
-                    jsondict['shard_id'] = int(pg.split('s')[1])
-                    JSON = json.dumps(jsondict)
+                    jsondict[1]['shard_id'] = int(pg.split('s')[1])
+                    JSON = json.dumps((pg, jsondict[1]))
                 OSDS = get_osds(pg, OSDDIR)
                 for osd in OSDS:
                     DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
@@ -1536,7 +1557,7 @@ def main(argv):
                     if not fnames:
                         continue
                     afd = open(ATTRFILE, "wb")
-                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' list-attrs").format(osd=osd, pg=pg, json=JSON)
+                    cmd = (CFSD_PREFIX + " '{json}' list-attrs").format(osd=osd, json=JSON)
                     logging.debug(cmd)
                     ret = call(cmd, shell=True, stdout=afd)
                     afd.close()
@@ -1556,7 +1577,7 @@ def main(argv):
                             continue
                         exp = values.pop(key)
                         vfd = open(VALFILE, "wb")
-                        cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key="_" + key)
+                        cmd = (CFSD_PREFIX + " '{json}' get-attr {key}").format(osd=osd, json=JSON, key="_" + key)
                         logging.debug(cmd)
                         ret = call(cmd, shell=True, stdout=vfd)
                         vfd.close()
diff --git a/ceph/qa/suites/ceph-deploy/basic/objectstore/bluestore-bitmap.yaml b/ceph/qa/suites/ceph-deploy/basic/objectstore/bluestore-bitmap.yaml
new file mode 120000
index 000000000..a59cf5175
--- /dev/null
+++ b/ceph/qa/suites/ceph-deploy/basic/objectstore/bluestore-bitmap.yaml
@@ -0,0 +1 @@
+.qa/objectstore/bluestore-bitmap.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/ceph-deploy/basic/objectstore/bluestore.yaml b/ceph/qa/suites/ceph-deploy/basic/objectstore/bluestore.yaml
deleted file mode 120000
index bd7d7e004..000000000
--- a/ceph/qa/suites/ceph-deploy/basic/objectstore/bluestore.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../../objectstore/bluestore.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/basic_functional/objectstore/bluestore-bitmap.yaml b/ceph/qa/suites/fs/basic_functional/objectstore/bluestore-bitmap.yaml
new file mode 120000
index 000000000..a59cf5175
--- /dev/null
+++ b/ceph/qa/suites/fs/basic_functional/objectstore/bluestore-bitmap.yaml
@@ -0,0 +1 @@
+.qa/objectstore/bluestore-bitmap.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/basic_functional/objectstore/bluestore.yaml b/ceph/qa/suites/fs/basic_functional/objectstore/bluestore.yaml
deleted file mode 120000
index bd7d7e004..000000000
--- a/ceph/qa/suites/fs/basic_functional/objectstore/bluestore.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../../objectstore/bluestore.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/basic_functional/tasks/libcephfs_java.yaml b/ceph/qa/suites/fs/basic_functional/tasks/libcephfs_java.yaml
deleted file mode 100644
index aaffa0338..000000000
--- a/ceph/qa/suites/fs/basic_functional/tasks/libcephfs_java.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-
-os_type: ubuntu
-os_version: "14.04"
-
-overrides:
-  ceph-fuse:
-    disabled: true
-  kclient:
-    disabled: true
-tasks:
-- workunit:
-    clients:
-      client.0:
-        - libcephfs-java/test.sh
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore-bitmap.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore-bitmap.yaml
new file mode 120000
index 000000000..a59cf5175
--- /dev/null
+++ b/ceph/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore-bitmap.yaml
@@ -0,0 +1 @@
+.qa/objectstore/bluestore-bitmap.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore.yaml
deleted file mode 120000
index 1728accf9..000000000
--- a/ceph/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../cephfs/objectstore-ec/bluestore.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
index 410606225..f0ed3366c 100644
--- a/ceph/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
+++ b/ceph/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
@@ -10,7 +10,6 @@ overrides:
 tasks:
 - exec:
     mon.a:
-    - "ceph tell mds.* config set mds_max_ratio_caps_per_client 1"
     - "ceph tell mds.* config set mds_min_caps_per_client 1"
 - background_exec:
     mon.a:
diff --git a/ceph/qa/suites/fs/verify/validater/valgrind.yaml b/ceph/qa/suites/fs/verify/validater/valgrind.yaml
index 6982cedfc..b25b71487 100644
--- a/ceph/qa/suites/fs/verify/validater/valgrind.yaml
+++ b/ceph/qa/suites/fs/verify/validater/valgrind.yaml
@@ -1,21 +1,20 @@
 # see http://tracker.ceph.com/issues/20360 and http://tracker.ceph.com/issues/18126
 os_type: centos
 
-# Valgrind makes everything slow, so ignore slow requests
-overrides:
-  ceph:
-    log-whitelist:
-      - slow requests are blocked
-
 overrides:
   install:
     ceph:
       flavor: notcmalloc
       debuginfo: true
   ceph:
+    # Valgrind makes everything slow, so ignore slow requests and extend heartbeat grace
+    log-whitelist:
+      - slow requests are blocked
     conf:
       global:
         osd heartbeat grace: 40
+      mds:
+        mds heartbeat grace: 60
       mon:
         mon osd crush smoke test: false
     valgrind:
diff --git a/ceph/qa/suites/powercycle/osd/whitelist_health.yaml b/ceph/qa/suites/powercycle/osd/whitelist_health.yaml
index f9ab0a62b..f724302a4 100644
--- a/ceph/qa/suites/powercycle/osd/whitelist_health.yaml
+++ b/ceph/qa/suites/powercycle/osd/whitelist_health.yaml
@@ -3,4 +3,5 @@ overrides:
     log-whitelist:
       - \(MDS_TRIM\)
       - \(MDS_SLOW_REQUEST\)
+      - MDS_SLOW_METADATA_IO
       - Behind on trimming
diff --git a/ceph/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml b/ceph/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml
index 3b821bc0c..def613b67 100644
--- a/ceph/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml
+++ b/ceph/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml
@@ -11,6 +11,7 @@ overrides:
       - \(PG_
       - \(POOL_APP_NOT_ENABLED\)
       - \(SMALLER_PGP_NUM\)
+      - slow request
     conf:
       global:
         debug objecter: 20
diff --git a/ceph/qa/suites/rados/rest/rest_test.yaml b/ceph/qa/suites/rados/rest/rest_test.yaml
index 0fdb9dc6a..bc8eb8360 100644
--- a/ceph/qa/suites/rados/rest/rest_test.yaml
+++ b/ceph/qa/suites/rados/rest/rest_test.yaml
@@ -32,6 +32,7 @@ tasks:
       - \(SLOW_OPS\)
       - \(TOO_FEW_PGS\)
       - but it is still running
+      - slow request
     conf:
       client.rest0:
         debug ms: 1
diff --git a/ceph/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml b/ceph/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml
new file mode 120000
index 000000000..a59cf5175
--- /dev/null
+++ b/ceph/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml
@@ -0,0 +1 @@
+.qa/objectstore/bluestore-bitmap.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml b/ceph/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml
deleted file mode 120000
index bd7d7e004..000000000
--- a/ceph/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../../objectstore/bluestore.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml b/ceph/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml
index a63400be3..01088ab89 100644
--- a/ceph/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml
+++ b/ceph/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml
@@ -22,6 +22,7 @@ tasks:
       - \(PG_
       - \(OBJECT_
       - \(REQUEST_SLOW\)
+      - slow request
     conf:
       osd:
         osd min pg log entries: 5
diff --git a/ceph/qa/suites/rados/singleton/all/osd-recovery.yaml b/ceph/qa/suites/rados/singleton/all/osd-recovery.yaml
index 5479f79b7..fa8fa704b 100644
--- a/ceph/qa/suites/rados/singleton/all/osd-recovery.yaml
+++ b/ceph/qa/suites/rados/singleton/all/osd-recovery.yaml
@@ -20,8 +20,8 @@ tasks:
       - \(OSD_
       - \(PG_
       - \(OBJECT_DEGRADED\)
-      - \(SLOW_OPS\)
       - \(REQUEST_SLOW\)
+      - slow request
     conf:
       osd:
         osd min pg log entries: 5
diff --git a/ceph/qa/suites/rados/singleton/all/thrash-eio.yaml b/ceph/qa/suites/rados/singleton/all/thrash-eio.yaml
index e184d911d..f4db6ca18 100644
--- a/ceph/qa/suites/rados/singleton/all/thrash-eio.yaml
+++ b/ceph/qa/suites/rados/singleton/all/thrash-eio.yaml
@@ -30,6 +30,7 @@ tasks:
     - \(PG_
     - \(OBJECT_MISPLACED\)
     - \(OSD_
+    - slow request
 - thrashosds:
     op_delay: 30
     clean_interval: 120
diff --git a/ceph/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml b/ceph/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml
new file mode 120000
index 000000000..635085f7f
--- /dev/null
+++ b/ceph/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml
@@ -0,0 +1 @@
+../thrash-erasure-code/objectstore/bluestore-bitmap.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml b/ceph/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml
deleted file mode 120000
index 1249ffda0..000000000
--- a/ceph/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../thrash-erasure-code/objectstore/bluestore.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/rados/verify/tasks/rados_api_tests.yaml b/ceph/qa/suites/rados/verify/tasks/rados_api_tests.yaml
index 4a06055b5..9fd048c91 100644
--- a/ceph/qa/suites/rados/verify/tasks/rados_api_tests.yaml
+++ b/ceph/qa/suites/rados/verify/tasks/rados_api_tests.yaml
@@ -10,6 +10,7 @@ overrides:
       - \(CACHE_POOL_NEAR_FULL\)
       - \(POOL_APP_NOT_ENABLED\)
       - \(PG_AVAILABILITY\)
+      - slow request
     conf:
       client:
         debug ms: 1
diff --git a/ceph/qa/suites/rbd/basic/tasks/rbd_python_api_tests_old_format.yaml b/ceph/qa/suites/rbd/basic/tasks/rbd_python_api_tests_old_format.yaml
index 7ab3185ec..32283b6ca 100644
--- a/ceph/qa/suites/rbd/basic/tasks/rbd_python_api_tests_old_format.yaml
+++ b/ceph/qa/suites/rbd/basic/tasks/rbd_python_api_tests_old_format.yaml
@@ -2,6 +2,7 @@ overrides:
   ceph:
     log-whitelist:
       - \(REQUEST_SLOW\)
+      - slow request
 tasks:
 - workunit:
     clients:
diff --git a/ceph/qa/suites/rbd/singleton-bluestore/objectstore/bluestore-bitmap.yaml b/ceph/qa/suites/rbd/singleton-bluestore/objectstore/bluestore-bitmap.yaml
new file mode 120000
index 000000000..a59cf5175
--- /dev/null
+++ b/ceph/qa/suites/rbd/singleton-bluestore/objectstore/bluestore-bitmap.yaml
@@ -0,0 +1 @@
+.qa/objectstore/bluestore-bitmap.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/rbd/singleton-bluestore/objectstore/bluestore.yaml b/ceph/qa/suites/rbd/singleton-bluestore/objectstore/bluestore.yaml
deleted file mode 120000
index bd7d7e004..000000000
--- a/ceph/qa/suites/rbd/singleton-bluestore/objectstore/bluestore.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../../objectstore/bluestore.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/rgw/multisite/overrides.yaml b/ceph/qa/suites/rgw/multisite/overrides.yaml
index 7fbd27605..0c2cd2e19 100644
--- a/ceph/qa/suites/rgw/multisite/overrides.yaml
+++ b/ceph/qa/suites/rgw/multisite/overrides.yaml
@@ -7,5 +7,7 @@ overrides:
         rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo=
         rgw crypt require ssl: false
         rgw sync log trim interval: 0
+        rgw md log max shards: 4
+        rgw data log num shards: 4
   rgw:
     compression type: random
diff --git a/ceph/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml b/ceph/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml
new file mode 120000
index 000000000..a59cf5175
--- /dev/null
+++ b/ceph/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml
@@ -0,0 +1 @@
+.qa/objectstore/bluestore-bitmap.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/smoke/basic/objectstore/bluestore.yaml b/ceph/qa/suites/smoke/basic/objectstore/bluestore.yaml
deleted file mode 120000
index bd7d7e004..000000000
--- a/ceph/qa/suites/smoke/basic/objectstore/bluestore.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../../objectstore/bluestore.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore-bitmap.yaml b/ceph/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore-bitmap.yaml
new file mode 120000
index 000000000..a59cf5175
--- /dev/null
+++ b/ceph/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore-bitmap.yaml
@@ -0,0 +1 @@
+.qa/objectstore/bluestore-bitmap.yaml
\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml b/ceph/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml
deleted file mode 120000
index d6445987d..000000000
--- a/ceph/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../objectstore/bluestore.yaml
\ No newline at end of file
diff --git a/ceph/qa/tasks/cephfs/test_client_limits.py b/ceph/qa/tasks/cephfs/test_client_limits.py
index 1f1d54670..bc029cd8a 100644
--- a/ceph/qa/tasks/cephfs/test_client_limits.py
+++ b/ceph/qa/tasks/cephfs/test_client_limits.py
@@ -42,12 +42,14 @@ class TestClientLimits(CephFSTestCase):
         cache_size = open_files/2
 
         self.set_conf('mds', 'mds cache size', cache_size)
+        self.set_conf('mds', 'mds_recall_max_caps', open_files/2)
+        self.set_conf('mds', 'mds_recall_warning_threshold', open_files)
         self.fs.mds_fail_restart()
         self.fs.wait_for_daemons()
 
         mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        mds_recall_warning_decay_rate = self.fs.get_config("mds_recall_warning_decay_rate")
         self.assertTrue(open_files >= mds_min_caps_per_client)
-        mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client"))
 
         mount_a_client_id = self.mount_a.get_global_id()
         path = "subdir/mount_a" if use_subdir else "mount_a"
@@ -64,13 +66,11 @@ class TestClientLimits(CephFSTestCase):
 
         # MDS should not be happy about that, as the client is failing to comply
         # with the SESSION_RECALL messages it is being sent
-        mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout"))
-        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10)
+        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2)
 
         # We can also test that the MDS health warning for oversized
         # cache is functioning as intended.
-        self.wait_for_health("MDS_CACHE_OVERSIZED",
-                mds_recall_state_timeout + 10)
+        self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2)
 
         # When the client closes the files, it should retain only as many caps as allowed
         # under the SESSION_RECALL policy
@@ -84,14 +84,13 @@ class TestClientLimits(CephFSTestCase):
 
         # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
         # which depend on the caps outstanding, cache size and overall ratio
-        recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2))
         def expected_caps():
             num_caps = self.get_session(mount_a_client_id)['num_caps']
             if num_caps < mds_min_caps_per_client:
                 raise RuntimeError("client caps fell below min!")
             elif num_caps == mds_min_caps_per_client:
                 return True
-            elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05:
+            elif num_caps < cache_size:
                 return True
             else:
                 return False
@@ -237,3 +236,28 @@ class TestClientLimits(CephFSTestCase):
     def test_client_cache_size(self):
         self._test_client_cache_size(False)
         self._test_client_cache_size(True)
+
+    def test_client_max_caps(self):
+        """
+        That the MDS will not let a client sit above mds_max_caps_per_client caps.
+        """
+
+        mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        mds_max_caps_per_client = 2*mds_min_caps_per_client
+        self.set_conf('mds', 'mds_max_caps_per_client', mds_max_caps_per_client)
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.create_n_files("foo/", 3*mds_max_caps_per_client, sync=True)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        def expected_caps():
+            num_caps = self.get_session(mount_a_client_id)['num_caps']
+            if num_caps < mds_min_caps_per_client:
+                raise RuntimeError("client caps fell below min!")
+            elif num_caps <= mds_max_caps_per_client:
+                return True
+            else:
+                return False
+
+        self.wait_until_true(expected_caps, timeout=60)
diff --git a/ceph/qa/tasks/cephfs/test_misc.py b/ceph/qa/tasks/cephfs/test_misc.py
index c27278008..9c44e6c09 100644
--- a/ceph/qa/tasks/cephfs/test_misc.py
+++ b/ceph/qa/tasks/cephfs/test_misc.py
@@ -52,6 +52,9 @@ class TestMisc(CephFSTestCase):
         self.assertGreaterEqual(rctime, t-10)
 
     def test_fs_new(self):
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
         data_pool_name = self.fs.get_data_pool_name()
 
         self.fs.mds_stop()
diff --git a/ceph/qa/tasks/radosbench.py b/ceph/qa/tasks/radosbench.py
index 530a6f149..dd1f85dee 100644
--- a/ceph/qa/tasks/radosbench.py
+++ b/ceph/qa/tasks/radosbench.py
@@ -76,12 +76,12 @@ def task(ctx, config):
             else:
                 pool = manager.create_pool_with_unique_name(erasure_code_profile_name=profile_name)
 
-        osize = config.get('objectsize', 0)
+        osize = config.get('objectsize', 65536)
         if osize is 0:
             objectsize = []
         else:
             objectsize = ['-o', str(osize)]
-        size = ['-b', str(config.get('size', 4<<20))]
+        size = ['-b', str(config.get('size', 65536))]
         # If doing a reading run then populate data
         if runtype != "write":
             proc = remote.run(
diff --git a/ceph/qa/valgrind.supp b/ceph/qa/valgrind.supp
new file mode 100644
index 000000000..cbd41a29e
--- /dev/null
+++ b/ceph/qa/valgrind.supp
@@ -0,0 +1,622 @@
+{
+   older boost mersenne twister uses uninitialized memory for randomness
+   Memcheck:Cond
+   ...
+   fun:*Monitor::prepare_new_fingerprint*
+   ...
+}
+{
+   older boost mersenne twister uses uninitialized memory for randomness
+   Memcheck:Value8
+   ...
+   fun:*Monitor::prepare_new_fingerprint*
+   ...
+}
+{
+   apparent TLS leak in eglibc
+   Memcheck:Leak
+   fun:calloc
+   fun:_dl_allocate_tls
+   fun:pthread_create*
+   ...
+}
+{
+   osd: ignore ec plugin loading (FIXME SOMEDAY)
+   Memcheck:Leak
+   ...
+   fun:*ErasureCodePluginRegistry*load*
+   ...
+}
+{
+   osd: ignore ec plugin factory (FIXME SOMEDAY)
+   Memcheck:Leak
+   ...
+   fun:*ErasureCodePluginRegistry*factory*
+   ...
+}
+{
+   tcmalloc: libboost_thread-mt.so.1.53 is linked with tcmalloc
+   Memcheck:Param
+   msync(start)
+   obj:/usr/lib64/libpthread-2.17.so
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   ...
+   fun:*tcmalloc*ThreadCache*
+   ...
+   obj:/usr/lib64/libboost_thread-mt.so.1.53.0
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes (centos 6.5)
+   Memcheck:Param
+   msync(start)
+   obj:/lib64/libpthread-2.12.so
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+}
+{
+   tcmalloc: msync heap allocation points to unaddressible bytes (centos 6.5 #2)
+   Memcheck:Param
+   msync(start)
+   obj:/lib64/libpthread-2.12.so
+   obj:/usr/lib64/libunwind.so.7.0.0
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes (rhel7)
+   Memcheck:Param
+   msync(start)
+   obj:/usr/lib64/libpthread-2.17.so
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes (rhel7 #2)
+   Memcheck:Param
+   msync(start)
+   obj:/usr/lib64/libpthread-2.17.so
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   fun:_ULx86_64_step
+   obj:/usr/lib64/libtcmalloc.so.4.2.6
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes (wheezy)
+   Memcheck:Param
+   msync(start)
+   obj:/lib/x86_64-linux-gnu/libpthread-2.13.so
+   obj:/usr/lib/libunwind.so.7.0.0
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes (precise)
+   Memcheck:Param
+   msync(start)
+   obj:/lib/x86_64-linux-gnu/libpthread-2.15.so
+   obj:/usr/lib/libunwind.so.7.0.0
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+   obj:/usr/lib/libtcmalloc.so.0.1.0
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes (trusty)
+   Memcheck:Param
+   msync(start)
+   obj:/lib/x86_64-linux-gnu/libpthread-2.19.so
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes 2 (trusty)
+   Memcheck:Param
+   msync(start)
+   fun:__msync_nocancel
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+   fun:_ZN8tcmalloc15CentralFreeList8PopulateEv
+   fun:_ZN8tcmalloc15CentralFreeList18FetchFromSpansSafeEv
+   fun:_ZN8tcmalloc15CentralFreeList11RemoveRangeEPPvS2_i
+}
+{
+   tcmalloc: msync (xenial)
+   Memcheck:Param
+   msync(start)
+   fun:__msync_nocancel
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:*tcmalloc*
+   fun:*GetStackTrace*
+}
+{
+	tcmalloc: string
+	Memcheck:Leak
+	...
+	obj:*tcmalloc*
+	fun:call_init*
+	...
+}
+{
+	ceph global: deliberate onexit leak
+	Memcheck:Leak
+	...
+	fun:*set_flush_on_exit*
+	...
+}
+{
+	libleveldb: ignore all static leveldb leaks
+	Memcheck:Leak
+	...
+	fun:*leveldb*
+	...
+}
+{
+	libleveldb: ignore all dynamic libleveldb leaks
+	Memcheck:Leak
+	...
+	obj:*libleveldb.so*
+	...
+}
+{
+	libcurl: ignore libcurl leaks
+	Memcheck:Leak
+	...
+	fun:*curl_global_init
+}
+{
+	ignore gnutls leaks
+	Memcheck:Leak
+	...
+	fun:gnutls_global_init
+}
+{
+	ignore libfcgi leak; OS_LibShutdown has no callers!
+	Memcheck:Leak
+	...
+	fun:OS_LibInit
+	fun:FCGX_Init
+}
+{
+	ignore libnss3 leaks
+	Memcheck:Leak
+	...
+	obj:*libnss3*
+	...
+}
+{
+        strptime suckage
+        Memcheck:Cond
+        fun:__GI___strncasecmp_l
+        fun:__strptime_internal
+        ...
+}
+{
+        strptime suckage 2
+        Memcheck:Value8
+        fun:__GI___strncasecmp_l
+        fun:__strptime_internal
+        ...
+}
+{
+        strptime suckage 3
+        Memcheck:Addr8
+        fun:__GI___strncasecmp_l
+        fun:__strptime_internal
+        ...
+}
+{
+	inet_ntop does something lame on local stack
+	Memcheck:Value8
+	...
+	fun:inet_ntop
+	...
+}
+{
+	inet_ntop does something lame on local stack
+	Memcheck:Addr8
+	...
+	fun:inet_ntop
+	...
+}
+{
+	dl-lookup.c thing .. Invalid write of size 8
+	Memcheck:Value8
+	fun:do_lookup_x
+	...
+	fun:_dl_lookup_symbol_x
+	...
+}
+{
+	dl-lookup.c thing .. Invalid write of size 8
+	Memcheck:Addr8
+	fun:do_lookup_x
+	...
+	fun:_dl_lookup_symbol_x
+	...
+}
+{
+	weird thing from libc
+	Memcheck:Leak
+	...
+	fun:*sub_I_comparator*
+	fun:__libc_csu_init
+	...
+}
+{
+	libfuse leak
+	Memcheck:Leak
+	...
+	fun:fuse_parse_cmdline
+	...
+}
+{
+	boost thread leaks on exit
+	Memcheck:Leak
+	...
+	fun:*boost*detail*
+	...
+	fun:exit
+}
+{
+	lttng appears to not clean up state
+	Memcheck:Leak
+	...
+	fun:lttng_ust_baddr_statedump_init
+	fun:lttng_ust_init
+	fun:call_init.part.0
+	...
+}
+{
+	fun:PK11_CreateContextBySymKey race
+	Helgrind:Race
+	obj:/usr/*lib*/libfreebl*3.so
+	...
+	obj:/usr/*lib*/libsoftokn3.so
+	...
+	obj:/usr/*lib*/libnss3.so
+	fun:PK11_CreateContextBySymKey
+	...
+}
+{
+	thread init race
+	Helgrind:Race
+	fun:mempcpy
+	fun:_dl_allocate_tls_init
+	...
+	fun:pthread_create@*
+	...
+}
+{
+	thread_local memory is falsely detected (https://svn.boost.org/trac/boost/ticket/3296)
+	Memcheck:Leak
+	...
+	fun:*boost*detail*get_once_per_thread_epoch*
+	fun:*boost*call_once*
+	fun:*boost*detail*get_current_thread_data*
+	...
+}
+{
+	rocksdb thread local singletons
+	Memcheck:Leak
+	...
+	fun:rocksdb::Env::Default()
+	...
+}
+{
+	rocksdb column thread local leaks
+	Memcheck:Leak
+	...
+	fun:rocksdb::ThreadLocalPtr::StaticMeta::SetHandler*
+	fun:rocksdb::ColumnFamilyData::ColumnFamilyData*
+	...
+}
+{
+	rocksdb thread crap
+	Memcheck:Leak
+	...
+	fun:*ThreadLocalPtr*
+	...
+}
+{
+	rocksdb singleton Env leak, blech
+	Memcheck:Leak
+	...
+	fun:CreateThreadStatusUpdater
+	fun:PosixEnv
+	...
+}
+{
+	rocksdb::Env::Default()
+	Memcheck:Leak
+	...
+	fun:*rocksdb*Env*Default*
+	...
+}
+{
+	rocksdb BGThreadWrapper
+	Memcheck:Leak
+	...
+	fun:*BGThreadWrapper*
+	...
+}
+{
+	libstdc++ leak on xenial
+	Memcheck:Leak
+	fun:malloc
+	...
+	fun:call_init.part.0
+	fun:call_init
+	fun:_dl_init
+	...
+}
+{
+	strange leak of std::string memory from md_config_t seen in radosgw
+	Memcheck:Leak
+	...
+	fun:_ZNSs4_Rep9_S_createEmmRKSaIcE
+	fun:_ZNSs12_S_constructIPKcEEPcT_S3_RKSaIcESt20forward_iterator_tag
+	...
+	fun:_ZN11md_config_tC1Ev
+	fun:_ZN11CephContextC1Eji
+	...
+}
+{
+    python does not reset the member field when dealloc an object
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:Py_InitializeEx
+    ...
+}
+{
+    statically allocated python types don't get members freed
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyType_Ready
+    ...
+}
+{
+    manually constructed python module members don't get freed
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:Py_InitModule4_64
+    ...
+}
+{
+    manually constructed python module members don't get freed
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyModule_AddObject
+    ...
+}
+{
+    python subinterpreters may not clean up properly
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:Py_NewInterpreter
+    ...
+}
+{
+    python should be able to take care of itself
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyEval_EvalCode
+}
+{
+    python should be able to take care of itself
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyImport_ImportModuleLevel
+}
+{
+    python-owned threads may not full clean up after themselves
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyEval_CallObjectWithKeywords
+}
+{
+    python should be able to take care of itself
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyEval_EvalFrameEx
+    ...
+    obj:/usr/lib64/libpython2.7.so.1.0
+}
+{
+    python should be able to take care of itself
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyObject_Call
+}
+
+{
+   rados cython constants
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:malloc
+   fun:PyObject_Malloc
+   fun:PyCode_New
+   fun:__Pyx_InitCachedConstants
+   fun:initrados
+   fun:_PyImport_LoadDynamicModule
+   ...
+   fun:PyImport_ImportModuleLevel
+   ...
+   fun:PyObject_Call
+   fun:PyEval_CallObjectWithKeywords
+   fun:PyEval_EvalFrameEx
+}
+
+{
+   rbd cython constants
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:malloc
+   fun:PyObject_Malloc
+   fun:PyCode_New
+   fun:__Pyx_InitCachedConstants
+   fun:initrbd
+   fun:_PyImport_LoadDynamicModule
+   ...
+   fun:PyImport_ImportModuleLevel
+   ...
+   fun:PyObject_Call
+   fun:PyEval_CallObjectWithKeywords
+   fun:PyEval_EvalFrameEx
+}
+
+{
+  dlopen() with -lceph-common https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=700899
+  Memcheck:Leak
+  match-leak-kinds: reachable
+  fun:*alloc
+  ...
+  fun:_dlerror_run
+  fun:dlopen@@GLIBC_2.2.5
+}
+
+{
+  ethdev_init_log thing
+  Memcheck:Leak
+  match-leak-kinds: reachable
+  ...
+  fun:ethdev_init_log
+  ...
+}
+
+{
+  rte_log_init() in DPDK fails to reset strdup()'ed string at exit
+  Memcheck:Leak
+  match-leak-kinds: reachable
+  fun:*alloc
+  ...
+  fun:rte_log_init
+  fun:__libc_csu_init
+}
+
+{
+  libc_csu_init (strdup, rte_log_register, etc.)
+  Memcheck:Leak
+  match-leak-kinds: reachable
+  ...
+  fun:__libc_csu_init
+  ...
+}
+
+{
+  Boost.Thread fails to call tls_destructor() when the thread exists
+  Memcheck:Leak
+  match-leak-kinds: reachable
+  ...
+  fun:*boost*detail*make_external_thread_data*
+  fun:*boost*detail*add_new_tss_node*
+  fun:*boost*detail*set_tss_data*
+  ...
+}
+
+{
+  ignore *all* ceph-mgr python crap.  this is overkill, but better than nothing
+  Memcheck:Leak
+  match-leak-kinds: all
+  ...
+  fun:Py*
+  ...
+}
+
+{
+  something in glibc
+  Memcheck:Leak
+  match-leak-kinds: all
+  ...
+  fun:strdup
+  fun:__trans_list_add
+  ...
+  fun:_dl_init
+  ...
+}
+
+# "Conditional jump or move depends on uninitialised value(s)" in OpenSSL
+# while using aes-128-gcm with AES-NI enabled. Not observed while running
+# with `OPENSSL_ia32cap="~0x200000200000000"`.
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Cond
+   ...
+   fun:EVP_DecryptFinal_ex
+   fun:_ZN4ceph6crypto6onwire25AES128GCM_OnWireRxHandler34authenticated_decrypt_update_finalEONS_6buffer7v14_2_04listEj
+   fun:_ZN10ProtocolV231handle_read_frame_epilogue_mainEOSt10unique_ptrIN4ceph6buffer7v14_2_08ptr_nodeENS4_8disposerEEi
+   fun:_ZN10ProtocolV216run_continuationER2CtIS_E
+   ...
+   fun:_ZN15AsyncConnection7processEv
+   fun:_ZN11EventCenter14process_eventsEjPNSt6chrono8durationImSt5ratioILl1ELl1000000000EEEE
+   fun:operator()
+   fun:_ZNSt17_Function_handlerIFvvEZN12NetworkStack10add_threadEjEUlvE_E9_M_invokeERKSt9_Any_data
+   fun:execute_native_thread_routine
+   fun:start_thread
+   fun:clone
+}
+
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Cond
+   fun:_ZN4ceph6crypto6onwire25AES128GCM_OnWireRxHandler34authenticated_decrypt_update_finalEONS_6buffer7v14_2_04listEj
+   fun:_ZN10ProtocolV231handle_read_frame_epilogue_mainEOSt10unique_ptrIN4ceph6buffer7v14_2_08ptr_nodeENS4_8disposerEEi
+   fun:_ZN10ProtocolV216run_continuationER2CtIS_E
+   ...
+   fun:_ZN15AsyncConnection7processEv
+   fun:_ZN11EventCenter14process_eventsEjPNSt6chrono8durationImSt5ratioILl1ELl1000000000EEEE
+   fun:operator()
+   fun:_ZNSt17_Function_handlerIFvvEZN12NetworkStack10add_threadEjEUlvE_E9_M_invokeERKSt9_Any_data
+   fun:execute_native_thread_routine
+   fun:start_thread
+   fun:clone
+}
diff --git a/ceph/qa/workunits/cephtool/test.sh b/ceph/qa/workunits/cephtool/test.sh
index 36f9dc43e..b0ab5c051 100755
--- a/ceph/qa/workunits/cephtool/test.sh
+++ b/ceph/qa/workunits/cephtool/test.sh
@@ -49,7 +49,7 @@ function expect_false()
 }
 
 
-TEMP_DIR=$(mktemp -d ${TMPDIR-/tmp}/cephtool.XXX)
+TEMP_DIR=$(mktemp -d ${TMPDIR:-/tmp}/cephtool.XXX)
 trap "rm -fr $TEMP_DIR" 0
 
 TMPFILE=$(mktemp $TEMP_DIR/test_invalid.XXX)
@@ -578,7 +578,9 @@ function test_tiering_9()
 
 function test_auth()
 {
-  ceph auth add client.xx mon allow osd "allow *"
+  expect_false ceph auth add client.xx mon 'invalid' osd "allow *"
+  expect_false ceph auth add client.xx mon 'allow *' osd "allow *" invalid "allow *"
+  ceph auth add client.xx mon 'allow *' osd "allow *"
   ceph auth export client.xx >client.xx.keyring
   ceph auth add client.xx -i client.xx.keyring
   rm -f client.xx.keyring
@@ -602,7 +604,7 @@ function test_auth()
   expect_false ceph auth get client.xx
 
   # (almost) interactive mode
-  echo -e 'auth add client.xx mon allow osd "allow *"\n' | ceph
+  echo -e 'auth add client.xx mon "allow *" osd "allow *"\n' | ceph
   ceph auth get client.xx
   # script mode
   echo 'auth del client.xx' | ceph
diff --git a/ceph/qa/workunits/libcephfs-java/test.sh b/ceph/qa/workunits/libcephfs-java/test.sh
deleted file mode 100755
index f299e9597..000000000
--- a/ceph/qa/workunits/libcephfs-java/test.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/sh -e
-
-echo "starting libcephfs-java tests"
-# configure CEPH_CONF and LD_LIBRARY_PATH if they're not already set
-conf="$CEPH_CONF"
-if [ -z "$conf" ] ; then
-	echo "Setting conf to /etc/ceph/ceph.conf" 
-	conf="/etc/ceph/ceph.conf"
-else
-	echo "conf is set to $conf"
-fi
-
-ld_lib_path="$LD_LIBRARY_PATH"
-if [ -z "$ld_lib_path" ] ; then
-	echo "Setting ld_lib_path to /usr/lib/jni:/usr/lib64"
-	ld_lib_path="/usr/lib/jni:/usr/lib64"
-else
-	echo "ld_lib_path was set to $ld_lib_path"
-fi
-
-ceph_java="$CEPH_JAVA_PATH"
-if [ -z "$ceph_java" ] ; then
-	echo "Setting ceph_java to /usr/share/java"
-	ceph_java="/usr/share/java"
-else
-	echo "ceph_java was set to $ceph_java"
-fi
-
-command="java -DCEPH_CONF_FILE=$conf -Djava.library.path=$ld_lib_path -cp /usr/share/java/junit4.jar:$ceph_java/libcephfs.jar:$ceph_java/libcephfs-test.jar org.junit.runner.JUnitCore com.ceph.fs.CephAllTests"
-
-echo "----------------------"
-echo $command
-echo "----------------------"
-
-$command
-
-echo "completed libcephfs-java tests"
-
-exit 0
diff --git a/ceph/qa/workunits/rados/test_health_warnings.sh b/ceph/qa/workunits/rados/test_health_warnings.sh
index a4a9c11c6..19dec9a84 100755
--- a/ceph/qa/workunits/rados/test_health_warnings.sh
+++ b/ceph/qa/workunits/rados/test_health_warnings.sh
@@ -7,6 +7,7 @@ crushtool -o crushmap --build --num_osds 10 host straw 2 rack straw 2 row straw
 ceph osd setcrushmap -i crushmap
 ceph osd tree
 ceph tell osd.* injectargs --osd_max_markdown_count 1024 --osd_max_markdown_period 1
+ceph osd set noout
 
 wait_for_healthy() {
   while ceph health | grep down
diff --git a/ceph/src/.git_version b/ceph/src/.git_version
index 268e02b63..63186701d 100644
--- a/ceph/src/.git_version
+++ b/ceph/src/.git_version
@@ -1,2 +1,2 @@
-26dc3775efc7bb286a1d6d66faee0ba30ea23eee
-v12.2.11
+1436006594665279fe734b4c15d7e08c13ebd777
+v12.2.12
diff --git a/ceph/src/CMakeLists.txt b/ceph/src/CMakeLists.txt
index b0837ab1d..4ff19154f 100644
--- a/ceph/src/CMakeLists.txt
+++ b/ceph/src/CMakeLists.txt
@@ -742,7 +742,7 @@ install(TARGETS librados-config DESTINATION bin)
 # virtualenv base directory for ceph-disk and ceph-detect-init
 set(CEPH_BUILD_VIRTUALENV $ENV{TMPDIR})
 if(NOT CEPH_BUILD_VIRTUALENV)
-  set(CEPH_BUILD_VIRTUALENV /tmp)
+  set(CEPH_BUILD_VIRTUALENV ${CMAKE_BINARY_DIR})
 endif()
 
 add_subdirectory(pybind)
diff --git a/ceph/src/auth/Crypto.cc b/ceph/src/auth/Crypto.cc
index 150052bfe..626367858 100644
--- a/ceph/src/auth/Crypto.cc
+++ b/ceph/src/auth/Crypto.cc
@@ -291,8 +291,9 @@ public:
     keyItem.type = siBuffer;
     keyItem.data = (unsigned char*)secret.c_str();
     keyItem.len = secret.length();
-    key = PK11_ImportSymKey(slot, mechanism, PK11_OriginUnwrap, CKA_ENCRYPT,
-			    &keyItem, NULL);
+    using ceph::crypto::PK11_ImportSymKey_FIPS;
+    key = PK11_ImportSymKey_FIPS(slot, mechanism, PK11_OriginUnwrap, CKA_ENCRYPT,
+				 &keyItem, NULL);
     if (!key) {
       err << "cannot convert AES key for NSS: " << PR_GetError();
       return -1;
diff --git a/ceph/src/ceph-disk/run-tox.sh b/ceph/src/ceph-disk/run-tox.sh
index 76935b9e1..5c51d149a 100755
--- a/ceph/src/ceph-disk/run-tox.sh
+++ b/ceph/src/ceph-disk/run-tox.sh
@@ -16,7 +16,7 @@
 #
 
 # run from the ceph-disk directory or from its parent
-: ${CEPH_DISK_VIRTUALENV:=/tmp/ceph-disk-virtualenv}
+: ${CEPH_DISK_VIRTUALENV:=$CEPH_BUILD_DIR/ceph-disk-virtualenv}
 test -d ceph-disk && cd ceph-disk
 
 if [ -e tox.ini ]; then
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/simple/activate.py b/ceph/src/ceph-volume/ceph_volume/devices/simple/activate.py
index 814c6fe37..3cf414fdc 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/simple/activate.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/simple/activate.py
@@ -1,6 +1,7 @@
 from __future__ import print_function
 import argparse
 import base64
+import glob
 import json
 import logging
 import os
@@ -230,6 +231,12 @@ class Activate(object):
             nargs='?',
             help='The FSID of the OSD, similar to a SHA1'
         )
+        parser.add_argument(
+            '--all',
+            help='Activate all OSDs with a OSD JSON config',
+            action='store_true',
+            default=False,
+        )
         parser.add_argument(
             '--file',
             help='The path to a JSON file, from a scanned OSD'
@@ -244,7 +251,7 @@ class Activate(object):
             print(sub_command_help)
             return
         args = parser.parse_args(self.argv)
-        if not args.file:
+        if not args.file and not args.all:
             if not args.osd_id and not args.osd_fsid:
                 terminal.error('ID and FSID are required to find the right OSD to activate')
                 terminal.error('from a scanned OSD location in /etc/ceph/osd/')
@@ -253,13 +260,22 @@ class Activate(object):
         # implicitly indicate that it would be possible to activate a json file
         # at a non-default location which would not work at boot time if the
         # custom location is not exposed through an ENV var
+        self.skip_systemd = args.skip_systemd
         json_dir = os.environ.get('CEPH_VOLUME_SIMPLE_JSON_DIR', '/etc/ceph/osd/')
-        if args.file:
-            json_config = args.file
+        if args.all:
+            if args.file or args.osd_id:
+                mlogger.warn('--all was passed, ignoring --file and ID/FSID arguments')
+            json_configs = glob.glob('{}/*.json'.format(json_dir))
+            for json_config in json_configs:
+                mlogger.info('activating OSD specified in {}'.format(json_config))
+                args.json_config = json_config
+                self.activate(args)
         else:
-            json_config = os.path.join(json_dir, '%s-%s.json' % (args.osd_id, args.osd_fsid))
-        if not os.path.exists(json_config):
-            raise RuntimeError('Expected JSON config path not found: %s' % json_config)
-        args.json_config = json_config
-        self.skip_systemd = args.skip_systemd
-        self.activate(args)
+            if args.file:
+                json_config = args.file
+            else:
+                json_config = os.path.join(json_dir, '%s-%s.json' % (args.osd_id, args.osd_fsid))
+            if not os.path.exists(json_config):
+                raise RuntimeError('Expected JSON config path not found: %s' % json_config)
+            args.json_config = json_config
+            self.activate(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/simple/scan.py b/ceph/src/ceph-volume/ceph_volume/devices/simple/scan.py
index f2f7d3dc9..78a1493bd 100644
--- a/ceph/src/ceph-volume/ceph_volume/devices/simple/scan.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/simple/scan.py
@@ -7,6 +7,7 @@ import os
 from textwrap import dedent
 from ceph_volume import decorators, terminal, conf
 from ceph_volume.api import lvm
+from ceph_volume.systemd import systemctl
 from ceph_volume.util import arg_validators, system, disk, encryption
 from ceph_volume.util.device import Device
 
@@ -40,7 +41,7 @@ def parse_keyring(file_contents):
 
 class Scan(object):
 
-    help = 'Capture metadata from an OSD data partition or directory'
+    help = 'Capture metadata from all running ceph-disk OSDs, OSD data partition or directory'
 
     def __init__(self, argv):
         self.argv = argv
@@ -283,7 +284,7 @@ class Scan(object):
 
     def main(self):
         sub_command_help = dedent("""
-        Scan an OSD directory (or data device) for files and configurations
+        Scan running OSDs, an OSD directory (or data device) for files and configurations
         that will allow to take over the management of the OSD.
 
         Scanned OSDs will get their configurations stored in
@@ -298,13 +299,19 @@ class Scan(object):
 
             /etc/ceph/osd/0-a9d50838-e823-43d6-b01f-2f8d0a77afc2.json
 
-        To a scan an existing, running, OSD:
+        To scan all running OSDs:
+
+            ceph-volume simple scan
+
+        To a scan a specific running OSD:
 
             ceph-volume simple scan /var/lib/ceph/osd/{cluster}-{osd id}
 
         And to scan a device (mounted or unmounted) that has OSD data in it, for example /dev/sda1
 
             ceph-volume simple scan /dev/sda1
+
+        Scanning a device or directory that belongs to an OSD not created by ceph-disk will be ingored.
         """)
         parser = argparse.ArgumentParser(
             prog='ceph-volume simple scan',
@@ -329,25 +336,40 @@ class Scan(object):
             metavar='OSD_PATH',
             type=arg_validators.OSDPath(),
             nargs='?',
+            default=None,
             help='Path to an existing OSD directory or OSD data partition'
         )
 
-        if len(self.argv) == 0:
-            print(sub_command_help)
-            return
-
         args = parser.parse_args(self.argv)
-        device = Device(args.osd_path)
-        if device.is_partition:
-            if device.ceph_disk.type != 'data':
-                label = device.ceph_disk.partlabel
-                msg = 'Device must be the ceph data partition, but PARTLABEL reported: "%s"' % label
-                raise RuntimeError(msg)
+        paths = []
+        if args.osd_path:
+            paths.append(args.osd_path)
+        else:
+            osd_ids = systemctl.get_running_osd_ids()
+            for osd_id in osd_ids:
+                paths.append("/var/lib/ceph/osd/{}-{}".format(
+                    conf.cluster,
+                    osd_id,
+                ))
 
         # Capture some environment status, so that it can be reused all over
         self.device_mounts = system.get_mounts(devices=True)
         self.path_mounts = system.get_mounts(paths=True)
-        self.encryption_metadata = encryption.legacy_encrypted(args.osd_path)
-        self.is_encrypted = self.encryption_metadata['encrypted']
 
-        self.scan(args)
+        for path in paths:
+            args.osd_path = path
+            device = Device(args.osd_path)
+            if device.is_partition:
+                if device.ceph_disk.type != 'data':
+                    label = device.ceph_disk.partlabel
+                    msg = 'Device must be the ceph data partition, but PARTLABEL reported: "%s"' % label
+                    raise RuntimeError(msg)
+
+            self.encryption_metadata = encryption.legacy_encrypted(args.osd_path)
+            self.is_encrypted = self.encryption_metadata['encrypted']
+
+            device = Device(self.encryption_metadata['device'])
+            if not device.is_ceph_disk_member:
+                terminal.warning("Ignoring %s because it's not a ceph-disk created osd." % path)
+            else:
+                self.scan(args)
diff --git a/ceph/src/ceph-volume/ceph_volume/systemd/systemctl.py b/ceph/src/ceph-volume/ceph_volume/systemd/systemctl.py
index 41dbbc19e..778ad1479 100644
--- a/ceph/src/ceph-volume/ceph_volume/systemd/systemctl.py
+++ b/ceph/src/ceph-volume/ceph_volume/systemd/systemctl.py
@@ -1,8 +1,11 @@
 """
 Utilities to control systemd units
 """
+import logging
+
 from ceph_volume import process
 
+logger = logging.getLogger(__name__)
 
 def start(unit):
     process.run(['systemctl', 'start', unit])
@@ -34,6 +37,26 @@ def is_active(unit):
     )
     return rc == 0
 
+def get_running_osd_ids():
+    out, err, rc = process.call([
+        'systemctl',
+        'show',
+        '--no-pager',
+        '--property=Id',
+        '--state=running',
+        'ceph-osd@*',
+    ])
+    osd_ids = []
+    if rc == 0:
+        for line in out:
+            if line:
+                # example line looks like: Id=ceph-osd@1.service
+                try:
+                    osd_id = line.split("@")[1].split(".service")[0]
+                    osd_ids.append(osd_id)
+                except (IndexError, TypeError):
+                    logger.warning("Failed to parse output from systemctl: %s", line)
+    return osd_ids
 
 def start_osd(id_):
     return start(osd_unit % id_)
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py
index a275bdd00..885a6ec25 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py
@@ -22,6 +22,26 @@ class TestActivate(object):
         stdout, stderr = capsys.readouterr()
         assert 'Activate OSDs by mounting devices previously configured' in stdout
 
+    def test_activate_all(self, is_root, monkeypatch):
+        '''
+        make sure Activate calls activate for each file returned by glob
+        '''
+        mocked_glob = []
+        def mock_glob(glob):
+            path = os.path.dirname(glob)
+            mocked_glob.extend(['{}/{}.json'.format(path, file_) for file_ in
+                                ['1', '2', '3']])
+            return mocked_glob
+        activate_files = []
+        def mock_activate(self, args):
+            activate_files.append(args.json_config)
+        monkeypatch.setattr('glob.glob', mock_glob)
+        monkeypatch.setattr(activate.Activate, 'activate', mock_activate)
+        activate.Activate(['--all']).main()
+        assert activate_files == mocked_glob
+
+
+
 
 class TestEnableSystemdUnits(object):
 
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py
index 08ca37f66..118493625 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py
@@ -3,14 +3,6 @@ import pytest
 from ceph_volume.devices.simple import scan
 
 
-class TestScan(object):
-
-    def test_main_spits_help_with_no_arguments(self, capsys):
-        scan.Scan([]).main()
-        stdout, stderr = capsys.readouterr()
-        assert 'Scan an OSD directory (or data device) for files' in stdout
-
-
 class TestGetContents(object):
 
     def test_multiple_lines_are_left_as_is(self, tmpfile):
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini
index 4c3af6811..db9652436 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini
@@ -48,20 +48,20 @@ commands=
   # prepare nodes for testing with testinfra
   ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
 
-  # test cluster state using ceph-ansible tests
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  # test cluster state using testinfra 
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   # reboot all vms - attempt
   bash {toxinidir}/../scripts/vagrant_reload.sh {env:VAGRANT_UP_FLAGS:"--no-provision"} {posargs:--provider=virtualbox}
 
   # retest to ensure cluster came back up correctly after rebooting
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   # destroy an OSD, zap it's device and recreate it using it's ID
   ansible-playbook -vv -i {changedir}/hosts {changedir}/test.yml
 
   # retest to ensure cluster came back up correctly
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   # test zap OSDs by ID
   ansible-playbook -vv -i {changedir}/hosts {changedir}/test_zap.yml
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml
index 8caa1ce38..bbd5b45d3 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml
@@ -93,6 +93,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml
index 17b74d524..91c9a1b84 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml
@@ -97,6 +97,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml
index 353df127c..1e9b8c3e0 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml
@@ -98,6 +98,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml
index e896c41b0..4e43839e8 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml
@@ -119,6 +119,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini
index d2432c8a8..d61c23719 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini
@@ -56,19 +56,19 @@ commands=
   # prepare nodes for testing with testinfra
   ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
 
-  # test cluster state using ceph-ansible tests
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  # test cluster state using testinfra
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   # reboot all vms - attempt
   bash {toxinidir}/../scripts/vagrant_reload.sh {env:VAGRANT_UP_FLAGS:"--no-provision"} {posargs:--provider=virtualbox}
 
   # retest to ensure cluster came back up correctly after rebooting
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   # destroy an OSD, zap it's device and recreate it using it's ID
   ansible-playbook -vv -i {changedir}/hosts {changedir}/test.yml
 
   # retest to ensure cluster came back up correctly
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   vagrant destroy --force
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml
index 3e032e202..27290d933 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml
@@ -93,6 +93,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml
index 17b74d524..91c9a1b84 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml
@@ -97,6 +97,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/dmcrypt-luks/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/dmcrypt-luks/test.yml
index 24e2c0353..55ae7cc8e 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/dmcrypt-luks/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/dmcrypt-luks/test.yml
@@ -4,28 +4,12 @@
   become: yes
   tasks:
 
-    - name: list all OSD directories
-      find:
-        paths: /var/lib/ceph/osd
-        file_type: directory
-      register: osd_paths
-
-    - name: scan all OSD directories
-      command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+    - name: scan all running OSDs
+      command: "ceph-volume --cluster={{ cluster }} simple scan"
       environment:
         CEPH_VOLUME_DEBUG: 1
-      with_items:
-        - "{{ osd_paths.files }}"
-
-    - name: list all OSD JSON files
-      find:
-        paths: /etc/ceph/osd
-        file_type: file
-      register: osd_configs
 
     - name: activate all scanned OSDs
-      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      command: "ceph-volume --cluster={{ cluster }} simple activate --all"
       environment:
         CEPH_VOLUME_DEBUG: 1
-      with_items:
-        - "{{ osd_configs.files }}"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml
index 24e2c0353..0745f2571 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml
@@ -24,8 +24,6 @@
       register: osd_configs
 
     - name: activate all scanned OSDs
-      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      command: "ceph-volume --cluster={{ cluster }} simple activate --all"
       environment:
         CEPH_VOLUME_DEBUG: 1
-      with_items:
-        - "{{ osd_configs.files }}"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini
index 391fb4ae9..2856d9ad0 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini
@@ -46,8 +46,8 @@ commands=
   # prepare nodes for testing with testinfra
   ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
 
-  # test cluster state using ceph-ansible tests
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  # test cluster state testinfra
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   # make ceph-volume simple take over all the OSDs that got deployed, disabling ceph-disk
   ansible-playbook -vv -i {changedir}/hosts {changedir}/test.yml
@@ -59,6 +59,6 @@ commands=
   sleep 120
 
   # retest to ensure cluster came back up correctly after rebooting
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   vagrant destroy --force
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml
index 24e2c0353..55ae7cc8e 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml
@@ -4,28 +4,12 @@
   become: yes
   tasks:
 
-    - name: list all OSD directories
-      find:
-        paths: /var/lib/ceph/osd
-        file_type: directory
-      register: osd_paths
-
-    - name: scan all OSD directories
-      command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+    - name: scan all running OSDs
+      command: "ceph-volume --cluster={{ cluster }} simple scan"
       environment:
         CEPH_VOLUME_DEBUG: 1
-      with_items:
-        - "{{ osd_paths.files }}"
-
-    - name: list all OSD JSON files
-      find:
-        paths: /etc/ceph/osd
-        file_type: file
-      register: osd_configs
 
     - name: activate all scanned OSDs
-      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      command: "ceph-volume --cluster={{ cluster }} simple activate --all"
       environment:
         CEPH_VOLUME_DEBUG: 1
-      with_items:
-        - "{{ osd_configs.files }}"
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/tests/__init__.py b/ceph/src/ceph-volume/ceph_volume/tests/functional/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/tests/conftest.py b/ceph/src/ceph-volume/ceph_volume/tests/functional/tests/conftest.py
new file mode 100644
index 000000000..05c9aa521
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/tests/conftest.py
@@ -0,0 +1,103 @@
+import pytest
+import os
+
+
+@pytest.fixture()
+def node(host, request):
+    """ This fixture represents a single node in the ceph cluster. Using the
+    host.ansible fixture provided by testinfra it can access all the ansible
+    variables provided to it by the specific test scenario being ran.
+
+    You must include this fixture on any tests that operate on specific type
+    of node because it contains the logic to manage which tests a node
+    should run.
+    """
+    ansible_vars = host.ansible.get_variables()
+    # tox/jenkins/user will pass in this environment variable. we need to do it this way
+    # because testinfra does not collect and provide ansible config passed in
+    # from using --extra-vars
+    ceph_dev_branch = os.environ.get("CEPH_DEV_BRANCH", "master")
+    group_names = ansible_vars["group_names"]
+    num_osd_ports = 4
+    if ceph_dev_branch in ['luminous', 'mimic']:
+        num_osd_ports = 2
+
+    # capture the initial/default state
+    test_is_applicable = False
+    for marker in request.node.iter_markers():
+        if marker.name in group_names or marker.name == 'all':
+            test_is_applicable = True
+            break
+    # Check if any markers on the test method exist in the nodes group_names.
+    # If they do not, this test is not valid for the node being tested.
+    if not test_is_applicable:
+        reason = "%s: Not a valid test for node type: %s" % (
+            request.function, group_names)
+        pytest.skip(reason)
+
+    osd_ids = []
+    osds = []
+    cluster_address = ""
+    # I can assume eth1 because I know all the vagrant
+    # boxes we test with use that interface
+    address = host.interface("eth1").addresses[0]
+    subnet = ".".join(ansible_vars["public_network"].split(".")[0:-1])
+    num_mons = len(ansible_vars["groups"]["mons"])
+    num_osds = len(ansible_vars.get("devices", []))
+    if not num_osds:
+        num_osds = len(ansible_vars.get("lvm_volumes", []))
+    osds_per_device = ansible_vars.get("osds_per_device", 1)
+    num_osds = num_osds * osds_per_device
+
+    # If number of devices doesn't map to number of OSDs, allow tests to define
+    # that custom number, defaulting it to ``num_devices``
+    num_osds = ansible_vars.get('num_osds', num_osds)
+    cluster_name = ansible_vars.get("cluster", "ceph")
+    conf_path = "/etc/ceph/{}.conf".format(cluster_name)
+    if "osds" in group_names:
+        # I can assume eth2 because I know all the vagrant
+        # boxes we test with use that interface. OSDs are the only
+        # nodes that have this interface.
+        cluster_address = host.interface("eth2").addresses[0]
+        cmd = host.run('sudo ls /var/lib/ceph/osd/ | sed "s/.*-//"')
+        if cmd.rc == 0:
+            osd_ids = cmd.stdout.rstrip("\n").split("\n")
+            osds = osd_ids
+
+    data = dict(
+        address=address,
+        subnet=subnet,
+        vars=ansible_vars,
+        osd_ids=osd_ids,
+        num_mons=num_mons,
+        num_osds=num_osds,
+        num_osd_ports=num_osd_ports,
+        cluster_name=cluster_name,
+        conf_path=conf_path,
+        cluster_address=cluster_address,
+        osds=osds,
+    )
+    return data
+
+
+def pytest_collection_modifyitems(session, config, items):
+    for item in items:
+        test_path = item.location[0]
+        if "mon" in test_path:
+            item.add_marker(pytest.mark.mons)
+        elif "osd" in test_path:
+            item.add_marker(pytest.mark.osds)
+        elif "mds" in test_path:
+            item.add_marker(pytest.mark.mdss)
+        elif "mgr" in test_path:
+            item.add_marker(pytest.mark.mgrs)
+        elif "rbd-mirror" in test_path:
+            item.add_marker(pytest.mark.rbdmirrors)
+        elif "rgw" in test_path:
+            item.add_marker(pytest.mark.rgws)
+        elif "nfs" in test_path:
+            item.add_marker(pytest.mark.nfss)
+        elif "iscsi" in test_path:
+            item.add_marker(pytest.mark.iscsigws)
+        else:
+            item.add_marker(pytest.mark.all)
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/tests/osd/__init__.py b/ceph/src/ceph-volume/ceph_volume/tests/functional/tests/osd/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/tests/osd/test_osds.py b/ceph/src/ceph-volume/ceph_volume/tests/functional/tests/osd/test_osds.py
new file mode 100644
index 000000000..6d12babdb
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/tests/osd/test_osds.py
@@ -0,0 +1,60 @@
+import json
+
+
+class TestOSDs(object):
+
+    def test_ceph_osd_package_is_installed(self, node, host):
+        assert host.package("ceph-osd").is_installed
+
+    def test_osds_listen_on_public_network(self, node, host):
+        # TODO: figure out way to paramaterize this test
+        nb_port = (node["num_osds"] * node["num_osd_ports"])
+        assert host.check_output(
+            "netstat -lntp | grep ceph-osd | grep %s | wc -l" % (node["address"])) == str(nb_port)  # noqa E501
+
+    def test_osds_listen_on_cluster_network(self, node, host):
+        # TODO: figure out way to paramaterize this test
+        nb_port = (node["num_osds"] * node["num_osd_ports"])
+        assert host.check_output("netstat -lntp | grep ceph-osd | grep %s | wc -l" %  # noqa E501
+                                 (node["cluster_address"])) == str(nb_port)
+
+    def test_osd_services_are_running(self, node, host):
+        # TODO: figure out way to paramaterize node['osds'] for this test
+        for osd in node["osds"]:
+            assert host.service("ceph-osd@%s" % osd).is_running
+
+    def test_osd_are_mounted(self, node, host):
+        # TODO: figure out way to paramaterize node['osd_ids'] for this test
+        for osd_id in node["osd_ids"]:
+            osd_path = "/var/lib/ceph/osd/{cluster}-{osd_id}".format(
+                cluster=node["cluster_name"],
+                osd_id=osd_id,
+            )
+            assert host.mount_point(osd_path).exists
+
+    def test_ceph_volume_is_installed(self, node, host):
+        host.exists('ceph-volume')
+
+    def test_ceph_volume_systemd_is_installed(self, node, host):
+        host.exists('ceph-volume-systemd')
+
+    def _get_osd_id_from_host(self, node, osd_tree):
+        children = []
+        for n in osd_tree['nodes']:
+            if n['name'] == node['vars']['inventory_hostname'] and n['type'] == 'host':  # noqa E501
+                children = n['children']
+        return children
+
+    def _get_nb_up_osds_from_ids(self, node, osd_tree):
+        nb_up = 0
+        ids = self._get_osd_id_from_host(node, osd_tree)
+        for n in osd_tree['nodes']:
+            if n['id'] in ids and n['status'] == 'up':
+                nb_up += 1
+        return nb_up
+
+    def test_all_osds_are_up_and_in(self, node, host):
+        cmd = "sudo ceph --cluster={cluster} --connect-timeout 5 --keyring /var/lib/ceph/bootstrap-osd/{cluster}.keyring -n client.bootstrap-osd osd tree -f json".format(  # noqa E501
+            cluster=node["cluster_name"])
+        output = json.loads(host.check_output(cmd))
+        assert node["num_osds"] == self._get_nb_up_osds_from_ids(node, output)
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/systemd/test_systemctl.py b/ceph/src/ceph-volume/ceph_volume/tests/systemd/test_systemctl.py
new file mode 100644
index 000000000..8eec4a3d4
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/systemd/test_systemctl.py
@@ -0,0 +1,21 @@
+import pytest
+from ceph_volume.systemd import systemctl
+
+class TestSystemctl(object):
+
+    @pytest.mark.parametrize("stdout,expected", [
+        (['Id=ceph-osd@1.service', '', 'Id=ceph-osd@2.service'], ['1','2']),
+        (['Id=ceph-osd1.service',], []),
+        (['Id=ceph-osd@1'], ['1']),
+        ([], []),
+    ])
+    def test_get_running_osd_ids(self, stub_call, stdout, expected):
+        stub_call((stdout, [], 0))
+        osd_ids = systemctl.get_running_osd_ids()
+        assert osd_ids == expected
+
+    def test_returns_empty_list_on_nonzero_return_code(self, stub_call):
+        stdout = ['Id=ceph-osd@1.service', '', 'Id=ceph-osd@2.service']
+        stub_call((stdout, [], 1))
+        osd_ids = systemctl.get_running_osd_ids()
+        assert osd_ids == []
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py
index 8be5f8e4b..00cb5a885 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py
@@ -43,6 +43,42 @@ class TestDevice(object):
         disk = device.Device("/dev/sda")
         assert disk.is_device is True
 
+    def test_device_is_rotational(self, device_info, pvolumes):
+        data = {"/dev/sda": {"rotational": "1"}}
+        lsblk = {"TYPE": "device"}
+        device_info(devices=data, lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.rotational
+
+    def test_device_is_not_rotational(self, device_info, pvolumes):
+        data = {"/dev/sda": {"rotational": "0"}}
+        lsblk = {"TYPE": "device"}
+        device_info(devices=data, lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert not disk.rotational
+
+    def test_device_is_rotational_lsblk(self, device_info, pvolumes):
+        data = {"/dev/sda": {"foo": "bar"}}
+        lsblk = {"TYPE": "device", "ROTA": "1"}
+        device_info(devices=data, lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.rotational
+
+    def test_device_is_not_rotational_lsblk(self, device_info, pvolumes):
+        data = {"/dev/sda": {"rotational": "0"}}
+        lsblk = {"TYPE": "device", "ROTA": "0"}
+        device_info(devices=data, lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert not disk.rotational
+
+    def test_device_is_rotational_defaults_true(self, device_info, pvolumes):
+        # rotational will default true if no info from sys_api or lsblk is found
+        data = {"/dev/sda": {"foo": "bar"}}
+        lsblk = {"TYPE": "device", "foo": "bar"}
+        device_info(devices=data, lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.rotational
+
     def test_disk_is_device(self, device_info, pvolumes):
         data = {"/dev/sda": {"foo": "bar"}}
         lsblk = {"TYPE": "disk"}
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py
index e40c982d1..3fae20094 100644
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py
@@ -267,28 +267,6 @@ class TestGetDevices(object):
         assert len(result) == 1
         assert result == [ceph_data_path]
 
-    def test_sda1_partition(self, tmpfile, tmpdir):
-        block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
-        block_sda_path = os.path.join(block_path, 'sda')
-        block_sda1_path = os.path.join(block_sda_path, 'sda1')
-        block_sda1_holders = os.path.join(block_sda1_path, 'holders')
-        dev_sda_path = os.path.join(dev_path, 'sda')
-        dev_sda1_path = os.path.join(dev_path, 'sda1')
-        os.makedirs(block_sda_path)
-        os.makedirs(block_sda1_path)
-        os.makedirs(dev_sda1_path)
-        os.makedirs(block_sda1_holders)
-        os.makedirs(dev_sda_path)
-        tmpfile('size', '1024', directory=block_sda_path)
-        tmpfile('partition', '1', directory=block_sda1_path)
-        result = disk.get_devices(
-            _sys_block_path=block_path,
-            _dev_path=dev_path,
-            _mapper_path=mapper_path)
-        assert dev_sda_path in list(result.keys())
-        assert '/dev/sda1' in list(result.keys())
-        assert result['/dev/sda1']['holders'] == []
-
     def test_sda_size(self, tmpfile, tmpdir):
         block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
         block_sda_path = os.path.join(block_path, 'sda')
diff --git a/ceph/src/ceph-volume/ceph_volume/util/device.py b/ceph/src/ceph-volume/ceph_volume/util/device.py
index 06f90cd37..29a01effa 100644
--- a/ceph/src/ceph-volume/ceph_volume/util/device.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/device.py
@@ -110,6 +110,14 @@ class Device(object):
         if not sys_info.devices:
             sys_info.devices = disk.get_devices()
         self.sys_api = sys_info.devices.get(self.abspath, {})
+        if not self.sys_api:
+            # if no device was found check if we are a partition
+            partname = self.abspath.split('/')[-1]
+            for device, info in sys_info.devices.items():
+                part = info['partitions'].get(partname, {})
+                if part:
+                    self.sys_api = part
+                    break
 
         # start with lvm since it can use an absolute or relative path
         lv = lvm.get_lv_from_argument(self.path)
@@ -257,7 +265,12 @@ class Device(object):
 
     @property
     def rotational(self):
-        return self.sys_api['rotational'] == '1'
+        rotational = self.sys_api.get('rotational')
+        if rotational is None:
+            # fall back to lsblk if not found in sys_api
+            # default to '1' if no value is found with lsblk either
+            rotational = self.disk_api.get('ROTA', '1')
+        return rotational == '1'
 
     @property
     def model(self):
diff --git a/ceph/src/ceph-volume/ceph_volume/util/disk.py b/ceph/src/ceph-volume/ceph_volume/util/disk.py
index c85d3be9a..da6411329 100644
--- a/ceph/src/ceph-volume/ceph_volume/util/disk.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/disk.py
@@ -815,9 +815,5 @@ def get_devices(_sys_block_path='/sys/block', _dev_path='/dev', _mapper_path='/d
         metadata['path'] = diskname
         metadata['locked'] = is_locked_raw_device(metadata['path'])
 
-        for part_name, part_metadata in metadata['partitions'].items():
-            part_abspath = '/dev/%s' % part_name
-            device_facts[part_abspath] = part_metadata
-
         device_facts[diskname] = metadata
     return device_facts
diff --git a/ceph/src/ceph-volume/tox.ini b/ceph/src/ceph-volume/tox.ini
index 514d208fa..fce465def 100644
--- a/ceph/src/ceph-volume/tox.ini
+++ b/ceph/src/ceph-volume/tox.ini
@@ -4,7 +4,7 @@ envlist = py27, py35, py36, flake8
 [testenv]
 deps=
   pytest
-commands=py.test -v {posargs:ceph_volume/tests}
+commands=py.test -v {posargs:ceph_volume/tests} --ignore=ceph_volume/tests/functional
 
 [testenv:flake8]
 deps=flake8
diff --git a/ceph/src/ceph.in b/ceph/src/ceph.in
index 7c1eda2c0..bde104763 100755
--- a/ceph/src/ceph.in
+++ b/ceph/src/ceph.in
@@ -21,7 +21,9 @@ Foundation.  See file COPYING.
 
 from __future__ import print_function
 import codecs
+import grp
 import os
+import pwd
 import sys
 import platform
 
@@ -270,7 +272,10 @@ def parse_cmdargs(args=None, target=''):
                         help='input file, or "-" for stdin')
     parser.add_argument('-o', '--out-file', dest='output_file',
                         help='output file, or "-" for stdout')
-
+    parser.add_argument('--setuser', dest='setuser',
+                        help='set user file permission')
+    parser.add_argument('--setgroup', dest='setgroup',
+                        help='set group file permission')
     parser.add_argument('--id', '--user', dest='client_id',
                         help='client id for authentication')
     parser.add_argument('--name', '-n', dest='client_name',
@@ -990,6 +995,20 @@ def main():
         except Exception as e:
             print('Can\'t open output file {0}: {1}'.format(parsed_args.output_file, e), file=sys.stderr)
             return 1
+        if parsed_args.setuser:
+            try:
+                ownerid = pwd.getpwnam(parsed_args.setuser).pw_uid
+                os.fchown(outf.fileno(), ownerid, -1)
+            except OSError as e:
+                print('Failed to change user ownership of {0} to {1}: {2}'.format(outf, parsed_args.setuser, e))
+                return 1
+        if parsed_args.setgroup:
+            try:
+                groupid = grp.getgrnam(parsed_args.setgroup).gr_gid
+                os.fchown(outf.fileno(), -1, groupid)
+            except OSError as e:
+                print('Failed to change group ownership of {0} to {1}: {2}'.format(outf, parsed_args.setgroup, e))
+                return 1
 
     # -s behaves like a command (ceph status).
     if parsed_args.status:
diff --git a/ceph/src/client/Client.cc b/ceph/src/client/Client.cc
index 9f78b24a5..0b240cbf7 100644
--- a/ceph/src/client/Client.cc
+++ b/ceph/src/client/Client.cc
@@ -909,9 +909,9 @@ Inode * Client::add_update_inode(InodeStat *st, utime_t from,
     return in;   // as with readdir returning indoes in different snaprealms (no caps!)
 
   if (in->snapid == CEPH_NOSNAP) {
-    add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
-		   st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
-		   request_perms);
+    add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
+		   st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
+		   st->cap.flags, request_perms);
     if (in->auth_cap && in->auth_cap->session == session) {
       in->max_size = st->max_size;
       in->rstat = st->rstat;
@@ -2087,9 +2087,11 @@ void Client::handle_client_session(MClientSession *m)
 
   case CEPH_SESSION_RENEWCAPS:
     if (session->cap_renew_seq == m->get_seq()) {
+      bool was_stale = ceph_clock_now() >= session->cap_ttl;
       session->cap_ttl =
 	session->last_cap_renew_request + mdsmap->get_session_timeout();
-      wake_inode_waiters(session);
+      if (was_stale)
+	wake_up_session_caps(session, false);
     }
     break;
 
@@ -2106,6 +2108,14 @@ void Client::handle_client_session(MClientSession *m)
     break;
 
   case CEPH_SESSION_FLUSHMSG:
+    /* flush cap release */
+    {
+      auto& m = session->release;
+      if (m) {
+        session->con->send_message(std::move(m));
+        m = nullptr;
+      }
+    }
     session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
     break;
 
@@ -2703,8 +2713,7 @@ void Client::handle_mds_map(MMDSMap* m)
 	kick_requests(session);
 	kick_flushing_caps(session);
 	signal_context_list(session->waiting_for_open);
-	kick_maxsize_requests(session);
-	wake_inode_waiters(session);
+	wake_up_session_caps(session, true);
       }
       connect_mds_targets(mds);
     } else if (newstate == MDSMap::STATE_NULL &&
@@ -3255,10 +3264,8 @@ int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
 	  return ret;
 	continue;
       }
-      if ((mds_wanted & file_wanted) ==
-	  (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
+      if (!(file_wanted & ~mds_wanted))
 	in->flags &= ~I_CAP_DROPPED;
-      }
     }
 
     if (waitfor_caps)
@@ -3424,23 +3431,30 @@ void Client::check_caps(Inode *in, unsigned flags)
   unsigned used = get_caps_used(in);
   unsigned cap_used;
 
-  if (in->is_dir() && (in->flags & I_COMPLETE)) {
-    // we do this here because we don't want to drop to Fs (and then
-    // drop the Fs if we do a create!) if that alone makes us send lookups
-    // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
-    wanted |= CEPH_CAP_FILE_EXCL;
-  }
-
   int implemented;
   int issued = in->caps_issued(&implemented);
   int revoking = implemented & ~issued;
 
   int retain = wanted | used | CEPH_CAP_PIN;
-  if (!unmounting) {
-    if (wanted)
+  if (!unmounting && in->nlink > 0) {
+    if (wanted) {
       retain |= CEPH_CAP_ANY;
-    else
+    } else if (in->is_dir() &&
+	       (issued & CEPH_CAP_FILE_SHARED) &&
+	       (in->flags & I_COMPLETE)) {
+      // we do this here because we don't want to drop to Fs (and then
+      // drop the Fs if we do a create!) if that alone makes us send lookups
+      // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
+      wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+      retain |= wanted;
+    } else {
       retain |= CEPH_CAP_ANY_SHARED;
+      // keep RD only if we didn't have the file open RW,
+      // because then the mds would revoke it anyway to
+      // journal max_size=0.
+      if (in->max_size == 0)
+	retain |= CEPH_CAP_ANY_RD;
+    }
   }
 
   ldout(cct, 10) << "check_caps on " << *in
@@ -3520,9 +3534,8 @@ void Client::check_caps(Inode *in, unsigned flags)
     if (!revoking && unmounting && (cap_used == 0))
       goto ack;
 
-    if (wanted == cap->wanted &&         // mds knows what we want.
-	((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
-	!in->dirty_caps)                 // and we have no dirty caps
+    if ((cap->issued & ~retain) == 0 && // and we don't have anything we wouldn't like
+	!in->dirty_caps)               // and we have no dirty caps
       continue;
 
     if (now < in->hold_caps_until) {
@@ -3743,12 +3756,26 @@ void Client::signal_context_list(list<Context*>& ls)
   }
 }
 
-void Client::wake_inode_waiters(MetaSession *s)
+void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
 {
   xlist<Cap*>::iterator iter = s->caps.begin();
   while (!iter.end()){
-    signal_cond_list((*iter)->inode->waitfor_caps);
+    auto cap = *iter;
+    auto in = cap->inode;
     ++iter;
+    if (reconnect) {
+      in->requested_max_size = 0;
+      in->wanted_max_size = 0;
+    } else {
+      if (cap->gen < s->cap_gen) {
+	// mds did not re-issue stale cap.
+	cap->issued = cap->implemented = CEPH_CAP_PIN;
+	// make sure mds knows what we want.
+	if (in->caps_file_wanted() & ~cap->wanted)
+	  in->flags |= I_CAP_DROPPED;
+      }
+    }
+    signal_cond_list(in->waitfor_caps);
   }
 }
 
@@ -3912,13 +3939,16 @@ void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
 }
 
 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
-			    unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
-			    int flags, const UserPerm& cap_perms)
+			    unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
+			    inodeno_t realm, int flags, const UserPerm& cap_perms)
 {
   Cap *cap = 0;
   mds_rank_t mds = mds_session->mds_num;
-  if (in->caps.count(mds)) {
-    cap = in->caps[mds];
+  auto it = in->caps.find(mds);
+  if (it != in->caps.end()) {
+    cap = it->second;
+    if (cap->gen < mds_session->cap_gen)
+      cap->issued = cap->implemented = CEPH_CAP_PIN;
 
     /*
      * auth mds of the inode changed. we received the cap export
@@ -3971,15 +4001,17 @@ void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id
   cap->cap_id = cap_id;
   cap->issued = issued;
   cap->implemented |= issued;
+  if (ceph_seq_cmp(mseq, cap->mseq) > 0)
+    cap->wanted = wanted;
+  else
+    cap->wanted |= wanted;
   cap->seq = seq;
   cap->issue_seq = seq;
   cap->mseq = mseq;
   cap->gen = mds_session->cap_gen;
   cap->latest_perms = cap_perms;
   ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
-	   << " from mds." << mds
-	   << " on " << *in
-	   << dendl;
+		 << " from mds." << mds << " on " << *in << dendl;
 
   if ((issued & ~old_caps) && in->auth_cap == cap) {
     // non-auth MDS is revoking the newly grant caps ?
@@ -4055,10 +4087,10 @@ void Client::remove_session_caps(MetaSession *s)
       dirty_caps = in->dirty_caps | in->flushing_caps;
       in->wanted_max_size = 0;
       in->requested_max_size = 0;
-      in->flags |= I_CAP_DROPPED;
     }
+    if (cap->wanted | cap->issued)
+      in->flags |= I_CAP_DROPPED;
     remove_cap(cap, false);
-    signal_cond_list(in->waitfor_caps);
     if (cap_snaps) {
       InodeRef tmp_ref(in);
       in->cap_snaps.clear();
@@ -4073,6 +4105,7 @@ void Client::remove_session_caps(MetaSession *s)
       in->mark_caps_clean();
       put_inode(in);
     }
+    signal_cond_list(in->waitfor_caps);
   }
   s->flushing_caps_tids.clear();
   sync_cond.Signal();
@@ -4425,17 +4458,6 @@ void Client::early_kick_flushing_caps(MetaSession *session)
   }
 }
 
-void Client::kick_maxsize_requests(MetaSession *session)
-{
-  xlist<Cap*>::iterator iter = session->caps.begin();
-  while (!iter.end()){
-    (*iter)->inode->requested_max_size = 0;
-    (*iter)->inode->wanted_max_size = 0;
-    signal_cond_list((*iter)->inode->waitfor_caps);
-    ++iter;
-  }
-}
-
 void SnapRealm::build_snap_context()
 {
   set<snapid_t> snaps;
@@ -4840,8 +4862,8 @@ void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
   update_snap_trace(m->snapbl, &realm);
 
   add_update_cap(in, session, m->get_cap_id(),
-		 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
-		 CEPH_CAP_FLAG_AUTH, cap_perms);
+		 m->get_caps(), m->get_wanted(), m->get_seq(), m->get_mseq(),
+		 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
   
   if (cap && cap->cap_id == m->peer.cap_id) {
       remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
@@ -4870,10 +4892,9 @@ void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
   if (in->caps.count(mds))
     cap = in->caps[mds];
 
-  const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
-
   if (cap && cap->cap_id == m->get_cap_id()) {
     if (m->peer.cap_id) {
+      const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
       MetaSession *tsession = _get_or_open_mds_session(peer_mds);
       if (in->caps.count(peer_mds)) {
 	Cap *tcap = in->caps[peer_mds];
@@ -4890,13 +4911,13 @@ void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
 	    adjust_session_flushing_caps(in, session, tsession);
 	}
       } else {
-	add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
+	add_update_cap(in, tsession, m->peer.cap_id, cap->issued, 0,
 		       m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
 		       cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
 		       cap->latest_perms);
       }
     } else {
-      if (cap == in->auth_cap)
+      if (cap->wanted | cap->issued)
 	in->flags |= I_CAP_DROPPED;
     }
 
@@ -5106,15 +5127,21 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
   int used = get_caps_used(in);
   int wanted = in->caps_wanted();
 
-  const int old_caps = cap->issued;
-  const int new_caps = m->get_caps();
+  const unsigned new_caps = m->get_caps();
+  const bool was_stale = session->cap_gen > cap->gen;
   ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino() 
 		<< " mds." << mds << " seq " << m->get_seq()
 		<< " caps now " << ccap_string(new_caps)
-		<< " was " << ccap_string(old_caps) << dendl;
+		<< " was " << ccap_string(cap->issued)
+		<< (was_stale ? "" : " (stale)") << dendl;
+
+  if (was_stale)
+      cap->issued = cap->implemented = CEPH_CAP_PIN;
   cap->seq = m->get_seq();
   cap->gen = session->cap_gen;
 
+  check_cap_issue(in, cap, new_caps);
+
   // update inode
   int issued;
   in->caps_issued(&issued);
@@ -5181,13 +5208,21 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
   }
 
   bool check = false;
-  if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
+  if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
+      (wanted & ~(cap->wanted | new_caps))) {
+    // If mds is importing cap, prior cap messages that update 'wanted'
+    // may get dropped by mds (migrate seq mismatch).
+    //
+    // We don't send cap message to update 'wanted' if what we want are
+    // already issued. If mds revokes caps, cap message that releases caps
+    // also tells mds what we want. But if caps got revoked by mds forcedly
+    // (session stale). We may haven't told mds what we want.
     check = true;
+  }
 
-  check_cap_issue(in, cap, new_caps);
 
   // update caps
-  int revoked = old_caps & ~new_caps;
+  auto revoked = cap->issued & ~new_caps;
   if (revoked) {
     ldout(cct, 10) << "  revocation of " << ccap_string(revoked) << dendl;
     cap->issued = new_caps;
@@ -5209,10 +5244,10 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
       check = true;
     }
-  } else if (old_caps == new_caps) {
-    ldout(cct, 10) << "  caps unchanged at " << ccap_string(old_caps) << dendl;
+  } else if (cap->issued == new_caps) {
+    ldout(cct, 10) << "  caps unchanged at " << ccap_string(cap->issued) << dendl;
   } else {
-    ldout(cct, 10) << "  grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
+    ldout(cct, 10) << "  grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
     cap->issued = new_caps;
     cap->implemented |= new_caps;
 
diff --git a/ceph/src/client/Client.h b/ceph/src/client/Client.h
index dd9e70ace..4a304466a 100644
--- a/ceph/src/client/Client.h
+++ b/ceph/src/client/Client.h
@@ -483,7 +483,7 @@ protected:
   Mutex                  client_lock;
 
   // helpers
-  void wake_inode_waiters(MetaSession *s);
+  void wake_up_session_caps(MetaSession *s, bool reconnect);
 
   void wait_on_context_list(list<Context*>& ls);
   void signal_context_list(list<Context*>& ls);
@@ -630,8 +630,8 @@ protected:
   // file caps
   void check_cap_issue(Inode *in, Cap *cap, unsigned issued);
   void add_update_cap(Inode *in, MetaSession *session, uint64_t cap_id,
-		      unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
-		      int flags, const UserPerm& perms);
+		      unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
+		      inodeno_t realm, int flags, const UserPerm& perms);
   void remove_cap(Cap *cap, bool queue_release);
   void remove_all_caps(Inode *in);
   void remove_session_caps(MetaSession *session);
@@ -641,7 +641,6 @@ protected:
   void flush_caps(Inode *in, MetaSession *session, bool sync=false);
   void kick_flushing_caps(MetaSession *session);
   void early_kick_flushing_caps(MetaSession *session);
-  void kick_maxsize_requests(MetaSession *session);
   int get_caps(Inode *in, int need, int want, int *have, loff_t endoff);
   int get_caps_used(Inode *in);
 
diff --git a/ceph/src/common/AsyncReserver.h b/ceph/src/common/AsyncReserver.h
index d5c7a852d..6695b7d29 100644
--- a/ceph/src/common/AsyncReserver.h
+++ b/ceph/src/common/AsyncReserver.h
@@ -143,6 +143,78 @@ public:
     do_queues();
   }
 
+  /**
+   * Update the priority of a reservation
+   *
+   * Note, on_reserved may be called following update_priority.  Thus,
+   * the callback must be safe in that case.  Callback will be called
+   * with no locks held.  cancel_reservation must be called to release the
+   * reservation slot.
+   *
+   * Cases
+   * 1. Item is queued, re-queue with new priority
+   * 2. Item is queued, re-queue and preempt if new priority higher than an in progress item
+   * 3. Item is in progress, just adjust priority if no higher priority waiting
+   * 4. Item is in progress, adjust priority if higher priority items waiting preempt item
+   *
+   */
+  void update_priority(T item, unsigned newprio) {
+    Mutex::Locker l(lock);
+    auto i = queue_pointers.find(item);
+    if (i != queue_pointers.end()) {
+      unsigned prio = i->second.first;
+      if (newprio == prio)
+        return;
+      Reservation r = *i->second.second;
+      rdout(10) << __func__ << " update " << r << " (was queued)" << dendl;
+      // Like cancel_reservation() without preempting
+      queues[prio].erase(i->second.second);
+      if (queues[prio].empty()) {
+	queues.erase(prio);
+      }
+      queue_pointers.erase(i);
+
+      // Like request_reservation() to re-queue it but with new priority
+      assert(!queue_pointers.count(item) &&
+	   !in_progress.count(item));
+      r.prio = newprio;
+      queues[newprio].push_back(r);
+      queue_pointers.insert(make_pair(item,
+				    make_pair(newprio,--(queues[newprio]).end())));
+    } else {
+      auto p = in_progress.find(item);
+      if (p != in_progress.end()) {
+        if (p->second.prio == newprio)
+          return;
+	rdout(10) << __func__ << " update " << p->second
+		  << " (in progress)" << dendl;
+        // We want to preempt if priority goes down
+        // and smaller then highest priority waiting
+	if (p->second.preempt) {
+	  if (newprio < p->second.prio && !queues.empty()) {
+            // choose highest priority queue
+            auto it = queues.end();
+            --it;
+            assert(!it->second.empty());
+            if (it->first > newprio) {
+	      rdout(10) << __func__ << " update " << p->second
+		        << " lowered priority let do_queues() preempt it" << dendl;
+            }
+          }
+	  preempt_by_prio.erase(make_pair(p->second.prio, p->second.item));
+          p->second.prio = newprio;
+	  preempt_by_prio.insert(make_pair(p->second.prio, p->second.item));
+	} else {
+          p->second.prio = newprio;
+        }
+      } else {
+	rdout(10) << __func__ << " update " << item << " (not found)" << dendl;
+      }
+    }
+    do_queues();
+    return;
+  }
+
   void dump(Formatter *f) {
     Mutex::Locker l(lock);
     _dump(f);
diff --git a/ceph/src/common/ceph_crypto.cc b/ceph/src/common/ceph_crypto.cc
index a0aa8767e..77454fb60 100644
--- a/ceph/src/common/ceph_crypto.cc
+++ b/ceph/src/common/ceph_crypto.cc
@@ -14,6 +14,7 @@
 
 #include "common/config.h"
 #include "ceph_crypto.h"
+#include "include/scope_guard.h"
 
 #ifdef USE_CRYPTOPP
 void ceph::crypto::init(CephContext *cct)
@@ -44,6 +45,124 @@ static uint32_t crypto_refs = 0;
 static NSSInitContext *crypto_context = NULL;
 static pid_t crypto_init_pid = 0;
 
+PK11SymKey *ceph::crypto::PK11_ImportSymKey_FIPS(
+    PK11SlotInfo * const slot,
+    const CK_MECHANISM_TYPE type,
+    const PK11Origin origin,
+    const CK_ATTRIBUTE_TYPE operation,
+    SECItem * const raw_key,
+    void * const wincx)
+{
+  if (PK11_IsFIPS() == PR_FALSE) {
+    // This isn't the FIPS mode, and thus PK11_ImportSymKey is available. Let's
+    // make use of it to avoid overhead related to e.g. creating extra PK11Ctx.
+    PK11SymKey *ret_key = nullptr;
+    ret_key = PK11_ImportSymKey(slot, type, origin, operation, raw_key, wincx);
+
+    return ret_key;
+  }
+
+  ceph_assert_always(wincx == nullptr);
+
+  std::vector<unsigned char> wrapped_key;
+
+  // getting 306 on my system which is CKM_DES3_ECB.
+  const CK_MECHANISM_TYPE wrap_mechanism = PK11_GetBestWrapMechanism(slot);
+
+  // Generate a wrapping key. It will be used exactly twice over the scope:
+  //   * to encrypt raw_key giving wrapped_key,
+  //   * to decrypt wrapped_key in the internals of PK11_UnwrapSymKey().
+  PK11SymKey * const wrapping_key = PK11_KeyGen(
+    slot,
+    wrap_mechanism,
+    nullptr,
+    PK11_GetBestKeyLength(slot, wrap_mechanism),
+    nullptr);
+  if (wrapping_key == nullptr) {
+    return nullptr;
+  }
+  auto wk_guard = make_scope_guard([wrapping_key] {
+    PK11_FreeSymKey(wrapping_key);
+  });
+
+  // Prepare a PK11 context for the raw_key -> wrapped_key encryption.
+  SECItem tmp_sec_item;
+  ::memset(&tmp_sec_item, 0, sizeof(tmp_sec_item));
+  PK11Context * const wrap_key_crypt_context = PK11_CreateContextBySymKey(
+    wrap_mechanism,
+    CKA_ENCRYPT,
+    wrapping_key,
+    &tmp_sec_item);
+  if (wrap_key_crypt_context == nullptr) {
+    return nullptr;
+  }
+  auto wkcc_guard = make_scope_guard([wrap_key_crypt_context] {
+    PK11_DestroyContext(wrap_key_crypt_context, PR_TRUE);
+  });
+
+
+  // Finally wrap the key. Important note is that the wrapping mechanism
+  // selection (read: just grabbing a cipher) offers, at least in my NSS
+  // copy, mostly CKM_*_ECB ciphers (with 3DES as the leading one, see
+  // wrapMechanismList[] in pk11mech.c). There is no CKM_*_*_PAD variant
+  // which means that plaintext we are providing to PK11_CipherOp() must
+  // be aligned to cipher's block size. For 3DES it's 64 bits.
+  {
+    const auto block_size = PK11_GetBlockSize(wrap_mechanism, nullptr);
+    SECItem * const raw_key_aligned = PK11_BlockData(raw_key, block_size);
+    if (raw_key_aligned == nullptr) {
+      return nullptr;
+    }
+    auto rka_guard = make_scope_guard([raw_key_aligned] {
+      SECITEM_FreeItem(raw_key_aligned, PR_TRUE);
+    });
+
+    // PARANOIA: always add space for one extra cipher's block. This seems
+    // unnecessary at the moment as padding is never used (see the comment
+    // above) but let's assume it can change in the future. Just in case.
+    wrapped_key.resize(raw_key_aligned->len + block_size, 0x0);
+    int out_len = 0;
+
+    int ret = PK11_CipherOp(
+      wrap_key_crypt_context,
+      wrapped_key.data(),
+      &out_len,
+      wrapped_key.size(), // max space
+      raw_key_aligned->data,
+      raw_key_aligned->len);
+    if (ret != SECSuccess) {
+      return nullptr;
+    }
+
+    ret = PK11_Finalize(wrap_key_crypt_context);
+    if (ret != SECSuccess) {
+      return nullptr;
+    }
+
+    ceph_assert(out_len <= static_cast<int>(wrapped_key.size()));
+    wrapped_key.resize(out_len);
+  }
+
+  // Key is wrapped now so we can acquire the ultimate PK11SymKey through
+  // unwrapping it. Of course these two opposite operations form NOP with
+  // a side effect: FIPS level 1 compatibility.
+  ::memset(&tmp_sec_item, 0, sizeof(tmp_sec_item));
+
+  SECItem wrapped_key_item;
+  ::memset(&wrapped_key_item, 0, sizeof(wrapped_key_item));
+  wrapped_key_item.data = wrapped_key.data();
+  wrapped_key_item.len = wrapped_key.size();
+
+  return PK11_UnwrapSymKey(
+    wrapping_key,
+    wrap_mechanism,
+    &tmp_sec_item,
+    &wrapped_key_item,
+    type,
+    operation,
+    raw_key->len);
+}
+
 void ceph::crypto::init(CephContext *cct)
 {
   pid_t pid = getpid();
diff --git a/ceph/src/common/ceph_crypto.h b/ceph/src/common/ceph_crypto.h
index 9c3023929..c58f1d0b5 100644
--- a/ceph/src/common/ceph_crypto.h
+++ b/ceph/src/common/ceph_crypto.h
@@ -67,6 +67,20 @@ namespace ceph {
 // ugly bit of CryptoPP that we have to emulate here :(
 typedef unsigned char byte;
 
+namespace ceph {
+  namespace crypto {
+    // workaround for no PK11_ImportSymKey in FIPS mode
+    PK11SymKey *PK11_ImportSymKey_FIPS(
+	PK11SlotInfo *slot,
+	CK_MECHANISM_TYPE type,
+	PK11Origin origin,
+	CK_ATTRIBUTE_TYPE operation,
+	SECItem *key,
+	void *wincx);
+  } // namespace crypto
+} // namespace
+
+
 namespace ceph {
   namespace crypto {
     void assert_init();
@@ -136,8 +150,8 @@ namespace ceph {
 	keyItem.type = siBuffer;
 	keyItem.data = (unsigned char*)key;
 	keyItem.len = length;
-	symkey = PK11_ImportSymKey(slot, cktype, PK11_OriginUnwrap,
-				   CKA_SIGN,  &keyItem, NULL);
+	symkey = PK11_ImportSymKey_FIPS(slot, cktype, PK11_OriginUnwrap,
+					CKA_SIGN,  &keyItem, NULL);
 	assert(symkey);
 	SECItem param;
 	param.type = siBuffer;
diff --git a/ceph/src/common/ceph_timer.h b/ceph/src/common/ceph_timer.h
index 4b7438672..8e9330122 100644
--- a/ceph/src/common/ceph_timer.h
+++ b/ceph/src/common/ceph_timer.h
@@ -138,6 +138,8 @@ namespace ceph {
 	    } // Otherwise the event requeued itself
 	  }
 
+          if (suspended)
+            break;
 	  if (schedule.empty())
 	    cond.wait(l);
 	  else
diff --git a/ceph/src/common/legacy_config_opts.h b/ceph/src/common/legacy_config_opts.h
index 828697758..7dac8782d 100644
--- a/ceph/src/common/legacy_config_opts.h
+++ b/ceph/src/common/legacy_config_opts.h
@@ -443,7 +443,6 @@ OPTION(mds_session_blacklist_on_timeout, OPT_BOOL)    // whether to blacklist cl
 OPTION(mds_session_blacklist_on_evict, OPT_BOOL)  // whether to blacklist clients whose sessions are dropped via admin commands
 
 OPTION(mds_sessionmap_keys_per_op, OPT_U32)    // how many sessions should I try to load/store in a single OMAP operation?
-OPTION(mds_recall_state_timeout, OPT_FLOAT)    // detect clients which aren't trimming caps
 OPTION(mds_freeze_tree_timeout, OPT_FLOAT)    // detecting freeze tree deadlock
 OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
 OPTION(mds_reconnect_timeout, OPT_FLOAT)  // seconds to wait for clients during mds restart
@@ -1099,6 +1098,7 @@ OPTION(bluestore_fsck_on_umount_deep, OPT_BOOL)
 OPTION(bluestore_fsck_on_mkfs, OPT_BOOL)
 OPTION(bluestore_fsck_on_mkfs_deep, OPT_BOOL)
 OPTION(bluestore_sync_submit_transaction, OPT_BOOL) // submit kv txn in queueing thread (not kv_sync_thread)
+OPTION(bluestore_fsck_read_bytes_cap, OPT_U64)
 OPTION(bluestore_throttle_bytes, OPT_U64)
 OPTION(bluestore_throttle_deferred_bytes, OPT_U64)
 OPTION(bluestore_throttle_cost_per_io_hdd, OPT_U64)
diff --git a/ceph/src/common/options.cc b/ceph/src/common/options.cc
index 231a7651b..fdbc23312 100644
--- a/ceph/src/common/options.cc
+++ b/ceph/src/common/options.cc
@@ -444,7 +444,7 @@ std::vector<Option> get_global_options() {
     .set_description(""),
 
     Option("mon_cluster_log_file_level", Option::TYPE_STR, Option::LEVEL_ADVANCED)
-    .set_default("info")
+    .set_default("debug")
     .set_description(""),
 
     Option("mon_cluster_log_to_graylog", Option::TYPE_STR, Option::LEVEL_ADVANCED)
@@ -1626,6 +1626,22 @@ std::vector<Option> get_global_options() {
     .set_default(10)
     .set_description(""),
 
+    Option("osd_calc_pg_upmaps_aggressively", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("try to calculate PG upmaps more aggressively, e.g., "
+                     "by doing a fairly exhaustive search of existing PGs "
+                     "that can be unmapped or upmapped"),
+
+    Option("osd_calc_pg_upmaps_max_stddev", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.0)
+    .set_description("standard deviation below which there is no attempt made "
+                     "while trying to calculate PG upmaps"),
+
+    Option("osd_calc_pg_upmaps_local_fallback_retries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description("Maximum number of PGs we can attempt to unmap or upmap "
+                     "for a specific overfull or underfull osd per iteration "),
+
     Option("journaler_prezero_periods", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(5)
     .set_description(""),
@@ -2564,7 +2580,8 @@ std::vector<Option> get_global_options() {
 
     Option("osd_scrub_backoff_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(.66)
-    .set_description("Backoff ratio after a failed scrub scheduling attempt"),
+    .set_long_description("This is the precentage of ticks that do NOT schedule scrubs, 66% means that 1 out of 3 ticks will schedule scrubs")
+    .set_description("Backoff ratio for scheduling scrubs"),
 
     Option("osd_scrub_chunk_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(5)
@@ -2599,9 +2616,8 @@ std::vector<Option> get_global_options() {
 
     Option("osd_deep_scrub_randomize_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(0.15)
-    .set_description("Ratio of deep scrub interval to randomly vary")
-    .set_long_description("This prevents a deep scrub 'stampede' by randomly varying the scrub intervals so that they are soon uniformly distributed over the week")
-    .add_see_also("osd_deep_scrub_interval"),
+    .set_description("Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep)")
+    .set_long_description("This prevents a deep scrub 'stampede' by spreading deep scrubs so they are uniformly distributed over the week"),
 
     Option("osd_deep_scrub_stride", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(524288)
@@ -3691,6 +3707,10 @@ std::vector<Option> get_global_options() {
     .set_default(false)
     .set_description("Try to submit metadata transaction to rocksdb in queuing thread context"),
 
+    Option("bluestore_fsck_read_bytes_cap", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64_M)
+    .set_description("Maximum bytes read at once by deep fsck"),
+
     Option("bluestore_throttle_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(64_M)
     .set_safe()
@@ -5978,7 +5998,7 @@ static std::vector<Option> get_rbd_options() {
     .set_description("default krbd map options"),
 
     Option("rbd_journal_order", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
-    .set_min(12)
+    .set_min_max(12, 26)
     .set_default(24)
     .set_description("default order (object size) for journal data objects"),
 
@@ -6098,6 +6118,14 @@ std::vector<Option> get_mds_options() {
     .set_default(.7)
     .set_description(""),
 
+    Option("mds_cache_trim_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("decay rate for trimming MDS cache throttle"),
+
+    Option("mds_cache_trim_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description("threshold for number of dentries that can be trimmed"),
+
     Option("mds_max_file_recover", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(32)
     .set_description(""),
@@ -6142,9 +6170,29 @@ std::vector<Option> get_mds_options() {
     .set_default(1024)
     .set_description(""),
 
-    Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(60)
-    .set_description(""),
+    Option("mds_recall_max_caps", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5000)
+    .set_description("maximum number of caps to recall from client session in single recall"),
+
+    Option("mds_recall_max_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.5)
+    .set_description("decay rate for throttle on recalled caps on a session"),
+
+    Option("mds_recall_max_decay_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(16_K)
+    .set_description("decay threshold for throttle on recalled caps on a session"),
+
+    Option("mds_recall_global_max_decay_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description("decay threshold for throttle on recalled caps globally"),
+
+    Option("mds_recall_warning_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(32_K)
+    .set_description("decay threshold for warning on slow session cap recall"),
+
+    Option("mds_recall_warning_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(60.0)
+    .set_description("decay rate for warning on slow session cap recall"),
 
     Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(30)
@@ -6518,9 +6566,10 @@ std::vector<Option> get_mds_options() {
     .set_default(100)
     .set_description("minimum number of capabilities a client may hold"),
 
-    Option("mds_max_ratio_caps_per_client", Option::TYPE_FLOAT, Option::LEVEL_DEV)
-    .set_default(.8)
-    .set_description("maximum ratio of current caps that may be recalled during MDS cache pressure"),
+    Option("mds_max_caps_per_client", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1_M)
+    .set_description("maximum number of capabilities a client may hold"),
+
     Option("mds_hack_allow_loading_invalid_metadata", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
      .set_default(0)
      .set_description("INTENTIONALLY CAUSE DATA LOSS by bypasing checks for invalid metadata on disk. Allows testing repair tools."),
diff --git a/ceph/src/common/str_map.cc b/ceph/src/common/str_map.cc
index c3f6a2b65..947ad21a4 100644
--- a/ceph/src/common/str_map.cc
+++ b/ceph/src/common/str_map.cc
@@ -17,6 +17,8 @@
 #include "include/str_map.h"
 #include "include/str_list.h"
 
+#include <boost/algorithm/string.hpp>
+
 #include "json_spirit/json_spirit.h"
 
 using namespace std;
@@ -56,19 +58,13 @@ int get_json_str_map(
   }
   return 0;
 }
+
 string trim(const string& str) {
-  size_t start = 0;
-  size_t end = str.size() - 1;
-  while (isspace(str[start]) != 0 && start <= end) {
-    ++start;
-  }
-  while (isspace(str[end]) != 0 && start <= end) {
-    --end;
-  }
-  if (start <= end) {
-    return str.substr(start, end - start + 1);
-  }
-  return string();
+  return boost::algorithm::trim_copy_if(
+    str,
+    [](unsigned char c) {
+      return std::isspace(c);
+    });
 }
 
 int get_str_map(
diff --git a/ceph/src/crush/CrushWrapper.cc b/ceph/src/crush/CrushWrapper.cc
index ca8d9059f..7173697d4 100644
--- a/ceph/src/crush/CrushWrapper.cc
+++ b/ceph/src/crush/CrushWrapper.cc
@@ -878,23 +878,84 @@ void CrushWrapper::get_children_of_type(int id,
   }
 }
 
-int CrushWrapper::get_rule_failure_domain(int rule_id)
-{
-  crush_rule *rule = get_rule(rule_id);
-  if (IS_ERR(rule)) {
+int CrushWrapper::verify_upmap(CephContext *cct,
+                               int rule_id,
+                               int pool_size,
+                               const vector<int>& up)
+{
+  auto rule = get_rule(rule_id);
+  if (IS_ERR(rule) || !rule) {
+    lderr(cct) << __func__ << " rule " << rule_id << " does not exist"
+               << dendl;
     return -ENOENT;
   }
-  int type = 0; // default to osd-level
-  for (unsigned s = 0; s < rule->len; ++s) {
-    if ((rule->steps[s].op == CRUSH_RULE_CHOOSE_FIRSTN ||
-         rule->steps[s].op == CRUSH_RULE_CHOOSE_INDEP ||
-         rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_FIRSTN ||
-         rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_INDEP) &&
-         rule->steps[s].arg2 > type) {
-      type = rule->steps[s].arg2;
+  for (unsigned step = 0; step < rule->len; ++step) {
+    auto curstep = &rule->steps[step];
+    ldout(cct, 10) << __func__ << " step " << step << dendl;
+    switch (curstep->op) {
+    case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+    case CRUSH_RULE_CHOOSELEAF_INDEP:
+      {
+        int type = curstep->arg2;
+        if (type == 0) // osd
+          break;
+        map<int, set<int>> osds_by_parent; // parent_of_desired_type -> osds
+        for (auto osd : up) {
+          auto parent = get_parent_of_type(osd, type, rule_id);
+          if (parent < 0) {
+            osds_by_parent[parent].insert(osd);
+          } else {
+            ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd
+                          << ", skipping for now"
+                          << dendl;
+          }
+        }
+        for (auto i : osds_by_parent) {
+          if (i.second.size() > 1) {
+            lderr(cct) << __func__ << " multiple osds " << i.second
+                       << " come from same failure domain " << i.first
+                       << dendl;
+            return -EINVAL;
+          }
+        }
+      }
+      break;
+
+    case CRUSH_RULE_CHOOSE_FIRSTN:
+    case CRUSH_RULE_CHOOSE_INDEP:
+      {
+        int numrep = curstep->arg1;
+        int type = curstep->arg2;
+        if (type == 0) // osd
+          break;
+        if (numrep <= 0)
+          numrep += pool_size;
+        set<int> parents_of_type;
+        for (auto osd : up) {
+          auto parent = get_parent_of_type(osd, type, rule_id);
+          if (parent < 0) {
+            parents_of_type.insert(parent);
+          } else {
+            ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd
+                          << ", skipping for now"
+                          << dendl;
+          }
+        }
+        if ((int)parents_of_type.size() > numrep) {
+          lderr(cct) << __func__ << " number of buckets "
+                     << parents_of_type.size() << " exceeds desired " << numrep
+                     << dendl;
+          return -EINVAL;
+        }
+      }
+      break;
+
+    default:
+      // ignore
+      break;
     }
   }
-  return type;
+  return 0;
 }
 
 int CrushWrapper::_get_leaves(int id, list<int> *leaves)
@@ -3614,7 +3675,8 @@ int CrushWrapper::_choose_type_stack(
   const vector<int>& orig,
   vector<int>::const_iterator& i,
   set<int>& used,
-  vector<int> *pw) const
+  vector<int> *pw,
+  int root_bucket) const
 {
   vector<int> w = *pw;
   vector<int> o;
@@ -3624,7 +3686,7 @@ int CrushWrapper::_choose_type_stack(
 		 << " at " << *i
 		 << " pw " << *pw
 		 << dendl;
-
+  ceph_assert(root_bucket < 0);
   vector<int> cumulative_fanout(stack.size());
   int f = 1;
   for (int j = (int)stack.size() - 1; j >= 0; --j) {
@@ -3652,6 +3714,10 @@ int CrushWrapper::_choose_type_stack(
       item = get_parent_of_type(item, type);
       ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
 		     << " is " << item << dendl;
+      if (!subtree_contains(root_bucket, item)) {
+        ldout(cct, 20) << __func__ << " not in root subtree " << root_bucket << dendl;
+        continue;
+      }
       underfull_buckets[j].insert(item);
     }
   }
@@ -3811,7 +3877,7 @@ int CrushWrapper::try_remap_rule(
   set<int> used;
 
   vector<pair<int,int>> type_stack;  // (type, fan-out)
-
+  int root_bucket = 0;
   for (unsigned step = 0; step < rule->len; ++step) {
     const crush_rule_step *curstep = &rule->steps[step];
     ldout(cct, 10) << __func__ << " step " << step << " w " << w << dendl;
@@ -3822,6 +3888,7 @@ int CrushWrapper::try_remap_rule(
 	   map->buckets[-1-curstep->arg1])) {
 	w.clear();
 	w.push_back(curstep->arg1);
+	root_bucket = curstep->arg1;
 	ldout(cct, 10) << __func__ << " take " << w << dendl;
       } else {
 	ldout(cct, 1) << " bad take value " << curstep->arg1 << dendl;
@@ -3839,7 +3906,7 @@ int CrushWrapper::try_remap_rule(
         if (type > 0)
 	  type_stack.push_back(make_pair(0, 1));
 	int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
-				   i, used, &w);
+				   i, used, &w, root_bucket);
 	if (r < 0)
 	  return r;
 	type_stack.clear();
@@ -3861,7 +3928,7 @@ int CrushWrapper::try_remap_rule(
       ldout(cct, 10) << " emit " << w << dendl;
       if (!type_stack.empty()) {
 	int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
-				   i, used, &w);
+				   i, used, &w, root_bucket);
 	if (r < 0)
 	  return r;
 	type_stack.clear();
diff --git a/ceph/src/crush/CrushWrapper.h b/ceph/src/crush/CrushWrapper.h
index 8063e8ab8..d9f3ef31d 100644
--- a/ceph/src/crush/CrushWrapper.h
+++ b/ceph/src/crush/CrushWrapper.h
@@ -733,12 +733,15 @@ public:
 			    set<int> *children,
 			    bool exclude_shadow = true) const;
 
+
   /**
-    * get failure-domain type of a specific crush rule
-    * @param rule_id crush rule id
-    * @return type of failure-domain or a negative errno on error.
-    */
-  int get_rule_failure_domain(int rule_id);
+   * verify upmapping results.
+   * return 0 on success or a negative errno on error.
+   */
+  int verify_upmap(CephContext *cct,
+                   int rule_id,
+                   int pool_size,
+                   const vector<int>& up);
 
   /**
     * enumerate leaves(devices) of given node
@@ -1529,7 +1532,8 @@ public:
     const vector<int>& orig,
     vector<int>::const_iterator& i,
     set<int>& used,
-    vector<int> *pw) const;
+    vector<int> *pw,
+    int root_bucket) const;
 
   int try_remap_rule(
     CephContext *cct,
diff --git a/ceph/src/journal/Journaler.cc b/ceph/src/journal/Journaler.cc
index a138c7c08..a6bc2451e 100644
--- a/ceph/src/journal/Journaler.cc
+++ b/ceph/src/journal/Journaler.cc
@@ -211,8 +211,8 @@ void Journaler::get_mutable_metadata(uint64_t *minimum_set,
 
 void Journaler::create(uint8_t order, uint8_t splay_width,
                       int64_t pool_id, Context *on_finish) {
-  if (order > 64 || order < 12) {
-    lderr(m_cct) << "order must be in the range [12, 64]" << dendl;
+  if (order > 26 || order < 12) {
+    lderr(m_cct) << "order must be in the range [12, 26]" << dendl;
     on_finish->complete(-EDOM);
     return;
   }
diff --git a/ceph/src/log/test.cc b/ceph/src/log/test.cc
index e11505af4..9a4336660 100644
--- a/ceph/src/log/test.cc
+++ b/ceph/src/log/test.cc
@@ -19,7 +19,7 @@ TEST(Log, Simple)
   Log log(&subs);
   log.start();
  
-  log.set_log_file("/tmp/foo");
+  log.set_log_file("foo");
   log.reopen_log_file();
 
   log.set_stderr_level(5, -1);
@@ -53,7 +53,7 @@ TEST(Log, ManyNoGather)
   subs.add(1, "foo", 1, 1);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -71,7 +71,7 @@ TEST(Log, ManyGatherLog)
   subs.add(1, "foo", 20, 10);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -89,7 +89,7 @@ TEST(Log, ManyGatherLogStringAssign)
   subs.add(1, "foo", 20, 10);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -110,7 +110,7 @@ TEST(Log, ManyGatherLogStringAssignWithReserve)
   subs.add(1, "foo", 20, 10);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -133,7 +133,7 @@ TEST(Log, ManyGatherLogPrebuf)
   subs.add(1, "foo", 20, 10);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -156,7 +156,7 @@ TEST(Log, ManyGatherLogPrebufOverflow)
   subs.add(1, "foo", 20, 10);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -180,7 +180,7 @@ TEST(Log, ManyGather)
   subs.add(1, "foo", 20, 1);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -197,7 +197,7 @@ void do_segv()
   subs.add(1, "foo", 20, 1);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
 
   log.inject_segv();
@@ -222,7 +222,7 @@ TEST(Log, LargeLog)
   subs.add(1, "foo", 20, 10);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   int l = 10;
   Entry *e = new Entry(ceph_clock_now(), pthread_self(), l, 1);
diff --git a/ceph/src/mds/Beacon.cc b/ceph/src/mds/Beacon.cc
index 475169a16..0cf8e86eb 100644
--- a/ceph/src/mds/Beacon.cc
+++ b/ceph/src/mds/Beacon.cc
@@ -28,6 +28,7 @@
 
 #include "Beacon.h"
 
+#include <math.h>
 #include <chrono>
 
 #define dout_context g_ceph_context
@@ -78,7 +79,9 @@ void Beacon::init(const MDSMap* mdsmap)
       auto since = std::chrono::duration<double>(now-last_send).count();
       auto interval = beacon_interval;
       if (since >= interval*.90) {
-        _send();
+        if (!_send()) {
+          interval = 0.5; /* 500ms */
+        }
       } else {
         interval -= since;
       }
@@ -183,7 +186,7 @@ void Beacon::send_and_wait(const double duration)
 /**
  * Call periodically, or when you have updated the desired state
  */
-void Beacon::_send()
+bool Beacon::_send()
 {
   auto now = clock::now();
   auto since = std::chrono::duration<double>(now-last_acked_stamp).count();
@@ -192,7 +195,7 @@ void Beacon::_send()
     /* If anything isn't progressing, let avoid sending a beacon so that
      * the MDS will consider us laggy */
     dout(0) << "Skipping beacon heartbeat to monitors (last acked " << since << "s ago); MDS internal heartbeat is not healthy!" << dendl;
-    return;
+    return false;
   }
 
   ++last_seq;
@@ -225,6 +228,7 @@ void Beacon::_send()
   }
   monc->send_mon_message(beacon);
   last_send = now;
+  return true;
 }
 
 /**
@@ -385,40 +389,27 @@ void Beacon::notify_health(MDSRank const *mds)
     set<Session*> sessions;
     mds->sessionmap.get_client_session_set(sessions);
 
-    auto mds_recall_state_timeout = g_conf->mds_recall_state_timeout;
-    auto last_recall = mds->mdcache->last_recall_state;
-    auto last_recall_span = std::chrono::duration<double>(clock::now()-last_recall).count();
-    bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
-
+    const auto recall_warning_threshold = g_conf->get_val<uint64_t>("mds_recall_warning_threshold");
+    const auto max_completed_requests = g_conf->mds_max_completed_requests;
+    const auto max_completed_flushes = g_conf->mds_max_completed_flushes;
     std::list<MDSHealthMetric> late_recall_metrics;
     std::list<MDSHealthMetric> large_completed_requests_metrics;
     for (auto& session : sessions) {
-      if (session->recalled_at != Session::time::min()) {
-        auto last_recall_sent = session->last_recall_sent;
-        auto recalled_at = session->recalled_at;
-        auto recalled_at_span = std::chrono::duration<double>(clock::now()-recalled_at).count();
-
-        dout(20) << "Session servicing RECALL " << session->info.inst
-          << ": " << recalled_at_span << "s ago " << session->recall_release_count
-          << "/" << session->recall_count << dendl;
-	if (recall_state_timedout || last_recall_sent < last_recall) {
-	  dout(20) << "  no longer recall" << dendl;
-	  session->clear_recalled_at();
-	} else if (recalled_at_span > mds_recall_state_timeout) {
-          dout(20) << "  exceeded timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
-          std::ostringstream oss;
-	  oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
-          MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
-          m.metadata["client_id"] = stringify(session->info.inst.name.num());
-          late_recall_metrics.push_back(m);
-        } else {
-          dout(20) << "  within timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
-        }
+      const uint64_t recall_caps = fmax(0.0, session->get_recall_caps()); /* In Luminous: decay counter may go negative due to hit */
+      if (recall_caps > recall_warning_threshold) {
+        dout(2) << "Session " << *session <<
+             " is not releasing caps fast enough. Recalled caps at " << recall_caps
+          << " > " << recall_warning_threshold << " (mds_recall_warning_threshold)." << dendl;
+        std::ostringstream oss;
+        oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
+        MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
+        m.metadata["client_id"] = stringify(session->get_client());
+        late_recall_metrics.push_back(m);
       }
       if ((session->get_num_trim_requests_warnings() > 0 &&
-	   session->get_num_completed_requests() >= g_conf->mds_max_completed_requests) ||
+	   session->get_num_completed_requests() >= max_completed_requests) ||
 	  (session->get_num_trim_flushes_warnings() > 0 &&
-	   session->get_num_completed_flushes() >= g_conf->mds_max_completed_flushes)) {
+	   session->get_num_completed_flushes() >= max_completed_flushes)) {
 	std::ostringstream oss;
 	oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid";
 	MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
diff --git a/ceph/src/mds/Beacon.h b/ceph/src/mds/Beacon.h
index bf5afa2c8..7120752fa 100644
--- a/ceph/src/mds/Beacon.h
+++ b/ceph/src/mds/Beacon.h
@@ -85,7 +85,7 @@ public:
 
 private:
   void _notify_mdsmap(MDSMap const *mdsmap);
-  void _send();
+  bool _send();
 
   mutable std::mutex mutex;
   std::thread sender;
diff --git a/ceph/src/mds/CInode.cc b/ceph/src/mds/CInode.cc
index e8c1bc8bc..b91c47881 100644
--- a/ceph/src/mds/CInode.cc
+++ b/ceph/src/mds/CInode.cc
@@ -218,7 +218,7 @@ ostream& operator<<(ostream& out, const CInode& in)
       if (it->second->issued() != it->second->pending())
 	out << "/" << ccap_string(it->second->issued());
       out << "/" << ccap_string(it->second->wanted())
-	  << "@" << it->second->get_last_sent();
+	  << "@" << it->second->get_last_seq();
     }
     out << "}";
     if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) {
@@ -2812,14 +2812,10 @@ Capability *CInode::add_client_cap(client_t client, Session *session, SnapRealm
   if (client_caps.empty())
     mdcache->num_inodes_with_caps++;
   
-  Capability *cap = new Capability(this, ++mdcache->last_cap_id, client);
+  Capability *cap = new Capability(this, session, ++mdcache->last_cap_id);
   assert(client_caps.count(client) == 0);
   client_caps[client] = cap;
 
-  session->add_cap(cap);
-  if (session->is_stale())
-    cap->mark_stale();
-  
   cap->client_follows = first-1;
   
   containing_realm->add_cap(client, cap);
@@ -4264,7 +4260,7 @@ void CInode::dump(Formatter *f) const
     f->dump_string("pending", ccap_string(it->second->pending()));
     f->dump_string("issued", ccap_string(it->second->issued()));
     f->dump_string("wanted", ccap_string(it->second->wanted()));
-    f->dump_int("last_sent", it->second->get_last_sent());
+    f->dump_int("last_sent", it->second->get_last_seq());
     f->close_section();
   }
   f->close_section();
diff --git a/ceph/src/mds/Capability.cc b/ceph/src/mds/Capability.cc
index 144f604e6..8bfa90b12 100644
--- a/ceph/src/mds/Capability.cc
+++ b/ceph/src/mds/Capability.cc
@@ -13,6 +13,7 @@
  */
 
 #include "Capability.h"
+#include "SessionMap.h"
 
 #include "common/Formatter.h"
 
@@ -140,6 +141,85 @@ void Capability::revoke_info::generate_test_instances(list<Capability::revoke_in
 /*
  * Capability
  */
+Capability::Capability(CInode *i, Session *s, uint64_t id) :
+  client_follows(0),
+  client_xattr_version(0), client_inline_version(0),
+  last_rbytes(0), last_rsize(0),
+  item_session_caps(this), item_snaprealm_caps(this),
+  item_revoking_caps(this), item_client_revoking_caps(this),
+  inode(i), session(s),
+  cap_id(id), _wanted(0), num_revoke_warnings(0),
+  _pending(0), _issued(0), last_sent(0), last_issue(0), mseq(0),
+  suppress(0), state(0)
+{
+  if (session) {
+    session->touch_cap_bottom(this);
+    cap_gen = session->get_cap_gen();
+  }
+}
+
+client_t Capability::get_client() const
+{
+  return session ? session->get_client() : client_t(-1);
+}
+
+bool Capability::is_stale() const
+{
+  return session ? session->is_stale() : false;
+}
+
+bool Capability::is_valid() const
+{
+  return !session || session->get_cap_gen() == cap_gen;
+}
+
+void Capability::revalidate()
+{
+  if (is_valid())
+    return;
+
+  if (_pending & ~CEPH_CAP_PIN)
+    inc_last_seq();
+
+  bool was_revoking = _issued & ~_pending;
+  _pending = _issued = CEPH_CAP_PIN;
+  _revokes.clear();
+
+  cap_gen = session->get_cap_gen();
+
+  if (was_revoking)
+    maybe_clear_notable();
+}
+
+void Capability::mark_notable()
+{
+  state |= STATE_NOTABLE;
+  session->touch_cap(this);
+}
+
+void Capability::maybe_clear_notable()
+{
+  if ((_issued == _pending) &&
+      !is_clientwriteable() &&
+      !is_wanted_notable(_wanted)) {
+    ceph_assert(is_notable());
+    state &= ~STATE_NOTABLE;
+    session->touch_cap_bottom(this);
+  }
+}
+
+void Capability::set_wanted(int w) {
+  CInode *in = get_inode();
+  if (in) {
+    if (!is_wanted_notable(_wanted) && is_wanted_notable(w)) {
+      if (!is_notable())
+	mark_notable();
+    } else if (is_wanted_notable(_wanted) && !is_wanted_notable(w)) {
+      maybe_clear_notable();
+    }
+  }
+  _wanted = w;
+}
 
 void Capability::encode(bufferlist& bl) const
 {
@@ -164,7 +244,7 @@ void Capability::decode(bufferlist::iterator &bl)
   ::decode(_revokes, bl);
   DECODE_FINISH(bl);
   
-  _calc_issued();
+  calc_issued();
 }
 
 void Capability::dump(Formatter *f) const
diff --git a/ceph/src/mds/Capability.h b/ceph/src/mds/Capability.h
index e8e05dbd6..db3e47d38 100644
--- a/ceph/src/mds/Capability.h
+++ b/ceph/src/mds/Capability.h
@@ -61,6 +61,7 @@
  */
 
 class CInode;
+class Session;
 
 namespace ceph {
   class Formatter;
@@ -109,41 +110,33 @@ public:
     static void generate_test_instances(list<revoke_info*>& ls);
   };
 
-
-  const static unsigned STATE_STALE		= (1<<0);
+  const static unsigned STATE_NOTABLE		= (1<<0);
   const static unsigned STATE_NEW		= (1<<1);
   const static unsigned STATE_IMPORTING		= (1<<2);
+  const static unsigned STATE_CLIENTWRITEABLE	= (1<<4);
 
-
-  Capability(CInode *i = NULL, uint64_t id = 0, client_t c = 0) :
-    client_follows(0), client_xattr_version(0),
-    client_inline_version(0),
-    last_rbytes(0), last_rsize(0),
-    item_session_caps(this), item_snaprealm_caps(this),
-    item_revoking_caps(this), item_client_revoking_caps(this),
-    inode(i), client(c),
-    cap_id(id),
-    _wanted(0), num_revoke_warnings(0),
-    _pending(0), _issued(0),
-    last_sent(0),
-    last_issue(0),
-    mseq(0),
-    suppress(0), state(0) {
-  }
+  Capability(CInode *i=nullptr, Session *s=nullptr, uint64_t id=0);
   Capability(const Capability& other);  // no copying
 
   const Capability& operator=(const Capability& other);  // no copying
 
-  int pending() { return _pending; }
-  int issued() { return _issued; }
-  bool is_null() { return !_pending && _revokes.empty(); }
+  int pending() const {
+    return is_valid() ? _pending : (_pending & CEPH_CAP_PIN);
+  }
+  int issued() const {
+    return is_valid() ? _issued : (_issued & CEPH_CAP_PIN);
+  }
 
   ceph_seq_t issue(unsigned c) {
+    revalidate();
+
     if (_pending & ~c) {
       // revoking (and maybe adding) bits.  note caps prior to this revocation
       _revokes.emplace_back(_pending, last_sent, last_issue);
       _pending = c;
       _issued |= c;
+      if (!is_notable())
+	mark_notable();
     } else if (~_pending & c) {
       // adding bits only.  remove obsolete revocations?
       _pending |= c;
@@ -157,23 +150,20 @@ public:
       assert(_pending == c);
     }
     //last_issue = 
-    ++last_sent;
+    inc_last_seq();
     return last_sent;
   }
   ceph_seq_t issue_norevoke(unsigned c) {
+    revalidate();
+
     _pending |= c;
     _issued |= c;
     //check_rdcaps_list();
-    ++last_sent;
+    inc_last_seq();
     return last_sent;
   }
-  void _calc_issued() {
-    _issued = _pending;
-    for (const auto &r : _revokes) {
-      _issued |= r.before;
-    }
-  }
   void confirm_receipt(ceph_seq_t seq, unsigned caps) {
+    bool was_revoking = (_issued & ~_pending);
     if (seq == last_sent) {
       _revokes.clear();
       _issued = caps;
@@ -186,16 +176,17 @@ public:
       if (!_revokes.empty()) {
 	if (_revokes.front().seq == seq)
 	  _revokes.begin()->before = caps;
-	_calc_issued();
+	calc_issued();
       } else {
 	// seq < last_sent
 	_issued = caps | _pending;
       }
     }
 
-    if (_issued == _pending) {
+    if (was_revoking && _issued == _pending) {
       item_revoking_caps.remove_myself();
       item_client_revoking_caps.remove_myself();
+      maybe_clear_notable();
     }
     //check_rdcaps_list();
   }
@@ -208,19 +199,20 @@ public:
       changed = true;
     }
     if (changed) {
-      _calc_issued();
-      if (_issued == _pending) {
+      bool was_revoking = (_issued & ~_pending);
+      calc_issued();
+      if (was_revoking && _issued == _pending) {
 	item_revoking_caps.remove_myself();
 	item_client_revoking_caps.remove_myself();
+	maybe_clear_notable();
       }
     }
   }
   ceph_seq_t get_mseq() { return mseq; }
   void inc_mseq() { mseq++; }
 
-  ceph_seq_t get_last_sent() { return last_sent; }
-  utime_t get_last_issue_stamp() { return last_issue_stamp; }
-  utime_t get_last_revoke_stamp() { return last_revoke_stamp; }
+  utime_t get_last_issue_stamp() const { return last_issue_stamp; }
+  utime_t get_last_revoke_stamp() const { return last_revoke_stamp; }
 
   void set_last_issue() { last_issue = last_sent; }
   void set_last_issue_stamp(utime_t t) { last_issue_stamp = t; }
@@ -238,29 +230,49 @@ public:
   void inc_suppress() { suppress++; }
   void dec_suppress() { suppress--; }
 
-  bool is_stale() { return state & STATE_STALE; }
-  void mark_stale() { state |= STATE_STALE; }
-  void clear_stale() { state &= ~STATE_STALE; }
-  bool is_new() { return state & STATE_NEW; }
+  static bool is_wanted_notable(int wanted) {
+    return wanted & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_WR|CEPH_CAP_FILE_RD);
+  }
+  bool is_notable() const { return state & STATE_NOTABLE; }
+
+  bool is_stale() const;
+  bool is_new() const { return state & STATE_NEW; }
   void mark_new() { state |= STATE_NEW; }
   void clear_new() { state &= ~STATE_NEW; }
   bool is_importing() { return state & STATE_IMPORTING; }
   void mark_importing() { state |= STATE_IMPORTING; }
   void clear_importing() { state &= ~STATE_IMPORTING; }
 
-  CInode *get_inode() { return inode; }
-  client_t get_client() const { return client; }
+  bool is_clientwriteable() const { return state & STATE_CLIENTWRITEABLE; }
+  void mark_clientwriteable() {
+    if (!is_clientwriteable()) {
+      state |= STATE_CLIENTWRITEABLE;
+      if (!is_notable())
+	mark_notable();
+    }
+  }
+  void clear_clientwriteable() {
+    if (is_clientwriteable()) {
+      state &= ~STATE_CLIENTWRITEABLE;
+      maybe_clear_notable();
+    }
+  }
+
+  CInode *get_inode() const { return inode; }
+  Session *get_session() const { return session; }
+  client_t get_client() const;
 
   // caps this client wants to hold
-  int wanted() { return _wanted; }
-  void set_wanted(int w) {
-    _wanted = w;
-    //check_rdcaps_list();
-  }
+  int wanted() const { return _wanted; }
+  void set_wanted(int w);
 
   void inc_last_seq() { last_sent++; }
-  ceph_seq_t get_last_seq() { return last_sent; }
-  ceph_seq_t get_last_issue() { return last_issue; }
+  ceph_seq_t get_last_seq() const {
+    if (!is_valid() && (_pending & ~CEPH_CAP_PIN))
+      return last_sent + 1;
+    return last_sent;
+  }
+  ceph_seq_t get_last_issue() const { return last_issue; }
 
   void reset_seq() {
     last_sent = 0;
@@ -268,8 +280,8 @@ public:
   }
   
   // -- exports --
-  Export make_export() {
-    return Export(cap_id, _wanted, issued(), pending(), client_follows, last_sent, mseq+1, last_issue_stamp);
+  Export make_export() const {
+    return Export(cap_id, wanted(), issued(), pending(), client_follows, get_last_seq(), mseq+1, last_issue_stamp);
   }
   void merge(const Export& other, bool auth_cap) {
     if (!is_stale()) {
@@ -287,7 +299,7 @@ public:
     client_follows = other.client_follows;
 
     // wanted
-    _wanted = _wanted | other.wanted;
+    set_wanted(wanted() | other.wanted);
     if (auth_cap)
       mseq = other.mseq;
   }
@@ -304,7 +316,7 @@ public:
     }
 
     // wanted
-    _wanted = _wanted | otherwanted;
+    set_wanted(wanted() | otherwanted);
   }
 
   void revoke() {
@@ -332,9 +344,10 @@ public:
 
 private:
   CInode *inode;
-  client_t client;
+  Session *session;
 
   uint64_t cap_id;
+  uint32_t cap_gen;
 
   __u32 _wanted;     // what the client wants (ideally)
 
@@ -354,6 +367,19 @@ private:
 
   int suppress;
   unsigned state;
+
+  void calc_issued() {
+    _issued = _pending;
+    for (const auto &r : _revokes) {
+      _issued |= r.before;
+    }
+  }
+
+  bool is_valid() const;
+  void revalidate();
+
+  void mark_notable();
+  void maybe_clear_notable();
 };
 
 WRITE_CLASS_ENCODER(Capability::Export)
diff --git a/ceph/src/mds/Locker.cc b/ceph/src/mds/Locker.cc
index 42b47087f..b2b4c21dc 100644
--- a/ceph/src/mds/Locker.cc
+++ b/ceph/src/mds/Locker.cc
@@ -759,8 +759,7 @@ void Locker::cancel_locking(MutationImpl *mut, set<CInode*> *pneed_issue)
     bool need_issue = false;
     if (lock->get_state() == LOCK_PREXLOCK) {
       _finish_xlock(lock, -1, &need_issue);
-    } else if (lock->get_state() == LOCK_LOCK_XLOCK &&
-	       lock->get_num_xlocks() == 0) {
+    } else if (lock->get_state() == LOCK_LOCK_XLOCK) {
       lock->set_state(LOCK_XLOCKDONE);
       eval_gather(lock, true, &need_issue);
     }
@@ -819,6 +818,24 @@ void Locker::drop_rdlocks_for_early_reply(MutationImpl *mut)
   issue_caps_set(need_issue);
 }
 
+void Locker::drop_locks_for_fragment_unfreeze(MutationImpl *mut)
+{
+  set<CInode*> need_issue;
+
+  for (auto it = mut->locks.begin(); it != mut->locks.end(); ) {
+    SimpleLock *lock = *it;
+    ++it;
+    if (lock->get_type() == CEPH_LOCK_IDFT) {
+      continue;
+    }
+    bool ni = false;
+    wrlock_finish(lock, mut, &ni);
+    if (ni)
+      need_issue.insert(static_cast<CInode*>(lock->get_parent()));
+  }
+  issue_caps_set(need_issue);
+}
+
 // generics
 
 void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, list<MDSInternalContextBase*> *pfinishers)
@@ -1626,11 +1643,17 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequestRef& mut)
   dout(7) << "xlock_start on " << *lock << " on " << *lock->get_parent() << dendl;
   client_t client = mut->get_client();
 
+  CInode *in = nullptr;
+  if (lock->get_cap_shift())
+    in = static_cast<CInode *>(lock->get_parent());
+
   // auth?
   if (lock->get_parent()->is_auth()) {
     // auth
     while (1) {
-      if (lock->can_xlock(client)) {
+      if (lock->can_xlock(client) &&
+	  !(lock->get_state() == LOCK_LOCK_XLOCK &&	// client is not xlocker or
+	    in && in->issued_caps_need_gather(lock))) { // xlocker does not hold shared cap
 	lock->set_state(LOCK_XLOCK);
 	lock->get_xlock(mut, client);
 	mut->xlocks.insert(lock);
@@ -1639,11 +1662,9 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequestRef& mut)
 	return true;
       }
       
-      if (lock->get_type() == CEPH_LOCK_IFILE) {
-	CInode *in = static_cast<CInode*>(lock->get_parent());
-	if (in->state_test(CInode::STATE_RECOVERING)) {
-	  mds->mdcache->recovery_queue.prioritize(in);
-	}
+      if (lock->get_type() == CEPH_LOCK_IFILE &&
+	  in->state_test(CInode::STATE_RECOVERING)) {
+	mds->mdcache->recovery_queue.prioritize(in);
       }
 
       if (!lock->is_stable() && (lock->get_state() != LOCK_XLOCKDONE ||
@@ -1764,9 +1785,8 @@ void Locker::xlock_finish(SimpleLock *lock, MutationImpl *mut, bool *pneed_issue
 			 SimpleLock::WAIT_WR | 
 			 SimpleLock::WAIT_RD, 0); 
   } else {
-    if (lock->get_num_xlocks() == 0) {
-      if (lock->get_state() == LOCK_LOCK_XLOCK)
-	lock->set_state(LOCK_XLOCKDONE);
+    if (lock->get_num_xlocks() == 0 &&
+        lock->get_state() != LOCK_LOCK_XLOCK) { // no one is taking xlock
       _finish_xlock(lock, xlocker, &do_issue);
     }
   }
@@ -1914,10 +1934,9 @@ Capability* Locker::issue_new_caps(CInode *in,
   bool is_new;
 
   // if replay, try to reconnect cap, and otherwise do nothing.
-  if (is_replay) {
-    mds->mdcache->try_reconnect_cap(in, session);
-    return 0;
-  }
+  if (is_replay)
+    return mds->mdcache->try_reconnect_cap(in, session);
+
 
   // my needs
   assert(session->info.inst.name.is_client());
@@ -2124,19 +2143,27 @@ void Locker::issue_truncate(CInode *in)
     check_inode_max_size(in);
 }
 
-
-void Locker::revoke_stale_caps(Capability *cap)
+void Locker::revoke_stale_caps(Session *session)
 {
-  CInode *in = cap->get_inode();
-  if (in->state_test(CInode::STATE_EXPORTINGCAPS)) {
-    // if export succeeds, the cap will be removed. if export fails, we need to
-    // revoke the cap if it's still stale.
-    in->state_set(CInode::STATE_EVALSTALECAPS);
-    return;
-  }
+  dout(10) << "revoke_stale_caps for " << session->info.inst.name << dendl;
+
+  std::vector<CInode*> to_eval;
 
-  int issued = cap->issued();
-  if (issued & ~CEPH_CAP_PIN) {
+  for (auto p = session->caps.begin(); !p.end(); ) {
+    Capability *cap = *p;
+    ++p;
+    if (!cap->is_notable()) {
+      // the rest ones are not being revoked and don't have writeable range
+      // and don't want exclusive caps or want file read/write. They don't
+      // need recover, they don't affect eval_gather()/try_eval()
+      break;
+    }
+
+    int issued = cap->issued();
+    if (!(issued & ~CEPH_CAP_PIN))
+      continue;
+
+    CInode *in = cap->get_inode();
     dout(10) << " revoking " << ccap_string(issued) << " on " << *in << dendl;
     cap->revoke();
 
@@ -2144,27 +2171,31 @@ void Locker::revoke_stale_caps(Capability *cap)
 	in->inode.client_ranges.count(cap->get_client()))
       in->state_set(CInode::STATE_NEEDSRECOVER);
 
-    if (!in->filelock.is_stable()) eval_gather(&in->filelock);
-    if (!in->linklock.is_stable()) eval_gather(&in->linklock);
-    if (!in->authlock.is_stable()) eval_gather(&in->authlock);
-    if (!in->xattrlock.is_stable()) eval_gather(&in->xattrlock);
-
-    if (in->is_auth()) {
-      try_eval(in, CEPH_CAP_LOCKS);
-    } else {
-      request_inode_file_caps(in);
-    }
+    // eval lock/inode may finish contexts, which may modify other cap's position
+    // in the session->caps.
+    to_eval.push_back(in);
   }
-}
 
-void Locker::revoke_stale_caps(Session *session)
-{
-  dout(10) << "revoke_stale_caps for " << session->info.inst.name << dendl;
+  // invalidate the rest
+  session->inc_cap_gen();
 
-  for (xlist<Capability*>::iterator p = session->caps.begin(); !p.end(); ++p) {
-    Capability *cap = *p;
-    cap->mark_stale();
-    revoke_stale_caps(cap);
+  for (auto in : to_eval) {
+    if (in->state_test(CInode::STATE_EXPORTINGCAPS))
+      continue;
+
+    if (!in->filelock.is_stable())
+      eval_gather(&in->filelock);
+    if (!in->linklock.is_stable())
+      eval_gather(&in->linklock);
+    if (!in->authlock.is_stable())
+      eval_gather(&in->authlock);
+    if (!in->xattrlock.is_stable())
+      eval_gather(&in->xattrlock);
+
+    if (in->is_auth())
+      try_eval(in, CEPH_CAP_LOCKS);
+    else
+      request_inode_file_caps(in);
   }
 }
 
@@ -2172,24 +2203,25 @@ void Locker::resume_stale_caps(Session *session)
 {
   dout(10) << "resume_stale_caps for " << session->info.inst.name << dendl;
 
-  for (xlist<Capability*>::iterator p = session->caps.begin(); !p.end(); ++p) {
+  for (xlist<Capability*>::iterator p = session->caps.begin(); !p.end(); ) {
     Capability *cap = *p;
+    ++p;
+    if (!cap->is_notable())
+      break; // see revoke_stale_caps()
+
     CInode *in = cap->get_inode();
-    assert(in->is_head());
-    if (cap->is_stale()) {
-      dout(10) << " clearing stale flag on " << *in << dendl;
-      cap->clear_stale();
-
-      if (in->state_test(CInode::STATE_EXPORTINGCAPS)) {
-	// if export succeeds, the cap will be removed. if export fails,
-	// we need to re-issue the cap if it's not stale.
-	in->state_set(CInode::STATE_EVALSTALECAPS);
-	continue;
-      }
+    ceph_assert(in->is_head());
+    dout(10) << " clearing stale flag on " << *in << dendl;
 
-      if (!in->is_auth() || !eval(in, CEPH_CAP_LOCKS))
-	issue_caps(in, cap);
+    if (in->state_test(CInode::STATE_EXPORTINGCAPS)) {
+      // if export succeeds, the cap will be removed. if export fails,
+      // we need to re-issue the cap if it's not stale.
+      in->state_set(CInode::STATE_EVALSTALECAPS);
+      continue;
     }
+
+    if (!in->is_auth() || !eval(in, CEPH_CAP_LOCKS))
+      issue_caps(in, cap);
   }
 }
 
@@ -2257,7 +2289,13 @@ void Locker::request_inode_file_caps(CInode *in)
 void Locker::handle_inode_file_caps(MInodeFileCaps *m)
 {
   // nobody should be talking to us during recovery.
-  assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+  if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+    if (mds->get_want_state() >= MDSMap::STATE_CLIENTREPLAY) {
+      mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+      return;
+    }
+    assert(!"got unexpected message during recovery");
+  }
 
   // ok
   CInode *in = mdcache->get_inode(m->get_ino());
@@ -2310,13 +2348,13 @@ uint64_t Locker::calc_new_max_size(CInode::mempool_inode *pi, uint64_t size)
   return ROUND_UP_TO(new_max, pi->get_layout_size_increment());
 }
 
-void Locker::calc_new_client_ranges(CInode *in, uint64_t size,
+void Locker::calc_new_client_ranges(CInode *in, uint64_t size, bool update,
 				    CInode::mempool_inode::client_range_map *new_ranges,
 				    bool *max_increased)
 {
   auto latest = in->get_projected_inode();
   uint64_t ms;
-  if(latest->has_layout()) {
+  if (latest->has_layout()) {
     ms = calc_new_max_size(latest, size);
   } else {
     // Layout-less directories like ~mds0/, have zero size
@@ -2328,7 +2366,7 @@ void Locker::calc_new_client_ranges(CInode *in, uint64_t size,
   for (map<client_t,Capability*>::iterator p = in->client_caps.begin();
        p != in->client_caps.end();
        ++p) {
-    if ((p->second->issued() | p->second->wanted()) & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER)) {
+    if ((p->second->issued() | p->second->wanted()) & (CEPH_CAP_ANY_FILE_WR)) {
       client_writeable_range_t& nr = (*new_ranges)[p->first];
       nr.range.first = 0;
       if (latest->client_ranges.count(p->first)) {
@@ -2342,6 +2380,11 @@ void Locker::calc_new_client_ranges(CInode *in, uint64_t size,
 	nr.range.last = ms;
 	nr.follows = in->first - 1;
       }
+      if (update)
+	p->second->mark_clientwriteable();
+    } else {
+      if (update)
+	p->second->clear_clientwriteable();
     }
   }
 }
@@ -2367,7 +2410,23 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
       update_size = false;
   }
 
-  calc_new_client_ranges(in, max(new_max_size, size), &new_ranges, &max_increased);
+  int can_update = 1;
+  if (in->is_frozen()) {
+    can_update = -1;
+  } else if (!force_wrlock && !in->filelock.can_wrlock(in->get_loner())) {
+    // lock?
+    if (in->filelock.is_stable()) {
+      if (in->get_target_loner() >= 0)
+	file_excl(&in->filelock);
+      else
+	simple_lock(&in->filelock);
+    }
+    if (!in->filelock.can_wrlock(in->get_loner()))
+      can_update = -2;
+  }
+
+  calc_new_client_ranges(in, std::max(new_max_size, size), can_update > 0,
+			 &new_ranges, &max_increased);
 
   if (max_increased || latest->client_ranges != new_ranges)
     update_max = true;
@@ -2381,34 +2440,16 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
 	   << " update_size " << update_size
 	   << " on " << *in << dendl;
 
-  if (in->is_frozen()) {
-    dout(10) << "check_inode_max_size frozen, waiting on " << *in << dendl;
-    C_MDL_CheckMaxSize *cms = new C_MDL_CheckMaxSize(this, in,
-                                                     new_max_size,
-                                                     new_size,
-                                                     new_mtime);
-    in->add_waiter(CInode::WAIT_UNFREEZE, cms);
-    return false;
-  }
-  if (!force_wrlock && !in->filelock.can_wrlock(in->get_loner())) {
-    // lock?
-    if (in->filelock.is_stable()) {
-      if (in->get_target_loner() >= 0)
-	file_excl(&in->filelock);
-      else
-	simple_lock(&in->filelock);
-    }
-    if (!in->filelock.can_wrlock(in->get_loner())) {
-      // try again later
-      C_MDL_CheckMaxSize *cms = new C_MDL_CheckMaxSize(this, in,
-                                                       new_max_size,
-                                                       new_size,
-                                                       new_mtime);
-
+  if (can_update < 0) {
+    auto cms = new C_MDL_CheckMaxSize(this, in, new_max_size, new_size, new_mtime);
+    if (can_update == -1) {
+      dout(10) << "check_inode_max_size frozen, waiting on " << *in << dendl;
+      in->add_waiter(CInode::WAIT_UNFREEZE, cms);
+    } else {
       in->filelock.add_waiter(SimpleLock::WAIT_STABLE, cms);
       dout(10) << "check_inode_max_size can't wrlock, waiting on " << *in << dendl;
-      return false;    
     }
+    return false;
   }
 
   MutationRef mut(new MutationImpl());
@@ -3030,7 +3071,7 @@ public:
 void Locker::kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq)
 {
   Capability *cap = in->get_client_cap(client);
-  if (!cap || cap->get_last_sent() != seq)
+  if (!cap || cap->get_last_seq() != seq)
     return;
   if (in->is_frozen()) {
     dout(10) << "kick_issue_caps waiting for unfreeze on " << *in << dendl;
@@ -3381,8 +3422,13 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap,
       cr.range.first = 0;
       cr.range.last = new_max;
       cr.follows = in->first - 1;
-    } else 
+      if (cap)
+	cap->mark_clientwriteable();
+    } else {
       pi.inode.client_ranges.erase(client);
+      if (cap)
+	cap->clear_clientwriteable();
+    }
   }
     
   if (change_max || (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) 
@@ -3506,18 +3552,20 @@ void Locker::_do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id,
     eval_cap_gather(in);
     return;
   }
-  remove_client_cap(in, client);
+  remove_client_cap(in, cap);
 }
 
-/* This function DOES put the passed message before returning */
-
-void Locker::remove_client_cap(CInode *in, client_t client)
+void Locker::remove_client_cap(CInode *in, Capability *cap)
 {
+  client_t client = cap->get_client();
   // clean out any pending snapflush state
   if (!in->client_need_snapflush.empty())
     _do_null_snapflush(in, client);
 
+  bool notable = cap->is_notable();
   in->remove_client_cap(client);
+  if (!notable)
+    return;
 
   if (in->is_auth()) {
     // make sure we clear out the client byte range
diff --git a/ceph/src/mds/Locker.h b/ceph/src/mds/Locker.h
index f0a9a4ce4..ed16aff97 100644
--- a/ceph/src/mds/Locker.h
+++ b/ceph/src/mds/Locker.h
@@ -88,6 +88,7 @@ public:
   void set_xlocks_done(MutationImpl *mut, bool skip_dentry=false);
   void drop_non_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue=0);
   void drop_rdlocks_for_early_reply(MutationImpl *mut);
+  void drop_locks_for_fragment_unfreeze(MutationImpl *mut);
 
   void eval_gather(SimpleLock *lock, bool first=false, bool *need_issue=0, list<MDSInternalContextBase*> *pfinishers=0);
   void eval(SimpleLock *lock, bool *need_issue);
@@ -184,7 +185,7 @@ public:
   void kick_cap_releases(MDRequestRef& mdr);
   void kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq);
 
-  void remove_client_cap(CInode *in, client_t client);
+  void remove_client_cap(CInode *in, Capability *cap);
 
   void get_late_revoking_clients(std::list<client_t> *result, double timeout) const;
 
@@ -244,7 +245,6 @@ public:
   void issue_caps_set(set<CInode*>& inset);
   void issue_truncate(CInode *in);
   void revoke_stale_caps(Session *session);
-  void revoke_stale_caps(Capability *cap);
   void resume_stale_caps(Session *session);
   void remove_stale_leases(Session *session);
 
@@ -258,7 +258,7 @@ protected:
 private:
   uint64_t calc_new_max_size(CInode::mempool_inode *pi, uint64_t size);
 public:
-  void calc_new_client_ranges(CInode *in, uint64_t size,
+  void calc_new_client_ranges(CInode *in, uint64_t size, bool update,
 			      CInode::mempool_inode::client_range_map* new_ranges,
 			      bool *max_increased);
   bool check_inode_max_size(CInode *in, bool force_wrlock=false,
diff --git a/ceph/src/mds/MDBalancer.cc b/ceph/src/mds/MDBalancer.cc
index 39abb57ba..d5c026c32 100644
--- a/ceph/src/mds/MDBalancer.cc
+++ b/ceph/src/mds/MDBalancer.cc
@@ -813,7 +813,7 @@ int MDBalancer::mantle_prep_rebalance()
   /* execute the balancer */
   Mantle mantle;
   int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
-  dout(2) << " mantle decided that new targets=" << state.targets << dendl;
+  dout(5) << " mantle decided that new targets=" << state.targets << dendl;
 
   /* mantle doesn't know about cluster size, so check target len here */
   if ((int) state.targets.size() != cluster_size)
diff --git a/ceph/src/mds/MDCache.cc b/ceph/src/mds/MDCache.cc
index d7b40a312..6a44598f1 100644
--- a/ceph/src/mds/MDCache.cc
+++ b/ceph/src/mds/MDCache.cc
@@ -94,6 +94,7 @@
 #include "messages/MMDSSlaveRequest.h"
 
 #include "messages/MMDSFragmentNotify.h"
+#include "messages/MMDSFragmentNotifyAck.h"
 
 #include "messages/MGatherCaps.h"
 
@@ -173,7 +174,8 @@ MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
   filer(m->objecter, m->finisher),
   exceeded_size_limit(false),
   recovery_queue(m),
-  stray_manager(m, purge_queue_)
+  stray_manager(m, purge_queue_),
+  trim_counter(ceph_clock_now(), g_conf->get_val<double>("mds_cache_trim_decay_rate"))
 {
   migrator.reset(new Migrator(mds, this));
   root = NULL;
@@ -243,6 +245,9 @@ void MDCache::handle_conf_change(const struct md_config_t *conf,
     cache_health_threshold = g_conf->get_val<double>("mds_health_cache_threshold");
   if (changed.count("mds_cache_mid"))
     lru.lru_set_midpoint(g_conf->get_val<double>("mds_cache_mid"));
+  if (changed.count("mds_cache_trim_decay_rate")) {
+    trim_counter = DecayCounter(ceph_clock_now(), g_conf->get_val<double>("mds_cache_trim_decay_rate"));
+  }
 
   migrator->handle_conf_change(conf, changed, mdsmap);
   mds->balancer->handle_conf_change(conf, changed, mdsmap);
@@ -2015,14 +2020,15 @@ void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accou
   }
 }
 
-void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct)
+void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
 {
   if (!in->is_auth() || in->is_frozen())
     return;
 
   auto i = in->get_projected_inode();
-
-  if (!i->quota.is_enable())
+  
+  if (!i->quota.is_enable() &&
+  	  !quota_change)
     return;
 
   for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
@@ -3056,9 +3062,19 @@ void MDCache::handle_mds_failure(mds_rank_t who)
        p != fragments.end(); ) {
     dirfrag_t df = p->first;
     fragment_info_t& info = p->second;
-    ++p;
-    if (info.is_fragmenting())
+
+    if (info.is_fragmenting()) {
+      if (info.notify_ack_waiting.erase(who) &&
+	  info.notify_ack_waiting.empty()) {
+	fragment_drop_locks(info);
+	fragment_maybe_finish(p++);
+      } else {
+	++p;
+      }
       continue;
+    }
+
+    ++p;
     dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
     list<CDir*> dirs;
     info.dirs.swap(dirs);
@@ -4643,7 +4659,13 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
   mds_rank_t from = mds_rank_t(strong->get_source().num());
 
   // only a recovering node will get a strong rejoin.
-  assert(mds->is_rejoin());
+  if (!mds->is_rejoin()) {
+    if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
+      mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
+      return;
+    }
+    assert(!"got unexpected rejoin message during recovery");
+  }
 
   // assimilate any potentially dirty scatterlock state
   for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
@@ -5688,12 +5710,13 @@ void MDCache::export_remaining_imported_caps()
   }
 }
 
-void MDCache::try_reconnect_cap(CInode *in, Session *session)
+Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
 {
   client_t client = session->info.get_client();
+  Capability *cap = nullptr;
   const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
   if (rc) {
-    in->reconnect_cap(client, *rc, session);
+    cap = in->reconnect_cap(client, *rc, session);
     dout(10) << "try_reconnect_cap client." << client
 	     << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
 	     << " issue " << ccap_string(rc->capinfo.issued)
@@ -5721,6 +5744,7 @@ void MDCache::try_reconnect_cap(CInode *in, Session *session)
       cap_reconnect_waiters.erase(it);
     }
   }
+  return cap;
 }
 
 
@@ -6228,7 +6252,9 @@ void MDCache::identify_files_to_recover()
 	 p != in->inode.client_ranges.end();
 	 ++p) {
       Capability *cap = in->get_client_cap(p->first);
-      if (!cap) {
+      if (cap) {
+	cap->mark_clientwriteable();
+      } else {
 	dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
 	recover = true;
 	break;
@@ -6472,12 +6498,14 @@ void MDCache::start_recovered_truncates()
 // ================================================================================
 // cache trimming
 
-void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
+std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
 {
   bool is_standby_replay = mds->is_standby_replay();
   std::vector<CDentry *> unexpirables;
   uint64_t trimmed = 0;
 
+  auto trim_threshold = g_conf->get_val<uint64_t>("mds_cache_trim_threshold");
+
   dout(7) << "trim_lru trimming " << count
           << " items from LRU"
           << " size=" << lru.lru_get_size()
@@ -6486,7 +6514,11 @@ void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap
           << " pinned=" << lru.lru_get_num_pinned()
           << dendl;
 
-  for (;;) {
+  const uint64_t trim_counter_start = trim_counter.get(ceph_clock_now());
+  bool throttled = false;
+  while (1) {
+    throttled |= trim_counter_start+trimmed >= trim_threshold;
+    if (throttled) break;
     CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
     if (!dn)
       break;
@@ -6503,7 +6535,9 @@ void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap
   unexpirables.clear();
 
   // trim dentries from the LRU until count is reached
-  while (cache_toofull() || count > 0) {
+  while (!throttled && (cache_toofull() || count > 0)) {
+    throttled |= trim_counter_start+trimmed >= trim_threshold;
+    if (throttled) break;
     CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
     if (!dn) {
       break;
@@ -6518,6 +6552,7 @@ void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap
       if (count > 0) count--;
     }
   }
+  trim_counter.hit(ceph_clock_now(), trimmed);
 
   for (auto &dn : unexpirables) {
     lru.lru_insert_mid(dn);
@@ -6525,6 +6560,7 @@ void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap
   unexpirables.clear();
 
   dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
+  return std::pair<bool, uint64_t>(throttled, trimmed);
 }
 
 /*
@@ -6533,7 +6569,7 @@ void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap
  *
  * @param count is number of dentries to try to expire
  */
-bool MDCache::trim(uint64_t count)
+std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
 {
   uint64_t used = cache_size();
   uint64_t limit = cache_memory_limit;
@@ -6547,7 +6583,8 @@ bool MDCache::trim(uint64_t count)
   // process delayed eval_stray()
   stray_manager.advance_delayed();
 
-  trim_lru(count, expiremap);
+  auto result = trim_lru(count, expiremap);
+  auto& trimmed = result.second;
 
   // trim non-auth, non-bound subtrees
   for (auto p = subtrees.begin(); p != subtrees.end();) {
@@ -6563,6 +6600,7 @@ bool MDCache::trim(uint64_t count)
 	  continue;
 
 	migrator->export_empty_import(dir);
+        ++trimmed;
       }
     } else {
       if (!diri->is_auth()) {
@@ -6579,6 +6617,7 @@ bool MDCache::trim(uint64_t count)
 	    rejoin_ack_gather.count(dir->get_dir_auth().first))
 	  continue;
 	trim_dirfrag(dir, 0, expiremap);
+        ++trimmed;
       }
     }
   }
@@ -6589,11 +6628,15 @@ bool MDCache::trim(uint64_t count)
     root->get_dirfrags(ls);
     for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
       CDir *dir = *p;
-      if (dir->get_num_ref() == 1)  // subtree pin
+      if (dir->get_num_ref() == 1) { // subtree pin
 	trim_dirfrag(dir, 0, expiremap);
+        ++trimmed;
+      }
     }
-    if (root->get_num_ref() == 0)
+    if (root->get_num_ref() == 0) {
       trim_inode(0, root, 0, expiremap);
+      ++trimmed;
+    }
   }
 
   std::set<mds_rank_t> stopping;
@@ -6616,11 +6659,15 @@ bool MDCache::trim(uint64_t count)
       list<CDir*> ls;
       mdsdir_in->get_dirfrags(ls);
       for (auto dir : ls) {
-	if (dir->get_num_ref() == 1)  // subtree pin
+	if (dir->get_num_ref() == 1) {  // subtree pin
 	  trim_dirfrag(dir, dir, expiremap);
+          ++trimmed;
+        }
       }
-      if (mdsdir_in->get_num_ref() == 0)
+      if (mdsdir_in->get_num_ref() == 0) {
 	trim_inode(NULL, mdsdir_in, NULL, expiremap);
+        ++trimmed;
+      }
     } else {
       dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
     }
@@ -6634,6 +6681,7 @@ bool MDCache::trim(uint64_t count)
         dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
         if ((*p)->get_num_ref() == 0) {
           trim_inode(NULL, *p, NULL, expiremap);
+          ++trimmed;
         }
       }
     }
@@ -6642,7 +6690,7 @@ bool MDCache::trim(uint64_t count)
   // send any expire messages
   send_expire_messages(expiremap);
 
-  return true;
+  return result;
 }
 
 void MDCache::send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap)
@@ -7482,7 +7530,7 @@ void MDCache::check_memory_usage()
   if (CInode::count())
     caps_per_inode = (double)Capability::count() / (double)CInode::count();
 
-  dout(2) << "check_memory_usage"
+  dout(2) << "Memory usage: "
 	   << " total " << last.get_total()
 	   << ", rss " << last.get_rss()
 	   << ", heap " << last.get_heap()
@@ -7497,8 +7545,7 @@ void MDCache::check_memory_usage()
   mds->mlogger->set(l_mdm_heap, last.get_heap());
 
   if (cache_toofull()) {
-    last_recall_state = clock::now();
-    mds->server->recall_client_state(-1.0, false, nullptr);
+    mds->server->recall_client_state(nullptr);
   }
 
   // If the cache size had exceeded its limit, but we're back in bounds
@@ -7508,7 +7555,7 @@ void MDCache::check_memory_usage()
     // Only do this once we are back in bounds: otherwise the releases would
     // slow down whatever process caused us to exceed bounds to begin with
     if (ceph_using_tcmalloc()) {
-      dout(2) << "check_memory_usage: releasing unused space from tcmalloc" 
+      dout(5) << "check_memory_usage: releasing unused space from tcmalloc"
 	      << dendl;
       ceph_heap_release_free_memory();
     }
@@ -7558,7 +7605,7 @@ void MDCache::shutdown_check()
 
 void MDCache::shutdown_start()
 {
-  dout(2) << "shutdown_start" << dendl;
+  dout(5) << "shutdown_start" << dendl;
 
   if (g_conf->mds_shutdown_check)
     mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
@@ -7755,7 +7802,7 @@ bool MDCache::shutdown_pass()
   }
 
   // done!
-  dout(2) << "shutdown done." << dendl;
+  dout(5) << "shutdown done." << dendl;
   return true;
 }
 
@@ -7921,6 +7968,9 @@ void MDCache::dispatch(Message *m)
   case MSG_MDS_FRAGMENTNOTIFY:
     handle_fragment_notify(static_cast<MMDSFragmentNotify*>(m));
     break;
+  case MSG_MDS_FRAGMENTNOTIFYACK:
+    handle_fragment_notify_ack(static_cast<MMDSFragmentNotifyAck*>(m));
+    break;
 
   case MSG_MDS_FINDINO:
     handle_find_ino(static_cast<MMDSFindIno *>(m));
@@ -11309,29 +11359,29 @@ public:
 
 class C_MDC_FragmentCommit : public MDCacheLogContext {
   dirfrag_t basedirfrag;
-  list<CDir*> resultfrags;
+  MDRequestRef mdr;
 public:
-  C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list<CDir*>& l) :
-    MDCacheLogContext(m), basedirfrag(df), resultfrags(l) {}
+  C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
+    MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
   void finish(int r) override {
-    mdcache->_fragment_committed(basedirfrag, resultfrags);
+    mdcache->_fragment_committed(basedirfrag, mdr);
   }
 };
 
-class C_IO_MDC_FragmentFinish : public MDCacheIOContext {
+class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
   dirfrag_t basedirfrag;
-  list<CDir*> resultfrags;
+  int bits;
+  MDRequestRef mdr;
 public:
-  C_IO_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
-    MDCacheIOContext(m), basedirfrag(f) {
-    resultfrags.swap(l);
-  }
+  C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
+			    const MDRequestRef& r) :
+    MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
   void finish(int r) override {
     assert(r == 0 || r == -ENOENT);
-    mdcache->_fragment_finish(basedirfrag, resultfrags);
+    mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
   }
   void print(ostream& out) const override {
-    out << "dirfrags_commit(" << basedirfrag << ")";
+    out << "fragment_purge_old(" << basedirfrag << ")";
   }
 };
 
@@ -11460,13 +11510,12 @@ void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
 void MDCache::_fragment_logged(MDRequestRef& mdr)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
-  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
-  assert(it != fragments.end());
-  fragment_info_t &info = it->second;
+  auto& info = fragments.at(basedirfrag);
   CInode *diri = info.resultfrags.front()->get_inode();
 
   dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
 	   << " on " << *diri << dendl;
+  mdr->mark_event("prepare logged");
 
   if (diri->is_auth())
     diri->pop_and_dirty_projected_inode(mdr->ls);
@@ -11494,23 +11543,46 @@ void MDCache::_fragment_logged(MDRequestRef& mdr)
 void MDCache::_fragment_stored(MDRequestRef& mdr)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
-  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
-  assert(it != fragments.end());
-  fragment_info_t &info = it->second;
-  CInode *diri = info.resultfrags.front()->get_inode();
+  fragment_info_t &info = fragments.at(basedirfrag);
+  CDir *first = info.resultfrags.front();
+  CInode *diri = first->get_inode();
 
   dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
 	   << " on " << *diri << dendl;
+  mdr->mark_event("new frags stored");
 
   // tell peers
-  CDir *first = *info.resultfrags.begin();
+  mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
+			  diri->authority().first : CDIR_AUTH_UNKNOWN;
   for (const auto &p : first->get_replicas()) {
     if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
 	(mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
 	 rejoin_gather.count(p.first)))
       continue;
 
-    MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
+    auto notify = new MMDSFragmentNotify(basedirfrag, info.bits, mdr->reqid.tid);
+    if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
+	diri_auth != p.first) { // not auth mds of diri
+      /*
+       * In the nornal case, mds does not trim dir inode whose child dirfrags
+       * are likely being fragmented (see trim_inode()). But when fragmenting
+       * subtree roots, following race can happen:
+       *
+       * - mds.a (auth mds of dirfrag) sends fragment_notify message to
+       *   mds.c and drops wrlock on dirfragtreelock.
+       * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
+       *   SYNC and send lock message mds.c
+       * - mds.c receives the lock message and changes dirfragtreelock state
+       *   to SYNC
+       * - mds.c trim dirfrag and dir inode from its cache
+       * - mds.c receives the fragment_notify message
+       *
+       * So we need to ensure replicas have received the notify, then unlock
+       * the dirfragtreelock.
+       */
+      notify->mark_ack_wanted();
+      info.notify_ack_waiting.insert(p.first);
+    }
 
     // freshly replicate new dirs to peers
     for (list<CDir*>::iterator q = info.resultfrags.begin();
@@ -11523,10 +11595,8 @@ void MDCache::_fragment_stored(MDRequestRef& mdr)
 
   // journal commit
   EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
-  mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag,
-							      info.resultfrags));
+  mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
 
-  mds->locker->drop_locks(mdr.get());
 
   // unfreeze resulting frags
   for (list<CDir*>::iterator p = info.resultfrags.begin();
@@ -11546,22 +11616,26 @@ void MDCache::_fragment_stored(MDRequestRef& mdr)
     dir->unfreeze_dir();
   }
 
-  fragments.erase(it);
-  request_finish(mdr);
+  if (info.notify_ack_waiting.empty()) {
+    fragment_drop_locks(info);
+  } else {
+    mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
+  }
 }
 
-void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
+void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
 {
   dout(10) << "fragment_committed " << basedirfrag << dendl;
-  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
-  assert(it != uncommitted_fragments.end());
-  ufragment &uf = it->second;
+  if (mdr)
+    mdr->mark_event("commit logged");
+
+  ufragment &uf = uncommitted_fragments.at(basedirfrag);
 
   // remove old frags
   C_GatherBuilder gather(
     g_ceph_context,
     new C_OnFinisher(
-      new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags),
+      new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
       mds->finisher));
 
   SnapContext nullsnapc;
@@ -11589,16 +11663,50 @@ void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrag
   gather.activate();
 }
 
-void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
+void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
 {
-  dout(10) << "fragment_finish " << basedirfrag << "resultfrags.size="
-           << resultfrags.size() << dendl;
-  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
-  assert(it != uncommitted_fragments.end());
-  ufragment &uf = it->second;
+  dout(10) << "fragment_old_purged " << basedirfrag << dendl;
+  if (mdr)
+    mdr->mark_event("old frags purged");
+
+  EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
+  mds->mdlog->start_submit_entry(le);
+
+  finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
+
+  if (mds->logger) {
+    if (bits > 0) {
+      mds->logger->inc(l_mds_dir_split);
+    } else {
+      mds->logger->inc(l_mds_dir_merge);
+    }
+  }
+
+  if (mdr) {
+    auto it = fragments.find(basedirfrag);
+    ceph_assert(it != fragments.end());
+    it->second.finishing = true;
+    if (it->second.notify_ack_waiting.empty())
+      fragment_maybe_finish(it);
+    else
+      mdr->mark_event("wating for notify acks");
+  }
+}
+
+void MDCache::fragment_drop_locks(fragment_info_t& info)
+{
+  mds->locker->drop_locks(info.mdr.get());
+  request_finish(info.mdr);
+  //info.mdr.reset();
+}
+
+void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
+{
+  if (!it->second.finishing)
+    return;
 
   // unmark & auth_unpin
-  for (const auto &dir : resultfrags) {
+  for (const auto &dir : it->second.resultfrags) {
     dir->state_clear(CDir::STATE_FRAGMENTING);
     dir->auth_unpin(this);
 
@@ -11609,24 +11717,41 @@ void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
     mds->balancer->maybe_fragment(dir, false);
   }
 
-  if (mds->logger) {
-    if (resultfrags.size() > 1) {
-      mds->logger->inc(l_mds_dir_split);
-    } else {
-      mds->logger->inc(l_mds_dir_merge);
-    }
+  fragments.erase(it);
+}
+
+
+void MDCache::handle_fragment_notify_ack(MMDSFragmentNotifyAck *ack)
+{
+  dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
+  mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+  if (mds->get_state() < MDSMap::STATE_ACTIVE) {
+    ack->put();
+    return;
   }
 
-  EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, uf.bits);
-  mds->mdlog->start_submit_entry(le);
+  auto it = fragments.find(ack->get_base_dirfrag());
+  if (it == fragments.end() ||
+      it->second.get_tid() != ack->get_tid()) {
+    dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
+    ack->put();
+    return;
+  }
 
-  finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
+  if (it->second.notify_ack_waiting.erase(from) &&
+      it->second.notify_ack_waiting.empty()) {
+    fragment_drop_locks(it->second);
+    fragment_maybe_finish(it);
+  }
+  ack->put();
 }
 
 /* This function DOES put the passed message before returning */
 void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
 {
   dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
+  mds_rank_t from = mds_rank_t(notify->get_source().num());
 
   if (mds->get_state() < MDSMap::STATE_REJOIN) {
     notify->put();
@@ -11661,13 +11786,18 @@ void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
     // add new replica dirs values
     bufferlist::iterator p = notify->basebl.begin();
     while (!p.end())
-      add_replica_dir(p, diri, mds_rank_t(notify->get_source().num()), waiters);
+      add_replica_dir(p, diri, from, waiters);
 
     mds->queue_waiters(waiters);
   } else {
     ceph_abort();
   }
 
+  if (notify->is_ack_wanted()) {
+    auto ack = new MMDSFragmentNotifyAck(notify->get_base_dirfrag(),
+					 notify->get_bits(), notify->get_tid());
+    mds->send_message_mds(ack, from);
+  }
   notify->put();
 }
 
@@ -11730,14 +11860,7 @@ void MDCache::rollback_uncommitted_fragments()
     assert(diri);
 
     if (uf.committed) {
-      list<CDir*> frags;
-      diri->get_dirfrags_under(p->first.frag, frags);
-      for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
-	CDir *dir = *q;
-	dir->auth_pin(this);
-	dir->state_set(CDir::STATE_FRAGMENTING);
-      }
-      _fragment_committed(p->first, frags);
+      _fragment_committed(p->first, MDRequestRef());
       continue;
     }
 
@@ -11809,16 +11932,10 @@ void MDCache::rollback_uncommitted_fragments()
     for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
       assert(!diri->dirfragtree.is_leaf(*q));
 
-    for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
-      CDir *dir = *q;
-      dir->auth_pin(this);
-      dir->state_set(CDir::STATE_FRAGMENTING);
-    }
-
     mds->mdlog->submit_entry(le);
 
     uf.old_frags.swap(old_frags);
-    _fragment_committed(p->first, resultfrags);
+    _fragment_committed(p->first, MDRequestRef());
   }
 }
 
diff --git a/ceph/src/mds/MDCache.h b/ceph/src/mds/MDCache.h
index 49d8fc738..2d1e8463a 100644
--- a/ceph/src/mds/MDCache.h
+++ b/ceph/src/mds/MDCache.h
@@ -19,6 +19,7 @@
 
 #include <boost/utility/string_view.hpp>
 
+#include "common/DecayCounter.h"
 #include "include/types.h"
 #include "include/filepath.h"
 #include "include/elist.h"
@@ -68,6 +69,7 @@ class MMDSSlaveRequest;
 struct MClientSnap;
 
 class MMDSFragmentNotify;
+class MMDSFragmentNotifyAck;
 
 class ESubtreeMap;
 
@@ -191,6 +193,9 @@ public:
    */
   void notify_stray(CDentry *dn) {
     assert(dn->get_dir()->get_inode()->is_stray());
+    if (dn->state_test(CDentry::STATE_PURGING))
+      return;
+
     stray_manager.eval_stray(dn);
   }
 
@@ -400,7 +405,7 @@ public:
   void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
 				   snapid_t ofirst, snapid_t last, 
 				   CInode *pin, bool cow_head);
-  void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1);
+  void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false);
   void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
 				CInode *in, CDir *parent,
 				int flags, int linkunlink=0,
@@ -651,7 +656,7 @@ public:
   void send_snaps(map<client_t,MClientSnap*>& splits);
   Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
   void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq);
-  void try_reconnect_cap(CInode *in, Session *session);
+  Capability* try_reconnect_cap(CInode *in, Session *session);
   void export_remaining_imported_caps();
 
   // cap imports.  delayed snap parent opens.
@@ -715,9 +720,9 @@ public:
   size_t get_cache_size() { return lru.lru_get_size(); }
 
   // trimming
-  bool trim(uint64_t count=0);
+  std::pair<bool, uint64_t> trim(uint64_t count=0);
 private:
-  void trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*>& expiremap);
+  std::pair<bool, uint64_t> trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*>& expiremap);
   bool trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap);
   void trim_dirfrag(CDir *dir, CDir *con,
 		    map<mds_rank_t, MCacheExpire*>& expiremap);
@@ -755,8 +760,6 @@ public:
   void trim_client_leases();
   void check_memory_usage();
 
-  time last_recall_state;
-
   // shutdown
 private:
   set<inodeno_t> shutdown_exporting_strays;
@@ -1099,15 +1102,20 @@ private:
     list<CDir*> dirs;
     list<CDir*> resultfrags;
     MDRequestRef mdr;
+    set<mds_rank_t> notify_ack_waiting;
+    bool finishing = false;
+
     // for deadlock detection
-    bool all_frozen;
+    bool all_frozen = false;
     utime_t last_cum_auth_pins_change;
-    int last_cum_auth_pins;
-    int num_remote_waiters;	// number of remote authpin waiters
-    fragment_info_t() : bits(0), all_frozen(false), last_cum_auth_pins(0), num_remote_waiters(0) {}
+    int last_cum_auth_pins = 0;
+    int num_remote_waiters = 0;	// number of remote authpin waiters
+    fragment_info_t() {}
     bool is_fragmenting() { return !resultfrags.empty(); }
+    uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; }
   };
   map<dirfrag_t,fragment_info_t> fragments;
+  typedef map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator;
 
   void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
 			    list<CDir*>& frags, list<MDSInternalContextBase*>& waiters, bool replay);
@@ -1125,11 +1133,13 @@ private:
   void fragment_mark_and_complete(MDRequestRef& mdr);
   void fragment_frozen(MDRequestRef& mdr, int r);
   void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
+  void fragment_drop_locks(fragment_info_t &info);
+  void fragment_maybe_finish(const fragment_info_iterator& it);
   void dispatch_fragment_dir(MDRequestRef& mdr);
   void _fragment_logged(MDRequestRef& mdr);
   void _fragment_stored(MDRequestRef& mdr);
-  void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags);
-  void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags);
+  void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr);
+  void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr);
 
   friend class EFragment;
   friend class C_MDC_FragmentFrozen;
@@ -1137,14 +1147,19 @@ private:
   friend class C_MDC_FragmentPrep;
   friend class C_MDC_FragmentStore;
   friend class C_MDC_FragmentCommit;
-  friend class C_IO_MDC_FragmentFinish;
+  friend class C_IO_MDC_FragmentPurgeOld;
 
   void handle_fragment_notify(MMDSFragmentNotify *m);
+  void handle_fragment_notify_ack(MMDSFragmentNotifyAck *m);
 
   void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag,
 				LogSegment *ls, bufferlist *rollback=NULL);
   void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
   void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags);
+
+
+  DecayCounter trim_counter;
+
 public:
   void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSInternalContextBase *c) {
     assert(uncommitted_fragments.count(dirfrag));
diff --git a/ceph/src/mds/MDSDaemon.cc b/ceph/src/mds/MDSDaemon.cc
index 2f55ef825..e312bf7cc 100644
--- a/ceph/src/mds/MDSDaemon.cc
+++ b/ceph/src/mds/MDSDaemon.cc
@@ -373,6 +373,7 @@ const char** MDSDaemon::get_tracked_conf_keys() const
     "mds_health_cache_threshold",
     "mds_cache_mid",
     "mds_dump_cache_threshold_formatter",
+    "mds_cache_trim_decay_rate",
     "mds_dump_cache_threshold_file",
     // MDBalancer
     "mds_bal_fragment_interval",
@@ -386,8 +387,10 @@ const char** MDSDaemon::get_tracked_conf_keys() const
     "mds_inject_migrator_message_loss",
     "host",
     "fsid",
-    "mds_request_load_average_decay_rate",
     "mds_cap_revoke_eviction_timeout",
+    // SessionMap
+    "mds_request_load_average_decay_rate",
+    "mds_recall_max_decay_rate",
     NULL
   };
   return KEYS;
@@ -1356,7 +1359,7 @@ bool MDSDaemon::ms_verify_authorizer(Connection *con, int peer_type,
     // It doesn't go into a SessionMap instance until it sends an explicit
     // request to open a session (initial state of Session is `closed`)
     if (!s) {
-      s = new Session;
+      s = new Session(nullptr);
       s->info.auth_name = name;
       s->info.inst.addr = con->get_peer_addr();
       s->info.inst.name = n;
diff --git a/ceph/src/mds/MDSDaemon.h b/ceph/src/mds/MDSDaemon.h
index ab79d1ed3..5b9577fe6 100644
--- a/ceph/src/mds/MDSDaemon.h
+++ b/ceph/src/mds/MDSDaemon.h
@@ -29,7 +29,7 @@
 #include "MDSMap.h"
 #include "MDSRank.h"
 
-#define CEPH_MDS_PROTOCOL    31 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    34 /* cluster internal */
 
 class AuthAuthorizeHandlerRegistry;
 class Message;
diff --git a/ceph/src/mds/MDSRank.cc b/ceph/src/mds/MDSRank.cc
index b196b5044..89b6e97b0 100644
--- a/ceph/src/mds/MDSRank.cc
+++ b/ceph/src/mds/MDSRank.cc
@@ -248,7 +248,8 @@ public:
                Formatter *f, Context *on_finish)
     : MDSInternalContext(mds),
       server(server), mdcache(mdcache), mdlog(mdlog),
-      recall_timeout(recall_timeout), f(f), on_finish(on_finish),
+      recall_timeout(recall_timeout), recall_start(mono_clock::now()),
+      f(f), on_finish(on_finish),
       whoami(mds->whoami), incarnation(mds->incarnation) {
   }
 
@@ -258,6 +259,7 @@ public:
     assert(mds->mds_lock.is_locked());
 
     dout(20) << __func__ << dendl;
+    f->open_object_section("result");
     recall_client_state();
   }
 
@@ -316,25 +318,42 @@ private:
 
   void recall_client_state() {
     dout(20) << __func__ << dendl;
-
-    f->open_object_section("result");
+    auto now = mono_clock::now();
+    auto duration = std::chrono::duration<double>(now-recall_start).count();
 
     MDSGatherBuilder *gather = new MDSGatherBuilder(g_ceph_context);
-    server->recall_client_state(1.0, true, gather);
-    if (!gather->has_subs()) {
-      handle_recall_client_state(0);
-      delete gather;
-      return;
+    auto result = server->recall_client_state(gather, Server::RecallFlags::STEADY);
+    auto& throttled = result.first;
+    auto& count = result.second;
+    dout(10) << __func__
+             << (throttled ? " (throttled)" : "")
+             << " recalled " << count << " caps" << dendl;
+
+    caps_recalled += count;
+    if ((throttled || count > 0) && (recall_timeout == 0 || duration < recall_timeout)) {
+      auto timer = new FunctionContext([this](int _) {
+        recall_client_state();
+      });
+      mds->timer.add_event_after(1.0, timer);
+    } else {
+      if (!gather->has_subs()) {
+        delete gather;
+        return handle_recall_client_state(0);
+      } else if (recall_timeout > 0 && duration > recall_timeout) {
+        delete gather;
+        return handle_recall_client_state(-ETIMEDOUT);
+      } else {
+        uint64_t remaining = (recall_timeout == 0 ? 0 : recall_timeout-duration);
+        C_ContextTimeout *ctx = new C_ContextTimeout(
+          mds, remaining, new FunctionContext([this](int r) {
+              handle_recall_client_state(r);
+            }));
+
+        ctx->start_timer();
+        gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
+        gather->activate();
+      }
     }
-
-    C_ContextTimeout *ctx = new C_ContextTimeout(
-      mds, recall_timeout, new FunctionContext([this](int r) {
-          handle_recall_client_state(r);
-        }));
-
-    ctx->start_timer();
-    gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
-    gather->activate();
   }
 
   void handle_recall_client_state(int r) {
@@ -344,21 +363,10 @@ private:
     f->open_object_section("client_recall");
     f->dump_int("return_code", r);
     f->dump_string("message", cpp_strerror(r));
+    f->dump_int("recalled", caps_recalled);
     f->close_section();
 
     // we can still continue after recall timeout
-    trim_cache();
-  }
-
-  void trim_cache() {
-    dout(20) << __func__ << dendl;
-
-    if (!mdcache->trim(UINT64_MAX)) {
-      cmd_err(f, "failed to trim cache");
-      complete(-EINVAL);
-      return;
-    }
-
     flush_journal();
   }
 
@@ -388,15 +396,38 @@ private:
     f->dump_string("message", ss.str());
     f->close_section();
 
-    cache_status();
+    trim_cache();
+  }
+
+  void trim_cache() {
+    dout(20) << __func__ << dendl;
+
+    auto p = mdcache->trim(UINT64_MAX);
+    auto& throttled = p.first;
+    auto& count = p.second;
+    dout(10) << __func__
+             << (throttled ? " (throttled)" : "")
+             << " trimmed " << count << " caps" << dendl;
+    dentries_trimmed += count;
+    if (throttled && count > 0) {
+      auto timer = new FunctionContext([this](int _) {
+        trim_cache();
+      });
+      mds->timer.add_event_after(1.0, timer);
+    } else {
+      cache_status();
+    }
   }
 
   void cache_status() {
     dout(20) << __func__ << dendl;
 
+    f->open_object_section("trim_cache");
+    f->dump_int("trimmed", dentries_trimmed);
+    f->close_section();
+
     // cache status section
     mdcache->cache_status(f);
-    f->close_section();
 
     complete(0);
   }
@@ -404,6 +435,10 @@ private:
   void finish(int r) override {
     dout(20) << __func__ << ": r=" << r << dendl;
 
+    auto d = std::chrono::duration<double>(mono_clock::now()-recall_start);
+    f->dump_float("duration", d.count());
+
+    f->close_section();
     on_finish->complete(r);
   }
 
@@ -411,11 +446,14 @@ private:
   MDCache *mdcache;
   MDLog *mdlog;
   uint64_t recall_timeout;
+  mono_time recall_start;
   Formatter *f;
   Context *on_finish;
 
   int retval = 0;
   std::stringstream ss;
+  uint64_t caps_recalled = 0;
+  uint64_t dentries_trimmed = 0;
 
   // so as to use dout
   mds_rank_t whoami;
@@ -648,6 +686,7 @@ void MDSRankDispatcher::tick()
   sessionmap.update_average_session_age();
 
   if (is_active() || is_stopping()) {
+    server->recall_client_state(nullptr, Server::RecallFlags::ENFORCE_MAX);
     mdcache->trim();
     mdcache->trim_client_leases();
     mdcache->check_memory_usage();
@@ -1474,27 +1513,27 @@ void MDSRank::boot_start(BootStep step, int r)
 
         MDSGatherBuilder gather(g_ceph_context,
             new C_MDS_BootStart(this, MDS_BOOT_OPEN_ROOT));
-        dout(2) << "boot_start " << step << ": opening inotable" << dendl;
+        dout(2) << "Booting: " << step << ": opening inotable" << dendl;
         inotable->set_rank(whoami);
         inotable->load(gather.new_sub());
 
-        dout(2) << "boot_start " << step << ": opening sessionmap" << dendl;
+        dout(2) << "Booting: " << step << ": opening sessionmap" << dendl;
         sessionmap.set_rank(whoami);
         sessionmap.load(gather.new_sub());
 
-        dout(2) << "boot_start " << step << ": opening mds log" << dendl;
+        dout(2) << "Booting: " << step << ": opening mds log" << dendl;
         mdlog->open(gather.new_sub());
 
 	if (is_starting()) {
-	  dout(2) << "boot_start " << step << ": opening purge queue" << dendl;
+	  dout(2) << "Booting: " << step << ": opening purge queue" << dendl;
 	  purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
 	} else if (!standby_replaying) {
-	  dout(2) << "boot_start " << step << ": opening purge queue (async)" << dendl;
+	  dout(2) << "Booting: " << step << ": opening purge queue (async)" << dendl;
 	  purge_queue.open(NULL);
 	}
 
         if (mdsmap->get_tableserver() == whoami) {
-          dout(2) << "boot_start " << step << ": opening snap table" << dendl;
+          dout(2) << "Booting: " << step << ": opening snap table" << dendl;
           snapserver->set_rank(whoami);
           snapserver->load(gather.new_sub());
         }
@@ -1504,7 +1543,7 @@ void MDSRank::boot_start(BootStep step, int r)
       break;
     case MDS_BOOT_OPEN_ROOT:
       {
-        dout(2) << "boot_start " << step << ": loading/discovering base inodes" << dendl;
+        dout(2) << "Booting: " << step << ": loading/discovering base inodes" << dendl;
 
         MDSGatherBuilder gather(g_ceph_context,
             new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
@@ -1527,19 +1566,19 @@ void MDSRank::boot_start(BootStep step, int r)
       break;
     case MDS_BOOT_PREPARE_LOG:
       if (is_any_replay()) {
-	dout(2) << "boot_start " << step << ": replaying mds log" << dendl;
+	dout(2) << "Booting: " << step << ": replaying mds log" << dendl;
 	MDSGatherBuilder gather(g_ceph_context,
 	    new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
 
 	if (!standby_replaying) {
-	  dout(2) << "boot_start " << step << ": waiting for purge queue recovered" << dendl;
+	  dout(2) << "Booting: " << step << ": waiting for purge queue recovered" << dendl;
 	  purge_queue.wait_for_recovery(new C_IO_Wrapper(this, gather.new_sub()));
 	}
 
 	mdlog->replay(gather.new_sub());
 	gather.activate();
       } else {
-        dout(2) << "boot_start " << step << ": positioning at end of old mds log" << dendl;
+        dout(2) << "Booting: " << step << ": positioning at end of old mds log" << dendl;
         mdlog->append();
         starting_done();
       }
@@ -1823,6 +1862,7 @@ void MDSRank::rejoin_start()
 {
   dout(1) << "rejoin_start" << dendl;
   mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
+  finish_contexts(g_ceph_context, waiting_for_rejoin);
 }
 void MDSRank::rejoin_done()
 {
@@ -1984,7 +2024,7 @@ void MDSRank::boot_create()
 
 void MDSRank::stopping_start()
 {
-  dout(2) << "stopping_start" << dendl;
+  dout(2) << "Stopping..." << dendl;
 
   if (mdsmap->get_num_in_mds() == 1 && !sessionmap.empty()) {
     // we're the only mds up!
@@ -1997,7 +2037,7 @@ void MDSRank::stopping_start()
 
 void MDSRank::stopping_done()
 {
-  dout(2) << "stopping_done" << dendl;
+  dout(2) << "Finished stopping..." << dendl;
 
   // tell monitor we shut down cleanly.
   request_state(MDSMap::STATE_STOPPED);
@@ -3072,11 +3112,20 @@ bool MDSRank::evict_client(int64_t session_id,
     return false;
   }
 
+  auto& addr = session->info.inst.addr;
+  {
+    std::stringstream ss;
+    ss << "Evicting " << (blacklist ? "(and blacklisting) " : "")
+       << "client session " << session_id << " (" << addr << ")";
+    dout(1) << ss.str() << dendl;
+    clog->info() << ss.str();
+  }
+
   dout(4) << "Preparing blacklist command... (wait=" << wait << ")" << dendl;
   stringstream ss;
   ss << "{\"prefix\":\"osd blacklist\", \"blacklistop\":\"add\",";
   ss << "\"addr\":\"";
-  ss << session->info.inst.addr;
+  ss << addr;
   ss << "\"}";
   std::string tmp = ss.str();
   std::vector<std::string> cmd = {tmp};
diff --git a/ceph/src/mds/MDSRank.h b/ceph/src/mds/MDSRank.h
index 46f72a4b9..6be660d10 100644
--- a/ceph/src/mds/MDSRank.h
+++ b/ceph/src/mds/MDSRank.h
@@ -269,7 +269,8 @@ class MDSRank {
 
     ceph_tid_t last_tid;    // for mds-initiated requests (e.g. stray rename)
 
-    list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
+    list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_rejoin,
+				  waiting_for_reconnect, waiting_for_resolve;
     list<MDSInternalContextBase*> waiting_for_any_client_connection;
     list<MDSInternalContextBase*> replay_queue;
     map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
@@ -409,6 +410,9 @@ class MDSRank {
     void wait_for_replay(MDSInternalContextBase *c) { 
       waiting_for_replay.push_back(c); 
     }
+    void wait_for_rejoin(MDSInternalContextBase *c) {
+      waiting_for_rejoin.push_back(c);
+    }
     void wait_for_reconnect(MDSInternalContextBase *c) {
       waiting_for_reconnect.push_back(c);
     }
diff --git a/ceph/src/mds/Migrator.cc b/ceph/src/mds/Migrator.cc
index a1cf8d31a..85f6b3b72 100644
--- a/ceph/src/mds/Migrator.cc
+++ b/ceph/src/mds/Migrator.cc
@@ -2128,10 +2128,9 @@ void Migrator::export_reverse(CDir *dir, export_state_t& stat)
     bool need_issue = false;
     for (auto& p : in->get_client_caps()) {
       Capability *cap = p.second;
-      if (cap->is_stale()) {
-	mds->locker->revoke_stale_caps(cap);
-      } else {
+      if (!cap->is_stale()) {
 	need_issue = true;
+	break;
       }
     }
     if (need_issue &&
diff --git a/ceph/src/mds/Server.cc b/ceph/src/mds/Server.cc
index e3fe19aa3..874b45605 100644
--- a/ceph/src/mds/Server.cc
+++ b/ceph/src/mds/Server.cc
@@ -17,6 +17,7 @@
 
 #include <boost/config/warning_disable.hpp>
 #include <boost/fusion/include/std_pair.hpp>
+#include <boost/range/adaptor/reversed.hpp>
 
 #include "MDSRank.h"
 #include "Server.h"
@@ -58,6 +59,7 @@
 #include "osd/OSDMap.h"
 
 #include <errno.h>
+#include <math.h>
 
 #include <list>
 #include <iostream>
@@ -199,7 +201,8 @@ Server::Server(MDSRank *m) :
   reconnect_done(NULL),
   failed_reconnects(0),
   reconnect_evicting(false),
-  terminating_sessions(false)
+  terminating_sessions(false),
+  recall_throttle(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"))
 {
 }
 
@@ -354,8 +357,7 @@ void Server::handle_client_session(MClientSession *m)
       m->put();
       return;
     }
-    assert(session->is_closed() ||
-	   session->is_closing());
+    assert(session->is_closed() || session->is_closing());
 
     if (mds->is_stopping()) {
       dout(10) << "mds is stopping, dropping open req" << dendl;
@@ -368,50 +370,82 @@ void Server::handle_client_session(MClientSession *m)
           return osd_map.is_blacklisted(session->info.inst.addr);
         });
 
-    if (blacklisted) {
-      dout(10) << "rejecting blacklisted client " << session->info.inst.addr << dendl;
-      mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
-      m->put();
-      return;
-    }
+    {
+      auto& addr = session->info.inst.addr;
+      session->set_client_metadata(m->client_meta);
+      auto& client_metadata = session->info.client_metadata;
+
+      auto log_session_status = [this, m = m->get(), session](boost::string_view status, boost::string_view err) {
+        auto now = ceph_clock_now();
+        auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
+        auto elapsed = now - m->get_recv_stamp();
+        std::stringstream ss;
+        ss << "New client session:"
+             << " addr=\"" <<  session->info.inst.addr << "\""
+             << ",elapsed=" << elapsed
+             << ",throttled=" << throttle_elapsed
+             << ",status=\"" << status << "\"";
+        if (!err.empty()) {
+          ss << ",error=\"" << err << "\"";
+        }
+        const auto& metadata = session->info.client_metadata;
+        auto it = metadata.find("root");
+        if (it != metadata.end()) {
+          ss << ",root=\"" << it->second << "\"";
+        }
+        dout(2) << ss.str() << dendl;
+        m->put();
+      };
 
-    session->set_client_metadata(m->client_meta);
-    dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
-      << session->info.client_metadata.size() << " metadata entries:" << dendl;
-    for (map<string, string>::iterator i = session->info.client_metadata.begin();
-        i != session->info.client_metadata.end(); ++i) {
-      dout(20) << "  " << i->first << ": " << i->second << dendl;
-    }
-
-    // Special case for the 'root' metadata path; validate that the claimed
-    // root is actually within the caps of the session
-    if (session->info.client_metadata.count("root")) {
-      const auto claimed_root = session->info.client_metadata.at("root");
-      // claimed_root has a leading "/" which we strip before passing
-      // into caps check
-      if (claimed_root.empty() || claimed_root[0] != '/' ||
-          !session->auth_caps.path_capable(claimed_root.substr(1))) {
-        derr << __func__ << " forbidden path claimed as mount root: "
-             << claimed_root << " by " << m->get_source() << dendl;
-        // Tell the client we're rejecting their open
+      if (blacklisted) {
+        dout(10) << "rejecting blacklisted client " << addr << dendl;
+        log_session_status("REJECTED", "blacklisted");
         mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
-        mds->clog->warn() << "client session with invalid root '" <<
-          claimed_root << "' denied (" << session->info.inst << ")";
-        session->clear();
-        // Drop out; don't record this session in SessionMap or journal it.
-        break;
+        m->put();
+        return;
       }
-    }
 
-    if (session->is_closed())
-      mds->sessionmap.add_session(session);
+      dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
+        << session->info.client_metadata.size() << " metadata entries:" << dendl;
+      for (map<string, string>::iterator i = session->info.client_metadata.begin();
+          i != session->info.client_metadata.end(); ++i) {
+        dout(20) << "  " << i->first << ": " << i->second << dendl;
+      }
 
-    pv = mds->sessionmap.mark_projected(session);
-    sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
-    mds->sessionmap.touch_session(session);
-    mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, m->client_meta),
-			      new C_MDS_session_finish(this, session, sseq, true, pv));
-    mdlog->flush();
+      // Special case for the 'root' metadata path; validate that the claimed
+      // root is actually within the caps of the session
+      if (session->info.client_metadata.count("root")) {
+        const auto claimed_root = session->info.client_metadata.at("root");
+        // claimed_root has a leading "/" which we strip before passing
+        // into caps check
+        if (claimed_root.empty() || claimed_root[0] != '/' ||
+            !session->auth_caps.path_capable(claimed_root.substr(1))) {
+          derr << __func__ << " forbidden path claimed as mount root: "
+               << claimed_root << " by " << m->get_source() << dendl;
+          // Tell the client we're rejecting their open
+          log_session_status("REJECTED", "invalid root");
+          mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
+          mds->clog->warn() << "client session with invalid root '" <<
+            claimed_root << "' denied (" << session->info.inst << ")";
+          session->clear();
+          // Drop out; don't record this session in SessionMap or journal it.
+          break;
+        }
+      }
+
+      if (session->is_closed())
+        mds->sessionmap.add_session(session);
+
+      pv = mds->sessionmap.mark_projected(session);
+      sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
+      mds->sessionmap.touch_session(session);
+      auto fin = new FunctionContext([log_session_status = std::move(log_session_status)](int r){
+        assert(r == 0);
+        log_session_status("ACCEPTED", "");
+      });
+      mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
+				new C_MDS_session_finish(this, session, sseq, true, pv, fin));
+    }
     break;
 
   case CEPH_SESSION_REQUEST_RENEWCAPS:
@@ -546,7 +580,7 @@ void Server::_session_logged(Session *session, uint64_t state_seq, bool open, ve
       Capability *cap = session->caps.front();
       CInode *in = cap->get_inode();
       dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
-      mds->locker->remove_client_cap(in, session->info.inst.name.num());
+      mds->locker->remove_client_cap(in, cap);
     }
     while (!session->leases.empty()) {
       ClientLease *r = session->leases.front();
@@ -698,7 +732,7 @@ class C_MDS_TerminatedSessions : public ServerContext {
 
 void Server::terminate_sessions()
 {
-  dout(2) << "terminate_sessions" << dendl;
+  dout(5) << "terminating all sessions..." << dendl;
 
   terminating_sessions = true;
 
@@ -853,6 +887,9 @@ void Server::handle_conf_change(const struct md_config_t *conf,
     dout(20) << __func__ << " cap revoke eviction timeout changed to "
             << cap_revoke_eviction_timeout << dendl;
   }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    recall_throttle = DecayCounter(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"));
+  }
 }
 
 /*
@@ -1186,62 +1223,125 @@ void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
   }
 }
 
-
 /**
  * Call this when the MDCache is oversized, to send requests to the clients
  * to trim some caps, and consequently unpin some inodes in the MDCache so
  * that it can trim too.
  */
-void Server::recall_client_state(double ratio, bool flush_client_session,
-                                 MDSGatherBuilder *gather) {
-  if (flush_client_session) {
-    assert(gather != nullptr);
-  }
-
-  /* try to recall at least 80% of all caps */
-  uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
-  uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
-  if (max_caps_per_client < min_caps_per_client) {
-    dout(0) << "max_caps_per_client " << max_caps_per_client
-            << " < min_caps_per_client " << min_caps_per_client << dendl;
-    max_caps_per_client = min_caps_per_client + 1;
-  }
-
-  /* unless this ratio is smaller: */
-  /* ratio: determine the amount of caps to recall from each client. Use
-   * percentage full over the cache reservation. Cap the ratio at 80% of client
-   * caps. */
-  if (ratio < 0.0)
-    ratio = 1.0 - fmin(0.80, mdcache->cache_toofull_ratio());
-
-  dout(10) << __func__ << ": ratio=" << ratio << ", caps per client "
-           << min_caps_per_client << "-" << max_caps_per_client << dendl;
-
-  set<Session*> sessions;
-  mds->sessionmap.get_client_session_set(sessions);
+std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
+{
+  const auto now = clock::now();
+  const bool steady = flags&RecallFlags::STEADY;
+  const bool enforce_max = flags&RecallFlags::ENFORCE_MAX;
+
+  const auto max_caps_per_client = g_conf->get_val<uint64_t>("mds_max_caps_per_client");
+  const auto min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
+  const auto recall_global_max_decay_threshold = g_conf->get_val<uint64_t>("mds_recall_global_max_decay_threshold");
+  const auto recall_max_caps = g_conf->get_val<uint64_t>("mds_recall_max_caps");
+  const auto recall_max_decay_threshold = g_conf->get_val<uint64_t>("mds_recall_max_decay_threshold");
+
+  dout(7) << __func__ << ":"
+           << " min=" << min_caps_per_client
+           << " max=" << max_caps_per_client
+           << " total=" << Capability::count()
+           << " flags=0x" << std::hex << flags
+           << dendl;
 
-  for (auto &session : sessions) {
+  /* trim caps of sessions with the most caps first */
+  std::multimap<uint64_t, Session*> caps_session;
+  auto f = [&caps_session, enforce_max, max_caps_per_client](Session* s) {
+    auto num_caps = s->caps.size();
+    if (!enforce_max || num_caps > max_caps_per_client) {
+      caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
+    }
+  };
+  mds->sessionmap.get_client_sessions(std::move(f));
+
+  std::pair<bool, uint64_t> result = {false, 0};
+  auto& throttled = result.first;
+  auto& caps_recalled = result.second;
+  last_recall_state = now;
+  for (const auto p : boost::adaptors::reverse(caps_session)) {
+    auto& num_caps = p.first;
+    auto& session = p.second;
     if (!session->is_open() ||
         !session->connection.get() ||
 	!session->info.inst.name.is_client())
       continue;
 
-    dout(10) << " session " << session->info.inst
-	     << " caps " << session->caps.size()
+    dout(10) << __func__ << ":"
+             << " session " << session->info.inst
+	     << " caps " << num_caps
 	     << ", leases " << session->leases.size()
 	     << dendl;
 
-    uint64_t newlim = MAX(MIN((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
-    if (session->caps.size() > newlim) {
-      MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
+    uint64_t newlim;
+    if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
+      newlim = min_caps_per_client;
+    } else {
+      newlim = num_caps-recall_max_caps;
+    }
+    if (num_caps > newlim) {
+      /* now limit the number of caps we recall at a time to prevent overloading ourselves */
+      uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
+      newlim = num_caps-recall;
+      const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
+      const uint64_t global_recall_throttle = recall_throttle.get(ceph_clock_now());
+      if (session_recall_throttle+recall > recall_max_decay_threshold) {
+        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
+        throttled = true;
+        continue;
+      } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
+        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
+        throttled = true;
+        break;
+      }
+
+      // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
+      if (steady) {
+        const auto session_recall = session->get_recall_caps();
+        const auto session_release = session->get_release_caps();
+        if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
+          /* The session has been unable to keep up with the number of caps
+           * recalled (by half); additionally, to prevent marking sessions
+           * we've just begun to recall from, the session_recall counter
+           * (decayed count of caps recently recalled) is **greater** than the
+           * session threshold for the session's cap recall throttle.
+           */
+          dout(15) << "  2*session_release < session_recall"
+                      " (2*" << session_release << " < " << session_recall << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        } else if (recall < recall_max_caps && 2*recall < session_recall) {
+          /* The number of caps recalled is less than the number we *could*
+           * recall (so there isn't much left to recall?) and the number of
+           * caps is less than the current recall_caps counter (decayed count
+           * of caps recently recalled).
+           */
+          dout(15) << "  2*recall < session_recall "
+                      " (2*" << recall << " < " << session_recall << ") &&"
+                      " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        }
+      }
+
+      dout(7) << "  recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
+
+      auto m = new MClientSession(CEPH_SESSION_RECALL_STATE);
       m->head.max_caps = newlim;
       mds->send_message_client(m, session);
-      if (flush_client_session) {
+      if (gather) {
         flush_session(session, gather);
       }
-      session->notify_recall_sent(newlim);
+      caps_recalled += session->notify_recall_sent(newlim);
+      recall_throttle.hit(ceph_clock_now(), recall);
     }
   }
+
+  dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
+
+  return result;
 }
 
 void Server::force_clients_readonly()
@@ -3676,9 +3776,6 @@ void Server::handle_client_openc(MDRequestRef& mdr)
   }
 
   // create inode.
-  SnapRealm *realm = diri->find_snaprealm();   // use directory's realm; inode isn't attached yet.
-  snapid_t follows = realm->get_newest_seq();
-
   CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
 				 req->head.args.open.mode | S_IFREG, &layout);
   assert(in);
@@ -3690,15 +3787,25 @@ void Server::handle_client_openc(MDRequestRef& mdr)
   if (layout.pool_id != mdcache->default_file_layout.pool_id)
     in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
   in->inode.update_backtrace();
-  if (cmode & CEPH_FILE_MODE_WR) {
+  in->inode.rstat.rfiles = 1;
+
+  SnapRealm *realm = diri->find_snaprealm();
+  snapid_t follows = realm->get_newest_seq();
+
+  ceph_assert(dn->first == follows+1);
+  in->first = dn->first;
+
+  // do the open
+  Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
+  in->authlock.set_state(LOCK_EXCL);
+  in->xattrlock.set_state(LOCK_EXCL);
+
+  if (cap && (cmode & CEPH_FILE_MODE_WR)) {
     in->inode.client_ranges[client].range.first = 0;
     in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
     in->inode.client_ranges[client].follows = follows;
+    cap->mark_clientwriteable();
   }
-  in->inode.rstat.rfiles = 1;
-
-  assert(dn->first == follows+1);
-  in->first = dn->first;
   
   // prepare finisher
   mdr->ls = mdlog->get_current_segment();
@@ -3709,11 +3816,6 @@ void Server::handle_client_openc(MDRequestRef& mdr)
   mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
   le->metablob.add_primary_dentry(dn, in, true, true, true);
 
-  // do the open
-  mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
-  in->authlock.set_state(LOCK_EXCL);
-  in->xattrlock.set_state(LOCK_EXCL);
-
   // make sure this inode gets into the journal
   le->metablob.add_opened_ino(in->ino());
   LogSegment *ls = mds->mdlog->get_current_segment();
@@ -4278,7 +4380,7 @@ void Server::handle_client_setattr(MDRequestRef& mdr)
     // adjust client's max_size?
     CInode::mempool_inode::client_range_map new_ranges;
     bool max_increased = false;
-    mds->locker->calc_new_client_ranges(cur, pi.inode.size, &new_ranges, &max_increased);
+    mds->locker->calc_new_client_ranges(cur, pi.inode.size, true, &new_ranges, &max_increased);
     if (pi.inode.client_ranges != new_ranges) {
       dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
       pi.inode.client_ranges = new_ranges;
@@ -4316,7 +4418,7 @@ void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
   dout(10) << "do_open_truncate " << *in << dendl;
 
   SnapRealm *realm = in->find_snaprealm();
-  mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
+  Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
 
   mdr->ls = mdlog->get_current_segment();
   EUpdate *le = new EUpdate(mdlog, "open_truncate");
@@ -4337,11 +4439,12 @@ void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
   }
 
   bool changed_ranges = false;
-  if (cmode & CEPH_FILE_MODE_WR) {
+  if (cap && (cmode & CEPH_FILE_MODE_WR)) {
     pi.inode.client_ranges[client].range.first = 0;
     pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
     pi.inode.client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
     changed_ranges = true;
+    cap->mark_clientwriteable();
   }
   
   le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
@@ -4827,7 +4930,7 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
     pip = &pi.inode;
 
     client_t exclude_ct = mdr->get_client();
-    mdcache->broadcast_quota_to_client(cur, exclude_ct);
+    mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
   } else if (name.find("ceph.dir.pin") == 0) {
     if (!cur->is_dir() || cur->is_root()) {
       respond_to_request(mdr, -EINVAL);
@@ -5223,11 +5326,6 @@ void Server::handle_client_mknod(MDRequestRef& mdr)
   // if the client created a _regular_ file via MKNOD, it's highly likely they'll
   // want to write to it (e.g., if they are reexporting NFS)
   if (S_ISREG(newi->inode.mode)) {
-    dout(15) << " setting a client_range too, since this is a regular file" << dendl;
-    newi->inode.client_ranges[client].range.first = 0;
-    newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
-    newi->inode.client_ranges[client].follows = follows;
-
     // issue a cap on the file
     int cmode = CEPH_FILE_MODE_RDWR;
     Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
@@ -5238,6 +5336,12 @@ void Server::handle_client_mknod(MDRequestRef& mdr)
       newi->filelock.set_state(LOCK_EXCL);
       newi->authlock.set_state(LOCK_EXCL);
       newi->xattrlock.set_state(LOCK_EXCL);
+
+      dout(15) << " setting a client_range too, since this is a regular file" << dendl;
+      newi->inode.client_ranges[client].range.first = 0;
+      newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
+      newi->inode.client_ranges[client].follows = follows;
+      cap->mark_clientwriteable();
     }
   }
 
diff --git a/ceph/src/mds/Server.h b/ceph/src/mds/Server.h
index 4fff7ae6f..2be8f10c4 100644
--- a/ceph/src/mds/Server.h
+++ b/ceph/src/mds/Server.h
@@ -17,6 +17,8 @@
 
 #include <boost/utility/string_view.hpp>
 
+#include <common/DecayCounter.h>
+
 #include "MDSRank.h"
 #include "Mutation.h"
 
@@ -121,6 +123,9 @@ public:
   void dump_reconnect_status(Formatter *f) const;
 
   void handle_client_session(class MClientSession *m);
+  time last_recalled() const {
+    return last_recall_state;
+  }
   void _session_logged(Session *session, uint64_t state_seq, 
 		       bool open, version_t pv, interval_set<inodeno_t>& inos,version_t piv);
   version_t prepare_force_open_sessions(map<client_t,entity_inst_t> &cm,
@@ -141,8 +146,12 @@ public:
   void reconnect_tick();
   void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
 
-  void recall_client_state(double ratio, bool flush_client_session,
-                           MDSGatherBuilder *gather);
+  enum RecallFlags {
+    NONE = 0,
+    STEADY = (1<<0),
+    ENFORCE_MAX = (1<<1),
+  };
+  std::pair<bool, uint64_t> recall_client_state(MDSGatherBuilder* gather, enum RecallFlags=RecallFlags::NONE);
   void force_clients_readonly();
 
   // -- requests --
@@ -323,6 +332,9 @@ public:
 private:
   void reply_client_request(MDRequestRef& mdr, MClientReply *reply);
   void flush_session(Session *session, MDSGatherBuilder *gather);
+
+  DecayCounter recall_throttle;
+  time last_recall_state;
 };
 
 #endif
diff --git a/ceph/src/mds/SessionMap.cc b/ceph/src/mds/SessionMap.cc
index 15776ea71..ae10833c7 100644
--- a/ceph/src/mds/SessionMap.cc
+++ b/ceph/src/mds/SessionMap.cc
@@ -546,8 +546,8 @@ void SessionMapStore::decode_legacy(bufferlist::iterator& p)
     ::decode(n, p);
     
     while (n-- && !p.end()) {
-      bufferlist::iterator p2 = p;
-      Session *s = new Session;
+      auto p2 = p;
+      Session *s = new Session(ConnectionRef());
       s->info.decode(p);
       if (session_map.count(s->info.inst.name)) {
 	// eager client connected too fast!  aie.
@@ -859,11 +859,8 @@ size_t Session::get_request_count()
  */
 void Session::notify_cap_release(size_t n_caps)
 {
-  if (recalled_at != time::min()) {
-    recall_release_count += n_caps;
-    if (recall_release_count >= recall_count)
-      clear_recalled_at();
-  }
+  recall_caps.hit(ceph_clock_now(), -(double)n_caps);
+  release_caps.hit(ceph_clock_now(), n_caps);
 }
 
 /**
@@ -872,32 +869,26 @@ void Session::notify_cap_release(size_t n_caps)
  * in order to generate health metrics if the session doesn't see
  * a commensurate number of calls to ::notify_cap_release
  */
-void Session::notify_recall_sent(const size_t new_limit)
+uint64_t Session::notify_recall_sent(size_t new_limit)
 {
-  if (recalled_at == time::min()) {
-    // Entering recall phase, set up counters so we can later
-    // judge whether the client has respected the recall request
-    recalled_at = last_recall_sent = clock::now();
-    assert (new_limit < caps.size());  // Behaviour of Server::recall_client_state
-    recall_count = caps.size() - new_limit;
-    recall_release_count = 0;
+  const auto num_caps = caps.size();
+  ceph_assert(new_limit < num_caps);  // Behaviour of Server::recall_client_state
+  const auto count = num_caps-new_limit;
+  uint64_t new_change;
+  if (recall_limit != new_limit) {
+    new_change = count;
   } else {
-    last_recall_sent = clock::now();
+    new_change = 0; /* no change! */
   }
-}
-
-void Session::clear_recalled_at()
-{
-  recalled_at = last_recall_sent = time::min();
-  recall_count = 0;
-  recall_release_count = 0;
-}
 
-void Session::set_client_metadata(map<string, string> const &meta)
-{
-  info.client_metadata = meta;
-
-  _update_human_name();
+  /* Always hit the session counter as a RECALL message is still sent to the
+   * client and we do not want the MDS to burn its global counter tokens on a
+   * session that is not releasing caps (i.e. allow the session counter to
+   * throttle future RECALL messages).
+   */
+  recall_caps_throttle.hit(ceph_clock_now(), count);
+  recall_caps.hit(ceph_clock_now(), count);
+  return new_change;
 }
 
 /**
@@ -990,23 +981,58 @@ void SessionMap::hit_session(Session *session) {
 }
 
 void SessionMap::handle_conf_change(const struct md_config_t *conf,
-                                    const std::set <std::string> &changed) {
+                                    const std::set <std::string> &changed)
+{
+
   if (changed.count("mds_request_load_average_decay_rate")) {
-    decay_rate = conf->get_val<double>("mds_request_load_average_decay_rate");
-    dout(20) << __func__ << " decay rate changed to " << decay_rate << dendl;
+    auto d = g_conf->get_val<double>("mds_request_load_average_decay_rate");
+    dout(20) << __func__ << " decay rate changed to " << d << dendl;
 
-    total_load_avg_rate = DecayRate(decay_rate);
+    decay_rate = d;
+    total_load_avg = DecayCounter(ceph_clock_now(), d);
 
-    auto p = by_state.find(Session::STATE_OPEN);
-    if (p != by_state.end()) {
-      for (const auto &session : *(p->second)) {
-        session->set_load_avg_decay_rate(decay_rate);
+    auto it = by_state.find(Session::STATE_OPEN);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->set_load_avg_decay_rate(d);
+      }
+    }
+    it = by_state.find(Session::STATE_STALE);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->set_load_avg_decay_rate(d);
+      }
+    }
+  }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    auto d = g_conf->get_val<double>("mds_recall_max_decay_rate");
+    auto it = by_state.find(Session::STATE_OPEN);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->recall_caps_throttle = DecayCounter(ceph_clock_now(), d);
+      }
+    }
+    it = by_state.find(Session::STATE_STALE);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->recall_caps_throttle = DecayCounter(ceph_clock_now(), d);
+      }
+    }
+  }
+  if (changed.count("mds_recall_warning_decay_rate")) {
+    auto d = g_conf->get_val<double>("mds_recall_warning_decay_rate");
+    auto it = by_state.find(Session::STATE_OPEN);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->recall_caps = DecayCounter(ceph_clock_now(), d);
+        session->release_caps = DecayCounter(ceph_clock_now(), d);
       }
     }
-    p = by_state.find(Session::STATE_STALE);
-    if (p != by_state.end()) {
-      for (const auto &session : *(p->second)) {
-        session->set_load_avg_decay_rate(decay_rate);
+    it = by_state.find(Session::STATE_STALE);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->recall_caps = DecayCounter(ceph_clock_now(), d);
+        session->release_caps = DecayCounter(ceph_clock_now(), d);
       }
     }
   }
diff --git a/ceph/src/mds/SessionMap.h b/ceph/src/mds/SessionMap.h
index eee382f4a..23e874831 100644
--- a/ceph/src/mds/SessionMap.h
+++ b/ceph/src/mds/SessionMap.h
@@ -27,10 +27,10 @@ using std::set;
 #include "mdstypes.h"
 #include "mds/MDSAuthCaps.h"
 #include "common/perf_counters.h"
+#include "common/DecayCounter.h"
 
 class CInode;
 struct MDRequestImpl;
-class DecayCounter;
 
 #include "CInode.h"
 #include "Capability.h"
@@ -96,9 +96,9 @@ public:
   }
 
 private:
-  int state;
-  uint64_t state_seq;
-  int importing_count;
+  int state = STATE_CLOSED;
+  uint64_t state_seq = 0;
+  int importing_count = 0;
   friend class SessionMap;
 
   // Human (friendly) name is soft state generated from client metadata
@@ -113,6 +113,16 @@ private:
   mutable DecayCounter load_avg;
   DecayRate    load_avg_rate;
 
+  // Ephemeral state for tracking progress of capability recalls
+  // caps being recalled recently by this session; used for Beacon warnings
+  mutable DecayCounter recall_caps;
+  // caps that have been released
+  mutable DecayCounter release_caps;
+  // throttle on caps recalled
+  mutable DecayCounter recall_caps_throttle;
+  // New limit in SESSION_RECALL
+  uint32_t recall_limit = 0;
+
   // session start time -- used to track average session time
   // note that this is initialized in the constructor rather
   // than at the time of adding a session to the sessionmap
@@ -143,15 +153,14 @@ public:
     }
   }
   void decode(bufferlist::iterator &p);
-  void set_client_metadata(std::map<std::string, std::string> const &meta);
+  template<typename T>
+  void set_client_metadata(T&& meta)
+  {
+    info.client_metadata = std::forward<T>(meta);
+    _update_human_name();
+  }
   std::string get_human_name() const {return human_name;}
 
-  // Ephemeral state for tracking progress of capability recalls
-  time recalled_at = time::min();  // When was I asked to SESSION_RECALL?
-  time last_recall_sent = time::min();
-  uint32_t recall_count;  // How many caps was I asked to SESSION_RECALL?
-  uint32_t recall_release_count;  // How many caps have I actually revoked?
-
   session_info_t info;                         ///< durable bits
 
   MDSAuthCaps auth_caps;
@@ -167,8 +176,16 @@ public:
   interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos
 
   void notify_cap_release(size_t n_caps);
-  void notify_recall_sent(const size_t new_limit);
-  void clear_recalled_at();
+  uint64_t notify_recall_sent(size_t new_limit);
+  double get_recall_caps_throttle() const {
+    return recall_caps_throttle.get(ceph_clock_now());
+  }
+  double get_recall_caps() const {
+    return recall_caps.get(ceph_clock_now());
+  }
+  double get_release_caps() const {
+    return release_caps.get(ceph_clock_now());
+  }
 
   inodeno_t next_ino() const {
     if (info.prealloc_inos.empty())
@@ -239,7 +256,8 @@ public:
 
   // -- caps --
 private:
-  version_t cap_push_seq;        // cap push seq #
+  uint32_t cap_gen = 0;
+  version_t cap_push_seq = 0;        // cap push seq #
   map<version_t, list<MDSInternalContextBase*> > waitfor_flush; // flush session messages
 
 public:
@@ -248,7 +266,9 @@ public:
   time last_cap_renew = time::min();
   time last_seen = time::min();
 
-public:
+  void inc_cap_gen() { ++cap_gen; }
+  uint32_t get_cap_gen() const { return cap_gen; }
+
   version_t inc_push_seq() { return ++cap_push_seq; }
   version_t get_push_seq() const { return cap_push_seq; }
 
@@ -265,7 +285,10 @@ public:
     }
   }
 
-  void add_cap(Capability *cap) {
+  void touch_cap(Capability *cap) {
+    caps.push_front(&cap->item_session_caps);
+  }
+  void touch_cap_bottom(Capability *cap) {
     caps.push_back(&cap->item_session_caps);
   }
   void touch_lease(ClientLease *r) {
@@ -273,16 +296,16 @@ public:
   }
 
   // -- leases --
-  uint32_t lease_seq;
+  uint32_t lease_seq = 0;
 
   // -- completed requests --
 private:
   // Has completed_requests been modified since the last time we
   // wrote this session out?
-  bool completed_requests_dirty;
+  bool completed_requests_dirty = false;
 
-  unsigned num_trim_flushes_warnings;
-  unsigned num_trim_requests_warnings;
+  unsigned num_trim_flushes_warnings = 0;
+  unsigned num_trim_requests_warnings = 0;
 public:
   void add_completed_request(ceph_tid_t t, inodeno_t created) {
     info.completed_requests[t] = created;
@@ -357,18 +380,18 @@ public:
   int check_access(CInode *in, unsigned mask, int caller_uid, int caller_gid,
 		   const vector<uint64_t> *gid_list, int new_uid, int new_gid);
 
-
-  Session() : 
-    state(STATE_CLOSED), state_seq(0), importing_count(0),
-    birth_time(clock::now()), recall_count(0),
-    recall_release_count(0), auth_caps(g_ceph_context),
-    connection(NULL), item_session_list(this),
-    requests(0),  // member_offset passed to front() manually
-    cap_push_seq(0),
-    lease_seq(0),
-    completed_requests_dirty(false),
-    num_trim_flushes_warnings(0),
-    num_trim_requests_warnings(0) { }
+  Session() = delete;
+  Session(ConnectionRef con) :
+    recall_caps(ceph_clock_now(), g_conf->get_val<double>("mds_recall_warning_decay_rate")),
+    release_caps(ceph_clock_now(), g_conf->get_val<double>("mds_recall_warning_decay_rate")),
+    recall_caps_throttle(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate")),
+    birth_time(clock::now()),
+    auth_caps(g_ceph_context),
+    item_session_list(this),
+    requests(0)  // member_offset passed to front() manually
+  {
+    connection = std::move(con);
+  }
   ~Session() override {
     if (state == STATE_CLOSED) {
       item_session_list.remove_myself();
@@ -464,7 +487,7 @@ public:
     if (session_map_entry != session_map.end()) {
       s = session_map_entry->second;
     } else {
-      s = session_map[i.name] = new Session;
+      s = session_map[i.name] = new Session(ConnectionRef());
       s->info.inst = i;
       s->last_cap_renew = Session::clock::now();
       if (logger) {
@@ -496,17 +519,15 @@ public:
   MDSRank *mds;
 
 protected:
-  version_t projected, committing, committed;
+  version_t projected = 0, committing = 0, committed = 0;
 public:
   map<int,xlist<Session*>* > by_state;
   uint64_t set_state(Session *session, int state);
   map<version_t, list<MDSInternalContextBase*> > commit_waiters;
   void update_average_session_age();
 
-  explicit SessionMap(MDSRank *m) : mds(m),
-		       projected(0), committing(0), committed(0),
-                       loaded_legacy(false)
-  { }
+  SessionMap() = delete;
+  explicit SessionMap(MDSRank *m) : mds(m) {}
 
   ~SessionMap() override
   {
@@ -600,12 +621,20 @@ public:
 
   void dump();
 
-  void get_client_session_set(set<Session*>& s) const {
-    for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
-	 p != session_map.end();
-	 ++p)
-      if (p->second->info.inst.name.is_client())
-	s.insert(p->second);
+  template<typename F>
+  void get_client_sessions(F&& f) const {
+    for (const auto& p : session_map) {
+      auto& session = p.second;
+      if (session->info.inst.name.is_client())
+	f(session);
+    }
+  }
+  template<typename C>
+  void get_client_session_set(C& c) const {
+    auto f = [&c](Session* s) {
+      c.insert(s);
+    };
+    get_client_sessions(f);
   }
 
   void replay_open_sessions(map<client_t,entity_inst_t>& client_map) {
@@ -667,7 +696,7 @@ public:
 protected:
   std::set<entity_name_t> dirty_sessions;
   std::set<entity_name_t> null_sessions;
-  bool loaded_legacy;
+  bool loaded_legacy = false;
   void _mark_dirty(Session *session);
 public:
 
diff --git a/ceph/src/mds/SimpleLock.h b/ceph/src/mds/SimpleLock.h
index 2c0ff9327..8e5c828fb 100644
--- a/ceph/src/mds/SimpleLock.h
+++ b/ceph/src/mds/SimpleLock.h
@@ -506,8 +506,9 @@ public:
   }
   void put_xlock() {
     assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE ||
-	   state == LOCK_XLOCKSNAP || is_locallock() ||
-	   state == LOCK_LOCK /* if we are a master of a slave */);
+	   state == LOCK_XLOCKSNAP || state == LOCK_LOCK_XLOCK ||
+	   state == LOCK_LOCK  || /* if we are a master of a slave */
+	   is_locallock());
     --more()->num_xlock;
     parent->put(MDSCacheObject::PIN_LOCK);
     if (more()->num_xlock == 0) {
diff --git a/ceph/src/messages/MMDSFragmentNotify.h b/ceph/src/messages/MMDSFragmentNotify.h
index 5568b37c0..c9a7a51dd 100644
--- a/ceph/src/messages/MMDSFragmentNotify.h
+++ b/ceph/src/messages/MMDSFragmentNotify.h
@@ -18,43 +18,52 @@
 #include "msg/Message.h"
 
 class MMDSFragmentNotify : public Message {
-  inodeno_t ino;
-  frag_t basefrag;
-  int8_t bits;
+  static constexpr int HEAD_VERSION = 2;
+  static constexpr int COMPAT_VERSION = 1;
+
+  dirfrag_t base_dirfrag;
+  int8_t bits = 0;
+  bool ack_wanted = false;
 
  public:
-  inodeno_t get_ino() { return ino; }
-  frag_t get_basefrag() { return basefrag; }
+  inodeno_t get_ino() { return base_dirfrag.ino; }
+  frag_t get_basefrag() { return base_dirfrag.frag; }
+  dirfrag_t get_base_dirfrag() const { return base_dirfrag; }
   int get_bits() { return bits; }
+  bool is_ack_wanted() const { return ack_wanted; }
+  void mark_ack_wanted() { ack_wanted = true; }
 
   bufferlist basebl;
 
-  MMDSFragmentNotify() : Message(MSG_MDS_FRAGMENTNOTIFY) {}
-  MMDSFragmentNotify(dirfrag_t df, int b) :
-	Message(MSG_MDS_FRAGMENTNOTIFY),
-    ino(df.ino), basefrag(df.frag), bits(b) { }
+  MMDSFragmentNotify() :
+    Message(MSG_MDS_FRAGMENTNOTIFY, HEAD_VERSION, COMPAT_VERSION) {}
+  MMDSFragmentNotify(dirfrag_t df, int b, uint64_t tid) :
+    Message(MSG_MDS_FRAGMENTNOTIFY, HEAD_VERSION, COMPAT_VERSION),
+    base_dirfrag(df), bits(b) {
+    set_tid(tid);
+  }
 private:
   ~MMDSFragmentNotify() override {}
 
 public:  
   const char *get_type_name() const override { return "fragment_notify"; }
   void print(ostream& o) const override {
-    o << "fragment_notify(" << ino << "." << basefrag
-      << " " << (int)bits << ")";
+    o << "fragment_notify(" << base_dirfrag << " " << (int)bits << ")";
   }
 
   void encode_payload(uint64_t features) override {
-    ::encode(ino, payload);
-    ::encode(basefrag, payload);
+    ::encode(base_dirfrag, payload);
     ::encode(bits, payload);
     ::encode(basebl, payload);
+    ::encode(ack_wanted, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
-    ::decode(ino, p);
-    ::decode(basefrag, p);
+    ::decode(base_dirfrag, p);
     ::decode(bits, p);
     ::decode(basebl, p);
+    if (header.version >= 2)
+      ::decode(ack_wanted, p);
   }
   
 };
diff --git a/ceph/src/messages/MMDSFragmentNotifyAck.h b/ceph/src/messages/MMDSFragmentNotifyAck.h
new file mode 100644
index 000000000..4175ec8d3
--- /dev/null
+++ b/ceph/src/messages/MMDSFragmentNotifyAck.h
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MMDSFRAGMENTNOTIFYAck_H
+#define CEPH_MMDSFRAGMENTNOTIFYAck_H
+
+#include "msg/Message.h"
+
+class MMDSFragmentNotifyAck : public Message {
+private:
+  dirfrag_t base_dirfrag;
+  int8_t bits = 0;
+
+public:
+  dirfrag_t get_base_dirfrag() const { return base_dirfrag; }
+  int get_bits() const { return bits; }
+
+  bufferlist basebl;
+
+  MMDSFragmentNotifyAck() : Message(MSG_MDS_FRAGMENTNOTIFYACK) {}
+  MMDSFragmentNotifyAck(dirfrag_t df, int b, uint64_t tid) :
+    Message(MSG_MDS_FRAGMENTNOTIFYACK),
+    base_dirfrag(df), bits(b) {
+    set_tid(tid);
+  }
+private:
+  ~MMDSFragmentNotifyAck() override {}
+
+public:
+  const char *get_type_name() const override { return "fragment_notify_ack"; }
+  void print(ostream& o) const override {
+    o << "fragment_notify_ack(" << base_dirfrag << " " << (int)bits << ")";
+  }
+
+  void encode_payload(uint64_t features) override {
+    ::encode(base_dirfrag, payload);
+    ::encode(bits, payload);
+  }
+  void decode_payload() override {
+    auto p = payload.begin();
+    ::decode(base_dirfrag, p);
+    ::decode(bits, p);
+  }
+};
+
+#endif
diff --git a/ceph/src/mgr/ActivePyModules.cc b/ceph/src/mgr/ActivePyModules.cc
index ea7128c53..5e6c62b70 100644
--- a/ceph/src/mgr/ActivePyModules.cc
+++ b/ceph/src/mgr/ActivePyModules.cc
@@ -289,9 +289,9 @@ PyObject *ActivePyModules::get_python(const std::string &what)
   } else if (what == "df") {
     PyFormatter f;
 
-    cluster_state.with_osdmap([this, &f](const OSDMap &osd_map){
-      cluster_state.with_pgmap(
-          [&osd_map, &f](const PGMap &pg_map) {
+    cluster_state.with_pgmap([this, &f](const PGMap &pg_map) {
+      cluster_state.with_osdmap(
+          [&pg_map, &f](const OSDMap &osd_map) {
         pg_map.dump_fs_stats(nullptr, &f, true);
         pg_map.dump_pool_stats_full(osd_map, nullptr, &f, true);
       });
@@ -420,15 +420,13 @@ void ActivePyModules::notify_all(const LogEntry &log_entry)
 bool ActivePyModules::get_config(const std::string &module_name,
     const std::string &key, std::string *val) const
 {
-  PyThreadState *tstate = PyEval_SaveThread();
-  Mutex::Locker l(lock);
-  PyEval_RestoreThread(tstate);
-
   const std::string global_key = PyModuleRegistry::config_prefix
     + module_name + "/" + key;
 
   dout(4) << __func__ << "key: " << global_key << dendl;
 
+  Mutex::Locker l(lock);
+
   if (config_cache.count(global_key)) {
     *val = config_cache.at(global_key);
     return true;
diff --git a/ceph/src/mgr/BaseMgrModule.cc b/ceph/src/mgr/BaseMgrModule.cc
index 6777a5238..790991cfe 100644
--- a/ceph/src/mgr/BaseMgrModule.cc
+++ b/ceph/src/mgr/BaseMgrModule.cc
@@ -137,6 +137,9 @@ ceph_send_command(BaseMgrModule *self, PyObject *args)
 
   auto c = new MonCommandCompletion(self->py_modules,
       completion, tag, PyThreadState_Get());
+
+  PyThreadState *tstate = PyEval_SaveThread();
+
   if (std::string(type) == "mon") {
     self->py_modules->get_monc().start_mon_command(
         {cmd_json},
@@ -151,6 +154,7 @@ ceph_send_command(BaseMgrModule *self, PyObject *args)
       delete c;
       string msg("invalid osd_id: ");
       msg.append("\"").append(name).append("\"");
+      PyEval_RestoreThread(tstate);
       PyErr_SetString(PyExc_ValueError, msg.c_str());
       return nullptr;
     }
@@ -175,6 +179,7 @@ ceph_send_command(BaseMgrModule *self, PyObject *args)
     if (r != 0) {
       string msg("failed to send command to mds: ");
       msg.append(cpp_strerror(r));
+      PyEval_RestoreThread(tstate);
       PyErr_SetString(PyExc_RuntimeError, msg.c_str());
       return nullptr;
     }
@@ -184,6 +189,7 @@ ceph_send_command(BaseMgrModule *self, PyObject *args)
       delete c;
       string msg("invalid pgid: ");
       msg.append("\"").append(name).append("\"");
+      PyEval_RestoreThread(tstate);
       PyErr_SetString(PyExc_ValueError, msg.c_str());
       return nullptr;
     }
@@ -197,15 +203,18 @@ ceph_send_command(BaseMgrModule *self, PyObject *args)
         &c->outbl,
         &c->outs,
         c);
+    PyEval_RestoreThread(tstate);
     return nullptr;
   } else {
     delete c;
     string msg("unknown service type: ");
     msg.append(type);
+    PyEval_RestoreThread(tstate);
     PyErr_SetString(PyExc_ValueError, msg.c_str());
     return nullptr;
   }
 
+  PyEval_RestoreThread(tstate);
   Py_RETURN_NONE;
 }
 
@@ -356,9 +365,13 @@ ceph_config_get(BaseMgrModule *self, PyObject *args)
     return nullptr;
   }
 
+  PyThreadState *tstate = PyEval_SaveThread();
   std::string value;
   bool found = self->py_modules->get_config(self->this_module->get_name(),
       what, &value);
+
+  PyEval_RestoreThread(tstate);
+
   if (found) {
     dout(10) << "ceph_config_get " << what << " found: " << value.c_str() << dendl;
     return PyString_FromString(value.c_str());
diff --git a/ceph/src/mgr/DaemonServer.cc b/ceph/src/mgr/DaemonServer.cc
index 87c399b80..f67acb3d4 100644
--- a/ceph/src/mgr/DaemonServer.cc
+++ b/ceph/src/mgr/DaemonServer.cc
@@ -1464,6 +1464,9 @@ void DaemonServer::send_report()
 	  jf.dump_object("health_checks", m->health_checks);
 	  jf.flush(*_dout);
 	  *_dout << dendl;
+          if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+              clog->debug() << "pgmap v" << pg_map.version << ": " << pg_map;
+          }
 	});
     });
 
diff --git a/ceph/src/mon/AuthMonitor.cc b/ceph/src/mon/AuthMonitor.cc
index 338d55b08..89d2669e0 100644
--- a/ceph/src/mon/AuthMonitor.cc
+++ b/ceph/src/mon/AuthMonitor.cc
@@ -32,6 +32,9 @@
 #include "include/stringify.h"
 #include "include/assert.h"
 
+#include "mds/MDSAuthCaps.h"
+#include "osd/OSDCap.h"
+
 #define dout_subsys ceph_subsys_mon
 #undef dout_prefix
 #define dout_prefix _prefix(_dout, mon, get_last_committed())
@@ -1040,6 +1043,37 @@ int AuthMonitor::do_osd_new(
   return 0;
 }
 
+bool AuthMonitor::valid_caps(const vector<string>& caps, ostream *out)
+{
+  for (vector<string>::const_iterator p = caps.begin();
+       p != caps.end(); p += 2) {
+    if ((p+1) == caps.end()) {
+      *out << "cap '" << *p << "' has no value";
+      return false;
+    }
+    if (*p == "mon" || *p == "mgr") {
+      MonCap tmp;
+      if (!tmp.parse(*(p+1), out)) {
+	return false;
+      }
+    } else if (*p == "osd") {
+      OSDCap ocap;
+      if (!ocap.parse(*(p+1), out)) {
+	return false;
+      }
+    } else if (*p == "mds") {
+      MDSAuthCaps mdscap;
+      if (!mdscap.parse(g_ceph_context, *(p+1), out)) {
+	return false;
+      }
+    } else {
+      *out << "unknown cap type '" << *p << "'";
+      return false;
+    }
+  }
+  return true;
+}
+
 bool AuthMonitor::prepare_command(MonOpRequestRef op)
 {
   MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
@@ -1141,6 +1175,11 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op)
       }
     }
 
+    if (!valid_caps(caps_vec, &ss)) {
+      err = -EINVAL;
+      goto done;
+    }
+
     // are we about to have it?
     if (entity_is_pending(entity)) {
       wait_for_finished_proposal(op,
@@ -1209,7 +1248,7 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op)
 						   get_last_committed() + 1));
     return true;
   } else if ((prefix == "auth get-or-create-key" ||
-	     prefix == "auth get-or-create") &&
+	      prefix == "auth get-or-create") &&
 	     !entity_name.empty()) {
     // auth get-or-create <name> [mon osdcapa osd osdcapb ...]
 
@@ -1316,6 +1355,11 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op)
     string mds_cap_string, osd_cap_string;
     string osd_cap_wanted = "r";
 
+    if (!valid_caps(caps_vec, &ss)) {
+      err = -EINVAL;
+      goto done;
+    }
+
     for (auto it = caps_vec.begin();
 	 it != caps_vec.end() && (it + 1) != caps_vec.end();
 	 it += 2) {
diff --git a/ceph/src/mon/AuthMonitor.h b/ceph/src/mon/AuthMonitor.h
index 18b847d64..01e4208c9 100644
--- a/ceph/src/mon/AuthMonitor.h
+++ b/ceph/src/mon/AuthMonitor.h
@@ -19,6 +19,7 @@
 #include <set>
 using namespace std;
 
+#include "global/global_init.h"
 #include "include/ceph_features.h"
 #include "include/types.h"
 #include "mon/PaxosService.h"
@@ -128,19 +129,9 @@ private:
     pending_auth.push_back(inc);
   }
 
-  /* validate mon caps ; don't care about caps for other services as
+  /* validate mon/osd/mds caps ; don't care about caps for other services as
    * we don't know how to validate them */
-  bool valid_caps(const vector<string>& caps, ostream *out) {
-    for (vector<string>::const_iterator p = caps.begin();
-         p != caps.end(); p += 2) {
-      if (!p->empty() && *p != "mon")
-        continue;
-      MonCap tmp;
-      if (!tmp.parse(*(p+1), out))
-        return false;
-    }
-    return true;
-  }
+  bool valid_caps(const vector<string>& caps, ostream *out);
 
   void on_active() override;
   bool should_propose(double& delay) override;
diff --git a/ceph/src/mon/CMakeLists.txt b/ceph/src/mon/CMakeLists.txt
index 0f553c8a0..af08671c1 100644
--- a/ceph/src/mon/CMakeLists.txt
+++ b/ceph/src/mon/CMakeLists.txt
@@ -21,6 +21,7 @@ set(lib_mon_srcs
   PGMonitor.cc
   PGMap.cc
   ConfigKeyService.cc
+  ../mds/MDSAuthCaps.cc
   ../mgr/mgr_commands.cc
   ../osd/OSDCap.cc)
 add_library(mon STATIC
diff --git a/ceph/src/mon/FSCommands.cc b/ceph/src/mon/FSCommands.cc
index cb3e54711..51c587601 100644
--- a/ceph/src/mon/FSCommands.cc
+++ b/ceph/src/mon/FSCommands.cc
@@ -620,6 +620,13 @@ class RemoveFilesystemHandler : public FileSystemCommandHandler
       map<string, cmd_vartype> &cmdmap,
       std::stringstream &ss) override
   {
+    /* We may need to blacklist ranks. */
+    if (!mon->osdmon()->is_writeable()) {
+      // not allowed to write yet, so retry when we can
+      mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+      return -EAGAIN;
+    }
+
     // Check caller has correctly named the FS to delete
     // (redundant while there is only one FS, but command
     //  syntax should apply to multi-FS future)
@@ -663,6 +670,9 @@ class RemoveFilesystemHandler : public FileSystemCommandHandler
       // wait for an osdmap propose here: ignore return value.
       mon->mdsmon()->fail_mds_gid(fsmap, gid);
     }
+    if (!to_fail.empty()) {
+      mon->osdmon()->propose_pending(); /* maybe new blacklists */
+    }
 
     fsmap.erase_filesystem(fs->fscid);
 
diff --git a/ceph/src/mon/MDSMonitor.cc b/ceph/src/mon/MDSMonitor.cc
index 7be5f14f7..436c9ff42 100644
--- a/ceph/src/mon/MDSMonitor.cc
+++ b/ceph/src/mon/MDSMonitor.cc
@@ -1255,6 +1255,8 @@ bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
   const MDSMap::mds_info_t &info = fsmap.get_info_gid(gid);
   dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
 
+  ceph_assert(mon->osdmon()->is_writeable());
+
   epoch_t blacklist_epoch = 0;
   if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
     utime_t until = ceph_clock_now();
@@ -2173,9 +2175,10 @@ bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, std::shared_ptr<Filesystem>
 	do_propose = true;
       }
     }
-  } else {
+  } else if (!fs->mds_map.is_degraded()) {
     // There were no failures to replace, so try using any available standbys
-    // as standby-replay daemons.
+    // as standby-replay daemons. Don't do this when the cluster is degraded
+    // as a standby-replay daemon may try to read a journal being migrated.
 
     // Take a copy of the standby GIDs so that we can iterate over
     // them while perhaps-modifying standby_daemons during the loop
diff --git a/ceph/src/mon/MgrStatMonitor.cc b/ceph/src/mon/MgrStatMonitor.cc
index b79884b7d..0773fd114 100644
--- a/ceph/src/mon/MgrStatMonitor.cc
+++ b/ceph/src/mon/MgrStatMonitor.cc
@@ -75,6 +75,7 @@ void MgrStatMonitor::create_initial()
   dout(10) << __func__ << dendl;
   version = 0;
   service_map.epoch = 1;
+  pending_service_map_bl.clear();
   ::encode(service_map, pending_service_map_bl, CEPH_FEATURES_ALL);
 }
 
@@ -95,7 +96,8 @@ void MgrStatMonitor::update_from_paxos(bool *need_bootstrap)
 	       << " service_map e" << service_map.epoch << dendl;
     }
     catch (buffer::error& e) {
-      derr << "failed to decode mgrstat state; luminous dev version?" << dendl;
+      derr << "failed to decode mgrstat state; luminous dev version? "
+	   << e.what() << dendl;
     }
   }
   check_subs();
diff --git a/ceph/src/mon/PGMap.cc b/ceph/src/mon/PGMap.cc
index c5def7d1c..aa623c777 100644
--- a/ceph/src/mon/PGMap.cc
+++ b/ceph/src/mon/PGMap.cc
@@ -2673,7 +2673,6 @@ void PGMap::get_health_checks(
     // Immediate reports
     { PG_STATE_INCONSISTENT,     {DAMAGED,     {}} },
     { PG_STATE_INCOMPLETE,       {UNAVAILABLE, {}} },
-    { PG_STATE_REPAIR,           {DAMAGED,     {}} },
     { PG_STATE_SNAPTRIM_ERROR,   {DAMAGED,     {}} },
     { PG_STATE_RECOVERY_UNFOUND, {DAMAGED,     {}} },
     { PG_STATE_BACKFILL_UNFOUND, {DAMAGED,     {}} },
@@ -3270,45 +3269,92 @@ void PGMap::get_health_checks(
 
   // PG_NOT_SCRUBBED
   // PG_NOT_DEEP_SCRUBBED
-  {
-    if (cct->_conf->mon_warn_not_scrubbed ||
+  if (cct->_conf->mon_warn_not_scrubbed ||
         cct->_conf->mon_warn_not_deep_scrubbed) {
-      list<string> detail, deep_detail;
-      const double age = cct->_conf->mon_warn_not_scrubbed +
-        cct->_conf->mon_scrub_interval;
-      utime_t cutoff = now;
-      cutoff -= age;
-      const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
-        cct->_conf->osd_deep_scrub_interval;
-      utime_t deep_cutoff = now;
-      deep_cutoff -= deep_age;
-      for (auto& p : pg_stat) {
-        if (cct->_conf->mon_warn_not_scrubbed &&
-            p.second.last_scrub_stamp < cutoff) {
-	  ostringstream ss;
-	  ss << "pg " << p.first << " not scrubbed since "
-	     << p.second.last_scrub_stamp;
-          detail.push_back(ss.str());
+    list<string> detail, deep_detail;
+    int detail_max = max, deep_detail_max = max;
+    int detail_more = 0, deep_detail_more = 0;
+    int detail_total = 0, deep_detail_total = 0;
+    for (auto& p : pg_stat) {
+      int64_t pnum =  p.first.pool();
+      auto pool = osdmap.get_pg_pool(pnum);
+      if (!pool)
+        continue;
+      if (cct->_conf->mon_warn_not_scrubbed) {
+        double scrub_max_interval = 0;
+        pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
+        if (scrub_max_interval <= 0) {
+          scrub_max_interval = cct->_conf->osd_scrub_max_interval;
         }
-        if (cct->_conf->mon_warn_not_deep_scrubbed &&
-            p.second.last_deep_scrub_stamp < deep_cutoff) {
-	  ostringstream ss;
-	  ss << "pg " << p.first << " not deep-scrubbed since "
-	     << p.second.last_deep_scrub_stamp;
-          deep_detail.push_back(ss.str());
+        const double age = cct->_conf->mon_warn_not_scrubbed +
+          scrub_max_interval;
+        utime_t cutoff = now;
+        cutoff -= age;
+        if (p.second.last_scrub_stamp < cutoff) {
+          if (detail_max > 0) {
+            ostringstream ss;
+            ss << "pg " << p.first << " not scrubbed since "
+               << p.second.last_scrub_stamp;
+            detail.push_back(ss.str());
+            --detail_max;
+          } else {
+            ++detail_more;
+          }
+          ++detail_total;
+        }
+      }
+      if (cct->_conf->mon_warn_not_deep_scrubbed) {
+        double deep_scrub_interval = 0;
+        pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
+        if (deep_scrub_interval <= 0) {
+          deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
+        }
+        double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
+          deep_scrub_interval;
+        utime_t deep_cutoff = now;
+        deep_cutoff -= deep_age;
+        if (p.second.last_deep_scrub_stamp < deep_cutoff) {
+          if (deep_detail_max > 0) {
+            ostringstream ss;
+            ss << "pg " << p.first << " not deep-scrubbed since "
+               << p.second.last_deep_scrub_stamp;
+            deep_detail.push_back(ss.str());
+            --deep_detail_max;
+          } else {
+            ++deep_detail_more;
+          }
+          ++deep_detail_total;
         }
       }
+    }
+    if (detail_total) {
+      ostringstream ss;
+      ss << detail_total << " pgs not scrubbed in time";
+      auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
+
       if (!detail.empty()) {
-        ostringstream ss;
-        ss << detail.size() << " pgs not scrubbed for " << age;
-        auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
         d.detail.swap(detail);
+
+        if (detail_more) {
+          ostringstream ss;
+          ss << detail_more << " more pgs... ";
+          d.detail.push_back(ss.str());
+        }
       }
+    }
+    if (deep_detail_total) {
+      ostringstream ss;
+      ss << deep_detail_total << " pgs not deep-scrubbed in time";
+      auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
+
       if (!deep_detail.empty()) {
-        ostringstream ss;
-        ss << deep_detail.size() << " pgs not deep-scrubbed for " << deep_age;
-        auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
         d.detail.swap(deep_detail);
+
+        if (deep_detail_more) {
+          ostringstream ss;
+          ss << deep_detail_more << " more pgs... ";
+          d.detail.push_back(ss.str());
+        }
       }
     }
   }
diff --git a/ceph/src/msg/Message.cc b/ceph/src/msg/Message.cc
index 629f3a6fd..644205b6c 100644
--- a/ceph/src/msg/Message.cc
+++ b/ceph/src/msg/Message.cc
@@ -138,6 +138,7 @@ using namespace std;
 #include "messages/MDiscoverReply.h"
 
 #include "messages/MMDSFragmentNotify.h"
+#include "messages/MMDSFragmentNotifyAck.h"
 
 #include "messages/MExportDirDiscover.h"
 #include "messages/MExportDirDiscoverAck.h"
@@ -678,6 +679,10 @@ Message *decode_message(CephContext *cct, int crcflags,
     m = new MMDSFragmentNotify;
     break;
 
+  case MSG_MDS_FRAGMENTNOTIFYACK:
+    m = new MMDSFragmentNotifyAck;
+    break;
+
   case MSG_MDS_EXPORTDIRDISCOVER:
     m = new MExportDirDiscover();
     break;
diff --git a/ceph/src/msg/Message.h b/ceph/src/msg/Message.h
index 10a7004f7..ec2c04d66 100644
--- a/ceph/src/msg/Message.h
+++ b/ceph/src/msg/Message.h
@@ -147,6 +147,7 @@
 #define MSG_MDS_OPENINO            0x20f
 #define MSG_MDS_OPENINOREPLY       0x210
 
+#define MSG_MDS_FRAGMENTNOTIFYACK  0x212
 #define MSG_MDS_LOCK               0x300
 #define MSG_MDS_INODEFILECAPS      0x301
 
diff --git a/ceph/src/msg/async/AsyncConnection.cc b/ceph/src/msg/async/AsyncConnection.cc
index ea03f8690..72c857a1e 100644
--- a/ceph/src/msg/async/AsyncConnection.cc
+++ b/ceph/src/msg/async/AsyncConnection.cc
@@ -129,7 +129,7 @@ AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQu
     recv_start(0), recv_end(0),
     last_active(ceph::coarse_mono_clock::now()),
     inactive_timeout_us(cct->_conf->ms_tcp_read_timeout*1000*1000),
-    got_bad_auth(false), authorizer(NULL), replacing(false),
+    authorizer(NULL), replacing(false),
     is_reset_from_peer(false), once_ready(false), state_buffer(NULL), state_offset(0),
     worker(w), center(&w->center)
 {
@@ -879,7 +879,6 @@ ssize_t AsyncConnection::_process_connection()
         assert(!policy.server);
 
         // reset connect state variables
-        got_bad_auth = false;
         delete authorizer;
         authorizer = NULL;
         authorizer_buf.clear();
@@ -1275,10 +1274,13 @@ ssize_t AsyncConnection::_process_connection()
         }
 
         addr_bl.append(state_buffer+strlen(CEPH_BANNER), sizeof(ceph_entity_addr));
-        {
+        try {
           bufferlist::iterator ti = addr_bl.begin();
           ::decode(peer_addr, ti);
-        }
+        } catch (const buffer::error& e) {
+	  lderr(async_msgr->cct) << __func__ <<  " decode peer_addr failed " << dendl;
+          goto fail;
+	}
 
         ldout(async_msgr->cct, 10) << __func__ << " accept peer addr is " << peer_addr << dendl;
         if (peer_addr.is_blank_ip()) {
@@ -1423,12 +1425,7 @@ int AsyncConnection::handle_connect_reply(ceph_msg_connect &connect, ceph_msg_co
 
   if (reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) {
     ldout(async_msgr->cct,0) << __func__ << " connect got BADAUTHORIZER" << dendl;
-    if (got_bad_auth)
-      goto fail;
-    got_bad_auth = true;
-    delete authorizer;
-    authorizer = async_msgr->get_authorizer(peer_type, true);  // try harder
-    state = STATE_CONNECTING_SEND_CONNECT_MSG;
+    goto fail;
   }
   if (reply.tag == CEPH_MSGR_TAG_RESETSESSION) {
     ldout(async_msgr->cct, 0) << __func__ << " connect got RESETSESSION" << dendl;
@@ -1542,6 +1539,14 @@ ssize_t AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlis
 	need_challenge ? &authorizer_challenge : nullptr) ||
       !authorizer_valid) {
     lock.lock();
+    if (state != STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+      ldout(async_msgr->cct, 1) << __func__
+                                << " state changed while verify_authorizer,"
+                                << " it must be mark_down"
+                                << dendl;
+      ceph_assert(state == STATE_CLOSED);
+      return -1;
+    }
     char tag;
     if (need_challenge && !had_challenge && authorizer_challenge) {
       ldout(async_msgr->cct,10) << __func__ << ": challenging authorizer"
@@ -1875,6 +1880,7 @@ ssize_t AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlis
   if (state != STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
     ldout(async_msgr->cct, 1) << __func__ << " state changed while accept_conn, it must be mark_down" << dendl;
     assert(state == STATE_CLOSED || state == STATE_NONE);
+    async_msgr->unregister_conn(this);
     goto fail_registered;
   }
 
@@ -2337,7 +2343,6 @@ void AsyncConnection::reset_recv_state()
       state <= STATE_CONNECTING_READY) {
     delete authorizer;
     authorizer = NULL;
-    got_bad_auth = false;
   }
 
   if (state > STATE_OPEN_MESSAGE_THROTTLE_MESSAGE &&
diff --git a/ceph/src/msg/async/AsyncConnection.h b/ceph/src/msg/async/AsyncConnection.h
index 64a750240..06a23ab01 100644
--- a/ceph/src/msg/async/AsyncConnection.h
+++ b/ceph/src/msg/async/AsyncConnection.h
@@ -348,7 +348,6 @@ class AsyncConnection : public Connection {
   bufferlist front, middle, data;
   ceph_msg_connect connect_msg;
   // Connecting state
-  bool got_bad_auth;
   AuthAuthorizer *authorizer;
   bufferlist authorizer_buf;
   ceph_msg_connect_reply connect_reply;
diff --git a/ceph/src/msg/async/EventEpoll.cc b/ceph/src/msg/async/EventEpoll.cc
index 37b469736..e7b4ac449 100644
--- a/ceph/src/msg/async/EventEpoll.cc
+++ b/ceph/src/msg/async/EventEpoll.cc
@@ -83,16 +83,18 @@ int EpollDriver::del_event(int fd, int cur_mask, int delmask)
 {
   ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask
                  << " delmask=" << delmask << " to " << epfd << dendl;
-  struct epoll_event ee;
+  struct epoll_event ee = {0};
   int mask = cur_mask & (~delmask);
   int r = 0;
 
-  ee.events = 0;
-  if (mask & EVENT_READABLE) ee.events |= EPOLLIN;
-  if (mask & EVENT_WRITABLE) ee.events |= EPOLLOUT;
-  ee.data.u64 = 0; /* avoid valgrind warning */
-  ee.data.fd = fd;
   if (mask != EVENT_NONE) {
+    ee.events = EPOLLET;
+    ee.data.fd = fd;
+    if (mask & EVENT_READABLE)
+      ee.events |= EPOLLIN;
+    if (mask & EVENT_WRITABLE)
+      ee.events |= EPOLLOUT;
+
     if ((r = epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ee)) < 0) {
       lderr(cct) << __func__ << " epoll_ctl: modify fd=" << fd << " mask=" << mask
                  << " failed." << cpp_strerror(errno) << dendl;
diff --git a/ceph/src/msg/msg_types.h b/ceph/src/msg/msg_types.h
index 5632950f3..e9998799b 100644
--- a/ceph/src/msg/msg_types.h
+++ b/ceph/src/msg/msg_types.h
@@ -435,7 +435,14 @@ struct entity_addr_t {
     __u32 elen = get_sockaddr_len();
     ::encode(elen, bl);
     if (elen) {
+#if (__FreeBSD__) || defined(__APPLE__)
+      __le16 ss_family = u.sa.sa_family;
+      ::encode(ss_family, bl);
+      bl.append(u.sa.sa_data,
+		elen - sizeof(u.sa.sa_len) - sizeof(u.sa.sa_family));
+#else
       bl.append((char*)get_sockaddr(), elen);
+#endif
     }
     ENCODE_FINISH(bl);
   }
@@ -454,7 +461,30 @@ struct entity_addr_t {
     __u32 elen;
     ::decode(elen, bl);
     if (elen) {
-      bl.copy(elen, (char*)get_sockaddr());
+#if defined(__FreeBSD__) || defined(__APPLE__)
+      u.sa.sa_len = 0;
+      __le16 ss_family;
+      if (elen < sizeof(ss_family)) {
+	throw buffer::malformed_input("elen smaller than family len");
+      }
+      ::decode(ss_family, bl);
+      u.sa.sa_family = ss_family;
+      elen -= sizeof(ss_family);
+      if (elen > get_sockaddr_len() - sizeof(u.sa.sa_family)) {
+	throw buffer::malformed_input("elen exceeds sockaddr len");
+      }
+      bl.copy(elen, u.sa.sa_data);
+#else
+      if (elen < sizeof(u.sa.sa_family)) {
+	throw buffer::malformed_input("elen smaller than family len");
+      }
+      bl.copy(sizeof(u.sa.sa_family), (char*)&u.sa.sa_family);
+      if (elen > get_sockaddr_len()) {
+	throw buffer::malformed_input("elen exceeds sockaddr len");
+      }
+      elen -= sizeof(u.sa.sa_family);
+      bl.copy(elen, u.sa.sa_data);
+#endif
     }
     DECODE_FINISH(bl);
   }
diff --git a/ceph/src/msg/simple/Pipe.cc b/ceph/src/msg/simple/Pipe.cc
index eec90f5b0..98c51be7c 100644
--- a/ceph/src/msg/simple/Pipe.cc
+++ b/ceph/src/msg/simple/Pipe.cc
@@ -412,9 +412,13 @@ int Pipe::accept()
     ldout(msgr->cct,10) << "accept couldn't read peer_addr" << dendl;
     goto fail_unlocked;
   }
-  {
+  try {
     bufferlist::iterator ti = addrbl.begin();
     ::decode(peer_addr, ti);
+  } catch (const buffer::error& e) {
+    ldout(msgr->cct,2) << __func__ <<  " decode peer_addr failed: " << e.what()
+			<< dendl;
+    goto fail_unlocked;
   }
 
   ldout(msgr->cct,10) << "accept peer addr is " << peer_addr << dendl;
diff --git a/ceph/src/os/CMakeLists.txt b/ceph/src/os/CMakeLists.txt
index c4cae0430..e25f95a7f 100644
--- a/ceph/src/os/CMakeLists.txt
+++ b/ceph/src/os/CMakeLists.txt
@@ -28,11 +28,11 @@ if(HAVE_LIBAIO)
     bluestore/BlueRocksEnv.cc
     bluestore/BlueStore.cc
     bluestore/bluestore_types.cc
+    bluestore/fastbmap_allocator_impl.cc
     bluestore/FreelistManager.cc
     bluestore/KernelDevice.cc
     bluestore/StupidAllocator.cc
-    bluestore/BitMapAllocator.cc
-    bluestore/BitAllocator.cc
+    bluestore/BitmapAllocator.cc
     bluestore/aio.cc
   )
 endif(HAVE_LIBAIO)
diff --git a/ceph/src/os/bluestore/Allocator.cc b/ceph/src/os/bluestore/Allocator.cc
index 8a4101eae..b60263531 100644
--- a/ceph/src/os/bluestore/Allocator.cc
+++ b/ceph/src/os/bluestore/Allocator.cc
@@ -3,7 +3,7 @@
 
 #include "Allocator.h"
 #include "StupidAllocator.h"
-#include "BitMapAllocator.h"
+#include "BitmapAllocator.h"
 #include "common/debug.h"
 
 #define dout_subsys ceph_subsys_bluestore
@@ -14,9 +14,18 @@ Allocator *Allocator::create(CephContext* cct, string type,
   if (type == "stupid") {
     return new StupidAllocator(cct);
   } else if (type == "bitmap") {
-    return new BitMapAllocator(cct, size, block_size);
+    return new BitmapAllocator(cct, size, block_size);
   }
   lderr(cct) << "Allocator::" << __func__ << " unknown alloc type "
 	     << type << dendl;
   return nullptr;
 }
+
+void Allocator::release(const PExtentVector& release_vec)
+{
+  interval_set<uint64_t> release_set;
+  for (auto e : release_vec) {
+    release_set.insert(e.offset, e.length);
+  }
+  release(release_set);
+}
diff --git a/ceph/src/os/bluestore/Allocator.h b/ceph/src/os/bluestore/Allocator.h
index a93428d9c..ddf194f91 100644
--- a/ceph/src/os/bluestore/Allocator.h
+++ b/ceph/src/os/bluestore/Allocator.h
@@ -22,9 +22,6 @@ class Allocator {
 public:
   virtual ~Allocator() {}
 
-  virtual int reserve(uint64_t need) = 0;
-  virtual void unreserve(uint64_t unused) = 0;
-
   /*
    * Allocate required number of blocks in n number of extents.
    * Min and Max number of extents are limited by:
@@ -36,15 +33,17 @@ public:
    */
   virtual int64_t allocate(uint64_t want_size, uint64_t alloc_unit,
 			   uint64_t max_alloc_size, int64_t hint,
-			   AllocExtentVector *extents) = 0;
+			   PExtentVector *extents) = 0;
 
   int64_t allocate(uint64_t want_size, uint64_t alloc_unit,
-		   int64_t hint, AllocExtentVector *extents) {
+		   int64_t hint, PExtentVector *extents) {
     return allocate(want_size, alloc_unit, want_size, hint, extents);
   }
 
-  virtual void release(
-    uint64_t offset, uint64_t length) = 0;
+  /* Bulk release. Implementations may override this method to handle the whole
+   * set at once. This could save e.g. unnecessary mutex dance. */
+  virtual void release(const interval_set<uint64_t>& release_set) = 0;
+  void release(const PExtentVector& release_set);
 
   virtual void dump() = 0;
 
@@ -52,6 +51,10 @@ public:
   virtual void init_rm_free(uint64_t offset, uint64_t length) = 0;
 
   virtual uint64_t get_free() = 0;
+  virtual double get_fragmentation(uint64_t alloc_unit)
+  {
+    return 0.0;
+  }
 
   virtual void shutdown() = 0;
   static Allocator *create(CephContext* cct, string type, int64_t size,
diff --git a/ceph/src/os/bluestore/BitAllocator.cc b/ceph/src/os/bluestore/BitAllocator.cc
deleted file mode 100644
index aeec038dc..000000000
--- a/ceph/src/os/bluestore/BitAllocator.cc
+++ /dev/null
@@ -1,1420 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-
-// vim: ts=8 sw=2 smarttab
-/*
- * Bitmap based in-memory allocator.
- * Author: Ramesh Chander, Ramesh.Chander@sandisk.com
- *
- * BitMap Tree Design:
- * Storage is divided into bitmap of blocks. Each bitmap has size of
- * unsigned long. Group of bitmap creates a Zone. Zone is a unit where
- * at a time single thread can be active as well as single biggest
- * contiguous allocation that can be requested.
- *
- * Rest of the nodes are classified into three categories:
- *   root node or Allocator
- *   internal nodes or BitMapAreaIN
- *   final nodes that contains Zones called BitMapAreaLeaf
- * This classification is according to their own implmentation of some
- * of the interfaces defined in BitMapArea.
- */
-
-#include "BitAllocator.h"
-#include <assert.h>
-#include "bluestore_types.h"
-#include "common/debug.h"
-#include <math.h>
-
-#define dout_context cct
-#define dout_subsys ceph_subsys_bluestore
-#undef dout_prefix
-#define dout_prefix *_dout << "bitalloc:"
-
-MEMPOOL_DEFINE_OBJECT_FACTORY(BitMapArea, BitMapArea, bluestore_alloc);
-MEMPOOL_DEFINE_OBJECT_FACTORY(BitMapAreaIN, BitMapAreaIN, bluestore_alloc);
-MEMPOOL_DEFINE_OBJECT_FACTORY(BitMapAreaLeaf, BitMapAreaLeaf, bluestore_alloc);
-MEMPOOL_DEFINE_OBJECT_FACTORY(BitMapZone, BitMapZone, bluestore_alloc);
-MEMPOOL_DEFINE_OBJECT_FACTORY(BmapEntry, BmapEntry, bluestore_alloc);
-MEMPOOL_DEFINE_OBJECT_FACTORY(BitAllocator, BitAllocator, bluestore_alloc);
-
-int64_t BitMapAreaLeaf::count = 0;
-int64_t BitMapZone::count = 0;
-int64_t BitMapZone::total_blocks = 0;
-
-
-
-int64_t BmapEntityListIter::index()
-{
-  return m_cur_idx;
-}
-
-BmapEntry::BmapEntry(CephContext*, const bool full)
-{
-  if (full) {
-    m_bits = BmapEntry::full_bmask();
-  } else {
-    m_bits = BmapEntry::empty_bmask();
-  }
-}
-
-BmapEntry::~BmapEntry()
-{
-
-}
-
-bool BmapEntry::check_bit(int bit)
-{
-  return (atomic_fetch() & bit_mask(bit));
-}
-
-bool BmapEntry::is_allocated(int64_t offset, int64_t num_bits)
-{
-  bmap_t bmask = BmapEntry::align_mask(num_bits) >> offset;
-  return ((m_bits & bmask) == bmask);
-}
-
-void BmapEntry::clear_bit(int bit)
-{
-  bmap_t bmask = bit_mask(bit);
-  m_bits &= ~(bmask);
-}
-
-void BmapEntry::clear_bits(int offset, int num_bits)
-{
-  if (num_bits == 0) {
-    return;
-  }
-
-  bmap_t bmask = BmapEntry::align_mask(num_bits) >> offset;
-  m_bits &= ~(bmask);
-}
-
-void BmapEntry::set_bits(int offset, int num_bits)
-{
-  if (num_bits == 0) {
-    return;
-  }
-
-  bmap_t bmask = BmapEntry::align_mask(num_bits) >> offset;
-  m_bits |= bmask;
-}
-
-/*
- * Allocate a bit if it was free.
- * Retruns true if it was free.
- */
-bool BmapEntry::check_n_set_bit(int bit)
-{
-  bmap_t bmask = bit_mask(bit);
-  bool res = !(m_bits & bmask);
-  m_bits |= bmask;
-  return res;
-}
-
-/*
- * Find N cont free bits in BitMap starting from an offset.
- *
- * Returns number of continuous bits found.
- */
-int BmapEntry::find_n_cont_bits(int start_offset, int64_t num_bits)
-{
-  int count = 0;
-  int i = 0;
-
-  if (num_bits == 0) {
-    return 0;
-  }
-
-  if (start_offset >= BmapEntry::size()) {
-    return 0;
-  }
-
-  for (i = start_offset; i < BmapEntry::size() && count < num_bits; i++) {
-    if (!check_n_set_bit(i)) {
-      break;
-    }
-    count++;
-  }
-
-  return count;
-}
-
-/*
- * Find N free bits starting search from a given offset.
- *
- * Returns number of bits found, start bit and end of
- * index next to bit where our search ended + 1.
- */
-int BmapEntry::find_n_free_bits(int start_idx, int64_t max_bits,
-         int *free_bit, int *end_idx)
-{
-  int i = 0;
-  int count = 0;
-
-  *free_bit = 0;
-  alloc_assert(max_bits > 0);
-
-  /*
-   * Find free bit aligned to bit_align return the bit_num in free_bit.
-   */
-  if (atomic_fetch() == BmapEntry::full_bmask()) {
-    /*
-     * All bits full, return fail.
-     */
-    *end_idx = BmapEntry::size();
-    return 0;
-  }
-
-  /*
-   * Do a serial scan on bitmap.
-   */
-  for (i = start_idx; i < BmapEntry::size(); i++) {
-    if (check_n_set_bit(i)) {
-      /*
-       * Found first free bit
-       */
-      *free_bit = i;
-      count++;
-      break;
-    }
-  }
-  count += find_n_cont_bits(i + 1, max_bits - 1);
-
-  (*end_idx) = i + count;
-  return count;
-}
-
-/*
- * Find first series of contiguous bits free in bitmap starting
- * from start offset that either
- * satisfy our need or are touching right edge of bitmap.
- *
- * Returns allocated bits, start bit of allocated, number of bits
- * scanned from start offset.
- */
-int
-BmapEntry::find_first_set_bits(int64_t required_blocks,
-          int bit_offset, int *start_offset,
-          int64_t *scanned)
-{
-  int allocated = 0;
-  int conti = 0;
-  int end_idx = 0;
-
-  *scanned = 0;
-
-  while (bit_offset < BmapEntry::size()) {
-    conti = find_n_free_bits(bit_offset, required_blocks,
-           start_offset, &end_idx);
-
-    *scanned += end_idx - bit_offset;
-    /*
-     * Either end of bitmap or got required.
-     */
-    if (conti == required_blocks ||
-        (conti + *start_offset == BmapEntry::size())) {
-      allocated += conti;
-      break;
-    }
-
-    /*
-     * Did not get expected, search from next index again.
-     */
-    clear_bits(*start_offset, conti);
-    allocated = 0;
-
-    bit_offset = end_idx;
-  }
-
-  return allocated;
-}
-
-void BmapEntry::dump_state(CephContext* const cct, const int& count)
-{
-  dout(0) << count << ":: 0x" << std::hex << m_bits << std::dec << dendl;
-}
-
-/*
- * Zone related functions.
- */
-void BitMapZone::init(CephContext* const cct,
-                      const int64_t zone_num,
-                      const int64_t total_blocks,
-                      const bool def)
-{
-  m_area_index = zone_num;
-  BitMapZone::total_blocks = total_blocks;
-  alloc_assert(size() > 0);
-
-  m_used_blocks = def? total_blocks: 0;
-
-  int64_t num_bmaps = total_blocks / BmapEntry::size();
-  alloc_assert(num_bmaps < std::numeric_limits<int16_t>::max());
-  alloc_assert(total_blocks < std::numeric_limits<int32_t>::max());
-  alloc_assert(!(total_blocks % BmapEntry::size()));
-
-  m_bmap_vec.resize(num_bmaps, BmapEntry(cct, def));
-  incr_count();
-}
-
-int64_t BitMapZone::sub_used_blocks(int64_t num_blocks)
-{
-  return std::atomic_fetch_sub(&m_used_blocks, (int32_t) num_blocks);
-}
-
-int64_t BitMapZone::add_used_blocks(int64_t num_blocks)
-{
-  return std::atomic_fetch_add(&m_used_blocks, (int32_t)num_blocks) + num_blocks;
-}
-
-/* Intensionally hinted because BitMapAreaLeaf::child_check_n_lock. */
-inline int64_t BitMapZone::get_used_blocks()
-{
-  return std::atomic_load(&m_used_blocks);
-}
-
-bool BitMapZone::reserve_blocks(int64_t num_blocks)
-{
-  ceph_abort();
-  return false;
-}
-
-void BitMapZone::unreserve(int64_t num_blocks, int64_t allocated)
-{
-  ceph_abort();
-}
-
-int64_t BitMapZone::get_reserved_blocks()
-{
-  ceph_abort();
-  return 0;
-}
-
-BitMapZone::BitMapZone(CephContext* cct, int64_t total_blocks,
-		       int64_t zone_num)
-  : BitMapArea(cct)
-{
-  init(cct, zone_num, total_blocks, false);
-}
-
-BitMapZone::BitMapZone(CephContext* cct, int64_t total_blocks,
-		       int64_t zone_num, bool def)
-  : BitMapArea(cct)
-{
-  init(cct, zone_num, total_blocks, def);
-}
-
-void BitMapZone::shutdown()
-{
-}
-
-BitMapZone::~BitMapZone()
-{
-}
-
-/*
- * Check if some search took zone marker to end.
- *
- * The inline hint has been added intensionally because of importance of this
- * method for BitMapAreaLeaf::child_check_n_lock, and thus for the overall
- * allocator's performance. Examination of disassemblies coming from GCC 5.4.0
- * showed that the compiler really needs that hint.
- */
-inline bool BitMapZone::is_exhausted()
-{
-  /* BitMapZone::get_used_blocks operates atomically. No need for lock. */
-  return BitMapZone::get_used_blocks() == BitMapZone::size();
-}
-
-bool BitMapZone::is_allocated(int64_t start_block, int64_t num_blocks)
-{
-  BmapEntry *bmap = NULL;
-  int bit = 0;
-  int64_t falling_in_bmap = 0;
-
-  while (num_blocks) {
-    bit = start_block % BmapEntry::size();
-    bmap = &m_bmap_vec[start_block / BmapEntry::size()];
-    falling_in_bmap = MIN(num_blocks, BmapEntry::size() - bit);
-
-    if (!bmap->is_allocated(bit, falling_in_bmap)) {
-      return false;
-    }
-
-    start_block += falling_in_bmap;
-    num_blocks -= falling_in_bmap;
-  }
-
-  return true;
-}
-
-void BitMapZone::set_blocks_used(int64_t start_block, int64_t num_blocks)
-{
-  BmapEntry *bmap = NULL;
-  int bit = 0;
-  int64_t falling_in_bmap = 0;
-  int64_t blks = num_blocks;
-
-  while (blks) {
-    bit = start_block % BmapEntry::size();
-    bmap = &m_bmap_vec[start_block / BmapEntry::size()];
-    falling_in_bmap = MIN(blks, BmapEntry::size() - bit);
-
-    bmap->set_bits(bit, falling_in_bmap);
-
-    start_block += falling_in_bmap;
-    blks -= falling_in_bmap;
-  }
-  add_used_blocks(num_blocks);
-}
-
-void BitMapZone::free_blocks_int(int64_t start_block, int64_t num_blocks)
-{
-  BmapEntry *bmap = NULL;
-  int bit = 0;
-  int64_t falling_in_bmap = 0;
-  int64_t count = num_blocks;
-  int64_t first_blk = start_block;
-  
-  if (num_blocks == 0) {
-    return; 
-  }
-  alloc_dbg_assert(is_allocated(start_block, num_blocks));
-
-  while (count) {
-    bit = first_blk % BmapEntry::size();
-    bmap = &m_bmap_vec[first_blk / BmapEntry::size()];
-    falling_in_bmap = MIN(count, BmapEntry::size() - bit);
-
-    bmap->clear_bits(bit, falling_in_bmap);
-
-    first_blk += falling_in_bmap;
-    count -= falling_in_bmap;
-  }
-  alloc_dbg_assert(!is_allocated(start_block, num_blocks));
-}
-
-void BitMapZone::lock_excl()
-{
-  m_lock.lock();
-}
-
-bool BitMapZone::lock_excl_try()
-{
-  return m_lock.try_lock();
-}
-
-void BitMapZone::unlock()
-{
-  m_lock.unlock();
-}
-
-bool BitMapZone::check_locked()
-{
-  return !lock_excl_try();
-}
-
-void BitMapZone::free_blocks(int64_t start_block, int64_t num_blocks)
-{
-  free_blocks_int(start_block, num_blocks);
-  sub_used_blocks(num_blocks);
-  alloc_assert(get_used_blocks() >= 0);
-}
-
-int64_t BitMapZone::alloc_blocks_dis(int64_t num_blocks,
-           int64_t min_alloc,
-     int64_t hint,
-     int64_t zone_blk_off, 
-     ExtentList *alloc_blocks)
-{
-  int64_t bmap_idx = hint / BmapEntry::size();
-  int bit = hint % BmapEntry::size();
-  BmapEntry *bmap = NULL;
-  int64_t allocated = 0;
-  int64_t blk_off = 0;
-  int64_t alloc_cont = 0;
-  int64_t last_cont = 0;
-  int64_t last_running_ext = 0;
-  int search_idx = bit;
-  int64_t scanned = 0;
-  int start_off = 0;
-  
-
-  alloc_assert(check_locked());
-
-  BitMapEntityIter <BmapEntry> iter = BitMapEntityIter<BmapEntry>(
-          &m_bmap_vec, bmap_idx);
-  bmap = iter.next();
-  if (!bmap) {
-    return 0;
-  }
-
-  while (allocated < num_blocks) {
-    blk_off = zone_blk_off + bmap_idx * bmap->size();
-    if (last_cont) {
-      /*
-       * We had bits free at end of last bitmap, try to complete required
-       * min alloc size using that.
-       */
-      alloc_cont = bmap->find_n_cont_bits(0, min_alloc - last_cont);
-      allocated += alloc_cont;
-      last_cont += alloc_cont;
-      
-      if (!alloc_cont) {
-        if (last_cont) {
-          this->free_blocks_int(last_running_ext - zone_blk_off, last_cont);
-        }
-        allocated -= last_cont;
-        last_cont = 0;
-      } else if (last_cont / min_alloc) {
-          /*
-           * Got contiguous min_alloc_size across bitmaps.
-           */
-          alloc_blocks->add_extents(last_running_ext, last_cont);
-          last_cont = 0;
-          last_running_ext = 0;
-      }
-      search_idx = alloc_cont;
-    } else {
-      /*
-       * Try to allocate  min_alloc_size bits from given bmap.
-       */
-      alloc_cont = bmap->find_first_set_bits(min_alloc, search_idx, &start_off, &scanned);
-      search_idx = search_idx + scanned;
-      allocated += alloc_cont;
-      if (alloc_cont / min_alloc) {
-        /*
-         * Got contiguous min_alloc_size within a bitmap.
-         */
-        alloc_blocks->add_extents(blk_off + start_off, min_alloc);
-      }
-      
-      if (alloc_cont % min_alloc) {
-        /*
-         * Got some bits at end of bitmap, carry them to try match with
-         * start bits from next bitmap.
-         */
-        if (!last_cont) {
-          last_running_ext = blk_off + start_off;
-        } 
-        last_cont += alloc_cont % min_alloc;
-      }
-    }
-  
-   
-    if (search_idx == BmapEntry::size()) {
-      search_idx = 0;
-      bmap_idx = iter.index();
-      if ((bmap = iter.next()) == NULL) {
-        if (last_cont) {
-          this->free_blocks_int(last_running_ext - zone_blk_off, last_cont);
-        }
-        allocated -= last_cont;
-        break;
-      }
-    }
-  }
-
-  add_used_blocks(allocated);
-  return allocated;
-}
-
-
-
-void BitMapZone::dump_state(CephContext* const cct, int& count)
-{
-  BmapEntry *bmap = NULL;
-  int bmap_idx = 0;
-  BitMapEntityIter <BmapEntry> iter = BitMapEntityIter<BmapEntry>(
-          &m_bmap_vec, 0);
-  dout(0) << __func__ << " zone " << count << " dump start " << dendl;
-  while ((bmap = static_cast<BmapEntry *>(iter.next()))) {
-    bmap->dump_state(cct, bmap_idx);
-    bmap_idx++;
-  }
-  dout(0) << __func__ << " zone " << count << " dump end " << dendl;
-  count++;
-}
-
-
-/*
- * BitMapArea Leaf and non-Leaf functions.
- */
-int64_t BitMapArea::get_zone_size(CephContext* cct)
-{
-  return cct->_conf->bluestore_bitmapallocator_blocks_per_zone;
-}
-
-int64_t BitMapArea::get_span_size(CephContext* cct)
-{
-  return cct->_conf->bluestore_bitmapallocator_span_size;
-}
-
-int BitMapArea::get_level(CephContext* cct, int64_t total_blocks)
-{
-  int level = 1;
-  int64_t zone_size_block = get_zone_size(cct);
-  int64_t span_size = get_span_size(cct);
-  int64_t spans = zone_size_block * span_size;
-  while (spans < total_blocks) {
-    spans *= span_size;
-    level++;
-  }
-  return level;
-}
-
-int64_t BitMapArea::get_level_factor(CephContext* cct, int level)
-{
-  alloc_assert(level > 0);
-
-  int64_t zone_size = get_zone_size(cct);
-  if (level == 1) {
-    return zone_size;
-  }
-
-  int64_t level_factor = zone_size;
-  int64_t span_size = get_span_size(cct);
-  while (--level) {
-    level_factor *= span_size;
-  }
-
-  return level_factor;
-}
-
-int64_t BitMapArea::get_index()
-{
-  return m_area_index;
-}
-
-/*
- * BitMapArea Leaf and Internal
- */
-BitMapAreaIN::BitMapAreaIN(CephContext* cct)
-  : BitMapArea(cct)
-{
-  // nothing
-}
-
-void BitMapAreaIN::init_common(CephContext* const cct,
-                               const int64_t total_blocks,
-                               const int64_t area_idx,
-                               const bool def)
-{
-  m_area_index = area_idx;
-  m_total_blocks = total_blocks;
-  m_level = BitMapArea::get_level(cct, total_blocks);
-  m_reserved_blocks = 0;
-
-  m_used_blocks = def? total_blocks: 0;
-}
-
-void BitMapAreaIN::init(CephContext* const cct,
-                        int64_t total_blocks,
-                        const int64_t area_idx,
-                        const bool def)
-{
-  int64_t num_child = 0;
-  alloc_assert(!(total_blocks % BmapEntry::size()));
-
-  init_common(cct, total_blocks, area_idx, def);
-  int64_t level_factor = BitMapArea::get_level_factor(cct, m_level);
-
-  num_child = (total_blocks + level_factor - 1) / level_factor;
-  alloc_assert(num_child < std::numeric_limits<int16_t>::max());
-
-  m_child_size_blocks = level_factor;
-
-  std::vector<BitMapArea*> children;
-  children.reserve(num_child);
-  int i = 0;
-  for (i = 0; i < num_child - 1; i++) {
-    if (m_level <= 2) {
-      children.push_back(new BitMapAreaLeaf(cct, m_child_size_blocks, i, def));
-    } else {
-      children.push_back(new BitMapAreaIN(cct, m_child_size_blocks, i, def));
-    }
-    total_blocks -= m_child_size_blocks;
-  }
-
-  int last_level = BitMapArea::get_level(cct, total_blocks);
-  if (last_level == 1) {
-    children.push_back(new BitMapAreaLeaf(cct, total_blocks, i, def));
-  } else {
-    children.push_back(new BitMapAreaIN(cct, total_blocks, i, def));
-  }
-  m_child_list = BitMapAreaList(std::move(children));
-}
-
-BitMapAreaIN::BitMapAreaIN(CephContext* cct,int64_t total_blocks,
-			   int64_t area_idx)
-  : BitMapArea(cct)
-{
-  init(cct, total_blocks, area_idx, false);
-}
-
-BitMapAreaIN::BitMapAreaIN(CephContext* cct, int64_t total_blocks,
-			   int64_t area_idx, bool def)
-  : BitMapArea(cct)
-{
-  init(cct, total_blocks, area_idx, def);
-}
-
-BitMapAreaIN::~BitMapAreaIN()
-{
-}
-
-void BitMapAreaIN::shutdown()
-{
-  lock_excl();
-  m_total_blocks = -1;
-  m_area_index = -2;
-  unlock();
-}
-
-bool BitMapAreaIN::child_check_n_lock(BitMapArea *child, int64_t required)
-{
-  child->lock_shared();
-
-  if (child->is_exhausted()) {
-    child->unlock();
-    return false;
-  }
-
-  int64_t child_used_blocks = child->get_used_blocks();
-  int64_t child_total_blocks = child->size();
-  if ((child_total_blocks - child_used_blocks) < required) {
-    child->unlock();
-    return false;
-  }
-
-  return true;
-}
-
-void BitMapAreaIN::child_unlock(BitMapArea *child)
-{
-  child->unlock();
-}
-
-bool BitMapAreaIN::is_exhausted()
-{
-  return get_used_blocks() == size();
-}
-
-int64_t BitMapAreaIN::add_used_blocks(int64_t blks)
-{
-  std::lock_guard<std::mutex> l(m_blocks_lock);
-  m_used_blocks += blks;
-  return m_used_blocks;
-}
-
-int64_t BitMapAreaIN::sub_used_blocks(int64_t num_blocks)
-{
-  std::lock_guard<std::mutex> l(m_blocks_lock);
-
-  int64_t used_blks = m_used_blocks;
-  m_used_blocks -= num_blocks;
-  alloc_assert(m_used_blocks >= 0);
-  return used_blks;
-}
-
-int64_t BitMapAreaIN::get_used_blocks()
-{
-  std::lock_guard<std::mutex> l(m_blocks_lock);
-  return m_used_blocks;
-}
-
-int64_t BitMapAreaIN::get_used_blocks_adj()
-{
-  std::lock_guard<std::mutex> l(m_blocks_lock);
-  return m_used_blocks - m_reserved_blocks;
-}
-
-bool BitMapAreaIN::reserve_blocks(int64_t num)
-{
-  bool res = false;
-  std::lock_guard<std::mutex> u_l(m_blocks_lock);
-  if (m_used_blocks + num <= size()) {
-    m_used_blocks += num;
-    m_reserved_blocks += num;
-    res = true;
-  }
-  alloc_assert(m_used_blocks <= size());
-  return res;
-}
-
-void BitMapAreaIN::unreserve(int64_t needed, int64_t allocated)
-{
-  std::lock_guard<std::mutex> l(m_blocks_lock);
-  m_used_blocks -= (needed - allocated);
-  m_reserved_blocks -= needed;
-  alloc_assert(m_used_blocks >= 0);
-  alloc_assert(m_reserved_blocks >= 0);
-}
-int64_t BitMapAreaIN::get_reserved_blocks()
-{
-  std::lock_guard<std::mutex> l(m_blocks_lock); 
-  return m_reserved_blocks;
-}
-
-bool BitMapAreaIN::is_allocated(int64_t start_block, int64_t num_blocks)
-{
-  BitMapArea *area = NULL;
-  int64_t area_block_offset = 0;
-  int64_t falling_in_area = 0;
-
-  alloc_assert(start_block >= 0 &&
-      (start_block + num_blocks <= size()));
-
-  if (num_blocks == 0) {
-    return true;
-  }
-
-  while (num_blocks) {
-    area = static_cast<BitMapArea *>(m_child_list.get_nth_item(
-                    start_block / m_child_size_blocks));
-
-    area_block_offset = start_block % m_child_size_blocks;
-    falling_in_area = MIN(m_child_size_blocks - area_block_offset,
-              num_blocks);
-    if (!area->is_allocated(area_block_offset, falling_in_area)) {
-      return false;
-    }
-    start_block += falling_in_area;
-    num_blocks -= falling_in_area;
-  }
-  return true;
-}
-
-int64_t BitMapAreaIN::alloc_blocks_dis_int_work(bool wrap, int64_t num_blocks, int64_t min_alloc, 
-           int64_t hint, int64_t area_blk_off, ExtentList *block_list)
-{
-  BitMapArea *child = NULL;
-  int64_t allocated = 0;
-  int64_t blk_off = 0;
-
-  BmapEntityListIter iter = BmapEntityListIter(
-        &m_child_list, hint / m_child_size_blocks, wrap);
-
-  while ((child = static_cast<BitMapArea *>(iter.next()))) {
-    if (!child_check_n_lock(child, 1)) {
-      hint = 0;
-      continue;
-    }
-
-    blk_off = child->get_index() * m_child_size_blocks + area_blk_off;
-    allocated += child->alloc_blocks_dis(num_blocks - allocated, min_alloc,
-                            hint % m_child_size_blocks, blk_off, block_list);
-    hint = 0;
-    child_unlock(child);
-    if (allocated == num_blocks) {
-      break;
-    }
-  }
-
-  return allocated;
-}
-
-int64_t BitMapAreaIN::alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc,
-                       int64_t hint, int64_t area_blk_off, ExtentList *block_list)
-{
-  return alloc_blocks_dis_int_work(false, num_blocks, min_alloc, hint,
-                     area_blk_off, block_list);
-}
-
-int64_t BitMapAreaIN::alloc_blocks_dis(int64_t num_blocks, int64_t min_alloc,
-           int64_t hint, int64_t blk_off, ExtentList *block_list)
-{
-  int64_t allocated = 0;
-
-  lock_shared();
-  allocated += alloc_blocks_dis_int(num_blocks, min_alloc, hint, blk_off, block_list);
-  add_used_blocks(allocated);
-
-  unlock();
-  return allocated;
-}
-
-
-void BitMapAreaIN::set_blocks_used_int(int64_t start_block, int64_t num_blocks)
-{
-  BitMapArea *child = NULL;
-  int64_t child_block_offset = 0;
-  int64_t falling_in_child = 0;
-  int64_t blks = num_blocks;
-  int64_t start_blk = start_block;
-
-  alloc_assert(start_block >= 0);
-
-  while (blks) {
-    child = static_cast<BitMapArea *>(m_child_list.get_nth_item(
-                  start_blk / m_child_size_blocks));
-
-    child_block_offset = start_blk % child->size();
-    falling_in_child = MIN(m_child_size_blocks - child_block_offset,
-              blks);
-    child->set_blocks_used(child_block_offset, falling_in_child);
-    start_blk += falling_in_child;
-    blks -= falling_in_child;
-  }
-
-  add_used_blocks(num_blocks);
-  alloc_dbg_assert(is_allocated(start_block, num_blocks));
-}
-
-void BitMapAreaIN::set_blocks_used(int64_t start_block, int64_t num_blocks)
-{
-  if (num_blocks == 0) {
-    return;
-  }
-
-  lock_shared();
-  set_blocks_used_int(start_block, num_blocks);
-  unlock();
-}
-
-void BitMapAreaIN::free_blocks_int(int64_t start_block, int64_t num_blocks)
-{
-  BitMapArea *child = NULL;
-  int64_t child_block_offset = 0;
-  int64_t falling_in_child = 0;
-
-  alloc_assert(start_block >= 0 &&
-    (start_block + num_blocks) <= size());
-
-  if (num_blocks == 0) {
-    return;
-  }
-
-  while (num_blocks) {
-    child = static_cast<BitMapArea *>(m_child_list.get_nth_item(
-          start_block / m_child_size_blocks));
-
-    child_block_offset = start_block % m_child_size_blocks;
-
-    falling_in_child = MIN(m_child_size_blocks - child_block_offset,
-              num_blocks);
-    child->free_blocks(child_block_offset, falling_in_child);
-    start_block += falling_in_child;
-    num_blocks -= falling_in_child;
-  }
-
-}
-void BitMapAreaIN::free_blocks(int64_t start_block, int64_t num_blocks)
-{
-  if (num_blocks == 0) {
-    return;
-  }
-  lock_shared();
-  alloc_dbg_assert(is_allocated(start_block, num_blocks));
-
-  free_blocks_int(start_block, num_blocks);
-  (void) sub_used_blocks(num_blocks);
-
-  unlock();
-}
-
-void BitMapAreaIN::dump_state(CephContext* const cct, int& count)
-{
-  BitMapArea *child = NULL;
-
-  BmapEntityListIter iter = BmapEntityListIter(
-        &m_child_list, 0, false);
-
-  while ((child = static_cast<BitMapArea *>(iter.next()))) {
-    child->dump_state(cct, count);
-  }
-}
-
-/*
- * BitMapArea Leaf
- */
-BitMapAreaLeaf::BitMapAreaLeaf(CephContext* cct, int64_t total_blocks,
-			       int64_t area_idx)
-  : BitMapAreaIN(cct)
-{
-  init(cct, total_blocks, area_idx, false);
-}
-
-BitMapAreaLeaf::BitMapAreaLeaf(CephContext* cct, int64_t total_blocks,
-			       int64_t area_idx, bool def)
-  : BitMapAreaIN(cct)
-{
-  init(cct, total_blocks, area_idx, def);
-}
-
-void BitMapAreaLeaf::init(CephContext* const cct,
-                          const int64_t total_blocks,
-                          const int64_t area_idx,
-                          const bool def)
-{
-  int64_t num_child = 0;
-  alloc_assert(!(total_blocks % BmapEntry::size()));
-
-  init_common(cct, total_blocks, area_idx, def);
-  alloc_assert(m_level == 1);
-  int zone_size_block = get_zone_size(cct);
-  alloc_assert(zone_size_block > 0);
-  num_child = (total_blocks + zone_size_block - 1) / zone_size_block;
-  alloc_assert(num_child);
-  m_child_size_blocks = total_blocks / num_child;
-
-  std::vector<BitMapArea*> children;
-  children.reserve(num_child);
-  for (int i = 0; i < num_child; i++) {
-    children.emplace_back(new BitMapZone(cct, m_child_size_blocks, i, def));
-  }
-
-  m_child_list = BitMapAreaList(std::move(children));
-
-  BitMapAreaLeaf::incr_count();
-}
-
-BitMapAreaLeaf::~BitMapAreaLeaf()
-{
-  lock_excl();
-
-  for (int64_t i = 0; i < m_child_list.size(); i++) {
-    auto child = static_cast<BitMapArea *>(m_child_list.get_nth_item(i));
-    delete child;
-  }
-
-  unlock();
-}
-
-/* Intensionally hinted because BitMapAreaLeaf::alloc_blocks_dis_int. */
-inline bool BitMapAreaLeaf::child_check_n_lock(BitMapZone* const child,
-                                               const int64_t required,
-                                               const bool lock)
-{
-  /* The exhausted check can be performed without acquiring the lock. This
-   * is because 1) BitMapZone::is_exhausted() actually operates atomically
-   * and 2) it's followed by the exclusive, required-aware re-verification. */
-  if (child->BitMapZone::is_exhausted()) {
-    return false;
-  }
-
-  if (lock) {
-    child->lock_excl();
-  } else if (!child->lock_excl_try()) {
-    return false;
-  }
-
-  int64_t child_used_blocks = child->get_used_blocks();
-  int64_t child_total_blocks = child->size();
-  if ((child_total_blocks - child_used_blocks) < required) {
-    child->unlock();
-    return false;
-  }
-
-  return true;
-}
-
-int64_t BitMapAreaLeaf::alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc, 
-                                 int64_t hint, int64_t area_blk_off, ExtentList *block_list)
-{
-  BitMapZone* child = nullptr;
-  int64_t allocated = 0;
-  int64_t blk_off = 0;
-
-  BmapEntityListIter iter = BmapEntityListIter(
-        &m_child_list, hint / m_child_size_blocks, false);
-
-  /* We're sure the only element type we aggregate is BitMapZone,
-   * so there is no business to go through vptr and thus prohibit
-   * compiler to inline the stuff. Consult BitMapAreaLeaf::init. */
-  while ((child = static_cast<BitMapZone*>(iter.next()))) {
-    if (!child_check_n_lock(child, 1, false)) {
-      hint = 0;
-      continue;
-    }
-
-    blk_off = child->get_index() * m_child_size_blocks + area_blk_off;
-    allocated += child->alloc_blocks_dis(num_blocks - allocated, min_alloc,
-                                         hint % m_child_size_blocks, blk_off, block_list);
-    child->unlock();
-    if (allocated == num_blocks) {
-      break;
-    }
-    hint = 0;
-  }
-  return allocated;
-}
-
-void BitMapAreaLeaf::free_blocks_int(int64_t start_block, int64_t num_blocks)
-{
-  BitMapArea *child = NULL;
-  int64_t child_block_offset = 0;
-  int64_t falling_in_child = 0;
-
-  alloc_assert(start_block >= 0 &&
-    (start_block + num_blocks) <= size());
-
-  if (num_blocks == 0) {
-    return;
-  }
-
-  while (num_blocks) {
-    child = static_cast<BitMapArea *>(m_child_list.get_nth_item(
-          start_block / m_child_size_blocks));
-
-    child_block_offset = start_block % m_child_size_blocks;
-
-    falling_in_child = MIN(m_child_size_blocks - child_block_offset,
-              num_blocks);
-
-    child->lock_excl();
-    child->free_blocks(child_block_offset, falling_in_child);
-    child->unlock();
-    start_block += falling_in_child;
-    num_blocks -= falling_in_child;
-  }
-}
-
-/*
- * Main allocator functions.
- */
-BitAllocator::BitAllocator(CephContext* cct, int64_t total_blocks,
-			   int64_t zone_size_block, bmap_alloc_mode_t mode)
-  : BitMapAreaIN(cct),
-    cct(cct)
-{
-  init_check(total_blocks, zone_size_block, mode, false, false);
-}
-
-BitAllocator::BitAllocator(CephContext* cct, int64_t total_blocks,
-			   int64_t zone_size_block, bmap_alloc_mode_t mode,
-			   bool def)
-  : BitMapAreaIN(cct),
-    cct(cct)
-{
-  init_check(total_blocks, zone_size_block, mode, def, false);
-}
-
-BitAllocator::BitAllocator(CephContext* cct, int64_t total_blocks,
-			   int64_t zone_size_block, bmap_alloc_mode_t mode,
-			   bool def, bool stats_on)
-  : BitMapAreaIN(cct),
-    cct(cct)
-{
-  init_check(total_blocks, zone_size_block, mode, def, stats_on);
-}
-
-void BitAllocator::init_check(int64_t total_blocks, int64_t zone_size_block,
-       bmap_alloc_mode_t mode, bool def, bool stats_on)
-{
-  int64_t unaligned_blocks = 0;
-
-  if (mode != SERIAL && mode != CONCURRENT) {
-    ceph_abort();
-  }
-
-  if (total_blocks <= 0) {
-    ceph_abort();
-  }
-
-  if (zone_size_block == 0 ||
-    zone_size_block < BmapEntry::size()) {
-    ceph_abort();
-  }
-
-  zone_size_block = (zone_size_block / BmapEntry::size()) *
-        BmapEntry::size();
-
-  unaligned_blocks = total_blocks % zone_size_block;
-  m_extra_blocks = unaligned_blocks? zone_size_block - unaligned_blocks: 0;
-  total_blocks = ROUND_UP_TO(total_blocks, zone_size_block);
-
-  m_alloc_mode = mode;
-  m_is_stats_on = stats_on;
-  if (m_is_stats_on) {
-    m_stats = new BitAllocatorStats();
-  }
-
-  pthread_rwlock_init(&m_rw_lock, NULL);
-  init(cct, total_blocks, 0, def);
-  if (!def && unaligned_blocks) {
-    /*
-     * Mark extra padded blocks used from beginning.
-     */
-    set_blocks_used(total_blocks - m_extra_blocks, m_extra_blocks);
-  }
-}
-
-void BitAllocator::lock_excl()
-{
-  pthread_rwlock_wrlock(&m_rw_lock);
-}
-
-void BitAllocator::lock_shared()
-{
-  pthread_rwlock_rdlock(&m_rw_lock);
-}
-
-bool BitAllocator::try_lock()
-{
-  bool get_lock = false;
-  if (pthread_rwlock_trywrlock(&m_rw_lock) == 0) {
-    get_lock = true;
-  }
-
-  return get_lock;
-}
-
-void BitAllocator::unlock()
-{
-  pthread_rwlock_unlock(&m_rw_lock);
-}
-
-BitAllocator::~BitAllocator()
-{
-  lock_excl();
-
-  for (int64_t i = 0; i < m_child_list.size(); i++) {
-    auto child = static_cast<BitMapArea *>(m_child_list.get_nth_item(i));
-    delete child;
-  }
-
-  unlock();
-  pthread_rwlock_destroy(&m_rw_lock);
-}
-
-void
-BitAllocator::shutdown()
-{
-  bool get_lock = try_lock();
-  assert(get_lock);
-  bool get_serial_lock = try_serial_lock();
-  assert(get_serial_lock);
-  serial_unlock();
-  unlock();
-}
-
-void BitAllocator::unreserve_blocks(int64_t unused)
-{
-  unreserve(unused, 0);
-}
-
-void BitAllocator::serial_lock()
-{
-  if (m_alloc_mode == SERIAL) {
-    m_serial_mutex.lock();
-  }
-}
-
-void BitAllocator::serial_unlock()
-{
-  if (m_alloc_mode == SERIAL) {
-    m_serial_mutex.unlock();
-  }
-}
-
-bool BitAllocator::try_serial_lock()
-{
-  bool get_lock = false;
-  if (m_alloc_mode == SERIAL) {
-    if (m_serial_mutex.try_lock() == 0) {
-      get_lock = true;
-    }
-  } else {
-    get_lock = true;
-  }
-  return get_lock;
-}
-
-bool BitAllocator::child_check_n_lock(BitMapArea *child, int64_t required)
-{
-  child->lock_shared();
-
-  if (child->is_exhausted()) {
-    child->unlock();
-    return false;
-  }
-
-  int64_t child_used_blocks = child->get_used_blocks();
-  int64_t child_total_blocks = child->size();
-  if ((child_total_blocks - child_used_blocks) < required) {
-    child->unlock();
-    return false;
-  }
-
-  return true;
-}
-
-void BitAllocator::child_unlock(BitMapArea *child)
-{
-  child->unlock();
-}
-
-bool BitAllocator::check_input_dis(int64_t num_blocks)
-{
-  if (num_blocks == 0 || num_blocks > size()) {
-    return false;
-  }
-  return true;
-}
-
-bool BitAllocator::check_input(int64_t num_blocks)
-{
-  if (num_blocks == 0 || num_blocks > get_zone_size(cct)) {
-    return false;
-  }
-  return true;
-}
-
-void BitAllocator::free_blocks(int64_t start_block, int64_t num_blocks)
-{
-  if (num_blocks == 0) {
-    return;
-  }
-
-  alloc_assert(start_block + num_blocks <= size());
-  if (is_stats_on()) {
-    m_stats->add_free_calls(1);
-    m_stats->add_freed(num_blocks);
-  }
-
-  lock_shared();
-  alloc_dbg_assert(is_allocated(start_block, num_blocks));
-
-  free_blocks_int(start_block, num_blocks);
-  (void) sub_used_blocks(num_blocks);
-
-  unlock();
-}
-
-
-void BitAllocator::set_blocks_used(int64_t start_block, int64_t num_blocks)
-{
-  if (num_blocks == 0) {
-    return;
-  }
-
-  alloc_assert(start_block + num_blocks <= size());
-  lock_shared();
-  serial_lock();
-  set_blocks_used_int(start_block, num_blocks);
-
-  serial_unlock();
-  unlock();
-}
-
-/*
- * Allocate N dis-contiguous blocks.
- */
-int64_t BitAllocator::alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc,
-                       int64_t hint, int64_t area_blk_off, ExtentList *block_list)
-{
-  return alloc_blocks_dis_int_work(true, num_blocks, min_alloc, hint,
-                     area_blk_off, block_list);
-}
-
-int64_t BitAllocator::alloc_blocks_dis_res(int64_t num_blocks, int64_t min_alloc,
-                                           int64_t hint, ExtentList *block_list)
-{
-  return alloc_blocks_dis_work(num_blocks, min_alloc, hint, block_list, true);
-}
-
-int64_t BitAllocator::alloc_blocks_dis_work(int64_t num_blocks, int64_t min_alloc,
-                                            int64_t hint, ExtentList *block_list, bool reserved)
-{
-  int scans = 1;
-  int64_t allocated = 0;
-  /*
-   * This is root so offset is 0 yet.
-   */
-  int64_t blk_off = 0;
-
-  if (!check_input_dis(num_blocks)) {
-    return 0;
-  }
-
-  if (is_stats_on()) {
-    m_stats->add_alloc_calls(1);
-    m_stats->add_allocated(num_blocks);
-  }
-
-  lock_shared();
-  serial_lock();
-  if (!reserved && !reserve_blocks(num_blocks)) {
-    goto exit;
-  }
-
-  if (is_stats_on()) {
-    m_stats->add_concurrent_scans(scans);
-  }
-
-  while (scans && allocated < num_blocks) {
-    allocated += alloc_blocks_dis_int(num_blocks - allocated, min_alloc, hint + allocated, blk_off, block_list);
-    scans--;
-  }
-
-  if (allocated < num_blocks) {
-    /*
-     * Could not find anything in concurrent scan.
-     * Go in serial manner to get something for sure
-     * if available.
-     */
-    serial_unlock();
-    unlock();
-    lock_excl();
-    serial_lock();
-    allocated += alloc_blocks_dis_int(num_blocks - allocated, min_alloc, hint + allocated,
-                                      blk_off, block_list);
-    if (is_stats_on()) {
-      m_stats->add_serial_scans(1);
-    }
-  }
-
-  unreserve(num_blocks, allocated);
-  alloc_dbg_assert(is_allocated_dis(block_list, allocated));
-
-exit:
-  serial_unlock();
-  unlock();
-
-  return allocated;
-}
-
-bool BitAllocator::is_allocated_dis(ExtentList *blocks, int64_t num_blocks)
-{
-  int64_t count = 0;
-  for (int64_t j = 0; j < blocks->get_extent_count(); j++) {
-    auto p = blocks->get_nth_extent(j);
-    count += p.second;
-    if (!is_allocated(p.first, p.second)) {
-      return false;
-    }
-  }
-
-  alloc_assert(count == num_blocks);
-  return true;
-}
-
-void BitAllocator::free_blocks_dis(int64_t num_blocks, ExtentList *block_list)
-{
-  int64_t freed = 0;
-  lock_shared();
-  if (is_stats_on()) {
-    m_stats->add_free_calls(1);
-    m_stats->add_freed(num_blocks);
-  }
-
-  for (int64_t i = 0; i < block_list->get_extent_count(); i++) {
-    free_blocks_int(block_list->get_nth_extent(i).first,
-                    block_list->get_nth_extent(i).second);
-    freed += block_list->get_nth_extent(i).second;
-  }
-
-  alloc_assert(num_blocks == freed);
-  sub_used_blocks(num_blocks);
-  alloc_assert(get_used_blocks() >= 0);
-  unlock();
-}
-
-void BitAllocator::dump()
-{
-  int count = 0;
-  serial_lock(); 
-  dump_state(cct, count);
-  serial_unlock(); 
-}
diff --git a/ceph/src/os/bluestore/BitAllocator.h b/ceph/src/os/bluestore/BitAllocator.h
deleted file mode 100644
index 90d9e862c..000000000
--- a/ceph/src/os/bluestore/BitAllocator.h
+++ /dev/null
@@ -1,569 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Bitmap based in memory allocator.
- * Author: Ramesh Chander, Ramesh.Chander@sandisk.com
- */
-
-#ifndef  CEPH_OS_BLUESTORE_BITALLOCATOR_H
-#define CEPH_OS_BLUESTORE_BITALLOCATOR_H
-
-
-#include <assert.h>
-#include <stdint.h>
-#include <pthread.h>
-#include <mutex>
-#include <atomic>
-#include <vector>
-#include "include/intarith.h"
-#include "os/bluestore/bluestore_types.h"
-
-#define alloc_assert assert
-
-#ifdef BIT_ALLOCATOR_DEBUG
-#define alloc_dbg_assert(x) assert(x)
-#else
-#define alloc_dbg_assert(x) (static_cast<void> (0))
-#endif
-
-
-class BitAllocatorStats {
-public:
-  std::atomic<int64_t> m_total_alloc_calls;
-  std::atomic<int64_t> m_total_free_calls;
-  std::atomic<int64_t> m_total_allocated;
-  std::atomic<int64_t> m_total_freed;
-  std::atomic<int64_t> m_total_serial_scans;
-  std::atomic<int64_t> m_total_concurrent_scans;
-  std::atomic<int64_t> m_total_node_scanned;
-
-  BitAllocatorStats() {
-    m_total_alloc_calls = 0;
-    m_total_free_calls = 0;
-    m_total_allocated = 0;
-    m_total_freed = 0;
-    m_total_serial_scans = 0;
-    m_total_concurrent_scans = 0;
-    m_total_node_scanned = 0;
-  }
-
-  void add_alloc_calls(int64_t val) {
-    std::atomic_fetch_add(&m_total_alloc_calls, val);
-  }
-  void add_free_calls(int64_t val) {
-    std::atomic_fetch_add(&m_total_free_calls, val);
-  }
-  void add_allocated(int64_t val) {
-    std::atomic_fetch_add(&m_total_allocated, val);
-  }
-  void add_freed(int64_t val) {
-    std::atomic_fetch_add(&m_total_freed, val);
-  }
-  void add_serial_scans(int64_t val) {
-    std::atomic_fetch_add(&m_total_serial_scans, val);
-  }
-  void add_concurrent_scans(int64_t val) {
-    std::atomic_fetch_add(&m_total_concurrent_scans, val);
-  }
-  void add_node_scanned(int64_t val) {
-    std::atomic_fetch_add(&m_total_node_scanned, val);
-  }
-};
-
-template <class BitMapEntity>
-class BitMapEntityIter {
-  typedef mempool::bluestore_alloc::vector<BitMapEntity> BitMapEntityVector;
-  BitMapEntityVector *m_list;
-  int64_t m_start_idx;
-  int64_t m_cur_idx;
-  bool m_wrap;
-  bool m_wrapped;
-  bool m_end;
-public:
-
-  void init(BitMapEntityVector *list, bool wrap, int64_t start_idx) {
-    m_list = list;
-    m_wrap = wrap;
-    m_start_idx = start_idx;
-    m_cur_idx = m_start_idx;
-    m_wrapped = false;
-    m_end = false;
-  }
-
-  BitMapEntityIter(BitMapEntityVector *list, int64_t start_idx) {
-    init(list, false, start_idx);
-  }
-  BitMapEntityIter(BitMapEntityVector *list, int64_t start_idx, bool wrap) {
-    init(list, wrap, start_idx);
-  }
-
-  BitMapEntity *next() {
-    int64_t cur_idx = m_cur_idx;
-
-    if (m_wrapped &&
-      cur_idx == m_start_idx) {
-      /*
-       * End of wrap cycle + 1
-       */
-      if (!m_end) {
-        m_end = true;
-        return &(*m_list)[cur_idx];
-      }
-      return NULL;
-    }
-    m_cur_idx++;
-
-    if (m_cur_idx == (int64_t)m_list->size() &&
-        m_wrap) {
-      m_cur_idx = 0;
-      m_wrapped = true;
-    }
-
-    if (cur_idx == (int64_t)m_list->size()) {
-      /*
-       * End of list
-       */
-      return NULL;
-    }
-
-    alloc_assert(cur_idx < (int64_t)m_list->size());
-    return &(*m_list)[cur_idx];
-  }
-
-  int64_t index() {
-    return m_cur_idx;
-  }
-};
-
-typedef unsigned long bmap_t;
-typedef mempool::bluestore_alloc::vector<bmap_t> bmap_mask_vec_t;
-
-class BmapEntry {
-private:
-  bmap_t m_bits;
-
-public:
-  MEMPOOL_CLASS_HELPERS();
-  static bmap_t full_bmask() {
-    return (bmap_t) -1;
-  }
-  static int64_t size() {
-    return (sizeof(bmap_t) * 8);
-  }
-  static bmap_t empty_bmask() {
-    return (bmap_t) 0;
-  }
-  static bmap_t align_mask(int x) {
-    return ((x) >= BmapEntry::size()? (bmap_t) -1 : (~(((bmap_t) -1) >> (x))));
-  }
-  static bmap_t bit_mask(int bit_num) {
-    return (bmap_t) 0x1 << ((BmapEntry::size() - 1) - bit_num);
-  }
-  bmap_t atomic_fetch() {
-    return m_bits;
-  }
-  BmapEntry(CephContext*, bool val);
-  BmapEntry(CephContext*) {
-    m_bits = 0;
-  }
-  BmapEntry(const BmapEntry& bmap) {
-    m_bits = bmap.m_bits;
-  }
-
-  void clear_bit(int bit);
-  void clear_bits(int offset, int num_bits);
-  void set_bits(int offset, int num_bits);
-  bool check_n_set_bit(int bit);
-  bool check_bit(int bit);
-  bool is_allocated(int64_t start_bit, int64_t num_bits);
-
-  int find_n_cont_bits(int start_offset, int64_t num_bits);
-  int find_n_free_bits(int start_idx, int64_t max_bits,
-           int *free_bit, int *end_idx);
-  int find_first_set_bits(int64_t required_blocks, int bit_offset,
-          int *start_offset, int64_t *scanned);
-
-  void dump_state(CephContext* cct, const int& count);
-  ~BmapEntry();
-
-};
-
-class BitMapArea {
-protected:
-  int16_t m_area_index;
-
-public:
-  MEMPOOL_CLASS_HELPERS();
-  static int64_t get_zone_size(CephContext* cct);
-  static int64_t get_span_size(CephContext* cct);
-  static int get_level(CephContext* cct, int64_t total_blocks);
-  static int64_t get_level_factor(CephContext* cct, int level);
-  virtual bool is_allocated(int64_t start_block, int64_t num_blocks) = 0;
-  virtual bool is_exhausted() = 0;
-  virtual bool child_check_n_lock(BitMapArea *child, int64_t required) {
-      ceph_abort();
-      return true;
-  }
-  virtual bool child_check_n_lock(BitMapArea *child, int64_t required, bool lock) {
-      ceph_abort();
-      return true;
-  }
-  virtual void child_unlock(BitMapArea *child) {
-    ceph_abort();
-  }
-
-  virtual void lock_excl() = 0;
-  virtual bool lock_excl_try() {
-    ceph_abort();
-    return false;
-  }
-  virtual void lock_shared() {
-    ceph_abort();
-    return;
-  }
-  virtual void unlock() = 0;
-
-  virtual int64_t sub_used_blocks(int64_t num_blocks) = 0;
-  virtual int64_t add_used_blocks(int64_t num_blocks) = 0;
-  virtual bool reserve_blocks(int64_t num_blocks) = 0;
-  virtual void unreserve(int64_t num_blocks, int64_t allocated) = 0;
-  virtual int64_t get_reserved_blocks() = 0;
-  virtual int64_t get_used_blocks() = 0;
-
-  virtual void shutdown() = 0;
-
-  virtual int64_t alloc_blocks_dis(int64_t num_blocks, int64_t min_alloc,
-             int64_t hint, int64_t blk_off, ExtentList *block_list) {
-    ceph_abort();
-    return 0;
-  }
-
-  virtual void set_blocks_used(int64_t start_block, int64_t num_blocks) = 0;
-  virtual void free_blocks(int64_t start_block, int64_t num_blocks) = 0;
-  virtual int64_t size() = 0;
-
-  int64_t child_count();
-  int64_t get_index();
-  int64_t get_level();
-  virtual void dump_state(CephContext* cct, int& count) = 0;
-  BitMapArea(CephContext*) { }
-  virtual ~BitMapArea() { }
-};
-
-class BitMapAreaList {
-
-private:
-  std::vector<BitMapArea*> m_items;
-
-public:
-  /* Must be DefaultConstructible as BitMapAreaIN and derivates employ
-   * a deferred init, sorry. */
-  BitMapAreaList() = default;
-
-  BitMapAreaList(std::vector<BitMapArea*>&& m_items)
-    : m_items(std::move(m_items)) {
-  }
-
-  BitMapArea *get_nth_item(const int64_t idx) {
-    return m_items[idx];
-  }
-
-  /* FIXME: we really should use size_t. */
-  int64_t size() const {
-    return m_items.size();
-  }
-};
-
-/* Intensionally inlined for the sake of BitMapAreaLeaf::alloc_blocks_dis_int. */
-class BmapEntityListIter {
-  BitMapAreaList* m_list;
-  int64_t m_start_idx;
-  int64_t m_cur_idx;
-  bool m_wrap;
-  bool m_wrapped;
-  bool m_end;
-
-public:
-  BmapEntityListIter(BitMapAreaList* const list,
-                     const int64_t start_idx,
-                     const bool wrap = false)
-    : m_list(list),
-      m_start_idx(start_idx),
-      m_cur_idx(start_idx),
-      m_wrap(wrap),
-      m_wrapped(false),
-      m_end(false) {
-  }
-
-  BitMapArea* next() {
-    int64_t cur_idx = m_cur_idx;
-
-    if (m_wrapped &&
-      cur_idx == m_start_idx) {
-      /*
-       * End of wrap cycle + 1
-       */
-      if (!m_end) {
-        m_end = true;
-        return m_list->get_nth_item(cur_idx);
-      }
-      return NULL;
-    }
-    m_cur_idx++;
-
-    if (m_cur_idx == m_list->size() &&
-        m_wrap) {
-      m_cur_idx = 0;
-      m_wrapped = true;
-    }
-    if (cur_idx == m_list->size()) {
-      /*
-       * End of list
-       */
-      return NULL;
-    }
-
-    /* This method should be *really* fast as it's being executed over
-     * and over during traversal of allocators indexes. */
-    alloc_dbg_assert(cur_idx < m_list->size());
-    return m_list->get_nth_item(cur_idx);
-  }
-
-  int64_t index();
-};
-
-typedef mempool::bluestore_alloc::vector<BmapEntry> BmapEntryVector;
-
-class BitMapZone: public BitMapArea {
-
-private:
-  std::atomic<int32_t> m_used_blocks;
-  BmapEntryVector m_bmap_vec;
-  std::mutex m_lock;
-
-public:
-  MEMPOOL_CLASS_HELPERS();
-  static int64_t count;
-  static int64_t total_blocks;
-  static void incr_count() { count++;}
-  static int64_t get_total_blocks() {return total_blocks;}
-  bool is_allocated(int64_t start_block, int64_t num_blocks) override;
-  bool is_exhausted() override final;
-  void reset_marker();
-
-  int64_t sub_used_blocks(int64_t num_blocks) override;
-  int64_t add_used_blocks(int64_t num_blocks) override;
-  bool reserve_blocks(int64_t num_blocks) override;
-  void unreserve(int64_t num_blocks, int64_t allocated) override;
-  int64_t get_reserved_blocks() override;
-  int64_t get_used_blocks() override final;
-  int64_t size() override final {
-    return get_total_blocks();
-  }
-
-  void lock_excl() override;
-  bool lock_excl_try() override;
-  void unlock() override;
-  bool check_locked();
-
-  void free_blocks_int(int64_t start_block, int64_t num_blocks);
-  void init(CephContext* cct, int64_t zone_num, int64_t total_blocks, bool def);
-
-  BitMapZone(CephContext* cct, int64_t total_blocks, int64_t zone_num);
-  BitMapZone(CephContext* cct, int64_t total_blocks, int64_t zone_num, bool def);
-
-  ~BitMapZone() override;
-  void shutdown() override;
-  int64_t alloc_blocks_dis(int64_t num_blocks, int64_t min_alloc, int64_t hint,
-        int64_t blk_off, ExtentList *block_list) override;  
-  void set_blocks_used(int64_t start_block, int64_t num_blocks) override;
-
-  void free_blocks(int64_t start_block, int64_t num_blocks) override;
-  void dump_state(CephContext* cct, int& count) override;
-};
-
-class BitMapAreaIN: public BitMapArea{
-
-protected:
-  int64_t m_child_size_blocks;
-  int64_t m_total_blocks;
-  int16_t m_level;
-
-  int64_t m_used_blocks;
-  int64_t m_reserved_blocks;
-  std::mutex m_blocks_lock;
-  BitMapAreaList m_child_list;
-
-  bool is_allocated(int64_t start_block, int64_t num_blocks) override;
-  bool is_exhausted() override;
-
-  bool child_check_n_lock(BitMapArea *child, int64_t required, bool lock) override {
-    ceph_abort();
-    return false;
-  }
-
-  bool child_check_n_lock(BitMapArea *child, int64_t required) override;
-  void child_unlock(BitMapArea *child) override;
-
-  void lock_excl() override {
-    return;
-  }
-  void lock_shared() override {
-    return;
-  }
-  void unlock() override {
-    return;
-  }
-
-  void init(CephContext* cct, int64_t total_blocks, int64_t zone_size_block, bool def);
-  void init_common(CephContext* cct,
-                   int64_t total_blocks,
-                   int64_t zone_size_block,
-                   bool def);
-  int64_t alloc_blocks_dis_int_work(bool wrap, int64_t num_blocks, int64_t min_alloc, int64_t hint,
-        int64_t blk_off, ExtentList *block_list);  
-
-  int64_t alloc_blocks_int_work(bool wait, bool wrap,
-                         int64_t num_blocks, int64_t hint, int64_t *start_block);
-
-public:
-  MEMPOOL_CLASS_HELPERS();
-  BitMapAreaIN(CephContext* cct);
-  BitMapAreaIN(CephContext* cct, int64_t zone_num, int64_t total_blocks);
-  BitMapAreaIN(CephContext* cct, int64_t zone_num, int64_t total_blocks,
-	       bool def);
-
-  ~BitMapAreaIN() override;
-  void shutdown() override;
-  int64_t sub_used_blocks(int64_t num_blocks) override;
-  int64_t add_used_blocks(int64_t num_blocks) override;
-  bool reserve_blocks(int64_t num_blocks) override;
-  void unreserve(int64_t num_blocks, int64_t allocated) override;
-  int64_t get_reserved_blocks() override;
-  int64_t get_used_blocks() override;
-  virtual int64_t get_used_blocks_adj();
-  int64_t size() override {
-    return m_total_blocks;
-  }
-  using BitMapArea::alloc_blocks_dis; //non-wait version
-
-  virtual int64_t alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc, int64_t hint,
-                                       int64_t blk_off, ExtentList *block_list);  
-  int64_t alloc_blocks_dis(int64_t num_blocks, int64_t min_alloc, int64_t hint,
-                           int64_t blk_off, ExtentList *block_list) override;  
-  virtual void set_blocks_used_int(int64_t start_block, int64_t num_blocks);
-  void set_blocks_used(int64_t start_block, int64_t num_blocks) override;
-
-  virtual void free_blocks_int(int64_t start_block, int64_t num_blocks);
-  void free_blocks(int64_t start_block, int64_t num_blocks) override;
-  void dump_state(CephContext* cct, int& count) override;
-};
-
-class BitMapAreaLeaf: public BitMapAreaIN{
-
-private:
-  void init(CephContext* cct, int64_t total_blocks, int64_t zone_size_block,
-            bool def);
-
-public:
-  MEMPOOL_CLASS_HELPERS();
-  static int64_t count;
-  static void incr_count() { count++;}
-  BitMapAreaLeaf(CephContext* cct) : BitMapAreaIN(cct) { }
-  BitMapAreaLeaf(CephContext* cct, int64_t zone_num, int64_t total_blocks);
-  BitMapAreaLeaf(CephContext* cct, int64_t zone_num, int64_t total_blocks,
-		 bool def);
-
-  using BitMapAreaIN::child_check_n_lock;
-  bool child_check_n_lock(BitMapArea *child, int64_t required) override {
-    ceph_abort();
-    return false;
-  }
-
-  bool child_check_n_lock(BitMapZone* child, int64_t required, bool lock);
-
-  int64_t alloc_blocks_int(int64_t num_blocks, int64_t hint, int64_t *start_block);
-  int64_t alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc, int64_t hint,
-        int64_t blk_off, ExtentList *block_list) override;  
-  void free_blocks_int(int64_t start_block, int64_t num_blocks) override;
-
-  ~BitMapAreaLeaf() override;
-};
-
-
-typedef enum bmap_alloc_mode {
-  SERIAL = 1,
-  CONCURRENT = 2,
-} bmap_alloc_mode_t;
-
-class BitAllocator:public BitMapAreaIN{
-private:
-  CephContext* const cct;
-  bmap_alloc_mode_t m_alloc_mode;
-  std::mutex m_serial_mutex;
-  pthread_rwlock_t m_rw_lock;
-  BitAllocatorStats *m_stats;
-  bool m_is_stats_on;
-  int64_t m_extra_blocks;
-
-  bool is_stats_on() {
-    return m_is_stats_on;
-  }
-
-  using BitMapArea::child_check_n_lock;
-  bool child_check_n_lock(BitMapArea *child, int64_t required) override;
-  void child_unlock(BitMapArea *child) override;
-
-  void serial_lock();
-  bool try_serial_lock();
-  void serial_unlock();
-  void lock_excl() override;
-  void lock_shared() override;
-  bool try_lock();
-  void unlock() override;
-
-  bool check_input(int64_t num_blocks);
-  bool check_input_dis(int64_t num_blocks);
-  void init_check(int64_t total_blocks, int64_t zone_size_block,
-                 bmap_alloc_mode_t mode, bool def, bool stats_on);
-  int64_t alloc_blocks_dis_work(int64_t num_blocks, int64_t min_alloc, int64_t hint, ExtentList *block_list, bool reserved);
-
-  int64_t alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc, 
-           int64_t hint, int64_t area_blk_off, ExtentList *block_list) override;
-
-public:
-  MEMPOOL_CLASS_HELPERS();
-
-  BitAllocator(CephContext* cct, int64_t total_blocks,
-	       int64_t zone_size_block, bmap_alloc_mode_t mode);
-  BitAllocator(CephContext* cct, int64_t total_blocks, int64_t zone_size_block,
-	       bmap_alloc_mode_t mode, bool def);
-  BitAllocator(CephContext* cct, int64_t total_blocks, int64_t zone_size_block,
-               bmap_alloc_mode_t mode, bool def, bool stats_on);
-  ~BitAllocator() override;
-  void shutdown() override;
-  using BitMapAreaIN::alloc_blocks_dis; //Wait version
-
-  void free_blocks(int64_t start_block, int64_t num_blocks) override;
-  void set_blocks_used(int64_t start_block, int64_t num_blocks) override;
-  void unreserve_blocks(int64_t blocks);
-
-  int64_t alloc_blocks_dis_res(int64_t num_blocks, int64_t min_alloc, int64_t hint, ExtentList *block_list);
-
-  void free_blocks_dis(int64_t num_blocks, ExtentList *block_list);
-  bool is_allocated_dis(ExtentList *blocks, int64_t num_blocks);
-
-  int64_t total_blocks() const {
-    return m_total_blocks - m_extra_blocks;
-  }
-  int64_t get_used_blocks() override {
-    return (BitMapAreaIN::get_used_blocks_adj() - m_extra_blocks);
-  }
-
-  BitAllocatorStats *get_stats() {
-      return m_stats;
-  }
-  void dump();
-};
-
-#endif //End of file
diff --git a/ceph/src/os/bluestore/BitMapAllocator.cc b/ceph/src/os/bluestore/BitMapAllocator.cc
deleted file mode 100644
index f2a16d8c9..000000000
--- a/ceph/src/os/bluestore/BitMapAllocator.cc
+++ /dev/null
@@ -1,220 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Bitmap based in-memory allocator.
- * Author: Ramesh Chander, Ramesh.Chander@sandisk.com
- *
- */
-
-#include "BitAllocator.h"
-
-#include "BitMapAllocator.h"
-#include "bluestore_types.h"
-#include "common/debug.h"
-
-#define dout_context cct
-#define dout_subsys ceph_subsys_bluestore
-#undef dout_prefix
-#define dout_prefix *_dout << "bitmapalloc:"
-
-
-BitMapAllocator::BitMapAllocator(CephContext* cct, int64_t device_size,
-				 int64_t block_size)
-  : cct(cct)
-{
-  if (!ISP2(block_size)) {
-    derr << __func__ << " block_size " << block_size
-         << " not power of 2 aligned!"
-         << dendl;
-    assert(ISP2(block_size));
-    return;
-  }
-
-  int64_t zone_size_blks = cct->_conf->bluestore_bitmapallocator_blocks_per_zone;
-  if (!ISP2(zone_size_blks)) {
-    derr << __func__ << " zone_size " << zone_size_blks
-         << " not power of 2 aligned!"
-         << dendl;
-    assert(ISP2(zone_size_blks));
-    return;
-  }
-
-  int64_t span_size = cct->_conf->bluestore_bitmapallocator_span_size;
-  if (!ISP2(span_size)) {
-    derr << __func__ << " span_size " << span_size
-         << " not power of 2 aligned!"
-         << dendl;
-    assert(ISP2(span_size));
-    return;
-  }
-
-  m_block_size = block_size;
-  m_total_size = P2ALIGN(device_size, block_size);
-  m_bit_alloc = new BitAllocator(cct, device_size / block_size,
-				 zone_size_blks, CONCURRENT, true);
-  if (!m_bit_alloc) {
-    derr << __func__ << " Unable to intialize Bit Allocator" << dendl;
-    assert(m_bit_alloc);
-  }
-  dout(10) << __func__ << " instance " << (uint64_t) this
-           << " size 0x" << std::hex << device_size << std::dec
-           << dendl;
-}
-
-BitMapAllocator::~BitMapAllocator()
-{
-  delete m_bit_alloc;
-}
-
-void BitMapAllocator::insert_free(uint64_t off, uint64_t len)
-{
-  dout(20) << __func__ << " instance " << (uint64_t) this
-           << " off 0x" << std::hex << off
-           << " len 0x" << len << std::dec
-           << dendl;
-
-  assert(!(off % m_block_size));
-  assert(!(len % m_block_size));
-
-  m_bit_alloc->free_blocks(off / m_block_size,
-             len / m_block_size);
-}
-
-int BitMapAllocator::reserve(uint64_t need)
-{
-  int nblks = need / m_block_size; // apply floor
-  assert(!(need % m_block_size));
-  dout(10) << __func__ << " instance " << (uint64_t) this
-           << " num_used " << m_bit_alloc->get_used_blocks()
-           << " total " << m_bit_alloc->total_blocks()
-           << dendl;
-
-  if (!m_bit_alloc->reserve_blocks(nblks)) {
-    return -ENOSPC;
-  }
-  return 0;
-}
-
-void BitMapAllocator::unreserve(uint64_t unused)
-{
-  int nblks = unused / m_block_size;
-  assert(!(unused % m_block_size));
-
-  dout(10) << __func__ << " instance " << (uint64_t) this
-           << " unused " << nblks
-           << " num used " << m_bit_alloc->get_used_blocks()
-           << " total " << m_bit_alloc->total_blocks()
-           << dendl;
-
-  m_bit_alloc->unreserve_blocks(nblks);
-}
-
-int64_t BitMapAllocator::allocate(
-  uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
-  int64_t hint, mempool::bluestore_alloc::vector<AllocExtent> *extents)
-{
-
-  assert(!(alloc_unit % m_block_size));
-  assert(alloc_unit);
-
-  assert(!max_alloc_size || max_alloc_size >= alloc_unit);
-
-  dout(10) << __func__ <<" instance "<< (uint64_t) this
-     << " want_size " << want_size
-     << " alloc_unit " << alloc_unit
-     << " hint " << hint
-     << dendl;
-  hint = hint % m_total_size; // make hint error-tolerant
-  return allocate_dis(want_size, alloc_unit / m_block_size,
-                      max_alloc_size, hint / m_block_size, extents);
-}
-
-int64_t BitMapAllocator::allocate_dis(
-  uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
-  int64_t hint, mempool::bluestore_alloc::vector<AllocExtent> *extents)
-{
-  ExtentList block_list = ExtentList(extents, m_block_size, max_alloc_size);
-  int64_t nblks = (want_size + m_block_size - 1) / m_block_size;
-  int64_t num = 0;
-
-  num = m_bit_alloc->alloc_blocks_dis_res(nblks, alloc_unit, hint, &block_list);
-  if (num == 0) {
-    return -ENOSPC;
-  }
-
-  return num * m_block_size;
-}
-
-void BitMapAllocator::release(
-  uint64_t offset, uint64_t length)
-{
-  dout(10) << __func__ << " 0x"
-           << std::hex << offset << "~" << length << std::dec
-           << dendl;
-  insert_free(offset, length);
-}
-
-uint64_t BitMapAllocator::get_free()
-{
-  assert(m_bit_alloc->total_blocks() >= m_bit_alloc->get_used_blocks());
-  return ((
-    m_bit_alloc->total_blocks() - m_bit_alloc->get_used_blocks()) *
-    m_block_size);
-}
-
-void BitMapAllocator::dump()
-{
-  dout(0) << __func__ << " instance " << this << dendl;
-  m_bit_alloc->dump();
-}
-
-void BitMapAllocator::init_add_free(uint64_t offset, uint64_t length)
-{
-  dout(10) << __func__ << " instance " << (uint64_t) this
-           << " offset 0x" << std::hex << offset
-           << " length 0x" << length << std::dec
-           << dendl;
-  uint64_t size = m_bit_alloc->size() * m_block_size;
-
-  uint64_t offset_adj = ROUND_UP_TO(offset, m_block_size);
-  uint64_t length_adj = ((length - (offset_adj - offset)) /
-                         m_block_size) * m_block_size;
-
-  if ((offset_adj + length_adj) > size) {
-    assert(((offset_adj + length_adj) - m_block_size) < size);
-    length_adj = size - offset_adj;
-  }
-
-  insert_free(offset_adj, length_adj);
-}
-
-void BitMapAllocator::init_rm_free(uint64_t offset, uint64_t length)
-{
-  dout(10) << __func__ << " instance " << (uint64_t) this
-           << " offset 0x" << std::hex << offset
-           << " length 0x" << length << std::dec
-           << dendl;
-
-  // we use the same adjustment/alignment that init_add_free does
-  // above so that we can yank back some of the space.
-  uint64_t offset_adj = ROUND_UP_TO(offset, m_block_size);
-  uint64_t length_adj = ((length - (offset_adj - offset)) /
-                         m_block_size) * m_block_size;
-
-  assert(!(offset_adj % m_block_size));
-  assert(!(length_adj % m_block_size));
-
-  int64_t first_blk = offset_adj / m_block_size;
-  int64_t count = length_adj / m_block_size;
-
-  if (count)
-    m_bit_alloc->set_blocks_used(first_blk, count);
-}
-
-
-void BitMapAllocator::shutdown()
-{
-  dout(10) << __func__ << " instance " << (uint64_t) this << dendl;
-  m_bit_alloc->shutdown();
-}
-
diff --git a/ceph/src/os/bluestore/BitMapAllocator.h b/ceph/src/os/bluestore/BitMapAllocator.h
deleted file mode 100644
index 2a2643c15..000000000
--- a/ceph/src/os/bluestore/BitMapAllocator.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_OS_BLUESTORE_BITMAPALLOCATOR_H
-#define CEPH_OS_BLUESTORE_BITMAPALLOCATOR_H
-
-#include <mutex>
-
-#include "Allocator.h"
-#include "BitAllocator.h"
-
-class BitMapAllocator : public Allocator {
-  CephContext* cct;
-
-  int64_t m_block_size;
-  int64_t m_total_size;
-
-  BitAllocator *m_bit_alloc; // Bit allocator instance
-
-  void insert_free(uint64_t offset, uint64_t len);
-
-  int64_t allocate_dis(
-    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
-    int64_t hint, mempool::bluestore_alloc::vector<AllocExtent> *extents);
-
-public:
-  BitMapAllocator(CephContext* cct, int64_t device_size, int64_t block_size);
-  ~BitMapAllocator() override;
-
-  int reserve(uint64_t need) override;
-  void unreserve(uint64_t unused) override;
-
-  int64_t allocate(
-    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
-    int64_t hint, mempool::bluestore_alloc::vector<AllocExtent> *extents) override;
-
-  void release(
-    uint64_t offset, uint64_t length) override;
-
-  uint64_t get_free() override;
-
-  void dump() override;
-
-  void init_add_free(uint64_t offset, uint64_t length) override;
-  void init_rm_free(uint64_t offset, uint64_t length) override;
-
-  void shutdown() override;
-};
-
-#endif
diff --git a/ceph/src/os/bluestore/BitmapAllocator.cc b/ceph/src/os/bluestore/BitmapAllocator.cc
new file mode 100755
index 000000000..d46a4d37c
--- /dev/null
+++ b/ceph/src/os/bluestore/BitmapAllocator.cc
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BitmapAllocator.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "fbmap_alloc " << this << " "
+
+BitmapAllocator::BitmapAllocator(CephContext* _cct,
+					 int64_t capacity,
+					 int64_t alloc_unit) :
+    cct(_cct)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << capacity << "/"
+		 << alloc_unit << std::dec << dendl;
+  _init(capacity, alloc_unit, false);
+}
+
+int64_t BitmapAllocator::allocate(
+  uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+  int64_t hint, PExtentVector *extents)
+{
+  uint64_t allocated = 0;
+
+  ldout(cct, 10) << __func__ << std::hex << " 0x" << want_size
+		 << "/" << alloc_unit << "," << max_alloc_size << "," << hint
+		 << std::dec << dendl;
+    
+    
+  _allocate_l2(want_size, alloc_unit, max_alloc_size, hint,
+    &allocated, extents);
+  if (!allocated) {
+    return -ENOSPC;
+  }
+  for (auto e : *extents) {
+    ldout(cct, 10) << __func__
+                   << " 0x" << std::hex << e.offset << "~" << e.length
+		   << "/" << alloc_unit << "," << max_alloc_size << "," << hint
+		   << std::dec << dendl;
+  }
+  return int64_t(allocated);
+}
+
+void BitmapAllocator::release(
+  const interval_set<uint64_t>& release_set)
+{
+  for (auto r : release_set) {
+    ldout(cct, 10) << __func__ << " 0x" << std::hex << r.first << "~" << r.second
+		  << std::dec << dendl;
+  }
+  _free_l2(release_set);
+  ldout(cct, 10) << __func__ << " done" << dendl;
+}
+
+
+void BitmapAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		  << std::dec << dendl;
+
+  auto mas = get_min_alloc_size();
+  uint64_t offs = ROUND_UP_TO(offset, mas);
+  uint64_t l = P2ALIGN(offset + length - offs, mas);
+
+  _mark_free(offs, l);
+  ldout(cct, 10) << __func__ << " done" << dendl;
+}
+void BitmapAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		 << std::dec << dendl;
+  auto mas = get_min_alloc_size();
+  uint64_t offs = ROUND_UP_TO(offset, mas);
+  uint64_t l = P2ALIGN(offset + length - offs, mas);
+  _mark_allocated(offs, l);
+  ldout(cct, 10) << __func__ << " done" << dendl;
+}
+
+void BitmapAllocator::shutdown()
+{
+  ldout(cct, 1) << __func__ << dendl;
+  _shutdown();
+}
+
+void BitmapAllocator::dump()
+{
+  // bin -> interval count
+  std::map<size_t, size_t> bins_overall;
+  collect_stats(bins_overall);
+  auto it = bins_overall.begin();
+  while (it != bins_overall.end()) {
+    ldout(cct, 0) << __func__
+	            << " bin " << it->first
+		    << "(< " << byte_u_t((1 << (it->first + 1)) * get_min_alloc_size()) << ")"
+		    << " : " << it->second << " extents"
+		    << dendl;
+    ++it;
+  }
+}
diff --git a/ceph/src/os/bluestore/BitmapAllocator.h b/ceph/src/os/bluestore/BitmapAllocator.h
new file mode 100755
index 000000000..223c21dfb
--- /dev/null
+++ b/ceph/src/os/bluestore/BitmapAllocator.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H
+#define CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H
+
+#include <mutex>
+
+#include "Allocator.h"
+#include "os/bluestore/bluestore_types.h"
+#include "fastbmap_allocator_impl.h"
+#include "include/mempool.h"
+#include "common/debug.h"
+
+class BitmapAllocator : public Allocator,
+  public AllocatorLevel02<AllocatorLevel01Loose> {
+  CephContext* cct;
+
+public:
+  BitmapAllocator(CephContext* _cct, int64_t capacity, int64_t alloc_unit);
+  ~BitmapAllocator() override
+  {
+  }
+
+
+  int64_t allocate(
+    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+    int64_t hint, PExtentVector *extents) override;
+
+  void release(
+    const interval_set<uint64_t>& release_set) override;
+
+  uint64_t get_free() override
+  {
+    return get_available();
+  }
+
+  void dump() override;
+  double get_fragmentation(uint64_t) override
+  {
+    return _get_fragmentation();
+  }
+
+  void init_add_free(uint64_t offset, uint64_t length) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+
+  void shutdown() override;
+};
+
+#endif
diff --git a/ceph/src/os/bluestore/BlockDevice.h b/ceph/src/os/bluestore/BlockDevice.h
index 280e650cb..bd74f8130 100644
--- a/ceph/src/os/bluestore/BlockDevice.h
+++ b/ceph/src/os/bluestore/BlockDevice.h
@@ -64,13 +64,13 @@ public:
   void aio_wait();
 
   void try_aio_wake() {
+    std::lock_guard<std::mutex> l(lock);
     if (num_running == 1) {
 
       // we might have some pending IOs submitted after the check
       // as there is no lock protection for aio_submit.
       // Hence we might have false conditional trigger.
       // aio_wait has to handle that hence do not care here.
-      std::lock_guard<std::mutex> l(lock);
       cond.notify_all();
       --num_running;
       assert(num_running >= 0);
diff --git a/ceph/src/os/bluestore/BlueFS.cc b/ceph/src/os/bluestore/BlueFS.cc
index cf3063aa8..916138351 100644
--- a/ceph/src/os/bluestore/BlueFS.cc
+++ b/ceph/src/os/bluestore/BlueFS.cc
@@ -186,23 +186,19 @@ void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
 }
 
 int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
-			   AllocExtentVector *extents)
+			   PExtentVector *extents)
 {
   std::unique_lock<std::mutex> l(lock);
   dout(1) << __func__ << " bdev " << id
           << " want 0x" << std::hex << want << std::dec << dendl;
   assert(id < alloc.size());
   assert(alloc[id]);
-  int r = alloc[id]->reserve(want);
-  assert(r == 0); // caller shouldn't ask for more than they can get
+
   int64_t got = alloc[id]->allocate(want, cct->_conf->bluefs_alloc_size, 0,
 				    extents);
-  if (got < (int64_t)want) {
-    alloc[id]->unreserve(want - MAX(0, got));
-  }
-  if (got <= 0) {
+  if (got < 0) {
     derr << __func__ << " failed to allocate space to return to bluestore"
-	 << dendl;
+      << dendl;
     alloc[id]->dump();
     return got;
   }
@@ -214,7 +210,7 @@ int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
   }
 
   flush_bdev();
-  r = _flush_and_sync_log(l);
+  int r = _flush_and_sync_log(l);
   assert(r == 0);
 
   if (logger)
@@ -1406,6 +1402,9 @@ int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
     return 0;
   }
 
+  vector<interval_set<uint64_t>> to_release(pending_release.size());
+  to_release.swap(pending_release);
+
   uint64_t seq = log_t.seq = ++log_seq;
   assert(want_seq == 0 || want_seq <= seq);
   log_t.uuid = super.uuid;
@@ -1498,6 +1497,14 @@ int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
              << " already >= out seq " << seq
              << ", we lost a race against another log flush, done" << dendl;
   }
+
+  for (unsigned i = 0; i < to_release.size(); ++i) {
+    if (!to_release[i].empty()) {
+      /* OK, now we have the guarantee alloc[i] won't be null. */
+      alloc[i]->release(to_release[i]);
+    }
+  }
+
   _update_logger_stats();
 
   return 0;
@@ -1852,15 +1859,10 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
   uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size;
 
   uint64_t left = ROUND_UP_TO(len, min_alloc_size);
-  int r = -ENOSPC;
   int64_t alloc_len = 0;
-  AllocExtentVector extents;
+  PExtentVector extents;
   
   if (alloc[id]) {
-    r = alloc[id]->reserve(left);
-  }
-  
-  if (r == 0) {
     uint64_t hint = 0;
     if (!node->extents.empty() && node->extents.back().bdev == id) {
       hint = node->extents.back().end();
@@ -1868,12 +1870,9 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
     extents.reserve(4);  // 4 should be (more than) enough for most allocations
     alloc_len = alloc[id]->allocate(left, min_alloc_size, hint, &extents);
   }
-  if (r < 0 || (alloc_len < (int64_t)left)) {
-    if (r == 0) {
-      alloc[id]->unreserve(left - alloc_len);
-      for (auto& p : extents) {
-        alloc[id]->release(p.offset, p.length);
-      }
+  if (alloc_len < (int64_t)left) {
+    if (alloc_len != 0) {
+      alloc[id]->release(extents);
     }
     if (id != BDEV_SLOW) {
       if (bdev[id]) {
@@ -1933,15 +1932,9 @@ void BlueFS::sync_metadata()
   }
   dout(10) << __func__ << dendl;
   utime_t start = ceph_clock_now();
-  vector<interval_set<uint64_t>> to_release(pending_release.size());
-  to_release.swap(pending_release);
   flush_bdev(); // FIXME?
   _flush_and_sync_log(l);
-  for (unsigned i = 0; i < to_release.size(); ++i) {
-    for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
-      alloc[i]->release(p.get_start(), p.get_len());
-    }
-  }
+  dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
 
   if (_should_compact_log()) {
     if (cct->_conf->bluefs_compact_log_sync) {
diff --git a/ceph/src/os/bluestore/BlueFS.h b/ceph/src/os/bluestore/BlueFS.h
index 033a4930c..81ea1a1c5 100644
--- a/ceph/src/os/bluestore/BlueFS.h
+++ b/ceph/src/os/bluestore/BlueFS.h
@@ -403,7 +403,7 @@ public:
 
   /// reclaim block space
   int reclaim_blocks(unsigned bdev, uint64_t want,
-		     AllocExtentVector *extents);
+		     PExtentVector *extents);
 
   void flush(FileWriter *h) {
     std::lock_guard<std::mutex> l(lock);
diff --git a/ceph/src/os/bluestore/BlueStore.cc b/ceph/src/os/bluestore/BlueStore.cc
index 6d601aaac..754eb975d 100644
--- a/ceph/src/os/bluestore/BlueStore.cc
+++ b/ceph/src/os/bluestore/BlueStore.cc
@@ -4192,6 +4192,8 @@ void BlueStore::_init_logger()
                     "Read EIO errors propagated to high level callers");
   b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
                     "Read operations that required at least one retry due to failed checksum validation");
+  b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
+            "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
   logger = b.create_perf_counters();
   cct->get_perfcounters_collection()->add(logger);
 }
@@ -5171,18 +5173,12 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
     dout(10) << __func__ << " gifting " << gift
 	     << " (" << byte_u_t(gift) << ")" << dendl;
 
-    // fixme: just do one allocation to start...
-    int r = alloc->reserve(gift);
-    assert(r == 0);
-
-    AllocExtentVector exts;
     int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
-					0, 0, &exts);
+					0, 0, extents);
 
     if (alloc_len <= 0) {
       dout(0) << __func__ << " no allocate on 0x" << std::hex << gift
               << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
-      alloc->unreserve(gift);
       _dump_alloc_on_rebalance_failure();
       return 0;
     } else if (alloc_len < (int64_t)gift) {
@@ -5190,13 +5186,10 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
               << " min_alloc_size 0x" << min_alloc_size 
 	      << " allocated 0x" << alloc_len
 	      << std::dec << dendl;
-      alloc->unreserve(gift - alloc_len);
       _dump_alloc_on_rebalance_failure();
     }
-    for (auto& p : exts) {
-      bluestore_pextent_t e = bluestore_pextent_t(p);
+    for (auto& e : *extents) {
       dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
-      extents->push_back(e);
     }
     gift = 0;
 
@@ -5215,7 +5208,7 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
 
     while (reclaim > 0) {
       // NOTE: this will block and do IO.
-      AllocExtentVector extents;
+      PExtentVector extents;
       int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
 				     &extents);
       if (r < 0) {
@@ -6184,12 +6177,23 @@ int BlueStore::_fsck(bool deep, bool repair)
       }
       if (deep) {
 	bufferlist bl;
-	int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
-	if (r < 0) {
-	  ++errors;
-	  derr << "fsck error: " << oid << " error during read: "
-	       << cpp_strerror(r) << dendl;
-	}
+	uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
+	uint64_t offset = 0;
+	do {
+	  uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
+	  int r = _do_read(c.get(), o, offset, l, bl,
+	    CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+	  if (r < 0) {
+	    ++errors;
+	    derr << "fsck error: " << oid << std::hex
+	         << " error during read: "
+		 << " " << offset << "~" << l
+		 << " " << cpp_strerror(r) << std::dec
+		 << dendl;
+	    break;
+	  }
+	  offset += l;
+	} while (offset < o->onode.size);
       }
       // omap
       if (o->onode.has_omap()) {
@@ -8525,21 +8529,24 @@ void BlueStore::_txc_finish(TransContext *txc)
     dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
     osr->_unregister();
   }
+  logger->set(l_bluestore_fragmentation,
+    (uint64_t)(alloc->get_fragmentation(min_alloc_size) * 1000));
 }
 
 void BlueStore::_txc_release_alloc(TransContext *txc)
 {
-  // update allocator with full released set
+  interval_set<uint64_t> bulk_release_extents;
+  // it's expected we're called with lazy_release_lock already taken!
   if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
     dout(10) << __func__ << " " << txc << " " << std::hex
              << txc->released << std::dec << dendl;
-    for (interval_set<uint64_t>::iterator p = txc->released.begin();
-	 p != txc->released.end();
-	 ++p) {
-      alloc->release(p.get_start(), p.get_len());
-    }
+    // interval_set seems to be too costly for inserting things in
+    // bstore_kv_final. We could serialize in simpler format and perform
+    // the merge separately, maybe even in a dedicated thread.
+    bulk_release_extents.insert(txc->released);
   }
 
+  alloc->release(bulk_release_extents);
   txc->allocated.clear();
   txc->released.clear();
 }
@@ -8692,18 +8699,24 @@ void BlueStore::_kv_sync_thread()
   dout(10) << __func__ << " start" << dendl;
   std::unique_lock<std::mutex> l(kv_lock);
   assert(!kv_sync_started);
+  bool bluefs_do_check_balance = false;
   kv_sync_started = true;
   kv_cond.notify_all();
   while (true) {
     assert(kv_committing.empty());
     if (kv_queue.empty() &&
 	((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
-	 !deferred_aggressive)) {
+	 !deferred_aggressive) &&
+	(bluefs_do_check_balance == false)) {
       if (kv_stop)
 	break;
       dout(20) << __func__ << " sleep" << dendl;
-      kv_cond.wait(l);
+      std::cv_status status = kv_cond.wait_for(l,
+        std::chrono::milliseconds(int64_t(cct->_conf->bluestore_bluefs_balance_interval * 1000)));
       dout(20) << __func__ << " wake" << dendl;
+      if (status == std::cv_status::timeout) {
+        bluefs_do_check_balance = true;
+      }
     } else {
       deque<TransContext*> kv_submitting;
       deque<DeferredBatch*> deferred_done, deferred_stable;
@@ -8838,6 +8851,7 @@ void BlueStore::_kv_sync_thread()
 	  synct->set(PREFIX_SUPER, "bluefs_extents", bl);
 	}
       }
+      bluefs_do_check_balance = false;
 
       // cleanup sync deferred keys
       for (auto b : deferred_stable) {
@@ -8890,14 +8904,9 @@ void BlueStore::_kv_sync_thread()
 	if (!bluefs_gift_extents.empty()) {
 	  _commit_bluefs_freespace(bluefs_gift_extents);
 	}
-	for (auto p = bluefs_extents_reclaiming.begin();
-	     p != bluefs_extents_reclaiming.end();
-	     ++p) {
-	  dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
-		   << p.get_start() << "~" << p.get_len() << std::dec
-		   << dendl;
-	  alloc->release(p.get_start(), p.get_len());
-	}
+	dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
+		 << bluefs_extents_reclaiming << std::dec << dendl;
+	alloc->release(bluefs_extents_reclaiming);
 	bluefs_extents_reclaiming.clear();
       }
 
@@ -10273,36 +10282,58 @@ int BlueStore::_do_alloc_write(
       // FIXME: memory alignment here is bad
       bufferlist t;
       int r = c->compress(wi.bl, t);
-      assert(r == 0);
-
-      bluestore_compression_header_t chdr;
-      chdr.type = c->get_type();
-      chdr.length = t.length();
-      ::encode(chdr, wi.compressed_bl);
-      wi.compressed_bl.claim_append(t);
 
-      wi.compressed_len = wi.compressed_bl.length();
-      uint64_t newlen = P2ROUNDUP(wi.compressed_len, min_alloc_size);
       uint64_t want_len_raw = wi.blob_length * crr;
       uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
-      if (newlen <= want_len && newlen < wi.blob_length) {
-	// Cool. We compressed at least as much as we were hoping to.
-	// pad out to min_alloc_size
-	wi.compressed_bl.append_zero(newlen - wi.compressed_len);
-	logger->inc(l_bluestore_write_pad_bytes, newlen - wi.compressed_len);
-	dout(20) << __func__ << std::hex << "  compressed 0x" << wi.blob_length
-		 << " -> 0x" << wi.compressed_len << " => 0x" << newlen
-		 << " with " << c->get_type()
-		 << std::dec << dendl;
-	txc->statfs_delta.compressed() += wi.compressed_len;
-	txc->statfs_delta.compressed_original() += wi.blob_length;
-	txc->statfs_delta.compressed_allocated() += newlen;
-	logger->inc(l_bluestore_compress_success_count);
-	wi.compressed = true;
-	need += newlen;
+      bool rejected = false;
+      uint64_t compressed_len = t.length();
+      // do an approximate (fast) estimation for resulting blob size
+      // that doesn't take header overhead  into account
+      uint64_t result_len = P2ROUNDUP(compressed_len, min_alloc_size);
+      if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
+	bluestore_compression_header_t chdr;
+	chdr.type = c->get_type();
+	chdr.length = t.length();
+	encode(chdr, wi.compressed_bl);
+	wi.compressed_bl.claim_append(t);
+
+	compressed_len = wi.compressed_bl.length();
+	result_len = P2ROUNDUP(compressed_len, min_alloc_size);
+	if (result_len <= want_len && result_len < wi.blob_length) {
+	  // Cool. We compressed at least as much as we were hoping to.
+	  // pad out to min_alloc_size
+	  wi.compressed_bl.append_zero(result_len - compressed_len);
+	  wi.compressed_len = compressed_len;
+	  wi.compressed = true;
+	  logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
+	  dout(20) << __func__ << std::hex << "  compressed 0x" << wi.blob_length
+		   << " -> 0x" << compressed_len << " => 0x" << result_len
+		   << " with " << c->get_type()
+		   << std::dec << dendl;
+	  txc->statfs_delta.compressed() += compressed_len;
+	  txc->statfs_delta.compressed_original() += wi.blob_length;
+	  txc->statfs_delta.compressed_allocated() += result_len;
+	  logger->inc(l_bluestore_compress_success_count);
+	  need += result_len;
+	} else {
+	  rejected = true;
+	}
+      } else if (r != 0) {
+	dout(5) << __func__ << std::hex << "  0x" << wi.blob_length
+		 << " bytes compressed using " << c->get_type_name()
+		 << std::dec
+		 << " failed with errcode = " << r
+		 << ", leaving uncompressed"
+		 << dendl;
+	logger->inc(l_bluestore_compress_rejected_count);
+	need += wi.blob_length;
       } else {
+	rejected = true;
+      }
+
+      if (rejected) {
 	dout(20) << __func__ << std::hex << "  0x" << wi.blob_length
-		 << " compressed to 0x" << wi.compressed_len << " -> 0x" << newlen
+		 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
 		 << " with " << c->get_type()
 		 << ", which is more than required 0x" << want_len_raw
 		 << " -> 0x" << want_len
@@ -10317,19 +10348,19 @@ int BlueStore::_do_alloc_write(
       need += wi.blob_length;
     }
   }
-  int r = alloc->reserve(need);
-  if (r < 0) {
-    derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
-	 << dendl;
-    return r;
-  }
-  AllocExtentVector prealloc;
+  PExtentVector prealloc;
   prealloc.reserve(2 * wctx->writes.size());;
   int prealloc_left = 0;
   prealloc_left = alloc->allocate(
     need, min_alloc_size, need,
     0, &prealloc);
+  if (prealloc_left  < 0) {
+    derr << __func__ << " failed to allocate 0x" << std::hex << need << std::dec
+	 << dendl;
+    return -ENOSPC;
+  }
   assert(prealloc_left == (int64_t)need);
+
   dout(20) << __func__ << " prealloc " << prealloc << dendl;
   auto prealloc_pos = prealloc.begin();
 
@@ -10381,7 +10412,7 @@ int BlueStore::_do_alloc_write(
       }
     }
 
-    AllocExtentVector extents;
+    PExtentVector extents;
     int64_t left = final_length;
     while (left > 0) {
       assert(prealloc_left > 0);
@@ -10963,7 +10994,7 @@ int BlueStore::_do_remove(
     );
   }
   txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
-  txc->removed(o);
+  txc->note_removed_object(o);
   o->extent_map.clear();
   o->onode = bluestore_onode_t();
   _debug_obj_on_delete(o->oid);
diff --git a/ceph/src/os/bluestore/BlueStore.h b/ceph/src/os/bluestore/BlueStore.h
index 8f78594d9..916054010 100644
--- a/ceph/src/os/bluestore/BlueStore.h
+++ b/ceph/src/os/bluestore/BlueStore.h
@@ -119,6 +119,7 @@ enum {
   l_bluestore_gc_merged,
   l_bluestore_read_eio,
   l_bluestore_reads_with_retries,
+  l_bluestore_fragmentation,
   l_bluestore_last
 };
 
@@ -942,7 +943,7 @@ public:
       uint64_t min_alloc_size);
 
     /// return a collection of extents to perform GC on
-    const vector<AllocExtent>& get_extents_to_collect() const {
+    const vector<bluestore_pextent_t>& get_extents_to_collect() const {
       return extents_to_collect;
     }
     GarbageCollector(CephContext* _cct) : cct(_cct) {}
@@ -972,8 +973,8 @@ public:
                                          ///< copies that are affected by the
                                          ///< specific write
 
-    vector<AllocExtent> extents_to_collect; ///< protrusive extents that should
-                                            ///< be collected if GC takes place
+    ///< protrusive extents that should be collected if GC takes place
+    vector<bluestore_pextent_t> extents_to_collect;
 
     boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
                                                 ///<  unit when traversing 
@@ -1605,9 +1606,9 @@ public:
       // onode itself isn't written, though
       modified_objects.insert(o);
     }
-    void removed(OnodeRef& o) {
+    void note_removed_object(OnodeRef& o) {
       onodes.erase(o);
-      modified_objects.erase(o);
+      modified_objects.insert(o);
     }
 
     void aio_finish(BlueStore *store) override {
diff --git a/ceph/src/os/bluestore/KernelDevice.cc b/ceph/src/os/bluestore/KernelDevice.cc
index e58749133..124a29bbd 100644
--- a/ceph/src/os/bluestore/KernelDevice.cc
+++ b/ceph/src/os/bluestore/KernelDevice.cc
@@ -697,7 +697,11 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
   int r = ::pread(buffered ? fd_buffered : fd_direct,
 		  p.c_str(), len, off);
   if (r < 0) {
-    r = -errno;
+    if (ioc->allow_eio && is_expected_ioerr(r)) {
+      r = -EIO;
+    } else {
+      r = -errno;
+    }
     goto out;
   }
   assert((uint64_t)r == len);
diff --git a/ceph/src/os/bluestore/StupidAllocator.cc b/ceph/src/os/bluestore/StupidAllocator.cc
index 65d7baf59..c7224ea8c 100644
--- a/ceph/src/os/bluestore/StupidAllocator.cc
+++ b/ceph/src/os/bluestore/StupidAllocator.cc
@@ -12,7 +12,6 @@
 
 StupidAllocator::StupidAllocator(CephContext* cct)
   : cct(cct), num_free(0),
-    num_reserved(0),
     free(10),
     last_alloc(0)
 {
@@ -48,28 +47,6 @@ void StupidAllocator::_insert_free(uint64_t off, uint64_t len)
   }
 }
 
-int StupidAllocator::reserve(uint64_t need)
-{
-  std::lock_guard<std::mutex> l(lock);
-  dout(10) << __func__ << " need 0x" << std::hex << need
-	   << " num_free 0x" << num_free
-	   << " num_reserved 0x" << num_reserved << std::dec << dendl;
-  if ((int64_t)need > num_free - num_reserved)
-    return -ENOSPC;
-  num_reserved += need;
-  return 0;
-}
-
-void StupidAllocator::unreserve(uint64_t unused)
-{
-  std::lock_guard<std::mutex> l(lock);
-  dout(10) << __func__ << " unused 0x" << std::hex << unused
-	   << " num_free 0x" << num_free
-	   << " num_reserved 0x" << num_reserved << std::dec << dendl;
-  assert(num_reserved >= (int64_t)unused);
-  num_reserved -= unused;
-}
-
 /// return the effective length of the extent if we align to alloc_unit
 uint64_t StupidAllocator::_aligned_len(
   StupidAllocator::interval_set_t::iterator p,
@@ -195,9 +172,7 @@ int64_t StupidAllocator::allocate_int(
   }
 
   num_free -= *length;
-  num_reserved -= *length;
   assert(num_free >= 0);
-  assert(num_reserved >= 0);
   last_alloc = *offset + *length;
   return 0;
 }
@@ -207,7 +182,7 @@ int64_t StupidAllocator::allocate(
   uint64_t alloc_unit,
   uint64_t max_alloc_size,
   int64_t hint,
-  mempool::bluestore_alloc::vector<AllocExtent> *extents)
+  PExtentVector *extents)
 {
   uint64_t allocated_size = 0;
   uint64_t offset = 0;
@@ -218,8 +193,6 @@ int64_t StupidAllocator::allocate(
     max_alloc_size = want_size;
   }
 
-  ExtentList block_list = ExtentList(extents, 1, max_alloc_size);
-
   while (allocated_size < want_size) {
     res = allocate_int(MIN(max_alloc_size, (want_size - allocated_size)),
        alloc_unit, hint, &offset, &length);
@@ -229,7 +202,19 @@ int64_t StupidAllocator::allocate(
        */
       break;
     }
-    block_list.add_extents(offset, length);
+    bool can_append = true;
+    if (!extents->empty()) {
+      bluestore_pextent_t &last_extent  = extents->back();
+      if ((last_extent.end() == offset) &&
+	  ((last_extent.length + length) <= max_alloc_size)) {
+	can_append = false;
+	last_extent.length += length;
+      }
+    }
+    if (can_append) {
+      extents->emplace_back(bluestore_pextent_t(offset, length));
+    }
+
     allocated_size += length;
     hint = offset + length;
   }
@@ -241,13 +226,19 @@ int64_t StupidAllocator::allocate(
 }
 
 void StupidAllocator::release(
-  uint64_t offset, uint64_t length)
+  const interval_set<uint64_t>& release_set)
 {
   std::lock_guard<std::mutex> l(lock);
-  dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
-	   << std::dec << dendl;
-  _insert_free(offset, length);
-  num_free += length;
+  for (interval_set<uint64_t>::const_iterator p = release_set.begin();
+       p != release_set.end();
+       ++p) {
+    const auto offset = p.get_start();
+    const auto length = p.get_len();
+    ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		   << std::dec << dendl;
+    _insert_free(offset, length);
+    num_free += length;
+  }
 }
 
 uint64_t StupidAllocator::get_free()
@@ -256,6 +247,31 @@ uint64_t StupidAllocator::get_free()
   return num_free;
 }
 
+double StupidAllocator::get_fragmentation(uint64_t alloc_unit)
+{
+  assert(alloc_unit);
+  double res;
+  uint64_t max_intervals = 0;
+  uint64_t intervals = 0;
+  {
+    std::lock_guard<std::mutex> l(lock);
+    max_intervals = num_free / alloc_unit;
+    for (unsigned bin = 0; bin < free.size(); ++bin) {
+      intervals += free[bin].num_intervals();
+    }
+  }
+  ldout(cct, 30) << __func__ << " " << intervals << "/" << max_intervals 
+                 << dendl;
+  assert(intervals <= max_intervals);
+  if (!intervals || max_intervals <= 1) {
+    return 0.0;
+  }
+  intervals--;
+  max_intervals--;
+  res = (double)intervals / max_intervals;
+  return res;
+}
+
 void StupidAllocator::dump()
 {
   std::lock_guard<std::mutex> l(lock);
diff --git a/ceph/src/os/bluestore/StupidAllocator.h b/ceph/src/os/bluestore/StupidAllocator.h
index bcce08232..c994cc718 100644
--- a/ceph/src/os/bluestore/StupidAllocator.h
+++ b/ceph/src/os/bluestore/StupidAllocator.h
@@ -17,7 +17,6 @@ class StupidAllocator : public Allocator {
   std::mutex lock;
 
   int64_t num_free;     ///< total bytes in freelist
-  int64_t num_reserved; ///< reserved bytes
 
   typedef mempool::bluestore_alloc::pool_allocator<
     pair<const uint64_t,uint64_t>> allocator_t;
@@ -38,21 +37,19 @@ public:
   StupidAllocator(CephContext* cct);
   ~StupidAllocator() override;
 
-  int reserve(uint64_t need) override;
-  void unreserve(uint64_t unused) override;
-
   int64_t allocate(
     uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
-    int64_t hint, mempool::bluestore_alloc::vector<AllocExtent> *extents) override;
+    int64_t hint, PExtentVector *extents) override;
 
   int64_t allocate_int(
     uint64_t want_size, uint64_t alloc_unit, int64_t hint,
     uint64_t *offset, uint32_t *length);
 
   void release(
-    uint64_t offset, uint64_t length) override;
+    const interval_set<uint64_t>& release_set) override;
 
   uint64_t get_free() override;
+  double get_fragmentation(uint64_t alloc_unit) override;
 
   void dump() override;
 
diff --git a/ceph/src/os/bluestore/bluefs_types.h b/ceph/src/os/bluestore/bluefs_types.h
index 6aec1a31a..128dac326 100644
--- a/ceph/src/os/bluestore/bluefs_types.h
+++ b/ceph/src/os/bluestore/bluefs_types.h
@@ -8,13 +8,16 @@
 #include "include/encoding.h"
 #include "include/denc.h"
 
-class bluefs_extent_t : public AllocExtent{
+class bluefs_extent_t {
 public:
   uint8_t bdev;
+  uint64_t offset = 0;
+  uint32_t length = 0;
 
   bluefs_extent_t(uint8_t b = 0, uint64_t o = 0, uint32_t l = 0)
-    : AllocExtent(o, l), bdev(b) {}
+    : bdev(b), offset(o), length(l) {}
 
+  uint64_t end() const { return  offset + length; }
   DENC(bluefs_extent_t, v, p) {
     DENC_START(1, 1, p);
     denc_lba(v.offset, p);
diff --git a/ceph/src/os/bluestore/bluestore_types.cc b/ceph/src/os/bluestore/bluestore_types.cc
index 4c0a5f77b..3e04a3dbc 100644
--- a/ceph/src/os/bluestore/bluestore_types.cc
+++ b/ceph/src/os/bluestore/bluestore_types.cc
@@ -17,28 +17,6 @@
 #include "common/Checksummer.h"
 #include "include/stringify.h"
 
-void ExtentList::add_extents(int64_t start, int64_t count) {
-  AllocExtent *last_extent = NULL;
-  bool can_merge = false;
-
-  if (!m_extents->empty()) {
-    last_extent = &(m_extents->back());
-    uint64_t last_offset = last_extent->end() / m_block_size;
-    uint32_t last_length = last_extent->length / m_block_size;
-    if ((last_offset == (uint64_t) start) &&
-        (!m_max_blocks || (last_length + count) <= m_max_blocks)) {
-      can_merge = true;
-    }
-  }
-
-  if (can_merge) {
-    last_extent->length += (count * m_block_size);
-  } else {
-    m_extents->emplace_back(AllocExtent(start * m_block_size,
-					count * m_block_size));
-  }
-}
-
 // bluestore_bdev_label_t
 
 void bluestore_bdev_label_t::encode(bufferlist& bl) const
@@ -767,7 +745,7 @@ int bluestore_blob_t::verify_csum(uint64_t b_off, const bufferlist& bl,
     return 0;
 }
 
-void bluestore_blob_t::allocated(uint32_t b_off, uint32_t length, const AllocExtentVector& allocs)
+void bluestore_blob_t::allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs)
 {
   if (extents.size() == 0) {
     // if blob is compressed then logical length to be already configured
@@ -779,6 +757,7 @@ void bluestore_blob_t::allocated(uint32_t b_off, uint32_t length, const AllocExt
     if (b_off) {
       extents.emplace_back(
         bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, b_off));
+
     }
     uint32_t new_len = b_off;
     for (auto& a : allocs) {
@@ -859,7 +838,8 @@ struct vecbuilder {
   void flush() {
     if (invalid) {
       v.emplace_back(bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET,
-	invalid));
+        invalid));
+
       invalid = 0;
     }
   }
@@ -869,7 +849,7 @@ struct vecbuilder {
     }
     else {
       flush();
-      v.emplace_back(bluestore_pextent_t(offset, length));
+      v.emplace_back(offset, length);
     }
   }
 };
diff --git a/ceph/src/os/bluestore/bluestore_types.h b/ceph/src/os/bluestore/bluestore_types.h
index bb57242ba..97c21ad9a 100644
--- a/ceph/src/os/bluestore/bluestore_types.h
+++ b/ceph/src/os/bluestore/bluestore_types.h
@@ -66,93 +66,37 @@ WRITE_CLASS_DENC(bluestore_cnode_t)
 
 ostream& operator<<(ostream& out, const bluestore_cnode_t& l);
 
-class AllocExtent;
-typedef mempool::bluestore_alloc::vector<AllocExtent> AllocExtentVector;
-class AllocExtent {
-public:
-  uint64_t offset;
-  uint32_t length;
-
-  AllocExtent() { 
-    offset = 0;
-    length = 0;
-  }
-
-  AllocExtent(int64_t off, int32_t len) : offset(off), length(len) { }
-  uint64_t end() const {
-    return offset + length;
-  }
-  bool operator==(const AllocExtent& other) const {
-    return offset == other.offset && length == other.length;
-  }
-};
-
-inline static ostream& operator<<(ostream& out, const AllocExtent& e) {
-  return out << "0x" << std::hex << e.offset << "~" << e.length << std::dec;
-}
-
-class ExtentList {
-  AllocExtentVector *m_extents;
-  int64_t m_block_size;
-  int64_t m_max_blocks;
+template <typename OFFS_TYPE, typename LEN_TYPE>
+struct bluestore_interval_t
+{
+  static const uint64_t INVALID_OFFSET = ~0ull;
 
-public:
-  void init(AllocExtentVector *extents, int64_t block_size,
-	    uint64_t max_alloc_size) {
-    m_extents = extents;
-    m_block_size = block_size;
-    m_max_blocks = max_alloc_size / block_size;
-    assert(m_extents->empty());
-  }
-
-  ExtentList(AllocExtentVector *extents, int64_t block_size) {
-    init(extents, block_size, 0);
-  }
+  OFFS_TYPE offset = 0;
+  LEN_TYPE length = 0;
 
-  ExtentList(AllocExtentVector *extents, int64_t block_size,
-	     uint64_t max_alloc_size) {
-    init(extents, block_size, max_alloc_size);
-  }
+  bluestore_interval_t(){}
+  bluestore_interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
 
-  void reset() {
-    m_extents->clear();
+  bool is_valid() const {
+    return offset != INVALID_OFFSET;
   }
-
-  void add_extents(int64_t start, int64_t count);
-
-  AllocExtentVector *get_extents() {
-    return m_extents;
+  uint64_t end() const {
+    return offset != INVALID_OFFSET ? offset + length : INVALID_OFFSET;
   }
 
-  std::pair<int64_t, int64_t> get_nth_extent(int index) {
-      return std::make_pair
-            ((*m_extents)[index].offset / m_block_size,
-             (*m_extents)[index].length / m_block_size);
+  bool operator==(const bluestore_interval_t& other) const {
+    return offset == other.offset && length == other.length;
   }
 
-  int64_t get_extent_count() {
-    return m_extents->size();
-  }
 };
 
-
 /// pextent: physical extent
-struct bluestore_pextent_t : public AllocExtent {
-  const static uint64_t INVALID_OFFSET = ~0ull;
-
-  bluestore_pextent_t() : AllocExtent() {}
-  bluestore_pextent_t(uint64_t o, uint64_t l) : AllocExtent(o, l) {}
-  bluestore_pextent_t(const AllocExtent &ext) :
-    AllocExtent(ext.offset, ext.length) { }
-
-  bluestore_pextent_t& operator=(const AllocExtent &ext) {
-    offset = ext.offset;
-    length = ext.length;
-    return *this;
-  }
-  bool is_valid() const {
-    return offset != INVALID_OFFSET;
-  }
+struct bluestore_pextent_t : public bluestore_interval_t<uint64_t, uint32_t> 
+{
+  bluestore_pextent_t() {}
+  bluestore_pextent_t(uint64_t o, uint64_t l) : bluestore_interval_t(o, l) {}
+  bluestore_pextent_t(const bluestore_interval_t &ext) :
+    bluestore_interval_t(ext.offset, ext.length) {}
 
   DENC(bluestore_pextent_t, v, p) {
     denc_lba(v.offset, p);
@@ -201,7 +145,6 @@ struct denc_traits<PExtentVector> {
   }
 };
 
-
 /// extent_map: a map of reference counted extents
 struct bluestore_extent_ref_map_t {
   struct record_t {
@@ -890,7 +833,7 @@ public:
   }
 
   void split(uint32_t blob_offset, bluestore_blob_t& rb);
-  void allocated(uint32_t b_off, uint32_t length, const AllocExtentVector& allocs);
+  void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs);
   void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only
 
   /// updates blob's pextents container and return unused pextents eligible
diff --git a/ceph/src/os/bluestore/fastbmap_allocator_impl.cc b/ceph/src/os/bluestore/fastbmap_allocator_impl.cc
new file mode 100755
index 000000000..1eff326d4
--- /dev/null
+++ b/ceph/src/os/bluestore/fastbmap_allocator_impl.cc
@@ -0,0 +1,544 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator implementation.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ *
+ */
+
+#include "fastbmap_allocator_impl.h"
+
+uint64_t AllocatorLevel::l0_dives = 0;
+uint64_t AllocatorLevel::l0_iterations = 0;
+uint64_t AllocatorLevel::l0_inner_iterations = 0;
+uint64_t AllocatorLevel::alloc_fragments = 0;
+uint64_t AllocatorLevel::alloc_fragments_fast = 0;
+uint64_t AllocatorLevel::l2_allocs = 0;
+
+inline interval_t _align2units(uint64_t offset, uint64_t len, uint64_t min_length)
+{
+  interval_t res;
+  if (len >= min_length) {
+    res.offset = P2ROUNDUP(offset, min_length);
+    auto delta_off = res.offset - offset;
+    if (len > delta_off) {
+      res.length = len - delta_off;
+      res.length = P2ALIGN(res.length, min_length);
+      if (res.length) {
+	return res;
+      }
+    }
+  }
+  return interval_t();
+}
+
+interval_t AllocatorLevel01Loose::_get_longest_from_l0(uint64_t pos0,
+  uint64_t pos1, uint64_t min_length, interval_t* tail) const
+{
+  interval_t res;
+  if (pos0 >= pos1) {
+    return res;
+  }
+  auto pos = pos0;
+
+  interval_t res_candidate;
+  if (tail->length != 0) {
+    assert((tail->offset % l0_granularity) == 0);
+    assert((tail->length % l0_granularity) == 0);
+    res_candidate.offset = tail->offset / l0_granularity;
+    res_candidate.length = tail->length / l0_granularity;
+  }
+  *tail = interval_t();
+
+  auto d = bits_per_slot;
+  slot_t bits = l0[pos / d];
+  bits >>= pos % d;
+  bool end_loop = false;
+  auto min_granules = min_length / l0_granularity;
+
+  do {
+    if ((pos % d) == 0) {
+      bits = l0[pos / d];
+      if (pos1 - pos >= d) {
+        switch(bits) {
+	  case all_slot_set:
+	    // slot is totally free
+	    if (!res_candidate.length) {
+	      res_candidate.offset = pos;
+	    }
+	    res_candidate.length += d;
+	    pos += d;
+	    end_loop = pos >= pos1;
+	    if (end_loop) {
+	      *tail = res_candidate;
+	      res_candidate = _align2units(res_candidate.offset,
+		res_candidate.length, min_granules);
+	      if(res.length < res_candidate.length) {
+		res = res_candidate;
+	      }
+	    }
+	    continue;
+	  case all_slot_clear:
+	    // slot is totally allocated
+	    res_candidate = _align2units(res_candidate.offset,
+	      res_candidate.length, min_granules);
+	    if (res.length < res_candidate.length) {
+	      res = res_candidate;
+	    }
+	    res_candidate = interval_t();
+	    pos += d;
+	    end_loop = pos >= pos1;
+	    continue;
+	}
+      }
+    } //if ((pos % d) == 0)
+
+    end_loop = ++pos >= pos1;
+    if (bits & 1) {
+      // item is free
+      if (!res_candidate.length) {
+	res_candidate.offset = pos - 1;
+      }
+      ++res_candidate.length;
+      if (end_loop) {
+	*tail = res_candidate;
+	res_candidate = _align2units(res_candidate.offset,
+	  res_candidate.length, min_granules);
+	if (res.length < res_candidate.length) {
+	  res = res_candidate;
+	}
+      }
+    } else {
+      res_candidate = _align2units(res_candidate.offset,
+	res_candidate.length, min_granules);
+      if (res.length < res_candidate.length) {
+	res = res_candidate;
+      }
+      res_candidate = interval_t();
+    }
+    bits >>= 1;
+  } while (!end_loop);
+  res.offset *= l0_granularity;
+  res.length *= l0_granularity;
+  tail->offset *= l0_granularity;
+  tail->length *= l0_granularity;
+  return res;
+}
+
+void AllocatorLevel01Loose::_analyze_partials(uint64_t pos_start,
+  uint64_t pos_end, uint64_t length, uint64_t min_length, int mode,
+  search_ctx_t* ctx)
+{
+  auto d = CHILD_PER_SLOT;
+  assert((pos_start % d) == 0);
+  assert((pos_end % d) == 0);
+
+  uint64_t l0_w = slotset_width * CHILD_PER_SLOT_L0;
+
+  uint64_t l1_pos = pos_start;
+  const interval_t empty_tail;
+  interval_t prev_tail;
+
+  uint64_t next_free_l1_pos = 0;
+  for (auto pos = pos_start / d; pos < pos_end / d; ++pos) {
+    slot_t slot_val = l1[pos];
+    // FIXME minor: code below can be optimized to check slot_val against
+    // all_slot_set(_clear) value
+
+    for (auto c = 0; c < d; c++) {
+      switch (slot_val & L1_ENTRY_MASK) {
+      case L1_ENTRY_FREE:
+        prev_tail  = empty_tail;
+        if (!ctx->free_count) {
+          ctx->free_l1_pos = l1_pos;
+        } else if (l1_pos != next_free_l1_pos){
+	  auto o = ctx->free_l1_pos * l1_granularity;
+	  auto l = ctx->free_count * l1_granularity;
+          // check if already found extent fits min_length after alignment
+	  if (_align2units(o, l, min_length).length >= min_length) {
+	    break;
+	  }
+	  // if not - proceed with the next one
+          ctx->free_l1_pos = l1_pos;
+          ctx->free_count = 0;
+	}
+	next_free_l1_pos = l1_pos + 1;
+        ++ctx->free_count;
+        if (mode == STOP_ON_EMPTY) {
+          return;
+        }
+        break;
+      case L1_ENTRY_FULL:
+        prev_tail = empty_tail;
+        break;
+      case L1_ENTRY_PARTIAL:
+	interval_t longest;
+        ++ctx->partial_count;
+
+        longest = _get_longest_from_l0(l1_pos * l0_w, (l1_pos + 1) * l0_w, min_length, &prev_tail);
+
+        if (longest.length >= length) {
+          if ((ctx->affordable_len == 0) ||
+              ((ctx->affordable_len != 0) &&
+                (longest.length < ctx->affordable_len))) {
+            ctx->affordable_len = longest.length;
+	    ctx->affordable_offs = longest.offset;
+          }
+        }
+        if (longest.length >= min_length &&
+	    (ctx->min_affordable_len == 0 ||
+	      (longest.length < ctx->min_affordable_len))) {
+
+          ctx->min_affordable_len = P2ALIGN(longest.length, min_length);
+	  ctx->min_affordable_offs = longest.offset;
+        }
+        if (mode == STOP_ON_PARTIAL) {
+          return;
+        }
+        break;
+      }
+      slot_val >>= L1_ENTRY_WIDTH;
+      ++l1_pos;
+    }
+  }
+  ctx->fully_processed = true;
+}
+
+void AllocatorLevel01Loose::_mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end)
+{
+  if (l0_pos == l0_pos_end) {
+    return;
+  }
+  auto d0 = bits_per_slotset;
+  uint64_t l1_w = CHILD_PER_SLOT;
+  // this should be aligned with slotset boundaries
+  assert(0 == (l0_pos % d0));
+  assert(0 == (l0_pos_end % d0));
+
+  int64_t idx = l0_pos / bits_per_slot;
+  int64_t idx_end = l0_pos_end / bits_per_slot;
+  slot_t mask_to_apply = L1_ENTRY_NOT_USED;
+
+  auto l1_pos = l0_pos / d0;
+
+  while (idx < idx_end) {
+    if (l0[idx] == all_slot_clear) {
+      // if not all prev slots are allocated then no need to check the
+      // current slot set, it's partial
+      ++idx;
+      if (mask_to_apply == L1_ENTRY_NOT_USED) {
+	mask_to_apply = L1_ENTRY_FULL;
+      } else if (mask_to_apply != L1_ENTRY_FULL) {
+	idx = P2ROUNDUP(idx, int64_t(slotset_width));
+        mask_to_apply = L1_ENTRY_PARTIAL;
+      }
+    } else if (l0[idx] == all_slot_set) {
+      // if not all prev slots are free then no need to check the
+      // current slot set, it's partial
+      ++idx;
+      if (mask_to_apply == L1_ENTRY_NOT_USED) {
+	mask_to_apply = L1_ENTRY_FREE;
+      } else if (mask_to_apply != L1_ENTRY_FREE) {
+	idx = P2ROUNDUP(idx, int64_t(slotset_width));
+        mask_to_apply = L1_ENTRY_PARTIAL;
+      }
+    } else {
+      // no need to check the current slot set, it's partial
+      mask_to_apply = L1_ENTRY_PARTIAL;
+      ++idx;
+      idx = P2ROUNDUP(idx, int64_t(slotset_width));
+    }
+    if ((idx % slotset_width) == 0) {
+      assert(mask_to_apply != L1_ENTRY_NOT_USED);
+      uint64_t shift = (l1_pos % l1_w) * L1_ENTRY_WIDTH;
+      slot_t& slot_val = l1[l1_pos / l1_w];
+      auto mask = slot_t(L1_ENTRY_MASK) << shift;
+
+      slot_t old_mask = (slot_val & mask) >> shift;
+      switch(old_mask) {
+      case L1_ENTRY_FREE:
+	unalloc_l1_count--;
+	break;
+      case L1_ENTRY_PARTIAL:
+	partial_l1_count--;
+	break;
+      }
+      slot_val &= ~mask;
+      slot_val |= slot_t(mask_to_apply) << shift;
+      switch(mask_to_apply) {
+      case L1_ENTRY_FREE:
+	unalloc_l1_count++;
+	break;
+      case L1_ENTRY_PARTIAL:
+	partial_l1_count++;
+	break;
+      }
+      mask_to_apply = L1_ENTRY_NOT_USED;
+      ++l1_pos;
+    }
+  }
+}
+
+void AllocatorLevel01Loose::_mark_alloc_l0(int64_t l0_pos_start,
+  int64_t l0_pos_end)
+{
+  auto d0 = CHILD_PER_SLOT_L0;
+
+  int64_t pos = l0_pos_start;
+  slot_t bits = (slot_t)1 << (l0_pos_start % d0);
+
+  while (pos < std::min(l0_pos_end, (int64_t)P2ROUNDUP(l0_pos_start, d0))) {
+    l0[pos / d0] &= ~bits;
+    bits <<= 1;
+    pos++;
+  }
+
+  while (pos < std::min(l0_pos_end, (int64_t)P2ALIGN(l0_pos_end, d0))) {
+    l0[pos / d0] = all_slot_clear;
+    pos += d0;
+  }
+  bits = 1;
+  while (pos < l0_pos_end) {
+    l0[pos / d0] &= ~bits;
+    bits <<= 1;
+    pos++;
+  }
+}
+
+interval_t AllocatorLevel01Loose::_allocate_l1_contiguous(uint64_t length,
+  uint64_t min_length, uint64_t max_length,
+  uint64_t pos_start, uint64_t pos_end)
+{
+  interval_t res = { 0, 0 };
+  uint64_t l0_w = slotset_width * CHILD_PER_SLOT_L0;
+
+  if (unlikely(length <= l0_granularity)) {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, l0_granularity, l0_granularity,
+      STOP_ON_PARTIAL, &ctx);
+
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.affordable_len) {
+      // allocate as specified
+      assert(ctx.affordable_len >= length);
+      auto pos = ctx.affordable_offs / l0_granularity;
+      _mark_alloc_l1_l0(pos, pos + 1);
+      res = interval_t(ctx.affordable_offs, length);
+      return res;
+    }
+
+    // allocate from free slot sets
+    if (ctx.free_count) {
+      auto l = std::min(length, ctx.free_count * l1_granularity);
+      assert((l % l0_granularity) == 0);
+      auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+      _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+      res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+      return res;
+    }
+  } else if (unlikely(length == l1_granularity)) {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, length, min_length, STOP_ON_EMPTY, &ctx);
+
+    // allocate using contiguous extent found at l1 if any
+    if (ctx.free_count) {
+
+      auto l = std::min(length, ctx.free_count * l1_granularity);
+      assert((l % l0_granularity) == 0);
+      auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+      _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+      res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+
+      return res;
+    }
+
+    // we can terminate earlier on free entry only
+    assert(ctx.fully_processed);
+
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.affordable_len) {
+      assert(ctx.affordable_len >= length);
+      assert((length % l0_granularity) == 0);
+      auto pos_start = ctx.affordable_offs + length / l0_granularity;
+      auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      res = interval_t(ctx.affordable_offs, length);
+      return res;
+    }
+    if (ctx.min_affordable_len) {
+      auto pos_start = ctx.min_affordable_offs / l0_granularity;
+      auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
+    }
+  } else {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, length, min_length, NO_STOP, &ctx);
+    assert(ctx.fully_processed);
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.affordable_len) {
+      assert(ctx.affordable_len >= length);
+      assert((length % l0_granularity) == 0);
+      auto pos_start = ctx.affordable_offs / l0_granularity;
+      auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      res = interval_t(ctx.affordable_offs, length);
+      return res;
+    }
+    // allocate using contiguous extent found at l1 if affordable
+    // align allocated extent with min_length
+    if (ctx.free_count) {
+      auto o = ctx.free_l1_pos * l1_granularity;
+      auto l = ctx.free_count * l1_granularity;
+      interval_t aligned_extent = _align2units(o, l, min_length);
+      if (aligned_extent.length > 0) {
+	aligned_extent.length = std::min(length,
+	  uint64_t(aligned_extent.length));
+	assert((aligned_extent.offset % l0_granularity) == 0);
+	assert((aligned_extent.length % l0_granularity) == 0);
+
+	auto pos_start = aligned_extent.offset / l0_granularity;
+	auto pos_end = (aligned_extent.offset + aligned_extent.length) / l0_granularity;
+
+	_mark_alloc_l1_l0(pos_start, pos_end);
+	return aligned_extent;
+      }
+    }
+    if (ctx.min_affordable_len) {
+      auto pos_start = ctx.min_affordable_offs / l0_granularity;
+      auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
+    }
+  }
+  return res;
+}
+
+bool AllocatorLevel01Loose::_allocate_l1(uint64_t length,
+  uint64_t min_length, uint64_t max_length,
+  uint64_t l1_pos_start, uint64_t l1_pos_end,
+  uint64_t* allocated,
+  interval_vector_t* res)
+{
+  uint64_t d0 = CHILD_PER_SLOT_L0;
+  uint64_t d1 = CHILD_PER_SLOT;
+
+  assert(0 == (l1_pos_start % (slotset_width * d1)));
+  assert(0 == (l1_pos_end % (slotset_width * d1)));
+  if (min_length != l0_granularity) {
+    // probably not the most effecient way but
+    // don't care much about that at the moment
+    bool has_space = true;
+    while (length > *allocated && has_space) {
+      interval_t i =
+        _allocate_l1_contiguous(length - *allocated, min_length, max_length,
+	  l1_pos_start, l1_pos_end);
+      if (i.length == 0) {
+        has_space = false;
+      } else {
+	_fragment_and_emplace(max_length, i.offset, i.length, res);
+        *allocated += i.length;
+      }
+    }
+  } else {
+    uint64_t l0_w = slotset_width * d0;
+
+    for (auto idx = l1_pos_start / d1;
+      idx < l1_pos_end / d1 && length > *allocated;
+      ++idx) {
+      slot_t& slot_val = l1[idx];
+      if (slot_val == all_slot_clear) {
+        continue;
+      } else if (slot_val == all_slot_set) {
+        uint64_t to_alloc = std::min(length - *allocated,
+          l1_granularity * d1);
+        *allocated += to_alloc;
+        ++alloc_fragments_fast;
+	_fragment_and_emplace(max_length, idx * d1 * l1_granularity, to_alloc,
+	  res);
+        _mark_alloc_l1_l0(idx * d1 * bits_per_slotset,
+	  idx * d1 * bits_per_slotset + to_alloc / l0_granularity);
+        continue;
+      }
+      auto free_pos = find_next_set_bit(slot_val, 0);
+      assert(free_pos < bits_per_slot);
+      do {
+        assert(length > *allocated);
+
+        bool empty;
+        empty = _allocate_l0(length, max_length,
+	  (idx * d1 + free_pos / L1_ENTRY_WIDTH) * l0_w,
+          (idx * d1 + free_pos / L1_ENTRY_WIDTH + 1) * l0_w,
+          allocated,
+          res);
+
+	auto mask = slot_t(L1_ENTRY_MASK) << free_pos;
+
+	slot_t old_mask = (slot_val & mask) >> free_pos;
+	switch(old_mask) {
+	case L1_ENTRY_FREE:
+	  unalloc_l1_count--;
+	  break;
+	case L1_ENTRY_PARTIAL:
+	  partial_l1_count--;
+	  break;
+	}
+        slot_val &= ~mask;
+        if (empty) {
+          // the next line is no op with the current L1_ENTRY_FULL but left
+          // as-is for the sake of uniformity and to avoid potential errors
+          // in future
+          slot_val |= slot_t(L1_ENTRY_FULL) << free_pos;
+        } else {
+          slot_val |= slot_t(L1_ENTRY_PARTIAL) << free_pos;
+	  partial_l1_count++;
+        }
+        if (length <= *allocated || slot_val == all_slot_clear) {
+          break;
+        }
+	free_pos = find_next_set_bit(slot_val, free_pos + L1_ENTRY_WIDTH);
+      } while (free_pos < bits_per_slot);
+    }
+  }
+  return _is_empty_l1(l1_pos_start, l1_pos_end);
+}
+
+void AllocatorLevel01Loose::collect_stats(
+  std::map<size_t, size_t>& bins_overall)
+{
+  size_t free_seq_cnt = 0;
+  for (auto slot : l0) {
+    if (slot == all_slot_set) {
+      free_seq_cnt += CHILD_PER_SLOT_L0;
+    } else if(slot != all_slot_clear) {
+      size_t pos = 0;
+      do {
+	auto pos1 = find_next_set_bit(slot, pos);
+	if (pos1 == pos) {
+	  free_seq_cnt++;
+	  pos = pos1 + 1;
+	} else {
+	  if (free_seq_cnt) {
+	    bins_overall[cbits(free_seq_cnt) - 1]++;
+	    free_seq_cnt = 0;
+	  }
+	  if (pos1 < bits_per_slot) {
+	    free_seq_cnt = 1;
+	  }
+          pos = pos1 + 1;
+	}
+      } while (pos < bits_per_slot);
+    } else if (free_seq_cnt) {
+      bins_overall[cbits(free_seq_cnt) - 1]++;
+      free_seq_cnt = 0;
+    }
+  }
+  if (free_seq_cnt) {
+    bins_overall[cbits(free_seq_cnt) - 1]++;
+  }
+}
diff --git a/ceph/src/os/bluestore/fastbmap_allocator_impl.h b/ceph/src/os/bluestore/fastbmap_allocator_impl.h
new file mode 100755
index 000000000..c4f2ca00b
--- /dev/null
+++ b/ceph/src/os/bluestore/fastbmap_allocator_impl.h
@@ -0,0 +1,774 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator implementation.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ *
+ */
+
+#ifndef __FAST_BITMAP_ALLOCATOR_IMPL_H
+#define __FAST_BITMAP_ALLOCATOR_IMPL_H
+#include <type_traits>
+#include "include/intarith.h"
+
+#include <vector>
+#include <algorithm>
+#include <mutex>
+
+typedef uint64_t slot_t;
+
+#ifdef NON_CEPH_BUILD
+#include <assert.h>
+struct interval_t
+{
+  uint64_t offset = 0;
+  uint64_t length = 0;
+
+  interval_t() {}
+  interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
+  interval_t(const interval_t &ext) :
+    offset(ext.offset), length(ext.length) {}
+};
+typedef std::vector<interval_t> interval_vector_t;
+typedef std::vector<slot_t> slot_vector_t;
+#else
+#include "include/assert.h"
+#include "common/likely.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+
+typedef bluestore_interval_t<uint64_t, uint64_t> interval_t;
+typedef PExtentVector interval_vector_t;
+
+typedef mempool::bluestore_alloc::vector<slot_t> slot_vector_t;
+
+#endif
+
+// fitting into cache line on x86_64
+static const size_t slotset_width = 8; // 8 slots per set
+static const size_t slotset_bytes = sizeof(slot_t) * slotset_width;
+static const size_t bits_per_slot = sizeof(slot_t) * 8;
+static const size_t bits_per_slotset = slotset_bytes * 8;
+static const slot_t all_slot_set = 0xffffffffffffffff;
+static const slot_t all_slot_clear = 0;
+
+inline size_t find_next_set_bit(slot_t slot_val, size_t start_pos)
+{
+#ifdef __GNUC__
+  if (start_pos == 0) {
+    start_pos = __builtin_ffsll(slot_val);
+    return start_pos ? start_pos - 1 : bits_per_slot;
+  }
+#endif
+  slot_t mask = slot_t(1) << start_pos;
+  while (start_pos < bits_per_slot && !(slot_val & mask)) {
+    mask <<= 1;
+    ++start_pos;
+  }
+  return start_pos;
+}
+
+
+class AllocatorLevel
+{
+protected:
+
+  virtual uint64_t _children_per_slot() const = 0;
+  virtual uint64_t _level_granularity() const = 0;
+
+public:
+  static uint64_t l0_dives;
+  static uint64_t l0_iterations;
+  static uint64_t l0_inner_iterations;
+  static uint64_t alloc_fragments;
+  static uint64_t alloc_fragments_fast;
+  static uint64_t l2_allocs;
+
+  virtual ~AllocatorLevel()
+  {}
+
+  virtual void collect_stats(
+    std::map<size_t, size_t>& bins_overall) = 0;
+
+};
+
+class AllocatorLevel01 : public AllocatorLevel
+{
+protected:
+  slot_vector_t l0; // set bit means free entry
+  slot_vector_t l1;
+  uint64_t l0_granularity = 0; // space per entry
+  uint64_t l1_granularity = 0; // space per entry
+
+  size_t partial_l1_count = 0;
+  size_t unalloc_l1_count = 0;
+
+  double get_fragmentation() const {
+    double res = 0.0;
+    auto total = unalloc_l1_count + partial_l1_count;
+    if (total) {
+      res = double(partial_l1_count) / double(total);
+    }
+    return res;
+  }
+
+  uint64_t _level_granularity() const override
+  {
+    return l1_granularity;
+  }
+
+  inline bool _is_slot_fully_allocated(uint64_t idx) const {
+    return l1[idx] == all_slot_clear;
+  }
+public:
+  inline uint64_t get_min_alloc_size() const
+  {
+    return l0_granularity;
+  }
+
+};
+
+template <class T>
+class AllocatorLevel02;
+
+class AllocatorLevel01Loose : public AllocatorLevel01
+{
+  enum {
+    L1_ENTRY_WIDTH = 2,
+    L1_ENTRY_MASK = (1 << L1_ENTRY_WIDTH) - 1,
+    L1_ENTRY_FULL = 0x00,
+    L1_ENTRY_PARTIAL = 0x01,
+    L1_ENTRY_NOT_USED = 0x02,
+    L1_ENTRY_FREE = 0x03,
+    CHILD_PER_SLOT = bits_per_slot / L1_ENTRY_WIDTH, // 32
+    CHILD_PER_SLOT_L0 = bits_per_slot, // 64
+  };
+  uint64_t _children_per_slot() const override
+  {
+    return CHILD_PER_SLOT;
+  }
+
+  interval_t _get_longest_from_l0(uint64_t pos0, uint64_t pos1,
+    uint64_t min_length, interval_t* tail) const;
+
+  inline void _fragment_and_emplace(uint64_t max_length, uint64_t offset,
+    uint64_t len,
+    interval_vector_t* res)
+  {
+    auto it = res->rbegin();
+    if (max_length) {
+      if (it != res->rend() && it->offset + it->length == offset) {
+	auto l = max_length - it->length;
+	if (l >= len) {
+	  it->length += len;
+	  return;
+	} else {
+	  offset += l;
+	  len -= l;
+	  it->length += l;
+	}
+      }
+
+      while (len > max_length) {
+	res->emplace_back(offset, max_length);
+	offset += max_length;
+	len -= max_length;
+      }
+      res->emplace_back(offset, len);
+      return;
+    }
+
+    if (it != res->rend() && it->offset + it->length == offset) {
+      it->length += len;
+    } else {
+      res->emplace_back(offset, len);
+    }
+  }
+
+  bool _allocate_l0(uint64_t length,
+    uint64_t max_length,
+    uint64_t l0_pos0, uint64_t l0_pos1,
+    uint64_t* allocated,
+    interval_vector_t* res)
+  {
+    uint64_t d0 = CHILD_PER_SLOT_L0;
+
+    ++l0_dives;
+
+    assert(l0_pos0 < l0_pos1);
+    assert(length > *allocated);
+    assert(0 == (l0_pos0 % (slotset_width * d0)));
+    assert(0 == (l0_pos1 % (slotset_width * d0)));
+    assert(((length - *allocated) % l0_granularity) == 0);
+
+    uint64_t need_entries = (length - *allocated) / l0_granularity;
+
+    for (auto idx = l0_pos0 / d0; (idx < l0_pos1 / d0) && (length > *allocated);
+      ++idx) {
+      ++l0_iterations;
+      slot_t& slot_val = l0[idx];
+      auto base = idx * d0;
+      if (slot_val == all_slot_clear) {
+        continue;
+      } else if (slot_val == all_slot_set) {
+        uint64_t to_alloc = std::min(need_entries, d0);
+        *allocated += to_alloc * l0_granularity;
+	++alloc_fragments;
+        need_entries -= to_alloc;
+
+	_fragment_and_emplace(max_length, base * l0_granularity,
+          to_alloc * l0_granularity, res);
+
+        if (to_alloc == d0) {
+          slot_val = all_slot_clear;
+        } else {
+          _mark_alloc_l0(base, base + to_alloc);
+        }
+        continue;
+      }
+
+      auto free_pos = find_next_set_bit(slot_val, 0);
+      assert(free_pos < bits_per_slot);
+      auto next_pos = free_pos + 1;
+      while (next_pos < bits_per_slot &&
+        (next_pos - free_pos) < need_entries) {
+	++l0_inner_iterations;
+
+        if (0 == (slot_val & (slot_t(1) << next_pos))) {
+          auto to_alloc = (next_pos - free_pos);
+          *allocated += to_alloc * l0_granularity;
+	  ++alloc_fragments;
+          need_entries -= to_alloc;
+	  _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
+	    to_alloc * l0_granularity, res);
+          _mark_alloc_l0(base + free_pos, base + next_pos);
+          free_pos = find_next_set_bit(slot_val, next_pos + 1);
+          next_pos = free_pos + 1;
+        } else {
+          ++next_pos;
+        }
+      }
+      if (need_entries && free_pos < bits_per_slot) {
+        auto to_alloc = std::min(need_entries, d0 - free_pos);
+        *allocated += to_alloc * l0_granularity;
+	++alloc_fragments;
+	need_entries -= to_alloc;
+	_fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
+	  to_alloc * l0_granularity, res);
+        _mark_alloc_l0(base + free_pos, base + free_pos + to_alloc);
+      }
+    }
+    return _is_empty_l0(l0_pos0, l0_pos1);
+  }
+
+protected:
+
+  friend class AllocatorLevel02<AllocatorLevel01Loose>;
+
+  void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
+  {
+    l0_granularity = _alloc_unit;
+    // 512 bits at L0 mapped to L1 entry
+    l1_granularity = l0_granularity * bits_per_slotset;
+
+    // capacity to have slot alignment at l1
+    auto aligned_capacity =
+      P2ROUNDUP((int64_t)capacity,
+        int64_t(l1_granularity * slotset_width * _children_per_slot()));
+    size_t slot_count =
+      aligned_capacity / l1_granularity / _children_per_slot();
+    // we use set bit(s) as a marker for (partially) free entry
+    l1.resize(slot_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+    // l0 slot count
+    size_t slot_count_l0 = aligned_capacity / _alloc_unit / bits_per_slot;
+    // we use set bit(s) as a marker for (partially) free entry
+    l0.resize(slot_count_l0, mark_as_free ? all_slot_set : all_slot_clear);
+
+    partial_l1_count = unalloc_l1_count = 0;
+    if (mark_as_free) {
+      unalloc_l1_count = slot_count * _children_per_slot();
+      auto l0_pos_no_use = P2ROUNDUP((int64_t)capacity, (int64_t)l0_granularity) / l0_granularity;
+      _mark_alloc_l1_l0(l0_pos_no_use, aligned_capacity / l0_granularity);
+    }
+  }
+
+  struct search_ctx_t
+  {
+    size_t partial_count = 0;
+    size_t free_count = 0;
+    uint64_t free_l1_pos = 0;
+
+    uint64_t min_affordable_len = 0;
+    uint64_t min_affordable_offs = 0;
+    uint64_t affordable_len = 0;
+    uint64_t affordable_offs = 0;
+
+    bool fully_processed = false;
+
+    void reset()
+    {
+      *this = search_ctx_t();
+    }
+  };
+  enum {
+    NO_STOP,
+    STOP_ON_EMPTY,
+    STOP_ON_PARTIAL,
+  };
+  void _analyze_partials(uint64_t pos_start, uint64_t pos_end,
+    uint64_t length, uint64_t min_length, int mode,
+    search_ctx_t* ctx);
+
+  void _mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end);
+  void _mark_alloc_l0(int64_t l0_pos_start, int64_t l0_pos_end);
+
+  void _mark_alloc_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    _mark_alloc_l0(l0_pos_start, l0_pos_end);
+    l0_pos_start = P2ALIGN(l0_pos_start, int64_t(bits_per_slotset));
+    l0_pos_end = P2ROUNDUP(l0_pos_end, int64_t(bits_per_slotset));
+    _mark_l1_on_l0(l0_pos_start, l0_pos_end);
+  }
+
+  void _mark_free_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    auto d0 = CHILD_PER_SLOT_L0;
+
+    auto pos = l0_pos_start;
+    slot_t bits = (slot_t)1 << (l0_pos_start % d0);
+    slot_t& val_s = l0[pos / d0];
+    int64_t pos_e = std::min(l0_pos_end, (int64_t)P2ROUNDUP(l0_pos_start + 1, d0));
+    while (pos < pos_e) {
+      val_s |= bits;
+      bits <<= 1;
+      pos++;
+    }
+    pos_e = std::min(l0_pos_end, (int64_t)P2ALIGN(l0_pos_end, d0));
+    auto idx = pos / d0;
+    while (pos < pos_e) {
+      l0[idx++] = all_slot_set;
+      pos += d0;
+    }
+    bits = 1;
+    uint64_t& val_e = l0[pos / d0];
+    while (pos < l0_pos_end) {
+      val_e |= bits;
+      bits <<= 1;
+      pos++;
+    }
+  }
+
+  void _mark_free_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    _mark_free_l0(l0_pos_start, l0_pos_end);
+    l0_pos_start = P2ALIGN(l0_pos_start, int64_t(bits_per_slotset));
+    l0_pos_end = P2ROUNDUP(l0_pos_end, int64_t(bits_per_slotset));
+    _mark_l1_on_l0(l0_pos_start, l0_pos_end);
+  }
+
+  bool _is_empty_l0(uint64_t l0_pos, uint64_t l0_pos_end)
+  {
+    bool no_free = true;
+    uint64_t d = slotset_width * CHILD_PER_SLOT_L0;
+    assert(0 == (l0_pos % d));
+    assert(0 == (l0_pos_end % d));
+
+    auto idx = l0_pos / CHILD_PER_SLOT_L0;
+    auto idx_end = l0_pos_end / CHILD_PER_SLOT_L0;
+    while (idx < idx_end && no_free) {
+      no_free = l0[idx] == all_slot_clear;
+      ++idx;
+    }
+    return no_free;
+  }
+  bool _is_empty_l1(uint64_t l1_pos, uint64_t l1_pos_end)
+  {
+    bool no_free = true;
+    uint64_t d = slotset_width * _children_per_slot();
+    assert(0 == (l1_pos % d));
+    assert(0 == (l1_pos_end % d));
+
+    auto idx = l1_pos / CHILD_PER_SLOT;
+    auto idx_end = l1_pos_end / CHILD_PER_SLOT;
+    while (idx < idx_end && no_free) {
+      no_free = _is_slot_fully_allocated(idx);
+      ++idx;
+    }
+    return no_free;
+  }
+
+  interval_t _allocate_l1_contiguous(uint64_t length,
+    uint64_t min_length, uint64_t max_length,
+    uint64_t pos_start, uint64_t pos_end);
+
+  bool _allocate_l1(uint64_t length,
+    uint64_t min_length, uint64_t max_length,
+    uint64_t l1_pos_start, uint64_t l1_pos_end,
+    uint64_t* allocated,
+    interval_vector_t* res);
+
+  uint64_t _mark_alloc_l1(const interval_t& r)
+  {
+    uint64_t l0_pos_start = r.offset / l0_granularity;
+    uint64_t l0_pos_end = P2ROUNDUP(r.offset + r.length, l0_granularity) / l0_granularity;
+    _mark_alloc_l1_l0(l0_pos_start, l0_pos_end);
+    return l0_granularity * (l0_pos_end - l0_pos_start);
+  }
+
+  uint64_t _free_l1(uint64_t offs, uint64_t len)
+  {
+    uint64_t l0_pos_start = offs / l0_granularity;
+    uint64_t l0_pos_end = P2ROUNDUP(offs + len, l0_granularity) / l0_granularity;
+    _mark_free_l1_l0(l0_pos_start, l0_pos_end);
+    return l0_granularity * (l0_pos_end - l0_pos_start);
+  }
+
+public:
+  uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    if (pos1 == 0) {
+      pos1 = l1.size() * CHILD_PER_SLOT;
+    }
+    auto avail = debug_get_free(pos0, pos1);
+    return (pos1 - pos0) * l1_granularity - avail;
+  }
+
+  uint64_t debug_get_free(uint64_t l1_pos0 = 0, uint64_t l1_pos1 = 0)
+  {
+    assert(0 == (l1_pos0 % CHILD_PER_SLOT));
+    assert(0 == (l1_pos1 % CHILD_PER_SLOT));
+
+    auto idx0 = l1_pos0 * slotset_width;
+    auto idx1 = l1_pos1 * slotset_width;
+
+    if (idx1 == 0) {
+      idx1 = l0.size();
+    }
+
+    uint64_t res = 0;
+    for (uint64_t i = idx0; i < idx1; ++i) {
+      auto v = l0[i];
+      if (v == all_slot_set) {
+        res += CHILD_PER_SLOT_L0;
+      } else if (v != all_slot_clear) {
+        size_t cnt = 0;
+#ifdef __GNUC__
+        cnt = __builtin_popcountll(v);
+#else
+        // Kernighan's Alg to count set bits
+        while (v) {
+          v &= (v - 1);
+          cnt++;
+        }
+#endif
+        res += cnt;
+      }
+    }
+    return res * l0_granularity;
+  }
+  void collect_stats(
+    std::map<size_t, size_t>& bins_overall) override;
+};
+
+class AllocatorLevel01Compact : public AllocatorLevel01
+{
+  uint64_t _children_per_slot() const override
+  {
+    return 8;
+  }
+public:
+  void collect_stats(
+    std::map<size_t, size_t>& bins_overall) override
+  {
+    // not implemented
+  }
+};
+
+template <class L1>
+class AllocatorLevel02 : public AllocatorLevel
+{
+public:
+  uint64_t debug_get_free(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    std::lock_guard<std::mutex> l(lock);
+    return l1.debug_get_free(pos0 * l1._children_per_slot() * bits_per_slot,
+      pos1 * l1._children_per_slot() * bits_per_slot);
+  }
+  uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    std::lock_guard<std::mutex> l(lock);
+    return l1.debug_get_allocated(pos0 * l1._children_per_slot() * bits_per_slot,
+      pos1 * l1._children_per_slot() * bits_per_slot);
+  }
+
+  uint64_t get_available()
+  {
+    std::lock_guard<std::mutex> l(lock);
+    return available;
+  }
+  inline uint64_t get_min_alloc_size() const
+  {
+    return l1.get_min_alloc_size();
+  }
+  void collect_stats(
+    std::map<size_t, size_t>& bins_overall) override {
+
+      std::lock_guard<std::mutex> l(lock);
+      l1.collect_stats(bins_overall);
+  }
+
+protected:
+  std::mutex lock;
+  L1 l1;
+  slot_vector_t l2;
+  uint64_t l2_granularity = 0; // space per entry
+  uint64_t available = 0;
+  uint64_t last_pos = 0;
+
+  enum {
+    CHILD_PER_SLOT = bits_per_slot, // 64
+  };
+
+  uint64_t _children_per_slot() const override
+  {
+    return CHILD_PER_SLOT;
+  }
+  uint64_t _level_granularity() const override
+  {
+    return l2_granularity;
+  }
+
+  void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
+  {
+    assert(ISP2(_alloc_unit));
+    l1._init(capacity, _alloc_unit, mark_as_free);
+
+    l2_granularity =
+      l1._level_granularity() * l1._children_per_slot() * slotset_width;
+
+    // capacity to have slot alignment at l2
+    auto aligned_capacity =
+      P2ROUNDUP((int64_t)capacity, (int64_t)l2_granularity * CHILD_PER_SLOT);
+    size_t elem_count = aligned_capacity / l2_granularity / CHILD_PER_SLOT;
+    // we use set bit(s) as a marker for (partially) free entry
+    l2.resize(elem_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+    if (mark_as_free) {
+      // capacity to have slotset alignment at l1
+      auto l2_pos_no_use =
+	P2ROUNDUP((int64_t)capacity, (int64_t)l2_granularity) / l2_granularity;
+      _mark_l2_allocated(l2_pos_no_use, aligned_capacity / l2_granularity);
+      available = P2ALIGN(capacity, _alloc_unit);
+    } else {
+      available = 0;
+    }
+  }
+
+  void _mark_l2_allocated(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = CHILD_PER_SLOT;
+    assert(0 <= l2_pos_end);
+    assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    while (l2_pos < l2_pos_end) {
+      l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
+      ++l2_pos;
+    }
+  }
+
+  void _mark_l2_free(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = CHILD_PER_SLOT;
+    assert(0 <= l2_pos_end);
+    assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    while (l2_pos < l2_pos_end) {
+        l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
+        ++l2_pos;
+    }
+  }
+
+  void _mark_l2_on_l1(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = CHILD_PER_SLOT;
+    assert(0 <= l2_pos_end);
+    assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    auto idx = l2_pos * slotset_width;
+    auto idx_end = l2_pos_end * slotset_width;
+    bool all_allocated = true;
+    while (idx < idx_end) {
+      if (!l1._is_slot_fully_allocated(idx)) {
+        all_allocated = false;
+        idx = P2ROUNDUP(int64_t(++idx), int64_t(slotset_width));
+      }
+      else {
+        ++idx;
+      }
+      if ((idx % slotset_width) == 0) {
+        if (all_allocated) {
+          l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
+        }
+        else {
+          l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
+        }
+        all_allocated = true;
+        ++l2_pos;
+      }
+    }
+  }
+
+  void _allocate_l2(uint64_t length,
+    uint64_t min_length,
+    uint64_t max_length,
+    uint64_t hint,
+    
+    uint64_t* allocated,
+    interval_vector_t* res)
+  {
+    uint64_t prev_allocated = *allocated;
+    uint64_t d = CHILD_PER_SLOT;
+    assert(ISP2(min_length));
+    assert(min_length <= l2_granularity);
+    assert(max_length == 0 || max_length >= min_length);
+    assert(max_length == 0 || (max_length % min_length) == 0);
+    assert(length >= min_length);
+    assert((length % min_length) == 0);
+
+    uint64_t cap = 1ull << 31;
+    if (max_length == 0 || max_length >= cap) {
+      max_length = cap;
+    }
+
+    uint64_t l1_w = slotset_width * l1._children_per_slot();
+
+    std::lock_guard<std::mutex> l(lock);
+
+    if (available < min_length) {
+      return;
+    }
+    if (hint != 0) {
+      last_pos = (hint / d) < l2.size() ? P2ALIGN(hint, d) : 0;
+    }
+    auto l2_pos = last_pos;
+    auto last_pos0 = last_pos;
+    auto pos = last_pos / d;
+    auto pos_end = l2.size();
+    // outer loop below is intended to optimize the performance by
+    // avoiding 'modulo' operations inside the internal loop.
+    // Looks like they have negative impact on the performance
+    for (auto i = 0; i < 2; ++i) {
+      for(; length > *allocated && pos < pos_end; ++pos) {
+	slot_t& slot_val = l2[pos];
+	size_t free_pos = 0;
+	bool all_set = false;
+	if (slot_val == all_slot_clear) {
+	  l2_pos += d;
+	  last_pos = l2_pos;
+	  continue;
+	} else if (slot_val == all_slot_set) {
+	  free_pos = 0;
+	  all_set = true;
+	} else {
+	  free_pos = find_next_set_bit(slot_val, 0);
+	  assert(free_pos < bits_per_slot);
+	}
+	do {
+	  assert(length > *allocated);
+	  bool empty = l1._allocate_l1(length,
+	    min_length,
+	    max_length,
+	    (l2_pos + free_pos) * l1_w,
+	    (l2_pos + free_pos + 1) * l1_w,
+	    allocated,
+	    res);
+	  if (empty) {
+	    slot_val &= ~(slot_t(1) << free_pos);
+	  }
+	  if (length <= *allocated || slot_val == all_slot_clear) {
+	    break;
+	  }
+	  ++free_pos;
+	  if (!all_set) {
+	    free_pos = find_next_set_bit(slot_val, free_pos);
+	  }
+	} while (free_pos < bits_per_slot);
+	last_pos = l2_pos;
+	l2_pos += d;
+      }
+      l2_pos = 0;
+      pos = 0;
+      pos_end = last_pos0 / d;
+    }
+
+    ++l2_allocs;
+    auto allocated_here = *allocated - prev_allocated;
+    assert(available >= allocated_here);
+    available -= allocated_here;
+  }
+
+#ifndef NON_CEPH_BUILD
+  // to provide compatibility with BlueStore's allocator interface
+  void _free_l2(const interval_set<uint64_t> & rr)
+  {
+    uint64_t released = 0;
+    std::lock_guard<std::mutex> l(lock);
+    for (auto r : rr) {
+      released += l1._free_l1(r.first, r.second);
+      uint64_t l2_pos = r.first / l2_granularity;
+      uint64_t l2_pos_end = P2ROUNDUP(int64_t(r.first + r.second), int64_t(l2_granularity)) / l2_granularity;
+
+      _mark_l2_free(l2_pos, l2_pos_end);
+    }
+    available += released;
+  }
+#endif
+
+  template <typename T>
+  void _free_l2(const T& rr)
+  {
+    uint64_t released = 0;
+    std::lock_guard<std::mutex> l(lock);
+    for (auto r : rr) {
+      released += l1._free_l1(r.offset, r.length);
+      uint64_t l2_pos = r.offset / l2_granularity;
+      uint64_t l2_pos_end = P2ROUNDUP(int64_t(r.offset + r.length), int64_t(l2_granularity)) / l2_granularity;
+
+      _mark_l2_free(l2_pos, l2_pos_end);
+    }
+    available += released;
+  }
+
+  void _mark_allocated(uint64_t o, uint64_t len)
+  {
+    uint64_t l2_pos = o / l2_granularity;
+    uint64_t l2_pos_end = P2ROUNDUP(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
+
+    std::lock_guard<std::mutex> l(lock);
+    auto allocated = l1._mark_alloc_l1(interval_t(o, len));
+    assert(available >= allocated);
+    available -= allocated;
+    _mark_l2_on_l1(l2_pos, l2_pos_end);
+  }
+
+  void _mark_free(uint64_t o, uint64_t len)
+  {
+    uint64_t l2_pos = o / l2_granularity;
+    uint64_t l2_pos_end = P2ROUNDUP(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
+
+    std::lock_guard<std::mutex> l(lock);
+    available += l1._free_l1(o, len);
+    _mark_l2_free(l2_pos, l2_pos_end);
+  }
+  void _shutdown()
+  {
+    last_pos = 0;
+  }
+  double _get_fragmentation() {
+    std::lock_guard<std::mutex> l(lock);
+    return l1.get_fragmentation();
+  }
+};
+
+#endif
diff --git a/ceph/src/os/filestore/FileStore.cc b/ceph/src/os/filestore/FileStore.cc
index dd3d1045e..1cdea3677 100644
--- a/ceph/src/os/filestore/FileStore.cc
+++ b/ceph/src/os/filestore/FileStore.cc
@@ -2387,7 +2387,11 @@ void FileStore::_set_global_replay_guard(const coll_t& cid,
   }
 
   // and make sure our xattr is durable.
-  ::fsync(fd);
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
 
   _inject_failure();
 
@@ -2456,7 +2460,11 @@ void FileStore::_set_replay_guard(int fd,
   _inject_failure();
 
   // first make sure the previous operation commits
-  ::fsync(fd);
+  int r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
 
   if (!in_progress) {
     // sync object_map too.  even if this object has a header or keys,
@@ -2471,7 +2479,7 @@ void FileStore::_set_replay_guard(int fd,
   bufferlist v(40);
   ::encode(spos, v);
   ::encode(in_progress, v);
-  int r = chain_fsetxattr<true, true>(
+  r = chain_fsetxattr<true, true>(
     fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
   if (r < 0) {
     derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
@@ -2479,7 +2487,11 @@ void FileStore::_set_replay_guard(int fd,
   }
 
   // and make sure our xattr is durable.
-  ::fsync(fd);
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
 
   _inject_failure();
 
@@ -2529,7 +2541,11 @@ void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos,
   }
 
   // and make sure our xattr is durable.
-  ::fsync(fd);
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
 
   _inject_failure();
 
diff --git a/ceph/src/os/filestore/LFNIndex.cc b/ceph/src/os/filestore/LFNIndex.cc
index 2befe38d5..bbdb76e30 100644
--- a/ceph/src/os/filestore/LFNIndex.cc
+++ b/ceph/src/os/filestore/LFNIndex.cc
@@ -29,6 +29,7 @@
 #include "common/debug.h"
 #include "include/buffer.h"
 #include "common/ceph_crypto.h"
+#include "common/errno.h"
 #include "include/compat.h"
 #include "chain_xattr.h"
 
@@ -176,10 +177,11 @@ int LFNIndex::fsync_dir(const vector<string> &path)
   maybe_inject_failure();
   int r = ::fsync(fd);
   maybe_inject_failure();
-  if (r < 0)
-    return -errno;
-  else
-    return 0;
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+  return 0;
 }
 
 int LFNIndex::link_object(const vector<string> &from,
diff --git a/ceph/src/os/filestore/WBThrottle.cc b/ceph/src/os/filestore/WBThrottle.cc
index 8930cd95a..6819c85e2 100644
--- a/ceph/src/os/filestore/WBThrottle.cc
+++ b/ceph/src/os/filestore/WBThrottle.cc
@@ -5,6 +5,7 @@
 
 #include "os/filestore/WBThrottle.h"
 #include "common/perf_counters.h"
+#include "common/errno.h"
 
 WBThrottle::WBThrottle(CephContext *cct) :
   cur_ios(0), cur_size(0),
@@ -166,10 +167,14 @@ void *WBThrottle::entry()
     logger->inc(l_wbthrottle_inodes_wb);
     lock.Unlock();
 #ifdef HAVE_FDATASYNC
-    ::fdatasync(**wb.get<1>());
+    int r = ::fdatasync(**wb.get<1>());
 #else
-    ::fsync(**wb.get<1>());
+    int r = ::fsync(**wb.get<1>());
 #endif
+    if (r < 0) {
+      lderr(cct) << "WBThrottle fsync failed: " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
 #ifdef HAVE_POSIX_FADVISE
     if (cct->_conf->filestore_fadvise && wb.get<2>().nocache) {
       int fa_r = posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED);
diff --git a/ceph/src/osd/ECBackend.cc b/ceph/src/osd/ECBackend.cc
index c2be4a052..ef2218b65 100644
--- a/ceph/src/osd/ECBackend.cc
+++ b/ceph/src/osd/ECBackend.cc
@@ -1836,13 +1836,12 @@ bool ECBackend::try_state_to_reads()
     return false;
   }
 
+  op->using_cache = pipeline_state.caching_enabled();
+
   if (op->invalidates_cache()) {
     dout(20) << __func__ << ": invalidating cache after this op"
 	     << dendl;
     pipeline_state.invalidate();
-    op->using_cache = false;
-  } else {
-    op->using_cache = pipeline_state.caching_enabled();
   }
 
   waiting_state.pop_front();
diff --git a/ceph/src/osd/OSD.cc b/ceph/src/osd/OSD.cc
index 0f968d333..3f7f79d5b 100644
--- a/ceph/src/osd/OSD.cc
+++ b/ceph/src/osd/OSD.cc
@@ -2987,10 +2987,19 @@ void OSD::final_init()
   r = admin_socket->register_command(
    "trigger_scrub",
    "trigger_scrub " \
-   "name=pgid,type=CephString ",
+   "name=pgid,type=CephString " \
+   "name=time,type=CephInt,req=false",
    test_ops_hook,
    "Trigger a scheduled scrub ");
   assert(r == 0);
+  r = admin_socket->register_command(
+   "trigger_deep_scrub",
+   "trigger_deep_scrub " \
+   "name=pgid,type=CephString " \
+   "name=time,type=CephInt,req=false",
+   test_ops_hook,
+   "Trigger a scheduled deep scrub ");
+  ceph_assert(r == 0);
   r = admin_socket->register_command(
    "injectfull",
    "injectfull " \
@@ -5613,8 +5622,9 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
        << "to " << service->cct->_conf->osd_recovery_delay_start;
     return;
   }
-  if (command ==  "trigger_scrub") {
+  if (command ==  "trigger_scrub" || command == "trigger_deep_scrub") {
     spg_t pgid;
+    bool deep = (command == "trigger_deep_scrub");
     OSDMapRef curmap = service->get_osdmap();
 
     string pgidstr;
@@ -5625,6 +5635,9 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
       return;
     }
 
+    int64_t time;
+    cmd_getval(service->cct, cmdmap, "time", time, (int64_t)0);
+
     PG *pg = service->osd->_lookup_lock_pg(pgid);
     if (pg == nullptr) {
       ss << "Can't find pg " << pgid;
@@ -5635,16 +5648,31 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
       pg->unreg_next_scrub();
       const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
       double pool_scrub_max_interval = 0;
-      p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
-      double scrub_max_interval = pool_scrub_max_interval > 0 ?
-        pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
+      double scrub_max_interval;
+      if (deep) {
+        p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
+        scrub_max_interval = pool_scrub_max_interval > 0 ?
+          pool_scrub_max_interval : g_conf->osd_deep_scrub_interval;
+      } else {
+        p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
+        scrub_max_interval = pool_scrub_max_interval > 0 ?
+          pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
+      }
       // Instead of marking must_scrub force a schedule scrub
       utime_t stamp = ceph_clock_now();
-      stamp -= scrub_max_interval;
-      stamp -=  100.0;  // push back last scrub more for good measure
-      pg->info.history.last_scrub_stamp = stamp;
+      if (time == 0)
+        stamp -= scrub_max_interval;
+      else
+        stamp -=  (float)time;
+      stamp -= 100.0;  // push back last scrub more for good measure
+      if (deep) {
+        pg->set_last_deep_scrub_stamp(stamp);
+      } else {
+        pg->set_last_scrub_stamp(stamp);
+      }
       pg->reg_next_scrub();
-      ss << "ok";
+      pg->publish_stats_to_osd();
+      ss << "ok - set" << (deep ? " deep" : "" ) << " stamp " << stamp;
     } else {
       ss << "Not primary";
     }
@@ -9596,46 +9624,31 @@ bool OSDService::_recover_now(uint64_t *available_pushes)
 
 void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
 {
-  if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
+  if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY))) {
     return;
-  int newstate = 0;
-
+  }
+  set<spg_t> did;
   if (newflags & OFR_BACKFILL) {
-    newstate = PG_STATE_FORCED_BACKFILL;
+    for (auto& pg : pgs) {
+      if (pg->set_force_backfill(!(newflags & OFR_CANCEL))) {
+	did.insert(pg->pg_id);
+      }
+    }
   } else if (newflags & OFR_RECOVERY) {
-    newstate = PG_STATE_FORCED_RECOVERY;
-  }
-
-  // debug output here may get large, don't generate it if debug level is below
-  // 10 and use abbreviated pg ids otherwise
-  if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
-    stringstream ss;
-
-    for (auto& i : pgs) {
-      ss << i->get_pgid() << " ";
+    for (auto& pg : pgs) {
+      if (pg->set_force_recovery(!(newflags & OFR_CANCEL))) {
+	did.insert(pg->pg_id);
+      }
     }
-
-    dout(10) << __func__ << " working on " << ss.str() << dendl;
   }
-
-  if (newflags & OFR_CANCEL) {
-    for (auto& i : pgs) {
-      i->lock();
-      i->_change_recovery_force_mode(newstate, true);
-      i->unlock();
-    }
+  if (did.empty()) {
+    dout(10) << __func__ << " " << ((newflags & OFR_CANCEL) ? "cleared" : "set")
+	     << " force_" << ((newflags & OFR_BACKFILL) ? "backfill" : "recovery")
+	     << " on no pgs" << dendl;
   } else {
-    for (auto& i : pgs) {
-      // make sure the PG is in correct state before forcing backfill or recovery, or
-      // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
-      // or forcing somehow recovery/backfill.
-      i->lock();
-      int pgstate = i->get_state();
-      if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
-	    ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING))) )
-        i->_change_recovery_force_mode(newstate, false);
-      i->unlock();
-    }
+    dout(10) << __func__ << " " << ((newflags & OFR_CANCEL) ? "cleared" : "set")
+	     << " force_" << ((newflags & OFR_BACKFILL) ? "backfill" : "recovery")
+	     << " on " << did << dendl;
   }
 }
 
diff --git a/ceph/src/osd/OSDMap.cc b/ceph/src/osd/OSDMap.cc
index 3b7ddfa4a..33c353d46 100644
--- a/ceph/src/osd/OSDMap.cc
+++ b/ceph/src/osd/OSDMap.cc
@@ -1642,7 +1642,30 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
       to_cancel.insert(pg);
       continue;
     }
+    vector<int> raw_up;
+    int primary;
+    tmpmap.pg_to_raw_up(pg, &raw_up, &primary);
+    vector<int> up;
+    up.reserve(raw_up.size());
+    for (auto osd : raw_up) {
+      // skip non-existent/down osd for erasure-coded PGs
+      if (osd == CRUSH_ITEM_NONE)
+        continue;
+      up.push_back(osd);
+    }
     auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
+    auto r = tmpmap.crush->verify_upmap(cct,
+                                        crush_rule,
+                                        tmpmap.get_pg_pool_size(pg),
+                                        up);
+    if (r < 0) {
+      ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
+                    << " returning " << r
+                    << dendl;
+      to_cancel.insert(pg);
+      continue;
+    }
+    // below we check against crush-topology changing..
     map<int, float> weight_map;
     auto it = rule_weight_map.find(crush_rule);
     if (it == rule_weight_map.end()) {
@@ -1656,43 +1679,10 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
     } else {
       weight_map = it->second;
     }
-    auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
-    if (type < 0) {
-      lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
-                 << pg << dendl;
-      continue;
-    }
     ldout(cct, 10) << __func__ << " pg " << pg
-                   << " crush-rule-id " << crush_rule
                    << " weight_map " << weight_map
-                   << " failure-domain-type " << type
                    << dendl;
-    vector<int> raw;
-    int primary;
-    tmpmap.pg_to_raw_up(pg, &raw, &primary);
-    set<int> parents;
-    for (auto osd : raw) {
-      // skip non-existent/down osd for erasure-coded PGs
-      if (osd == CRUSH_ITEM_NONE)
-        continue;
-      if (type > 0) {
-        auto parent = tmpmap.crush->get_parent_of_type(osd, type, crush_rule);
-        if (parent < 0) {
-          auto r = parents.insert(parent);
-          if (!r.second) {
-            // two up-set osds come from same parent
-            to_cancel.insert(pg);
-            break;
-          }
-        } else {
-          lderr(cct) << __func__ << " unable to get parent of raw osd."
-                     << osd << " of pg " << pg
-                     << dendl;
-          // continue to do checks below
-        }
-      }
-      // the above check validates collision only
-      // below we continue to check against crush-topology changing..
+    for (auto osd : up) {
       auto it = weight_map.find(osd);
       if (it == weight_map.end()) {
         // osd is gone or has been moved out of the specific crush-tree
@@ -2263,6 +2253,17 @@ void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
     *primary = _pick_primary(*raw);
 }
 
+void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int> *raw_upmap) const
+{
+  auto pool = get_pg_pool(pg.pool());
+  if (!pool) {
+    raw_upmap->clear();
+    return;
+  }
+  _pg_to_raw_osds(*pool, pg, raw_upmap, NULL);
+  _apply_upmap(*pool, pg, raw_upmap);
+}
+
 void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
 {
   const pg_pool_t *pool = get_pg_pool(pg.pool());
@@ -3975,9 +3976,6 @@ bool OSDMap::try_pg_upmap(
   if (rule < 0)
     return false;
 
-  // get original mapping
-  _pg_to_raw_osds(*pool, pg, orig, NULL);
-
   // make sure there is something there to remap
   bool any = false;
   for (auto osd : *orig) {
@@ -4008,202 +4006,476 @@ int OSDMap::calc_pg_upmaps(
   CephContext *cct,
   float max_deviation_ratio,
   int max,
-  const set<int64_t>& only_pools_orig,
+  const set<int64_t>& only_pools,
   OSDMap::Incremental *pending_inc)
 {
-  set<int64_t> only_pools;
-  if (only_pools_orig.empty()) {
-    for (auto& i : pools) {
-      only_pools.insert(i.first);
-    }
-  } else {
-    only_pools = only_pools_orig;
-  }
+  ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
   OSDMap tmp;
   tmp.deepish_copy_from(*this);
-  float start_deviation = 0;
-  float end_deviation = 0;
   int num_changed = 0;
-  while (true) {
-    map<int,set<pg_t>> pgs_by_osd;
-    int total_pgs = 0;
-    float osd_weight_total = 0;
-    map<int,float> osd_weight;
-    for (auto& i : pools) {
-      if (!only_pools.empty() && !only_pools.count(i.first))
-	continue;
-      for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
-	pg_t pg(ps, i.first);
-	vector<int> up;
-	tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
-	for (auto osd : up) {
-	  if (osd != CRUSH_ITEM_NONE)
-	    pgs_by_osd[osd].insert(pg);
-	}
+  map<int,set<pg_t>> pgs_by_osd;
+  int total_pgs = 0;
+  float osd_weight_total = 0;
+  map<int,float> osd_weight;
+  for (auto& i : pools) {
+    if (!only_pools.empty() && !only_pools.count(i.first))
+      continue;
+    for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
+      pg_t pg(ps, i.first);
+      vector<int> up;
+      tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
+      ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
+      for (auto osd : up) {
+        if (osd != CRUSH_ITEM_NONE)
+	  pgs_by_osd[osd].insert(pg);
       }
-      total_pgs += i.second.get_size() * i.second.get_pg_num();
-
-      map<int,float> pmap;
-      int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
-					i.second.get_type(),
-					i.second.get_size());
-      tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
-      ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
-      for (auto p : pmap) {
-	auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
-        if (adjusted_weight == 0) {
-          continue;
-        }
-	osd_weight[p.first] += adjusted_weight;
-	osd_weight_total += adjusted_weight;
+    }
+    total_pgs += i.second.get_size() * i.second.get_pg_num();
+
+    map<int,float> pmap;
+    int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
+				      i.second.get_type(),
+				      i.second.get_size());
+    tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
+    ldout(cct,20) << __func__ << " pool " << i.first
+                  << " ruleno " << ruleno
+                  << " weight-map " << pmap
+                  << dendl;
+    for (auto p : pmap) {
+      auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
+      if (adjusted_weight == 0) {
+        continue;
       }
+      osd_weight[p.first] += adjusted_weight;
+      osd_weight_total += adjusted_weight;
     }
-    for (auto& i : osd_weight) {
-      int pgs = 0;
-      auto p = pgs_by_osd.find(i.first);
-      if (p != pgs_by_osd.end())
+  }
+  for (auto& i : osd_weight) {
+    int pgs = 0;
+    auto p = pgs_by_osd.find(i.first);
+    if (p != pgs_by_osd.end())
 	pgs = p->second.size();
-      else
+    else
 	pgs_by_osd.emplace(i.first, set<pg_t>());
-      ldout(cct, 20) << " osd." << i.first << " weight " << i.second
+    ldout(cct, 20) << " osd." << i.first << " weight " << i.second
 		     << " pgs " << pgs << dendl;
-    }
+  }
+  if (osd_weight_total == 0) {
+    lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
+    return 0;
+  }
+  float pgs_per_weight = total_pgs / osd_weight_total;
+  ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
+  ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
 
-    if (osd_weight_total == 0) {
-      lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
+  if (max <= 0) {
+    lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
+    return 0;
+  }
+  float decay_factor = 1.0 / float(max);
+  float stddev = 0;
+  map<int,float> osd_deviation;       // osd, deviation(pgs)
+  multimap<float,int> deviation_osd;  // deviation(pgs), osd
+  for (auto& i : pgs_by_osd) {
+    // make sure osd is still there (belongs to this crush-tree)
+    ceph_assert(osd_weight.count(i.first));
+    float target = osd_weight[i.first] * pgs_per_weight;
+    float deviation = (float)i.second.size() - target;
+    ldout(cct, 20) << " osd." << i.first
+                   << "\tpgs " << i.second.size()
+                   << "\ttarget " << target
+                   << "\tdeviation " << deviation
+                   << dendl;
+    osd_deviation[i.first] = deviation;
+    deviation_osd.insert(make_pair(deviation, i.first));
+    stddev += deviation * deviation;
+  }
+  if (stddev <= cct->_conf->get_val<double>("osd_calc_pg_upmaps_max_stddev")) {
+    ldout(cct, 10) << __func__ << " distribution is almost perfect"
+                   << dendl;
+    return 0;
+  }
+  bool skip_overfull = false;
+  auto aggressive =
+    cct->_conf->get_val<bool>("osd_calc_pg_upmaps_aggressively");
+  auto local_fallback_retries =
+    cct->_conf->get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
+  while (max--) {
+    // build overfull and underfull
+    set<int> overfull;
+    vector<int> underfull;
+    float decay = 0;
+    int decay_count = 0;
+    while (overfull.empty()) {
+      for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
+        if (i->first >= (1.0 - decay))
+          overfull.insert(i->second);
+      }
+      if (!overfull.empty())
+        break;
+      decay_count++;
+      decay = decay_factor * decay_count;
+      if (decay >= 1.0)
+        break;
+      ldout(cct, 30) << " decay_factor = " << decay_factor
+                     << " decay_count = " << decay_count
+                     << " decay (overfull) = " << decay
+                     << dendl;
+    }
+    if (overfull.empty()) {
+      lderr(cct) << __func__ << " failed to build overfull" << dendl;
       break;
     }
-    float pgs_per_weight = total_pgs / osd_weight_total;
-    ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
-    ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
 
-    // osd deviation
-    float total_deviation = 0;
-    map<int,float> osd_deviation;       // osd, deviation(pgs)
-    multimap<float,int> deviation_osd;  // deviation(pgs), osd
-    set<int> overfull;
-    for (auto& i : pgs_by_osd) {
-      // make sure osd is still there (belongs to this crush-tree)
-      assert(osd_weight.count(i.first));
-      float target = osd_weight[i.first] * pgs_per_weight;
-      float deviation = (float)i.second.size() - target;
-      ldout(cct, 20) << " osd." << i.first
-		     << "\tpgs " << i.second.size()
-		     << "\ttarget " << target
-		     << "\tdeviation " << deviation
-		     << dendl;
-      osd_deviation[i.first] = deviation;
-      deviation_osd.insert(make_pair(deviation, i.first));
-      if (deviation >= 1.0)
-	overfull.insert(i.first);
-      total_deviation += abs(deviation);
-    }
-    if (num_changed == 0) {
-      start_deviation = total_deviation;
-    }
-    end_deviation = total_deviation;
-
-    // build underfull, sorted from least-full to most-average
-    vector<int> underfull;
-    for (auto i = deviation_osd.begin();
-	 i != deviation_osd.end();
-	 ++i) {
-      if (i->first >= -.999)
-	break;
-      underfull.push_back(i->second);
+    decay = 0;
+    decay_count = 0;
+    while (underfull.empty()) {
+      for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
+        if (i->first >= (-.999 + decay))
+          break;
+        underfull.push_back(i->second);
+      }
+      if (!underfull.empty())
+        break;
+      decay_count++;
+      decay = decay_factor * decay_count;
+      if (decay >= .999)
+        break;
+      ldout(cct, 30) << " decay_factor = " << decay_factor
+                     << " decay_count = " << decay_count
+                     << " decay (underfull) = " << decay
+                     << dendl;
     }
-    ldout(cct, 10) << " total_deviation " << total_deviation
-		   << " overfull " << overfull
-		   << " underfull " << underfull << dendl;
-    if (overfull.empty() || underfull.empty())
+    if (underfull.empty()) {
+      lderr(cct) << __func__ << " failed to build underfull" << dendl;
       break;
+    }
 
-    // pick fullest
-    bool restart = false;
+    ldout(cct, 10) << " overfull " << overfull
+                   << " underfull " << underfull
+                   << dendl;
+    set<pg_t> to_skip;
+    uint64_t local_fallback_retried = 0;
+
+  retry:
+
+    set<pg_t> to_unmap;
+    map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
+    auto temp_pgs_by_osd = pgs_by_osd;
+    // always start with fullest, break if we find any changes to make
     for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
+      if (skip_overfull) {
+        ldout(cct, 10) << " skipping overfull " << dendl;
+        break; // fall through to check underfull
+      }
       int osd = p->second;
       float deviation = p->first;
       float target = osd_weight[osd] * pgs_per_weight;
-      assert(target > 0);
-      if (deviation/target < max_deviation_ratio) {
+      ceph_assert(target > 0);
+      float deviation_ratio = deviation / target;
+      if (deviation_ratio < max_deviation_ratio) {
 	ldout(cct, 10) << " osd." << osd
-		       << " target " << target
-		       << " deviation " << deviation
-		       << " -> ratio " << deviation/target
-		       << " < max ratio " << max_deviation_ratio << dendl;
+                       << " target " << target
+                       << " deviation " << deviation
+                       << " -> ratio " << deviation_ratio
+                       << " < max ratio " << max_deviation_ratio
+                       << dendl;
 	break;
       }
-      int num_to_move = deviation;
-      ldout(cct, 10) << " osd." << osd << " move " << num_to_move << dendl;
-      if (num_to_move < 1)
-	break;
-
-      set<pg_t>& pgs = pgs_by_osd[osd];
 
+      vector<pg_t> pgs;
+      pgs.reserve(pgs_by_osd[osd].size());
+      for (auto& pg : pgs_by_osd[osd]) {
+        if (to_skip.count(pg))
+          continue;
+        pgs.push_back(pg);
+      }
+      if (aggressive) {
+        // shuffle PG list so they all get equal (in)attention
+        std::random_device rd;
+        std::default_random_engine rng{rd()};
+        std::shuffle(pgs.begin(), pgs.end(), rng);
+      }
       // look for remaps we can un-remap
       for (auto pg : pgs) {
 	auto p = tmp.pg_upmap_items.find(pg);
-	if (p != tmp.pg_upmap_items.end()) {
-	  for (auto q : p->second) {
-	    if (q.second == osd) {
-	      ldout(cct, 10) << "  dropping pg_upmap_items " << pg
-			     << " " << p->second << dendl;
-	      tmp.pg_upmap_items.erase(p);
-	      pending_inc->old_pg_upmap_items.insert(pg);
-	      ++num_changed;
-	      restart = true;
-	    }
-	  }
-	}
-	if (restart)
-	  break;
-      } // pg loop
-      if (restart)
-	break;
+        if (p == tmp.pg_upmap_items.end())
+          continue;
+        mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
+        for (auto q : p->second) {
+	  if (q.second == osd) {
+            ldout(cct, 10) << " will try dropping existing"
+                           << " remapping pair "
+                           << q.first << " -> " << q.second
+                           << " which remapped " << pg
+                           << " into overfull osd." << osd
+                           << dendl;
+            temp_pgs_by_osd[q.second].erase(pg);
+            temp_pgs_by_osd[q.first].insert(pg);
+          } else {
+            new_upmap_items.push_back(q);
+          }
+        }
+        if (new_upmap_items.empty()) {
+          // drop whole item
+          ldout(cct, 10) << " existing pg_upmap_items " << p->second
+                         << " remapped " << pg << " into overfull osd." << osd
+                         << ", will try cancelling it entirely"
+                         << dendl;
+          to_unmap.insert(pg);
+          goto test_change;
+        } else if (new_upmap_items.size() != p->second.size()) {
+          // drop single remapping pair, updating
+          ceph_assert(new_upmap_items.size() < p->second.size());
+          ldout(cct, 10) << " existing pg_upmap_items " << p->second
+                         << " remapped " << pg << " into overfull osd." << osd
+                         << ", new_pg_upmap_items now " << new_upmap_items
+                         << dendl;
+          to_upmap[pg] = new_upmap_items;
+          goto test_change;
+        }
+      }
 
+      // try upmap
       for (auto pg : pgs) {
-	if (tmp.pg_upmap.count(pg) ||
-	    tmp.pg_upmap_items.count(pg)) {
-	  ldout(cct, 20) << "  already remapped " << pg << dendl;
+        auto temp_it = tmp.pg_upmap.find(pg);
+        if (temp_it != tmp.pg_upmap.end()) {
+          // leave pg_upmap alone
+          // it must be specified by admin since balancer does not
+          // support pg_upmap yet
+	  ldout(cct, 10) << " " << pg << " already has pg_upmap "
+                         << temp_it->second << ", skipping"
+                         << dendl;
 	  continue;
 	}
-	ldout(cct, 10) << "  trying " << pg << dendl;
+        auto pg_pool_size = tmp.get_pg_pool_size(pg);
+        mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
+        set<int> existing;
+        auto it = tmp.pg_upmap_items.find(pg);
+        if (it != tmp.pg_upmap_items.end() &&
+            it->second.size() >= (size_t)pg_pool_size) {
+          ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
+                         << it->second << ", skipping"
+                         << dendl;
+          continue;
+        } else if (it != tmp.pg_upmap_items.end()) {
+          ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
+                         << it->second
+                         << dendl;
+          new_upmap_items = it->second;
+          // build existing too (for dedup)
+          for (auto i : it->second) {
+            existing.insert(i.first);
+            existing.insert(i.second);
+          }
+          // fall through
+          // to see if we can append more remapping pairs
+        }
+	ldout(cct, 10) << " trying " << pg << dendl;
 	vector<int> orig, out;
+        tmp.pg_to_raw_upmap(pg, &orig); // including existing upmaps too
 	if (!try_pg_upmap(cct, pg, overfull, underfull, &orig, &out)) {
 	  continue;
 	}
-	ldout(cct, 10) << "  " << pg << " " << orig << " -> " << out << dendl;
+	ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
 	if (orig.size() != out.size()) {
 	  continue;
 	}
-	assert(orig != out);
-	auto& rmi = tmp.pg_upmap_items[pg];
+	ceph_assert(orig != out);
 	for (unsigned i = 0; i < out.size(); ++i) {
-	  if (orig[i] != out[i]) {
-	    rmi.push_back(make_pair(orig[i], out[i]));
-	  }
+          if (orig[i] == out[i])
+            continue; // skip invalid remappings
+          if (existing.count(orig[i]) || existing.count(out[i]))
+            continue; // we want new remappings only!
+          ldout(cct, 10) << " will try adding new remapping pair "
+                         << orig[i] << " -> " << out[i] << " for " << pg
+                         << dendl;
+          existing.insert(orig[i]);
+          existing.insert(out[i]);
+          temp_pgs_by_osd[orig[i]].erase(pg);
+          temp_pgs_by_osd[out[i]].insert(pg);
+          ceph_assert(new_upmap_items.size() < (size_t)pg_pool_size);
+          new_upmap_items.push_back(make_pair(orig[i], out[i]));
+          // append new remapping pairs slowly
+          // This way we can make sure that each tiny change will
+          // definitely make distribution of PGs converging to
+          // the perfect status.
+          to_upmap[pg] = new_upmap_items;
+          goto test_change;
 	}
-	pending_inc->new_pg_upmap_items[pg] = rmi;
-	ldout(cct, 10) << "  " << pg << " pg_upmap_items " << rmi << dendl;
-	restart = true;
-	++num_changed;
-	break;
-      } // pg loop
-      if (restart)
-	break;
-    } // osd loop
+      }
+    }
 
-    if (!restart) {
-      ldout(cct, 10) << " failed to find any changes to make" << dendl;
-      break;
+    ceph_assert(!(to_unmap.size() || to_upmap.size()));
+    ldout(cct, 10) << " failed to find any changes for overfull osds"
+                   << dendl;
+    for (auto& p : deviation_osd) {
+      if (std::find(underfull.begin(), underfull.end(), p.second) ==
+                    underfull.end())
+        break;
+      int osd = p.second;
+      float deviation = p.first;
+      float target = osd_weight[osd] * pgs_per_weight;
+      ceph_assert(target > 0);
+      float deviation_ratio = abs(deviation / target);
+      if (deviation_ratio < max_deviation_ratio) {
+        // respect max_deviation_ratio too
+        ldout(cct, 10) << " osd." << osd
+                       << " target " << target
+                       << " deviation " << deviation
+                       << " -> absolute ratio " << deviation_ratio
+                       << " < max ratio " << max_deviation_ratio
+                       << dendl;
+        break;
+      }
+      // look for remaps we can un-remap
+      vector<pair<pg_t,
+        mempool::osdmap::vector<pair<int32_t,int32_t>>>> candidates;
+      candidates.reserve(tmp.pg_upmap_items.size());
+      for (auto& i : tmp.pg_upmap_items) {
+        if (to_skip.count(i.first))
+          continue;
+        if (!only_pools.empty() && !only_pools.count(i.first.pool()))
+          continue;
+        candidates.push_back(make_pair(i.first, i.second));
+      }
+      if (aggressive) {
+        // shuffle candidates so they all get equal (in)attention
+        std::random_device rd;
+        std::default_random_engine rng{rd()};
+        std::shuffle(candidates.begin(), candidates.end(), rng);
+      }
+      for (auto& i : candidates) {
+        auto pg = i.first;
+        mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
+        for (auto& j : i.second) {
+          if (j.first == osd) {
+            ldout(cct, 10) << " will try dropping existing"
+                           << " remapping pair "
+                           << j.first << " -> " << j.second
+                           << " which remapped " << pg
+                           << " out from underfull osd." << osd
+                           << dendl;
+            temp_pgs_by_osd[j.second].erase(pg);
+            temp_pgs_by_osd[j.first].insert(pg);
+          } else {
+            new_upmap_items.push_back(j);
+          }
+        }
+        if (new_upmap_items.empty()) {
+          // drop whole item
+          ldout(cct, 10) << " existing pg_upmap_items " << i.second
+                         << " remapped " << pg
+                         << " out from underfull osd." << osd
+                         << ", will try cancelling it entirely"
+                         << dendl;
+          to_unmap.insert(pg);
+          goto test_change;
+        } else if (new_upmap_items.size() != i.second.size()) {
+          // drop single remapping pair, updating
+          ceph_assert(new_upmap_items.size() < i.second.size());
+          ldout(cct, 10) << " existing pg_upmap_items " << i.second
+                         << " remapped " << pg
+                         << " out from underfull osd." << osd
+                         << ", new_pg_upmap_items now " << new_upmap_items
+                         << dendl;
+          to_upmap[pg] = new_upmap_items;
+          goto test_change;
+        }
+      }
     }
-    if (--max == 0) {
-      ldout(cct, 10) << " hit max iterations, stopping" << dendl;
+
+    ceph_assert(!(to_unmap.size() || to_upmap.size()));
+    ldout(cct, 10) << " failed to find any changes for underfull osds"
+                   << dendl;
+    if (!aggressive) {
+      ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
+      break;
+    } else if (!skip_overfull) {
+      // safe to quit because below here we know
+      // we've done checking both overfull and underfull osds..
+      ldout(cct, 10) << " break due to not being able to find any"
+                     << " further optimizations"
+                     << dendl;
       break;
     }
+    // restart with fullest and do exhaustive searching
+    skip_overfull = false;
+    continue;
+
+  test_change:
+
+    // test change, apply if change is good
+    ceph_assert(to_unmap.size() || to_upmap.size());
+    float new_stddev = 0;
+    map<int,float> temp_osd_deviation;
+    multimap<float,int> temp_deviation_osd;
+    for (auto& i : temp_pgs_by_osd) {
+      // make sure osd is still there (belongs to this crush-tree)
+      ceph_assert(osd_weight.count(i.first));
+      float target = osd_weight[i.first] * pgs_per_weight;
+      float deviation = (float)i.second.size() - target;
+      ldout(cct, 20) << " osd." << i.first
+                     << "\tpgs " << i.second.size()
+                     << "\ttarget " << target
+                     << "\tdeviation " << deviation
+                     << dendl;
+      temp_osd_deviation[i.first] = deviation;
+      temp_deviation_osd.insert(make_pair(deviation, i.first));
+      new_stddev += deviation * deviation;
+    }
+    ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
+    if (new_stddev >= stddev) {
+      if (!aggressive) {
+        ldout(cct, 10) << " break because stddev is not decreasing"
+                       << " and aggressive mode is not enabled"
+                       << dendl;
+        break;
+      }
+      local_fallback_retried++;
+      if (local_fallback_retried >= local_fallback_retries) {
+        // does not make progress
+        // flip *skip_overfull* so both overfull and underfull
+        // get equal (in)attention
+        skip_overfull = !skip_overfull;
+        ldout(cct, 10) << " hit local_fallback_retries "
+                       << local_fallback_retries
+                       << dendl;
+        continue;
+      }
+      for (auto& i : to_unmap)
+        to_skip.insert(i);
+      for (auto& i : to_upmap)
+        to_skip.insert(i.first);
+      ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
+                     << " to_skip " << to_skip
+                     << dendl;
+      goto retry;
+    }
+
+    // ready to go
+    ceph_assert(new_stddev < stddev);
+    stddev = new_stddev;
+    pgs_by_osd = temp_pgs_by_osd;
+    osd_deviation = temp_osd_deviation;
+    deviation_osd = temp_deviation_osd;
+    for (auto& i : to_unmap) {
+      ldout(cct, 10) << " unmap pg " << i << dendl;
+      ceph_assert(tmp.pg_upmap_items.count(i));
+      tmp.pg_upmap_items.erase(i);
+      pending_inc->old_pg_upmap_items.insert(i);
+      ++num_changed;
+    }
+    for (auto& i : to_upmap) {
+      ldout(cct, 10) << " upmap pg " << i.first
+                     << " new pg_upmap_items " << i.second
+                     << dendl;
+      tmp.pg_upmap_items[i.first] = i.second;
+      pending_inc->new_pg_upmap_items[i.first] = i.second;
+      ++num_changed;
+    }
   }
-  ldout(cct, 10) << " start deviation " << start_deviation << dendl;
-  ldout(cct, 10) << " end deviation " << end_deviation << dendl;
+  ldout(cct, 10) << " num_changed = " << num_changed << dendl;
   return num_changed;
 }
 
diff --git a/ceph/src/osd/OSDMap.h b/ceph/src/osd/OSDMap.h
index 5e3fe3d28..bea0000ef 100644
--- a/ceph/src/osd/OSDMap.h
+++ b/ceph/src/osd/OSDMap.h
@@ -1134,6 +1134,7 @@ public:
    * raw and primary must be non-NULL
    */
   void pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const;
+  void pg_to_raw_upmap(pg_t pg, vector<int> *raw_upmap) const;
   /// map a pg to its acting set. @return acting set size
   void pg_to_acting_osds(const pg_t& pg, vector<int> *acting,
                         int *acting_primary) const {
diff --git a/ceph/src/osd/PG.cc b/ceph/src/osd/PG.cc
index 7965757d7..98758ef1a 100644
--- a/ceph/src/osd/PG.cc
+++ b/ceph/src/osd/PG.cc
@@ -682,12 +682,10 @@ bool PG::MissingLoc::add_source_info(
 			 << ")" << dendl;
       continue;
     }
-    if (oinfo.last_complete < need) {
-      if (omissing.is_missing(soid)) {
-	ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
-			   << " also missing on osd." << fromosd << dendl;
-	continue;
-      }
+    if (omissing.is_missing(soid)) {
+      ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
+			 << " also missing on osd." << fromosd << dendl;
+      continue;
     }
 
     ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
@@ -2134,17 +2132,64 @@ void PG::mark_clean()
   kick_snap_trim();
 }
 
-void PG::_change_recovery_force_mode(int new_mode, bool clear)
+bool PG::set_force_recovery(bool b)
 {
+  bool did = false;
+  lock();
   if (!deleting) {
-    // we can't and shouldn't do anything if the PG is being deleted locally
-    if (clear) {
-      state_clear(new_mode);
-    } else {
-      state_set(new_mode);
+    if (b) {
+      if (!(state & PG_STATE_FORCED_RECOVERY) &&
+	  (state & (PG_STATE_DEGRADED |
+		    PG_STATE_RECOVERY_WAIT |
+		    PG_STATE_RECOVERING))) {
+	dout(20) << __func__ << " set" << dendl;
+	state_set(PG_STATE_FORCED_RECOVERY);
+	publish_stats_to_osd();
+	did = true;
+      }
+    } else if (state & PG_STATE_FORCED_RECOVERY) {
+      dout(20) << __func__ << " clear" << dendl;
+      state_clear(PG_STATE_FORCED_RECOVERY);
+      publish_stats_to_osd();
+      did = true;
     }
-    publish_stats_to_osd();
   }
+  unlock();
+  if (did) {
+    dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
+    osd->local_reserver.update_priority(info.pgid, get_recovery_priority());
+  }
+  return did;
+}
+
+bool PG::set_force_backfill(bool b)
+{
+  bool did = false;
+  lock();
+  if (!deleting) {
+    if (b) {
+      if (!(state & PG_STATE_FORCED_BACKFILL) &&
+	  (state & (PG_STATE_DEGRADED |
+		    PG_STATE_BACKFILL_WAIT |
+		    PG_STATE_BACKFILLING))) {
+	dout(10) << __func__ << " set" << dendl;
+	state_set(PG_STATE_FORCED_BACKFILL);
+	publish_stats_to_osd();
+	did = true;
+      }
+    } else if (state & PG_STATE_FORCED_BACKFILL) {
+      dout(10) << __func__ << " clear" << dendl;
+      state_clear(PG_STATE_FORCED_BACKFILL);
+      publish_stats_to_osd();
+      did = true;
+    }
+  }
+  unlock();
+  if (did) {
+    dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
+    osd->local_reserver.update_priority(info.pgid, get_backfill_priority());
+  }
+  return did;
 }
 
 inline int PG::clamp_recovery_priority(int priority)
@@ -2182,7 +2227,7 @@ unsigned PG::get_backfill_priority()
   // a higher value -> a higher priority
   int ret = OSD_BACKFILL_PRIORITY_BASE;
   if (state & PG_STATE_FORCED_BACKFILL) {
-    ret = OSD_RECOVERY_PRIORITY_FORCED;
+    ret = OSD_BACKFILL_PRIORITY_FORCED;
   } else {
     if (acting.size() < pool.info.min_size) {
       // inactive: no. of replicas < min_size, highest priority since it blocks IO
@@ -2774,14 +2819,14 @@ void PG::_update_calc_stats()
         for (auto& ml: sml.second) {
           int missing_shards;
           if (sml.first == shard_id_t::NO_SHARD) {
-            dout(0) << __func__ << " ml " << ml.second << " upset size " << upset.size() << " up " << ml.first.up << dendl;
+            dout(20) << __func__ << " ml " << ml.second << " upset size " << upset.size() << " up " << ml.first.up << dendl;
             missing_shards = (int)upset.size() - ml.first.up;
           } else {
 	    // Handle shards not even in upset below
             if (!find_shard(upset, sml.first))
 	      continue;
 	    missing_shards = std::max(0, 1 - ml.first.up);
-            dout(0) << __func__ << " shard " << sml.first << " ml " << ml.second << " missing shards " << missing_shards << dendl;
+            dout(20) << __func__ << " shard " << sml.first << " ml " << ml.second << " missing shards " << missing_shards << dendl;
           }
           int odegraded = ml.second * missing_shards;
           // Copies on other osds but limited to the possible degraded
diff --git a/ceph/src/osd/PG.h b/ceph/src/osd/PG.h
index d782d897f..dc64cedb1 100644
--- a/ceph/src/osd/PG.h
+++ b/ceph/src/osd/PG.h
@@ -122,6 +122,11 @@ public:
 
   void dump(Formatter* f) const;
 
+  string get_current_state() {
+    if (pi == nullptr) return "unknown";
+    return std::get<1>(pi->embedded_states.top());
+  }
+
 private:
   bool pg_in_destructor = false;
   PG* thispg = nullptr;
@@ -252,6 +257,10 @@ struct PGPool {
  */
 
 class PG : public DoutPrefixProvider {
+public:
+  bool set_force_recovery(bool b);
+  bool set_force_backfill(bool b);
+
 protected:
   OSDService *osd;
   CephContext *cct;
@@ -1059,6 +1068,7 @@ protected:
 
   void _update_calc_stats();
   void _update_blocked_by();
+  friend class TestOpsSocketHook;
   void publish_stats_to_osd();
   void clear_publish_stats();
 
@@ -1093,7 +1103,6 @@ public:
   unsigned get_backfill_priority();
 
   void mark_clean();  ///< mark an active pg clean
-  void _change_recovery_force_mode(int new_mode, bool clear);
 
   /// return [start,end) bounds for required past_intervals
   static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
@@ -2481,12 +2490,12 @@ protected:
   PG(OSDService *o, OSDMapRef curmap,
      const PGPool &pool, spg_t p);
   ~PG() override;
+  const spg_t pg_id;
 
  private:
   // Prevent copying
   explicit PG(const PG& rhs);
   PG& operator=(const PG& rhs);
-  const spg_t pg_id;
   uint64_t peer_features;
   uint64_t acting_features;
   uint64_t upacting_features;
@@ -2496,6 +2505,16 @@ protected:
  public:
   const spg_t&      get_pgid() const { return pg_id; }
 
+  void set_last_scrub_stamp(utime_t t) {
+    info.stats.last_scrub_stamp = t;
+    info.history.last_scrub_stamp = t;
+  }
+
+  void set_last_deep_scrub_stamp(utime_t t) {
+    info.stats.last_deep_scrub_stamp = t;
+    info.history.last_deep_scrub_stamp = t;
+  }
+
   void reset_min_peer_features() {
     peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
   }
diff --git a/ceph/src/osd/PrimaryLogPG.cc b/ceph/src/osd/PrimaryLogPG.cc
index 0074c7964..eb937f99b 100644
--- a/ceph/src/osd/PrimaryLogPG.cc
+++ b/ceph/src/osd/PrimaryLogPG.cc
@@ -4966,7 +4966,12 @@ int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
     }
     if (r >= 0)
       op.extent.length = r;
-    else {
+    else if (r == -EAGAIN) {
+      // EAGAIN should not change the length of extent or count the read op.
+      dout(10) << " read got " << r << " / " << op.extent.length
+              << " bytes from obj " << soid << ". try again." << dendl;
+      return -EAGAIN;
+    } else {
       result = r;
       op.extent.length = 0;
     }
@@ -9901,6 +9906,10 @@ void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
     return;
   }
+  if (!obc->obs.exists) {
+    dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
+    return;
+  }
   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
       watch->get_delayed_cb()
diff --git a/ceph/src/osd/osd_types.h b/ceph/src/osd/osd_types.h
index 5b63b9ed9..0f300d6e8 100644
--- a/ceph/src/osd/osd_types.h
+++ b/ceph/src/osd/osd_types.h
@@ -82,9 +82,12 @@
 #define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
 
 /// max manually/automatically set recovery priority for MBackfillReserve
-#define OSD_RECOVERY_PRIORITY_MAX 254
+#define OSD_RECOVERY_PRIORITY_MAX 253
 
-/// max recovery priority for MBackfillReserve, only when forced manually
+/// backfill priority for MBackfillReserve, when forced manually
+#define OSD_BACKFILL_PRIORITY_FORCED 254
+
+/// recovery priority for MRecoveryReserve, when forced manually
 #define OSD_RECOVERY_PRIORITY_FORCED 255
 
 
diff --git a/ceph/src/osdc/Objecter.cc b/ceph/src/osdc/Objecter.cc
index 0989a4c14..447f44f62 100644
--- a/ceph/src/osdc/Objecter.cc
+++ b/ceph/src/osdc/Objecter.cc
@@ -3294,11 +3294,11 @@ int Objecter::calc_op_budget(Op *op)
     if (i->op.op & CEPH_OSD_OP_MODE_WR) {
       op_budget += i->indata.length();
     } else if (ceph_osd_op_mode_read(i->op.op)) {
-      if (ceph_osd_op_type_data(i->op.op)) {
-	if ((int64_t)i->op.extent.length > 0)
-	  op_budget += (int64_t)i->op.extent.length;
+      if (ceph_osd_op_uses_extent(i->op.op)) {
+        if ((int64_t)i->op.extent.length > 0)
+          op_budget += (int64_t)i->op.extent.length;
       } else if (ceph_osd_op_type_attr(i->op.op)) {
-	op_budget += i->op.xattr.name_len + i->op.xattr.value_len;
+        op_budget += i->op.xattr.name_len + i->op.xattr.value_len;
       }
     }
   }
@@ -4427,7 +4427,10 @@ bool Objecter::ms_handle_reset(Connection *con)
     if (session) {
       ldout(cct, 1) << "ms_handle_reset " << con << " session " << session
 		    << " osd." << session->osd << dendl;
-      if (!initialized) {
+      // the session maybe had been closed if new osdmap just handled
+      // says the osd down
+      if (!(initialized && osdmap->is_up(session->osd))) {
+	ldout(cct, 1) << "ms_handle_reset aborted,initialized=" << initialized << dendl;
 	wl.unlock();
 	return false;
       }
diff --git a/ceph/src/pybind/mgr/balancer/module.py b/ceph/src/pybind/mgr/balancer/module.py
index 0f966aa2d..75cdfe35e 100644
--- a/ceph/src/pybind/mgr/balancer/module.py
+++ b/ceph/src/pybind/mgr/balancer/module.py
@@ -266,7 +266,7 @@ class Module(MgrModule):
         {
             "cmd": "balancer execute name=plan,type=CephString",
             "desc": "Execute an optimization plan",
-            "perm": "r",
+            "perm": "rw",
         },
     ]
     active = False
@@ -377,25 +377,46 @@ class Module(MgrModule):
         self.run = False
         self.event.set()
 
-    def time_in_interval(self, tod, begin, end):
-        if begin <= end:
-            return tod >= begin and tod < end
+    def time_permit(self):
+        local_time = time.localtime()
+        time_of_day = time.strftime('%H%M', local_time)
+        weekday = (local_time.tm_wday + 1) % 7 # be compatible with C
+        permit = False
+
+        begin_time = self.get_config('begin_time') or '0000'
+        end_time = self.get_config('end_time') or '2400'
+        if begin_time <= end_time:
+            permit = begin_time <= time_of_day < end_time
+        else:
+            permit = time_of_day >= begin_time or time_of_day < end_time
+        if not permit:
+            self.log.debug("should run between %s - %s, now %s, skipping",
+                           begin_time, end_time, time_of_day)
+            return False
+
+        begin_weekday = int(self.get_config('begin_weekday', 0))
+        end_weekday = int(self.get_config('end_weekday', 7))
+        if begin_weekday <= end_weekday:
+            permit = begin_weekday <= weekday < end_weekday
         else:
-            return tod >= begin or tod < end
+            permit = weekday >= begin_weekday or weekday < end_weekday
+        if not permit:
+            self.log.debug("should run between weekday %d - %d, now %d, skipping",
+                           begin_weekday, end_weekday, weekday)
+            return False
+
+        return True
 
     def serve(self):
         self.log.info('Starting')
         while self.run:
             self.active = self.get_config('active', '') is not ''
-            begin_time = self.get_config('begin_time') or '0000'
-            end_time = self.get_config('end_time') or '2400'
-            timeofday = time.strftime('%H%M', time.localtime())
-            self.log.debug('Waking up [%s, scheduled for %s-%s, now %s]',
-                           "active" if self.active else "inactive",
-                           begin_time, end_time, timeofday)
             sleep_interval = float(self.get_config('sleep_interval',
                                                    default_sleep_interval))
-            if self.active and self.time_in_interval(timeofday, begin_time, end_time):
+            self.log.debug('Waking up [%s, now %s]',
+                           "active" if self.active else "inactive",
+                           time.strftime(TIME_FORMAT, time.localtime()))
+            if self.active and self.time_permit():
                 self.log.debug('Running')
                 name = 'auto_%s' % time.strftime(TIME_FORMAT, time.gmtime())
                 plan = self.plan_create(name, self.get_osdmap(), [])
diff --git a/ceph/src/pybind/mgr/dashboard/module.py b/ceph/src/pybind/mgr/dashboard/module.py
index 39add8063..c52ff5305 100644
--- a/ceph/src/pybind/mgr/dashboard/module.py
+++ b/ceph/src/pybind/mgr/dashboard/module.py
@@ -27,6 +27,7 @@ import socket
 import cherrypy
 import jinja2
 import urlparse
+from distutils.version import StrictVersion
 
 from mgr_module import MgrModule, MgrStandbyModule, CommandResult
 
@@ -46,6 +47,20 @@ log = logging.getLogger("dashboard")
 # python module for the convenience of the GUI?
 LOG_BUFFER_SIZE = 30
 
+# When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
+# that the ports its listening on are in fact bound. When using the any address
+# "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
+# ipv6 isn't yet configured / supported and CherryPy throws an uncaught
+# exception.
+if cherrypy is not None:
+    v = StrictVersion(cherrypy.__version__)
+    # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
+    # centos:7) and back to at least 3.0.0.
+    if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"):
+        # https://github.com/cherrypy/cherrypy/issues/1100
+        from cherrypy.process import servers
+        servers.wait_for_occupied_port = lambda host, port: None
+
 # cherrypy likes to sys.exit on error.  don't let it take us down too!
 def os_exit_noop(*args, **kwargs):
     pass
diff --git a/ceph/src/pybind/mgr/prometheus/module.py b/ceph/src/pybind/mgr/prometheus/module.py
index fc013fb85..880549526 100644
--- a/ceph/src/pybind/mgr/prometheus/module.py
+++ b/ceph/src/pybind/mgr/prometheus/module.py
@@ -1,4 +1,5 @@
 import cherrypy
+from distutils.version import StrictVersion
 import json
 import errno
 import math
@@ -15,6 +16,19 @@ from mgr_module import MgrModule, MgrStandbyModule
 DEFAULT_ADDR = '::'
 DEFAULT_PORT = 9283
 
+# When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
+# that the ports its listening on are in fact bound. When using the any address
+# "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
+# ipv6 isn't yet configured / supported and CherryPy throws an uncaught
+# exception.
+if cherrypy is not None:
+    v = StrictVersion(cherrypy.__version__)
+    # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
+    # centos:7) and back to at least 3.0.0.
+    if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"):
+        # https://github.com/cherrypy/cherrypy/issues/1100
+        from cherrypy.process import servers
+        servers.wait_for_occupied_port = lambda host, port: None
 
 # cherrypy likes to sys.exit on error.  don't let it take us down too!
 def os_exit_noop(*args, **kwargs):
@@ -91,8 +105,9 @@ MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank',
 
 MON_METADATA = ('ceph_daemon', 'hostname', 'public_addr', 'rank', 'ceph_version')
 
-OSD_METADATA = ('ceph_daemon', 'cluster_addr', 'device_class', 'hostname',
-                'public_addr', 'ceph_version')
+OSD_METADATA = ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class',
+                'front_iface', 'hostname', 'objectstore', 'public_addr',
+                'ceph_version')
 
 OSD_STATUS = ['weight', 'up', 'in']
 
@@ -464,12 +479,25 @@ class Module(MgrModule):
 
             host_version = servers.get((str(id_), 'osd'), ('',''))
 
+            # collect disk occupation metadata
+            osd_metadata = self.get_metadata("osd", str(id_))
+            if osd_metadata is None:
+                continue
+
+            obj_store = osd_metadata.get('osd_objectstore', '')
+            f_iface = osd_metadata.get('front_iface', '')
+            b_iface = osd_metadata.get('back_iface', '')
+
             self.metrics['osd_metadata'].set(1, (
+                b_iface,
                 'osd.{}'.format(id_),
                 c_addr,
                 dev_class,
+                f_iface,
                 host_version[0],
-                p_addr, host_version[1]
+                obj_store,
+                p_addr,
+                host_version[1]
             ))
 
             # collect osd status
@@ -479,19 +507,13 @@ class Module(MgrModule):
                     'osd.{}'.format(id_),
                 ))
 
-            # collect disk occupation metadata
-            osd_metadata = self.get_metadata("osd", str(id_))
-            if osd_metadata is None:
-                continue
-
-            osd_objectstore = osd_metadata.get('osd_objectstore', None)
-            if osd_objectstore == "filestore":
+            if obj_store == "filestore":
             # collect filestore backend device
                 osd_dev_node = osd_metadata.get('backend_filestore_dev_node', None)
             # collect filestore journal device
                 osd_wal_dev_node = osd_metadata.get('osd_journal', '')
                 osd_db_dev_node = ''
-            elif osd_objectstore == "bluestore":
+            elif obj_store == "bluestore":
             # collect bluestore backend device
                 osd_dev_node = osd_metadata.get('bluestore_bdev_dev_node', None)
             # collect bluestore wal backend
diff --git a/ceph/src/pybind/mgr/restful/api/crush.py b/ceph/src/pybind/mgr/restful/api/crush.py
index 11a04264d..785620e22 100644
--- a/ceph/src/pybind/mgr/restful/api/crush.py
+++ b/ceph/src/pybind/mgr/restful/api/crush.py
@@ -14,11 +14,11 @@ class CrushRule(RestController):
         """
         Show crush rules
         """
-        rules = module.instance.get('osd_map_crush')['rules']
-        nodes = module.instance.get('osd_map_tree')['nodes']
+        crush = module.instance.get('osd_map_crush')
+        rules = crush['rules']
 
         for rule in rules:
-            rule['osd_count'] = len(common.crush_rule_osds(nodes, rule))
+            rule['osd_count'] = len(common.crush_rule_osds(crush['buckets'], rule))
 
         return rules
 
diff --git a/ceph/src/pybind/mgr/restful/common.py b/ceph/src/pybind/mgr/restful/common.py
index 847156780..ab08fc41c 100644
--- a/ceph/src/pybind/mgr/restful/common.py
+++ b/ceph/src/pybind/mgr/restful/common.py
@@ -88,31 +88,34 @@ def pool_update_commands(pool_name, args):
 
     return commands
 
-
-def crush_rule_osds(nodes, rule):
-    nodes_by_id = dict((n['id'], n) for n in nodes)
+def crush_rule_osds(node_buckets, rule):
+    nodes_by_id = dict((b['id'], b) for b in node_buckets)
 
     def _gather_leaf_ids(node):
         if node['id'] >= 0:
             return set([node['id']])
 
         result = set()
-        for child_id in node['children']:
-            if child_id >= 0:
-                result.add(child_id)
+        for item in node['items']:
+            if item['id'] >= 0:
+                result.add(item['id'])
             else:
-                result |= _gather_leaf_ids(nodes_by_id[child_id])
+                result |= _gather_leaf_ids(nodes_by_id[item['id']])
 
         return result
 
     def _gather_descendent_ids(node, typ):
         result = set()
-        for child_id in node['children']:
-            child_node = nodes_by_id[child_id]
-            if child_node['type'] == typ:
-                result.add(child_node['id'])
-            elif 'children' in child_node:
-                result |= _gather_descendent_ids(child_node, typ)
+        for item in node['items']:
+            if item['id'] >= 0:
+                if typ == "osd":
+                    result.add(item['id'])
+            else:
+                child_node = nodes_by_id[item['id']]
+                if child_node['type_name'] == typ:
+                    result.add(child_node['id'])
+                elif 'items' in child_node:
+                    result |= _gather_descendent_ids(child_node, typ)
 
         return result
 
@@ -124,17 +127,26 @@ def crush_rule_osds(nodes, rule):
         step = steps[0]
         if step['op'] == 'choose_firstn':
             # Choose all descendents of the current node of type 'type'
-            d = _gather_descendent_ids(root, step['type'])
-            for desc_node in [nodes_by_id[i] for i in d]:
-                osds |= _gather_osds(desc_node, steps[1:])
+            descendent_ids = _gather_descendent_ids(root, step['type'])
+            for node_id in descendent_ids:
+                if node_id >= 0:
+                    osds.add(node_id)
+                else:
+                    for desc_node in nodes_by_id[node_id]:
+                        osds |= _gather_osds(desc_node, steps[1:])
         elif step['op'] == 'chooseleaf_firstn':
             # Choose all descendents of the current node of type 'type',
             # and select all leaves beneath those
-            for desc_node in [nodes_by_id[i] for i in _gather_descendent_ids(root, step['type'])]:
-                # Short circuit another iteration to find the emit
-                # and assume anything we've done a chooseleaf on
-                # is going to be part of the selected set of osds
-                osds |= _gather_leaf_ids(desc_node)
+            descendent_ids = _gather_descendent_ids(root, step['type'])
+            for node_id in descendent_ids:
+                if node_id >= 0:
+                    osds.add(node_id)
+                else:
+                    for desc_node in nodes_by_id[node_id]['items']:
+                        # Short circuit another iteration to find the emit
+                        # and assume anything we've done a chooseleaf on
+                        # is going to be part of the selected set of osds
+                        osds |= _gather_leaf_ids(desc_node)
         elif step['op'] == 'emit':
             if root['id'] >= 0:
                 osds |= root['id']
diff --git a/ceph/src/pybind/mgr/restful/module.py b/ceph/src/pybind/mgr/restful/module.py
index 197df6ec5..ba761df9c 100644
--- a/ceph/src/pybind/mgr/restful/module.py
+++ b/ceph/src/pybind/mgr/restful/module.py
@@ -214,7 +214,7 @@ class Module(MgrModule):
         {
             "cmd": "restful list-keys",
             "desc": "List all API keys",
-            "perm": "rw"
+            "perm": "r"
         },
         {
             "cmd": "restful create-self-signed-cert",
@@ -503,14 +503,15 @@ class Module(MgrModule):
     def get_osd_pools(self):
         osds = dict(map(lambda x: (x['osd'], []), self.get('osd_map')['osds']))
         pools = dict(map(lambda x: (x['pool'], x), self.get('osd_map')['pools']))
-        crush_rules = self.get('osd_map_crush')['rules']
+        crush = self.get('osd_map_crush')
+        crush_rules = crush['rules']
 
         osds_by_pool = {}
         for pool_id, pool in pools.items():
             pool_osds = None
             for rule in [r for r in crush_rules if r['rule_id'] == pool['crush_rule']]:
                 if rule['min_size'] <= pool['size'] <= rule['max_size']:
-                    pool_osds = common.crush_rule_osds(self.get('osd_map_tree')['nodes'], rule)
+                    pool_osds = common.crush_rule_osds(crush['buckets'], rule)
 
             osds_by_pool[pool_id] = pool_osds
 
diff --git a/ceph/src/rbdmap b/ceph/src/rbdmap
index 3b840aef9..f85bca984 100755
--- a/ceph/src/rbdmap
+++ b/ceph/src/rbdmap
@@ -1,5 +1,52 @@
 #!/bin/bash
 
+create_cmd_params() {
+	local PARAMS="$1"
+	local CMDPARAMS=""
+	local STATE="START"
+	for (( i=0; i<${#PARAMS}; i++ )); do
+		CHAR="${PARAMS:$i:1}"
+		case $CHAR in
+			"#")
+				break
+				;;
+			"'")
+				if [ "$STATE" == "INQUOTE" ];then
+					STATE="NORMAL"
+				else
+					STATE="INQUOTE"
+				fi
+				;;
+			"=")
+				if [ "$STATE" == "INQUOTE" ]; then
+					CMDPARAMS="${CMDPARAMS}${CHAR}"
+				else
+					CMDPARAMS="${CMDPARAMS} "
+				fi
+				;;
+			",")
+				if [ "$STATE" == "INQUOTE" ]; then
+					CMDPARAMS="${CMDPARAMS}${CHAR}"
+				elif [ "$STATE" == "START" ]; then
+					STATE="NORMAL"
+					CMDPARAMS="${CMDPARAMS} --"
+				else
+					CMDPARAMS="${CMDPARAMS} --"
+				fi
+				;;
+			*)
+				if [ "$STATE" == "START" ];then
+					STATE="NORMAL"
+					CMDPARAMS="${CMDPARAMS}--${CHAR}"
+				else
+					CMDPARAMS="${CMDPARAMS}${CHAR}"
+				fi
+				;;
+		esac
+	done
+	echo -n "$CMDPARAMS"
+}
+
 do_map() {
 	# Read /etc/rbdtab to create non-existant mapping
 	RET=0
@@ -14,16 +61,12 @@ do_map() {
 			DEV=rbd/$DEV
 			;;
 		esac
+
+		CMDPARAMS="$(create_cmd_params "${PARAMS}")"
 		logger -p "daemon.debug" -t rbdmap "Mapping '${DEV}'"
 		newrbd=""
 		MAP_RV=""
-		OIFS=$IFS
-		IFS=','
-		CMDPARAMS=""
-		for PARAM in ${PARAMS[@]}; do
-			CMDPARAMS="$CMDPARAMS --$(echo $PARAM | tr '=' ' ')"
-		done
-		IFS=$OIFS
+
 		if [ -b /dev/rbd/$DEV ]; then
 			MAP_RV="$(readlink -f /dev/rbd/$DEV)"
 		else
diff --git a/ceph/src/rgw/CMakeLists.txt b/ceph/src/rgw/CMakeLists.txt
index 57cb2a5b9..48a399b87 100644
--- a/ceph/src/rgw/CMakeLists.txt
+++ b/ceph/src/rgw/CMakeLists.txt
@@ -177,9 +177,7 @@ endif (WITH_RADOSGW_BEAST_FRONTEND)
 
 add_library(radosgw_a STATIC ${radosgw_srcs}
   $<TARGET_OBJECTS:civetweb_common_objs>)
-if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
-  target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
-endif()
+target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
 
 add_executable(radosgw rgw_main.cc)
 target_link_libraries(radosgw radosgw_a librados
@@ -195,10 +193,6 @@ add_dependencies(radosgw cls_rgw cls_lock cls_refcount
   cls_version cls_replica_log cls_user)
 install(TARGETS radosgw DESTINATION bin)
 
-if (WITH_RADOSGW_BEAST_FRONTEND)
-  target_link_libraries(radosgw_a ${OPENSSL_LIBRARIES})
-endif()
-
 set(radosgw_admin_srcs
   rgw_admin.cc
   rgw_orphan.cc)
diff --git a/ceph/src/rgw/rgw_admin.cc b/ceph/src/rgw/rgw_admin.cc
index 13ce136cc..1fcdba04c 100644
--- a/ceph/src/rgw/rgw_admin.cc
+++ b/ceph/src/rgw/rgw_admin.cc
@@ -453,6 +453,7 @@ enum {
   OPT_DATA_SYNC_RUN,
   OPT_DATALOG_LIST,
   OPT_DATALOG_STATUS,
+  OPT_DATALOG_AUTOTRIM,
   OPT_DATALOG_TRIM,
   OPT_OPSTATE_LIST,
   OPT_OPSTATE_SET,
@@ -892,6 +893,8 @@ static int get_cmd(const char *cmd, const char *prev_cmd, const char *prev_prev_
   } else if (strcmp(prev_cmd, "datalog") == 0) {
     if (strcmp(cmd, "list") == 0)
       return OPT_DATALOG_LIST;
+    if (strcmp(cmd, "autotrim") == 0)
+      return OPT_DATALOG_AUTOTRIM;
     if (strcmp(cmd, "trim") == 0)
       return OPT_DATALOG_TRIM;
     if (strcmp(cmd, "status") == 0)
@@ -5621,7 +5624,7 @@ next:
 
     rgw_cls_bi_entry entry;
 
-    ret = store->bi_get(bucket, obj, bi_index_type, &entry);
+    ret = store->bi_get(bucket_info, obj, bi_index_type, &entry);
     if (ret < 0) {
       cerr << "ERROR: bi_get(): " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -7239,6 +7242,24 @@ next:
     formatter->flush(cout);
   }
 
+  if (opt_cmd == OPT_DATALOG_AUTOTRIM) {
+    RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
+    RGWHTTPManager http(store->ctx(), crs.get_completion_mgr());
+    int ret = http.set_threaded();
+    if (ret < 0) {
+      cerr << "failed to initialize http client with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+
+    auto num_shards = g_conf->rgw_data_log_num_shards;
+    std::vector<std::string> markers(num_shards);
+    ret = crs.run(create_admin_data_log_trim_cr(store, &http, num_shards, markers));
+    if (ret < 0) {
+      cerr << "automated datalog trim failed with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+  }
+
   if (opt_cmd == OPT_DATALOG_TRIM) {
     utime_t start_time, end_time;
 
diff --git a/ceph/src/rgw/rgw_auth_s3.cc b/ceph/src/rgw/rgw_auth_s3.cc
index e685705ee..089f290db 100644
--- a/ceph/src/rgw/rgw_auth_s3.cc
+++ b/ceph/src/rgw/rgw_auth_s3.cc
@@ -116,6 +116,7 @@ void rgw_create_s3_canonical_header(
   const char* const content_type,
   const char* const date,
   const std::map<std::string, std::string>& meta_map,
+  const std::map<std::string, std::string>& qs_map,
   const char* const request_uri,
   const std::map<std::string, std::string>& sub_resources,
   std::string& dest_str)
@@ -143,6 +144,7 @@ void rgw_create_s3_canonical_header(
   dest.append("\n");
 
   dest.append(get_canon_amz_hdr(meta_map));
+  dest.append(get_canon_amz_hdr(qs_map));
   dest.append(get_canon_resource(request_uri, sub_resources));
 
   dest_str = dest;
@@ -152,6 +154,17 @@ static inline bool is_base64_for_content_md5(unsigned char c) {
   return (isalnum(c) || isspace(c) || (c == '+') || (c == '/') || (c == '='));
 }
 
+static inline void get_v2_qs_map(const req_info& info,
+				 std::map<std::string, std::string>& qs_map) {
+  const auto& params = const_cast<RGWHTTPArgs&>(info.args).get_params();
+  for (const auto& elt : params) {
+    std::string k = boost::algorithm::to_lower_copy(elt.first);
+    if (k.find("x-amz-meta-") == /* offset */ 0) {
+      add_amz_meta_header(qs_map, k, elt.second);
+    }
+  }
+}
+
 /*
  * get the header authentication  information required to
  * compute a request's signature
@@ -175,7 +188,10 @@ bool rgw_create_s3_canonical_header(const req_info& info,
   const char *content_type = info.env->get("CONTENT_TYPE");
 
   std::string date;
+  std::map<std::string, std::string> qs_map;
+
   if (qsr) {
+    get_v2_qs_map(info, qs_map); // handle qs metadata
     date = info.args.get("Expires");
   } else {
     const char *str = info.env->get("HTTP_X_AMZ_DATE");
@@ -214,8 +230,8 @@ bool rgw_create_s3_canonical_header(const req_info& info,
   }
 
   rgw_create_s3_canonical_header(info.method, content_md5, content_type,
-                                 date.c_str(), meta_map, request_uri.c_str(),
-                                 sub_resources, dest);
+                                 date.c_str(), meta_map, qs_map,
+				 request_uri.c_str(), sub_resources, dest);
   return true;
 }
 
@@ -412,13 +428,13 @@ static inline int parse_v4_auth_header(const req_info& info,               /* in
   return 0;
 }
 
-int parse_credentials(const req_info& info,                     /* in */
-                      boost::string_view& access_key_id,        /* out */
-                      boost::string_view& credential_scope,     /* out */
-                      boost::string_view& signedheaders,        /* out */
-                      boost::string_view& signature,            /* out */
-                      boost::string_view& date,                 /* out */
-                      bool& using_qs)                           /* out */
+int parse_v4_credentials(const req_info& info,                     /* in */
+			 boost::string_view& access_key_id,        /* out */
+			 boost::string_view& credential_scope,     /* out */
+			 boost::string_view& signedheaders,        /* out */
+			 boost::string_view& signature,            /* out */
+			 boost::string_view& date,                 /* out */
+                         bool& using_qs)                           /* out */
 {
   const char* const http_auth = info.env->get("HTTP_AUTHORIZATION");
   using_qs = http_auth == nullptr || http_auth[0] == '\0';
diff --git a/ceph/src/rgw/rgw_auth_s3.h b/ceph/src/rgw/rgw_auth_s3.h
index 6f860a72a..ed5096e36 100644
--- a/ceph/src/rgw/rgw_auth_s3.h
+++ b/ceph/src/rgw/rgw_auth_s3.h
@@ -79,8 +79,7 @@ public:
 
     }
 
-    if (cct->_conf->rgw_s3_auth_use_ldap &&
-        ! cct->_conf->rgw_ldap_uri.empty()) {
+    if (ldap_engine.valid()) {
       add_engine(Control::SUFFICIENT, ldap_engine);
     }
   }
@@ -350,6 +349,7 @@ void rgw_create_s3_canonical_header(
   const char *content_type,
   const char *date,
   const std::map<std::string, std::string>& meta_map,
+  const std::map<std::string, std::string>& qs_map,
   const char *request_uri,
   const std::map<std::string, std::string>& sub_resources,
   std::string& dest_str);
@@ -381,13 +381,13 @@ static constexpr char AWS4_UNSIGNED_PAYLOAD_HASH[] = "UNSIGNED-PAYLOAD";
 static constexpr char AWS4_STREAMING_PAYLOAD_HASH[] = \
   "STREAMING-AWS4-HMAC-SHA256-PAYLOAD";
 
-int parse_credentials(const req_info& info,                     /* in */
-                      boost::string_view& access_key_id,        /* out */
-                      boost::string_view& credential_scope,     /* out */
-                      boost::string_view& signedheaders,        /* out */
-                      boost::string_view& signature,            /* out */
-                      boost::string_view& date,                 /* out */
-                      bool& using_qs);                          /* out */
+int parse_v4_credentials(const req_info& info,                     /* in */
+			 boost::string_view& access_key_id,        /* out */
+			 boost::string_view& credential_scope,     /* out */
+			 boost::string_view& signedheaders,        /* out */
+			 boost::string_view& signature,            /* out */
+			 boost::string_view& date,                 /* out */
+                         bool& using_qs);                          /* out */
 
 static inline std::string get_v4_canonical_uri(const req_info& info) {
   /* The code should normalize according to RFC 3986 but S3 does NOT do path
diff --git a/ceph/src/rgw/rgw_bucket.cc b/ceph/src/rgw/rgw_bucket.cc
index e4a56d849..3204acd4c 100644
--- a/ceph/src/rgw/rgw_bucket.cc
+++ b/ceph/src/rgw/rgw_bucket.cc
@@ -540,11 +540,13 @@ int rgw_remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_children)
   int max = 1000;
 
   list_op.params.list_versions = true;
+  list_op.params.allow_unordered = true;
 
+  bool is_truncated = false;
   do {
     objs.clear();
 
-    ret = list_op.list_objects(max, &objs, &common_prefixes, NULL);
+    ret = list_op.list_objects(max, &objs, &common_prefixes, &is_truncated);
     if (ret < 0)
       return ret;
 
@@ -556,11 +558,11 @@ int rgw_remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_children)
     for (const auto& obj : objs) {
       rgw_obj_key key(obj.key);
       ret = rgw_remove_object(store, info, bucket, key);
-      if (ret < 0)
+      if (ret < 0 && ret != -ENOENT) {
         return ret;
+      }
     }
-
-  } while (!objs.empty());
+  } while(is_truncated);
 
   string prefix, delimiter;
 
@@ -576,9 +578,12 @@ int rgw_remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_children)
 
   RGWObjVersionTracker objv_tracker;
 
-  ret = store->delete_bucket(info, objv_tracker);
+  // if we deleted children above we will force delete, as any that
+  // remain is detrius from a prior bug
+  ret = store->delete_bucket(info, objv_tracker, !delete_children);
   if (ret < 0) {
-    lderr(store->ctx()) << "ERROR: could not remove bucket " << bucket.name << dendl;
+    lderr(store->ctx()) << "ERROR: could not remove bucket " <<
+      bucket.name << dendl;
     return ret;
   }
 
@@ -645,16 +650,20 @@ int rgw_remove_bucket_bypass_gc(RGWRados *store, rgw_bucket& bucket,
   RGWRados::Bucket::List list_op(&target);
 
   list_op.params.list_versions = true;
+  list_op.params.allow_unordered = true;
 
   std::list<librados::AioCompletion*> handles;
 
   int max = 1000;
   int max_aio = concurrent_max;
-  ret = list_op.list_objects(max, &objs, &common_prefixes, NULL);
-  if (ret < 0)
-    return ret;
+  bool is_truncated = true;
+
+  while (is_truncated) {
+    objs.clear();
+    ret = list_op.list_objects(max, &objs, &common_prefixes, &is_truncated);
+    if (ret < 0)
+      return ret;
 
-  while (!objs.empty()) {
     std::vector<rgw_bucket_dir_entry>::iterator it = objs.begin();
     for (; it != objs.end(); ++it) {
       RGWObjState *astate = NULL;
@@ -717,11 +726,6 @@ int rgw_remove_bucket_bypass_gc(RGWRados *store, rgw_bucket& bucket,
         max_aio = concurrent_max;
       }
     } // for all RGW objects
-    objs.clear();
-
-    ret = list_op.list_objects(max, &objs, &common_prefixes, NULL);
-    if (ret < 0)
-      return ret;
   }
 
   ret = drain_handles(handles);
@@ -737,7 +741,10 @@ int rgw_remove_bucket_bypass_gc(RGWRados *store, rgw_bucket& bucket,
 
   RGWObjVersionTracker objv_tracker;
 
-  ret = store->delete_bucket(info, objv_tracker);
+  // this function can only be run if caller wanted children to be
+  // deleted, so we can ignore the check for children as any that
+  // remain are detritus from a prior bug
+  ret = store->delete_bucket(info, objv_tracker, false);
   if (ret < 0) {
     lderr(store->ctx()) << "ERROR: could not remove bucket " << bucket.name << dendl;
     return ret;
@@ -1470,14 +1477,15 @@ int RGWBucketAdminOp::limit_check(RGWRados *store,
   formatter->open_array_section("users");
 
   for (const auto& user_id : user_ids) {
+
     formatter->open_object_section("user");
     formatter->dump_string("user_id", user_id);
-    bool done;
     formatter->open_array_section("buckets");
+
+    string marker;
+    bool is_truncated{false};
     do {
       RGWUserBuckets buckets;
-      string marker;
-      bool is_truncated;
 
       ret = rgw_read_user_buckets(store, user_id, buckets,
 				  marker, string(), max_entries, false,
@@ -1552,9 +1560,8 @@ int RGWBucketAdminOp::limit_check(RGWRados *store,
 	  }
 	}
       }
-
-      done = (m_buckets.size() < max_entries);
-    } while (!done); /* foreach: bucket */
+      formatter->flush(cout);
+    } while (is_truncated); /* foreach: bucket */
 
     formatter->close_section();
     formatter->close_section();
diff --git a/ceph/src/rgw/rgw_common.cc b/ceph/src/rgw/rgw_common.cc
index 4980d233b..5a2ed3221 100644
--- a/ceph/src/rgw/rgw_common.cc
+++ b/ceph/src/rgw/rgw_common.cc
@@ -6,8 +6,6 @@
 #include <algorithm>
 #include <string>
 #include <boost/tokenizer.hpp>
-#include <boost/algorithm/string.hpp>
-#include <boost/utility/string_view.hpp>
 
 #include "json_spirit/json_spirit.h"
 #include "common/ceph_json.h"
@@ -404,7 +402,6 @@ struct str_len meta_prefixes[] = { STR_LEN_ENTRY("HTTP_X_AMZ"),
                                    STR_LEN_ENTRY("HTTP_X_ACCOUNT"),
                                    {NULL, 0} };
 
-
 void req_info::init_meta_info(bool *found_bad_meta)
 {
   x_meta_map.clear();
diff --git a/ceph/src/rgw/rgw_common.h b/ceph/src/rgw/rgw_common.h
index 8e927ec93..3854d5b4a 100644
--- a/ceph/src/rgw/rgw_common.h
+++ b/ceph/src/rgw/rgw_common.h
@@ -18,6 +18,7 @@
 
 #include <array>
 
+#include <boost/algorithm/string.hpp>
 #include <boost/utility/string_view.hpp>
 
 #include "common/ceph_crypto.h"
@@ -645,11 +646,15 @@ struct RGWUserInfo
       type(TYPE_NONE) {
   }
 
-  RGWAccessKey* get_key0() {
+  RGWAccessKey* get_key(const string& access_key) {
     if (access_keys.empty())
       return nullptr;
+
+    auto k = access_keys.find(access_key);
+    if (k == access_keys.end())
+      return nullptr;
     else
-      return &(access_keys.begin()->second);
+      return &(k->second);
   }
 
   void encode(bufferlist& bl) const {
@@ -2207,6 +2212,25 @@ static inline uint64_t rgw_rounded_objsize_kb(uint64_t bytes)
   return ((bytes + 4095) & ~4095) / 1024;
 }
 
+/* implement combining step, S3 header canonicalization;  k is a
+ * valid header and in lc form */
+static inline void add_amz_meta_header(
+  std::map<std::string, std::string>& x_meta_map,
+  const std::string& k,
+  const std::string& v)
+{
+  auto it = x_meta_map.find(k);
+  if (it != x_meta_map.end()) {
+    std::string old = it->second;
+    boost::algorithm::trim_right(old);
+    old.append(",");
+    old.append(v);
+    x_meta_map[k] = old;
+  } else {
+    x_meta_map[k] = v;
+  }
+} /* add_amz_meta_header */
+
 extern string rgw_string_unquote(const string& s);
 extern void parse_csv_string(const string& ival, vector<string>& ovals);
 extern int parse_key_value(string& in_str, string& key, string& val);
diff --git a/ceph/src/rgw/rgw_cr_rados.cc b/ceph/src/rgw/rgw_cr_rados.cc
index 47babe661..c5949abe0 100644
--- a/ceph/src/rgw/rgw_cr_rados.cc
+++ b/ceph/src/rgw/rgw_cr_rados.cc
@@ -1,3 +1,4 @@
+#include "include/compat.h"
 #include "rgw_rados.h"
 #include "rgw_coroutine.h"
 #include "rgw_cr_rados.h"
@@ -797,9 +798,10 @@ RGWSyncLogTrimCR::RGWSyncLogTrimCR(RGWRados *store, const std::string& oid,
 int RGWSyncLogTrimCR::request_complete()
 {
   int r = RGWRadosTimelogTrimCR::request_complete();
-  if (r < 0 && r != -ENODATA) {
+  if (r != -ENODATA) {
     return r;
   }
+  // nothing left to trim, update last_trim_marker
   if (*last_trim_marker < to_marker) {
     *last_trim_marker = to_marker;
   }
diff --git a/ceph/src/rgw/rgw_cr_rest.h b/ceph/src/rgw/rgw_cr_rest.h
index f1897edb8..c6ea88217 100644
--- a/ceph/src/rgw/rgw_cr_rest.h
+++ b/ceph/src/rgw/rgw_cr_rest.h
@@ -15,6 +15,8 @@ class RGWReadRESTResourceCR : public RGWSimpleCoroutine {
   param_vec_t params;
   T *result;
 
+  param_vec_t extra_headers;
+public:
   boost::intrusive_ptr<RGWRESTReadResource> http_op;
 
 public:
@@ -25,13 +27,24 @@ public:
       path(_path), params(make_param_list(params)), result(_result)
   {}
 
+ RGWReadRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                          RGWHTTPManager *_http_manager, const string& _path,
+                          rgw_http_param_pair *params,
+                          std::map <std::string, std::string> *hdrs,
+                          T *_result)
+   : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager),
+    path(_path), params(make_param_list(params)),
+    result(_result), extra_headers(make_param_list(hdrs))
+    {}
+
+
   ~RGWReadRESTResourceCR() override {
     request_cleanup();
   }
 
   int send_request() override {
     auto op = boost::intrusive_ptr<RGWRESTReadResource>(
-        new RGWRESTReadResource(conn, path, params, NULL, http_manager));
+        new RGWRESTReadResource(conn, path, params, &extra_headers, http_manager));
 
     op->set_user_info((void *)stack);
 
@@ -67,15 +80,17 @@ public:
   }
 };
 
-template <class S, class T>
+template <class S, class T, class E = int>
 class RGWSendRESTResourceCR : public RGWSimpleCoroutine {
   RGWRESTConn *conn;
   RGWHTTPManager *http_manager;
   string method;
   string path;
   param_vec_t params;
+  param_vec_t headers;
   T *result;
-  S input;
+  E *err_result;
+  bufferlist input_bl;
 
   boost::intrusive_ptr<RGWRESTSendResource> http_op;
 
@@ -83,11 +98,18 @@ public:
   RGWSendRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
                         RGWHTTPManager *_http_manager,
                         const string& _method, const string& _path,
-                        rgw_http_param_pair *_params, S& _input, T *_result)
+                        rgw_http_param_pair *_params, map<string, string> *_attrs,
+                        S& _input, T *_result, E *_err_result = nullptr)
     : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager),
-      method(_method), path(_path), params(make_param_list(_params)), result(_result),
-      input(_input)
-  {}
+      method(_method), path(_path), params(make_param_list(_params)), headers(make_param_list(_attrs)),
+      result(_result), err_result(_err_result) {
+    JSONFormatter jf;
+    encode_json("data", _input, &jf);
+    std::stringstream ss;
+    jf.flush(ss);
+    //bufferlist bl;
+    this->input_bl.append(ss.str());
+  }
 
   ~RGWSendRESTResourceCR() override {
     request_cleanup();
@@ -95,18 +117,11 @@ public:
 
   int send_request() override {
     auto op = boost::intrusive_ptr<RGWRESTSendResource>(
-        new RGWRESTSendResource(conn, method, path, params, NULL, http_manager));
+        new RGWRESTSendResource(conn, method, path, params, &headers, http_manager));
 
     op->set_user_info((void *)stack);
 
-    JSONFormatter jf;
-    encode_json("data", input, &jf);
-    std::stringstream ss;
-    jf.flush(ss);
-    bufferlist bl;
-    bl.append(ss.str());
-
-    int ret = op->aio_send(bl);
+    int ret = op->aio_send(input_bl);
     if (ret < 0) {
       lsubdout(cct, rgw, 0) << "ERROR: failed to send request" << dendl;
       op->put();
@@ -118,8 +133,8 @@ public:
 
   int request_complete() override {
     int ret;
-    if (result) {
-      ret = http_op->wait(result);
+    if (result || err_result) {
+      ret = http_op->wait(result, err_result);
     } else {
       bufferlist bl;
       ret = http_op->wait_bl(&bl);
@@ -145,28 +160,42 @@ public:
   }
 };
 
-template <class S, class T>
-class RGWPostRESTResourceCR : public RGWSendRESTResourceCR<S, T> {
+template <class S, class T, class E = int>
+class RGWPostRESTResourceCR : public RGWSendRESTResourceCR<S, T, E> {
 public:
   RGWPostRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
                         RGWHTTPManager *_http_manager,
                         const string& _path,
-                        rgw_http_param_pair *_params, S& _input, T *_result)
-    : RGWSendRESTResourceCR<S, T>(_cct, _conn, _http_manager,
+                        rgw_http_param_pair *_params, S& _input,
+                        T *_result, E *_err_result = nullptr)
+    : RGWSendRESTResourceCR<S, T, E>(_cct, _conn, _http_manager,
                             "POST", _path,
-                            _params, _input, _result) {}
+                            _params, nullptr, _input, _result, _err_result) {}
 };
 
-template <class S, class T>
-class RGWPutRESTResourceCR : public RGWSendRESTResourceCR<S, T> {
+template <class S, class T, class E = int>
+class RGWPutRESTResourceCR : public RGWSendRESTResourceCR<S, T, E> {
 public:
   RGWPutRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
                         RGWHTTPManager *_http_manager,
                         const string& _path,
-                        rgw_http_param_pair *_params, S& _input, T *_result)
-    : RGWSendRESTResourceCR<S, T>(_cct, _conn, _http_manager,
-                            "PUT", _path,
-                            _params, _input, _result) {}
+                        rgw_http_param_pair *_params, S& _input,
+                        T *_result, E *_err_result = nullptr)
+    : RGWSendRESTResourceCR<S, T, E>(_cct, _conn, _http_manager,
+                                  "PUT", _path,
+                                  _params, nullptr, _input,
+                                  _result, _err_result) {}
+
+  RGWPutRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                       RGWHTTPManager *_http_manager,
+                       const string& _path,
+                       rgw_http_param_pair *_params,
+                       map <string, string> *_attrs,
+                       S& _input, T *_result, E *_err_result = nullptr)
+    : RGWSendRESTResourceCR<S, T, E>(_cct, _conn, _http_manager,
+                                  "PUT", _path,
+                                  _params, _attrs, _input,
+                                  _result, _err_result) {}
 };
 
 class RGWDeleteRESTResourceCR : public RGWSimpleCoroutine {
diff --git a/ceph/src/rgw/rgw_crypt.cc b/ceph/src/rgw/rgw_crypt.cc
index d5893734f..cf4e38995 100644
--- a/ceph/src/rgw/rgw_crypt.cc
+++ b/ceph/src/rgw/rgw_crypt.cc
@@ -31,6 +31,7 @@ using namespace CryptoPP;
 #define dout_subsys ceph_subsys_rgw
 
 using namespace rgw;
+using ceph::crypto::PK11_ImportSymKey_FIPS;
 
 /**
  * Encryption in CTR mode. offset is used as IV for each block.
@@ -129,7 +130,7 @@ public:
       keyItem.data = key;
       keyItem.len = AES_256_KEYSIZE;
 
-      symkey = PK11_ImportSymKey(slot, CKM_AES_CTR, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL);
+      symkey = PK11_ImportSymKey_FIPS(slot, CKM_AES_CTR, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL);
       if (symkey) {
         static_assert(sizeof(ctr_params.cb) >= AES_256_IVSIZE, "Must fit counter");
         ctr_params.ulCounterBits = 128;
@@ -317,7 +318,7 @@ public:
       keyItem.type = siBuffer;
       keyItem.data = const_cast<unsigned char*>(&key[0]);
       keyItem.len = AES_256_KEYSIZE;
-      symkey = PK11_ImportSymKey(slot, CKM_AES_CBC, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL);
+      symkey = PK11_ImportSymKey_FIPS(slot, CKM_AES_CBC, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL);
       if (symkey) {
         memcpy(ctr_params.iv, iv, AES_256_IVSIZE);
         ivItem.type = siBuffer;
@@ -577,7 +578,7 @@ bool AES_256_ECB_encrypt(CephContext* cct,
 
       param = PK11_ParamFromIV(CKM_AES_ECB, NULL);
       if (param) {
-        symkey = PK11_ImportSymKey(slot, CKM_AES_ECB, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL);
+        symkey = PK11_ImportSymKey_FIPS(slot, CKM_AES_ECB, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL);
         if (symkey) {
           ectx = PK11_CreateContextBySymKey(CKM_AES_ECB, CKA_ENCRYPT, symkey, param);
           if (ectx) {
@@ -666,29 +667,28 @@ int RGWGetObj_BlockDecrypt::fixup_range(off_t& bl_ofs, off_t& bl_end) {
     off_t in_end = bl_end;
 
     size_t i = 0;
-    while (i<parts_len.size() && (in_ofs > (off_t)parts_len[i])) {
+    while (i<parts_len.size() && (in_ofs >= (off_t)parts_len[i])) {
       in_ofs -= parts_len[i];
       i++;
     }
     //in_ofs is inside block i
     size_t j = 0;
-    while (j<parts_len.size() && (in_end > (off_t)parts_len[j])) {
+    while (j<(parts_len.size() - 1) && (in_end >= (off_t)parts_len[j])) {
       in_end -= parts_len[j];
       j++;
     }
-    //in_end is inside block j
+    //in_end is inside part j, OR j is the last part
 
-    size_t rounded_end;
-    rounded_end = ( in_end & ~(block_size - 1) ) + (block_size - 1);
-    if (rounded_end + 1 >= parts_len[j]) {
+    size_t rounded_end = ( in_end & ~(block_size - 1) ) + (block_size - 1);
+    if (rounded_end > parts_len[j]) {
       rounded_end = parts_len[j] - 1;
     }
 
     enc_begin_skip = in_ofs & (block_size - 1);
     ofs = bl_ofs - enc_begin_skip;
     end = bl_end;
-    bl_ofs = bl_ofs - enc_begin_skip;
     bl_end += rounded_end - in_end;
+    bl_ofs = std::min(bl_ofs - enc_begin_skip, bl_end);
   }
   else
   {
@@ -703,31 +703,47 @@ int RGWGetObj_BlockDecrypt::fixup_range(off_t& bl_ofs, off_t& bl_end) {
   return 0;
 }
 
+int RGWGetObj_BlockDecrypt::process(bufferlist& in, size_t part_ofs, size_t size)
+{
+  bufferlist data;
+  if (!crypt->decrypt(in, 0, size, data, part_ofs)) {
+    return -ERR_INTERNAL_ERROR;
+  }
+  off_t send_size = size - enc_begin_skip;
+  if (ofs + enc_begin_skip + send_size > end + 1) {
+    send_size = end + 1 - ofs - enc_begin_skip;
+  }
+  int res = next->handle_data(data, enc_begin_skip, send_size);
+  enc_begin_skip = 0;
+  ofs += size;
+  in.splice(0, size);
+  return res;
+}
 
 int RGWGetObj_BlockDecrypt::handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) {
-  int res = 0;
   ldout(cct, 25) << "Decrypt " << bl_len << " bytes" << dendl;
+  bl.copy(bl_ofs, bl_len, cache);
+
+  int res = 0;
   size_t part_ofs = ofs;
-  size_t i = 0;
-  while (i<parts_len.size() && (part_ofs >= parts_len[i])) {
-    part_ofs -= parts_len[i];
-    i++;
+  for (size_t part : parts_len) {
+    if (part_ofs >= part) {
+      part_ofs -= part;
+    } else if (part_ofs + cache.length() >= part) {
+      // flush data up to part boundaries, aligned or not
+      res = process(cache, part_ofs, part - part_ofs);
+      if (res < 0) {
+        return res;
+      }
+      part_ofs = 0;
+    } else {
+      break;
+    }
   }
-  bl.copy(bl_ofs, bl_len, cache);
+  // write up to block boundaries, aligned only
   off_t aligned_size = cache.length() & ~(block_size - 1);
   if (aligned_size > 0) {
-    bufferlist data;
-    if (! crypt->decrypt(cache, 0, aligned_size, data, part_ofs) ) {
-      return -ERR_INTERNAL_ERROR;
-    }
-    off_t send_size = aligned_size - enc_begin_skip;
-    if (ofs + enc_begin_skip + send_size > end + 1) {
-      send_size = end + 1 - ofs - enc_begin_skip;
-    }
-    res = next->handle_data(data, enc_begin_skip, send_size);
-    enc_begin_skip = 0;
-    ofs += aligned_size;
-    cache.splice(0, aligned_size);
+    res = process(cache, part_ofs, aligned_size);
   }
   return res;
 }
@@ -736,25 +752,26 @@ int RGWGetObj_BlockDecrypt::handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_l
  * flush remainder of data to output
  */
 int RGWGetObj_BlockDecrypt::flush() {
+  ldout(cct, 25) << "Decrypt flushing " << cache.length() << " bytes" << dendl;
   int res = 0;
   size_t part_ofs = ofs;
-  size_t i = 0;
-  while (i<parts_len.size() && (part_ofs > parts_len[i])) {
-    part_ofs -= parts_len[i];
-    i++;
+  for (size_t part : parts_len) {
+    if (part_ofs >= part) {
+      part_ofs -= part;
+    } else if (part_ofs + cache.length() >= part) {
+      // flush data up to part boundaries, aligned or not
+      res = process(cache, part_ofs, part - part_ofs);
+      if (res < 0) {
+        return res;
+      }
+      part_ofs = 0;
+    } else {
+      break;
+    }
   }
+  // flush up to block boundaries, aligned or not
   if (cache.length() > 0) {
-    bufferlist data;
-    if (! crypt->decrypt(cache, 0, cache.length(), data, part_ofs) ) {
-      return -ERR_INTERNAL_ERROR;
-    }
-    off_t send_size = cache.length() - enc_begin_skip;
-    if (ofs + enc_begin_skip + send_size > end + 1) {
-      send_size = end + 1 - ofs - enc_begin_skip;
-    }
-    res = next->handle_data(data, enc_begin_skip, send_size);
-    enc_begin_skip = 0;
-    ofs += send_size;
+    res = process(cache, part_ofs, cache.length());
   }
   return res;
 }
diff --git a/ceph/src/rgw/rgw_crypt.h b/ceph/src/rgw/rgw_crypt.h
index 77f07f8f2..a83dd1016 100644
--- a/ceph/src/rgw/rgw_crypt.h
+++ b/ceph/src/rgw/rgw_crypt.h
@@ -93,6 +93,10 @@ class RGWGetObj_BlockDecrypt : public RGWGetObj_Filter {
   off_t end; /**< stream offset of last byte that is requested */
   bufferlist cache; /**< stores extra data that could not (yet) be processed by BlockCrypt */
   size_t block_size; /**< snapshot of \ref BlockCrypt.get_block_size() */
+
+  int process(bufferlist& cipher, size_t part_ofs, size_t size);
+
+protected:
   std::vector<size_t> parts_len; /**< size of parts of multipart object, parsed from manifest */
 public:
   RGWGetObj_BlockDecrypt(CephContext* cct,
diff --git a/ceph/src/rgw/rgw_data_sync.cc b/ceph/src/rgw/rgw_data_sync.cc
index ad4315782..5da750df2 100644
--- a/ceph/src/rgw/rgw_data_sync.cc
+++ b/ceph/src/rgw/rgw_data_sync.cc
@@ -308,24 +308,24 @@ struct read_remote_data_log_response {
 class RGWReadRemoteDataLogShardCR : public RGWCoroutine {
   RGWDataSyncEnv *sync_env;
 
-  RGWRESTReadResource *http_op;
+  RGWRESTReadResource *http_op = nullptr;
 
   int shard_id;
-  string *pmarker;
+  const std::string& marker;
+  string *pnext_marker;
   list<rgw_data_change_log_entry> *entries;
   bool *truncated;
 
   read_remote_data_log_response response;
 
 public:
-  RGWReadRemoteDataLogShardCR(RGWDataSyncEnv *_sync_env,
-                              int _shard_id, string *_pmarker, list<rgw_data_change_log_entry> *_entries, bool *_truncated) : RGWCoroutine(_sync_env->cct),
-                                                      sync_env(_sync_env),
-                                                      http_op(NULL),
-                                                      shard_id(_shard_id),
-                                                      pmarker(_pmarker),
-                                                      entries(_entries),
-                                                      truncated(_truncated) {
+  RGWReadRemoteDataLogShardCR(RGWDataSyncEnv *_sync_env, int _shard_id,
+                              const std::string& marker, string *pnext_marker,
+                              list<rgw_data_change_log_entry> *_entries,
+                              bool *_truncated)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+      shard_id(_shard_id), marker(marker), pnext_marker(pnext_marker),
+      entries(_entries), truncated(_truncated) {
   }
   ~RGWReadRemoteDataLogShardCR() override {
     if (http_op) {
@@ -340,7 +340,7 @@ public:
 	snprintf(buf, sizeof(buf), "%d", shard_id);
         rgw_http_param_pair pairs[] = { { "type" , "data" },
 	                                { "id", buf },
-	                                { "marker", pmarker->c_str() },
+	                                { "marker", marker.c_str() },
 	                                { "extra-info", "true" },
 	                                { NULL, NULL } };
 
@@ -366,7 +366,7 @@ public:
         }
         entries->clear();
         entries->swap(response.entries);
-        *pmarker = response.marker;
+        *pnext_marker = response.marker;
         *truncated = response.truncated;
         return set_cr_done();
       }
@@ -1112,6 +1112,7 @@ class RGWDataSyncShardCR : public RGWCoroutine {
 
   RGWDataSyncShardMarkerTrack *marker_tracker;
 
+  std::string next_marker;
   list<rgw_data_change_log_entry> log_entries;
   list<rgw_data_change_log_entry>::iterator log_iter;
   bool truncated;
@@ -1158,7 +1159,7 @@ class RGWDataSyncShardCR : public RGWCoroutine {
 public:
   RGWDataSyncShardCR(RGWDataSyncEnv *_sync_env,
                      rgw_pool& _pool,
-		     uint32_t _shard_id, rgw_data_sync_marker& _marker, bool *_reset_backoff) : RGWCoroutine(_sync_env->cct),
+		     uint32_t _shard_id, const rgw_data_sync_marker& _marker, bool *_reset_backoff) : RGWCoroutine(_sync_env->cct),
                                                       sync_env(_sync_env),
 						      pool(_pool),
 						      shard_id(_shard_id),
@@ -1242,6 +1243,7 @@ public:
         if (lease_cr->is_done()) {
           ldout(cct, 5) << "lease cr failed, done early " << dendl;
           set_status("lease lock failed, early abort");
+          drain_all();
           return set_cr_error(lease_cr->get_ret_status());
         }
         set_sleeping(true);
@@ -1323,6 +1325,7 @@ public:
           if (lease_cr->is_done()) {
             ldout(cct, 5) << "lease cr failed, done early " << dendl;
             set_status("lease lock failed, early abort");
+            drain_all();
             return set_cr_error(lease_cr->get_ret_status());
           }
           set_sleeping(true);
@@ -1387,7 +1390,8 @@ public:
 #define INCREMENTAL_MAX_ENTRIES 100
 	      ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " sync_marker=" << sync_marker.marker << dendl;
         spawned_keys.clear();
-        yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, &sync_marker.marker, &log_entries, &truncated));
+        yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, sync_marker.marker,
+                                                   &next_marker, &log_entries, &truncated));
         if (retcode < 0) {
           ldout(sync_env->cct, 0) << "ERROR: failed to read remote data log info: ret=" << retcode << dendl;
           stop_spawned_services();
@@ -1430,11 +1434,17 @@ public:
             }
             /* not waiting for child here */
           }
-	      }
-	      ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " sync_marker=" << sync_marker.marker << " truncated=" << truncated << dendl;
-	      if (!truncated) {
-        yield wait(get_idle_interval());
-      }
+        }
+        ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " sync_marker=" << sync_marker.marker
+            << " next_marker=" << next_marker << " truncated=" << truncated << dendl;
+        if (!truncated) {
+          yield wait(get_idle_interval());
+        }
+        if (!next_marker.empty()) {
+          sync_marker.marker = next_marker;
+        } else if (!log_entries.empty()) {
+          sync_marker.marker = log_entries.back().log_id;
+        }
       } while (true);
     }
     return 0;
@@ -2086,6 +2096,7 @@ class RGWReadPendingBucketShardsCoroutine : public RGWCoroutine {
   rgw_data_sync_marker* sync_marker;
   int count;
 
+  std::string next_marker;
   list<rgw_data_change_log_entry> log_entries;
   bool truncated;
 
@@ -2121,7 +2132,8 @@ int RGWReadPendingBucketShardsCoroutine::operate()
     marker = sync_marker->marker;
     count = 0;
     do{
-      yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, &marker, &log_entries, &truncated));
+      yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, marker,
+                                                 &next_marker, &log_entries, &truncated));
 
       if (retcode == -ENOENT) {
         break;
@@ -2229,6 +2241,16 @@ struct bucket_list_entry {
     JSONDecoder::decode_json("VersionedEpoch", versioned_epoch, obj);
     JSONDecoder::decode_json("RgwxTag", rgw_tag, obj);
   }
+
+  RGWModifyOp get_modify_op() const {
+    if (delete_marker) {
+      return CLS_RGW_OP_LINK_OLH_DM;
+    } else if (!key.instance.empty() && key.instance != "null") {
+      return CLS_RGW_OP_LINK_OLH;
+    } else {
+      return CLS_RGW_OP_ADD;
+    }
+  }
 };
 
 struct bucket_list_result {
@@ -2607,7 +2629,6 @@ class RGWBucketShardFullSyncCR : public RGWCoroutine {
   RGWBucketFullSyncShardMarkerTrack marker_tracker;
   rgw_obj_key list_marker;
   bucket_list_entry *entry{nullptr};
-  RGWModifyOp op{CLS_RGW_OP_ADD};
 
   int total_entries{0};
 
@@ -2669,12 +2690,11 @@ int RGWBucketShardFullSyncCR::operate()
         if (!marker_tracker.start(entry->key, total_entries, real_time())) {
           ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << entry->key << ". Duplicate entry?" << dendl;
         } else {
-          op = (entry->key.instance.empty() || entry->key.instance == "null" ? CLS_RGW_OP_ADD : CLS_RGW_OP_LINK_OLH);
           using SyncCR = RGWBucketSyncSingleEntryCR<rgw_obj_key, rgw_obj_key>;
           yield spawn(new SyncCR(sync_env, bucket_info, bs, entry->key,
                                  false, /* versioned, only matters for object removal */
                                  entry->versioned_epoch, entry->mtime,
-                                 entry->owner, op, CLS_RGW_STATE_COMPLETE,
+                                 entry->owner, entry->get_modify_op(), CLS_RGW_STATE_COMPLETE,
                                  entry->key, &marker_tracker, zones_trace),
                       false);
         }
@@ -3025,6 +3045,7 @@ int RGWRunBucketSyncCoroutine::operate()
       if (lease_cr->is_done()) {
         ldout(cct, 5) << "lease cr failed, done early" << dendl;
         set_status("lease lock failed, early abort");
+        drain_all();
         return set_cr_error(lease_cr->get_ret_status());
       }
       set_sleeping(true);
@@ -3077,6 +3098,14 @@ int RGWRunBucketSyncCoroutine::operate()
     do {
       if (sync_status.state == rgw_bucket_shard_sync_info::StateInit) {
         yield call(new RGWInitBucketShardSyncStatusCoroutine(sync_env, bs, sync_status));
+        if (retcode == -ENOENT) {
+          ldout(sync_env->cct, 0) << "bucket sync disabled" << dendl;
+          lease_cr->abort(); // deleted lease object, abort/wakeup instead of unlock
+          lease_cr->wakeup();
+          lease_cr.reset();
+          drain_all();
+          return set_cr_done();
+        }
         if (retcode < 0) {
           ldout(sync_env->cct, 0) << "ERROR: init sync on " << bucket_shard_str{bs}
               << " failed, retcode=" << retcode << dendl;
@@ -3431,6 +3460,14 @@ int DataLogTrimCR::operate()
   return 0;
 }
 
+RGWCoroutine* create_admin_data_log_trim_cr(RGWRados *store,
+                                            RGWHTTPManager *http,
+                                            int num_shards,
+                                            std::vector<std::string>& markers)
+{
+  return new DataLogTrimCR(store, http, num_shards, markers);
+}
+
 class DataLogTrimPollCR : public RGWCoroutine {
   RGWRados *store;
   RGWHTTPManager *http;
diff --git a/ceph/src/rgw/rgw_data_sync.h b/ceph/src/rgw/rgw_data_sync.h
index 83f75c925..05822d604 100644
--- a/ceph/src/rgw/rgw_data_sync.h
+++ b/ceph/src/rgw/rgw_data_sync.h
@@ -79,7 +79,7 @@ struct rgw_data_sync_info {
       state = StateInit;
     }
     JSONDecoder::decode_json("num_shards", num_shards, obj);
-    JSONDecoder::decode_json("instance_id", num_shards, obj);
+    JSONDecoder::decode_json("instance_id", instance_id, obj);
   }
   static void generate_test_instances(std::list<rgw_data_sync_info*>& o);
 
@@ -566,4 +566,10 @@ extern RGWCoroutine* create_data_log_trim_cr(RGWRados *store,
                                              RGWHTTPManager *http,
                                              int num_shards, utime_t interval);
 
+// factory function for datalog trim via radosgw-admin
+RGWCoroutine* create_admin_data_log_trim_cr(RGWRados *store,
+                                            RGWHTTPManager *http,
+                                            int num_shards,
+                                            std::vector<std::string>& markers);
+
 #endif
diff --git a/ceph/src/rgw/rgw_es_query.cc b/ceph/src/rgw/rgw_es_query.cc
index 5fb943f40..8ecc2146a 100644
--- a/ceph/src/rgw/rgw_es_query.cc
+++ b/ceph/src/rgw/rgw_es_query.cc
@@ -29,6 +29,7 @@ map<string, int> operator_map = {
   { "<",   3 },
   { "<=",  3 },
   { "==",  3 },
+  { "!=",  3 },
   { ">=",  3 },
   { ">",   3 },
 };
@@ -301,6 +302,33 @@ public:
   }
 };
 
+class ESQueryNode_Op_NotEqual : public ESQueryNode_Op {
+public:
+  explicit ESQueryNode_Op_NotEqual(ESQueryCompiler *compiler) : ESQueryNode_Op(compiler) {}
+  ESQueryNode_Op_NotEqual(ESQueryCompiler *compiler, const string& f, const string& v) : ESQueryNode_Op(compiler) {
+    op = "!=";
+    field = f;
+    str_val = v;
+  }
+
+  bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override {
+    if (op.empty()) {
+      return ESQueryNode_Op::init(s, pnode, perr);
+    }
+    return do_init(pnode, perr);
+  }
+
+  virtual void dump(Formatter *f) const override {
+    f->open_object_section("bool");
+    f->open_object_section("must_not");
+    f->open_object_section("term");
+    val->encode_json(field, f);
+    f->close_section();
+    f->close_section();
+    f->close_section();
+  }
+};
+
 class ESQueryNode_Op_Range : public ESQueryNode_Op {
   string range_str;
 public:
@@ -438,6 +466,8 @@ static bool alloc_node(ESQueryCompiler *compiler, ESQueryStack *s, ESQueryNode *
     node = new ESQueryNode_Bool(compiler);
   } else if (op == "==") {
     node = new ESQueryNode_Op_Equal(compiler);
+  } else if (op == "!=") {
+    node = new ESQueryNode_Op_NotEqual(compiler);
   } else {
     static map<string, string> range_op_map = {
       { "<", "lt"},
@@ -470,6 +500,7 @@ bool is_key_char(char c)
     case ')':
     case '<':
     case '>':
+    case '!':
     case '@':
     case ',':
     case ';':
@@ -493,6 +524,7 @@ bool is_key_char(char c)
 static bool is_op_char(char c)
 {
   switch (c) {
+    case '!':
     case '<':
     case '=':
     case '>':
@@ -534,7 +566,7 @@ bool ESInfixQueryParser::parse_condition() {
    * condition: <key> <operator> <val>
    *
    * whereas key: needs to conform to http header field restrictions
-   *         operator: one of the following: < <= == >= >
+   *         operator: one of the following: < <= == != >= >
    *         val: ascii, terminated by either space or ')' (or end of string)
    */
 
diff --git a/ceph/src/rgw/rgw_file.h b/ceph/src/rgw/rgw_file.h
index 86d647f95..8c9436b2d 100644
--- a/ceph/src/rgw/rgw_file.h
+++ b/ceph/src/rgw/rgw_file.h
@@ -918,9 +918,8 @@ namespace rgw {
     int authorize(RGWRados* store) {
       int ret = rgw_get_user_info_by_access_key(store, key.id, user);
       if (ret == 0) {
-	RGWAccessKey* key0 = user.get_key0();
-	if (!key0 ||
-	    (key0->key != key.key))
+	RGWAccessKey* k = user.get_key(key.id);
+	if (!k || (k->key != key.key))
 	  return -EINVAL;
 	if (user.suspended)
 	  return -ERR_USER_SUSPENDED;
@@ -1291,6 +1290,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -1427,6 +1427,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     prefix = rgw_fh->relative_object_name();
     if (prefix.length() > 0)
@@ -1439,7 +1440,7 @@ public:
   int operator()(const boost::string_ref name, const rgw_obj_key& marker,
 		uint8_t type) {
 
-    assert(name.length() > 0); // XXX
+    assert(name.length() > 0); // all cases handled in callers
 
     /* hash offset of name in parent (short name) for NFS readdir cookie */
     uint64_t off = XXH64(name.data(), name.length(), fh_key::seed);
@@ -1525,6 +1526,12 @@ public:
 			     << " cpref=" << sref
 			     << dendl;
 
+      if (sref.empty()) {
+	/* null path segment--could be created in S3 but has no NFS
+	 * interpretation */
+	return;
+      }
+
       this->operator()(sref, next_marker, RGW_FS_TYPE_DIRECTORY);
       ++ix;
     }
@@ -1596,6 +1603,7 @@ public:
     s->info.domain = ""; /* XXX ? */
 
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     prefix = rgw_fh->relative_object_name();
     if (prefix.length() > 0)
@@ -1683,6 +1691,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -1746,6 +1755,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -1811,6 +1821,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -1900,6 +1911,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -1985,6 +1997,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -2065,6 +2078,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -2145,6 +2159,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -2215,6 +2230,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     prefix = rgw_fh->relative_object_name();
     if (prefix.length() > 0)
@@ -2337,6 +2353,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -2466,6 +2483,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -2526,6 +2544,7 @@ public:
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
diff --git a/ceph/src/rgw/rgw_gc.cc b/ceph/src/rgw/rgw_gc.cc
index bf64925ab..1abd8d798 100644
--- a/ceph/src/rgw/rgw_gc.cc
+++ b/ceph/src/rgw/rgw_gc.cc
@@ -189,6 +189,7 @@ int RGWGC::process(int index, int max_secs)
           ctx = new IoCtx;
 	  ret = rgw_init_ioctx(store->get_rados_handle(), obj.pool, *ctx);
 	  if (ret < 0) {
+	    last_pool = "";
 	    dout(0) << "ERROR: failed to create ioctx pool=" << obj.pool << dendl;
 	    continue;
 	  }
diff --git a/ceph/src/rgw/rgw_iam_policy.h b/ceph/src/rgw/rgw_iam_policy.h
index 8791861a5..aa224ea16 100644
--- a/ceph/src/rgw/rgw_iam_policy.h
+++ b/ceph/src/rgw/rgw_iam_policy.h
@@ -99,6 +99,8 @@ static constexpr std::uint64_t s3Count = 54;
 static constexpr std::uint64_t s3All = (1ULL << s3Count) - 1;
 
 namespace {
+// Please update the table in doc/radosgw/s3/authentication.rst if you
+// modify this function.
 inline int op_to_perm(std::uint64_t op) {
   switch (op) {
   case s3GetObject:
diff --git a/ceph/src/rgw/rgw_ldap.cc b/ceph/src/rgw/rgw_ldap.cc
index a39afa6c0..d16816411 100644
--- a/ceph/src/rgw/rgw_ldap.cc
+++ b/ceph/src/rgw/rgw_ldap.cc
@@ -27,7 +27,7 @@ std::string parse_rgw_ldap_bindpw(CephContext* ctx)
       memset(bindpw, 0, 1024);
       int pwlen = safe_read_file("" /* base */, ldap_secret.c_str(),
 				 bindpw, 1023);
-    if (pwlen) {
+    if (pwlen > 0) {
       ldap_bindpw = bindpw;
       boost::algorithm::trim(ldap_bindpw);
       if (ldap_bindpw.back() == '\n')
diff --git a/ceph/src/rgw/rgw_loadgen.cc b/ceph/src/rgw/rgw_loadgen.cc
index fd66469e4..9b5e34014 100644
--- a/ceph/src/rgw/rgw_loadgen.cc
+++ b/ceph/src/rgw/rgw_loadgen.cc
@@ -29,6 +29,7 @@ int RGWLoadGenRequestEnv::sign(RGWAccessKey& access_key)
                                  content_type.c_str(),
                                  date_str.c_str(),
                                  meta_map,
+				 map<string, string>{},
                                  uri.c_str(),
                                  sub_resources,
                                  canonical_header);
diff --git a/ceph/src/rgw/rgw_op.cc b/ceph/src/rgw/rgw_op.cc
index 730de87fa..f9a1fc2ee 100644
--- a/ceph/src/rgw/rgw_op.cc
+++ b/ceph/src/rgw/rgw_op.cc
@@ -4238,9 +4238,9 @@ int RGWDeleteObj::verify_permission()
 				 rgw::IAM::s3DeleteObjectVersion,
 				 ARN(s->bucket, s->object.name));
     if (r == Effect::Allow)
-      return true;
+      return 0;
     else if (r == Effect::Deny)
-      return false;
+      return -EACCES;
   }
 
   if (!verify_bucket_permission_no_policy(s, RGW_PERM_WRITE)) {
@@ -4625,6 +4625,9 @@ int RGWGetACLs::verify_permission()
 				    rgw::IAM::s3GetObjectAcl :
 				    rgw::IAM::s3GetObjectVersionAcl);
   } else {
+    if (!s->bucket_exists) {
+      return -ERR_NO_SUCH_BUCKET;
+    }
     perm = verify_bucket_permission(s, rgw::IAM::s3GetBucketAcl);
   }
   if (!perm)
diff --git a/ceph/src/rgw/rgw_rados.cc b/ceph/src/rgw/rgw_rados.cc
index ff4ac6959..4fcd15c15 100644
--- a/ceph/src/rgw/rgw_rados.cc
+++ b/ceph/src/rgw/rgw_rados.cc
@@ -6796,6 +6796,23 @@ int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
   return 0;
 }
 
+int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info,
+                                const rgw_obj& obj)
+{
+  bucket = bucket_info.bucket;
+
+  int ret = store->open_bucket_index_shard(bucket_info, index_ctx,
+                                           obj.get_hash_object(), &bucket_obj,
+                                           &shard_id);
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+    return ret;
+  }
+  ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
+
+  return 0;
+}
+
 int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
 {
   bucket = bucket_info.bucket;
@@ -7563,6 +7580,15 @@ public:
 
       src_attrs.erase(RGW_ATTR_COMPRESSION);
       src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
+
+      // filter out olh attributes
+      auto iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
+      while (iter != src_attrs.end()) {
+        if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
+          break;
+        }
+        iter = src_attrs.erase(iter);
+      }
     }
 
     if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
@@ -11412,6 +11438,61 @@ int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObj
   return 0;
 }
 
+// a multisite sync bug resulted in the OLH head attributes being overwritten by
+// the attributes from another zone, causing link_olh() to fail endlessly due to
+// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
+// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
+int RGWRados::repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info,
+                         const rgw_obj& obj)
+{
+  // fetch the current olh entry from the bucket index
+  rgw_bucket_olh_entry olh;
+  int r = bi_get_olh(bucket_info, obj, &olh);
+  if (r < 0) {
+    ldout(cct, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
+    return r;
+  }
+  if (olh.tag == state->olh_tag.to_str()) { // mismatch already resolved?
+    return 0;
+  }
+
+  ldout(cct, 4) << "repair_olh setting olh_tag=" << olh.tag
+      << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
+
+  // rewrite OLH_ID_TAG and OLH_INFO from current olh
+  ObjectWriteOperation op;
+  // assert this is the same olh tag we think we're fixing
+  bucket_index_guard_olh_op(*state, op);
+  // preserve existing mtime
+  struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
+  op.mtime2(&mtime_ts);
+  {
+    bufferlist bl;
+    bl.append(olh.tag.c_str(), olh.tag.size());
+    op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
+  }
+  {
+    RGWOLHInfo info;
+    info.target = rgw_obj(bucket_info.bucket, olh.key);
+    info.removed = olh.delete_marker;
+    bufferlist bl;
+    encode(info, bl);
+    op.setxattr(RGW_ATTR_OLH_INFO, bl);
+  }
+  rgw_rados_ref ref;
+  r = get_obj_head_ref(bucket_info, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+  r = ref.ioctx.operate(ref.oid, &op);
+  if (r < 0) {
+    ldout(cct, 0) << "repair_olh failed to write olh attributes with "
+        << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
 {
   rgw_rados_ref ref;
@@ -11493,6 +11574,11 @@ int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGW
   op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
   op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
 
+  bufferlist ver_bl;
+  string last_ver_s = to_string(last_ver);
+  ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
+  op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
+
   struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
   op.mtime2(&mtime_ts);
 
@@ -11584,7 +11670,7 @@ int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGW
     ObjectWriteOperation rm_op;
 
     rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
-    rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
+    rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
     cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
     rm_op.remove();
 
@@ -11668,6 +11754,12 @@ int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const r
     if (ret < 0) {
       ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
       if (ret == -ECANCELED) {
+        // the bucket index rejected the link_olh() due to olh tag mismatch;
+        // attempt to reconstruct olh head attributes based on the bucket index
+        int r2 = repair_olh(state, bucket_info, olh_obj);
+        if (r2 < 0 && r2 != -ECANCELED) {
+          return r2;
+        }
         continue;
       }
       return ret;
@@ -11966,7 +12058,7 @@ int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime,
 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
     map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
 {
-  map<string, rgw_bucket_dir_header> headers;
+  vector<rgw_bucket_dir_header> headers;
   map<int, string> bucket_instance_ids;
   int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
   if (r < 0) {
@@ -11975,25 +12067,25 @@ int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string
 
   assert(headers.size() == bucket_instance_ids.size());
 
-  map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
+  auto iter = headers.begin();
   map<int, string>::iterator viter = bucket_instance_ids.begin();
   BucketIndexShardsManager ver_mgr;
   BucketIndexShardsManager master_ver_mgr;
   BucketIndexShardsManager marker_mgr;
   char buf[64];
   for(; iter != headers.end(); ++iter, ++viter) {
-    accumulate_raw_stats(iter->second, stats);
-    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
+    accumulate_raw_stats(*iter, stats);
+    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
     ver_mgr.add(viter->first, string(buf));
-    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
+    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
     master_ver_mgr.add(viter->first, string(buf));
     if (shard_id >= 0) {
-      *max_marker = iter->second.max_marker;
+      *max_marker = iter->max_marker;
     } else {
-      marker_mgr.add(viter->first, iter->second.max_marker);
+      marker_mgr.add(viter->first, iter->max_marker);
     }
     if (syncstopped != NULL)
-      *syncstopped = iter->second.syncstopped;
+      *syncstopped = iter->syncstopped;
   }
   ver_mgr.to_string(bucket_ver);
   master_ver_mgr.to_string(master_ver);
@@ -12006,7 +12098,7 @@ int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string
 int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
     map<int, string>& markers)
 {
-  map<string, rgw_bucket_dir_header> headers;
+  vector<rgw_bucket_dir_header> headers;
   map<int, string> bucket_instance_ids;
   int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
   if (r < 0)
@@ -12014,14 +12106,14 @@ int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
 
   assert(headers.size() == bucket_instance_ids.size());
 
-  map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
+  auto iter = headers.begin();
   map<int, string>::iterator viter = bucket_instance_ids.begin();
 
   for(; iter != headers.end(); ++iter, ++viter) {
     if (shard_id >= 0) {
-      markers[shard_id] = iter->second.max_marker;
+      markers[shard_id] = iter->max_marker;
     } else {
-      markers[viter->first] = iter->second.max_marker;
+      markers[viter->first] = iter->max_marker;
     }
   }
   return 0;
@@ -12573,7 +12665,7 @@ int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
     ent.size = 0;
     ent.size_rounded = 0;
 
-    map<string, rgw_bucket_dir_header> headers;
+    vector<rgw_bucket_dir_header> headers;
 
     RGWBucketInfo bucket_info;
     int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
@@ -12585,11 +12677,11 @@ int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
     if (r < 0)
       return r;
 
-    map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
+    auto hiter = headers.begin();
     for (; hiter != headers.end(); ++hiter) {
       RGWObjCategory category = main_category;
-      map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
-      if (iter != hiter->second.stats.end()) {
+      map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->stats).find((uint8_t)category);
+      if (iter != hiter->stats.end()) {
         struct rgw_bucket_category_stats& stats = iter->second;
         ent.count += stats.num_entries;
         ent.size += stats.total_size;
@@ -12950,25 +13042,42 @@ int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
   return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
 }
 
-int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
+int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                              rgw_bucket_dir_entry *dirent)
 {
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(bucket_info, obj, &ref);
+  rgw_cls_bi_entry bi_entry;
+  int r = bi_get(bucket_info, obj, InstanceIdx, &bi_entry);
+  if (r < 0 && r != -ENOENT) {
+    ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
+  }
   if (r < 0) {
     return r;
   }
+  bufferlist::iterator iter = bi_entry.data.begin();
+  try {
+    ::decode(*dirent, iter);
+  } catch (buffer::error& err) {
+    ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
 
+int RGWRados::bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                         rgw_bucket_olh_entry *olh)
+{
   rgw_cls_bi_entry bi_entry;
-  r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
+  int r = bi_get(bucket_info, obj, OLHIdx, &bi_entry);
   if (r < 0 && r != -ENOENT) {
     ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
   }
   if (r < 0) {
     return r;
   }
-  bufferlist::iterator iter = bi_entry.data.begin();
+  auto iter = bi_entry.data.begin();
   try {
-    ::decode(*dirent, iter);
+    decode(*olh, iter);
   } catch (buffer::error& err) {
     ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
     return -EIO;
@@ -12977,10 +13086,11 @@ int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rg
   return 0;
 }
 
-int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
+int RGWRados::bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                     BIIndexType index_type, rgw_cls_bi_entry *entry)
 {
   BucketShard bs(this);
-  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
+  int ret = bs.init(bucket_info, obj);
   if (ret < 0) {
     ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
     return ret;
@@ -12988,11 +13098,7 @@ int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, r
 
   cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
   
-  ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
-  if (ret < 0)
-    return ret;
-
-  return 0;
+  return cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
 }
 
 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
@@ -13641,7 +13747,7 @@ int RGWRados::check_disk_state(librados::IoCtx io_ctx,
   return 0;
 }
 
-int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
+int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
 {
   librados::IoCtx index_ctx;
   map<int, string> oids;
@@ -13656,7 +13762,7 @@ int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, ma
 
   map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
   for(; iter != list_results.end(); ++iter) {
-    headers[oids[iter->first]] = iter->second.dir.header;
+    headers.push_back(std::move(iter->second.dir.header));
   }
   return 0;
 }
@@ -13745,7 +13851,7 @@ int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_
 
 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
 {
-  map<string, struct rgw_bucket_dir_header> headers;
+  vector<rgw_bucket_dir_header> headers;
   int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
   if (r < 0) {
     ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
@@ -13757,7 +13863,7 @@ int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketI
   bucket_info.bucket.convert(&entry.bucket);
 
   for (const auto& hiter : headers) {
-    for (const auto& iter : hiter.second.stats) {
+    for (const auto& iter : hiter.stats) {
       const struct rgw_bucket_category_stats& header_stats = iter.second;
       entry.size += header_stats.total_size;
       entry.size_rounded += header_stats.total_size_rounded;
@@ -13779,7 +13885,7 @@ int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketI
 
 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
 {
-  map<string, struct rgw_bucket_dir_header> headers;
+  vector<rgw_bucket_dir_header> headers;
   RGWBucketInfo bucket_info;
   RGWObjectCtx obj_ctx(this);
   int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
@@ -13796,7 +13902,7 @@ int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucke
   bucket.convert(&entry.bucket);
 
   for (const auto& hiter : headers) {
-    for (const auto& iter : hiter.second.stats) {
+    for (const auto& iter : hiter.stats) {
       const struct rgw_bucket_category_stats& header_stats = iter.second;
       entry.size += header_stats.total_size;
       entry.size_rounded += header_stats.total_size_rounded;
diff --git a/ceph/src/rgw/rgw_rados.h b/ceph/src/rgw/rgw_rados.h
index cd462fe20..79124e15d 100644
--- a/ceph/src/rgw/rgw_rados.h
+++ b/ceph/src/rgw/rgw_rados.h
@@ -2710,6 +2710,7 @@ public:
     explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {}
     int init(const rgw_bucket& _bucket, const rgw_obj& obj, RGWBucketInfo* out);
     int init(const rgw_bucket& _bucket, int sid, RGWBucketInfo* out);
+    int init(const RGWBucketInfo& bucket_info, const rgw_obj& obj);
     int init(const RGWBucketInfo& bucket_info, int sid);
   };
 
@@ -3384,6 +3385,8 @@ public:
   int set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
               uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time,
               rgw_zone_set *zones_trace = nullptr, bool log_data_change = false);
+  int repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info,
+                 const rgw_obj& obj);
   int unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
                           uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr);
 
@@ -3542,7 +3545,7 @@ public:
 				vector<rgw_bucket_dir_entry>& ent_list,
 				bool *is_truncated, rgw_obj_index_key *last_entry,
 				bool (*force_check_filter)(const string& name) = nullptr);
-  int cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids = NULL);
+  int cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids = NULL);
   int cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio);
   int list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max, std::list<rgw_bi_log_entry>& result, bool *truncated);
   int trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, string& end_marker);
@@ -3550,8 +3553,9 @@ public:
   int stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id);
   int get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id, map<int, string>& max_marker);
 
-  int bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent);
-  int bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry);
+  int bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_dir_entry *dirent);
+  int bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_olh_entry *olh);
+  int bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry);
   void bi_put(librados::ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry);
   int bi_put(BucketShard& bs, rgw_cls_bi_entry& entry);
   int bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry);
diff --git a/ceph/src/rgw/rgw_reshard.cc b/ceph/src/rgw/rgw_reshard.cc
index a7c041f68..2a1bcad43 100644
--- a/ceph/src/rgw/rgw_reshard.cc
+++ b/ceph/src/rgw/rgw_reshard.cc
@@ -727,6 +727,11 @@ int RGWBucketReshard::execute(int num_shards, int max_op_entries,
       "\"created after successful resharding with error " << ret << dendl;
   }
 
+  ldout(store->ctx(), 1) << __func__ <<
+    " INFO: reshard of bucket \"" << bucket_info.bucket.name << "\" from \"" <<
+    bucket_info.bucket.get_key() << "\" to \"" <<
+    new_bucket_info.bucket.get_key() << "\" completed successfully" << dendl;
+
   return 0;
 
 error_out:
diff --git a/ceph/src/rgw/rgw_rest_client.cc b/ceph/src/rgw/rgw_rest_client.cc
index 22bbfe80f..fada94f57 100644
--- a/ceph/src/rgw/rgw_rest_client.cc
+++ b/ceph/src/rgw/rgw_rest_client.cc
@@ -123,7 +123,7 @@ int RGWRESTSimpleRequest::execute(RGWAccessKey& key, const char *method, const c
   map<string, string> meta_map;
   map<string, string> sub_resources;
   rgw_create_s3_canonical_header(method, NULL, NULL, date_str.c_str(),
-                            meta_map, new_url.c_str(), sub_resources,
+                            meta_map, meta_map, new_url.c_str(), sub_resources,
                             canonical_header);
 
   string digest;
diff --git a/ceph/src/rgw/rgw_rest_conn.h b/ceph/src/rgw/rgw_rest_conn.h
index e9941856f..36bae4098 100644
--- a/ceph/src/rgw/rgw_rest_conn.h
+++ b/ceph/src/rgw/rgw_rest_conn.h
@@ -49,6 +49,18 @@ inline param_vec_t make_param_list(const rgw_http_param_pair* pp)
   return params;
 }
 
+inline param_vec_t make_param_list(const map<string, string> *pp)
+{
+  param_vec_t params;
+  if (!pp) {
+    return params;
+  }
+  for (auto iter : *pp) {
+    params.emplace_back(make_pair(iter.first, iter.second));
+  }
+  return params;
+}
+
 class RGWRESTConn
 {
   CephContext *cct;
@@ -304,8 +316,8 @@ public:
     return req.get_user_info();
   }
 
-  template <class T>
-  int decode_resource(T *dest);
+  template <class T, class E>
+  int decode_resource(T *dest, E *err_result);
 
   int send(bufferlist& bl);
 
@@ -332,17 +344,25 @@ public:
     return 0;
   }
 
-  template <class T>
-  int wait(T *dest);
+  template <class T, class E>
+  int wait(T *dest, E *err_result = nullptr);
 };
 
-template <class T>
-int RGWRESTSendResource::decode_resource(T *dest)
+template <class T, class E>
+int RGWRESTSendResource::decode_resource(T *dest, E *err_result)
 {
   int ret = req.get_status();
   if (ret < 0) {
+    if (err_result) {
+      parse_decode_json(cct, *err_result, bl);
+    }
     return ret;
   }
+
+  if (!dest) {
+    return 0;
+  }
+
   ret = parse_decode_json(cct, *dest, bl);
   if (ret < 0) {
     return ret;
@@ -350,15 +370,17 @@ int RGWRESTSendResource::decode_resource(T *dest)
   return 0;
 }
 
-template <class T>
-int RGWRESTSendResource::wait(T *dest)
-{
+template <class T, class E>
+int RGWRESTSendResource::wait(T *dest, E *err_result){
   int ret = req.wait();
   if (ret < 0) {
+    if (err_result) {
+      parse_decode_json(cct, *err_result, bl);
+    }
     return ret;
   }
 
-  ret = decode_resource(dest);
+  ret = decode_resource(dest, err_result);
   if (ret < 0) {
     return ret;
   }
diff --git a/ceph/src/rgw/rgw_rest_s3.cc b/ceph/src/rgw/rgw_rest_s3.cc
index 6534e254e..0fba5b923 100644
--- a/ceph/src/rgw/rgw_rest_s3.cc
+++ b/ceph/src/rgw/rgw_rest_s3.cc
@@ -352,14 +352,11 @@ int RGWGetObj_ObjStore_S3::get_decrypt_filter(std::unique_ptr<RGWGetDataCB> *fil
   res = rgw_s3_prepare_decrypt(s, attrs, &block_crypt, crypt_http_responses);
   if (res == 0) {
     if (block_crypt != nullptr) {
-      auto f = std::unique_ptr<RGWGetObj_BlockDecrypt>(new RGWGetObj_BlockDecrypt(s->cct, cb, std::move(block_crypt)));
-      //RGWGetObj_BlockDecrypt* f = new RGWGetObj_BlockDecrypt(s->cct, cb, std::move(block_crypt));
-      if (f != nullptr) {
-        if (manifest_bl != nullptr) {
-          res = f->read_manifest(*manifest_bl);
-          if (res == 0) {
-            *filter = std::move(f);
-          }
+      auto f = ceph::make_unique<RGWGetObj_BlockDecrypt>(s->cct, cb, std::move(block_crypt));
+      if (manifest_bl != nullptr) {
+        res = f->read_manifest(*manifest_bl);
+        if (res == 0) {
+          *filter = std::move(f);
         }
       }
     }
@@ -1243,6 +1240,19 @@ void RGWDeleteBucket_ObjStore_S3::send_response()
   }
 }
 
+static inline void map_qs_metadata(struct req_state* s)
+{
+  /* merge S3 valid user metadata from the query-string into
+   * x_meta_map, which maps them to attributes */
+  const auto& params = const_cast<RGWHTTPArgs&>(s->info.args).get_params();
+  for (const auto& elt : params) {
+    std::string k = boost::algorithm::to_lower_copy(elt.first);
+    if (k.find("x-amz-meta-") == /* offset */ 0) {
+      add_amz_meta_header(s->info.x_meta_map, k, elt.second);
+    }
+  }
+}
+
 int RGWPutObj_ObjStore_S3::get_params()
 {
   if (!s->length)
@@ -1253,6 +1263,8 @@ int RGWPutObj_ObjStore_S3::get_params()
   size_t pos;
   int ret;
 
+  map_qs_metadata(s);
+
   RGWAccessControlPolicy_S3 s3policy(s->cct);
   ret = create_s3_policy(s, store, s3policy, s->owner);
   if (ret < 0)
@@ -1547,6 +1559,8 @@ int RGWPostObj_ObjStore_S3::get_params()
     return op_ret;
   }
 
+  map_qs_metadata(s);
+
   ldout(s->cct, 20) << "adding bucket to policy env: " << s->bucket.name
 		    << dendl;
   env.add_var("bucket", s->bucket.name);
@@ -2539,6 +2553,8 @@ int RGWCompleteMultipart_ObjStore_S3::get_params()
     return ret;
   }
 
+  map_qs_metadata(s);
+
   return do_aws4_auth_completion();
 }
 
@@ -3736,13 +3752,13 @@ AWSGeneralAbstractor::get_auth_data_v4(const req_state* const s,
   boost::string_view credential_scope;
   boost::string_view client_signature;
 
-  int ret = rgw::auth::s3::parse_credentials(s->info,
-                                             access_key_id,
-                                             credential_scope,
-                                             signed_hdrs,
-                                             client_signature,
-                                             date,
-                                             using_qs);
+  int ret = rgw::auth::s3::parse_v4_credentials(s->info,
+						access_key_id,
+						credential_scope,
+						signed_hdrs,
+						client_signature,
+						date,
+						using_qs);
   if (ret < 0) {
     throw ret;
   }
@@ -4059,6 +4075,11 @@ std::mutex rgw::auth::s3::LDAPEngine::mtx;
 
 void rgw::auth::s3::LDAPEngine::init(CephContext* const cct)
 {
+  if (! cct->_conf->rgw_s3_auth_use_ldap ||
+      cct->_conf->rgw_ldap_uri.empty()) {
+    return;
+  }
+
   if (! ldh) {
     std::lock_guard<std::mutex> lck(mtx);
     if (! ldh) {
@@ -4078,6 +4099,11 @@ void rgw::auth::s3::LDAPEngine::init(CephContext* const cct)
   }
 }
 
+bool rgw::auth::s3::LDAPEngine::valid() {
+  std::lock_guard<std::mutex> lck(mtx);
+  return (!!ldh);
+}
+
 rgw::auth::RemoteApplier::acl_strategy_t
 rgw::auth::s3::LDAPEngine::get_acl_strategy() const
 {
diff --git a/ceph/src/rgw/rgw_rest_s3.h b/ceph/src/rgw/rgw_rest_s3.h
index 506f5f4c5..5cb86e21d 100644
--- a/ceph/src/rgw/rgw_rest_s3.h
+++ b/ceph/src/rgw/rgw_rest_s3.h
@@ -847,6 +847,8 @@ public:
   const char* get_name() const noexcept override {
     return "rgw::auth::s3::LDAPEngine";
   }
+
+  static bool valid();
 };
 
 
diff --git a/ceph/src/rgw/rgw_sync_module.cc b/ceph/src/rgw/rgw_sync_module.cc
index 215dcf2fe..560887841 100644
--- a/ceph/src/rgw/rgw_sync_module.cc
+++ b/ceph/src/rgw/rgw_sync_module.cc
@@ -35,8 +35,8 @@ int RGWCallStatRemoteObjCR::operate() {
       return set_cr_error(retcode);
     }
     ldout(sync_env->cct, 20) << "stat of remote obj: z=" << sync_env->source_zone
-      << " b=" << bucket_info.bucket << " k=" << key << " size=" << size << " mtime=" << mtime
-      << " attrs=" << attrs << dendl;
+                             << " b=" << bucket_info.bucket << " k=" << key
+                             << " size=" << size << " mtime=" << mtime << dendl;
     yield {
       RGWStatRemoteObjCBCR *cb = allocate_callback();
       if (cb) {
diff --git a/ceph/src/rgw/rgw_sync_module_es.cc b/ceph/src/rgw/rgw_sync_module_es.cc
index aa58219b5..b5be52413 100644
--- a/ceph/src/rgw/rgw_sync_module_es.cc
+++ b/ceph/src/rgw/rgw_sync_module_es.cc
@@ -1,3 +1,7 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "rgw_b64.h"
 #include "rgw_common.h"
 #include "rgw_coroutine.h"
 #include "rgw_sync_module.h"
@@ -103,6 +107,55 @@ public:
 #define ES_NUM_SHARDS_DEFAULT 16
 #define ES_NUM_REPLICAS_DEFAULT 1
 
+using ESVersion = std::pair<int,int>;
+static constexpr ESVersion ES_V5{5,0};
+
+struct ESInfo {
+  std::string name;
+  std::string cluster_name;
+  std::string cluster_uuid;
+  ESVersion version;
+
+  void decode_json(JSONObj *obj);
+
+  std::string get_version_str(){
+    return std::to_string(version.first) + "." + std::to_string(version.second);
+  }
+};
+
+// simple wrapper structure to wrap the es version nested type
+struct es_version_decoder {
+  ESVersion version;
+
+  int parse_version(const std::string& s) {
+    int major, minor;
+    int ret = sscanf(s.c_str(), "%d.%d", &major, &minor);
+    if (ret < 0) {
+      return ret;
+    }
+    version = std::make_pair(major,minor);
+    return 0;
+  }
+
+  void decode_json(JSONObj *obj) {
+    std::string s;
+    JSONDecoder::decode_json("number",s,obj);
+    if (parse_version(s) < 0)
+      throw JSONDecoder::err("Failed to parse ElasticVersion");
+  }
+};
+
+
+void ESInfo::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("name", name, obj);
+  JSONDecoder::decode_json("cluster_name", cluster_name, obj);
+  JSONDecoder::decode_json("cluster_uuid", cluster_uuid, obj);
+  es_version_decoder esv;
+  JSONDecoder::decode_json("version", esv, obj);
+  version = std::move(esv.version);
+}
+
 struct ElasticConfig {
   uint64_t sync_instance{0};
   string id;
@@ -114,6 +167,7 @@ struct ElasticConfig {
   ItemList allow_owners;
   uint32_t num_shards{0};
   uint32_t num_replicas{0};
+  std::map <string,string> default_headers = {{ "Content-Type", "application/json" }};
 
   void init(CephContext *cct, const map<string, string, ltstr_nocase>& config) {
     string elastic_endpoint = rgw_conf_get(config, "endpoint", "");
@@ -128,6 +182,12 @@ struct ElasticConfig {
       num_shards = ES_NUM_SHARDS_MIN;
     }
     num_replicas = rgw_conf_get_int(config, "num_replicas", ES_NUM_REPLICAS_DEFAULT);
+    string user = rgw_conf_get(config, "username", "");
+    string pw = rgw_conf_get(config, "password", "");
+    if (!user.empty() && !pw.empty()) {
+      auto auth_string = user + ":" + pw;
+      default_headers.emplace("AUTHORIZATION", "Basic " + rgw::to_base64(auth_string));
+    }
   }
 
   void init_instance(RGWRealm& realm, uint64_t instance_id) {
@@ -148,6 +208,10 @@ struct ElasticConfig {
     return index_path;
   }
 
+  map<string, string>& get_request_headers() {
+    return default_headers;
+  }
+
   string get_obj_path(const RGWBucketInfo& bucket_info, const rgw_obj_key& key) {
     return index_path +  "/object/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance));
   }
@@ -160,58 +224,146 @@ struct ElasticConfig {
 
 using ElasticConfigRef = std::shared_ptr<ElasticConfig>;
 
-struct es_dump_type {
-  const char *type;
-  const char *format;
-  bool analyzed;
+static const char *es_type_to_str(const ESType& t) {
+  switch (t) {
+  case ESType::String: return "string";
+  case ESType::Text: return "text";
+  case ESType::Keyword: return "keyword";
+  case ESType::Long: return "long";
+  case ESType::Integer: return "integer";
+  case ESType::Short: return "short";
+  case ESType::Byte: return "byte";
+  case ESType::Double: return "double";
+  case ESType::Float: return "float";
+  case ESType::Half_Float: return "half_float";
+  case ESType::Scaled_Float: return "scaled_float";
+  case ESType::Date: return "date";
+  case ESType::Boolean: return "boolean";
+  case ESType::Integer_Range: return "integer_range";
+  case ESType::Float_Range: return "float_range";
+  case ESType::Double_Range: return "date_range";
+  case ESType::Date_Range: return "date_range";
+  case ESType::Geo_Point: return "geo_point";
+  case ESType::Ip: return "ip";
+  default:
+    return "<unknown>";
+  }
+}
+
+struct es_type_v2 {
+  ESType estype;
+  const char *format{nullptr};
+  boost::optional<bool> analyzed;
+
+  es_type_v2(ESType et) : estype(et) {}
+
+  void dump(Formatter *f) const {
+    const char *type_str = es_type_to_str(estype);
+    encode_json("type", type_str, f);
+    if (format) {
+      encode_json("format", format, f);
+    }
+
+    auto is_analyzed = analyzed;
+
+    if (estype == ESType::String &&
+        !is_analyzed) {
+      is_analyzed = false;
+    }
+
+    if (is_analyzed) {
+      encode_json("index", (is_analyzed.value() ? "analyzed" : "not_analyzed"), f);
+    }
+  }
+};
 
-  es_dump_type(const char *t, const char *f = nullptr, bool a = false) : type(t), format(f), analyzed(a) {}
+struct es_type_v5 {
+  ESType estype;
+  const char *format{nullptr};
+  boost::optional<bool> analyzed;
+  boost::optional<bool> index;
+
+  es_type_v5(ESType et) : estype(et) {}
 
   void dump(Formatter *f) const {
-    encode_json("type", type, f);
+    ESType new_estype;
+    if (estype != ESType::String) {
+      new_estype = estype;
+    } else {
+      bool is_analyzed = analyzed.value_or(false);
+      new_estype = (is_analyzed ? ESType::Text : ESType::Keyword);
+      /* index = true; ... Not setting index=true, because that's the default,
+       * and dumping a boolean value *might* be a problem when backporting this
+       * because value might get quoted
+       */
+    }
+
+    const char *type_str = es_type_to_str(new_estype);
+    encode_json("type", type_str, f);
     if (format) {
       encode_json("format", format, f);
     }
-    if (!analyzed && strcmp(type, "string") == 0) {
-      encode_json("index", "not_analyzed", f);
+    if (index) {
+      encode_json("index", index.value(), f);
     }
   }
 };
 
+template <class T>
+struct es_type : public T {
+  es_type(T t) : T(t) {}
+  es_type& set_format(const char *f) {
+    T::format = f;
+    return *this;
+  }
+
+  es_type& set_analyzed(bool a) {
+    T::analyzed = a;
+    return *this;
+  }
+};
+
+template <class T>
 struct es_index_mappings {
-  void dump_custom(Formatter *f, const char *section, const char *type, const char *format) const {
+  ESType string_type {ESType::String};
+
+  es_type<T> est(ESType t) const {
+    return es_type<T>(t);
+  }
+
+  void dump_custom(const char *section, ESType type, const char *format, Formatter *f) const {
     f->open_object_section(section);
     ::encode_json("type", "nested", f);
     f->open_object_section("properties");
-    encode_json("name", es_dump_type("string"), f);
-    encode_json("value", es_dump_type(type, format), f);
+    encode_json("name", est(string_type), f);
+    encode_json("value", est(type).set_format(format), f);
     f->close_section(); // entry
     f->close_section(); // custom-string
   }
+
   void dump(Formatter *f) const {
     f->open_object_section("object");
     f->open_object_section("properties");
-    encode_json("bucket", es_dump_type("string"), f);
-    encode_json("name", es_dump_type("string"), f);
-    encode_json("instance", es_dump_type("string"), f);
-    encode_json("versioned_epoch", es_dump_type("long"), f);
+    encode_json("bucket", est(string_type), f);
+    encode_json("name", est(string_type), f);
+    encode_json("instance", est(string_type), f);
+    encode_json("versioned_epoch", est(ESType::Long), f);
     f->open_object_section("meta");
     f->open_object_section("properties");
-    encode_json("cache_control", es_dump_type("string"), f);
-    encode_json("content_disposition", es_dump_type("string"), f);
-    encode_json("content_encoding", es_dump_type("string"), f);
-    encode_json("content_language", es_dump_type("string"), f);
-    encode_json("content_type", es_dump_type("string"), f);
-    encode_json("etag", es_dump_type("string"), f);
-    encode_json("expires", es_dump_type("string"), f);
-    f->open_object_section("mtime");
-    ::encode_json("type", "date", f);
-    ::encode_json("format", "strict_date_optional_time||epoch_millis", f);
-    f->close_section(); // mtime
-    encode_json("size", es_dump_type("long"), f);
-    dump_custom(f, "custom-string", "string", nullptr);
-    dump_custom(f, "custom-int", "long", nullptr);
-    dump_custom(f, "custom-date", "date", "strict_date_optional_time||epoch_millis");
+    encode_json("cache_control", est(string_type), f);
+    encode_json("content_disposition", est(string_type), f);
+    encode_json("content_encoding", est(string_type), f);
+    encode_json("content_language", est(string_type), f);
+    encode_json("content_type", est(string_type), f);
+    encode_json("storage_class", est(string_type), f);
+    encode_json("etag", est(string_type), f);
+    encode_json("expires", est(string_type), f);
+    encode_json("mtime", est(ESType::Date)
+                         .set_format("strict_date_optional_time||epoch_millis"), f);
+    encode_json("size", est(ESType::Long), f);
+    dump_custom("custom-string", string_type, nullptr, f);
+    dump_custom("custom-int", ESType::Long, nullptr, f);
+    dump_custom("custom-date", ESType::Date, "strict_date_optional_time||epoch_millis", f);
     f->close_section(); // properties
     f->close_section(); // meta
     f->close_section(); // properties
@@ -231,11 +383,17 @@ struct es_index_settings {
   }
 };
 
-struct es_index_config {
+struct es_index_config_base {
+  virtual ~es_index_config_base() {}
+  virtual void dump(Formatter *f) const = 0;
+};
+
+template <class T>
+struct es_index_config : public es_index_config_base {
   es_index_settings settings;
-  es_index_mappings mappings;
+  es_index_mappings<T> mappings;
 
-  es_index_config(es_index_settings& _s, es_index_mappings& _m) : settings(_s), mappings(_m) {}
+  es_index_config(es_index_settings& _s) : settings(_s) {}
 
   void dump(Formatter *f) const {
     encode_json("settings", settings, f);
@@ -256,6 +414,16 @@ static bool is_sys_attr(const std::string& attr_name){
   return std::find(rgw_sys_attrs.begin(), rgw_sys_attrs.end(), attr_name) != rgw_sys_attrs.end();
 }
 
+static size_t attr_len(const bufferlist& val)
+{
+  size_t len = val.length();
+  if (len && val[len - 1] == '\0') {
+    --len;
+  }
+
+  return len;
+}
+
 struct es_obj_metadata {
   CephContext *cct;
   ElasticConfigRef es_conf;
@@ -282,17 +450,22 @@ struct es_obj_metadata {
       const string& attr_name = i.first;
       bufferlist& val = i.second;
 
-      if (attr_name.compare(0, sizeof(RGW_ATTR_PREFIX) - 1, RGW_ATTR_PREFIX) != 0) {
+      if (!boost::algorithm::starts_with(attr_name, RGW_ATTR_PREFIX)) {
         continue;
       }
 
-      if (attr_name.compare(0, sizeof(RGW_ATTR_META_PREFIX) - 1, RGW_ATTR_META_PREFIX) == 0) {
+      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_META_PREFIX)) {
         custom_meta.emplace(attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1),
-                            string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0)));
+                            string(val.c_str(), attr_len(val)));
+        continue;
+      }
+
+      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_CRYPT_PREFIX)) {
         continue;
       }
 
-      if (attr_name.compare(0, sizeof(RGW_ATTR_CRYPT_PREFIX) -1, RGW_ATTR_CRYPT_PREFIX) == 0) {
+      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_OLH_PREFIX)) {
+        // skip versioned object olh info
         continue;
       }
 
@@ -341,13 +514,16 @@ struct es_obj_metadata {
       } else {
         if (!is_sys_attr(attr_name)) {
           out_attrs.emplace(attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1),
-                            std::string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0)));
+                            std::string(val.c_str(), attr_len(val)));
         }
       }
     }
     ::encode_json("bucket", bucket_info.bucket.name, f);
     ::encode_json("name", key.name, f);
-    ::encode_json("instance", key.instance, f);
+    string instance = key.instance;
+    if (instance.empty())
+      instance = "null";
+    ::encode_json("instance", instance, f);
     ::encode_json("versioned_epoch", versioned_epoch, f);
     ::encode_json("owner", policy.get_owner(), f);
     ::encode_json("permissions", permissions, f);
@@ -449,6 +625,28 @@ struct es_obj_metadata {
 class RGWElasticInitConfigCBCR : public RGWCoroutine {
   RGWDataSyncEnv *sync_env;
   ElasticConfigRef conf;
+  ESInfo es_info;
+
+  struct _err_response {
+    struct err_reason {
+      vector<err_reason> root_cause;
+      string type;
+      string reason;
+      string index;
+
+      void decode_json(JSONObj *obj) {
+        JSONDecoder::decode_json("root_cause", root_cause, obj);
+        JSONDecoder::decode_json("type", type, obj);
+        JSONDecoder::decode_json("reason", reason, obj);
+        JSONDecoder::decode_json("index", index, obj);
+      }
+    } error;
+
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("error", error, obj);
+    }
+  } err_response;
+
 public:
   RGWElasticInitConfigCBCR(RGWDataSyncEnv *_sync_env,
                           ElasticConfigRef _conf) : RGWCoroutine(_sync_env->cct),
@@ -457,21 +655,46 @@ public:
   int operate() override {
     reenter(this) {
       ldout(sync_env->cct, 0) << ": init elasticsearch config zone=" << sync_env->source_zone << dendl;
+      yield call(new RGWReadRESTResourceCR<ESInfo> (sync_env->cct,
+                                                    conf->conn.get(),
+                                                    sync_env->http_manager,
+                                                    "/", nullptr /*params*/,
+                                                    &(conf->default_headers),
+                                                    &es_info));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
       yield {
         string path = conf->get_index_path();
+        ldout(sync_env->cct, 5) << "got elastic version=" << es_info.get_version_str() << dendl;
 
         es_index_settings settings(conf->num_replicas, conf->num_shards);
-        es_index_mappings mappings;
 
-        es_index_config index_conf(settings, mappings);
+        std::unique_ptr<es_index_config_base> index_conf;
 
-        call(new RGWPutRESTResourceCR<es_index_config, int>(sync_env->cct, conf->conn.get(),
-                                                              sync_env->http_manager,
-                                                              path, nullptr /* params */,
-                                                              index_conf, nullptr /* result */));
+        if (es_info.version >= ES_V5) {
+          ldout(sync_env->cct, 0) << "elasticsearch: index mapping: version >= 5" << dendl;
+          index_conf.reset(new es_index_config<es_type_v5>(settings));
+        } else {
+          ldout(sync_env->cct, 0) << "elasticsearch: index mapping: version < 5" << dendl;
+          index_conf.reset(new es_index_config<es_type_v2>(settings));
+        }
+        call(new RGWPutRESTResourceCR<es_index_config_base, int, _err_response> (sync_env->cct,
+                                                             conf->conn.get(),
+                                                             sync_env->http_manager,
+                                                             path, nullptr /*params*/,
+                                                             &(conf->default_headers),
+                                                             *index_conf, nullptr, &err_response));
       }
       if (retcode < 0) {
-        return set_cr_error(retcode);
+        ldout(sync_env->cct, 0) << "elasticsearch: failed to initialize index: response.type=" << err_response.error.type << " response.reason=" << err_response.error.reason << dendl;
+
+        if (err_response.error.type != "index_already_exists_exception") {
+          return set_cr_error(retcode);
+        }
+
+        ldout(sync_env->cct, 0) << "elasticsearch: index already exists, assuming external initialization" << dendl;
       }
       return set_cr_done();
     }
@@ -491,8 +714,9 @@ public:
   int operate() override {
     reenter(this) {
       ldout(sync_env->cct, 10) << ": stat of remote obj: z=" << sync_env->source_zone
-                               << " b=" << bucket_info.bucket << " k=" << key << " size=" << size << " mtime=" << mtime
-                               << " attrs=" << attrs << dendl;
+                               << " b=" << bucket_info.bucket << " k=" << key
+                               << " size=" << size << " mtime=" << mtime << dendl;
+
       yield {
         string path = conf->get_obj_path(bucket_info, key);
         es_obj_metadata doc(sync_env->cct, conf, bucket_info, key, mtime, size, attrs, versioned_epoch);
@@ -500,6 +724,7 @@ public:
         call(new RGWPutRESTResourceCR<es_obj_metadata, int>(sync_env->cct, conf->conn.get(),
                                                             sync_env->http_manager,
                                                             path, nullptr /* params */,
+                                                            &(conf->default_headers),
                                                             doc, nullptr /* result */));
 
       }
@@ -609,6 +834,10 @@ public:
   string get_index_path() {
     return conf->get_index_path();
   }
+
+  map<string, string>& get_request_headers() {
+    return conf->get_request_headers();
+  }
 };
 
 RGWElasticSyncModuleInstance::RGWElasticSyncModuleInstance(CephContext *cct, const map<string, string, ltstr_nocase>& config)
@@ -630,6 +859,10 @@ string RGWElasticSyncModuleInstance::get_index_path() {
   return data_handler->get_index_path();
 }
 
+map<string, string>& RGWElasticSyncModuleInstance::get_request_headers() {
+  return data_handler->get_request_headers();
+}
+
 RGWRESTMgr *RGWElasticSyncModuleInstance::get_rest_filter(int dialect, RGWRESTMgr *orig) {
   if (dialect != RGW_REST_S3) {
     return orig;
diff --git a/ceph/src/rgw/rgw_sync_module_es.h b/ceph/src/rgw/rgw_sync_module_es.h
index 43e591e42..5436733ac 100644
--- a/ceph/src/rgw/rgw_sync_module_es.h
+++ b/ceph/src/rgw/rgw_sync_module_es.h
@@ -3,6 +3,33 @@
 
 #include "rgw_sync_module.h"
 
+enum class ESType {
+  /* string datatypes */
+  String, /* Deprecated Since 5.X+ */
+  Text,
+  Keyword,
+
+  /* Numeric Types */
+  Long, Integer, Short, Byte, Double, Float, Half_Float, Scaled_Float,
+
+  /* Date Type */
+  Date,
+
+  /* Boolean */
+  Boolean,
+
+  /* Binary; Must Be Base64 Encoded */
+  Binary,
+
+  /* Range Types */
+  Integer_Range, Float_Range, Long_Range, Double_Range, Date_Range,
+
+  /* A Few Specialized Types */
+  Geo_Point,
+  Ip
+};
+
+
 class RGWElasticSyncModule : public RGWSyncModule {
 public:
   RGWElasticSyncModule() {}
@@ -23,6 +50,7 @@ public:
   RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) override;
   RGWRESTConn *get_rest_conn();
   std::string get_index_path();
+  map<string, string>& get_request_headers();
 };
 
 #endif
diff --git a/ceph/src/rgw/rgw_sync_module_es_rest.cc b/ceph/src/rgw/rgw_sync_module_es_rest.cc
index 200335ffa..192d20fa1 100644
--- a/ceph/src/rgw/rgw_sync_module_es_rest.cc
+++ b/ceph/src/rgw/rgw_sync_module_es_rest.cc
@@ -20,6 +20,7 @@ struct es_index_obj_response {
     ceph::real_time mtime;
     string etag;
     string content_type;
+    string storage_class;
     map<string, string> custom_str;
     map<string, int64_t> custom_int;
     map<string, string> custom_date;
@@ -41,6 +42,7 @@ struct es_index_obj_response {
       parse_time(mtime_str.c_str(), &mtime);
       JSONDecoder::decode_json("etag", etag, obj);
       JSONDecoder::decode_json("content_type", content_type, obj);
+      JSONDecoder::decode_json("storage_class", storage_class, obj);
       list<_custom_entry<string> > str_entries;
       JSONDecoder::decode_json("custom-string", str_entries, obj);
       for (auto& e : str_entries) {
@@ -180,7 +182,11 @@ void RGWMetadataSearchOp::execute()
                                   { "size", "meta.size" },
                                   { "mtime", "meta.mtime" },
                                   { "lastmodified", "meta.mtime" },
-                                  { "contenttype", "meta.contenttype" },
+                                  { "last_modified", "meta.mtime" },
+                                  { "contenttype", "meta.content_type" },
+                                  { "content_type", "meta.content_type" },
+                                  { "storageclass", "meta.storage_class" },
+                                  { "storage_class", "meta.storage_class" },
   };
   es_query.set_field_aliases(&aliases);
 
@@ -189,9 +195,10 @@ void RGWMetadataSearchOp::execute()
                                                            {"instance", ESEntityTypeMap::ES_ENTITY_STR},
                                                            {"permissions", ESEntityTypeMap::ES_ENTITY_STR},
                                                            {"meta.etag", ESEntityTypeMap::ES_ENTITY_STR},
-                                                           {"meta.contenttype", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"meta.content_type", ESEntityTypeMap::ES_ENTITY_STR},
                                                            {"meta.mtime", ESEntityTypeMap::ES_ENTITY_DATE},
-                                                           {"meta.size", ESEntityTypeMap::ES_ENTITY_INT} };
+                                                           {"meta.size", ESEntityTypeMap::ES_ENTITY_INT},
+                                                           {"meta.storage_class", ESEntityTypeMap::ES_ENTITY_STR} };
   ESEntityTypeMap gm(generic_map);
   es_query.set_generic_type_map(&gm);
 
@@ -236,7 +243,8 @@ void RGWMetadataSearchOp::execute()
     params.push_back(param_pair_t("from", marker_str.c_str()));
   }
   ldout(s->cct, 20) << "sending request to elasticsearch, payload=" << string(in.c_str(), in.length()) << dendl;
-  op_ret = conn->get_resource(resource, &params, nullptr, out, &in);
+  auto& extra_headers = es_module->get_request_headers();
+  op_ret = conn->get_resource(resource, &params, &extra_headers, out, &in);
   if (op_ret < 0) {
     ldout(s->cct, 0) << "ERROR: failed to fetch resource (r=" << resource << ", ret=" << op_ret << ")" << dendl;
     return;
@@ -332,6 +340,7 @@ public:
       s->formatter->dump_int("Size", e.meta.size);
       s->formatter->dump_format("ETag", "\"%s\"", e.meta.etag.c_str());
       s->formatter->dump_string("ContentType", e.meta.content_type.c_str());
+      s->formatter->dump_string("StorageClass", e.meta.storage_class.c_str());
       dump_owner(s, e.owner.get_id(), e.owner.get_display_name());
       s->formatter->open_array_section("CustomMetadata");
       for (auto& m : e.meta.custom_str) {
diff --git a/ceph/src/test/cli/osdmaptool/upmap-out.t b/ceph/src/test/cli/osdmaptool/upmap-out.t
index bc0a28a07..18641b18e 100644
--- a/ceph/src/test/cli/osdmaptool/upmap-out.t
+++ b/ceph/src/test/cli/osdmaptool/upmap-out.t
@@ -1,7 +1,7 @@
   $ osdmaptool --create-from-conf om -c $TESTDIR/ceph.conf.withracks --with-default-pool
   osdmaptool: osdmap file 'om'
   osdmaptool: writing epoch 1 to om
-  $ osdmaptool om --mark-up-in --mark-out 147 --upmap-max 11 --upmap c
+  $ osdmaptool --osd_calc_pg_upmaps_aggressively=false om --mark-up-in --mark-out 147 --upmap-max 11 --upmap c
   osdmaptool: osdmap file 'om'
   marking all OSDs up and in
   marking OSD@147 as out
@@ -10,14 +10,11 @@
   upmap, max-count 11, max deviation 0.01
   $ cat c
   ceph osd pg-upmap-items 1.7 142 145
-  ceph osd pg-upmap-items 1.8 219 223 99 103
+  ceph osd pg-upmap-items 1.8 219 223
   ceph osd pg-upmap-items 1.17 171 173 201 202
-  ceph osd pg-upmap-items 1.1a 201 202 115 114
-  ceph osd pg-upmap-items 1.1c 171 173 201 202 127 130
+  ceph osd pg-upmap-items 1.1a 201 202
+  ceph osd pg-upmap-items 1.1c 171 173 201 202
   ceph osd pg-upmap-items 1.20 88 87 201 202
-  ceph osd pg-upmap-items 1.21 207 206 142 145
-  ceph osd pg-upmap-items 1.51 201 202 65 64 186 189
   ceph osd pg-upmap-items 1.62 219 223
-  ceph osd pg-upmap-items 1.6f 219 223 108 111
-  ceph osd pg-upmap-items 1.82 219 223 157 158 6 3
+  ceph osd pg-upmap-items 1.6f 219 223
   $ rm -f om c
diff --git a/ceph/src/test/cli/osdmaptool/upmap.t b/ceph/src/test/cli/osdmaptool/upmap.t
index 6e17b204d..fd6541c2f 100644
--- a/ceph/src/test/cli/osdmaptool/upmap.t
+++ b/ceph/src/test/cli/osdmaptool/upmap.t
@@ -1,7 +1,7 @@
   $ osdmaptool --create-from-conf om -c $TESTDIR/ceph.conf.withracks --with-default-pool
   osdmaptool: osdmap file 'om'
   osdmaptool: writing epoch 1 to om
-  $ osdmaptool om --mark-up-in --upmap-max 11 --upmap c
+  $ osdmaptool --osd_calc_pg_upmaps_aggressively=false om --mark-up-in --upmap-max 11 --upmap c
   osdmaptool: osdmap file 'om'
   marking all OSDs up and in
   writing upmap command output to: c
@@ -11,12 +11,9 @@
   ceph osd pg-upmap-items 1.7 142 147
   ceph osd pg-upmap-items 1.8 219 223
   ceph osd pg-upmap-items 1.17 171 173 201 202
-  ceph osd pg-upmap-items 1.1a 201 202 115 114
-  ceph osd pg-upmap-items 1.1c 171 173 201 202 127 130
+  ceph osd pg-upmap-items 1.1a 201 202
+  ceph osd pg-upmap-items 1.1c 171 173 201 202
   ceph osd pg-upmap-items 1.20 88 87 201 202
-  ceph osd pg-upmap-items 1.24 32 35 232 233
-  ceph osd pg-upmap-items 1.51 201 202 65 64 186 189
+  ceph osd pg-upmap-items 1.51 201 202
   ceph osd pg-upmap-items 1.62 219 223
-  ceph osd pg-upmap-items 1.6f 219 223 108 111
-  ceph osd pg-upmap-items 1.f8 201 202
   $ rm -f om c
diff --git a/ceph/src/test/cli/rbd/help.t b/ceph/src/test/cli/rbd/help.t
index 06cfa294f..25b484bba 100644
--- a/ceph/src/test/cli/rbd/help.t
+++ b/ceph/src/test/cli/rbd/help.t
@@ -182,7 +182,7 @@ Skip test on FreeBSD as it generates different output there.
     --stripe-count arg        stripe count
     --data-pool arg           data pool
     --journal-splay-width arg number of active journal objects
-    --journal-object-size arg size of journal objects
+    --journal-object-size arg size of journal objects [4K <= size <= 64M]
     --journal-pool arg        pool for journal objects
   
   Image Features:
@@ -229,7 +229,7 @@ Skip test on FreeBSD as it generates different output there.
     --stripe-count arg           stripe count
     --data-pool arg              data pool
     --journal-splay-width arg    number of active journal objects
-    --journal-object-size arg    size of journal objects
+    --journal-object-size arg    size of journal objects [4K <= size <= 64M]
     --journal-pool arg           pool for journal objects
     --sparse-size arg            sparse size in B/K/M [default: 4K]
     --no-progress                disable progress output
@@ -274,7 +274,7 @@ Skip test on FreeBSD as it generates different output there.
     --stripe-count arg        stripe count
     --data-pool arg           data pool
     --journal-splay-width arg number of active journal objects
-    --journal-object-size arg size of journal objects
+    --journal-object-size arg size of journal objects [4K <= size <= 64M]
     --journal-pool arg        pool for journal objects
     -s [ --size ] arg         image size (in M/G/T) [default: M]
   
@@ -406,7 +406,7 @@ Skip test on FreeBSD as it generates different output there.
     -p [ --pool ] arg         pool name
     --image arg               image name
     --journal-splay-width arg number of active journal objects
-    --journal-object-size arg size of journal objects
+    --journal-object-size arg size of journal objects [4K <= size <= 64M]
     --journal-pool arg        pool for journal objects
   
   rbd help flatten
@@ -527,7 +527,7 @@ Skip test on FreeBSD as it generates different output there.
     --stripe-count arg        stripe count
     --data-pool arg           data pool
     --journal-splay-width arg number of active journal objects
-    --journal-object-size arg size of journal objects
+    --journal-object-size arg size of journal objects [4K <= size <= 64M]
     --journal-pool arg        pool for journal objects
     --sparse-size arg         sparse size in B/K/M [default: 4K]
     --no-progress             disable progress output
diff --git a/ceph/src/test/common/test_str_map.cc b/ceph/src/test/common/test_str_map.cc
index e96c792a4..b61739e8f 100644
--- a/ceph/src/test/common/test_str_map.cc
+++ b/ceph/src/test/common/test_str_map.cc
@@ -66,6 +66,18 @@ TEST(str_map, plaintext) {
   }
 }
 
+TEST(str_map, empty_values) {
+  {
+    map<string,string> str_map;
+    ASSERT_EQ(0, get_str_map("M= P= L=",
+			     &str_map));
+    ASSERT_EQ(3u, str_map.size());
+    ASSERT_EQ("", str_map["M"]);
+    ASSERT_EQ("", str_map["P"]);
+    ASSERT_EQ("", str_map["L"]);
+  }
+}
+
 /* 
  * Local Variables:
  * compile-command: "cd ../.. ; make -j4 && 
diff --git a/ceph/src/test/librbd/fsx.cc b/ceph/src/test/librbd/fsx.cc
index baa1a21ea..00c684716 100644
--- a/ceph/src/test/librbd/fsx.cc
+++ b/ceph/src/test/librbd/fsx.cc
@@ -963,7 +963,7 @@ krbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
 	int ret;
 
 	/*
-	 * BLKDISCARD goes straight to disk and doesn't do anything
+	 * BLKZEROOUT goes straight to disk and doesn't do anything
 	 * about dirty buffers.  This means we need to flush so that
 	 *
 	 *   write 0..3M
@@ -977,19 +977,22 @@ krbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
 	 *
 	 * returns "data 0000 data" rather than "data data data" in
 	 * case 1..2M was cached.
+	 *
+         * Note: These cache coherency issues are supposed to be fixed
+         * in recent kernels.
 	 */
 	ret = __krbd_flush(ctx, true);
 	if (ret < 0)
 		return ret;
 
 	/*
-	 * off and len must be 512-byte aligned, otherwise BLKDISCARD
+	 * off and len must be 512-byte aligned, otherwise BLKZEROOUT
 	 * will fail with -EINVAL.  This means that -K (enable krbd
 	 * mode) requires -h 512 or similar.
 	 */
-	if (ioctl(ctx->krbd_fd, BLKDISCARD, &range) < 0) {
+	if (ioctl(ctx->krbd_fd, BLKZEROOUT, &range) < 0) {
 		ret = -errno;
-		prt("BLKDISCARD(%llu, %llu) failed\n", off, len);
+		prt("BLKZEROOUT(%llu, %llu) failed\n", off, len);
 		return ret;
 	}
 
diff --git a/ceph/src/test/mds/TestSessionFilter.cc b/ceph/src/test/mds/TestSessionFilter.cc
index 29ce49256..d9f495fe8 100644
--- a/ceph/src/test/mds/TestSessionFilter.cc
+++ b/ceph/src/test/mds/TestSessionFilter.cc
@@ -74,8 +74,8 @@ TEST(MDSSessionFilter, IdEquality)
   SessionFilter filter;
   std::stringstream ss;
   filter.parse({"id=123"}, &ss);
-  Session *a = new Session();;
-  Session *b = new Session();;
+  Session *a = new Session(nullptr);;
+  Session *b = new Session(nullptr);;
   a->info.inst.name.parse("client.123");
   b->info.inst.name.parse("client.456");
 
@@ -90,9 +90,9 @@ TEST(MDSSessionFilter, StateEquality)
   SessionFilter filter;
   std::stringstream ss;
   filter.parse({"state=closing"}, &ss);
-  Session *a = new Session();
+  Session *a = new Session(nullptr);
   a->set_state(Session::STATE_CLOSING);
-  Session *b = new Session();
+  Session *b = new Session(nullptr);
   b->set_state(Session::STATE_OPENING);
 
   ASSERT_TRUE(filter.match(*a, [](client_t c) -> bool {return false;}));
@@ -106,9 +106,9 @@ TEST(MDSSessionFilter, AuthEquality)
   SessionFilter filter;
   std::stringstream ss;
   filter.parse({"auth_name=rhubarb"}, &ss);
-  Session *a = new Session();
+  Session *a = new Session(nullptr);
   a->info.auth_name.set_id("rhubarb");
-  Session *b = new Session();
+  Session *b = new Session(nullptr);
   b->info.auth_name.set_id("custard");
 
   ASSERT_TRUE(filter.match(*a, [](client_t c) -> bool {return false;}));
@@ -123,10 +123,10 @@ TEST(MDSSessionFilter, MetadataEquality)
   std::stringstream ss;
   int r = filter.parse({"client_metadata.root=/rhubarb"}, &ss);
   ASSERT_EQ(r, 0);
-  Session *a = new Session();
-  a->set_client_metadata({{"root", "/rhubarb"}});
-  Session *b = new Session();
-  b->set_client_metadata({{"root", "/custard"}});
+  Session *a = new Session(nullptr);
+  a->set_client_metadata(std::map<std::string,std::string>({{"root", "/rhubarb"}}));
+  Session *b = new Session(nullptr);
+  b->set_client_metadata(std::map<std::string,std::string>({{"root", "/custard"}}));
 
   ASSERT_TRUE(filter.match(*a, [](client_t c) -> bool {return false;}));
   ASSERT_FALSE(filter.match(*b, [](client_t c) -> bool {return false;}));
@@ -140,7 +140,7 @@ TEST(MDSSessionFilter, ReconnectingEquality)
   std::stringstream ss;
   int r = filter.parse({"reconnecting=true"}, &ss);
   ASSERT_EQ(r, 0);
-  Session *a = new Session();
+  Session *a = new Session(nullptr);
 
   ASSERT_TRUE(filter.match(*a, [](client_t c) -> bool {return true;}));
   ASSERT_FALSE(filter.match(*a, [](client_t c) -> bool {return false;}));
diff --git a/ceph/src/test/objectstore/Allocator_bench.cc b/ceph/src/test/objectstore/Allocator_bench.cc
new file mode 100755
index 000000000..bdd64800b
--- /dev/null
+++ b/ceph/src/test/objectstore/Allocator_bench.cc
@@ -0,0 +1,340 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * In memory space allocator benchmarks.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ */
+#include <iostream>
+#include <boost/scoped_ptr.hpp>
+#include <gtest/gtest.h>
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "include/Context.h"
+#include "os/bluestore/Allocator.h"
+
+#include <boost/random/uniform_int.hpp>
+typedef boost::mt11213b gen_type;
+
+#include "common/debug.h"
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_
+
+#if GTEST_HAS_PARAM_TEST
+
+class AllocTest : public ::testing::TestWithParam<const char*> {
+
+public:
+  boost::scoped_ptr<Allocator> alloc;
+  AllocTest(): alloc(0) { }
+  void init_alloc(int64_t size, uint64_t min_alloc_size) {
+    std::cout << "Creating alloc type " << string(GetParam()) << " \n";
+    alloc.reset(Allocator::create(g_ceph_context, string(GetParam()), size,
+				  min_alloc_size));
+  }
+
+  void init_close() {
+    alloc.reset(0);
+  }
+  void doOverwriteTest(uint64_t capacity, uint64_t prefill,
+    uint64_t overwrite);
+};
+
+const uint64_t _1m = 1024 * 1024;
+const uint64_t _2m = 2 * 1024 * 1024;
+
+void dump_mempools()
+{
+  ostringstream ostr;
+  Formatter* f = Formatter::create("json-pretty", "json-pretty", "json-pretty");
+  ostr << "Mempools: ";
+  f->open_object_section("mempools");
+  mempool::dump(f);
+  f->close_section();
+  f->flush(ostr);
+  delete f;
+  ldout(g_ceph_context, 0) << ostr.str() << dendl;
+}
+
+class AllocTracker
+{
+  std::vector<uint64_t> allocations;
+  uint64_t head = 0;
+  uint64_t tail = 0;
+  uint64_t size = 0;
+  boost::uniform_int<> u1;
+
+public:
+  AllocTracker(uint64_t capacity, uint64_t alloc_unit)
+    : u1(capacity, alloc_unit)
+  {
+    assert(alloc_unit >= 0x100);
+    assert(capacity <= (uint64_t(1) << 48)); // we use 5 octets (bytes 1 - 5) to store
+				 // offset to save the required space.
+				 // This supports capacity up to 281 TB
+
+    allocations.resize(capacity / alloc_unit);
+  }
+  inline uint64_t get_head() const
+  {
+    return head;
+  }
+
+  inline uint64_t get_tail() const
+  {
+    return tail;
+  }
+
+  bool push(uint64_t offs, uint32_t len)
+  {
+    assert((len & 0xff) == 0);
+    assert((offs & 0xff) == 0);
+    assert((offs & 0xffff000000000000) == 0);
+
+    if (head + 1 == tail)
+      return false;
+    uint64_t val = (offs << 16) | (len >> 8);
+    allocations[head++] = val;
+    head %= allocations.size();
+    ++size;
+    return true;
+  }
+  bool pop(uint64_t* offs, uint32_t* len)
+  {
+    if (size == 0)
+      return false;
+    uint64_t val = allocations[tail++];
+    *len = uint64_t((val & 0xffffff) << 8);
+    *offs = (val >> 16) & ~uint64_t(0xff);
+    tail %= allocations.size();
+    --size;
+    return true;
+  }
+  bool pop_random(gen_type& rng, uint64_t* offs, uint32_t* len,
+    uint32_t max_len = 0)
+  {
+    if (size == 0)
+      return false;
+
+    uint64_t pos = (u1(rng) % size) + tail;
+    pos %= allocations.size();
+    uint64_t val = allocations[pos];
+    *len = uint64_t((val & 0xffffff) << 8);
+    *offs = (val >> 16) & ~uint64_t(0xff);
+    if (max_len && *len > max_len) {
+      val = ((*offs + max_len) << 16) | ((*len - max_len) >> 8);
+      allocations[pos] = val;
+      *len = max_len;
+    } else {
+      allocations[pos] = allocations[tail++];
+      tail %= allocations.size();
+      --size;
+    }
+    return true;
+  }
+};
+
+TEST_P(AllocTest, test_alloc_bench_seq)
+{
+  uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+  uint64_t alloc_unit = 4096;
+  uint64_t want_size = alloc_unit;
+  PExtentVector allocated, tmp;
+
+  init_alloc(capacity, alloc_unit);
+  alloc->init_add_free(0, capacity);
+
+  utime_t start = ceph_clock_now();
+  for (uint64_t i = 0; i < capacity; i += want_size)
+  {
+    tmp.clear();
+    EXPECT_EQ(want_size, alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+
+  std::cout << "releasing..." << std::endl;
+  for (size_t i = 0; i < capacity; i += want_size)
+  {
+    interval_set<uint64_t> release_set;
+    release_set.insert(i, want_size);
+    alloc->release(release_set);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "release " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl;
+  dump_mempools();
+}
+
+TEST_P(AllocTest, test_alloc_bench)
+{
+  uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+  uint64_t alloc_unit = 4096;
+  PExtentVector allocated, tmp;
+  AllocTracker at(capacity, alloc_unit);
+
+  init_alloc(capacity, alloc_unit);
+  alloc->init_add_free(0, capacity);
+
+  gen_type rng(time(NULL));
+  boost::uniform_int<> u1(0, 9); // 4K-2M
+  boost::uniform_int<> u2(0, 7); // 4K-512K
+
+  utime_t start = ceph_clock_now();
+  for (uint64_t i = 0; i < capacity * 2; )
+  {
+    uint32_t want = alloc_unit << u1(rng);
+
+    tmp.clear();
+    auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+    if (r < want) {
+      break;
+    }
+    i += r;
+
+    for(auto a : tmp) {
+      bool full = !at.push(a.offset, a.length);
+      EXPECT_EQ(full, false);
+    }
+    uint64_t want_release = alloc_unit << u2(rng);
+    uint64_t released = 0;
+    do {
+      uint64_t o = 0;
+      uint32_t l = 0;
+      interval_set<uint64_t> release_set;
+      if (!at.pop_random(rng, &o, &l, want_release - released)) {
+	break;
+      }
+      release_set.insert(o, l);
+      alloc->release(release_set);
+      released += l;
+    } while (released < want_release);
+
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl;
+  std::cout<<"Avail "<< alloc->get_free() / _1m << " MB" << std::endl;
+  dump_mempools();
+}
+
+void AllocTest::doOverwriteTest(uint64_t capacity, uint64_t prefill,
+  uint64_t overwrite)
+{
+  uint64_t alloc_unit = 4096;
+  PExtentVector allocated, tmp;
+  AllocTracker at(capacity, alloc_unit);
+
+  init_alloc(capacity, alloc_unit);
+  alloc->init_add_free(0, capacity);
+
+  gen_type rng(time(NULL));
+  boost::uniform_int<> u1(0, 9); // 4K-2M
+  boost::uniform_int<> u2(0, 9); // 4K-512K
+
+  utime_t start = ceph_clock_now();
+  // allocate 90% of the capacity
+  auto cap = prefill;
+  for (uint64_t i = 0; i < cap; )
+  {
+    uint32_t want = alloc_unit << u1(rng);
+    tmp.clear();
+    auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+    if (r < want) {
+      break;
+    }
+    i += r;
+
+    for(auto a : tmp) {
+      bool full = !at.push(a.offset, a.length);
+      EXPECT_EQ(full, false);
+    }
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc " << i / 1024 / 1024 << " mb of "
+        << cap / 1024 / 1024 << std::endl;
+    }
+  }
+
+  cap = overwrite;
+  for (uint64_t i = 0; i < cap; )
+  {
+    uint64_t want_release = alloc_unit << u2(rng);
+    uint64_t released = 0;
+    do {
+      uint64_t o = 0;
+      uint32_t l = 0;
+      interval_set<uint64_t> release_set;
+      if (!at.pop_random(rng, &o, &l, want_release - released)) {
+	break;
+      }
+      release_set.insert(o, l);
+      alloc->release(release_set);
+      released += l;
+    } while (released < want_release);
+
+    uint32_t want = alloc_unit << u1(rng);
+    tmp.clear();
+    auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+    if (r != want) {
+      std::cout<<"Can't allocate more space, stopping."<< std::endl;
+      break;
+    }
+    i += r;
+
+    for(auto a : tmp) {
+      bool full = !at.push(a.offset, a.length);
+      EXPECT_EQ(full, false);
+    }
+
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "reuse " << i / 1024 / 1024 << " mb of "
+        << cap / 1024 / 1024 << std::endl;
+    }
+  }
+  std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl;
+  std::cout<<"Avail "<< alloc->get_free() / _1m << " MB" << std::endl;
+
+  dump_mempools();
+}
+
+TEST_P(AllocTest, test_alloc_bench_90_300)
+{
+  uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+  auto prefill = capacity - capacity / 10;
+  auto overwrite = capacity * 3;
+  doOverwriteTest(capacity, prefill, overwrite);
+}
+
+TEST_P(AllocTest, test_alloc_bench_50_300)
+{
+  uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+  auto prefill = capacity / 2;
+  auto overwrite = capacity * 3;
+  doOverwriteTest(capacity, prefill, overwrite);
+}
+
+TEST_P(AllocTest, test_alloc_bench_10_300)
+{
+  uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+  auto prefill = capacity / 10;
+  auto overwrite = capacity * 3;
+  doOverwriteTest(capacity, prefill, overwrite);
+}
+
+INSTANTIATE_TEST_CASE_P(
+  Allocator,
+  AllocTest,
+  ::testing::Values("stupid", "bitmap"));
+
+#else
+
+TEST(DummyTest, ValueParameterizedTestsAreNotSupportedOnThisPlatform) {}
+#endif
diff --git a/ceph/src/test/objectstore/Allocator_test.cc b/ceph/src/test/objectstore/Allocator_test.cc
index a52fd53c3..959932af9 100644
--- a/ceph/src/test/objectstore/Allocator_test.cc
+++ b/ceph/src/test/objectstore/Allocator_test.cc
@@ -14,37 +14,39 @@
 #include "include/stringify.h"
 #include "include/Context.h"
 #include "os/bluestore/Allocator.h"
-#include "os/bluestore/BitAllocator.h"
 
+#include <boost/random/uniform_int.hpp>
+typedef boost::mt11213b gen_type;
 
 #if GTEST_HAS_PARAM_TEST
 
 class AllocTest : public ::testing::TestWithParam<const char*> {
+
 public:
-    boost::scoped_ptr<Allocator> alloc;
-    AllocTest(): alloc(0) { }
-    void init_alloc(int64_t size, uint64_t min_alloc_size) {
-      std::cout << "Creating alloc type " << string(GetParam()) << " \n";
-      alloc.reset(Allocator::create(g_ceph_context, string(GetParam()), size,
-				    min_alloc_size));
-    }
+  boost::scoped_ptr<Allocator> alloc;
+  AllocTest(): alloc(0) { }
+  void init_alloc(int64_t size, uint64_t min_alloc_size) {
+    std::cout << "Creating alloc type " << string(GetParam()) << " \n";
+    alloc.reset(Allocator::create(g_ceph_context, string(GetParam()), size,
+				  min_alloc_size));
+  }
 
-    void init_close() {
-      alloc.reset(0);
-    }
+  void init_close() {
+    alloc.reset(0);
+  }
 };
 
 TEST_P(AllocTest, test_alloc_init)
 {
-  int64_t blocks = BmapEntry::size();
+  int64_t blocks = 64;
   init_alloc(blocks, 1);
   ASSERT_EQ(0U, alloc->get_free());
   alloc->shutdown(); 
-  blocks = BitMapZone::get_total_blocks() * 2 + 16;
+  blocks = 1024 * 2 + 16;
   init_alloc(blocks, 1);
   ASSERT_EQ(0U, alloc->get_free());
   alloc->shutdown(); 
-  blocks = BitMapZone::get_total_blocks() * 2;
+  blocks = 1024 * 2;
   init_alloc(blocks, 1);
   ASSERT_EQ(alloc->get_free(), (uint64_t) 0);
 }
@@ -52,13 +54,13 @@ TEST_P(AllocTest, test_alloc_init)
 TEST_P(AllocTest, test_alloc_min_alloc)
 {
   int64_t block_size = 1024;
-  int64_t blocks = BitMapZone::get_total_blocks() * 2 * block_size;
+  int64_t capacity = 4 * 1024 * block_size;
 
   {
-    init_alloc(blocks, block_size);
+    init_alloc(capacity, block_size);
+
     alloc->init_add_free(block_size, block_size);
-    EXPECT_EQ(alloc->reserve(block_size), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(block_size, alloc->allocate(block_size, block_size,
 					  0, (int64_t) 0, &extents));
   }
@@ -68,8 +70,7 @@ TEST_P(AllocTest, test_alloc_min_alloc)
    */   
   {
     alloc->init_add_free(0, block_size * 4);
-    EXPECT_EQ(alloc->reserve(block_size * 4), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(4*block_size,
 	      alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size,
 			      0, (int64_t) 0, &extents));
@@ -83,8 +84,7 @@ TEST_P(AllocTest, test_alloc_min_alloc)
   {
     alloc->init_add_free(0, block_size * 2);
     alloc->init_add_free(3 * block_size, block_size * 2);
-    EXPECT_EQ(alloc->reserve(block_size * 4), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
   
     EXPECT_EQ(4*block_size,
 	      alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size,
@@ -99,9 +99,9 @@ TEST_P(AllocTest, test_alloc_min_alloc)
 TEST_P(AllocTest, test_alloc_min_max_alloc)
 {
   int64_t block_size = 1024;
-  int64_t blocks = BitMapZone::get_total_blocks() * 2 * block_size;
 
-  init_alloc(blocks, block_size);
+  int64_t capacity = 4 * 1024 * block_size;
+  init_alloc(capacity, block_size);
 
   /*
    * Make sure we get all extents different when
@@ -109,8 +109,7 @@ TEST_P(AllocTest, test_alloc_min_max_alloc)
    */
   {
     alloc->init_add_free(0, block_size * 4);
-    EXPECT_EQ(alloc->reserve(block_size * 4), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(4*block_size,
 	      alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size,
 			      block_size, (int64_t) 0, &extents));
@@ -127,8 +126,7 @@ TEST_P(AllocTest, test_alloc_min_max_alloc)
    */
   {
     alloc->init_add_free(0, block_size * 4);
-    EXPECT_EQ(alloc->reserve(block_size * 4), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(4*block_size,
 	      alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size,
 			      2 * block_size, (int64_t) 0, &extents));
@@ -143,8 +141,7 @@ TEST_P(AllocTest, test_alloc_min_max_alloc)
    */
   {
     alloc->init_add_free(0, block_size * 1024);
-    EXPECT_EQ(alloc->reserve(block_size * 1024), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(1024 * block_size,
 	      alloc->allocate(1024 * (uint64_t)block_size,
 			      (uint64_t) block_size * 4,
@@ -160,8 +157,7 @@ TEST_P(AllocTest, test_alloc_min_max_alloc)
    */
   {
     alloc->init_add_free(0, block_size * 16);
-    EXPECT_EQ(alloc->reserve(block_size * 16), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(16 * block_size,
 	      alloc->allocate(16 * (uint64_t)block_size, (uint64_t) block_size,
 			      2 * block_size, (int64_t) 0, &extents));
@@ -176,15 +172,14 @@ TEST_P(AllocTest, test_alloc_min_max_alloc)
 TEST_P(AllocTest, test_alloc_failure)
 {
   int64_t block_size = 1024;
-  int64_t blocks = BitMapZone::get_total_blocks() * block_size;
+  int64_t capacity = 4 * 1024 * block_size;
 
-  init_alloc(blocks, block_size);
+  init_alloc(capacity, block_size);
   {
     alloc->init_add_free(0, block_size * 256);
     alloc->init_add_free(block_size * 512, block_size * 256);
 
-    EXPECT_EQ(alloc->reserve(block_size * 512), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(512 * block_size,
 	      alloc->allocate(512 * (uint64_t)block_size,
 			      (uint64_t) block_size * 256,
@@ -192,7 +187,6 @@ TEST_P(AllocTest, test_alloc_failure)
     alloc->init_add_free(0, block_size * 256);
     alloc->init_add_free(block_size * 512, block_size * 256);
     extents.clear();
-    EXPECT_EQ(alloc->reserve(block_size * 512), 0);
     EXPECT_EQ(-ENOSPC,
 	      alloc->allocate(512 * (uint64_t)block_size,
 			      (uint64_t) block_size * 512,
@@ -209,70 +203,12 @@ TEST_P(AllocTest, test_alloc_big)
   alloc->init_add_free(2*block_size, (blocks-2)*block_size);
   for (int64_t big = mas; big < 1048576*128; big*=2) {
     cout << big << std::endl;
-    EXPECT_EQ(alloc->reserve(big), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(big,
 	      alloc->allocate(big, mas, 0, &extents));
   }
 }
 
-TEST_P(AllocTest, test_alloc_hint_bmap)
-{
-  if (GetParam() == std::string("stupid")) {
-    return;
-  }
-  int64_t blocks = BitMapArea::get_level_factor(g_ceph_context, 2) * 4;
-  int64_t allocated = 0;
-  int64_t zone_size = 1024;
-  g_conf->set_val("bluestore_bitmapallocator_blocks_per_zone",
-		  std::to_string(zone_size));
-
-  init_alloc(blocks, 1);
-  alloc->init_add_free(0, blocks);
-
-  AllocExtentVector extents;
-  alloc->reserve(blocks);
-
-  allocated = alloc->allocate(1, 1, 1, zone_size, &extents);
-  ASSERT_EQ(1, allocated);
-  ASSERT_EQ(1u, extents.size());
-  ASSERT_EQ(extents[0].offset, (uint64_t) zone_size);
-
-  extents.clear();
-  allocated = alloc->allocate(1, 1, 1, zone_size * 2 - 1, &extents);
-  EXPECT_EQ(1, allocated);
-  ASSERT_EQ(1u, extents.size());
-  EXPECT_EQ((int64_t) extents[0].offset, zone_size * 2 - 1);
-
-  /*
-   * Wrap around with hint
-   */
-  extents.clear();
-  allocated = alloc->allocate(zone_size * 2, 1, 1,  blocks - zone_size * 2,
-			      &extents);
-  ASSERT_EQ(zone_size * 2, allocated);
-  EXPECT_EQ(zone_size * 2, (int)extents.size());
-  EXPECT_EQ((int64_t)extents[0].offset, blocks - zone_size * 2);
-
-  extents.clear();
-  allocated = alloc->allocate(zone_size, 1, 1, blocks - zone_size, &extents);
-  ASSERT_EQ(zone_size, allocated);
-  EXPECT_EQ(zone_size, (int)extents.size());
-  EXPECT_EQ(extents[0].offset, (uint64_t) 0);
-  /*
-   * Verify out-of-bound hint
-   */
-  extents.clear();
-  allocated = alloc->allocate(1, 1, 1, blocks, &extents);
-  ASSERT_EQ(1, allocated);
-  EXPECT_EQ(1, (int)extents.size());
-
-  extents.clear();
-  allocated = alloc->allocate(1, 1, 1, blocks * 3 + 1 , &extents);
-  ASSERT_EQ(1, allocated);
-  EXPECT_EQ(1, (int)extents.size());
-}
-
 TEST_P(AllocTest, test_alloc_non_aligned_len)
 {
   int64_t block_size = 1 << 12;
@@ -285,11 +221,100 @@ TEST_P(AllocTest, test_alloc_non_aligned_len)
   alloc->init_add_free(2097152, 1064960);
   alloc->init_add_free(3670016, 2097152);
 
-  EXPECT_EQ(0, alloc->reserve(want_size));
-  AllocExtentVector extents;
+  PExtentVector extents;
   EXPECT_EQ(want_size, alloc->allocate(want_size, alloc_unit, 0, &extents));
 }
 
+TEST_P(AllocTest, test_alloc_fragmentation)
+{
+  uint64_t capacity = 4 * 1024 * 1024;
+  uint64_t alloc_unit = 4096;
+  uint64_t want_size = alloc_unit;
+  PExtentVector allocated, tmp;
+  
+  init_alloc(capacity, alloc_unit);
+  alloc->init_add_free(0, capacity);
+  bool bitmap_alloc = GetParam() == std::string("bitmap");
+  
+  EXPECT_EQ(0.0, alloc->get_fragmentation(alloc_unit));
+
+  for (size_t i = 0; i < capacity / alloc_unit; ++i)
+  {
+    tmp.clear();
+    EXPECT_EQ(want_size, alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
+    allocated.insert(allocated.end(), tmp.begin(), tmp.end());
+
+    // bitmap fragmentation calculation doesn't provide such constant
+    // estimate
+    if (!bitmap_alloc) {
+      EXPECT_EQ(0.0, alloc->get_fragmentation(alloc_unit));
+    }
+  }
+  EXPECT_EQ(-ENOSPC, alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
+
+  for (size_t i = 0; i < allocated.size(); i += 2)
+  {
+    interval_set<uint64_t> release_set;
+    release_set.insert(allocated[i].offset, allocated[i].length);
+    alloc->release(release_set);
+  }
+  EXPECT_EQ(1.0, alloc->get_fragmentation(alloc_unit));
+  for (size_t i = 1; i < allocated.size() / 2; i += 2)
+  {
+    interval_set<uint64_t> release_set;
+    release_set.insert(allocated[i].offset, allocated[i].length);
+    alloc->release(release_set);
+  }
+  if (bitmap_alloc) {
+    // fragmentation = one l1 slot is free + one l1 slot is partial
+    EXPECT_EQ(50, uint64_t(alloc->get_fragmentation(alloc_unit) * 100));
+  } else {
+    // fragmentation approx = 257 intervals / 768 max intervals
+    EXPECT_EQ(33, uint64_t(alloc->get_fragmentation(alloc_unit) * 100));
+  }
+
+  for (size_t i = allocated.size() / 2 + 1; i < allocated.size(); i += 2)
+  {
+    interval_set<uint64_t> release_set;
+    release_set.insert(allocated[i].offset, allocated[i].length);
+    alloc->release(release_set);
+  }
+  // doing some rounding trick as stupid allocator doesn't merge all the 
+  // extents that causes some minor fragmentation (minor bug or by-design behavior?).
+  // Hence leaving just two 
+  // digits after decimal point due to this.
+  EXPECT_EQ(0, uint64_t(alloc->get_fragmentation(alloc_unit) * 100));
+}
+
+TEST_P(AllocTest, test_alloc_bug_24598)
+{
+  if (string(GetParam()) != "bitmap")
+    return;
+  
+  uint64_t capacity = 0x2625a0000ull;
+  uint64_t alloc_unit = 0x4000;
+  uint64_t want_size = 0x200000;
+  PExtentVector allocated, tmp;
+
+  init_alloc(capacity, alloc_unit);
+
+  alloc->init_add_free(0x4800000, 0x100000);
+  alloc->init_add_free(0x4a00000, 0x100000);
+
+  alloc->init_rm_free(0x4800000, 0x100000);
+  alloc->init_rm_free(0x4a00000, 0x100000);
+
+  alloc->init_add_free(0x3f00000, 0x500000);
+  alloc->init_add_free(0x4500000, 0x100000);
+  alloc->init_add_free(0x4700000, 0x100000);
+  alloc->init_add_free(0x4900000, 0x100000);
+  alloc->init_add_free(0x4b00000, 0x200000);
+
+  EXPECT_EQ(want_size, alloc->allocate(want_size, 0x100000, 0, 0, &tmp));
+  EXPECT_EQ(tmp[0].offset, 0x4b00000);
+  EXPECT_EQ(tmp[0].length, 0x200000);
+  EXPECT_EQ(tmp.size(), 1);
+}
 
 INSTANTIATE_TEST_CASE_P(
   Allocator,
diff --git a/ceph/src/test/objectstore/BitAllocator_test.cc b/ceph/src/test/objectstore/BitAllocator_test.cc
deleted file mode 100644
index 5d1da96a0..000000000
--- a/ceph/src/test/objectstore/BitAllocator_test.cc
+++ /dev/null
@@ -1,593 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Bitmap based in-memory allocator unit test cases.
- * Author: Ramesh Chander, Ramesh.Chander@sandisk.com
- */
-
-#include "include/Context.h"
-#include "os/bluestore/BitAllocator.h"
-#include <stdio.h>
-#include <assert.h>
-#include <math.h>
-#include <sstream>
-#include <gtest/gtest.h>
-
-
-//#define bmap_test_assert(x) ASSERT_EQ(true, (x))
-#define bmap_test_assert(x) assert((x))
-#define NUM_THREADS 16
-#define MAX_BLOCKS (1024 * 1024 * 1)
-
-TEST(BitAllocator, test_bmap_iter)
-{
-  int num_items = 5;
-  int off = 2;
-
-  class BmapEntityTmp {
-      int64_t m_num;
-      int64_t m_len;
-    public:
-      void init(int index) {
-        m_num = index;
-      }
-      BmapEntityTmp() {
-
-      }
-      BmapEntityTmp(int num) {
-        m_num = num;
-        m_len = num;
-      }
-
-      int64_t get_index() {
-        return m_num;
-      }
-      bool is_allocated(int64_t s, int64_t num)
-      {
-        return true;
-      }
-  };
-  BmapEntityTmp *obj = NULL;
-  int i = 0;
-  mempool::bluestore_alloc::vector<BmapEntityTmp> *arr = new mempool::bluestore_alloc::vector<BmapEntityTmp>(num_items);
-  for (i = 0; i < num_items; i++) {
-    (*arr)[i].init(i);
-  }
-  BitMapEntityIter<BmapEntityTmp> iter = BitMapEntityIter<BmapEntityTmp>(arr, off, false);
-
-  i = off;
-  int count = 0;
-  int64_t last_idx = off;
-  while ((obj = iter.next())) {
-    bmap_test_assert(obj->get_index() == last_idx);
-    bmap_test_assert(obj->get_index() == i);
-    bmap_test_assert(obj == &(*arr)[i]);
-    last_idx = iter.index();
-    i++;
-    count++;
-  }
-  bmap_test_assert(i == num_items);
-  bmap_test_assert(count == num_items - off);
-
-  iter = BitMapEntityIter<BmapEntityTmp>(arr, off, true);
-
-  i = off;
-  last_idx = off;
-  count = 0;
-  while ((obj = iter.next())) {
-    bmap_test_assert(obj->get_index() == last_idx);
-    bmap_test_assert(obj->get_index() == i);
-    bmap_test_assert(obj == &(*arr)[i]);
-    last_idx = iter.index();
-
-    i = (i + 1) % num_items;
-    count++;
-  }
-  bmap_test_assert(i == off + 1);
-  bmap_test_assert(count == num_items + 1);
-
-  delete arr;
-
-  num_items = 4;
-  off = num_items - 1;
-
-  arr = new mempool::bluestore_alloc::vector<BmapEntityTmp>(num_items);
-  for (i = 0; i < num_items; i++) {
-    (*arr)[i].init(i);
-  }
-  iter = BitMapEntityIter<BmapEntityTmp>(arr, off, true);
-  i = off;
-  last_idx = off;
-  count = 0;
-  while ((obj = static_cast<BmapEntityTmp*>(iter.next()))) {
-    bmap_test_assert(obj->get_index() == last_idx);
-    bmap_test_assert(obj->get_index() == i);
-    bmap_test_assert(obj == &(*arr)[i]);
-    last_idx = iter.index();
-    i = (i + 1) % num_items;
-    count++;
-  }
-  bmap_test_assert(i == (off + 1)%num_items);
-  bmap_test_assert(count == num_items + 1);
-
-  delete arr;
-
-  /*
-   * BitMapArea Iter tests.
-   */
-  BitMapArea *area = nullptr;
-  std::vector<BitMapArea*> children;
-  children.reserve(num_items);
-  for (i = 0; i < num_items; i++) {
-    children.emplace_back(new BitMapAreaLeaf(
-      g_ceph_context,
-      BitMapArea::get_span_size(g_ceph_context), i, false));
-  }
-
-  off = 0;
-  BitMapAreaList *area_list = \
-    new BitMapAreaList(std::vector<BitMapArea*>(children));
-  BmapEntityListIter area_iter = BmapEntityListIter(
-                                area_list, (int64_t) 0);
-  i = off;
-  last_idx = off;
-  count = 0;
-  while ((area = area_iter.next())) {
-    bmap_test_assert(area->get_index() == last_idx);
-    bmap_test_assert(area->get_index() == i);
-    bmap_test_assert(area == children[i]);
-    last_idx = area_iter.index();
-    i = (i + 1) % num_items;
-    count++;
-  }
-  bmap_test_assert(i == off);
-  bmap_test_assert(count == num_items);
-
-  off = 0;
-  area_iter = BmapEntityListIter(area_list, off, true);
-  i = off;
-  last_idx = off;
-  count = 0;
-  while ((area = area_iter.next())) {
-    bmap_test_assert(area->get_index() == last_idx);
-    bmap_test_assert(area->get_index() == i);
-    bmap_test_assert(area == children[i]);
-    last_idx = area_iter.index();
-    i = (i + 1) % num_items;
-    count++;
-  }
-  bmap_test_assert(i == (off + 1)%num_items);
-  bmap_test_assert(count == num_items + 1);
-
-  for (i = 0; i < num_items; i++)
-    delete children[i];
-
-  delete area_list;
-}
-
-TEST(BitAllocator, test_bmap_entry)
-{
-  int i = 0;
-  int start = 0;
-  int64_t scanned = 0;
-  int64_t allocated = 0;
-  int size = BmapEntry::size();
-
-  BmapEntry *bmap = new BmapEntry(g_ceph_context, true);
-
-  // Clear bits one by one and check they are cleared
-  for (i = 0; i < size; i++) {
-    bmap->clear_bit(i);
-    bmap_test_assert(!bmap->check_bit(i));
-  }
-
-  // Set all bits again using set_bits
-  bmap->set_bits(0, size);
-
-  // clear 4 bits at a time and then check allocated
-  for (i = 0; i < size/4; i++) {
-    bmap->clear_bits(i * 4, 4);
-    bmap_test_assert(!bmap->is_allocated(i * 4, 4));
-  }
-
-  // set all bits again
-  bmap->set_bits(0, size);
-
-  // clear alternate bits, check and set those bits
-  for (i = 0; i < size/2; i++) {
-    bmap->clear_bit(i * 2 + 1);
-    bmap_test_assert(!bmap->check_bit(i * 2 + 1));
-    bmap_test_assert(bmap->check_n_set_bit(i * 2 + 1));
-  }
-
-  // free 1, 2 and size bits at a time and try to find n cont bits
-  for (i = 0; i < size / 4; i++) {
-    bmap->clear_bits(i * 2 + 1, i + 1);
-    bmap_test_assert(!bmap->check_bit(i * 2 + 1));
-    bmap_test_assert(bmap->find_n_cont_bits(i * 2 + 1, i + 1) ==
-        i + 1);
-  }
-
-  // free 1, 2 and size bits at a time and try to find any cont bits
-  for (i = 0; i < size / 4; i++) {
-    bmap->clear_bits(i * 2 + 1, i + 1);
-    bmap_test_assert(!bmap->is_allocated(i * 2 + 1, i + 1));
-  }
-
-  for (i = 0; i < size / 4; i++) {
-    bmap->clear_bits(i * 2 + 1, i + 1);
-    allocated = bmap->find_first_set_bits(i + 1, 0, &start, &scanned);
-
-    bmap_test_assert(allocated == i + 1);
-    bmap_test_assert(scanned == ((i * 2 + 1) + (i + 1)));
-    bmap_test_assert(start == i * 2 + 1);
-    bmap->set_bits(0, BmapEntry::size());
-
-  }
-
-
-
-  // Find few bits at end of bitmap and find those
-  bmap->clear_bits(0, 4);
-  bmap->clear_bits(BmapEntry::size() - 12, 5);
-  bmap->clear_bits(BmapEntry::size() - 6, 6);
-  allocated = bmap->find_first_set_bits(6, 0, &start, &scanned);
-
-  bmap_test_assert(allocated == 6);
-  bmap_test_assert(scanned == BmapEntry::size() - 6 + 6);
-  bmap_test_assert(start == BmapEntry::size() - 6);
-  bmap_test_assert(bmap->is_allocated(start, 6));
-
-  delete bmap;
-
-  {
-
-    bmap = new BmapEntry(g_ceph_context, false);
-    start = -1;
-    scanned = 0;
-    allocated = 0;
-    allocated = bmap->find_first_set_bits(1, 1, &start, &scanned);
-    bmap_test_assert(allocated == 1);
-    bmap_test_assert(start == 1);
-
-    allocated = bmap->find_first_set_bits(1, BmapEntry::size() - 2, &start, &scanned);
-    bmap_test_assert(allocated == 1);
-    bmap_test_assert(start == BmapEntry::size() - 2);
-
-    bmap->clear_bits(0, BmapEntry::size());
-    bmap->set_bits(0, BmapEntry::size() / 4);
-    allocated = bmap->find_first_set_bits(4, 2, &start, &scanned);
-    bmap_test_assert(allocated == 4);
-    bmap_test_assert(start == BmapEntry::size() / 4);
-    delete bmap;
-  }
-
-  bmap = new BmapEntry(g_ceph_context, false);
-  bmap->set_bits(4, BmapEntry::size() - 4);
-  bmap_test_assert(bmap->is_allocated(4, BmapEntry::size() - 4));
-  bmap_test_assert(!bmap->is_allocated(0, 4));
-  bmap->set_bits(0, 4);
-  bmap_test_assert(bmap->is_allocated(0, BmapEntry::size()));
-  delete bmap;
-
-}
-
-TEST(BitAllocator, test_zone_alloc)
-{
-  int total_blocks = 1024;
-  int64_t allocated = 0;
-
-  BitMapZone *zone = new BitMapZone(g_ceph_context, total_blocks, 0);
-
-  // Allocate all blocks and see that it is allocating in order.
-  bool lock = zone->lock_excl_try();
-  bmap_test_assert(lock);
-
-  int64_t blk_size = 1024;
-  AllocExtentVector extents;
-  ExtentList *block_list = new ExtentList(&extents, blk_size);
-  allocated = zone->alloc_blocks_dis(zone->size() / 2, 1, 0, 0, block_list);
-  bmap_test_assert(allocated == zone->size() / 2);
-
-
-  {
-    int64_t blk_size = 1024;
-    AllocExtentVector extents;
-    ExtentList *block_list = new ExtentList(&extents, blk_size);
-
-    zone = new BitMapZone(g_ceph_context, total_blocks, 0);
-    lock = zone->lock_excl_try();
-    bmap_test_assert(lock);
-    for (int i = 0; i < zone->size(); i += 4) {
-      block_list->reset();
-      allocated = zone->alloc_blocks_dis(1, 1, i, 0, block_list);
-      bmap_test_assert(allocated == 1);
-      EXPECT_EQ(extents[0].offset, (uint64_t) i * blk_size);
-    }
-
-    for (int i = 0; i < zone->size(); i += 4) {
-      zone->free_blocks(i, 1);
-    }
-  }
-
-  /*
-   * Min alloc size cases.
-   */
-  {
-    int64_t blk_size = 1;
-    AllocExtentVector extents;
-
-    for (int i = 1; i <= total_blocks - BmapEntry::size(); i = i << 1) {
-      for (int64_t j = 0; j <= BmapEntry::size(); j = 1 << j) {
-	extents.clear();
-        ExtentList *block_list = new ExtentList(&extents, blk_size);
-	zone = new BitMapZone(g_ceph_context, total_blocks, 0);
-        lock = zone->lock_excl_try();
-        bmap_test_assert(lock);
-
-        block_list->reset();
-        int64_t need_blks = (((total_blocks - j) / i) * i);
-        allocated = zone->alloc_blocks_dis(need_blks, i, j, 0, block_list);
-        bmap_test_assert(allocated == need_blks);
-        bmap_test_assert(extents[0].offset ==  (uint64_t) j);
-        delete block_list;
-        delete zone;
-      }
-    }
-
-    //allocation in loop
-    {
-      extents.clear();
-      ExtentList *block_list = new ExtentList(&extents, blk_size);
-      zone = new BitMapZone(g_ceph_context, total_blocks, 0);
-      lock = zone->lock_excl_try();
-
-      for (int iter = 1; iter < 5; iter++) {
-        for (int i = 1; i <= total_blocks; i = i << 1) {
-          for (int j = 0; j < total_blocks; j +=i) {
-            bmap_test_assert(lock);
-            block_list->reset();
-            int64_t need_blks = i;
-            allocated = zone->alloc_blocks_dis(need_blks, i, 0, 0, block_list);
-            bmap_test_assert(allocated == need_blks);
-            bmap_test_assert(extents[0].offset ==  (uint64_t) j);
-            block_list->reset();
-          }
-          {
-            allocated = zone->alloc_blocks_dis(1, 1, 0, 0, block_list);
-            bmap_test_assert(allocated == 0);
-            block_list->reset();
-          }
-         
-          for (int j = 0; j < total_blocks; j +=i) {
-            zone->free_blocks(j, i);
-          }
-        }
-      }
-      delete block_list;
-      delete zone;
-    }
-
-    {
-      extents.clear();
-      ExtentList *block_list = new ExtentList(&extents, blk_size);
-      zone = new BitMapZone(g_ceph_context, total_blocks, 0);
-      lock = zone->lock_excl_try();
-      bmap_test_assert(lock);
-
-      block_list->reset();
-      allocated = zone->alloc_blocks_dis(total_blocks + 1, total_blocks + 1, 0, 1024, block_list);
-      bmap_test_assert(allocated == 0);
-
-      block_list->reset();
-      allocated = zone->alloc_blocks_dis(total_blocks, total_blocks, 1, 1024, block_list);
-      bmap_test_assert(allocated == 0);
-
-      block_list->reset();
-      allocated = zone->alloc_blocks_dis(total_blocks, total_blocks, 0, 0, block_list);
-      bmap_test_assert(allocated == total_blocks);
-      bmap_test_assert(extents[0].offset == 0);
-
-      zone->free_blocks(extents[0].offset, allocated);
-        
-      delete block_list;
-      extents.clear();
-      block_list = new ExtentList(&extents, blk_size, total_blocks / 4 * blk_size);
-      allocated = zone->alloc_blocks_dis(total_blocks, total_blocks / 4, 0, 0, block_list);
-      bmap_test_assert(allocated == total_blocks);
-      for (int i = 0; i < 4; i++) {
-	bmap_test_assert(extents[i].offset == (uint64_t) i * (total_blocks / 4));
-      }
-    }
-  }
-}
-
-TEST(BitAllocator, test_bmap_alloc)
-{
-  const int max_iter = 3;
-
-  for (int round = 0; round < 3; round++) {
-    // Test zone of different sizes: 512, 1024, 2048
-    int64_t zone_size = 512ull << round;
-    ostringstream val;
-    val << zone_size;
-    g_conf->set_val("bluestore_bitmapallocator_blocks_per_zone", val.str());
-
-    // choose randomized span_size
-    int64_t span_size = 512ull << (rand() % 4);
-    val.str("");
-    val << span_size;
-    g_conf->set_val("bluestore_bitmapallocator_span_size", val.str());
-    g_ceph_context->_conf->apply_changes(NULL);
-
-    int64_t total_blocks = zone_size * 4;
-    int64_t allocated = 0;
-
-    BitAllocator *alloc = new BitAllocator(g_ceph_context, total_blocks,
-					   zone_size, CONCURRENT);
-    int64_t alloc_size = 2;
-    for (int64_t iter = 0; iter < max_iter; iter++) {
-      for (int64_t j = 0; alloc_size <= total_blocks; j++) {
-        int64_t blk_size = 1024;
-        AllocExtentVector extents;
-        ExtentList *block_list = new ExtentList(&extents, blk_size, alloc_size);
-        for (int64_t i = 0; i < total_blocks; i += alloc_size) {
-          bmap_test_assert(alloc->reserve_blocks(alloc_size) == true);
-          allocated = alloc->alloc_blocks_dis_res(alloc_size, MIN(alloc_size, zone_size),
-                                                  0, block_list);
-          bmap_test_assert(alloc_size == allocated);
-          bmap_test_assert(block_list->get_extent_count() == 
-                           (alloc_size > zone_size? alloc_size / zone_size: 1));
-          bmap_test_assert(extents[0].offset == (uint64_t) i * blk_size);
-          bmap_test_assert((int64_t) extents[0].length == 
-                           ((alloc_size > zone_size? zone_size: alloc_size) * blk_size));
-          block_list->reset();
-        }
-        for (int64_t i = 0; i < total_blocks; i += alloc_size) {
-          alloc->free_blocks(i, alloc_size);
-        }
-        alloc_size = 2 << j; 
-      }
-    }
-
-    int64_t blk_size = 1024;
-    AllocExtentVector extents;
-
-    ExtentList *block_list = new ExtentList(&extents, blk_size);
-  
-    ASSERT_EQ(alloc->reserve_blocks(alloc->size() / 2), true);
-    allocated = alloc->alloc_blocks_dis_res(alloc->size()/2, 1, 0, block_list);
-    ASSERT_EQ(alloc->size()/2, allocated);
-
-    block_list->reset();
-    ASSERT_EQ(alloc->reserve_blocks(1), true);
-    allocated = alloc->alloc_blocks_dis_res(1, 1, 0, block_list);
-    bmap_test_assert(allocated == 1);
-
-    alloc->free_blocks(alloc->size()/2, 1);
-
-    block_list->reset();
-    ASSERT_EQ(alloc->reserve_blocks(1), true);
-    allocated = alloc->alloc_blocks_dis_res(1, 1, 0, block_list);
-    bmap_test_assert(allocated == 1);
-
-    bmap_test_assert((int64_t) extents[0].offset == alloc->size()/2 * blk_size);
-
-    delete block_list;
-    delete alloc;
-
-  }
-
-  // restore to typical value
-  g_conf->set_val("bluestore_bitmapallocator_blocks_per_zone", "1024");
-  g_conf->set_val("bluestore_bitmapallocator_span_size", "1024");
-  g_ceph_context->_conf->apply_changes(NULL);
-}
-
-bool alloc_extents_max_block(BitAllocator *alloc,
-           int64_t max_alloc,
-           int64_t total_alloc)
-{
-  int64_t blk_size = 1;
-  int64_t allocated = 0;
-  int64_t verified = 0;
-  int64_t count = 0;
-  AllocExtentVector extents;
-
-  ExtentList *block_list = new ExtentList(&extents, blk_size, max_alloc);
-
-  EXPECT_EQ(alloc->reserve_blocks(total_alloc), true);
-  allocated = alloc->alloc_blocks_dis_res(total_alloc, blk_size, 0, block_list);
-  EXPECT_EQ(allocated, total_alloc);
-
-  max_alloc = total_alloc > max_alloc? max_alloc: total_alloc;
-
-  for (auto &p: extents) {
-    count++;
-    EXPECT_EQ(p.length,  max_alloc);
-    verified += p.length;
-    if (verified >= total_alloc) {
-      break;
-    }
-  }
-
-  EXPECT_EQ(total_alloc / max_alloc, count);
-  return true;
-}
-
-TEST(BitAllocator, test_bmap_alloc2)
-{
-  int64_t total_blocks = 1024 * 4;
-  int64_t zone_size = 1024;
-  BitAllocator *alloc = new BitAllocator(g_ceph_context, total_blocks,
-					 zone_size, CONCURRENT);
-
-  alloc_extents_max_block(alloc, 1, 16);
-  alloc_extents_max_block(alloc, 4, 16);
-  alloc_extents_max_block(alloc, 16, 16);
-  alloc_extents_max_block(alloc, 32, 16);
-}
-
-__thread int my_tid;
-
-void
-do_work_dis(BitAllocator *alloc)
-{
-  int num_iters = 10;
-  int64_t alloced = 0;
-  int64_t num_blocks = alloc->size() / NUM_THREADS;
-
-  AllocExtentVector extents;
-  ExtentList *block_list = new ExtentList(&extents, 4096);
-
-  while (num_iters--) {
-    alloc_assert(alloc->reserve_blocks(num_blocks));
-    alloced = alloc->alloc_blocks_dis_res(num_blocks, 1, 0, block_list);
-    alloc_assert(alloced == num_blocks);
-
-    alloc_assert(alloc->is_allocated_dis(block_list, num_blocks));
-    alloc->free_blocks_dis(num_blocks, block_list);
-    block_list->reset();
-  }
-}
-
-int tid = 0;
-static bool cont = true;
-
-void *
-worker(void *args)
-{
-  my_tid = __sync_fetch_and_add(&tid, 1);
-  BitAllocator *alloc = (BitAllocator *) args;
-  printf("Starting thread %d", my_tid);
-  do_work_dis(alloc);
-
-  return NULL;
-}
-
-TEST(BitAllocator, test_bmap_alloc_concurrent)
-{
-  int64_t total_blocks = MAX_BLOCKS;
-  int64_t zone_size = 1024;
-  pthread_t pthreads[NUM_THREADS] = {0};
-
-  bmap_test_assert(total_blocks <= MAX_BLOCKS);
-
-  BitAllocator *alloc = new BitAllocator(g_ceph_context, total_blocks,
-					 zone_size, CONCURRENT);
-
-  for (int k = 0; k < 2; k++) {
-    cont = k;
-    printf("Spawning %d threads for parallel test. Mode Cont = %d.....\n", NUM_THREADS, cont);
-    for (int j = 0; j < NUM_THREADS; j++) {
-      if (pthread_create(&pthreads[j], NULL, worker, alloc)) {
-        printf("Unable to create worker thread.\n");
-        exit(0);
-      }
-    }
-
-    for (int j = 0; j < NUM_THREADS; j++) {
-      pthread_join(pthreads[j], NULL);
-    }
-  }
-}
diff --git a/ceph/src/test/objectstore/CMakeLists.txt b/ceph/src/test/objectstore/CMakeLists.txt
index 71248d1ac..3705d2c03 100644
--- a/ceph/src/test/objectstore/CMakeLists.txt
+++ b/ceph/src/test/objectstore/CMakeLists.txt
@@ -108,13 +108,6 @@ add_ceph_unittest(unittest_rocksdb_option ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unit
 target_link_libraries(unittest_rocksdb_option global os ${BLKID_LIBRARIES})
 
 if(HAVE_LIBAIO)
-  # unittest_bit_alloc
-  add_executable(unittest_bit_alloc
-    BitAllocator_test.cc
-    $<TARGET_OBJECTS:unit-main>
-    )
-  add_ceph_unittest(unittest_bit_alloc ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_bit_alloc)
-  target_link_libraries(unittest_bit_alloc os global)
 
   add_executable(unittest_alloc
     Allocator_test.cc
@@ -123,6 +116,24 @@ if(HAVE_LIBAIO)
   add_ceph_unittest(unittest_alloc ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_alloc)
   target_link_libraries(unittest_alloc os global)
 
+
+  add_executable(unittest_alloc_bench
+    Allocator_bench.cc
+    $<TARGET_OBJECTS:unit-main>
+    )
+  add_ceph_unittest(unittest_alloc_bench ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_alloc_bench)
+  target_link_libraries(unittest_alloc_bench os global)
+
+  add_executable(unittest_fastbmap_allocator
+    fastbmap_allocator_test.cc
+    $<TARGET_OBJECTS:unit-main>
+    )
+  add_ceph_unittest(unittest_fastbmap_allocator ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_fastbmap_allocator)
+  target_link_libraries(unittest_fastbmap_allocator os global)
+
+  set_target_properties(unittest_fastbmap_allocator PROPERTIES COMPILE_FLAGS
+  ${UNITTEST_CXX_FLAGS})
+
   # unittest_bluefs
   add_executable(unittest_bluefs
     test_bluefs.cc
diff --git a/ceph/src/test/objectstore/fastbmap_allocator_test.cc b/ceph/src/test/objectstore/fastbmap_allocator_test.cc
new file mode 100755
index 000000000..fa7c90fcb
--- /dev/null
+++ b/ceph/src/test/objectstore/fastbmap_allocator_test.cc
@@ -0,0 +1,933 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "os/bluestore/fastbmap_allocator_impl.h"
+
+class TestAllocatorLevel01 : public AllocatorLevel01Loose
+{
+public:
+  void init(uint64_t capacity, uint64_t alloc_unit)
+  {
+    _init(capacity, alloc_unit);
+  }
+  interval_t allocate_l1_cont(uint64_t length, uint64_t min_length,
+    uint64_t pos_start, uint64_t pos_end)
+  {
+    return _allocate_l1_contiguous(length, min_length, 0, pos_start, pos_end);
+  }
+  void free_l1(const interval_t& r)
+  {
+    _free_l1(r.offset, r.length);
+  }
+};
+
+class TestAllocatorLevel02 : public AllocatorLevel02<AllocatorLevel01Loose>
+{
+public:
+  void init(uint64_t capacity, uint64_t alloc_unit)
+  {
+    _init(capacity, alloc_unit);
+  }
+  void allocate_l2(uint64_t length, uint64_t min_length,
+    uint64_t* allocated0,
+    interval_vector_t* res)
+  {
+    uint64_t allocated = 0;
+    uint64_t hint = 0; // trigger internal l2 hint support
+    _allocate_l2(length, min_length, 0, hint, &allocated, res);
+    *allocated0 += allocated;
+  }
+  void free_l2(const interval_vector_t& r)
+  {
+    _free_l2(r);
+  }
+};
+
+const uint64_t _1m = 1024 * 1024;
+const uint64_t _2m = 2 * 1024 * 1024;
+
+TEST(TestAllocatorLevel01, test_l1)
+{
+  TestAllocatorLevel01 al1;
+  uint64_t num_l1_entries = 3 * 256;
+  uint64_t capacity = num_l1_entries * 512 * 4096;
+  al1.init(capacity, 0x1000);
+  ASSERT_EQ(capacity, al1.debug_get_free());
+
+  auto i1 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i1.offset, 0);
+  ASSERT_EQ(i1.length, 0x1000);
+  ASSERT_EQ(capacity - 0x1000, al1.debug_get_free());
+
+  auto i2 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 0x1000);
+  ASSERT_EQ(i2.length, 0x1000);
+  al1.free_l1(i2);
+  al1.free_l1(i1);
+  i1 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i1.offset, 0);
+  ASSERT_EQ(i1.length, 0x1000);
+  i2 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 0x1000);
+  ASSERT_EQ(i2.length, 0x1000);
+  al1.free_l1(i1);
+  al1.free_l1(i2);
+
+  i1 = al1.allocate_l1_cont(0x2000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i1.offset, 0);
+  ASSERT_EQ(i1.length, 0x2000);
+
+  i2 = al1.allocate_l1_cont(0x3000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 0x2000);
+  ASSERT_EQ(i2.length, 0x3000);
+
+  al1.free_l1(i1);
+  al1.free_l1(i2);
+
+  i1 = al1.allocate_l1_cont(0x2000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i1.offset, 0);
+  ASSERT_EQ(i1.length, 0x2000);
+
+  i2 = al1.allocate_l1_cont(2 * 1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 2 * 1024 * 1024);
+  ASSERT_EQ(i2.length, 2 * 1024 * 1024);
+
+  al1.free_l1(i1);
+  i1 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i1.offset, 0);
+  ASSERT_EQ(i1.length, 1024 * 1024);
+
+  auto i3 = al1.allocate_l1_cont(1024 * 1024 + 0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i3.offset, 2 * 2 * 1024 * 1024);
+  ASSERT_EQ(i3.length, 1024 * 1024 + 0x1000);
+
+  // here we have the following layout:
+  // Alloc: 0~1M, 2M~2M, 4M~1M+4K
+  // Free: 1M~1M, 4M+4K ~ 2M-4K, 6M ~...
+  //
+  auto i4 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i4.offset, 1 * 1024 * 1024);
+  ASSERT_EQ(i4.length, 1024 * 1024);
+  al1.free_l1(i4);
+
+  i4 = al1.allocate_l1_cont(1024 * 1024 - 0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i4.offset, 5 * 1024 * 1024 + 0x1000);
+  ASSERT_EQ(i4.length, 1024 * 1024 - 0x1000);
+  al1.free_l1(i4);
+
+  i4 = al1.allocate_l1_cont(1024 * 1024 + 0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i4.offset, 6 * 1024 * 1024);
+  //ASSERT_EQ(i4.offset, 5 * 1024 * 1024 + 0x1000);
+  ASSERT_EQ(i4.length, 1024 * 1024 + 0x1000);
+
+  al1.free_l1(i1);
+  al1.free_l1(i2);
+  al1.free_l1(i3);
+  al1.free_l1(i4);
+
+  i1 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i1.offset, 0);
+  ASSERT_EQ(i1.length, 1024 * 1024);
+
+  i2 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 1 * 1024 * 1024);
+  ASSERT_EQ(i2.length, 1024 * 1024 );
+
+  i3 = al1.allocate_l1_cont(512 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i3.offset, 2 * 1024 * 1024);
+  ASSERT_EQ(i3.length, 512 * 1024);
+
+  i4 = al1.allocate_l1_cont(1536 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i4.offset, (2 * 1024 + 512) * 1024 );
+  ASSERT_EQ(i4.length, 1536 * 1024);
+  // making a hole 1.5 Mb length
+  al1.free_l1(i2);
+  al1.free_l1(i3);
+  // and trying to fill it
+  i2 = al1.allocate_l1_cont(1536 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 1024 * 1024);
+  ASSERT_EQ(i2.length, 1536 * 1024);
+
+  al1.free_l1(i2);
+  // and trying to fill it partially
+  i2 = al1.allocate_l1_cont(1528 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 1024 * 1024);
+  ASSERT_EQ(i2.length, 1528 * 1024);
+
+  i3 = al1.allocate_l1_cont(8 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i3.offset, 2552 * 1024);
+  ASSERT_EQ(i3.length, 8 * 1024);
+
+  al1.free_l1(i2);
+  // here we have the following layout:
+  // Alloc: 0~1M, 2552K~8K, num_l1_entries0K~1.5M
+  // Free: 1M~1528K, 4M ~...
+  //
+  i2 = al1.allocate_l1_cont(1536 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 4 * 1024 * 1024);
+  ASSERT_EQ(i2.length, 1536 * 1024);
+
+  al1.free_l1(i1);
+  al1.free_l1(i2);
+  al1.free_l1(i3);
+  al1.free_l1(i4);
+  ASSERT_EQ(capacity, al1.debug_get_free());
+
+  for (uint64_t i = 0; i < capacity; i += _2m) {
+    i1 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+    ASSERT_EQ(i1.offset, i);
+    ASSERT_EQ(i1.length, _2m);
+  }
+  ASSERT_EQ(0, al1.debug_get_free());
+  i2 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+  ASSERT_EQ(i2.length, 0);
+  ASSERT_EQ(0, al1.debug_get_free());
+
+  al1.free_l1(i1);
+  i2 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+  ASSERT_EQ(i2, i1);
+  al1.free_l1(i2);
+  i2 = al1.allocate_l1_cont(_1m, _1m, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, i1.offset);
+  ASSERT_EQ(i2.length, _1m);
+
+  i3 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+  ASSERT_EQ(i3.length, 0);
+
+  i3 = al1.allocate_l1_cont(_2m, _1m, 0, num_l1_entries);
+  ASSERT_EQ(i3.length, _1m);
+
+  i4 = al1.allocate_l1_cont(_2m, _1m, 0, num_l1_entries);
+  ASSERT_EQ(i4.length, 0);
+
+  al1.free_l1(i2);
+  i2 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+  ASSERT_EQ(i2.length, 0);
+
+  i2 = al1.allocate_l1_cont(_2m, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.length, _1m);
+
+  al1.free_l1(i2);
+  al1.free_l1(i3);
+  ASSERT_EQ(_2m, al1.debug_get_free());
+
+  i1 = al1.allocate_l1_cont(_2m - 3 * 0x1000, 0x1000, 0, num_l1_entries);
+  i2 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  i3 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  i4 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(0, al1.debug_get_free());
+
+  al1.free_l1(i2);
+  al1.free_l1(i4);
+
+  i2 = al1.allocate_l1_cont(0x4000, 0x2000, 0, num_l1_entries);
+  ASSERT_EQ(i2.length, 0);
+  i2 = al1.allocate_l1_cont(0x4000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.length, 0x1000);
+
+  al1.free_l1(i3);
+  i3 = al1.allocate_l1_cont(0x6000, 0x3000, 0, num_l1_entries);
+  ASSERT_EQ(i3.length, 0);
+  i3 = al1.allocate_l1_cont(0x6000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i3.length, 0x2000);
+  ASSERT_EQ(0, al1.debug_get_free());
+
+  std::cout << "Done L1" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2)
+{
+  TestAllocatorLevel02 al2;
+  uint64_t num_l2_entries = 64;// *512;
+  uint64_t capacity = num_l2_entries * 256 * 512 * 4096;
+  al2.init(capacity, 0x1000);
+  std::cout << "Init L2" << std::endl;
+
+  uint64_t allocated1 = 0;
+  interval_vector_t a1;
+  al2.allocate_l2(0x2000, 0x2000, &allocated1, &a1);
+  ASSERT_EQ(allocated1, 0x2000);
+  ASSERT_EQ(a1[0].offset, 0);
+  ASSERT_EQ(a1[0].length, 0x2000);
+
+  // limit query range in debug_get_free for the sake of performance
+  ASSERT_EQ(0x2000, al2.debug_get_allocated(0, 1));
+  ASSERT_EQ(0, al2.debug_get_allocated(1, 2));
+
+  uint64_t allocated2 = 0;
+  interval_vector_t a2;
+  al2.allocate_l2(0x2000, 0x2000, &allocated2, &a2);
+  ASSERT_EQ(allocated2, 0x2000);
+  ASSERT_EQ(a2[0].offset, 0x2000);
+  ASSERT_EQ(a2[0].length, 0x2000);
+  // limit query range in debug_get_free for the sake of performance
+  ASSERT_EQ(0x4000, al2.debug_get_allocated(0, 1));
+  ASSERT_EQ(0, al2.debug_get_allocated(1, 2));
+
+  al2.free_l2(a1);
+
+  allocated2 = 0;
+  a2.clear();
+  al2.allocate_l2(0x1000, 0x1000, &allocated2, &a2);
+  ASSERT_EQ(allocated2, 0x1000);
+  ASSERT_EQ(a2[0].offset, 0x0000);
+  ASSERT_EQ(a2[0].length, 0x1000);
+  // limit query range in debug_get_free for the sake of performance
+  ASSERT_EQ(0x3000, al2.debug_get_allocated(0, 1));
+  ASSERT_EQ(0, al2.debug_get_allocated(1, 2));
+
+  uint64_t allocated3 = 0;
+  interval_vector_t a3;
+  al2.allocate_l2(0x2000, 0x1000, &allocated3, &a3);
+  ASSERT_EQ(allocated3, 0x2000);
+  ASSERT_EQ(a3.size(), 2);
+  ASSERT_EQ(a3[0].offset, 0x1000);
+  ASSERT_EQ(a3[0].length, 0x1000);
+  ASSERT_EQ(a3[1].offset, 0x4000);
+  ASSERT_EQ(a3[1].length, 0x1000);
+  // limit query range in debug_get_free for the sake of performance
+  ASSERT_EQ(0x5000, al2.debug_get_allocated(0, 1));
+  ASSERT_EQ(0, al2.debug_get_allocated(1, 2));
+  {
+    interval_vector_t r;
+    r.emplace_back(0x0, 0x5000);
+    al2.free_l2(r);
+  }
+
+  a3.clear();
+  allocated3 = 0;
+  al2.allocate_l2(_1m, _1m, &allocated3, &a3);
+  ASSERT_EQ(a3.size(), 1);
+  ASSERT_EQ(a3[0].offset, 0);
+  ASSERT_EQ(a3[0].length, _1m);
+
+  al2.free_l2(a3);
+
+  a3.clear();
+  allocated3 = 0;
+  al2.allocate_l2(4 * _1m, _1m, &allocated3, &a3);
+  ASSERT_EQ(a3.size(), 1);
+  ASSERT_EQ(a3[0].offset, 0);
+  ASSERT_EQ(a3[0].length, 4 * _1m);
+
+  al2.free_l2(a3);
+
+#ifndef _DEBUG
+  for (uint64_t i = 0; i < capacity; i += 0x1000) {
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), 1);
+    ASSERT_EQ(a4[0].offset, i);
+    ASSERT_EQ(a4[0].length, 0x1000);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc1 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+#else
+  for (uint64_t i = 0; i < capacity; i += _2m) {
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(_2m, _2m, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), 1);
+    ASSERT_EQ(a4[0].offset, i);
+    ASSERT_EQ(a4[0].length, _2m);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc1 " << i / 1024 / 1024 << " mb of "
+                << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+#endif
+
+  ASSERT_EQ(0, al2.debug_get_free());
+  for (uint64_t i = 0; i < capacity; i += _1m) {
+    interval_vector_t r;
+    r.emplace_back(i, _1m);
+    al2.free_l2(r);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "free1 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  ASSERT_EQ(capacity, al2.debug_get_free());
+
+  for (uint64_t i = 0; i < capacity; i += _1m) {
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), 1);
+    ASSERT_EQ(allocated4, _1m);
+    ASSERT_EQ(a4[0].offset, i);
+    ASSERT_EQ(a4[0].length, _1m);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc2 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  ASSERT_EQ(0, al2.debug_get_free());
+  uint64_t allocated4 = 0;
+  interval_vector_t a4;
+  al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+  ASSERT_EQ(a4.size(), 0);
+  al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+  ASSERT_EQ(a4.size(), 0);
+
+  for (uint64_t i = 0; i < capacity; i += 0x2000) {
+    interval_vector_t r;
+    r.emplace_back(i, 0x1000);
+    al2.free_l2(r);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "free2 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  ASSERT_EQ(capacity / 2, al2.debug_get_free());
+
+  // unable to allocate due to fragmentation
+  al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+  ASSERT_EQ(a4.size(), 0);
+
+  for (uint64_t i = 0; i < capacity; i += 2 * _1m) {
+    a4.clear();
+    allocated4 = 0;
+    al2.allocate_l2(_1m, 0x1000, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), _1m / 0x1000);
+    ASSERT_EQ(allocated4, _1m);
+    ASSERT_EQ(a4[0].offset, i);
+    ASSERT_EQ(a4[0].length, 0x1000);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc3 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  ASSERT_EQ(0, al2.debug_get_free());
+
+  std::cout << "Done L2" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2_huge)
+{
+  TestAllocatorLevel02 al2;
+  uint64_t num_l2_entries = 4 * 512;
+  uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 1 TB
+  al2.init(capacity, 0x1000);
+  std::cout << "Init L2 Huge" << std::endl;
+
+  for (uint64_t i = 0; i < capacity; i += _1m) {
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), 1);
+    ASSERT_EQ(allocated4, 0x1000);
+    ASSERT_EQ(a4[0].offset, i);
+    ASSERT_EQ(a4[0].length, 0x1000);
+
+    allocated4 = 0;
+    a4.clear();
+    al2.allocate_l2(_1m - 0x1000, 0x1000, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), 1);
+    ASSERT_EQ(allocated4, _1m - 0x1000);
+    ASSERT_EQ(a4[0].offset, i + 0x1000);
+    ASSERT_EQ(a4[0].length, _1m - 0x1000);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "allocH " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  for (uint64_t i = 0; i < capacity; i += _1m) {
+    interval_vector_t a4;
+    a4.emplace_back(i, 0x1000);
+    al2.free_l2(a4);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "freeH1 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  {
+    std::cout << "Try" << std::endl;
+    time_t t = time(NULL);
+    for (int i = 0; i < 10; ++i) {
+      uint64_t allocated = 0;
+      interval_vector_t a;
+      al2.allocate_l2(0x2000, 0x2000, &allocated, &a);
+      ASSERT_EQ(a.size(), 0);
+    }
+    std::cout << "End try in " << time(NULL) - t << " seconds" << std::endl;
+  }
+  {
+    std::cout << "Try" << std::endl;
+    time_t t = time(NULL);
+    for (int i = 0; i < 10; ++i) {
+      uint64_t allocated = 0;
+      interval_vector_t a;
+      al2.allocate_l2(_2m, _2m, &allocated, &a);
+      ASSERT_EQ(a.size(), 0);
+    }
+    std::cout << "End try in " << time(NULL) - t << " seconds" << std::endl;
+  }
+
+  ASSERT_EQ((capacity / _1m) * 0x1000, al2.debug_get_free());
+
+  std::cout << "Done L2 Huge" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2_unaligned)
+{
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t num_l2_entries = 3;
+    uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 3x512 MB
+    al2.init(capacity, 0x1000);
+    std::cout << "Init L2 Unaligned" << std::endl;
+
+    for (uint64_t i = 0; i < capacity; i += _1m / 2) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m / 2, _1m / 2, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(allocated4, _1m / 2);
+      ASSERT_EQ(a4[0].offset, i);
+      ASSERT_EQ(a4[0].length, _1m / 2);
+      if (0 == (i % (1 * 1024 * _1m))) {
+        std::cout << "allocU " << i / 1024 / 1024 << " mb of "
+          << capacity / 1024 / 1024 << std::endl;
+      }
+    }
+    ASSERT_EQ(0, al2.debug_get_free());
+    {
+      // no space to allocate
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 0);
+    }
+  }
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 500 * 512 * 4096; // 500x2 MB
+    al2.init(capacity, 0x1000);
+    std::cout << ("Init L2 Unaligned2\n");
+    for (uint64_t i = 0; i < capacity; i += _1m / 2) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m / 2, _1m / 2, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(allocated4, _1m / 2);
+      ASSERT_EQ(a4[0].offset, i);
+      ASSERT_EQ(a4[0].length, _1m / 2);
+      if (0 == (i % (1 * 1024 * _1m))) {
+        std::cout << "allocU2 " << i / 1024 / 1024 << " mb of "
+          << capacity / 1024 / 1024 << std::endl;
+      }
+    }
+    ASSERT_EQ(0, al2.debug_get_free());
+    {
+      // no space to allocate
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 0);
+    }
+  }
+
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 100 * 512 * 4096 + 127 * 4096;
+    al2.init(capacity, 0x1000);
+    std::cout << "Init L2 Unaligned2" << std::endl;
+    for (uint64_t i = 0; i < capacity; i += 0x1000) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(allocated4, 0x1000);
+      ASSERT_EQ(a4[0].offset, i);
+      ASSERT_EQ(a4[0].length, 0x1000);
+    }
+    ASSERT_EQ(0, al2.debug_get_free());
+    {
+      // no space to allocate
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 0);
+    }
+  }
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 3 * 4096;
+    al2.init(capacity, 0x1000);
+    std::cout << "Init L2 Unaligned2" << std::endl;
+    for (uint64_t i = 0; i < capacity; i += 0x1000) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(allocated4, 0x1000);
+      ASSERT_EQ(a4[0].offset, i);
+      ASSERT_EQ(a4[0].length, 0x1000);
+    }
+    ASSERT_EQ(0, al2.debug_get_free());
+    {
+      // no space to allocate
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 0);
+    }
+  }
+
+  std::cout << "Done L2 Unaligned" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2_contiguous_alignment)
+{
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t num_l2_entries = 3;
+    uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 3x512 MB
+    uint64_t num_chunks = capacity / 4096;
+    al2.init(capacity, 4096);
+    std::cout << "Init L2 cont aligned" << std::endl;
+
+    std::map<size_t, size_t> bins_overall;
+    al2.collect_stats(bins_overall);
+    ASSERT_EQ(bins_overall.size(), 1u);
+//    std::cout<<bins_overall.begin()->first << std::endl;
+    ASSERT_EQ(bins_overall[cbits(num_chunks) - 1], 1u);
+
+    for (uint64_t i = 0; i < capacity / 2; i += _1m) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(allocated4, _1m);
+      ASSERT_EQ(a4[0].offset, i);
+      ASSERT_EQ(a4[0].length, _1m);
+    }
+    ASSERT_EQ(capacity / 2, al2.debug_get_free());
+
+    bins_overall.clear();
+    al2.collect_stats(bins_overall);
+    ASSERT_EQ(bins_overall.size(), 1u);
+    ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+
+    {
+      size_t to_release = 2 * _1m + 0x1000;
+      // release 2M + 4K at the beginning
+      interval_vector_t r;
+      r.emplace_back(0, to_release);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 2u);
+      ASSERT_EQ(bins_overall[cbits(to_release / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+    }
+    {
+      // allocate 4K within the deallocated range
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1u);
+      ASSERT_EQ(allocated4, 0x1000u);
+      ASSERT_EQ(a4[0].offset, 0u);
+      ASSERT_EQ(a4[0].length, 0x1000u);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 2u);
+      ASSERT_EQ(bins_overall[cbits(2 * _1m / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+    }
+    {
+      // allocate 1M - should go to the second 1M chunk
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(allocated4, _1m);
+      ASSERT_EQ(a4[0].offset, _1m);
+      ASSERT_EQ(a4[0].length, _1m);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[0], 1u);
+      ASSERT_EQ(bins_overall[cbits((_1m - 0x1000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+    }
+    {
+      // and allocate yet another 8K within the deallocated range
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x2000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1u);
+      ASSERT_EQ(allocated4, 0x2000u);
+      ASSERT_EQ(a4[0].offset, 0x1000u);
+      ASSERT_EQ(a4[0].length, 0x2000u);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall[0], 1u);
+      ASSERT_EQ(bins_overall[cbits((_1m - 0x3000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+    }
+    {
+      // release just allocated 1M
+      interval_vector_t r;
+      r.emplace_back(_1m, _1m);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 2u);
+      ASSERT_EQ(bins_overall[cbits((2 * _1m - 0x3000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+    }
+    {
+      // allocate 3M - should go to the second 1M chunk and @capacity/2
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(3 * _1m, _1m, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 2);
+      ASSERT_EQ(allocated4, 3 * _1m);
+      ASSERT_EQ(a4[0].offset, _1m);
+      ASSERT_EQ(a4[0].length, _1m);
+      ASSERT_EQ(a4[1].offset, capacity / 2);
+      ASSERT_EQ(a4[1].length, 2 * _1m);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[0], 1u);
+      ASSERT_EQ(bins_overall[cbits((_1m - 0x3000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits((num_chunks - 512) / 2) - 1], 1u);
+    }
+    {
+      // release allocated 1M in the second meg chunk except
+      // the first 4K chunk
+      interval_vector_t r;
+      r.emplace_back(_1m + 0x1000, _1m);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[cbits(_1m / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits((_1m - 0x3000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits((num_chunks - 512) / 2) - 1], 1u);
+    }
+    {
+      // release 2M @(capacity / 2)
+      interval_vector_t r;
+      r.emplace_back(capacity / 2, 2 * _1m);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[cbits(_1m / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits((_1m - 0x3000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits((num_chunks) / 2) - 1], 1u);
+    }
+    {
+      // allocate 4x512K - should go to the second halves of
+      // the first and second 1M chunks and @(capacity / 2)
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(2 * _1m, _1m / 2, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 3);
+      ASSERT_EQ(allocated4, 2 * _1m);
+      ASSERT_EQ(a4[0].offset, _1m / 2);
+      ASSERT_EQ(a4[0].length, _1m / 2);
+      ASSERT_EQ(a4[1].offset, _1m + _1m / 2);
+      ASSERT_EQ(a4[1].length, _1m / 2);
+      ASSERT_EQ(a4[2].offset, capacity / 2);
+      ASSERT_EQ(a4[2].length, _1m);
+
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[0], 1u);
+      // below we have 512K - 4K & 512K - 12K chunks which both fit into
+      // the same bin = 6
+      ASSERT_EQ(bins_overall[6], 2u);
+      ASSERT_EQ(bins_overall[cbits((num_chunks - 256) / 2) - 1], 1u);
+
+    }
+    {
+      // cleanup first 2M except except the last 4K chunk
+      interval_vector_t r;
+      r.emplace_back(0, 2 * _1m - 0x1000);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[0], 1u);
+      ASSERT_EQ(bins_overall[cbits((_2m - 0x1000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits((num_chunks - 256) / 2) - 1], 1u);
+    }
+    {
+      // release 2M @(capacity / 2)
+      interval_vector_t r;
+      r.emplace_back(capacity / 2, 2 * _1m);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[0], 1u);
+      ASSERT_EQ(bins_overall[cbits((_2m - 0x1000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+    }
+    {
+      // allocate 132M using 4M granularity should go to (capacity / 2)
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(132 * _1m, 4 * _1m , &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(a4[0].offset, capacity / 2);
+      ASSERT_EQ(a4[0].length, 132 * _1m);
+
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 3u);
+    }
+    {
+      // cleanup left 4K chunk in the first 2M
+      interval_vector_t r;
+      r.emplace_back(2 * _1m - 0x1000, 0x1000);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+
+      ASSERT_EQ(bins_overall.size(), 2u);
+    }
+    {
+      // release 132M @(capacity / 2)
+      interval_vector_t r;
+      r.emplace_back(capacity / 2, 132 * _1m);
+      al2.free_l2(r);
+    }
+    {
+      // allocate 132M using 2M granularity should go to the first chunk and to
+      // (capacity / 2)
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(132 * _1m, 2 * _1m , &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 2);
+      ASSERT_EQ(a4[0].offset, 0);
+      ASSERT_EQ(a4[0].length, 2 * _1m);
+      ASSERT_EQ(a4[1].offset, capacity / 2);
+      ASSERT_EQ(a4[1].length, 130 * _1m);
+    }
+    {
+      // release 130M @(capacity / 2)
+      interval_vector_t r;
+      r.emplace_back(capacity / 2, 132 * _1m);
+      al2.free_l2(r);
+    }
+    {
+      // release 4K~16K
+      // release 28K~32K
+      // release 68K~24K
+      interval_vector_t r;
+      r.emplace_back(0x1000, 0x4000);
+      r.emplace_back(0x7000, 0x8000);
+      r.emplace_back(0x11000, 0x6000);
+      al2.free_l2(r);
+    }
+    {
+      // allocate 32K using 16K granularity - should bypass the first
+      // unaligned extent, use the second free extent partially given
+      // the 16K alignment and then fallback to capacity / 2
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x8000, 0x4000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 2);
+      ASSERT_EQ(a4[0].offset, 0x8000);
+      ASSERT_EQ(a4[0].length, 0x4000);
+      ASSERT_EQ(a4[1].offset, capacity / 2);
+      ASSERT_EQ(a4[1].length, 0x4000);
+    }
+
+  }
+  std::cout << "Done L2 cont aligned" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_4G_alloc_bug)
+{
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 0x8000 * _1m; // = 32GB
+    al2.init(capacity, 0x10000);
+    std::cout << "Init L2 cont aligned" << std::endl;
+
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1u); // the bug caused no allocations here
+      ASSERT_EQ(allocated4, _1m);
+      ASSERT_EQ(a4[0].offset, 0u);
+      ASSERT_EQ(a4[0].length, _1m);
+  }
+}
+
+TEST(TestAllocatorLevel01, test_4G_alloc_bug2)
+{
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 0x8000 * _1m; // = 32GB
+    al2.init(capacity, 0x10000);
+
+    for (uint64_t i = 0; i < capacity; i += _1m) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1u);
+      ASSERT_EQ(allocated4, _1m);
+      ASSERT_EQ(a4[0].offset, i);
+      ASSERT_EQ(a4[0].length, _1m);
+    }
+    ASSERT_EQ(0u , al2.debug_get_free());
+
+    interval_vector_t r;
+    r.emplace_back(0x5fec30000, 0x13d0000);
+    r.emplace_back(0x628000000, 0x80000000);
+    r.emplace_back(0x6a8000000, 0x80000000);
+    r.emplace_back(0x728100000, 0x70000);
+    al2.free_l2(r);
+
+    std::map<size_t, size_t> bins_overall;
+    al2.collect_stats(bins_overall);
+
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(0x3e000000, _1m, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), 2u);
+    ASSERT_EQ(allocated4, 0x3e000000u);
+    ASSERT_EQ(a4[0].offset, 0x5fed00000u);
+    ASSERT_EQ(a4[0].length, 0x1300000u);
+    ASSERT_EQ(a4[1].offset, 0x628000000u);
+    ASSERT_EQ(a4[1].length, 0x3cd00000u);
+  }
+}
+
+TEST(TestAllocatorLevel01, test_4G_alloc_bug3)
+{
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 0x8000 * _1m; // = 32GB
+    al2.init(capacity, 0x10000);
+    std::cout << "Init L2 cont aligned" << std::endl;
+
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(4096ull * _1m, _1m, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 2u); // allocator has to split into 2 allocations
+      ASSERT_EQ(allocated4, 4096ull * _1m);
+      ASSERT_EQ(a4[0].offset, 0u);
+      ASSERT_EQ(a4[0].length, 2048ull * _1m);
+      ASSERT_EQ(a4[1].offset, 2048ull * _1m);
+      ASSERT_EQ(a4[1].length, 2048ull * _1m);
+  }
+}
diff --git a/ceph/src/test/objectstore/test_bluestore_types.cc b/ceph/src/test/objectstore/test_bluestore_types.cc
index f16f8452c..3aad5f684 100644
--- a/ceph/src/test/objectstore/test_bluestore_types.cc
+++ b/ceph/src/test/objectstore/test_bluestore_types.cc
@@ -1242,7 +1242,7 @@ TEST(GarbageCollector, BasicTest)
     ASSERT_EQ(saving, 1);
     auto& to_collect = gc.get_extents_to_collect();
     ASSERT_EQ(to_collect.size(), 1u);
-    ASSERT_EQ(to_collect[0], AllocExtent(100,10) );
+    ASSERT_EQ(to_collect[0], bluestore_pextent_t(100,10) );
 
     em.clear();
     old_extents.clear();
@@ -1312,10 +1312,10 @@ TEST(GarbageCollector, BasicTest)
     ASSERT_EQ(saving, 2);
     auto& to_collect = gc.get_extents_to_collect();
     ASSERT_EQ(to_collect.size(), 2u);
-    ASSERT_TRUE(to_collect[0] == AllocExtent(0x0,0x8000) ||
-		  to_collect[1] == AllocExtent(0x0,0x8000));
-    ASSERT_TRUE(to_collect[0] == AllocExtent(0x3f000,0x1000) ||
-		  to_collect[1] == AllocExtent(0x3f000,0x1000));
+    ASSERT_TRUE(to_collect[0] == bluestore_pextent_t(0x0,0x8000) ||
+		  to_collect[1] == bluestore_pextent_t(0x0,0x8000));
+    ASSERT_TRUE(to_collect[0] == bluestore_pextent_t(0x3f000,0x1000) ||
+		  to_collect[1] == bluestore_pextent_t(0x3f000,0x1000));
 
     em.clear();
     old_extents.clear();
@@ -1433,10 +1433,10 @@ TEST(GarbageCollector, BasicTest)
     ASSERT_EQ(saving, 2);
     auto& to_collect = gc.get_extents_to_collect();
     ASSERT_EQ(to_collect.size(), 2u);
-    ASSERT_TRUE(to_collect[0] == AllocExtent(0x0,0x8000) ||
-		  to_collect[1] == AllocExtent(0x0,0x8000));
-    ASSERT_TRUE(to_collect[0] == AllocExtent(0x3f000,0x1000) ||
-		  to_collect[1] == AllocExtent(0x3f000,0x1000));
+    ASSERT_TRUE(to_collect[0] == bluestore_pextent_t(0x0,0x8000) ||
+		  to_collect[1] == bluestore_pextent_t(0x0,0x8000));
+    ASSERT_TRUE(to_collect[0] == bluestore_pextent_t(0x3f000,0x1000) ||
+		  to_collect[1] == bluestore_pextent_t(0x3f000,0x1000));
 
     em.clear();
     old_extents.clear();
diff --git a/ceph/src/test/osd/TestOSDMap.cc b/ceph/src/test/osd/TestOSDMap.cc
index 251ae4d99..cfb83d153 100644
--- a/ceph/src/test/osd/TestOSDMap.cc
+++ b/ceph/src/test/osd/TestOSDMap.cc
@@ -28,7 +28,7 @@ int main(int argc, char **argv) {
 }
 
 class OSDMapTest : public testing::Test {
-  const static int num_osds = 6;
+  int num_osds = 6;
 public:
   OSDMap osdmap;
   OSDMapMapping mapping;
@@ -38,7 +38,8 @@ public:
 
   OSDMapTest() {}
 
-  void set_up_map() {
+  void set_up_map(int new_num_osds = 6, bool no_default_pools = false) {
+    num_osds = new_num_osds;
     uuid_d fsid;
     osdmap.build_simple(g_ceph_context, 0, fsid, num_osds);
     OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
@@ -57,6 +58,8 @@ public:
       pending_inc.new_uuid[i] = sample_uuid;
     }
     osdmap.apply_incremental(pending_inc);
+    if (no_default_pools) // do not create any default pool(s)
+      return;
 
     // Create an EC ruleset and a pool using it
     int r = osdmap.crush->add_simple_rule(
@@ -92,17 +95,17 @@ public:
     osdmap.apply_incremental(new_pool_inc);
   }
   unsigned int get_num_osds() { return num_osds; }
-  void get_crush(CrushWrapper& newcrush) {
+  void get_crush(const OSDMap& tmap, CrushWrapper& newcrush) {
     bufferlist bl;
-    osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+    tmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
     bufferlist::iterator p = bl.begin();
     newcrush.decode(p);
   }
-  int crush_move(const string &name, const vector<string> &argvec) {
+  int crush_move(OSDMap& tmap, const string &name, const vector<string> &argvec) {
     map<string,string> loc;
     CrushWrapper::parse_loc_map(argvec, &loc);
     CrushWrapper newcrush;
-    get_crush(newcrush);
+    get_crush(tmap, newcrush);
     if (!newcrush.name_exists(name)) {
        return -ENOENT;
     }
@@ -115,10 +118,10 @@ public:
         err = newcrush.move_bucket(g_ceph_context, id, loc);
       }
       if (err >= 0) {
-        OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+        OSDMap::Incremental pending_inc(tmap.get_epoch() + 1);
         pending_inc.crush.clear();
         newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
-        osdmap.apply_incremental(pending_inc);
+        tmap.apply_incremental(pending_inc);
         err = 0;
       }
     } else {
@@ -134,7 +137,7 @@ public:
       return osdmap.crush->get_rule_id(name);
     }
     CrushWrapper newcrush;
-    get_crush(newcrush);
+    get_crush(osdmap, newcrush);
     string device_class;
     stringstream ss;
     int ruleno = newcrush.add_simple_rule(
@@ -559,7 +562,7 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
     move_to.push_back("root=default");
     string host_loc = "host=" + host_name.str();
     move_to.push_back(host_loc);
-    int r = crush_move(osd_name.str(), move_to);
+    int r = crush_move(osdmap, osd_name.str(), move_to);
     ASSERT_EQ(0, r);
   }
   const string upmap_rule = "upmap";
@@ -735,6 +738,132 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
     }
   }
 
+  {
+    // http://tracker.ceph.com/issues/37968
+    
+    // build a temporary crush topology of 2 hosts, 3 osds per host
+    OSDMap tmp; // use a tmpmap here, so we do not dirty origin map..
+    tmp.deepish_copy_from(osdmap);
+    const int expected_host_num = 2;
+    int osd_per_host = get_num_osds() / expected_host_num;
+    ASSERT_GE(osd_per_host, 3);
+    int index = 0;
+    for (int i = 0; i < (int)get_num_osds(); i++) {
+      if (i && i % osd_per_host == 0) {
+        ++index;
+      }
+      stringstream osd_name;
+      stringstream host_name;
+      vector<string> move_to;
+      osd_name << "osd." << i;
+      host_name << "host-" << index;
+      move_to.push_back("root=default");
+      string host_loc = "host=" + host_name.str();
+      move_to.push_back(host_loc);
+      auto r = crush_move(tmp, osd_name.str(), move_to);
+      ASSERT_EQ(0, r);
+    }
+      
+    // build crush rule
+    CrushWrapper crush;
+    get_crush(tmp, crush);
+    string rule_name = "rule_37968";
+    int rule_type = pg_pool_t::TYPE_ERASURE;
+    ASSERT_TRUE(!crush.rule_exists(rule_name));
+    int rno;
+    for (rno = 0; rno < crush.get_max_rules(); rno++) {
+      if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
+        break;
+    }
+    string root_name = "default";
+    int root = crush.get_item_id(root_name);
+    int min_size = 3;
+    int max_size = 4;
+    int steps = 6;
+    crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
+    int step = 0;
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 1 /* host*/); 
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 0 /* osd */); 
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    ASSERT_TRUE(step == steps);
+    auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
+    ASSERT_TRUE(r >= 0);
+    crush.set_rule_name(rno, rule_name);
+    {
+      OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
+      pending_inc.crush.clear();
+      crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      tmp.apply_incremental(pending_inc);
+    }
+
+    // create a erasuce-coded pool referencing the above rule
+    int64_t pool_37968;
+    {
+      OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1);
+      new_pool_inc.new_pool_max = tmp.get_pool_max();
+      new_pool_inc.fsid = tmp.get_fsid();
+      pg_pool_t empty;
+      pool_37968 = ++new_pool_inc.new_pool_max;
+      pg_pool_t *p = new_pool_inc.get_new_pool(pool_37968, &empty);
+      p->size = 4;
+      p->set_pg_num(8);
+      p->set_pgp_num(8);
+      p->type = pg_pool_t::TYPE_ERASURE;
+      p->crush_rule = rno;
+      p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+      new_pool_inc.new_pool_names[pool_37968] = "pool_37968";
+      tmp.apply_incremental(new_pool_inc);
+    }
+
+    pg_t ec_pg(0, pool_37968);
+    pg_t ec_pgid = tmp.raw_pg_to_pg(ec_pg);
+    int from = -1;
+    int to = -1;
+    {
+      // insert a valid pg_upmap_item
+      vector<int> ec_up;
+      int ec_up_primary;
+      tmp.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
+      ASSERT_TRUE(ec_up.size() == 4);
+      from = *(ec_up.begin());
+      ASSERT_TRUE(from >= 0);
+      auto parent = tmp.crush->get_parent_of_type(from, 1 /* host */, rno);
+      ASSERT_TRUE(parent < 0);
+      // pick an osd of the same parent with *from*
+      for (int i = 0; i < (int)get_num_osds(); i++) {
+        if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
+          auto p = tmp.crush->get_parent_of_type(i, 1 /* host */, rno);
+          if (p == parent) {
+            to = i;
+            break;
+          }
+        }
+      }
+      ASSERT_TRUE(to >= 0);
+      ASSERT_TRUE(from != to);
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(from, to));
+      OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[ec_pgid] =
+        mempool::osdmap::vector<pair<int32_t,int32_t>>(
+          new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      tmp.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid));
+    }
+    {
+      // *maybe_remove_pg_upmaps* should not remove the above upmap_item
+      OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
+      OSDMap nextmap;
+      nextmap.deepish_copy_from(tmp);
+      nextmap.maybe_remove_pg_upmaps(g_ceph_context, nextmap, &pending_inc);
+      tmp.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid));
+    }
+  }
+
   {
     // TEST pg_upmap
     {
@@ -941,6 +1070,229 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
   }
 }
 
+TEST_F(OSDMapTest, BUG_38897) {
+  // http://tracker.ceph.com/issues/38897
+  // build a fresh map with 12 OSDs, without any default pools
+  set_up_map(12, true);
+  const string pool_1("pool1");
+  const string pool_2("pool2");
+  int64_t pool_1_id = -1;
+
+  {
+    // build customized crush rule for "pool1"
+    string host_name = "host_for_pool_1";
+    // build a customized host to capture osd.1~5
+    for (int i = 1; i < 5; i++) {
+      stringstream osd_name;
+      vector<string> move_to;
+      osd_name << "osd." << i;
+      move_to.push_back("root=default");
+      string host_loc = "host=" + host_name;
+      move_to.push_back(host_loc);
+      auto r = crush_move(osdmap, osd_name.str(), move_to);
+      ASSERT_EQ(0, r);
+    }
+    CrushWrapper crush;
+    get_crush(osdmap, crush);
+    auto host_id = crush.get_item_id(host_name);
+    ASSERT_TRUE(host_id < 0);
+    string rule_name = "rule_for_pool1";
+    int rule_type = pg_pool_t::TYPE_REPLICATED;
+    ASSERT_TRUE(!crush.rule_exists(rule_name));
+    int rno;
+    for (rno = 0; rno < crush.get_max_rules(); rno++) {
+      if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
+        break;
+    }
+    int min_size = 3;
+    int max_size = 3;
+    int steps = 7;
+    crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
+    int step = 0;
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+    // always choose osd.0
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    // then pick any other random osds
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    ASSERT_TRUE(step == steps);
+    auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
+    ASSERT_TRUE(r >= 0);
+    crush.set_rule_name(rno, rule_name);
+    {
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.crush.clear();
+      crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      osdmap.apply_incremental(pending_inc);
+    }
+
+    // create "pool1"
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    pending_inc.new_pool_max = osdmap.get_pool_max();
+    auto pool_id = ++pending_inc.new_pool_max;
+    pool_1_id = pool_id;
+    pg_pool_t empty;
+    auto p = pending_inc.get_new_pool(pool_id, &empty);
+    p->size = 3;
+    p->min_size = 1;
+    p->set_pg_num(3);
+    p->set_pgp_num(3);
+    p->type = pg_pool_t::TYPE_REPLICATED;
+    p->crush_rule = rno;
+    p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+    pending_inc.new_pool_names[pool_id] = pool_1;
+    osdmap.apply_incremental(pending_inc);
+    ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
+    ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_1);
+    {
+      for (unsigned i = 0; i < 3; i++) {
+        // 1.x -> [1]
+        pg_t rawpg(i, pool_id);
+        pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+        vector<int> up;
+        int up_primary;
+        osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+        ASSERT_TRUE(up.size() == 3);
+        ASSERT_TRUE(up[0] == 0);
+
+        // insert a new pg_upmap
+        vector<int32_t> new_up;
+        // and remap 1.x to osd.1 only
+        // this way osd.0 is deemed to be *underfull*
+        // and osd.1 is deemed to be *overfull*
+        new_up.push_back(1);
+        {
+          OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+          pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
+            new_up.begin(), new_up.end());
+          osdmap.apply_incremental(pending_inc);
+        }
+        osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+        ASSERT_TRUE(up.size() == 1);
+        ASSERT_TRUE(up[0] == 1);
+      }
+    }
+  }
+
+  {
+    // build customized crush rule for "pool2"
+    string host_name = "host_for_pool_2";
+    // build a customized host to capture osd.6~11
+    for (int i = 6; i < (int)get_num_osds(); i++) {
+      stringstream osd_name;
+      vector<string> move_to;
+      osd_name << "osd." << i;
+      move_to.push_back("root=default");
+      string host_loc = "host=" + host_name;
+      move_to.push_back(host_loc);
+      auto r = crush_move(osdmap, osd_name.str(), move_to);
+      ASSERT_EQ(0, r);
+    }
+    CrushWrapper crush;
+    get_crush(osdmap, crush);
+    auto host_id = crush.get_item_id(host_name);
+    ASSERT_TRUE(host_id < 0);
+    string rule_name = "rule_for_pool2";
+    int rule_type = pg_pool_t::TYPE_REPLICATED;
+    ASSERT_TRUE(!crush.rule_exists(rule_name));
+    int rno;
+    for (rno = 0; rno < crush.get_max_rules(); rno++) {
+      if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
+        break;
+    }
+    int min_size = 3;
+    int max_size = 3;
+    int steps = 7;
+    crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
+    int step = 0;
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+    // always choose osd.0
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    // then pick any other random osds
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    ASSERT_TRUE(step == steps);
+    auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
+    ASSERT_TRUE(r >= 0);
+    crush.set_rule_name(rno, rule_name);
+    {
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.crush.clear();
+      crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      osdmap.apply_incremental(pending_inc);
+    }
+
+    // create "pool2"
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    pending_inc.new_pool_max = osdmap.get_pool_max();
+    auto pool_id = ++pending_inc.new_pool_max;
+    pg_pool_t empty;
+    auto p = pending_inc.get_new_pool(pool_id, &empty);
+    p->size = 3;
+    // include a single PG
+    p->set_pg_num(1);
+    p->set_pgp_num(1);
+    p->type = pg_pool_t::TYPE_REPLICATED;
+    p->crush_rule = rno;
+    p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+    pending_inc.new_pool_names[pool_id] = pool_2;
+    osdmap.apply_incremental(pending_inc);
+    ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
+    ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_2);
+    pg_t rawpg(0, pool_id);
+    pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+    EXPECT_TRUE(!osdmap.have_pg_upmaps(pgid));
+    vector<int> up;
+    int up_primary;
+    osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+    ASSERT_TRUE(up.size() == 3);
+    ASSERT_TRUE(up[0] == 0);
+
+    {
+      // build a pg_upmap_item that will
+      // remap pg out from *underfull* osd.0
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(0, 10)); // osd.0 -> osd.10
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[pgid] =
+      mempool::osdmap::vector<pair<int32_t,int32_t>>(
+        new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      osdmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(osdmap.have_pg_upmaps(pgid));
+      vector<int> up;
+      int up_primary;
+      osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+      ASSERT_TRUE(up.size() == 3);
+      ASSERT_TRUE(up[0] == 10);
+    }
+  }
+
+  // ready to go
+  {
+    // require perfect distribution!
+    auto ret = g_ceph_context->_conf->set_val(
+      "osd_calc_pg_upmaps_max_stddev", "0");
+    ASSERT_EQ(0, ret);
+    g_ceph_context->_conf->apply_changes(nullptr);
+    set<int64_t> only_pools;
+    ASSERT_TRUE(pool_1_id >= 0);
+    only_pools.insert(pool_1_id);
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    osdmap.calc_pg_upmaps(g_ceph_context,
+                          0, // so we can force optimizing
+                          100,
+                          only_pools,
+                          &pending_inc);
+    osdmap.apply_incremental(pending_inc);
+  }
+}
+
 TEST(PGTempMap, basic)
 {
   PGTempMap m;
diff --git a/ceph/src/test/osd/TestPGLog.cc b/ceph/src/test/osd/TestPGLog.cc
index dbacde444..cdae317ae 100644
--- a/ceph/src/test/osd/TestPGLog.cc
+++ b/ceph/src/test/osd/TestPGLog.cc
@@ -2834,6 +2834,7 @@ TEST_F(PGLogTrimTest, TestTrimAll)
 {
   SetUp(1, 2, 20);
   PGLog::IndexedLog log;
+  EXPECT_EQ(0u, log.dup_index.size()); // Sanity check
   log.head = mk_evt(24, 0);
   log.skip_can_rollback_to_to_head();
   log.head = mk_evt(9, 0);
@@ -2856,6 +2857,7 @@ TEST_F(PGLogTrimTest, TestTrimAll)
   EXPECT_EQ(6u, trimmed.size());
   EXPECT_EQ(5u, log.dups.size());
   EXPECT_EQ(0u, trimmed_dups.size());
+  EXPECT_EQ(0u, log.dup_index.size()); // dup_index entry should be trimmed
 }
 
 
diff --git a/ceph/src/test/rgw/rgw_multi/tests.py b/ceph/src/test/rgw/rgw_multi/tests.py
index 5b4f0a0d3..7d97c67cd 100644
--- a/ceph/src/test/rgw/rgw_multi/tests.py
+++ b/ceph/src/test/rgw/rgw_multi/tests.py
@@ -96,6 +96,15 @@ def meta_sync_status(zone):
 def mdlog_autotrim(zone):
     zone.cluster.admin(['mdlog', 'autotrim'])
 
+def datalog_list(zone, period = None):
+    cmd = ['datalog', 'list']
+    (datalog_json, _) = zone.cluster.admin(cmd, read_only=True)
+    datalog_json = datalog_json.decode('utf-8')
+    return json.loads(datalog_json)
+
+def datalog_autotrim(zone):
+    zone.cluster.admin(['datalog', 'autotrim'])
+
 def bilog_list(zone, bucket, args = None):
     cmd = ['bilog', 'list', '--bucket', bucket] + (args or [])
     bilog, _ = zone.cluster.admin(cmd, read_only=True)
@@ -280,7 +289,7 @@ def bucket_sync_status(target_zone, source_zone, bucket_name):
 def data_source_log_status(source_zone):
     source_cluster = source_zone.cluster
     cmd = ['datalog', 'status'] + source_zone.zone_args()
-    datalog_status_json, retcode = source_cluster.rgw_admin(cmd, read_only=True)
+    datalog_status_json, retcode = source_cluster.admin(cmd, read_only=True)
     datalog_status = json.loads(datalog_status_json.decode('utf-8'))
 
     markers = {i: s['marker'] for i, s in enumerate(datalog_status)}
@@ -345,7 +354,7 @@ def compare_bucket_status(target_zone, source_zone, bucket_name, log_status, syn
 
     return True
 
-def zone_data_checkpoint(target_zone, source_zone_conn):
+def zone_data_checkpoint(target_zone, source_zone):
     if target_zone == source_zone:
         return
 
@@ -367,6 +376,13 @@ def zone_data_checkpoint(target_zone, source_zone_conn):
     assert False, 'failed data checkpoint for target_zone=%s source_zone=%s' % \
                   (target_zone.name, source_zone.name)
 
+def zonegroup_data_checkpoint(zonegroup_conns):
+    for source_conn in zonegroup_conns.rw_zones:
+        for target_conn in zonegroup_conns.zones:
+            if source_conn.zone == target_conn.zone:
+                continue
+            log.debug('data checkpoint: source=%s target=%s', source_conn.zone.name, target_conn.zone.name)
+            zone_data_checkpoint(target_conn.zone, source_conn.zone)
 
 def zone_bucket_checkpoint(target_zone, source_zone, bucket_name):
     if target_zone == source_zone:
@@ -688,6 +704,90 @@ def test_versioned_object_incremental_sync():
     for _, bucket in zone_bucket:
         zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
 
+def test_delete_marker_full_sync():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+    buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+    # enable versioning
+    for _, bucket in zone_bucket:
+        bucket.configure_versioning(True)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    for zone, bucket in zone_bucket:
+        # upload an initial object
+        key1 = new_key(zone, bucket, 'obj')
+        key1.set_contents_from_string('')
+
+        # create a delete marker
+        key2 = new_key(zone, bucket, 'obj')
+        key2.delete()
+
+    # wait for full sync
+    for _, bucket in zone_bucket:
+        zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
+def test_suspended_delete_marker_full_sync():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+    buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+    # enable/suspend versioning
+    for _, bucket in zone_bucket:
+        bucket.configure_versioning(True)
+        bucket.configure_versioning(False)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    for zone, bucket in zone_bucket:
+        # upload an initial object
+        key1 = new_key(zone, bucket, 'obj')
+        key1.set_contents_from_string('')
+
+        # create a delete marker
+        key2 = new_key(zone, bucket, 'obj')
+        key2.delete()
+
+    # wait for full sync
+    for _, bucket in zone_bucket:
+        zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
+def test_version_suspended_incremental_sync():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+
+    zone = zonegroup_conns.rw_zones[0]
+
+    # create a non-versioned bucket
+    bucket = zone.create_bucket(gen_bucket_name())
+    log.debug('created bucket=%s', bucket.name)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    # upload an initial object
+    key1 = new_key(zone, bucket, 'obj')
+    key1.set_contents_from_string('')
+    log.debug('created initial version id=%s', key1.version_id)
+    zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
+    # enable versioning
+    bucket.configure_versioning(True)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    # re-upload the object as a new version
+    key2 = new_key(zone, bucket, 'obj')
+    key2.set_contents_from_string('')
+    log.debug('created new version id=%s', key2.version_id)
+    zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
+    # suspend versioning
+    bucket.configure_versioning(False)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    # re-upload the object as a 'null' version
+    key3 = new_key(zone, bucket, 'obj')
+    key3.set_contents_from_string('')
+    log.debug('created null version id=%s', key3.version_id)
+    zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
 
 def test_bucket_versioning():
     buckets, zone_bucket = create_bucket_per_zone_in_realm()
@@ -822,6 +922,25 @@ def test_multi_period_incremental_sync():
             mdlog = mdlog_list(zone, period)
             assert len(mdlog) == 0
 
+def test_datalog_autotrim():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+    buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+    # upload an object to each zone to generate a datalog entry
+    for zone, bucket in zone_bucket:
+        k = new_key(zone, bucket.name, 'key')
+        k.set_contents_from_string('body')
+
+    # wait for data sync to catch up
+    zonegroup_data_checkpoint(zonegroup_conns)
+
+    # trim each datalog
+    for zone, _ in zone_bucket:
+        datalog_autotrim(zone.zone)
+        datalog = datalog_list(zone.zone)
+        assert len(datalog) == 0
+
 def test_zonegroup_remove():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
@@ -913,6 +1032,8 @@ def test_bucket_sync_disable():
     for zone in zonegroup.zones:
         check_buckets_sync_status_obj_not_exist(zone, buckets)
 
+    zonegroup_data_checkpoint(zonegroup_conns)
+
 def test_bucket_sync_enable_right_after_disable():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
@@ -943,6 +1064,8 @@ def test_bucket_sync_enable_right_after_disable():
     for bucket_name in buckets:
         zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
 
+    zonegroup_data_checkpoint(zonegroup_conns)
+
 def test_bucket_sync_disable_enable():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
@@ -979,6 +1102,8 @@ def test_bucket_sync_disable_enable():
     for bucket_name in buckets:
         zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
 
+    zonegroup_data_checkpoint(zonegroup_conns)
+
 def test_multipart_object_sync():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
diff --git a/ceph/src/test/rgw/rgw_multi/zone_rados.py b/ceph/src/test/rgw/rgw_multi/zone_rados.py
index 8fae22ea8..99cc84edb 100644
--- a/ceph/src/test/rgw/rgw_multi/zone_rados.py
+++ b/ceph/src/test/rgw/rgw_multi/zone_rados.py
@@ -1,4 +1,5 @@
 import logging
+from boto.s3.deletemarker import DeleteMarker
 
 try:
     from itertools import izip_longest as zip_longest
@@ -16,6 +17,13 @@ def check_object_eq(k1, k2, check_extra = True):
     assert k2
     log.debug('comparing key name=%s', k1.name)
     eq(k1.name, k2.name)
+    eq(k1.version_id, k2.version_id)
+    eq(k1.is_latest, k2.is_latest)
+    eq(k1.last_modified, k2.last_modified)
+    if isinstance(k1, DeleteMarker):
+        assert isinstance(k2, DeleteMarker)
+        return
+
     eq(k1.get_contents_as_string(), k2.get_contents_as_string())
     eq(k1.metadata, k2.metadata)
     eq(k1.cache_control, k2.cache_control)
@@ -24,16 +32,13 @@ def check_object_eq(k1, k2, check_extra = True):
     eq(k1.content_disposition, k2.content_disposition)
     eq(k1.content_language, k2.content_language)
     eq(k1.etag, k2.etag)
-    eq(k1.last_modified, k2.last_modified)
     if check_extra:
         eq(k1.owner.id, k2.owner.id)
         eq(k1.owner.display_name, k2.owner.display_name)
     eq(k1.storage_class, k2.storage_class)
     eq(k1.size, k2.size)
-    eq(k1.version_id, k2.version_id)
     eq(k1.encrypted, k2.encrypted)
 
-
 class RadosZone(Zone):
     def __init__(self, name, zonegroup = None, cluster = None, data = None, zone_id = None, gateways = None):
         super(RadosZone, self).__init__(name, zonegroup, cluster, data, zone_id, gateways)
@@ -57,14 +62,17 @@ class RadosZone(Zone):
             b1 = self.get_bucket(bucket_name)
             b2 = zone_conn.get_bucket(bucket_name)
 
+            b1_versions = b1.list_versions()
             log.debug('bucket1 objects:')
-            for o in b1.get_all_versions():
+            for o in b1_versions:
                 log.debug('o=%s', o.name)
+
+            b2_versions = b2.list_versions()
             log.debug('bucket2 objects:')
-            for o in b2.get_all_versions():
+            for o in b2_versions:
                 log.debug('o=%s', o.name)
 
-            for k1, k2 in zip_longest(b1.get_all_versions(), b2.get_all_versions()):
+            for k1, k2 in zip_longest(b1_versions, b2_versions):
                 if k1 is None:
                     log.critical('key=%s is missing from zone=%s', k2.name, self.name)
                     assert False
@@ -74,11 +82,23 @@ class RadosZone(Zone):
 
                 check_object_eq(k1, k2)
 
-                # now get the keys through a HEAD operation, verify that the available data is the same
-                k1_head = b1.get_key(k1.name)
-                k2_head = b2.get_key(k2.name)
-
-                check_object_eq(k1_head, k2_head, False)
+                if isinstance(k1, DeleteMarker):
+                    # verify that HEAD sees a delete marker
+                    assert b1.get_key(k1.name) is None
+                    assert b2.get_key(k2.name) is None
+                else:
+                    # now get the keys through a HEAD operation, verify that the available data is the same
+                    k1_head = b1.get_key(k1.name, version_id=k1.version_id)
+                    k2_head = b2.get_key(k2.name, version_id=k2.version_id)
+                    check_object_eq(k1_head, k2_head, False)
+
+                    if k1.version_id:
+                        # compare the olh to make sure they agree about the current version
+                        k1_olh = b1.get_key(k1.name)
+                        k2_olh = b2.get_key(k2.name)
+                        # if there's a delete marker, HEAD will return None
+                        if k1_olh or k2_olh:
+                            check_object_eq(k1_olh, k2_olh, False)
 
             log.info('success, bucket identical: bucket=%s zones={%s, %s}', bucket_name, self.name, zone_conn.name)
 
diff --git a/ceph/src/test/rgw/test_rgw_crypto.cc b/ceph/src/test/rgw/test_rgw_crypto.cc
index 7739b4e99..522c8bb27 100644
--- a/ceph/src/test/rgw/test_rgw_crypto.cc
+++ b/ceph/src/test/rgw/test_rgw_crypto.cc
@@ -68,12 +68,14 @@ public:
 
 
 class BlockCryptNone: public BlockCrypt {
+  size_t block_size = 256;
 public:
   BlockCryptNone(){};
+  BlockCryptNone(size_t sz) : block_size(sz) {}
   virtual ~BlockCryptNone(){};
   size_t get_block_size() override
   {
-    return 256;
+    return block_size;
   }
   bool encrypt(bufferlist& input,
                        off_t in_ofs,
@@ -526,6 +528,172 @@ TEST(TestRGWCrypto, check_RGWGetObj_BlockDecrypt_fixup)
   ASSERT_EQ(fixup_range(&decrypt,513,1024), range_t(512,1024+255));
 }
 
+using parts_len_t = std::vector<size_t>;
+
+class TestRGWGetObj_BlockDecrypt : public RGWGetObj_BlockDecrypt {
+  using RGWGetObj_BlockDecrypt::RGWGetObj_BlockDecrypt;
+public:
+  void set_parts_len(parts_len_t&& other) {
+    parts_len = std::move(other);
+  }
+};
+
+std::vector<size_t> create_mp_parts(size_t obj_size, size_t mp_part_len){
+  std::vector<size_t> parts_len;
+  size_t part_size;
+  size_t ofs=0;
+
+  while (ofs < obj_size){
+    part_size = std::min(mp_part_len, (obj_size - ofs));
+    ofs += part_size;
+    parts_len.push_back(part_size);
+  }
+  return parts_len;
+}
+
+const size_t part_size = 5*1024*1024;
+const size_t obj_size = 30*1024*1024;
+
+TEST(TestRGWCrypto, check_RGWGetObj_BlockDecrypt_fixup_simple)
+{
+
+  ut_get_sink get_sink;
+  auto nonecrypt = ceph::make_unique<BlockCryptNone>(4096);
+  TestRGWGetObj_BlockDecrypt decrypt(g_ceph_context, &get_sink,
+				     std::move(nonecrypt));
+  decrypt.set_parts_len(create_mp_parts(obj_size, part_size));
+  ASSERT_EQ(fixup_range(&decrypt,0,0),     range_t(0,4095));
+  ASSERT_EQ(fixup_range(&decrypt,1,4096),   range_t(0,8191));
+  ASSERT_EQ(fixup_range(&decrypt,0,4095),   range_t(0,4095));
+  ASSERT_EQ(fixup_range(&decrypt,4095,4096), range_t(0,8191));
+
+  // ranges are end-end inclusive, we request bytes just spanning short of first
+  // part to exceeding the first part, part_size - 1 is aligned to a 4095 boundary
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size - 2), range_t(0, part_size -1));
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size - 1), range_t(0, part_size -1));
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size),     range_t(0, part_size + 4095));
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size + 1), range_t(0, part_size + 4095));
+
+  // request bytes spanning 2 parts
+  ASSERT_EQ(fixup_range(&decrypt, part_size -2, part_size + 2),
+	    range_t(part_size - 4096, part_size + 4095));
+
+  // request last byte
+  ASSERT_EQ(fixup_range(&decrypt, obj_size - 1, obj_size -1),
+	    range_t(obj_size - 4096, obj_size -1));
+
+}
+
+TEST(TestRGWCrypto, check_RGWGetObj_BlockDecrypt_fixup_non_aligned_obj_size)
+{
+
+  ut_get_sink get_sink;
+  auto nonecrypt = ceph::make_unique<BlockCryptNone>(4096);
+  TestRGWGetObj_BlockDecrypt decrypt(g_ceph_context, &get_sink,
+				     std::move(nonecrypt));
+  auto na_obj_size = obj_size + 1;
+  decrypt.set_parts_len(create_mp_parts(na_obj_size, part_size));
+
+  // these should be unaffected here
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size - 2), range_t(0, part_size -1));
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size - 1), range_t(0, part_size -1));
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size),     range_t(0, part_size + 4095));
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size + 1), range_t(0, part_size + 4095));
+
+
+  // request last 2 bytes; spanning 2 parts
+  ASSERT_EQ(fixup_range(&decrypt, na_obj_size -2 , na_obj_size -1),
+	    range_t(na_obj_size - 1 - 4096, na_obj_size - 1));
+
+  // request last byte, spans last 1B part only
+  ASSERT_EQ(fixup_range(&decrypt, na_obj_size -1, na_obj_size - 1),
+	    range_t(na_obj_size - 1, na_obj_size -1));
+
+}
+
+TEST(TestRGWCrypto, check_RGWGetObj_BlockDecrypt_fixup_non_aligned_part_size)
+{
+
+  ut_get_sink get_sink;
+  auto nonecrypt = ceph::make_unique<BlockCryptNone>(4096);
+  TestRGWGetObj_BlockDecrypt decrypt(g_ceph_context, &get_sink,
+				     std::move(nonecrypt));
+  auto na_part_size = part_size + 1;
+  decrypt.set_parts_len(create_mp_parts(obj_size, na_part_size));
+
+  // na_part_size -2, ie. part_size -1  is aligned to 4095 boundary
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size - 2), range_t(0, na_part_size -2));
+  // even though na_part_size -1 should not align to a 4095 boundary, the range
+  // should not span the next part
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size - 1), range_t(0, na_part_size -1));
+
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size),     range_t(0, na_part_size + 4095));
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size + 1), range_t(0, na_part_size + 4095));
+
+  // request spanning 2 parts
+  ASSERT_EQ(fixup_range(&decrypt, na_part_size - 2, na_part_size + 2),
+	    range_t(na_part_size - 1 - 4096, na_part_size + 4095));
+
+  // request last byte, this will be interesting, since this a multipart upload
+  // with 5MB+1 size, the last part is actually 5 bytes short of 5 MB, which
+  // should be considered for the ranges alignment; an easier way to look at
+  // this will be that the last offset aligned to a 5MiB part will be 5MiB -
+  // 4095, this is a part that is 5MiB - 5 B
+  ASSERT_EQ(fixup_range(&decrypt, obj_size - 1, obj_size -1),
+	    range_t(obj_size +5 -4096, obj_size -1));
+
+}
+
+TEST(TestRGWCrypto, check_RGWGetObj_BlockDecrypt_fixup_non_aligned)
+{
+
+  ut_get_sink get_sink;
+  auto nonecrypt = ceph::make_unique<BlockCryptNone>(4096);
+  TestRGWGetObj_BlockDecrypt decrypt(g_ceph_context, &get_sink,
+				     std::move(nonecrypt));
+  auto na_part_size = part_size + 1;
+  auto na_obj_size = obj_size + 7; // (6*(5MiB + 1) + 1) for the last 1B overflow
+  decrypt.set_parts_len(create_mp_parts(na_obj_size, na_part_size));
+
+  // na_part_size -2, ie. part_size -1  is aligned to 4095 boundary
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size - 2), range_t(0, na_part_size -2));
+  // even though na_part_size -1 should not align to a 4095 boundary, the range
+  // should not span the next part
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size - 1), range_t(0, na_part_size -1));
+
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size),     range_t(0, na_part_size + 4095));
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size + 1), range_t(0, na_part_size + 4095));
+
+  // request last byte, spans last 1B part only
+  ASSERT_EQ(fixup_range(&decrypt, na_obj_size -1, na_obj_size - 1),
+	    range_t(na_obj_size - 1, na_obj_size -1));
+
+  ASSERT_EQ(fixup_range(&decrypt, na_obj_size -2, na_obj_size -1),
+	    range_t(na_obj_size - 2, na_obj_size -1));
+
+}
+
+TEST(TestRGWCrypto, check_RGWGetObj_BlockDecrypt_fixup_invalid_ranges)
+{
+
+  ut_get_sink get_sink;
+  auto nonecrypt = ceph::make_unique<BlockCryptNone>(4096);
+  TestRGWGetObj_BlockDecrypt decrypt(g_ceph_context, &get_sink,
+				     std::move(nonecrypt));
+
+  decrypt.set_parts_len(create_mp_parts(obj_size, part_size));
+
+  // the ranges below would be mostly unreachable in current code as rgw
+  // would've returned a 411 before reaching, but we're just doing this to make
+  // sure we don't have invalid access
+  ASSERT_EQ(fixup_range(&decrypt, obj_size - 1, obj_size + 100),
+            range_t(obj_size - 4096, obj_size - 1));
+  ASSERT_EQ(fixup_range(&decrypt, obj_size, obj_size + 1),
+            range_t(obj_size - 1, obj_size - 1));
+  ASSERT_EQ(fixup_range(&decrypt, obj_size+1, obj_size + 100),
+            range_t(obj_size - 1, obj_size - 1));
+
+}
 
 TEST(TestRGWCrypto, verify_RGWPutObj_BlockEncrypt_chunks)
 {
diff --git a/ceph/src/test/smoke.sh b/ceph/src/test/smoke.sh
index 297bbc7d6..e0c25b534 100755
--- a/ceph/src/test/smoke.sh
+++ b/ceph/src/test/smoke.sh
@@ -2,11 +2,13 @@
 
 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
 
+mon_port=$(get_unused_port)
+
 function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7224" # git grep '\<7224\>' : there must be only one
+    export CEPH_MON="127.0.0.1:$mon_port"
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
@@ -34,10 +36,9 @@ function TEST_minimal() {
 
 function TEST_multimon() {
     local dir=$1
-
-    MONA="127.0.0.1:7224" # git grep '\<7224\>' : there must be only one
-    MONB="127.0.0.1:7225" # git grep '\<7225\>' : there must be only one
-    MONC="127.0.0.1:7226" # git grep '\<7226\>' : there must be only one
+    MONA="127.0.0.1:$((mon_port++))"
+    MONB="127.0.0.1:$((mon_port++))"
+    MONC="127.0.0.1:$((mon_port++))"
 
     run_mon $dir a --public-addr $MONA
     run_mon $dir b --public-addr $MONB
diff --git a/ceph/src/tools/ceph_monstore_tool.cc b/ceph/src/tools/ceph_monstore_tool.cc
index 0ad652d61..ced2673a2 100644
--- a/ceph/src/tools/ceph_monstore_tool.cc
+++ b/ceph/src/tools/ceph_monstore_tool.cc
@@ -33,6 +33,7 @@
 #include "mon/MgrMap.h"
 #include "osd/OSDMap.h"
 #include "crush/CrushCompiler.h"
+#include "mon/CreatingPGs.h"
 
 namespace po = boost::program_options;
 using namespace std;
@@ -175,7 +176,6 @@ int parse_cmd_args(
  *  replay-trace
  *  random-gen
  *  rewrite-crush
- *  inflate-pgmap
  *
  * wanted syntax:
  *
@@ -222,8 +222,6 @@ void usage(const char *n, po::options_description &d)
   << "                                  (random-gen -- --help for more info)\n"
   << "  rewrite-crush [-- options]      add a rewrite commit to the store\n"
   << "                                  (rewrite-crush -- --help for more info)\n"
-  << "  inflate-pgmap [-- options]      add given number of pgmaps to store\n"
-  << "                                  (inflate-pgmap -- --help for more info)\n"
   << "  rebuild                         rebuild store\n"
   << "                                  (rebuild -- --help for more info)\n"
   << std::endl;
@@ -454,70 +452,6 @@ int rewrite_crush(const char* progname,
   return 0;
 }
 
-int inflate_pgmap(MonitorDBStore& st, unsigned n, bool can_be_trimmed) {
-  // put latest pg map into monstore to bloat it up
-  // only format version == 1 is supported
-  version_t last = st.get("pgmap", "last_committed");
-  bufferlist bl;
-
-  // get the latest delta
-  int r = st.get("pgmap", last, bl);
-  if (r) {
-    std::cerr << "Error getting pgmap: " << cpp_strerror(r) << std::endl;
-    return r;
-  }
-
-  // try to pull together an idempotent "delta"
-  ceph::unordered_map<pg_t, pg_stat_t> pg_stat;
-  for (KeyValueDB::Iterator i = st.get_iterator("pgmap_pg");
-       i->valid(); i->next()) {
-    pg_t pgid;
-    if (!pgid.parse(i->key().c_str())) {
-      std::cerr << "unable to parse key " << i->key() << std::endl;
-      continue;
-    }
-    bufferlist pg_bl = i->value();
-    pg_stat_t ps;
-    bufferlist::iterator p = pg_bl.begin();
-    ::decode(ps, p);
-    // will update the last_epoch_clean of all the pgs.
-    pg_stat[pgid] = ps;
-  }
-
-  version_t first = st.get("pgmap", "first_committed");
-  version_t ver = last;
-  auto txn(std::make_shared<MonitorDBStore::Transaction>());
-  for (unsigned i = 0; i < n; i++) {
-    bufferlist trans_bl;
-    bufferlist dirty_pgs;
-    for (ceph::unordered_map<pg_t, pg_stat_t>::iterator ps = pg_stat.begin();
-	 ps != pg_stat.end(); ++ps) {
-      ::encode(ps->first, dirty_pgs);
-      if (!can_be_trimmed) {
-	ps->second.last_epoch_clean = first;
-      }
-      ::encode(ps->second, dirty_pgs);
-    }
-    utime_t inc_stamp = ceph_clock_now();
-    ::encode(inc_stamp, trans_bl);
-    ::encode_destructively(dirty_pgs, trans_bl);
-    bufferlist dirty_osds;
-    ::encode(dirty_osds, trans_bl);
-    txn->put("pgmap", ++ver, trans_bl);
-    // update the db in batch
-    if (txn->size() > 1024) {
-      st.apply_transaction(txn);
-      // reset the transaction
-      txn.reset(new MonitorDBStore::Transaction);
-    }
-  }
-  txn->put("pgmap", "last_committed", ver);
-  txn->put("pgmap_meta", "version", ver);
-  // this will also piggy back the leftover pgmap added in the loop above
-  st.apply_transaction(txn);
-  return 0;
-}
-
 static int update_auth(MonitorDBStore& st, const string& keyring_path)
 {
   // import all keyrings stored in the keyring file
@@ -592,6 +526,35 @@ static int update_monitor(MonitorDBStore& st)
   return 0;
 }
 
+// rebuild
+//  - creating_pgs
+static int update_creating_pgs(MonitorDBStore& st)
+{
+  bufferlist bl;
+  auto last_osdmap_epoch = st.get("osdmap", "last_committed");
+  int r = st.get("osdmap", st.combine_strings("full", last_osdmap_epoch), bl);
+  if (r < 0) {
+    cerr << "unable to losd osdmap e" << last_osdmap_epoch << std::endl;
+    return r;
+  }
+
+  OSDMap osdmap;
+  osdmap.decode(bl);
+  creating_pgs_t creating;
+  for (auto& i : osdmap.get_pools()) {
+    creating.created_pools.insert(i.first);
+  }
+  creating.last_scan_epoch = last_osdmap_epoch;
+
+  bufferlist newbl;
+  ::encode(creating, newbl);
+
+  auto t = make_shared<MonitorDBStore::Transaction>();
+  t->put("osd_pg_creating", "creating", newbl);
+  st.apply_transaction(t);
+  return 0;
+}
+
 // rebuild
 //  - mgr
 //  - mgr_command_desc
@@ -635,8 +598,7 @@ static int update_paxos(MonitorDBStore& st)
   {
     MonitorDBStore::Transaction t;
     vector<string> prefixes = {"auth", "osdmap",
-			       "mgr", "mgr_command_desc",
-			       "pgmap", "pgmap_pg", "pgmap_meta"};
+			       "mgr", "mgr_command_desc"};
     for (const auto& prefix : prefixes) {
       for (auto i = st.get_iterator(prefix); i->valid(); i->next()) {
 	auto key = i->raw_key();
@@ -658,60 +620,6 @@ static int update_paxos(MonitorDBStore& st)
   return 0;
 }
 
-// rebuild
-//  - pgmap_meta/version
-//  - pgmap_meta/last_osdmap_epoch
-//  - pgmap_meta/last_pg_scan
-//  - pgmap_meta/full_ratio
-//  - pgmap_meta/nearfull_ratio
-//  - pgmap_meta/stamp
-static int update_pgmap_meta(MonitorDBStore& st)
-{
-  const string prefix("pgmap_meta");
-  auto t = make_shared<MonitorDBStore::Transaction>();
-  // stolen from PGMonitor::create_pending()
-  // the first pgmap_meta
-  t->put(prefix, "version", 1);
-  {
-    auto stamp = ceph_clock_now();
-    bufferlist bl;
-    ::encode(stamp, bl);
-    t->put(prefix, "stamp", bl);
-  }
-  {
-    auto last_osdmap_epoch = st.get("osdmap", "last_committed");
-    t->put(prefix, "last_osdmap_epoch", last_osdmap_epoch);
-  }
-  // be conservative, so PGMonitor will scan the all pools for pg changes
-  t->put(prefix, "last_pg_scan", 1);
-  {
-    auto full_ratio = g_ceph_context->_conf->mon_osd_full_ratio;
-    if (full_ratio > 1.0)
-      full_ratio /= 100.0;
-    bufferlist bl;
-    ::encode(full_ratio, bl);
-    t->put(prefix, "full_ratio", bl);
-  }
-  {
-    auto backfillfull_ratio = g_ceph_context->_conf->mon_osd_backfillfull_ratio;
-    if (backfillfull_ratio > 1.0)
-      backfillfull_ratio /= 100.0;
-    bufferlist bl;
-    ::encode(backfillfull_ratio, bl);
-    t->put(prefix, "backfillfull_ratio", bl);
-  }
-  {
-    auto nearfull_ratio = g_ceph_context->_conf->mon_osd_nearfull_ratio;
-    if (nearfull_ratio > 1.0)
-      nearfull_ratio /= 100.0;
-    bufferlist bl;
-    ::encode(nearfull_ratio, bl);
-    t->put(prefix, "nearfull_ratio", bl);
-  }
-  st.apply_transaction(t);
-  return 0;
-}
-
 int rebuild_monstore(const char* progname,
 		     vector<string>& subcmds,
 		     MonitorDBStore& st)
@@ -732,7 +640,7 @@ int rebuild_monstore(const char* progname,
   }
   if (!keyring_path.empty())
     update_auth(st, keyring_path);
-  if ((r = update_pgmap_meta(st))) {
+  if ((r = update_creating_pgs(st))) {
     return r;
   }
   if ((r = update_mgrmap(st))) {
@@ -1328,29 +1236,6 @@ int main(int argc, char **argv) {
               << std::endl;
   } else if (cmd == "rewrite-crush") {
     err = rewrite_crush(argv[0], subcmds, st);
-  } else if (cmd == "inflate-pgmap") {
-    unsigned n = 2000;
-    bool can_be_trimmed = false;
-    po::options_description op_desc("Allowed 'inflate-pgmap' options");
-    op_desc.add_options()
-      ("num-maps,n", po::value<unsigned>(&n),
-       "number of maps to add (default: 2000)")
-      ("can-be-trimmed", po::value<bool>(&can_be_trimmed),
-       "can be trimmed (default: false)")
-      ;
-
-    po::variables_map op_vm;
-    try {
-      po::parsed_options op_parsed = po::command_line_parser(subcmds).
-        options(op_desc).run();
-      po::store(op_parsed, op_vm);
-      po::notify(op_vm);
-    } catch (po::error &e) {
-      std::cerr << "error: " << e.what() << std::endl;
-      err = EINVAL;
-      goto done;
-    }
-    err = inflate_pgmap(st, n, can_be_trimmed);
   } else if (cmd == "rebuild") {
     err = rebuild_monstore(argv[0], subcmds, st);
   } else {
diff --git a/ceph/src/tools/ceph_objectstore_tool.cc b/ceph/src/tools/ceph_objectstore_tool.cc
index 2d47a47cf..208db5351 100644
--- a/ceph/src/tools/ceph_objectstore_tool.cc
+++ b/ceph/src/tools/ceph_objectstore_tool.cc
@@ -35,6 +35,7 @@
 #include "osd/PGLog.h"
 #include "osd/OSD.h"
 #include "osd/PG.h"
+#include "osd/ECUtil.h"
 
 #include "json_spirit/json_spirit_value.h"
 #include "json_spirit/json_spirit_reader.h"
@@ -2399,6 +2400,22 @@ int print_obj_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter
       formatter->close_section();
     }
   }
+  bufferlist hattr;
+  gr = store->getattr(coll, ghobj, ECUtil::get_hinfo_key(), hattr);
+  if (gr == 0) {
+    ECUtil::HashInfo hinfo;
+    auto hp = hattr.begin();
+    try {
+      decode(hinfo, hp);
+      formatter->open_object_section("hinfo");
+      hinfo.dump(formatter);
+      formatter->close_section();
+    } catch (...) {
+      r = -EINVAL;
+      cerr << "Error decoding hinfo on : " << make_pair(coll, ghobj) << ", "
+           << cpp_strerror(r) << std::endl;
+    }
+  }
   formatter->close_section();
   formatter->flush(cout);
   cout << std::endl;
diff --git a/ceph/src/tools/rbd/ArgumentTypes.cc b/ceph/src/tools/rbd/ArgumentTypes.cc
index 1798e205f..d7093ac85 100644
--- a/ceph/src/tools/rbd/ArgumentTypes.cc
+++ b/ceph/src/tools/rbd/ArgumentTypes.cc
@@ -292,7 +292,7 @@ void add_create_journal_options(po::options_description *opt) {
     (JOURNAL_SPLAY_WIDTH.c_str(), po::value<uint64_t>(),
      "number of active journal objects")
     (JOURNAL_OBJECT_SIZE.c_str(), po::value<JournalObjectSize>(),
-     "size of journal objects")
+     "size of journal objects [4K <= size <= 64M]")
     (JOURNAL_POOL.c_str(), po::value<std::string>(),
      "pool for journal objects");
 }
@@ -506,7 +506,7 @@ void validate(boost::any& v, const std::vector<std::string>& values,
 
   std::string parse_error;
   uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error);
-  if (parse_error.empty() && (size >= (1 << 12))) {
+  if (parse_error.empty() && (size >= (1 << 12)) && (size <= (1 << 26))) {
     v = boost::any(size);
     return;
   }
diff --git a/ceph/src/tools/rbd_mirror/ImageReplayer.cc b/ceph/src/tools/rbd_mirror/ImageReplayer.cc
index 11bc08bf4..5ba3ee468 100644
--- a/ceph/src/tools/rbd_mirror/ImageReplayer.cc
+++ b/ceph/src/tools/rbd_mirror/ImageReplayer.cc
@@ -806,7 +806,7 @@ void ImageReplayer<I>::on_stop_journal_replay(int r, const std::string &desc)
   }
 
   set_state_description(r, desc);
-  update_mirror_image_status(false, boost::none);
+  update_mirror_image_status(true, boost::none);
   reschedule_update_status_task(-1);
   shut_down(0);
 }
@@ -1411,7 +1411,7 @@ void ImageReplayer<I>::send_mirror_status_update(const OptionalState &opt_state)
   case STATE_STOPPING:
     if (stopping_replay) {
       status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY;
-      status.description = "stopping replay";
+      status.description = state_desc.empty() ? "stopping replay" : state_desc;
       break;
     }
     // FALLTHROUGH
diff --git a/ceph/src/tools/rebuild_mondb.cc b/ceph/src/tools/rebuild_mondb.cc
index 1d070fc23..11ac783b1 100644
--- a/ceph/src/tools/rebuild_mondb.cc
+++ b/ceph/src/tools/rebuild_mondb.cc
@@ -12,7 +12,6 @@ static int update_monitor(const OSDSuperblock& sb, MonitorDBStore& ms);
 static int update_osdmap(ObjectStore& fs,
                          OSDSuperblock& sb,
                          MonitorDBStore& ms);
-static int update_pgmap_pg(ObjectStore& fs, MonitorDBStore& ms);
 
 int update_mon_db(ObjectStore& fs, OSDSuperblock& sb,
                   const string& keyring,
@@ -30,9 +29,6 @@ int update_mon_db(ObjectStore& fs, OSDSuperblock& sb,
   if ((r = update_osdmap(fs, sb, ms)) < 0) {
     goto out;
   }
-  if ((r = update_pgmap_pg(fs, ms)) < 0) {
-    goto out;
-  }
   if ((r = update_monitor(sb, ms)) < 0) {
     goto out;
   }
@@ -339,59 +335,3 @@ int update_osdmap(ObjectStore& fs, OSDSuperblock& sb, MonitorDBStore& ms)
        << nadded << " osdmaps added." << std::endl;
   return 0;
 }
-
-// rebuild
-//  - pgmap_pg/${pgid}
-int update_pgmap_pg(ObjectStore& fs, MonitorDBStore& ms)
-{
-  // pgmap/${epoch} is the incremental of: stamp, pgmap_pg, pgmap_osd
-  // if PGMonitor fails to read it, it will fall back to the pgmap_pg, i.e.
-  // the fullmap.
-  vector<coll_t> collections;
-  int r = fs.list_collections(collections);
-  if (r < 0) {
-    cerr << "failed to list pgs: "  << cpp_strerror(r) << std::endl;
-    return r;
-  }
-  const string prefix("pgmap_pg");
-  // in general, there are less than 100 PGs per OSD, so no need to apply
-  // transaction in batch.
-  auto t = make_shared<MonitorDBStore::Transaction>();
-  unsigned npg = 0;
-  for (const auto& coll : collections) {
-    spg_t pgid;
-    if (!coll.is_pg(&pgid))
-      continue;
-    bufferlist bl;
-    pg_info_t info(pgid);
-    PastIntervals past_intervals;
-    __u8 struct_v;
-    r = PG::read_info(&fs, pgid, coll, bl, info, past_intervals, struct_v);
-    if (r < 0) {
-      cerr << "failed to read_info: " << cpp_strerror(r) << std::endl;
-      return r;
-    }
-    if (struct_v < PG::cur_struct_v) {
-      cerr << "incompatible pg_info: v" << struct_v << std::endl;
-      return -EINVAL;
-    }
-    version_t latest_epoch = 0;
-    r = ms.get(prefix, stringify(pgid.pgid), bl);
-    if (r >= 0) {
-      pg_stat_t pg_stat;
-      auto bp = bl.begin();
-      ::decode(pg_stat, bp);
-      latest_epoch = pg_stat.reported_epoch;
-    }
-    if (info.stats.reported_epoch > latest_epoch) {
-      bufferlist bl;
-      ::encode(info.stats, bl);
-      t->put(prefix, stringify(pgid.pgid), bl);
-      npg++;
-    }
-  }
-  ms.apply_transaction(t);
-  cout << std::left << setw(10)
-       << " " << npg << " pgs added." << std::endl;
-  return 0;
-}
diff --git a/ceph/src/valgrind.supp b/ceph/src/valgrind.supp
deleted file mode 100644
index 369d7b821..000000000
--- a/ceph/src/valgrind.supp
+++ /dev/null
@@ -1,263 +0,0 @@
-# some valgrind suppressions
-# to load these automagically,
-# cat > ~/.valgrindrc
-# --suppressions=valgrind.supp 
-# <control-d>
-
-
-# this one makes valgrind shut up about what appears to be a bug in libc's writev.
-{
-   writev uninit bytes thing -sage
-   Memcheck:Param
-   writev(vector[...])
-   fun:writev
-   fun:_ZN11BlockDevice6_writeEijjRN6buffer4listE
-   fun:_ZN11BlockDevice5do_ioEiRSt4listIPNS_6biovecESaIS2_EE
-   fun:_ZN11BlockDevice15io_thread_entryEv
-   fun:_ZN11BlockDevice8IOThread5entryEv
-   fun:_ZN6Thread11_entry_funcEPv
-   fun:start_thread
-   fun:clone
-   obj:*
-   obj:*
-   obj:*
-   obj:*
-}
-
-# gethostbyname
-{
-   gethostbyname on issdm
-   Memcheck:Param
-   socketcall.sendto(msg)
-   fun:send
-   fun:get_mapping
-   fun:__nscd_get_map_ref
-   fun:nscd_gethst_r
-   fun:__nscd_gethostbyname_r
-   fun:gethostbyname_r@@GLIBC_2.2.5
-   fun:gethostbyname
-   fun:_ZN4Rank8Accepter5startEv
-   fun:_ZN4Rank10start_rankEv
-   fun:main
-}
-{
-   <insert a suppression name here>
-   Memcheck:Param
-   socketcall.sendto(msg)
-   fun:send
-   fun:get_mapping
-   fun:__nscd_get_map_ref
-   fun:nscd_gethst_r
-   fun:__nscd_gethostbyname_r
-   fun:gethostbyname_r@@GLIBC_2.2.5
-   fun:gethostbyname
-   fun:_ZN4Rank8Accepter5startEv
-   fun:_ZN4Rank10start_rankEv
-   fun:_Z17mpi_bootstrap_newRiRPPcP6MonMap
-   fun:main
-}
-
-# gethostbyname
-
-{
-   gethostbyname on foil
-   Memcheck:Addr8
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/libc-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   fun:__libc_dlopen_mode
-   fun:__nss_lookup_function
-   obj:/lib/libc-2.6.1.so
-}
-
-# mpi on issdm
-{
-   <insert a suppression name here>
-   Memcheck:Overlap
-   fun:memcpy
-   fun:MPIR_Localcopy
-   fun:MPIR_Gather
-   fun:MPI_Gather
-   fun:_Z17mpi_bootstrap_newRiRPPcP6MonMap
-   fun:main
-}
-{
-   <insert a suppression name here>
-   Memcheck:Param
-   writev(vector[...])
-   fun:writev
-   fun:MPIDU_Sock_writev
-   fun:MPIDI_CH3_iStartMsgv
-   fun:MPIDI_CH3_EagerContigSend
-   fun:MPID_Send
-   fun:MPIC_Send
-   fun:MPIR_Bcast
-   fun:MPI_Bcast
-   fun:_Z17mpi_bootstrap_newRiRPPcP6MonMap
-   fun:main
-}
-{
-   <insert a suppression name here>
-   Memcheck:Param
-   write(buf)
-   obj:/lib64/tls/libpthread-2.3.4.so
-   fun:MPIDU_Sock_write
-   fun:MPIDI_CH3_iSend
-   fun:MPID_Isend
-   fun:MPIC_Sendrecv
-   fun:MPIR_Barrier
-   fun:MPI_Barrier
-   fun:_Z17mpi_bootstrap_newRiRPPcP6MonMap
-   fun:main
-}
-{
-   <insert a suppression name here>
-   Memcheck:Param
-   write(buf)
-   obj:/lib64/tls/libpthread-2.3.4.so
-   fun:MPIDU_Sock_write
-   fun:MPIDI_CH3_iStartMsg
-   fun:MPIDI_CH3U_VC_SendClose
-   fun:MPIDI_PG_Close_VCs
-   fun:MPID_Finalize
-   fun:MPI5:    <insert a suppression name here>
-   fun:main
-}
-{
-   <insert a suppression name 5:    obj:/lib64/tls/libpthread-2.3.4.so
-   Memcheck:Param
-   write(buf)
-   obj:/lib64/tls/libpthread-2.3.4.so
-   fun:MPIDU_Sock_write
-   fun:MPIDI_CH3_iStartMsg
-   fun:MPIDI_CH3_PktHandler_Close
-   fun:MPIDI_CH3I_Progress_handle_sock_event
-   fun:MPIDI_CH3_Progress_wait
-   fun:MPIDI_CH3U_VC_WaitForClose
-   fun:MPID_Finalize
-   fun:MPI_Finalize
-   fun:main
-}
-{
-   <insert a suppression name here>
-   Memcheck:Param
-   write(buf)
-   obj:/lib64/tls/libpthread-2.3.4.so
-   fun:MPIDU_Sock_write
-   fun:MPIDI_CH3_iSend
-   fun:MPID_Isend
-   fun:MPIC_Sendrecv
-   fun:MPIR_Barrier
-   fun:MPI_Barrier
-   fun:main
-}
-{
-   <insert a suppression name here>
-   Memcheck:Param
-   write(buf)
-   obj:/lib64/tls/libpthread-2.3.4.so
-   fun:MPIDU_Sock_write
-   fun:MPIDI_CH3_iStartMsg
-   fun:MPIDI_CH3U_VC_SendClose
-   fun:MPIDI_PG_Close_VCs
-   fun:MPID_Finalize
-   fun:MPI_Finalize
-   fun:main
-}
- {
-   <insert a suppression name here>
-   Memcheck:Param
-   writev(vector[...])
-   fun:writev
-   fun:MPIDU_Socki_handle_write
-   fun:MPIDU_Sock_wait
-   fun:MPIDI_CH3_Progress_wait
-   fun:MPIC_Wait
-   fun:MPIC_Send
-   fun:MPIR_Gather
-   fun:MPI_Gather
-   fun:_Z17mpi_bootstrap_newRiRPPcP6MonMap
-   fun:main
-}
-
-# lttng-ust
-{
-   <insert_a_suppression_name_here>
-   Memcheck:Leak
-   fun:calloc
-   fun:_dl_allocate_tls
-   fun:pthread_create@@GLIBC_2.2.5
-   obj:/usr/*lib*/liblttng-ust.*
-   fun:call_init.part.0
-   fun:_dl_init
-   obj:*
-}
-
-# PK11_CreateContextBySymKey
-{
-   <insert_a_suppression_name_here>
-   Helgrind:Race
-   obj:/usr/*lib*/libfreebl*3.so
-   ...
-   obj:/usr/*lib*/libsoftokn3.so
-   ...
-   obj:/usr/*lib*/libnss3.so
-   ...
-   fun:PK11_CreateContextBySymKey
-   ...
-}
-
-# _dl_allocate_tls_init
-{
-   <insert_a_suppression_name_here>
-   Helgrind:Race
-   fun:mempcpy
-   fun:_dl_allocate_tls_init
-   ...
-   fun:pthread_create@*
-   ...
-}
-
-# rados cython constants
-{
-   <insert_a_suppression_name_here>
-   Memcheck:Leak
-   match-leak-kinds: definite
-   fun:malloc
-   fun:PyObject_Malloc
-   fun:PyCode_New
-   fun:__Pyx_InitCachedConstants
-   fun:initrados
-   fun:_PyImport_LoadDynamicModule
-   ...
-   fun:PyImport_ImportModuleLevel
-   ...
-   fun:PyObject_Call
-   fun:PyEval_CallObjectWithKeywords
-   fun:PyEval_EvalFrameEx
-}
-
-# rbd cython constants
-{
-   <insert_a_suppression_name_here>
-   Memcheck:Leak
-   match-leak-kinds: definite
-   fun:malloc
-   fun:PyObject_Malloc
-   fun:PyCode_New
-   fun:__Pyx_InitCachedConstants
-   fun:initrbd
-   fun:_PyImport_LoadDynamicModule
-   ...
-   fun:PyImport_ImportModuleLevel
-   ...
-   fun:PyObject_Call
-   fun:PyEval_CallObjectWithKeywords
-   fun:PyEval_EvalFrameEx
-}
-- 
2.39.5